├── test
├── __init__.py
├── test_group.py
├── test_base.py
├── test_summarize.py
├── test_transform.py
├── test_vector.py
├── test_join.py
├── test_subset.py
├── test_reshape.py
├── test_window_functions.py
├── test_select.py
└── test_summary_functions.py
├── dfply.egg-info
├── dependency_links.txt
├── requires.txt
├── top_level.txt
├── PKG-INFO
└── SOURCES.txt
├── requirements.txt
├── MANIFEST.in
├── .travis.yml~
├── dfply
├── data
│ └── __init__.py
├── group.py
├── __init__.py
├── summarize.py
├── subset.py
├── transform.py
├── summary_functions.py
├── select.py
├── window_functions.py
├── set_ops.py
├── vector.py
├── join.py
├── base.py
└── reshape.py
├── .gitignore
├── setup.py
├── .travis.yml
├── RELEASES.txt
└── examples
├── basics-extending-functionality.ipynb
└── .ipynb_checkpoints
└── basics-extending-functionality-checkpoint.ipynb
/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dfply.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/dfply.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 |
--------------------------------------------------------------------------------
/dfply.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | dfply
2 | test
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.11.1
2 | pandas>=0.18.1
3 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # Include the license file
2 | include LICENSE.md
3 |
--------------------------------------------------------------------------------
/.travis.yml~:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - 2.6
4 | - 2.7
5 | install:
6 | - pip install .
7 | - pip install -r requirements.txt
8 | script: python -m pytest test/
9 |
--------------------------------------------------------------------------------
/dfply/data/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 |
4 | root = os.path.abspath(os.path.dirname(__file__))
5 | diamonds = pd.read_csv(os.path.join(root, "diamonds.csv"))
6 |
--------------------------------------------------------------------------------
/dfply/group.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 |
3 |
4 | @pipe
5 | @symbolic_evaluation(eval_as_label=True)
6 | def group_by(df, *args):
7 | df._grouped_by = list(args)
8 | return df
9 |
10 |
11 | @pipe
12 | def ungroup(df):
13 | df._grouped_by = None
14 | return df
15 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # Unit test / coverage reports
7 | .cache
8 |
9 | # Annoying Mac File
10 | .DC_Store
11 |
12 | # workbook test files
13 | test/feature_workbook.ipynb
14 | test/.ipynb_checkpoints/*
15 | test/worksheet.py
16 |
17 | # distribution
18 | dist
19 | build
20 |
21 | # egg_info
22 | dfply.egg_info
23 |
--------------------------------------------------------------------------------
/test/test_group.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from dfply import *
4 |
5 | ##==============================================================================
6 | ## grouping test functions
7 | ##==============================================================================
8 |
9 |
10 | def test_group_attributes():
11 | d = diamonds >> group_by('cut')
12 | assert hasattr(d, '_grouped_by')
13 | assert d._grouped_by == ['cut',]
14 |
--------------------------------------------------------------------------------
/dfply.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 1.0
2 | Name: dfply
3 | Version: 0.3.0
4 | Summary: dplyr-style piping operations for pandas dataframes
5 | Home-page: https://github.com/kieferk/dfply
6 | Author: Kiefer Katovich
7 | Author-email: kiefer.katovich@gmail.com
8 | License: GNU General Public License v3.0
9 | Description: See https://github.com/kieferk/dfply/blob/master/README.md for details.
10 | Keywords: pandas dplyr
11 | Platform: UNKNOWN
12 |
--------------------------------------------------------------------------------
/dfply/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 | from .group import *
3 | from .join import *
4 | from .reshape import *
5 | from .select import *
6 | from .select import *
7 | from .set_ops import *
8 | from .subset import *
9 | from .summarize import *
10 | from .transform import *
11 | from .data import diamonds
12 | from .summary_functions import *
13 | from .window_functions import *
14 | from .vector import *
15 |
16 | for verb in dir():
17 | if 'ize' in verb:
18 | exec(verb.replace('ize', 'ise') + '=' + verb)
19 |
--------------------------------------------------------------------------------
/test/test_base.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from dfply import *
4 |
5 |
6 | ##==============================================================================
7 | ## pipe tests
8 | ##==============================================================================
9 |
10 | @dfpipe
11 | def blank_function(df):
12 | return df
13 |
14 |
15 | def test_pipe():
16 | d = diamonds >> blank_function()
17 | assert diamonds.equals(d)
18 | d = diamonds >> blank_function() >> blank_function()
19 | assert diamonds.equals(d)
20 |
21 |
22 | def test_inplace_pipe():
23 | df = diamonds[['price','carat']].head(5)
24 | d = diamonds.copy()
25 | d >>= select(X.price, X.carat) >> head(5)
26 | print(df)
27 | print(d)
28 | assert df.equals(d)
29 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(
4 | name = 'dfply',
5 | version = '0.3.3',
6 | author = 'Kiefer Katovich',
7 | author_email = 'kiefer.katovich@gmail.com',
8 | keywords = 'pandas dplyr',
9 | packages = find_packages(),
10 | include_package_data=True,
11 | package_data={'dfply': ['data/diamonds.csv']},
12 | package_dir={'dfply':'dfply'},
13 | install_requires=['numpy', 'pandas'],
14 | description = 'dplyr-style piping operations for pandas dataframes',
15 | long_description = 'See https://github.com/kieferk/dfply/blob/master/README.md for details.',
16 | license = 'GNU General Public License v3.0',
17 | url = 'https://github.com/kieferk/dfply',
18 | test_suite='test',
19 | )
20 |
--------------------------------------------------------------------------------
/dfply/summarize.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 |
3 |
4 | @dfpipe
5 | def summarize(df, **kwargs):
6 | return pd.DataFrame({k: [v] for k, v in kwargs.items()})
7 |
8 |
9 | @dfpipe
10 | def summarize_each(df, functions, *args):
11 | columns, values = [], []
12 | for arg in args:
13 | if isinstance(arg, pd.Series):
14 | varname = arg.name
15 | col = arg
16 | elif isinstance(arg, str):
17 | varname = arg
18 | col = df[varname]
19 | elif isinstance(arg, int):
20 | varname = df.columns[arg]
21 | col = df.iloc[:, arg]
22 |
23 | for f in functions:
24 | fname = f.__name__
25 | columns.append('_'.join([varname, fname]))
26 | values.append(f(col))
27 |
28 | return pd.DataFrame([values], columns=columns)
29 |
--------------------------------------------------------------------------------
/dfply.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | LICENSE.md
2 | MANIFEST.in
3 | setup.py
4 | dfply/__init__.py
5 | dfply/base.py
6 | dfply/group.py
7 | dfply/join.py
8 | dfply/reshape.py
9 | dfply/select.py
10 | dfply/set_ops.py
11 | dfply/subset.py
12 | dfply/summarize.py
13 | dfply/summary_functions.py
14 | dfply/transform.py
15 | dfply/vector.py
16 | dfply/window_functions.py
17 | dfply.egg-info/PKG-INFO
18 | dfply.egg-info/SOURCES.txt
19 | dfply.egg-info/dependency_links.txt
20 | dfply.egg-info/requires.txt
21 | dfply.egg-info/top_level.txt
22 | dfply/data/__init__.py
23 | test/__init__.py
24 | test/test_base.py
25 | test/test_group.py
26 | test/test_join.py
27 | test/test_reshape.py
28 | test/test_select.py
29 | test/test_subset.py
30 | test/test_summarize.py
31 | test/test_summary_functions.py
32 | test/test_transform.py
33 | test/test_vector.py
34 | test/test_window_functions.py
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - '3.6'
4 | - '3.7'
5 | install:
6 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
7 | - bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/miniconda
8 | - export PATH="$HOME/miniconda/bin:$PATH"
9 | - hash -r
10 | - conda config --set always_yes yes --set changeps1 no
11 | - conda update -q conda
12 | - conda info -a
13 | - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pip pytest numpy
14 | pandas>=0.18.1
15 | - source activate test-environment
16 | - python setup.py install
17 | script: python -m pytest test/
18 | deploy:
19 | provider: pypi
20 | user: TokenNobody
21 | password:
22 | secure: XazMtrRpb6i/jtdeBIDV5mWZNQr2dPlmspgF/qqt9AbZRCu/Y28DaI/12KGSFgVJc2lzREp+cxKNq60bDT8mB3t0+YtYeHsmQXawInyXAFACfmRI5/nigiYLMhQ1OV/RHtXQcXeHJF1MbKeF2WjWdBKh9m9cBi5NVxGot/knGOALkwyiPG4Ykf5fVD4bCeJTkdrBav/XLYqYPntpw6GT0PA8yvt3E1lQfL+uTV8+ZcwsqXh8ebWNI0aU86lurE6b1cJn6xpTZYzSqiJqHuikCZC7alqd311kpm/sKuHMb2V9tKiHiJFN7fcKfdaVuAjQE22Tc7R7uC2ph9tBvL8xHnzi48Wj9Ri5QYLATN2u28d3rkCS+zN+tC3MT9bjDcyuqPdbx3Sx5bFJC6P0HFcof5lpnan80TW4VQSM2GV8rqwPgm0kLi0k/DG5yvRWecNdlvvCDZ5e6M9eiOcer9guimDYITtQCfuiUZLUbzgw+u7QE3jY9Exnv7Ekdi150Zd+ubPS+yU1ZG5tgB2ijw7n2bTxEy77d6Zm0quDnQ6gVBi7STp2si3397TTQH/nV+eaX51VOxTufZXDW0eiaVoRhH32xUllhFeAzJSezAVJ0WuLEuSLXGkxep7VNofK0Kyjxg4S2ED41lV7LtucdQe7L/LlGTfmYCgzSDaDW98CqIM=
23 | on:
24 | tags: true
25 | distributions: sdist bdist_wheel
26 | repo: kieferk/dfply
27 | branch: master
28 |
--------------------------------------------------------------------------------
/dfply/subset.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 | import warnings
3 | import numpy as np
4 |
5 |
6 | # ------------------------------------------------------------------------------
7 | # `head` and `tail`
8 | # ------------------------------------------------------------------------------
9 |
10 | @dfpipe
11 | def head(df, n=5):
12 | return df.head(n)
13 |
14 |
15 | @dfpipe
16 | def tail(df, n=5):
17 | return df.tail(n)
18 |
19 |
20 | # ------------------------------------------------------------------------------
21 | # Sampling
22 | # ------------------------------------------------------------------------------
23 |
24 |
25 | @dfpipe
26 | def sample(df, *args, **kwargs):
27 | return df.sample(*args, **kwargs)
28 |
29 |
30 | @pipe
31 | @group_delegation
32 | @symbolic_evaluation(eval_as_label=['*'])
33 | def distinct(df, *args, **kwargs):
34 | if not args:
35 | return df.drop_duplicates(**kwargs)
36 | return df.drop_duplicates(list(args), **kwargs)
37 |
38 |
39 | @dfpipe
40 | def row_slice(df, indices):
41 | if isinstance(indices, (tuple, list)):
42 | indices = np.array(indices)
43 | if isinstance(indices, int):
44 | indices = np.array([indices])
45 | if isinstance(indices, pd.Series):
46 | indices = indices.values
47 |
48 | if indices.dtype == bool:
49 | return df.loc[indices, :]
50 | else:
51 | return df.iloc[indices, :]
52 |
53 |
54 | # ------------------------------------------------------------------------------
55 | # Filtering/masking
56 | # ------------------------------------------------------------------------------
57 |
58 | @dfpipe
59 | def mask(df, *args):
60 | mask = pd.Series(np.ones(df.shape[0], dtype=bool))
61 | for arg in args:
62 | if arg.dtype != bool:
63 | raise Exception("Arguments must be boolean.")
64 | mask = mask & arg.reset_index(drop=True)
65 | return df[mask.values]
66 |
67 |
68 | filter_by = mask # alias for mask()
69 |
70 |
71 | @dfpipe
72 | def top_n(df, n=None, ascending=True, col=None):
73 | if not n:
74 | raise ValueError('n must be specified')
75 | if not isinstance(col, pd.Series):
76 | col = df.columns[-1]
77 | else:
78 | col = col._name
79 | index = df[[col]].copy()
80 | index['ranks'] = index[col].rank(ascending=ascending)
81 | index = index[index['ranks'] >= index['ranks'].nlargest(n).min()]
82 | return df.reindex(index.index)
83 |
84 |
85 | @dfpipe
86 | def pull(df, column=-1):
87 | return df.ix[:, column]
88 |
--------------------------------------------------------------------------------
/test/test_summarize.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from dfply import *
4 |
5 |
6 | ##==============================================================================
7 | ## summarization test functions
8 | ##==============================================================================
9 |
10 | def test_summarize():
11 | p = pd.DataFrame({
12 | 'price_mean':[diamonds.price.mean()],
13 | 'price_std':[diamonds.price.std()]
14 | })
15 | assert p.equals(diamonds >> summarize(price_mean=X.price.mean(),
16 | price_std=X.price.std()))
17 |
18 | pcut = pd.DataFrame({
19 | 'cut':['Fair','Good','Ideal','Premium','Very Good']
20 | })
21 | pcut['price_mean'] = [diamonds[diamonds.cut == c].price.mean() for c in pcut.cut.values]
22 | pcut['price_std'] = [diamonds[diamonds.cut == c].price.std() for c in pcut.cut.values]
23 | assert pcut.equals(diamonds >> group_by('cut') >>
24 | summarize(price_mean=X.price.mean(), price_std=X.price.std()))
25 |
26 |
27 | def test_summarize_each():
28 | to_match = pd.DataFrame({
29 | 'price_mean':[np.mean(diamonds.price)],
30 | 'price_var':[np.var(diamonds.price)],
31 | 'depth_mean':[np.mean(diamonds.depth)],
32 | 'depth_var':[np.var(diamonds.depth)]
33 | })
34 | to_match = to_match[['price_mean','price_var','depth_mean','depth_var']]
35 |
36 | test1 = diamonds >> summarize_each([np.mean, np.var], X.price, 4)
37 | test2 = diamonds >> summarize_each([np.mean, np.var], X.price, 'depth')
38 | assert to_match.equals(test1)
39 | assert to_match.equals(test2)
40 |
41 | group = pd.DataFrame({
42 | 'cut':['Fair','Good','Ideal','Premium','Very Good']
43 | })
44 | group['price_mean'] = [np.mean(diamonds[diamonds.cut == c].price) for c in group.cut.values]
45 | group['price_var'] = [np.var(diamonds[diamonds.cut == c].price) for c in group.cut.values]
46 | group['depth_mean'] = [np.mean(diamonds[diamonds.cut == c].depth) for c in group.cut.values]
47 | group['depth_var'] = [np.var(diamonds[diamonds.cut == c].depth) for c in group.cut.values]
48 |
49 | group = group[['cut','price_mean','price_var','depth_mean','depth_var']]
50 |
51 | test1 = (diamonds >> group_by(X.cut) >>
52 | summarize_each([np.mean, np.var], X.price, 4))
53 | test2 = (diamonds >> group_by('cut') >>
54 | summarize_each([np.mean, np.var], X.price, 'depth'))
55 |
56 | assert group.equals(test1)
57 | assert group.equals(test2)
58 |
--------------------------------------------------------------------------------
/test/test_transform.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from dfply import *
4 |
5 |
6 | ##==============================================================================
7 | ## transform test functions
8 | ##==============================================================================
9 |
10 | def test_mutate():
11 | df = diamonds.copy()
12 | df['testcol'] = 1
13 | assert df.equals(diamonds >> mutate(testcol=1))
14 | df['testcol'] = df['x']
15 | assert df.equals(diamonds >> mutate(testcol=X.x))
16 | df['testcol'] = df['x'] * df['y']
17 | assert df.equals(diamonds >> mutate(testcol=X.x * X.y))
18 | df['testcol'] = df['x'].mean()
19 | assert df.equals(diamonds >> mutate(testcol=np.mean(X.x)))
20 |
21 |
22 | def group_mutate_helper(df):
23 | df['testcol'] = df['x']*df.shape[0]
24 | return df
25 |
26 |
27 | def test_group_mutate():
28 | df = diamonds.copy()
29 | df = df.groupby('cut').apply(group_mutate_helper)
30 | d = diamonds >> group_by('cut') >> mutate(testcol=X.x*X.shape[0]) >> ungroup()
31 | assert df.equals(d.sort_index())
32 |
33 |
34 | def test_transmute():
35 | df = diamonds.copy()
36 | df['testcol'] = df['x'] * df['y']
37 | df = df[['testcol']]
38 | assert df.equals(diamonds >> transmute(testcol=X.x * X.y))
39 |
40 |
41 | def test_group_transmute():
42 | df = diamonds.copy()
43 | df = df.groupby('cut').apply(group_mutate_helper).reset_index(drop=True)
44 | df = df[['cut','testcol']]
45 | d = diamonds >> group_by('cut') >> transmute(testcol=X.x*X.shape[0])
46 | print(d.head())
47 | print(df.head())
48 | assert df.equals(d.sort_index())
49 |
50 |
51 | def test_mutate_if():
52 | df = diamonds.copy()
53 | for col in df:
54 | try:
55 | if max(df[col]) < 10:
56 | df[col] *= 2
57 | except:
58 | pass
59 | assert df.equals(diamonds >> mutate_if(lambda col: max(col) < 10, lambda row: row * 2))
60 | df = diamonds.copy()
61 | for col in df:
62 | try:
63 | if any(df[col].str.contains('.')):
64 | df[col] = df[col].str.lower()
65 | except:
66 | pass
67 | assert df.equals(diamonds >> mutate_if(lambda col: any(col.str.contains('.')), lambda row: row.str.lower()))
68 | df = diamonds.copy()
69 | for col in df:
70 | try:
71 | if min(df[col]) < 1 and mean(df[col]) < 4:
72 | df[col] *= -1
73 | except:
74 | pass
75 | assert df.equals(diamonds >> mutate_if(lambda col: min(col) < 1 and mean(col) < 4, lambda row: -row))
76 |
--------------------------------------------------------------------------------
/dfply/transform.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 |
3 |
4 | @dfpipe
5 | def mutate(df, **kwargs):
6 | """
7 | Creates new variables (columns) in the DataFrame specified by keyword
8 | argument pairs, where the key is the column name and the value is the
9 | new column value(s).
10 |
11 | Args:
12 | df (pandas.DataFrame): data passed in through the pipe.
13 |
14 | Kwargs:
15 | **kwargs: keys are the names of the new columns, values indicate
16 | what the new column values will be.
17 |
18 | Example:
19 | diamonds >> mutate(x_plus_y=X.x + X.y) >> select_from('x') >> head(3)
20 |
21 | x y z x_plus_y
22 | 0 3.95 3.98 2.43 7.93
23 | 1 3.89 3.84 2.31 7.73
24 | 2 4.05 4.07 2.31 8.12
25 | """
26 |
27 | return df.assign(**kwargs)
28 |
29 |
30 | @dfpipe
31 | def mutate_if(df, predicate, fun):
32 | """
33 | Modifies columns in place if the specified predicate is true.
34 | Args:
35 | df (pandas.DataFrame): data passed in through the pipe.
36 | predicate: a function applied to columns that returns a boolean value
37 | fun: a function that will be applied to columns where predicate returns True
38 |
39 | Example:
40 | diamonds >> mutate_if(lambda col: min(col) < 1 and mean(col) < 4, lambda row: 2 * row) >> head(3)
41 | carat cut color clarity depth table price x y z
42 | 0 0.46 Ideal E SI2 61.5 55.0 326 3.95 3.98 4.86
43 | 1 0.42 Premium E SI1 59.8 61.0 326 3.89 3.84 4.62
44 | 2 0.46 Good E VS1 56.9 65.0 327 4.05 4.07 4.62
45 | (columns 'carat' and 'z', both having a min < 1 and mean < 4, are doubled, while the
46 | other rows remain as they were)
47 | """
48 | cols = list()
49 | for col in df:
50 | try:
51 | if predicate(df[col]):
52 | cols.append(col)
53 | except:
54 | pass
55 | df[cols] = df[cols].apply(fun)
56 | return df
57 |
58 | # df2 = df.copy()
59 | # df2[cols] = df2[cols].apply(fun)
60 | # return df2
61 |
62 |
63 | @dfpipe
64 | def transmute(df, *keep_columns, **kwargs):
65 | """
66 | Creates columns and then returns those new columns and optionally specified
67 | original columns from the DataFrame.
68 |
69 | This works like `mutate`, but designed to discard the original columns used
70 | to create the new ones.
71 |
72 | Args:
73 | *keep_columns: Column labels to keep. Can be string, symbolic, or
74 | integer position.
75 |
76 | Kwargs:
77 | **kwargs: keys are the names of the new columns, values indicate
78 | what the new column values will be.
79 |
80 | Example:
81 | diamonds >> transmute(x_plus_y=X.x + X.y, y_div_z=(X.y / X.z)) >> head(3)
82 |
83 | y_div_z x_plus_y
84 | 0 1.637860 7.93
85 | 1 1.662338 7.73
86 | 2 1.761905 8.12
87 | """
88 |
89 | keep_cols = []
90 | for col in flatten(keep_columns):
91 | try:
92 | keep_cols.append(col.name)
93 | except:
94 | if isinstance(col, str):
95 | keep_cols.append(col)
96 | elif isinstance(col, int):
97 | keep_cols.append(df.columns[col])
98 |
99 | df = df.assign(**kwargs)
100 | columns = [k for k in kwargs.keys()] + list(keep_cols)
101 | return df[columns]
102 |
--------------------------------------------------------------------------------
/test/test_vector.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from dfply import *
3 |
4 | ##==============================================================================
5 | ## desc, order by tests
6 | ##==============================================================================
7 |
8 | def test_desc():
9 |
10 | df = diamonds >> select(X.cut, X.x) >> head(10)
11 | t = df >> summarize(last=nth(X.x, -1, order_by=[desc(X.cut), desc(X.x)]))
12 |
13 | series_num = pd.Series([4,1,3,2])
14 | series_bool = pd.Series([True,False,True,False])
15 | series_str = pd.Series(['d','a','c','b'])
16 |
17 | num_truth = series_num.rank(method='min',ascending=False)
18 | bool_truth = series_bool.rank(method='min',ascending=False)
19 | str_truth = series_str.rank(method='min',ascending=False)
20 |
21 | assert desc(series_num).equals(num_truth)
22 | assert desc(series_bool).equals(bool_truth)
23 | assert desc(series_str).equals(str_truth)
24 |
25 |
26 | def test_order_series_by():
27 | series = pd.Series([1,2,3,4,5,6,7,8])
28 | order1 = pd.Series(['A','B','A','B','A','B','A','B'])
29 | ordered1 = order_series_by(series, order1).reset_index(drop=True)
30 | true1 = pd.Series([1,3,5,7,2,4,6,8])
31 | assert ordered1.equals(true1)
32 |
33 | order2 = pd.Series([2,2,2,2,1,1,1,1])
34 | ordered2 = order_series_by(series, [order1, order2]).reset_index(drop=True)
35 | true2 = pd.Series([5,7,1,3,6,8,2,4])
36 | assert ordered2.equals(true2)
37 |
38 |
39 | ##==============================================================================
40 | ## coalesce test
41 | ##==============================================================================
42 |
43 | def test_coalesce():
44 | df = pd.DataFrame({
45 | 'a':[1,np.nan,np.nan,np.nan,np.nan],
46 | 'b':[2,3,np.nan,np.nan,np.nan],
47 | 'c':[np.nan,np.nan,4,5,np.nan],
48 | 'd':[6,7,8,9,np.nan]
49 | })
50 | truth_df = df.assign(coal=[1,3,4,5,np.nan])
51 | d = df >> mutate(coal=coalesce(X.a, X.b, X.c, X.d))
52 | assert truth_df.equals(d)
53 |
54 |
55 | ##==============================================================================
56 | ## case_when test
57 | ##==============================================================================
58 |
59 | def test_case_when():
60 | df = pd.DataFrame({
61 | 'num':np.arange(31)
62 | })
63 | df_truth = df.assign(strnum=['fizzbuzz' if (i % 15 == 0) else
64 | 'fizz' if (i % 3 == 0) else
65 | 'buzz' if (i % 5 == 0) else
66 | str(i) for i in np.arange(31)])
67 | d = df >> mutate(strnum=case_when([X.num % 15 == 0, 'fizzbuzz'],
68 | [X.num % 3 == 0, 'fizz'],
69 | [X.num % 5 == 0, 'buzz'],
70 | [True, X.num.astype(str)]))
71 | print(df_truth)
72 | print(d)
73 | assert df_truth.equals(d)
74 |
75 |
76 | ##==============================================================================
77 | ## if_else test
78 | ##==============================================================================
79 |
80 | def test_if_else():
81 | df = pd.DataFrame({
82 | 'a':[1,2,3,4,5,6,7,8,9]
83 | })
84 | b_truth = ['odd','even','odd','even','odd','even','odd','even','odd']
85 | d = df >> mutate(b=if_else(X.a % 2 == 0, 'even', 'odd'))
86 | assert d.equals(df.assign(b=b_truth))
87 |
88 | df = pd.DataFrame({
89 | 'a':[0,0,0,1,1,1,2,2,2]
90 | })
91 | b_truth = [5,5,5,5,5,5,9,9,9]
92 | d = df >> mutate(b=if_else(X.a < 2, [5,5,5,5,5,5,5,5,5], [9,9,9,9,9,9,9,9,9]))
93 | assert d.equals(df.assign(b=b_truth))
94 |
95 |
96 | ##==============================================================================
97 | ## na_if test
98 | ##==============================================================================
99 |
100 | def test_na_if():
101 | df = pd.DataFrame({
102 | 'a':[1,2,3,4,5]
103 | })
104 | d = df >> mutate(b=na_if(X.a, 3), c=na_if(X.a,1,2,3))
105 | d = d[['a','b','c']]
106 | df_true = df.assign(b=[1,2,np.nan,4,5], c=[np.nan,np.nan,np.nan,4,5])
107 | assert df_true.equals(d)
108 |
--------------------------------------------------------------------------------
/test/test_join.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from dfply import *
4 |
5 | ##==============================================================================
6 | ## join test functions
7 | ##==============================================================================
8 |
9 | @pytest.fixture
10 | def dfA(scope='module'):
11 | a = pd.DataFrame({
12 | 'x1':['A','B','C'],
13 | 'x2':[1,2,3]
14 | })
15 | return a
16 |
17 |
18 | @pytest.fixture
19 | def dfB(scope='module'):
20 | b = pd.DataFrame({
21 | 'x1':['A','B','D'],
22 | 'x3':[True,False,True]
23 | })
24 | return b
25 |
26 | @pytest.fixture
27 | def dfC(scope='module'):
28 | c = pd.DataFrame({
29 | 'x1':['B','C','D'],
30 | 'x2':[2,3,4]
31 | })
32 | return c
33 |
34 |
35 | def test_inner_join(dfA, dfB):
36 | ab = pd.DataFrame({
37 | 'x1':['A','B'],
38 | 'x2':[1,2],
39 | 'x3':[True, False]
40 | })
41 |
42 | c = dfA >> inner_join(dfB, by='x1')
43 | assert c.equals(ab)
44 |
45 |
46 | def test_outer_join(dfA, dfB):
47 | ab = pd.DataFrame({
48 | 'x1':['A','B','C','D'],
49 | 'x2':[1,2,3,np.nan],
50 | 'x3':[True, False,np.nan,True]
51 | })
52 |
53 | c = dfA >> outer_join(dfB, by='x1')
54 | assert c.equals(ab)
55 | c = dfA >> full_join(dfB, by='x1')
56 | assert c.equals(ab)
57 |
58 |
59 | def test_left_join(dfA, dfB):
60 | ab = pd.DataFrame({
61 | 'x1':['A','B','C'],
62 | 'x2':[1,2,3],
63 | 'x3':[True, False, np.nan]
64 | })
65 |
66 | c = dfA >> left_join(dfB, by='x1')
67 | assert c.equals(ab)
68 |
69 |
70 | def test_right_join(dfA, dfB):
71 | ab = pd.DataFrame({
72 | 'x1':['A','B','D'],
73 | 'x2':[1,2,np.nan],
74 | 'x3':[True, False, True]
75 | })
76 |
77 | c = dfA >> right_join(dfB, by='x1')
78 | assert c.equals(ab)
79 |
80 | def test_semi_join(dfA, dfB):
81 | ab = pd.DataFrame({
82 | 'x1':['A', 'B'],
83 | 'x2':[1, 2]
84 | })
85 |
86 | c = dfA >> semi_join(dfB, by='x1')
87 | assert c.equals(ab)
88 |
89 |
90 | def test_anti_join(dfA, dfB):
91 | ab = pd.DataFrame({
92 | 'x1':['C'],
93 | 'x2':[3]
94 | }, index=[2])
95 |
96 | c = dfA >> anti_join(dfB, by='x1')
97 | assert c.equals(ab)
98 |
99 |
100 | ##==============================================================================
101 | ## set operation (row join) test functions
102 | ##==============================================================================
103 |
104 | def test_union(dfA, dfC):
105 | ac = pd.DataFrame({
106 | 'x1': ['A', 'B', 'C', 'D'],
107 | 'x2': [1, 2, 3, 4]
108 | }, index=[0, 1, 2, 2])
109 |
110 | d = dfA >> union(dfC)
111 | assert d.equals(ac)
112 |
113 |
114 | def test_intersect(dfA, dfC):
115 | ac = pd.DataFrame({
116 | 'x1': ['B', 'C'],
117 | 'x2': [2, 3]
118 | })
119 |
120 | d = dfA >> intersect(dfC)
121 | assert d.equals(ac)
122 |
123 |
124 | def test_set_diff(dfA, dfC):
125 | ac = pd.DataFrame({
126 | 'x1': ['A'],
127 | 'x2': [1]
128 | })
129 |
130 | d = dfA >> set_diff(dfC)
131 | assert d.equals(ac)
132 |
133 |
134 | ##==============================================================================
135 | ## bind rows, cols
136 | ##==============================================================================
137 |
138 | def test_bind_rows(dfA, dfB):
139 | inner = pd.DataFrame({
140 | 'x1':['A','B','C','A','B','D']
141 | })
142 | outer = pd.DataFrame({
143 | 'x1':['A','B','C','A','B','D'],
144 | 'x2':[1,2,3,np.nan,np.nan,np.nan],
145 | 'x3':[np.nan,np.nan,np.nan,True,False,True]
146 | })
147 | ab_inner = dfA >> bind_rows(dfB, join='inner')
148 | ab_outer = dfA >> bind_rows(dfB, join='outer')
149 | assert inner.equals(ab_inner.reset_index(drop=True))
150 | assert outer.equals(ab_outer.reset_index(drop=True))
151 |
152 |
153 | def test_bind_cols(dfA, dfB):
154 | dfB.columns = ['x3','x4']
155 | df = pd.DataFrame({
156 | 'x1':['A','B','C'],
157 | 'x2':[1,2,3],
158 | 'x3':['A','B','D'],
159 | 'x4':[True,False,True]
160 | })
161 | d = dfA >> bind_cols(dfB)
162 | assert df.equals(d)
163 |
--------------------------------------------------------------------------------
/test/test_subset.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from dfply import *
4 |
5 |
6 | ##==============================================================================
7 | ## subset test functions
8 | ##==============================================================================
9 |
10 | def test_head():
11 | df = diamonds.head(2)
12 | d = diamonds >> head(2)
13 | assert df.equals(d)
14 |
15 |
16 | def test_grouped_head():
17 | df = diamonds.groupby(['cut','color']).apply(lambda x: x.head(2)).reset_index(drop=True)
18 | d = diamonds >> group_by('cut','color') >> head(2)
19 | assert df.equals(d.reset_index(drop=True))
20 |
21 |
22 | def test_tail():
23 | df = diamonds.tail(2)
24 | d = diamonds >> tail(2)
25 | assert df.equals(d)
26 |
27 |
28 | def test_grouped_tail():
29 | df = diamonds.groupby(['cut','color']).apply(lambda x: x.tail(2)).reset_index(drop=True)
30 | d = diamonds >> group_by('cut','color') >> tail(2)
31 | assert df.equals(d.reset_index(drop=True))
32 |
33 |
34 | def test_distinct():
35 | d = diamonds >> distinct('depth')
36 | df = diamonds.drop_duplicates('depth')
37 | assert df.equals(d)
38 |
39 | d = diamonds >> distinct(X.cut, 'depth')
40 | df = diamonds.drop_duplicates(['cut','depth'])
41 | assert df.equals(d)
42 |
43 | df = diamonds[['carat', 'cut']].drop_duplicates()
44 | d = diamonds >> select(X.carat, X.cut) >> distinct()
45 | assert df.equals(d)
46 |
47 | df = diamonds[['carat', 'cut']].drop_duplicates(keep='last')
48 | d = diamonds >> select(X.carat, X.cut) >> distinct(keep='last')
49 | assert df.equals(d)
50 |
51 |
52 | def test_sample():
53 | random_state = 55
54 |
55 | d = diamonds >> sample(n=10, random_state=random_state)
56 | df = diamonds.sample(n=10, random_state=random_state)
57 | assert df.equals(d)
58 |
59 | d = diamonds >> sample(frac=0.001, random_state=random_state)
60 | df = diamonds.sample(frac=0.001, random_state=random_state)
61 | assert df.equals(d)
62 |
63 | d = diamonds >> group_by(X.cut) >> sample(n=10, random_state=random_state)
64 | d = d.reset_index(drop=True)
65 | df = diamonds.groupby('cut').apply(lambda x: x.sample(n=10, random_state=random_state))
66 | df = df.reset_index(drop=True)
67 | assert df.equals(d)
68 |
69 |
70 | def test_row_slice():
71 | df = diamonds.iloc[[0,1],:]
72 | assert df.equals(diamonds >> row_slice([0,1]))
73 | df = diamonds.groupby('cut').apply(lambda df: df.iloc[0,:]).reset_index(drop=True)
74 | d = diamonds >> group_by(X.cut) >> row_slice(0)
75 | assert df.equals(d.reset_index(drop=True))
76 | df = diamonds.loc[diamonds.table > 61, :]
77 | assert df.equals(diamonds >> row_slice(X.table > 61))
78 |
79 |
80 | def test_mask():
81 | test1 = diamonds >> mask(X.cut == 'Ideal')
82 | df = diamonds[diamonds.cut == 'Ideal']
83 | assert df.equals(test1)
84 |
85 | test2 = diamonds >> mask(X.cut == 'Ideal', X.color == 'E',
86 | X.table < 55, X.price < 500)
87 | df_mask = (diamonds.cut == 'Ideal') & (diamonds.color == 'E')
88 | df_mask = df_mask & (diamonds.table < 55) & (diamonds.price < 500)
89 | df = diamonds[df_mask]
90 | assert df.equals(test2)
91 |
92 |
93 | # def test_mask_small():
94 | # a = (diamonds >> group_by(X.cut) >> arrange(X.price) >>
95 | # head(3) >> ungroup() >> mask(X.carat < 0.23))
96 | # print(a)
97 | # assert False
98 |
99 | # d = diamonds >> group_by(X.cut) >> mutate(price_lag=lag(X.price)) >> head(2) >> select(X.cut, X.price_lag)
100 |
101 | def test_top_n():
102 | with pytest.raises(ValueError):
103 | diamonds >> top_n()
104 | test2 = diamonds >> top_n(n=6)
105 | df2 = diamonds.sort_values('z', ascending=False).head(6).sort_index()
106 | assert test2.equals(df2)
107 | test3 = diamonds >> top_n(col=X.x, n=5)
108 | df3 = diamonds.sort_values('x', ascending=False).head(5).sort_index()
109 | assert test3.equals(df3)
110 | test4 = diamonds >> top_n(col=X.cut, n=1)
111 | df4 = diamonds[diamonds.cut == 'Very Good']
112 | assert test4.equals(df4)
113 | test5 = diamonds >> group_by(X.cut) >> top_n(n=2)
114 | df5 = diamonds.ix[[27415, 27630, 23539, 27517, 27518, 24297, 24328, 24067, 25999, 26444, 48410]]
115 | assert test5.equals(df5)
116 | test6 = diamonds >> top_n(col=X.x, ascending=False, n=5)
117 | df6 = diamonds.sort_values('x', ascending=True).head(8).sort_index()
118 | assert test6.equals(df6)
119 |
--------------------------------------------------------------------------------
/dfply/summary_functions.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 | from .vector import *
3 |
4 |
5 | # ------------------------------------------------------------------------------
6 | # Series summary functions
7 | # ------------------------------------------------------------------------------
8 |
9 |
10 | @make_symbolic
11 | def mean(series):
12 | """
13 | Returns the mean of a series.
14 |
15 | Args:
16 | series (pandas.Series): column to summarize.
17 | """
18 |
19 | if np.issubdtype(series.dtype, np.number):
20 | return series.mean()
21 | else:
22 | return np.nan
23 |
24 |
25 | @make_symbolic
26 | def first(series, order_by=None):
27 | """
28 | Returns the first value of a series.
29 |
30 | Args:
31 | series (pandas.Series): column to summarize.
32 |
33 | Kwargs:
34 | order_by: a pandas.Series or list of series (can be symbolic) to order
35 | the input series by before summarization.
36 | """
37 |
38 | if order_by is not None:
39 | series = order_series_by(series, order_by)
40 | first_s = series.iloc[0]
41 | return first_s
42 |
43 |
44 | @make_symbolic
45 | def last(series, order_by=None):
46 | """
47 | Returns the last value of a series.
48 |
49 | Args:
50 | series (pandas.Series): column to summarize.
51 |
52 | Kwargs:
53 | order_by: a pandas.Series or list of series (can be symbolic) to order
54 | the input series by before summarization.
55 | """
56 |
57 | if order_by is not None:
58 | series = order_series_by(series, order_by)
59 | last_s = series.iloc[series.size - 1]
60 | return last_s
61 |
62 |
63 | @make_symbolic
64 | def nth(series, n, order_by=None):
65 | """
66 | Returns the nth value of a series.
67 |
68 | Args:
69 | series (pandas.Series): column to summarize.
70 | n (integer): position of desired value. Returns `NaN` if out of range.
71 |
72 | Kwargs:
73 | order_by: a pandas.Series or list of series (can be symbolic) to order
74 | the input series by before summarization.
75 | """
76 |
77 | if order_by is not None:
78 | series = order_series_by(series, order_by)
79 | try:
80 | return series.iloc[n]
81 | except:
82 | return np.nan
83 |
84 |
85 | @make_symbolic
86 | def n(series):
87 | """
88 | Returns the length of a series.
89 |
90 | Args:
91 | series (pandas.Series): column to summarize.
92 | """
93 |
94 | n_s = series.size
95 | return n_s
96 |
97 |
98 | @make_symbolic
99 | def n_distinct(series):
100 | """
101 | Returns the number of distinct values in a series.
102 |
103 | Args:
104 | series (pandas.Series): column to summarize.
105 | """
106 |
107 | n_distinct_s = series.unique().size
108 | return n_distinct_s
109 |
110 |
111 | @make_symbolic
112 | def IQR(series):
113 | """
114 | Returns the inter-quartile range (IQR) of a series.
115 |
116 | The IRQ is defined as the 75th quantile minus the 25th quantile values.
117 |
118 | Args:
119 | series (pandas.Series): column to summarize.
120 | """
121 |
122 | iqr_s = series.quantile(.75) - series.quantile(.25)
123 | return iqr_s
124 |
125 |
126 | @make_symbolic
127 | def colmin(series):
128 | """
129 | Returns the minimum value of a series.
130 |
131 | Args:
132 | series (pandas.Series): column to summarize.
133 | """
134 |
135 | min_s = series.min()
136 | return min_s
137 |
138 |
139 | @make_symbolic
140 | def colmax(series):
141 | """
142 | Returns the maximum value of a series.
143 |
144 | Args:
145 | series (pandas.Series): column to summarize.
146 | """
147 |
148 | max_s = series.max()
149 | return max_s
150 |
151 |
152 | @make_symbolic
153 | def median(series):
154 | """
155 | Returns the median value of a series.
156 |
157 | Args:
158 | series (pandas.Series): column to summarize.
159 | """
160 |
161 | if np.issubdtype(series.dtype, np.number):
162 | return series.median()
163 | else:
164 | return np.nan
165 |
166 |
167 | @make_symbolic
168 | def var(series):
169 | """
170 | Returns the variance of values in a series.
171 |
172 | Args:
173 | series (pandas.Series): column to summarize.
174 | """
175 | if np.issubdtype(series.dtype, np.number):
176 | return series.var()
177 | else:
178 | return np.nan
179 |
180 |
181 | @make_symbolic
182 | def sd(series):
183 | """
184 | Returns the standard deviation of values in a series.
185 |
186 | Args:
187 | series (pandas.Series): column to summarize.
188 | """
189 |
190 | if np.issubdtype(series.dtype, np.number):
191 | return series.std()
192 | else:
193 | return np.nan
194 |
--------------------------------------------------------------------------------
/RELEASES.txt:
--------------------------------------------------------------------------------
1 | v0.3.3
2 | - Hotfix for parsing left_on and right_on
3 |
4 | TODO: Need to figure out fix to the inversion of symbol issue. Somewhat complicated.
5 |
6 | v0.3.2
7 | Various PRs added fixing bugs, etc.
8 |
9 | v0.3.1
10 | This update is almost solely the pull requests by @bleearmstrong that were sitting
11 | in the repo waiting. There were some minor bug-fixes and changes too.
12 |
13 | - `select_if` and `drop_if` are now available to perform selection according to a function
14 | - `mutate_if` allows variable creation if a criterion function is evaluated as True
15 | - `row_number` window function is available (same as rank(method='first'))
16 | - `distinct` can take no arguments, making it equivalent to `drop_duplicates` with no arguments
17 |
18 | v0.3.0
19 | Lots and lots of big changes here. Total reworking of the internal functionality.
20 | The good news is that it should (basically) work the same as before, but
21 | hopefully better. Obviously keep in mind that this is still beta and there will
22 | be plenty of bugs to work out on the horizon, but the preexisting tests pass for now...
23 | There is not backward compatibility with old versions as the decorator names
24 | have changed, but again, the functionality is otherwise the same.
25 |
26 | Some major things (see readme for details):
27 | - Moved entirely to python 3 support only. It may still work in python 2, but don't count on it.
28 | - pandas-ply is no longer required; It was brittle and so rolled my own stuff.
29 | - Selection "subfunctions" are now working and the selection functions have been changed in light of this.
30 |
31 |
32 | v0.2.4
33 | - Bug fixed in semi-join and anti-join
34 | - top_n added
35 |
36 | v0.2.3
37 | Inplace piping added using the `>>=` operator. The `pipe` decorator internals
38 | have been changed to make this possible through the addition of an `__rshift__`
39 | implementation and chaining pipes together until evaluated against a
40 | DataFrame.
41 |
42 |
43 | v0.2.2
44 | - Added docstrings to functions and classes.
45 | - Added the `case_when` function.
46 | - Fixed `arrange` to work with symbolic functions like `desc` in the function
47 | call.
48 | - Added `cumany` and `cumall` window functions.
49 | - Added `if_else` function.
50 | - Added `na_if` function.
51 | - Added `percent_rank` function.
52 | - Reorganization of decorator functions (better subclassing).
53 |
54 |
55 | v0.2.1
56 | Fixed an issue with the way the decorators were structured, particularly
57 | the @make_symbolic, that would cause problems with returning Call objects
58 | that would not evaluate properly. Hopefully this is now resolved.
59 |
60 | The "coalesce" function was added from dplyr.
61 |
62 | Some code was moved from base.py to the new vector.py file. The vector.py
63 | file now contains functions that specifically perform operations on
64 | series or numpy arrays (such as coalesce, desc, etc.). Test files have been
65 | reorganized accordingly.
66 |
67 |
68 | v0.2.0
69 | This release now introduces the @make_symbolic decorator, which can wrap
70 | functions to allow them to evaluate later. This is particularly (and perhaps
71 | only) useful when you embed functions as arguments to other functions. For
72 | example the summary and window functions.
73 |
74 | The code for the symbolic handling decorators has been reworked. They now
75 | inherit from a common class since they shared patterns in their code.
76 |
77 | - @make_symbolic decorator
78 | - README updates
79 | - desc() and order_series_by() functions
80 | - re-imagining of the code for @symbolic_evaluation, @symbolic_reference
81 | (the functionality remains unchanged)
82 | - window and summary functions, along with their tests, have been moved around
83 | to other files.
84 |
85 |
86 | v0.1.10
87 | - `separate` and `unite` functions added.
88 | - Summary functions added for series operations.
89 | - README improved dramatically.
90 | - Function docstrings added to more functions (still not all).
91 |
92 | v0.1.9
93 | Moved unit tests into individual files that reflect the categories of the
94 | functions/features they are testing. Some small bugs have been fixed as well.
95 |
96 | v0.1.8
97 | The pipe decorator now copies the dataframe upon each chained function, along
98 | with the `_grouped_by` attribute, if any. Before, operations with the pipe
99 | functions were modifying the original dataframe (such as `mutate`).
100 |
101 | v0.1.7
102 | Restructuring of package to include `diamonds.csv` with pip installation
103 | and require `six` and `pandas-ply` rather than come pre-packaged with them.
104 |
105 | v0.1.6
106 | Added window functions:
107 | `dense_rank`
108 | `min_rank`
109 | `cumsum`
110 | `cummean`
111 | `cummax`
112 | `cummin`
113 | `cumprod`
114 |
115 |
116 | v0.1.5
117 | dplyr set operations added thanks to bleearmstrong.
118 |
119 | `df >> union(other)`
120 | Rows that appear in either `df` or `other`.
121 |
122 | `df >> intersect(other)`
123 | Rows that appear in both `df` and `other`
124 |
125 | `df >> set_diff(other)`
126 | Rows that appear in `df` but not `other`.
127 |
--------------------------------------------------------------------------------
/dfply/select.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from .base import *
4 |
5 |
6 | # ------------------------------------------------------------------------------
7 | # Select and drop operators
8 | # ------------------------------------------------------------------------------
9 |
10 | def selection_context(arg, context):
11 | if isinstance(arg, Intention):
12 | arg = arg.evaluate(context)
13 | if isinstance(arg, pd.Index):
14 | arg = list(arg)
15 | if isinstance(arg, pd.Series):
16 | arg = arg.name
17 | return arg
18 |
19 |
20 | def selection_filter(f):
21 | def wrapper(*args, **kwargs):
22 | return Intention(lambda x: f(list(x.columns),
23 | *(selection_context(a, x) for a in args),
24 | **{k: selection_context(v, x) for k, v in kwargs.items()}))
25 |
26 | return wrapper
27 |
28 |
29 | def resolve_selection(df, *args, drop=False):
30 | if len(args) > 0:
31 | args = [a for a in flatten(args)]
32 | ordering = []
33 | column_indices = np.zeros(df.shape[1])
34 | for selector in args:
35 | visible = np.where(selector != 0)[0]
36 | if not drop:
37 | column_indices[visible] = selector[visible]
38 | else:
39 | column_indices[visible] = selector[visible] * -1
40 | for selection in np.where(selector == 1)[0]:
41 | if not df.columns[selection] in ordering:
42 | ordering.append(df.columns[selection])
43 | else:
44 | ordering = list(df.columns)
45 | column_indices = np.ones(df.shape[1])
46 | return ordering, column_indices
47 |
48 |
49 | @pipe
50 | @group_delegation
51 | @symbolic_evaluation(eval_as_selector=True)
52 | def select(df, *args):
53 | ordering, column_indices = resolve_selection(df, *args)
54 | if (column_indices == 0).all():
55 | return df[[]]
56 | selection = np.where((column_indices == np.max(column_indices)) &
57 | (column_indices >= 0))[0]
58 | df = df.iloc[:, selection]
59 | if all([col in ordering for col in df.columns]):
60 | ordering = [c for c in ordering if c in df.columns]
61 | return df[ordering]
62 | else:
63 | return df
64 |
65 |
66 | @pipe
67 | @group_delegation
68 | @symbolic_evaluation(eval_as_selector=True)
69 | def drop(df, *args):
70 | _, column_indices = resolve_selection(df, *args, drop=True)
71 | if (column_indices == 0).all():
72 | return df[[]]
73 | selection = np.where((column_indices == np.max(column_indices)) &
74 | (column_indices >= 0))[0]
75 | return df.iloc[:, selection]
76 |
77 |
78 | @pipe
79 | def select_if(df, fun):
80 | """Selects columns where fun(ction) is true
81 | Args:
82 | fun: a function that will be applied to columns
83 | """
84 |
85 | def _filter_f(col):
86 | try:
87 | return fun(df[col])
88 | except:
89 | return False
90 |
91 | cols = list(filter(_filter_f, df.columns))
92 | return df[cols]
93 |
94 |
95 | @pipe
96 | def drop_if(df, fun):
97 | """Drops columns where fun(ction) is true
98 | Args:
99 | fun: a function that will be applied to columns
100 | """
101 |
102 | def _filter_f(col):
103 | try:
104 | return fun(df[col])
105 | except:
106 | return False
107 |
108 | cols = list(filter(_filter_f, df.columns))
109 | return df.drop(cols, axis=1)
110 |
111 |
112 | @selection_filter
113 | def starts_with(columns, prefix):
114 | return [c for c in columns if c.startswith(prefix)]
115 |
116 |
117 | @selection_filter
118 | def ends_with(columns, suffix):
119 | return [c for c in columns if c.endswith(suffix)]
120 |
121 |
122 | @selection_filter
123 | def contains(columns, substr):
124 | return [c for c in columns if substr in c]
125 |
126 |
127 | @selection_filter
128 | def matches(columns, pattern):
129 | return [c for c in columns if re.search(pattern, c)]
130 |
131 |
132 | @selection_filter
133 | def everything(columns):
134 | return columns
135 |
136 |
137 | @selection_filter
138 | def num_range(columns, prefix, range):
139 | colnames = [prefix + str(i) for i in range]
140 | return [c for c in columns if c in colnames]
141 |
142 |
143 | @selection_filter
144 | def one_of(columns, specified):
145 | return [c for c in columns if c in specified]
146 |
147 |
148 | @selection_filter
149 | def columns_between(columns, start_col, end_col, inclusive=True):
150 | if isinstance(start_col, str):
151 | start_col = columns.index(start_col)
152 | if isinstance(end_col, str):
153 | end_col = columns.index(end_col)
154 | return columns[start_col:end_col + int(inclusive)]
155 |
156 |
157 | @selection_filter
158 | def columns_from(columns, start_col):
159 | if isinstance(start_col, str):
160 | start_col = columns.index(start_col)
161 | return columns[start_col:]
162 |
163 |
164 | @selection_filter
165 | def columns_to(columns, end_col, inclusive=False):
166 | if isinstance(end_col, str):
167 | end_col = columns.index(end_col)
168 | return columns[:end_col + int(inclusive)]
169 |
--------------------------------------------------------------------------------
/dfply/window_functions.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 |
3 |
4 | # ------------------------------------------------------------------------------
5 | # Window functions
6 | # ------------------------------------------------------------------------------
7 |
8 | @make_symbolic
9 | def lead(series, i=1):
10 | """
11 | Returns a series shifted forward by a value. `NaN` values will be filled
12 | in the end.
13 |
14 | Same as a call to `series.shift(i)`
15 |
16 | Args:
17 | series: column to shift forward.
18 | i (int): number of positions to shift forward.
19 | """
20 |
21 | shifted = series.shift(i * -1)
22 | return shifted
23 |
24 |
25 | @make_symbolic
26 | def lag(series, i=1):
27 | """
28 | Returns a series shifted backwards by a value. `NaN` values will be filled
29 | in the beginning.
30 |
31 | Same as a call to `series.shift(-i)`
32 |
33 | Args:
34 | series: column to shift backward.
35 | i (int): number of positions to shift backward.
36 | """
37 |
38 | shifted = series.shift(i)
39 | return shifted
40 |
41 |
42 | @make_symbolic
43 | def between(series, a, b, inclusive=False):
44 | """
45 | Returns a boolean series specifying whether rows of the input series
46 | are between values `a` and `b`.
47 |
48 | Args:
49 | series: column to compare, typically symbolic.
50 | a: value series must be greater than (or equal to if `inclusive=True`)
51 | for the output series to be `True` at that position.
52 | b: value series must be less than (or equal to if `inclusive=True`) for
53 | the output series to be `True` at that position.
54 |
55 | Kwargs:
56 | inclusive (bool): If `True`, comparison is done with `>=` and `<=`.
57 | If `False` (the default), comparison uses `>` and `<`.
58 | """
59 |
60 | if inclusive == True:
61 | met_condition = (series >= a) & (series <= b)
62 | elif inclusive == False:
63 | met_condition = (series > a) & (series < b)
64 | return met_condition
65 |
66 |
67 | @make_symbolic
68 | def dense_rank(series, ascending=True):
69 | """
70 | Equivalent to `series.rank(method='dense', ascending=ascending)`.
71 |
72 | Args:
73 | series: column to rank.
74 |
75 | Kwargs:
76 | ascending (bool): whether to rank in ascending order (default is `True`).
77 | """
78 |
79 | ranks = series.rank(method='dense', ascending=ascending)
80 | return ranks
81 |
82 |
83 | @make_symbolic
84 | def min_rank(series, ascending=True):
85 | """
86 | Equivalent to `series.rank(method='min', ascending=ascending)`.
87 |
88 | Args:
89 | series: column to rank.
90 |
91 | Kwargs:
92 | ascending (bool): whether to rank in ascending order (default is `True`).
93 | """
94 |
95 | ranks = series.rank(method='min', ascending=ascending)
96 | return ranks
97 |
98 |
99 | @make_symbolic
100 | def cumsum(series):
101 | """
102 | Calculates cumulative sum of values. Equivalent to `series.cumsum()`.
103 |
104 | Args:
105 | series: column to compute cumulative sum for.
106 | """
107 |
108 | sums = series.cumsum()
109 | return sums
110 |
111 |
112 | @make_symbolic
113 | def cummean(series):
114 | """
115 | Calculates cumulative mean of values. Equivalent to
116 | `series.expanding().mean()`.
117 |
118 | Args:
119 | series: column to compute cumulative mean for.
120 | """
121 |
122 | means = series.expanding().mean()
123 | return means
124 |
125 |
126 | @make_symbolic
127 | def cummax(series):
128 | """
129 | Calculates cumulative maximum of values. Equivalent to
130 | `series.expanding().max()`.
131 |
132 | Args:
133 | series: column to compute cumulative maximum for.
134 | """
135 |
136 | maxes = series.expanding().max()
137 | return maxes
138 |
139 |
140 | @make_symbolic
141 | def cummin(series):
142 | """
143 | Calculates cumulative minimum of values. Equivalent to
144 | `series.expanding().min()`.
145 |
146 | Args:
147 | series: column to compute cumulative minimum for.
148 | """
149 |
150 | mins = series.expanding().min()
151 | return mins
152 |
153 |
154 | @make_symbolic
155 | def cumprod(series):
156 | """
157 | Calculates cumulative product of values. Equivalent to
158 | `series.cumprod()`.
159 |
160 | Args:
161 | series: column to compute cumulative product for.
162 | """
163 |
164 | prods = series.cumprod()
165 | return prods
166 |
167 |
168 | @make_symbolic
169 | def cumany(series):
170 | """
171 | Calculates cumulative any of values. Equivalent to
172 | `series.expanding().apply(np.any).astype(bool)`.
173 |
174 | Args:
175 | series: column to compute cumulative any for.
176 | """
177 |
178 | anys = series.expanding().apply(np.any).astype(bool)
179 | return anys
180 |
181 |
182 | @make_symbolic
183 | def cumall(series):
184 | """
185 | Calculates cumulative all of values. Equivalent to
186 | `series.expanding().apply(np.all).astype(bool)`.
187 |
188 | Args:
189 | series: column to compute cumulative all for.
190 | """
191 |
192 | alls = series.expanding().apply(np.all).astype(bool)
193 | return alls
194 |
195 |
196 | @make_symbolic
197 | def percent_rank(series, ascending=True):
198 | if series.size == 1:
199 | return 0
200 | percents = (series.rank(method='min', ascending=ascending) - 1) / (series.size - 1)
201 | return percents
202 |
203 |
204 | @make_symbolic
205 | def row_number(series, ascending=True):
206 | """
207 | Returns row number based on column rank
208 | Equivalent to `series.rank(method='first', ascending=ascending)`.
209 |
210 | Args:
211 | series: column to rank.
212 |
213 | Kwargs:
214 | ascending (bool): whether to rank in ascending order (default is `True`).
215 |
216 | Usage:
217 | diamonds >> head() >> mutate(rn=row_number(X.x))
218 |
219 | carat cut color clarity depth table price x y z rn
220 | 0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 2.0
221 | 1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 1.0
222 | 2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 3.0
223 | 3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 4.0
224 | 4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 5.0
225 | """
226 |
227 | series_rank = series.rank(method='first', ascending=ascending)
228 | return series_rank
229 |
--------------------------------------------------------------------------------
/dfply/set_ops.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 | import warnings
3 | import pandas as pd
4 |
5 |
6 | def validate_set_ops(df, other):
7 | """
8 | Helper function to ensure that DataFrames are valid for set operations.
9 | Columns must be the same name in the same order, and indices must be of the
10 | same dimension with the same names.
11 | """
12 |
13 | if df.columns.values.tolist() != other.columns.values.tolist():
14 | not_in_df = [col for col in other.columns if col not in df.columns]
15 | not_in_other = [col for col in df.columns if col not in other.columns]
16 | error_string = 'Error: not compatible.'
17 | if len(not_in_df):
18 | error_string += ' Cols in y but not x: ' + str(not_in_df) + '.'
19 | if len(not_in_other):
20 | error_string += ' Cols in x but not y: ' + str(not_in_other) + '.'
21 | raise ValueError(error_string)
22 | if len(df.index.names) != len(other.index.names):
23 | raise ValueError('Index dimension mismatch')
24 | if df.index.names != other.index.names:
25 | raise ValueError('Index mismatch')
26 | else:
27 | return
28 |
29 |
30 | # ------------------------------------------------------------------------------
31 | # `union`
32 | # ------------------------------------------------------------------------------
33 |
34 | @pipe
35 | def union(df, other, index=False, keep='first'):
36 | """
37 | Returns rows that appear in either DataFrame.
38 |
39 | Args:
40 | df (pandas.DataFrame): data passed in through the pipe.
41 | other (pandas.DataFrame): other DataFrame to use for set operation with
42 | the first.
43 |
44 | Kwargs:
45 | index (bool): Boolean indicating whether to consider the pandas index
46 | as part of the set operation (default `False`).
47 | keep (str): Indicates which duplicate should be kept. Options are `'first'`
48 | and `'last'`.
49 | """
50 | validate_set_ops(df, other)
51 | stacked = df.append(other)
52 | if index:
53 | stacked_reset_indexes = stacked.reset_index()
54 | index_cols = [col for col in stacked_reset_indexes.columns if col not in df.columns]
55 | index_name = df.index.names
56 | return_df = stacked_reset_indexes.drop_duplicates(keep=keep).set_index(index_cols)
57 | return_df.index.names = index_name
58 | return return_df
59 | else:
60 | return stacked.drop_duplicates(keep=keep)
61 |
62 |
63 | # ------------------------------------------------------------------------------
64 | # `intersect`
65 | # ------------------------------------------------------------------------------
66 |
67 |
68 | @pipe
69 | def intersect(df, other, index=False, keep='first'):
70 | """
71 | Returns rows that appear in both DataFrames.
72 |
73 | Args:
74 | df (pandas.DataFrame): data passed in through the pipe.
75 | other (pandas.DataFrame): other DataFrame to use for set operation with
76 | the first.
77 |
78 | Kwargs:
79 | index (bool): Boolean indicating whether to consider the pandas index
80 | as part of the set operation (default `False`).
81 | keep (str): Indicates which duplicate should be kept. Options are `'first'`
82 | and `'last'`.
83 | """
84 |
85 | validate_set_ops(df, other)
86 | if index:
87 | df_reset_index = df.reset_index()
88 | other_reset_index = other.reset_index()
89 | index_cols = [col for col in df_reset_index.columns if col not in df.columns]
90 | df_index_names = df.index.names
91 | return_df = (pd.merge(df_reset_index, other_reset_index,
92 | how='inner',
93 | left_on=df_reset_index.columns.values.tolist(),
94 | right_on=df_reset_index.columns.values.tolist())
95 | .set_index(index_cols))
96 | return_df.index.names = df_index_names
97 | return_df = return_df.drop_duplicates(keep=keep)
98 | return return_df
99 | else:
100 | return_df = pd.merge(df, other,
101 | how='inner',
102 | left_on=df.columns.values.tolist(),
103 | right_on=df.columns.values.tolist())
104 | return_df = return_df.drop_duplicates(keep=keep)
105 | return return_df
106 |
107 |
108 | # ------------------------------------------------------------------------------
109 | # `set_diff`
110 | # ------------------------------------------------------------------------------
111 |
112 |
113 | @pipe
114 | def set_diff(df, other, index=False, keep='first'):
115 | """
116 | Returns rows that appear in the first DataFrame but not the second.
117 |
118 | Args:
119 | df (pandas.DataFrame): data passed in through the pipe.
120 | other (pandas.DataFrame): other DataFrame to use for set operation with
121 | the first.
122 |
123 | Kwargs:
124 | index (bool): Boolean indicating whether to consider the pandas index
125 | as part of the set operation (default `False`).
126 | keep (str): Indicates which duplicate should be kept. Options are `'first'`
127 | and `'last'`.
128 | """
129 |
130 | validate_set_ops(df, other)
131 | if index:
132 | df_reset_index = df.reset_index()
133 | other_reset_index = other.reset_index()
134 | index_cols = [col for col in df_reset_index.columns if col not in df.columns]
135 | df_index_names = df.index.names
136 | return_df = (pd.merge(df_reset_index, other_reset_index,
137 | how='left',
138 | left_on=df_reset_index.columns.values.tolist(),
139 | right_on=other_reset_index.columns.values.tolist(),
140 | indicator=True)
141 | .set_index(index_cols))
142 | return_df = return_df[return_df._merge == 'left_only']
143 | return_df.index.names = df_index_names
144 | return_df = return_df.drop_duplicates(keep=keep)[df.columns]
145 | return return_df
146 | else:
147 | return_df = pd.merge(df, other,
148 | how='left',
149 | left_on=df.columns.values.tolist(),
150 | right_on=df.columns.values.tolist(),
151 | indicator=True)
152 | return_df = return_df[return_df._merge == 'left_only']
153 | return_df = return_df.drop_duplicates(keep=keep)[df.columns]
154 | return return_df
155 |
--------------------------------------------------------------------------------
/test/test_reshape.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from dfply import *
4 |
5 | ##==============================================================================
6 | ## reshape test functions
7 | ##==============================================================================
8 |
9 |
10 | def arrange_apply_helperfunc(df):
11 | df = df.sort_values('depth', ascending=False)
12 | df = df.head(5)
13 | return df
14 |
15 | # def test_arrange_small():
16 | # d = diamonds >> arrange(desc(X.cut), desc(X.price))
17 | # print(d.head(25))
18 | # assert False
19 |
20 |
21 | def test_arrange():
22 | df = diamonds.groupby('cut').apply(arrange_apply_helperfunc).reset_index(drop=True)
23 | d = (diamonds >> group_by('cut') >> arrange('depth', ascending=False) >>
24 | head(5) >> ungroup()).reset_index(drop=True)
25 | #print('df', df, df.shape)
26 | #print('d', d, d.shape)
27 | assert df.equals(d)
28 |
29 | d = (diamonds >> group_by('cut') >> arrange(X.depth, ascending=False) >>
30 | head(5) >> ungroup()).reset_index(drop=True)
31 | assert df.equals(d)
32 |
33 | print(type(d), type(df), type(diamonds))
34 |
35 | df = diamonds.sort_values(['cut','price'], ascending=False)
36 | d = diamonds >> arrange(desc(X.cut), desc(X.price))
37 | print('df', df >> head(5))
38 | print('d', d >> head(5))
39 | assert df.equals(d)
40 |
41 |
42 | def test_rename():
43 | df = diamonds.rename(columns={'cut':'Cut','table':'Table','carat':'Carat'})
44 | d = diamonds >> rename(Cut=X.cut, Table=X.table, Carat='carat')
45 | assert df.equals(d)
46 |
47 |
48 | @pytest.fixture
49 | def elongated():
50 | elongated = diamonds >> gather('variable', 'value', add_id=True)
51 | return elongated
52 |
53 |
54 | def test_gather(elongated):
55 | d = diamonds >> gather('variable', 'value', ['price', 'depth','x','y','z'])
56 |
57 | variables = ['price','depth','x','y','z']
58 | id_vars = [c for c in diamonds.columns if c not in variables]
59 | df = pd.melt(diamonds, id_vars, variables, 'variable', 'value')
60 |
61 | assert df.equals(d)
62 |
63 | d = diamonds >> gather('variable', 'value')
64 |
65 | variables = diamonds.columns.tolist()
66 | id_vars = []
67 | df = pd.melt(diamonds, id_vars, variables, 'variable', 'value')
68 |
69 | assert df.equals(d)
70 |
71 | df = diamonds.copy()
72 | df['_ID'] = np.arange(df.shape[0])
73 | df = pd.melt(df, ['_ID'], variables, 'variable', 'value')
74 |
75 | assert df.equals(elongated)
76 |
77 |
78 | def test_spread(elongated):
79 |
80 | columns = elongated.columns.tolist()
81 | id_cols = ['_ID']
82 |
83 | df = elongated.copy()
84 | df['temp_index'] = df['_ID'].values
85 | df = df.set_index('temp_index')
86 | spread_data = df[['variable','value']]
87 |
88 | spread_data = spread_data.pivot(columns='variable', values='value')
89 | converted_spread = spread_data.copy()
90 |
91 | columns_to_convert = [col for col in spread_data if col not in columns]
92 | converted_spread = convert_type(converted_spread, columns_to_convert)
93 |
94 | df = df[['_ID']].drop_duplicates()
95 |
96 | df_spread = df.merge(spread_data, left_index=True, right_index=True).reset_index(drop=True)
97 | df_conv = df.merge(converted_spread, left_index=True, right_index=True).reset_index(drop=True)
98 |
99 | d_spread = elongated >> spread('variable', 'value')
100 | d_spread_conv = elongated >> spread('variable', 'value', convert=True)
101 |
102 | assert df_spread.equals(d_spread)
103 | assert df_conv.equals(d_spread_conv)
104 |
105 |
106 | def test_separate():
107 |
108 | d = pd.DataFrame({
109 | 'a':['1-a-3','1-b','1-c-3-4','9-d-1','10']
110 | })
111 |
112 | test1 = d >> separate(X.a, ['a1','a2','a3'],
113 | remove=True, convert=False,
114 | extra='merge', fill='right')
115 |
116 | true1 = pd.DataFrame({
117 | 'a1':['1','1','1','9','10'],
118 | 'a2':['a','b','c','d',np.nan],
119 | 'a3':['3',np.nan,'3-4','1',np.nan]
120 | })
121 | print(test1)
122 | print(true1)
123 | assert true1.equals(test1)
124 |
125 | test2 = d >> separate(X.a, ['a1','a2','a3'],
126 | remove=True, convert=False,
127 | extra='merge', fill='left')
128 |
129 | true2 = pd.DataFrame({
130 | 'a1':['1',np.nan,'1','9',np.nan],
131 | 'a2':['a','1','c','d',np.nan],
132 | 'a3':['3','b','3-4','1','10']
133 | })
134 | assert true2.equals(test2)
135 |
136 | test3 = d >> separate(X.a, ['a1','a2','a3'],
137 | remove=True, convert=True,
138 | extra='merge', fill='right')
139 |
140 | true3 = pd.DataFrame({
141 | 'a1':[1,1,1,9,10],
142 | 'a2':['a','b','c','d',np.nan],
143 | 'a3':['3',np.nan,'3-4','1',np.nan]
144 | })
145 | assert true3.equals(test3)
146 |
147 | test4 = d >> separate(X.a, ['col1','col2'], sep=[1,3],
148 | remove=True, convert=False, extra='drop', fill='left')
149 |
150 | true4 = pd.DataFrame({
151 | 'col1':['1','1','1','9','1'],
152 | 'col2':['-a','-b','-c','-d','0']
153 | })
154 | assert true4.equals(test4)
155 |
156 | test5 = d >> separate(X.a, ['col1','col2'], sep=[1,3],
157 | remove=False, convert=False, extra='drop', fill='left')
158 |
159 | true5 = pd.DataFrame({
160 | 'a':['1-a-3','1-b','1-c-3-4','9-d-1','10'],
161 | 'col1':['1','1','1','9','1'],
162 | 'col2':['-a','-b','-c','-d','0']
163 | })
164 | assert true5.equals(test5)
165 |
166 | test6 = d >> separate(X.a, ['col1','col2','col3'], sep=[30],
167 | remove=True, convert=False, extra='drop', fill='left')
168 |
169 | true6 = pd.DataFrame({
170 | 'col1':['1-a-3','1-b','1-c-3-4','9-d-1','10'],
171 | 'col2':[np.nan,np.nan,np.nan,np.nan,np.nan],
172 | 'col3':[np.nan,np.nan,np.nan,np.nan,np.nan]
173 | })
174 | assert true6.equals(test6)
175 |
176 |
177 | def test_unite():
178 | d = pd.DataFrame({
179 | 'a':[1,2,3],
180 | 'b':['a','b','c'],
181 | 'c':[True, False, np.nan]
182 | })
183 |
184 | test1 = d >> unite('united', X.a, 'b', 2, remove=True, na_action='maintain')
185 | true1 = pd.DataFrame({
186 | 'united':['1_a_True','2_b_False',np.nan]
187 | })
188 | assert true1.equals(test1)
189 |
190 | test2 = d >> unite('united', ['a','b','c'], remove=True, na_action='ignore',
191 | sep='*')
192 | true2 = pd.DataFrame({
193 | 'united':['1*a*True','2*b*False','3*c']
194 | })
195 | assert test2.equals(true2)
196 |
197 | test3 = d >> unite('united', d.columns, remove=True, na_action='as_string')
198 | true3 = pd.DataFrame({
199 | 'united':['1_a_True','2_b_False','3_c_nan']
200 | })
201 | assert true3.equals(test3)
202 |
203 | test4 = d >> unite('united', d.columns, remove=False, na_action='as_string')
204 | true4 = pd.DataFrame({
205 | 'a':[1,2,3],
206 | 'b':['a','b','c'],
207 | 'c':[True, False, np.nan],
208 | 'united':['1_a_True','2_b_False','3_c_nan']
209 | })
210 |
211 | print(true4)
212 | print(test4)
213 | assert true4.equals(test4)
214 |
--------------------------------------------------------------------------------
/test/test_window_functions.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from dfply import *
4 |
5 | ##==============================================================================
6 | ## window function tests
7 | ##==============================================================================
8 |
9 |
10 | def test_lead():
11 | d = diamonds >> mutate(price_lag = lead(X.price, i=2))
12 | df = diamonds.assign(price_lag = diamonds.price.shift(-2))
13 | assert df.equals(d)
14 |
15 |
16 | def test_lag():
17 | d = diamonds >> mutate(price_lag = lag(X.price, i=2))
18 | df = diamonds.assign(price_lag = diamonds.price.shift(2))
19 | assert df.equals(d)
20 |
21 |
22 | def test_between():
23 | d = diamonds >> mutate(z_btwn_x_y = between(X.z, X.x, X.y))
24 | df = diamonds.copy()
25 | df['z_btwn_x_y'] = (df.z > df.x) & (df.z < df.y)
26 | assert df.equals(d)
27 |
28 |
29 | def test_dense_rank():
30 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
31 | df_dr = df >> mutate(dr=dense_rank(X.x))
32 | df_truth = df
33 | df_truth['dr'] = pd.Series([2.0, 1.0, 3.0, 4.0, 5.0])
34 | assert df_dr.equals(df_truth)
35 | df_dr = df >> mutate(dr=dense_rank(X.cut))
36 | df_truth['dr'] = pd.Series([2.0, 3.0, 1.0, 3.0, 1.0])
37 | assert df_dr.equals(df_truth)
38 | df_dr = df >> group_by(X.cut) >> mutate(dr=dense_rank(X.x))
39 | df_truth['dr'] = pd.Series([1.0, 1.0, 1.0, 2.0, 2.0])
40 | assert df_dr.sort_index().equals(df_truth)
41 | df_dr = df >> mutate(dr=dense_rank(X.x, ascending=False))
42 | df_truth['dr'] = pd.Series([4.0, 5.0, 3.0, 2.0, 1.0])
43 | assert df_dr.equals(df_truth)
44 |
45 |
46 | def test_min_rank():
47 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
48 | df_mr = df >> mutate(mr=min_rank(X.x))
49 | df_truth = df
50 | df_truth['mr'] = pd.Series([2.0, 1.0, 3.0, 4.0, 5.0])
51 | assert df_mr.equals(df_truth)
52 | df_mr = df >> mutate(mr=min_rank(X.cut))
53 | df_truth['mr'] = pd.Series([3.0, 4.0, 1.0, 4.0, 1.0])
54 | assert df_mr.equals(df_truth)
55 | df_mr = df >> group_by(X.cut) >> mutate(mr=min_rank(X.x))
56 | df_truth['mr'] = pd.Series([1.0, 1.0, 1.0, 2.0, 2.0])
57 | assert df_mr.sort_index().equals(df_truth)
58 | df_mr = df >> mutate(mr=min_rank(X.x, ascending=False))
59 | df_truth['mr'] = pd.Series([4.0, 5.0, 3.0, 2.0, 1.0])
60 | assert df_mr.equals(df_truth)
61 |
62 |
63 | def test_cumsum():
64 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
65 | df_cs = df >> mutate(cs=cumsum(X.x))
66 | df_truth = df
67 | df_truth['cs'] = pd.Series([3.95, 7.84, 11.89, 16.09, 20.43])
68 | pd.util.testing.assert_frame_equal(df_cs, df_truth)
69 | #assert df_cs.equals(df_truth)
70 | df_cs = df >> group_by(X.cut) >> mutate(cs=cumsum(X.x))
71 | df_truth['cs'] = pd.Series([3.95, 3.89, 4.05, 8.09, 8.39])
72 | pd.util.testing.assert_frame_equal(df_cs.sort_index(), df_truth)
73 | #assert df_cs.equals(df_truth)
74 |
75 |
76 | def test_cummean():
77 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
78 | df_cm = df >> mutate(cm=cummean(X.x))
79 | df_truth = df
80 | df_truth['cm'] = pd.Series([3.950000, 3.920000, 3.963333, 4.022500, 4.086000])
81 | pd.util.testing.assert_frame_equal(df_cm, df_truth)
82 | #assert df_cm.equals(df_truth)
83 | df_cm = df >> group_by(X.cut) >> mutate(cm=cummean(X.x))
84 | df_truth['cm'] = pd.Series([3.950, 3.890, 4.050, 4.045, 4.195])
85 | pd.util.testing.assert_frame_equal(df_cm.sort_index(), df_truth)
86 | #assert df_cm.equals(df_truth)
87 |
88 |
89 | def test_cummax():
90 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
91 | df_cm = df >> mutate(cm=cummax(X.x))
92 | df_truth = df
93 | df_truth['cm'] = pd.Series([3.95, 3.95, 4.05, 4.20, 4.34])
94 | pd.util.testing.assert_frame_equal(df_cm, df_truth)
95 | #assert df_cm.equals(df_truth)
96 | df_cm = df >> group_by(X.cut) >> mutate(cm=cummax(X.x))
97 | df_truth['cm'] = pd.Series([3.95, 3.89, 4.05, 4.20, 4.34])
98 | pd.util.testing.assert_frame_equal(df_cm.sort_index(), df_truth)
99 | #assert df_cm.equals(df_truth)
100 |
101 |
102 | def test_cummin():
103 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
104 | df_cm = df >> mutate(cm=cummin(X.x))
105 | df_truth = df
106 | df_truth['cm'] = pd.Series([3.95, 3.89, 3.89, 3.89, 3.89])
107 | pd.util.testing.assert_frame_equal(df_cm, df_truth)
108 | #assert df_cm.equals(df_truth)
109 | df_cm = df >> group_by(X.cut) >> mutate(cm=cummin(X.x))
110 | df_truth['cm'] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05])
111 | pd.util.testing.assert_frame_equal(df_cm.sort_index(), df_truth)
112 | #assert df_cm.equals(df_truth)
113 |
114 |
115 | def test_cumprod():
116 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
117 | df_cp = df >> mutate(cp=cumprod(X.x))
118 | df_truth = df.copy()
119 | df_truth['cp'] = pd.Series([3.950000, 15.365500, 62.230275, 261.367155, 1134.333453])
120 | pd.util.testing.assert_frame_equal(df_cp, df_truth)
121 | #assert df_cp.equals(df_truth)
122 | df_cp = df >> group_by(X.cut) >> mutate(cp=cumprod(X.x))
123 | df_truth['cp'] = pd.Series([3.950, 3.890, 4.050, 16.338, 17.577])
124 | # some tricky floating point stuff going on here
125 | diffs = df_cp.sort_index().cp - df_truth.cp
126 | assert all(diffs < .0000001)
127 |
128 |
129 | def test_cumany():
130 | df = pd.DataFrame({
131 | 'a':[False,False,True,True,False,True],
132 | 'b':['x','x','x','x','y','y']
133 | })
134 |
135 | d = df >> mutate(ca=cumany(X.a))
136 | assert d.equals(df.assign(ca=[False,False,True,True,True,True]))
137 |
138 | d = df >> group_by(X.b) >> mutate(ca=cumany(X.a))
139 | assert d.sort_index().equals(df.assign(ca=[False,False,True,True,False,True]))
140 |
141 |
142 | def test_cumall():
143 | df = pd.DataFrame({
144 | 'a':[True,True,False,True,False,True],
145 | 'b':['x','x','x','y','y','y']
146 | })
147 |
148 | d = df >> mutate(ca=cumall(X.a))
149 | assert d.equals(df.assign(ca=[True,True,False,False,False,False]))
150 |
151 | d = df >> group_by(X.b) >> mutate(ca=cumall(X.a))
152 | assert d.sort_index().equals(df.assign(ca=[True,True,False,True,False,False]))
153 |
154 |
155 | def test_percent_rank():
156 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
157 | df_pr = df >> mutate(pr=percent_rank(X.x))
158 | df_truth = df.copy()
159 | assert df_pr.equals(df_truth.assign(pr=[.25, 0.00, 0.50, 0.75, 1.00]))
160 | df_pr = df >> mutate(pr=percent_rank(X.cut))
161 | assert df_pr.equals(df_truth.assign(pr=[0.50, 0.75, 0.00, 0.75, 0.00]))
162 | df_pr = df >> group_by(X.cut) >> mutate(pr=percent_rank(X.x))
163 | assert df_pr.sort_index().equals(df_truth.assign(pr=[0.0, 0.0, 0.0, 1.0, 1.0]))
164 | df_pr = df >> mutate(pr=percent_rank(X.x, ascending=False))
165 | assert df_pr.equals(df_truth.assign(pr=[0.75, 1.0, 0.50, 0.25, 0.00]))
166 |
167 |
168 | def test_row_number():
169 | df = diamonds.copy().head(5).sort_values(by='x')
170 | df['rn'] = range(1, df.shape[0] + 1)
171 | df['rn'] = df['rn'].astype(float)
172 | df.sort_index(inplace=True)
173 | assert df.equals(diamonds >> head(5) >> mutate(rn=row_number(X.x)))
174 | # test 2: row number with desc() option
175 | df = diamonds.copy().head(5).sort_values(by='x', ascending=False)
176 | df['rn'] = range(1, df.shape[0] + 1)
177 | df['rn'] = df['rn'].astype(float)
178 | df.sort_index(inplace=True)
179 | assert df.equals(diamonds >> head(5) >> mutate(rn=row_number(desc(X.x))))
180 | # test 3: row number with ascending keyword
181 | df = diamonds.copy().head(5).sort_values(by='x', ascending=False)
182 | df['rn'] = range(1, df.shape[0] + 1)
183 | df['rn'] = df['rn'].astype(float)
184 | df.sort_index(inplace=True)
185 | assert df.equals(diamonds >> head(5) >> mutate(rn=row_number(X.x, ascending=False)))
186 | # test 4: with a group by
187 | df = diamonds.copy().head(5)
188 | df['rn'] = [1, 1, 1, 2, 2]
189 | df['rn'] = df['rn'].astype(float)
190 | assert df.equals((diamonds >> head(5) >> group_by(X.cut) >> mutate(rn=row_number(X.x))).sort_index())
191 |
--------------------------------------------------------------------------------
/dfply/vector.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 | import collections
3 |
4 |
5 | # ------------------------------------------------------------------------------
6 | # series ordering
7 | # ------------------------------------------------------------------------------
8 |
9 | @make_symbolic
10 | def order_series_by(series, order_series):
11 | """
12 | Orders one series according to another series, or a list of other
13 | series. If a list of other series are specified, ordering is done hierarchically
14 | like when a list of columns is supplied to `.sort_values()`.
15 |
16 | Args:
17 | series (:obj:`pandas.Series`): the pandas Series object to be reordered.
18 | order_series: either a pandas Series object or a list of pandas Series
19 | objects. These will be sorted using `.sort_values()` with
20 | `ascending=True`, and the new order will be used to reorder the
21 | Series supplied in the first argument.
22 |
23 | Returns:
24 | reordered `pandas.Series` object
25 | """
26 |
27 | if isinstance(order_series, (list, tuple)):
28 | sorter = pd.concat(order_series, axis=1)
29 | sorter_columns = ['_sorter' + str(i) for i in range(len(order_series))]
30 | sorter.columns = sorter_columns
31 | sorter['series'] = series.values
32 | sorted_series = sorter.sort_values(sorter_columns)['series']
33 | return sorted_series
34 | else:
35 | sorted_series = pd.DataFrame({
36 | 'series': series.values,
37 | 'order': order_series.values
38 | }).sort_values('order', ascending=True)['series']
39 | return sorted_series
40 |
41 |
42 | @make_symbolic
43 | def desc(series):
44 | """
45 | Mimics the functionality of the R desc function. Essentially inverts a
46 | series object to make ascending sort act like descending sort.
47 |
48 | Args:
49 | series (:obj:`pandas.Series`): pandas series to be inverted prior to
50 | ordering/sorting.
51 |
52 | Returns:
53 | inverted `pandas.Series`. The returned series will be numeric (integers),
54 | regardless of the type of the original series.
55 |
56 | Example:
57 |
58 | First group by cut, then find the first value of price when ordering by
59 | price ascending, and ordering by price descending using the `desc` function.
60 |
61 | diamonds >> group_by(X.cut) >> summarize(carat_low=first(X.price, order_by=X.price),
62 | carat_high=first(X.price, order_by=desc(X.price)))
63 |
64 | cut carat_high carat_low
65 | 0 Fair 18574 337
66 | 1 Good 18788 327
67 | 2 Ideal 18806 326
68 | 3 Premium 18823 326
69 | 4 Very Good 18818 336
70 | """
71 |
72 | return series.rank(method='min', ascending=False)
73 |
74 |
75 | # ------------------------------------------------------------------------------
76 | # coalesce
77 | # ------------------------------------------------------------------------------
78 |
79 | @make_symbolic
80 | def coalesce(*series):
81 | """
82 | Takes the first non-NaN value in order across the specified series,
83 | returning a new series. Mimics the coalesce function in dplyr and SQL.
84 |
85 | Args:
86 | *series: Series objects, typically represented in their symbolic form
87 | (like X.series).
88 |
89 | Example:
90 | df = pd.DataFrame({
91 | 'a':[1,np.nan,np.nan,np.nan,np.nan],
92 | 'b':[2,3,np.nan,np.nan,np.nan],
93 | 'c':[np.nan,np.nan,4,5,np.nan],
94 | 'd':[6,7,8,9,np.nan]
95 | })
96 | df >> transmute(coal=coalesce(X.a, X.b, X.c, X.d))
97 |
98 | coal
99 | 0 1
100 | 1 3
101 | 2 4
102 | 3 5
103 | 4 np.nan
104 | """
105 |
106 | series = [pd.Series(s) for s in series]
107 | coalescer = pd.concat(series, axis=1)
108 | min_nonna = np.argmin(pd.isnull(coalescer).values, axis=1)
109 | min_nonna = [coalescer.columns[i] for i in min_nonna]
110 | return coalescer.lookup(np.arange(coalescer.shape[0]), min_nonna)
111 |
112 |
113 | # ------------------------------------------------------------------------------
114 | # case_when
115 | # ------------------------------------------------------------------------------
116 |
117 | @make_symbolic
118 | def case_when(*conditions):
119 | """
120 | Functions as a switch statement, creating a new series out of logical
121 | conditions specified by 2-item lists where the left-hand item is the
122 | logical condition and the right-hand item is the value where that
123 | condition is true.
124 |
125 | Conditions should go from the most specific to the most general. A
126 | conditional that appears earlier in the series will "overwrite" one that
127 | appears later. Think of it like a series of if-else statements.
128 |
129 | The logicals and values of the condition pairs must be all the same
130 | length, or length 1. Logicals can be vectors of booleans or a single
131 | boolean (`True`, for example, can be the logical statement for the
132 | final conditional to catch all remaining.).
133 |
134 | Args:
135 | *conditions: Each condition should be a list with two values. The first
136 | value is a boolean or vector of booleans that specify indices in
137 | which the condition is met. The second value is a vector of values
138 | or single value specifying the outcome where that condition is met.
139 |
140 | Example:
141 | df = pd.DataFrame({
142 | 'num':np.arange(16)
143 | })
144 | df >> mutate(strnum=case_when([X.num % 15 == 0, 'fizzbuzz'],
145 | [X.num % 3 == 0, 'fizz'],
146 | [X.num % 5 == 0, 'buzz'],
147 | [True, X.num.astype(str)]))
148 |
149 | num strnum
150 | 0 0 fizzbuzz
151 | 1 1 1
152 | 2 2 2
153 | 3 3 fizz
154 | 4 4 4
155 | 5 5 buzz
156 | 6 6 fizz
157 | 7 7 7
158 | 8 8 8
159 | 9 9 fizz
160 | 10 10 buzz
161 | 11 11 11
162 | 12 12 fizz
163 | 13 13 13
164 | 14 14 14
165 | 15 15 fizzbuzz
166 | """
167 |
168 | lengths = []
169 | for logical, outcome in conditions:
170 | if isinstance(logical, collections.Iterable):
171 | lengths.append(len(logical))
172 | if isinstance(outcome, collections.Iterable) and not isinstance(outcome, str):
173 | lengths.append(len(outcome))
174 | unique_lengths = np.unique(lengths)
175 | assert len(unique_lengths) == 1
176 | output_len = unique_lengths[0]
177 |
178 | output = []
179 | for logical, outcome in conditions:
180 | if isinstance(logical, bool):
181 | logical = np.repeat(logical, output_len)
182 | if isinstance(logical, pd.Series):
183 | logical = logical.values
184 | if not isinstance(outcome, collections.Iterable) or isinstance(outcome, str):
185 | outcome = pd.Series(np.repeat(outcome, output_len))
186 | outcome[~logical] = np.nan
187 | output.append(outcome)
188 |
189 | return coalesce(*output)
190 |
191 |
192 | # ------------------------------------------------------------------------------
193 | # if_else
194 | # ------------------------------------------------------------------------------
195 |
196 | @make_symbolic
197 | def if_else(condition, when_true, otherwise):
198 | """
199 | Wraps creation of a series based on if-else conditional logic into a function
200 | call.
201 |
202 | Provide a boolean vector condition, value(s) when true, and value(s)
203 | when false, and a vector will be returned the same length as the conditional
204 | vector according to the logical statement.
205 |
206 | Args:
207 | condition: A boolean vector representing the condition. This is often
208 | a logical statement with a symbolic series.
209 | when_true: A vector the same length as the condition vector or a single
210 | value to apply when the condition is `True`.
211 | otherwise: A vector the same length as the condition vector or a single
212 | value to apply when the condition is `False`.
213 |
214 | Example:
215 | df = pd.DataFrame
216 | """
217 |
218 | if not isinstance(when_true, collections.Iterable) or isinstance(when_true, str):
219 | when_true = np.repeat(when_true, len(condition))
220 | if not isinstance(otherwise, collections.Iterable) or isinstance(otherwise, str):
221 | otherwise = np.repeat(otherwise, len(condition))
222 | assert (len(condition) == len(when_true)) and (len(condition) == len(otherwise))
223 |
224 | if isinstance(when_true, pd.Series):
225 | when_true = when_true.values
226 | if isinstance(otherwise, pd.Series):
227 | otherwise = otherwise.values
228 |
229 | output = np.array([when_true[i] if c else otherwise[i]
230 | for i, c in enumerate(condition)])
231 | return output
232 |
233 |
234 | # ------------------------------------------------------------------------------
235 | # na_if
236 | # ------------------------------------------------------------------------------
237 |
238 | @make_symbolic
239 | def na_if(series, *values):
240 | """
241 | If values in a series match a specified value, change them to `np.nan`.
242 |
243 | Args:
244 | series: Series or vector, often symbolic.
245 | *values: Value(s) to convert to `np.nan` in the series.
246 | """
247 |
248 | series = pd.Series(series)
249 | series[series.isin(values)] = np.nan
250 | return series
251 |
--------------------------------------------------------------------------------
/dfply/join.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 |
3 |
4 | # ------------------------------------------------------------------------------
5 | # SQL-style joins
6 | # ------------------------------------------------------------------------------
7 |
8 | def get_join_parameters(join_kwargs):
9 | """
10 | Convenience function to determine the columns to join the right and
11 | left DataFrames on, as well as any suffixes for the columns.
12 | """
13 |
14 | by = join_kwargs.get('by', None)
15 | suffixes = join_kwargs.get('suffixes', ('_x', '_y'))
16 | if isinstance(by, tuple):
17 | left_on, right_on = by
18 | elif isinstance(by, list):
19 | by = [x if isinstance(x, tuple) else (x, x) for x in by]
20 | left_on, right_on = (list(x) for x in zip(*by))
21 | else:
22 | left_on, right_on = by, by
23 | return left_on, right_on, suffixes
24 |
25 |
26 | @pipe
27 | def inner_join(df, other, **kwargs):
28 | """
29 | Joins on values present in both DataFrames.
30 |
31 | Args:
32 | df (pandas.DataFrame): Left DataFrame (passed in via pipe)
33 | other (pandas.DataFrame): Right DataFrame
34 |
35 | Kwargs:
36 | by (str or list): Columns to join on. If a single string, will join
37 | on that column. If a list of lists which contain strings or
38 | integers, the right/left columns to join on.
39 | suffixes (list): String suffixes to append to column names in left
40 | and right DataFrames.
41 |
42 | Example:
43 | a >> inner_join(b, by='x1')
44 |
45 | x1 x2 x3
46 | 0 A 1 True
47 | 1 B 2 False
48 | """
49 |
50 | left_on, right_on, suffixes = get_join_parameters(kwargs)
51 | joined = df.merge(other, how='inner', left_on=left_on,
52 | right_on=right_on, suffixes=suffixes)
53 | return joined
54 |
55 |
56 | @pipe
57 | def full_join(df, other, **kwargs):
58 | """
59 | Joins on values present in either DataFrame. (Alternate to `outer_join`)
60 |
61 | Args:
62 | df (pandas.DataFrame): Left DataFrame (passed in via pipe)
63 | other (pandas.DataFrame): Right DataFrame
64 |
65 | Kwargs:
66 | by (str or list): Columns to join on. If a single string, will join
67 | on that column. If a list of lists which contain strings or
68 | integers, the right/left columns to join on.
69 | suffixes (list): String suffixes to append to column names in left
70 | and right DataFrames.
71 |
72 | Example:
73 | a >> outer_join(b, by='x1')
74 |
75 | x1 x2 x3
76 | 0 A 1.0 True
77 | 1 B 2.0 False
78 | 2 C 3.0 NaN
79 | 3 D NaN True
80 | """
81 |
82 | left_on, right_on, suffixes = get_join_parameters(kwargs)
83 | joined = df.merge(other, how='outer', left_on=left_on,
84 | right_on=right_on, suffixes=suffixes)
85 | return joined
86 |
87 |
88 | @pipe
89 | def outer_join(df, other, **kwargs):
90 | """
91 | Joins on values present in either DataFrame. (Alternate to `full_join`)
92 |
93 | Args:
94 | df (pandas.DataFrame): Left DataFrame (passed in via pipe)
95 | other (pandas.DataFrame): Right DataFrame
96 |
97 | Kwargs:
98 | by (str or list): Columns to join on. If a single string, will join
99 | on that column. If a list of lists which contain strings or
100 | integers, the right/left columns to join on.
101 | suffixes (list): String suffixes to append to column names in left
102 | and right DataFrames.
103 |
104 | Example:
105 | a >> full_join(b, by='x1')
106 |
107 | x1 x2 x3
108 | 0 A 1.0 True
109 | 1 B 2.0 False
110 | 2 C 3.0 NaN
111 | 3 D NaN True
112 | """
113 |
114 | left_on, right_on, suffixes = get_join_parameters(kwargs)
115 | joined = df.merge(other, how='outer', left_on=left_on,
116 | right_on=right_on, suffixes=suffixes)
117 | return joined
118 |
119 |
120 | @pipe
121 | def left_join(df, other, **kwargs):
122 | """
123 | Joins on values present in in the left DataFrame.
124 |
125 | Args:
126 | df (pandas.DataFrame): Left DataFrame (passed in via pipe)
127 | other (pandas.DataFrame): Right DataFrame
128 |
129 | Kwargs:
130 | by (str or list): Columns to join on. If a single string, will join
131 | on that column. If a list of lists which contain strings or
132 | integers, the right/left columns to join on.
133 | suffixes (list): String suffixes to append to column names in left
134 | and right DataFrames.
135 |
136 | Example:
137 | a >> left_join(b, by='x1')
138 |
139 | x1 x2 x3
140 | 0 A 1 True
141 | 1 B 2 False
142 | 2 C 3 NaN
143 | """
144 |
145 | left_on, right_on, suffixes = get_join_parameters(kwargs)
146 | joined = df.merge(other, how='left', left_on=left_on,
147 | right_on=right_on, suffixes=suffixes)
148 | return joined
149 |
150 |
151 | @pipe
152 | def right_join(df, other, **kwargs):
153 | """
154 | Joins on values present in in the right DataFrame.
155 |
156 | Args:
157 | df (pandas.DataFrame): Left DataFrame (passed in via pipe)
158 | other (pandas.DataFrame): Right DataFrame
159 |
160 | Kwargs:
161 | by (str or list): Columns to join on. If a single string, will join
162 | on that column. If a list of lists which contain strings or
163 | integers, the right/left columns to join on.
164 | suffixes (list): String suffixes to append to column names in left
165 | and right DataFrames.
166 |
167 | Example:
168 | a >> right_join(b, by='x1')
169 |
170 | x1 x2 x3
171 | 0 A 1.0 True
172 | 1 B 2.0 False
173 | 2 D NaN True
174 | """
175 |
176 | left_on, right_on, suffixes = get_join_parameters(kwargs)
177 | joined = df.merge(other, how='right', left_on=left_on,
178 | right_on=right_on, suffixes=suffixes)
179 | return joined
180 |
181 |
182 | @pipe
183 | def semi_join(df, other, **kwargs):
184 | """
185 | Returns all of the rows in the left DataFrame that have a match
186 | in the right DataFrame.
187 |
188 | Args:
189 | df (pandas.DataFrame): Left DataFrame (passed in via pipe)
190 | other (pandas.DataFrame): Right DataFrame
191 |
192 | Kwargs:
193 | by (str or list): Columns to join on. If a single string, will join
194 | on that column. If a list of lists which contain strings or
195 | integers, the right/left columns to join on.
196 |
197 | Example:
198 | a >> semi_join(b, by='x1')
199 |
200 | x1 x2
201 | 0 A 1
202 | 1 B 2
203 | """
204 |
205 | left_on, right_on, suffixes = get_join_parameters(kwargs)
206 | if not right_on:
207 | right_on = [col_name for col_name in df.columns.values.tolist() if col_name in other.columns.values.tolist()]
208 | left_on = right_on
209 | elif not isinstance(right_on, (list, tuple)):
210 | right_on = [right_on]
211 | other_reduced = other[right_on].drop_duplicates()
212 | joined = df.merge(other_reduced, how='inner', left_on=left_on,
213 | right_on=right_on, suffixes=('', '_y'),
214 | indicator=True).query('_merge=="both"')[df.columns.values.tolist()]
215 | return joined
216 |
217 |
218 | @pipe
219 | def anti_join(df, other, **kwargs):
220 | """
221 | Returns all of the rows in the left DataFrame that do not have a
222 | match in the right DataFrame.
223 |
224 | Args:
225 | df (pandas.DataFrame): Left DataFrame (passed in via pipe)
226 | other (pandas.DataFrame): Right DataFrame
227 |
228 | Kwargs:
229 | by (str or list): Columns to join on. If a single string, will join
230 | on that column. If a list of lists which contain strings or
231 | integers, the right/left columns to join on.
232 |
233 | Example:
234 | a >> anti_join(b, by='x1')
235 |
236 | x1 x2
237 | 2 C 3
238 | """
239 |
240 | left_on, right_on, suffixes = get_join_parameters(kwargs)
241 | if not right_on:
242 | right_on = [col_name for col_name in df.columns.values.tolist() if col_name in other.columns.values.tolist()]
243 | left_on = right_on
244 | elif not isinstance(right_on, (list, tuple)):
245 | right_on = [right_on]
246 | other_reduced = other[right_on].drop_duplicates()
247 | joined = df.merge(other_reduced, how='left', left_on=left_on,
248 | right_on=right_on, suffixes=('', '_y'),
249 | indicator=True).query('_merge=="left_only"')[df.columns.values.tolist()]
250 | return joined
251 |
252 |
253 | # ------------------------------------------------------------------------------
254 | # Binding
255 | # ------------------------------------------------------------------------------
256 |
257 | @pipe
258 | def bind_rows(df, other, join='outer', ignore_index=False):
259 | """
260 | Binds DataFrames "vertically", stacking them together. This is equivalent
261 | to `pd.concat` with `axis=0`.
262 |
263 | Args:
264 | df (pandas.DataFrame): Top DataFrame (passed in via pipe).
265 | other (pandas.DataFrame): Bottom DataFrame.
266 |
267 | Kwargs:
268 | join (str): One of `"outer"` or `"inner"`. Outer join will preserve
269 | columns not present in both DataFrames, whereas inner joining will
270 | drop them.
271 | ignore_index (bool): Indicates whether to consider pandas indices as
272 | part of the concatenation (defaults to `False`).
273 | """
274 |
275 | df = pd.concat([df, other], join=join, ignore_index=ignore_index, axis=0)
276 | return df
277 |
278 |
279 | @pipe
280 | def bind_cols(df, other, join='outer', ignore_index=False):
281 | """
282 | Binds DataFrames "horizontally". This is equivalent to `pd.concat` with
283 | `axis=1`.
284 |
285 | Args:
286 | df (pandas.DataFrame): Left DataFrame (passed in via pipe).
287 | other (pandas.DataFrame): Right DataFrame.
288 |
289 | Kwargs:
290 | join (str): One of `"outer"` or `"inner"`. Outer join will preserve
291 | rows not present in both DataFrames, whereas inner joining will
292 | drop them.
293 | ignore_index (bool): Indicates whether to consider pandas indices as
294 | part of the concatenation (defaults to `False`).
295 | """
296 |
297 | df = pd.concat([df, other], join=join, ignore_index=ignore_index, axis=1)
298 | return df
299 |
--------------------------------------------------------------------------------
/test/test_select.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from dfply import *
4 |
5 | ##==============================================================================
6 | ## select and drop test functions
7 | ##==============================================================================
8 |
9 | # 0 1 2 3 4 5 6 7 8 9
10 | # carat cut color clarity depth table price x y z
11 | # 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
12 |
13 | def test_select():
14 | df = diamonds[['carat','cut','price']]
15 | assert df.equals(diamonds >> select('carat','cut','price'))
16 | assert df.equals(diamonds >> select(0, 1, 6))
17 | assert df.equals(diamonds >> select(0, 1, 'price'))
18 | assert df.equals(diamonds >> select([0, X.cut], X.price))
19 | assert df.equals(diamonds >> select(X.carat, X['cut'], X.price))
20 | assert df.equals(diamonds >> select(X[['carat','cut','price']]))
21 | assert df.equals(diamonds >> select(X[['carat','cut']], X.price))
22 | assert df.equals(diamonds >> select(X.iloc[:,[0,1,6]]))
23 | assert df.equals(diamonds >> select([X.loc[:, ['carat','cut','price']]]))
24 |
25 |
26 | def test_select_inversion():
27 | df = diamonds.iloc[:, 3:]
28 | d = diamonds >> select(~X.carat, ~X.cut, ~X.color)
29 | print(df.head())
30 | print(d.head())
31 | assert df.equals(d)
32 |
33 |
34 | def test_drop():
35 | df = diamonds.drop(['carat','cut','price'], axis=1)
36 | assert df.equals(diamonds >> drop('carat','cut','price'))
37 | assert df.equals(diamonds >> drop(0, 1, 6))
38 | assert df.equals(diamonds >> drop(0, 1, 'price'))
39 | assert df.equals(diamonds >> drop([0, X.cut], X.price))
40 | assert df.equals(diamonds >> drop(X.carat, X['cut'], X.price))
41 | assert df.equals(diamonds >> drop(X[['carat','cut','price']]))
42 | assert df.equals(diamonds >> drop(X[['carat','cut']], X.price))
43 | assert df.equals(diamonds >> drop(X.iloc[:,[0,1,6]]))
44 | assert df.equals(diamonds >> drop([X.loc[:, ['carat','cut','price']]]))
45 |
46 |
47 | def test_select_containing():
48 | df = diamonds[['carat','cut','color','clarity','price']]
49 | assert df.equals(diamonds >> select(contains('c')))
50 |
51 |
52 | def test_drop_containing():
53 | df = diamonds[['depth','table','x','y','z']]
54 | assert df.equals(diamonds >> drop(contains('c')))
55 |
56 |
57 | def test_select_matches():
58 | df = diamonds[['carat','cut','color','clarity','price']]
59 | assert df.equals(diamonds >> select(matches('^c[auol]|pri')))
60 |
61 |
62 | def test_drop_matches():
63 | df = diamonds[['depth','table','x','y','z']]
64 | assert df.equals(diamonds >> drop(matches('^c[auol]|p.i')))
65 |
66 |
67 | def test_select_startswith():
68 | df = diamonds[['carat','cut','color','clarity']]
69 | assert df.equals(diamonds >> select(starts_with('c')))
70 |
71 |
72 | def test_drop_startswith():
73 | df = diamonds[['depth','table','price','x','y','z']]
74 | assert df.equals(diamonds >> drop(starts_with('c')))
75 |
76 |
77 | def test_select_endswith():
78 | df = diamonds[['table','price']]
79 | assert df.equals(diamonds >> select(ends_with('e')))
80 |
81 |
82 | def test_drop_endswith():
83 | df = diamonds.drop('z', axis=1)
84 | assert df.equals(diamonds >> drop(ends_with('z')))
85 |
86 |
87 | def test_select_between():
88 | df = diamonds[['cut','color','clarity']]
89 | assert df.equals(diamonds >> select(columns_between(X.cut, X.clarity)))
90 | assert df.equals(diamonds >> select(columns_between('cut', 'clarity')))
91 | assert df.equals(diamonds >> select(columns_between(1, 3)))
92 |
93 | df = diamonds[['x','y','z']]
94 | assert df.equals(diamonds >> select(columns_between('x', 20)))
95 |
96 |
97 |
98 | def test_drop_between():
99 | df = diamonds[['carat','z']]
100 | assert df.equals(diamonds >> drop(columns_between('cut','y')))
101 | assert df.equals(diamonds >> drop(columns_between(X.cut, 8)))
102 |
103 | df = diamonds[['carat','cut']]
104 | assert df.equals(diamonds >> drop(columns_between(X.color, 20)))
105 |
106 |
107 | def test_select_from():
108 | df = diamonds[['x','y','z']]
109 | assert df.equals(diamonds >> select(columns_from('x')))
110 | assert df.equals(diamonds >> select(columns_from(X.x)))
111 | assert df.equals(diamonds >> select(columns_from(7)))
112 |
113 | assert diamonds[[]].equals(diamonds >> select(columns_from(100)))
114 |
115 |
116 | def test_drop_from():
117 | df = diamonds[['carat','cut']]
118 | assert df.equals(diamonds >> drop(columns_from('color')))
119 | assert df.equals(diamonds >> drop(columns_from(X.color)))
120 | assert df.equals(diamonds >> drop(columns_from(2)))
121 |
122 | #print(diamonds >> drop(columns_from(0)))
123 | assert diamonds[[]].equals(diamonds >> drop(columns_from(0)))
124 |
125 |
126 | def test_select_to():
127 | df = diamonds[['carat','cut']]
128 | assert df.equals(diamonds >> select(columns_to('color')))
129 | assert df.equals(diamonds >> select(columns_to(X.color)))
130 | assert df.equals(diamonds >> select(columns_to(2)))
131 |
132 |
133 | def test_drop_to():
134 | df = diamonds[['x','y','z']]
135 | assert df.equals(diamonds >> drop(columns_to('x')))
136 | assert df.equals(diamonds >> drop(columns_to(X.x)))
137 | assert df.equals(diamonds >> drop(columns_to(7)))
138 |
139 |
140 | def select_through():
141 | df = diamonds[['carat','cut','color']]
142 | assert df.equals(diamonds >> select(columns_to('color', inclusive=True)))
143 | assert df.equals(diamonds >> select(columns_to(X.color, inclusive=True)))
144 | assert df.equals(diamonds >> select(columns_to(2, inclusive=True)))
145 |
146 |
147 | def drop_through():
148 | df = diamonds[['y','z']]
149 | assert df.equals(diamonds >> drop(columns_to('x', inclusive=True)))
150 | assert df.equals(diamonds >> drop(columns_to(X.x, inclusive=True)))
151 | assert df.equals(diamonds >> drop(columns_to(7, inclusive=True)))
152 |
153 |
154 |
155 | def test_select_if():
156 | # test 1: manually build diamonds subset where columns are numeric and
157 | # mean is greater than 3
158 | cols = list()
159 | for col in diamonds:
160 | try:
161 | if mean(diamonds[col]) > 3:
162 | cols.append(col)
163 | except:
164 | pass
165 | df_if = diamonds[cols]
166 | assert df_if.equals(diamonds >> select_if(lambda col: mean(col) > 3))
167 | # test 2: use and
168 | cols = list()
169 | for col in diamonds:
170 | try:
171 | if mean(diamonds[col]) > 3 and max(diamonds[col]) < 50:
172 | cols.append(col)
173 | except:
174 | pass
175 | df_if = diamonds[cols]
176 | assert df_if.equals(diamonds >> select_if(lambda col: mean(col) > 3 and max(col) < 50))
177 | # test 3: use or
178 | cols = list()
179 | for col in diamonds:
180 | try:
181 | if mean(diamonds[col]) > 3 or max(diamonds[col]) < 6:
182 | cols.append(col)
183 | except:
184 | pass
185 | df_if = diamonds[cols]
186 | assert df_if.equals(diamonds >> select_if(lambda col: mean(col) > 3 or max(col) < 6))
187 | # test 4: string operations - contain a specific string
188 | cols = list()
189 | for col in diamonds:
190 | try:
191 | if any(diamonds[col].str.contains('Ideal')):
192 | cols.append(col)
193 | except:
194 | pass
195 | df_if = diamonds[cols]
196 | assert df_if.equals(diamonds >> select_if(lambda col: any(col.str.contains('Ideal'))))
197 | # test 5: get any text columns
198 | # uses the special '.' regex symbol to find any text value
199 | cols = list()
200 | for col in diamonds:
201 | try:
202 | if any(diamonds[col].str.contains('.')):
203 | cols.append(col)
204 | except:
205 | pass
206 | df_if = diamonds[cols]
207 | assert df_if.equals(diamonds >> select_if(lambda col: any(col.str.contains('.'))))
208 |
209 |
210 | def test_drop_if():
211 | # test 1: returns a dataframe where any column does not have a mean greater than 3
212 | # this means numeric columns with mean less than 3, and also any non-numeric column
213 | # (since it does not have a mean)
214 | cols = list()
215 | for col in diamonds:
216 | try:
217 | if mean(diamonds[col]) > 3:
218 | cols.append(col)
219 | except:
220 | pass
221 | inverse_cols = [col for col in diamonds if col not in cols]
222 | df_if = diamonds[inverse_cols]
223 | assert df_if.equals(diamonds >> drop_if(lambda col: mean(col) > 3))
224 | # test 2: use and
225 | # return colums where both conditions are false:
226 | # the mean greater than 3, and max < 50
227 | # again, this will include non-numeric columns
228 | cols = list()
229 | for col in diamonds:
230 | try:
231 | if mean(diamonds[col]) > 3 and max(diamonds[col]) < 50:
232 | cols.append(col)
233 | except:
234 | pass
235 | inverse_cols = [col for col in diamonds if col not in cols]
236 | df_if = diamonds[inverse_cols]
237 | assert df_if.equals(diamonds >> drop_if(lambda col: mean(col) > 3 and max(col) < 50))
238 | # test 3: use or
239 | # this will return a dataframe where either of the two conditions are false:
240 | # the mean is greater than 3, or the max < 6
241 | cols = list()
242 | for col in diamonds:
243 | try:
244 | if mean(diamonds[col]) > 3 or max(diamonds[col]) < 6:
245 | cols.append(col)
246 | except:
247 | pass
248 | inverse_cols = [col for col in diamonds if col not in cols]
249 | df_if = diamonds[inverse_cols]
250 | assert df_if.equals(diamonds >> drop_if(lambda col: mean(col) > 3 or max(col) < 6))
251 | # test 4: string operations - contain a specific string
252 | # this will drop any columns if they contain the word 'Ideal'
253 | cols = list()
254 | for col in diamonds:
255 | try:
256 | if any(diamonds[col].str.contains('Ideal')):
257 | cols.append(col)
258 | except:
259 | pass
260 | inverse_cols = [col for col in diamonds if col not in cols]
261 | df_if = diamonds[inverse_cols]
262 | assert df_if.equals(diamonds >> drop_if(lambda col: any(col.str.contains('Ideal'))))
263 | # test 5: drop any text columns
264 | # uses the special '.' regex symbol to find any text value
265 | cols = list()
266 | for col in diamonds:
267 | try:
268 | if any(diamonds[col].str.contains('.')):
269 | cols.append(col)
270 | except:
271 | pass
272 | inverse_cols = [col for col in diamonds if col not in cols]
273 | df_if = diamonds[inverse_cols]
274 | assert df_if.equals(diamonds >> drop_if(lambda col: any(col.str.contains('.'))))
275 |
--------------------------------------------------------------------------------
/dfply/base.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import warnings
4 | from functools import partial, wraps
5 |
6 |
7 | def _recursive_apply(f, l):
8 | if isinstance(l, (list, tuple)):
9 | out = [_recursive_apply(f, l_) for l_ in l]
10 | if isinstance(l, tuple):
11 | out = tuple(out)
12 | return out
13 | else:
14 | return f(l)
15 |
16 |
17 | def contextualize(arg, context):
18 | if isinstance(arg, Intention):
19 | arg = arg.evaluate(context)
20 | return arg
21 |
22 |
23 | def flatten(l):
24 | for el in l:
25 | if isinstance(el, (tuple, list)):
26 | yield from flatten(el)
27 | else:
28 | yield el
29 |
30 |
31 | def _check_delayed_eval(args, kwargs):
32 | check = lambda x: isinstance(x, Intention)
33 | delay = any([a for a in flatten(_recursive_apply(check, args))])
34 | delay = delay or any([v for v in flatten(_recursive_apply(check, list(kwargs.values())))])
35 | return delay
36 |
37 |
38 | def _context_args(args):
39 | return lambda x: _recursive_apply(partial(contextualize, context=x), args)
40 |
41 |
42 | def _context_kwargs(kwargs):
43 | values_ = lambda x: _recursive_apply(partial(contextualize, context=x),
44 | list(kwargs.values()))
45 | return lambda x: {k: v for k, v in zip(kwargs.keys(), values_(x))}
46 |
47 |
48 | def _delayed_function(function, args, kwargs):
49 | return lambda x: function(*_context_args(args)(x),
50 | **_context_kwargs(kwargs)(x))
51 |
52 |
53 | def make_symbolic(f):
54 | def wrapper(*args, **kwargs):
55 | delay = _check_delayed_eval(args, kwargs)
56 | if delay:
57 | delayed = _delayed_function(f, args, kwargs)
58 | return Intention(delayed)
59 | else:
60 | return f(*args, **kwargs)
61 |
62 | return wrapper
63 |
64 |
65 | class Intention(object):
66 | def __init__(self, function=lambda x: x, invert=False):
67 | self.function = function
68 | self.inverted = invert
69 |
70 | def evaluate(self, context):
71 | return self.function(context)
72 |
73 | def __getattr__(self, attribute):
74 | return Intention(lambda x: getattr(self.function(x), attribute),
75 | invert=self.inverted)
76 |
77 | def __invert__(self):
78 | return Intention(self.function, invert=not self.inverted)
79 |
80 | def __call__(self, *args, **kwargs):
81 | return Intention(lambda x: self.function(x)(*_context_args(args)(x),
82 | **_context_kwargs(kwargs)(x)),
83 | invert=self.inverted)
84 |
85 |
86 | _magic_method_names = [
87 | '__abs__', '__add__', '__and__', '__cmp__', '__complex__', '__contains__',
88 | '__delattr__', '__delete__', '__delitem__', '__delslice__', '__div__',
89 | '__divmod__', '__enter__', '__eq__', '__exit__', '__float__',
90 | '__floordiv__', '__ge__', '__get__', '__getitem__', '__getslice__',
91 | '__gt__', '__hash__', '__hex__', '__iadd__', '__iand__', '__idiv__',
92 | '__ifloordiv__', '__ilshift__', '__imod__', '__imul__', '__index__',
93 | '__int__', '__ior__', '__ipow__', '__irshift__', '__isub__',
94 | '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__long__',
95 | '__lshift__', '__lt__', '__mod__', '__mul__', '__ne__', '__neg__',
96 | '__nonzero__', '__oct__', '__or__', '__pos__', '__pow__', '__radd__',
97 | '__rand__', '__rcmp__', '__rdiv__', '__rdivmod__', # '__repr__',
98 | '__reversed__', '__rfloordiv__', '__rlshift__', '__rmod__', '__rmul__',
99 | '__ror__', '__rpow__', '__rrshift__', '__rshift__', '__rsub__',
100 | '__rtruediv__', '__rxor__', '__set__', '__setitem__', '__setslice__',
101 | '__sub__', '__truediv__', '__unicode__', '__xor__', '__str__',
102 | ]
103 |
104 |
105 | def _set_magic_method(name):
106 | def magic_method(self, *args, **kwargs):
107 | return Intention(lambda x: getattr(self.function(x), name)(*_context_args(args)(x),
108 | **_context_kwargs(kwargs)(x)),
109 | invert=self.inverted)
110 |
111 | return magic_method
112 |
113 |
114 | for name in _magic_method_names:
115 | setattr(Intention, name, _set_magic_method(name))
116 |
117 | # Initialize the global X symbol
118 | X = Intention()
119 |
120 |
121 | class pipe(object):
122 | __name__ = "pipe"
123 |
124 | def __init__(self, function):
125 | self.function = function
126 | self.__doc__ = function.__doc__
127 |
128 | self.chained_pipes = []
129 |
130 | def __rshift__(self, other):
131 | assert isinstance(other, pipe)
132 | self.chained_pipes.append(other)
133 | return self
134 |
135 | def __rrshift__(self, other):
136 | other_copy = other.copy()
137 |
138 | with warnings.catch_warnings():
139 | warnings.simplefilter("ignore")
140 | other_copy._grouped_by = getattr(other, '_grouped_by', None)
141 |
142 | result = self.function(other_copy)
143 |
144 | for p in self.chained_pipes:
145 | result = p.__rrshift__(result)
146 | return result
147 |
148 | def __call__(self, *args, **kwargs):
149 | return pipe(lambda x: self.function(x, *args, **kwargs))
150 |
151 |
152 | class IntentionEvaluator(object):
153 | """
154 | Parent class for symbolic argument decorators.
155 | Default behavior is to recursively turn the arguments and keyword
156 | arguments of a decorated function into `symbolic.Call` objects that
157 | can be evaluated against a pandas DataFrame as it comes down a pipe.
158 | """
159 |
160 | __name__ = "IntentionEvaluator"
161 |
162 | def __init__(self, function, eval_symbols=True, eval_as_label=[],
163 | eval_as_selector=[]):
164 | super(IntentionEvaluator, self).__init__()
165 | self.function = function
166 | self.__doc__ = function.__doc__
167 |
168 | self.eval_symbols = eval_symbols
169 | self.eval_as_label = eval_as_label
170 | self.eval_as_selector = eval_as_selector
171 |
172 | def _evaluate(self, df, arg):
173 | if isinstance(arg, Intention):
174 | negate = arg.inverted
175 | arg = arg.evaluate(df)
176 | if negate:
177 | arg = ~arg
178 | return arg
179 |
180 | def _evaluate_label(self, df, arg):
181 | arg = self._evaluate(df, arg)
182 |
183 | cols = list(df.columns)
184 | if isinstance(arg, pd.Series):
185 | arg = arg.name
186 | if isinstance(arg, pd.Index):
187 | arg = list(arg)
188 | if isinstance(arg, int):
189 | arg = cols[arg]
190 | return arg
191 |
192 | def _evaluate_selector(self, df, arg):
193 | negate = False
194 | if isinstance(arg, Intention):
195 | negate = arg.inverted
196 | arg = arg.evaluate(df)
197 |
198 | cols = list(df.columns)
199 | if isinstance(arg, pd.Series):
200 | arg = [cols.index(arg.name)]
201 | if isinstance(arg, pd.Index):
202 | arg = [cols.index(i) for i in list(arg)]
203 | if isinstance(arg, pd.DataFrame):
204 | arg = [cols.index(i) for i in arg.columns]
205 | if isinstance(arg, int):
206 | arg = [arg]
207 | if isinstance(arg, str):
208 | arg = [cols.index(arg)]
209 | if isinstance(arg, (list, tuple)):
210 | arg = [cols.index(i) if isinstance(i, str) else i for i in arg]
211 |
212 | selection_vector = np.zeros(df.shape[1])
213 | col_idx = np.array(arg)
214 |
215 | if negate and len(col_idx) > 0:
216 | selection_vector[col_idx] = -1
217 | elif len(col_idx) > 0:
218 | selection_vector[col_idx] = 1
219 | return selection_vector
220 |
221 | def _evaluator_loop(self, df, arg, eval_func):
222 | if isinstance(arg, (list, tuple)):
223 | return [self._evaluator_loop(df, a_, eval_func) for a_ in arg]
224 | else:
225 | return eval_func(df, arg)
226 |
227 | def _symbolic_eval(self, df, arg):
228 | return self._evaluator_loop(df, arg, self._evaluate)
229 |
230 | def _symbolic_to_label(self, df, arg):
231 | return self._evaluator_loop(df, arg, self._evaluate_label)
232 |
233 | def _symbolic_to_selector(self, df, arg):
234 | return self._evaluator_loop(df, arg, self._evaluate_selector)
235 |
236 | def _recursive_arg_eval(self, df, args):
237 | eval_symbols = self._find_eval_args(self.eval_symbols, args)
238 | eval_as_label = self._find_eval_args(self.eval_as_label, args)
239 | eval_as_selector = self._find_eval_args(self.eval_as_selector, args)
240 |
241 | return [
242 | self._symbolic_to_label(df, a) if i in eval_as_label
243 | else self._symbolic_to_selector(df, a) if i in eval_as_selector
244 | else self._symbolic_eval(df, a) if i in eval_symbols
245 | else a
246 | for i, a in enumerate(args)
247 | ]
248 |
249 | def _recursive_kwarg_eval(self, df, kwargs):
250 | eval_symbols = self._find_eval_kwargs(self.eval_symbols, kwargs)
251 | eval_as_label = self._find_eval_kwargs(self.eval_as_label, kwargs)
252 | eval_as_selector = self._find_eval_kwargs(self.eval_as_selector, kwargs)
253 |
254 | return {
255 | k: (self._symbolic_to_label(df, v) if k in eval_as_label
256 | else self._symbolic_to_selector(df, v) if k in eval_as_selector
257 | else self._symbolic_eval(df, v) if k in eval_symbols
258 | else v)
259 | for k, v in kwargs.items()
260 | }
261 |
262 | def _find_eval_args(self, request, args):
263 | if (request == True) or ('*' in request):
264 | return [i for i in range(len(args))]
265 | elif request in [None, False]:
266 | return []
267 | return request
268 |
269 | def _find_eval_kwargs(self, request, kwargs):
270 | if (request == True) or ('**' in request):
271 | return [k for k in kwargs.keys()]
272 | elif request in [None, False]:
273 | return []
274 | return request
275 |
276 | def __call__(self, *args, **kwargs):
277 | df = args[0]
278 |
279 | args = self._recursive_arg_eval(df, args[1:])
280 | kwargs = self._recursive_kwarg_eval(df, kwargs)
281 |
282 | return self.function(df, *args, **kwargs)
283 |
284 |
285 | def symbolic_evaluation(function=None, eval_symbols=True, eval_as_label=[],
286 | eval_as_selector=[]):
287 | if function:
288 | return IntentionEvaluator(function)
289 | else:
290 | @wraps(function)
291 | def wrapper(function):
292 | return IntentionEvaluator(function, eval_symbols=eval_symbols,
293 | eval_as_label=eval_as_label,
294 | eval_as_selector=eval_as_selector)
295 |
296 | return wrapper
297 |
298 |
299 | class group_delegation(object):
300 | __name__ = "group_delegation"
301 |
302 | def __init__(self, function):
303 | self.function = function
304 | self.__doc__ = function.__doc__
305 |
306 | def _apply(self, df, *args, **kwargs):
307 | grouped = df.groupby(df._grouped_by)
308 |
309 | dff = grouped.apply(self.function, *args, **kwargs)
310 | # Save all the metadata attributes back into the new data frame
311 | for field in df._metadata:
312 | setattr(dff, field, getattr(df, field))
313 | df = dff
314 |
315 | for name in df.index.names[:-1]:
316 | if name in df:
317 | df.reset_index(level=0, drop=True, inplace=True)
318 | else:
319 | df.reset_index(level=0, inplace=True)
320 |
321 | if (df.index == 0).all():
322 | df.reset_index(drop=True, inplace=True)
323 |
324 | return df
325 |
326 | def __call__(self, *args, **kwargs):
327 | grouped_by = getattr(args[0], '_grouped_by', None)
328 | if (grouped_by is None) or not all([g in args[0].columns for g in grouped_by]):
329 | return self.function(*args, **kwargs)
330 | else:
331 | applied = self._apply(args[0], *args[1:], **kwargs)
332 |
333 | with warnings.catch_warnings():
334 | warnings.simplefilter("ignore")
335 | applied._grouped_by = grouped_by
336 |
337 | return applied
338 |
339 |
340 | def dfpipe(f):
341 | return pipe(
342 | group_delegation(
343 | symbolic_evaluation(f)
344 | )
345 | )
346 |
--------------------------------------------------------------------------------
/dfply/reshape.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 | import re
3 |
4 |
5 | # ------------------------------------------------------------------------------
6 | # Sorting
7 | # ------------------------------------------------------------------------------
8 |
9 | @dfpipe
10 | def arrange(df, *args, **kwargs):
11 | """Calls `pandas.DataFrame.sort_values` to sort a DataFrame according to
12 | criteria.
13 |
14 | See:
15 | http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
16 |
17 | For a list of specific keyword arguments for sort_values (which will be
18 | the same in arrange).
19 |
20 | Args:
21 | *args: Symbolic, string, integer or lists of those types indicating
22 | columns to sort the DataFrame by.
23 |
24 | Kwargs:
25 | **kwargs: Any keyword arguments will be passed through to the pandas
26 | `DataFrame.sort_values` function.
27 | """
28 |
29 | flat_args = [a for a in flatten(args)]
30 |
31 | series = [df[arg] if isinstance(arg, str) else
32 | df.iloc[:, arg] if isinstance(arg, int) else
33 | pd.Series(arg) for arg in flat_args]
34 |
35 | sorter = pd.concat(series, axis=1).reset_index(drop=True)
36 | sorter = sorter.sort_values(sorter.columns.tolist(), **kwargs)
37 | return df.iloc[sorter.index, :]
38 |
39 |
40 | # ------------------------------------------------------------------------------
41 | # Renaming
42 | # ------------------------------------------------------------------------------
43 |
44 | @pipe
45 | @symbolic_evaluation(eval_as_label=True)
46 | def rename(df, **kwargs):
47 | """Renames columns, where keyword argument values are the current names
48 | of columns and keys are the new names.
49 |
50 | Args:
51 | df (:obj:`pandas.DataFrame`): DataFrame passed in via `>>` pipe.
52 |
53 | Kwargs:
54 | **kwargs: key:value pairs where keys are new names for columns and
55 | values are current names of columns.
56 | """
57 |
58 | return df.rename(columns={v: k for k, v in kwargs.items()})
59 |
60 |
61 | # ------------------------------------------------------------------------------
62 | # Elongate
63 | # ------------------------------------------------------------------------------
64 |
65 | @pipe
66 | @symbolic_evaluation(eval_as_label=['*'])
67 | def gather(df, key, values, *args, **kwargs):
68 | """
69 | Melts the specified columns in your DataFrame into two key:value columns.
70 |
71 | Args:
72 | key (str): Name of identifier column.
73 | values (str): Name of column that will contain values for the key.
74 | *args (str, int, symbolic): Columns to "melt" into the new key and
75 | value columns. If no args are specified, all columns are melted
76 | into they key and value columns.
77 |
78 | Kwargs:
79 | add_id (bool): Boolean value indicating whether to add a `"_ID"`
80 | column that will preserve information about the original rows
81 | (useful for being able to re-widen the data later).
82 |
83 | Example:
84 | diamonds >> gather('variable', 'value', ['price', 'depth','x','y','z']) >> head(5)
85 |
86 | carat cut color clarity table variable value
87 | 0 0.23 Ideal E SI2 55.0 price 326.0
88 | 1 0.21 Premium E SI1 61.0 price 326.0
89 | 2 0.23 Good E VS1 65.0 price 327.0
90 | 3 0.29 Premium I VS2 58.0 price 334.0
91 | 4 0.31 Good J SI2 58.0 price 335.0
92 | """
93 |
94 | if len(args) == 0:
95 | args = df.columns.tolist()
96 | else:
97 | args = [a for a in flatten(args)]
98 |
99 | if kwargs.get('add_id', False):
100 | df = df.assign(_ID=np.arange(df.shape[0]))
101 |
102 | columns = df.columns.tolist()
103 | id_vars = [col for col in columns if col not in args]
104 | return pd.melt(df, id_vars, list(args), key, values)
105 |
106 |
107 | # ------------------------------------------------------------------------------
108 | # Widen
109 | # ------------------------------------------------------------------------------
110 |
111 | def convert_type(df, columns):
112 | """
113 | Helper function that attempts to convert columns into their appropriate
114 | data type.
115 | """
116 | # taken in part from the dplython package
117 | out_df = df.copy()
118 | for col in columns:
119 | column_values = pd.Series(out_df[col].unique())
120 | column_values = column_values[~column_values.isnull()]
121 | # empty
122 | if len(column_values) == 0:
123 | continue
124 | # boolean
125 | if set(column_values.values) < {'True', 'False'}:
126 | out_df[col] = out_df[col].map({'True': True, 'False': False})
127 | continue
128 | # numeric
129 | if pd.to_numeric(column_values, errors='coerce').isnull().sum() == 0:
130 | out_df[col] = pd.to_numeric(out_df[col], errors='ignore')
131 | continue
132 | # datetime
133 | if pd.to_datetime(column_values, errors='coerce').isnull().sum() == 0:
134 | out_df[col] = pd.to_datetime(out_df[col], errors='ignore',
135 | infer_datetime_format=True)
136 | continue
137 |
138 | return out_df
139 |
140 |
141 | @pipe
142 | @symbolic_evaluation(eval_as_label=['*'])
143 | def spread(df, key, values, convert=False):
144 | """
145 | Transforms a "long" DataFrame into a "wide" format using a key and value
146 | column.
147 |
148 | If you have a mixed datatype column in your long-format DataFrame then the
149 | default behavior is for the spread columns to be of type `object`, or
150 | string. If you want to try to convert dtypes when spreading, you can set
151 | the convert keyword argument in spread to True.
152 |
153 | Args:
154 | key (str, int, or symbolic): Label for the key column.
155 | values (str, int, or symbolic): Label for the values column.
156 |
157 | Kwargs:
158 | convert (bool): Boolean indicating whether or not to try and convert
159 | the spread columns to more appropriate data types.
160 |
161 |
162 | Example:
163 | widened = elongated >> spread(X.variable, X.value)
164 | widened >> head(5)
165 |
166 | _ID carat clarity color cut depth price table x y z
167 | 0 0 0.23 SI2 E Ideal 61.5 326 55 3.95 3.98 2.43
168 | 1 1 0.21 SI1 E Premium 59.8 326 61 3.89 3.84 2.31
169 | 2 10 0.3 SI1 J Good 64 339 55 4.25 4.28 2.73
170 | 3 100 0.75 SI1 D Very Good 63.2 2760 56 5.8 5.75 3.65
171 | 4 1000 0.75 SI1 D Ideal 62.3 2898 55 5.83 5.8 3.62
172 | """
173 |
174 | # Taken mostly from dplython package
175 | columns = df.columns.tolist()
176 | id_cols = [col for col in columns if not col in [key, values]]
177 |
178 | temp_index = ['' for i in range(len(df))]
179 | for id_col in id_cols:
180 | temp_index += df[id_col].map(str)
181 |
182 | out_df = df.assign(temp_index=temp_index)
183 | out_df = out_df.set_index('temp_index')
184 | spread_data = out_df[[key, values]]
185 |
186 | if not all(spread_data.groupby([spread_data.index, key]).agg(
187 | 'count').reset_index()[values] < 2):
188 | raise ValueError('Duplicate identifiers')
189 |
190 | spread_data = spread_data.pivot(columns=key, values=values)
191 |
192 | if convert and (out_df[values].dtype.kind in 'OSaU'):
193 | columns_to_convert = [col for col in spread_data if col not in columns]
194 | spread_data = convert_type(spread_data, columns_to_convert)
195 |
196 | out_df = out_df[id_cols].drop_duplicates()
197 | out_df = out_df.merge(spread_data, left_index=True, right_index=True).reset_index(drop=True)
198 |
199 | out_df = (out_df >> arrange(id_cols)).reset_index(drop=True)
200 |
201 | return out_df
202 |
203 |
204 | # ------------------------------------------------------------------------------
205 | # Separate columns
206 | # ------------------------------------------------------------------------------
207 |
208 | @pipe
209 | @symbolic_evaluation(eval_as_label=['*'])
210 | def separate(df, column, into, sep="[\W_]+", remove=True, convert=False,
211 | extra='drop', fill='right'):
212 | """
213 | Splits columns into multiple columns.
214 |
215 | Args:
216 | df (pandas.DataFrame): DataFrame passed in through the pipe.
217 | column (str, symbolic): Label of column to split.
218 | into (list): List of string names for new columns.
219 |
220 | Kwargs:
221 | sep (str or list): If a string, the regex string used to split the
222 | column. If a list, a list of integer positions to split strings
223 | on.
224 | remove (bool): Boolean indicating whether to remove the original column.
225 | convert (bool): Boolean indicating whether the new columns should be
226 | converted to the appropriate type.
227 | extra (str): either `'drop'`, where split pieces beyond the specified
228 | new columns are dropped, or `'merge'`, where the final split piece
229 | contains the remainder of the original column.
230 | fill (str): either `'right'`, where `np.nan` values are filled in the
231 | right-most columns for missing pieces, or `'left'` where `np.nan`
232 | values are filled in the left-most columns.
233 | """
234 |
235 | assert isinstance(into, (tuple, list))
236 |
237 | if isinstance(sep, (tuple, list)):
238 | inds = [0] + list(sep)
239 | if len(inds) > len(into):
240 | if extra == 'drop':
241 | inds = inds[:len(into) + 1]
242 | elif extra == 'merge':
243 | inds = inds[:len(into)] + [None]
244 | else:
245 | inds = inds + [None]
246 |
247 | splits = df[column].map(lambda x: [str(x)[slice(inds[i], inds[i + 1])]
248 | if i < len(inds) - 1 else np.nan
249 | for i in range(len(into))])
250 |
251 | else:
252 | maxsplit = len(into) - 1 if extra == 'merge' else 0
253 | splits = df[column].map(lambda x: re.split(sep, x, maxsplit))
254 |
255 | right_filler = lambda x: x + [np.nan for i in range(len(into) - len(x))]
256 | left_filler = lambda x: [np.nan for i in range(len(into) - len(x))] + x
257 |
258 | if fill == 'right':
259 | splits = [right_filler(x) for x in splits]
260 | elif fill == 'left':
261 | splits = [left_filler(x) for x in splits]
262 |
263 | for i, split_col in enumerate(into):
264 | df[split_col] = [x[i] if not x[i] == '' else np.nan for x in splits]
265 |
266 | if convert:
267 | df = convert_type(df, into)
268 |
269 | if remove:
270 | df.drop(column, axis=1, inplace=True)
271 |
272 | return df
273 |
274 |
275 | # ------------------------------------------------------------------------------
276 | # Unite columns
277 | # ------------------------------------------------------------------------------
278 |
279 | @pipe
280 | @symbolic_evaluation(eval_as_label=['*'])
281 | def unite(df, colname, *args, **kwargs):
282 | """
283 | Does the inverse of `separate`, joining columns together by a specified
284 | separator.
285 |
286 | Any columns that are not strings will be converted to strings.
287 |
288 | Args:
289 | df (pandas.DataFrame): DataFrame passed in through the pipe.
290 | colname (str): the name of the new joined column.
291 | *args: list of columns to be joined, which can be strings, symbolic, or
292 | integer positions.
293 |
294 | Kwargs:
295 | sep (str): the string separator to join the columns with.
296 | remove (bool): Boolean indicating whether or not to remove the
297 | original columns.
298 | na_action (str): can be one of `'maintain'` (the default),
299 | '`ignore'`, or `'as_string'`. The default will make the new column
300 | row a `NaN` value if any of the original column cells at that
301 | row contained `NaN`. '`ignore'` will treat any `NaN` value as an
302 | empty string during joining. `'as_string'` will convert any `NaN`
303 | value to the string `'nan'` prior to joining.
304 | """
305 |
306 | to_unite = list([a for a in flatten(args)])
307 | sep = kwargs.get('sep', '_')
308 | remove = kwargs.get('remove', True)
309 | # possible na_action values
310 | # ignore: empty string
311 | # maintain: keep as np.nan (default)
312 | # as_string: becomes string 'nan'
313 | na_action = kwargs.get('na_action', 'maintain')
314 |
315 | # print(to_unite, sep, remove, na_action)
316 |
317 | if na_action == 'maintain':
318 | df[colname] = df[to_unite].apply(lambda x: np.nan if any(x.isnull())
319 | else sep.join(x.map(str)), axis=1)
320 | elif na_action == 'ignore':
321 | df[colname] = df[to_unite].apply(lambda x: sep.join(x[~x.isnull()].map(str)),
322 | axis=1)
323 | elif na_action == 'as_string':
324 | df[colname] = df[to_unite].astype(str).apply(lambda x: sep.join(x), axis=1)
325 |
326 | if remove:
327 | df.drop(to_unite, axis=1, inplace=True)
328 |
329 | return df
330 |
--------------------------------------------------------------------------------
/test/test_summary_functions.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from dfply import *
3 |
4 |
5 | ##==============================================================================
6 | ## transform summary functions
7 | ##==============================================================================
8 |
9 | def test_mean():
10 | df = diamonds >> select(X.cut, X.x) >> head(5)
11 | # straight summarize
12 | t = df >> summarize(m=mean(X.x))
13 | df_truth = pd.DataFrame({'m': [4.086]})
14 | assert t.equals(df_truth)
15 | # grouped summarize
16 | t = df >> group_by(X.cut) >> summarize(m=mean(X.x))
17 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
18 | 'm': [4.195, 3.950, 4.045]})
19 | assert t.equals(df_truth)
20 | # straight mutate
21 | t = df >> mutate(m=mean(X.x))
22 | df_truth = df.copy()
23 | df_truth['m'] = df_truth.x.mean()
24 | assert t.equals(df_truth)
25 | # grouped mutate
26 | t = df >> group_by(X.cut) >> mutate(m=mean(X.x))
27 | df_truth['m'] = pd.Series([3.950, 4.045, 4.195, 4.045, 4.195])
28 | assert t.sort_index().equals(df_truth)
29 |
30 |
31 | def test_first():
32 | df = diamonds >> select(X.cut, X.x) >> head(5)
33 | # straight summarize
34 | t = df >> summarize(f=first(X.x))
35 | df_truth = pd.DataFrame({'f': [3.95]})
36 | assert t.equals(df_truth)
37 | # grouped summarize
38 | t = df >> group_by(X.cut) >> summarize(f=first(X.x))
39 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
40 | 'f': [4.05, 3.95, 3.89]})
41 | assert t.equals(df_truth)
42 | # summarize with order_by
43 | t = df >> summarize(f=first(X.x, order_by=desc(X.cut)))
44 | df_truth = pd.DataFrame({'f':[3.89]})
45 | # straight mutate
46 | t = df >> mutate(f=first(X.x))
47 | df_truth = df.copy()
48 | df_truth['f'] = df_truth.x.iloc[0]
49 | assert t.equals(df_truth)
50 | # grouped mutate
51 | t = df >> group_by(X.cut) >> mutate(f=first(X.x))
52 | df_truth['f'] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05])
53 | assert t.sort_index().equals(df_truth)
54 |
55 |
56 | def test_last():
57 | df = diamonds >> select(X.cut, X.x) >> head(5)
58 | # straight summarize
59 | t = df >> summarize(l=last(X.x))
60 | df_truth = pd.DataFrame({'l': [4.34]})
61 | assert t.equals(df_truth)
62 | # grouped summarize
63 | t = df >> group_by(X.cut) >> summarize(l=last(X.x))
64 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
65 | 'l': [4.34, 3.95, 4.20]})
66 | assert t.equals(df_truth)
67 | # summarize with order_by
68 | #t = df >> summarize(f=last(X.x, order_by=desc(X.cut)))
69 | t = df >> summarize(f=last(X.x, order_by=[desc(X.cut), desc(X.x)]))
70 | df_truth = pd.DataFrame({'f':[4.05]})
71 | assert df_truth.equals(t)
72 | # straight mutate
73 | t = df >> mutate(l=last(X.x))
74 | df_truth = df.copy()
75 | df_truth['l'] = df_truth.x.iloc[4]
76 | assert t.equals(df_truth)
77 | # grouped mutate
78 | t = df >> group_by(X.cut) >> mutate(l=last(X.x))
79 | df_truth['l'] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34])
80 | assert t.sort_index().equals(df_truth)
81 |
82 |
83 | def test_nth():
84 | df = diamonds >> select(X.cut, X.x) >> head(10)
85 | # straight summarize
86 | t = df >> summarize(second=nth(X.x, 1))
87 | df_truth = pd.DataFrame({'second': [3.89]})
88 | assert t.equals(df_truth)
89 | # grouped summarize
90 | t = df >> group_by(X.cut) >> summarize(first=nth(X.x, 0))
91 | df_truth = pd.DataFrame({'cut': ['Fair','Good', 'Ideal', 'Premium','Very Good'],
92 | 'first': [3.87,4.05,3.95,3.89,3.94]})
93 | assert t.equals(df_truth)
94 | # summarize with order_by
95 | t = df >> summarize(last=nth(X.x, -1, order_by=[desc(X.cut), desc(X.x)]))
96 | #print(t)
97 | df_truth = pd.DataFrame({'last':[3.87]})
98 | #print(df_truth)
99 | #print(t)
100 | assert df_truth.equals(t)
101 | # straight mutate
102 | t = df >> mutate(out_of_range=nth(X.x, 500))
103 | df_truth = df.copy()
104 | df_truth['out_of_range'] = np.nan
105 | assert t.equals(df_truth)
106 | # grouped mutate
107 | t = df >> group_by(X.cut) >> mutate(penultimate=nth(X.x, -2))
108 | df_truth = df.copy()
109 | df_truth['penultimate'] = pd.Series([np.nan,3.89,4.05,3.89,4.05,4.07,
110 | 4.07,4.07,np.nan,4.07])
111 | print(t)
112 | print(df_truth)
113 | assert t.sort_index().equals(df_truth)
114 |
115 |
116 | def test_n():
117 | df = diamonds >> select(X.cut, X.x) >> head(5)
118 | # straight summarize
119 | t = df >> summarize(n=n(X.x))
120 | df_truth = pd.DataFrame({'n': [5]})
121 | assert t.equals(df_truth)
122 | # grouped summarize
123 | t = df >> group_by(X.cut) >> summarize(n=n(X.x))
124 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
125 | 'n': [2, 1, 2]})
126 | assert t.equals(df_truth)
127 | # straight mutate
128 | t = df >> mutate(n=n(X.x))
129 | df_truth = df.copy()
130 | df_truth['n'] = 5
131 | assert t.equals(df_truth)
132 | # grouped mutate
133 | t = df >> group_by(X.cut) >> mutate(n=n(X.x))
134 | df_truth['n'] = pd.Series([1, 2, 2, 2, 2, 2])
135 | print(t)
136 | print(df_truth)
137 | assert t.sort_index().equals(df_truth)
138 |
139 |
140 | def test_n_distinct():
141 | df = pd.DataFrame({'col_1': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c'],
142 | 'col_2': [1, 1, 1, 2, 3, 3, 4, 5]})
143 | # straight summarize
144 | t = df >> summarize(n=n_distinct(X.col_2))
145 | df_truth = pd.DataFrame({'n': [5]})
146 | assert t.equals(df_truth)
147 | # grouped summarize
148 | t = df >> group_by(X.col_1) >> summarize(n=n_distinct(X.col_2))
149 | df_truth = pd.DataFrame({'col_1': ['a', 'b', 'c'],
150 | 'n': [1, 2, 2]})
151 | assert t.equals(df_truth)
152 | # straight mutate
153 | t = df >> mutate(n=n_distinct(X.col_2))
154 | df_truth = df.copy()
155 | df_truth['n'] = 5
156 | assert t.equals(df_truth)
157 | # grouped mutate
158 | t = df >> group_by(X.col_1) >> mutate(n=n_distinct(X.col_2))
159 | df_truth['n'] = pd.Series([1, 1, 1, 2, 2, 2, 2, 2])
160 | assert t.equals(df_truth)
161 |
162 |
163 | def test_IQR():
164 | df = diamonds >> select(X.cut, X.x) >> head(5)
165 | # straight summarize
166 | t = df >> summarize(i=IQR(X.x))
167 | df_truth = pd.DataFrame({'i': [.25]})
168 | assert t.equals(df_truth)
169 | # grouped summarize
170 | t = df >> group_by(X.cut) >> summarize(i=IQR(X.x))
171 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
172 | 'i': [0.145, 0.000, 0.155]})
173 | test_vector = abs(t.i - df_truth.i)
174 | assert all(test_vector < 0.000000001)
175 | # straight mutate
176 | t = df >> mutate(i=IQR(X.x))
177 | df_truth = df.copy()
178 | df_truth['i'] = 0.25
179 | assert t.equals(df_truth)
180 | # grouped mutate
181 | t = df >> group_by(X.cut) >> mutate(i=IQR(X.x))
182 | df_truth['i'] = pd.Series([0.000, 0.155, 0.145, 0.155, 0.145])
183 | test_vector = abs(t.i - df_truth.i)
184 | assert all(test_vector < 0.000000001)
185 |
186 |
187 | def test_colmin():
188 | df = diamonds >> select(X.cut, X.x) >> head(5)
189 | # straight summarize
190 | t = df >> summarize(m=colmin(X.x))
191 | df_truth = pd.DataFrame({'m': [3.89]})
192 | assert t.equals(df_truth)
193 | # grouped summarize
194 | t = df >> group_by(X.cut) >> summarize(m=colmin(X.x))
195 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
196 | 'm': [4.05, 3.95, 3.89]})
197 | assert t.equals(df_truth)
198 | # straight mutate
199 | t = df >> mutate(m=colmin(X.x))
200 | df_truth = df.copy()
201 | df_truth['m'] = 3.89
202 | assert t.equals(df_truth)
203 | # grouped mutate
204 | t = df >> group_by(X.cut) >> mutate(m=colmin(X.x))
205 | df_truth['m'] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05])
206 | assert t.sort_index().equals(df_truth)
207 |
208 |
209 | def test_colmax():
210 | df = diamonds >> select(X.cut, X.x) >> head(5)
211 | # straight summarize
212 | t = df >> summarize(m=colmax(X.x))
213 | df_truth = pd.DataFrame({'m': [4.34]})
214 | assert t.equals(df_truth)
215 | # grouped summarize
216 | t = df >> group_by(X.cut) >> summarize(m=colmax(X.x))
217 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
218 | 'm': [4.34, 3.95, 4.20]})
219 | assert t.equals(df_truth)
220 | # straight mutate
221 | t = df >> mutate(m=colmax(X.x))
222 | df_truth = df.copy()
223 | df_truth['m'] = 4.34
224 | assert t.equals(df_truth)
225 | # grouped mutate
226 | print(df.groupby('cut')['x'].agg(np.max))
227 | print(df)
228 | t = df >> group_by(X.cut) >> mutate(m=colmax(X.x))
229 | df_truth['m'] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34])
230 | print(t)
231 | print(df_truth)
232 | assert t.sort_index().equals(df_truth)
233 |
234 |
235 | def test_median():
236 | df = diamonds >> group_by(X.cut) >> head(3) >> select(X.cut, X.x) >> ungroup()
237 | # straight summarize
238 | t = df >> summarize(m=median(X.x))
239 | df_truth = pd.DataFrame({'m': [4.05]})
240 | assert t.equals(df_truth)
241 |
242 | # grouped summarize
243 | t = df >> group_by(X.cut) >> summarize(m=median(X.x))
244 | df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'],
245 | 'm': [6.27, 4.25, 3.95, 3.89, 3.95]})
246 | assert t.equals(df_truth)
247 | # straight mutate
248 | t = df >> mutate(m=median(X.x))
249 | df_truth = df.copy()
250 | df_truth['m'] = 4.05
251 | assert t.equals(df_truth)
252 | # grouped mutate
253 | # t = df >> group_by(X.cut) >> mutate(m=median(X.x))
254 | # df_truth['m'] = pd.Series(
255 | # [6.27, 6.27, 6.27, 4.25, 4.25, 4.25, 3.95, 3.95, 3.95, 3.89, 3.89, 3.89, 3.95, 3.95, 3.95],
256 | # index=t.index)
257 | # assert t.equals(df_truth)
258 | # make sure it handles case with even counts properly
259 | df = diamonds >> group_by(X.cut) >> head(2) >> select(X.cut, X.x)
260 | t = df >> group_by(X.cut) >> summarize(m=median(X.x))
261 | df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'],
262 | 'm': [5.160, 4.195, 3.940, 4.045, 3.945]})
263 | test_vector = abs(t.m - df_truth.m)
264 | assert all(test_vector < .000000001)
265 |
266 |
267 | def test_var():
268 | df = diamonds >> group_by(X.cut) >> head(3) >> select(X.cut, X.x) >> ungroup()
269 |
270 | # straight summarize
271 | t = df >> summarize(v=var(X.x))
272 | df_truth = pd.DataFrame({'v': [0.687392]})
273 | test_vector = abs(t.v - df_truth.v)
274 | print(t.v)
275 | print(df_truth.v)
276 | assert all(test_vector < .00001)
277 |
278 | # grouped summarize
279 | t = df >> group_by(X.cut) >> summarize(v=var(X.x))
280 | df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'],
281 | 'v': [2.074800, 0.022033, 0.056133, 0.033100, 0.005233]})
282 | test_vector = abs(t.v - df_truth.v)
283 | assert all(test_vector < .00001)
284 | # straight mutate
285 | t = df >> mutate(v=var(X.x))
286 | df_truth = df.copy()
287 | df_truth['v'] = 0.687392
288 | test_vector = abs(t.v - df_truth.v)
289 | assert all(test_vector < .00001)
290 | # grouped mutate
291 | # t = df >> group_by(X.cut) >> mutate(v=var(X.x))
292 | # df_truth['v'] = pd.Series([2.074800, 2.074800, 2.074800, 0.022033, 0.022033, 0.022033,
293 | # 0.056133, 0.056133, 0.056133, 0.033100, 0.033100, 0.033100,
294 | # 0.005233, 0.005233, 0.005233],
295 | # index=t.index)
296 | # test_vector = abs(t.v - df_truth.v)
297 | # assert all(test_vector < .00001)
298 | # test with single value (var undefined)
299 | df = diamonds >> group_by(X.cut) >> head(1) >> select(X.cut, X.x)
300 | t = df >> group_by(X.cut) >> summarize(v=var(X.x))
301 | df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'],
302 | 'v': [np.nan, np.nan, np.nan, np.nan, np.nan]})
303 | assert t.equals(df_truth)
304 |
305 |
306 | def test_sd():
307 | df = diamonds >> group_by(X.cut) >> head(3) >> select(X.cut, X.x) >> ungroup()
308 | # straight summarize
309 | t = df >> summarize(s=sd(X.x))
310 | df_truth = pd.DataFrame({'s': [0.829091]})
311 | test_vector = abs(t.s - df_truth.s)
312 | print(t)
313 | print(t.s)
314 | print(df_truth.s)
315 | assert all(test_vector < .00001)
316 | # grouped summarize
317 | t = df >> group_by(X.cut) >> summarize(s=sd(X.x))
318 | df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'],
319 | 's': [1.440417, 0.148436, 0.236925, 0.181934, 0.072342]})
320 | test_vector = abs(t.s - df_truth.s)
321 | assert all(test_vector < .00001)
322 | # straight mutate
323 | t = df >> mutate(s=sd(X.x))
324 | df_truth = df.copy()
325 | df_truth['s'] = 0.829091
326 | test_vector = abs(t.s - df_truth.s)
327 | assert all(test_vector < .00001)
328 | # grouped mutate
329 | t = df >> group_by(X.cut) >> mutate(s=sd(X.x))
330 | # df_truth['s'] = pd.Series([1.440417, 1.440417, 1.440417, 0.148436, 0.148436, 0.148436,
331 | # 0.236925, 0.236925, 0.236925, 0.181934, 0.181934, 0.181934,
332 | # 0.072342, 0.072342, 0.072342],
333 | # index=t.index)
334 | # test_vector = abs(t.s - df_truth.s)
335 | # print(t)
336 | # print(df_truth)
337 | assert all(test_vector < .00001)
338 | # test with single value (var undefined)
339 | df = diamonds >> group_by(X.cut) >> head(1) >> select(X.cut, X.x)
340 | t = df >> group_by(X.cut) >> summarize(s=sd(X.x))
341 | df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'],
342 | 's': [np.nan, np.nan, np.nan, np.nan, np.nan]})
343 | assert t.equals(df_truth)
344 |
--------------------------------------------------------------------------------
/examples/basics-extending-functionality.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "\n",
14 | "from dfply import *"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "### Case #1: A custom pipe function\n",
22 | "---\n",
23 | "\n",
24 | "Pandas has a function `pd.crosstab` which can generate a cross-tabluation of factors. Let's say we wanted to build a pipe function that wrapped around this. The docstring of the Pandas function is below:\n",
25 | "\n",
26 | "Compute a simple cross-tabulation of two (or more) factors. By default\n",
27 | "computes a frequency table of the factors unless an array of values and an\n",
28 | "aggregation function are passed\n",
29 | "\n",
30 | " Parameters\n",
31 | " ----------\n",
32 | " index : array-like, Series, or list of arrays/Series\n",
33 | " Values to group by in the rows\n",
34 | " columns : array-like, Series, or list of arrays/Series\n",
35 | " Values to group by in the columns\n",
36 | " values : array-like, optional\n",
37 | " Array of values to aggregate according to the factors.\n",
38 | " Requires `aggfunc` be specified.\n",
39 | " aggfunc : function, optional\n",
40 | " If specified, requires `values` be specified as well\n",
41 | " rownames : sequence, default None\n",
42 | " If passed, must match number of row arrays passed\n",
43 | " colnames : sequence, default None\n",
44 | " If passed, must match number of column arrays passed\n",
45 | " margins : boolean, default False\n",
46 | " Add row/column margins (subtotals)\n",
47 | " dropna : boolean, default True\n",
48 | " Do not include columns whose entries are all NaN\n",
49 | " normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False\n",
50 | " Normalize by dividing all values by the sum of values.\n",
51 | "\n",
52 | " - If passed 'all' or `True`, will normalize over all values.\n",
53 | " - If passed 'index' will normalize over each row.\n",
54 | " - If passed 'columns' will normalize over each column.\n",
55 | " - If margins is `True`, will also normalize margin values.\n",
56 | " \n",
57 | "\n",
58 | "**To keep it simple, let's build a reduced version of this that takes only:**\n",
59 | "- `index`\n",
60 | "- `columns`\n",
61 | "- `values`\n",
62 | "- `aggfunc`\n",
63 | "\n",
64 | "Below is a function that wraps around the call to `pd.crosstab`. "
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 5,
70 | "metadata": {
71 | "collapsed": true
72 | },
73 | "outputs": [],
74 | "source": [
75 | "def crosstab(index, columns, values=None, aggfunc=None):\n",
76 | " return pd.crosstab(index, columns, values=values, aggfunc=aggfunc)"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 6,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "data": {
86 | "text/html": [
87 | "
\n",
88 | "\n",
101 | "
\n",
102 | " \n",
103 | " \n",
104 | " | \n",
105 | " carat | \n",
106 | " cut | \n",
107 | " color | \n",
108 | " clarity | \n",
109 | " depth | \n",
110 | " table | \n",
111 | " price | \n",
112 | " x | \n",
113 | " y | \n",
114 | " z | \n",
115 | "
\n",
116 | " \n",
117 | " \n",
118 | " \n",
119 | " | 0 | \n",
120 | " 0.23 | \n",
121 | " Ideal | \n",
122 | " E | \n",
123 | " SI2 | \n",
124 | " 61.5 | \n",
125 | " 55.0 | \n",
126 | " 326 | \n",
127 | " 3.95 | \n",
128 | " 3.98 | \n",
129 | " 2.43 | \n",
130 | "
\n",
131 | " \n",
132 | " | 1 | \n",
133 | " 0.21 | \n",
134 | " Premium | \n",
135 | " E | \n",
136 | " SI1 | \n",
137 | " 59.8 | \n",
138 | " 61.0 | \n",
139 | " 326 | \n",
140 | " 3.89 | \n",
141 | " 3.84 | \n",
142 | " 2.31 | \n",
143 | "
\n",
144 | " \n",
145 | "
\n",
146 | "
"
147 | ],
148 | "text/plain": [
149 | " carat cut color clarity depth table price x y z\n",
150 | "0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n",
151 | "1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31"
152 | ]
153 | },
154 | "execution_count": 6,
155 | "metadata": {},
156 | "output_type": "execute_result"
157 | }
158 | ],
159 | "source": [
160 | "diamonds.head(2)"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 7,
166 | "metadata": {},
167 | "outputs": [
168 | {
169 | "data": {
170 | "text/html": [
171 | "\n",
172 | "\n",
185 | "
\n",
186 | " \n",
187 | " \n",
188 | " | color | \n",
189 | " D | \n",
190 | " E | \n",
191 | " F | \n",
192 | " G | \n",
193 | " H | \n",
194 | " I | \n",
195 | " J | \n",
196 | "
\n",
197 | " \n",
198 | " | cut | \n",
199 | " | \n",
200 | " | \n",
201 | " | \n",
202 | " | \n",
203 | " | \n",
204 | " | \n",
205 | " | \n",
206 | "
\n",
207 | " \n",
208 | " \n",
209 | " \n",
210 | " | Fair | \n",
211 | " 163 | \n",
212 | " 224 | \n",
213 | " 312 | \n",
214 | " 314 | \n",
215 | " 303 | \n",
216 | " 175 | \n",
217 | " 119 | \n",
218 | "
\n",
219 | " \n",
220 | " | Good | \n",
221 | " 662 | \n",
222 | " 933 | \n",
223 | " 909 | \n",
224 | " 871 | \n",
225 | " 702 | \n",
226 | " 522 | \n",
227 | " 307 | \n",
228 | "
\n",
229 | " \n",
230 | " | Ideal | \n",
231 | " 2834 | \n",
232 | " 3903 | \n",
233 | " 3826 | \n",
234 | " 4884 | \n",
235 | " 3115 | \n",
236 | " 2093 | \n",
237 | " 896 | \n",
238 | "
\n",
239 | " \n",
240 | " | Premium | \n",
241 | " 1603 | \n",
242 | " 2337 | \n",
243 | " 2331 | \n",
244 | " 2924 | \n",
245 | " 2360 | \n",
246 | " 1428 | \n",
247 | " 808 | \n",
248 | "
\n",
249 | " \n",
250 | " | Very Good | \n",
251 | " 1513 | \n",
252 | " 2400 | \n",
253 | " 2164 | \n",
254 | " 2299 | \n",
255 | " 1824 | \n",
256 | " 1204 | \n",
257 | " 678 | \n",
258 | "
\n",
259 | " \n",
260 | "
\n",
261 | "
"
262 | ],
263 | "text/plain": [
264 | "color D E F G H I J\n",
265 | "cut \n",
266 | "Fair 163 224 312 314 303 175 119\n",
267 | "Good 662 933 909 871 702 522 307\n",
268 | "Ideal 2834 3903 3826 4884 3115 2093 896\n",
269 | "Premium 1603 2337 2331 2924 2360 1428 808\n",
270 | "Very Good 1513 2400 2164 2299 1824 1204 678"
271 | ]
272 | },
273 | "execution_count": 7,
274 | "metadata": {},
275 | "output_type": "execute_result"
276 | }
277 | ],
278 | "source": [
279 | "crosstab(diamonds.cut, diamonds.color)"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "If you want your function to be part of a dfply pipe chain, the first argument _must_ be a dataframe, which is implicitly passed through during the evaluation of the chain! We will need to redefine the function to have the implicit `df` passed in as the first argument.\n",
287 | "\n",
288 | "The most common and straightforward way to convert a custom function to a dfply piping function is to use the `@dfpipe` decorator. \n",
289 | "\n",
290 | "> Note: the `@dfpipe` decorator is in fact a convenience decorator that stacks three dfply decorators together: \n",
291 | "\n",
292 | "> `@pipe` \n",
293 | "`@group_delegation` \n",
294 | "`@symbolic_evaluation` \n",
295 | "\n",
296 | "> `@pipe` ensures that the function will work in the dfply piping syntax and take an implicit DataFrame, `@group_delegation` makes the function work with groupings applied prior in the chain, and `@symbolic_evaluation` enables you to use and evaluate symbolic arguments like `X.cut` that are placeholders for incoming data."
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 8,
302 | "metadata": {
303 | "collapsed": true
304 | },
305 | "outputs": [],
306 | "source": [
307 | "@dfpipe\n",
308 | "def crosstab(df, index, columns, values=None, aggfunc=None):\n",
309 | " return pd.crosstab(index, columns, values=values, aggfunc=aggfunc)"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 9,
315 | "metadata": {},
316 | "outputs": [
317 | {
318 | "data": {
319 | "text/html": [
320 | "\n",
321 | "\n",
334 | "
\n",
335 | " \n",
336 | " \n",
337 | " | color | \n",
338 | " D | \n",
339 | " E | \n",
340 | " F | \n",
341 | " G | \n",
342 | " H | \n",
343 | " I | \n",
344 | " J | \n",
345 | "
\n",
346 | " \n",
347 | " | cut | \n",
348 | " | \n",
349 | " | \n",
350 | " | \n",
351 | " | \n",
352 | " | \n",
353 | " | \n",
354 | " | \n",
355 | "
\n",
356 | " \n",
357 | " \n",
358 | " \n",
359 | " | Fair | \n",
360 | " 163 | \n",
361 | " 224 | \n",
362 | " 312 | \n",
363 | " 314 | \n",
364 | " 303 | \n",
365 | " 175 | \n",
366 | " 119 | \n",
367 | "
\n",
368 | " \n",
369 | " | Good | \n",
370 | " 662 | \n",
371 | " 933 | \n",
372 | " 909 | \n",
373 | " 871 | \n",
374 | " 702 | \n",
375 | " 522 | \n",
376 | " 307 | \n",
377 | "
\n",
378 | " \n",
379 | " | Ideal | \n",
380 | " 2834 | \n",
381 | " 3903 | \n",
382 | " 3826 | \n",
383 | " 4884 | \n",
384 | " 3115 | \n",
385 | " 2093 | \n",
386 | " 896 | \n",
387 | "
\n",
388 | " \n",
389 | " | Premium | \n",
390 | " 1603 | \n",
391 | " 2337 | \n",
392 | " 2331 | \n",
393 | " 2924 | \n",
394 | " 2360 | \n",
395 | " 1428 | \n",
396 | " 808 | \n",
397 | "
\n",
398 | " \n",
399 | " | Very Good | \n",
400 | " 1513 | \n",
401 | " 2400 | \n",
402 | " 2164 | \n",
403 | " 2299 | \n",
404 | " 1824 | \n",
405 | " 1204 | \n",
406 | " 678 | \n",
407 | "
\n",
408 | " \n",
409 | "
\n",
410 | "
"
411 | ],
412 | "text/plain": [
413 | "color D E F G H I J\n",
414 | "cut \n",
415 | "Fair 163 224 312 314 303 175 119\n",
416 | "Good 662 933 909 871 702 522 307\n",
417 | "Ideal 2834 3903 3826 4884 3115 2093 896\n",
418 | "Premium 1603 2337 2331 2924 2360 1428 808\n",
419 | "Very Good 1513 2400 2164 2299 1824 1204 678"
420 | ]
421 | },
422 | "execution_count": 9,
423 | "metadata": {},
424 | "output_type": "execute_result"
425 | }
426 | ],
427 | "source": [
428 | "diamonds >> crosstab(X.cut, X.color)"
429 | ]
430 | },
431 | {
432 | "cell_type": "markdown",
433 | "metadata": {},
434 | "source": [
435 | "### Case #2: A function that works with symbolic arguments\n",
436 | "---\n",
437 | "\n",
438 | "Many tasks are simpler and do not require the capacity to work as a pipe function. The `dfply` window functions are the common examples of this: functions that take a Series (or _symbolic_ Series) and return a modified version.\n",
439 | "\n",
440 | "Let's say we had a dataframe with dates represented by strings that we wanted to convert to pandas datetime objects using the `pd.to_datetime` function. Below is a tiny example dataframe with this issue."
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": 10,
446 | "metadata": {},
447 | "outputs": [
448 | {
449 | "data": {
450 | "text/html": [
451 | "\n",
452 | "\n",
465 | "
\n",
466 | " \n",
467 | " \n",
468 | " | \n",
469 | " date | \n",
470 | " sales | \n",
471 | "
\n",
472 | " \n",
473 | " \n",
474 | " \n",
475 | " | 0 | \n",
476 | " 7/10/17 | \n",
477 | " 1220 | \n",
478 | "
\n",
479 | " \n",
480 | " | 1 | \n",
481 | " 7/11/17 | \n",
482 | " 1592 | \n",
483 | "
\n",
484 | " \n",
485 | " | 2 | \n",
486 | " 7/12/17 | \n",
487 | " 908 | \n",
488 | "
\n",
489 | " \n",
490 | " | 3 | \n",
491 | " 7/13/17 | \n",
492 | " 1102 | \n",
493 | "
\n",
494 | " \n",
495 | " | 4 | \n",
496 | " 7/14/17 | \n",
497 | " 1395 | \n",
498 | "
\n",
499 | " \n",
500 | "
\n",
501 | "
"
502 | ],
503 | "text/plain": [
504 | " date sales\n",
505 | "0 7/10/17 1220\n",
506 | "1 7/11/17 1592\n",
507 | "2 7/12/17 908\n",
508 | "3 7/13/17 1102\n",
509 | "4 7/14/17 1395"
510 | ]
511 | },
512 | "execution_count": 10,
513 | "metadata": {},
514 | "output_type": "execute_result"
515 | }
516 | ],
517 | "source": [
518 | "sales = pd.DataFrame(dict(date=['7/10/17','7/11/17','7/12/17','7/13/17','7/14/17'],\n",
519 | " sales=[1220, 1592, 908, 1102, 1395]))\n",
520 | "sales"
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": 11,
526 | "metadata": {},
527 | "outputs": [
528 | {
529 | "data": {
530 | "text/plain": [
531 | "date object\n",
532 | "sales int64\n",
533 | "dtype: object"
534 | ]
535 | },
536 | "execution_count": 11,
537 | "metadata": {},
538 | "output_type": "execute_result"
539 | }
540 | ],
541 | "source": [
542 | "sales.dtypes"
543 | ]
544 | },
545 | {
546 | "cell_type": "markdown",
547 | "metadata": {},
548 | "source": [
549 | "In pandas we would use the `pd.to_datetime` function to convert the strings to date objects, and add it as a new column like so:"
550 | ]
551 | },
552 | {
553 | "cell_type": "code",
554 | "execution_count": 12,
555 | "metadata": {},
556 | "outputs": [
557 | {
558 | "data": {
559 | "text/html": [
560 | "\n",
561 | "\n",
574 | "
\n",
575 | " \n",
576 | " \n",
577 | " | \n",
578 | " date | \n",
579 | " sales | \n",
580 | " pd_date | \n",
581 | "
\n",
582 | " \n",
583 | " \n",
584 | " \n",
585 | " | 0 | \n",
586 | " 7/10/17 | \n",
587 | " 1220 | \n",
588 | " 2017-07-10 | \n",
589 | "
\n",
590 | " \n",
591 | " | 1 | \n",
592 | " 7/11/17 | \n",
593 | " 1592 | \n",
594 | " 2017-07-11 | \n",
595 | "
\n",
596 | " \n",
597 | " | 2 | \n",
598 | " 7/12/17 | \n",
599 | " 908 | \n",
600 | " 2017-07-12 | \n",
601 | "
\n",
602 | " \n",
603 | " | 3 | \n",
604 | " 7/13/17 | \n",
605 | " 1102 | \n",
606 | " 2017-07-13 | \n",
607 | "
\n",
608 | " \n",
609 | " | 4 | \n",
610 | " 7/14/17 | \n",
611 | " 1395 | \n",
612 | " 2017-07-14 | \n",
613 | "
\n",
614 | " \n",
615 | "
\n",
616 | "
"
617 | ],
618 | "text/plain": [
619 | " date sales pd_date\n",
620 | "0 7/10/17 1220 2017-07-10\n",
621 | "1 7/11/17 1592 2017-07-11\n",
622 | "2 7/12/17 908 2017-07-12\n",
623 | "3 7/13/17 1102 2017-07-13\n",
624 | "4 7/14/17 1395 2017-07-14"
625 | ]
626 | },
627 | "execution_count": 12,
628 | "metadata": {},
629 | "output_type": "execute_result"
630 | }
631 | ],
632 | "source": [
633 | "sales['pd_date'] = pd.to_datetime(sales['date'], infer_datetime_format=True)\n",
634 | "sales"
635 | ]
636 | },
637 | {
638 | "cell_type": "code",
639 | "execution_count": 13,
640 | "metadata": {
641 | "collapsed": true
642 | },
643 | "outputs": [],
644 | "source": [
645 | "sales.drop('pd_date', axis=1, inplace=True)"
646 | ]
647 | },
648 | {
649 | "cell_type": "markdown",
650 | "metadata": {},
651 | "source": [
652 | "What if you tried to use the `pd.to_datetime` function inside of a call to mutate, like so?\n",
653 | "\n",
654 | "```python\n",
655 | "sales >> mutate(pd_date=pd.to_datetime(X.date, infer_datetime_format=True))\n",
656 | "```\n",
657 | "\n",
658 | "This will unfortunately break. The `dfply` functions are special in that they \"know\" to delay their evaluation until the data is at that point in the chain. `pd.to_datetime` is not such a function, and will immediately try to evaluate `X.date`. With a symbolic `Intention` argument passed in, the function will fail as it does not know what to do with that.\n",
659 | "\n",
660 | "Instead, we will need to make a wrapper around `pd.to_datetime` that can handle these symbolic arguments and delay evaluation until the right time. \n",
661 | "\n",
662 | "This is quite simple: all you need to do is decorate a function with the `@make_symbolic` decorator, like so:"
663 | ]
664 | },
665 | {
666 | "cell_type": "code",
667 | "execution_count": 14,
668 | "metadata": {
669 | "collapsed": true
670 | },
671 | "outputs": [],
672 | "source": [
673 | "@make_symbolic\n",
674 | "def to_datetime(series, infer_datetime_format=True):\n",
675 | " return pd.to_datetime(series, infer_datetime_format=infer_datetime_format)"
676 | ]
677 | },
678 | {
679 | "cell_type": "code",
680 | "execution_count": 15,
681 | "metadata": {},
682 | "outputs": [
683 | {
684 | "data": {
685 | "text/html": [
686 | "\n",
687 | "\n",
700 | "
\n",
701 | " \n",
702 | " \n",
703 | " | \n",
704 | " date | \n",
705 | " sales | \n",
706 | " pd_date | \n",
707 | "
\n",
708 | " \n",
709 | " \n",
710 | " \n",
711 | " | 0 | \n",
712 | " 7/10/17 | \n",
713 | " 1220 | \n",
714 | " 2017-07-10 | \n",
715 | "
\n",
716 | " \n",
717 | " | 1 | \n",
718 | " 7/11/17 | \n",
719 | " 1592 | \n",
720 | " 2017-07-11 | \n",
721 | "
\n",
722 | " \n",
723 | " | 2 | \n",
724 | " 7/12/17 | \n",
725 | " 908 | \n",
726 | " 2017-07-12 | \n",
727 | "
\n",
728 | " \n",
729 | " | 3 | \n",
730 | " 7/13/17 | \n",
731 | " 1102 | \n",
732 | " 2017-07-13 | \n",
733 | "
\n",
734 | " \n",
735 | " | 4 | \n",
736 | " 7/14/17 | \n",
737 | " 1395 | \n",
738 | " 2017-07-14 | \n",
739 | "
\n",
740 | " \n",
741 | "
\n",
742 | "
"
743 | ],
744 | "text/plain": [
745 | " date sales pd_date\n",
746 | "0 7/10/17 1220 2017-07-10\n",
747 | "1 7/11/17 1592 2017-07-11\n",
748 | "2 7/12/17 908 2017-07-12\n",
749 | "3 7/13/17 1102 2017-07-13\n",
750 | "4 7/14/17 1395 2017-07-14"
751 | ]
752 | },
753 | "execution_count": 15,
754 | "metadata": {},
755 | "output_type": "execute_result"
756 | }
757 | ],
758 | "source": [
759 | "sales >> mutate(pd_date=to_datetime(X.date))"
760 | ]
761 | },
762 | {
763 | "cell_type": "markdown",
764 | "metadata": {},
765 | "source": [
766 | "And there you go. Able to delay the evaluation.\n",
767 | "\n",
768 | "What's particularly nice about the `@make_symbolic` decorator is that it has no trouble working with non-symbolic arguments too. If we were to pass in the series itself the function evaluates without a problem:"
769 | ]
770 | },
771 | {
772 | "cell_type": "code",
773 | "execution_count": 16,
774 | "metadata": {},
775 | "outputs": [
776 | {
777 | "data": {
778 | "text/plain": [
779 | "0 2017-07-10\n",
780 | "1 2017-07-11\n",
781 | "2 2017-07-12\n",
782 | "3 2017-07-13\n",
783 | "4 2017-07-14\n",
784 | "Name: date, dtype: datetime64[ns]"
785 | ]
786 | },
787 | "execution_count": 16,
788 | "metadata": {},
789 | "output_type": "execute_result"
790 | }
791 | ],
792 | "source": [
793 | "to_datetime(sales.date)"
794 | ]
795 | },
796 | {
797 | "cell_type": "markdown",
798 | "metadata": {},
799 | "source": [
800 | "Keep in mind, though, that if _any_ of the arguments or keyword arguments are symbolic `Intention` objects, the return will itself be an `Intention` object representing the function awaiting evaluation by a dataframe:"
801 | ]
802 | },
803 | {
804 | "cell_type": "code",
805 | "execution_count": 17,
806 | "metadata": {},
807 | "outputs": [
808 | {
809 | "data": {
810 | "text/plain": [
811 | ""
812 | ]
813 | },
814 | "execution_count": 17,
815 | "metadata": {},
816 | "output_type": "execute_result"
817 | }
818 | ],
819 | "source": [
820 | "to_datetime(X.date)"
821 | ]
822 | },
823 | {
824 | "cell_type": "code",
825 | "execution_count": 19,
826 | "metadata": {},
827 | "outputs": [
828 | {
829 | "data": {
830 | "text/plain": [
831 | "0 2017-07-10\n",
832 | "1 2017-07-11\n",
833 | "2 2017-07-12\n",
834 | "3 2017-07-13\n",
835 | "4 2017-07-14\n",
836 | "Name: date, dtype: datetime64[ns]"
837 | ]
838 | },
839 | "execution_count": 19,
840 | "metadata": {},
841 | "output_type": "execute_result"
842 | }
843 | ],
844 | "source": [
845 | "awaiting = to_datetime(X.date)\n",
846 | "awaiting.evaluate(sales)"
847 | ]
848 | },
849 | {
850 | "cell_type": "code",
851 | "execution_count": null,
852 | "metadata": {
853 | "collapsed": true
854 | },
855 | "outputs": [],
856 | "source": []
857 | }
858 | ],
859 | "metadata": {
860 | "kernelspec": {
861 | "display_name": "Python 3",
862 | "language": "python",
863 | "name": "python3"
864 | },
865 | "language_info": {
866 | "codemirror_mode": {
867 | "name": "ipython",
868 | "version": 3
869 | },
870 | "file_extension": ".py",
871 | "mimetype": "text/x-python",
872 | "name": "python",
873 | "nbconvert_exporter": "python",
874 | "pygments_lexer": "ipython3",
875 | "version": "3.6.1"
876 | }
877 | },
878 | "nbformat": 4,
879 | "nbformat_minor": 2
880 | }
881 |
--------------------------------------------------------------------------------
/examples/.ipynb_checkpoints/basics-extending-functionality-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "\n",
14 | "from dfply import *"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "### Case #1: A custom pipe function\n",
22 | "---\n",
23 | "\n",
24 | "Pandas has a function `pd.crosstab` which can generate a cross-tabluation of factors. Let's say we wanted to build a pipe function that wrapped around this. The docstring of the Pandas function is below:\n",
25 | "\n",
26 | "Compute a simple cross-tabulation of two (or more) factors. By default\n",
27 | "computes a frequency table of the factors unless an array of values and an\n",
28 | "aggregation function are passed\n",
29 | "\n",
30 | " Parameters\n",
31 | " ----------\n",
32 | " index : array-like, Series, or list of arrays/Series\n",
33 | " Values to group by in the rows\n",
34 | " columns : array-like, Series, or list of arrays/Series\n",
35 | " Values to group by in the columns\n",
36 | " values : array-like, optional\n",
37 | " Array of values to aggregate according to the factors.\n",
38 | " Requires `aggfunc` be specified.\n",
39 | " aggfunc : function, optional\n",
40 | " If specified, requires `values` be specified as well\n",
41 | " rownames : sequence, default None\n",
42 | " If passed, must match number of row arrays passed\n",
43 | " colnames : sequence, default None\n",
44 | " If passed, must match number of column arrays passed\n",
45 | " margins : boolean, default False\n",
46 | " Add row/column margins (subtotals)\n",
47 | " dropna : boolean, default True\n",
48 | " Do not include columns whose entries are all NaN\n",
49 | " normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False\n",
50 | " Normalize by dividing all values by the sum of values.\n",
51 | "\n",
52 | " - If passed 'all' or `True`, will normalize over all values.\n",
53 | " - If passed 'index' will normalize over each row.\n",
54 | " - If passed 'columns' will normalize over each column.\n",
55 | " - If margins is `True`, will also normalize margin values.\n",
56 | " \n",
57 | "\n",
58 | "**To keep it simple, let's build a reduced version of this that takes only:**\n",
59 | "- `index`\n",
60 | "- `columns`\n",
61 | "- `values`\n",
62 | "- `aggfunc`\n",
63 | "\n",
64 | "Below is a function that wraps around the call to `pd.crosstab`. "
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 5,
70 | "metadata": {
71 | "collapsed": true
72 | },
73 | "outputs": [],
74 | "source": [
75 | "def crosstab(index, columns, values=None, aggfunc=None):\n",
76 | " return pd.crosstab(index, columns, values=values, aggfunc=aggfunc)"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 6,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "data": {
86 | "text/html": [
87 | "\n",
88 | "\n",
101 | "
\n",
102 | " \n",
103 | " \n",
104 | " | \n",
105 | " carat | \n",
106 | " cut | \n",
107 | " color | \n",
108 | " clarity | \n",
109 | " depth | \n",
110 | " table | \n",
111 | " price | \n",
112 | " x | \n",
113 | " y | \n",
114 | " z | \n",
115 | "
\n",
116 | " \n",
117 | " \n",
118 | " \n",
119 | " | 0 | \n",
120 | " 0.23 | \n",
121 | " Ideal | \n",
122 | " E | \n",
123 | " SI2 | \n",
124 | " 61.5 | \n",
125 | " 55.0 | \n",
126 | " 326 | \n",
127 | " 3.95 | \n",
128 | " 3.98 | \n",
129 | " 2.43 | \n",
130 | "
\n",
131 | " \n",
132 | " | 1 | \n",
133 | " 0.21 | \n",
134 | " Premium | \n",
135 | " E | \n",
136 | " SI1 | \n",
137 | " 59.8 | \n",
138 | " 61.0 | \n",
139 | " 326 | \n",
140 | " 3.89 | \n",
141 | " 3.84 | \n",
142 | " 2.31 | \n",
143 | "
\n",
144 | " \n",
145 | "
\n",
146 | "
"
147 | ],
148 | "text/plain": [
149 | " carat cut color clarity depth table price x y z\n",
150 | "0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n",
151 | "1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31"
152 | ]
153 | },
154 | "execution_count": 6,
155 | "metadata": {},
156 | "output_type": "execute_result"
157 | }
158 | ],
159 | "source": [
160 | "diamonds.head(2)"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 7,
166 | "metadata": {},
167 | "outputs": [
168 | {
169 | "data": {
170 | "text/html": [
171 | "\n",
172 | "\n",
185 | "
\n",
186 | " \n",
187 | " \n",
188 | " | color | \n",
189 | " D | \n",
190 | " E | \n",
191 | " F | \n",
192 | " G | \n",
193 | " H | \n",
194 | " I | \n",
195 | " J | \n",
196 | "
\n",
197 | " \n",
198 | " | cut | \n",
199 | " | \n",
200 | " | \n",
201 | " | \n",
202 | " | \n",
203 | " | \n",
204 | " | \n",
205 | " | \n",
206 | "
\n",
207 | " \n",
208 | " \n",
209 | " \n",
210 | " | Fair | \n",
211 | " 163 | \n",
212 | " 224 | \n",
213 | " 312 | \n",
214 | " 314 | \n",
215 | " 303 | \n",
216 | " 175 | \n",
217 | " 119 | \n",
218 | "
\n",
219 | " \n",
220 | " | Good | \n",
221 | " 662 | \n",
222 | " 933 | \n",
223 | " 909 | \n",
224 | " 871 | \n",
225 | " 702 | \n",
226 | " 522 | \n",
227 | " 307 | \n",
228 | "
\n",
229 | " \n",
230 | " | Ideal | \n",
231 | " 2834 | \n",
232 | " 3903 | \n",
233 | " 3826 | \n",
234 | " 4884 | \n",
235 | " 3115 | \n",
236 | " 2093 | \n",
237 | " 896 | \n",
238 | "
\n",
239 | " \n",
240 | " | Premium | \n",
241 | " 1603 | \n",
242 | " 2337 | \n",
243 | " 2331 | \n",
244 | " 2924 | \n",
245 | " 2360 | \n",
246 | " 1428 | \n",
247 | " 808 | \n",
248 | "
\n",
249 | " \n",
250 | " | Very Good | \n",
251 | " 1513 | \n",
252 | " 2400 | \n",
253 | " 2164 | \n",
254 | " 2299 | \n",
255 | " 1824 | \n",
256 | " 1204 | \n",
257 | " 678 | \n",
258 | "
\n",
259 | " \n",
260 | "
\n",
261 | "
"
262 | ],
263 | "text/plain": [
264 | "color D E F G H I J\n",
265 | "cut \n",
266 | "Fair 163 224 312 314 303 175 119\n",
267 | "Good 662 933 909 871 702 522 307\n",
268 | "Ideal 2834 3903 3826 4884 3115 2093 896\n",
269 | "Premium 1603 2337 2331 2924 2360 1428 808\n",
270 | "Very Good 1513 2400 2164 2299 1824 1204 678"
271 | ]
272 | },
273 | "execution_count": 7,
274 | "metadata": {},
275 | "output_type": "execute_result"
276 | }
277 | ],
278 | "source": [
279 | "crosstab(diamonds.cut, diamonds.color)"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "If you want your function to be part of a dfply pipe chain, the first argument _must_ be a dataframe, which is implicitly passed through during the evaluation of the chain! We will need to redefine the function to have the implicit `df` passed in as the first argument.\n",
287 | "\n",
288 | "The most common and straightforward way to convert a custom function to a dfply piping function is to use the `@dfpipe` decorator. \n",
289 | "\n",
290 | "> Note: the `@dfpipe` decorator is in fact a convenience decorator that stacks three dfply decorators together: \n",
291 | "\n",
292 | "> `@pipe` \n",
293 | "`@group_delegation` \n",
294 | "`@symbolic_evaluation` \n",
295 | "\n",
296 | "> `@pipe` ensures that the function will work in the dfply piping syntax and take an implicit DataFrame, `@group_delegation` makes the function work with groupings applied prior in the chain, and `@symbolic_evaluation` enables you to use and evaluate symbolic arguments like `X.cut` that are placeholders for incoming data."
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 8,
302 | "metadata": {
303 | "collapsed": true
304 | },
305 | "outputs": [],
306 | "source": [
307 | "@dfpipe\n",
308 | "def crosstab(df, index, columns, values=None, aggfunc=None):\n",
309 | " return pd.crosstab(index, columns, values=values, aggfunc=aggfunc)"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 9,
315 | "metadata": {},
316 | "outputs": [
317 | {
318 | "data": {
319 | "text/html": [
320 | "\n",
321 | "\n",
334 | "
\n",
335 | " \n",
336 | " \n",
337 | " | color | \n",
338 | " D | \n",
339 | " E | \n",
340 | " F | \n",
341 | " G | \n",
342 | " H | \n",
343 | " I | \n",
344 | " J | \n",
345 | "
\n",
346 | " \n",
347 | " | cut | \n",
348 | " | \n",
349 | " | \n",
350 | " | \n",
351 | " | \n",
352 | " | \n",
353 | " | \n",
354 | " | \n",
355 | "
\n",
356 | " \n",
357 | " \n",
358 | " \n",
359 | " | Fair | \n",
360 | " 163 | \n",
361 | " 224 | \n",
362 | " 312 | \n",
363 | " 314 | \n",
364 | " 303 | \n",
365 | " 175 | \n",
366 | " 119 | \n",
367 | "
\n",
368 | " \n",
369 | " | Good | \n",
370 | " 662 | \n",
371 | " 933 | \n",
372 | " 909 | \n",
373 | " 871 | \n",
374 | " 702 | \n",
375 | " 522 | \n",
376 | " 307 | \n",
377 | "
\n",
378 | " \n",
379 | " | Ideal | \n",
380 | " 2834 | \n",
381 | " 3903 | \n",
382 | " 3826 | \n",
383 | " 4884 | \n",
384 | " 3115 | \n",
385 | " 2093 | \n",
386 | " 896 | \n",
387 | "
\n",
388 | " \n",
389 | " | Premium | \n",
390 | " 1603 | \n",
391 | " 2337 | \n",
392 | " 2331 | \n",
393 | " 2924 | \n",
394 | " 2360 | \n",
395 | " 1428 | \n",
396 | " 808 | \n",
397 | "
\n",
398 | " \n",
399 | " | Very Good | \n",
400 | " 1513 | \n",
401 | " 2400 | \n",
402 | " 2164 | \n",
403 | " 2299 | \n",
404 | " 1824 | \n",
405 | " 1204 | \n",
406 | " 678 | \n",
407 | "
\n",
408 | " \n",
409 | "
\n",
410 | "
"
411 | ],
412 | "text/plain": [
413 | "color D E F G H I J\n",
414 | "cut \n",
415 | "Fair 163 224 312 314 303 175 119\n",
416 | "Good 662 933 909 871 702 522 307\n",
417 | "Ideal 2834 3903 3826 4884 3115 2093 896\n",
418 | "Premium 1603 2337 2331 2924 2360 1428 808\n",
419 | "Very Good 1513 2400 2164 2299 1824 1204 678"
420 | ]
421 | },
422 | "execution_count": 9,
423 | "metadata": {},
424 | "output_type": "execute_result"
425 | }
426 | ],
427 | "source": [
428 | "diamonds >> crosstab(X.cut, X.color)"
429 | ]
430 | },
431 | {
432 | "cell_type": "markdown",
433 | "metadata": {},
434 | "source": [
435 | "### Case #2: A function that works with symbolic arguments\n",
436 | "---\n",
437 | "\n",
438 | "Many tasks are simpler and do not require the capacity to work as a pipe function. The `dfply` window functions are the common examples of this: functions that take a Series (or _symbolic_ Series) and return a modified version.\n",
439 | "\n",
440 | "Let's say we had a dataframe with dates represented by strings that we wanted to convert to pandas datetime objects using the `pd.to_datetime` function. Below is a tiny example dataframe with this issue."
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": 10,
446 | "metadata": {},
447 | "outputs": [
448 | {
449 | "data": {
450 | "text/html": [
451 | "\n",
452 | "\n",
465 | "
\n",
466 | " \n",
467 | " \n",
468 | " | \n",
469 | " date | \n",
470 | " sales | \n",
471 | "
\n",
472 | " \n",
473 | " \n",
474 | " \n",
475 | " | 0 | \n",
476 | " 7/10/17 | \n",
477 | " 1220 | \n",
478 | "
\n",
479 | " \n",
480 | " | 1 | \n",
481 | " 7/11/17 | \n",
482 | " 1592 | \n",
483 | "
\n",
484 | " \n",
485 | " | 2 | \n",
486 | " 7/12/17 | \n",
487 | " 908 | \n",
488 | "
\n",
489 | " \n",
490 | " | 3 | \n",
491 | " 7/13/17 | \n",
492 | " 1102 | \n",
493 | "
\n",
494 | " \n",
495 | " | 4 | \n",
496 | " 7/14/17 | \n",
497 | " 1395 | \n",
498 | "
\n",
499 | " \n",
500 | "
\n",
501 | "
"
502 | ],
503 | "text/plain": [
504 | " date sales\n",
505 | "0 7/10/17 1220\n",
506 | "1 7/11/17 1592\n",
507 | "2 7/12/17 908\n",
508 | "3 7/13/17 1102\n",
509 | "4 7/14/17 1395"
510 | ]
511 | },
512 | "execution_count": 10,
513 | "metadata": {},
514 | "output_type": "execute_result"
515 | }
516 | ],
517 | "source": [
518 | "sales = pd.DataFrame(dict(date=['7/10/17','7/11/17','7/12/17','7/13/17','7/14/17'],\n",
519 | " sales=[1220, 1592, 908, 1102, 1395]))\n",
520 | "sales"
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": 11,
526 | "metadata": {},
527 | "outputs": [
528 | {
529 | "data": {
530 | "text/plain": [
531 | "date object\n",
532 | "sales int64\n",
533 | "dtype: object"
534 | ]
535 | },
536 | "execution_count": 11,
537 | "metadata": {},
538 | "output_type": "execute_result"
539 | }
540 | ],
541 | "source": [
542 | "sales.dtypes"
543 | ]
544 | },
545 | {
546 | "cell_type": "markdown",
547 | "metadata": {},
548 | "source": [
549 | "In pandas we would use the `pd.to_datetime` function to convert the strings to date objects, and add it as a new column like so:"
550 | ]
551 | },
552 | {
553 | "cell_type": "code",
554 | "execution_count": 12,
555 | "metadata": {},
556 | "outputs": [
557 | {
558 | "data": {
559 | "text/html": [
560 | "\n",
561 | "\n",
574 | "
\n",
575 | " \n",
576 | " \n",
577 | " | \n",
578 | " date | \n",
579 | " sales | \n",
580 | " pd_date | \n",
581 | "
\n",
582 | " \n",
583 | " \n",
584 | " \n",
585 | " | 0 | \n",
586 | " 7/10/17 | \n",
587 | " 1220 | \n",
588 | " 2017-07-10 | \n",
589 | "
\n",
590 | " \n",
591 | " | 1 | \n",
592 | " 7/11/17 | \n",
593 | " 1592 | \n",
594 | " 2017-07-11 | \n",
595 | "
\n",
596 | " \n",
597 | " | 2 | \n",
598 | " 7/12/17 | \n",
599 | " 908 | \n",
600 | " 2017-07-12 | \n",
601 | "
\n",
602 | " \n",
603 | " | 3 | \n",
604 | " 7/13/17 | \n",
605 | " 1102 | \n",
606 | " 2017-07-13 | \n",
607 | "
\n",
608 | " \n",
609 | " | 4 | \n",
610 | " 7/14/17 | \n",
611 | " 1395 | \n",
612 | " 2017-07-14 | \n",
613 | "
\n",
614 | " \n",
615 | "
\n",
616 | "
"
617 | ],
618 | "text/plain": [
619 | " date sales pd_date\n",
620 | "0 7/10/17 1220 2017-07-10\n",
621 | "1 7/11/17 1592 2017-07-11\n",
622 | "2 7/12/17 908 2017-07-12\n",
623 | "3 7/13/17 1102 2017-07-13\n",
624 | "4 7/14/17 1395 2017-07-14"
625 | ]
626 | },
627 | "execution_count": 12,
628 | "metadata": {},
629 | "output_type": "execute_result"
630 | }
631 | ],
632 | "source": [
633 | "sales['pd_date'] = pd.to_datetime(sales['date'], infer_datetime_format=True)\n",
634 | "sales"
635 | ]
636 | },
637 | {
638 | "cell_type": "code",
639 | "execution_count": 13,
640 | "metadata": {
641 | "collapsed": true
642 | },
643 | "outputs": [],
644 | "source": [
645 | "sales.drop('pd_date', axis=1, inplace=True)"
646 | ]
647 | },
648 | {
649 | "cell_type": "markdown",
650 | "metadata": {},
651 | "source": [
652 | "What if you tried to use the `pd.to_datetime` function inside of a call to mutate, like so?\n",
653 | "\n",
654 | "```python\n",
655 | "sales >> mutate(pd_date=pd.to_datetime(X.date, infer_datetime_format=True))\n",
656 | "```\n",
657 | "\n",
658 | "This will unfortunately break. The `dfply` functions are special in that they \"know\" to delay their evaluation until the data is at that point in the chain. `pd.to_datetime` is not such a function, and will immediately try to evaluate `X.date`. With a symbolic `Intention` argument passed in, the function will fail as it does not know what to do with that.\n",
659 | "\n",
660 | "Instead, we will need to make a wrapper around `pd.to_datetime` that can handle these symbolic arguments and delay evaluation until the right time. \n",
661 | "\n",
662 | "This is quite simple: all you need to do is decorate a function with the `@make_symbolic` decorator, like so:"
663 | ]
664 | },
665 | {
666 | "cell_type": "code",
667 | "execution_count": 14,
668 | "metadata": {
669 | "collapsed": true
670 | },
671 | "outputs": [],
672 | "source": [
673 | "@make_symbolic\n",
674 | "def to_datetime(series, infer_datetime_format=True):\n",
675 | " return pd.to_datetime(series, infer_datetime_format=infer_datetime_format)"
676 | ]
677 | },
678 | {
679 | "cell_type": "code",
680 | "execution_count": 15,
681 | "metadata": {},
682 | "outputs": [
683 | {
684 | "data": {
685 | "text/html": [
686 | "\n",
687 | "\n",
700 | "
\n",
701 | " \n",
702 | " \n",
703 | " | \n",
704 | " date | \n",
705 | " sales | \n",
706 | " pd_date | \n",
707 | "
\n",
708 | " \n",
709 | " \n",
710 | " \n",
711 | " | 0 | \n",
712 | " 7/10/17 | \n",
713 | " 1220 | \n",
714 | " 2017-07-10 | \n",
715 | "
\n",
716 | " \n",
717 | " | 1 | \n",
718 | " 7/11/17 | \n",
719 | " 1592 | \n",
720 | " 2017-07-11 | \n",
721 | "
\n",
722 | " \n",
723 | " | 2 | \n",
724 | " 7/12/17 | \n",
725 | " 908 | \n",
726 | " 2017-07-12 | \n",
727 | "
\n",
728 | " \n",
729 | " | 3 | \n",
730 | " 7/13/17 | \n",
731 | " 1102 | \n",
732 | " 2017-07-13 | \n",
733 | "
\n",
734 | " \n",
735 | " | 4 | \n",
736 | " 7/14/17 | \n",
737 | " 1395 | \n",
738 | " 2017-07-14 | \n",
739 | "
\n",
740 | " \n",
741 | "
\n",
742 | "
"
743 | ],
744 | "text/plain": [
745 | " date sales pd_date\n",
746 | "0 7/10/17 1220 2017-07-10\n",
747 | "1 7/11/17 1592 2017-07-11\n",
748 | "2 7/12/17 908 2017-07-12\n",
749 | "3 7/13/17 1102 2017-07-13\n",
750 | "4 7/14/17 1395 2017-07-14"
751 | ]
752 | },
753 | "execution_count": 15,
754 | "metadata": {},
755 | "output_type": "execute_result"
756 | }
757 | ],
758 | "source": [
759 | "sales >> mutate(pd_date=to_datetime(X.date))"
760 | ]
761 | },
762 | {
763 | "cell_type": "markdown",
764 | "metadata": {},
765 | "source": [
766 | "And there you go. Able to delay the evaluation.\n",
767 | "\n",
768 | "What's particularly nice about the `@make_symbolic` decorator is that it has no trouble working with non-symbolic arguments too. If we were to pass in the series itself the function evaluates without a problem:"
769 | ]
770 | },
771 | {
772 | "cell_type": "code",
773 | "execution_count": 16,
774 | "metadata": {},
775 | "outputs": [
776 | {
777 | "data": {
778 | "text/plain": [
779 | "0 2017-07-10\n",
780 | "1 2017-07-11\n",
781 | "2 2017-07-12\n",
782 | "3 2017-07-13\n",
783 | "4 2017-07-14\n",
784 | "Name: date, dtype: datetime64[ns]"
785 | ]
786 | },
787 | "execution_count": 16,
788 | "metadata": {},
789 | "output_type": "execute_result"
790 | }
791 | ],
792 | "source": [
793 | "to_datetime(sales.date)"
794 | ]
795 | },
796 | {
797 | "cell_type": "markdown",
798 | "metadata": {},
799 | "source": [
800 | "Keep in mind, though, that if _any_ of the arguments or keyword arguments are symbolic `Intention` objects, the return will itself be an `Intention` object representing the function awaiting evaluation by a dataframe:"
801 | ]
802 | },
803 | {
804 | "cell_type": "code",
805 | "execution_count": 17,
806 | "metadata": {},
807 | "outputs": [
808 | {
809 | "data": {
810 | "text/plain": [
811 | ""
812 | ]
813 | },
814 | "execution_count": 17,
815 | "metadata": {},
816 | "output_type": "execute_result"
817 | }
818 | ],
819 | "source": [
820 | "to_datetime(X.date)"
821 | ]
822 | },
823 | {
824 | "cell_type": "code",
825 | "execution_count": 19,
826 | "metadata": {},
827 | "outputs": [
828 | {
829 | "data": {
830 | "text/plain": [
831 | "0 2017-07-10\n",
832 | "1 2017-07-11\n",
833 | "2 2017-07-12\n",
834 | "3 2017-07-13\n",
835 | "4 2017-07-14\n",
836 | "Name: date, dtype: datetime64[ns]"
837 | ]
838 | },
839 | "execution_count": 19,
840 | "metadata": {},
841 | "output_type": "execute_result"
842 | }
843 | ],
844 | "source": [
845 | "awaiting = to_datetime(X.date)\n",
846 | "awaiting.evaluate(sales)"
847 | ]
848 | },
849 | {
850 | "cell_type": "code",
851 | "execution_count": null,
852 | "metadata": {
853 | "collapsed": true
854 | },
855 | "outputs": [],
856 | "source": []
857 | }
858 | ],
859 | "metadata": {
860 | "kernelspec": {
861 | "display_name": "Python 3",
862 | "language": "python",
863 | "name": "python3"
864 | },
865 | "language_info": {
866 | "codemirror_mode": {
867 | "name": "ipython",
868 | "version": 3
869 | },
870 | "file_extension": ".py",
871 | "mimetype": "text/x-python",
872 | "name": "python",
873 | "nbconvert_exporter": "python",
874 | "pygments_lexer": "ipython3",
875 | "version": "3.6.1"
876 | }
877 | },
878 | "nbformat": 4,
879 | "nbformat_minor": 2
880 | }
881 |
--------------------------------------------------------------------------------