├── .github ├── FUNDING.yml └── workflows │ └── python-publish.yml ├── .gitignore ├── CHANGELOG ├── LICENSE ├── Makefile ├── README.md ├── TODO ├── images ├── bars.png └── redframes.png ├── mypy.ini ├── redframes ├── __init__.py ├── checks.py ├── core.py ├── io │ ├── __init__.py │ ├── convert.py │ ├── load.py │ └── save.py ├── stat.py ├── types.py ├── verbs │ ├── __init__.py │ ├── accumulate.py │ ├── append.py │ ├── combine.py │ ├── cross.py │ ├── dedupe.py │ ├── denix.py │ ├── drop.py │ ├── fill.py │ ├── filter.py │ ├── gather.py │ ├── group.py │ ├── join.py │ ├── mutate.py │ ├── pack.py │ ├── rank.py │ ├── rename.py │ ├── replace.py │ ├── rollup.py │ ├── sample.py │ ├── select.py │ ├── shuffle.py │ ├── sort.py │ ├── split.py │ ├── spread.py │ ├── take.py │ └── unpack.py └── version.py ├── setup.py └── tests ├── __init__.py ├── test_deprecations.py ├── test_docstrings.py ├── test_dupe_columns.py ├── test_index.py ├── test_interchange.py ├── test_io.py ├── test_ladybugs.py ├── test_readme.py ├── test_side_effects.py └── test_type_hints.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [maxhumber] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 14 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload to PyPI 5 | on: 6 | release: 7 | types: [published] 8 | permissions: 9 | contents: read 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Python 16 | uses: actions/setup-python@v3 17 | with: 18 | python-version: '3.x' 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install -e ".[test]" 23 | - name: Run tests 24 | run: python -m unittest 25 | - name: Install build dependencies 26 | run: pip install build 27 | - name: Build package 28 | run: python -m build 29 | - name: Publish package 30 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 31 | with: 32 | user: __token__ 33 | password: ${{ secrets.PYPI_API_TOKEN }} 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # custom 2 | playground 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | - 1.4.1 2 | - BUMP: support for pandas 2.0+ 3 | - NEW: support for Python 3.8 4 | - 1.4 5 | - NEW: pandas dependency pinned to below 2.0 6 | - NEW: `pack` verb 7 | - NEW: `unpack` verb 8 | - NEW: `group` + `gather` compatibility 9 | - NEW: `make loc` (for development) 10 | - IMPROVED: README Quickstart + "Verb Table" 11 | - IMPROVED: GroupedFrame `__repr__` 12 | - IMPROVED: `group` performance optimizations 13 | - BUGFIX: `rf.wrap` now properly throws an error on "MultiIndex" columns 14 | - BUGFIX: sort order is now retained in `group` operations 15 | - BUGFIX: some `TypeError`s have been changed to `ValueError`s 16 | - DEPRECATED: `gather(beside=...)` ...whoops! please use `group` + `gather`! 17 | - 1.3 18 | - NEW: `gather(beside=...)` argument! 19 | - IMPROVED: `sample` errors are more explicit 20 | - 1.2 21 | - NEW: `cross` join verb! 22 | - NEW: `join(..., postfix=("_lhs, "_rhs"))` argument 23 | - NEW: `memory` property to check DataFrame memory footprint 24 | - NEW: Makefile (for development) 25 | - BUGFIX: `combine` drop=True argument now works as intended 26 | - BUGFIX: `summarize` deprecation warning now displays properly 27 | - BREAKING: `combine` now explicitly requires a `sep` argument 28 | - 1.1 29 | - BUMP: pandas 1.5+ 30 | - NEW: `__dataframe__` interchange format support 31 | - NEW: `rollup` verb (fka `summarize`) 32 | - NEW: `__version__` 33 | - BUGFIX: `select` verb now requires unique column keys 34 | - BUGFIX: `types` property is now more robust to mixed types within a column 35 | - DEPRECATED: `summarize` (please use `rollup`!) 36 | 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2022, Max Humber 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | python -m unittest 3 | 4 | format: 5 | isort redframes tests 6 | black redframes tests 7 | 8 | types: 9 | mypy redframes 10 | pyright redframes 11 | 12 | loc: 13 | find redframes -name '*.py' | xargs wc -l | sort -nr 14 | find tests -name '*.py' | xargs wc -l | sort -nr 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | redframes 3 |
4 |
5 | Pandas Version 6 | PyPI 7 | Downloads 8 |
9 |
10 |
11 | 12 | 13 | 14 | ### About 15 | 16 | **redframes** (**re**ctangular **d**ata **frames**) is a general purpose data manipulation library that prioritizes syntax, simplicity, and speed (to a solution). Importantly, the library is fully interoperable with [pandas](https://github.com/pandas-dev/pandas), compatible with [scikit-learn](https://github.com/scikit-learn/scikit-learn), and works great with [matplotlib](https://github.com/matplotlib/matplotlib). 17 | 18 | 19 | 20 | ### Install & Import 21 | 22 | ```sh 23 | pip install redframes 24 | ``` 25 | 26 | ```python 27 | import redframes as rf 28 | ``` 29 | 30 | 31 | 32 | ### Quickstart 33 | 34 | Copy-and-paste this to get started: 35 | 36 | ```python 37 | import redframes as rf 38 | 39 | df = rf.DataFrame({ 40 | 'bear': ['Brown bear', 'Polar bear', 'Asian black bear', 'American black bear', 'Sun bear', 'Sloth bear', 'Spectacled bear', 'Giant panda'], 41 | 'genus': ['Ursus', 'Ursus', 'Ursus', 'Ursus', 'Helarctos', 'Melursus', 'Tremarctos', 'Ailuropoda'], 42 | 'weight (male, lbs)': ['300-860', '880-1320', '220-440', '125-500', '60-150', '175-310', '220-340', '190-275'], 43 | 'weight (female, lbs)': ['205-455', '330-550', '110-275', '90-300', '45-90', '120-210', '140-180', '155-220'] 44 | }) 45 | 46 | # | bear | genus | weight (male, lbs) | weight (female, lbs) | 47 | # |:--------------------|:-----------|:---------------------|:-----------------------| 48 | # | Brown bear | Ursus | 300-860 | 205-455 | 49 | # | Polar bear | Ursus | 880-1320 | 330-550 | 50 | # | Asian black bear | Ursus | 220-440 | 110-275 | 51 | # | American black bear | Ursus | 125-500 | 90-300 | 52 | # | Sun bear | Helarctos | 60-150 | 45-90 | 53 | # | Sloth bear | Melursus | 175-310 | 120-210 | 54 | # | Spectacled bear | Tremarctos | 220-340 | 140-180 | 55 | # | Giant panda | Ailuropoda | 190-275 | 155-220 | 56 | 57 | ( 58 | df 59 | .rename({"weight (male, lbs)": "male", "weight (female, lbs)": "female"}) 60 | .gather(["male", "female"], into=("sex", "weight")) 61 | .split("weight", into=["min", "max"], sep="-") 62 | .gather(["min", "max"], into=("stat", "weight")) 63 | .mutate({"weight": lambda row: float(row["weight"])}) 64 | .group(["genus", "sex"]) 65 | .rollup({"weight": ("weight", rf.stat.mean)}) 66 | .spread("sex", using="weight") 67 | .mutate({"dimorphism": lambda row: round(row["male"] / row["female"], 2)}) 68 | .drop(["male", "female"]) 69 | .sort("dimorphism", descending=True) 70 | ) 71 | 72 | # | genus | dimorphism | 73 | # |:-----------|-------------:| 74 | # | Ursus | 2.01 | 75 | # | Tremarctos | 1.75 | 76 | # | Helarctos | 1.56 | 77 | # | Melursus | 1.47 | 78 | # | Ailuropoda | 1.24 | 79 | ``` 80 | 81 | 82 | 83 | For comparison, here's the equivalent pandas: 84 | 85 | ```python 86 | import pandas as pd 87 | 88 | # df = pd.DataFrame({...}) 89 | 90 | df = df.rename(columns={"weight (male, lbs)": "male", "weight (female, lbs)": "female"}) 91 | df = pd.melt(df, id_vars=['bear', 'genus'], value_vars=['male', 'female'], var_name='sex', value_name='weight') 92 | df[["min", "max"]] = df["weight"].str.split("-", expand=True) 93 | df = df.drop("weight", axis=1) 94 | df = pd.melt(df, id_vars=['bear', 'genus', 'sex'], value_vars=['min', 'max'], var_name='stat', value_name='weight') 95 | df['weight'] = df["weight"].astype('float') 96 | df = df.groupby(["genus", "sex"])["weight"].mean() 97 | df = df.reset_index() 98 | df = pd.pivot_table(df, index=['genus'], columns=['sex'], values='weight') 99 | df = df.reset_index() 100 | df = df.rename_axis(None, axis=1) 101 | df["dimorphism"] = round(df["male"] / df["female"], 2) 102 | df = df.drop(["female", "male"], axis=1) 103 | df = df.sort_values("dimorphism", ascending=False) 104 | df = df.reset_index(drop=True) 105 | 106 | # 🤮 107 | ``` 108 | 109 | 110 | 111 | ### IO 112 | 113 | Save, load, and convert `rf.DataFrame` objects: 114 | 115 | ```python 116 | # save .csv 117 | rf.save(df, "bears.csv") 118 | 119 | # load .csv 120 | df = rf.load("bears.csv") 121 | 122 | # convert redframes → pandas 123 | pandas_df = rf.unwrap(df) 124 | 125 | # convert pandas → redframes 126 | df = rf.wrap(pandas_df) 127 | ``` 128 | 129 | 130 | 131 | ### Verbs 132 | 133 | Verbs are [pure](https://en.wikipedia.org/wiki/Pure_function) and "chain-able" methods that manipulate `rf.DataFrame` objects. Here is the complete list (see *docstrings* for examples and more details): 134 | 135 | | Verb | Description | 136 | | ------------------------------------------------ | ------------------------------------------------------------ | 137 | | `accumulate` | Run a cumulative sum over a column | 138 | | `append` | Append rows from another DataFrame | 139 | | `combine` | Combine multiple columns into a single column (opposite of `split`) | 140 | | `cross` | Cross join columns from another DataFrame | 141 | | `dedupe` | Remove duplicate rows | 142 | | [`denix`](https://www.dictionary.com/browse/nix) | Remove rows with missing values | 143 | | `drop` | Drop entire columns (opposite of `select`) | 144 | | `fill` | Fill missing values "down", "up", or with a constant | 145 | | `filter` | Keep rows matching specific conditions | 146 | | `gather` | Gather columns into rows (opposite of `spread`) | 147 | | `group` | Prepare groups for compatible verbs | 148 | | `join` | Join columns from another DataFrame | 149 | | `mutate` | Create a new, or overwrite an existing column | 150 | | `pack` | Collate and concatenate row values for a target column (opposite of `unpack`) | 151 | | `rank` | Rank order values in a column | 152 | | `rename` | Rename column keys | 153 | | `replace` | Replace matching values within columns | 154 | | `rollup` | Apply summary functions and/or statistics to target columns | 155 | | `sample` | Randomly sample any number of rows | 156 | | `select` | Select specific columns (opposite of `drop`) | 157 | | `shuffle` | Shuffle the order of all rows | 158 | | `sort` | Sort rows by specific columns | 159 | | `split` | Split a single column into multiple columns (opposite of `combine`) | 160 | | `spread` | Spread rows into columns (opposite of `gather`) | 161 | | `take` | Take any number of rows (from the top/bottom) | 162 | | `unpack` | "Explode" concatenated row values into multiple rows (opposite of `pack`) | 163 | 164 | 165 | 166 | ### Properties 167 | 168 | In addition to all of the verbs there are several properties attached to each `DataFrame` object: 169 | 170 | ```python 171 | df["genus"] 172 | # ['Ursus', 'Ursus', 'Ursus', 'Ursus', 'Helarctos', 'Melursus', 'Tremarctos', 'Ailuropoda'] 173 | 174 | df.columns 175 | # ['bear', 'genus', 'weight (male, lbs)', 'weight (female, lbs)'] 176 | 177 | df.dimensions 178 | # {'rows': 8, 'columns': 4} 179 | 180 | df.empty 181 | # False 182 | 183 | df.memory 184 | # '2 KB' 185 | 186 | df.types 187 | # {'bear': object, 'genus': object, 'weight (male, lbs)': object, 'weight (female, lbs)': object} 188 | ``` 189 | 190 | 191 | 192 | ### matplotlib 193 | 194 | `rf.DataFrame` objects integrate seamlessly with `matplotlib`: 195 | 196 | ```python 197 | import redframes as rf 198 | import matplotlib.pyplot as plt 199 | 200 | football = rf.DataFrame({ 201 | 'position': ['TE', 'K', 'RB', 'WR', 'QB'], 202 | 'avp': [116.98, 131.15, 180, 222.22, 272.91] 203 | }) 204 | 205 | df = ( 206 | football 207 | .mutate({"color": lambda row: row["position"] in ["WR", "RB"]}) 208 | .replace({"color": {False: "orange", True: "red"}}) 209 | ) 210 | 211 | plt.barh(df["position"], df["avp"], color=df["color"]); 212 | ``` 213 | 214 | redframes 215 | 216 | 217 | 218 | ### scikit-learn 219 | 220 | `rf.DataFrame` objects are fully compatible with `sklearn` functions, estimators, and transformers: 221 | 222 | ```python 223 | import redframes as rf 224 | from sklearn.model_selection import train_test_split 225 | from sklearn.linear_model import LinearRegression 226 | 227 | df = rf.DataFrame({ 228 | "touchdowns": [15, 19, 5, 7, 9, 10, 12, 22, 16, 10], 229 | "age": [21, 22, 21, 24, 26, 28, 30, 35, 28, 21], 230 | "mvp": [1, 1, 0, 0, 0, 0, 0, 1, 0, 0] 231 | }) 232 | 233 | target = "touchdowns" 234 | y = df[target] 235 | X = df.drop(target) 236 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 237 | 238 | model = LinearRegression() 239 | model.fit(X_train, y_train) 240 | model.score(X_test, y_test) 241 | # 0.5083194901655527 242 | 243 | print(X_train.take(1)) 244 | # rf.DataFrame({'age': [21], 'mvp': [0]}) 245 | 246 | X_new = rf.DataFrame({'age': [22], 'mvp': [1]}) 247 | model.predict(X_new) 248 | # array([19.]) 249 | ``` 250 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | 1.5 2 | - docstrings examples to bears 3 | - pd.DataFrame().to_redframes() 4 | - to_pandas / (to_csv / to_dict) 5 | - rf.read_csv() / rf.from_csv() 6 | - deprecate io functions 7 | - replace `__str__` 8 | - from_dict, from_pandas, from_csv, from_excel? 9 | 10 | 1.6 11 | - reorder/move columns to front / end / before / after 12 | - tally verb 13 | - complete verb (tidyr) 14 | 15 | 1.7 16 | - warning on multiple columns in mutate (override) or fix? 17 | - expose @extension 18 | 19 | 2.0 20 | - explicit * keyword arguments 21 | - remove deprecated functions/methods 22 | 23 | Later 24 | - complete verb (tidyr) 25 | - hide/protect/private methods/attributes 26 | - cheatsheet & tutorial 27 | 28 | Maybe 29 | - slice verb 30 | - log verb (Untitled12) 31 | - builtin datasets 32 | - vectorized mutate support (`.assign` mutate(..., vectorized=True))? 33 | - polars/arrow backend 34 | - class RedList(list): ...? 35 | - speedtests 36 | -------------------------------------------------------------------------------- /images/bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxhumber/redframes/6e3f1226358ad4e67f4343cbc4b1ee4b63475034/images/bars.png -------------------------------------------------------------------------------- /images/redframes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxhumber/redframes/6e3f1226358ad4e67f4343cbc4b1ee4b63475034/images/redframes.png -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.9 3 | warn_return_any = True 4 | warn_unused_configs = True 5 | 6 | [mypy-pandas.*] 7 | ignore_missing_imports = True -------------------------------------------------------------------------------- /redframes/__init__.py: -------------------------------------------------------------------------------- 1 | from . import stat 2 | from .core import DataFrame 3 | from .io import load, save, unwrap, wrap 4 | from .version import __version__ 5 | -------------------------------------------------------------------------------- /redframes/checks.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .types import ( 4 | Any, 5 | Columns, 6 | LazyColumns, 7 | PandasDataFrame, 8 | PandasIndex, 9 | PandasRangeIndex, 10 | ) 11 | 12 | 13 | def _check_type(argument: Any, against: type | set[type | None]) -> None: 14 | if isinstance(against, set): 15 | if len(against) == 0: 16 | against = {against} # type: ignore 17 | if not isinstance(against, set): 18 | against = {against} 19 | optional = None in against 20 | just_types = against.difference({None}) 21 | checks = [isinstance(argument, t) for t in just_types] # type: ignore 22 | if optional: 23 | checks += [argument == None] 24 | if not any(checks): 25 | str_types = " | ".join([t.__name__ for t in just_types]) # type: ignore 26 | if optional: 27 | str_types += " | None" 28 | raise TypeError(f"must be {str_types}") 29 | 30 | 31 | def _check_values(values: Any, type: type) -> None: 32 | if not all(isinstance(value, type) for value in values): 33 | raise TypeError(f"must be {type.__name__}") 34 | 35 | 36 | def _check_keys(columns: LazyColumns | None, against: Columns | PandasIndex) -> None: 37 | if isinstance(columns, str): 38 | columns = [columns] 39 | columns = [] if (columns == None) else columns 40 | bad_keys = set(columns).difference(against) # type: ignore 41 | if bad_keys: 42 | if len(bad_keys) == 1: 43 | raise KeyError(f"invalid key {bad_keys}") 44 | else: 45 | raise KeyError(f"invalid keys {bad_keys}") 46 | 47 | 48 | def _check_index(df: PandasDataFrame) -> None: 49 | if not (df.index.name == None): 50 | raise IndexError("must be unnamed") 51 | if not isinstance(df.index, PandasRangeIndex): 52 | raise IndexError("must be range") 53 | if not (df.index.start == 0): 54 | raise IndexError("must start at 0") 55 | if not (df.index.step == 1): 56 | raise IndexError("must step by 1") 57 | 58 | 59 | def _check_columns(df: PandasDataFrame) -> None: 60 | if type(df.columns) != PandasIndex: 61 | raise KeyError("must be flat") 62 | if df.columns.has_duplicates: 63 | raise KeyError("must not contain duplicate keys") 64 | 65 | 66 | def _check_file(path: str) -> None: 67 | if not path.endswith(".csv"): 68 | raise TypeError("must end in .csv") 69 | -------------------------------------------------------------------------------- /redframes/core.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pprint 4 | import warnings 5 | 6 | from .checks import _check_type 7 | from .types import ( 8 | Any, 9 | Column, 10 | Columns, 11 | DateTime, 12 | Direction, 13 | Func, 14 | Join, 15 | LazyColumns, 16 | NewColumn, 17 | NewValue, 18 | NumpyArray, 19 | NumpyType, 20 | OldColumn, 21 | OldValue, 22 | PandasDataFrame, 23 | PandasGroupedFrame, 24 | Value, 25 | Values, 26 | ) 27 | from .verbs import ( 28 | accumulate, 29 | append, 30 | combine, 31 | cross, 32 | dedupe, 33 | denix, 34 | drop, 35 | fill, 36 | filter, 37 | gather, 38 | group, 39 | join, 40 | mutate, 41 | pack, 42 | rank, 43 | rename, 44 | replace, 45 | rollup, 46 | sample, 47 | select, 48 | shuffle, 49 | sort, 50 | split, 51 | spread, 52 | take, 53 | unpack, 54 | ) 55 | 56 | 57 | def _wrap(data: PandasDataFrame) -> DataFrame: 58 | """Unsafe version of redframes.io.wrap()""" 59 | df = DataFrame() 60 | df._data = data 61 | return df 62 | 63 | 64 | class _TakeMixin: 65 | def __init__(self, data: PandasDataFrame | PandasGroupedFrame) -> None: 66 | self._data = data 67 | 68 | def take(self, rows: int, **kwargs) -> DataFrame: 69 | """Take any number of rows (from the top/bottom) 70 | 71 | Examples: 72 | 73 | ```python 74 | df = rf.DataFrame({"foo": range(10)}) 75 | ``` 76 | | foo | 77 | |------:| 78 | | 0 | 79 | | 1 | 80 | | 2 | 81 | | 3 | 82 | | 4 | 83 | | 5 | 84 | | 6 | 85 | | 7 | 86 | | 8 | 87 | | 9 | 88 | 89 | From "head": 90 | 91 | ```python 92 | df.take(1) 93 | ``` 94 | | foo | 95 | |------:| 96 | | 0 | 97 | 98 | From "tail": 99 | 100 | ```python 101 | df.take(-2) 102 | ``` 103 | | foo | 104 | |------:| 105 | | 8 | 106 | | 9 | 107 | """ 108 | return _wrap(take(self._data, rows, **kwargs)) 109 | 110 | 111 | class _InterchangeMixin(_TakeMixin): 112 | def __init__(self, data: PandasDataFrame) -> None: 113 | self._data = data 114 | 115 | def __array__(self) -> NumpyArray: 116 | return self._data.__array__() 117 | 118 | def __dataframe__(self, nan_as_null=False, allow_copy=True) -> "PandasDataFrameXchg": # type: ignore 119 | return self._data.__dataframe__(nan_as_null, allow_copy) 120 | 121 | def __len__(self) -> int: 122 | return self._data.__len__() 123 | 124 | @property 125 | def iloc(self): 126 | return self._data.iloc 127 | 128 | 129 | class _CommonMixin(_TakeMixin): 130 | def __init__(self, data: PandasDataFrame | PandasGroupedFrame) -> None: 131 | self._data = data 132 | 133 | def accumulate(self, column: Column, into: Column) -> DataFrame: 134 | """Run a cumulative sum over a column 135 | 136 | Example: 137 | 138 | ```python 139 | df = rf.DataFrame({"foo": [1, 2, 3, 4]}) 140 | ``` 141 | | foo | 142 | |------:| 143 | | 1 | 144 | | 2 | 145 | | 3 | 146 | | 4 | 147 | 148 | ```python 149 | df.accumulate("foo", into="cumsum") 150 | ``` 151 | | foo | cumsum | 152 | |------:|---------:| 153 | | 1 | 1 | 154 | | 2 | 3 | 155 | | 3 | 6 | 156 | | 4 | 10 | 157 | """ 158 | return _wrap(accumulate(self._data, column, into)) 159 | 160 | def gather( 161 | self, 162 | columns: Columns | None = None, 163 | beside: LazyColumns | None = None, 164 | into: tuple[Column, Column] = ("variable", "value"), 165 | ): 166 | """Gather columns into rows (opposite of spread) 167 | 168 | Examples: 169 | 170 | ```python 171 | df = rf.DataFrame({ 172 | "foo": [1, 2, 1, 2], 173 | "bar": ["A", "B", "C", "D"], 174 | "baz": ["!", "@", "#", "$"], 175 | "jaz": range(4) 176 | }) 177 | ``` 178 | | foo | bar | baz | jaz | 179 | |------:|:------|:------|------:| 180 | | 1 | A | ! | 0 | 181 | | 2 | B | @ | 1 | 182 | | 1 | C | # | 2 | 183 | | 2 | D | $ | 3 | 184 | 185 | All columns: 186 | 187 | ```python 188 | df.gather() 189 | ``` 190 | | variable | value | 191 | |:-----------|:--------| 192 | | foo | 1 | 193 | | foo | 2 | 194 | | foo | 1 | 195 | | foo | 2 | 196 | | bar | A | 197 | | bar | B | 198 | | bar | C | 199 | | bar | D | 200 | | baz | ! | 201 | | baz | @ | 202 | | baz | # | 203 | | baz | $ | 204 | | jaz | 0 | 205 | | jaz | 1 | 206 | | jaz | 2 | 207 | | jaz | 3 | 208 | 209 | Multiple columns: 210 | 211 | ```python 212 | df.gather(["foo", "bar"], into=("var", "val")) 213 | ``` 214 | | baz | jaz | var | val | 215 | |:------|------:|:------|:------| 216 | | ! | 0 | foo | 1 | 217 | | @ | 1 | foo | 2 | 218 | | # | 2 | foo | 1 | 219 | | $ | 3 | foo | 2 | 220 | | ! | 0 | bar | A | 221 | | @ | 1 | bar | B | 222 | | # | 2 | bar | C | 223 | | $ | 3 | bar | D | 224 | 225 | All columns beside: 226 | 227 | ```python 228 | df.group(["foo", "bar"]).gather(into=("variable", "value")) 229 | ``` 230 | | foo | bar | variable | value | 231 | |------:|:------|:-----------|:--------| 232 | | 1 | A | baz | ! | 233 | | 2 | B | baz | @ | 234 | | 1 | C | baz | # | 235 | | 2 | D | baz | $ | 236 | | 1 | A | jaz | 0 | 237 | | 2 | B | jaz | 1 | 238 | | 1 | C | jaz | 2 | 239 | | 2 | D | jaz | 3 | 240 | """ 241 | return _wrap(gather(self._data, columns, beside, into)) 242 | 243 | def pack(self, column: Column, sep: str) -> DataFrame: 244 | """Collate and concatenate row values for a target column (opposite of unpack) 245 | 246 | Examples: 247 | 248 | ```python 249 | df = rf.DataFrame({ 250 | "foo": ["A", "A", "B", "A", "B", "C"], 251 | "bar": [1, 2, 3, 4, 5, 6] 252 | }) 253 | ``` 254 | | foo | bar | 255 | |:------|------:| 256 | | A | 1 | 257 | | A | 2 | 258 | | B | 3 | 259 | | A | 4 | 260 | | B | 5 | 261 | | C | 6 | 262 | 263 | Pack all rows: 264 | 265 | ```python 266 | df.pack("foo", sep="+") 267 | ``` 268 | | foo | 269 | |:------------| 270 | | A+A+B+A+B+C | 271 | 272 | Pack rows by Group: 273 | 274 | ```python 275 | df.group("foo").pack("bar", sep="|") 276 | ``` 277 | | foo | bar | 278 | |:------|:------| 279 | | A | 1|2|4 | 280 | | B | 3|5 | 281 | | C | 6 | 282 | """ 283 | return _wrap(pack(self._data, column, sep)) 284 | 285 | def rank( 286 | self, 287 | column: Column, 288 | into: Column, 289 | descending: bool = False, 290 | ) -> DataFrame: 291 | """Rank order values in a column 292 | 293 | Example: 294 | 295 | ```python 296 | df = rf.DataFrame({"foo": [2, 3, 3, 99, 1000, 1, -6, 4]}) 297 | ``` 298 | | foo | 299 | |------:| 300 | | 2 | 301 | | 3 | 302 | | 3 | 303 | | 99 | 304 | | 1000 | 305 | | 1 | 306 | | -6 | 307 | | 4 | 308 | 309 | ```python 310 | df.rank("foo", into="rank", descending=True) 311 | ``` 312 | | foo | rank | 313 | |------:|-------:| 314 | | 2 | 5 | 315 | | 3 | 4 | 316 | | 3 | 4 | 317 | | 99 | 2 | 318 | | 1000 | 1 | 319 | | 1 | 6 | 320 | | -6 | 7 | 321 | | 4 | 3 | 322 | """ 323 | return _wrap(rank(self._data, column, into, descending)) 324 | 325 | def rollup(self, over: dict[Column, tuple[Column, Func]]) -> DataFrame: 326 | """Apply summary functions and/or statistics to target columns 327 | 328 | Example: 329 | 330 | ```python 331 | df = rf.DataFrame({"foo": [1, 2, 3, 4, 5], "bar": [99, 100, 1, -5, 2]}) 332 | ``` 333 | | foo | bar | 334 | |------:|------:| 335 | | 1 | 99 | 336 | | 2 | 100 | 337 | | 3 | 1 | 338 | | 4 | -5 | 339 | | 5 | 2 | 340 | 341 | ```python 342 | df.rollup({ 343 | "fcount": ("foo", rf.stat.count), 344 | "fmean": ("foo", rf.stat.mean), 345 | "fsum": ("foo", rf.stat.sum), 346 | "fmax": ("foo", rf.stat.max), 347 | "bmedian": ("bar", rf.stat.median), 348 | "bmin": ("bar", rf.stat.min), 349 | "bstd": ("bar", rf.stat.std) 350 | }) 351 | ``` 352 | | fcount | fmean | fsum | fmax | bmedian | bmin | bstd | 353 | |---------:|--------:|-------:|-------:|----------:|-------:|-------:| 354 | | 5 | 3 | 15 | 5 | 2 | -5 | 54.93 | 355 | """ 356 | return _wrap(rollup(self._data, over)) 357 | 358 | def summarize(self, over: dict[Column, tuple[Column, Func]]) -> DataFrame: 359 | message = "Marked for removal, please use `rollup` instead" 360 | warnings.warn(message, FutureWarning) 361 | return self.rollup(over) 362 | 363 | 364 | class GroupedFrame(_CommonMixin): 365 | """GroupedFrame compatible with: `accumulate`, `gather`, `pack`, `rank`, `rollup`, `take`""" 366 | 367 | def __repr__(self) -> str: 368 | return self._data.obj.__repr__() # type: ignore 369 | 370 | def _repr_html_(self) -> str: 371 | return self._data.obj.to_html(index=True) # type: ignore 372 | 373 | 374 | class DataFrame(_CommonMixin, _InterchangeMixin): 375 | def __init__(self, data: dict[Column, Values] | None = None) -> None: 376 | """Initialize a DataFrame with a standard dictionary 377 | 378 | Example: 379 | 380 | ```python 381 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]}) 382 | ``` 383 | | foo | bar | 384 | |------:|:------| 385 | | 1 | A | 386 | | 2 | B | 387 | """ 388 | _check_type(data, {dict, None}) 389 | if not data: 390 | self._data = PandasDataFrame() 391 | if isinstance(data, dict): 392 | self._data = PandasDataFrame(data) 393 | 394 | def __eq__(self, rhs: Any) -> bool: 395 | """Check if two DataFrames are equal to each other 396 | 397 | Example: 398 | 399 | ```python 400 | adf = rf.DataFrame({"foo": [1]}) 401 | bdf = rf.DataFrame({"bar": [1]}) 402 | cdf = rf.DataFrame({"foo": [1]}) 403 | print(adf == bdf) 404 | print(adf == cdf) 405 | # False 406 | # True 407 | ``` 408 | """ 409 | if not isinstance(rhs, DataFrame): 410 | return False 411 | return self._data.equals(rhs._data) 412 | 413 | def __getitem__(self, key: Column) -> Values: 414 | """Retrive values (as a python list) from a specified column 415 | 416 | Example: 417 | 418 | ```python 419 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]}) 420 | df["foo"] 421 | # [1, 2] 422 | ``` 423 | """ 424 | return list(self._data[key]) 425 | 426 | def __repr__(self) -> str: 427 | return self._data.__repr__() 428 | 429 | def _repr_html_(self) -> str: 430 | return self._data.to_html(index=True) 431 | 432 | def __str__(self) -> str: 433 | """Return string constructor (for copy-and-pasting) 434 | 435 | Example: 436 | 437 | ```python 438 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]}) 439 | str(df) 440 | # "rf.DataFrame({'foo': [1, 2], 'bar': ['A', 'B']})" 441 | ``` 442 | """ 443 | data = self._data.to_dict(orient="list") 444 | string = pprint.pformat(data, indent=4, sort_dicts=False, compact=True) 445 | if "\n" in string: 446 | string = " " + string[1:-1] 447 | string = f"rf.DataFrame({{\n{string}\n}})" 448 | else: 449 | string = f"rf.DataFrame({string})" 450 | return string 451 | 452 | @property 453 | def columns(self) -> Columns: 454 | """Inspect column keys (names) 455 | 456 | Example: 457 | 458 | ```python 459 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"], "baz": [True, False]}) 460 | df.columns 461 | # ['foo', 'bar', 'baz'] 462 | ``` 463 | """ 464 | return list(self._data.columns) 465 | 466 | @property 467 | def dimensions(self) -> dict[str, int]: 468 | """Inspect DataFrame shape 469 | 470 | Example: 471 | 472 | ```python 473 | df = rf.DataFrame({"foo": range(10), "bar": range(10, 20)}) 474 | df.dimensions 475 | # {'rows': 10, 'columns': 2} 476 | ``` 477 | """ 478 | return dict(zip(["rows", "columns"], self._data.shape)) 479 | 480 | @property 481 | def empty(self) -> bool: 482 | """Inspect if DataFrame is "empty" 483 | 484 | Example: 485 | 486 | ```python 487 | df = rf.DataFrame() 488 | df.empty 489 | # True 490 | ``` 491 | """ 492 | return self._data.empty 493 | 494 | @property 495 | def memory(self) -> str: 496 | """Interrogate DataFrame (deep) memory usage 497 | 498 | Example: 499 | 500 | ```python 501 | df = rf.DataFrame({"foo": [1, 2, 3], "bar": ["A", "B", "C"]}) 502 | df.memory 503 | # '326B' 504 | ``` 505 | """ 506 | size = self._data.memory_usage(deep=True).sum() 507 | power_labels = {40: "TB", 30: "GB", 20: "MB", 10: "KB"} 508 | for power, label in power_labels.items(): 509 | if size >= (2**power): 510 | approx_size = size // 2**power 511 | return f"{approx_size} {label}" 512 | return f"{size} B" 513 | 514 | @property 515 | def types(self) -> dict[Column, type]: 516 | """Inspect column types 517 | 518 | Example: 519 | 520 | ```python 521 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"], "baz": [True, False]}) 522 | df.types 523 | # {'foo': int, 'bar': object, 'baz': bool} 524 | ``` 525 | """ 526 | numpy_types = { 527 | NumpyType("O"): object, 528 | NumpyType("int64"): int, 529 | NumpyType("float64"): float, 530 | NumpyType("bool"): bool, 531 | NumpyType("datetime64"): DateTime, 532 | } 533 | raw_types = dict(self._data.dtypes) 534 | clean_types = {} 535 | for column in self.columns: 536 | current = raw_types[column] 537 | clean = numpy_types.get(current, current) # type: ignore 538 | clean_types[column] = clean 539 | return clean_types 540 | 541 | def append(self, other: DataFrame) -> DataFrame: 542 | """Append rows from another DataFrame 543 | 544 | Example: 545 | 546 | ```python 547 | df1 = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]}) 548 | ``` 549 | | foo | bar | 550 | |------:|:------| 551 | | 1 | A | 552 | | 2 | B | 553 | 554 | ```python 555 | df2 = rf.DataFrame({"bar": ["C", "D"], "foo": [3, 4], "baz": ["$", "@"]}) 556 | ``` 557 | | bar | foo | baz | 558 | |:------|------:|:------| 559 | | C | 3 | $ | 560 | | D | 4 | @ | 561 | 562 | ```python 563 | df1.append(df2) 564 | ``` 565 | | foo | bar | baz | 566 | |------:|:------|:------| 567 | | 1 | A | nan | 568 | | 2 | B | nan | 569 | | 3 | C | $ | 570 | | 4 | D | @ | 571 | """ 572 | _check_type(other, DataFrame) 573 | return _wrap(append(self._data, other._data)) 574 | 575 | def combine( 576 | self, columns: Columns, into: Column, sep: str, drop: bool = True 577 | ) -> DataFrame: 578 | """Combine multiple columns into a single column (opposite of `split`) 579 | 580 | Example: 581 | 582 | ```python 583 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]}) 584 | ``` 585 | | foo | bar | 586 | |------:|:------| 587 | | 1 | A | 588 | | 2 | B | 589 | 590 | ```python 591 | df.combine(["bar", "foo"], into="baz", sep="::", drop=True) 592 | ``` 593 | | baz | 594 | |:------| 595 | | A::1 | 596 | | B::2 | 597 | """ 598 | return _wrap(combine(self._data, columns, into, sep, drop)) 599 | 600 | def cross( 601 | self, rhs: DataFrame | None = None, postfix: tuple[str, str] = ("_lhs", "_rhs") 602 | ) -> DataFrame: 603 | """Cross join columns from another DataFrame 604 | 605 | Examples: 606 | 607 | ```python 608 | df = rf.DataFrame({"foo": ["a", "b", "c"], "bar": [1, 2, 3]}) 609 | ``` 610 | | foo | bar | 611 | |:------|------:| 612 | | a | 1 | 613 | | b | 2 | 614 | | c | 3 | 615 | 616 | Self: 617 | 618 | ```python 619 | df.cross() 620 | ``` 621 | 622 | | foo_lhs | bar_lhs | foo_rhs | bar_rhs | 623 | |:----------|----------:|:----------|----------:| 624 | | a | 1 | a | 1 | 625 | | a | 1 | b | 2 | 626 | | a | 1 | c | 3 | 627 | | b | 2 | a | 1 | 628 | | b | 2 | b | 2 | 629 | | b | 2 | c | 3 | 630 | | c | 3 | a | 1 | 631 | | c | 3 | b | 2 | 632 | | c | 3 | c | 3 | 633 | 634 | Two DataFrames: 635 | 636 | ```python 637 | dfa = rf.DataFrame({"foo": [1, 2, 3]}) 638 | dfb = rf.DataFrame({"bar": [1, 2, 3]}) 639 | dfa.cross(dfb, postfix=("_a", "_b")) 640 | ``` 641 | 642 | | foo | bar | 643 | |------:|------:| 644 | | 1 | 1 | 645 | | 1 | 2 | 646 | | 1 | 3 | 647 | | 2 | 1 | 648 | | 2 | 2 | 649 | | 2 | 3 | 650 | | 3 | 1 | 651 | | 3 | 2 | 652 | | 3 | 3 | 653 | """ 654 | rhs = self if (rhs == None) else rhs 655 | _check_type(rhs, DataFrame) 656 | return _wrap(cross(self._data, rhs._data, postfix)) # type: ignore 657 | 658 | def dedupe(self, columns: LazyColumns | None = None) -> DataFrame: 659 | """Remove duplicate rows 660 | 661 | Examples: 662 | 663 | ```python 664 | df = rf.DataFrame({"foo": [1, 1, 2, 2], "bar": ["A", "A", "B", "A"]}) 665 | ``` 666 | | foo | bar | 667 | |------:|:------| 668 | | 1 | A | 669 | | 1 | A | 670 | | 2 | B | 671 | | 2 | A | 672 | 673 | All columns: 674 | 675 | ```python 676 | df.dedupe() 677 | ``` 678 | | foo | bar | 679 | |------:|:------| 680 | | 1 | A | 681 | | 2 | B | 682 | | 2 | A | 683 | 684 | Single column: 685 | 686 | ```python 687 | df.dedupe("foo") 688 | ``` 689 | | foo | bar | 690 | |------:|:------| 691 | | 1 | A | 692 | | 2 | B | 693 | 694 | Multiple columns: 695 | 696 | ```python 697 | df.dedupe(["foo", "bar"]) 698 | ``` 699 | | foo | bar | 700 | |------:|:------| 701 | | 1 | A | 702 | | 2 | B | 703 | | 2 | A | 704 | """ 705 | return _wrap(dedupe(self._data, columns)) 706 | 707 | def denix(self, columns: LazyColumns | None = None) -> DataFrame: 708 | """Remove rows with *NaN/None* values 709 | 710 | Example: 711 | 712 | ```python 713 | df = rf.DataFrame({"foo": [1, None, 3, None, 5, 6], "bar": [1, None, 3, 4, None, None]}) 714 | ``` 715 | | foo | bar | 716 | |------:|------:| 717 | | 1 | 1 | 718 | | nan | nan | 719 | | 3 | 3 | 720 | | nan | 4 | 721 | | 5 | nan | 722 | | 6 | nan | 723 | 724 | All columns: 725 | 726 | ```python 727 | df.denix() 728 | ``` 729 | | foo | bar | 730 | |------:|------:| 731 | | 1 | 1 | 732 | | 3 | 3 | 733 | 734 | Single column: 735 | 736 | ```python 737 | df.denix("bar") 738 | ``` 739 | | foo | bar | 740 | |------:|------:| 741 | | 1 | 1 | 742 | | 3 | 3 | 743 | | nan | 4 | 744 | 745 | Multiple columns: 746 | 747 | ```python 748 | df.denix(["foo", "bar"]) 749 | ``` 750 | | foo | bar | 751 | |------:|------:| 752 | | 1 | 1 | 753 | | 3 | 3 | 754 | """ 755 | return _wrap(denix(self._data, columns)) 756 | 757 | def drop(self, columns: LazyColumns) -> DataFrame: 758 | """Drop entire columns 759 | 760 | Examples: 761 | 762 | ```python 763 | df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [5, 6]}) 764 | ``` 765 | | foo | bar | baz | 766 | |------:|------:|------:| 767 | | 1 | 3 | 5 | 768 | | 2 | 4 | 6 | 769 | 770 | ```python 771 | df.drop("baz") 772 | ``` 773 | | foo | bar | 774 | |------:|------:| 775 | | 1 | 3 | 776 | | 2 | 4 | 777 | 778 | ```python 779 | df.drop(["foo", "baz"]) 780 | ``` 781 | | bar | 782 | |------:| 783 | | 3 | 784 | | 4 | 785 | """ 786 | return _wrap(drop(self._data, columns)) 787 | 788 | def fill( 789 | self, 790 | columns: LazyColumns | None = None, 791 | direction: Direction | None = None, 792 | constant: Value | None = None, 793 | ) -> DataFrame: 794 | """Fill missing values "down", "up", or with a constant 795 | 796 | Examples: 797 | 798 | ```python 799 | df = rf.DataFrame({"foo": [1, None, None, 2, None], "bar": [None, "A", None, "B", None]}) 800 | ``` 801 | | foo | bar | 802 | |------:|:------| 803 | | 1 | | 804 | | nan | A | 805 | | nan | | 806 | | 2 | B | 807 | | nan | | 808 | 809 | Constant (all columns): 810 | 811 | ```python 812 | df.fill(constant=0) 813 | ``` 814 | | foo | bar | 815 | |------:|:------| 816 | | 1 | 0 | 817 | | 0 | A | 818 | | 0 | 0 | 819 | | 2 | B | 820 | | 0 | 0 | 821 | 822 | Down (all columns): 823 | 824 | ```python 825 | df.fill(direction="down") 826 | ``` 827 | | foo | bar | 828 | |------:|:------| 829 | | 1 | | 830 | | 1 | A | 831 | | 1 | A | 832 | | 2 | B | 833 | | 2 | B | 834 | 835 | Down (single column): 836 | 837 | ```python 838 | df.fill("foo", direction="down") 839 | ``` 840 | | foo | bar | 841 | |------:|:------| 842 | | 1 | | 843 | | 1 | A | 844 | | 1 | | 845 | | 2 | B | 846 | | 2 | | 847 | 848 | Up (single/mutiple columns): 849 | 850 | ```python 851 | df.fill(["foo"], direction="up") 852 | ``` 853 | | foo | bar | 854 | |------:|:------| 855 | | 1 | | 856 | | 2 | A | 857 | | 2 | | 858 | | 2 | B | 859 | | nan | | 860 | """ 861 | return _wrap(fill(self._data, columns, direction, constant)) 862 | 863 | def filter(self, func: Func) -> DataFrame: 864 | """Keep rows matching specific conditions 865 | 866 | Compatible operators: `|`, `&`, `< <= == != >= >`, `isin` 867 | 868 | Examples: 869 | 870 | ```python 871 | df = rf.DataFrame({"foo": ["A", "A", "A", "B"], "bar": [1, 2, 3, 4]}) 872 | ``` 873 | | foo | bar | 874 | |:------|------:| 875 | | A | 1 | 876 | | A | 2 | 877 | | A | 3 | 878 | | B | 4 | 879 | 880 | Single condition: 881 | 882 | ```python 883 | df.filter(lambda row: row["foo"].isin(["A"])) 884 | ``` 885 | | foo | bar | 886 | |:------|------:| 887 | | A | 1 | 888 | | A | 2 | 889 | | A | 3 | 890 | 891 | And (multiple conditions): 892 | 893 | ```python 894 | df.filter(lambda row: (row["foo"] == "A") & (row["bar"] <= 2)) 895 | ``` 896 | | foo | bar | 897 | |:------|------:| 898 | | A | 1 | 899 | | A | 2 | 900 | 901 | Or (multiple conditions): 902 | 903 | ```python 904 | df.filter(lambda row: (row["foo"] == "B") | (row["bar"] == 1)) 905 | ``` 906 | | foo | bar | 907 | |:------|------:| 908 | | A | 1 | 909 | | B | 4 | 910 | """ 911 | return _wrap(filter(self._data, func)) 912 | 913 | def group(self, by: LazyColumns) -> GroupedFrame: 914 | """Prepare groups for compatible verbs 915 | 916 | Compatible verbs: `accumulate`, `gather`, `pack`, `rank`, `rollup`, `take` 917 | 918 | Example: 919 | 920 | ```python 921 | df = rf.DataFrame({"foo": ["A", "A", "A", "B", "B"], "bar": [1, 2, 3, 4, 5], "baz": [9, 7, 7, 5, 6]}) 922 | ``` 923 | | foo | bar | baz | 924 | |:------|------:|------:| 925 | | A | 1 | 9 | 926 | | A | 2 | 7 | 927 | | A | 3 | 7 | 928 | | B | 4 | 5 | 929 | | B | 5 | 6 | 930 | 931 | + `accumulate`: 932 | 933 | ```python 934 | df.group("foo").accumulate("bar", into="bar_cumsum") 935 | ``` 936 | | foo | bar | baz | bar_cumsum | 937 | |:------|------:|------:|-------------:| 938 | | A | 1 | 9 | 1 | 939 | | A | 2 | 7 | 3 | 940 | | A | 3 | 7 | 6 | 941 | | B | 4 | 5 | 4 | 942 | | B | 5 | 6 | 9 | 943 | 944 | + `gather`: 945 | 946 | ```python 947 | df.group("foo").gather() 948 | ``` 949 | | foo | variable | value | 950 | |:------|:-----------|--------:| 951 | | A | bar | 1 | 952 | | A | bar | 2 | 953 | | A | bar | 3 | 954 | | B | bar | 4 | 955 | | B | bar | 5 | 956 | | A | baz | 9 | 957 | | A | baz | 7 | 958 | | A | baz | 7 | 959 | | B | baz | 5 | 960 | | B | baz | 6 | 961 | 962 | + `pack`: 963 | 964 | ```python 965 | df.group("foo").pack("bar", sep=":") 966 | ``` 967 | | foo | bar | 968 | |:------|:------| 969 | | A | 1:2:3 | 970 | | B | 4:5 | 971 | 972 | 973 | + `rank`: 974 | 975 | ```python 976 | df.group("foo").rank("baz", into="baz_rank", descending=True) 977 | ``` 978 | | foo | bar | baz | baz_rank | 979 | |:------|------:|------:|-----------:| 980 | | A | 1 | 9 | 1 | 981 | | A | 2 | 7 | 2 | 982 | | A | 3 | 7 | 2 | 983 | | B | 4 | 5 | 2 | 984 | | B | 5 | 6 | 1 | 985 | 986 | + `rollup`: 987 | 988 | ```python 989 | df.group("foo").rollup({ 990 | "bar_mean": ("bar", rf.stat.mean), 991 | "baz_min": ("baz", rf.stat.min) 992 | }) 993 | ``` 994 | | foo | bar_mean | baz_min | 995 | |:------|-----------:|----------:| 996 | | A | 2 | 7 | 997 | | B | 4.5 | 5 | 998 | 999 | + `take`: 1000 | 1001 | ```python 1002 | df.group("foo").take(1) 1003 | ``` 1004 | | foo | bar | baz | 1005 | |:------|------:|------:| 1006 | | A | 1 | 9 | 1007 | | B | 4 | 5 | 1008 | 1009 | """ 1010 | return GroupedFrame(group(self._data, by)) 1011 | 1012 | def join( 1013 | self, 1014 | rhs: DataFrame, 1015 | on: LazyColumns, 1016 | how: Join = "left", 1017 | postfix: tuple[str, str] = ("_lhs", "_rhs"), 1018 | ) -> DataFrame: 1019 | """Join columns from another DataFrame 1020 | 1021 | Examples: 1022 | 1023 | ```python 1024 | adf = rf.DataFrame({"foo": ["A", "B", "C"], "bar": [1, 2, 3]}) 1025 | ``` 1026 | | foo | bar | 1027 | |:------|------:| 1028 | | A | 1 | 1029 | | B | 2 | 1030 | | C | 3 | 1031 | 1032 | ```python 1033 | bdf = rf.DataFrame({"foo": ["A", "B", "D"], "baz": ["!", "@", "#"]}) 1034 | ``` 1035 | | foo | baz | 1036 | |:------|:------| 1037 | | A | ! | 1038 | | B | @ | 1039 | | D | # | 1040 | 1041 | Left join: 1042 | 1043 | ```python 1044 | adf.join(bdf, on="foo", how="left") 1045 | ``` 1046 | | foo | bar | baz | 1047 | |:------|------:|:------| 1048 | | A | 1 | ! | 1049 | | B | 2 | @ | 1050 | | C | 3 | nan | 1051 | 1052 | Right join: 1053 | 1054 | ```python 1055 | adf.join(bdf, on="foo", how="right") 1056 | ``` 1057 | | foo | bar | baz | 1058 | |:------|------:|:------| 1059 | | A | 1 | ! | 1060 | | B | 2 | @ | 1061 | | D | nan | # | 1062 | 1063 | Inner join: 1064 | 1065 | ```python 1066 | adf.join(bdf, on="foo", how="inner") 1067 | ``` 1068 | | foo | bar | baz | 1069 | |:------|------:|:------| 1070 | | A | 1 | ! | 1071 | | B | 2 | @ | 1072 | 1073 | Full join: 1074 | 1075 | ```python 1076 | adf.join(bdf, on="foo", how="full") 1077 | ``` 1078 | | foo | bar | baz | 1079 | |:------|------:|:------| 1080 | | A | 1 | ! | 1081 | | B | 2 | @ | 1082 | | C | 3 | nan | 1083 | | D | nan | # | 1084 | """ 1085 | _check_type(rhs, DataFrame) 1086 | return _wrap(join(self._data, rhs._data, on, how, postfix)) 1087 | 1088 | def mutate(self, over: dict[Column, Func]) -> DataFrame: 1089 | """Create a new, or overwrite an existing column 1090 | 1091 | Example: 1092 | 1093 | ```python 1094 | df = rf.DataFrame({"foo": [1, 2, 3]}) 1095 | ``` 1096 | | foo | 1097 | |------:| 1098 | | 1 | 1099 | | 2 | 1100 | | 3 | 1101 | 1102 | ```python 1103 | df.mutate({ 1104 | "bar": lambda row: float(row["foo"]), 1105 | "baz": lambda row: "X" + str(row["bar"] * 2), 1106 | "jaz": lambda _: "Jazz" 1107 | }) 1108 | ``` 1109 | | foo | bar | baz | jaz | 1110 | |------:|------:|:------|:------| 1111 | | 1 | 1 | X2.0 | Jazz | 1112 | | 2 | 2 | X4.0 | Jazz | 1113 | | 3 | 3 | X6.0 | Jazz | 1114 | """ 1115 | return _wrap(mutate(self._data, over)) 1116 | 1117 | def rename(self, columns: dict[OldColumn, NewColumn]) -> DataFrame: 1118 | """Rename column keys (from "old" to "new") 1119 | 1120 | Example: 1121 | 1122 | ```python 1123 | df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]}) 1124 | ``` 1125 | | foo | bar | 1126 | |------:|------:| 1127 | | 1 | 3 | 1128 | | 2 | 4 | 1129 | 1130 | ```python 1131 | df.rename({"foo": "oof", "bar": "rab"}) 1132 | ``` 1133 | | oof | rab | 1134 | |------:|------:| 1135 | | 1 | 3 | 1136 | | 2 | 4 | 1137 | 1138 | """ 1139 | return _wrap(rename(self._data, columns)) 1140 | 1141 | def replace(self, over: dict[Column, dict[OldValue, NewValue]]) -> DataFrame: 1142 | """Replace matching values within columns (from "old" to "new") 1143 | 1144 | Example: 1145 | 1146 | ```python 1147 | df = rf.DataFrame({"foo": [1, 2, 2, 2, 1], "bar": [1, "A", "B", True, False]}) 1148 | ``` 1149 | | foo | bar | 1150 | |------:|:------| 1151 | | 1 | 1 | 1152 | | 2 | A | 1153 | | 2 | B | 1154 | | 2 | True | 1155 | | 1 | False | 1156 | 1157 | ```python 1158 | df.replace({ 1159 | "foo": {2: 222}, 1160 | "bar": {False: 0, True: 1, "A": 2, "B": 3} 1161 | }) 1162 | ``` 1163 | | foo | bar | 1164 | |------:|------:| 1165 | | 1 | 1 | 1166 | | 222 | 2 | 1167 | | 222 | 3 | 1168 | | 222 | 1 | 1169 | | 1 | 0 | 1170 | """ 1171 | return _wrap(replace(self._data, over)) 1172 | 1173 | def sample(self, rows: int | float, seed: int | None = None) -> DataFrame: 1174 | """Randomly sample any number of rows 1175 | 1176 | Examples: 1177 | 1178 | ```python 1179 | df = rf.DataFrame({"foo": range(10), "bar": range(10, 20)}) 1180 | ``` 1181 | | foo | bar | 1182 | |------:|------:| 1183 | | 0 | 10 | 1184 | | 1 | 11 | 1185 | | 2 | 12 | 1186 | | 3 | 13 | 1187 | | 4 | 14 | 1188 | | 5 | 15 | 1189 | | 6 | 16 | 1190 | | 7 | 17 | 1191 | | 8 | 18 | 1192 | | 9 | 19 | 1193 | 1194 | Single row: 1195 | 1196 | ```python 1197 | df.sample(1) 1198 | ``` 1199 | | foo | bar | 1200 | |------:|------:| 1201 | | 7 | 17 | 1202 | 1203 | Multiple rows: 1204 | 1205 | ```python 1206 | df.sample(3) 1207 | ``` 1208 | | foo | bar | 1209 | |------:|------:| 1210 | | 4 | 14 | 1211 | | 1 | 11 | 1212 | | 6 | 16 | 1213 | 1214 | Percentage of total rows (30%): 1215 | 1216 | ```python 1217 | df.sample(0.3) 1218 | ``` 1219 | | foo | bar | 1220 | |------:|------:| 1221 | | 4 | 14 | 1222 | | 3 | 13 | 1223 | | 1 | 11 | 1224 | """ 1225 | return _wrap(sample(self._data, rows, seed)) 1226 | 1227 | def select(self, columns: LazyColumns) -> DataFrame: 1228 | """Select specific columns 1229 | 1230 | Examples: 1231 | 1232 | ```python 1233 | df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [5, 6]}) 1234 | ``` 1235 | | foo | bar | baz | 1236 | |------:|------:|------:| 1237 | | 1 | 3 | 5 | 1238 | | 2 | 4 | 6 | 1239 | 1240 | Single column: 1241 | 1242 | ```python 1243 | df.select("foo") 1244 | ``` 1245 | | foo | 1246 | |------:| 1247 | | 1 | 1248 | | 2 | 1249 | 1250 | Multiple columns: 1251 | 1252 | ```python 1253 | df.select(["foo", "baz"]) 1254 | ``` 1255 | | foo | baz | 1256 | |------:|------:| 1257 | | 1 | 5 | 1258 | | 2 | 6 | 1259 | """ 1260 | return _wrap(select(self._data, columns)) 1261 | 1262 | def shuffle(self, seed: int | None = None) -> DataFrame: 1263 | """Shuffle the order of all rows 1264 | 1265 | Example: 1266 | 1267 | ```python 1268 | df = rf.DataFrame({"foo": range(5), "bar": range(5, 10)}) 1269 | ``` 1270 | | foo | bar | 1271 | |------:|------:| 1272 | | 0 | 5 | 1273 | | 1 | 6 | 1274 | | 2 | 7 | 1275 | | 3 | 8 | 1276 | | 4 | 9 | 1277 | 1278 | ```python 1279 | df.shuffle() 1280 | ``` 1281 | | foo | bar | 1282 | |------:|------:| 1283 | | 4 | 9 | 1284 | | 2 | 7 | 1285 | | 3 | 8 | 1286 | | 0 | 5 | 1287 | | 1 | 6 | 1288 | """ 1289 | return _wrap(shuffle(self._data, seed)) 1290 | 1291 | def sort(self, columns: LazyColumns, descending: bool = False) -> DataFrame: 1292 | """Sort rows by specific columns 1293 | 1294 | Examples: 1295 | 1296 | ```python 1297 | df = rf.DataFrame({"foo": ["Z", "X", "A", "A"], "bar": [2, -2, 4, -4]}) 1298 | ``` 1299 | | foo | bar | 1300 | |:------|------:| 1301 | | Z | 2 | 1302 | | X | -2 | 1303 | | A | 4 | 1304 | | A | -4 | 1305 | 1306 | Single column: 1307 | 1308 | ```python 1309 | df.sort("bar") 1310 | ``` 1311 | | foo | bar | 1312 | |:------|------:| 1313 | | A | -4 | 1314 | | X | -2 | 1315 | | Z | 2 | 1316 | | A | 4 | 1317 | 1318 | Descending order: 1319 | 1320 | ```python 1321 | df.sort("bar", descending=True) 1322 | ``` 1323 | | foo | bar | 1324 | |:------|------:| 1325 | | A | 4 | 1326 | | Z | 2 | 1327 | | X | -2 | 1328 | | A | -4 | 1329 | 1330 | Multiple columns: 1331 | 1332 | ```python 1333 | df.sort(["foo", "bar"], descending=False) 1334 | ``` 1335 | | foo | bar | 1336 | |:------|------:| 1337 | | A | -4 | 1338 | | A | 4 | 1339 | | X | -2 | 1340 | | Z | 2 | 1341 | """ 1342 | return _wrap(sort(self._data, columns, descending)) 1343 | 1344 | def split( 1345 | self, column: Column, into: Columns, sep: str, drop: bool = True 1346 | ) -> DataFrame: 1347 | """Split a single column into multiple columns (opposite of `combine`) 1348 | 1349 | Example: 1350 | 1351 | ```python 1352 | df = rf.DataFrame({"foo": ["A::1", "B::2", "C:3"]}) 1353 | ``` 1354 | | foo | 1355 | |:------| 1356 | | A::1 | 1357 | | B::2 | 1358 | | C:3 | 1359 | 1360 | ```python 1361 | df.split("foo", into=["foo", "bar"], sep="::", drop=True) 1362 | ``` 1363 | | foo | bar | 1364 | |:------|------:| 1365 | | A | 1 | 1366 | | B | 2 | 1367 | | C:3 | | 1368 | """ 1369 | return _wrap(split(self._data, column, into, sep, drop)) 1370 | 1371 | def spread(self, column: Column, using: Column) -> DataFrame: 1372 | """Spread rows into columns (opposite of `gather`) 1373 | 1374 | Example: 1375 | 1376 | ```python 1377 | df = rf.DataFrame({"foo": ["A", "A", "A", "B", "B", "B", "B"], "bar": [1, 2, 3, 4, 5, 6, 7]}) 1378 | ``` 1379 | | foo | bar | 1380 | |:------|------:| 1381 | | A | 1 | 1382 | | A | 2 | 1383 | | A | 3 | 1384 | | B | 4 | 1385 | | B | 5 | 1386 | | B | 6 | 1387 | | B | 7 | 1388 | 1389 | ```python 1390 | df.spread("foo", using="bar") 1391 | ``` 1392 | | A | B | 1393 | |----:|----:| 1394 | | 1 | 4 | 1395 | | 2 | 5 | 1396 | | 3 | 6 | 1397 | | nan | 7 | 1398 | """ 1399 | return _wrap(spread(self._data, column, using)) 1400 | 1401 | def unpack(self, column: Column, sep: str) -> DataFrame: 1402 | """'Explode' concatenated row values into multiple rows (opposite of `pack`) 1403 | 1404 | Example: 1405 | 1406 | ```python 1407 | df = rf.DataFrame({ 1408 | "foo": [1, 2, 3, 4], 1409 | "bar": ["A:B", "B:C:D", "D:E", "F"] 1410 | }) 1411 | ``` 1412 | | foo | bar | 1413 | |------:|:------| 1414 | | 1 | A:B | 1415 | | 2 | B:C:D | 1416 | | 3 | D:E | 1417 | | 4 | F | 1418 | 1419 | ```python 1420 | df.unpack("bar", sep=":") 1421 | ``` 1422 | | foo | bar | 1423 | |------:|:------| 1424 | | 1 | A | 1425 | | 1 | B | 1426 | | 2 | B | 1427 | | 2 | C | 1428 | | 2 | D | 1429 | | 3 | D | 1430 | | 3 | E | 1431 | | 4 | F | 1432 | """ 1433 | return _wrap(unpack(self._data, column, sep)) 1434 | -------------------------------------------------------------------------------- /redframes/io/__init__.py: -------------------------------------------------------------------------------- 1 | from .convert import unwrap, wrap 2 | from .load import load 3 | from .save import save 4 | -------------------------------------------------------------------------------- /redframes/io/convert.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_columns, _check_index, _check_type 4 | from ..core import DataFrame 5 | from ..types import PandasDataFrame 6 | 7 | 8 | def unwrap(rdf: DataFrame) -> PandasDataFrame: 9 | """Convert a rf.DataFrame into a pd.DataFrame (opposite of `wrap`) 10 | 11 | Example: 12 | 13 | ```python 14 | rdf = rf.DataFrame({"foo": range(10)}) 15 | pdf = rf.unwrap(rdf) 16 | ``` 17 | """ 18 | _check_type(rdf, DataFrame) 19 | return rdf._data.copy() 20 | 21 | 22 | def wrap(pdf: PandasDataFrame) -> DataFrame: 23 | """Convert a pd.DataFrame into a rf.DataFrame (opposite of `unwrap`) 24 | 25 | Example: 26 | 27 | ```python 28 | pdf = pd.DataFrame({"foo": range(10)}) 29 | rdf = rf.wrap(pdf) 30 | ``` 31 | """ 32 | _check_type(pdf, PandasDataFrame) 33 | _check_index(pdf) 34 | _check_columns(pdf) 35 | rdf = DataFrame() 36 | rdf._data = pdf.copy() 37 | return rdf 38 | 39 | 40 | def convert(df: DataFrame | PandasDataFrame) -> PandasDataFrame | DataFrame: 41 | """Convert a rf.DataFrame into a pd.DataFrame (and/or vice versa) 42 | 43 | Example: 44 | 45 | ```python 46 | redf = rf.DataFrame({"foo": range(10)}) 47 | padf = rf.convert(redf) # now a pd.DataFrame 48 | redf = rf.convert(padf) # now a rf.DataFrame 49 | ``` 50 | """ 51 | if isinstance(df, DataFrame): 52 | return unwrap(df) 53 | if isinstance(df, PandasDataFrame): 54 | return wrap(df) 55 | raise TypeError("must be rf.DataFrame | pd.DataFrame") 56 | -------------------------------------------------------------------------------- /redframes/io/load.py: -------------------------------------------------------------------------------- 1 | import pandas as pd # pyright: ignore[reportMissingImports] 2 | 3 | from redframes.types import PandasDataFrame 4 | 5 | from ..checks import _check_columns, _check_file, _check_index, _check_type 6 | from ..core import DataFrame, _wrap 7 | 8 | 9 | def load(path: str, **kwargs) -> DataFrame: 10 | """Load a csv file into a rf.DataFrame (opposite of `save`) 11 | 12 | Example: 13 | 14 | ```python 15 | df = rf.load("example.csv") 16 | ``` 17 | """ 18 | _check_type(path, str) 19 | _check_file(path) 20 | data: PandasDataFrame = pd.read_csv(path, **kwargs) # type: ignore 21 | _check_index(data) 22 | _check_columns(data) 23 | return _wrap(data) 24 | -------------------------------------------------------------------------------- /redframes/io/save.py: -------------------------------------------------------------------------------- 1 | from ..checks import _check_file, _check_type 2 | from ..core import DataFrame 3 | 4 | 5 | def save(df: DataFrame, path: str, **kwargs) -> None: 6 | """Save a rf.DataFrame to a csv file (opposite of `load`) 7 | 8 | Example: 9 | 10 | ```python 11 | rf.save(df, "example.csv") 12 | ``` 13 | """ 14 | _check_type(df, DataFrame) 15 | _check_type(path, str) 16 | _check_file(path) 17 | df._data.to_csv(path, index=False, **kwargs) 18 | -------------------------------------------------------------------------------- /redframes/stat.py: -------------------------------------------------------------------------------- 1 | """Common summary functions/statistics""" 2 | 3 | import numpy as np # pyright: ignore[reportMissingImports] 4 | 5 | count = len 6 | mean = np.mean 7 | sum = np.sum 8 | max = np.max 9 | median = np.median 10 | min = np.min 11 | std = np.std 12 | -------------------------------------------------------------------------------- /redframes/types.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import datetime 4 | from typing import Any, Callable, Literal, Union 5 | 6 | import numpy as np # pyright: ignore[reportMissingImports] 7 | import pandas as pd # pyright: ignore[reportMissingImports] 8 | import pandas.core.groupby.generic as pg # pyright: ignore[reportMissingImports] 9 | 10 | Value = Any 11 | Values = list[Value] 12 | OldValue = Value 13 | NewValue = Value 14 | Column = str 15 | Columns = list[Column] 16 | LazyColumns = Union[Column, Columns] 17 | OldColumn = Column 18 | NewColumn = Column 19 | Direction = Literal["up", "down"] 20 | Func = Callable[..., Any] 21 | Join = Literal["left", "right", "inner", "full"] 22 | NumpyArray = np.ndarray 23 | NumpyType = np.dtype 24 | PandasDataFrame = pd.DataFrame 25 | PandasGroupedFrame = pg.DataFrameGroupBy 26 | PandasIndex = pd.Index 27 | PandasRangeIndex = pd.RangeIndex 28 | DateTime = datetime.datetime 29 | -------------------------------------------------------------------------------- /redframes/verbs/__init__.py: -------------------------------------------------------------------------------- 1 | from .accumulate import accumulate 2 | from .append import append 3 | from .combine import combine 4 | from .cross import cross 5 | from .dedupe import dedupe 6 | from .denix import denix 7 | from .drop import drop 8 | from .fill import fill 9 | from .filter import filter 10 | from .gather import gather 11 | from .group import group 12 | from .join import join 13 | from .mutate import mutate 14 | from .pack import pack 15 | from .rank import rank 16 | from .rename import rename 17 | from .replace import replace 18 | from .rollup import rollup 19 | from .sample import sample 20 | from .select import select 21 | from .shuffle import shuffle 22 | from .sort import sort 23 | from .split import split 24 | from .spread import spread 25 | from .take import take 26 | from .unpack import unpack 27 | -------------------------------------------------------------------------------- /redframes/verbs/accumulate.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | 5 | from ..checks import _check_type 6 | from ..types import Column, PandasDataFrame, PandasGroupedFrame 7 | 8 | 9 | def accumulate( 10 | df: PandasDataFrame | PandasGroupedFrame, column: Column, into: Column 11 | ) -> PandasDataFrame: 12 | _check_type(column, str) 13 | _check_type(into, str) 14 | if isinstance(df, PandasDataFrame): 15 | into_is_not_column = into != column 16 | into_is_in_df_columns = into in df.columns 17 | if into_is_not_column and into_is_in_df_columns: 18 | message = f"overwriting existing column '{into}'" 19 | warnings.warn(message) 20 | df = df.copy() 21 | result = df[column].cumsum() 22 | if isinstance(df, PandasGroupedFrame): 23 | df = df.obj.copy() # type: ignore 24 | df[into] = result # type: ignore 25 | return df # type: ignore 26 | -------------------------------------------------------------------------------- /redframes/verbs/append.py: -------------------------------------------------------------------------------- 1 | import pandas as pd # pyright: ignore[reportMissingImports] 2 | 3 | from ..types import PandasDataFrame 4 | 5 | 6 | def append(top: PandasDataFrame, bottom: PandasDataFrame) -> PandasDataFrame: 7 | df = pd.concat([top, bottom]) 8 | df = df.reset_index(drop=True) 9 | return df 10 | -------------------------------------------------------------------------------- /redframes/verbs/combine.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | 5 | from ..checks import _check_type 6 | from ..types import Column, Columns, PandasDataFrame 7 | 8 | 9 | def combine( 10 | df: PandasDataFrame, columns: Columns, into: Column, sep: str, drop: bool = True 11 | ) -> PandasDataFrame: 12 | _check_type(columns, list) 13 | _check_type(into, str) 14 | _check_type(sep, str) 15 | _check_type(drop, bool) 16 | into_is_in_columns = into in columns 17 | into_is_not_in_columns = not into_is_in_columns 18 | into_is_in_df_columns = into in df.columns 19 | if into_is_not_in_columns and into_is_in_df_columns: 20 | message = f"overwriting existing column '{into}'" 21 | warnings.warn(message) 22 | df = df.copy() 23 | new = df[columns].apply(lambda row: sep.join(row.values.astype(str)), axis=1) 24 | if drop: 25 | df = df.drop(columns, axis=1) 26 | df[into] = new 27 | return df 28 | -------------------------------------------------------------------------------- /redframes/verbs/cross.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd # pyright: ignore[reportMissingImports] 4 | 5 | from ..checks import _check_type 6 | from ..types import PandasDataFrame 7 | 8 | 9 | def cross( 10 | lhs: PandasDataFrame, 11 | rhs: PandasDataFrame, 12 | postfix: tuple[str, str] = ("_lhs", "_rhs"), 13 | ) -> PandasDataFrame: 14 | _check_type(postfix, tuple) 15 | df = pd.merge(lhs, rhs, how="cross", suffixes=postfix) 16 | df = df.reset_index(drop=True) 17 | return df 18 | -------------------------------------------------------------------------------- /redframes/verbs/dedupe.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_keys, _check_type 4 | from ..types import LazyColumns, PandasDataFrame 5 | 6 | 7 | def dedupe(df: PandasDataFrame, columns: LazyColumns | None = None) -> PandasDataFrame: 8 | _check_type(columns, {list, str, None}) 9 | _check_keys(columns, df.columns) 10 | df = df.drop_duplicates(subset=columns, keep="first") 11 | df = df.reset_index(drop=True) 12 | return df 13 | -------------------------------------------------------------------------------- /redframes/verbs/denix.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_type 4 | from ..types import LazyColumns, PandasDataFrame 5 | 6 | 7 | def denix(df: PandasDataFrame, columns: LazyColumns | None = None) -> PandasDataFrame: 8 | _check_type(columns, {list, str, None}) 9 | columns = [columns] if isinstance(columns, str) else columns 10 | if isinstance(columns, list): 11 | bad_keys = set(columns).difference(df.columns) 12 | if bad_keys: 13 | if len(bad_keys) == 1: 14 | message = f"columns argument contains invalid key {bad_keys}" 15 | else: 16 | message = f"columns argument contains invalid keys {bad_keys}" 17 | raise KeyError(message) 18 | df = df.dropna(subset=columns) 19 | df = df.reset_index(drop=True) 20 | return df 21 | -------------------------------------------------------------------------------- /redframes/verbs/drop.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_type 4 | from ..types import LazyColumns, PandasDataFrame 5 | 6 | 7 | def drop(df: PandasDataFrame, columns: LazyColumns) -> PandasDataFrame: 8 | _check_type(columns, {list, str}) 9 | df = df.drop(columns, axis=1) 10 | return df 11 | -------------------------------------------------------------------------------- /redframes/verbs/fill.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_type 4 | from ..types import Direction, LazyColumns, PandasDataFrame, Value 5 | 6 | 7 | def fill( 8 | df: PandasDataFrame, 9 | columns: LazyColumns | None = None, 10 | direction: Direction | None = None, 11 | constant: Value | None = None, 12 | ) -> PandasDataFrame: 13 | _check_type(columns, {list, str, None}) 14 | _check_type(direction, {str, None}) 15 | columns = [columns] if isinstance(columns, str) else columns 16 | if (direction != None) and (constant != None): 17 | raise ValueError("either direction OR constant must be None") 18 | if (direction == None) and (constant == None): 19 | raise ValueError("either direction OR constant must not be None") 20 | if direction != None: 21 | if not (direction in ["down", "up"]): 22 | raise ValueError("must be one of {'down', 'up'}") 23 | method = {"down": "ffill", "up": "bfill"}.get(direction) 24 | value = None 25 | if constant != None: 26 | value = constant 27 | method = None 28 | df = df.copy() 29 | if columns: 30 | df[columns] = df[columns].fillna(value=value, method=method) # type: ignore 31 | else: 32 | df = df.fillna(value=value, method=method) # type: ignore 33 | return df 34 | -------------------------------------------------------------------------------- /redframes/verbs/filter.py: -------------------------------------------------------------------------------- 1 | from ..types import Func, PandasDataFrame 2 | 3 | 4 | def filter(df: PandasDataFrame, func: Func) -> PandasDataFrame: 5 | if not callable(func): 6 | raise TypeError("must be Func") 7 | df = df.loc[func] # type: ignore 8 | df = df.reset_index(drop=True) 9 | return df 10 | -------------------------------------------------------------------------------- /redframes/verbs/gather.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | 5 | import pandas as pd # pyright: ignore[reportMissingImports] 6 | 7 | from ..checks import _check_type 8 | from ..types import Column, Columns, LazyColumns, PandasDataFrame, PandasGroupedFrame 9 | 10 | 11 | def _melt( 12 | df: PandasDataFrame, 13 | cols_to_keep: list[str], 14 | cols_to_gather: list[str], 15 | into: tuple[str, str], 16 | ) -> PandasDataFrame: 17 | df = pd.melt( 18 | df, 19 | id_vars=cols_to_keep, 20 | value_vars=cols_to_gather, 21 | var_name=into[0], 22 | value_name=into[1], 23 | ) 24 | df = df.dropna(subset=into[1]) # type: ignore 25 | df = df.reset_index(drop=True) 26 | return df 27 | 28 | 29 | def _grouped_melt(df: PandasGroupedFrame, into: tuple[str, str]) -> PandasDataFrame: 30 | cols_to_keep = df.grouper.names # type: ignore 31 | cols_to_gather = [col for col in df.obj.columns if col not in cols_to_keep] # type: ignore 32 | df = _melt(df.obj, cols_to_keep, cols_to_gather, into) # type: ignore 33 | return df 34 | 35 | 36 | def gather( 37 | df: PandasDataFrame | PandasGroupedFrame, 38 | columns: Columns | None = None, 39 | beside: LazyColumns | None = None, 40 | into: tuple[Column, Column] = ("variable", "value"), 41 | ) -> PandasDataFrame: 42 | _check_type(columns, {list, None}) 43 | _check_type(beside, {str, list, None}) 44 | _check_type(into, tuple) 45 | if (columns == None) and (beside != None) and isinstance(df, PandasDataFrame): 46 | warnings.warn( 47 | "Marked for removal, please use `df.group(...).gather(...)` instead", 48 | FutureWarning, 49 | ) 50 | if not (isinstance(into, tuple) and (len(into) == 2)): 51 | raise TypeError("must be tuple[str, str]") 52 | if into[0] == into[1]: 53 | raise TypeError("must be unique") 54 | if isinstance(df, PandasGroupedFrame): 55 | if (into[0] in df.obj.columns) or (into[1] in df.obj.columns): # type: ignore 56 | raise ValueError("must not be an existing column key") 57 | if columns != None: 58 | raise ValueError("columns is incompatible with group+gather") 59 | if beside != None: 60 | raise ValueError("beside is incompatible with group+gather") 61 | df = _grouped_melt(df, into) 62 | return df 63 | if (into[0] in df.columns) or (into[1] in df.columns): 64 | raise ValueError("must not be an existing column key") 65 | if (columns != None) and (beside != None): 66 | raise ValueError("columns OR beside must be None") 67 | if (columns == None) and (beside == None): 68 | id_vars = [] 69 | value_vars = list(df.columns) 70 | if isinstance(beside, str): 71 | beside = [beside] 72 | if isinstance(beside, list): 73 | id_vars = beside 74 | value_vars = [col for col in df.columns if col not in id_vars] 75 | if isinstance(columns, list): 76 | id_vars = [col for col in df.columns if col not in columns] 77 | value_vars = columns 78 | df = _melt(df, id_vars, value_vars, into) # pyright: ignore[reportUnboundVariable] 79 | return df 80 | -------------------------------------------------------------------------------- /redframes/verbs/group.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_type 4 | from ..types import LazyColumns, PandasDataFrame, PandasGroupedFrame 5 | 6 | 7 | def group(df: PandasDataFrame, by: LazyColumns) -> PandasGroupedFrame: 8 | _check_type(by, {list, str}) 9 | gdf = df.groupby(by, as_index=False, sort=False) 10 | return gdf 11 | -------------------------------------------------------------------------------- /redframes/verbs/join.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd # pyright: ignore[reportMissingImports] 4 | 5 | from ..checks import _check_type 6 | from ..types import Join, LazyColumns, PandasDataFrame 7 | 8 | 9 | def join( 10 | lhs: PandasDataFrame, 11 | rhs: PandasDataFrame, 12 | on: LazyColumns, 13 | how: Join = "left", 14 | postfix: tuple[str, str] = ("_lhs", "_rhs"), 15 | ) -> PandasDataFrame: 16 | _check_type(on, {list, str}) 17 | _check_type(how, str) 18 | _check_type(postfix, tuple) 19 | if not how in ["left", "right", "inner", "full"]: 20 | message = ( 21 | "on argument is invalid, must be one of {'left', 'right', 'inner', 'full'}" 22 | ) 23 | raise ValueError(message) 24 | how = "outer" if (how == "full") else how # type: ignore 25 | df = pd.merge(lhs, rhs, on=on, how=how, suffixes=postfix) 26 | df = df.reset_index(drop=True) 27 | return df 28 | -------------------------------------------------------------------------------- /redframes/verbs/mutate.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_type 4 | from ..types import Column, Func, PandasDataFrame 5 | 6 | 7 | def mutate(df: PandasDataFrame, over: dict[Column, Func]) -> PandasDataFrame: 8 | _check_type(over, dict) 9 | df = df.copy() 10 | for column, mutation in over.items(): 11 | df[column] = df.apply(mutation, axis=1) 12 | return df # type: ignore 13 | -------------------------------------------------------------------------------- /redframes/verbs/pack.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_type 4 | from ..types import Column, PandasDataFrame, PandasGroupedFrame 5 | 6 | 7 | def pack( 8 | df: PandasDataFrame | PandasGroupedFrame, column: Column, sep: str 9 | ) -> PandasDataFrame: 10 | _check_type(column, str) 11 | _check_type(sep, str) 12 | order = df.obj.columns if isinstance(df, PandasGroupedFrame) else df.columns # type: ignore 13 | df = df.agg(**{column: (column, lambda x: x.astype(str).str.cat(sep=sep))}) # type: ignore 14 | df = df[[col for col in df.columns if col in order]] 15 | df = df.reset_index(drop=True) 16 | return df 17 | -------------------------------------------------------------------------------- /redframes/verbs/rank.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | 5 | from ..checks import _check_type 6 | from ..types import Column, PandasDataFrame, PandasGroupedFrame 7 | 8 | 9 | def rank( 10 | df: PandasDataFrame | PandasGroupedFrame, 11 | column: Column, 12 | into: Column, 13 | descending: bool = False, 14 | ) -> PandasDataFrame: 15 | _check_type(column, str) 16 | _check_type(into, str) 17 | _check_type(descending, bool) 18 | if isinstance(df, PandasDataFrame): 19 | into_is_not_column = into != column 20 | into_is_in_df_columns = into in df.columns 21 | if into_is_not_column and into_is_in_df_columns: 22 | message = f"overwriting existing column '{into}'" 23 | warnings.warn(message) 24 | df = df.copy() 25 | result = df[column].rank(method="dense", ascending=not descending) 26 | if isinstance(df, PandasGroupedFrame): 27 | df = df.obj.copy() # type: ignore 28 | df[into] = result # type: ignore 29 | return df # type: ignore 30 | -------------------------------------------------------------------------------- /redframes/verbs/rename.py: -------------------------------------------------------------------------------- 1 | from ..checks import _check_type, _check_values 2 | from ..types import NewColumn, OldColumn, PandasDataFrame 3 | 4 | 5 | def rename(df: PandasDataFrame, columns: dict[OldColumn, NewColumn]) -> PandasDataFrame: 6 | _check_type(columns, dict) 7 | cv = columns.values() 8 | _check_values(cv, str) 9 | if len(set(cv)) != len(cv): 10 | raise KeyError("columns must be unique") 11 | missing_keys = set(columns.keys()) - set(df.columns) 12 | if missing_keys and len(missing_keys) == 1: 13 | raise KeyError(f"column key ({missing_keys}) is invalid") 14 | if missing_keys and len(missing_keys) > 1: 15 | raise KeyError(f"column keys ({missing_keys}) are invalid") 16 | df = df.rename(columns=columns) 17 | return df 18 | -------------------------------------------------------------------------------- /redframes/verbs/replace.py: -------------------------------------------------------------------------------- 1 | from ..checks import _check_type 2 | from ..types import Column, NewValue, OldValue, PandasDataFrame 3 | 4 | 5 | def replace( 6 | df: PandasDataFrame, over: dict[Column, dict[OldValue, NewValue]] 7 | ) -> PandasDataFrame: 8 | _check_type(over, dict) 9 | bad_columns = list(set(over.keys()) - set(df.columns)) 10 | if bad_columns and len(bad_columns) == 1: 11 | raise KeyError(f"column key: {bad_columns} is invalid") 12 | if bad_columns and len(bad_columns) > 1: 13 | raise KeyError(f"column keys: {bad_columns} are invalid") 14 | df = df.replace(over) 15 | return df 16 | -------------------------------------------------------------------------------- /redframes/verbs/rollup.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_type 4 | from ..types import Column, Func, PandasDataFrame, PandasGroupedFrame 5 | 6 | 7 | def rollup( 8 | df: PandasDataFrame | PandasGroupedFrame, 9 | over: dict[Column, tuple[Column, Func]], 10 | ) -> PandasDataFrame: 11 | _check_type(over, dict) 12 | if isinstance(df, PandasGroupedFrame): 13 | groups = set(df.grouper.names) # type: ignore 14 | keys = set(over.keys()) 15 | if groups.intersection(keys): 16 | raise KeyError("unable to overwrite group keys") 17 | df = df.agg(**over) 18 | df = df.reset_index(drop=True) 19 | else: 20 | df = df.agg(**over) # type: ignore 21 | df = df.T # type: ignore 22 | df = df.reset_index(drop=True) # type: ignore 23 | df = df.fillna(method="ffill") # type: ignore 24 | df = df.fillna(method="bfill") # type: ignore 25 | df = df.head(1) # type: ignore 26 | return df 27 | -------------------------------------------------------------------------------- /redframes/verbs/sample.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_type 4 | from ..types import PandasDataFrame 5 | 6 | 7 | def sample( 8 | df: PandasDataFrame, rows: int | float, seed: int | None = None 9 | ) -> PandasDataFrame: 10 | _check_type(rows, {int, float}) 11 | if rows >= 1: 12 | if isinstance(rows, float): 13 | raise ValueError("must be int if > 1") 14 | df = df.sample(rows, random_state=seed) 15 | elif 0 < rows < 1: 16 | df = df.sample(frac=rows, random_state=seed) 17 | else: 18 | raise ValueError("must be > 0") 19 | df = df.reset_index(drop=True) 20 | return df 21 | -------------------------------------------------------------------------------- /redframes/verbs/select.py: -------------------------------------------------------------------------------- 1 | import pandas as pd # pyright: ignore[reportMissingImports] 2 | 3 | from ..checks import _check_type 4 | from ..types import LazyColumns, PandasDataFrame 5 | 6 | 7 | def select(df: PandasDataFrame, columns: LazyColumns) -> PandasDataFrame: 8 | _check_type(columns, {list, str}) 9 | columns = [columns] if isinstance(columns, str) else columns 10 | if len(set(columns)) != len(columns): 11 | raise KeyError(f"column keys must be unique") 12 | bad_columns = list(set(columns) - set(df.columns)) 13 | if bad_columns and len(bad_columns) == 1: 14 | raise KeyError(f"column key: {bad_columns} is invalid") 15 | if bad_columns and len(bad_columns) > 1: 16 | raise KeyError(f"column keys: {bad_columns} are invalid") 17 | df = df[columns] 18 | return df 19 | -------------------------------------------------------------------------------- /redframes/verbs/shuffle.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_type 4 | from ..types import PandasDataFrame 5 | 6 | 7 | def shuffle(df: PandasDataFrame, seed: int | None = None) -> PandasDataFrame: 8 | _check_type(seed, {int, None}) 9 | df = df.sample(frac=1, random_state=seed) 10 | df = df.reset_index(drop=True) 11 | return df 12 | -------------------------------------------------------------------------------- /redframes/verbs/sort.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_keys, _check_type 4 | from ..types import LazyColumns, PandasDataFrame 5 | 6 | 7 | def sort( 8 | df: PandasDataFrame, columns: LazyColumns, descending: bool = False 9 | ) -> PandasDataFrame: 10 | _check_type(columns, {list, str}) 11 | _check_type(descending, bool) 12 | _check_keys(columns, df.columns) 13 | df = df.sort_values(by=columns, ascending=not descending) 14 | df = df.reset_index(drop=True) 15 | return df 16 | -------------------------------------------------------------------------------- /redframes/verbs/split.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | from ..checks import _check_type 4 | from ..types import Column, Columns, PandasDataFrame 5 | 6 | 7 | def split( 8 | df: PandasDataFrame, column: Column, into: Columns, sep: str, drop: bool = True 9 | ) -> PandasDataFrame: 10 | _check_type(column, str) 11 | _check_type(into, list) 12 | _check_type(sep, str) 13 | _check_type(drop, bool) 14 | if len(into) != len(set(into)): 15 | raise KeyError("into keys must be unique") 16 | if (column in into) and (not drop): 17 | raise KeyError("into keys must be unique") 18 | bad_keys = set(df.columns).difference(set([column])).intersection(set(into)) 19 | if bad_keys: 20 | raise KeyError("into keys must be unique") 21 | columns = {uuid.uuid4().hex: col for col in into} 22 | temp = list(columns.keys()) 23 | df = df.copy() 24 | df[temp] = df[column].str.split(sep, expand=True) 25 | if drop: 26 | df = df.drop(column, axis=1) 27 | df = df.rename(columns=columns) 28 | return df 29 | -------------------------------------------------------------------------------- /redframes/verbs/spread.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | import pandas as pd # pyright: ignore[reportMissingImports] 4 | 5 | from ..checks import _check_type 6 | from ..types import Column, PandasDataFrame 7 | 8 | 9 | def spread(df: PandasDataFrame, column: Column, using: Column) -> PandasDataFrame: 10 | _check_type(column, str) 11 | _check_type(using, str) 12 | if column == using: 13 | raise KeyError("column and using must be unique") 14 | original_shape = df.shape[1] 15 | if original_shape == 2: 16 | temp = uuid.uuid4().hex 17 | df[temp] = df.groupby(column).cumcount() 18 | index = [col for col in df.columns if col not in [column, using]] 19 | df = pd.pivot_table(df, index=index, columns=[column], values=[using], aggfunc="first") # type: ignore 20 | df.columns = [col for col in df.columns.get_level_values(1)] # type: ignore 21 | df = df.reset_index().rename_axis(None, axis=0) 22 | if original_shape == 2: 23 | df = df.drop(temp, axis=1) # type: ignore 24 | return df 25 | -------------------------------------------------------------------------------- /redframes/verbs/take.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_type 4 | from ..types import PandasDataFrame, PandasGroupedFrame 5 | 6 | 7 | def take( 8 | df: PandasDataFrame | PandasGroupedFrame, rows: int = 1, **kwargs 9 | ) -> PandasDataFrame: 10 | if kwargs: # compatibility: sklearn / train_test_split 11 | df = df.take(rows, **kwargs) # type: ignore 12 | df = df.reset_index(drop=True) 13 | return df 14 | _check_type(rows, int) 15 | if isinstance(df, PandasDataFrame): 16 | if rows > df.shape[0]: 17 | raise ValueError("rows argument is invalid, exceeds total size") 18 | if rows == 0: 19 | raise ValueError("rows argument is invalid, must not be 0") 20 | if rows <= -1: 21 | df = df.tail(rows * -1) 22 | else: 23 | df = df.head(rows) 24 | if isinstance(df, PandasGroupedFrame): 25 | df = df.reset_index() 26 | else: 27 | df = df.reset_index(drop=True) 28 | return df 29 | -------------------------------------------------------------------------------- /redframes/verbs/unpack.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..checks import _check_type 4 | from ..types import Column, PandasDataFrame 5 | 6 | 7 | def unpack(df: PandasDataFrame, column: Column, sep: str) -> PandasDataFrame: 8 | _check_type(column, str) 9 | _check_type(sep, str) 10 | df = df.assign(**{column: df[column].str.split(sep)}) 11 | df = df.explode(column) 12 | df = df.reset_index(drop=True) 13 | return df 14 | -------------------------------------------------------------------------------- /redframes/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.4.1" 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | exec(open("redframes/version.py").read()) 4 | 5 | with open("README.md", "r", encoding="utf-8") as f: 6 | long_description = f.read() 7 | 8 | setup( 9 | name="redframes", 10 | version=__version__, # type: ignore 11 | url="https://github.com/maxhumber/redframes", 12 | description="General Purpose Data Manipulation Library", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | author="Max Humber", 16 | author_email="max.humber@gmail.com", 17 | license="BSD 2", 18 | packages=find_packages(), 19 | python_requires=">=3.8", 20 | install_requires=["pandas>=1.5,<3.0"], 21 | extras_require={ 22 | "test": [ 23 | "matplotlib", 24 | "scikit-learn", 25 | ], 26 | "dev": [ 27 | "black", 28 | "ipykernel", 29 | "isort", 30 | "lxml", 31 | "matplotlib", 32 | "mypy", 33 | "pandas-stubs", 34 | "pyright", 35 | "scikit-learn", 36 | "tabulate", 37 | ], 38 | }, 39 | classifiers=[ 40 | "Intended Audience :: Developers", 41 | "Programming Language :: Python :: 3.8", 42 | "Programming Language :: Python :: 3.9", 43 | "Programming Language :: Python :: 3.10", 44 | "Programming Language :: Python :: 3.11", 45 | ], 46 | ) 47 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxhumber/redframes/6e3f1226358ad4e67f4343cbc4b1ee4b63475034/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_deprecations.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import redframes as rf 4 | 5 | 6 | class TestDeprecations(unittest.TestCase): 7 | def test_summarize_deprecation(self): 8 | df = rf.DataFrame({"foo": range(10)}) 9 | expected = rf.DataFrame({"foo": [4.5]}) 10 | message = "Marked for removal, please use `rollup` instead" 11 | with self.assertWarnsRegex(FutureWarning, message): 12 | result = df.summarize({"foo": ("foo", rf.stat.mean)}) 13 | self.assertEqual(result, expected) 14 | 15 | def test_gather_beside_deprecation(self): 16 | df = rf.DataFrame({"foo": [1, 1, 2, 2], "bar": [1, 2, 3, 4]}) 17 | expected = rf.DataFrame( 18 | { 19 | "foo": [1, 1, 2, 2], 20 | "variable": ["bar", "bar", "bar", "bar"], 21 | "value": [1, 2, 3, 4], 22 | } 23 | ) 24 | with self.assertWarnsRegex(FutureWarning, "Marked for removal*"): 25 | result = df.gather(beside="foo") 26 | self.assertEqual(result, expected) 27 | -------------------------------------------------------------------------------- /tests/test_docstrings.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import redframes as rf 4 | 5 | 6 | class TestDocstrings(unittest.TestCase): 7 | def test_take(self): 8 | df = rf.DataFrame({"foo": range(10)}) 9 | result1 = df.take(1) 10 | result2 = df.take(-2) 11 | expected1 = rf.DataFrame({"foo": [0]}) 12 | expected2 = rf.DataFrame({"foo": [8, 9]}) 13 | self.assertEqual(result1, expected1) 14 | self.assertEqual(result2, expected2) 15 | 16 | def test_accumulate(self): 17 | df = rf.DataFrame({"foo": [1, 2, 3, 4]}) 18 | result = df.accumulate("foo", into="cumsum") 19 | expected = rf.DataFrame({"foo": [1, 2, 3, 4], "cumsum": [1, 3, 6, 10]}) 20 | self.assertEqual(result, expected) 21 | 22 | def test_gather(self): 23 | df = rf.DataFrame( 24 | { 25 | "foo": [1, 2, 1, 2], 26 | "bar": ["A", "B", "C", "D"], 27 | "baz": ["!", "@", "#", "$"], 28 | "jaz": range(4), 29 | } 30 | ) 31 | result1 = df.gather() 32 | result2 = df.gather(["foo", "bar"], into=("var", "val")) 33 | result3 = df.group(["foo", "bar"]).gather(into=("variable", "value")) 34 | expected1 = rf.DataFrame( 35 | { 36 | "variable": [ 37 | "foo", 38 | "foo", 39 | "foo", 40 | "foo", 41 | "bar", 42 | "bar", 43 | "bar", 44 | "bar", 45 | "baz", 46 | "baz", 47 | "baz", 48 | "baz", 49 | "jaz", 50 | "jaz", 51 | "jaz", 52 | "jaz", 53 | ], 54 | "value": [ 55 | 1, 56 | 2, 57 | 1, 58 | 2, 59 | "A", 60 | "B", 61 | "C", 62 | "D", 63 | "!", 64 | "@", 65 | "#", 66 | "$", 67 | 0, 68 | 1, 69 | 2, 70 | 3, 71 | ], 72 | } 73 | ) 74 | expected2 = rf.DataFrame( 75 | { 76 | "baz": ["!", "@", "#", "$", "!", "@", "#", "$"], 77 | "jaz": [0, 1, 2, 3, 0, 1, 2, 3], 78 | "var": ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], 79 | "val": [1, 2, 1, 2, "A", "B", "C", "D"], 80 | } 81 | ) 82 | expected3 = rf.DataFrame( 83 | { 84 | "foo": [1, 2, 1, 2, 1, 2, 1, 2], 85 | "bar": ["A", "B", "C", "D", "A", "B", "C", "D"], 86 | "variable": ["baz", "baz", "baz", "baz", "jaz", "jaz", "jaz", "jaz"], 87 | "value": ["!", "@", "#", "$", 0, 1, 2, 3], 88 | } 89 | ) 90 | self.assertEqual(result1, expected1) 91 | self.assertEqual(result2, expected2) 92 | self.assertEqual(result3, expected3) 93 | 94 | def test_pack(self): 95 | df = rf.DataFrame( 96 | {"foo": ["A", "A", "B", "A", "B", "C"], "bar": [1, 2, 3, 4, 5, 6]} 97 | ) 98 | result1 = df.pack("foo", sep="+") 99 | result2 = df.group("foo").pack("bar", sep="|") 100 | expected1 = rf.DataFrame({"foo": ["A+A+B+A+B+C"]}) 101 | expected2 = rf.DataFrame({"foo": ["A", "B", "C"], "bar": ["1|2|4", "3|5", "6"]}) 102 | self.assertEqual(result1, expected1) 103 | self.assertEqual(result2, expected2) 104 | 105 | def test_rank(self): 106 | df = rf.DataFrame({"foo": [2, 3, 3, 99, 1000, 1, -6, 4]}) 107 | result = df.rank("foo", into="rank", descending=True) 108 | expected = rf.DataFrame( 109 | {"foo": [2, 3, 3, 99, 1000, 1, -6, 4], "rank": [5.0, 4, 4, 2, 1, 6, 7, 3]} 110 | ) 111 | self.assertEqual(result, expected) 112 | 113 | def test_rollup(self): 114 | df = rf.DataFrame({"foo": [1, 2, 3, 4, 5], "bar": [99, 100, 1, -5, 2]}) 115 | result = df.rollup( 116 | { 117 | "fcount": ("foo", rf.stat.count), 118 | "fmean": ("foo", rf.stat.mean), 119 | "fsum": ("foo", rf.stat.sum), 120 | "fmax": ("foo", rf.stat.max), 121 | "bmedian": ("bar", rf.stat.median), 122 | "bmin": ("bar", rf.stat.min), 123 | "bstd": ("bar", rf.stat.std), 124 | } 125 | ) 126 | expected = rf.DataFrame( 127 | { 128 | "fcount": [5.0], 129 | "fmean": [3.0], 130 | "fsum": [15.0], 131 | "fmax": [5.0], 132 | "bmedian": [2.0], 133 | "bmin": [-5.0], 134 | "bstd": [54.929955397760885], 135 | } 136 | ) 137 | self.assertEqual(result, expected) 138 | 139 | def test_init(self): 140 | rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]}) 141 | self.assertTrue(True) 142 | 143 | def test_eq(self): 144 | adf = rf.DataFrame({"foo": [1]}) 145 | bdf = rf.DataFrame({"bar": [1]}) 146 | cdf = rf.DataFrame({"foo": [1]}) 147 | self.assertFalse(adf == bdf) 148 | self.assertTrue(adf == cdf) 149 | 150 | def test_getitem(self): 151 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]}) 152 | result = df["foo"] 153 | expected = [1, 2] 154 | self.assertEqual(result, expected) 155 | 156 | def test_str(self): 157 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]}) 158 | result = str(df) 159 | expected = "rf.DataFrame({'foo': [1, 2], 'bar': ['A', 'B']})" 160 | self.assertEqual(result, expected) 161 | 162 | def test_columns(self): 163 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"], "baz": [True, False]}) 164 | result = df.columns 165 | expected = ["foo", "bar", "baz"] 166 | self.assertEqual(result, expected) 167 | 168 | def test_dimensions(self): 169 | df = rf.DataFrame({"foo": range(10), "bar": range(10, 20)}) 170 | result = df.dimensions 171 | expected = {"rows": 10, "columns": 2} 172 | self.assertEqual(result, expected) 173 | 174 | def test_empty(self): 175 | df = rf.DataFrame() 176 | result = df.empty 177 | expected = True 178 | self.assertEqual(result, expected) 179 | 180 | def test_memory(self): 181 | df = rf.DataFrame({"foo": [1, 2, 3], "bar": ["A", "B", "C"]}) 182 | result = df.memory 183 | is_small = result.startswith("3") and result.endswith("B") 184 | self.assertTrue(is_small) 185 | 186 | def test_types(self): 187 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"], "baz": [True, False]}) 188 | result = df.types 189 | expected = {"foo": int, "bar": object, "baz": bool} 190 | self.assertEqual(result, expected) 191 | 192 | def test_append(self): 193 | df1 = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]}) 194 | df2 = rf.DataFrame({"bar": ["C", "D"], "foo": [3, 4], "baz": ["$", "@"]}) 195 | result = df1.append(df2) 196 | expected = rf.DataFrame( 197 | { 198 | "foo": [1, 2, 3, 4], 199 | "bar": ["A", "B", "C", "D"], 200 | "baz": [None, None, "$", "@"], 201 | } 202 | ) 203 | self.assertEqual(result, expected) 204 | 205 | def test_combine(self): 206 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]}) 207 | result = df.combine(["bar", "foo"], into="baz", sep="::", drop=True) 208 | expected = rf.DataFrame({"baz": ["A::1", "B::2"]}) 209 | self.assertEqual(result, expected) 210 | 211 | def test_cross(self): 212 | df = rf.DataFrame({"foo": ["a", "b", "c"], "bar": [1, 2, 3]}) 213 | dfa = rf.DataFrame({"foo": [1, 2, 3]}) 214 | dfb = rf.DataFrame({"bar": [1, 2, 3]}) 215 | result1 = df.cross() 216 | result2 = dfa.cross(dfb, postfix=("_a", "_b")) 217 | expected1 = rf.DataFrame( 218 | { 219 | "foo_lhs": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], 220 | "bar_lhs": [1, 1, 1, 2, 2, 2, 3, 3, 3], 221 | "foo_rhs": ["a", "b", "c", "a", "b", "c", "a", "b", "c"], 222 | "bar_rhs": [1, 2, 3, 1, 2, 3, 1, 2, 3], 223 | } 224 | ) 225 | expected2 = rf.DataFrame( 226 | {"foo": [1, 1, 1, 2, 2, 2, 3, 3, 3], "bar": [1, 2, 3, 1, 2, 3, 1, 2, 3]} 227 | ) 228 | self.assertEqual(result1, expected1) 229 | self.assertEqual(result2, expected2) 230 | 231 | def test_dedupe(self): 232 | df = rf.DataFrame({"foo": [1, 1, 2, 2], "bar": ["A", "A", "B", "A"]}) 233 | result1 = df.dedupe() 234 | result2 = df.dedupe("foo") 235 | result3 = df.dedupe(["foo", "bar"]) 236 | expected1 = rf.DataFrame({"foo": [1, 2, 2], "bar": ["A", "B", "A"]}) 237 | expected2 = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]}) 238 | expected3 = rf.DataFrame({"foo": [1, 2, 2], "bar": ["A", "B", "A"]}) 239 | self.assertEqual(result1, expected1) 240 | self.assertEqual(result2, expected2) 241 | self.assertEqual(result3, expected3) 242 | 243 | def test_denix(self): 244 | df = rf.DataFrame( 245 | {"foo": [1, None, 3, None, 5, 6], "bar": [1, None, 3, 4, None, None]} 246 | ) 247 | result1 = df.denix() 248 | result2 = df.denix("bar") 249 | result3 = df.denix(["foo", "bar"]) 250 | expected1 = rf.DataFrame({"foo": [1.0, 3.0], "bar": [1.0, 3.0]}) 251 | expected2 = rf.DataFrame({"foo": [1.0, 3.0, None], "bar": [1.0, 3.0, 4.0]}) 252 | expected3 = rf.DataFrame({"foo": [1.0, 3.0], "bar": [1.0, 3.0]}) 253 | self.assertEqual(result1, expected1) 254 | self.assertEqual(result2, expected2) 255 | self.assertEqual(result3, expected3) 256 | 257 | def test_drop(self): 258 | df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [5, 6]}) 259 | result1 = df.drop("baz") 260 | result2 = df.drop(["foo", "baz"]) 261 | expected1 = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]}) 262 | expected2 = rf.DataFrame({"bar": [3, 4]}) 263 | self.assertEqual(result1, expected1) 264 | self.assertEqual(result2, expected2) 265 | 266 | def test_fill(self): 267 | df = rf.DataFrame( 268 | {"foo": [1, None, None, 2, None], "bar": [None, "A", None, "B", None]} 269 | ) 270 | result1 = df.fill(constant=0) 271 | result2 = df.fill(direction="down") 272 | result3 = df.fill("foo", direction="down") 273 | result4 = df.fill(["foo"], direction="up") 274 | expected1 = rf.DataFrame( 275 | {"foo": [1.0, 0.0, 0.0, 2.0, 0.0], "bar": [0, "A", 0, "B", 0]} 276 | ) 277 | expected2 = rf.DataFrame( 278 | {"foo": [1.0, 1.0, 1.0, 2.0, 2.0], "bar": [None, "A", "A", "B", "B"]} 279 | ) 280 | expected3 = rf.DataFrame( 281 | {"foo": [1.0, 1.0, 1.0, 2.0, 2.0], "bar": [None, "A", None, "B", None]} 282 | ) 283 | expected4 = rf.DataFrame( 284 | {"foo": [1.0, 2.0, 2.0, 2.0, None], "bar": [None, "A", None, "B", None]} 285 | ) 286 | self.assertEqual(result1, expected1) 287 | self.assertEqual(result2, expected2) 288 | self.assertEqual(result3, expected3) 289 | self.assertEqual(result4, expected4) 290 | 291 | def test_filter(self): 292 | df = rf.DataFrame({"foo": ["A", "A", "A", "B"], "bar": [1, 2, 3, 4]}) 293 | result1 = df.filter(lambda row: row["foo"].isin(["A"])) 294 | result2 = df.filter(lambda row: (row["foo"] == "A") & (row["bar"] <= 2)) 295 | result3 = df.filter(lambda row: (row["foo"] == "B") | (row["bar"] == 1)) 296 | expected1 = rf.DataFrame({"foo": ["A", "A", "A"], "bar": [1, 2, 3]}) 297 | expected2 = rf.DataFrame({"foo": ["A", "A"], "bar": [1, 2]}) 298 | expected3 = rf.DataFrame({"foo": ["A", "B"], "bar": [1, 4]}) 299 | self.assertEqual(result1, expected1) 300 | self.assertEqual(result2, expected2) 301 | self.assertEqual(result3, expected3) 302 | 303 | def test_group(self): 304 | df = rf.DataFrame( 305 | { 306 | "foo": ["A", "A", "A", "B", "B"], 307 | "bar": [1, 2, 3, 4, 5], 308 | "baz": [9, 7, 7, 5, 6], 309 | } 310 | ) 311 | result1 = df.group("foo").accumulate("bar", into="bar_cumsum") 312 | result2 = df.group("foo").gather() 313 | result3 = df.group("foo").pack("bar", sep=":") 314 | result4 = df.group("foo").rank("baz", into="baz_rank", descending=True) 315 | result5 = df.group("foo").rollup( 316 | {"bar_mean": ("bar", rf.stat.mean), "baz_min": ("baz", rf.stat.min)} 317 | ) 318 | result6 = df.group("foo").take(1) 319 | expected1 = rf.DataFrame( 320 | { 321 | "foo": ["A", "A", "A", "B", "B"], 322 | "bar": [1, 2, 3, 4, 5], 323 | "baz": [9, 7, 7, 5, 6], 324 | "bar_cumsum": [1, 3, 6, 4, 9], 325 | } 326 | ) 327 | expected2 = rf.DataFrame( 328 | { 329 | "foo": ["A", "A", "A", "B", "B", "A", "A", "A", "B", "B"], 330 | "variable": [ 331 | "bar", 332 | "bar", 333 | "bar", 334 | "bar", 335 | "bar", 336 | "baz", 337 | "baz", 338 | "baz", 339 | "baz", 340 | "baz", 341 | ], 342 | "value": [1, 2, 3, 4, 5, 9, 7, 7, 5, 6], 343 | } 344 | ) 345 | expected3 = rf.DataFrame({"foo": ["A", "B"], "bar": ["1:2:3", "4:5"]}) 346 | expected4 = rf.DataFrame( 347 | { 348 | "foo": ["A", "A", "A", "B", "B"], 349 | "bar": [1, 2, 3, 4, 5], 350 | "baz": [9, 7, 7, 5, 6], 351 | "baz_rank": [1.0, 2.0, 2.0, 2.0, 1.0], 352 | } 353 | ) 354 | expected5 = rf.DataFrame( 355 | {"foo": ["A", "B"], "bar_mean": [2.0, 4.5], "baz_min": [7, 5]} 356 | ) 357 | expected6 = rf.DataFrame({"foo": ["A", "B"], "bar": [1, 4], "baz": [9, 5]}) 358 | self.assertEqual(result1, expected1) 359 | self.assertEqual(result2, expected2) 360 | self.assertEqual(result3, expected3) 361 | self.assertEqual(result4, expected4) 362 | self.assertEqual(result5, expected5) 363 | self.assertEqual(result6, expected6) 364 | 365 | def test_join(self): 366 | adf = rf.DataFrame({"foo": ["A", "B", "C"], "bar": [1, 2, 3]}) 367 | bdf = rf.DataFrame({"foo": ["A", "B", "D"], "baz": ["!", "@", "#"]}) 368 | result1 = adf.join(bdf, on="foo", how="left") 369 | result2 = adf.join(bdf, on="foo", how="right") 370 | result3 = adf.join(bdf, on="foo", how="inner") 371 | result4 = adf.join(bdf, on="foo", how="full") 372 | expected1 = rf.DataFrame( 373 | {"foo": ["A", "B", "C"], "bar": [1, 2, 3], "baz": ["!", "@", None]} 374 | ) 375 | expected2 = rf.DataFrame( 376 | {"foo": ["A", "B", "D"], "bar": [1.0, 2.0, None], "baz": ["!", "@", "#"]} 377 | ) 378 | expected3 = rf.DataFrame({"foo": ["A", "B"], "bar": [1, 2], "baz": ["!", "@"]}) 379 | expected4 = rf.DataFrame( 380 | { 381 | "foo": ["A", "B", "C", "D"], 382 | "bar": [1.0, 2.0, 3.0, None], 383 | "baz": ["!", "@", None, "#"], 384 | } 385 | ) 386 | self.assertEqual(result1, expected1) 387 | self.assertEqual(result2, expected2) 388 | self.assertEqual(result3, expected3) 389 | self.assertEqual(result4, expected4) 390 | 391 | def test_mutate(self): 392 | df = rf.DataFrame({"foo": [1, 2, 3]}) 393 | result = df.mutate( 394 | { 395 | "bar": lambda row: float(row["foo"]), 396 | "baz": lambda row: "X" + str(row["bar"] * 2), 397 | "jaz": lambda _: "Jazz", 398 | } 399 | ) 400 | expected = rf.DataFrame( 401 | { 402 | "foo": [1, 2, 3], 403 | "bar": [1.0, 2.0, 3.0], 404 | "baz": ["X2.0", "X4.0", "X6.0"], 405 | "jaz": ["Jazz", "Jazz", "Jazz"], 406 | } 407 | ) 408 | self.assertEqual(result, expected) 409 | 410 | def test_rename(self): 411 | df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]}) 412 | result = df.rename({"foo": "oof", "bar": "rab"}) 413 | expected = rf.DataFrame({"oof": [1, 2], "rab": [3, 4]}) 414 | self.assertEqual(result, expected) 415 | 416 | def test_replace(self): 417 | df = rf.DataFrame({"foo": [1, 2, 2, 2, 1], "bar": [1, "A", "B", True, False]}) 418 | result = df.replace( 419 | {"foo": {2: 222}, "bar": {False: 0, True: 1, "A": 2, "B": 3}} 420 | ) 421 | expected = rf.DataFrame({"foo": [1, 222, 222, 222, 1], "bar": [1, 2, 3, 1, 0]}) 422 | self.assertEqual(result, expected) 423 | 424 | def test_sample(self): 425 | df = rf.DataFrame({"foo": range(10), "bar": range(10, 20)}) 426 | result1 = df.sample(1) 427 | result2 = df.sample(3) 428 | result3 = df.sample(0.3) 429 | self.assertEqual(len(result1), 1) 430 | self.assertEqual(len(result2), 3) 431 | self.assertEqual(len(result3), 3) 432 | 433 | def test_select(self): 434 | df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [5, 6]}) 435 | result1 = df.select("foo") 436 | result2 = df.select(["foo", "baz"]) 437 | expected1 = rf.DataFrame({"foo": [1, 2]}) 438 | expected2 = rf.DataFrame({"foo": [1, 2], "baz": [5, 6]}) 439 | self.assertEqual(result1, expected1) 440 | self.assertEqual(result2, expected2) 441 | 442 | def test_shuffle(self): 443 | df = rf.DataFrame({"foo": range(5), "bar": range(5, 10)}) 444 | result = df.shuffle() 445 | self.assertNotEqual(df, result) 446 | 447 | def test_sort(self): 448 | df = rf.DataFrame({"foo": ["Z", "X", "A", "A"], "bar": [2, -2, 4, -4]}) 449 | result1 = df.sort("bar") 450 | result2 = df.sort("bar", descending=True) 451 | result3 = df.sort(["foo", "bar"], descending=False) 452 | expected1 = rf.DataFrame({"foo": ["A", "X", "Z", "A"], "bar": [-4, -2, 2, 4]}) 453 | expected2 = rf.DataFrame({"foo": ["A", "Z", "X", "A"], "bar": [4, 2, -2, -4]}) 454 | expected3 = rf.DataFrame({"foo": ["A", "A", "X", "Z"], "bar": [-4, 4, -2, 2]}) 455 | self.assertEqual(result1, expected1) 456 | self.assertEqual(result2, expected2) 457 | self.assertEqual(result3, expected3) 458 | 459 | def test_split(self): 460 | df = rf.DataFrame({"foo": ["A::1", "B::2", "C:3"]}) 461 | result = df.split("foo", into=["foo", "bar"], sep="::", drop=True) 462 | expected = rf.DataFrame({"foo": ["A", "B", "C:3"], "bar": ["1", "2", None]}) 463 | self.assertEqual(result, expected) 464 | 465 | def test_spread(self): 466 | df = rf.DataFrame( 467 | {"foo": ["A", "A", "A", "B", "B", "B", "B"], "bar": [1, 2, 3, 4, 5, 6, 7]} 468 | ) 469 | result = df.spread("foo", using="bar") 470 | expected = rf.DataFrame({"A": [1.0, 2.0, 3.0, None], "B": [4.0, 5.0, 6.0, 7.0]}) 471 | self.assertEqual(result, expected) 472 | 473 | def test_unpack(self): 474 | df = rf.DataFrame({"foo": [1, 2, 3, 4], "bar": ["A:B", "B:C:D", "D:E", "F"]}) 475 | result = df.unpack("bar", sep=":") 476 | expected = rf.DataFrame( 477 | { 478 | "foo": [1, 1, 2, 2, 2, 3, 3, 4], 479 | "bar": ["A", "B", "B", "C", "D", "D", "E", "F"], 480 | } 481 | ) 482 | self.assertEqual(result, expected) 483 | -------------------------------------------------------------------------------- /tests/test_dupe_columns.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import redframes as rf 4 | 5 | 6 | class TestDupeColumns(unittest.TestCase): 7 | def setUp(self): 8 | self.df = rf.DataFrame( 9 | { 10 | "foo": range(10), 11 | "bar": [1, 3.2, 4.5, 2, -1, 30, None, 1.1, 1.1, 9], 12 | "baz": ["A", "A", None, "B", "B", "A", "B", "C", "C", "A"], 13 | "jaz": [ 14 | "1::1", 15 | "2::2", 16 | "3:3", 17 | "4::4", 18 | "5::5", 19 | "6::7", 20 | "7::8", 21 | "8::9", 22 | "9::0", 23 | "0::-1", 24 | ], 25 | "raz": [1, 2, 3, None, None, None, 9, 9, None, None], 26 | } 27 | ) 28 | 29 | def test_accumulate_not_unqiue(self): 30 | self.df.accumulate("foo", into="foo") 31 | self.assertTrue(True) 32 | 33 | def test_accumulate_overwrite_existing(self): 34 | with self.assertWarnsRegex(UserWarning, "overwriting existing column *"): 35 | self.df.accumulate("foo", into="bar") 36 | 37 | def test_combine_into_overwrite(self): 38 | self.df.combine(["foo", "bar"], into="foo", sep="-") 39 | self.assertTrue(True) 40 | 41 | def test_combine_overwrite_existing(self): 42 | with self.assertWarnsRegex(UserWarning, "overwriting existing column *"): 43 | self.df.combine(["foo", "bar"], into="baz", sep="-") 44 | 45 | def test_combine_overwrite_no_drop(self): 46 | self.df.combine(["foo", "bar"], into="foo", sep="-", drop=False) 47 | self.assertTrue(True) 48 | 49 | def test_gather_same_column_names(self): 50 | with self.assertRaisesRegex(TypeError, "must be unique"): 51 | self.df.gather(into=("foo", "foo")) 52 | 53 | def test_gather_exising_column_name_for_variable(self): 54 | with self.assertRaisesRegex(ValueError, "must not be an existing column key"): 55 | self.df.gather(into=("foo", "value")) 56 | 57 | def test_gather_exising_column_name_for_value(self): 58 | with self.assertRaisesRegex(ValueError, "must not be an existing column key"): 59 | self.df.gather(into=("variable", "foo")) 60 | 61 | def test_gather_exising_column_key(self): 62 | with self.assertRaisesRegex(ValueError, "must not be an existing column key"): 63 | self.df.gather(["foo", "bar"], into=("raz", "baz")) 64 | 65 | def test_gather_group_into_conflict(self): 66 | with self.assertRaisesRegex(ValueError, "must not be an existing column key"): 67 | self.df.group("foo").gather(into=("foo", "bar")) 68 | 69 | def test_rank_into_overwrite(self): 70 | self.df.rank("bar", into="bar", descending=True) 71 | self.assertTrue(True) 72 | 73 | def test_rank_overwrite_existing(self): 74 | with self.assertWarnsRegex(UserWarning, "overwriting existing column *"): 75 | self.df.rank("bar", into="baz", descending=True) 76 | 77 | def test_rename_duplicated_dict_values(self): 78 | with self.assertRaisesRegex(KeyError, "columns must be unique"): 79 | self.df.rename({"foo": "oof", "bar": "oof"}) 80 | 81 | def test_rollup_group_existing_column(self): 82 | with self.assertRaisesRegex(KeyError, "unable to overwrite group key"): 83 | self.df.group("baz").rollup({"baz": ("foo", rf.stat.max)}) 84 | 85 | def test_select_duplicate_keys(self): 86 | with self.assertRaisesRegex(KeyError, "column keys must be unique"): 87 | self.df.select(["foo", "foo"]) 88 | 89 | def test_split_overwrite_into_one(self): 90 | self.df.split("jaz", into=["jaz", "paz"], sep="::") 91 | self.assertTrue(True) 92 | 93 | def test_split_overwrite_into_existing(self): 94 | with self.assertRaisesRegex(KeyError, "into keys must be unique"): 95 | self.df.split("jaz", into=["jaz", "foo"], sep="::") 96 | 97 | def test_split_duplicated_into_keys(self): 98 | with self.assertRaisesRegex(KeyError, "into keys must be unique"): 99 | self.df.split("jaz", into=["paz", "paz"], sep="::") 100 | 101 | def test_spread_duplicated_column_names(self): 102 | with self.assertRaisesRegex(KeyError, "column and using must be unique"): 103 | self.df.gather().spread("variable", "variable") 104 | -------------------------------------------------------------------------------- /tests/test_index.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | import redframes as rf 6 | 7 | 8 | def index_is_okay(df: rf.DataFrame) -> bool: 9 | index = df._data.index 10 | is_unnamed = index.name == None 11 | is_range = isinstance(index, pd.RangeIndex) 12 | is_zero_start = index.start == 0 13 | is_one_step = index.step == 1 14 | return all([is_unnamed, is_range, is_zero_start, is_one_step]) 15 | 16 | 17 | class TestIndex(unittest.TestCase): 18 | def setUp(self): 19 | self.df = rf.DataFrame( 20 | { 21 | "foo": range(10), 22 | "bar": [1, 3.2, 4.5, 2, -1, 30, None, 1.1, 1.1, 9], 23 | "baz": ["A", "A", None, "B", "B", "A", "B", "C", "C", "A"], 24 | "jaz": [ 25 | "1::1", 26 | "2::2", 27 | "3:3", 28 | "4::4", 29 | "5::5", 30 | "6::7", 31 | "7::8", 32 | "8::9", 33 | "9::0", 34 | "0::-1", 35 | ], 36 | "raz": [1, 2, 3, None, None, None, 9, 9, None, None], 37 | } 38 | ) 39 | 40 | def test_accumulate(self): 41 | new = self.df.accumulate("foo", into="foo") 42 | self.assertTrue(index_is_okay(new)) 43 | 44 | def test_append(self): 45 | df_bottom = rf.DataFrame({"foo": [10]}) 46 | new = self.df.append(df_bottom) 47 | self.assertTrue(index_is_okay(new)) 48 | 49 | def test_combine(self): 50 | new = self.df.combine(["foo", "bar"], into="foo", sep="-") 51 | self.assertTrue(index_is_okay(new)) 52 | 53 | def test_cross(self): 54 | new = self.df.cross() 55 | self.assertTrue(index_is_okay(new)) 56 | 57 | def test_dedupe(self): 58 | new = self.df.dedupe("baz") 59 | self.assertTrue(index_is_okay(new)) 60 | 61 | def test_denix(self): 62 | new = self.df.denix() 63 | self.assertTrue(index_is_okay(new)) 64 | 65 | def test_drop(self): 66 | new = self.df.drop("foo") 67 | self.assertTrue(index_is_okay(new)) 68 | 69 | def test_fill(self): 70 | new = self.df.fill("baz", direction="down") 71 | self.assertTrue(index_is_okay(new)) 72 | 73 | def test_filter(self): 74 | new = self.df.filter(lambda row: row["bar"] > 5) 75 | self.assertTrue(index_is_okay(new)) 76 | 77 | def test_gather(self): 78 | new = self.df.gather() 79 | self.assertTrue(index_is_okay(new)) 80 | 81 | def test_group(self): 82 | new = self.df.group("baz").rollup({"foo": ("foo", rf.stat.mean)}) 83 | self.assertTrue(index_is_okay(new)) 84 | 85 | def test_join(self): 86 | df_right = rf.DataFrame({"baz": ["A", "B"], "haz": ["Apple", "Banana"]}) 87 | new = self.df.join(df_right, on="baz") 88 | self.assertTrue(index_is_okay(new)) 89 | 90 | def test_mutate(self): 91 | new = self.df.mutate({"foo": lambda row: row["foo"] * 10}) 92 | self.assertTrue(index_is_okay(new)) 93 | 94 | def test_pack(self): 95 | new = self.df.pack("baz", sep="|") 96 | self.assertTrue(index_is_okay(new)) 97 | 98 | def test_rank(self): 99 | new = self.df.rank("bar", into="bar_rank", descending=True) 100 | self.assertTrue(index_is_okay(new)) 101 | 102 | def test_rename(self): 103 | new = self.df.rename({"foo": "oof"}) 104 | self.assertTrue(index_is_okay(new)) 105 | 106 | def test_replace(self): 107 | new = self.df.replace({"baz": {"B": "Banana"}}) 108 | self.assertTrue(index_is_okay(new)) 109 | 110 | def test_rollup(self): 111 | new = self.df.rollup({"bar_mean": ("bar", rf.stat.mean)}) 112 | self.assertTrue(index_is_okay(new)) 113 | 114 | def test_sample(self): 115 | new = self.df.sample(5) 116 | self.assertTrue(index_is_okay(new)) 117 | 118 | def test_select(self): 119 | new = self.df.select(["foo", "bar"]) 120 | self.assertTrue(index_is_okay(new)) 121 | 122 | def test_shuffle(self): 123 | new = self.df.shuffle() 124 | self.assertTrue(index_is_okay(new)) 125 | 126 | def test_sort(self): 127 | new = self.df.sort("bar", descending=True) 128 | self.assertTrue(index_is_okay(new)) 129 | 130 | def test_split(self): 131 | new = self.df.split("jaz", into=["jaz_1", "jaz_2"], sep="::") 132 | self.assertTrue(index_is_okay(new)) 133 | 134 | def test_spread(self): 135 | new = self.df.denix("baz").select(["baz", "foo"]).spread("baz", "foo") 136 | self.assertTrue(index_is_okay(new)) 137 | 138 | def test_take(self): 139 | new = self.df.take(-3) 140 | self.assertTrue(index_is_okay(new)) 141 | 142 | def test_unpack(self): 143 | new = self.df.unpack("jaz", sep="::") 144 | self.assertTrue(index_is_okay(new)) 145 | -------------------------------------------------------------------------------- /tests/test_interchange.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | import redframes as rf 6 | 7 | 8 | class TestInterchange(unittest.TestCase): 9 | def test_wrap_no_side_effect(self): 10 | rdf = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]}) 11 | result = pd.api.interchange.from_dataframe(rdf) 12 | expected = pd.DataFrame({"foo": [1, 2], "bar": [3, 4]}) 13 | self.assertTrue(result.equals(expected)) 14 | -------------------------------------------------------------------------------- /tests/test_io.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pathlib import Path 3 | from shutil import rmtree as delete 4 | from tempfile import mkdtemp as make_temp_dir 5 | 6 | import pandas as pd 7 | 8 | import redframes as rf 9 | 10 | 11 | class TestIO(unittest.TestCase): 12 | def setUp(self): 13 | self.tempdir = tempdir = make_temp_dir() 14 | self.df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]}) 15 | self.pdf = pd.DataFrame({"foo": [1, 2], "bar": [3, 4]}) 16 | self.path = str(Path(tempdir) / "example.csv") 17 | 18 | def tearDown(self): 19 | delete(self.tempdir) 20 | 21 | def test_load_missing_file(self): 22 | with self.assertRaises(FileNotFoundError): 23 | rf.load("test_missing_file.csv") 24 | 25 | def test_load_bad_format(self): 26 | with self.assertRaisesRegex(TypeError, "must end in .csv"): 27 | rf.load("test_bad_file_format.json") 28 | 29 | def test_save_bad_path_format(self): 30 | with self.assertRaisesRegex(TypeError, "must end in .csv"): 31 | rf.save(self.df, "example.json") 32 | 33 | def test_save_bad_type(self): 34 | with self.assertRaisesRegex(TypeError, "must be DataFrame"): 35 | rf.save(1, "example.json") 36 | 37 | def test_unwrap_bad_type(self): 38 | with self.assertRaisesRegex(TypeError, "must be DataFrame"): 39 | rf.unwrap(1) 40 | 41 | def test_wrap_bad_type(self): 42 | with self.assertRaisesRegex(TypeError, "must be DataFrame"): 43 | rf.wrap(1) 44 | 45 | def test_unwrap_wrong_direction(self): 46 | with self.assertRaisesRegex(TypeError, "must be DataFrame"): 47 | rf.unwrap(self.pdf) 48 | 49 | def test_wrap_wrong_direction(self): 50 | with self.assertRaisesRegex(TypeError, "must be DataFrame"): 51 | rf.wrap(self.df) 52 | 53 | def test_unwrap_no_side_effect(self): 54 | pdf = rf.unwrap(self.df) 55 | pdf.columns = ["oof", "rab"] 56 | expected = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]}) 57 | self.assertEqual(self.df, expected) 58 | 59 | def test_wrap_no_side_effect(self): 60 | df = rf.wrap(self.pdf) 61 | df = df.rename({"foo": "oof"}) 62 | expected = pd.DataFrame({"foo": [1, 2], "bar": [3, 4]}) 63 | self.assertTrue(self.pdf.equals(expected)) 64 | 65 | def test_round_trip_save_load(self): 66 | rf.save(self.df, self.path) 67 | result = rf.load(self.path) 68 | expected = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]}) 69 | self.assertEqual(result, expected) 70 | 71 | def test_round_trip_unwrap_wrap(self): 72 | pdf = rf.unwrap(self.df) 73 | result = rf.wrap(pdf) 74 | expected = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]}) 75 | self.assertEqual(result, expected) 76 | -------------------------------------------------------------------------------- /tests/test_ladybugs.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | import redframes as rf 6 | 7 | 8 | class TestLadyBugs(unittest.TestCase): 9 | def test_gather_spread_string_values(self): 10 | df = rf.DataFrame( 11 | {"foo": ["A", "B", "C"], "bar": ["D", "E", "F"], "baz": ["G", "H", "I"]} 12 | ) 13 | result = df.gather().spread("variable", "value") 14 | expected = rf.DataFrame( 15 | {"bar": ["D", "E", "F"], "baz": ["G", "H", "I"], "foo": ["A", "B", "C"]} 16 | ) 17 | self.assertEqual(result, expected) 18 | 19 | def test_types_mixed_column(self): 20 | df = rf.DataFrame({"foo": [1, None, 2.0, "3"]}) 21 | result = df.types 22 | expected = {"foo": object} 23 | self.assertEqual(result, expected) 24 | 25 | def test_comine_overwrite_and_drop_other(self): 26 | df = rf.DataFrame({"foo": [1, 2, 3], "bar": [1, 2, 3]}) 27 | result = df.combine(["foo", "bar"], into="foo", sep="-", drop=True) 28 | expected = rf.DataFrame({"foo": ["1-1", "2-2", "3-3"]}) 29 | self.assertEqual(result, expected) 30 | 31 | def test_sample_float_1_point_0(self): 32 | df = rf.DataFrame({"foo": range(100)}) 33 | with self.assertRaisesRegex(ValueError, "must be int if > 1"): 34 | df.sample(1.0) 35 | 36 | def test_sample_negative_1(self): 37 | df = rf.DataFrame({"foo": range(100)}) 38 | with self.assertRaisesRegex(ValueError, "must be > 0"): 39 | df.sample(-1) 40 | 41 | def test_io_wrap_multi_columns(self): 42 | columns = pd.MultiIndex.from_arrays( 43 | [["route", "action", "action"], ["type", "source", "destination"]] 44 | ) 45 | pdf = pd.DataFrame([[1, 2, 3]], columns=columns) 46 | with self.assertRaisesRegex(KeyError, "must be flat"): 47 | rf.wrap(pdf) 48 | 49 | def test_group_gather_beside_conflict(self): 50 | df = rf.DataFrame( 51 | { 52 | "foo": [1, 1, 1, 2, 2, 1, 3, 3], 53 | "bar": range(8), 54 | "baz": range(8), 55 | "jaz": range(8), 56 | } 57 | ) 58 | with self.assertRaisesRegex(ValueError, "beside is incompatible*"): 59 | df.group("foo").gather(beside="bar") 60 | 61 | def test_group_gather_columns_conflict(self): 62 | df = rf.DataFrame( 63 | { 64 | "foo": [1, 1, 1, 2, 2, 1, 3, 3], 65 | "bar": range(8), 66 | "baz": range(8), 67 | "jaz": range(8), 68 | } 69 | ) 70 | with self.assertRaisesRegex(ValueError, "columns is incompatible*"): 71 | df.group("foo").gather(columns=["foo", "bar"]) 72 | -------------------------------------------------------------------------------- /tests/test_readme.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pathlib import Path 3 | from shutil import rmtree as delete 4 | from tempfile import mkdtemp as make_temp_dir 5 | 6 | 7 | class TestReadme(unittest.TestCase): 8 | def setUp(self): 9 | self.tempdir = tempdir = make_temp_dir() 10 | self.path = str(Path(tempdir) / "bears.csv") 11 | 12 | def tearDown(self): 13 | delete(self.tempdir) 14 | 15 | def test_quick_start(self): 16 | import redframes as rf 17 | 18 | df = rf.DataFrame( 19 | { 20 | "bear": [ 21 | "Brown bear", 22 | "Polar bear", 23 | "Asian black bear", 24 | "American black bear", 25 | "Sun bear", 26 | "Sloth bear", 27 | "Spectacled bear", 28 | "Giant panda", 29 | ], 30 | "genus": [ 31 | "Ursus", 32 | "Ursus", 33 | "Ursus", 34 | "Ursus", 35 | "Helarctos", 36 | "Melursus", 37 | "Tremarctos", 38 | "Ailuropoda", 39 | ], 40 | "weight (male, lbs)": [ 41 | "300-860", 42 | "880-1320", 43 | "220-440", 44 | "125-500", 45 | "60-150", 46 | "175-310", 47 | "220-340", 48 | "190-275", 49 | ], 50 | "weight (female, lbs)": [ 51 | "205-455", 52 | "330-550", 53 | "110-275", 54 | "90-300", 55 | "45-90", 56 | "120-210", 57 | "140-180", 58 | "155-220", 59 | ], 60 | } 61 | ) 62 | 63 | # | bear | genus | weight (male, lbs) | weight (female, lbs) | 64 | # |:--------------------|:-----------|:---------------------|:-----------------------| 65 | # | Brown bear | Ursus | 300-860 | 205-455 | 66 | # | Polar bear | Ursus | 880-1320 | 330-550 | 67 | # | Asian black bear | Ursus | 220-440 | 110-275 | 68 | # | American black bear | Ursus | 125-500 | 90-300 | 69 | # | Sun bear | Helarctos | 60-150 | 45-90 | 70 | # | Sloth bear | Melursus | 175-310 | 120-210 | 71 | # | Spectacled bear | Tremarctos | 220-340 | 140-180 | 72 | # | Giant panda | Ailuropoda | 190-275 | 155-220 | 73 | 74 | ( 75 | df.rename({"weight (male, lbs)": "male", "weight (female, lbs)": "female"}) 76 | .gather(["male", "female"], into=("sex", "weight")) 77 | .split("weight", into=["min", "max"], sep="-") 78 | .gather(["min", "max"], into=("stat", "weight")) 79 | .mutate({"weight": lambda row: float(row["weight"])}) 80 | .group(["genus", "sex"]) 81 | .rollup({"weight": ("weight", rf.stat.mean)}) 82 | .spread("sex", using="weight") 83 | .mutate({"dimorphism": lambda row: round(row["male"] / row["female"], 2)}) 84 | .drop(["male", "female"]) 85 | .sort("dimorphism", descending=True) 86 | ) 87 | 88 | # | genus | dimorphism | 89 | # |:-----------|-------------:| 90 | # | Ursus | 2.01 | 91 | # | Tremarctos | 1.75 | 92 | # | Helarctos | 1.56 | 93 | # | Melursus | 1.47 | 94 | # | Ailuropoda | 1.24 | 95 | 96 | self.assertTrue(True) 97 | 98 | def test_pandas_comparison(self): 99 | import pandas as pd 100 | 101 | df = pd.DataFrame( 102 | { 103 | "bear": [ 104 | "Brown bear", 105 | "Polar bear", 106 | "Asian black bear", 107 | "American black bear", 108 | "Sun bear", 109 | "Sloth bear", 110 | "Spectacled bear", 111 | "Giant panda", 112 | ], 113 | "genus": [ 114 | "Ursus", 115 | "Ursus", 116 | "Ursus", 117 | "Ursus", 118 | "Helarctos", 119 | "Melursus", 120 | "Tremarctos", 121 | "Ailuropoda", 122 | ], 123 | "weight (male, lbs)": [ 124 | "300-860", 125 | "880-1320", 126 | "220-440", 127 | "125-500", 128 | "60-150", 129 | "175-310", 130 | "220-340", 131 | "190-275", 132 | ], 133 | "weight (female, lbs)": [ 134 | "205-455", 135 | "330-550", 136 | "110-275", 137 | "90-300", 138 | "45-90", 139 | "120-210", 140 | "140-180", 141 | "155-220", 142 | ], 143 | } 144 | ) 145 | 146 | df = df.rename( 147 | columns={"weight (male, lbs)": "male", "weight (female, lbs)": "female"} 148 | ) 149 | df = pd.melt( 150 | df, 151 | id_vars=["bear", "genus"], 152 | value_vars=["male", "female"], 153 | var_name="sex", 154 | value_name="weight", 155 | ) 156 | df[["min", "max"]] = df["weight"].str.split("-", expand=True) 157 | df = df.drop("weight", axis=1) 158 | df = pd.melt( 159 | df, 160 | id_vars=["bear", "genus", "sex"], 161 | value_vars=["min", "max"], 162 | var_name="stat", 163 | value_name="weight", 164 | ) 165 | df["weight"] = df["weight"].astype("float") 166 | df = df.groupby(["genus", "sex"])["weight"].mean() 167 | df = df.reset_index() 168 | df = pd.pivot_table(df, index=["genus"], columns=["sex"], values="weight") 169 | df = df.reset_index() 170 | df = df.rename_axis(None, axis=1) 171 | df["dimorphism"] = round(df["male"] / df["female"], 2) 172 | df = df.drop(["female", "male"], axis=1) 173 | df = df.sort_values("dimorphism", ascending=False) 174 | df = df.reset_index(drop=True) 175 | 176 | self.assertTrue(True) 177 | 178 | def test_io(self): 179 | import redframes as rf 180 | 181 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]}) 182 | 183 | # save .csv 184 | rf.save(df, self.path) 185 | 186 | # load .csv 187 | df = rf.load(self.path) 188 | 189 | # convert redframes → pandas 190 | pandas_df = rf.unwrap(df) 191 | 192 | # convert pandas → redframes 193 | df = rf.wrap(pandas_df) 194 | 195 | self.assertTrue(True) 196 | 197 | def test_properties(self): 198 | import redframes as rf 199 | 200 | df = rf.DataFrame({"genus": [1]}) 201 | 202 | df["genus"] 203 | # ['Ursus', 'Ursus', 'Ursus', 'Ursus', 'Helarctos', 'Melursus', 'Tremarctos', 'Ailuropoda'] 204 | 205 | df.columns 206 | # ['bear', 'genus', 'weight (male, lbs)', 'weight (female, lbs)'] 207 | 208 | df.dimensions 209 | # {'rows': 8, 'columns': 4} 210 | 211 | df.empty 212 | # False 213 | 214 | df.memory 215 | # '2 KB' 216 | 217 | df.types 218 | # {'bear': object, 'genus': object, 'weight (male, lbs)': object, 'weight (female, lbs)': object} 219 | 220 | self.assertTrue(True) 221 | 222 | def test_matplotlib(self): 223 | import matplotlib.pyplot as plt 224 | 225 | import redframes as rf 226 | 227 | football = rf.DataFrame( 228 | { 229 | "position": ["TE", "K", "RB", "WR", "QB"], 230 | "avp": [116.98, 131.15, 180, 222.22, 272.91], 231 | } 232 | ) 233 | 234 | df = football.mutate( 235 | {"color": lambda row: row["position"] in ["WR", "RB"]} 236 | ).replace({"color": {False: "orange", True: "red"}}) 237 | 238 | plt.barh(df["position"], df["avp"], color=df["color"]) 239 | 240 | self.assertTrue(True) 241 | 242 | def test_sklearn(self): 243 | from sklearn.linear_model import LinearRegression 244 | from sklearn.model_selection import train_test_split 245 | 246 | import redframes as rf 247 | 248 | df = rf.DataFrame( 249 | { 250 | "touchdowns": [15, 19, 5, 7, 9, 10, 12, 22, 16, 10], 251 | "age": [21, 22, 21, 24, 26, 28, 30, 35, 28, 21], 252 | "mvp": [1, 1, 0, 0, 0, 0, 0, 1, 0, 0], 253 | } 254 | ) 255 | 256 | target = "touchdowns" 257 | y = df[target] 258 | X = df.drop(target) 259 | X_train, X_test, y_train, y_test = train_test_split( 260 | X, y, test_size=0.3, random_state=1 261 | ) 262 | 263 | model = LinearRegression() 264 | model.fit(X_train, y_train) 265 | model.score(X_test, y_test) 266 | # 0.5083194901655527 267 | 268 | # print(X_train.take(1)) 269 | # rf.DataFrame({'age': [21], 'mvp': [0]}) 270 | 271 | X_new = rf.DataFrame({"age": [22], "mvp": [1]}) 272 | model.predict(X_new) 273 | # array([19.]) 274 | 275 | self.assertTrue(True) 276 | -------------------------------------------------------------------------------- /tests/test_side_effects.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import redframes as rf 4 | 5 | 6 | class TestSideEffects(unittest.TestCase): 7 | def setUp(self): 8 | self.df = rf.DataFrame( 9 | { 10 | "foo": range(10), 11 | "bar": [1, 3.2, 4.5, 2, -1, 30, None, 1.1, 1.1, 9], 12 | "baz": ["A", "A", None, "B", "B", "A", "B", "C", "C", "A"], 13 | "jaz": [ 14 | "1::1", 15 | "2::2", 16 | "3:3", 17 | "4::4", 18 | "5::5", 19 | "6::7", 20 | "7::8", 21 | "8::9", 22 | "9::0", 23 | "0::-1", 24 | ], 25 | "raz": [1, 2, 3, None, None, None, 9, 9, None, None], 26 | } 27 | ) 28 | self.expected = rf.DataFrame( 29 | { 30 | "foo": range(10), 31 | "bar": [1, 3.2, 4.5, 2, -1, 30, None, 1.1, 1.1, 9], 32 | "baz": ["A", "A", None, "B", "B", "A", "B", "C", "C", "A"], 33 | "jaz": [ 34 | "1::1", 35 | "2::2", 36 | "3:3", 37 | "4::4", 38 | "5::5", 39 | "6::7", 40 | "7::8", 41 | "8::9", 42 | "9::0", 43 | "0::-1", 44 | ], 45 | "raz": [1, 2, 3, None, None, None, 9, 9, None, None], 46 | } 47 | ) 48 | 49 | def test_accumulate(self): 50 | _ = self.df.accumulate("foo", into="foo") 51 | self.assertEqual(self.df, self.expected) 52 | 53 | def test_append(self): 54 | df_bottom = rf.DataFrame({"foo": [10]}) 55 | df_bottom_expected = rf.DataFrame({"foo": [10]}) 56 | _ = self.df.append(df_bottom) 57 | self.assertEqual(self.df, self.expected) 58 | self.assertEqual(df_bottom, df_bottom_expected) 59 | 60 | def test_combine(self): 61 | _ = self.df.combine(["foo", "bar"], into="foo", sep="-") 62 | self.assertEqual(self.df, self.expected) 63 | 64 | def test_cross(self): 65 | _ = self.df.cross(postfix=("_a", "_b")) 66 | self.assertEqual(self.df, self.expected) 67 | 68 | def test_dedupe(self): 69 | _ = self.df.dedupe("baz") 70 | self.assertEqual(self.df, self.expected) 71 | 72 | def test_denix(self): 73 | _ = self.df.denix() 74 | self.assertEqual(self.df, self.expected) 75 | 76 | def test_drop(self): 77 | _ = self.df.drop("foo") 78 | self.assertEqual(self.df, self.expected) 79 | 80 | def test_fill(self): 81 | _ = self.df.fill("baz", direction="down") 82 | self.assertEqual(self.df, self.expected) 83 | 84 | def test_filter(self): 85 | _ = self.df.filter(lambda row: row["bar"] > 5) 86 | self.assertEqual(self.df, self.expected) 87 | 88 | def test_gather(self): 89 | _ = self.df.gather() 90 | self.assertEqual(self.df, self.expected) 91 | 92 | def test_group(self): 93 | _ = self.df.group("baz").rollup({"foo": ("foo", rf.stat.mean)}) 94 | self.assertEqual(self.df, self.expected) 95 | 96 | def test_join(self): 97 | df_right = rf.DataFrame({"baz": ["A", "B"], "haz": ["Apple", "Banana"]}) 98 | df_right_expected = rf.DataFrame( 99 | {"baz": ["A", "B"], "haz": ["Apple", "Banana"]} 100 | ) 101 | _ = self.df.join(df_right, on="baz") 102 | self.assertEqual(self.df, self.expected) 103 | self.assertEqual(df_right, df_right_expected) 104 | 105 | def test_mutate(self): 106 | _ = self.df.mutate({"foo": lambda row: row["foo"] * 10}) 107 | self.assertEqual(self.df, self.expected) 108 | 109 | def test_pack(self): 110 | _ = self.df.pack("baz", sep="|") 111 | self.assertEqual(self.df, self.expected) 112 | 113 | def test_rank(self): 114 | _ = self.df.rank("bar", into="bar_rank", descending=True) 115 | self.assertEqual(self.df, self.expected) 116 | 117 | def test_rename(self): 118 | _ = self.df.rename({"foo": "oof"}) 119 | self.assertEqual(self.df, self.expected) 120 | 121 | def test_replace(self): 122 | _ = self.df.replace({"baz": {"B": "Banana"}}) 123 | self.assertEqual(self.df, self.expected) 124 | 125 | def test_rollup(self): 126 | _ = self.df.rollup({"bar_mean": ("bar", rf.stat.mean)}) 127 | self.assertEqual(self.df, self.expected) 128 | 129 | def test_sample(self): 130 | _ = self.df.sample(5) 131 | self.assertEqual(self.df, self.expected) 132 | 133 | def test_select(self): 134 | _ = self.df.select(["foo", "bar"]) 135 | self.assertEqual(self.df, self.expected) 136 | 137 | def test_shuffle(self): 138 | _ = self.df.shuffle() 139 | self.assertEqual(self.df, self.expected) 140 | 141 | def test_sort(self): 142 | _ = self.df.sort("bar", descending=True) 143 | self.assertEqual(self.df, self.expected) 144 | 145 | def test_split(self): 146 | _ = self.df.split("jaz", into=["jaz_1", "jaz_2"], sep="::") 147 | self.assertEqual(self.df, self.expected) 148 | 149 | def test_spread(self): 150 | _ = self.df.denix("baz").select(["baz", "foo"]).spread("baz", "foo") 151 | self.assertEqual(self.df, self.expected) 152 | 153 | def test_take(self): 154 | _ = self.df.take(-3) 155 | self.assertEqual(self.df, self.expected) 156 | 157 | def test_take(self): 158 | _ = self.df.unpack("jaz", sep="::") 159 | self.assertEqual(self.df, self.expected) 160 | -------------------------------------------------------------------------------- /tests/test_type_hints.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import redframes as rf 4 | 5 | 6 | class TestTypeHints(unittest.TestCase): 7 | def setUp(self): 8 | self.df = rf.DataFrame( 9 | { 10 | "foo": range(10), 11 | "bar": [1, 3.2, 4.5, 2, -1, 30, None, 1.1, 1.1, 9], 12 | "baz": ["A", "A", None, "B", "B", "A", "B", "C", "C", "A"], 13 | "jaz": [ 14 | "1::1", 15 | "2::2", 16 | "3:3", 17 | "4::4", 18 | "5::5", 19 | "6::7", 20 | "7::8", 21 | "8::9", 22 | "9::0", 23 | "0::-1", 24 | ], 25 | "raz": [1, 2, 3, None, None, None, 9, 9, None, None], 26 | } 27 | ) 28 | 29 | def test_io_load_bad_path(self): 30 | with self.assertRaisesRegex(TypeError, "must be str"): 31 | rf.load(1) 32 | 33 | def test_io_load_bad_file_type(self): 34 | with self.assertRaisesRegex(TypeError, "must end in .csv"): 35 | rf.load("example.json") 36 | 37 | def test_io_save_bad_object(self): 38 | with self.assertRaisesRegex(TypeError, "must be DataFrame"): 39 | rf.save(1, "example.csv") 40 | 41 | def test_io_save_bad_path(self): 42 | with self.assertRaisesRegex(TypeError, "must be str"): 43 | rf.save(self.df, 1) 44 | 45 | def test_io_save_bad_format(self): 46 | with self.assertRaisesRegex(TypeError, "must end in .csv"): 47 | rf.save(self.df, "example.json") 48 | 49 | def test_io_unwrap_bad_object(self): 50 | with self.assertRaisesRegex(TypeError, "must be DataFrame"): 51 | rf.unwrap(1) 52 | 53 | def test_io_wrap_bad_object(self): 54 | with self.assertRaisesRegex(TypeError, "must be DataFrame"): 55 | rf.unwrap(1) 56 | 57 | def test_take_bad_rows(self): 58 | with self.assertRaisesRegex(TypeError, "must be int"): 59 | self.df.take("A") 60 | 61 | def test_accumulate_bad_column(self): 62 | with self.assertRaisesRegex(TypeError, "must be str"): 63 | self.df.accumulate(1, "foo") 64 | 65 | def test_accumulate_bad_into_column(self): 66 | with self.assertRaisesRegex(TypeError, "must be str"): 67 | self.df.accumulate("foo", 1) 68 | 69 | def test_rank_bad_column(self): 70 | with self.assertRaisesRegex(TypeError, "must be str"): 71 | self.df.rank(1, "bar2") 72 | 73 | def test_rank_bad_into_column(self): 74 | with self.assertRaisesRegex(TypeError, "must be str"): 75 | self.df.rank("bar", 1) 76 | 77 | def test_rank_bad_descending_argument(self): 78 | with self.assertRaisesRegex(TypeError, "must be bool"): 79 | self.df.rank("bar", "bar", descending="bar") 80 | 81 | def test_rollup_bad_over(self): 82 | with self.assertRaisesRegex(TypeError, "must be dict"): 83 | self.df.rollup(1) 84 | 85 | def test_rollup_bad_over_values(self): 86 | with self.assertRaises(TypeError): 87 | self.df.rollup({"bar_mean": 1}) 88 | 89 | def test_init_bad_data(self): 90 | with self.assertRaisesRegex(TypeError, "must be dict | None"): 91 | rf.DataFrame(1) 92 | 93 | def test_eq_bad_rhs_object(self): 94 | self.assertFalse(self.df == 1) 95 | 96 | def test_getitem_bad_key(self): 97 | pass 98 | 99 | def test_append_bad_other(self): 100 | with self.assertRaisesRegex(TypeError, "must be DataFrame"): 101 | self.df.append(1) 102 | 103 | def test_combine_bad_columns(self): 104 | with self.assertRaisesRegex(TypeError, "must be list"): 105 | self.df.combine(1, "foo", sep="-") 106 | 107 | def test_combine_bad_into_column(self): 108 | with self.assertRaisesRegex(TypeError, "must be str"): 109 | self.df.combine(["foo", "bar"], 1, sep="-") 110 | 111 | def test_combine_bad_sep_argument(self): 112 | with self.assertRaisesRegex(TypeError, "must be str"): 113 | self.df.combine(["foo", "bar"], "foo", sep=1) 114 | 115 | def test_combine_bad_drop_argument(self): 116 | with self.assertRaisesRegex(TypeError, "must be bool"): 117 | self.df.combine(["foo", "bar"], "foo", sep=":::", drop="A") 118 | 119 | def test_dedupe_bad_columns(self): 120 | with self.assertRaisesRegex(TypeError, "must be list | str | None"): 121 | self.df.dedupe(1) 122 | 123 | def test_denix_bad_columns(self): 124 | with self.assertRaisesRegex(TypeError, "must be list | str | None"): 125 | self.df.denix(1) 126 | 127 | def test_drop_bad_columns(self): 128 | with self.assertRaisesRegex(TypeError, "must be list | str | None"): 129 | self.df.drop(1) 130 | 131 | def test_fill_bad_columns(self): 132 | with self.assertRaisesRegex(TypeError, "must be list | str | None"): 133 | self.df.fill(1) 134 | 135 | def test_fill_bad_direction(self): 136 | with self.assertRaisesRegex(ValueError, "must be one of {'down', 'up'}"): 137 | self.df.fill("bar", direction="sideways") 138 | 139 | def test_fill_bad_constant_and_direction(self): 140 | with self.assertRaisesRegex( 141 | ValueError, "either direction OR constant must not be None" 142 | ): 143 | self.df.fill("bar") 144 | 145 | def test_fill_bad_no_constant_nor_direction(self): 146 | with self.assertRaisesRegex( 147 | ValueError, "either direction OR constant must be None" 148 | ): 149 | self.df.fill("bar", direction="down", constant="X") 150 | 151 | def test_filter_bad_func(self): 152 | with self.assertRaisesRegex(TypeError, "must be Func"): 153 | self.df.filter(1) 154 | 155 | def test_gather_bad_columns(self): 156 | with self.assertRaisesRegex(TypeError, "must be list | None"): 157 | self.df.gather(1) 158 | 159 | def test_gather_bad_beside(self): 160 | with self.assertRaisesRegex(TypeError, "must be str | list | None"): 161 | self.df.gather(beside=1) 162 | 163 | def test_gather_bad_into_column(self): 164 | with self.assertRaisesRegex(TypeError, "must be tuple"): 165 | self.df.gather(["foo", "bar"], into=1) 166 | 167 | def test_gather_bad_into_tuple(self): 168 | with self.assertRaisesRegex(TypeError, "must be tuple*"): 169 | self.df.gather(into=("one", "two", "three")) 170 | 171 | def test_gather_bad_both_not_none(self): 172 | with self.assertRaisesRegex(ValueError, "columns OR beside must be None"): 173 | self.df.gather(columns=["foo", "bar"], beside=["baz"]) 174 | 175 | def test_group_bad_by_columns(self): 176 | with self.assertRaisesRegex(TypeError, "must be list | str"): 177 | self.df.group(1) 178 | 179 | def test_join_bad_rhs_object(self): 180 | with self.assertRaisesRegex(TypeError, "must be DataFrame"): 181 | self.df.join(1, on="baz") 182 | 183 | def test_join_bad_on_type(self): 184 | rhs = rf.DataFrame() 185 | with self.assertRaisesRegex(TypeError, "must be list | str"): 186 | self.df.join(rhs, on=1) 187 | 188 | def test_join_bad_how_argument(self): 189 | rhs = rf.DataFrame() 190 | message = ( 191 | "on argument is invalid, must be one of {'left', 'right', 'inner', 'full'}" 192 | ) 193 | with self.assertRaisesRegex(ValueError, message): 194 | self.df.join(rhs, on="baz", how="inside") 195 | 196 | def test_mutate_bad_over(self): 197 | with self.assertRaisesRegex(TypeError, "must be dict"): 198 | self.df.mutate(1) 199 | 200 | def test_pack_bad_column(self): 201 | with self.assertRaisesRegex(TypeError, "must be str"): 202 | self.df.pack(1, sep="|") 203 | 204 | def test_pack_bad_sep(self): 205 | with self.assertRaisesRegex(TypeError, "must be str"): 206 | self.df.pack("baz", sep=1) 207 | 208 | def test_rename_bad_columns(self): 209 | with self.assertRaisesRegex(TypeError, "must be dict"): 210 | self.df.rename(1) 211 | 212 | def test_rename_bad_columns_values(self): 213 | with self.assertRaisesRegex(TypeError, "must be str"): 214 | self.df.rename({"foo": 1}) 215 | 216 | def test_replace_bad_over(self): 217 | with self.assertRaisesRegex(TypeError, "must be dict"): 218 | self.df.replace(1) 219 | 220 | def test_sample_bad_rows(self): 221 | with self.assertRaisesRegex(TypeError, "must be int | float"): 222 | self.df.sample("A") 223 | 224 | def test_select_bad_columns(self): 225 | with self.assertRaisesRegex(TypeError, "must be list | str"): 226 | self.df.select(1) 227 | 228 | def test_shuffle(self): 229 | pass 230 | 231 | def test_sort_bad_columns(self): 232 | with self.assertRaisesRegex(TypeError, "must be list | str"): 233 | self.df.sort(1) 234 | 235 | def test_sort_bad_descending_argument(self): 236 | with self.assertRaisesRegex(TypeError, "must be bool"): 237 | self.df.sort("bar", descending="A") 238 | 239 | def test_split_bad_column(self): 240 | with self.assertRaisesRegex(TypeError, "must be str"): 241 | self.df.split(1, into=["jaz1", "jaz2"], sep="::") 242 | 243 | def test_split_bad_into_column(self): 244 | with self.assertRaisesRegex(TypeError, "must be list"): 245 | self.df.split("jaz", into=1, sep="::") 246 | 247 | def test_split_bad_sep_argument(self): 248 | with self.assertRaisesRegex(TypeError, "must be str"): 249 | self.df.split("jaz", into=["jaz1", "jaz2"], sep=1) 250 | 251 | def test_split_bad_drop_argument(self): 252 | with self.assertRaisesRegex(TypeError, "must be bool"): 253 | self.df.split("jaz", into=["jaz1", "jaz2"], sep="::", drop="A") 254 | 255 | def test_spread_bad_column(self): 256 | with self.assertRaisesRegex(TypeError, "must be str"): 257 | self.df.spread(1, using="bar") 258 | 259 | def test_spread_bad_using_column(self): 260 | with self.assertRaisesRegex(TypeError, "must be str"): 261 | self.df.spread("foo", using=1) 262 | 263 | def test_unpack_bad_column(self): 264 | with self.assertRaisesRegex(TypeError, "must be str"): 265 | self.df.unpack(1, sep="|") 266 | 267 | def test_unpack_bad_sep(self): 268 | with self.assertRaisesRegex(TypeError, "must be str"): 269 | self.df.unpack("jaz", sep=1) 270 | --------------------------------------------------------------------------------