├── .github ├── FUNDING.yml └── workflows │ └── python-publish.yml ├── .gitignore ├── CHANGELOG ├── LICENSE ├── Makefile ├── README.md ├── TODO ├── images ├── bars.png └── redframes.png ├── mypy.ini ├── redframes ├── __init__.py ├── checks.py ├── core.py ├── io │ ├── __init__.py │ ├── convert.py │ ├── load.py │ └── save.py ├── stat.py ├── types.py ├── verbs │ ├── __init__.py │ ├── accumulate.py │ ├── append.py │ ├── combine.py │ ├── cross.py │ ├── dedupe.py │ ├── denix.py │ ├── drop.py │ ├── fill.py │ ├── filter.py │ ├── gather.py │ ├── group.py │ ├── join.py │ ├── mutate.py │ ├── pack.py │ ├── rank.py │ ├── rename.py │ ├── replace.py │ ├── rollup.py │ ├── sample.py │ ├── select.py │ ├── shuffle.py │ ├── sort.py │ ├── split.py │ ├── spread.py │ ├── take.py │ └── unpack.py └── version.py ├── setup.py └── tests ├── __init__.py ├── test_deprecations.py ├── test_docstrings.py ├── test_dupe_columns.py ├── test_index.py ├── test_interchange.py ├── test_io.py ├── test_ladybugs.py ├── test_readme.py ├── test_side_effects.py └── test_type_hints.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [maxhumber] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 14 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload to PyPI 5 | on: 6 | release: 7 | types: [published] 8 | permissions: 9 | contents: read 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Python 16 | uses: actions/setup-python@v3 17 | with: 18 | python-version: '3.x' 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install -e ".[test]" 23 | - name: Run tests 24 | run: python -m unittest 25 | - name: Install build dependencies 26 | run: pip install build 27 | - name: Build package 28 | run: python -m build 29 | - name: Publish package 30 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 31 | with: 32 | user: __token__ 33 | password: ${{ secrets.PYPI_API_TOKEN }} 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # custom 2 | playground 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | - 1.4.1 2 | - BUMP: support for pandas 2.0+ 3 | - NEW: support for Python 3.8 4 | - 1.4 5 | - NEW: pandas dependency pinned to below 2.0 6 | - NEW: `pack` verb 7 | - NEW: `unpack` verb 8 | - NEW: `group` + `gather` compatibility 9 | - NEW: `make loc` (for development) 10 | - IMPROVED: README Quickstart + "Verb Table" 11 | - IMPROVED: GroupedFrame `__repr__` 12 | - IMPROVED: `group` performance optimizations 13 | - BUGFIX: `rf.wrap` now properly throws an error on "MultiIndex" columns 14 | - BUGFIX: sort order is now retained in `group` operations 15 | - BUGFIX: some `TypeError`s have been changed to `ValueError`s 16 | - DEPRECATED: `gather(beside=...)` ...whoops! please use `group` + `gather`! 17 | - 1.3 18 | - NEW: `gather(beside=...)` argument! 19 | - IMPROVED: `sample` errors are more explicit 20 | - 1.2 21 | - NEW: `cross` join verb! 22 | - NEW: `join(..., postfix=("_lhs, "_rhs"))` argument 23 | - NEW: `memory` property to check DataFrame memory footprint 24 | - NEW: Makefile (for development) 25 | - BUGFIX: `combine` drop=True argument now works as intended 26 | - BUGFIX: `summarize` deprecation warning now displays properly 27 | - BREAKING: `combine` now explicitly requires a `sep` argument 28 | - 1.1 29 | - BUMP: pandas 1.5+ 30 | - NEW: `__dataframe__` interchange format support 31 | - NEW: `rollup` verb (fka `summarize`) 32 | - NEW: `__version__` 33 | - BUGFIX: `select` verb now requires unique column keys 34 | - BUGFIX: `types` property is now more robust to mixed types within a column 35 | - DEPRECATED: `summarize` (please use `rollup`!) 36 | 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2022, Max Humber 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | python -m unittest 3 | 4 | format: 5 | isort redframes tests 6 | black redframes tests 7 | 8 | types: 9 | mypy redframes 10 | pyright redframes 11 | 12 | loc: 13 | find redframes -name '*.py' | xargs wc -l | sort -nr 14 | find tests -name '*.py' | xargs wc -l | sort -nr 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
11 | 12 | 13 | 14 | ### About 15 | 16 | **redframes** (**re**ctangular **d**ata **frames**) is a general purpose data manipulation library that prioritizes syntax, simplicity, and speed (to a solution). Importantly, the library is fully interoperable with [pandas](https://github.com/pandas-dev/pandas), compatible with [scikit-learn](https://github.com/scikit-learn/scikit-learn), and works great with [matplotlib](https://github.com/matplotlib/matplotlib). 17 | 18 | 19 | 20 | ### Install & Import 21 | 22 | ```sh 23 | pip install redframes 24 | ``` 25 | 26 | ```python 27 | import redframes as rf 28 | ``` 29 | 30 | 31 | 32 | ### Quickstart 33 | 34 | Copy-and-paste this to get started: 35 | 36 | ```python 37 | import redframes as rf 38 | 39 | df = rf.DataFrame({ 40 | 'bear': ['Brown bear', 'Polar bear', 'Asian black bear', 'American black bear', 'Sun bear', 'Sloth bear', 'Spectacled bear', 'Giant panda'], 41 | 'genus': ['Ursus', 'Ursus', 'Ursus', 'Ursus', 'Helarctos', 'Melursus', 'Tremarctos', 'Ailuropoda'], 42 | 'weight (male, lbs)': ['300-860', '880-1320', '220-440', '125-500', '60-150', '175-310', '220-340', '190-275'], 43 | 'weight (female, lbs)': ['205-455', '330-550', '110-275', '90-300', '45-90', '120-210', '140-180', '155-220'] 44 | }) 45 | 46 | # | bear | genus | weight (male, lbs) | weight (female, lbs) | 47 | # |:--------------------|:-----------|:---------------------|:-----------------------| 48 | # | Brown bear | Ursus | 300-860 | 205-455 | 49 | # | Polar bear | Ursus | 880-1320 | 330-550 | 50 | # | Asian black bear | Ursus | 220-440 | 110-275 | 51 | # | American black bear | Ursus | 125-500 | 90-300 | 52 | # | Sun bear | Helarctos | 60-150 | 45-90 | 53 | # | Sloth bear | Melursus | 175-310 | 120-210 | 54 | # | Spectacled bear | Tremarctos | 220-340 | 140-180 | 55 | # | Giant panda | Ailuropoda | 190-275 | 155-220 | 56 | 57 | ( 58 | df 59 | .rename({"weight (male, lbs)": "male", "weight (female, lbs)": "female"}) 60 | .gather(["male", "female"], into=("sex", "weight")) 61 | .split("weight", into=["min", "max"], sep="-") 62 | .gather(["min", "max"], into=("stat", "weight")) 63 | .mutate({"weight": lambda row: float(row["weight"])}) 64 | .group(["genus", "sex"]) 65 | .rollup({"weight": ("weight", rf.stat.mean)}) 66 | .spread("sex", using="weight") 67 | .mutate({"dimorphism": lambda row: round(row["male"] / row["female"], 2)}) 68 | .drop(["male", "female"]) 69 | .sort("dimorphism", descending=True) 70 | ) 71 | 72 | # | genus | dimorphism | 73 | # |:-----------|-------------:| 74 | # | Ursus | 2.01 | 75 | # | Tremarctos | 1.75 | 76 | # | Helarctos | 1.56 | 77 | # | Melursus | 1.47 | 78 | # | Ailuropoda | 1.24 | 79 | ``` 80 | 81 | 82 | 83 | For comparison, here's the equivalent pandas: 84 | 85 | ```python 86 | import pandas as pd 87 | 88 | # df = pd.DataFrame({...}) 89 | 90 | df = df.rename(columns={"weight (male, lbs)": "male", "weight (female, lbs)": "female"}) 91 | df = pd.melt(df, id_vars=['bear', 'genus'], value_vars=['male', 'female'], var_name='sex', value_name='weight') 92 | df[["min", "max"]] = df["weight"].str.split("-", expand=True) 93 | df = df.drop("weight", axis=1) 94 | df = pd.melt(df, id_vars=['bear', 'genus', 'sex'], value_vars=['min', 'max'], var_name='stat', value_name='weight') 95 | df['weight'] = df["weight"].astype('float') 96 | df = df.groupby(["genus", "sex"])["weight"].mean() 97 | df = df.reset_index() 98 | df = pd.pivot_table(df, index=['genus'], columns=['sex'], values='weight') 99 | df = df.reset_index() 100 | df = df.rename_axis(None, axis=1) 101 | df["dimorphism"] = round(df["male"] / df["female"], 2) 102 | df = df.drop(["female", "male"], axis=1) 103 | df = df.sort_values("dimorphism", ascending=False) 104 | df = df.reset_index(drop=True) 105 | 106 | # 🤮 107 | ``` 108 | 109 | 110 | 111 | ### IO 112 | 113 | Save, load, and convert `rf.DataFrame` objects: 114 | 115 | ```python 116 | # save .csv 117 | rf.save(df, "bears.csv") 118 | 119 | # load .csv 120 | df = rf.load("bears.csv") 121 | 122 | # convert redframes → pandas 123 | pandas_df = rf.unwrap(df) 124 | 125 | # convert pandas → redframes 126 | df = rf.wrap(pandas_df) 127 | ``` 128 | 129 | 130 | 131 | ### Verbs 132 | 133 | Verbs are [pure](https://en.wikipedia.org/wiki/Pure_function) and "chain-able" methods that manipulate `rf.DataFrame` objects. Here is the complete list (see *docstrings* for examples and more details): 134 | 135 | | Verb | Description | 136 | | ------------------------------------------------ | ------------------------------------------------------------ | 137 | | `accumulate`‡ | Run a cumulative sum over a column | 138 | | `append` | Append rows from another DataFrame | 139 | | `combine` | Combine multiple columns into a single column (opposite of `split`) | 140 | | `cross` | Cross join columns from another DataFrame | 141 | | `dedupe` | Remove duplicate rows | 142 | | [`denix`](https://www.dictionary.com/browse/nix) | Remove rows with missing values | 143 | | `drop` | Drop entire columns (opposite of `select`) | 144 | | `fill` | Fill missing values "down", "up", or with a constant | 145 | | `filter` | Keep rows matching specific conditions | 146 | | `gather`‡ | Gather columns into rows (opposite of `spread`) | 147 | | `group` | Prepare groups for compatible verbs‡ | 148 | | `join` | Join columns from another DataFrame | 149 | | `mutate` | Create a new, or overwrite an existing column | 150 | | `pack`‡ | Collate and concatenate row values for a target column (opposite of `unpack`) | 151 | | `rank`‡ | Rank order values in a column | 152 | | `rename` | Rename column keys | 153 | | `replace` | Replace matching values within columns | 154 | | `rollup`‡ | Apply summary functions and/or statistics to target columns | 155 | | `sample` | Randomly sample any number of rows | 156 | | `select` | Select specific columns (opposite of `drop`) | 157 | | `shuffle` | Shuffle the order of all rows | 158 | | `sort` | Sort rows by specific columns | 159 | | `split` | Split a single column into multiple columns (opposite of `combine`) | 160 | | `spread` | Spread rows into columns (opposite of `gather`) | 161 | | `take`‡ | Take any number of rows (from the top/bottom) | 162 | | `unpack` | "Explode" concatenated row values into multiple rows (opposite of `pack`) | 163 | 164 | 165 | 166 | ### Properties 167 | 168 | In addition to all of the verbs there are several properties attached to each `DataFrame` object: 169 | 170 | ```python 171 | df["genus"] 172 | # ['Ursus', 'Ursus', 'Ursus', 'Ursus', 'Helarctos', 'Melursus', 'Tremarctos', 'Ailuropoda'] 173 | 174 | df.columns 175 | # ['bear', 'genus', 'weight (male, lbs)', 'weight (female, lbs)'] 176 | 177 | df.dimensions 178 | # {'rows': 8, 'columns': 4} 179 | 180 | df.empty 181 | # False 182 | 183 | df.memory 184 | # '2 KB' 185 | 186 | df.types 187 | # {'bear': object, 'genus': object, 'weight (male, lbs)': object, 'weight (female, lbs)': object} 188 | ``` 189 | 190 | 191 | 192 | ### matplotlib 193 | 194 | `rf.DataFrame` objects integrate seamlessly with `matplotlib`: 195 | 196 | ```python 197 | import redframes as rf 198 | import matplotlib.pyplot as plt 199 | 200 | football = rf.DataFrame({ 201 | 'position': ['TE', 'K', 'RB', 'WR', 'QB'], 202 | 'avp': [116.98, 131.15, 180, 222.22, 272.91] 203 | }) 204 | 205 | df = ( 206 | football 207 | .mutate({"color": lambda row: row["position"] in ["WR", "RB"]}) 208 | .replace({"color": {False: "orange", True: "red"}}) 209 | ) 210 | 211 | plt.barh(df["position"], df["avp"], color=df["color"]); 212 | ``` 213 | 214 |
215 |
216 |
217 |
218 | ### scikit-learn
219 |
220 | `rf.DataFrame` objects are fully compatible with `sklearn` functions, estimators, and transformers:
221 |
222 | ```python
223 | import redframes as rf
224 | from sklearn.model_selection import train_test_split
225 | from sklearn.linear_model import LinearRegression
226 |
227 | df = rf.DataFrame({
228 | "touchdowns": [15, 19, 5, 7, 9, 10, 12, 22, 16, 10],
229 | "age": [21, 22, 21, 24, 26, 28, 30, 35, 28, 21],
230 | "mvp": [1, 1, 0, 0, 0, 0, 0, 1, 0, 0]
231 | })
232 |
233 | target = "touchdowns"
234 | y = df[target]
235 | X = df.drop(target)
236 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
237 |
238 | model = LinearRegression()
239 | model.fit(X_train, y_train)
240 | model.score(X_test, y_test)
241 | # 0.5083194901655527
242 |
243 | print(X_train.take(1))
244 | # rf.DataFrame({'age': [21], 'mvp': [0]})
245 |
246 | X_new = rf.DataFrame({'age': [22], 'mvp': [1]})
247 | model.predict(X_new)
248 | # array([19.])
249 | ```
250 |
--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
1 | 1.5
2 | - docstrings examples to bears
3 | - pd.DataFrame().to_redframes()
4 | - to_pandas / (to_csv / to_dict)
5 | - rf.read_csv() / rf.from_csv()
6 | - deprecate io functions
7 | - replace `__str__`
8 | - from_dict, from_pandas, from_csv, from_excel?
9 |
10 | 1.6
11 | - reorder/move columns to front / end / before / after
12 | - tally verb
13 | - complete verb (tidyr)
14 |
15 | 1.7
16 | - warning on multiple columns in mutate (override) or fix?
17 | - expose @extension
18 |
19 | 2.0
20 | - explicit * keyword arguments
21 | - remove deprecated functions/methods
22 |
23 | Later
24 | - complete verb (tidyr)
25 | - hide/protect/private methods/attributes
26 | - cheatsheet & tutorial
27 |
28 | Maybe
29 | - slice verb
30 | - log verb (Untitled12)
31 | - builtin datasets
32 | - vectorized mutate support (`.assign` mutate(..., vectorized=True))?
33 | - polars/arrow backend
34 | - class RedList(list): ...?
35 | - speedtests
36 |
--------------------------------------------------------------------------------
/images/bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxhumber/redframes/6e3f1226358ad4e67f4343cbc4b1ee4b63475034/images/bars.png
--------------------------------------------------------------------------------
/images/redframes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxhumber/redframes/6e3f1226358ad4e67f4343cbc4b1ee4b63475034/images/redframes.png
--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | python_version = 3.9
3 | warn_return_any = True
4 | warn_unused_configs = True
5 |
6 | [mypy-pandas.*]
7 | ignore_missing_imports = True
--------------------------------------------------------------------------------
/redframes/__init__.py:
--------------------------------------------------------------------------------
1 | from . import stat
2 | from .core import DataFrame
3 | from .io import load, save, unwrap, wrap
4 | from .version import __version__
5 |
--------------------------------------------------------------------------------
/redframes/checks.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .types import (
4 | Any,
5 | Columns,
6 | LazyColumns,
7 | PandasDataFrame,
8 | PandasIndex,
9 | PandasRangeIndex,
10 | )
11 |
12 |
13 | def _check_type(argument: Any, against: type | set[type | None]) -> None:
14 | if isinstance(against, set):
15 | if len(against) == 0:
16 | against = {against} # type: ignore
17 | if not isinstance(against, set):
18 | against = {against}
19 | optional = None in against
20 | just_types = against.difference({None})
21 | checks = [isinstance(argument, t) for t in just_types] # type: ignore
22 | if optional:
23 | checks += [argument == None]
24 | if not any(checks):
25 | str_types = " | ".join([t.__name__ for t in just_types]) # type: ignore
26 | if optional:
27 | str_types += " | None"
28 | raise TypeError(f"must be {str_types}")
29 |
30 |
31 | def _check_values(values: Any, type: type) -> None:
32 | if not all(isinstance(value, type) for value in values):
33 | raise TypeError(f"must be {type.__name__}")
34 |
35 |
36 | def _check_keys(columns: LazyColumns | None, against: Columns | PandasIndex) -> None:
37 | if isinstance(columns, str):
38 | columns = [columns]
39 | columns = [] if (columns == None) else columns
40 | bad_keys = set(columns).difference(against) # type: ignore
41 | if bad_keys:
42 | if len(bad_keys) == 1:
43 | raise KeyError(f"invalid key {bad_keys}")
44 | else:
45 | raise KeyError(f"invalid keys {bad_keys}")
46 |
47 |
48 | def _check_index(df: PandasDataFrame) -> None:
49 | if not (df.index.name == None):
50 | raise IndexError("must be unnamed")
51 | if not isinstance(df.index, PandasRangeIndex):
52 | raise IndexError("must be range")
53 | if not (df.index.start == 0):
54 | raise IndexError("must start at 0")
55 | if not (df.index.step == 1):
56 | raise IndexError("must step by 1")
57 |
58 |
59 | def _check_columns(df: PandasDataFrame) -> None:
60 | if type(df.columns) != PandasIndex:
61 | raise KeyError("must be flat")
62 | if df.columns.has_duplicates:
63 | raise KeyError("must not contain duplicate keys")
64 |
65 |
66 | def _check_file(path: str) -> None:
67 | if not path.endswith(".csv"):
68 | raise TypeError("must end in .csv")
69 |
--------------------------------------------------------------------------------
/redframes/core.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pprint
4 | import warnings
5 |
6 | from .checks import _check_type
7 | from .types import (
8 | Any,
9 | Column,
10 | Columns,
11 | DateTime,
12 | Direction,
13 | Func,
14 | Join,
15 | LazyColumns,
16 | NewColumn,
17 | NewValue,
18 | NumpyArray,
19 | NumpyType,
20 | OldColumn,
21 | OldValue,
22 | PandasDataFrame,
23 | PandasGroupedFrame,
24 | Value,
25 | Values,
26 | )
27 | from .verbs import (
28 | accumulate,
29 | append,
30 | combine,
31 | cross,
32 | dedupe,
33 | denix,
34 | drop,
35 | fill,
36 | filter,
37 | gather,
38 | group,
39 | join,
40 | mutate,
41 | pack,
42 | rank,
43 | rename,
44 | replace,
45 | rollup,
46 | sample,
47 | select,
48 | shuffle,
49 | sort,
50 | split,
51 | spread,
52 | take,
53 | unpack,
54 | )
55 |
56 |
57 | def _wrap(data: PandasDataFrame) -> DataFrame:
58 | """Unsafe version of redframes.io.wrap()"""
59 | df = DataFrame()
60 | df._data = data
61 | return df
62 |
63 |
64 | class _TakeMixin:
65 | def __init__(self, data: PandasDataFrame | PandasGroupedFrame) -> None:
66 | self._data = data
67 |
68 | def take(self, rows: int, **kwargs) -> DataFrame:
69 | """Take any number of rows (from the top/bottom)
70 |
71 | Examples:
72 |
73 | ```python
74 | df = rf.DataFrame({"foo": range(10)})
75 | ```
76 | | foo |
77 | |------:|
78 | | 0 |
79 | | 1 |
80 | | 2 |
81 | | 3 |
82 | | 4 |
83 | | 5 |
84 | | 6 |
85 | | 7 |
86 | | 8 |
87 | | 9 |
88 |
89 | From "head":
90 |
91 | ```python
92 | df.take(1)
93 | ```
94 | | foo |
95 | |------:|
96 | | 0 |
97 |
98 | From "tail":
99 |
100 | ```python
101 | df.take(-2)
102 | ```
103 | | foo |
104 | |------:|
105 | | 8 |
106 | | 9 |
107 | """
108 | return _wrap(take(self._data, rows, **kwargs))
109 |
110 |
111 | class _InterchangeMixin(_TakeMixin):
112 | def __init__(self, data: PandasDataFrame) -> None:
113 | self._data = data
114 |
115 | def __array__(self) -> NumpyArray:
116 | return self._data.__array__()
117 |
118 | def __dataframe__(self, nan_as_null=False, allow_copy=True) -> "PandasDataFrameXchg": # type: ignore
119 | return self._data.__dataframe__(nan_as_null, allow_copy)
120 |
121 | def __len__(self) -> int:
122 | return self._data.__len__()
123 |
124 | @property
125 | def iloc(self):
126 | return self._data.iloc
127 |
128 |
129 | class _CommonMixin(_TakeMixin):
130 | def __init__(self, data: PandasDataFrame | PandasGroupedFrame) -> None:
131 | self._data = data
132 |
133 | def accumulate(self, column: Column, into: Column) -> DataFrame:
134 | """Run a cumulative sum over a column
135 |
136 | Example:
137 |
138 | ```python
139 | df = rf.DataFrame({"foo": [1, 2, 3, 4]})
140 | ```
141 | | foo |
142 | |------:|
143 | | 1 |
144 | | 2 |
145 | | 3 |
146 | | 4 |
147 |
148 | ```python
149 | df.accumulate("foo", into="cumsum")
150 | ```
151 | | foo | cumsum |
152 | |------:|---------:|
153 | | 1 | 1 |
154 | | 2 | 3 |
155 | | 3 | 6 |
156 | | 4 | 10 |
157 | """
158 | return _wrap(accumulate(self._data, column, into))
159 |
160 | def gather(
161 | self,
162 | columns: Columns | None = None,
163 | beside: LazyColumns | None = None,
164 | into: tuple[Column, Column] = ("variable", "value"),
165 | ):
166 | """Gather columns into rows (opposite of spread)
167 |
168 | Examples:
169 |
170 | ```python
171 | df = rf.DataFrame({
172 | "foo": [1, 2, 1, 2],
173 | "bar": ["A", "B", "C", "D"],
174 | "baz": ["!", "@", "#", "$"],
175 | "jaz": range(4)
176 | })
177 | ```
178 | | foo | bar | baz | jaz |
179 | |------:|:------|:------|------:|
180 | | 1 | A | ! | 0 |
181 | | 2 | B | @ | 1 |
182 | | 1 | C | # | 2 |
183 | | 2 | D | $ | 3 |
184 |
185 | All columns:
186 |
187 | ```python
188 | df.gather()
189 | ```
190 | | variable | value |
191 | |:-----------|:--------|
192 | | foo | 1 |
193 | | foo | 2 |
194 | | foo | 1 |
195 | | foo | 2 |
196 | | bar | A |
197 | | bar | B |
198 | | bar | C |
199 | | bar | D |
200 | | baz | ! |
201 | | baz | @ |
202 | | baz | # |
203 | | baz | $ |
204 | | jaz | 0 |
205 | | jaz | 1 |
206 | | jaz | 2 |
207 | | jaz | 3 |
208 |
209 | Multiple columns:
210 |
211 | ```python
212 | df.gather(["foo", "bar"], into=("var", "val"))
213 | ```
214 | | baz | jaz | var | val |
215 | |:------|------:|:------|:------|
216 | | ! | 0 | foo | 1 |
217 | | @ | 1 | foo | 2 |
218 | | # | 2 | foo | 1 |
219 | | $ | 3 | foo | 2 |
220 | | ! | 0 | bar | A |
221 | | @ | 1 | bar | B |
222 | | # | 2 | bar | C |
223 | | $ | 3 | bar | D |
224 |
225 | All columns beside:
226 |
227 | ```python
228 | df.group(["foo", "bar"]).gather(into=("variable", "value"))
229 | ```
230 | | foo | bar | variable | value |
231 | |------:|:------|:-----------|:--------|
232 | | 1 | A | baz | ! |
233 | | 2 | B | baz | @ |
234 | | 1 | C | baz | # |
235 | | 2 | D | baz | $ |
236 | | 1 | A | jaz | 0 |
237 | | 2 | B | jaz | 1 |
238 | | 1 | C | jaz | 2 |
239 | | 2 | D | jaz | 3 |
240 | """
241 | return _wrap(gather(self._data, columns, beside, into))
242 |
243 | def pack(self, column: Column, sep: str) -> DataFrame:
244 | """Collate and concatenate row values for a target column (opposite of unpack)
245 |
246 | Examples:
247 |
248 | ```python
249 | df = rf.DataFrame({
250 | "foo": ["A", "A", "B", "A", "B", "C"],
251 | "bar": [1, 2, 3, 4, 5, 6]
252 | })
253 | ```
254 | | foo | bar |
255 | |:------|------:|
256 | | A | 1 |
257 | | A | 2 |
258 | | B | 3 |
259 | | A | 4 |
260 | | B | 5 |
261 | | C | 6 |
262 |
263 | Pack all rows:
264 |
265 | ```python
266 | df.pack("foo", sep="+")
267 | ```
268 | | foo |
269 | |:------------|
270 | | A+A+B+A+B+C |
271 |
272 | Pack rows by Group:
273 |
274 | ```python
275 | df.group("foo").pack("bar", sep="|")
276 | ```
277 | | foo | bar |
278 | |:------|:------|
279 | | A | 1|2|4 |
280 | | B | 3|5 |
281 | | C | 6 |
282 | """
283 | return _wrap(pack(self._data, column, sep))
284 |
285 | def rank(
286 | self,
287 | column: Column,
288 | into: Column,
289 | descending: bool = False,
290 | ) -> DataFrame:
291 | """Rank order values in a column
292 |
293 | Example:
294 |
295 | ```python
296 | df = rf.DataFrame({"foo": [2, 3, 3, 99, 1000, 1, -6, 4]})
297 | ```
298 | | foo |
299 | |------:|
300 | | 2 |
301 | | 3 |
302 | | 3 |
303 | | 99 |
304 | | 1000 |
305 | | 1 |
306 | | -6 |
307 | | 4 |
308 |
309 | ```python
310 | df.rank("foo", into="rank", descending=True)
311 | ```
312 | | foo | rank |
313 | |------:|-------:|
314 | | 2 | 5 |
315 | | 3 | 4 |
316 | | 3 | 4 |
317 | | 99 | 2 |
318 | | 1000 | 1 |
319 | | 1 | 6 |
320 | | -6 | 7 |
321 | | 4 | 3 |
322 | """
323 | return _wrap(rank(self._data, column, into, descending))
324 |
325 | def rollup(self, over: dict[Column, tuple[Column, Func]]) -> DataFrame:
326 | """Apply summary functions and/or statistics to target columns
327 |
328 | Example:
329 |
330 | ```python
331 | df = rf.DataFrame({"foo": [1, 2, 3, 4, 5], "bar": [99, 100, 1, -5, 2]})
332 | ```
333 | | foo | bar |
334 | |------:|------:|
335 | | 1 | 99 |
336 | | 2 | 100 |
337 | | 3 | 1 |
338 | | 4 | -5 |
339 | | 5 | 2 |
340 |
341 | ```python
342 | df.rollup({
343 | "fcount": ("foo", rf.stat.count),
344 | "fmean": ("foo", rf.stat.mean),
345 | "fsum": ("foo", rf.stat.sum),
346 | "fmax": ("foo", rf.stat.max),
347 | "bmedian": ("bar", rf.stat.median),
348 | "bmin": ("bar", rf.stat.min),
349 | "bstd": ("bar", rf.stat.std)
350 | })
351 | ```
352 | | fcount | fmean | fsum | fmax | bmedian | bmin | bstd |
353 | |---------:|--------:|-------:|-------:|----------:|-------:|-------:|
354 | | 5 | 3 | 15 | 5 | 2 | -5 | 54.93 |
355 | """
356 | return _wrap(rollup(self._data, over))
357 |
358 | def summarize(self, over: dict[Column, tuple[Column, Func]]) -> DataFrame:
359 | message = "Marked for removal, please use `rollup` instead"
360 | warnings.warn(message, FutureWarning)
361 | return self.rollup(over)
362 |
363 |
364 | class GroupedFrame(_CommonMixin):
365 | """GroupedFrame compatible with: `accumulate`, `gather`, `pack`, `rank`, `rollup`, `take`"""
366 |
367 | def __repr__(self) -> str:
368 | return self._data.obj.__repr__() # type: ignore
369 |
370 | def _repr_html_(self) -> str:
371 | return self._data.obj.to_html(index=True) # type: ignore
372 |
373 |
374 | class DataFrame(_CommonMixin, _InterchangeMixin):
375 | def __init__(self, data: dict[Column, Values] | None = None) -> None:
376 | """Initialize a DataFrame with a standard dictionary
377 |
378 | Example:
379 |
380 | ```python
381 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]})
382 | ```
383 | | foo | bar |
384 | |------:|:------|
385 | | 1 | A |
386 | | 2 | B |
387 | """
388 | _check_type(data, {dict, None})
389 | if not data:
390 | self._data = PandasDataFrame()
391 | if isinstance(data, dict):
392 | self._data = PandasDataFrame(data)
393 |
394 | def __eq__(self, rhs: Any) -> bool:
395 | """Check if two DataFrames are equal to each other
396 |
397 | Example:
398 |
399 | ```python
400 | adf = rf.DataFrame({"foo": [1]})
401 | bdf = rf.DataFrame({"bar": [1]})
402 | cdf = rf.DataFrame({"foo": [1]})
403 | print(adf == bdf)
404 | print(adf == cdf)
405 | # False
406 | # True
407 | ```
408 | """
409 | if not isinstance(rhs, DataFrame):
410 | return False
411 | return self._data.equals(rhs._data)
412 |
413 | def __getitem__(self, key: Column) -> Values:
414 | """Retrive values (as a python list) from a specified column
415 |
416 | Example:
417 |
418 | ```python
419 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]})
420 | df["foo"]
421 | # [1, 2]
422 | ```
423 | """
424 | return list(self._data[key])
425 |
426 | def __repr__(self) -> str:
427 | return self._data.__repr__()
428 |
429 | def _repr_html_(self) -> str:
430 | return self._data.to_html(index=True)
431 |
432 | def __str__(self) -> str:
433 | """Return string constructor (for copy-and-pasting)
434 |
435 | Example:
436 |
437 | ```python
438 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]})
439 | str(df)
440 | # "rf.DataFrame({'foo': [1, 2], 'bar': ['A', 'B']})"
441 | ```
442 | """
443 | data = self._data.to_dict(orient="list")
444 | string = pprint.pformat(data, indent=4, sort_dicts=False, compact=True)
445 | if "\n" in string:
446 | string = " " + string[1:-1]
447 | string = f"rf.DataFrame({{\n{string}\n}})"
448 | else:
449 | string = f"rf.DataFrame({string})"
450 | return string
451 |
452 | @property
453 | def columns(self) -> Columns:
454 | """Inspect column keys (names)
455 |
456 | Example:
457 |
458 | ```python
459 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"], "baz": [True, False]})
460 | df.columns
461 | # ['foo', 'bar', 'baz']
462 | ```
463 | """
464 | return list(self._data.columns)
465 |
466 | @property
467 | def dimensions(self) -> dict[str, int]:
468 | """Inspect DataFrame shape
469 |
470 | Example:
471 |
472 | ```python
473 | df = rf.DataFrame({"foo": range(10), "bar": range(10, 20)})
474 | df.dimensions
475 | # {'rows': 10, 'columns': 2}
476 | ```
477 | """
478 | return dict(zip(["rows", "columns"], self._data.shape))
479 |
480 | @property
481 | def empty(self) -> bool:
482 | """Inspect if DataFrame is "empty"
483 |
484 | Example:
485 |
486 | ```python
487 | df = rf.DataFrame()
488 | df.empty
489 | # True
490 | ```
491 | """
492 | return self._data.empty
493 |
494 | @property
495 | def memory(self) -> str:
496 | """Interrogate DataFrame (deep) memory usage
497 |
498 | Example:
499 |
500 | ```python
501 | df = rf.DataFrame({"foo": [1, 2, 3], "bar": ["A", "B", "C"]})
502 | df.memory
503 | # '326B'
504 | ```
505 | """
506 | size = self._data.memory_usage(deep=True).sum()
507 | power_labels = {40: "TB", 30: "GB", 20: "MB", 10: "KB"}
508 | for power, label in power_labels.items():
509 | if size >= (2**power):
510 | approx_size = size // 2**power
511 | return f"{approx_size} {label}"
512 | return f"{size} B"
513 |
514 | @property
515 | def types(self) -> dict[Column, type]:
516 | """Inspect column types
517 |
518 | Example:
519 |
520 | ```python
521 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"], "baz": [True, False]})
522 | df.types
523 | # {'foo': int, 'bar': object, 'baz': bool}
524 | ```
525 | """
526 | numpy_types = {
527 | NumpyType("O"): object,
528 | NumpyType("int64"): int,
529 | NumpyType("float64"): float,
530 | NumpyType("bool"): bool,
531 | NumpyType("datetime64"): DateTime,
532 | }
533 | raw_types = dict(self._data.dtypes)
534 | clean_types = {}
535 | for column in self.columns:
536 | current = raw_types[column]
537 | clean = numpy_types.get(current, current) # type: ignore
538 | clean_types[column] = clean
539 | return clean_types
540 |
541 | def append(self, other: DataFrame) -> DataFrame:
542 | """Append rows from another DataFrame
543 |
544 | Example:
545 |
546 | ```python
547 | df1 = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]})
548 | ```
549 | | foo | bar |
550 | |------:|:------|
551 | | 1 | A |
552 | | 2 | B |
553 |
554 | ```python
555 | df2 = rf.DataFrame({"bar": ["C", "D"], "foo": [3, 4], "baz": ["$", "@"]})
556 | ```
557 | | bar | foo | baz |
558 | |:------|------:|:------|
559 | | C | 3 | $ |
560 | | D | 4 | @ |
561 |
562 | ```python
563 | df1.append(df2)
564 | ```
565 | | foo | bar | baz |
566 | |------:|:------|:------|
567 | | 1 | A | nan |
568 | | 2 | B | nan |
569 | | 3 | C | $ |
570 | | 4 | D | @ |
571 | """
572 | _check_type(other, DataFrame)
573 | return _wrap(append(self._data, other._data))
574 |
575 | def combine(
576 | self, columns: Columns, into: Column, sep: str, drop: bool = True
577 | ) -> DataFrame:
578 | """Combine multiple columns into a single column (opposite of `split`)
579 |
580 | Example:
581 |
582 | ```python
583 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]})
584 | ```
585 | | foo | bar |
586 | |------:|:------|
587 | | 1 | A |
588 | | 2 | B |
589 |
590 | ```python
591 | df.combine(["bar", "foo"], into="baz", sep="::", drop=True)
592 | ```
593 | | baz |
594 | |:------|
595 | | A::1 |
596 | | B::2 |
597 | """
598 | return _wrap(combine(self._data, columns, into, sep, drop))
599 |
600 | def cross(
601 | self, rhs: DataFrame | None = None, postfix: tuple[str, str] = ("_lhs", "_rhs")
602 | ) -> DataFrame:
603 | """Cross join columns from another DataFrame
604 |
605 | Examples:
606 |
607 | ```python
608 | df = rf.DataFrame({"foo": ["a", "b", "c"], "bar": [1, 2, 3]})
609 | ```
610 | | foo | bar |
611 | |:------|------:|
612 | | a | 1 |
613 | | b | 2 |
614 | | c | 3 |
615 |
616 | Self:
617 |
618 | ```python
619 | df.cross()
620 | ```
621 |
622 | | foo_lhs | bar_lhs | foo_rhs | bar_rhs |
623 | |:----------|----------:|:----------|----------:|
624 | | a | 1 | a | 1 |
625 | | a | 1 | b | 2 |
626 | | a | 1 | c | 3 |
627 | | b | 2 | a | 1 |
628 | | b | 2 | b | 2 |
629 | | b | 2 | c | 3 |
630 | | c | 3 | a | 1 |
631 | | c | 3 | b | 2 |
632 | | c | 3 | c | 3 |
633 |
634 | Two DataFrames:
635 |
636 | ```python
637 | dfa = rf.DataFrame({"foo": [1, 2, 3]})
638 | dfb = rf.DataFrame({"bar": [1, 2, 3]})
639 | dfa.cross(dfb, postfix=("_a", "_b"))
640 | ```
641 |
642 | | foo | bar |
643 | |------:|------:|
644 | | 1 | 1 |
645 | | 1 | 2 |
646 | | 1 | 3 |
647 | | 2 | 1 |
648 | | 2 | 2 |
649 | | 2 | 3 |
650 | | 3 | 1 |
651 | | 3 | 2 |
652 | | 3 | 3 |
653 | """
654 | rhs = self if (rhs == None) else rhs
655 | _check_type(rhs, DataFrame)
656 | return _wrap(cross(self._data, rhs._data, postfix)) # type: ignore
657 |
658 | def dedupe(self, columns: LazyColumns | None = None) -> DataFrame:
659 | """Remove duplicate rows
660 |
661 | Examples:
662 |
663 | ```python
664 | df = rf.DataFrame({"foo": [1, 1, 2, 2], "bar": ["A", "A", "B", "A"]})
665 | ```
666 | | foo | bar |
667 | |------:|:------|
668 | | 1 | A |
669 | | 1 | A |
670 | | 2 | B |
671 | | 2 | A |
672 |
673 | All columns:
674 |
675 | ```python
676 | df.dedupe()
677 | ```
678 | | foo | bar |
679 | |------:|:------|
680 | | 1 | A |
681 | | 2 | B |
682 | | 2 | A |
683 |
684 | Single column:
685 |
686 | ```python
687 | df.dedupe("foo")
688 | ```
689 | | foo | bar |
690 | |------:|:------|
691 | | 1 | A |
692 | | 2 | B |
693 |
694 | Multiple columns:
695 |
696 | ```python
697 | df.dedupe(["foo", "bar"])
698 | ```
699 | | foo | bar |
700 | |------:|:------|
701 | | 1 | A |
702 | | 2 | B |
703 | | 2 | A |
704 | """
705 | return _wrap(dedupe(self._data, columns))
706 |
707 | def denix(self, columns: LazyColumns | None = None) -> DataFrame:
708 | """Remove rows with *NaN/None* values
709 |
710 | Example:
711 |
712 | ```python
713 | df = rf.DataFrame({"foo": [1, None, 3, None, 5, 6], "bar": [1, None, 3, 4, None, None]})
714 | ```
715 | | foo | bar |
716 | |------:|------:|
717 | | 1 | 1 |
718 | | nan | nan |
719 | | 3 | 3 |
720 | | nan | 4 |
721 | | 5 | nan |
722 | | 6 | nan |
723 |
724 | All columns:
725 |
726 | ```python
727 | df.denix()
728 | ```
729 | | foo | bar |
730 | |------:|------:|
731 | | 1 | 1 |
732 | | 3 | 3 |
733 |
734 | Single column:
735 |
736 | ```python
737 | df.denix("bar")
738 | ```
739 | | foo | bar |
740 | |------:|------:|
741 | | 1 | 1 |
742 | | 3 | 3 |
743 | | nan | 4 |
744 |
745 | Multiple columns:
746 |
747 | ```python
748 | df.denix(["foo", "bar"])
749 | ```
750 | | foo | bar |
751 | |------:|------:|
752 | | 1 | 1 |
753 | | 3 | 3 |
754 | """
755 | return _wrap(denix(self._data, columns))
756 |
757 | def drop(self, columns: LazyColumns) -> DataFrame:
758 | """Drop entire columns
759 |
760 | Examples:
761 |
762 | ```python
763 | df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [5, 6]})
764 | ```
765 | | foo | bar | baz |
766 | |------:|------:|------:|
767 | | 1 | 3 | 5 |
768 | | 2 | 4 | 6 |
769 |
770 | ```python
771 | df.drop("baz")
772 | ```
773 | | foo | bar |
774 | |------:|------:|
775 | | 1 | 3 |
776 | | 2 | 4 |
777 |
778 | ```python
779 | df.drop(["foo", "baz"])
780 | ```
781 | | bar |
782 | |------:|
783 | | 3 |
784 | | 4 |
785 | """
786 | return _wrap(drop(self._data, columns))
787 |
788 | def fill(
789 | self,
790 | columns: LazyColumns | None = None,
791 | direction: Direction | None = None,
792 | constant: Value | None = None,
793 | ) -> DataFrame:
794 | """Fill missing values "down", "up", or with a constant
795 |
796 | Examples:
797 |
798 | ```python
799 | df = rf.DataFrame({"foo": [1, None, None, 2, None], "bar": [None, "A", None, "B", None]})
800 | ```
801 | | foo | bar |
802 | |------:|:------|
803 | | 1 | |
804 | | nan | A |
805 | | nan | |
806 | | 2 | B |
807 | | nan | |
808 |
809 | Constant (all columns):
810 |
811 | ```python
812 | df.fill(constant=0)
813 | ```
814 | | foo | bar |
815 | |------:|:------|
816 | | 1 | 0 |
817 | | 0 | A |
818 | | 0 | 0 |
819 | | 2 | B |
820 | | 0 | 0 |
821 |
822 | Down (all columns):
823 |
824 | ```python
825 | df.fill(direction="down")
826 | ```
827 | | foo | bar |
828 | |------:|:------|
829 | | 1 | |
830 | | 1 | A |
831 | | 1 | A |
832 | | 2 | B |
833 | | 2 | B |
834 |
835 | Down (single column):
836 |
837 | ```python
838 | df.fill("foo", direction="down")
839 | ```
840 | | foo | bar |
841 | |------:|:------|
842 | | 1 | |
843 | | 1 | A |
844 | | 1 | |
845 | | 2 | B |
846 | | 2 | |
847 |
848 | Up (single/mutiple columns):
849 |
850 | ```python
851 | df.fill(["foo"], direction="up")
852 | ```
853 | | foo | bar |
854 | |------:|:------|
855 | | 1 | |
856 | | 2 | A |
857 | | 2 | |
858 | | 2 | B |
859 | | nan | |
860 | """
861 | return _wrap(fill(self._data, columns, direction, constant))
862 |
863 | def filter(self, func: Func) -> DataFrame:
864 | """Keep rows matching specific conditions
865 |
866 | Compatible operators: `|`, `&`, `< <= == != >= >`, `isin`
867 |
868 | Examples:
869 |
870 | ```python
871 | df = rf.DataFrame({"foo": ["A", "A", "A", "B"], "bar": [1, 2, 3, 4]})
872 | ```
873 | | foo | bar |
874 | |:------|------:|
875 | | A | 1 |
876 | | A | 2 |
877 | | A | 3 |
878 | | B | 4 |
879 |
880 | Single condition:
881 |
882 | ```python
883 | df.filter(lambda row: row["foo"].isin(["A"]))
884 | ```
885 | | foo | bar |
886 | |:------|------:|
887 | | A | 1 |
888 | | A | 2 |
889 | | A | 3 |
890 |
891 | And (multiple conditions):
892 |
893 | ```python
894 | df.filter(lambda row: (row["foo"] == "A") & (row["bar"] <= 2))
895 | ```
896 | | foo | bar |
897 | |:------|------:|
898 | | A | 1 |
899 | | A | 2 |
900 |
901 | Or (multiple conditions):
902 |
903 | ```python
904 | df.filter(lambda row: (row["foo"] == "B") | (row["bar"] == 1))
905 | ```
906 | | foo | bar |
907 | |:------|------:|
908 | | A | 1 |
909 | | B | 4 |
910 | """
911 | return _wrap(filter(self._data, func))
912 |
913 | def group(self, by: LazyColumns) -> GroupedFrame:
914 | """Prepare groups for compatible verbs
915 |
916 | Compatible verbs: `accumulate`, `gather`, `pack`, `rank`, `rollup`, `take`
917 |
918 | Example:
919 |
920 | ```python
921 | df = rf.DataFrame({"foo": ["A", "A", "A", "B", "B"], "bar": [1, 2, 3, 4, 5], "baz": [9, 7, 7, 5, 6]})
922 | ```
923 | | foo | bar | baz |
924 | |:------|------:|------:|
925 | | A | 1 | 9 |
926 | | A | 2 | 7 |
927 | | A | 3 | 7 |
928 | | B | 4 | 5 |
929 | | B | 5 | 6 |
930 |
931 | + `accumulate`:
932 |
933 | ```python
934 | df.group("foo").accumulate("bar", into="bar_cumsum")
935 | ```
936 | | foo | bar | baz | bar_cumsum |
937 | |:------|------:|------:|-------------:|
938 | | A | 1 | 9 | 1 |
939 | | A | 2 | 7 | 3 |
940 | | A | 3 | 7 | 6 |
941 | | B | 4 | 5 | 4 |
942 | | B | 5 | 6 | 9 |
943 |
944 | + `gather`:
945 |
946 | ```python
947 | df.group("foo").gather()
948 | ```
949 | | foo | variable | value |
950 | |:------|:-----------|--------:|
951 | | A | bar | 1 |
952 | | A | bar | 2 |
953 | | A | bar | 3 |
954 | | B | bar | 4 |
955 | | B | bar | 5 |
956 | | A | baz | 9 |
957 | | A | baz | 7 |
958 | | A | baz | 7 |
959 | | B | baz | 5 |
960 | | B | baz | 6 |
961 |
962 | + `pack`:
963 |
964 | ```python
965 | df.group("foo").pack("bar", sep=":")
966 | ```
967 | | foo | bar |
968 | |:------|:------|
969 | | A | 1:2:3 |
970 | | B | 4:5 |
971 |
972 |
973 | + `rank`:
974 |
975 | ```python
976 | df.group("foo").rank("baz", into="baz_rank", descending=True)
977 | ```
978 | | foo | bar | baz | baz_rank |
979 | |:------|------:|------:|-----------:|
980 | | A | 1 | 9 | 1 |
981 | | A | 2 | 7 | 2 |
982 | | A | 3 | 7 | 2 |
983 | | B | 4 | 5 | 2 |
984 | | B | 5 | 6 | 1 |
985 |
986 | + `rollup`:
987 |
988 | ```python
989 | df.group("foo").rollup({
990 | "bar_mean": ("bar", rf.stat.mean),
991 | "baz_min": ("baz", rf.stat.min)
992 | })
993 | ```
994 | | foo | bar_mean | baz_min |
995 | |:------|-----------:|----------:|
996 | | A | 2 | 7 |
997 | | B | 4.5 | 5 |
998 |
999 | + `take`:
1000 |
1001 | ```python
1002 | df.group("foo").take(1)
1003 | ```
1004 | | foo | bar | baz |
1005 | |:------|------:|------:|
1006 | | A | 1 | 9 |
1007 | | B | 4 | 5 |
1008 |
1009 | """
1010 | return GroupedFrame(group(self._data, by))
1011 |
1012 | def join(
1013 | self,
1014 | rhs: DataFrame,
1015 | on: LazyColumns,
1016 | how: Join = "left",
1017 | postfix: tuple[str, str] = ("_lhs", "_rhs"),
1018 | ) -> DataFrame:
1019 | """Join columns from another DataFrame
1020 |
1021 | Examples:
1022 |
1023 | ```python
1024 | adf = rf.DataFrame({"foo": ["A", "B", "C"], "bar": [1, 2, 3]})
1025 | ```
1026 | | foo | bar |
1027 | |:------|------:|
1028 | | A | 1 |
1029 | | B | 2 |
1030 | | C | 3 |
1031 |
1032 | ```python
1033 | bdf = rf.DataFrame({"foo": ["A", "B", "D"], "baz": ["!", "@", "#"]})
1034 | ```
1035 | | foo | baz |
1036 | |:------|:------|
1037 | | A | ! |
1038 | | B | @ |
1039 | | D | # |
1040 |
1041 | Left join:
1042 |
1043 | ```python
1044 | adf.join(bdf, on="foo", how="left")
1045 | ```
1046 | | foo | bar | baz |
1047 | |:------|------:|:------|
1048 | | A | 1 | ! |
1049 | | B | 2 | @ |
1050 | | C | 3 | nan |
1051 |
1052 | Right join:
1053 |
1054 | ```python
1055 | adf.join(bdf, on="foo", how="right")
1056 | ```
1057 | | foo | bar | baz |
1058 | |:------|------:|:------|
1059 | | A | 1 | ! |
1060 | | B | 2 | @ |
1061 | | D | nan | # |
1062 |
1063 | Inner join:
1064 |
1065 | ```python
1066 | adf.join(bdf, on="foo", how="inner")
1067 | ```
1068 | | foo | bar | baz |
1069 | |:------|------:|:------|
1070 | | A | 1 | ! |
1071 | | B | 2 | @ |
1072 |
1073 | Full join:
1074 |
1075 | ```python
1076 | adf.join(bdf, on="foo", how="full")
1077 | ```
1078 | | foo | bar | baz |
1079 | |:------|------:|:------|
1080 | | A | 1 | ! |
1081 | | B | 2 | @ |
1082 | | C | 3 | nan |
1083 | | D | nan | # |
1084 | """
1085 | _check_type(rhs, DataFrame)
1086 | return _wrap(join(self._data, rhs._data, on, how, postfix))
1087 |
1088 | def mutate(self, over: dict[Column, Func]) -> DataFrame:
1089 | """Create a new, or overwrite an existing column
1090 |
1091 | Example:
1092 |
1093 | ```python
1094 | df = rf.DataFrame({"foo": [1, 2, 3]})
1095 | ```
1096 | | foo |
1097 | |------:|
1098 | | 1 |
1099 | | 2 |
1100 | | 3 |
1101 |
1102 | ```python
1103 | df.mutate({
1104 | "bar": lambda row: float(row["foo"]),
1105 | "baz": lambda row: "X" + str(row["bar"] * 2),
1106 | "jaz": lambda _: "Jazz"
1107 | })
1108 | ```
1109 | | foo | bar | baz | jaz |
1110 | |------:|------:|:------|:------|
1111 | | 1 | 1 | X2.0 | Jazz |
1112 | | 2 | 2 | X4.0 | Jazz |
1113 | | 3 | 3 | X6.0 | Jazz |
1114 | """
1115 | return _wrap(mutate(self._data, over))
1116 |
1117 | def rename(self, columns: dict[OldColumn, NewColumn]) -> DataFrame:
1118 | """Rename column keys (from "old" to "new")
1119 |
1120 | Example:
1121 |
1122 | ```python
1123 | df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]})
1124 | ```
1125 | | foo | bar |
1126 | |------:|------:|
1127 | | 1 | 3 |
1128 | | 2 | 4 |
1129 |
1130 | ```python
1131 | df.rename({"foo": "oof", "bar": "rab"})
1132 | ```
1133 | | oof | rab |
1134 | |------:|------:|
1135 | | 1 | 3 |
1136 | | 2 | 4 |
1137 |
1138 | """
1139 | return _wrap(rename(self._data, columns))
1140 |
1141 | def replace(self, over: dict[Column, dict[OldValue, NewValue]]) -> DataFrame:
1142 | """Replace matching values within columns (from "old" to "new")
1143 |
1144 | Example:
1145 |
1146 | ```python
1147 | df = rf.DataFrame({"foo": [1, 2, 2, 2, 1], "bar": [1, "A", "B", True, False]})
1148 | ```
1149 | | foo | bar |
1150 | |------:|:------|
1151 | | 1 | 1 |
1152 | | 2 | A |
1153 | | 2 | B |
1154 | | 2 | True |
1155 | | 1 | False |
1156 |
1157 | ```python
1158 | df.replace({
1159 | "foo": {2: 222},
1160 | "bar": {False: 0, True: 1, "A": 2, "B": 3}
1161 | })
1162 | ```
1163 | | foo | bar |
1164 | |------:|------:|
1165 | | 1 | 1 |
1166 | | 222 | 2 |
1167 | | 222 | 3 |
1168 | | 222 | 1 |
1169 | | 1 | 0 |
1170 | """
1171 | return _wrap(replace(self._data, over))
1172 |
1173 | def sample(self, rows: int | float, seed: int | None = None) -> DataFrame:
1174 | """Randomly sample any number of rows
1175 |
1176 | Examples:
1177 |
1178 | ```python
1179 | df = rf.DataFrame({"foo": range(10), "bar": range(10, 20)})
1180 | ```
1181 | | foo | bar |
1182 | |------:|------:|
1183 | | 0 | 10 |
1184 | | 1 | 11 |
1185 | | 2 | 12 |
1186 | | 3 | 13 |
1187 | | 4 | 14 |
1188 | | 5 | 15 |
1189 | | 6 | 16 |
1190 | | 7 | 17 |
1191 | | 8 | 18 |
1192 | | 9 | 19 |
1193 |
1194 | Single row:
1195 |
1196 | ```python
1197 | df.sample(1)
1198 | ```
1199 | | foo | bar |
1200 | |------:|------:|
1201 | | 7 | 17 |
1202 |
1203 | Multiple rows:
1204 |
1205 | ```python
1206 | df.sample(3)
1207 | ```
1208 | | foo | bar |
1209 | |------:|------:|
1210 | | 4 | 14 |
1211 | | 1 | 11 |
1212 | | 6 | 16 |
1213 |
1214 | Percentage of total rows (30%):
1215 |
1216 | ```python
1217 | df.sample(0.3)
1218 | ```
1219 | | foo | bar |
1220 | |------:|------:|
1221 | | 4 | 14 |
1222 | | 3 | 13 |
1223 | | 1 | 11 |
1224 | """
1225 | return _wrap(sample(self._data, rows, seed))
1226 |
1227 | def select(self, columns: LazyColumns) -> DataFrame:
1228 | """Select specific columns
1229 |
1230 | Examples:
1231 |
1232 | ```python
1233 | df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [5, 6]})
1234 | ```
1235 | | foo | bar | baz |
1236 | |------:|------:|------:|
1237 | | 1 | 3 | 5 |
1238 | | 2 | 4 | 6 |
1239 |
1240 | Single column:
1241 |
1242 | ```python
1243 | df.select("foo")
1244 | ```
1245 | | foo |
1246 | |------:|
1247 | | 1 |
1248 | | 2 |
1249 |
1250 | Multiple columns:
1251 |
1252 | ```python
1253 | df.select(["foo", "baz"])
1254 | ```
1255 | | foo | baz |
1256 | |------:|------:|
1257 | | 1 | 5 |
1258 | | 2 | 6 |
1259 | """
1260 | return _wrap(select(self._data, columns))
1261 |
1262 | def shuffle(self, seed: int | None = None) -> DataFrame:
1263 | """Shuffle the order of all rows
1264 |
1265 | Example:
1266 |
1267 | ```python
1268 | df = rf.DataFrame({"foo": range(5), "bar": range(5, 10)})
1269 | ```
1270 | | foo | bar |
1271 | |------:|------:|
1272 | | 0 | 5 |
1273 | | 1 | 6 |
1274 | | 2 | 7 |
1275 | | 3 | 8 |
1276 | | 4 | 9 |
1277 |
1278 | ```python
1279 | df.shuffle()
1280 | ```
1281 | | foo | bar |
1282 | |------:|------:|
1283 | | 4 | 9 |
1284 | | 2 | 7 |
1285 | | 3 | 8 |
1286 | | 0 | 5 |
1287 | | 1 | 6 |
1288 | """
1289 | return _wrap(shuffle(self._data, seed))
1290 |
1291 | def sort(self, columns: LazyColumns, descending: bool = False) -> DataFrame:
1292 | """Sort rows by specific columns
1293 |
1294 | Examples:
1295 |
1296 | ```python
1297 | df = rf.DataFrame({"foo": ["Z", "X", "A", "A"], "bar": [2, -2, 4, -4]})
1298 | ```
1299 | | foo | bar |
1300 | |:------|------:|
1301 | | Z | 2 |
1302 | | X | -2 |
1303 | | A | 4 |
1304 | | A | -4 |
1305 |
1306 | Single column:
1307 |
1308 | ```python
1309 | df.sort("bar")
1310 | ```
1311 | | foo | bar |
1312 | |:------|------:|
1313 | | A | -4 |
1314 | | X | -2 |
1315 | | Z | 2 |
1316 | | A | 4 |
1317 |
1318 | Descending order:
1319 |
1320 | ```python
1321 | df.sort("bar", descending=True)
1322 | ```
1323 | | foo | bar |
1324 | |:------|------:|
1325 | | A | 4 |
1326 | | Z | 2 |
1327 | | X | -2 |
1328 | | A | -4 |
1329 |
1330 | Multiple columns:
1331 |
1332 | ```python
1333 | df.sort(["foo", "bar"], descending=False)
1334 | ```
1335 | | foo | bar |
1336 | |:------|------:|
1337 | | A | -4 |
1338 | | A | 4 |
1339 | | X | -2 |
1340 | | Z | 2 |
1341 | """
1342 | return _wrap(sort(self._data, columns, descending))
1343 |
1344 | def split(
1345 | self, column: Column, into: Columns, sep: str, drop: bool = True
1346 | ) -> DataFrame:
1347 | """Split a single column into multiple columns (opposite of `combine`)
1348 |
1349 | Example:
1350 |
1351 | ```python
1352 | df = rf.DataFrame({"foo": ["A::1", "B::2", "C:3"]})
1353 | ```
1354 | | foo |
1355 | |:------|
1356 | | A::1 |
1357 | | B::2 |
1358 | | C:3 |
1359 |
1360 | ```python
1361 | df.split("foo", into=["foo", "bar"], sep="::", drop=True)
1362 | ```
1363 | | foo | bar |
1364 | |:------|------:|
1365 | | A | 1 |
1366 | | B | 2 |
1367 | | C:3 | |
1368 | """
1369 | return _wrap(split(self._data, column, into, sep, drop))
1370 |
1371 | def spread(self, column: Column, using: Column) -> DataFrame:
1372 | """Spread rows into columns (opposite of `gather`)
1373 |
1374 | Example:
1375 |
1376 | ```python
1377 | df = rf.DataFrame({"foo": ["A", "A", "A", "B", "B", "B", "B"], "bar": [1, 2, 3, 4, 5, 6, 7]})
1378 | ```
1379 | | foo | bar |
1380 | |:------|------:|
1381 | | A | 1 |
1382 | | A | 2 |
1383 | | A | 3 |
1384 | | B | 4 |
1385 | | B | 5 |
1386 | | B | 6 |
1387 | | B | 7 |
1388 |
1389 | ```python
1390 | df.spread("foo", using="bar")
1391 | ```
1392 | | A | B |
1393 | |----:|----:|
1394 | | 1 | 4 |
1395 | | 2 | 5 |
1396 | | 3 | 6 |
1397 | | nan | 7 |
1398 | """
1399 | return _wrap(spread(self._data, column, using))
1400 |
1401 | def unpack(self, column: Column, sep: str) -> DataFrame:
1402 | """'Explode' concatenated row values into multiple rows (opposite of `pack`)
1403 |
1404 | Example:
1405 |
1406 | ```python
1407 | df = rf.DataFrame({
1408 | "foo": [1, 2, 3, 4],
1409 | "bar": ["A:B", "B:C:D", "D:E", "F"]
1410 | })
1411 | ```
1412 | | foo | bar |
1413 | |------:|:------|
1414 | | 1 | A:B |
1415 | | 2 | B:C:D |
1416 | | 3 | D:E |
1417 | | 4 | F |
1418 |
1419 | ```python
1420 | df.unpack("bar", sep=":")
1421 | ```
1422 | | foo | bar |
1423 | |------:|:------|
1424 | | 1 | A |
1425 | | 1 | B |
1426 | | 2 | B |
1427 | | 2 | C |
1428 | | 2 | D |
1429 | | 3 | D |
1430 | | 3 | E |
1431 | | 4 | F |
1432 | """
1433 | return _wrap(unpack(self._data, column, sep))
1434 |
--------------------------------------------------------------------------------
/redframes/io/__init__.py:
--------------------------------------------------------------------------------
1 | from .convert import unwrap, wrap
2 | from .load import load
3 | from .save import save
4 |
--------------------------------------------------------------------------------
/redframes/io/convert.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_columns, _check_index, _check_type
4 | from ..core import DataFrame
5 | from ..types import PandasDataFrame
6 |
7 |
8 | def unwrap(rdf: DataFrame) -> PandasDataFrame:
9 | """Convert a rf.DataFrame into a pd.DataFrame (opposite of `wrap`)
10 |
11 | Example:
12 |
13 | ```python
14 | rdf = rf.DataFrame({"foo": range(10)})
15 | pdf = rf.unwrap(rdf)
16 | ```
17 | """
18 | _check_type(rdf, DataFrame)
19 | return rdf._data.copy()
20 |
21 |
22 | def wrap(pdf: PandasDataFrame) -> DataFrame:
23 | """Convert a pd.DataFrame into a rf.DataFrame (opposite of `unwrap`)
24 |
25 | Example:
26 |
27 | ```python
28 | pdf = pd.DataFrame({"foo": range(10)})
29 | rdf = rf.wrap(pdf)
30 | ```
31 | """
32 | _check_type(pdf, PandasDataFrame)
33 | _check_index(pdf)
34 | _check_columns(pdf)
35 | rdf = DataFrame()
36 | rdf._data = pdf.copy()
37 | return rdf
38 |
39 |
40 | def convert(df: DataFrame | PandasDataFrame) -> PandasDataFrame | DataFrame:
41 | """Convert a rf.DataFrame into a pd.DataFrame (and/or vice versa)
42 |
43 | Example:
44 |
45 | ```python
46 | redf = rf.DataFrame({"foo": range(10)})
47 | padf = rf.convert(redf) # now a pd.DataFrame
48 | redf = rf.convert(padf) # now a rf.DataFrame
49 | ```
50 | """
51 | if isinstance(df, DataFrame):
52 | return unwrap(df)
53 | if isinstance(df, PandasDataFrame):
54 | return wrap(df)
55 | raise TypeError("must be rf.DataFrame | pd.DataFrame")
56 |
--------------------------------------------------------------------------------
/redframes/io/load.py:
--------------------------------------------------------------------------------
1 | import pandas as pd # pyright: ignore[reportMissingImports]
2 |
3 | from redframes.types import PandasDataFrame
4 |
5 | from ..checks import _check_columns, _check_file, _check_index, _check_type
6 | from ..core import DataFrame, _wrap
7 |
8 |
9 | def load(path: str, **kwargs) -> DataFrame:
10 | """Load a csv file into a rf.DataFrame (opposite of `save`)
11 |
12 | Example:
13 |
14 | ```python
15 | df = rf.load("example.csv")
16 | ```
17 | """
18 | _check_type(path, str)
19 | _check_file(path)
20 | data: PandasDataFrame = pd.read_csv(path, **kwargs) # type: ignore
21 | _check_index(data)
22 | _check_columns(data)
23 | return _wrap(data)
24 |
--------------------------------------------------------------------------------
/redframes/io/save.py:
--------------------------------------------------------------------------------
1 | from ..checks import _check_file, _check_type
2 | from ..core import DataFrame
3 |
4 |
5 | def save(df: DataFrame, path: str, **kwargs) -> None:
6 | """Save a rf.DataFrame to a csv file (opposite of `load`)
7 |
8 | Example:
9 |
10 | ```python
11 | rf.save(df, "example.csv")
12 | ```
13 | """
14 | _check_type(df, DataFrame)
15 | _check_type(path, str)
16 | _check_file(path)
17 | df._data.to_csv(path, index=False, **kwargs)
18 |
--------------------------------------------------------------------------------
/redframes/stat.py:
--------------------------------------------------------------------------------
1 | """Common summary functions/statistics"""
2 |
3 | import numpy as np # pyright: ignore[reportMissingImports]
4 |
5 | count = len
6 | mean = np.mean
7 | sum = np.sum
8 | max = np.max
9 | median = np.median
10 | min = np.min
11 | std = np.std
12 |
--------------------------------------------------------------------------------
/redframes/types.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import datetime
4 | from typing import Any, Callable, Literal, Union
5 |
6 | import numpy as np # pyright: ignore[reportMissingImports]
7 | import pandas as pd # pyright: ignore[reportMissingImports]
8 | import pandas.core.groupby.generic as pg # pyright: ignore[reportMissingImports]
9 |
10 | Value = Any
11 | Values = list[Value]
12 | OldValue = Value
13 | NewValue = Value
14 | Column = str
15 | Columns = list[Column]
16 | LazyColumns = Union[Column, Columns]
17 | OldColumn = Column
18 | NewColumn = Column
19 | Direction = Literal["up", "down"]
20 | Func = Callable[..., Any]
21 | Join = Literal["left", "right", "inner", "full"]
22 | NumpyArray = np.ndarray
23 | NumpyType = np.dtype
24 | PandasDataFrame = pd.DataFrame
25 | PandasGroupedFrame = pg.DataFrameGroupBy
26 | PandasIndex = pd.Index
27 | PandasRangeIndex = pd.RangeIndex
28 | DateTime = datetime.datetime
29 |
--------------------------------------------------------------------------------
/redframes/verbs/__init__.py:
--------------------------------------------------------------------------------
1 | from .accumulate import accumulate
2 | from .append import append
3 | from .combine import combine
4 | from .cross import cross
5 | from .dedupe import dedupe
6 | from .denix import denix
7 | from .drop import drop
8 | from .fill import fill
9 | from .filter import filter
10 | from .gather import gather
11 | from .group import group
12 | from .join import join
13 | from .mutate import mutate
14 | from .pack import pack
15 | from .rank import rank
16 | from .rename import rename
17 | from .replace import replace
18 | from .rollup import rollup
19 | from .sample import sample
20 | from .select import select
21 | from .shuffle import shuffle
22 | from .sort import sort
23 | from .split import split
24 | from .spread import spread
25 | from .take import take
26 | from .unpack import unpack
27 |
--------------------------------------------------------------------------------
/redframes/verbs/accumulate.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import warnings
4 |
5 | from ..checks import _check_type
6 | from ..types import Column, PandasDataFrame, PandasGroupedFrame
7 |
8 |
9 | def accumulate(
10 | df: PandasDataFrame | PandasGroupedFrame, column: Column, into: Column
11 | ) -> PandasDataFrame:
12 | _check_type(column, str)
13 | _check_type(into, str)
14 | if isinstance(df, PandasDataFrame):
15 | into_is_not_column = into != column
16 | into_is_in_df_columns = into in df.columns
17 | if into_is_not_column and into_is_in_df_columns:
18 | message = f"overwriting existing column '{into}'"
19 | warnings.warn(message)
20 | df = df.copy()
21 | result = df[column].cumsum()
22 | if isinstance(df, PandasGroupedFrame):
23 | df = df.obj.copy() # type: ignore
24 | df[into] = result # type: ignore
25 | return df # type: ignore
26 |
--------------------------------------------------------------------------------
/redframes/verbs/append.py:
--------------------------------------------------------------------------------
1 | import pandas as pd # pyright: ignore[reportMissingImports]
2 |
3 | from ..types import PandasDataFrame
4 |
5 |
6 | def append(top: PandasDataFrame, bottom: PandasDataFrame) -> PandasDataFrame:
7 | df = pd.concat([top, bottom])
8 | df = df.reset_index(drop=True)
9 | return df
10 |
--------------------------------------------------------------------------------
/redframes/verbs/combine.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import warnings
4 |
5 | from ..checks import _check_type
6 | from ..types import Column, Columns, PandasDataFrame
7 |
8 |
9 | def combine(
10 | df: PandasDataFrame, columns: Columns, into: Column, sep: str, drop: bool = True
11 | ) -> PandasDataFrame:
12 | _check_type(columns, list)
13 | _check_type(into, str)
14 | _check_type(sep, str)
15 | _check_type(drop, bool)
16 | into_is_in_columns = into in columns
17 | into_is_not_in_columns = not into_is_in_columns
18 | into_is_in_df_columns = into in df.columns
19 | if into_is_not_in_columns and into_is_in_df_columns:
20 | message = f"overwriting existing column '{into}'"
21 | warnings.warn(message)
22 | df = df.copy()
23 | new = df[columns].apply(lambda row: sep.join(row.values.astype(str)), axis=1)
24 | if drop:
25 | df = df.drop(columns, axis=1)
26 | df[into] = new
27 | return df
28 |
--------------------------------------------------------------------------------
/redframes/verbs/cross.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd # pyright: ignore[reportMissingImports]
4 |
5 | from ..checks import _check_type
6 | from ..types import PandasDataFrame
7 |
8 |
9 | def cross(
10 | lhs: PandasDataFrame,
11 | rhs: PandasDataFrame,
12 | postfix: tuple[str, str] = ("_lhs", "_rhs"),
13 | ) -> PandasDataFrame:
14 | _check_type(postfix, tuple)
15 | df = pd.merge(lhs, rhs, how="cross", suffixes=postfix)
16 | df = df.reset_index(drop=True)
17 | return df
18 |
--------------------------------------------------------------------------------
/redframes/verbs/dedupe.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_keys, _check_type
4 | from ..types import LazyColumns, PandasDataFrame
5 |
6 |
7 | def dedupe(df: PandasDataFrame, columns: LazyColumns | None = None) -> PandasDataFrame:
8 | _check_type(columns, {list, str, None})
9 | _check_keys(columns, df.columns)
10 | df = df.drop_duplicates(subset=columns, keep="first")
11 | df = df.reset_index(drop=True)
12 | return df
13 |
--------------------------------------------------------------------------------
/redframes/verbs/denix.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_type
4 | from ..types import LazyColumns, PandasDataFrame
5 |
6 |
7 | def denix(df: PandasDataFrame, columns: LazyColumns | None = None) -> PandasDataFrame:
8 | _check_type(columns, {list, str, None})
9 | columns = [columns] if isinstance(columns, str) else columns
10 | if isinstance(columns, list):
11 | bad_keys = set(columns).difference(df.columns)
12 | if bad_keys:
13 | if len(bad_keys) == 1:
14 | message = f"columns argument contains invalid key {bad_keys}"
15 | else:
16 | message = f"columns argument contains invalid keys {bad_keys}"
17 | raise KeyError(message)
18 | df = df.dropna(subset=columns)
19 | df = df.reset_index(drop=True)
20 | return df
21 |
--------------------------------------------------------------------------------
/redframes/verbs/drop.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_type
4 | from ..types import LazyColumns, PandasDataFrame
5 |
6 |
7 | def drop(df: PandasDataFrame, columns: LazyColumns) -> PandasDataFrame:
8 | _check_type(columns, {list, str})
9 | df = df.drop(columns, axis=1)
10 | return df
11 |
--------------------------------------------------------------------------------
/redframes/verbs/fill.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_type
4 | from ..types import Direction, LazyColumns, PandasDataFrame, Value
5 |
6 |
7 | def fill(
8 | df: PandasDataFrame,
9 | columns: LazyColumns | None = None,
10 | direction: Direction | None = None,
11 | constant: Value | None = None,
12 | ) -> PandasDataFrame:
13 | _check_type(columns, {list, str, None})
14 | _check_type(direction, {str, None})
15 | columns = [columns] if isinstance(columns, str) else columns
16 | if (direction != None) and (constant != None):
17 | raise ValueError("either direction OR constant must be None")
18 | if (direction == None) and (constant == None):
19 | raise ValueError("either direction OR constant must not be None")
20 | if direction != None:
21 | if not (direction in ["down", "up"]):
22 | raise ValueError("must be one of {'down', 'up'}")
23 | method = {"down": "ffill", "up": "bfill"}.get(direction)
24 | value = None
25 | if constant != None:
26 | value = constant
27 | method = None
28 | df = df.copy()
29 | if columns:
30 | df[columns] = df[columns].fillna(value=value, method=method) # type: ignore
31 | else:
32 | df = df.fillna(value=value, method=method) # type: ignore
33 | return df
34 |
--------------------------------------------------------------------------------
/redframes/verbs/filter.py:
--------------------------------------------------------------------------------
1 | from ..types import Func, PandasDataFrame
2 |
3 |
4 | def filter(df: PandasDataFrame, func: Func) -> PandasDataFrame:
5 | if not callable(func):
6 | raise TypeError("must be Func")
7 | df = df.loc[func] # type: ignore
8 | df = df.reset_index(drop=True)
9 | return df
10 |
--------------------------------------------------------------------------------
/redframes/verbs/gather.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import warnings
4 |
5 | import pandas as pd # pyright: ignore[reportMissingImports]
6 |
7 | from ..checks import _check_type
8 | from ..types import Column, Columns, LazyColumns, PandasDataFrame, PandasGroupedFrame
9 |
10 |
11 | def _melt(
12 | df: PandasDataFrame,
13 | cols_to_keep: list[str],
14 | cols_to_gather: list[str],
15 | into: tuple[str, str],
16 | ) -> PandasDataFrame:
17 | df = pd.melt(
18 | df,
19 | id_vars=cols_to_keep,
20 | value_vars=cols_to_gather,
21 | var_name=into[0],
22 | value_name=into[1],
23 | )
24 | df = df.dropna(subset=into[1]) # type: ignore
25 | df = df.reset_index(drop=True)
26 | return df
27 |
28 |
29 | def _grouped_melt(df: PandasGroupedFrame, into: tuple[str, str]) -> PandasDataFrame:
30 | cols_to_keep = df.grouper.names # type: ignore
31 | cols_to_gather = [col for col in df.obj.columns if col not in cols_to_keep] # type: ignore
32 | df = _melt(df.obj, cols_to_keep, cols_to_gather, into) # type: ignore
33 | return df
34 |
35 |
36 | def gather(
37 | df: PandasDataFrame | PandasGroupedFrame,
38 | columns: Columns | None = None,
39 | beside: LazyColumns | None = None,
40 | into: tuple[Column, Column] = ("variable", "value"),
41 | ) -> PandasDataFrame:
42 | _check_type(columns, {list, None})
43 | _check_type(beside, {str, list, None})
44 | _check_type(into, tuple)
45 | if (columns == None) and (beside != None) and isinstance(df, PandasDataFrame):
46 | warnings.warn(
47 | "Marked for removal, please use `df.group(...).gather(...)` instead",
48 | FutureWarning,
49 | )
50 | if not (isinstance(into, tuple) and (len(into) == 2)):
51 | raise TypeError("must be tuple[str, str]")
52 | if into[0] == into[1]:
53 | raise TypeError("must be unique")
54 | if isinstance(df, PandasGroupedFrame):
55 | if (into[0] in df.obj.columns) or (into[1] in df.obj.columns): # type: ignore
56 | raise ValueError("must not be an existing column key")
57 | if columns != None:
58 | raise ValueError("columns is incompatible with group+gather")
59 | if beside != None:
60 | raise ValueError("beside is incompatible with group+gather")
61 | df = _grouped_melt(df, into)
62 | return df
63 | if (into[0] in df.columns) or (into[1] in df.columns):
64 | raise ValueError("must not be an existing column key")
65 | if (columns != None) and (beside != None):
66 | raise ValueError("columns OR beside must be None")
67 | if (columns == None) and (beside == None):
68 | id_vars = []
69 | value_vars = list(df.columns)
70 | if isinstance(beside, str):
71 | beside = [beside]
72 | if isinstance(beside, list):
73 | id_vars = beside
74 | value_vars = [col for col in df.columns if col not in id_vars]
75 | if isinstance(columns, list):
76 | id_vars = [col for col in df.columns if col not in columns]
77 | value_vars = columns
78 | df = _melt(df, id_vars, value_vars, into) # pyright: ignore[reportUnboundVariable]
79 | return df
80 |
--------------------------------------------------------------------------------
/redframes/verbs/group.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_type
4 | from ..types import LazyColumns, PandasDataFrame, PandasGroupedFrame
5 |
6 |
7 | def group(df: PandasDataFrame, by: LazyColumns) -> PandasGroupedFrame:
8 | _check_type(by, {list, str})
9 | gdf = df.groupby(by, as_index=False, sort=False)
10 | return gdf
11 |
--------------------------------------------------------------------------------
/redframes/verbs/join.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd # pyright: ignore[reportMissingImports]
4 |
5 | from ..checks import _check_type
6 | from ..types import Join, LazyColumns, PandasDataFrame
7 |
8 |
9 | def join(
10 | lhs: PandasDataFrame,
11 | rhs: PandasDataFrame,
12 | on: LazyColumns,
13 | how: Join = "left",
14 | postfix: tuple[str, str] = ("_lhs", "_rhs"),
15 | ) -> PandasDataFrame:
16 | _check_type(on, {list, str})
17 | _check_type(how, str)
18 | _check_type(postfix, tuple)
19 | if not how in ["left", "right", "inner", "full"]:
20 | message = (
21 | "on argument is invalid, must be one of {'left', 'right', 'inner', 'full'}"
22 | )
23 | raise ValueError(message)
24 | how = "outer" if (how == "full") else how # type: ignore
25 | df = pd.merge(lhs, rhs, on=on, how=how, suffixes=postfix)
26 | df = df.reset_index(drop=True)
27 | return df
28 |
--------------------------------------------------------------------------------
/redframes/verbs/mutate.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_type
4 | from ..types import Column, Func, PandasDataFrame
5 |
6 |
7 | def mutate(df: PandasDataFrame, over: dict[Column, Func]) -> PandasDataFrame:
8 | _check_type(over, dict)
9 | df = df.copy()
10 | for column, mutation in over.items():
11 | df[column] = df.apply(mutation, axis=1)
12 | return df # type: ignore
13 |
--------------------------------------------------------------------------------
/redframes/verbs/pack.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_type
4 | from ..types import Column, PandasDataFrame, PandasGroupedFrame
5 |
6 |
7 | def pack(
8 | df: PandasDataFrame | PandasGroupedFrame, column: Column, sep: str
9 | ) -> PandasDataFrame:
10 | _check_type(column, str)
11 | _check_type(sep, str)
12 | order = df.obj.columns if isinstance(df, PandasGroupedFrame) else df.columns # type: ignore
13 | df = df.agg(**{column: (column, lambda x: x.astype(str).str.cat(sep=sep))}) # type: ignore
14 | df = df[[col for col in df.columns if col in order]]
15 | df = df.reset_index(drop=True)
16 | return df
17 |
--------------------------------------------------------------------------------
/redframes/verbs/rank.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import warnings
4 |
5 | from ..checks import _check_type
6 | from ..types import Column, PandasDataFrame, PandasGroupedFrame
7 |
8 |
9 | def rank(
10 | df: PandasDataFrame | PandasGroupedFrame,
11 | column: Column,
12 | into: Column,
13 | descending: bool = False,
14 | ) -> PandasDataFrame:
15 | _check_type(column, str)
16 | _check_type(into, str)
17 | _check_type(descending, bool)
18 | if isinstance(df, PandasDataFrame):
19 | into_is_not_column = into != column
20 | into_is_in_df_columns = into in df.columns
21 | if into_is_not_column and into_is_in_df_columns:
22 | message = f"overwriting existing column '{into}'"
23 | warnings.warn(message)
24 | df = df.copy()
25 | result = df[column].rank(method="dense", ascending=not descending)
26 | if isinstance(df, PandasGroupedFrame):
27 | df = df.obj.copy() # type: ignore
28 | df[into] = result # type: ignore
29 | return df # type: ignore
30 |
--------------------------------------------------------------------------------
/redframes/verbs/rename.py:
--------------------------------------------------------------------------------
1 | from ..checks import _check_type, _check_values
2 | from ..types import NewColumn, OldColumn, PandasDataFrame
3 |
4 |
5 | def rename(df: PandasDataFrame, columns: dict[OldColumn, NewColumn]) -> PandasDataFrame:
6 | _check_type(columns, dict)
7 | cv = columns.values()
8 | _check_values(cv, str)
9 | if len(set(cv)) != len(cv):
10 | raise KeyError("columns must be unique")
11 | missing_keys = set(columns.keys()) - set(df.columns)
12 | if missing_keys and len(missing_keys) == 1:
13 | raise KeyError(f"column key ({missing_keys}) is invalid")
14 | if missing_keys and len(missing_keys) > 1:
15 | raise KeyError(f"column keys ({missing_keys}) are invalid")
16 | df = df.rename(columns=columns)
17 | return df
18 |
--------------------------------------------------------------------------------
/redframes/verbs/replace.py:
--------------------------------------------------------------------------------
1 | from ..checks import _check_type
2 | from ..types import Column, NewValue, OldValue, PandasDataFrame
3 |
4 |
5 | def replace(
6 | df: PandasDataFrame, over: dict[Column, dict[OldValue, NewValue]]
7 | ) -> PandasDataFrame:
8 | _check_type(over, dict)
9 | bad_columns = list(set(over.keys()) - set(df.columns))
10 | if bad_columns and len(bad_columns) == 1:
11 | raise KeyError(f"column key: {bad_columns} is invalid")
12 | if bad_columns and len(bad_columns) > 1:
13 | raise KeyError(f"column keys: {bad_columns} are invalid")
14 | df = df.replace(over)
15 | return df
16 |
--------------------------------------------------------------------------------
/redframes/verbs/rollup.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_type
4 | from ..types import Column, Func, PandasDataFrame, PandasGroupedFrame
5 |
6 |
7 | def rollup(
8 | df: PandasDataFrame | PandasGroupedFrame,
9 | over: dict[Column, tuple[Column, Func]],
10 | ) -> PandasDataFrame:
11 | _check_type(over, dict)
12 | if isinstance(df, PandasGroupedFrame):
13 | groups = set(df.grouper.names) # type: ignore
14 | keys = set(over.keys())
15 | if groups.intersection(keys):
16 | raise KeyError("unable to overwrite group keys")
17 | df = df.agg(**over)
18 | df = df.reset_index(drop=True)
19 | else:
20 | df = df.agg(**over) # type: ignore
21 | df = df.T # type: ignore
22 | df = df.reset_index(drop=True) # type: ignore
23 | df = df.fillna(method="ffill") # type: ignore
24 | df = df.fillna(method="bfill") # type: ignore
25 | df = df.head(1) # type: ignore
26 | return df
27 |
--------------------------------------------------------------------------------
/redframes/verbs/sample.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_type
4 | from ..types import PandasDataFrame
5 |
6 |
7 | def sample(
8 | df: PandasDataFrame, rows: int | float, seed: int | None = None
9 | ) -> PandasDataFrame:
10 | _check_type(rows, {int, float})
11 | if rows >= 1:
12 | if isinstance(rows, float):
13 | raise ValueError("must be int if > 1")
14 | df = df.sample(rows, random_state=seed)
15 | elif 0 < rows < 1:
16 | df = df.sample(frac=rows, random_state=seed)
17 | else:
18 | raise ValueError("must be > 0")
19 | df = df.reset_index(drop=True)
20 | return df
21 |
--------------------------------------------------------------------------------
/redframes/verbs/select.py:
--------------------------------------------------------------------------------
1 | import pandas as pd # pyright: ignore[reportMissingImports]
2 |
3 | from ..checks import _check_type
4 | from ..types import LazyColumns, PandasDataFrame
5 |
6 |
7 | def select(df: PandasDataFrame, columns: LazyColumns) -> PandasDataFrame:
8 | _check_type(columns, {list, str})
9 | columns = [columns] if isinstance(columns, str) else columns
10 | if len(set(columns)) != len(columns):
11 | raise KeyError(f"column keys must be unique")
12 | bad_columns = list(set(columns) - set(df.columns))
13 | if bad_columns and len(bad_columns) == 1:
14 | raise KeyError(f"column key: {bad_columns} is invalid")
15 | if bad_columns and len(bad_columns) > 1:
16 | raise KeyError(f"column keys: {bad_columns} are invalid")
17 | df = df[columns]
18 | return df
19 |
--------------------------------------------------------------------------------
/redframes/verbs/shuffle.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_type
4 | from ..types import PandasDataFrame
5 |
6 |
7 | def shuffle(df: PandasDataFrame, seed: int | None = None) -> PandasDataFrame:
8 | _check_type(seed, {int, None})
9 | df = df.sample(frac=1, random_state=seed)
10 | df = df.reset_index(drop=True)
11 | return df
12 |
--------------------------------------------------------------------------------
/redframes/verbs/sort.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_keys, _check_type
4 | from ..types import LazyColumns, PandasDataFrame
5 |
6 |
7 | def sort(
8 | df: PandasDataFrame, columns: LazyColumns, descending: bool = False
9 | ) -> PandasDataFrame:
10 | _check_type(columns, {list, str})
11 | _check_type(descending, bool)
12 | _check_keys(columns, df.columns)
13 | df = df.sort_values(by=columns, ascending=not descending)
14 | df = df.reset_index(drop=True)
15 | return df
16 |
--------------------------------------------------------------------------------
/redframes/verbs/split.py:
--------------------------------------------------------------------------------
1 | import uuid
2 |
3 | from ..checks import _check_type
4 | from ..types import Column, Columns, PandasDataFrame
5 |
6 |
7 | def split(
8 | df: PandasDataFrame, column: Column, into: Columns, sep: str, drop: bool = True
9 | ) -> PandasDataFrame:
10 | _check_type(column, str)
11 | _check_type(into, list)
12 | _check_type(sep, str)
13 | _check_type(drop, bool)
14 | if len(into) != len(set(into)):
15 | raise KeyError("into keys must be unique")
16 | if (column in into) and (not drop):
17 | raise KeyError("into keys must be unique")
18 | bad_keys = set(df.columns).difference(set([column])).intersection(set(into))
19 | if bad_keys:
20 | raise KeyError("into keys must be unique")
21 | columns = {uuid.uuid4().hex: col for col in into}
22 | temp = list(columns.keys())
23 | df = df.copy()
24 | df[temp] = df[column].str.split(sep, expand=True)
25 | if drop:
26 | df = df.drop(column, axis=1)
27 | df = df.rename(columns=columns)
28 | return df
29 |
--------------------------------------------------------------------------------
/redframes/verbs/spread.py:
--------------------------------------------------------------------------------
1 | import uuid
2 |
3 | import pandas as pd # pyright: ignore[reportMissingImports]
4 |
5 | from ..checks import _check_type
6 | from ..types import Column, PandasDataFrame
7 |
8 |
9 | def spread(df: PandasDataFrame, column: Column, using: Column) -> PandasDataFrame:
10 | _check_type(column, str)
11 | _check_type(using, str)
12 | if column == using:
13 | raise KeyError("column and using must be unique")
14 | original_shape = df.shape[1]
15 | if original_shape == 2:
16 | temp = uuid.uuid4().hex
17 | df[temp] = df.groupby(column).cumcount()
18 | index = [col for col in df.columns if col not in [column, using]]
19 | df = pd.pivot_table(df, index=index, columns=[column], values=[using], aggfunc="first") # type: ignore
20 | df.columns = [col for col in df.columns.get_level_values(1)] # type: ignore
21 | df = df.reset_index().rename_axis(None, axis=0)
22 | if original_shape == 2:
23 | df = df.drop(temp, axis=1) # type: ignore
24 | return df
25 |
--------------------------------------------------------------------------------
/redframes/verbs/take.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_type
4 | from ..types import PandasDataFrame, PandasGroupedFrame
5 |
6 |
7 | def take(
8 | df: PandasDataFrame | PandasGroupedFrame, rows: int = 1, **kwargs
9 | ) -> PandasDataFrame:
10 | if kwargs: # compatibility: sklearn / train_test_split
11 | df = df.take(rows, **kwargs) # type: ignore
12 | df = df.reset_index(drop=True)
13 | return df
14 | _check_type(rows, int)
15 | if isinstance(df, PandasDataFrame):
16 | if rows > df.shape[0]:
17 | raise ValueError("rows argument is invalid, exceeds total size")
18 | if rows == 0:
19 | raise ValueError("rows argument is invalid, must not be 0")
20 | if rows <= -1:
21 | df = df.tail(rows * -1)
22 | else:
23 | df = df.head(rows)
24 | if isinstance(df, PandasGroupedFrame):
25 | df = df.reset_index()
26 | else:
27 | df = df.reset_index(drop=True)
28 | return df
29 |
--------------------------------------------------------------------------------
/redframes/verbs/unpack.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from ..checks import _check_type
4 | from ..types import Column, PandasDataFrame
5 |
6 |
7 | def unpack(df: PandasDataFrame, column: Column, sep: str) -> PandasDataFrame:
8 | _check_type(column, str)
9 | _check_type(sep, str)
10 | df = df.assign(**{column: df[column].str.split(sep)})
11 | df = df.explode(column)
12 | df = df.reset_index(drop=True)
13 | return df
14 |
--------------------------------------------------------------------------------
/redframes/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.4.1"
2 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | exec(open("redframes/version.py").read())
4 |
5 | with open("README.md", "r", encoding="utf-8") as f:
6 | long_description = f.read()
7 |
8 | setup(
9 | name="redframes",
10 | version=__version__, # type: ignore
11 | url="https://github.com/maxhumber/redframes",
12 | description="General Purpose Data Manipulation Library",
13 | long_description=long_description,
14 | long_description_content_type="text/markdown",
15 | author="Max Humber",
16 | author_email="max.humber@gmail.com",
17 | license="BSD 2",
18 | packages=find_packages(),
19 | python_requires=">=3.8",
20 | install_requires=["pandas>=1.5,<3.0"],
21 | extras_require={
22 | "test": [
23 | "matplotlib",
24 | "scikit-learn",
25 | ],
26 | "dev": [
27 | "black",
28 | "ipykernel",
29 | "isort",
30 | "lxml",
31 | "matplotlib",
32 | "mypy",
33 | "pandas-stubs",
34 | "pyright",
35 | "scikit-learn",
36 | "tabulate",
37 | ],
38 | },
39 | classifiers=[
40 | "Intended Audience :: Developers",
41 | "Programming Language :: Python :: 3.8",
42 | "Programming Language :: Python :: 3.9",
43 | "Programming Language :: Python :: 3.10",
44 | "Programming Language :: Python :: 3.11",
45 | ],
46 | )
47 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxhumber/redframes/6e3f1226358ad4e67f4343cbc4b1ee4b63475034/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_deprecations.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import redframes as rf
4 |
5 |
6 | class TestDeprecations(unittest.TestCase):
7 | def test_summarize_deprecation(self):
8 | df = rf.DataFrame({"foo": range(10)})
9 | expected = rf.DataFrame({"foo": [4.5]})
10 | message = "Marked for removal, please use `rollup` instead"
11 | with self.assertWarnsRegex(FutureWarning, message):
12 | result = df.summarize({"foo": ("foo", rf.stat.mean)})
13 | self.assertEqual(result, expected)
14 |
15 | def test_gather_beside_deprecation(self):
16 | df = rf.DataFrame({"foo": [1, 1, 2, 2], "bar": [1, 2, 3, 4]})
17 | expected = rf.DataFrame(
18 | {
19 | "foo": [1, 1, 2, 2],
20 | "variable": ["bar", "bar", "bar", "bar"],
21 | "value": [1, 2, 3, 4],
22 | }
23 | )
24 | with self.assertWarnsRegex(FutureWarning, "Marked for removal*"):
25 | result = df.gather(beside="foo")
26 | self.assertEqual(result, expected)
27 |
--------------------------------------------------------------------------------
/tests/test_docstrings.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import redframes as rf
4 |
5 |
6 | class TestDocstrings(unittest.TestCase):
7 | def test_take(self):
8 | df = rf.DataFrame({"foo": range(10)})
9 | result1 = df.take(1)
10 | result2 = df.take(-2)
11 | expected1 = rf.DataFrame({"foo": [0]})
12 | expected2 = rf.DataFrame({"foo": [8, 9]})
13 | self.assertEqual(result1, expected1)
14 | self.assertEqual(result2, expected2)
15 |
16 | def test_accumulate(self):
17 | df = rf.DataFrame({"foo": [1, 2, 3, 4]})
18 | result = df.accumulate("foo", into="cumsum")
19 | expected = rf.DataFrame({"foo": [1, 2, 3, 4], "cumsum": [1, 3, 6, 10]})
20 | self.assertEqual(result, expected)
21 |
22 | def test_gather(self):
23 | df = rf.DataFrame(
24 | {
25 | "foo": [1, 2, 1, 2],
26 | "bar": ["A", "B", "C", "D"],
27 | "baz": ["!", "@", "#", "$"],
28 | "jaz": range(4),
29 | }
30 | )
31 | result1 = df.gather()
32 | result2 = df.gather(["foo", "bar"], into=("var", "val"))
33 | result3 = df.group(["foo", "bar"]).gather(into=("variable", "value"))
34 | expected1 = rf.DataFrame(
35 | {
36 | "variable": [
37 | "foo",
38 | "foo",
39 | "foo",
40 | "foo",
41 | "bar",
42 | "bar",
43 | "bar",
44 | "bar",
45 | "baz",
46 | "baz",
47 | "baz",
48 | "baz",
49 | "jaz",
50 | "jaz",
51 | "jaz",
52 | "jaz",
53 | ],
54 | "value": [
55 | 1,
56 | 2,
57 | 1,
58 | 2,
59 | "A",
60 | "B",
61 | "C",
62 | "D",
63 | "!",
64 | "@",
65 | "#",
66 | "$",
67 | 0,
68 | 1,
69 | 2,
70 | 3,
71 | ],
72 | }
73 | )
74 | expected2 = rf.DataFrame(
75 | {
76 | "baz": ["!", "@", "#", "$", "!", "@", "#", "$"],
77 | "jaz": [0, 1, 2, 3, 0, 1, 2, 3],
78 | "var": ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
79 | "val": [1, 2, 1, 2, "A", "B", "C", "D"],
80 | }
81 | )
82 | expected3 = rf.DataFrame(
83 | {
84 | "foo": [1, 2, 1, 2, 1, 2, 1, 2],
85 | "bar": ["A", "B", "C", "D", "A", "B", "C", "D"],
86 | "variable": ["baz", "baz", "baz", "baz", "jaz", "jaz", "jaz", "jaz"],
87 | "value": ["!", "@", "#", "$", 0, 1, 2, 3],
88 | }
89 | )
90 | self.assertEqual(result1, expected1)
91 | self.assertEqual(result2, expected2)
92 | self.assertEqual(result3, expected3)
93 |
94 | def test_pack(self):
95 | df = rf.DataFrame(
96 | {"foo": ["A", "A", "B", "A", "B", "C"], "bar": [1, 2, 3, 4, 5, 6]}
97 | )
98 | result1 = df.pack("foo", sep="+")
99 | result2 = df.group("foo").pack("bar", sep="|")
100 | expected1 = rf.DataFrame({"foo": ["A+A+B+A+B+C"]})
101 | expected2 = rf.DataFrame({"foo": ["A", "B", "C"], "bar": ["1|2|4", "3|5", "6"]})
102 | self.assertEqual(result1, expected1)
103 | self.assertEqual(result2, expected2)
104 |
105 | def test_rank(self):
106 | df = rf.DataFrame({"foo": [2, 3, 3, 99, 1000, 1, -6, 4]})
107 | result = df.rank("foo", into="rank", descending=True)
108 | expected = rf.DataFrame(
109 | {"foo": [2, 3, 3, 99, 1000, 1, -6, 4], "rank": [5.0, 4, 4, 2, 1, 6, 7, 3]}
110 | )
111 | self.assertEqual(result, expected)
112 |
113 | def test_rollup(self):
114 | df = rf.DataFrame({"foo": [1, 2, 3, 4, 5], "bar": [99, 100, 1, -5, 2]})
115 | result = df.rollup(
116 | {
117 | "fcount": ("foo", rf.stat.count),
118 | "fmean": ("foo", rf.stat.mean),
119 | "fsum": ("foo", rf.stat.sum),
120 | "fmax": ("foo", rf.stat.max),
121 | "bmedian": ("bar", rf.stat.median),
122 | "bmin": ("bar", rf.stat.min),
123 | "bstd": ("bar", rf.stat.std),
124 | }
125 | )
126 | expected = rf.DataFrame(
127 | {
128 | "fcount": [5.0],
129 | "fmean": [3.0],
130 | "fsum": [15.0],
131 | "fmax": [5.0],
132 | "bmedian": [2.0],
133 | "bmin": [-5.0],
134 | "bstd": [54.929955397760885],
135 | }
136 | )
137 | self.assertEqual(result, expected)
138 |
139 | def test_init(self):
140 | rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]})
141 | self.assertTrue(True)
142 |
143 | def test_eq(self):
144 | adf = rf.DataFrame({"foo": [1]})
145 | bdf = rf.DataFrame({"bar": [1]})
146 | cdf = rf.DataFrame({"foo": [1]})
147 | self.assertFalse(adf == bdf)
148 | self.assertTrue(adf == cdf)
149 |
150 | def test_getitem(self):
151 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]})
152 | result = df["foo"]
153 | expected = [1, 2]
154 | self.assertEqual(result, expected)
155 |
156 | def test_str(self):
157 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]})
158 | result = str(df)
159 | expected = "rf.DataFrame({'foo': [1, 2], 'bar': ['A', 'B']})"
160 | self.assertEqual(result, expected)
161 |
162 | def test_columns(self):
163 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"], "baz": [True, False]})
164 | result = df.columns
165 | expected = ["foo", "bar", "baz"]
166 | self.assertEqual(result, expected)
167 |
168 | def test_dimensions(self):
169 | df = rf.DataFrame({"foo": range(10), "bar": range(10, 20)})
170 | result = df.dimensions
171 | expected = {"rows": 10, "columns": 2}
172 | self.assertEqual(result, expected)
173 |
174 | def test_empty(self):
175 | df = rf.DataFrame()
176 | result = df.empty
177 | expected = True
178 | self.assertEqual(result, expected)
179 |
180 | def test_memory(self):
181 | df = rf.DataFrame({"foo": [1, 2, 3], "bar": ["A", "B", "C"]})
182 | result = df.memory
183 | is_small = result.startswith("3") and result.endswith("B")
184 | self.assertTrue(is_small)
185 |
186 | def test_types(self):
187 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"], "baz": [True, False]})
188 | result = df.types
189 | expected = {"foo": int, "bar": object, "baz": bool}
190 | self.assertEqual(result, expected)
191 |
192 | def test_append(self):
193 | df1 = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]})
194 | df2 = rf.DataFrame({"bar": ["C", "D"], "foo": [3, 4], "baz": ["$", "@"]})
195 | result = df1.append(df2)
196 | expected = rf.DataFrame(
197 | {
198 | "foo": [1, 2, 3, 4],
199 | "bar": ["A", "B", "C", "D"],
200 | "baz": [None, None, "$", "@"],
201 | }
202 | )
203 | self.assertEqual(result, expected)
204 |
205 | def test_combine(self):
206 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]})
207 | result = df.combine(["bar", "foo"], into="baz", sep="::", drop=True)
208 | expected = rf.DataFrame({"baz": ["A::1", "B::2"]})
209 | self.assertEqual(result, expected)
210 |
211 | def test_cross(self):
212 | df = rf.DataFrame({"foo": ["a", "b", "c"], "bar": [1, 2, 3]})
213 | dfa = rf.DataFrame({"foo": [1, 2, 3]})
214 | dfb = rf.DataFrame({"bar": [1, 2, 3]})
215 | result1 = df.cross()
216 | result2 = dfa.cross(dfb, postfix=("_a", "_b"))
217 | expected1 = rf.DataFrame(
218 | {
219 | "foo_lhs": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
220 | "bar_lhs": [1, 1, 1, 2, 2, 2, 3, 3, 3],
221 | "foo_rhs": ["a", "b", "c", "a", "b", "c", "a", "b", "c"],
222 | "bar_rhs": [1, 2, 3, 1, 2, 3, 1, 2, 3],
223 | }
224 | )
225 | expected2 = rf.DataFrame(
226 | {"foo": [1, 1, 1, 2, 2, 2, 3, 3, 3], "bar": [1, 2, 3, 1, 2, 3, 1, 2, 3]}
227 | )
228 | self.assertEqual(result1, expected1)
229 | self.assertEqual(result2, expected2)
230 |
231 | def test_dedupe(self):
232 | df = rf.DataFrame({"foo": [1, 1, 2, 2], "bar": ["A", "A", "B", "A"]})
233 | result1 = df.dedupe()
234 | result2 = df.dedupe("foo")
235 | result3 = df.dedupe(["foo", "bar"])
236 | expected1 = rf.DataFrame({"foo": [1, 2, 2], "bar": ["A", "B", "A"]})
237 | expected2 = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]})
238 | expected3 = rf.DataFrame({"foo": [1, 2, 2], "bar": ["A", "B", "A"]})
239 | self.assertEqual(result1, expected1)
240 | self.assertEqual(result2, expected2)
241 | self.assertEqual(result3, expected3)
242 |
243 | def test_denix(self):
244 | df = rf.DataFrame(
245 | {"foo": [1, None, 3, None, 5, 6], "bar": [1, None, 3, 4, None, None]}
246 | )
247 | result1 = df.denix()
248 | result2 = df.denix("bar")
249 | result3 = df.denix(["foo", "bar"])
250 | expected1 = rf.DataFrame({"foo": [1.0, 3.0], "bar": [1.0, 3.0]})
251 | expected2 = rf.DataFrame({"foo": [1.0, 3.0, None], "bar": [1.0, 3.0, 4.0]})
252 | expected3 = rf.DataFrame({"foo": [1.0, 3.0], "bar": [1.0, 3.0]})
253 | self.assertEqual(result1, expected1)
254 | self.assertEqual(result2, expected2)
255 | self.assertEqual(result3, expected3)
256 |
257 | def test_drop(self):
258 | df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [5, 6]})
259 | result1 = df.drop("baz")
260 | result2 = df.drop(["foo", "baz"])
261 | expected1 = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]})
262 | expected2 = rf.DataFrame({"bar": [3, 4]})
263 | self.assertEqual(result1, expected1)
264 | self.assertEqual(result2, expected2)
265 |
266 | def test_fill(self):
267 | df = rf.DataFrame(
268 | {"foo": [1, None, None, 2, None], "bar": [None, "A", None, "B", None]}
269 | )
270 | result1 = df.fill(constant=0)
271 | result2 = df.fill(direction="down")
272 | result3 = df.fill("foo", direction="down")
273 | result4 = df.fill(["foo"], direction="up")
274 | expected1 = rf.DataFrame(
275 | {"foo": [1.0, 0.0, 0.0, 2.0, 0.0], "bar": [0, "A", 0, "B", 0]}
276 | )
277 | expected2 = rf.DataFrame(
278 | {"foo": [1.0, 1.0, 1.0, 2.0, 2.0], "bar": [None, "A", "A", "B", "B"]}
279 | )
280 | expected3 = rf.DataFrame(
281 | {"foo": [1.0, 1.0, 1.0, 2.0, 2.0], "bar": [None, "A", None, "B", None]}
282 | )
283 | expected4 = rf.DataFrame(
284 | {"foo": [1.0, 2.0, 2.0, 2.0, None], "bar": [None, "A", None, "B", None]}
285 | )
286 | self.assertEqual(result1, expected1)
287 | self.assertEqual(result2, expected2)
288 | self.assertEqual(result3, expected3)
289 | self.assertEqual(result4, expected4)
290 |
291 | def test_filter(self):
292 | df = rf.DataFrame({"foo": ["A", "A", "A", "B"], "bar": [1, 2, 3, 4]})
293 | result1 = df.filter(lambda row: row["foo"].isin(["A"]))
294 | result2 = df.filter(lambda row: (row["foo"] == "A") & (row["bar"] <= 2))
295 | result3 = df.filter(lambda row: (row["foo"] == "B") | (row["bar"] == 1))
296 | expected1 = rf.DataFrame({"foo": ["A", "A", "A"], "bar": [1, 2, 3]})
297 | expected2 = rf.DataFrame({"foo": ["A", "A"], "bar": [1, 2]})
298 | expected3 = rf.DataFrame({"foo": ["A", "B"], "bar": [1, 4]})
299 | self.assertEqual(result1, expected1)
300 | self.assertEqual(result2, expected2)
301 | self.assertEqual(result3, expected3)
302 |
303 | def test_group(self):
304 | df = rf.DataFrame(
305 | {
306 | "foo": ["A", "A", "A", "B", "B"],
307 | "bar": [1, 2, 3, 4, 5],
308 | "baz": [9, 7, 7, 5, 6],
309 | }
310 | )
311 | result1 = df.group("foo").accumulate("bar", into="bar_cumsum")
312 | result2 = df.group("foo").gather()
313 | result3 = df.group("foo").pack("bar", sep=":")
314 | result4 = df.group("foo").rank("baz", into="baz_rank", descending=True)
315 | result5 = df.group("foo").rollup(
316 | {"bar_mean": ("bar", rf.stat.mean), "baz_min": ("baz", rf.stat.min)}
317 | )
318 | result6 = df.group("foo").take(1)
319 | expected1 = rf.DataFrame(
320 | {
321 | "foo": ["A", "A", "A", "B", "B"],
322 | "bar": [1, 2, 3, 4, 5],
323 | "baz": [9, 7, 7, 5, 6],
324 | "bar_cumsum": [1, 3, 6, 4, 9],
325 | }
326 | )
327 | expected2 = rf.DataFrame(
328 | {
329 | "foo": ["A", "A", "A", "B", "B", "A", "A", "A", "B", "B"],
330 | "variable": [
331 | "bar",
332 | "bar",
333 | "bar",
334 | "bar",
335 | "bar",
336 | "baz",
337 | "baz",
338 | "baz",
339 | "baz",
340 | "baz",
341 | ],
342 | "value": [1, 2, 3, 4, 5, 9, 7, 7, 5, 6],
343 | }
344 | )
345 | expected3 = rf.DataFrame({"foo": ["A", "B"], "bar": ["1:2:3", "4:5"]})
346 | expected4 = rf.DataFrame(
347 | {
348 | "foo": ["A", "A", "A", "B", "B"],
349 | "bar": [1, 2, 3, 4, 5],
350 | "baz": [9, 7, 7, 5, 6],
351 | "baz_rank": [1.0, 2.0, 2.0, 2.0, 1.0],
352 | }
353 | )
354 | expected5 = rf.DataFrame(
355 | {"foo": ["A", "B"], "bar_mean": [2.0, 4.5], "baz_min": [7, 5]}
356 | )
357 | expected6 = rf.DataFrame({"foo": ["A", "B"], "bar": [1, 4], "baz": [9, 5]})
358 | self.assertEqual(result1, expected1)
359 | self.assertEqual(result2, expected2)
360 | self.assertEqual(result3, expected3)
361 | self.assertEqual(result4, expected4)
362 | self.assertEqual(result5, expected5)
363 | self.assertEqual(result6, expected6)
364 |
365 | def test_join(self):
366 | adf = rf.DataFrame({"foo": ["A", "B", "C"], "bar": [1, 2, 3]})
367 | bdf = rf.DataFrame({"foo": ["A", "B", "D"], "baz": ["!", "@", "#"]})
368 | result1 = adf.join(bdf, on="foo", how="left")
369 | result2 = adf.join(bdf, on="foo", how="right")
370 | result3 = adf.join(bdf, on="foo", how="inner")
371 | result4 = adf.join(bdf, on="foo", how="full")
372 | expected1 = rf.DataFrame(
373 | {"foo": ["A", "B", "C"], "bar": [1, 2, 3], "baz": ["!", "@", None]}
374 | )
375 | expected2 = rf.DataFrame(
376 | {"foo": ["A", "B", "D"], "bar": [1.0, 2.0, None], "baz": ["!", "@", "#"]}
377 | )
378 | expected3 = rf.DataFrame({"foo": ["A", "B"], "bar": [1, 2], "baz": ["!", "@"]})
379 | expected4 = rf.DataFrame(
380 | {
381 | "foo": ["A", "B", "C", "D"],
382 | "bar": [1.0, 2.0, 3.0, None],
383 | "baz": ["!", "@", None, "#"],
384 | }
385 | )
386 | self.assertEqual(result1, expected1)
387 | self.assertEqual(result2, expected2)
388 | self.assertEqual(result3, expected3)
389 | self.assertEqual(result4, expected4)
390 |
391 | def test_mutate(self):
392 | df = rf.DataFrame({"foo": [1, 2, 3]})
393 | result = df.mutate(
394 | {
395 | "bar": lambda row: float(row["foo"]),
396 | "baz": lambda row: "X" + str(row["bar"] * 2),
397 | "jaz": lambda _: "Jazz",
398 | }
399 | )
400 | expected = rf.DataFrame(
401 | {
402 | "foo": [1, 2, 3],
403 | "bar": [1.0, 2.0, 3.0],
404 | "baz": ["X2.0", "X4.0", "X6.0"],
405 | "jaz": ["Jazz", "Jazz", "Jazz"],
406 | }
407 | )
408 | self.assertEqual(result, expected)
409 |
410 | def test_rename(self):
411 | df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]})
412 | result = df.rename({"foo": "oof", "bar": "rab"})
413 | expected = rf.DataFrame({"oof": [1, 2], "rab": [3, 4]})
414 | self.assertEqual(result, expected)
415 |
416 | def test_replace(self):
417 | df = rf.DataFrame({"foo": [1, 2, 2, 2, 1], "bar": [1, "A", "B", True, False]})
418 | result = df.replace(
419 | {"foo": {2: 222}, "bar": {False: 0, True: 1, "A": 2, "B": 3}}
420 | )
421 | expected = rf.DataFrame({"foo": [1, 222, 222, 222, 1], "bar": [1, 2, 3, 1, 0]})
422 | self.assertEqual(result, expected)
423 |
424 | def test_sample(self):
425 | df = rf.DataFrame({"foo": range(10), "bar": range(10, 20)})
426 | result1 = df.sample(1)
427 | result2 = df.sample(3)
428 | result3 = df.sample(0.3)
429 | self.assertEqual(len(result1), 1)
430 | self.assertEqual(len(result2), 3)
431 | self.assertEqual(len(result3), 3)
432 |
433 | def test_select(self):
434 | df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [5, 6]})
435 | result1 = df.select("foo")
436 | result2 = df.select(["foo", "baz"])
437 | expected1 = rf.DataFrame({"foo": [1, 2]})
438 | expected2 = rf.DataFrame({"foo": [1, 2], "baz": [5, 6]})
439 | self.assertEqual(result1, expected1)
440 | self.assertEqual(result2, expected2)
441 |
442 | def test_shuffle(self):
443 | df = rf.DataFrame({"foo": range(5), "bar": range(5, 10)})
444 | result = df.shuffle()
445 | self.assertNotEqual(df, result)
446 |
447 | def test_sort(self):
448 | df = rf.DataFrame({"foo": ["Z", "X", "A", "A"], "bar": [2, -2, 4, -4]})
449 | result1 = df.sort("bar")
450 | result2 = df.sort("bar", descending=True)
451 | result3 = df.sort(["foo", "bar"], descending=False)
452 | expected1 = rf.DataFrame({"foo": ["A", "X", "Z", "A"], "bar": [-4, -2, 2, 4]})
453 | expected2 = rf.DataFrame({"foo": ["A", "Z", "X", "A"], "bar": [4, 2, -2, -4]})
454 | expected3 = rf.DataFrame({"foo": ["A", "A", "X", "Z"], "bar": [-4, 4, -2, 2]})
455 | self.assertEqual(result1, expected1)
456 | self.assertEqual(result2, expected2)
457 | self.assertEqual(result3, expected3)
458 |
459 | def test_split(self):
460 | df = rf.DataFrame({"foo": ["A::1", "B::2", "C:3"]})
461 | result = df.split("foo", into=["foo", "bar"], sep="::", drop=True)
462 | expected = rf.DataFrame({"foo": ["A", "B", "C:3"], "bar": ["1", "2", None]})
463 | self.assertEqual(result, expected)
464 |
465 | def test_spread(self):
466 | df = rf.DataFrame(
467 | {"foo": ["A", "A", "A", "B", "B", "B", "B"], "bar": [1, 2, 3, 4, 5, 6, 7]}
468 | )
469 | result = df.spread("foo", using="bar")
470 | expected = rf.DataFrame({"A": [1.0, 2.0, 3.0, None], "B": [4.0, 5.0, 6.0, 7.0]})
471 | self.assertEqual(result, expected)
472 |
473 | def test_unpack(self):
474 | df = rf.DataFrame({"foo": [1, 2, 3, 4], "bar": ["A:B", "B:C:D", "D:E", "F"]})
475 | result = df.unpack("bar", sep=":")
476 | expected = rf.DataFrame(
477 | {
478 | "foo": [1, 1, 2, 2, 2, 3, 3, 4],
479 | "bar": ["A", "B", "B", "C", "D", "D", "E", "F"],
480 | }
481 | )
482 | self.assertEqual(result, expected)
483 |
--------------------------------------------------------------------------------
/tests/test_dupe_columns.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import redframes as rf
4 |
5 |
6 | class TestDupeColumns(unittest.TestCase):
7 | def setUp(self):
8 | self.df = rf.DataFrame(
9 | {
10 | "foo": range(10),
11 | "bar": [1, 3.2, 4.5, 2, -1, 30, None, 1.1, 1.1, 9],
12 | "baz": ["A", "A", None, "B", "B", "A", "B", "C", "C", "A"],
13 | "jaz": [
14 | "1::1",
15 | "2::2",
16 | "3:3",
17 | "4::4",
18 | "5::5",
19 | "6::7",
20 | "7::8",
21 | "8::9",
22 | "9::0",
23 | "0::-1",
24 | ],
25 | "raz": [1, 2, 3, None, None, None, 9, 9, None, None],
26 | }
27 | )
28 |
29 | def test_accumulate_not_unqiue(self):
30 | self.df.accumulate("foo", into="foo")
31 | self.assertTrue(True)
32 |
33 | def test_accumulate_overwrite_existing(self):
34 | with self.assertWarnsRegex(UserWarning, "overwriting existing column *"):
35 | self.df.accumulate("foo", into="bar")
36 |
37 | def test_combine_into_overwrite(self):
38 | self.df.combine(["foo", "bar"], into="foo", sep="-")
39 | self.assertTrue(True)
40 |
41 | def test_combine_overwrite_existing(self):
42 | with self.assertWarnsRegex(UserWarning, "overwriting existing column *"):
43 | self.df.combine(["foo", "bar"], into="baz", sep="-")
44 |
45 | def test_combine_overwrite_no_drop(self):
46 | self.df.combine(["foo", "bar"], into="foo", sep="-", drop=False)
47 | self.assertTrue(True)
48 |
49 | def test_gather_same_column_names(self):
50 | with self.assertRaisesRegex(TypeError, "must be unique"):
51 | self.df.gather(into=("foo", "foo"))
52 |
53 | def test_gather_exising_column_name_for_variable(self):
54 | with self.assertRaisesRegex(ValueError, "must not be an existing column key"):
55 | self.df.gather(into=("foo", "value"))
56 |
57 | def test_gather_exising_column_name_for_value(self):
58 | with self.assertRaisesRegex(ValueError, "must not be an existing column key"):
59 | self.df.gather(into=("variable", "foo"))
60 |
61 | def test_gather_exising_column_key(self):
62 | with self.assertRaisesRegex(ValueError, "must not be an existing column key"):
63 | self.df.gather(["foo", "bar"], into=("raz", "baz"))
64 |
65 | def test_gather_group_into_conflict(self):
66 | with self.assertRaisesRegex(ValueError, "must not be an existing column key"):
67 | self.df.group("foo").gather(into=("foo", "bar"))
68 |
69 | def test_rank_into_overwrite(self):
70 | self.df.rank("bar", into="bar", descending=True)
71 | self.assertTrue(True)
72 |
73 | def test_rank_overwrite_existing(self):
74 | with self.assertWarnsRegex(UserWarning, "overwriting existing column *"):
75 | self.df.rank("bar", into="baz", descending=True)
76 |
77 | def test_rename_duplicated_dict_values(self):
78 | with self.assertRaisesRegex(KeyError, "columns must be unique"):
79 | self.df.rename({"foo": "oof", "bar": "oof"})
80 |
81 | def test_rollup_group_existing_column(self):
82 | with self.assertRaisesRegex(KeyError, "unable to overwrite group key"):
83 | self.df.group("baz").rollup({"baz": ("foo", rf.stat.max)})
84 |
85 | def test_select_duplicate_keys(self):
86 | with self.assertRaisesRegex(KeyError, "column keys must be unique"):
87 | self.df.select(["foo", "foo"])
88 |
89 | def test_split_overwrite_into_one(self):
90 | self.df.split("jaz", into=["jaz", "paz"], sep="::")
91 | self.assertTrue(True)
92 |
93 | def test_split_overwrite_into_existing(self):
94 | with self.assertRaisesRegex(KeyError, "into keys must be unique"):
95 | self.df.split("jaz", into=["jaz", "foo"], sep="::")
96 |
97 | def test_split_duplicated_into_keys(self):
98 | with self.assertRaisesRegex(KeyError, "into keys must be unique"):
99 | self.df.split("jaz", into=["paz", "paz"], sep="::")
100 |
101 | def test_spread_duplicated_column_names(self):
102 | with self.assertRaisesRegex(KeyError, "column and using must be unique"):
103 | self.df.gather().spread("variable", "variable")
104 |
--------------------------------------------------------------------------------
/tests/test_index.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pandas as pd
4 |
5 | import redframes as rf
6 |
7 |
8 | def index_is_okay(df: rf.DataFrame) -> bool:
9 | index = df._data.index
10 | is_unnamed = index.name == None
11 | is_range = isinstance(index, pd.RangeIndex)
12 | is_zero_start = index.start == 0
13 | is_one_step = index.step == 1
14 | return all([is_unnamed, is_range, is_zero_start, is_one_step])
15 |
16 |
17 | class TestIndex(unittest.TestCase):
18 | def setUp(self):
19 | self.df = rf.DataFrame(
20 | {
21 | "foo": range(10),
22 | "bar": [1, 3.2, 4.5, 2, -1, 30, None, 1.1, 1.1, 9],
23 | "baz": ["A", "A", None, "B", "B", "A", "B", "C", "C", "A"],
24 | "jaz": [
25 | "1::1",
26 | "2::2",
27 | "3:3",
28 | "4::4",
29 | "5::5",
30 | "6::7",
31 | "7::8",
32 | "8::9",
33 | "9::0",
34 | "0::-1",
35 | ],
36 | "raz": [1, 2, 3, None, None, None, 9, 9, None, None],
37 | }
38 | )
39 |
40 | def test_accumulate(self):
41 | new = self.df.accumulate("foo", into="foo")
42 | self.assertTrue(index_is_okay(new))
43 |
44 | def test_append(self):
45 | df_bottom = rf.DataFrame({"foo": [10]})
46 | new = self.df.append(df_bottom)
47 | self.assertTrue(index_is_okay(new))
48 |
49 | def test_combine(self):
50 | new = self.df.combine(["foo", "bar"], into="foo", sep="-")
51 | self.assertTrue(index_is_okay(new))
52 |
53 | def test_cross(self):
54 | new = self.df.cross()
55 | self.assertTrue(index_is_okay(new))
56 |
57 | def test_dedupe(self):
58 | new = self.df.dedupe("baz")
59 | self.assertTrue(index_is_okay(new))
60 |
61 | def test_denix(self):
62 | new = self.df.denix()
63 | self.assertTrue(index_is_okay(new))
64 |
65 | def test_drop(self):
66 | new = self.df.drop("foo")
67 | self.assertTrue(index_is_okay(new))
68 |
69 | def test_fill(self):
70 | new = self.df.fill("baz", direction="down")
71 | self.assertTrue(index_is_okay(new))
72 |
73 | def test_filter(self):
74 | new = self.df.filter(lambda row: row["bar"] > 5)
75 | self.assertTrue(index_is_okay(new))
76 |
77 | def test_gather(self):
78 | new = self.df.gather()
79 | self.assertTrue(index_is_okay(new))
80 |
81 | def test_group(self):
82 | new = self.df.group("baz").rollup({"foo": ("foo", rf.stat.mean)})
83 | self.assertTrue(index_is_okay(new))
84 |
85 | def test_join(self):
86 | df_right = rf.DataFrame({"baz": ["A", "B"], "haz": ["Apple", "Banana"]})
87 | new = self.df.join(df_right, on="baz")
88 | self.assertTrue(index_is_okay(new))
89 |
90 | def test_mutate(self):
91 | new = self.df.mutate({"foo": lambda row: row["foo"] * 10})
92 | self.assertTrue(index_is_okay(new))
93 |
94 | def test_pack(self):
95 | new = self.df.pack("baz", sep="|")
96 | self.assertTrue(index_is_okay(new))
97 |
98 | def test_rank(self):
99 | new = self.df.rank("bar", into="bar_rank", descending=True)
100 | self.assertTrue(index_is_okay(new))
101 |
102 | def test_rename(self):
103 | new = self.df.rename({"foo": "oof"})
104 | self.assertTrue(index_is_okay(new))
105 |
106 | def test_replace(self):
107 | new = self.df.replace({"baz": {"B": "Banana"}})
108 | self.assertTrue(index_is_okay(new))
109 |
110 | def test_rollup(self):
111 | new = self.df.rollup({"bar_mean": ("bar", rf.stat.mean)})
112 | self.assertTrue(index_is_okay(new))
113 |
114 | def test_sample(self):
115 | new = self.df.sample(5)
116 | self.assertTrue(index_is_okay(new))
117 |
118 | def test_select(self):
119 | new = self.df.select(["foo", "bar"])
120 | self.assertTrue(index_is_okay(new))
121 |
122 | def test_shuffle(self):
123 | new = self.df.shuffle()
124 | self.assertTrue(index_is_okay(new))
125 |
126 | def test_sort(self):
127 | new = self.df.sort("bar", descending=True)
128 | self.assertTrue(index_is_okay(new))
129 |
130 | def test_split(self):
131 | new = self.df.split("jaz", into=["jaz_1", "jaz_2"], sep="::")
132 | self.assertTrue(index_is_okay(new))
133 |
134 | def test_spread(self):
135 | new = self.df.denix("baz").select(["baz", "foo"]).spread("baz", "foo")
136 | self.assertTrue(index_is_okay(new))
137 |
138 | def test_take(self):
139 | new = self.df.take(-3)
140 | self.assertTrue(index_is_okay(new))
141 |
142 | def test_unpack(self):
143 | new = self.df.unpack("jaz", sep="::")
144 | self.assertTrue(index_is_okay(new))
145 |
--------------------------------------------------------------------------------
/tests/test_interchange.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pandas as pd
4 |
5 | import redframes as rf
6 |
7 |
8 | class TestInterchange(unittest.TestCase):
9 | def test_wrap_no_side_effect(self):
10 | rdf = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]})
11 | result = pd.api.interchange.from_dataframe(rdf)
12 | expected = pd.DataFrame({"foo": [1, 2], "bar": [3, 4]})
13 | self.assertTrue(result.equals(expected))
14 |
--------------------------------------------------------------------------------
/tests/test_io.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from pathlib import Path
3 | from shutil import rmtree as delete
4 | from tempfile import mkdtemp as make_temp_dir
5 |
6 | import pandas as pd
7 |
8 | import redframes as rf
9 |
10 |
11 | class TestIO(unittest.TestCase):
12 | def setUp(self):
13 | self.tempdir = tempdir = make_temp_dir()
14 | self.df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]})
15 | self.pdf = pd.DataFrame({"foo": [1, 2], "bar": [3, 4]})
16 | self.path = str(Path(tempdir) / "example.csv")
17 |
18 | def tearDown(self):
19 | delete(self.tempdir)
20 |
21 | def test_load_missing_file(self):
22 | with self.assertRaises(FileNotFoundError):
23 | rf.load("test_missing_file.csv")
24 |
25 | def test_load_bad_format(self):
26 | with self.assertRaisesRegex(TypeError, "must end in .csv"):
27 | rf.load("test_bad_file_format.json")
28 |
29 | def test_save_bad_path_format(self):
30 | with self.assertRaisesRegex(TypeError, "must end in .csv"):
31 | rf.save(self.df, "example.json")
32 |
33 | def test_save_bad_type(self):
34 | with self.assertRaisesRegex(TypeError, "must be DataFrame"):
35 | rf.save(1, "example.json")
36 |
37 | def test_unwrap_bad_type(self):
38 | with self.assertRaisesRegex(TypeError, "must be DataFrame"):
39 | rf.unwrap(1)
40 |
41 | def test_wrap_bad_type(self):
42 | with self.assertRaisesRegex(TypeError, "must be DataFrame"):
43 | rf.wrap(1)
44 |
45 | def test_unwrap_wrong_direction(self):
46 | with self.assertRaisesRegex(TypeError, "must be DataFrame"):
47 | rf.unwrap(self.pdf)
48 |
49 | def test_wrap_wrong_direction(self):
50 | with self.assertRaisesRegex(TypeError, "must be DataFrame"):
51 | rf.wrap(self.df)
52 |
53 | def test_unwrap_no_side_effect(self):
54 | pdf = rf.unwrap(self.df)
55 | pdf.columns = ["oof", "rab"]
56 | expected = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]})
57 | self.assertEqual(self.df, expected)
58 |
59 | def test_wrap_no_side_effect(self):
60 | df = rf.wrap(self.pdf)
61 | df = df.rename({"foo": "oof"})
62 | expected = pd.DataFrame({"foo": [1, 2], "bar": [3, 4]})
63 | self.assertTrue(self.pdf.equals(expected))
64 |
65 | def test_round_trip_save_load(self):
66 | rf.save(self.df, self.path)
67 | result = rf.load(self.path)
68 | expected = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]})
69 | self.assertEqual(result, expected)
70 |
71 | def test_round_trip_unwrap_wrap(self):
72 | pdf = rf.unwrap(self.df)
73 | result = rf.wrap(pdf)
74 | expected = rf.DataFrame({"foo": [1, 2], "bar": [3, 4]})
75 | self.assertEqual(result, expected)
76 |
--------------------------------------------------------------------------------
/tests/test_ladybugs.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pandas as pd
4 |
5 | import redframes as rf
6 |
7 |
8 | class TestLadyBugs(unittest.TestCase):
9 | def test_gather_spread_string_values(self):
10 | df = rf.DataFrame(
11 | {"foo": ["A", "B", "C"], "bar": ["D", "E", "F"], "baz": ["G", "H", "I"]}
12 | )
13 | result = df.gather().spread("variable", "value")
14 | expected = rf.DataFrame(
15 | {"bar": ["D", "E", "F"], "baz": ["G", "H", "I"], "foo": ["A", "B", "C"]}
16 | )
17 | self.assertEqual(result, expected)
18 |
19 | def test_types_mixed_column(self):
20 | df = rf.DataFrame({"foo": [1, None, 2.0, "3"]})
21 | result = df.types
22 | expected = {"foo": object}
23 | self.assertEqual(result, expected)
24 |
25 | def test_comine_overwrite_and_drop_other(self):
26 | df = rf.DataFrame({"foo": [1, 2, 3], "bar": [1, 2, 3]})
27 | result = df.combine(["foo", "bar"], into="foo", sep="-", drop=True)
28 | expected = rf.DataFrame({"foo": ["1-1", "2-2", "3-3"]})
29 | self.assertEqual(result, expected)
30 |
31 | def test_sample_float_1_point_0(self):
32 | df = rf.DataFrame({"foo": range(100)})
33 | with self.assertRaisesRegex(ValueError, "must be int if > 1"):
34 | df.sample(1.0)
35 |
36 | def test_sample_negative_1(self):
37 | df = rf.DataFrame({"foo": range(100)})
38 | with self.assertRaisesRegex(ValueError, "must be > 0"):
39 | df.sample(-1)
40 |
41 | def test_io_wrap_multi_columns(self):
42 | columns = pd.MultiIndex.from_arrays(
43 | [["route", "action", "action"], ["type", "source", "destination"]]
44 | )
45 | pdf = pd.DataFrame([[1, 2, 3]], columns=columns)
46 | with self.assertRaisesRegex(KeyError, "must be flat"):
47 | rf.wrap(pdf)
48 |
49 | def test_group_gather_beside_conflict(self):
50 | df = rf.DataFrame(
51 | {
52 | "foo": [1, 1, 1, 2, 2, 1, 3, 3],
53 | "bar": range(8),
54 | "baz": range(8),
55 | "jaz": range(8),
56 | }
57 | )
58 | with self.assertRaisesRegex(ValueError, "beside is incompatible*"):
59 | df.group("foo").gather(beside="bar")
60 |
61 | def test_group_gather_columns_conflict(self):
62 | df = rf.DataFrame(
63 | {
64 | "foo": [1, 1, 1, 2, 2, 1, 3, 3],
65 | "bar": range(8),
66 | "baz": range(8),
67 | "jaz": range(8),
68 | }
69 | )
70 | with self.assertRaisesRegex(ValueError, "columns is incompatible*"):
71 | df.group("foo").gather(columns=["foo", "bar"])
72 |
--------------------------------------------------------------------------------
/tests/test_readme.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from pathlib import Path
3 | from shutil import rmtree as delete
4 | from tempfile import mkdtemp as make_temp_dir
5 |
6 |
7 | class TestReadme(unittest.TestCase):
8 | def setUp(self):
9 | self.tempdir = tempdir = make_temp_dir()
10 | self.path = str(Path(tempdir) / "bears.csv")
11 |
12 | def tearDown(self):
13 | delete(self.tempdir)
14 |
15 | def test_quick_start(self):
16 | import redframes as rf
17 |
18 | df = rf.DataFrame(
19 | {
20 | "bear": [
21 | "Brown bear",
22 | "Polar bear",
23 | "Asian black bear",
24 | "American black bear",
25 | "Sun bear",
26 | "Sloth bear",
27 | "Spectacled bear",
28 | "Giant panda",
29 | ],
30 | "genus": [
31 | "Ursus",
32 | "Ursus",
33 | "Ursus",
34 | "Ursus",
35 | "Helarctos",
36 | "Melursus",
37 | "Tremarctos",
38 | "Ailuropoda",
39 | ],
40 | "weight (male, lbs)": [
41 | "300-860",
42 | "880-1320",
43 | "220-440",
44 | "125-500",
45 | "60-150",
46 | "175-310",
47 | "220-340",
48 | "190-275",
49 | ],
50 | "weight (female, lbs)": [
51 | "205-455",
52 | "330-550",
53 | "110-275",
54 | "90-300",
55 | "45-90",
56 | "120-210",
57 | "140-180",
58 | "155-220",
59 | ],
60 | }
61 | )
62 |
63 | # | bear | genus | weight (male, lbs) | weight (female, lbs) |
64 | # |:--------------------|:-----------|:---------------------|:-----------------------|
65 | # | Brown bear | Ursus | 300-860 | 205-455 |
66 | # | Polar bear | Ursus | 880-1320 | 330-550 |
67 | # | Asian black bear | Ursus | 220-440 | 110-275 |
68 | # | American black bear | Ursus | 125-500 | 90-300 |
69 | # | Sun bear | Helarctos | 60-150 | 45-90 |
70 | # | Sloth bear | Melursus | 175-310 | 120-210 |
71 | # | Spectacled bear | Tremarctos | 220-340 | 140-180 |
72 | # | Giant panda | Ailuropoda | 190-275 | 155-220 |
73 |
74 | (
75 | df.rename({"weight (male, lbs)": "male", "weight (female, lbs)": "female"})
76 | .gather(["male", "female"], into=("sex", "weight"))
77 | .split("weight", into=["min", "max"], sep="-")
78 | .gather(["min", "max"], into=("stat", "weight"))
79 | .mutate({"weight": lambda row: float(row["weight"])})
80 | .group(["genus", "sex"])
81 | .rollup({"weight": ("weight", rf.stat.mean)})
82 | .spread("sex", using="weight")
83 | .mutate({"dimorphism": lambda row: round(row["male"] / row["female"], 2)})
84 | .drop(["male", "female"])
85 | .sort("dimorphism", descending=True)
86 | )
87 |
88 | # | genus | dimorphism |
89 | # |:-----------|-------------:|
90 | # | Ursus | 2.01 |
91 | # | Tremarctos | 1.75 |
92 | # | Helarctos | 1.56 |
93 | # | Melursus | 1.47 |
94 | # | Ailuropoda | 1.24 |
95 |
96 | self.assertTrue(True)
97 |
98 | def test_pandas_comparison(self):
99 | import pandas as pd
100 |
101 | df = pd.DataFrame(
102 | {
103 | "bear": [
104 | "Brown bear",
105 | "Polar bear",
106 | "Asian black bear",
107 | "American black bear",
108 | "Sun bear",
109 | "Sloth bear",
110 | "Spectacled bear",
111 | "Giant panda",
112 | ],
113 | "genus": [
114 | "Ursus",
115 | "Ursus",
116 | "Ursus",
117 | "Ursus",
118 | "Helarctos",
119 | "Melursus",
120 | "Tremarctos",
121 | "Ailuropoda",
122 | ],
123 | "weight (male, lbs)": [
124 | "300-860",
125 | "880-1320",
126 | "220-440",
127 | "125-500",
128 | "60-150",
129 | "175-310",
130 | "220-340",
131 | "190-275",
132 | ],
133 | "weight (female, lbs)": [
134 | "205-455",
135 | "330-550",
136 | "110-275",
137 | "90-300",
138 | "45-90",
139 | "120-210",
140 | "140-180",
141 | "155-220",
142 | ],
143 | }
144 | )
145 |
146 | df = df.rename(
147 | columns={"weight (male, lbs)": "male", "weight (female, lbs)": "female"}
148 | )
149 | df = pd.melt(
150 | df,
151 | id_vars=["bear", "genus"],
152 | value_vars=["male", "female"],
153 | var_name="sex",
154 | value_name="weight",
155 | )
156 | df[["min", "max"]] = df["weight"].str.split("-", expand=True)
157 | df = df.drop("weight", axis=1)
158 | df = pd.melt(
159 | df,
160 | id_vars=["bear", "genus", "sex"],
161 | value_vars=["min", "max"],
162 | var_name="stat",
163 | value_name="weight",
164 | )
165 | df["weight"] = df["weight"].astype("float")
166 | df = df.groupby(["genus", "sex"])["weight"].mean()
167 | df = df.reset_index()
168 | df = pd.pivot_table(df, index=["genus"], columns=["sex"], values="weight")
169 | df = df.reset_index()
170 | df = df.rename_axis(None, axis=1)
171 | df["dimorphism"] = round(df["male"] / df["female"], 2)
172 | df = df.drop(["female", "male"], axis=1)
173 | df = df.sort_values("dimorphism", ascending=False)
174 | df = df.reset_index(drop=True)
175 |
176 | self.assertTrue(True)
177 |
178 | def test_io(self):
179 | import redframes as rf
180 |
181 | df = rf.DataFrame({"foo": [1, 2], "bar": ["A", "B"]})
182 |
183 | # save .csv
184 | rf.save(df, self.path)
185 |
186 | # load .csv
187 | df = rf.load(self.path)
188 |
189 | # convert redframes → pandas
190 | pandas_df = rf.unwrap(df)
191 |
192 | # convert pandas → redframes
193 | df = rf.wrap(pandas_df)
194 |
195 | self.assertTrue(True)
196 |
197 | def test_properties(self):
198 | import redframes as rf
199 |
200 | df = rf.DataFrame({"genus": [1]})
201 |
202 | df["genus"]
203 | # ['Ursus', 'Ursus', 'Ursus', 'Ursus', 'Helarctos', 'Melursus', 'Tremarctos', 'Ailuropoda']
204 |
205 | df.columns
206 | # ['bear', 'genus', 'weight (male, lbs)', 'weight (female, lbs)']
207 |
208 | df.dimensions
209 | # {'rows': 8, 'columns': 4}
210 |
211 | df.empty
212 | # False
213 |
214 | df.memory
215 | # '2 KB'
216 |
217 | df.types
218 | # {'bear': object, 'genus': object, 'weight (male, lbs)': object, 'weight (female, lbs)': object}
219 |
220 | self.assertTrue(True)
221 |
222 | def test_matplotlib(self):
223 | import matplotlib.pyplot as plt
224 |
225 | import redframes as rf
226 |
227 | football = rf.DataFrame(
228 | {
229 | "position": ["TE", "K", "RB", "WR", "QB"],
230 | "avp": [116.98, 131.15, 180, 222.22, 272.91],
231 | }
232 | )
233 |
234 | df = football.mutate(
235 | {"color": lambda row: row["position"] in ["WR", "RB"]}
236 | ).replace({"color": {False: "orange", True: "red"}})
237 |
238 | plt.barh(df["position"], df["avp"], color=df["color"])
239 |
240 | self.assertTrue(True)
241 |
242 | def test_sklearn(self):
243 | from sklearn.linear_model import LinearRegression
244 | from sklearn.model_selection import train_test_split
245 |
246 | import redframes as rf
247 |
248 | df = rf.DataFrame(
249 | {
250 | "touchdowns": [15, 19, 5, 7, 9, 10, 12, 22, 16, 10],
251 | "age": [21, 22, 21, 24, 26, 28, 30, 35, 28, 21],
252 | "mvp": [1, 1, 0, 0, 0, 0, 0, 1, 0, 0],
253 | }
254 | )
255 |
256 | target = "touchdowns"
257 | y = df[target]
258 | X = df.drop(target)
259 | X_train, X_test, y_train, y_test = train_test_split(
260 | X, y, test_size=0.3, random_state=1
261 | )
262 |
263 | model = LinearRegression()
264 | model.fit(X_train, y_train)
265 | model.score(X_test, y_test)
266 | # 0.5083194901655527
267 |
268 | # print(X_train.take(1))
269 | # rf.DataFrame({'age': [21], 'mvp': [0]})
270 |
271 | X_new = rf.DataFrame({"age": [22], "mvp": [1]})
272 | model.predict(X_new)
273 | # array([19.])
274 |
275 | self.assertTrue(True)
276 |
--------------------------------------------------------------------------------
/tests/test_side_effects.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import redframes as rf
4 |
5 |
6 | class TestSideEffects(unittest.TestCase):
7 | def setUp(self):
8 | self.df = rf.DataFrame(
9 | {
10 | "foo": range(10),
11 | "bar": [1, 3.2, 4.5, 2, -1, 30, None, 1.1, 1.1, 9],
12 | "baz": ["A", "A", None, "B", "B", "A", "B", "C", "C", "A"],
13 | "jaz": [
14 | "1::1",
15 | "2::2",
16 | "3:3",
17 | "4::4",
18 | "5::5",
19 | "6::7",
20 | "7::8",
21 | "8::9",
22 | "9::0",
23 | "0::-1",
24 | ],
25 | "raz": [1, 2, 3, None, None, None, 9, 9, None, None],
26 | }
27 | )
28 | self.expected = rf.DataFrame(
29 | {
30 | "foo": range(10),
31 | "bar": [1, 3.2, 4.5, 2, -1, 30, None, 1.1, 1.1, 9],
32 | "baz": ["A", "A", None, "B", "B", "A", "B", "C", "C", "A"],
33 | "jaz": [
34 | "1::1",
35 | "2::2",
36 | "3:3",
37 | "4::4",
38 | "5::5",
39 | "6::7",
40 | "7::8",
41 | "8::9",
42 | "9::0",
43 | "0::-1",
44 | ],
45 | "raz": [1, 2, 3, None, None, None, 9, 9, None, None],
46 | }
47 | )
48 |
49 | def test_accumulate(self):
50 | _ = self.df.accumulate("foo", into="foo")
51 | self.assertEqual(self.df, self.expected)
52 |
53 | def test_append(self):
54 | df_bottom = rf.DataFrame({"foo": [10]})
55 | df_bottom_expected = rf.DataFrame({"foo": [10]})
56 | _ = self.df.append(df_bottom)
57 | self.assertEqual(self.df, self.expected)
58 | self.assertEqual(df_bottom, df_bottom_expected)
59 |
60 | def test_combine(self):
61 | _ = self.df.combine(["foo", "bar"], into="foo", sep="-")
62 | self.assertEqual(self.df, self.expected)
63 |
64 | def test_cross(self):
65 | _ = self.df.cross(postfix=("_a", "_b"))
66 | self.assertEqual(self.df, self.expected)
67 |
68 | def test_dedupe(self):
69 | _ = self.df.dedupe("baz")
70 | self.assertEqual(self.df, self.expected)
71 |
72 | def test_denix(self):
73 | _ = self.df.denix()
74 | self.assertEqual(self.df, self.expected)
75 |
76 | def test_drop(self):
77 | _ = self.df.drop("foo")
78 | self.assertEqual(self.df, self.expected)
79 |
80 | def test_fill(self):
81 | _ = self.df.fill("baz", direction="down")
82 | self.assertEqual(self.df, self.expected)
83 |
84 | def test_filter(self):
85 | _ = self.df.filter(lambda row: row["bar"] > 5)
86 | self.assertEqual(self.df, self.expected)
87 |
88 | def test_gather(self):
89 | _ = self.df.gather()
90 | self.assertEqual(self.df, self.expected)
91 |
92 | def test_group(self):
93 | _ = self.df.group("baz").rollup({"foo": ("foo", rf.stat.mean)})
94 | self.assertEqual(self.df, self.expected)
95 |
96 | def test_join(self):
97 | df_right = rf.DataFrame({"baz": ["A", "B"], "haz": ["Apple", "Banana"]})
98 | df_right_expected = rf.DataFrame(
99 | {"baz": ["A", "B"], "haz": ["Apple", "Banana"]}
100 | )
101 | _ = self.df.join(df_right, on="baz")
102 | self.assertEqual(self.df, self.expected)
103 | self.assertEqual(df_right, df_right_expected)
104 |
105 | def test_mutate(self):
106 | _ = self.df.mutate({"foo": lambda row: row["foo"] * 10})
107 | self.assertEqual(self.df, self.expected)
108 |
109 | def test_pack(self):
110 | _ = self.df.pack("baz", sep="|")
111 | self.assertEqual(self.df, self.expected)
112 |
113 | def test_rank(self):
114 | _ = self.df.rank("bar", into="bar_rank", descending=True)
115 | self.assertEqual(self.df, self.expected)
116 |
117 | def test_rename(self):
118 | _ = self.df.rename({"foo": "oof"})
119 | self.assertEqual(self.df, self.expected)
120 |
121 | def test_replace(self):
122 | _ = self.df.replace({"baz": {"B": "Banana"}})
123 | self.assertEqual(self.df, self.expected)
124 |
125 | def test_rollup(self):
126 | _ = self.df.rollup({"bar_mean": ("bar", rf.stat.mean)})
127 | self.assertEqual(self.df, self.expected)
128 |
129 | def test_sample(self):
130 | _ = self.df.sample(5)
131 | self.assertEqual(self.df, self.expected)
132 |
133 | def test_select(self):
134 | _ = self.df.select(["foo", "bar"])
135 | self.assertEqual(self.df, self.expected)
136 |
137 | def test_shuffle(self):
138 | _ = self.df.shuffle()
139 | self.assertEqual(self.df, self.expected)
140 |
141 | def test_sort(self):
142 | _ = self.df.sort("bar", descending=True)
143 | self.assertEqual(self.df, self.expected)
144 |
145 | def test_split(self):
146 | _ = self.df.split("jaz", into=["jaz_1", "jaz_2"], sep="::")
147 | self.assertEqual(self.df, self.expected)
148 |
149 | def test_spread(self):
150 | _ = self.df.denix("baz").select(["baz", "foo"]).spread("baz", "foo")
151 | self.assertEqual(self.df, self.expected)
152 |
153 | def test_take(self):
154 | _ = self.df.take(-3)
155 | self.assertEqual(self.df, self.expected)
156 |
157 | def test_take(self):
158 | _ = self.df.unpack("jaz", sep="::")
159 | self.assertEqual(self.df, self.expected)
160 |
--------------------------------------------------------------------------------
/tests/test_type_hints.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import redframes as rf
4 |
5 |
6 | class TestTypeHints(unittest.TestCase):
7 | def setUp(self):
8 | self.df = rf.DataFrame(
9 | {
10 | "foo": range(10),
11 | "bar": [1, 3.2, 4.5, 2, -1, 30, None, 1.1, 1.1, 9],
12 | "baz": ["A", "A", None, "B", "B", "A", "B", "C", "C", "A"],
13 | "jaz": [
14 | "1::1",
15 | "2::2",
16 | "3:3",
17 | "4::4",
18 | "5::5",
19 | "6::7",
20 | "7::8",
21 | "8::9",
22 | "9::0",
23 | "0::-1",
24 | ],
25 | "raz": [1, 2, 3, None, None, None, 9, 9, None, None],
26 | }
27 | )
28 |
29 | def test_io_load_bad_path(self):
30 | with self.assertRaisesRegex(TypeError, "must be str"):
31 | rf.load(1)
32 |
33 | def test_io_load_bad_file_type(self):
34 | with self.assertRaisesRegex(TypeError, "must end in .csv"):
35 | rf.load("example.json")
36 |
37 | def test_io_save_bad_object(self):
38 | with self.assertRaisesRegex(TypeError, "must be DataFrame"):
39 | rf.save(1, "example.csv")
40 |
41 | def test_io_save_bad_path(self):
42 | with self.assertRaisesRegex(TypeError, "must be str"):
43 | rf.save(self.df, 1)
44 |
45 | def test_io_save_bad_format(self):
46 | with self.assertRaisesRegex(TypeError, "must end in .csv"):
47 | rf.save(self.df, "example.json")
48 |
49 | def test_io_unwrap_bad_object(self):
50 | with self.assertRaisesRegex(TypeError, "must be DataFrame"):
51 | rf.unwrap(1)
52 |
53 | def test_io_wrap_bad_object(self):
54 | with self.assertRaisesRegex(TypeError, "must be DataFrame"):
55 | rf.unwrap(1)
56 |
57 | def test_take_bad_rows(self):
58 | with self.assertRaisesRegex(TypeError, "must be int"):
59 | self.df.take("A")
60 |
61 | def test_accumulate_bad_column(self):
62 | with self.assertRaisesRegex(TypeError, "must be str"):
63 | self.df.accumulate(1, "foo")
64 |
65 | def test_accumulate_bad_into_column(self):
66 | with self.assertRaisesRegex(TypeError, "must be str"):
67 | self.df.accumulate("foo", 1)
68 |
69 | def test_rank_bad_column(self):
70 | with self.assertRaisesRegex(TypeError, "must be str"):
71 | self.df.rank(1, "bar2")
72 |
73 | def test_rank_bad_into_column(self):
74 | with self.assertRaisesRegex(TypeError, "must be str"):
75 | self.df.rank("bar", 1)
76 |
77 | def test_rank_bad_descending_argument(self):
78 | with self.assertRaisesRegex(TypeError, "must be bool"):
79 | self.df.rank("bar", "bar", descending="bar")
80 |
81 | def test_rollup_bad_over(self):
82 | with self.assertRaisesRegex(TypeError, "must be dict"):
83 | self.df.rollup(1)
84 |
85 | def test_rollup_bad_over_values(self):
86 | with self.assertRaises(TypeError):
87 | self.df.rollup({"bar_mean": 1})
88 |
89 | def test_init_bad_data(self):
90 | with self.assertRaisesRegex(TypeError, "must be dict | None"):
91 | rf.DataFrame(1)
92 |
93 | def test_eq_bad_rhs_object(self):
94 | self.assertFalse(self.df == 1)
95 |
96 | def test_getitem_bad_key(self):
97 | pass
98 |
99 | def test_append_bad_other(self):
100 | with self.assertRaisesRegex(TypeError, "must be DataFrame"):
101 | self.df.append(1)
102 |
103 | def test_combine_bad_columns(self):
104 | with self.assertRaisesRegex(TypeError, "must be list"):
105 | self.df.combine(1, "foo", sep="-")
106 |
107 | def test_combine_bad_into_column(self):
108 | with self.assertRaisesRegex(TypeError, "must be str"):
109 | self.df.combine(["foo", "bar"], 1, sep="-")
110 |
111 | def test_combine_bad_sep_argument(self):
112 | with self.assertRaisesRegex(TypeError, "must be str"):
113 | self.df.combine(["foo", "bar"], "foo", sep=1)
114 |
115 | def test_combine_bad_drop_argument(self):
116 | with self.assertRaisesRegex(TypeError, "must be bool"):
117 | self.df.combine(["foo", "bar"], "foo", sep=":::", drop="A")
118 |
119 | def test_dedupe_bad_columns(self):
120 | with self.assertRaisesRegex(TypeError, "must be list | str | None"):
121 | self.df.dedupe(1)
122 |
123 | def test_denix_bad_columns(self):
124 | with self.assertRaisesRegex(TypeError, "must be list | str | None"):
125 | self.df.denix(1)
126 |
127 | def test_drop_bad_columns(self):
128 | with self.assertRaisesRegex(TypeError, "must be list | str | None"):
129 | self.df.drop(1)
130 |
131 | def test_fill_bad_columns(self):
132 | with self.assertRaisesRegex(TypeError, "must be list | str | None"):
133 | self.df.fill(1)
134 |
135 | def test_fill_bad_direction(self):
136 | with self.assertRaisesRegex(ValueError, "must be one of {'down', 'up'}"):
137 | self.df.fill("bar", direction="sideways")
138 |
139 | def test_fill_bad_constant_and_direction(self):
140 | with self.assertRaisesRegex(
141 | ValueError, "either direction OR constant must not be None"
142 | ):
143 | self.df.fill("bar")
144 |
145 | def test_fill_bad_no_constant_nor_direction(self):
146 | with self.assertRaisesRegex(
147 | ValueError, "either direction OR constant must be None"
148 | ):
149 | self.df.fill("bar", direction="down", constant="X")
150 |
151 | def test_filter_bad_func(self):
152 | with self.assertRaisesRegex(TypeError, "must be Func"):
153 | self.df.filter(1)
154 |
155 | def test_gather_bad_columns(self):
156 | with self.assertRaisesRegex(TypeError, "must be list | None"):
157 | self.df.gather(1)
158 |
159 | def test_gather_bad_beside(self):
160 | with self.assertRaisesRegex(TypeError, "must be str | list | None"):
161 | self.df.gather(beside=1)
162 |
163 | def test_gather_bad_into_column(self):
164 | with self.assertRaisesRegex(TypeError, "must be tuple"):
165 | self.df.gather(["foo", "bar"], into=1)
166 |
167 | def test_gather_bad_into_tuple(self):
168 | with self.assertRaisesRegex(TypeError, "must be tuple*"):
169 | self.df.gather(into=("one", "two", "three"))
170 |
171 | def test_gather_bad_both_not_none(self):
172 | with self.assertRaisesRegex(ValueError, "columns OR beside must be None"):
173 | self.df.gather(columns=["foo", "bar"], beside=["baz"])
174 |
175 | def test_group_bad_by_columns(self):
176 | with self.assertRaisesRegex(TypeError, "must be list | str"):
177 | self.df.group(1)
178 |
179 | def test_join_bad_rhs_object(self):
180 | with self.assertRaisesRegex(TypeError, "must be DataFrame"):
181 | self.df.join(1, on="baz")
182 |
183 | def test_join_bad_on_type(self):
184 | rhs = rf.DataFrame()
185 | with self.assertRaisesRegex(TypeError, "must be list | str"):
186 | self.df.join(rhs, on=1)
187 |
188 | def test_join_bad_how_argument(self):
189 | rhs = rf.DataFrame()
190 | message = (
191 | "on argument is invalid, must be one of {'left', 'right', 'inner', 'full'}"
192 | )
193 | with self.assertRaisesRegex(ValueError, message):
194 | self.df.join(rhs, on="baz", how="inside")
195 |
196 | def test_mutate_bad_over(self):
197 | with self.assertRaisesRegex(TypeError, "must be dict"):
198 | self.df.mutate(1)
199 |
200 | def test_pack_bad_column(self):
201 | with self.assertRaisesRegex(TypeError, "must be str"):
202 | self.df.pack(1, sep="|")
203 |
204 | def test_pack_bad_sep(self):
205 | with self.assertRaisesRegex(TypeError, "must be str"):
206 | self.df.pack("baz", sep=1)
207 |
208 | def test_rename_bad_columns(self):
209 | with self.assertRaisesRegex(TypeError, "must be dict"):
210 | self.df.rename(1)
211 |
212 | def test_rename_bad_columns_values(self):
213 | with self.assertRaisesRegex(TypeError, "must be str"):
214 | self.df.rename({"foo": 1})
215 |
216 | def test_replace_bad_over(self):
217 | with self.assertRaisesRegex(TypeError, "must be dict"):
218 | self.df.replace(1)
219 |
220 | def test_sample_bad_rows(self):
221 | with self.assertRaisesRegex(TypeError, "must be int | float"):
222 | self.df.sample("A")
223 |
224 | def test_select_bad_columns(self):
225 | with self.assertRaisesRegex(TypeError, "must be list | str"):
226 | self.df.select(1)
227 |
228 | def test_shuffle(self):
229 | pass
230 |
231 | def test_sort_bad_columns(self):
232 | with self.assertRaisesRegex(TypeError, "must be list | str"):
233 | self.df.sort(1)
234 |
235 | def test_sort_bad_descending_argument(self):
236 | with self.assertRaisesRegex(TypeError, "must be bool"):
237 | self.df.sort("bar", descending="A")
238 |
239 | def test_split_bad_column(self):
240 | with self.assertRaisesRegex(TypeError, "must be str"):
241 | self.df.split(1, into=["jaz1", "jaz2"], sep="::")
242 |
243 | def test_split_bad_into_column(self):
244 | with self.assertRaisesRegex(TypeError, "must be list"):
245 | self.df.split("jaz", into=1, sep="::")
246 |
247 | def test_split_bad_sep_argument(self):
248 | with self.assertRaisesRegex(TypeError, "must be str"):
249 | self.df.split("jaz", into=["jaz1", "jaz2"], sep=1)
250 |
251 | def test_split_bad_drop_argument(self):
252 | with self.assertRaisesRegex(TypeError, "must be bool"):
253 | self.df.split("jaz", into=["jaz1", "jaz2"], sep="::", drop="A")
254 |
255 | def test_spread_bad_column(self):
256 | with self.assertRaisesRegex(TypeError, "must be str"):
257 | self.df.spread(1, using="bar")
258 |
259 | def test_spread_bad_using_column(self):
260 | with self.assertRaisesRegex(TypeError, "must be str"):
261 | self.df.spread("foo", using=1)
262 |
263 | def test_unpack_bad_column(self):
264 | with self.assertRaisesRegex(TypeError, "must be str"):
265 | self.df.unpack(1, sep="|")
266 |
267 | def test_unpack_bad_sep(self):
268 | with self.assertRaisesRegex(TypeError, "must be str"):
269 | self.df.unpack("jaz", sep=1)
270 |
--------------------------------------------------------------------------------