├── .github └── FUNDING.yml ├── .gitignore ├── LICENSE ├── README.md ├── gifs ├── p_describe.gif └── ram_usage.png ├── parallel_pandas ├── __init__.py ├── core │ ├── __init__.py │ ├── _numba.py │ ├── parallel_dataframe.py │ ├── parallel_groupby.py │ ├── parallel_series.py │ ├── parallel_window.py │ ├── progress_imap.py │ └── tools.py └── main.py ├── requirements.txt ├── setup.cfg └── setup.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [dubovikmaster] 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | # Byte-compiled / optimized / DLL files 3 | pycache/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | pypackages/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Pavel Dubovik 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Parallel-pandas 2 | 3 | [![PyPI version fury.io](https://badge.fury.io/py/parallel-pandas.svg)](https://pypi.org/project/parallel-pandas/) 4 | [![PyPI license](https://img.shields.io/pypi/l/parallel-pandas.svg)](https://pypi.org/project/parallel-pandas/) 5 | [![PyPI download month](https://img.shields.io/pypi/dm/parallel-pandas.svg)](https://pypi.org/project/parallel-pandas/) 6 | 7 | 8 | Makes it easy to parallelize your calculations in pandas on all your CPUs. 9 | 10 | ## Installation 11 | 12 | ```python 13 | pip install --upgrade parallel-pandas 14 | ``` 15 | 16 | ## Quickstart 17 | ```python 18 | import pandas as pd 19 | import numpy as np 20 | from parallel_pandas import ParallelPandas 21 | 22 | #initialize parallel-pandas 23 | ParallelPandas.initialize(n_cpu=16, split_factor=4, disable_pr_bar=True) 24 | 25 | # create big DataFrame 26 | df = pd.DataFrame(np.random.random((1_000_000, 100))) 27 | 28 | # calculate multiple quantiles. Pandas only uses one core of CPU 29 | %%timeit 30 | res = df.quantile(q=[.25, .5, .95], axis=1) 31 | ``` 32 | `3.66 s ± 31.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)` 33 | ```python 34 | #p_quantile is parallel analogue of quantile methods. Can use all cores of your CPU. 35 | %%timeit 36 | res = df.p_quantile(q=[.25, .5, .95], axis=1) 37 | ``` 38 | `679 ms ± 10.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)` 39 | 40 | As you can see the `p_quantile` method is **5 times faster**! 41 | 42 | ## Usage 43 | 44 | Under the hood, **parallel-pandas** works very simply. The Dataframe or Series is split into chunks along the first or second axis. Then these chunks are passed to a pool of processes or threads where the desired method is executed on each part. At the end, the parts are concatenated to get the final result. 45 | 46 | 47 | When initializing parallel-pandas you can specify the following options: 48 | 1. `n_cpu` - the number of cores of your CPU that you want to use (default `None` - use all cores of CPU) 49 | 2. `split_factor` - Affects the number of chunks into which the DataFrame/Series is split according to the formula `chunks_number = split_factor*n_cpu` (default 1). 50 | 3. `show_vmem` - Shows a progress bar with available RAM (default `False`) 51 | 4. `disable_pr_bar` - Disable the progress bar for parallel tasks (default `False`) 52 | 53 | For example 54 | 55 | ```python 56 | import pandas as pd 57 | import numpy as np 58 | from parallel_pandas import ParallelPandas 59 | 60 | #initialize parallel-pandas 61 | ParallelPandas.initialize(n_cpu=16, split_factor=4, disable_pr_bar=False) 62 | 63 | # create big DataFrame 64 | df = pd.DataFrame(np.random.random((1_000_000, 100))) 65 | ``` 66 | ![](https://raw.githubusercontent.com/dubovikmaster/parallel-pandas/master/gifs/p_describe.gif) 67 | 68 | During initialization, we specified `split_factor=4` and `n_cpu = 16`, so the DataFrame will be split into 64 chunks (in the case of the `describe` method, axis = 1) and the progress bar shows the progress for each chunk 69 | 70 | You can parallelize any expression with pandas Dataframe. For example, let's do a z-score normalization of columns in a dataframe. Let's look at the execution time and memory consumption. Compare with synchronous execution and with Dask.DataFrame 71 | ```python 72 | import pandas as pd 73 | import numpy as np 74 | from parallel_pandas import ParallelPandas 75 | import dask.dataframe as dd 76 | from time import monotonic 77 | 78 | #initialize parallel-pandas 79 | ParallelPandas.initialize(n_cpu=16, split_factor=8, disable_pr_bar=True) 80 | 81 | # create big DataFrame 82 | df = pd.DataFrame(np.random.random((1_000_000, 1000))) 83 | 84 | # create dask DataFrame 85 | ddf = dd.from_pandas(df, npartitions=128) 86 | 87 | start = monotonic() 88 | res=(df-df.mean())/df.std() 89 | print(f'synchronous z-score normalization time took: {monotonic()-start:.1f} s.') 90 | ``` 91 | ```python 92 | synchronous z-score normalization time took: 21.7 s. 93 | ``` 94 | ```python 95 | #parallel-pandas 96 | start = monotonic() 97 | res=(df-df.p_mean())/df.p_std() 98 | print(f'parallel z-score normalization time took: {monotonic()-start:.1f} s.') 99 | ``` 100 | ```python 101 | parallel z-score normalization time took: 11.7 s. 102 | ``` 103 | ```python 104 | #dask dataframe 105 | start = monotonic() 106 | res=((ddf-ddf.mean())/ddf.std()).compute() 107 | print(f'dask parallel z-score normalization time took: {monotonic()-start:.1f} s.') 108 | ``` 109 | ```python 110 | dask parallel z-score normalization time took: 12.5 s. 111 | ``` 112 | 113 | Pay attention to memory consumption. `parallel-pandas` and `dask` use almost half as much RAM as `pandas` 114 | 115 | ![](https://raw.githubusercontent.com/dubovikmaster/parallel-pandas/master/gifs/ram_usage.png) 116 | 117 | For some methods `parallel-pandas` is faster than `dask.DataFrame`: 118 | ```python 119 | #dask 120 | %%time 121 | res = ddf.nunique().compute() 122 | Wall time: 42.9 s 123 | 124 | %%time 125 | res = ddf.rolling(10).mean().compute() 126 | Wall time: 19.1 s 127 | 128 | #parallel-pandas 129 | %%time 130 | res = df.p_nunique() 131 | Wall time: 12.9 s 132 | 133 | %%time 134 | res = df.rolling(10).p_mean() 135 | Wall time: 12.5 s 136 | ``` 137 | 138 | ## API 139 | 140 | ### Parallel counterparts for pandas Series methods 141 | 142 | | methods | parallel analogue | executor | 143 | |-------------------|---------------------|----------------------| 144 | | pd.Series.apply() | pd.Series.p_apply() | threads / processes | 145 | | pd.Series.map() | pd.Series.p_map() | threads / processes | 146 | 147 | 148 | ### Parallel counterparts for pandas SeriesGroupBy methods 149 | 150 | | methods | parallel analogue | executor | 151 | |--------------------------|----------------------------|-------------------------| 152 | | pd.SeriesGroupBy.apply() | pd.SeriesGroupBy.p_apply() | threads / processes | 153 | 154 | ### Parallel counterparts for pandas Dataframe methods 155 | 156 | | methods | parallel analogue | executor | 157 | |----------------|-------------------|---------------------| 158 | | df.mean() | df.p_mean() | threads | 159 | | df.min() | df.p_min() | threads | 160 | | df.max() | df.p_max() | threads | 161 | | df.median() | df.p_max() | threads | 162 | | df.kurt() | df.p_kurt() | threads | 163 | | df.skew() | df.p_skew() | threads | 164 | | df.sum() | df.p_sum() | threads | 165 | | df.prod() | df.p_prod() | threads | 166 | | df.var() | df.p_var() | threads | 167 | | df.sem() | df.p_sem() | threads | 168 | | df.std() | df.p_std() | threads | 169 | | df.cummin() | df.p_cummin() | threads | 170 | | df.cumsum() | df.p_cumsum() | threads | 171 | | df.cummax() | df.p_cummax() | threads | 172 | | df.cumprod() | df.p_cumprod() | threads | 173 | | df.apply() | df.p_apply() | threads / processes | 174 | | df.applymap() | df.p_applymap() | processes | 175 | | df.replace() | df.p_replace() | threads | 176 | | df.describe() | df.p_describe() | threads | 177 | | df.nunique() | df.p_nunique() | threads / processes | 178 | | df.mad() | df.p_mad() | threads | 179 | | df.idxmin() | df.p_idxmin() | threads | 180 | | df.idxmax() | df.p_idxmax() | threads | 181 | | df.rank() | df.p_rank() | threads | 182 | | df.mode() | df.p_mode() | threads/processes | 183 | | df.agg() | df.p_agg() | threads/processes | 184 | | df.aggregate() | df.p_aggregate() | threads/processes | 185 | | df.quantile() | df.p_quantile() | threads/processes | 186 | | df.corr() | df.p_corr() | threads/processes | 187 | 188 | ### Parallel counterparts for pandas DataframeGroupBy methods 189 | 190 | | methods | parallel analogue | executor | 191 | |--------------------------|----------------------------|----------------------| 192 | | DataFrameGroupBy.apply() | DataFrameGroupBy.p_apply() | threads / processes | 193 | 194 | ### Parallel counterparts for pandas window methods 195 | 196 | #### Rolling 197 | 198 | | methods | parallel analogue | executor | 199 | |------------------------------------|--------------------------------------|---------------------| 200 | | pd.core.window.Rolling.apply() | pd.core.window.Rolling.p_apply() | threads / processes | 201 | | pd.core.window.Rolling.min() | pd.core.window.Rolling.p_min() | threads / processes | 202 | | pd.core.window.Rolling.max() | pd.core.window.Rolling.p_max() | threads / processes | 203 | | pd.core.window.Rolling.mean() | pd.core.window.Rolling.p_mean() | threads / processes | 204 | | pd.core.window.Rolling.sum() | pd.core.window.Rolling.p_sum() | threads / processes | 205 | | pd.core.window.Rolling.var() | pd.core.window.Rolling.p_var() | threads / processes | 206 | | pd.core.window.Rolling.sem() | pd.core.window.Rolling.p_sem() | threads / processes | 207 | | pd.core.window.Rolling.skew() | pd.core.window.Rolling.p_skew() | threads / processes | 208 | | pd.core.window.Rolling.kurt() | pd.core.window.Rolling.p_kurt() | threads / processes | 209 | | pd.core.window.Rolling.median() | pd.core.window.Rolling.p_median() | threads / processes | 210 | | pd.core.window.Rolling.quantile() | pd.core.window.Rolling.p_quantile() | threads / processes | 211 | | pd.core.window.Rolling.rank() | pd.core.window.Rolling.p_rank() | threads / processes | 212 | | pd.core.window.Rolling.agg() | pd.core.window.Rolling.p_agg() | threads / processes | 213 | | pd.core.window.Rolling.aggregate() | pd.core.window.Rolling.p_aggregate() | threads / processes | 214 | 215 | 216 | #### Window 217 | 218 | | methods | parallel analogue | executor | 219 | |-----------------------------------|-------------------------------------|---------------------| 220 | | pd.core.window.Window.mean() | pd.core.window.Window.p_mean() | threads / processes | 221 | | pd.core.window.Window.sum() | pd.core.window.Window.p_sum() | threads / processes | 222 | | pd.core.window.Window.var() | pd.core.window.Window.p_var() | threads / processes | 223 | | pd.core.window.Window.std() | pd.core.window.Window.p_std() | threads / processes | 224 | 225 | 226 | #### RollingGroupby 227 | 228 | | methods | parallel analogue | executor | 229 | |-------------------------------------------|---------------------------------------------|---------------------| 230 | | pd.core.window.RollingGroupby.apply() | pd.core.window.RollingGroupby.p_apply() | threads / processes | 231 | | pd.core.window.RollingGroupby.min() | pd.core.window.RollingGroupby.p_min() | threads / processes | 232 | | pd.core.window.RollingGroupby.max() | pd.core.window.RollingGroupby.p_max() | threads / processes | 233 | | pd.core.window.RollingGroupby.mean() | pd.core.window.RollingGroupby.p_mean() | threads / processes | 234 | | pd.core.window.RollingGroupby.sum() | pd.core.window.RollingGroupby.p_sum() | threads / processes | 235 | | pd.core.window.RollingGroupby.var() | pd.core.window.RollingGroupby.p_var() | threads / processes | 236 | | pd.core.window.RollingGroupby.sem() | pd.core.window.RollingGroupby.p_sem() | threads / processes | 237 | | pd.core.window.RollingGroupby.skew() | pd.core.window.RollingGroupby.p_skew() | threads / processes | 238 | | pd.core.window.RollingGroupby.kurt() | pd.core.window.RollingGroupby.p_kurt() | threads / processes | 239 | | pd.core.window.RollingGroupby.median() | pd.core.window.RollingGroupby.p_median() | threads / processes | 240 | | pd.core.window.RollingGroupby.quantile() | pd.core.window.RollingGroupby.p_quantile() | threads / processes | 241 | | pd.core.window.RollingGroupby.rank() | pd.core.window.RollingGroupby.p_rank() | threads / processes | 242 | | pd.core.window.RollingGroupby.agg() | pd.core.window.RollingGroupby.p_agg() | threads / processes | 243 | | pd.core.window.RollingGroupby.aggregate() | pd.core.window.RollingGroupby.p_aggregate() | threads / processes | 244 | 245 | #### Expanding 246 | 247 | | methods | parallel analogue | executor | 248 | |--------------------------------------|----------------------------------------|---------------------| 249 | | pd.core.window.Expanding.apply() | pd.core.window.Expanding.p_apply() | threads / processes | 250 | | pd.core.window.Expanding.min() | pd.core.window.Expanding.p_min() | threads / processes | 251 | | pd.core.window.Expanding.max() | pd.core.window.Expanding.p_max() | threads / processes | 252 | | pd.core.window.Expanding.mean() | pd.core.window.Expanding.p_mean() | threads / processes | 253 | | pd.core.window.Expanding.sum() | pd.core.window.Expanding.p_sum() | threads / processes | 254 | | pd.core.window.Expanding.var() | pd.core.window.Expanding.p_var() | threads / processes | 255 | | pd.core.window.Expanding.sem() | pd.core.window.Expanding.p_sem() | threads / processes | 256 | | pd.core.window.Expanding.skew() | pd.core.window.Expanding.p_skew() | threads / processes | 257 | | pd.core.window.Expanding.kurt() | pd.core.window.Expanding.p_kurt() | threads / processes | 258 | | pd.core.window.Expanding.median() | pd.core.window.Expanding.p_median() | threads / processes | 259 | | pd.core.window.Expanding.quantile() | pd.core.window.Expanding.p_quantile() | threads / processes | 260 | | pd.core.window.Expanding.rank() | pd.core.window.Expanding.p_rank() | threads / processes | 261 | | pd.core.window.Expanding.agg() | pd.core.window.Expanding.p_agg() | threads / processes | 262 | | pd.core.window.Expanding.aggregate() | pd.core.window.Expanding.p_aggregate() | threads / processes | 263 | 264 | 265 | #### ExpandingGroupby 266 | 267 | | methods | parallel analogue | executor | 268 | |---------------------------------------------|-----------------------------------------------|---------------------| 269 | | pd.core.window.ExpandingGroupby.apply() | pd.core.window.ExpandingGroupby.p_apply() | threads / processes | 270 | | pd.core.window.ExpandingGroupby.min() | pd.core.window.ExpandingGroupby.p_min() | threads / processes | 271 | | pd.core.window.ExpandingGroupby.max() | pd.core.window.ExpandingGroupby.p_max() | threads / processes | 272 | | pd.core.window.ExpandingGroupby.mean() | pd.core.window.ExpandingGroupby.p_mean() | threads / processes | 273 | | pd.core.window.ExpandingGroupby.sum() | pd.core.window.ExpandingGroupby.p_sum() | threads / processes | 274 | | pd.core.window.ExpandingGroupby.var() | pd.core.window.ExpandingGroupby.p_var() | threads / processes | 275 | | pd.core.window.ExpandingGroupby.sem() | pd.core.window.ExpandingGroupby.p_sem() | threads / processes | 276 | | pd.core.window.ExpandingGroupby.skew() | pd.core.window.ExpandingGroupby.p_skew() | threads / processes | 277 | | pd.core.window.ExpandingGroupby.kurt() | pd.core.window.ExpandingGroupby.p_kurt() | threads / processes | 278 | | pd.core.window.ExpandingGroupby.median() | pd.core.window.ExpandingGroupby.p_median() | threads / processes | 279 | | pd.core.window.ExpandingGroupby.quantile() | pd.core.window.ExpandingGroupby.p_quantile() | threads / processes | 280 | | pd.core.window.ExpandingGroupby.rank() | pd.core.window.ExpandingGroupby.p_rank() | threads / processes | 281 | | pd.core.window.ExpandingGroupby.agg() | pd.core.window.ExpandingGroupby.p_agg() | threads / processes | 282 | | pd.core.window.ExpandingGroupby.aggregate() | pd.core.window.ExpandingGroupby.p_aggregate() | threads / processes | 283 | 284 | ### ExponentialMovingWindow 285 | 286 | | methods | parallel analogue | executor | 287 | |-----------------------------------------------|-------------------------------------------------|---------------------| 288 | | pd.core.window.ExponentialMovingWindow.mean() | pd.core.window.ExponentialMovingWindow.p_mean() | threads / processes | 289 | | pd.core.window.ExponentialMovingWindow.sum() | pd.core.window.ExponentialMovingWindow.p_sum() | threads / processes | 290 | | pd.core.window.ExponentialMovingWindow.var() | pd.core.window.ExponentialMovingWindow.p_var() | threads / processes | 291 | | pd.core.window.ExponentialMovingWindow.std() | pd.core.window.ExponentialMovingWindow.p_std() | threads / processes | 292 | 293 | ### ExponentialMovingWindowGroupby 294 | 295 | | methods | parallel analogue | executor | 296 | |------------------------------------------------------|--------------------------------------------------------|---------------------| 297 | | pd.core.window.ExponentialMovingWindowGroupby.mean() | pd.core.window.ExponentialMovingWindowGroupby.p_mean() | threads / processes | 298 | | pd.core.window.ExponentialMovingWindowGroupby.sum() | pd.core.window.ExponentialMovingWindowGroupby.p_sum() | threads / processes | 299 | | pd.core.window.ExponentialMovingWindowGroupby.var() | pd.core.window.ExponentialMovingWindowGroupby.p_var() | threads / processes | 300 | | pd.core.window.ExponentialMovingWindowGroupby.std() | pd.core.window.ExponentialMovingWindowGroupby.p_std() | threads / processes | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | -------------------------------------------------------------------------------- /gifs/p_describe.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dubovikmaster/parallel-pandas/ec0ce813f62b3576f52ae755d35b29d7e75b3d74/gifs/p_describe.gif -------------------------------------------------------------------------------- /gifs/ram_usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dubovikmaster/parallel-pandas/ec0ce813f62b3576f52ae755d35b29d7e75b3d74/gifs/ram_usage.png -------------------------------------------------------------------------------- /parallel_pandas/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import ParallelPandas 2 | 3 | __all__ = ['ParallelPandas'] -------------------------------------------------------------------------------- /parallel_pandas/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .parallel_series import series_parallelize_apply 2 | from .parallel_series import series_parallelize_map 3 | from .parallel_groupby import parallelize_groupby_apply 4 | from .parallel_dataframe import parallelize_apply 5 | from .parallel_dataframe import parallelize_replace 6 | from .parallel_dataframe import parallelize_applymap 7 | from .parallel_dataframe import parallelize_describe 8 | from .parallel_dataframe import parallelize_nunique 9 | from .parallel_dataframe import parallelize_mad 10 | from .parallel_dataframe import parallelize_idxmax 11 | from .parallel_dataframe import parallelize_idxmin 12 | from .parallel_dataframe import parallelize_rank 13 | from .parallel_dataframe import ParallelizeStatFunc 14 | from .parallel_dataframe import ParallelizeStatFuncDdof 15 | from .parallel_dataframe import ParallelizeMinCountStatFunc 16 | from .parallel_dataframe import ParallelizeAccumFunc 17 | from .parallel_dataframe import parallelize_quantile 18 | from .parallel_dataframe import parallelize_mode 19 | from .parallel_dataframe import parallelize_chunk_apply 20 | from .parallel_dataframe import parallelize_merge 21 | from .parallel_dataframe import parallelize_pct_change 22 | from .parallel_dataframe import parallelize_isin 23 | from .parallel_dataframe import parallelize_aggregate 24 | from .parallel_dataframe import parallelize_corr 25 | from .parallel_dataframe import parallelize_map 26 | from .parallel_window import ParallelRolling 27 | from .parallel_window import ParallelExpanding 28 | from .parallel_window import ParallelEWM 29 | from .parallel_window import ParallelRollingGroupby 30 | from .parallel_window import ParallelExpandingGroupby 31 | from .parallel_window import ParallelEWMGroupby 32 | from .parallel_window import ParallelWindow 33 | 34 | -------------------------------------------------------------------------------- /parallel_pandas/core/_numba.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | try: 4 | from numba import njit, prange 5 | 6 | 7 | @njit 8 | def _pearson_corr(x, y): 9 | return np.corrcoef(x, y)[0, 1] 10 | 11 | 12 | @njit 13 | def calculate_rank(arr): 14 | n = len(arr) 15 | result = np.zeros(n, dtype=np.int64) 16 | indexes = np.argsort(arr) 17 | for i in range(n): 18 | result[indexes[i]] = i + 1 19 | 20 | return result 21 | 22 | 23 | @njit(fastmath=True) 24 | def _calculate_rank_dot_product(a, b): 25 | n = len(a) 26 | b_rank = calculate_rank(b) 27 | result = 0.0 28 | indexes = np.argsort(a) 29 | for i in range(n): 30 | result += (i + 1 - b_rank[indexes[i]]) * (i + 1 - b_rank[indexes[i]]) 31 | 32 | return result 33 | 34 | 35 | @njit 36 | def _spearman_corr_with_rank(x, y): 37 | n = len(x) 38 | sum_d_squared = _calculate_rank_dot_product(x, y) 39 | correlation = 1 - (6 * sum_d_squared) / (n * (n ** 2 - 1)) 40 | 41 | return correlation 42 | 43 | 44 | @njit(parallel=True) 45 | def _parallel_spearman_corr(mat, min_periods): 46 | n = mat.shape[0] 47 | c = mat.shape[1] 48 | rows, cols = np.triu_indices(c, k=1) 49 | corr_mat = np.zeros((c, c), dtype=np.float64) 50 | ranks = np.zeros((n, c), dtype=np.int64) 51 | isnan_flags = np.zeros(c, dtype=np.bool_) 52 | isnan = np.zeros((n, c), dtype=np.bool_) 53 | for i in prange(c): 54 | isnan[:, i] = np.isnan(mat[:, i]) 55 | if isnan[:, i].any(): 56 | isnan_flags[i] = True 57 | # 58 | else: 59 | ranks[:, i] = calculate_rank(mat[:, i]) 60 | for i in prange(len(rows)): 61 | row, col = rows[i], cols[i] 62 | if isnan_flags[row] or isnan_flags[col]: 63 | valid = ~isnan[:, row] & ~isnan[:, col] 64 | if valid.sum() < min_periods: 65 | corr_mat[row, col] = np.nan 66 | else: 67 | corr_mat[row, col] = _spearman_corr_with_rank(mat[valid, row], mat[valid, col]) 68 | else: 69 | d = ranks[:, row] - ranks[:, col] 70 | sum_d_squared = np.sum(d * d) 71 | corr_mat[row, col] = 1 - (6 * sum_d_squared) / (n * (n ** 2 - 1)) 72 | corr_mat = corr_mat + corr_mat.transpose() 73 | np.fill_diagonal(corr_mat, 1) 74 | return corr_mat 75 | 76 | 77 | @njit(parallel=True) 78 | def _parallel_pearson_corr(mat, min_periods): 79 | n = mat.shape[0] 80 | c = mat.shape[1] 81 | rows, cols = np.triu_indices(c, k=1) 82 | corr_mat = np.zeros((mat.shape[1], mat.shape[1]), dtype=np.float64) 83 | isnan = np.isnan(mat) 84 | isnan_flags = isnan.sum(axis=0) 85 | sums = np.sum(mat, axis=0) 86 | sum_squareds = np.sum((mat * mat), axis=0) 87 | for i in prange(len(rows)): 88 | row, col = rows[i], cols[i] 89 | if isnan_flags[row] or isnan_flags[col]: 90 | valid = ~isnan[:, row] & ~isnan[:, col] 91 | if valid.sum() < min_periods: 92 | corr_mat[row, col] = np.nan 93 | else: 94 | corr_mat[row, col] = _pearson_corr(mat[valid, row], mat[valid, col]) 95 | else: 96 | sum_xy = np.sum(mat[:, row] * mat[:, col]) 97 | numerator = n * sum_xy - sums[row] * sums[col] 98 | denominator = np.sqrt( 99 | (n * sum_squareds[row] - sums[row] * sums[row]) * (n * sum_squareds[col] - sums[col] * sums[col])) 100 | corr_mat[row, col] = numerator / denominator if denominator != 0 else 0 101 | corr_mat = corr_mat + corr_mat.transpose() 102 | np.fill_diagonal(corr_mat, 1) 103 | return corr_mat 104 | 105 | 106 | @njit(fastmath=True) 107 | def _kendall_tau(x, y): 108 | concordant = 0 109 | discordant = 0 110 | n = len(x) 111 | for i in range(n - 1): 112 | for j in range(i + 1, n): 113 | # Проверка согласованности или расхождения порядка 114 | if (x[i] - x[j]) * (y[i] - y[j]) > 0: 115 | concordant += 1 116 | elif (x[i] - x[j]) * (y[i] - y[j]) < 0: 117 | discordant += 1 118 | # Вычисление корреляции Кендалла 119 | tau = (concordant - discordant) / np.sqrt((concordant + discordant) * (n * (n - 1) / 2)) 120 | 121 | return tau 122 | 123 | 124 | @njit(parallel=True) 125 | def _parallel_kendall(mat, min_periods): 126 | n = mat.shape[0] 127 | c = mat.shape[1] 128 | rows, cols = np.triu_indices(c, k=1) 129 | corr_mat = np.zeros((c, c), dtype=np.float64) 130 | isnan = np.isnan(mat) 131 | isnan_flags = isnan.sum(axis=0) 132 | for i in prange(len(rows)): 133 | row, col = rows[i], cols[i] 134 | a = mat[:, row] 135 | b = mat[:, col] 136 | if isnan_flags[row] or isnan_flags[col]: 137 | valid = ~isnan[:, row] & ~isnan[:, col] 138 | if valid.sum() < min_periods: 139 | corr_mat[row, col] = np.nan 140 | else: 141 | corr_mat[row, col] = _kendall_tau(a[valid], b[valid]) 142 | else: 143 | corr_mat[row, col] = _kendall_tau(a, b) 144 | corr_mat = corr_mat + corr_mat.transpose() 145 | np.fill_diagonal(corr_mat, 1) 146 | return corr_mat 147 | 148 | 149 | @njit 150 | def _do_parallel_corr(mat, method, min_periods): 151 | if method == 'pearson': 152 | return _parallel_pearson_corr(mat, min_periods) 153 | elif method == 'spearman': 154 | return _parallel_spearman_corr(mat, min_periods) 155 | elif method == 'kendall': 156 | return _parallel_kendall(mat, min_periods) 157 | else: 158 | raise ValueError(f'Unknown method {method}') 159 | 160 | except ImportError as e: 161 | def _do_parallel_corr(*args, **kwargs): 162 | raise ImportError('Numba not installed. Please install numba to use parallel pandas with numba engine.') 163 | -------------------------------------------------------------------------------- /parallel_pandas/core/parallel_dataframe.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor 5 | from multiprocessing import cpu_count 6 | 7 | from functools import partial 8 | from multiprocessing import Manager 9 | import numpy as np 10 | import pandas as pd 11 | from pandas._libs import lib 12 | from pandas.core import nanops 13 | from pandas.util._decorators import doc 14 | from pandas._typing import ( 15 | IndexLabel, 16 | Suffixes 17 | ) 18 | 19 | from scipy.stats import spearmanr, kendalltau 20 | 21 | import dill 22 | 23 | from ._numba import _do_parallel_corr 24 | from .progress_imap import progress_imap 25 | from .progress_imap import progress_udf_wrapper 26 | from .tools import ( 27 | get_pandas_version, 28 | parallel_rank, 29 | get_split_data, 30 | get_split_size, 31 | ) 32 | 33 | DOC = 'Parallel analogue of the DataFrame.{func} method\nSee pandas DataFrame docstring for more ' \ 34 | 'information\nhttps://pandas.pydata.org/docs/reference/frame.html ' 35 | 36 | MAJOR_PANDAS_VERSION, _ = get_pandas_version() 37 | 38 | 39 | def _do_apply(data, dill_func, workers_queue, axis, raw, result_type, args, kwargs): 40 | func = dill.loads(dill_func) 41 | return data.apply(progress_udf_wrapper(func, workers_queue, data.shape[1 - axis]), axis=axis, raw=raw, 42 | result_type=result_type, args=args, **kwargs) 43 | 44 | 45 | def parallelize_apply(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 46 | @doc(DOC, func='apply') 47 | def p_apply(data, func, executor='processes', axis=0, raw=False, result_type=None, args=(), **kwargs): 48 | workers_queue = Manager().Queue() 49 | split_size = get_split_size(n_cpu, split_factor) 50 | tasks = get_split_data(data, axis, split_size) 51 | dill_func = dill.dumps(func) 52 | result = progress_imap(partial(_do_apply, axis=axis, raw=raw, result_type=result_type, dill_func=dill_func, 53 | workers_queue=workers_queue, args=args, kwargs=kwargs), 54 | tasks, workers_queue, n_cpu=n_cpu, total=data.shape[1 - axis], disable=disable_pr_bar, 55 | show_vmem=show_vmem, executor=executor, desc=func.__name__.upper()) 56 | concat_axis = 0 57 | if result: 58 | if isinstance(result[0], pd.DataFrame): 59 | concat_axis = 1 - axis 60 | return pd.concat(result, axis=concat_axis) 61 | 62 | return p_apply 63 | 64 | 65 | def _do_chunk_apply(data, dill_func, workers_queue, args, kwargs): 66 | func = dill.loads(dill_func) 67 | 68 | def foo(): 69 | return func(data, *args, **kwargs) 70 | 71 | return progress_udf_wrapper(foo, workers_queue, 1)() 72 | 73 | 74 | def parallelize_chunk_apply(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 75 | def chunk_apply(data, func, executor='processes', axis=0, split_by_col=None, 76 | concat_result=True, args=(), **kwargs): 77 | workers_queue = Manager().Queue() 78 | split_size = get_split_size(n_cpu, split_factor) 79 | if split_by_col: 80 | idx_split = np.array_split(data[split_by_col].unique(), split_size) 81 | group = data.groupby(split_by_col) 82 | tasks = (pd.concat([group.get_group(j) for j in i], copy=False) for i in idx_split) 83 | else: 84 | tasks = get_split_data(data, 1 - axis, split_size) 85 | dill_func = dill.dumps(func) 86 | result = progress_imap(partial(_do_chunk_apply, dill_func=dill_func, 87 | workers_queue=workers_queue, args=args, kwargs=kwargs), 88 | tasks, workers_queue, n_cpu=n_cpu, total=split_size, disable=disable_pr_bar, 89 | show_vmem=show_vmem, executor=executor, desc='chunk_apply'.upper()) 90 | if concat_result: 91 | return pd.concat(result, axis=axis) 92 | else: 93 | return result 94 | 95 | return chunk_apply 96 | 97 | 98 | def _np_pearson_corr(a, b=None): 99 | if b is None: 100 | return np.corrcoef(a, rowvar=False) 101 | return np.corrcoef(a, y=b)[0, 1] 102 | 103 | 104 | def _run_in_pool(func, tasks, n_cpu, executor): 105 | chunk_size, extra = divmod(len(tasks), n_cpu) 106 | if extra: 107 | chunk_size += 1 108 | if executor == 'processes': 109 | pool = ProcessPoolExecutor(n_cpu) 110 | else: 111 | pool = ThreadPoolExecutor(n_cpu) 112 | with pool as p: 113 | result = list(p.map(func, tasks, chunksize=chunk_size)) 114 | return result 115 | 116 | 117 | def _pearson_corr(idx, mat, min_periods, isnan, isnan_flags, sums, sum_squareds): 118 | row, col = idx 119 | n = mat.shape[0] 120 | if isnan_flags[row] or isnan_flags[col]: 121 | valid = ~isnan[:, row] & ~isnan[:, col] 122 | if valid.sum() < min_periods: 123 | return np.nan 124 | else: 125 | return _np_pearson_corr(mat[valid, row], mat[valid, col]) 126 | else: 127 | sum_xy = np.sum(mat[:, row] * mat[:, col]) 128 | numerator = n * sum_xy - sums[row] * sums[col] 129 | denominator = np.sqrt( 130 | (n * sum_squareds[row] - sums[row] * sums[row]) * (n * sum_squareds[col] - sums[col] * sums[col])) 131 | return numerator / denominator if denominator != 0 else 0 132 | 133 | 134 | def _parallel_pearson_corr(mat, min_periods, executor, n_cpu): 135 | comb = [(i, j) for i in range(mat.shape[1]) for j in range(i + 1, mat.shape[1])] 136 | isnan = np.isnan(mat) 137 | isnan_flags = isnan.sum(axis=0) 138 | sums = np.sum(mat, axis=0) 139 | sum_squareds = np.sum((mat * mat), axis=0) 140 | func = partial(_pearson_corr, mat=mat, min_periods=min_periods, isnan=isnan, isnan_flags=isnan_flags, sums=sums, 141 | sum_squareds=sum_squareds) 142 | result = _run_in_pool(func, comb, n_cpu, executor) 143 | return result 144 | 145 | 146 | def _spearman_corr(idx, mat, min_periods, isnan, isnan_flags, ranks): 147 | row, col = idx 148 | n = mat.shape[0] 149 | if isnan_flags[row] or isnan_flags[col]: 150 | valid = ~isnan[:, row] & ~isnan[:, col] 151 | if valid.sum() < min_periods: 152 | return np.nan 153 | else: 154 | return _scipy_spearman_corr(mat[valid, row], mat[valid, col]) 155 | d = ranks[:, row] - ranks[:, col] 156 | sum_d_squared = np.sum(d * d) 157 | return 1 - (6 * sum_d_squared) / (n * (n ** 2 - 1)) 158 | 159 | 160 | def _parallel_spearman_corr(mat, min_periods, executor, n_cpu): 161 | comb = [(i, j) for i in range(mat.shape[1]) for j in range(i + 1, mat.shape[1])] 162 | isnan = np.isnan(mat) 163 | isnan_flags = isnan.sum(axis=0) 164 | ranks = parallel_rank(mat, n_cpu) 165 | func = partial(_spearman_corr, mat=mat, min_periods=min_periods, isnan=isnan, isnan_flags=isnan_flags, ranks=ranks) 166 | result = _run_in_pool(func, comb, n_cpu, executor) 167 | return result 168 | 169 | 170 | def _kendall_tau(x, y): 171 | return kendalltau(x, y)[0] 172 | 173 | 174 | def _scipy_spearman_corr(x, y): 175 | return spearmanr(x, y)[0] 176 | 177 | 178 | def _do_corr(idx, mat, min_periods, corrf): 179 | a, b = mat[:, idx[0]], mat[:, idx[1]] 180 | valid = np.isfinite(a) & np.isfinite(b) 181 | if valid.sum() < min_periods: 182 | c = np.nan 183 | elif not valid.all(): 184 | c = corrf(a[valid], b[valid]) 185 | else: 186 | c = corrf(a, b) 187 | return c 188 | 189 | 190 | def _parallel_do_corr(mat, min_periods, corrf, executor, n_cpu): 191 | comb = [(i, j) for i in range(mat.shape[1]) for j in range(i + 1, mat.shape[1])] 192 | func = partial(_do_corr, min_periods=min_periods, mat=mat, corrf=corrf) 193 | result = _run_in_pool(func, comb, n_cpu, executor) 194 | return result 195 | 196 | 197 | def parallelize_corr(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 198 | if n_cpu is None: 199 | n_cpu = cpu_count() 200 | 201 | def p_corr(data, method='pearson', min_periods=1, numeric_only=False, executor='threads', engine=None): 202 | data = data._get_numeric_data() if numeric_only else data 203 | cols = data.columns 204 | idx = cols.copy() 205 | if method in ['pearson', 'spearman']: 206 | if not data.isna().any().any(): 207 | if method == 'pearson': 208 | result = _np_pearson_corr(data.values) 209 | elif method == 'spearman': 210 | ranks = parallel_rank(data.values, n_cpu) 211 | result = _np_pearson_corr(ranks) 212 | return pd.DataFrame(result, index=idx, columns=idx) 213 | mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) 214 | if engine == 'numba': 215 | return pd.DataFrame(_do_parallel_corr(mat, method=method, min_periods=min_periods), index=idx, columns=cols) 216 | if method == 'pearson': 217 | result = _parallel_pearson_corr(mat, min_periods, executor, n_cpu) 218 | elif method == 'spearman': 219 | result = _parallel_spearman_corr(mat, min_periods, executor, n_cpu) 220 | elif method == 'kendall': 221 | result = _parallel_do_corr(mat, min_periods, _kendall_tau, executor, n_cpu) 222 | elif callable(method): 223 | result = _parallel_do_corr(mat, min_periods, method, executor, n_cpu) 224 | else: 225 | raise ValueError(f'Unknown method {method}') 226 | 227 | corr_mat = np.zeros((data.shape[1], data.shape[1])) 228 | z = 0 229 | for j in range(0, data.shape[1]): 230 | for i in range(j + 1, data.shape[1]): 231 | corr_mat[i, j] = result[z] 232 | z += 1 233 | result_mat = corr_mat + corr_mat.transpose() 234 | np.fill_diagonal(result_mat, 1) 235 | return pd.DataFrame(result_mat, index=idx, columns=idx) 236 | 237 | return p_corr 238 | 239 | 240 | def _do_aggregate(data, func, workers_queue, dilled_func, axis, args, kwargs): 241 | if dilled_func: 242 | func = dill.loads(func) 243 | if isinstance(func, dict): 244 | _axis = data._get_axis_number(axis) 245 | func = {k: v for k, v in func.items() if k in data._get_axis(1 - _axis)} 246 | 247 | def foo(): 248 | return data.agg(func, axis=axis, *args, **kwargs) 249 | 250 | return progress_udf_wrapper(foo, workers_queue, 1)() 251 | 252 | 253 | def parallelize_aggregate(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 254 | @doc(DOC, func='aggregate') 255 | def p_aggregate(data, func, executor='threads', axis=0, args=(), **kwargs): 256 | workers_queue = Manager().Queue() 257 | split_size = get_split_size(n_cpu, split_factor) 258 | tasks = get_split_data(data, axis, split_size) 259 | dilled_func = False 260 | if callable(func): 261 | dilled_func = True 262 | func = dill.dumps(func) 263 | result = progress_imap(partial(_do_aggregate, axis=axis, func=func, 264 | workers_queue=workers_queue, dilled_func=dilled_func, args=args, kwargs=kwargs), 265 | tasks, workers_queue, n_cpu=n_cpu, total=split_size, disable=disable_pr_bar, 266 | show_vmem=show_vmem, executor=executor, desc='agg'.upper()) 267 | concat_axis = 0 268 | if result: 269 | if isinstance(result[0], pd.DataFrame): 270 | concat_axis = 1 - axis 271 | return pd.concat(result, axis=concat_axis) 272 | 273 | return p_aggregate 274 | 275 | 276 | def _do_replace(df, workers_queue, **kwargs): 277 | def foo(): 278 | return df.replace(**kwargs) 279 | 280 | return progress_udf_wrapper(foo, workers_queue, 1)() 281 | 282 | 283 | def parallelize_replace(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 284 | @doc(DOC, func='replace') 285 | def p_replace(data, to_replace=None, value=lib.no_default, limit=None, 286 | regex: bool = False, method: str | lib.NoDefault = lib.no_default): 287 | workers_queue = Manager().Queue() 288 | split_size = get_split_size(n_cpu, split_factor) 289 | tasks = get_split_data(data, 1, split_size) 290 | result = progress_imap(partial(_do_replace, to_replace=to_replace, value=value, limit=limit, regex=regex, 291 | method=method, workers_queue=workers_queue), tasks, workers_queue, 292 | total=split_size, n_cpu=n_cpu, disable=disable_pr_bar, show_vmem=show_vmem, 293 | desc='REPLACE') 294 | 295 | return pd.concat(result) 296 | 297 | return p_replace 298 | 299 | 300 | def do_applymap(df, workers_queue, dill_func, na_action, kwargs): 301 | func = dill.loads(dill_func) 302 | 303 | return df.applymap(progress_udf_wrapper(func, workers_queue, df.size), na_action=na_action, **kwargs) 304 | 305 | 306 | def parallelize_applymap(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 307 | @doc(DOC, func='applymap') 308 | def p_applymap(data, func, na_action=None, **kwargs): 309 | workers_queue = Manager().Queue() 310 | split_size = get_split_size(n_cpu, split_factor) 311 | tasks = get_split_data(data, 1, split_size) 312 | dill_func = dill.dumps(func) 313 | result = progress_imap( 314 | partial(do_applymap, workers_queue=workers_queue, dill_func=dill_func, na_action=na_action, 315 | kwargs=kwargs), tasks, workers_queue, n_cpu=n_cpu, disable=disable_pr_bar, show_vmem=show_vmem, 316 | total=data.size, executor='processes', desc='APPLYMAP') 317 | return pd.concat(result) 318 | 319 | return p_applymap 320 | 321 | 322 | def do_map(df, workers_queue, dill_func, na_action, kwargs): 323 | func = dill.loads(dill_func) 324 | 325 | return df.map(progress_udf_wrapper(func, workers_queue, df.size), na_action=na_action, **kwargs) 326 | 327 | 328 | def parallelize_map(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 329 | @doc(DOC, func='map') 330 | def p_map(data, func, na_action=None, **kwargs): 331 | workers_queue = Manager().Queue() 332 | split_size = get_split_size(n_cpu, split_factor) 333 | tasks = get_split_data(data, 1, split_size) 334 | dill_func = dill.dumps(func) 335 | result = progress_imap( 336 | partial(do_map, workers_queue=workers_queue, dill_func=dill_func, na_action=na_action, 337 | kwargs=kwargs), tasks, workers_queue, n_cpu=n_cpu, disable=disable_pr_bar, show_vmem=show_vmem, 338 | total=data.size, executor='processes', desc='MAP') 339 | return pd.concat(result) 340 | 341 | return p_map 342 | 343 | 344 | def do_describe(df, workers_queue, percentiles, include, exclude, datetime_is_numeric): 345 | if isinstance(df, pd.Series): 346 | df = df.to_frame() 347 | if MAJOR_PANDAS_VERSION < 2: 348 | def foo(): 349 | return df.describe(percentiles=percentiles, include=include, exclude=exclude, 350 | datetime_is_numeric=datetime_is_numeric) 351 | else: 352 | def foo(): 353 | return df.describe(percentiles=percentiles, include=include, exclude=exclude) 354 | 355 | return progress_udf_wrapper(foo, workers_queue, 1)() 356 | 357 | 358 | def parallelize_describe(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 359 | @doc(DOC, func='describe') 360 | def p_describe(data, percentiles=None, include=None, exclude=None, 361 | datetime_is_numeric=False): 362 | if MAJOR_PANDAS_VERSION > 1: 363 | if datetime_is_numeric: 364 | warnings.warn('datetime_is_numeric is deprecated since pandas 2.0.0') 365 | workers_queue = Manager().Queue() 366 | split_size = get_split_size(n_cpu, split_factor) 367 | tasks = get_split_data(data, 0, split_size) 368 | result = progress_imap( 369 | partial(do_describe, workers_queue=workers_queue, percentiles=percentiles, include=include, exclude=exclude, 370 | datetime_is_numeric=datetime_is_numeric), tasks, workers_queue, n_cpu=n_cpu, disable=disable_pr_bar, 371 | show_vmem=show_vmem, total=min(split_size, data.shape[1]), desc='DESCRIBE') 372 | return pd.concat(result, axis=1) 373 | 374 | return p_describe 375 | 376 | 377 | def do_pct_change(df, workers_queue, periods, fill_method, limit, freq, kwargs): 378 | def foo(): 379 | return df.pct_change(periods=periods, fill_method=fill_method, limit=limit, freq=freq, **kwargs) 380 | 381 | return progress_udf_wrapper(foo, workers_queue, 1)() 382 | 383 | 384 | def parallelize_pct_change(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 385 | @doc(DOC, func='pct_change') 386 | def p_pct_change(data, periods=1, 387 | fill_method="pad", 388 | limit=None, 389 | freq=None, 390 | **kwargs, ): 391 | axis = kwargs.get('axis', 0) 392 | workers_queue = Manager().Queue() 393 | split_size = get_split_size(n_cpu, split_factor) 394 | tasks = get_split_data(data, axis, split_size) 395 | result = progress_imap( 396 | partial(do_pct_change, workers_queue=workers_queue, periods=periods, fill_method=fill_method, limit=limit, 397 | freq=freq, kwargs=kwargs), tasks, workers_queue, n_cpu=n_cpu, disable=disable_pr_bar, 398 | show_vmem=show_vmem, total=min(split_size, data.shape[1]), desc='PCT_CHANGE') 399 | return pd.concat(result, axis=1 - axis) 400 | 401 | return p_pct_change 402 | 403 | 404 | def parallelize_nunique(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 405 | @doc(DOC, func='nunique') 406 | def p_nunique(data, executor='threads', axis=0, dropna=True): 407 | return parallelize_apply(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, show_vmem=show_vmem, 408 | split_factor=split_factor)(data, pd.Series.nunique, executor=executor, 409 | axis=axis, dropna=dropna) 410 | 411 | return p_nunique 412 | 413 | 414 | def do_mad(df, workers_queue, axis, skipna, level): 415 | def foo(): 416 | return df.mad(axis=axis, skipna=skipna, level=level) 417 | 418 | return progress_udf_wrapper(foo, workers_queue, 1)() 419 | 420 | 421 | def parallelize_mad(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 422 | @doc(DOC, func='mad') 423 | def p_mad(data, axis=0, skipna=True, level=None): 424 | workers_queue = Manager().Queue() 425 | split_size = get_split_size(n_cpu, split_factor) 426 | tasks = get_split_data(data, axis, split_size) 427 | total = min(split_size, data.shape[1 - axis]) 428 | result = progress_imap( 429 | partial(do_mad, workers_queue=workers_queue, axis=axis, skipna=skipna, level=level), tasks, workers_queue, 430 | n_cpu=n_cpu, disable=disable_pr_bar, show_vmem=show_vmem, total=total, desc='MAD' 431 | ) 432 | return pd.concat(result) 433 | 434 | return p_mad 435 | 436 | 437 | def do_idxmax(df, workers_queue, axis, skipna): 438 | def foo(): 439 | return df.idxmax(axis=axis, skipna=skipna) 440 | 441 | return progress_udf_wrapper(foo, workers_queue, 1)() 442 | 443 | 444 | def parallelize_idxmax(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 445 | @doc(DOC, func='idxmax') 446 | def p_idxmax(data, axis=0, skipna=True): 447 | workers_queue = Manager().Queue() 448 | split_size = get_split_size(n_cpu, split_factor) 449 | tasks = get_split_data(data, axis, split_size) 450 | total = min(split_size, data.shape[1 - axis]) 451 | result = progress_imap( 452 | partial(do_idxmax, workers_queue=workers_queue, axis=axis, skipna=skipna), tasks, workers_queue, 453 | n_cpu=n_cpu, disable=disable_pr_bar, show_vmem=show_vmem, total=total, desc='IDXMAX' 454 | ) 455 | return pd.concat(result) 456 | 457 | return p_idxmax 458 | 459 | 460 | def do_idxmin(df, workers_queue, axis, skipna): 461 | def foo(): 462 | return df.idxmin(axis=axis, skipna=skipna) 463 | 464 | return progress_udf_wrapper(foo, workers_queue, 1)() 465 | 466 | 467 | def parallelize_idxmin(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 468 | @doc(DOC, func='idxmin') 469 | def p_idxmin(data, axis=0, skipna=True): 470 | workers_queue = Manager().Queue() 471 | split_size = get_split_size(n_cpu, split_factor) 472 | tasks = get_split_data(data, axis, split_size) 473 | total = min(split_size, data.shape[1 - axis]) 474 | result = progress_imap( 475 | partial(do_idxmin, workers_queue=workers_queue, axis=axis, skipna=skipna), tasks, workers_queue, 476 | n_cpu=n_cpu, disable=disable_pr_bar, show_vmem=show_vmem, total=total, desc='IDXMIN' 477 | ) 478 | return pd.concat(result) 479 | 480 | return p_idxmin 481 | 482 | 483 | def do_rank(df, workers_queue, axis, method, numeric_only, na_option, ascending, pct): 484 | def foo(): 485 | return df.rank(axis=axis, method=method, numeric_only=numeric_only, na_option=na_option, ascending=ascending, 486 | pct=pct) 487 | 488 | return progress_udf_wrapper(foo, workers_queue, 1)() 489 | 490 | 491 | def parallelize_rank(n_cpu=None, disable_pr_bar=False, split_factor=1, 492 | show_vmem=False): 493 | @doc(DOC, func='rank') 494 | def p_rank(data, axis=0, method: str = "average", numeric_only=lib.no_default, na_option="keep", ascending=True, 495 | pct=False): 496 | workers_queue = Manager().Queue() 497 | split_size = get_split_size(n_cpu, split_factor) 498 | tasks = get_split_data(data, axis, split_size) 499 | total = min(split_size, data.shape[1 - axis]) 500 | result = progress_imap( 501 | partial(do_rank, workers_queue=workers_queue, axis=axis, method=method, numeric_only=numeric_only, 502 | na_option=na_option, ascending=ascending, pct=pct), tasks, workers_queue, 503 | n_cpu=n_cpu, disable=disable_pr_bar, show_vmem=show_vmem, total=total, desc='RANK' 504 | ) 505 | return pd.concat(result, axis=1 - axis) 506 | 507 | return p_rank 508 | 509 | 510 | def do_quantile(df, workers_queue, axis, q, numeric_only, interpolation): 511 | def foo(): 512 | return df.quantile(axis=axis, q=q, numeric_only=numeric_only, interpolation=interpolation) 513 | 514 | return progress_udf_wrapper(foo, workers_queue, 1)() 515 | 516 | 517 | def parallelize_quantile(n_cpu=None, disable_pr_bar=False, split_factor=1, 518 | show_vmem=False): 519 | @doc(DOC, func='quantile') 520 | def p_quantile(data, q=0.5, axis=0, numeric_only: bool = True, interpolation: str = "linear"): 521 | workers_queue = Manager().Queue() 522 | split_size = get_split_size(n_cpu, split_factor) 523 | tasks = get_split_data(data, axis, split_size) 524 | total = min(split_size, data.shape[1 - axis]) 525 | result = progress_imap( 526 | partial(do_quantile, workers_queue=workers_queue, axis=axis, numeric_only=numeric_only, q=q, 527 | interpolation=interpolation), tasks, workers_queue, 528 | n_cpu=n_cpu, disable=disable_pr_bar, show_vmem=show_vmem, total=total, desc='QUANTILE' 529 | ) 530 | if not lib.is_list_like(q): 531 | return pd.concat(result) 532 | return pd.concat(result, axis=1) 533 | 534 | return p_quantile 535 | 536 | 537 | def do_mode(df, workers_queue, axis, numeric_only, dropna): 538 | def foo(): 539 | return df.mode(axis=axis, numeric_only=numeric_only, dropna=dropna) 540 | 541 | return progress_udf_wrapper(foo, workers_queue, 1)() 542 | 543 | 544 | def parallelize_mode(n_cpu=None, disable_pr_bar=False, split_factor=1, 545 | show_vmem=False): 546 | @doc(DOC, func='mode') 547 | def p_mode(data, executor='processes', axis=0, numeric_only: bool = False, dropna=True): 548 | workers_queue = Manager().Queue() 549 | split_size = get_split_size(n_cpu, split_factor) 550 | tasks = get_split_data(data, axis, split_size) 551 | total = min(split_size, data.shape[1 - axis]) 552 | result = progress_imap( 553 | partial(do_mode, workers_queue=workers_queue, axis=axis, numeric_only=numeric_only, dropna=dropna 554 | ), tasks, workers_queue, 555 | n_cpu=n_cpu, disable=disable_pr_bar, show_vmem=show_vmem, total=total, executor=executor, desc='MODE' 556 | ) 557 | return pd.concat(result, axis=1 - axis) 558 | 559 | return p_mode 560 | 561 | 562 | def do_merge(df, workers_queue, right, **kwargs): 563 | def foo(): 564 | return df.merge(right, **kwargs) 565 | 566 | return progress_udf_wrapper(foo, workers_queue, 1)() 567 | 568 | 569 | def parallelize_merge(n_cpu=None, disable_pr_bar=False, split_factor=1, 570 | show_vmem=False): 571 | @doc(DOC, func='mode') 572 | def p_merge(data, 573 | right: pd.DataFrame | pd.Series, 574 | how: str = "inner", 575 | on: IndexLabel | None = None, 576 | left_on: IndexLabel | None = None, 577 | right_on: IndexLabel | None = None, 578 | left_index: bool = False, 579 | right_index: bool = False, 580 | sort: bool = False, 581 | suffixes: Suffixes = ("_x", "_y"), 582 | copy: bool = True, 583 | indicator: bool = False, 584 | validate: str | None = None, ): 585 | workers_queue = Manager().Queue() 586 | split_size = get_split_size(n_cpu, split_factor) 587 | tasks = get_split_data(data, 1, split_size) 588 | total = min(split_size, data.shape[0]) 589 | result = progress_imap( 590 | partial(do_merge, workers_queue=workers_queue, how=how, right=right, on=on, left_on=left_on, 591 | right_on=right_on, 592 | left_index=left_index, right_index=right_index, sort=sort, suffixes=suffixes, copy=copy, 593 | indicator=indicator, 594 | validate=validate), tasks, workers_queue, n_cpu=n_cpu, disable=disable_pr_bar, show_vmem=show_vmem, 595 | total=total, executor='threads', desc='MERGE' 596 | ) 597 | return pd.concat(result) 598 | 599 | return p_merge 600 | 601 | 602 | def do_isin(df, workers_queue, values): 603 | def foo(): 604 | return df.isin(values) 605 | 606 | return progress_udf_wrapper(foo, workers_queue, 1)() 607 | 608 | 609 | def parallelize_isin(n_cpu=None, disable_pr_bar=False, split_factor=1, 610 | show_vmem=False): 611 | @doc(DOC, func='isin') 612 | def p_isin(data, values): 613 | workers_queue = Manager().Queue() 614 | split_size = get_split_size(n_cpu, split_factor) 615 | tasks = get_split_data(data, 1, split_size) 616 | total = min(split_size, data.shape[0]) 617 | result = progress_imap( 618 | partial(do_isin, workers_queue=workers_queue, values=values), tasks, workers_queue, 619 | n_cpu=n_cpu, disable=disable_pr_bar, show_vmem=show_vmem, total=total, executor='threads', desc='ISIN' 620 | ) 621 | return pd.concat(result) 622 | 623 | return p_isin 624 | 625 | 626 | class ParallelizeStatFunc: 627 | def __init__(self, n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 628 | self.n_cpu = n_cpu 629 | self.disable_pr_bar = disable_pr_bar 630 | self.show_vmem = show_vmem 631 | self.split_factor = split_factor 632 | 633 | @staticmethod 634 | def get_nanops_arg(name): 635 | if name == 'min': 636 | return nanops.nanmin 637 | if name == 'max': 638 | return nanops.nanmax 639 | if name == 'mean': 640 | return nanops.nanmean 641 | if name == 'median': 642 | return nanops.nanmedian 643 | if name == 'skew': 644 | return nanops.nanskew 645 | if name == 'kurt': 646 | return nanops.nankurt 647 | 648 | def _stat_func(self, df, workers_queue, name, axis, skipna, level, numeric_only, kwargs): 649 | def closure(): 650 | if MAJOR_PANDAS_VERSION < 2: 651 | return df._stat_function(name, self.get_nanops_arg(name), axis, skipna, level, numeric_only, **kwargs) 652 | return df._stat_function(name, self.get_nanops_arg(name), axis, skipna, numeric_only, **kwargs) 653 | 654 | return progress_udf_wrapper(closure, workers_queue, 1)() 655 | 656 | def _parallel_stat_func(self, data, name, kwargs, axis=0, skipna=True, level=None, numeric_only=None): 657 | workers_queue = Manager().Queue() 658 | split_size = get_split_size(self.n_cpu, self.split_factor) 659 | tasks = get_split_data(data, axis, split_size) 660 | total = min(split_size, data.shape[1 - axis]) 661 | result = progress_imap( 662 | partial(self._stat_func, workers_queue=workers_queue, name=name, axis=axis, skipna=skipna, 663 | level=level, numeric_only=numeric_only, kwargs=kwargs), tasks, workers_queue, 664 | total=total, n_cpu=self.n_cpu, disable=self.disable_pr_bar, show_vmem=self.show_vmem, desc=name.upper()) 665 | return pd.concat(result) 666 | 667 | def do_parallel(self, name): 668 | if name == 'min': 669 | @doc(DOC, func=name) 670 | def p_min(data, axis=0, skipna=True, level=None, numeric_only=None, **kwargs): 671 | return self._parallel_stat_func(data, name=name, axis=axis, skipna=skipna, 672 | level=level, numeric_only=numeric_only, kwargs=kwargs) 673 | 674 | return p_min 675 | if name == 'max': 676 | @doc(DOC, func=name) 677 | def p_max(data, axis=0, skipna=True, level=None, numeric_only=None, **kwargs): 678 | return self._parallel_stat_func(data, name=name, axis=axis, skipna=skipna, 679 | level=level, numeric_only=numeric_only, kwargs=kwargs) 680 | 681 | return p_max 682 | if name == 'mean': 683 | @doc(DOC, func=name) 684 | def p_mean(data, axis=0, skipna=True, level=None, numeric_only=None, **kwargs): 685 | return self._parallel_stat_func(data, name=name, axis=axis, skipna=skipna, 686 | level=level, numeric_only=numeric_only, kwargs=kwargs) 687 | 688 | return p_mean 689 | if name == 'median': 690 | @doc(DOC, func=name) 691 | def p_median(data, axis=0, skipna=True, level=None, numeric_only=None, **kwargs): 692 | return self._parallel_stat_func(data, name=name, axis=axis, skipna=skipna, 693 | level=level, numeric_only=numeric_only, kwargs=kwargs) 694 | 695 | return p_median 696 | if name == 'kurt': 697 | @doc(DOC, func=name) 698 | def p_kurt(data, axis=0, skipna=True, level=None, numeric_only=None, **kwargs): 699 | return self._parallel_stat_func(data, name=name, axis=axis, skipna=skipna, 700 | level=level, numeric_only=numeric_only, kwargs=kwargs) 701 | 702 | return p_kurt 703 | 704 | if name == 'skew': 705 | @doc(DOC, func=name) 706 | def p_skew(data, axis=0, skipna=True, level=None, numeric_only=None, **kwargs): 707 | return self._parallel_stat_func(data, name=name, axis=axis, skipna=skipna, 708 | level=level, numeric_only=numeric_only, kwargs=kwargs) 709 | 710 | return p_skew 711 | 712 | 713 | class ParallelizeStatFuncDdof: 714 | def __init__(self, n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 715 | self.n_cpu = n_cpu 716 | self.disable_pr_bar = disable_pr_bar 717 | self.show_vmem = show_vmem 718 | self.split_factor = split_factor 719 | 720 | @staticmethod 721 | def get_nanops_arg(name): 722 | if name == 'sem': 723 | return nanops.nansem 724 | if name == 'var': 725 | return nanops.nanvar 726 | if name == 'std': 727 | return nanops.nanstd 728 | 729 | def _stat_func_ddof(self, df, workers_queue, name, axis, skipna, level, ddof, numeric_only, kwargs): 730 | def closure(): 731 | if MAJOR_PANDAS_VERSION < 2: 732 | return df._stat_function_ddof(name, self.get_nanops_arg(name), axis, skipna, level, ddof, numeric_only, 733 | **kwargs) 734 | return df._stat_function_ddof(name, self.get_nanops_arg(name), axis, skipna, ddof, numeric_only, 735 | **kwargs) 736 | 737 | return progress_udf_wrapper(closure, workers_queue, 1)() 738 | 739 | def _parallel_stat_func_ddof(self, data, name, kwargs, axis=0, skipna=True, level=None, ddof=1, 740 | numeric_only=None): 741 | workers_queue = Manager().Queue() 742 | split_size = get_split_size(self.n_cpu, self.split_factor) 743 | tasks = get_split_data(data, axis, split_size) 744 | total = min(split_size, data.shape[1 - axis]) 745 | result = progress_imap( 746 | partial(self._stat_func_ddof, workers_queue=workers_queue, name=name, axis=axis, skipna=skipna, 747 | level=level, ddof=ddof, numeric_only=numeric_only, kwargs=kwargs), tasks, workers_queue, 748 | total=total, n_cpu=self.n_cpu, disable=self.disable_pr_bar, show_vmem=self.show_vmem, desc=name.upper()) 749 | return pd.concat(result) 750 | 751 | def do_parallel(self, name): 752 | if name == 'var': 753 | @doc(DOC, func=name) 754 | def p_var(data, axis=0, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs): 755 | return self._parallel_stat_func_ddof(data, name=name, axis=axis, 756 | skipna=skipna, level=level, ddof=ddof, numeric_only=numeric_only, 757 | kwargs=kwargs) 758 | 759 | return p_var 760 | if name == 'std': 761 | @doc(DOC, func=name) 762 | def p_std(data, axis=0, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs): 763 | return self._parallel_stat_func_ddof(data, name=name, axis=axis, 764 | skipna=skipna, level=level, ddof=ddof, numeric_only=numeric_only, 765 | kwargs=kwargs) 766 | 767 | return p_std 768 | if name == 'sem': 769 | @doc(DOC, func=name) 770 | def p_sem(data, axis=0, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs): 771 | return self._parallel_stat_func_ddof(data, name=name, axis=axis, 772 | skipna=skipna, level=level, ddof=ddof, numeric_only=numeric_only, 773 | kwargs=kwargs) 774 | 775 | return p_sem 776 | 777 | 778 | class ParallelizeMinCountStatFunc: 779 | def __init__(self, n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 780 | self.n_cpu = n_cpu 781 | self.disable_pr_bar = disable_pr_bar 782 | self.show_vmem = show_vmem 783 | self.split_factor = split_factor 784 | 785 | @staticmethod 786 | def get_nanops_arg(name): 787 | if name == 'sum': 788 | return nanops.nansum 789 | if name == 'prod': 790 | return nanops.nanprod 791 | 792 | def _min_count_stat_func(self, df, workers_queue, name, axis, skipna, level, numeric_only, min_count, kwargs): 793 | def closure(): 794 | if MAJOR_PANDAS_VERSION < 2: 795 | return df._min_count_stat_function(name, self.get_nanops_arg(name), axis, skipna, level, numeric_only, 796 | min_count, **kwargs 797 | ) 798 | return df._min_count_stat_function(name, self.get_nanops_arg(name), axis, skipna, numeric_only, 799 | min_count, **kwargs 800 | ) 801 | 802 | return progress_udf_wrapper(closure, workers_queue, 1)() 803 | 804 | def _parallel_min_count_stat_func(self, data, name, kwargs, axis=0, skipna=True, level=None, 805 | numeric_only=None, min_count=0): 806 | workers_queue = Manager().Queue() 807 | split_size = get_split_size(self.n_cpu, self.split_factor) 808 | tasks = get_split_data(data, axis, split_size) 809 | total = min(split_size, data.shape[1 - axis]) 810 | result = progress_imap( 811 | partial(self._min_count_stat_func, workers_queue=workers_queue, name=name, axis=axis, skipna=skipna, 812 | level=level, min_count=min_count, numeric_only=numeric_only, kwargs=kwargs), tasks, workers_queue, 813 | total=total, n_cpu=self.n_cpu, disable=self.disable_pr_bar, show_vmem=self.show_vmem, desc=name.upper()) 814 | return pd.concat(result) 815 | 816 | def do_parallel(self, name): 817 | if name == 'sum': 818 | @doc(DOC, func=name) 819 | def p_sum(data, axis=0, skipna=True, level=None, numeric_only=None, 820 | min_count=0, **kwargs): 821 | return self._parallel_min_count_stat_func(data, name=name, axis=axis, 822 | skipna=skipna, level=level, min_count=min_count, 823 | numeric_only=numeric_only, kwargs=kwargs) 824 | 825 | return p_sum 826 | if name == 'prod': 827 | @doc(DOC, func=name) 828 | def p_prod(data, axis=0, skipna=True, level=None, numeric_only=None, 829 | min_count=0, **kwargs): 830 | return self._parallel_min_count_stat_func(data, name=name, axis=axis, 831 | skipna=skipna, level=level, min_count=min_count, 832 | numeric_only=numeric_only, kwargs=kwargs) 833 | 834 | return p_prod 835 | 836 | 837 | class ParallelizeAccumFunc: 838 | def __init__(self, n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 839 | self.n_cpu = n_cpu 840 | self.disable_pr_bar = disable_pr_bar 841 | self.show_vmem = show_vmem 842 | self.split_factor = split_factor 843 | 844 | @staticmethod 845 | def get_func(name): 846 | if name == 'cummin': 847 | return np.minimum.accumulate 848 | if name == 'cummax': 849 | return np.maximum.accumulate 850 | if name == 'cumsum': 851 | return np.cumsum 852 | if name == 'cumprod': 853 | return np.cumprod 854 | 855 | def _concat_by_columns(self, columns): 856 | first = columns[0] 857 | for d in columns[1:]: 858 | first.update(d) 859 | return pd.DataFrame(first) 860 | 861 | def _accum_func(self, df, workers_queue, name, axis, skipna, args, kwargs): 862 | def closure(): 863 | if not axis: 864 | return df._accum_func(name, self.get_func(name), axis, skipna, *args, **kwargs).to_dict(orient='series') 865 | return df._accum_func(name, self.get_func(name), axis, skipna, *args, **kwargs) 866 | 867 | return progress_udf_wrapper(closure, workers_queue, 1)() 868 | 869 | def _parallel_accum_func(self, data, name, args, kwargs, axis=0, skipna=True): 870 | workers_queue = Manager().Queue() 871 | split_size = get_split_size(self.n_cpu, self.split_factor) 872 | tasks = get_split_data(data, axis, split_size) 873 | total = min(split_size, data.shape[1 - axis]) 874 | result = progress_imap( 875 | partial(self._accum_func, workers_queue=workers_queue, name=name, axis=axis, skipna=skipna, 876 | args=args, kwargs=kwargs), tasks, workers_queue, 877 | total=total, n_cpu=self.n_cpu, disable=self.disable_pr_bar, show_vmem=self.show_vmem, desc=name.upper()) 878 | if not axis: 879 | return self._concat_by_columns(result) 880 | return pd.concat(result, axis=1 - axis) 881 | 882 | def do_parallel(self, name): 883 | if name == 'cumsum': 884 | @doc(DOC, func=name) 885 | def p_cumsum(data, axis=0, skipna=True, *args, **kwargs): 886 | return self._parallel_accum_func(data, name=name, axis=axis, 887 | skipna=skipna, args=args, kwargs=kwargs) 888 | 889 | return p_cumsum 890 | 891 | if name == 'cumprod': 892 | @doc(DOC, func=name) 893 | def p_cumprod(data, axis=0, skipna=True, *args, **kwargs): 894 | return self._parallel_accum_func(data, name=name, axis=axis, 895 | skipna=skipna, args=args, kwargs=kwargs) 896 | 897 | return p_cumprod 898 | 899 | if name == 'cummin': 900 | @doc(DOC, func=name) 901 | def p_cummin(data, axis=0, skipna=True, *args, **kwargs): 902 | return self._parallel_accum_func(data, name=name, axis=axis, 903 | skipna=skipna, args=args, kwargs=kwargs) 904 | 905 | return p_cummin 906 | 907 | if name == 'cummax': 908 | @doc(DOC, func=name) 909 | def p_cummax(data, axis=0, skipna=True, *args, **kwargs): 910 | return self._parallel_accum_func(data, name=name, axis=axis, 911 | skipna=skipna, args=args, kwargs=kwargs) 912 | 913 | return p_cummax 914 | -------------------------------------------------------------------------------- /parallel_pandas/core/parallel_groupby.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from multiprocessing import Manager 3 | 4 | from .progress_imap import progress_imap 5 | from pandas.core.groupby.ops import _is_indexed_like 6 | from pandas.util._decorators import doc 7 | 8 | import dill 9 | 10 | from .progress_imap import progress_udf_wrapper 11 | from .tools import ( 12 | get_pandas_version, 13 | ) 14 | 15 | DOC = 'Parallel analogue of the GroupBy.{func} method\nSee pandas GroupBy docstring for more ' \ 16 | 'information\nhttps://pandas.pydata.org/docs/reference/groupby.html' 17 | 18 | MAJOR_PANDAS_VERSION, MINOR_PANDAS_VERSION = get_pandas_version() 19 | 20 | 21 | def _do_group_apply(data, dill_func, workers_queue, args, kwargs): 22 | func = dill.loads(dill_func) 23 | result = progress_udf_wrapper(func, workers_queue, 1)(data, *args, **kwargs) 24 | return result, data.axes, 0 25 | 26 | 27 | def _prepare_result(data): 28 | mutated = False 29 | result = list() 30 | for d in data: 31 | if not mutated and not _is_indexed_like(*d): 32 | mutated = True 33 | result.append(d[0]) 34 | return result, mutated 35 | 36 | 37 | def _get_group_iterator(data, include_groups): 38 | if MAJOR_PANDAS_VERSION == 2 and MINOR_PANDAS_VERSION >= 2 and not include_groups: 39 | return iter(data.grouper._get_splitter(data._obj_with_exclusions, data.axis)) 40 | else: 41 | return iter(data.grouper._get_splitter(data._selected_obj, data.axis)) 42 | 43 | 44 | def parallelize_groupby_apply(n_cpu=None, disable_pr_bar=False): 45 | @doc(DOC, func='apply') 46 | def p_apply(data, func, executor='processes', include_groups=True, args=(), **kwargs): 47 | workers_queue = Manager().Queue() 48 | gr_count = data.ngroups 49 | iterator = _get_group_iterator(data, include_groups) 50 | dill_func = dill.dumps(func) 51 | result = progress_imap( 52 | partial(_do_group_apply, dill_func=dill_func, workers_queue=workers_queue, args=args, kwargs=kwargs), 53 | iterator, workers_queue, total=gr_count, n_cpu=n_cpu, disable=disable_pr_bar, executor=executor, 54 | desc=func.__name__.upper() 55 | ) 56 | result, mutated = _prepare_result(result) 57 | 58 | # due to a bug in the get_iterator method of the Basegrouper class 59 | # that was only fixed in pandas 1.4.0, earlier versions are not yet supported 60 | 61 | # pandas_version = get_pandas_version() 62 | # 63 | # if pandas_version < (1, 3): 64 | # return data._wrap_applied_output(data.grouper._get_group_keys(), result, 65 | # not_indexed_same=mutated or data.mutated) 66 | # elif pandas_version < (1, 4): 67 | # return data._wrap_applied_output(data.grouper._get_group_keys(), data.grouper._get_group_keys(), result, 68 | # not_indexed_same=mutated or data.mutated) 69 | return data._wrap_applied_output(data._selected_obj, result, not_indexed_same=mutated) 70 | 71 | return p_apply 72 | -------------------------------------------------------------------------------- /parallel_pandas/core/parallel_series.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from functools import partial 4 | from multiprocessing import Manager 5 | 6 | import pandas as pd 7 | from pandas.util._decorators import doc 8 | 9 | import dill 10 | 11 | from .progress_imap import progress_imap 12 | from .progress_imap import progress_udf_wrapper 13 | from .tools import ( 14 | get_split_data, 15 | get_split_size, 16 | ) 17 | 18 | DOC = 'Parallel analogue of the pd.Series.{func} method\nSee pandas Series docstring for more ' \ 19 | 'information\nhttps://pandas.pydata.org/pandas-docs/stable/reference/series.html' 20 | 21 | 22 | def _do_apply(data, dill_func, workers_queue, convert_dtype, args, kwargs): 23 | func = dill.loads(dill_func) 24 | if convert_dtype: 25 | return data.apply(progress_udf_wrapper(func, workers_queue, data.shape[0]), args=args, **kwargs) 26 | else: 27 | return data.astype(object).apply(progress_udf_wrapper(func, workers_queue, data.shape[0]), args=args, **kwargs) 28 | 29 | 30 | def series_parallelize_apply(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 31 | @doc(DOC, func='apply') 32 | def p_apply(data, func, executor='processes', convert_dtype=True, args=(), **kwargs): 33 | workers_queue = Manager().Queue() 34 | split_size = get_split_size(n_cpu, split_factor) 35 | tasks = get_split_data(data, 1, split_size) 36 | dill_func = dill.dumps(func) 37 | result = progress_imap(partial(_do_apply, convert_dtype=convert_dtype, dill_func=dill_func, 38 | workers_queue=workers_queue, args=args, kwargs=kwargs), 39 | tasks, workers_queue, n_cpu=n_cpu, total=data.shape[0], disable=disable_pr_bar, 40 | show_vmem=show_vmem, executor=executor, desc=func.__name__.upper()) 41 | 42 | return pd.concat(result) 43 | 44 | return p_apply 45 | 46 | 47 | def _do_map(data, dill_arg, workers_queue, na_action): 48 | func = dill.loads(dill_arg) 49 | def foo(): 50 | return data.map(func, na_action=na_action) 51 | return progress_udf_wrapper(foo, workers_queue, 1)() 52 | 53 | 54 | def series_parallelize_map(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 55 | @doc(DOC, func='map') 56 | def p_map(data, arg, executor='threads', na_action=None): 57 | workers_queue = Manager().Queue() 58 | split_size = get_split_size(n_cpu, split_factor) 59 | tasks = get_split_data(data, 1, split_size) 60 | dill_arg = dill.dumps(arg) 61 | result = progress_imap(partial(_do_map, dill_arg=dill_arg, 62 | workers_queue=workers_queue, na_action=na_action), 63 | tasks, workers_queue, n_cpu=n_cpu, total=split_size, disable=disable_pr_bar, 64 | show_vmem=show_vmem, executor=executor, desc='map'.upper()) 65 | 66 | return pd.concat(result) 67 | 68 | return p_map 69 | -------------------------------------------------------------------------------- /parallel_pandas/core/parallel_window.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from functools import partial 4 | from multiprocessing import cpu_count, Manager 5 | import pandas as pd 6 | from pandas.util._decorators import doc 7 | import dill 8 | from .progress_imap import progress_imap 9 | from .progress_imap import progress_udf_wrapper 10 | from .tools import get_split_data 11 | 12 | DOC = 'Parallel analogue of the {func} method\nSee pandas DataFrame docstring for more ' \ 13 | 'information\nhttps://pandas.pydata.org/docs/reference/window.html' 14 | 15 | 16 | class ParallelRolling: 17 | def __init__(self, n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 18 | self.n_cpu = n_cpu if n_cpu else cpu_count() 19 | self.disable_pr_bar = disable_pr_bar 20 | self.show_vmem = show_vmem 21 | self.split_factor = split_factor 22 | 23 | @staticmethod 24 | def _get_method(df, name, kwargs): 25 | return getattr(df.rolling(**kwargs), name) 26 | 27 | def do_method(self, df, workers_queue, name, serialized_flag, window_attr, args, kwargs): 28 | if name in ['apply', 'aggregate', 'agg']: 29 | if serialized_flag: 30 | # need to deserialize the function 31 | args = (dill.loads(args[0]),) 32 | elif isinstance(args[0], dict): 33 | _axis = df._get_axis_number(window_attr['axis']) 34 | if isinstance(df, pd.DataFrame): 35 | func = {k: v for k, v in args[0].items() if k in df._get_axis(1 - _axis)} 36 | args = (func,) 37 | 38 | def foo(): 39 | method = self._get_method(df, name, window_attr) 40 | if name in ['aggregate', 'agg']: 41 | return method(*args, *kwargs['args'], **kwargs['kwargs']) 42 | return method(*args, **kwargs) 43 | 44 | return progress_udf_wrapper(foo, workers_queue, 1)() 45 | 46 | @staticmethod 47 | def _get_axis_and_offset(data): 48 | axis = 1 49 | offset = data.window 50 | if isinstance(data.obj, pd.DataFrame): 51 | axis = data.axis 52 | offset = 0 53 | return axis, offset 54 | 55 | def _get_split_data(self, data): 56 | axis, offset = self._get_axis_and_offset(data) 57 | return get_split_data(data.obj, axis, self.n_cpu * self.split_factor, offset=offset) 58 | 59 | def _get_total_tasks(self, data): 60 | axis = data.axis 61 | if isinstance(data.obj, pd.Series): 62 | axis = 1 63 | return min(self.n_cpu * self.split_factor, data.obj.shape[1 - axis]) 64 | 65 | def _data_reduce(self, result, data): 66 | axis, offset = self._get_axis_and_offset(data) 67 | if offset: 68 | result = [result[0]] + [s[offset:] for s in result[1:]] 69 | return pd.concat(result, axis=1 - axis, ignore_index=True) 70 | return pd.concat(result, axis=1 - axis) 71 | 72 | @staticmethod 73 | def _func_serialize(func): 74 | if callable(func): 75 | return dill.dumps(func), True 76 | return func, False 77 | 78 | @staticmethod 79 | def _get_attributes(data): 80 | attributes = {attribute: getattr(data, attribute) for attribute in data._attributes} 81 | attributes.pop("_grouper", None) 82 | if attributes['win_type'] == 'freq': 83 | attributes['win_type'] = None 84 | return attributes 85 | 86 | def parallelize_method(self, data, name, executor, *args, **kwargs): 87 | attributes = self._get_attributes(data) 88 | workers_queue = Manager().Queue() 89 | serialized_flag = False 90 | if name in ['apply', 'aggregate', 'agg']: 91 | # if func is callable need to serialize it 92 | func, serialized_flag = self._func_serialize(args[0]) 93 | if isinstance(func, dict): 94 | _axis = data.obj._get_axis_number(attributes['axis']) 95 | if _axis: 96 | data.obj = data.obj.loc[[i for i in func.keys()]] 97 | else: 98 | if len(func) == 1: 99 | key = list(func.keys()) 100 | data.obj = data.obj[key[0]] 101 | else: 102 | data.obj = data.obj[[i for i in func.keys()]] 103 | args = (func,) 104 | tasks = self._get_split_data(data) 105 | total = self._get_total_tasks(data) 106 | result = progress_imap( 107 | partial(self.do_method, workers_queue=workers_queue, args=args, kwargs=kwargs, name=name, 108 | window_attr=attributes, serialized_flag=serialized_flag), 109 | tasks, workers_queue, n_cpu=self.n_cpu, disable=self.disable_pr_bar, show_vmem=self.show_vmem, 110 | total=total, desc=name.upper(), executor=executor, 111 | ) 112 | return self._data_reduce(result, data) 113 | 114 | def do_parallel(self, name): 115 | if name == 'mean': 116 | @doc(DOC, func=name) 117 | def p_mean(data, *args, executor='threads', engine=None, engine_kwargs=None, **kwargs): 118 | return self.parallelize_method(data, name, executor, *args, engine=engine, engine_kwargs=engine_kwargs, 119 | **kwargs) 120 | 121 | return p_mean 122 | 123 | if name == 'median': 124 | @doc(DOC, func=name) 125 | def p_median(data, executor='threads', engine=None, engine_kwargs=None, **kwargs): 126 | return self.parallelize_method(data, name, executor, engine=engine, engine_kwargs=engine_kwargs, 127 | **kwargs) 128 | 129 | return p_median 130 | 131 | if name == 'sum': 132 | @doc(DOC, func=name) 133 | def p_sum(data, *args, executor='threads', engine=None, engine_kwargs=None, **kwargs): 134 | return self.parallelize_method(data, name, executor, *args, engine=engine, engine_kwargs=engine_kwargs, 135 | **kwargs) 136 | 137 | return p_sum 138 | 139 | if name == 'min': 140 | @doc(DOC, func=name) 141 | def p_min(data, *args, executor='threads', engine=None, engine_kwargs=None, **kwargs): 142 | return self.parallelize_method(data, name, executor, *args, engine=engine, engine_kwargs=engine_kwargs, 143 | **kwargs) 144 | 145 | return p_min 146 | 147 | if name == 'max': 148 | @doc(DOC, func=name) 149 | def p_max(data, *args, executor='threads', engine=None, engine_kwargs=None, **kwargs): 150 | return self.parallelize_method(data, name, executor, *args, engine=engine, engine_kwargs=engine_kwargs, 151 | **kwargs) 152 | 153 | return p_max 154 | 155 | if name == 'std': 156 | @doc(DOC, func=name) 157 | def p_std(data, executor='threads', ddof=1, *args, engine=None, engine_kwargs=None, **kwargs): 158 | return self.parallelize_method(data, name, executor, ddof=ddof, *args, engine=engine, 159 | engine_kwargs=engine_kwargs, 160 | **kwargs) 161 | 162 | return p_std 163 | 164 | if name == 'var': 165 | @doc(DOC, func=name) 166 | def p_var(data, executor='threads', ddof=1, *args, engine=None, engine_kwargs=None, **kwargs): 167 | return self.parallelize_method(data, name, executor, ddof=ddof, *args, engine=engine, 168 | engine_kwargs=engine_kwargs, 169 | **kwargs) 170 | 171 | return p_var 172 | 173 | if name == 'sem': 174 | @doc(DOC, func=name) 175 | def p_sem(data, executor='threads', ddof=1, *args, **kwargs): 176 | return self.parallelize_method(data, name, executor, ddof=ddof, *args, **kwargs) 177 | 178 | return p_sem 179 | 180 | if name == 'skew': 181 | @doc(DOC, func=name) 182 | def p_skew(data, executor='threads', **kwargs): 183 | return self.parallelize_method(data, name, executor, **kwargs) 184 | 185 | return p_skew 186 | 187 | if name == 'kurt': 188 | @doc(DOC, func=name) 189 | def p_kurt(data, executor='threads', **kwargs): 190 | return self.parallelize_method(data, name, executor, **kwargs) 191 | 192 | return p_kurt 193 | 194 | if name == 'rank': 195 | @doc(DOC, func=name) 196 | def p_rank(data, executor='processes', method='average', ascending=True, pct=False, **kwargs): 197 | return self.parallelize_method(data, name, executor, method=method, ascending=ascending, pct=pct, 198 | **kwargs) 199 | 200 | return p_rank 201 | 202 | if name == 'quantile': 203 | @doc(DOC, func=name) 204 | def p_quantile(data, quantile, executor='threads', interpolation="linear", **kwargs): 205 | return self.parallelize_method(data, name, executor, quantile, interpolation=interpolation, **kwargs) 206 | 207 | return p_quantile 208 | 209 | if name == 'cov': 210 | @doc(DOC, func=name) 211 | def p_cov(data, executor='processes', other=None, pairwise=None, ddof=1, numeric_only=False): 212 | return self.parallelize_method(data, name, executor, other=other, pairwise=pairwise, ddof=ddof, 213 | numeric_only=numeric_only) 214 | 215 | return p_cov 216 | 217 | if name == 'apply': 218 | @doc(DOC, func=name) 219 | def p_apply(data, func, executor='processes', raw=False, engine=None, engine_kwargs=None, args=None, 220 | kwargs=None): 221 | return self.parallelize_method(data, name, executor, func, raw=raw, engine=engine, 222 | engine_kwargs=engine_kwargs, args=args, kwargs=kwargs) 223 | 224 | return p_apply 225 | 226 | if name in ['aggregate', 'agg']: 227 | @doc(DOC, func=name) 228 | def p_aggregate(data, func, *args, executor='processes', **kwargs): 229 | return self.parallelize_method(data, name, executor, func, args=args, kwargs=kwargs) 230 | 231 | return p_aggregate 232 | 233 | 234 | class ParallelWindow(ParallelRolling): 235 | @staticmethod 236 | def _get_attributes(data): 237 | attributes = {attribute: getattr(data, attribute) for attribute in data._attributes} 238 | attributes.pop("_grouper", None) 239 | return attributes 240 | 241 | def do_parallel(self, name): 242 | if name == 'mean': 243 | @doc(DOC, func=name) 244 | def p_mean(data, executor='threads', **kwargs): 245 | return self.parallelize_method(data, name, executor, **kwargs) 246 | 247 | return p_mean 248 | 249 | if name == 'sum': 250 | @doc(DOC, func=name) 251 | def p_sum(data, executor='threads', **kwargs): 252 | return self.parallelize_method(data, name, executor, **kwargs) 253 | 254 | return p_sum 255 | 256 | if name == 'std': 257 | @doc(DOC, func=name) 258 | def p_std(data, executor='threads', **kwargs): 259 | return self.parallelize_method(data, name, executor, **kwargs) 260 | 261 | return p_std 262 | 263 | if name == 'var': 264 | @doc(DOC, func=name) 265 | def p_var(data, executor='threads', **kwargs): 266 | return self.parallelize_method(data, name, executor, **kwargs) 267 | 268 | return p_var 269 | 270 | 271 | class ParallelGroupbyMixin(ParallelRolling): 272 | 273 | def do_method(self, data, workers_queue, name, serialized_flag, window_attr, args, kwargs): 274 | if name in ['apply', 'aggregate', 'agg']: 275 | if serialized_flag: 276 | # need to deserialize the function 277 | args = (dill.loads(args[0]),) 278 | elif isinstance(args[0], dict): 279 | df = data[1] 280 | _axis = df._get_axis_number(window_attr['axis']) 281 | if isinstance(df, pd.DataFrame): 282 | func = {k: v for k, v in args[0].items() if k in df._get_axis(1 - _axis)} 283 | args = (func,) 284 | 285 | def foo(): 286 | method = self._get_method(data[1], name, window_attr) 287 | if name in ['aggregate', 'agg']: 288 | result = method(*args, *kwargs['args'], **kwargs['kwargs']) 289 | else: 290 | result = method(*args, **kwargs) 291 | if isinstance(data[0], tuple): 292 | idx = [[i] for i in data[0]] + [result.index.tolist()] 293 | else: 294 | idx = [[data[0]], result.index.tolist()] 295 | result.index = pd.MultiIndex.from_product(idx) 296 | return result 297 | 298 | return progress_udf_wrapper(foo, workers_queue, 1)() 299 | 300 | def _get_split_data(self, data): 301 | return data._grouper.get_iterator(data.obj) 302 | 303 | def _get_total_tasks(self, data): 304 | return data._grouper.ngroups 305 | 306 | def _data_reduce(self, result, data): 307 | out = pd.concat(result) 308 | out.rename_axis(data._grouper.names + [data._grouper.axis.name], inplace=True) 309 | return out 310 | 311 | 312 | class ParallelRollingGroupby(ParallelGroupbyMixin, ParallelRolling): 313 | pass 314 | 315 | 316 | class ParallelExpanding(ParallelRolling): 317 | 318 | @staticmethod 319 | def _get_method(df, name, kwargs): 320 | return getattr(df.expanding(**kwargs), name) 321 | 322 | @staticmethod 323 | def _get_axis_and_offset(data): 324 | if isinstance(data.obj, pd.DataFrame): 325 | axis = data.axis 326 | offset = 0 327 | else: 328 | raise NotImplementedError('Parallel methods for Series objects are not implemented.') 329 | return axis, offset 330 | 331 | 332 | class ParallelExpandingGroupby(ParallelGroupbyMixin, ParallelExpanding): 333 | pass 334 | 335 | 336 | class ParallelEWM(ParallelRolling): 337 | 338 | @staticmethod 339 | def _get_axis_and_offset(data): 340 | if isinstance(data.obj, pd.DataFrame): 341 | axis = data.axis 342 | offset = 0 343 | else: 344 | raise NotImplementedError('Parallel methods for Series objects are not implemented.') 345 | return axis, offset 346 | 347 | @staticmethod 348 | def _get_method(df, name, kwargs): 349 | return getattr(df.ewm(**kwargs), name) 350 | 351 | 352 | class ParallelEWMGroupby(ParallelGroupbyMixin, ParallelEWM): 353 | pass 354 | -------------------------------------------------------------------------------- /parallel_pandas/core/progress_imap.py: -------------------------------------------------------------------------------- 1 | import time 2 | from functools import partial 3 | from itertools import count 4 | 5 | import multiprocessing as mp 6 | from threading import Thread 7 | 8 | from tqdm.auto import tqdm 9 | 10 | from .tools import _wrapped_func 11 | 12 | from psutil import virtual_memory 13 | from psutil._common import bytes2human 14 | 15 | 16 | class ProgressBar(tqdm): 17 | 18 | def __init__(self, *args, **kwargs): 19 | super().__init__(*args, **kwargs) 20 | 21 | def close(self): 22 | super().close() 23 | if hasattr(self, 'disp'): 24 | if self.total and self.n < self.total: 25 | self.disp(bar_style='warning') 26 | 27 | 28 | class MemoryProgressBar(tqdm): 29 | 30 | def refresh(self, **kwargs): 31 | super().refresh(**kwargs) 32 | if 70 <= self.n < 90: 33 | self.colour = 'orange' 34 | elif self.n >= 90: 35 | self.colour = 'red' 36 | else: 37 | self.colour = 'green' 38 | 39 | 40 | class ProgressStatus: 41 | def __init__(self): 42 | self.next_update = 1 43 | self.last_update_t = time.perf_counter() 44 | self.last_update_val = 0 45 | 46 | 47 | def progress_udf_wrapper(func, workers_queue, total): 48 | state = ProgressStatus() 49 | cnt = count(1) 50 | 51 | def wrapped_udf(*args, **kwargs): 52 | result = func(*args, **kwargs) 53 | updated = next(cnt) 54 | if updated == state.next_update: 55 | time_now = time.perf_counter() 56 | 57 | delta_t = time_now - state.last_update_t 58 | delta_i = updated - state.last_update_val 59 | 60 | state.next_update += max(int((delta_i / delta_t) * .25), 1) 61 | state.last_update_val = updated 62 | state.last_update_t = time_now 63 | workers_queue.put_nowait((1, delta_i)) 64 | elif updated == total: 65 | workers_queue.put_nowait((1, updated - state.last_update_val)) 66 | return result 67 | 68 | return wrapped_udf 69 | 70 | 71 | def _process_status(bar_size, disable, show_vmem, desc, q): 72 | bar = ProgressBar(total=bar_size, disable=disable, desc=desc + ' DONE') 73 | vmem = virtual_memory() 74 | if show_vmem: 75 | vmem_pbar = MemoryProgressBar(range(100), 76 | bar_format="{desc}: {percentage:.1f}%|{bar}| " + bytes2human(vmem.total), 77 | initial=vmem.percent, colour='green', position=1, desc='VMEM USAGE', 78 | disable=disable, 79 | ) 80 | vmem_pbar.refresh() 81 | while True: 82 | flag, upd_value = q.get() 83 | if not flag: 84 | bar.close() 85 | if show_vmem: 86 | vmem_pbar.close() 87 | return 88 | bar.update(upd_value) 89 | if show_vmem: 90 | if time.time() - vmem_pbar.last_print_t >= 1: 91 | vmem = virtual_memory() 92 | vmem_pbar.update(vmem.percent - vmem_pbar.n) 93 | vmem_pbar.refresh() 94 | 95 | 96 | def _do_parallel(func, tasks, initializer, initargs, n_cpu, total, disable, show_vmem, 97 | q, executor, desc): 98 | if not n_cpu: 99 | n_cpu = mp.cpu_count() 100 | thread_ = Thread(target=_process_status, args=(total, disable, show_vmem, desc, q)) 101 | thread_.start() 102 | if executor == 'threads': 103 | exc_pool = mp.pool.ThreadPool(n_cpu, initializer=initializer, initargs=initargs) 104 | else: 105 | exc_pool = mp.Pool(n_cpu, initializer=initializer, initargs=initargs) 106 | with exc_pool as p: 107 | result = list() 108 | iter_result = p.imap(func, tasks) 109 | while 1: 110 | try: 111 | result.append(next(iter_result)) 112 | except StopIteration: 113 | break 114 | q.put((None, None)) 115 | thread_.join() 116 | return result 117 | 118 | 119 | def progress_imap(func, tasks, q, executor='threads', initializer=None, initargs=(), n_cpu=None, total=None, 120 | disable=False, process_timeout=None, show_vmem=False, desc='' 121 | ): 122 | if executor not in ['threads', 'processes']: 123 | raise ValueError('Invalid executor value specified. Must be one of the values: "threads", "processes"') 124 | try: 125 | if process_timeout: 126 | func = partial(_wrapped_func, func, process_timeout, True) 127 | result = _do_parallel(func, tasks, initializer, initargs, n_cpu, total, disable, show_vmem, q, executor, desc) 128 | except (KeyboardInterrupt, Exception): 129 | q.put((None, None)) 130 | raise 131 | return result 132 | -------------------------------------------------------------------------------- /parallel_pandas/core/tools.py: -------------------------------------------------------------------------------- 1 | import _thread as thread 2 | import threading 3 | from multiprocessing import cpu_count 4 | from functools import wraps 5 | from concurrent.futures import ThreadPoolExecutor 6 | 7 | import platform 8 | import signal 9 | import os 10 | 11 | import pandas as pd 12 | import numpy as np 13 | import time 14 | 15 | 16 | def time_of_function(function): 17 | def wrapped(*args, **kwargs): 18 | start_time = time.time() 19 | res = function(*args, **kwargs) 20 | print('Time of function {} is {:.3f} s.'.format(function.__name__, time.time() - start_time)) 21 | return res 22 | 23 | return wrapped 24 | 25 | 26 | def get_pandas_version(): 27 | major, minor = pd.__version__.split(".")[:2] 28 | return int(major), int(minor) 29 | 30 | 31 | def _rank(mat): 32 | return np.argsort(np.argsort(mat, axis=0), axis=0) 33 | 34 | 35 | def parallel_rank(mat, n_cpu): 36 | matrix_parts = np.array_split(mat, n_cpu, axis=1) 37 | with ThreadPoolExecutor(n_cpu) as pool: 38 | return np.hstack(list(pool.map(_rank, matrix_parts))) 39 | 40 | 41 | def get_split_size(n_cpu, split_factor): 42 | if n_cpu is None: 43 | n_cpu = cpu_count() 44 | if split_factor is None: 45 | split_factor = 4 46 | return n_cpu * split_factor 47 | 48 | 49 | def iterate_by_df(df, idx, axis, offset): 50 | if axis: 51 | for i in idx: 52 | start = max(0, i[0] - offset) 53 | yield df.iloc[start:i[-1] + 1] 54 | else: 55 | for i in idx: 56 | yield df.iloc[:, i[0]:i[-1] + 1] 57 | 58 | 59 | def get_split_data(df, axis, split_size, offset=0): 60 | split_size = min(split_size, df.shape[1 - axis]) 61 | idx_split = np.array_split(np.arange(df.shape[1 - axis]), split_size) 62 | tasks = iterate_by_df(df, idx_split, axis, offset) 63 | return tasks 64 | 65 | 66 | def stop_function(): 67 | if platform.system() == 'Windows': 68 | thread.interrupt_main() 69 | else: 70 | os.kill(os.getpid(), signal.SIGINT) 71 | 72 | 73 | def stopit_after_timeout(s, raise_exception=True): 74 | def actual_decorator(func): 75 | @wraps(func) 76 | def wrapper(*args, **kwargs): 77 | timer = threading.Timer(s, stop_function) 78 | try: 79 | timer.start() 80 | result = func(*args, **kwargs) 81 | except KeyboardInterrupt: 82 | msg = f'function \"{func.__name__}\" took longer than {s} s.' 83 | if raise_exception: 84 | raise TimeoutError(msg) 85 | result = msg 86 | finally: 87 | timer.cancel() 88 | return result 89 | 90 | return wrapper 91 | 92 | return actual_decorator 93 | 94 | 95 | def _wrapped_func(func, s, raise_exception, *args, **kwargs): 96 | return stopit_after_timeout(s, raise_exception=raise_exception)(func)(*args, **kwargs) 97 | -------------------------------------------------------------------------------- /parallel_pandas/main.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from .core.tools import get_pandas_version 4 | 5 | from .core import series_parallelize_apply 6 | from .core import series_parallelize_map 7 | from .core import parallelize_apply 8 | from .core import parallelize_chunk_apply 9 | from .core import parallelize_replace 10 | from .core import ParallelizeStatFunc 11 | from .core import ParallelizeStatFuncDdof 12 | from .core import parallelize_groupby_apply 13 | from .core import parallelize_applymap 14 | from .core import parallelize_describe 15 | from .core import parallelize_nunique 16 | from .core import parallelize_mad 17 | from .core import parallelize_idxmax 18 | from .core import parallelize_idxmin 19 | from .core import parallelize_rank 20 | from .core import ParallelizeMinCountStatFunc 21 | from .core import ParallelizeAccumFunc 22 | from .core import parallelize_quantile 23 | from .core import parallelize_mode 24 | from .core import parallelize_isin 25 | from .core import parallelize_aggregate 26 | from .core import ParallelRolling 27 | from .core import ParallelExpanding 28 | from .core import ParallelEWM 29 | from .core import ParallelEWMGroupby 30 | from .core import ParallelExpandingGroupby 31 | from .core import ParallelRollingGroupby 32 | from .core import ParallelWindow 33 | from .core import parallelize_corr 34 | from .core import parallelize_map 35 | 36 | ROLL_AND_EXP_OPS = ['mean', 'max', 'min', 'sum', 'std', 'var', 'median', 'skew', 'kurt', 'sem', 'quantile', 'rank', 37 | 'apply', 'aggregate', 'agg'] 38 | EWM_OPS = WIN_OPS = ['mean', 'sum', 'std', 'var'] 39 | 40 | 41 | MAJOR, MINOR = get_pandas_version() 42 | 43 | PD_VERSION = MAJOR*10 + MINOR 44 | 45 | 46 | class ParallelPandas: 47 | @staticmethod 48 | def initialize(n_cpu=None, disable_pr_bar=False, show_vmem=False, split_factor=1): 49 | # add parallel methods to Series 50 | pd.Series.p_apply = series_parallelize_apply(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, show_vmem=show_vmem, 51 | split_factor=split_factor) 52 | pd.Series.p_map = series_parallelize_map(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, show_vmem=show_vmem, 53 | split_factor=split_factor) 54 | pd.Series.p_isin = parallelize_isin(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, show_vmem=show_vmem, 55 | split_factor=split_factor) 56 | 57 | # add parallel methods to DataFrame 58 | pd.DataFrame.p_corr = parallelize_corr(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, show_vmem=show_vmem, 59 | split_factor=split_factor) 60 | pd.DataFrame.p_apply = parallelize_apply(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, show_vmem=show_vmem, 61 | split_factor=split_factor) 62 | 63 | pd.DataFrame.p_agg = parallelize_aggregate(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, show_vmem=show_vmem, 64 | split_factor=split_factor) 65 | 66 | pd.DataFrame.p_replace = parallelize_replace(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 67 | show_vmem=show_vmem, split_factor=split_factor) 68 | 69 | pd.DataFrame.p_min = ParallelizeStatFunc(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 70 | show_vmem=show_vmem, split_factor=split_factor).do_parallel('min') 71 | 72 | pd.DataFrame.p_max = ParallelizeStatFunc(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 73 | show_vmem=show_vmem, split_factor=split_factor).do_parallel('max') 74 | 75 | pd.DataFrame.p_mean = ParallelizeStatFunc(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 76 | show_vmem=show_vmem, split_factor=split_factor).do_parallel('mean') 77 | pd.DataFrame.p_median = ParallelizeStatFunc(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 78 | show_vmem=show_vmem, 79 | split_factor=split_factor).do_parallel('median') 80 | pd.DataFrame.p_skew = ParallelizeStatFunc(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 81 | show_vmem=show_vmem, split_factor=split_factor).do_parallel('skew') 82 | 83 | pd.DataFrame.p_kurt = ParallelizeStatFunc(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 84 | show_vmem=show_vmem, split_factor=split_factor).do_parallel('kurt') 85 | 86 | pd.DataFrame.p_std = ParallelizeStatFuncDdof(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 87 | show_vmem=show_vmem, split_factor=split_factor).do_parallel('std') 88 | 89 | pd.DataFrame.p_var = ParallelizeStatFuncDdof(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 90 | show_vmem=show_vmem, split_factor=split_factor).do_parallel('var') 91 | 92 | pd.DataFrame.p_sem = ParallelizeStatFuncDdof(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 93 | show_vmem=show_vmem, split_factor=split_factor).do_parallel('sem') 94 | 95 | pd.DataFrame.p_sum = ParallelizeMinCountStatFunc(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 96 | show_vmem=show_vmem, 97 | split_factor=split_factor).do_parallel('sum') 98 | 99 | pd.DataFrame.p_prod = ParallelizeMinCountStatFunc(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 100 | show_vmem=show_vmem, 101 | split_factor=split_factor).do_parallel('prod') 102 | 103 | pd.DataFrame.p_cumprod = ParallelizeAccumFunc(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 104 | show_vmem=show_vmem, 105 | split_factor=split_factor).do_parallel('cumprod') 106 | 107 | pd.DataFrame.p_cummin = ParallelizeAccumFunc(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 108 | show_vmem=show_vmem, 109 | split_factor=split_factor).do_parallel('cummin') 110 | 111 | pd.DataFrame.p_cummax = ParallelizeAccumFunc(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 112 | show_vmem=show_vmem, 113 | split_factor=split_factor).do_parallel('cummax') 114 | 115 | pd.DataFrame.p_cumsum = ParallelizeAccumFunc(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 116 | show_vmem=show_vmem, 117 | split_factor=split_factor).do_parallel('cumsum') 118 | if PD_VERSION < 21: 119 | pd.DataFrame.p_applymap = parallelize_applymap(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 120 | show_vmem=show_vmem) 121 | else: 122 | pd.DataFrame.p_map = parallelize_map(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 123 | show_vmem=show_vmem) 124 | pd.DataFrame.p_applymap = parallelize_applymap(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 125 | show_vmem=show_vmem) 126 | pd.DataFrame.p_describe = parallelize_describe(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 127 | show_vmem=show_vmem, 128 | split_factor=split_factor) 129 | 130 | pd.DataFrame.p_nunique = parallelize_nunique(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 131 | show_vmem=show_vmem, 132 | split_factor=split_factor) 133 | 134 | pd.DataFrame.p_mad = parallelize_mad(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, show_vmem=show_vmem, 135 | split_factor=split_factor) 136 | 137 | pd.DataFrame.p_idxmax = parallelize_idxmax(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, show_vmem=show_vmem, 138 | split_factor=split_factor) 139 | 140 | pd.DataFrame.p_idxmin = parallelize_idxmin(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, show_vmem=show_vmem, 141 | split_factor=split_factor) 142 | 143 | pd.DataFrame.p_rank = parallelize_rank(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, show_vmem=show_vmem, 144 | split_factor=split_factor) 145 | 146 | pd.DataFrame.p_quantile = parallelize_quantile(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, show_vmem=show_vmem, 147 | split_factor=split_factor) 148 | 149 | pd.DataFrame.p_mode = parallelize_mode(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, show_vmem=show_vmem, 150 | split_factor=split_factor) 151 | 152 | pd.DataFrame.chunk_apply = parallelize_chunk_apply(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 153 | show_vmem=show_vmem, 154 | split_factor=split_factor) 155 | 156 | pd.DataFrame.p_isin = parallelize_isin(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 157 | show_vmem=show_vmem, 158 | split_factor=split_factor) 159 | 160 | # Rolling parallel methods 161 | for name in ROLL_AND_EXP_OPS: 162 | setattr(pd.core.window.Rolling, 'p_' + name, ParallelRolling(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 163 | show_vmem=show_vmem, 164 | split_factor=split_factor).do_parallel(name)) 165 | 166 | # Window parallel methods 167 | for name in WIN_OPS: 168 | setattr(pd.core.window.Window, 'p_' + name, ParallelWindow(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 169 | show_vmem=show_vmem, 170 | split_factor=split_factor).do_parallel( 171 | name)) 172 | 173 | # Expanding parallel methods 174 | for name in ROLL_AND_EXP_OPS: 175 | setattr(pd.core.window.Expanding, 'p_' + name, ParallelExpanding(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 176 | show_vmem=show_vmem, 177 | split_factor=split_factor).do_parallel( 178 | name)) 179 | 180 | # ExponentialMovingWindow parallel methods 181 | for name in EWM_OPS: 182 | setattr(pd.core.window.ExponentialMovingWindow, 'p_' + name, 183 | ParallelEWM(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 184 | show_vmem=show_vmem, 185 | split_factor=split_factor).do_parallel( 186 | name)) 187 | 188 | # RollingGroupby parallel methods 189 | for name in ROLL_AND_EXP_OPS: 190 | setattr(pd.core.window.RollingGroupby, 'p_' + name, 191 | ParallelRollingGroupby(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 192 | show_vmem=show_vmem, 193 | split_factor=split_factor).do_parallel(name)) 194 | 195 | # ExpandingGroupby parallel methods 196 | for name in ROLL_AND_EXP_OPS: 197 | setattr(pd.core.window.ExpandingGroupby, 'p_' + name, 198 | ParallelExpandingGroupby(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 199 | show_vmem=show_vmem, 200 | split_factor=split_factor).do_parallel( 201 | name)) 202 | 203 | # ExponentialMovingWindow parallel methods 204 | for name in EWM_OPS: 205 | setattr(pd.core.window.ExponentialMovingWindowGroupby, 'p_' + name, 206 | ParallelEWMGroupby(n_cpu=n_cpu, disable_pr_bar=disable_pr_bar, 207 | show_vmem=show_vmem, 208 | split_factor=split_factor).do_parallel( 209 | name)) 210 | 211 | # add parallel methods to DataFrameGroupBy and SeriesGroupBy 212 | pd.core.groupby.DataFrameGroupBy.p_apply = parallelize_groupby_apply(n_cpu=n_cpu, 213 | disable_pr_bar=disable_pr_bar) 214 | pd.core.groupby.SeriesGroupBy.p_apply = parallelize_groupby_apply(n_cpu=n_cpu, 215 | disable_pr_bar=disable_pr_bar) 216 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas >= 1.4.0 2 | dill >= 0.3.5.1 3 | tqdm>=4.64.0 4 | psutil>=5.9.1 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('README.md', encoding='utf-8') as f: 4 | long_description = f.read() 5 | 6 | 7 | setup( 8 | name='parallel-pandas', 9 | python_requires='>=3.7', 10 | version='0.6.5', 11 | packages=find_packages(), 12 | author='Dubovik Pavel', 13 | author_email='geometryk@gmail.com', 14 | description='Parallel processing on pandas with progress bars', 15 | long_description=long_description, 16 | long_description_content_type='text/markdown', 17 | keywords=[ 18 | 'parallel pandas', 19 | 'progress bar', 20 | 'parallel apply', 21 | 'parallel groupby', 22 | 'multiprocessing bar', 23 | ], 24 | url='https://github.com/dubovikmaster/parallel-pandas', 25 | license='MIT', 26 | install_requires=[ 27 | 'pandas >= 1.4.0', 28 | 'dill', 29 | 'psutil', 30 | 'tqdm', 31 | 'scipy', 32 | ], 33 | extras_require={ 34 | 'numba': [ 35 | 'numba', 36 | ] 37 | }, 38 | platforms='any', 39 | ) 40 | --------------------------------------------------------------------------------