├── .circleci ├── codecov.yml └── config.yml ├── .coveragerc ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── assets ├── groupby_parallel_v_single_log10.png ├── groupby_parallel_v_single_real.png ├── groupby_parallel_v_single_text_log10.png ├── groupby_parallel_v_single_text_real.png ├── groupby_parallel_vs_single_compatible.png ├── modin_swifter_performance_benchmark_log10.png ├── modin_swifter_performance_benchmark_real.png ├── multiprocessing_v_single_log10.png ├── multiprocessing_v_single_real.png ├── multiprocessing_vs_single_compatible.png ├── vectorizes_when_possible_compatible.png ├── vectorizes_when_possible_log10.png └── vectorizes_when_possible_real.png ├── docker ├── Dockerfile-dev ├── docker-compose.yml ├── requirements-dev.txt └── requirements-windows.txt ├── docs ├── changelog.md └── documentation.md ├── examples ├── modin_dataframe_swifter_performance_benchmark.ipynb ├── swifter_apply_examples.ipynb └── swifter_speed_comparison.ipynb ├── setup.cfg ├── setup.py ├── swifter ├── __init__.py ├── base.py ├── parallel_accessor.py ├── swifter.py ├── swifter_tests.py └── tqdm_dask_progressbar.py └── tox.ini /.circleci/codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | require_ci_to_pass: yes 3 | 4 | coverage: 5 | precision: 2 6 | round: down 7 | range: "70...100" 8 | 9 | status: 10 | project: yes 11 | default: 12 | target: number 13 | threshold: 75% 14 | base: auto 15 | patch: off 16 | changes: no 17 | 18 | parsers: 19 | gcov: 20 | branch_detection: 21 | conditional: yes 22 | loop: yes 23 | method: no 24 | macro: no 25 | 26 | comment: 27 | layout: "reach,diff,flags,tree" 28 | behavior: default 29 | require_changes: no 30 | 31 | ignore: 32 | - "/usr/local/lib/**/*" 33 | - "/usr/lal/lib/**/*" 34 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | orbs: 3 | win: circleci/windows@5.0 4 | codecov: codecov/codecov@1.0.5 5 | jobs: 6 | unittest-lint-codecov-linux: 7 | parallelism: 1 8 | resource_class: xlarge # 8 vCPU 16GB RAM 9 | working_directory: ~/repo 10 | docker: 11 | - image: python:3.9 12 | auth: 13 | username: $DOCKERHUB_USERNAME 14 | password: $DOCKERHUB_PASSWORD 15 | 16 | steps: 17 | - checkout 18 | - run: 19 | name: Install requirements 20 | command: | 21 | pip install --upgrade pip 22 | pip install -r docker/requirements-dev.txt 23 | - run: 24 | name: Black lint check 25 | command: | 26 | cd swifter && black --line-length 120 --check . 27 | - run: 28 | name: Unit tests 29 | command: | 30 | coverage run -m unittest swifter/swifter_tests.py 31 | - run: 32 | name: Codecov report 33 | command: | 34 | coverage report -i && coverage html -i 35 | codecov --required || (sleep 5 && codecov --required) || (sleep 5 && codecov --required) || (sleep 5 && codecov --required) || (sleep 5 && codecov --required) 36 | - store_artifacts: 37 | path: htmlcov 38 | 39 | unittest-windows: 40 | parallelism: 1 41 | working_directory: ~/repo 42 | executor: 43 | name: win/default 44 | size: large # 8 vCPU 30GB RAM 45 | 46 | steps: 47 | - checkout 48 | - run: 49 | name: Install requirements 50 | command: | 51 | pip install --upgrade pip 52 | pip install -r docker/requirements-windows.txt 53 | shell: bash.exe 54 | - run: 55 | name: Unit tests 56 | command: | 57 | python -m unittest swifter/swifter_tests.py 58 | shell: bash.exe 59 | 60 | workflows: 61 | version: 2 62 | build-and-test: 63 | jobs: 64 | - unittest-lint-codecov-linux 65 | - unittest-windows 66 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | # .coveragerc to control coverage.py 2 | [run] 3 | source = 4 | swifter/ 5 | omit = 6 | /usr/* 7 | 8 | [report] 9 | # Regexes for lines to exclude from consideration 10 | exclude_lines = 11 | # Have to re-enable the standard pragma 12 | pragma: no cover 13 | 14 | # Don't complain about missing debug-only code: 15 | def __repr__ 16 | if self\.debug 17 | 18 | # Don't complain if tests don't hit defensive assertion code: 19 | raise AssertionError 20 | raise NotImplementedError 21 | 22 | # Don't complain if non-runnable code isn't run: 23 | if 0: 24 | if __name__ == .__main__.: 25 | 26 | ignore_errors = True 27 | 28 | [html] 29 | directory = htmlcov -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | *.ipynb_checkpoints 3 | */__pycache__/** 4 | *tmp/* 5 | .idea 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Jason Carpenter 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | 3 | graft assets 4 | graft docs 5 | graft examples 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help, build, ci-black, ci-flake8, ci-unittest, ci, black, sphinx, git-tag, release-tag, release-production, dev-start, dev-stop 2 | 3 | PROJECT=swifter 4 | IMAGE=${PROJECT} 5 | VERSION_FILE:=${PROJECT}/__init__.py 6 | VERSION_TAG:=$(shell cat ${VERSION_FILE} | grep -oEi [0-9]+.[0-9]+.[0-9]+) 7 | 8 | help: ## Display help 9 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 10 | 11 | build: ## Build docker image with tag "latest" for unit testing 12 | cd docker && DOCKER_BUILDKIT=1 docker build --build-arg GRANT_SUDO="yes" -t $(IMAGE) . -f Dockerfile-dev && cd .. 13 | 14 | sha256: ## Get the openssl sha256 of the deployed version 15 | curl -sL https://url.com/you/want/to/get.tar.gz | openssl sha256 16 | 17 | git-tag: ## Tag in git from VERSION file then push tag up to origin 18 | echo $(VERSION_TAG) 19 | git tag $(VERSION_TAG) 20 | git push --tag 21 | 22 | release-production-pypi: ## Builds and publishes the version to PyPi 23 | python3 setup.py build sdist 24 | twine upload dist/swifter-${VERSION_TAG}.tar.gz 25 | rm -r dist && rm -r build 26 | 27 | release-production: git-tag release-production-pypi 28 | 29 | ci-black: build ## Test for black requirements 30 | docker run --rm -t -v ${PWD}:/mnt $(IMAGE) black --line-length 120 --diff --color --check ${PROJECT} 31 | 32 | ci-flake8: build ## Test for flake8 requirements 33 | docker run --rm -v ${PWD}:/mnt -t $(IMAGE) flake8 ${PROJECT} 34 | 35 | ci-unittest: build ## Test pytest unittests 36 | docker run --rm -v ${PWD}:/mnt -t $(IMAGE) python -m unittest ${PROJECT}/swifter_tests.py 37 | 38 | ci: ci-black ci-flake8 ci-unittest ## Check black, flake8, and unittests 39 | @echo "CI successful" 40 | 41 | black: build ## Run black, which formats code 42 | docker run --rm -v ${PWD}:/mnt -t $(IMAGE) black --line-length 120 ${PROJECT} 43 | 44 | dev-start: ## Primary make command for dev, spins up containers 45 | docker-compose -f docker/docker-compose.yml --project-name ${PROJECT} up -d --build 46 | 47 | dev-stop: ## Spins down active containers 48 | docker-compose -f docker/docker-compose.yml --project-name ${PROJECT} down 49 | 50 | sphinx: ## Creates docs using sphinx 51 | echo "Not implemented" 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # swifter 2 | A package which efficiently applies any function to a pandas dataframe or series in the fastest available manner. 3 | 4 | [![PyPI version](https://badge.fury.io/py/swifter.svg)](https://badge.fury.io/py/swifter) 5 | [![CircleCI](https://circleci.com/gh/jmcarpenter2/swifter.svg?style=shield)](https://circleci.com/gh/jmcarpenter2/swifter) 6 | [![codecov](https://img.shields.io/codecov/c/github/jmcarpenter2/swifter?label=codecov&logo=codecov&style=flat)](https://codecov.io/gh/jmcarpenter2/swifter) 7 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) 8 | ![GitHub stars](https://img.shields.io/github/stars/jmcarpenter2/swifter.svg?style=popout) 9 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/swifter.svg) 10 | 11 | ## Blog posts 12 | * [Release 1.0.0](https://medium.com/@jmcarpenter2/swifter-1-0-0-automatically-efficient-pandas-and-modin-dataframe-applies-cfbd9555e7c8) 13 | * [First release](https://medium.com/@jmcarpenter2/swiftapply-automatically-efficient-pandas-apply-operations-50e1058909f9) 14 | 15 | ## Documentation 16 | To know about latest improvements, please check the [changelog](docs/changelog.md). 17 | 18 | Further documentations on swifter is available [here](docs/documentation.md). 19 | 20 | Check out the [examples notebook](examples/swifter_apply_examples.ipynb), along with the [speed benchmark notebook](examples/swifter_speed_comparison.ipynb). The benchmarks are created using the library [perfplot](https://github.com/unutbu/perfplot). 21 | 22 | ## Installation: 23 | ``` 24 | $ pip install -U pandas # upgrade pandas 25 | $ pip install swifter # first time installation 26 | $ pip install swifter[notebook] # first time installation including dependency for rich progress bar in jupyter notebooks 27 | $ pip install swifter[groupby] # first time installation including dependency for groupby.apply functionality 28 | 29 | $ pip install -U swifter # upgrade to latest version if already installed 30 | $ pip install -U swifter[notebook] # upgrade to latest version to include dependency for rich progress bar in jupyter notebooks 31 | $ pip install -U swifter[groupby] # upgrade to latest version to include dependency for groupby.apply functionality 32 | ``` 33 | 34 | alternatively, to install on [Anaconda](https://anaconda.org/conda-forge/swifter): 35 | ``` 36 | conda install -c conda-forge swifter # Install swifter 37 | conda install -c conda-forge swifter>=1.3.2 ray>=1.0.0 # Install swifter with dependency for groupby.apply 38 | ``` 39 | 40 | ...after installing, import `swifter` into your code along with `pandas` using: 41 | ```python 42 | import pandas as pd 43 | import swifter 44 | ``` 45 | 46 | ...alternatively, `swifter` can be used with `modin` dataframes in the same manner: 47 | ```python 48 | import modin.pandas as pd 49 | import swifter 50 | ``` 51 | NOTE: if you import swifter before modin, you will have to additionally register modin: `swifter.register_modin()` 52 | 53 | 54 | ## Easy to use 55 | ```python 56 | df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [5, 6, 7, 8]}) 57 | 58 | # runs on single core 59 | df['x2'] = df['x'].apply(lambda x: x**2) 60 | # runs on multiple cores 61 | df['x2'] = df['x'].swifter.apply(lambda x: x**2) 62 | 63 | # use swifter apply on whole dataframe 64 | df['agg'] = df.swifter.apply(lambda x: x.sum() - x.min()) 65 | 66 | # use swifter apply on specific columns 67 | df['outCol'] = df[['inCol1', 'inCol2']].swifter.apply(my_func) 68 | df['outCol'] = df[['inCol1', 'inCol2', 'inCol3']].swifter.apply(my_func, 69 | positional_arg, keyword_arg=keyword_argval) 70 | ``` 71 | 72 | ## Vectorizes your function, when possible 73 | ![Alt text](/assets/vectorizes_when_possible_compatible.png?raw=true) 74 | 75 | ## When vectorization is not possible, automatically decides which is faster: to use dask parallel processing or a simple pandas apply 76 | ![Alt text](/assets/multiprocessing_vs_single_compatible.png?raw=true) 77 | 78 | ## Highly performant, even for groupby applies 79 | ![Alt text](/assets/groupby_parallel_vs_single_compatible.png?raw=true) 80 | 81 | See the [speed benchmark notebook](examples/swifter_speed_comparison.ipynb) for source of the above performance plots. 82 | 83 | ## Notes 84 | 1. The function is documented in the .py file. In Jupyter Notebooks, you can see the docs by pressing Shift+Tab(x3). Also, check out the complete documentation [here](docs/documentation.md) along with the [changelog](docs/changelog.md). 85 | 86 | 2. Please upgrade your version of pandas, as the pandas extension api used in this module is a recent addition to pandas. 87 | 88 | 3. Import modin before importing swifter, if you wish to use modin with swifter. Otherwise, use `swifter.register_modin()` to access it. 89 | 90 | 4. Do not use swifter to apply a function that modifies external variables. Under the hood, swifter does sample applies to optimize performance. These sample applies will modify the external variable in addition to the final apply. Thus, you will end up with an erroneously modified external variable. 91 | 92 | 5. It is advised to disable the progress bar if calling swifter from a forked process as the progress bar may get confused between various multiprocessing modules. 93 | 94 | 6. If swifter return is different than pandas try explicitly casting type e.g.: `df.swifter.apply(lambda x: float(np.angle(x)))` 95 | -------------------------------------------------------------------------------- /assets/groupby_parallel_v_single_log10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmcarpenter2/swifter/ea8d30ab1055441cca10cd07913c78370c284352/assets/groupby_parallel_v_single_log10.png -------------------------------------------------------------------------------- /assets/groupby_parallel_v_single_real.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmcarpenter2/swifter/ea8d30ab1055441cca10cd07913c78370c284352/assets/groupby_parallel_v_single_real.png -------------------------------------------------------------------------------- /assets/groupby_parallel_v_single_text_log10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmcarpenter2/swifter/ea8d30ab1055441cca10cd07913c78370c284352/assets/groupby_parallel_v_single_text_log10.png -------------------------------------------------------------------------------- /assets/groupby_parallel_v_single_text_real.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmcarpenter2/swifter/ea8d30ab1055441cca10cd07913c78370c284352/assets/groupby_parallel_v_single_text_real.png -------------------------------------------------------------------------------- /assets/groupby_parallel_vs_single_compatible.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmcarpenter2/swifter/ea8d30ab1055441cca10cd07913c78370c284352/assets/groupby_parallel_vs_single_compatible.png -------------------------------------------------------------------------------- /assets/modin_swifter_performance_benchmark_log10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmcarpenter2/swifter/ea8d30ab1055441cca10cd07913c78370c284352/assets/modin_swifter_performance_benchmark_log10.png -------------------------------------------------------------------------------- /assets/modin_swifter_performance_benchmark_real.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmcarpenter2/swifter/ea8d30ab1055441cca10cd07913c78370c284352/assets/modin_swifter_performance_benchmark_real.png -------------------------------------------------------------------------------- /assets/multiprocessing_v_single_log10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmcarpenter2/swifter/ea8d30ab1055441cca10cd07913c78370c284352/assets/multiprocessing_v_single_log10.png -------------------------------------------------------------------------------- /assets/multiprocessing_v_single_real.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmcarpenter2/swifter/ea8d30ab1055441cca10cd07913c78370c284352/assets/multiprocessing_v_single_real.png -------------------------------------------------------------------------------- /assets/multiprocessing_vs_single_compatible.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmcarpenter2/swifter/ea8d30ab1055441cca10cd07913c78370c284352/assets/multiprocessing_vs_single_compatible.png -------------------------------------------------------------------------------- /assets/vectorizes_when_possible_compatible.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmcarpenter2/swifter/ea8d30ab1055441cca10cd07913c78370c284352/assets/vectorizes_when_possible_compatible.png -------------------------------------------------------------------------------- /assets/vectorizes_when_possible_log10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmcarpenter2/swifter/ea8d30ab1055441cca10cd07913c78370c284352/assets/vectorizes_when_possible_log10.png -------------------------------------------------------------------------------- /assets/vectorizes_when_possible_real.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmcarpenter2/swifter/ea8d30ab1055441cca10cd07913c78370c284352/assets/vectorizes_when_possible_real.png -------------------------------------------------------------------------------- /docker/Dockerfile-dev: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | ADD requirements-windows.txt /build/requirements-windows.txt 3 | ADD requirements-dev.txt /build/requirements-dev.txt 4 | WORKDIR /build/ 5 | RUN pip install --upgrade pip 6 | RUN pip install -r requirements-dev.txt 7 | WORKDIR /mnt/ 8 | ENV PYTHONPATH "${PYTHONPATH}:/mnt" 9 | -------------------------------------------------------------------------------- /docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | bash: 4 | build: 5 | context: . 6 | dockerfile: Dockerfile-dev 7 | environment: 8 | GRANT_SUDO: "yes" 9 | user: root 10 | volumes: 11 | - ../:/mnt 12 | entrypoint: "/bin/bash" 13 | stdin_open: true 14 | container_name: "swifter_bash_${USER}" 15 | tty: true 16 | 17 | jupyter: 18 | build: 19 | context: . 20 | dockerfile: Dockerfile-dev 21 | environment: 22 | GRANT_SUDO: "yes" 23 | user: root 24 | ports: 25 | - "127.0.0.1::8888" 26 | volumes: 27 | - ../:/mnt 28 | entrypoint: bash -c "cd /mnt && jupyter notebook --NotebookApp.token='' --ip=0.0.0.0 --allow-root && /bin/bash" 29 | stdin_open: true 30 | container_name: "swifter_jupyter_${USER}" 31 | tty: true 32 | -------------------------------------------------------------------------------- /docker/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements-windows.txt 2 | ray>=1.0.0 -------------------------------------------------------------------------------- /docker/requirements-windows.txt: -------------------------------------------------------------------------------- 1 | pandas>=1.0.0 2 | psutil>=5.6.6 3 | dask[dataframe]>=2.10.0 4 | modin[dask]>=0.8.1.1 5 | tqdm>=4.33.0 6 | ipywidgets>=7.0.0 7 | black==22.3.0 8 | flake8==3.7.7 9 | perfplot==0.7.3 10 | pytest==6.2.2 11 | jupyterlab==3.6.7 12 | coverage 13 | codecov 14 | nose 15 | -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## Version 1.4.0 -- 2023-07-21 4 | * Significantly reduced core dependencies of swifter library. See https://github.com/jmcarpenter2/swifter/issues/219 for discussion 5 | - Big shout out to @PeterJCLaw for starting this discussion and contributions from @xquyvu as well 6 | * Removed deprecated `loffset` parameter 7 | - Thanks to @bnavigator for identifying this bug 8 | * Updated README to be more readable for darkmode users 9 | - Thank you to @MemphisMeng for identifying this gap 10 | 11 | ## Version 1.3.5 -- 2023-06-12 12 | * Add secondary fallback for series applies 13 | * Code refactoring for simplicity 14 | 15 | ## Version 1.3.4 -- 2022-08-16 16 | * Enable indexing after a groupby, e.g. `df.swifter.groupby(by)[key].apply(func)` 17 | * Improve groupby apply progress bar 18 | - Previously, the groupby apply progress bar only appeared after the data was distributed across the cores. 19 | - Now, the groupby apply progress bar appears before the data is distributed for a more realistic reflection of how long it took 20 | * Additional groupby apply code refactoring and optimizations, including removing the mutability of the data within `ray` 21 | 22 | ## Version 1.3.3 -- 2022-07-28 23 | * Enable users to pass in `df.index` as the by parameter for the df.swifter.groupby(by).apply(func) command 24 | 25 | ## Version 1.3.2 -- 2022-07-23 26 | * Enable users to `df.swifter.groupby.apply`, which requires a new package (ray) that now available as an extra_requires. 27 | * To use groupby apply, install swifter as `pip install -U swifter[groupby]` 28 | * All credit goes to user @diditforlulz273 for writing the performant groupby apply code, that is now part of swifter! 29 | 30 | ## Version 1.2.0 -- 2022-07-06 31 | * Enable users to `force_parallel` which immediately forces swifter to jump to using dask apply. This enables a simple interface for parallel processing, but disables swifter's algorithm to determine the fastest apply solution possible. 32 | 33 | ## Version 1.1.4 -- 2022-06-29 34 | * Enable users to leverage `set_defaults` functionality so they don't have to keep invoking individual settings on a per swifter invocation basis 35 | 36 | ## Version 1.1.3 -- 2022-04-12 37 | * Enhance the robustness of swifter by randomizing the sample index to avoid sparse data impacting the validity of apply validation 38 | * Resolve issue where functions that return a non array-like cause swifter to fail on vectorization 39 | 40 | ## Version 1.1.2 -- 2022-02-07 41 | * Resolve installation issue by removing import from setup.py 42 | 43 | ## Version 1.1.1 -- 2022-02-04 44 | * Resolve installation issues by removing modin dependency, and modin apply route for axis=1 string applies 45 | * apply_dask_on_strings returns to original functionality, 46 | which allows control over whether to use dask or pandas by default for string applies 47 | 48 | ## Version 1.0.7 -- 2020-10-11 49 | * Sample applies now suppress logging in addition to stdout and stderr 50 | * Allow new kwargs `offset` and `origin` for pandas df.resample 51 | 52 | ## Version 1.0.6 -- 2020-09-19 53 | * Fix warnings introduced in 1.0.5 to appropriate warn when using an apply function 54 | 55 | ## Version 1.0.5 -- 2020-09-19 56 | * Added warnings/errors for swifter methods which do not exist when using modin dataframes 57 | * Updated Dask Dataframe dependencies to require a more recent version 58 | * Updated examples/speed benchmark notebooks 59 | 60 | ## Version 1.0.4 -- 2020-09-19 61 | * Broken release 62 | 63 | ## Version 1.0.3 -- 2020-08-03 64 | * Fixed bug with string, axis=1 applies for pandas dataframes that prevented swifter from leveraging modin for parallelization when returning a series instead of a dataframe 65 | 66 | ## Version 1.0.2 -- 2020-07-31 67 | * Remove pickle5 hard dependency 68 | 69 | ## Version 1.0.1 -- 2020-07-27 70 | * Reduce resources consumed by swifter by only importing modin/ray when necessary. 71 | * Added `swifter.register_modin()` function, which gives access to `modin.DataFrame.swifter.apply(...)`, but is only required if modin is imported after swifter. If you import modin before swifter, this is not necessary. 72 | 73 | ## Version 1.0.0 -- 2020-07-27 74 | Two major enhancements are included in this release, both involving the use of modin in swifter. Special thanks to Devin Petersohn for the collaboration. 75 | 76 | 1. Enable compatibility with modin dataframes. Compatibility not only allows modin dataframes to work with `df.swifter.apply(...)`, but still attempts to vectorize the operation which can lead to a performance boost. 77 | 78 | Example: 79 | ```python 80 | 81 | import modin.pandas as pd 82 | df = pd.DataFrame(...) 83 | df.swifter.apply(...) 84 | ``` 85 | 86 | 2. Significantly speed up swifter axis=1 string applies by using Modin, resolving a long-standing issue for swifter. 87 | * Use Modin for axis=1 string applies, unless allow_dask_on_strings(True) is set. If that flag is set, still use Dask. 88 | * NOTE: this means that allow_dask_on_strings() is no longer required to work with text data using swifter. 89 | 90 | ## Version 0.305 91 | (1) Remove Numba hard dependency, but still handle TypingErrors when numba is installed 92 | (2) Only call tqdm's `progress_apply` on transformations (e.g. Resampler, Rolling) when tqdm has an implementation for that object. 93 | 94 | ## Version 0.304 95 | Swifter performance consistency improved in two ways: 96 | 97 | 98 | (1) The validation check of the vectorized form of swifter was always failing, because of assumption of dataframe type, when usually a vectorized function form results in array type. 99 | 100 | 101 | (2) When using a dataframe with duplicated indices, swifter was silently failing to leverage dask. Now swifter raises a warning when the dataframe has duplicated indices, with a recommendation to call `df.reset_index(drop=True)`. 102 | 103 | ## Version 0.301 104 | Following pandas release v1.0.0, removing deprecated keyword args "broadcast" and "reduce" 105 | 106 | ## Version 0.300 107 | Added new `applymap` method for pandas dataframes. `df.swifter.applymap(...)` 108 | 109 | ## Version 0.297 110 | Fixed issue causing errors when using swifter on empty dataframes. Now swifter will perform a pandas apply on empty dataframes. 111 | 112 | ## Version 0.296 113 | Added support for resample objects in syntax that refects pandas. `df.swifter.resample(...).apply(...)` 114 | 115 | ## Version 0.295 116 | Context Manager to suppress print statements during the sample/test applies. Now if a print is part of the function that is applied, the only print that will occur is the final apply's print. 117 | 118 | ## Version 0.294 119 | Made several code simplifications, thanks to @ianozvsald's suggestions. One of these code changes avoids the issue where assertions are ignored according to python -O flag, which would effectively break swifter. 120 | 121 | ## Version 0.293 122 | Require tqdm>=4.33.0, which resolves a bug with the progress bar that stems from pandas itself. 123 | 124 | ## Version 0.292 125 | Fix known security vulnerability in parso <= 0.4.0 by requiring parso > 0.4.0 126 | 127 | ## Version 0.291 128 | Change import from tqdm.auto instead of tqdm.autenook. Less warnings will show when importing swifter. 129 | 130 | ## Version 0.290 131 | df.swifter.progress_bar(desc="") now allows for a custom description. 132 | 133 | ## Versions 0.288 and 0.289 134 | Very minor bug fixes for edge cases, e.g. KeyError for applying on a dataframe with a dictionary as a nested element 135 | 136 | ## Version 0.287 137 | Fixed bugs with rolling apply. Added unit test coverage. 138 | 139 | ## Version 0.286 140 | Fixed a bug that prevented result_type kwarg from being passed to the dask apply function. Now you can use this functionality and it will rely on dask rather than pandas. 141 | 142 | Additionally adjusted the speed estimation for data sets < 25000 rows so that it doesn't spend a lot of time estimating how long to run an apply for on the first 1000 rows when the data set is tiny. We want to asymptote to near-pandas performance even on tiny data sets. 143 | 144 | ## Version 0.285 145 | Uses tqdm.autonotebook to dynamically switch between beautiful notebook progress bar and CLI version of the progress bar 146 | 147 | ## Version 0.284 148 | Minor ipywidgets requirements update 149 | 150 | ## Version 0.283 151 | Allowed user to override scheduler default to multithreading if desired. 152 | 153 | ## Version 0.282 154 | Add an option `allow_dask_on_strings` to `DataFrameAccessor`. This is a non-recommended option if you are doing string processing. It is intended for using the string as a lookup for the rest of the dataframe processing. This override is also included in `SeriesAccessor`, but there I am not aware of a use-case that it makes sense to use this. 155 | 156 | ## Version 0.280 157 | Swifter now defaults to axis=0, with a NotImplementedError for when trying to use dask on large datasets, because dask hasn't implemented axis=0 applies yet. 158 | 159 | ## Version 0.270 160 | Added documentation and code styling thanks to @msampathkumar. Also included override options for dask_threshold and npartitions parameters. 161 | 162 | ## Version 0.260 163 | Added support for rolling objects in syntax that reflects pandas. `df.swifter.rolling(..).apply(...)` 164 | 165 | ## Version 0.250 166 | Fixed a bug that would call a vectorized function when in fact the vectorization was wrong to apply. We have to ensure that output data shape is aligned regardless of apply type. 167 | 168 | ## Version 0.240 169 | Added TQDM support (to disable do `df.swifter.progress_bar(False).apply(...)`, removed groupby_apply (because it's too slow), and tweaked some under-the-hood _dask_apply functionality. Specific functionality changes for pd.Series.swifter.apply include falling back to dask apply if dask map_partitions fails. Specific functionality changes for pd.DataFrame.swifter.apply include falling back to pandas apply if dask apply fails. 170 | 171 | ## Version 0.230 172 | Made a change so that swifter uses pandas apply when input is series/dataframe of dtype string. This is a temporary solution to slow dask apply processing of strings. 173 | 174 | ## Version 0.220 175 | Added a groupby_apply function to utilize dask for groupby apply when its faster. Simply use as **df.swifter.groupby_apply(groupby_col, func)**. I would've extended the Pandas DataFrameGroupBy object, but he hasn't added support for that kind of extension yet. Also, removed the str_object limitation to utilizing dask. Now it will simply determine whether to use dask v pandas based on the dask_threshold (default 1 second). 176 | 177 | ## Version 0.210 178 | Fixed a bug for row-wise applies. Thanks to @slhck for poining this out. 179 | 180 | ## Version 0.200 181 | Completely refactored the package as an extension to pandas, rather than an independent function call. This will allow for increased flexibility of the user and simplicity of using swiftapply. 182 | **This new update changed the way to use swiftapply. Now the format is df.swifter.apply(func)** 183 | 184 | ## Version 0.150 185 | Fixed bug that would allow certain functions to be applied to the entire series/dataframe, rather than to each element. For example, len(x) returned the length of the series, rather than the length of each string within the series. A special thanks to @bharatvem for pointing this out. 186 | 187 | ## Version 0.140 188 | Added support for vectorized and pandas applies to dataframes. 189 | Converted all string manipulations to pandas apply (unless vectorizable) because dask processes string manipulations slowly. 190 | 191 | ## Version 0.13 192 | Removed numba jit function, because this was adding to the total runtime. Will do some experiments and consider readding later. 193 | 194 | ## Version 0.1 195 | Currently works very well with pandas series, needs some work to optimize dask multiprocessing for pandas dataframes. For now, it is probably best to apply to each series independently, rather than multiple columns of a dataframe at once. 196 | -------------------------------------------------------------------------------- /docs/documentation.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | ## Important Notes 4 | 5 | 1. Please upgrade your version of pandas, as the pandas extension api used in this module is a recent addition to pandas. 6 | 7 | 2. Do not use swifter to apply a function that modifies external variables. Under the hood, swifter does sample applies to optimize performance. These sample applies will modify the external variable in addition to the final apply. Thus, you will end up with an erroneously modified external variable. 8 | 9 | 3. It is advised to disable the progress bar if calling swifter from a forked process as the progress bar may get confused between various multiprocessing modules. 10 | 11 | ## 1. `swifter.set_defaults` 12 | 13 | Allows for upfront configuration of swifter settings. Set once, re-use across all dataframe swifter invocations. 14 | **NOTE: You must set the defaults before creating the dataframe because this entrypoint is part of the `__init__`.** 15 | 16 | ```python 17 | from swifter import set_defaults 18 | set_defaults( 19 | npartitions=None, 20 | dask_threshold=1, 21 | scheduler="processes", 22 | progress_bar=True, 23 | progress_bar_desc=None, 24 | allow_dask_on_strings=False, 25 | force_parallel=False, 26 | ) 27 | ``` 28 | 29 | **Parameters:** 30 | 31 | `npartitions` : Integer. The number of partitions to distribute the data into for dask processing. 32 | Default: `2*cpu_count()` 33 | 34 | `dask_threshold` : Float. The amount of seconds to use for estimating whether to use dask or pandas apply. 35 | Default: `1` second 36 | 37 | `scheduler` : String. Whether to use `threads` or `processes` for the dask scheduler 38 | Default: `processes` 39 | 40 | `progress_bar` : Boolean. Whether to turn the progress bar on or off. 41 | Default: `True` 42 | 43 | `progress_bar_desc` : String. Progress Bar Description 44 | Default: `None` 45 | 46 | `allow_dask_on_strings` : Boolean. Allows user to enable dask parallel processing on string data 47 | Default: `False` 48 | 49 | `force_parallel` : Boolean. Allows user to override swifter algorithm and jump straight to using dask processing 50 | Default: `False` 51 | 52 | 53 | 54 | ## 2. `pandas.Series.swifter.apply` OR `modin.pandas.Series.swifter.apply` 55 | 56 | Efficiently apply any function to a pandas series in the fastest available manner 57 | 58 | ```python 59 | def pandas.Series.swifter.apply(func, convert_dtype=True, args=(), **kwds) 60 | ``` 61 | 62 | **Parameters:** 63 | 64 | `func` : function. Function to apply to each element of the series. 65 | 66 | `convert_dtype` : boolean, default True. Try to find better dtype for elementwise function results. If False, leave as dtype=object 67 | 68 | `args` : tuple. Positional arguments to pass to function in addition to the value 69 | 70 | `kwds` : Additional keyword arguments will be passed as keywords to the function 71 | 72 | NOTE: docstring taken from pandas documentation. 73 | 74 | 75 | ## 3. `pandas.DataFrame.swifter.apply` OR `modin.pandas.DataFrame.swifter.apply` 76 | 77 | Efficiently apply any function to a pandas dataframe in the fastest available manner. 78 | 79 | ```python 80 | def pandas.DataFrame.swifter.apply( 81 | func, 82 | axis=0, 83 | raw=False, 84 | result_type=None, 85 | args=(), 86 | **kwds 87 | ) 88 | ``` 89 | 90 | **Parameters:** 91 | 92 | `func` : function. Function to apply to each column or row. 93 | 94 | `axis` : {0 or 'index', 1 or 'columns'}, default 0. **For now, Dask only supports axis=1, and thus swifter is limited to axis=1 on large datasets when the function cannot be vectorized.** Axis along which the function is applied: 95 | 96 | * 0 or 'index': apply function to each column. 97 | * 1 or 'columns': apply function to each row. 98 | 99 | `raw` : bool, default False 100 | False : passes each row or column as a Series to the function. 101 | True : the passed function will receive ndarray objects instead. If you are just applying a NumPy reduction function this will achieve much better performance. 102 | 103 | `result_type` : {'expand', 'reduce', 'broadcast', None}, default None. These only act when axis=1 (columns): 104 | 105 | 'expand' : list-like results will be turned into columns. 106 | 'reduce' : returns a Series if possible rather than expanding list-like results. This is the opposite of 'expand'. 107 | 'broadcast' : results will be broadcast to the original shape of the DataFrame, the original index and columns will be retained. 108 | The default behaviour (None) depends on the return value of the applied function: list-like results will be returned as a Series of those. However if the apply function returns a Series these are expanded to columns. 109 | 110 | `args` : tuple. Positional arguments to pass to func in addition to the array/series. 111 | 112 | `kwds` : Additional keyword arguments to pass as keywords arguments to func. 113 | 114 | NOTE: docstring taken from pandas documentation. 115 | 116 | **returns:** 117 | 118 | The new dataframe/series with the function applied as quickly as possible 119 | 120 | ## 4. `pandas.DataFrame.swifter.applymap` 121 | 122 | Efficiently applymap any function to a pandas dataframe in the fastest available manner. Applymap is elementwise. 123 | 124 | ```python 125 | def pandas.DataFrame.swifter.applymap(func) 126 | ``` 127 | 128 | ## 5. `pandas.DataFrame.swifter.groupby.apply` 129 | 130 | Applies over a resampler object on the original series/dataframe in the fastest available manner. 131 | 132 | ```python 133 | def pandas.DataFrame.swifter.groupby( 134 | by, 135 | axis=0, 136 | level=None, 137 | as_index=True, 138 | sort=True, 139 | group_keys=True, 140 | squeeze=False, 141 | observed=False, 142 | dropna=True 143 | ).apply(func, *args, **kwds) 144 | ``` 145 | 146 | ## 6. `pandas.DataFrame.swifter.rolling.apply` 147 | 148 | Applies over a rolling object on the original series/dataframe in the fastest available manner. 149 | 150 | ```python 151 | def pandas.DataFrame.swifter.rolling( 152 | window, 153 | min_periods=None, 154 | center=False, 155 | win_type=None, 156 | on=None, 157 | axis=0, 158 | closed=None 159 | ).apply(func, *args, **kwds) 160 | ``` 161 | 162 | ## 7. `pandas.DataFrame.swifter.resample.apply` 163 | 164 | Applies over a resampler object on the original series/dataframe in the fastest available manner. 165 | 166 | ```python 167 | def pandas.DataFrame.swifter.resample( 168 | rule, 169 | axis=0, 170 | closed=None, 171 | label=None, 172 | convention="start", 173 | kind=None, 174 | loffset=None, 175 | base=0, 176 | on=None, 177 | level=None, 178 | origin=None, 179 | offset=None, 180 | ).apply(func, *args, **kwds) 181 | ``` 182 | 183 | ## 8. `pandas.DataFrame.swifter.progress_bar(False).apply` 184 | 185 | Enable or disable the TQDM progress bar by setting the enable parameter to True/False, respectively. You can also specify a custom description. 186 | 187 | Note: It is advised to disable the progress bar if calling swifter from a forked process as the progress bar may get confused between various multiprocessing modules. 188 | 189 | ```python 190 | def pandas.DataFrame.swifter.progress_bar(enable=True, desc=None) 191 | ``` 192 | 193 | For example, let's say we have a pandas dataframe df. The following will perform a swifter apply, without the TQDM progress bar. 194 | 195 | ```python 196 | df.swifter.progress_bar(False).apply(lambda x: x+1) 197 | ``` 198 | 199 | ## 9. `pandas.DataFrame.swifter.set_npartitions(npartitions=None).apply` 200 | 201 | Specify the number of partitions to allocate to swifter, if parallel processing is chosen to be the quickest apply. 202 | If npartitions=None, it defaults to cpu_count()*2 203 | 204 | ```python 205 | def pandas.DataFrame.swifter.set_npartitions(npartitions=None) 206 | ``` 207 | 208 | For example, let's say we have a pandas dataframe df. The following will perform a swifter apply, using 2 partitions 209 | ```python 210 | df.swifter.set_npartitions(2).apply(lambda x: x+1) 211 | ``` 212 | 213 | ## 10. `pandas.DataFrame.swifter.set_dask_threshold(dask_threshold=1).apply` 214 | 215 | Specify the dask threshold (in seconds) for the max allowable time estimate for a pandas apply on the full dataframe 216 | ```python 217 | def pandas.DataFrame.swifter.set_dask_threshold(dask_threshold=1) 218 | ``` 219 | 220 | For example, let's say we have a pandas dataframe df. The following will perform a swifter apply, with the threshold set to 3 seconds 221 | ```python 222 | df.swifter.set_dask_threshold(dask_threshold=3).apply(lambda x: x+1) 223 | ``` 224 | 225 | ## 11. `pandas.DataFrame.swifter.set_dask_scheduler(scheduler="processes").apply` 226 | 227 | Set the dask scheduler 228 | 229 | :param scheduler: String, ["threads", "processes"] 230 | ```python 231 | def pandas.DataFrame.swifter.set_dask_scheduler(scheduler="processes") 232 | ``` 233 | 234 | For example, let's say we have a pandas dataframe df. The following will perform a swifter apply, with the scheduler set to multithreading. 235 | ```python 236 | df.swifter.set_dask_scheduler(scheduler="threads").apply(lambda x: x+1) 237 | ``` 238 | 239 | ## 12. `pandas.DataFrame.swifter.allow_dask_on_strings(enable=True).apply` 240 | 241 | This flag allows the user to specify whether to allow dask to handle dataframes containing string types. Dask can be particularly slow if you are actually manipulating strings, but if you just have a string column in your data frame this will allow dask to handle the execution. 242 | ```python 243 | def pandas.DataFrame.swifter.allow_dask_on_strings(enable=True) 244 | ``` 245 | 246 | For example, let's say we have a pandas dataframe df. The following will allow Dask to process a dataframe with string columns. 247 | ```python 248 | df.swifter.allow_dask_on_strings().apply(lambda x: x+1) 249 | ``` 250 | 251 | ## 13. `pandas.DataFrame.swifter.force_parallel(enable=True).apply` 252 | 253 | This flag allows the user to specify to override swifter's default functionality to run try vectorization, sample applies, and determine the fastest apply possible. Instead it forces swifter to use dask. 254 | ```python 255 | def pandas.DataFrame.swifter.force_parallel(enable=True) 256 | ``` 257 | 258 | For example, let's say we have a pandas dataframe df. The following will force Dask to process the dataframe. 259 | ```python 260 | df.swifter.force_parallel().apply(lambda x: x+1) 261 | ``` 262 | 263 | ## 14. `swifter.register_modin()` 264 | 265 | This gives access to `modin.DataFrame.swifter.apply(...)` and `modin.Series.swifter.apply(...)`. This registers modin dataframes and series with swifter as accessors. 266 | * NOTE: This is only necessary if you import swifter BEFORE modin. If you import modin before swifter you do not need to execute this method. 267 | -------------------------------------------------------------------------------- /examples/modin_dataframe_swifter_performance_benchmark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 14, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2020-08-03T17:56:43.672411Z", 9 | "start_time": "2020-08-03T17:56:43.670023Z" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import modin.pandas as pd\n", 15 | "import numpy as np\n", 16 | "import swifter\n", 17 | "\n", 18 | "import perfplot\n", 19 | "import matplotlib.pyplot as plt" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 106, 25 | "metadata": { 26 | "ExecuteTime": { 27 | "end_time": "2020-08-03T18:11:07.639672Z", 28 | "start_time": "2020-08-03T18:11:07.467094Z" 29 | } 30 | }, 31 | "outputs": [ 32 | { 33 | "name": "stderr", 34 | "output_type": "stream", 35 | "text": [ 36 | "UserWarning: Distributing object. This may take some time.\n" 37 | ] 38 | } 39 | ], 40 | "source": [ 41 | "df = pd.DataFrame({\n", 42 | " \"gaussian\": np.random.normal(size=1_000_000),\n", 43 | " \"str_date\": [\"2020-01-01\"] * 1_000_000,\n", 44 | "})" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 107, 50 | "metadata": { 51 | "ExecuteTime": { 52 | "end_time": "2020-08-03T18:11:30.129466Z", 53 | "start_time": "2020-08-03T18:11:30.126464Z" 54 | } 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "def modin_apply(df, func):\n", 59 | " return df.apply(func)._to_pandas()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 108, 65 | "metadata": { 66 | "ExecuteTime": { 67 | "end_time": "2020-08-03T18:11:30.286133Z", 68 | "start_time": "2020-08-03T18:11:30.282799Z" 69 | } 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "def swifter_apply(df, func):\n", 74 | " return df.swifter.apply(func)._to_pandas()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 126, 80 | "metadata": { 81 | "ExecuteTime": { 82 | "end_time": "2020-08-03T18:28:54.307020Z", 83 | "start_time": "2020-08-03T18:28:54.302023Z" 84 | } 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "def perfplot_benchmark(df, field, func):\n", 89 | " benchmark = perfplot.bench(\n", 90 | " setup=lambda n: df.iloc[:n][field],\n", 91 | " kernels=[\n", 92 | " lambda srs: modin_apply(srs, func),\n", 93 | " lambda srs: swifter_apply(srs, func),\n", 94 | " ],\n", 95 | " labels=[\"Modin Apply\", \"Swifter Apply\"],\n", 96 | " n_range=[2**k for k in range (0, 26, 2)],\n", 97 | " xlabel=\"n_rows\",\n", 98 | " equality_check=lambda left, right: left.equals(right)\n", 99 | " )\n", 100 | " return benchmark" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 127, 106 | "metadata": { 107 | "ExecuteTime": { 108 | "end_time": "2020-08-03T18:44:04.214823Z", 109 | "start_time": "2020-08-03T18:28:56.329639Z" 110 | }, 111 | "scrolled": true 112 | }, 113 | "outputs": [ 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | " 0%| | 0/13 [00:00" 237 | ] 238 | }, 239 | "metadata": { 240 | "needs_background": "light" 241 | }, 242 | "output_type": "display_data" 243 | } 244 | ], 245 | "source": [ 246 | "fig = plt.figure()\n", 247 | "bench.xlabel = \"n_rows\"\n", 248 | "bench.plot(logx=False, logy=False)\n", 249 | "plt.title(\"Modin DataFrame Performance Benchmark\")\n", 250 | "plt.savefig(\"modin_swifter_performance_benchmark.png\")" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 130, 256 | "metadata": { 257 | "ExecuteTime": { 258 | "end_time": "2020-08-03T18:44:50.611292Z", 259 | "start_time": "2020-08-03T18:44:49.763201Z" 260 | } 261 | }, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEbCAYAAADERMP2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3deXhU1fnA8e+bhSQkEPYdCZuI7IRFRASUCu5VUcCy44bVavtrLbZWKXVra611Q3EDBIKIGyBWqxJlX8Iiu2wBwr4lIWSfnN8f9waGkJlMkpnMZPJ+nmeembu/czO57z333HuOGGNQSimlXAnxdwBKKaUCmyYKpZRSbmmiUEop5ZYmCqWUUm5polBKKeWWJgqllFJuaaKo5EQkTkSMiITZw1+JyBh/x6XcE5E7ROSgiGSISDd/xxOsRGSAiKT4cftjRWSZv7bvLZooKpCIJItIrojUKzJ+o32wjyvvNowxNxpjZpQjviwROSsiqSKyQkQeEhGPfidFk1Ypt5nh9GpSlvh9SUSm23+7DBE5LSL/E5EryrHKl4BHjDExxpgN3oozkNm/jXP2PjwpIgkiUsvfcamSaaKoePuAEYUDItIJiPJfOJe41RhTA2gBvAj8EXivArYZ4/Q6XHSG0iQfH/qHMSYGaAYcB6aXdgVO36MFsLUsQYhIaFmWCxBd7H3YCqgNTPZvOL4TIL9Zr9BEUfE+BEY7DY8BZjrPICKxIjJTRE6IyH4RearwrF5EQkXkJfuMbC9wc5FlE0XkPvvzWBFZZs9/RkT2iciNngRpjEkzxiwAhgFjRKSjvc6bRWSDiKTbl04mOy32o/2eap819hGR1iLyvYicsmOe7clZpFPpZIKIHAC+t8d/LCJHRSRNRH4UkQ5Oy0wXkTfty28ZIrJcRBqJyCv299/hfJlHRJqIyCf2ft4nIr/xcN9kAnOAwn0SIiKTRGSP/T3niUgdF99jqYhkAKHAJhHZY8/X3v7bpYrIVhG5rcj3mioii0XkHDCwDN+1ML6zIrJNRO5wmub2dyIidUTkAxE5bE//3GnaLWKViAtLoJ093IfpwALgSqd1xYrIeyJyREQOicizhUmxPDHa0/9PRI7b6x5XZN96cz8uF5F/i8hpikmCIvJP+3vEerKfAoYxRl8V9AKSgUHATqA91sHiINbZpQHi7PlmAl8ANYA44Gdggj3tIWAH0ByoAyyxlw2zpycC99mfxwJ5wP32tiYChwFxF18x4w8AE+3PA4BOWCcZnYFjwC/taXHOsdjj2gC/ACKA+ljJ5BUPtlm4rplANBBljx9v75cI4BVgo9My04GTQDwQiZVc9mEl5lDgWWCJPW8IkAQ8DVTDOsPdCwx2sW+mA8/an2OwEsVSe/hxYBVWSSMCeBtIKOF7GKCN/Tkc2A38yY7lOuAs0M5p22lAXzvuyNJ8V3sddwNN7OWHAeeAxp78ToAvgY+wSgDhQH97fHesklVve7kx9t8zwsU+dP7OtYFvgClO0z+391000ABYAzxYzhgHAPnAFHv8TUAmULu0vxkP92M+8CgQhnWlYCywzJ7/HeBroLq/j0WlPnb5O4Cq9OJCongKeAEYAvzP/lEZrINKKJADXOm03INAov35e+Ahp2k34D5R7Haat7o9byN38RUzfhXwZxfLvAL82/4cR5FEUcz8vwQ2FNlmBpBqvz4vsq5WbtZVy54n1h6eDrzjNP1RYLvTcCcg1f7cGzhQZH1PAh+42NZ0INuO8SjW2XBre9p24HqneRtjHdTCXH0PLj5o9rPXGeI0PQGY7LTtmcXE49F3dfF9NgK3l/Q7sb9LAfaBtcg6pgJ/KzJuJ/ZBupj5DZBu70MH1glPU3taQ6zffZTT/CO4kNjLGuMAIIuLT16OA1f5aD8W/U2NBVZjJbFPgGrujhGB+gqaa2iVzIdYZ9YtKXLZCaiHdVa532ncfqCp/bkJVinEeZo7Rws/GGMyRQSsM+LSaAqcBhCR3lh1Fx3tOCOAj10tKCINgFexDoY1sM6szhSZ7ZfGmG9drOL8d7UvQzyHdVZXH+vgANY+S7M/H3NaNquY4cLv3gJoIiKpTtNDgaWuvgvwkjHmqWLGtwA+E5ECp3EOrIPfJd+jGE2Ag8YY5+Wd/+aulvf0uyIio4HfYSUu7GnON1W4+p3UAU4bY4r+zcD63mNE5FGncdXs7+NKd2PMbhEJBx7GuhR3pb2ucOCIvW2wfivO37ssMQKcMsbkOw1ncvH/gDf3Y3F/pzZAF6CXMSbXRYwBTeso/MAYsx+reHsT8GmRySexzkZbOI27DDhkfz6CddnJeZrPiEhPrANW4S1+c7DOppsbY2KBt4DC/+zimiJ+wR7f2RhTExjpNL8nnNd5L3A7Vqkslgv/rKVZX6GDwD5jTC2nVw1jzE1lXNeNRdYVaYw55DSPu2aaDwPN5eK7y5z/5iUt75aItMC67PEIUNcYUwvYgmf77SBQR4qvVzoIPFfke1c3xiSUtFJjTB7wLtbJUkd7XTlAPad11TTGdHC3Hg9i9BoP92Nxf6ftwDjgKxFp58sYfUUThf9MAK4zxpxzHmmMcQDzgOdEpIb94/wdMMueZR7wGxFpJiK1gUm+CE5EaorILcBcYJYxZrM9qQbW2Vu2iPTCOngXOoF1lt/KaVwN7EtLItIU+EM5wqqBdTA5hXXp4flyrGsNkC4ifxSRKLFuEuhoJ8bSegvr79UCQETqi8jtpVh+Nda17idEJFxEBgC3Yu17b4jGOoCdsOMbh10RXxJjzBHgK+BNEaltx3etPfkd4CER6S2WaLFudqhR0nrt0uE4rDP2vfZ2vgH+Zf/2QsS6EaJ/OWP0pvLsxwSsOqhvRaS1D2LzKU0UfmKM2WOMWedi8qNYB469WGfyc4D37WmFFWKbgPVcWiIpr4UichbrLO3PwMtY/9CFHgam2PM8jZW4gPN3Az0HLLfvgrkK+CtWpWcaVoVjeeKdiXVJ5hCwDavupEzshHwr0BWrdHcS6wy3LHej/AerlPWNvV9WYdWBeBpLLnAbcKMdx5vAaGPMjjLEUtz6twH/AlZiXVbpBCwvxSpGYZVyd2Bd33/cXu86rMrl17EuJ+7Guibvziax7vo6g1X5fYcx5rQ9bTTWpatt9vT5WPUPZY7Rm8q7H431fNMU4HvxwjNTFanwjgGllFKqWFqiUEop5ZYmCqWUUm5polBKKeWWJgqllFJuaaJQSinlVlA+mV2vXj0TFxdXpmXPnTtHdHS0dwPyAY3TuzRO76sssWqclqSkpJPGmPrFTvR3GyLefGHdFz+tTZs2pqyWLFlS5mUrksbpXRqn91WWWDVOC7DOuDi2BtWlJ2PMQmPMA7GxlasFX6WUCmRBlSiUUkp5X1AlChG5VUSmpaWllTyzUkopjwRVZbYxZiGwsEePHvcXnZaXl0dKSgrZ2dlu1xEbG8v27dt9FaLXBEOckZGRNGvWjPDw8AqOSilVGkGVKNxJSUmhRo0axMXF4dTe/SXOnj1LjRolNn7pd5U9TmMMp06dIiUlhZYtW/ohMqWUp6rMpafs7Gzq1q3rNkmoiiMi1K1bt8QSnlLK/4IqUZR015MmicCifw8VCI6eK2DviQxSM3MpKKh8rWkbY8jMzedwahbbj6T7ZBtV5tJTIBARRo4cyYcffghAfn4+jRs3pnfv3ixatMjj9cTFxZGYmEiNGjW4+uqrWbFiRaniyM/Pp1GjRtx///288MILpVrWWWJiIi+99FKpYlcqUOTmF/DXhVuZvToLlv4AQIhAbFQ4tatXo3Z0NWpXD6dWdevdGr4wrk50NWpVt+YND/XOOXeeo4DUzDzSsnI5k5nHmXO5pGbmkZqVy6aduXx9+ifOnMvjTGYuaVnW+5nMPHLzL/Si+/OzN1ItzLtlgKBKFCJyK3BrmzZt/B1KsaKjo9myZQtZWVlERUXxv//9j6ZNm5a8oBulTRIA33zzDe3atWPevHk8//zzemavqpzj6dlMnL2epP1nuKFFGEN6d+BMZh6pmbnWwdc+GB9KzWbr4XROn8slJ7/A5fpiIsKoHW0ljfOJpbqdWKKtxFJQYDiTaR/47QP8+eGsXFLP5XE2J9/lNkIFap84fn7dzetUp3Oz2Iu2Wat6OL74dw6qROHurqdAceONN/Lll18ydOhQEhISGDFiBEuXLgXg9OnTjB8/nr1791K9enWmTZtG586dOXXqFCNGjODEiRP06tWr8Cl0AGJiYsjIyCAxMZHJkydTr149tmzZQnx8PLNmzSo2CSQkJPDYY48xdepUVq1aRZ8+fQCrpDJs2DCWLFkCwJw5c2jTpg1jx44lMjKSrVu3cuzYMV5++WVuueWW8+srKCigXbt2rFixgvr161NQUMDll1/OqlWriIiI8OXuVKrUkvafYeKsJM5m5/P6vd2IOf0zA7o3K3G5rFyHfQZvHdxPn8u96IB/5lzu+WSTfPIcZzJzOZtd/IG/ZmQYtaOtA3zdmGq0aRDjVJKxEkutqMLEY5Vm1q5YysCBA729OzwSVInCU39duJVth4u/ludwOAgNDS31Oq9sUpNnbi25H/jhw4czZcoUbrnlFn766SfGjx9/PlE888wzdOvWjc8//5zvv/+e0aNHs3HjRv76179yzTXX8PTTT/Pll18ybdq0Yte9YcMGtm7dSpMmTejbty/Lly/nmmuuuWierKwsvvvuO95++21SU1NJSEg4nygAatasyZo1a5g5cyaPP/74+ctKycnJ/PDDD+zZs4eBAweye/fu88uEhIQwcuRIZs+ezeOPP863335Lly5dqFevHmfPni31vlTKVxLWHODpL7bQODaKGeN70b5xTRITf/Zo2ahqoURVi6JJrSiPt1d4KSk1M5eQEKF29WrERoUTGlL6035/lvyDqjK7MujcuTPJyckkJCRw0003XTRt2bJljBo1CoDrrruOU6dOkZaWxo8//sjIkSMBuPnmm6ldu3ax6+7VqxfNmjUjJCSErl27kpycfMk8ixYtYuDAgVSvXp277rqLzz77DIfDcX76iBEjzr+vXLny/Ph77rmHkJAQ2rZtS6tWrdix4+LunMePH8/MmTMBeP/99xk3bhxKBYqcfAdPfrqZJz/dTJ/W9VjwSF/aN67p8+2Gh4ZQv0YEbRvWoHX9GOpEVytTkvC3oCpReFpH4e7MvyKeT7jtttv4/e9/T2JiIqdOnTo/3vmSUqHCswhPziacL/OEhoaSn39psTchIYHly5dT2LruqVOnWLJkCYMGDbpkO64+FzfcvHlzGjZsyPfff8/q1auZPXt2ifEqVRGOpWczcVYS6w+k8vCA1vzfDe0q5cHan4KqRFFZGgUcP348Tz/9NJ06dbpo/LXXXnv+AJuYmEi9evWoWbPmReO/+uorzpw5U6btpqens2zZMg4cOEBycjLJycm88cYbJCQknJ/no48+Ov/ufEnq448/pqCggD179rB3717atWt3yfrvu+8+Ro4cyT333FOmy3dKeVvS/tPc8toydhw9y5u/6s4TQ67QJFEGQVWiqCyaNWvGY489dsn4yZMnM27cODp37kz16tWZMWMGYNVdjBgxgu7du9O/f38uu+yyMm33008/5brrrruo5HH77bfzxBNPkJOTA0BOTg69e/emoKDgogTSrl07+vfvz7Fjx3jrrbeIjIy8ZP233XYb48aN08tOKiDMXr2fyQu20qRWFLMm9KZdo8BvySBguWp/vDK/4uPjL2lrfdu2bR61yZ6enu7RfP7mizhbtGhhTpw4ccn4MWPGmI8//rjE5deuXWuuueaai8aVFKenfxdf0z4JvM9fsWbn5ZtJn2wyLf64yIx+b7VJPZfrdv7Ksk/92R+FliiUV7z44otMnTpV6yaUXx1Ny+ahWUlsPJjKrwe25ne/0PoIbwiqRBHoD9wFuuLukgKYPn16ictOmjSJSZMmeTcgpUphXfJpJs5ez7mcfKb+qjs3dmrs75CChlZmK6UqNWMMH67az/Bpq4iuFsrnv+6rScLLgqpEoZSqWrLzHDzzxVY+WneQge3q88rwbsRGaf8m3qaJQilVKR1Jy+KhWevZdDCVR69rw28HXU6I1kf4hCYKpVSls2bfaR6enURWroO3RsYzpGMjf4cU1IKqjiLQPffcc3To0IHOnTvTtWtXVq9e7dFyTz/9NN9++y0AS5cupUOHDvTt25ft27czZ84cr8S2YcMGRISvv/66XOuZPHkyL730kldiUqooYwwzVyZz7zurqBkZzue/7qtJogJoiaKCrFy5kkWLFrF+/XoiIiI4efIkubm5Hi07ZcqU859nz57N73//e4YOHUpSUhJz5szh3nvv9TgOV40eJiQkcM0115CQkMDgwYM9Xp9SFSU7z8FfPt/Cx0kpXH9FA/49vCs1I7U+oiIEVYnCXVeo/nbkyBHq1at3/qnoevXq0aRJE9asWcOdd94JwBdffEFUVBS5ublkZ2fTqlUrAMaOHcv8+fN59913mTdvHlOmTGHChAlMmjSJpUuX0rVrV/7973/jcDj4wx/+QM+ePencuTNvv/02YDUHMnDgQO69995Lmg0B6yxt/vz5TJ8+nW+++eZ896TJyclcccUVjBkzhs6dOzN06FAyMzMBq0nyP/7xj/Tq1YtevXpd1JoswJ49e+jevfv54V27dhEfH+/lvaqqisOpWQx7eyUfJ6Xwm+vb8s7oHpokKlBQlSiMp/1RfDUJjm4udlKUIx9Cy7BbGnWCG190OfmGG25gypQpXH755QwaNIhhw4bRv39/unfvzoYNGwDrslLHjh1Zu3Yt+fn59O7d+6J13HfffSxbtoxbbrmFwYMHk5SUdFEPc9OmTSM2Npa1a9eSk5ND3759ueGGGwBYs2YNW7ZsoWXLlpfEtnz5clq2bEnr1q0ZMGAAixcvPp+8du7cyXvvvUffvn0ZP348b775Jr///e8B102SA7Ru3ZrY2Fh++ukn+vbtywcffMDYsWNLv19Vlbd67yl+PWc92XkFvD0qnsEd9FJTRQuqEkUgi4mJISkpiWnTplG/fn2GDRvG9OnTCQsLo02bNmzfvp01a9bwu9/9jh9//JGlS5fSr1+/Um3jm2++YebMmXTt2pXevXtz6tQpdu3aBVhNkBeXJMC67DR8+HDA6i/DuY2n5s2b07dvXwBGjhzJsmXLzk9z1SR5ofvuu49Zs2bhcDj46KOPSnWJTCljDNOX7+NX766mZlQ4n//6ak0SfhJUJQqPuTnzz/JhM+OhoaEMGDCAAQMG0KlTJ2bMmMHYsWPp168fX331FeHh4QwaNIixY8ficDhKXSlsjOG11167pI4hMTGR6OjoYpdxOBx88sknLFiwgOeeew5jDKdOnTrf4ZC75sXdNUMOcNddd/HMM8+waNEi4uPjqVu3bqm+j6q6jDE89+V23l22j0HtG/LysC56qcmPtERRQXbu3Hn+7B5g48aNtGjRArCaF3/llVfo06cP9evX59SpU+zYsYMOHdz3mFejRo2LepAbPHgwU6dOJS8vD4Cff/6Zc+fOuV1HYW90Bw8eJDk5mf3793PXXXfx+eefA3DgwIHzpYXCCu9CrpokLxQZGcn111/PxIkTtUVZVSrvLt3Hu8v2MaZPC6aNitck4WdVs0ThBxkZGTz66KOkpqaev9xU2KVp7969OXbsGNdeey1g9YLXoEGDEjsr6ty5M2FhYXTp0oWxY8fy2GOPkZycTPfu3THGUL9+/fMHfFcSEhK44447Lhp31113MXXqVPr160f79u2ZMWMGDz74IG3btmXixInn53PVJLmze+65h0WLFp2vK1GqJAs2Hea5xdu5uXNjnrm1gz5EFwhcNStbmV/azLh37Nu3z3To0KHYaa6aJC/q2WefNU899ZTL6drMeOlUljiNKVusy3efMG3/tNjc/dYKk5Wb7/2gilFZ9qk2M66C0h133MGuXbtITEz0dyiqEthxNJ0HZyYRV68674zqQWS49pIYKDRRKJfi4uLYsmVLsdNcNUnu7LPPPquQPshV5Xc4NYux76+lekQo08f1Ira61kkEkqCqzA7kB+6UUsVLy8pj7AdrOJeTz/RxvWhSK8rfIakigipRmBL6o7Auw6lAoX8PlZPv4IGZ69h38hxvj4qnfeOa/g5JFSOoEoU7kZGRnDp1Sg9OAcLYz2tERkb6OxTlJwUFhv+bt4nV+07z0t1duLpNPX+HpFyoMnUUzZo1IyUlhRMnTridLzs7u1IcvIIhzsjISJo1a1bBEalA8cJX21n00xEm3XgFt3dt6u9wlBtVJlGEh4e7bMLCWWJiIt26dauAiMpH41SV2XvL9vHOUuuBugevbeXvcFQJqsylJ6VUYPjypyM8++U2BndoyNO3dijxwVLlf5oolFIVZvXeU/z2o43EX1ab/wzvRqg+dV0paKJQSlWIn4+d5f6Z62heJ4p3RusDdZWJJgqllM8dTctm7PtriAi3HqirHV3N3yGpUtBEoZTyqfRs64G6tKw8po/rSfM61f0dkiqlKnPXk1Kq4uXmFzBxVhK7j2fwwbiedGhS/MOwKrBpolBK+USBMTwxfxPLd5/iX3d3oV/b+v4OSZWRJgqllE/M/zmPxfsO84fB7bgrXh+srMwCvo5CRFqJyHsiMt/fsSilPDNjRTKL9+Xxq96X8fCA1v4OR5WTXxKFiLwvIsdFZEuR8UNEZKeI7BaRSQDGmL3GmAn+iFMpVXr/3XKEyQu30q1BKFNu76gP1AUBf5UopgNDnEeISCjwBnAjcCUwQkSurPjQlFJltS75NI/N3UjX5rV4qEuEPlAXJMRframKSBywyBjT0R7uA0w2xgy2h58EMMa8YA/PN8YMdbO+B4AHABo2bBg/d+7cMsWVkZFBTExMmZatSBqnd2mc5Xc4o4DnVmcREy48dVUUknsuYGN1Fsj71Jmv4xw4cGCSMaZHsRNd9ZHq6xcQB2xxGh4KvOs0PAp4HagLvAXsAZ70ZN3F9ZntKe0/17s0Tu8K1DiPpWWZq1/4zsT/7Ruz/+Q5Y0zgxlqUxmmhkvSZXVwZ1RhjTgEPVXQwSinPZOTkM276Ws5k5vLRA324rK4+UBdsAumupxSgudNwM+BwaVagXaEqVbEKH6jbcfQsb/yqO52a6QN1wSiQEsVaoK2ItBSRasBwYEFpVmBK6ApVKeU9xhgmffoTS3ed5IU7OzGwXQN/h6R8xF+3xyYAK4F2IpIiIhOMMfnAI8DXwHZgnjFmaynXqyUKpSrIS9/s5NP1h/jtoMu5p0fzkhdQlZZf6iiMMSNcjF8MLC7HehcCC3v06HF/WdehlCrZh6v288aSPYzo1ZzfXN/G3+EoHwukymylVIAzxvDqd7v597c/c90VDfibPlBXJQRVohCRW4Fb27TRMxylvC07z8ET839iwabD3Nm9KS/c2Ymw0ECq5lS+ElR/Za3MVso3TpzNYcQ7q1iw6TBPDGnHv+7uQkSY9lBXVQRViUIp5X07jqYzYfo6Tp3L4a2R3RnSsbG/Q1IVTBOFUsql73cc49E5G4iJDOPjB6/W5ySqKJeJQkS6e7B8njFmsxfjKReto1DKO4wxvLdsH88v3s6VTWry7uieNIqN9HdYyk/clSh+wHoIzt0tDS2x2mwKCHp7rFLll+co4OkvtpCw5iBDOjTi5WFdqF5NLz5UZe7++muNMde5W1hEvvdyPEopP0rNzOXh2etZsecUDw9oze9vaEeINhVe5blMFCUlCU/nUUpVDntPZDBhxjpSzmTyr7u7aPel6rwSy5MiEgJ0AZoAWcBWY8wxXwdWFlpHoVTZrNhzkomz1hMaIsy5/yp6xtXxd0gqgLirzG4N/BEYBOwCTgCRwOUikgm8DcwwxhRURKCe0DoKpUpv7poDPPX5FuLqRfP+mJ7aTLi6hLsSxbPAVOBBu1OL80SkAXAvVudCM3wXnlLKVxwFhhe/2s47S/fRr2093vhVd2pGhvs7LBWA3NVRFNtwnz3tOPCKTyJSSvlcRk4+jyVs4LsdxxnTpwV/ueVKbY5DuVTiL0NE7haRGvbnp0TkUw+fsVBKBaBDqVkMnbqCxJ9PMOX2Dvz19o6aJJRbnvw6/mKMOSsi1wCDsS41TfVtWGWj/VEo5d6GA2e4/fXlHDqTxftjezK6T5y/Q1KVgCeJwmG/3wxMNcZ8AVTzXUhlp40CKuXagk2HGTZtFdWrhfLpw1fT//L6/g5JVRKePG55SETexrr76e8iEkGQtTqrVDAzxvCf73bxyre76BVXh7dGxVMnOiDP9VSA8iRR3AMMAV4yxqSKSGPgD74NSynlDdl5Dv4w/ycWbjrMXd2b8fydHbV5cFVq7p6jiDHGZBhjMoFPC8cbY44AR5zn8X2YSqnSOn42mwdmJrHxYCp/HHIFD/Vvpb3RqTJxV6L4QkQ2Al8AScaYcwAi0goYiFXSeAeY7/MolVKlsv1IOhOmr+VMZh5vjYxnSMdG/g5JVWLunqO4XkRuAh4E+opIHSAP2Al8CYwxxhytmDCVUp76dtsxfjN3AzUiw/j4oT50bKo3d6jycVtHYYxZDCyuoFjKTdt6UlXdf7ccYeLs9XRsEss7o3toHxLKK9wmChGJxarIbgoY4DDwtTEmtQJiKzVt60lVZbuOneX/5m2iS7NazLm/t/YhobzG5W2uIjIaWA8MAKoD0Vh1E0n2NKVUgEjPzuOBD5OIqhbGWyPjNUkor3L3a/ozEF+09CAitYHVwExfBqaU8kxBgeG3czdy8HQmc+6/Si83Ka9z9+CcYF1uKqoA992jKqUq0Kvf7+K7Hcf5yy1X0qul9iOhvM9dieI5YL2IfAMctMddBvwC+JuvA1NKlezbbcd45dtd3Nm9KaP7tPB3OCpIuSxRGGNmAD2AH4AcIBdIBHoYY6ZXRHBKKdf2nsjgtx9tpGPTmjx/Ryd9mE75TEm3x54B5lZQLEopD2Xk5PPgh0mEh4Xw1sh4IsO1WQ7lO2Vq3E9ENns7EKWUZ4wx/OHjTew5kcHrI7rRrLZ2Xap8y11bT3e6mgQEZHsA+sCdqgqm/rCHr7Yc5c83tefqNvX8HY6qAtxdevoImE3xdz4F5P13+sCdCnabT+Tz8vqd3NqlCff1a+nvcFQV4S5R/KoRhUcAACAASURBVITVtPiWohNEZJDvQlJKFefAqUze+imHdg1r8Pe7tPJaVRx3dRSPA+kupt3hg1iUUi5k5ubzwIfrMAbeHqVPXquK5a712KVupq3zTThKqaKMMUz6ZDM7j53lt90jaFE32t8hqSqmxNMSEXm1mNFpwDq7/2yllA+9t2wfCzYd5g+D29FBUvwdjqqCPLk9NhLoCuyyX52BOsAEEXnFh7EpVeWt2HOSF77aweAODXl4QGt/h6OqKE8udLYBrjPG5AOIyFTgG6ymPPR5CqV85FBqFo/M2UDLetH8656uWnmt/MaTEkVTrCbGC0UDTYwxDqymPZRSXpad5+ChD5PIyy/g7VHxxERo5bXyH09+ff8ANopIItbDdtcCz4tINPCtD2NTqkoyxvDU51vYfCiNd0b3oHX9GH+HpKq4EhOFMeY9EVkM9MJKFH8yxhy2J//Bl8EpVRXNWrWf+Ukp/Ob6tvziyob+Dkcpj0oUAD2BfvZnB1aXqEopL1ubfJq/LtzG9Vc04PHr2/o7HKUAz26PfRErUcy2R/1GRK42xjzp08gubD8aeBO7mXNjzOwSFlGqUjqWns3Ds9fTvE51Xh7WlZAQrbxWgcGTyuybgF8YY943xrwPDAFuLs9GReR9ETkuIluKjB8iIjtFZLeITLJH3wnMN8bcD9xWnu0qFahy8h1MnJXEuZx83h4VT2xUuL9DUuo8T5sZr+X0OdYL252OlXDOE5FQ4A3gRuBKYISIXAk040IPew4vbFupgPPXhdtYfyCVl+7uwuUNa/g7HKUu4kkdxQvABhFZwoW7nsp12ckY86OIxBUZ3QvYbYzZCyAic4HbgRSsZLGRMvafoVQg+2jtAeasPsBD/VtzU6fG/g5HqUuIMcW1Il5kJpHGWPUUAqw2xhwt94atRLHIGNPRHh4KDDHG3GcPjwJ6A38EXgeygWWu6ihE5AHgAYCGDRvGz51bto75MjIyiIkJ/NsRNU7v8lece1MdPL86myvqhPK7HhGElPBQXWXZn1B5YtU4LQMHDkwyxvQobpq7jou6FxlV2MhMExFpYoxZ760ACzdZzDhjjDkHjCtpYWPMNGAaQI8ePcyAAQPKFERiYiJlXbYiaZze5Y84T5zNYdJry2hUK4oPH76G2tHVSlymsuxPqDyxapwlc3fp6V9uphngOi/HkgI0dxpuRilvw9Ue7lRlkeco4Ndz1pOalcsnE6/2KEko5S/umhkfWJGBAGuBtiLSEjgEDAfuLc0KtIc7VVk89+V21uw7zX+Gd6VDE2/cH6KU77isHC7m0lOZ5nGxXAKwEmgnIikiMsFudPAR4GtgOzDPGLO1LOtXKpB9tiGF6SuSGd+3Jbd3bervcJQqkbtLTx+IyACKrzso9B7QrbQbNcaMcDF+MbC4tOsrpJeeVKDbciiNSZ9s5qpWdXjypiv8HY5SHnGXKGKBJNwnihPeDad89NKTCmSnz+Xy4IdJ1Imuxuv3dic8VO/2VpWDuzqKuAqMQ6mgluco4JE56zmRkcPHD/ahXkyEv0NSymNBdUojIreKyLS0tDR/h6LUecYYJi/Yyoo9p3jhjk50aV6r5IWUCiBBlSiMMQuNMQ/ExupdJCpwzFy5n9mrD/Bg/1bcFd/M3+EoVWpBlSiUCjRLd51gyqJtDGrfgCcGa+W1qpxKTBRiGSkiT9vDl4lIL9+HplTltudEBg/PXk/bBjG8MrwbodpsuKqkPClRvAn0AQpvaT2L1cprwNE6ChUoUjNzuW/GOqqFhvDO6B7a57Wq1DxJFL2NMb/GapQPY8wZICDbG9A6ChUICpvnOHQmi7dHxdO8TnV/h6RUuXhympNn9xVhAESkPlDg06iUqsSmLNzG8t2neOnuLvSIq+PvcJQqN09KFK8CnwENROQ5YBnwvE+jUqqS+nBlMh+u2s+D17ZiqN7hpIJEiSUKY8xsEUkCrsd6SvuXxpjtPo+sDLQJD+VPy3adZPLCbVx/RQOeGKJ3OKng4entsceApcAKIKqsjQH6mtZRKH/ZeyKDh2cn0aZ+DP8ZoXc4qeBSYolCRP4GjAX2YNdT4Jv+KJSqlNIy87hvxjrCQkN4d4ze4aSCjye/6HuA1saYXF8Ho1RlU3iH08Ezmcy5/yq9w0kFJU8uPW0BtHEapYrxt0XbWLb7JM/d0YmeeoeTClKelCheADaIyBYgp3CkMeY2n0VVRlqZrSrSh6v2M3Plfu7v15J7ejQveQGlKilPEsUM4O/AZgL8+Qntj0JVlOW7TzJ5wVauu6IBk25s7+9wlPIpTxLFSWPMqz6PRKlKYt/Jczw8ez2t60fzn+Fd9Q4nFfQ8SRRJIvICsICLLz2t91lUSgWotKw8JsxYS4jAu6N7UiMy3N8hKeVzniSKwj6xr3Iap7fHqion3+6l7uDpTGZN6M1ldfUOJ1U1ePJk9sCKCESpQPfsl9tZuuskf7+rE71b1fV3OEpVGJeJQkRGGmNmicjviptujHnZd2GVjd71pHxl9ur9TF+RzH3XtGRYz8v8HY5SFcrdcxTR9nuNYl4xPo6rTLQJD+ULK/ac5JkvtjKwXX2evEnvcFJVj8sShTHmbfvjt8aY5c7TRKSvT6NSKkDsO3mOibPW07JeNK9qG06qivKkMvs1oGgjgMWNUyqoON/h9N4YvcNJBRhjIPM0nD0M6fbr3Ano/4TXN+WujqIPcDVQv0g9RU0g1OuRKBVACu9wOnAqk1n36R1OqoI58iHjKKQfgfRDcPYIrfasgpMfWgnh7GFrmiPn4uUkBK56GCK8WzvgrkRRDasuIgyrXqJQOjDUq1EoFWAK73B68c5OXKV3OClvyj1nHeSdSwLph+HsEaeSwXEwFzeE0TSkGmQ0hZpNoVlPqNkEajSx3ms2gRqNIaYhhHq/9WJ3dRQ/AD+IyHRjzH6vb1mpADVn9QGmr0hmfN+WDO+ldzgFtQIHMWd3w75QKMgDh/1y+znXOuMv7ees01bpIDvt0jgia1042Dfs4HTwv5AIlq7exICB/nlawZPUEyEi04A45/mNMfrAnQo6K/ac5OkvttD/8vr86SbtpS4oGQPHtsBPH8Hm+fQ4ewSSyrCekDAICYfQatZZvNvP4VCnFcRdYyWDmk2hpv1eoxFUiy55e+K/Gyk8SRQfA28B7wIO34ajlP8k2204xdWL5rV7uxEW6mkHkKpSSEuBzR/DT/Pg+DbrQN/2BraFjODKXtdZB/PCg7onn/144K5oniSKfGPMVJ9HopQfZeYZ7pu5DoD3xvSgpt7hFByyUmH7Ais5JC8DDDTvDTf/C668A6LrcjwxkStb9vN3pAHNk0SxUEQeBj7j4kYBT/ssqjLSJ7NVWeQ7Cpi6KYfk0wV8OKE3Lep6cBlABa78XNj9P+vS0s7/WncG1W0DA/8EnYZal4BUqXiSKMbY739wGmeAgNvb2h+FKi1jDH/5YiubTzp44c5O9GmtdzhVSsbAwdVWctj6GWSdger1oMc46HwPNOlepS4VeZsnjQK2rIhAlPKHNxP3kLDmALe0CmeE3uFU+Zz42a6UngepByAsCtrfAp2HQasBVl2CKrcSE4WIjC5uvDFmpvfDUarifLYhhX9+vZPbuzbhlw1T/R2O8tTZY7DlEytBHNloPWTWagAM/DNccTNE1ChpDaqUPLn01NPpcyRwPbAe0EShKq0Vu0/yxPyfuKpVHf4xtDMrly31d0jKnZwM2PGllRz2LrEeRmvcBQY/Dx3vsm4xVT7jyaWnR52HRSQW+NBnESnlYzuOpvPgh0m0rBfN26N6EBGmLdIEpAIH7FliJYcdiyAvE2Ivg2t+C53ugQb6nEtFKcuz3plAW28HolRFOJKWxbgP1lI9IpQPxvUiNkqvYQekggL4eAxsXwiRsVaFdOdh0PwqCNHnWyqaJ3UUC7HucgKr/4orsR7CU6pSOZudx7gP1pKelce8h/rQtFaUv0NSrnz7jJUkrvsLXP0ohEX4O6IqzZMSxUtOn/OB/caYFB/Fo5RP5DkKmDhrPbuPZ/D+2J50aKKdWwWspOmw4lXoeT/0+z+9rTUAeFJH8YPzsIiEisivjDGzfReWUt5jjGHSJ5tZtvsk/xzamWsvr+/vkJQre5bAot9Bm0Ew5EVNEgHC5cU+EakpIk+KyOsicoNYHgH2AvdUXIhKlc+/v93FJ+tTeHxQW+7u0dzf4ShXju+AeWOgfjsY+oFPmstWZePuL/EhcAZYCdyH9WR2NeB2Y8zGCohNqXL7aO0BXv1uF3fHN+Ox6/UejICVcQLm3GPVRdz7EUTW9HdEyom7RNHKGNMJQETeBU4ClxljzlZIZDYRaQX8GYg1xmiHScpjiTuP86fPttCvbT2ev7MTopcxAlNeNsy9FzKOw7gvoZY+IR9o3N1nllf4wRjjAPaVNkmIyPsiclxEthQZP0REdorIbhGZ5G4dxpi9xpgJpdmuUlsOpfHr2eu5vGEN3vxVd8K1yfDAZAx88TCkrIE734am8f6OSBXDXYmii4ik258FiLKHBTDGGE/KhtOB13F6iltEQoE3gF8AKcBaEVmA1Q/3C0WWH2+MOe7JF1GqUMqZTMZNX0tsVDjTx/WkhjYZHriWPG81xzFoMlx5u7+jUS646wq13I+rGmN+FJG4IqN7AbuNMXsBRGQuVr3HC8At5d2mqtrSMvMY+8FasvMczJ54NQ1rRvo7JOXKxgT48R/QbST0fdzf0Sg3xBhT8lzl2YCVKBYZYzraw0OBIcaY++zhUUBvY8wjLpavCzyHVQJ5104oxc33APAAQMOGDePnzp1bpngzMjKIiYkp07IVSeO8VF6B4aW12exOLeD3PSJpX9fzcx3dn97nLtbY1K102fQ0abHt+anzM5gQ/5X6Kss+9XWcAwcOTDLG9Ch2ojHGpy+svra3OA3fjXXALxweBbzmzW3Gx8ebslqyZEmZl61IGufFHI4C88ic9abFHxeZzzeklHp53Z/e5zLWk7uNebGFMa/GG5N5uiJDKlZl2ae+jhNYZ1wcU/1Rw5cCON/M3gw47I0Vi8itIjItLS3NG6tTlcg/vt7Jwk2HeWJIO27v2tTf4ShXMk9bt8Ei8Kt5EFXb3xEpD/gjUawF2opISxGpBgwHFnhjxcaYhcaYB2JjtXmGquTDVft564c93Nv7Mib2b+3vcJQr+bkwb7TVwdDwOdolaSXi00QhIglYD+y1E5EUEZlgjMkHHgG+BrYD84wxW30Zhwpe/9t2jGe+2ML1VzRgym0d9FmJQGUMLPotJC+F29+AFn38HZEqBZ8+I2+MGeFi/GJgsbe3JyK3Are2adPG26tWAWjjwVQeTVhPx6axvHZvN8L0WYnAtexl2DgL+k+ymgxXlUpQ/WfppaeqY/+pc0yYvpb6NSJ4b0xPqlfTdoEC1tbP4Lsp0HEoDHD7fK0KUEGVKFTVcPpcLmM/WIvDGKaP60X9GtpXQcBKWQefPQTNe1uXnPTSYKUUVIlC73oKftl5Du6bsZZDqVm8O7oHresH/v3vVVVk1jFIGA4xDa3K63B9+LGyCqpEoZeegpujwPD43I1sOJjKK8O60iOujr9DUq5kp9Fp87PWnU6/+hii6/k7IlUOemFXVRrPfbmd/249ylM3t+emTo39HY5yxZEPH48jKusQjPrU6l9CVWpBVaLQS0/B671l+3h/+T7GXh3HhGta+jsc5Yox8NUTsOc7drV9CFoN8HdEyguCKlHopafg9NXmIzz75TYGd2jIX265Up+VCGSrpsK696DvYxxpcoO/o1FeElSJQgWfpbtO8NhHG+nWvBb/Gd6N0BBNEgFr51fw9Z+g/a1w/WR/R6O8SBOFClgJaw4w9oO1tKwbzbtjehIZXu6W75WvHNkE8ydAk65wxzQI0UNLMNHKbBVwCgoML/53B9N+3Ev/y+vz+r3dtPOhQJZ+GOYMh6haMGIuVKvu74iUlwVVotAmPCq/rFwHj3+0ga+3HmPkVZcx+dYO2jRHIMvJgDnDICcdxn8NNRr5OyLlA0H1H6iV2ZXb8fRshk1byTfbjvGXW67kb7d31CQRyAoc8On9cGwLDP0AGnX0d0TKR4KqRKEqr+1H0pkwfS2pWXm8M6oHg65s6O+QVEn+9zTsXAw3/hMu1zucgpkmCuV3S3Yc55E564mJDGPeg33o2FRLhH7nyIe8TMjLgrxz1ntupj0uEw6th5WvQ68HofcD/o5W+ZgmCuVXM1cmM3nBVto3rsl7Y3rSKFbbAyqTAgdkHLMqls8ehdxzxR/gCw/+RcflFkkKjtySt3n5EBj8vO+/m/K7oEoUWpldeTgKDH9btI3pK5IZ1L4B/xnejeiIoPo5ek+Bwzr4px+m/vHlsHKrlRDSUqz39MNw9ggYh+t1hIRBeDSER1l3JYUXvqKgel3r3XlcteiLxxVdploM1G2jt8FWEUH1n2mMWQgs7NGjx/3+jkW5lpGTz28SNvD9juOM79uSP9/cvuo+SOfIh4yj9gH/EKQduvA5/dCFEoKdBDoAbAPCoiC2KdRsAi37QU37c2wzq7XWiBoXH+BD9fZiVXZBlShU4DuSlsX46evYeTSdv93egVF94vwdUtkYY12eycuEvGyn6/lZ1ud853GF79mQdfrihJBxFEzBxes+nwSaQsv+dgKwhtf+fISe190GUbW1bwdVYTRRqNIxxj4I2gdARx5grPHO81gfqH4uBU78DBh2HUvnL19soVqug49+eQU94zLh2NaL5nf5ucBhv/Kts+vCYWOPOz+toMh8+UXmLWYdBfm02p8MWV9desAvmgTynaYVPcB7Irz6hbP/VgMulApqNrPfm7hNAucOJ0J1bV5dVSxNFM5+/Cetd2+CvCUQFglh1SA0AsIiILRakfeIi6e7m1beM7+CggsHNvs9LC8DMk9ffAA8/24fLJ0P6M7vxZ7tOp31XjKuyFlyKfQCWGt9bgvMBRDgq/LtEq8KCaOZEThhX6YJi7xwLT48CmIaOF2vj7LO+AunhVe3OuQ5P38xyzvPo5eAVCWkicLZzq9ofHQrHP4vFOR5b70h4U6JJNKqWDSXHvzPH+Cdh11UUF4DsNwLsYVGXHwQdD7ARdW6dFzRg2VoOCB2MrQTotPnrdu3cSykPp9tPMRldaJ5oH9rYiPDXc7v8nNIGISEWi8JdT0soRfGh4QVmRbitFzhNKsy9sfERAYMGOCFHapU8AmqRFHuu57u/55lhQeMggLrGrQjx+qly5ED+TnWOOf3/BzX0y5atvA920oCEmodCJ0PbuffQy68XzTtwvDuPfto0/byIvMXWU9YpNMBvphkEBZpzecj+Y4CXlpakyUH87mxY28euacrUdW0YT+lKpugShRevespJARCIgO2n9+U3ETaXDXA32G4lJ6dx69nr2fpwXwe6t+aJwa3I6Sq3tmkVCUXVIlCBYaDpzOZMGMte0+cY1yHaky68Qp/h6SUKgdNFMqrNh5M5b4Za8nJL2DG+F7kpWzxd0hKqXLSxyqV1yzefIRhb68kqloonz18NX3b1PN3SEopL9AShSo3Ywxv/bCXv/93B/EtajNtVDx1YyL8HZZSyks0Uahyyc0v4C+fb+GjdQe5tUsT/jm0s3ZZqlSQ0UShyiwtM4+Js5NYsecUv7m+Lb8d1BbRZiWUCjqaKFSpHUvPZtaq/cxZfYD07DxevqcLd3Zv5u+wlFI+ElSJQpsZ960NB87wwfJkFm8+gsMYrr+iAY9c15auzWv5OzSllA8FVaLQZsa9L89RwOLNR/hgeTIbD6YSExHG6D5xjLm6BS3qRvs7PKVUBQiqRKG851RGDglrDvDhqv0cS8+hZb1oJt96JUN7NCdGOxhSqkrR/3h1ke1H0vlg+T4+33iY3PwC+rWtx4t3dqb/5fW1CQ6lqihNFApHgeF/247xwfJ9rN53mqjwUO6Ob8bYq+No27CGv8NTSvmZJooqLC0rj3lrDzJjZTIpZ7JoWiuKJ2+8guE9LyO2uvaboJSyaKKognYfz2DGimQ+WZ9CZq6DXnF1+PNN7fnFlQ0JC9VWXZRSF9NEUUUUFBh+2HWCD5Yn8+PPJ6gWGsJtXZsw9uo4OjaN9Xd4SqkApokiyJ3LyeeT9SlMX5HM3hPnqF8jgt/94nLu7X0Z9bQ9JqWUBzRRBCFHgWH/qXPMWX2Aj9Yd5Gx2Pl2a1+I/w7tyY8fGVAvTy0tKKc9poqhksnIdHE3PZvspB6kbDnE0PZujafYrPZtj6dkcP5uDo8AQFiLc2Kkx4/rG0f2y2v4OXSlVSWmiCBDGGE6fy+VImnWwP5qezTH74H80PYejaVkcTcsmPTv/wkJrNwJQIyKMhrGRNKoZSevW9WgUG0Hj2CgGtW9Io9jA7MpVKVV5aKLwAkeBITvPYb3yC8jOc5CTV0B2vuPC5zyHPVxARna+nQAuJIPj6TnkOgouWm+IQL2YCBrFRhJXN5qrWtWlYU0rIRxL3skN/XrTKDZSn5RWSvlUwB9hROSXwM1AA+ANY8w3vtrW2z/sYd32HL4+vZkc+8B+4YBfcCEZ5BWQ4zQtz2FKva2o8FAaxUbSsGYEPVrUplFsFI1qRtjjImkUG0n9mAiXt6smnt1NmwYx5f3KSilVIp8mChF5H7gFOG6M6eg0fgjwHyAUeNcY86KrdRhjPgc+F5HawEuAzxLFFxsPs+9EPjGnjxERFkJkeCiR4SFEhoUSFR5K7erhRISFEhFuTwuzp4eHXjx/eCgR9rQIp3kKp0dHhFEjIkz7blBKVQq+LlFMB14HZhaOEJFQ4A3gF0AKsFZEFmAljReKLD/eGHPc/vyUvZzPLH6sH4mJiQwYMMCXm1FKqUpFjCn9ZZNSbUAkDlhUWKIQkT7AZGPMYHv4SQBjTNEkUbi8AC8C/zPGfOtmOw8ADwA0bNgwfu7cuWWKNyMjg5iYwL+ko3F6l8bpfZUlVo3TMnDgwCRjTI9iJxpjfPoC4oAtTsNDsS43FQ6PAl53s/xvgCTgLeAhT7YZHx9vymrJkiVlXrYiaZzepXF6X2WJVeO0AOuMi2OqPyqzi7sw77JYY4x5FXjVd+EopZRyxx+P6KYAzZ2GmwGHvbFiEblVRKalpaV5Y3VKKaXwT6JYC7QVkZYiUg0YDizwxoqNMQuNMQ/Exmojd0op5S0+TRQikgCsBNqJSIqITDDG5AOPAF8D24F5xpitXtqeliiUUsrLfFpHYYwZ4WL8YmCxD7a3EFjYo0eP+729bqWUqqq0GVGllFJu+fw5Cn8QkRPAfiAWcL4OVTjsPL7ouHrAyVJusuh2SprmKq7ihovGp3EWP82T2DROz6eXNU7ncaWNtSLiLC5mjdPSwhhTv9g5Xd03GwwvYFpxw87ji47Dzb3Enm6npGmu4ipuuJj4NM5S/K01TvdxluVv7y7O8sRaEXF6Y58Gc5yuXsF+6Wmhi+GFJYwr73ZKmuYqruKGi8ancXo2vrjYnD9rnO6nlzVOT7ZXmjhKml7aOJ0/a5weCspLT+UhIuuMq8fYA4jG6V0ap/dVllg1zpIFe4miLKb5OwAPaZzepXF6X2WJVeMsgZYolFJKuaUlCqWUUm5polBKKeWWJgqllFJuaaJwQ0SiRWSGiLwjIr/ydzzuiEgrEXlPROb7OxZ3ROSX9v78QkRu8Hc8rohIexF5S0Tmi8hEf8fjjv07TRKRW/wdiysiMkBEltr7dIC/43FFREJE5DkReU1Exvg7HldEpJ+9L98VkRW+3l6VSxQi8r6IHBeRLUXGDxGRnSKyW0Qm2aPvBOYbY+4HbgvkWI0xe40xEyo6xjLE+bm9P8cCwwI4zu3GmIeAe4AKvSWxlL9RgD8C8yoyRjue0sRpgAwgEqurgUCN83agKZAXyHEaY5bav89FwAyfB1eaJ/2C4QVcC3Tn4l73QoE9QCugGrAJuBJ4EuhqzzMnkGN1mj6/ksT5L6B7IMeJdXKwArg3UOMEBmE11T8WuCWA4wyxpzcEZgdwnJOAB+15KvR/qYz/R/OAmr6OrcqVKIwxPwKni4zuBew21ll5LjAX68wiBatjJfBD6auUsfpNaeIUy9+Br4wx6wM1Tnv+BcaYq4EKvexYyjgHAlcB9wL3i0iF/U5LE6cxpsCefgaIqKgYoUz/82fseRwVF2Xpf58ichmQZoxJ93Vs/ugKNRA1BQ46DacAvbG6YH1dRG6mfE0oeFOxsYpIXeA5oJuIPGmMecEv0V3gap8+inUWHCsibYwxb/kjOCeu9ucArEuPEfigSfwyKDZOY8wjACIyFjjpdED2F1f7805gMFALeN0fgRXh6vf5H+A1EekH/OiPwIpwFSfABOCDighCE4Wl2H68jTHngHEVHUwJXMV6CnioooNxw1WcgdYHuqs4E4HEig3FLbd9zRtjpldcKG652p+fAp9WdDBuuIozE+sAHChc/t2NMc9UVBBV7tKTCz7rx9sHKkusGqd3aZzepXGWgiYKi8/68faByhKrxuldGqd3aZylUZG1+oHwAhKAI1y4/W2CPf4m4GesOwz+7O84K1OsGqfGqXEGd5zaKKBSSim39NKTUkoptzRRKKWUcksThVJKKbc0USillHJLE4VSSim3NFEopZRySxOFUkoptzRRqCpBRDLKsewjdl8ARkTqOY0XEXnVnvaTiHR3mtZYRBaVN25vE5FbROSv/o5DVS6aKJQq2XKsFm/3Fxl/I9DWfj0ATHWa9jvgnfJuWERCy7uOIr4EbhOR6l5erwpimihUlWKXAv4pIltEZLOIDLPHh4jImyKyVUQWichiERkKYIzZYIxJLmZ1twMzjWUVUEtEGtvT7gL+a697rIh8KiL/FZFdIvKPEmLMEJEpIrIa6CMi14vIBjve90UkQkR6icin9vy3i0iWiFQTkUgR2WuP/42IbLNLO3Pt72KwWsUN2G5TVeDRZsZVVXMn0BXoAtQD1orIj0BfIA7oBDQAtgPvl7Cu4voKaCoikcAZd1QlkgAAAlNJREFUY0yO07SuQDcgB9gpIq8ZY5yXdRaN1cvZ0/a6dgHXG2N+FpGZwESsPh262fP3A7YAPbH+p1fb4ycBLY0xOSJSy2n96+xlKrz7VFU5aYlCVTXXAAnGGIcx5hjwA9YB9hrgY2NMgTHmKLDEg3W56iugMXCiyPjvjDFpxphsYBvQws16HcAn9ud2wD5jzM/28AzgWmNMPrBbRNpj9YL2MlZXmv2Apfa8PwGzRWQkkO+0/uNAEw++n1KAJgpV9RR3cHc33h1XfQVkAZFF5nUuXThwX5rPNsYUdsPpLq6lWPUkecC3WMnuGi70zHYz8AYQDySJSOE2I+0YlfKIJgpV1fwIDBORUBGpj3UWvgZYBtxl11U0BAZ4sK4FwGi73uMqrP6Lj2A1CR3npXh3AHEi0sYeHoVVCir8Lo8DK40xJ4C6wBXAVrvv7ObGmCXAE1hdkMbYy12OdalKKY9oolBVzWdYl2Q2Ad8DT9iXmj7BKiFsAd7Gus6fBucrhVOwSgw/ici79roWA3uB3Vh3OD0MYKwudPc4HdzLzL5UNQ74WEQ2AwVAYT/jq4GGXChB/AT8ZFdYhwKz7GU2AP82xqTa8w3EuvtJKY9ofxRK2UQkxhiTISJ1sUoZfe0kUpZ13QHEG2Oe8mqQ5WSXluYYY673dyyq8tC7npS6YJF9d1A14G9lTRIAxpjP7IQTaC4D/s/fQajKRUsUSvmJ/ZxERJHRo4wxm/0Rj1KuaKJQSinlllZmK6WUcksThVJKKbc0USillHJLE4VSSim3NFEopZRy6/8BELgWs38P86kAAAAASUVORK5CYII=\n", 266 | "text/plain": [ 267 | "
" 268 | ] 269 | }, 270 | "metadata": { 271 | "needs_background": "light" 272 | }, 273 | "output_type": "display_data" 274 | } 275 | ], 276 | "source": [ 277 | "fig = plt.figure()\n", 278 | "bench.xlabel = \"log10(n_rows)\"\n", 279 | "bench.plot(logx=True, logy=True)\n", 280 | "plt.title(\"Modin DataFrame Performance Benchmark\")\n", 281 | "plt.ylabel(\"Runtime [log10(s)]\")\n", 282 | "plt.savefig(\"modin_swifter_performance_benchmark_loglog.png\")" 283 | ] 284 | } 285 | ], 286 | "metadata": { 287 | "kernelspec": { 288 | "display_name": "Python 3", 289 | "language": "python", 290 | "name": "python3" 291 | }, 292 | "language_info": { 293 | "codemirror_mode": { 294 | "name": "ipython", 295 | "version": 3 296 | }, 297 | "file_extension": ".py", 298 | "mimetype": "text/x-python", 299 | "name": "python", 300 | "nbconvert_exporter": "python", 301 | "pygments_lexer": "ipython3", 302 | "version": "3.7.6" 303 | }, 304 | "toc": { 305 | "base_numbering": 1, 306 | "nav_menu": {}, 307 | "number_sections": true, 308 | "sideBar": true, 309 | "skip_h1_title": false, 310 | "title_cell": "Table of Contents", 311 | "title_sidebar": "Contents", 312 | "toc_cell": false, 313 | "toc_position": {}, 314 | "toc_section_display": true, 315 | "toc_window_display": false 316 | } 317 | }, 318 | "nbformat": 4, 319 | "nbformat_minor": 4 320 | } 321 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file=README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name="swifter", 5 | packages=["swifter"], # this must be the same as the name above 6 | version="1.4.0", 7 | description="A package which efficiently applies any function to a pandas dataframe or series in the fastest available manner", 8 | author="Jason Carpenter", 9 | author_email="jcarpenter@manifold.ai", 10 | url="https://github.com/jmcarpenter2/swifter", # use the URL to the github repo 11 | download_url="https://github.com/jmcarpenter2/swifter/archive/1.4.0.tar.gz", 12 | keywords=["pandas", "dask", "apply", "function", "parallelize", "vectorize"], 13 | install_requires=[ 14 | "pandas>=1.0.0", 15 | "psutil>=5.6.6", 16 | "dask[dataframe]>=2.10.0", 17 | "tqdm>=4.33.0", 18 | ], 19 | extras_require={ 20 | "groupby": ["ray>=1.0.0"], 21 | "notebook": ["ipywidgets>=7.0.0"], 22 | }, 23 | classifiers=[], 24 | ) 25 | -------------------------------------------------------------------------------- /swifter/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | import sys 3 | import warnings 4 | from logging import config 5 | from .swifter import SeriesAccessor, DataFrameAccessor, set_defaults 6 | from .parallel_accessor import ( 7 | register_parallel_dataframe_accessor, 8 | register_parallel_series_accessor, 9 | register_modin, 10 | ) 11 | 12 | warnings.filterwarnings("ignore", category=FutureWarning) 13 | 14 | if "modin.pandas" in sys.modules: 15 | register_modin() 16 | 17 | __all__ = [ 18 | "set_defaults", 19 | "SeriesAccessor", 20 | "DataFrameAccessor", 21 | "register_parallel_dataframe_accessor", 22 | "register_parallel_series_accessor", 23 | "register_modin", 24 | ] 25 | __version__ = "1.4.0" 26 | -------------------------------------------------------------------------------- /swifter/base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import importlib 3 | import numpy as np 4 | from os import devnull 5 | from math import ceil 6 | from psutil import cpu_count 7 | from contextlib import contextmanager, redirect_stderr, redirect_stdout 8 | 9 | ERRORS_TO_HANDLE = [AttributeError, ValueError, TypeError, KeyError] 10 | try: 11 | from numba.core.errors import TypingError 12 | 13 | ERRORS_TO_HANDLE.append(TypingError) 14 | except ImportError: 15 | pass 16 | ERRORS_TO_HANDLE = tuple(ERRORS_TO_HANDLE) 17 | 18 | RAY_INSTALLED = importlib.util.find_spec("ray") is not None 19 | 20 | 21 | SAMPLE_SIZE = 1000 22 | N_REPEATS = 3 23 | 24 | 25 | @contextmanager 26 | def suppress_stdout_stderr_logging(): 27 | """ 28 | A context manager that redirects stdout and stderr to devnull 29 | Used for avoiding repeated prints of the data during sample/test applies of Swifter 30 | """ 31 | previous_level = logging.root.manager.disable 32 | 33 | logging.disable(logging.CRITICAL) 34 | try: 35 | with open(devnull, "w") as fnull: 36 | with redirect_stderr(fnull) as err, redirect_stdout(fnull) as out: 37 | yield (err, out) 38 | finally: 39 | logging.disable(previous_level) 40 | 41 | 42 | class _SwifterBaseObject: 43 | def __init__(self, base_obj, npartitions=None): 44 | self._obj = base_obj 45 | self._nrows = self._obj.shape[0] 46 | self._SAMPLE_SIZE = SAMPLE_SIZE if self._nrows > (25 * SAMPLE_SIZE) else int(ceil(self._nrows / 25)) 47 | self._SAMPLE_INDEX = sorted(np.random.choice(range(self._nrows), size=self._SAMPLE_SIZE, replace=False)) 48 | self.set_npartitions(npartitions=npartitions) 49 | 50 | @staticmethod 51 | def _validate_apply(expr, error_message): 52 | if not expr: 53 | raise ValueError(error_message) 54 | 55 | def set_npartitions(self, npartitions=None): 56 | """ 57 | Set the number of partitions to use for dask/modin 58 | """ 59 | if npartitions is None: 60 | self._npartitions = cpu_count() * 2 61 | else: 62 | self._npartitions = npartitions 63 | 64 | return self 65 | -------------------------------------------------------------------------------- /swifter/parallel_accessor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import warnings 3 | from .base import _SwifterBaseObject, ERRORS_TO_HANDLE, suppress_stdout_stderr_logging 4 | 5 | 6 | class _SwifterParallelBaseObject(_SwifterBaseObject): 7 | def set_dask_threshold(self, dask_threshold=1): 8 | """ 9 | Set the threshold (seconds) for maximum allowed estimated duration 10 | of pandas apply before switching to dask 11 | """ 12 | warnings.warn("Parallel Accessor does not use Dask.") 13 | return self 14 | 15 | def set_dask_scheduler(self, scheduler="processes"): 16 | """ 17 | Set the dask scheduler 18 | :param scheduler: String, ["threads", "processes"] 19 | """ 20 | warnings.warn("Parallel Accessor does not use Dask.") 21 | return self 22 | 23 | def progress_bar(self, enable=True, desc=None): 24 | """ 25 | Turn on/off the progress bar, and optionally add a custom description 26 | """ 27 | warnings.warn("Parallel Accessor does not use have a progress bar.") 28 | return self 29 | 30 | def allow_dask_on_strings(self, enable=True): 31 | """ 32 | Override the string processing default, which is to not use dask if 33 | a string is contained in the pandas object 34 | """ 35 | warnings.warn("Parallel Accessor does not use Dask.") 36 | return self 37 | 38 | def force_parallel(self, enable=True): 39 | """ 40 | Force swifter to use dask parallel processing, without attempting any 41 | vectorized solution or estimating pandas apply duration to determine 42 | what will be the fastest approach 43 | """ 44 | warnings.warn("Parallel Accessor does not use Dask.") 45 | return self 46 | 47 | def rolling( 48 | self, 49 | window, 50 | min_periods=None, 51 | center=False, 52 | win_type=None, 53 | on=None, 54 | axis=0, 55 | closed=None, 56 | ): 57 | """ 58 | Create a swifter rolling object 59 | """ 60 | raise NotImplementedError("Parallel Accessor cannot create Rolling objects.") 61 | 62 | def resample( 63 | self, 64 | rule, 65 | axis=0, 66 | closed=None, 67 | label=None, 68 | convention="start", 69 | kind=None, 70 | loffset=None, 71 | base=0, 72 | on=None, 73 | level=None, 74 | ): 75 | """ 76 | Create a swifter resampler object 77 | """ 78 | raise NotImplementedError("Parallel Accessor cannot create Resampler objects.") 79 | 80 | 81 | class ParallelSeriesAccessor(_SwifterParallelBaseObject): 82 | def apply(self, func, convert_dtype=True, args=(), **kwds): 83 | """ 84 | Apply the function to the Series using swifter 85 | """ 86 | 87 | # if the series is empty, return early using Pandas 88 | if not self._nrows: 89 | return self._obj.apply(func, convert_dtype=convert_dtype, args=args, **kwds) 90 | 91 | sample = self._obj.iloc[self._SAMPLE_INDEX] 92 | if "axis" in kwds.keys(): 93 | kwds.pop("axis") 94 | warnings.warn("Axis keyword not necessary because applying on a Series.") 95 | 96 | try: # try to vectorize 97 | with suppress_stdout_stderr_logging(): 98 | tmp_df = func(sample, *args, **kwds) 99 | sample_df = sample.apply(func, convert_dtype=convert_dtype, args=args, **kwds) 100 | self._validate_apply( 101 | np.array_equal(sample_df, tmp_df) & (sample_df.shape == tmp_df.shape), 102 | error_message=("Vectorized function sample doesn't " "match parallel series apply sample."), 103 | ) 104 | return func(self._obj, *args, **kwds) 105 | except ERRORS_TO_HANDLE: # if can't vectorize, return regular apply 106 | return self._obj.apply(func, convert_dtype=convert_dtype, args=args, **kwds) 107 | 108 | 109 | class ParallelDataFrameAccessor(_SwifterParallelBaseObject): 110 | def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): 111 | """ 112 | Apply the function to the Parallel DataFrame using swifter 113 | """ 114 | # If there are no rows return early using default 115 | if not self._nrows: 116 | return self._obj.apply(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) 117 | 118 | sample = self._obj.iloc[self._SAMPLE_INDEX] 119 | 120 | try: # try to vectorize 121 | with suppress_stdout_stderr_logging(): 122 | tmp_df = func(sample, *args, **kwds) 123 | sample_df = sample.apply(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) 124 | self._validate_apply( 125 | np.array_equal(sample_df, tmp_df) & (sample_df.shape == tmp_df.shape), 126 | error_message=("Vectorized function sample doesn't " "match parallel dataframe apply sample."), 127 | ) 128 | return func(self._obj, *args, **kwds) 129 | except ERRORS_TO_HANDLE: # if can't vectorize, return regular apply 130 | return self._obj.apply(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) 131 | 132 | 133 | def register_parallel_series_accessor(series_to_register): 134 | """ 135 | Register a parallel series type with swifter attribute, 136 | giving access to automatic vectorization 137 | """ 138 | current_init = series_to_register.__init__ 139 | 140 | def new_init(self, *args, **kwds): 141 | current_init(self, *args, **kwds) 142 | self.swifter = ParallelSeriesAccessor(self) 143 | 144 | series_to_register.__init__ = new_init 145 | 146 | 147 | def register_parallel_dataframe_accessor(dataframe_to_register): 148 | """ 149 | Register a parallel dataframe type with swifter attribute, 150 | giving access to automatic vectorization 151 | """ 152 | current_init = dataframe_to_register.__init__ 153 | 154 | def new_init(self, *args, **kwds): 155 | current_init(self, *args, **kwds) 156 | self.swifter = ParallelDataFrameAccessor(self) 157 | 158 | dataframe_to_register.__init__ = new_init 159 | 160 | 161 | def register_modin(): 162 | """ 163 | Register modin's series/dataframe as parallel accessors 164 | """ 165 | from modin.pandas import Series, DataFrame 166 | 167 | register_parallel_series_accessor(Series) 168 | register_parallel_dataframe_accessor(DataFrame) 169 | -------------------------------------------------------------------------------- /swifter/swifter.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | import warnings 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from abc import abstractmethod 8 | from dask import dataframe as dd 9 | from functools import partial 10 | from tqdm.auto import tqdm 11 | from .tqdm_dask_progressbar import TQDMDaskProgressBar 12 | 13 | from .base import ( 14 | _SwifterBaseObject, 15 | suppress_stdout_stderr_logging, 16 | ERRORS_TO_HANDLE, 17 | RAY_INSTALLED, 18 | N_REPEATS, 19 | ) 20 | 21 | DEFAULT_KWARGS = { 22 | "npartitions": None, 23 | "dask_threshold": 1, 24 | "scheduler": "processes", 25 | "progress_bar": True, 26 | "progress_bar_desc": None, 27 | "allow_dask_on_strings": False, 28 | "force_parallel": False, 29 | } 30 | 31 | GROUPBY_MAX_ROWS_PANDAS_DEFAULT = 5000 32 | 33 | 34 | def register_default_config_dataframe_accessor(dataframe_to_register, kwargs): 35 | """ 36 | Register dataframe type with default swifter config 37 | """ 38 | current_init = dataframe_to_register.__init__ 39 | 40 | def new_init(self, *args, **kwds): 41 | current_init(self, *args, **kwds) 42 | self.swifter = ( 43 | self.swifter.set_npartitions(npartitions=kwargs.get("npartitions", DEFAULT_KWARGS["npartitions"])) 44 | .set_dask_threshold(dask_threshold=kwargs.get("dask_threshold", DEFAULT_KWARGS["dask_threshold"])) 45 | .set_dask_scheduler(scheduler=kwargs.get("scheduler", DEFAULT_KWARGS["scheduler"])) 46 | .progress_bar( 47 | enable=kwargs.get("progress_bar", DEFAULT_KWARGS["progress_bar"]), 48 | desc=kwargs.get("progress_bar_desc", DEFAULT_KWARGS["progress_bar_desc"]), 49 | ) 50 | .allow_dask_on_strings(enable=kwargs.get("allow_dask_on_strings", DEFAULT_KWARGS["allow_dask_on_strings"])) 51 | .force_parallel(enable=kwargs.get("force_parallel", DEFAULT_KWARGS["force_parallel"])) 52 | ) 53 | 54 | dataframe_to_register.__init__ = new_init 55 | 56 | 57 | def set_defaults(**kwargs): 58 | """ 59 | Register swifter's default kwargs 60 | npartitions=None, 61 | dask_threshold=1, 62 | scheduler="processes", 63 | progress_bar=True, 64 | progress_bar_desc=None, 65 | allow_dask_on_strings=False, 66 | """ 67 | from pandas import Series, DataFrame 68 | 69 | register_default_config_dataframe_accessor(Series, kwargs) 70 | register_default_config_dataframe_accessor(DataFrame, kwargs) 71 | 72 | 73 | class _SwifterObject(_SwifterBaseObject): 74 | def __init__( 75 | self, 76 | pandas_obj, 77 | npartitions=DEFAULT_KWARGS["npartitions"], 78 | dask_threshold=DEFAULT_KWARGS["dask_threshold"], 79 | scheduler=DEFAULT_KWARGS["scheduler"], 80 | progress_bar=DEFAULT_KWARGS["progress_bar"], 81 | progress_bar_desc=DEFAULT_KWARGS["progress_bar_desc"], 82 | allow_dask_on_strings=DEFAULT_KWARGS["allow_dask_on_strings"], 83 | force_parallel=DEFAULT_KWARGS["force_parallel"], 84 | ): 85 | super().__init__(base_obj=pandas_obj, npartitions=npartitions) 86 | if self._obj.index.duplicated().any(): 87 | warnings.warn( 88 | "This pandas object has duplicate indices, " 89 | "and swifter may not be able to improve performance. " 90 | "Consider resetting the indices with `df.reset_index(drop=True)`." 91 | ) 92 | self._dask_threshold = dask_threshold 93 | self._scheduler = scheduler 94 | self._progress_bar = progress_bar 95 | self._progress_bar_desc = progress_bar_desc 96 | self._allow_dask_on_strings = allow_dask_on_strings 97 | self._force_parallel = force_parallel 98 | 99 | def set_dask_threshold(self, dask_threshold=1): 100 | """ 101 | Set the threshold (seconds) for maximum allowed estimated duration 102 | of pandas apply before switching to dask 103 | """ 104 | self._dask_threshold = dask_threshold 105 | return self 106 | 107 | def set_dask_scheduler(self, scheduler="processes"): 108 | """ 109 | Set the dask scheduler 110 | :param scheduler: String, ["threads", "processes"] 111 | """ 112 | self._scheduler = scheduler 113 | return self 114 | 115 | def progress_bar(self, enable=True, desc=None): 116 | """ 117 | Turn on/off the progress bar, and optionally add a custom description 118 | """ 119 | self._progress_bar = enable 120 | self._progress_bar_desc = desc 121 | return self 122 | 123 | def allow_dask_on_strings(self, enable=True): 124 | """ 125 | Override the string processing default, which is to not use dask 126 | if a string is contained in the pandas object 127 | """ 128 | self._allow_dask_on_strings = enable 129 | return self 130 | 131 | def force_parallel(self, enable=True): 132 | """ 133 | Force swifter to use dask parallel processing, without attempting any 134 | vectorized solution or estimating pandas apply duration to determine 135 | what will be the fastest approach 136 | """ 137 | self._force_parallel = enable 138 | return self 139 | 140 | def rolling( 141 | self, 142 | window, 143 | min_periods=None, 144 | center=False, 145 | win_type=None, 146 | on=None, 147 | axis=0, 148 | closed=None, 149 | ): 150 | """ 151 | Create a swifter rolling object 152 | """ 153 | kwds = { 154 | "window": window, 155 | "min_periods": min_periods, 156 | "center": center, 157 | "win_type": win_type, 158 | "on": on, 159 | "axis": axis, 160 | "closed": closed, 161 | } 162 | return Rolling( 163 | self._obj, 164 | npartitions=self._npartitions, 165 | dask_threshold=self._dask_threshold, 166 | scheduler=self._scheduler, 167 | progress_bar=self._progress_bar, 168 | progress_bar_desc=self._progress_bar_desc, 169 | allow_dask_on_strings=self._allow_dask_on_strings, 170 | force_parallel=self._force_parallel, 171 | **kwds, 172 | ) 173 | 174 | def resample( 175 | self, 176 | rule, 177 | axis=0, 178 | closed=None, 179 | label=None, 180 | convention="start", 181 | kind=None, 182 | loffset=None, 183 | base=0, 184 | on=None, 185 | level=None, 186 | origin=None, 187 | offset=None, 188 | ): 189 | """ 190 | Create a swifter resampler object 191 | """ 192 | kwds = { 193 | "rule": rule, 194 | "axis": axis, 195 | "closed": closed, 196 | "label": label, 197 | "convention": convention, 198 | "kind": kind, 199 | "base": base, 200 | "on": on, 201 | "level": level, 202 | "origin": origin, 203 | "offset": offset, 204 | } 205 | if not base: 206 | kwds.pop("base") 207 | if loffset is not None: 208 | kwds.update({"loffset": loffset}) 209 | 210 | return Resampler( 211 | self._obj, 212 | npartitions=self._npartitions, 213 | dask_threshold=self._dask_threshold, 214 | scheduler=self._scheduler, 215 | progress_bar=self._progress_bar, 216 | progress_bar_desc=self._progress_bar_desc, 217 | allow_dask_on_strings=self._allow_dask_on_strings, 218 | force_parallel=self._force_parallel, 219 | **kwds, 220 | ) 221 | 222 | 223 | @pd.api.extensions.register_series_accessor("swifter") 224 | class SeriesAccessor(_SwifterObject): 225 | def _wrapped_apply(self, func, convert_dtype=True, args=(), **kwds): 226 | def wrapped(): 227 | with suppress_stdout_stderr_logging(): 228 | self._obj.iloc[self._SAMPLE_INDEX].apply(func, convert_dtype=convert_dtype, args=args, **kwds) 229 | 230 | return wrapped 231 | 232 | def _pandas_apply(self, df, func, convert_dtype, *args, **kwds): 233 | if self._progress_bar: 234 | tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") 235 | return df.progress_apply(func, convert_dtype=convert_dtype, args=args, **kwds) 236 | else: 237 | return df.apply(func, convert_dtype=convert_dtype, args=args, **kwds) 238 | 239 | def _dask_map_partitions(self, df, func, meta, *args, **kwds): 240 | return ( 241 | dd.from_pandas(df, npartitions=self._npartitions) 242 | .map_partitions(func, *args, meta=meta, **kwds) 243 | .compute(scheduler=self._scheduler) 244 | ) 245 | 246 | def _dask_apply(self, df, func, convert_dtype, meta, *args, **kwds): 247 | return ( 248 | dd.from_pandas(df, npartitions=self._npartitions) 249 | .apply( 250 | lambda x: func(x, *args, **kwds), 251 | convert_dtype=convert_dtype, 252 | meta=meta, 253 | ) 254 | .compute(scheduler=self._scheduler) 255 | ) 256 | 257 | def _parallel_apply(self, func, convert_dtype, *args, **kwds): 258 | sample = self._obj.iloc[self._SAMPLE_INDEX] 259 | with suppress_stdout_stderr_logging(): 260 | meta = sample.apply(func, convert_dtype=convert_dtype, args=args, **kwds) 261 | try: 262 | # check that the dask map partitions matches the pandas apply 263 | with suppress_stdout_stderr_logging(): 264 | tmp_df = self._dask_map_partitions(sample, func, meta, *args, **kwds) 265 | self._validate_apply( 266 | tmp_df.equals(meta), 267 | error_message=("Dask map-partitions sample does not match pandas apply sample."), 268 | ) 269 | if self._progress_bar: 270 | with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): 271 | return self._dask_map_partitions(self._obj, func, meta, *args, **kwds) 272 | else: 273 | return self._dask_map_partitions(self._obj, func, meta, *args, **kwds) 274 | except ERRORS_TO_HANDLE: 275 | # if map partitions doesn't match pandas apply, 276 | # we can use dask apply, but it will be a bit slower 277 | try: 278 | if self._progress_bar: 279 | with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): 280 | return self._dask_apply(self._obj, func, convert_dtype, meta, *args, **kwds) 281 | else: 282 | return self._dask_apply(self._obj, func, convert_dtype, meta, *args, **kwds) 283 | except ERRORS_TO_HANDLE: 284 | # Second fallback to pandas if dask apply fails 285 | return self._pandas_apply(self._obj, func, convert_dtype, *args, **kwds) 286 | 287 | def apply(self, func, convert_dtype=True, args=(), **kwds): 288 | """ 289 | Apply the function to the Series using swifter 290 | """ 291 | 292 | # if the series is empty, return early using Pandas 293 | if not self._nrows: 294 | return self._obj.apply(func, convert_dtype=convert_dtype, args=args, **kwds) 295 | 296 | # If parallel processing is forced by the user, then skip the logic and apply dask 297 | if self._force_parallel: 298 | return self._parallel_apply(func, convert_dtype, *args, **kwds) 299 | 300 | sample = self._obj.iloc[self._SAMPLE_INDEX] 301 | # check if input is string or 302 | # if the user is overriding the string processing default 303 | allow_dask_processing = True if self._allow_dask_on_strings else (sample.dtype != "object") 304 | 305 | if "axis" in kwds.keys(): 306 | kwds.pop("axis") 307 | warnings.warn("Axis keyword not necessary because applying on a Series.") 308 | 309 | try: # try to vectorize 310 | with suppress_stdout_stderr_logging(): 311 | tmp_df = func(sample, *args, **kwds) 312 | sample_df = sample.apply(func, convert_dtype=convert_dtype, args=args, **kwds) 313 | self._validate_apply( 314 | np.array_equal(sample_df, tmp_df) & (hasattr(tmp_df, "shape")) & (sample_df.shape == tmp_df.shape), 315 | error_message=("Vectorized function sample doesn't match pandas apply sample."), 316 | ) 317 | return func(self._obj, *args, **kwds) 318 | except ERRORS_TO_HANDLE: # if can't vectorize, estimate time to pandas apply 319 | wrapped = self._wrapped_apply(func, convert_dtype=convert_dtype, args=args, **kwds) 320 | timed = timeit.timeit(wrapped, number=N_REPEATS) 321 | sample_proc_est = timed / N_REPEATS 322 | est_apply_duration = sample_proc_est / self._SAMPLE_SIZE * self._nrows 323 | 324 | # if pandas sample apply takes too long and not performing str processing 325 | # then use dask 326 | if (est_apply_duration > self._dask_threshold) and allow_dask_processing: 327 | return self._parallel_apply(func, convert_dtype, *args, **kwds) 328 | else: # use pandas 329 | return self._pandas_apply(self._obj, func, convert_dtype, *args, **kwds) 330 | 331 | 332 | @pd.api.extensions.register_dataframe_accessor("swifter") 333 | class DataFrameAccessor(_SwifterObject): 334 | def _wrapped_apply(self, func, axis=0, raw=None, result_type=None, args=(), **kwds): 335 | def wrapped(): 336 | with suppress_stdout_stderr_logging(): 337 | self._obj.iloc[self._SAMPLE_INDEX].apply( 338 | func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds 339 | ) 340 | 341 | return wrapped 342 | 343 | def _pandas_apply(self, df, func, axis, raw, result_type, *args, **kwds): 344 | if self._progress_bar: 345 | tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") 346 | apply_func = df.progress_apply 347 | else: 348 | apply_func = df.apply 349 | 350 | return apply_func(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) 351 | 352 | def _dask_apply(self, df, func, axis, raw, result_type, meta, *args, **kwds): 353 | return ( 354 | dd.from_pandas(df, npartitions=self._npartitions) 355 | .apply( 356 | func, 357 | *args, 358 | axis=axis, 359 | raw=raw, 360 | result_type=result_type, 361 | meta=meta, 362 | **kwds, 363 | ) 364 | .compute(scheduler=self._scheduler) 365 | ) 366 | 367 | def _parallel_apply(self, func, axis=0, raw=None, result_type=None, *args, **kwds): 368 | sample = self._obj.iloc[self._SAMPLE_INDEX] 369 | with suppress_stdout_stderr_logging(): 370 | meta = sample.apply(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) 371 | try: 372 | with suppress_stdout_stderr_logging(): 373 | # check that the dask apply matches the pandas apply 374 | tmp_df = ( 375 | dd.from_pandas(sample, npartitions=self._npartitions) 376 | .apply( 377 | func, 378 | *args, 379 | axis=axis, 380 | raw=raw, 381 | result_type=result_type, 382 | meta=meta, 383 | **kwds, 384 | ) 385 | .compute(scheduler=self._scheduler) 386 | ) 387 | self._validate_apply( 388 | tmp_df.equals(meta), 389 | error_message="Dask apply sample does not match pandas apply sample.", 390 | ) 391 | if self._progress_bar: 392 | with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): 393 | return self._dask_apply(self._obj, func, axis, raw, result_type, meta, *args, **kwds) 394 | else: 395 | return self._dask_apply(self._obj, func, axis, raw, result_type, meta, *args, **kwds) 396 | except ERRORS_TO_HANDLE: 397 | # if dask apply doesn't match pandas apply, fallback to pandas 398 | return self._pandas_apply(self._obj, func, axis, raw, result_type, *args, **kwds) 399 | 400 | def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): 401 | """ 402 | Apply the function to the DataFrame using swifter 403 | """ 404 | # If there are no rows return early using Pandas 405 | if not self._nrows: 406 | return self._obj.apply(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) 407 | 408 | # If parallel processing is forced by the user, then skip the logic and apply dask 409 | if self._force_parallel: 410 | return self._parallel_apply(func, axis, raw, result_type, *args, **kwds) 411 | 412 | sample = self._obj.iloc[self._SAMPLE_INDEX] 413 | # check if input is string 414 | # or if the user is overriding the string processing default 415 | allow_dask_processing = True if self._allow_dask_on_strings else ("object" not in sample.dtypes.values) 416 | 417 | try: # try to vectorize 418 | with suppress_stdout_stderr_logging(): 419 | tmp_df = func(sample, *args, **kwds) 420 | sample_df = sample.apply(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) 421 | self._validate_apply( 422 | np.array_equal(sample_df, tmp_df) & (hasattr(tmp_df, "shape")) & (sample_df.shape == tmp_df.shape), 423 | error_message=("Vectorized function sample does not match pandas apply sample."), 424 | ) 425 | return func(self._obj, *args, **kwds) 426 | except ERRORS_TO_HANDLE: # if can't vectorize, estimate time to pandas apply 427 | wrapped = self._wrapped_apply(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) 428 | timed = timeit.timeit(wrapped, number=N_REPEATS) 429 | sample_proc_est = timed / N_REPEATS 430 | est_apply_duration = sample_proc_est / self._SAMPLE_SIZE * self._nrows 431 | 432 | # if pandas sample apply takes too long 433 | # and not performing str processing, use dask 434 | if (est_apply_duration > self._dask_threshold) and allow_dask_processing and axis == 1: 435 | return self._parallel_apply(func, axis, raw, result_type, *args, **kwds) 436 | else: # use pandas 437 | return self._pandas_apply(self._obj, func, axis, raw, result_type, *args, **kwds) 438 | 439 | def _wrapped_applymap(self, func): 440 | def wrapped(): 441 | with suppress_stdout_stderr_logging(): 442 | self._obj.iloc[self._SAMPLE_INDEX].applymap(func) 443 | 444 | return wrapped 445 | 446 | def _pandas_applymap(self, df, func): 447 | if self._progress_bar: 448 | tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") 449 | applymap_func = df.progress_applymap 450 | else: 451 | applymap_func = df.applymap 452 | 453 | return applymap_func(func) 454 | 455 | def _dask_applymap(self, df, func, meta): 456 | return ( 457 | dd.from_pandas(df, npartitions=self._npartitions) 458 | .applymap(func, meta=meta) 459 | .compute(scheduler=self._scheduler) 460 | ) 461 | 462 | def _parallel_applymap(self, func): 463 | sample = self._obj.iloc[self._SAMPLE_INDEX] 464 | with suppress_stdout_stderr_logging(): 465 | meta = sample.applymap(func) 466 | try: 467 | with suppress_stdout_stderr_logging(): 468 | # check that the dask apply matches the pandas apply 469 | tmp_df = self._dask_applymap(sample, func, meta) 470 | self._validate_apply( 471 | tmp_df.equals(meta), 472 | error_message=("Dask applymap sample does not match pandas applymap sample."), 473 | ) 474 | if self._progress_bar: 475 | with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Applymap"): 476 | return self._dask_applymap(self._obj, func, meta) 477 | else: 478 | return self._dask_applymap(self._obj, func, meta) 479 | except ERRORS_TO_HANDLE: 480 | # if dask apply doesn't match pandas apply, fallback to pandas 481 | return self._pandas_applymap(self._obj, func) 482 | 483 | def applymap(self, func): 484 | """ 485 | Applymap the function to the DataFrame using swifter 486 | """ 487 | 488 | # If there are no rows return early using Pandas 489 | if not self._nrows: 490 | return self._obj.applymap(func) 491 | 492 | # If parallel processing is forced by the user, then skip the logic and apply dask 493 | if self._force_parallel: 494 | return self._parallel_applymap(func) 495 | 496 | sample = self._obj.iloc[self._SAMPLE_INDEX] 497 | # check if input is string 498 | # or if the user is overriding the string processing default 499 | allow_dask_processing = True if self._allow_dask_on_strings else ("object" not in sample.dtypes.values) 500 | 501 | try: # try to vectorize 502 | with suppress_stdout_stderr_logging(): 503 | tmp_df = func(sample) 504 | sample_df = sample.applymap(func) 505 | self._validate_apply( 506 | np.array_equal(sample_df, tmp_df) & (sample_df.shape == tmp_df.shape), 507 | error_message=("Vectorized function sample does not match pandas apply sample."), 508 | ) 509 | return func(self._obj) 510 | except ERRORS_TO_HANDLE: # if can't vectorize, estimate time to pandas apply 511 | wrapped = self._wrapped_applymap(func) 512 | timed = timeit.timeit(wrapped, number=N_REPEATS) 513 | sample_proc_est = timed / N_REPEATS 514 | est_apply_duration = sample_proc_est / self._SAMPLE_SIZE * self._nrows 515 | 516 | # if pandas sample apply takes too long 517 | # and not performing str processing, use dask 518 | if (est_apply_duration > self._dask_threshold) and allow_dask_processing: 519 | return self._parallel_applymap(func) 520 | else: # use pandas 521 | return self._pandas_applymap(self._obj, func) 522 | 523 | def groupby( 524 | self, by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, observed=False, dropna=True 525 | ): 526 | """ 527 | Create a swifter groupby object 528 | """ 529 | grpby_kwargs = { 530 | "level": level, 531 | "as_index": as_index, 532 | "sort": sort, 533 | "group_keys": group_keys, 534 | "observed": observed, 535 | "dropna": dropna, 536 | } 537 | if RAY_INSTALLED: 538 | return GroupBy( 539 | self._obj, 540 | by=[by] if isinstance(by, str) else by, 541 | axis=axis, 542 | progress_bar=self._progress_bar, 543 | progress_bar_desc=self._progress_bar_desc, 544 | **grpby_kwargs, 545 | ) 546 | else: 547 | raise NotImplementedError( 548 | "Ray is required for groupby apply functionality." 549 | "Please install `ray` before continuing and then restart your script or kernel." 550 | ) 551 | 552 | 553 | if RAY_INSTALLED: # noqa: C901 554 | 555 | class GroupBy(DataFrameAccessor): 556 | import ray 557 | 558 | def __init__( 559 | self, 560 | pandas_obj, 561 | by, 562 | axis=0, 563 | npartitions=DEFAULT_KWARGS["npartitions"], 564 | dask_threshold=DEFAULT_KWARGS["dask_threshold"], 565 | progress_bar=DEFAULT_KWARGS["progress_bar"], 566 | progress_bar_desc="Ray GroupBy Apply", 567 | **grpby_kwargs, 568 | ): 569 | super(GroupBy, self).__init__( 570 | pandas_obj, 571 | npartitions=npartitions, 572 | dask_threshold=dask_threshold, 573 | progress_bar=progress_bar, 574 | progress_bar_desc=progress_bar_desc, 575 | ) 576 | self._obj_pd = pandas_obj 577 | self._nrows = pandas_obj.shape[0] 578 | self._by = by 579 | self._grpby_index = self._obj_pd.index.equals(self._by) 580 | self._axis = axis 581 | self._grpby_kwargs = grpby_kwargs 582 | self._subset_columns = None 583 | 584 | def __getitem__(self, key): 585 | self._subset_columns = key 586 | return self 587 | 588 | # NOTE: All credit for the Ray Groupby Apply logic goes to github user @diditforlulz273 589 | # NOTE: He provided a gist which I adapted to work in swifter's codebase 590 | # NOTE: https://gist.github.com/diditforlulz273/06ffa5f5b1c00830671ce0330851352f 591 | def _get_chunks(self): 592 | subset_df = self._obj_pd.index if self._grpby_index else self._obj_pd[self._by[0]] 593 | unique_groups = subset_df.unique() 594 | n_splits = min(len(unique_groups), self._npartitions) 595 | splits = np.array_split(unique_groups, n_splits) 596 | return [self._obj_pd.loc[subset_df.isin(splits[x])] for x in range(n_splits)] 597 | 598 | @ray.remote 599 | def _ray_groupby_apply_chunk(self, chunk, func, *args, **kwds): 600 | by = chunk.index if self._grpby_index else self._by 601 | grpby = chunk.groupby(by, axis=self._axis, **self._grpby_kwargs) 602 | grpby = grpby if self._subset_columns is None else grpby[self._subset_columns] 603 | return grpby.apply(func, *args, **kwds) 604 | 605 | def _ray_submit_apply(self, chunks, func, *args, **kwds): 606 | import ray 607 | 608 | return [self._ray_groupby_apply_chunk.remote(self, ray.put(chunk), func, *args, **kwds) for chunk in chunks] 609 | 610 | def _ray_progress_apply(self, ray_submit_apply, total_chunks): 611 | import ray 612 | 613 | with tqdm(desc=self._progress_bar_desc, total=total_chunks) as pbar: 614 | apply_chunks = ray_submit_apply() 615 | for complete_chunk in range(total_chunks): 616 | ray.wait(apply_chunks, num_returns=complete_chunk + 1) 617 | pbar.update(1) 618 | return apply_chunks 619 | 620 | def _ray_apply(self, func, *args, **kwds): 621 | import ray 622 | 623 | chunks = self._get_chunks() 624 | ray_submit_apply = partial(self._ray_submit_apply, chunks=chunks, func=func, *args, **kwds) 625 | apply_chunks = ( 626 | self._ray_progress_apply(ray_submit_apply, len(chunks)) if self._progress_bar else ray_submit_apply() 627 | ) 628 | return pd.concat(ray.get(apply_chunks), axis=self._axis).sort_index() 629 | 630 | def apply(self, func, *args, **kwds): 631 | """ 632 | Apply the function to the groupby swifter object 633 | """ 634 | # if the transformed dataframe is empty or very small, return early using Pandas 635 | if not self._nrows or self._nrows <= GROUPBY_MAX_ROWS_PANDAS_DEFAULT: 636 | return self._obj_pd.groupby(self._by, axis=self._axis, **self._grpby_kwargs).apply(func, *args, **kwds) 637 | 638 | # Swifter logic can't accurately estimate groupby applies, so always parallelize 639 | return self._ray_apply(func, *args, **kwds) 640 | 641 | 642 | class Transformation(_SwifterObject): 643 | def __init__( 644 | self, 645 | pandas_obj, 646 | npartitions=DEFAULT_KWARGS["npartitions"], 647 | dask_threshold=DEFAULT_KWARGS["dask_threshold"], 648 | scheduler=DEFAULT_KWARGS["scheduler"], 649 | progress_bar=DEFAULT_KWARGS["progress_bar"], 650 | progress_bar_desc=DEFAULT_KWARGS["progress_bar_desc"], 651 | allow_dask_on_strings=DEFAULT_KWARGS["allow_dask_on_strings"], 652 | force_parallel=DEFAULT_KWARGS["force_parallel"], 653 | ): 654 | super(Transformation, self).__init__( 655 | pandas_obj, 656 | npartitions, 657 | dask_threshold, 658 | scheduler, 659 | progress_bar, 660 | progress_bar_desc, 661 | allow_dask_on_strings, 662 | force_parallel, 663 | ) 664 | self._sample_pd = pandas_obj.iloc[: self._SAMPLE_SIZE] 665 | self._obj_pd = pandas_obj 666 | self._obj_dd = dd.from_pandas(pandas_obj, npartitions=npartitions) 667 | self._nrows = pandas_obj.shape[0] 668 | 669 | def _wrapped_apply(self, func, *args, **kwds): 670 | def wrapped(): 671 | with suppress_stdout_stderr_logging(): 672 | self._sample_pd.apply(func, *args, **kwds) 673 | 674 | return wrapped 675 | 676 | @abstractmethod 677 | def _parallel_apply(self, func, *args, **kwds): 678 | raise NotImplementedError("Transformation class does not implement _parallel_apply") 679 | 680 | def apply(self, func, *args, **kwds): 681 | """ 682 | Apply the function to the transformed swifter object 683 | """ 684 | # if the transformed dataframe is empty, return early using Pandas 685 | if not self._nrows: 686 | return self._obj_pd.apply(func, *args, **kwds) 687 | 688 | # If parallel processing is forced by the user, then skip the logic and apply dask 689 | if self._force_parallel: 690 | return self._parallel_apply(func, *args, **kwds) 691 | 692 | # estimate time to pandas apply 693 | wrapped = self._wrapped_apply(func, *args, **kwds) 694 | timed = timeit.timeit(wrapped, number=N_REPEATS) 695 | sample_proc_est = timed / N_REPEATS 696 | est_apply_duration = sample_proc_est / self._SAMPLE_SIZE * self._nrows 697 | 698 | # No `allow_dask_processing` variable here, 699 | # because we don't know the dtypes of the transformation 700 | if est_apply_duration > self._dask_threshold: 701 | return self._parallel_apply(func, *args, **kwds) 702 | else: # use pandas 703 | if self._progress_bar and hasattr(self._obj_pd, "progress_apply"): 704 | tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") 705 | return self._obj_pd.progress_apply(func, *args, **kwds) 706 | else: 707 | return self._obj_pd.apply(func, *args, **kwds) 708 | 709 | 710 | class Rolling(Transformation): 711 | def __init__( 712 | self, 713 | pandas_obj, 714 | npartitions=None, 715 | dask_threshold=1, 716 | scheduler="processes", 717 | progress_bar=True, 718 | progress_bar_desc=None, 719 | allow_dask_on_strings=False, 720 | force_parallel=False, 721 | **kwds, 722 | ): 723 | super(Rolling, self).__init__( 724 | pandas_obj, 725 | npartitions=npartitions, 726 | dask_threshold=dask_threshold, 727 | scheduler=scheduler, 728 | progress_bar=progress_bar, 729 | progress_bar_desc=progress_bar_desc, 730 | allow_dask_on_strings=allow_dask_on_strings, 731 | force_parallel=force_parallel, 732 | ) 733 | self._rolling_kwds = kwds.copy() 734 | self._comparison_pd = self._obj_pd.iloc[: self._SAMPLE_SIZE] 735 | self._sample_pd = self._sample_pd.rolling(**kwds) 736 | self._obj_pd = self._obj_pd.rolling(**kwds) 737 | self._obj_dd = self._obj_dd.rolling(**{k: v for k, v in kwds.items() if k not in ["on", "closed"]}) 738 | 739 | def _parallel_apply(self, func, *args, **kwds): 740 | try: 741 | # check that the dask rolling apply matches the pandas apply 742 | with suppress_stdout_stderr_logging(): 743 | tmp_df = ( 744 | dd.from_pandas(self._comparison_pd, npartitions=self._npartitions) 745 | .rolling(**{k: v for k, v in self._rolling_kwds.items() if k not in ["on", "closed"]}) 746 | .apply(func, *args, **kwds) 747 | .compute(scheduler=self._scheduler) 748 | ) 749 | self._validate_apply( 750 | tmp_df.equals(self._comparison_pd.rolling(**self._rolling_kwds).apply(func, *args, **kwds)), 751 | error_message=("Dask rolling apply sample does not match " "pandas rolling apply sample."), 752 | ) 753 | if self._progress_bar: 754 | with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): 755 | return self._obj_dd.apply(func, *args, **kwds).compute(scheduler=self._scheduler) 756 | else: 757 | return self._obj_dd.apply(func, *args, **kwds).compute(scheduler=self._scheduler) 758 | except ERRORS_TO_HANDLE: 759 | if self._progress_bar: 760 | tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") 761 | return self._obj_pd.progress_apply(func, *args, **kwds) 762 | else: 763 | return self._obj_pd.apply(func, *args, **kwds) 764 | 765 | 766 | class Resampler(Transformation): 767 | def __init__( 768 | self, 769 | pandas_obj, 770 | npartitions=None, 771 | dask_threshold=1, 772 | scheduler="processes", 773 | progress_bar=True, 774 | progress_bar_desc=None, 775 | allow_dask_on_strings=False, 776 | force_parallel=False, 777 | **kwds, 778 | ): 779 | super(Resampler, self).__init__( 780 | pandas_obj, 781 | npartitions=npartitions, 782 | dask_threshold=dask_threshold, 783 | scheduler=scheduler, 784 | progress_bar=progress_bar, 785 | progress_bar_desc=progress_bar_desc, 786 | allow_dask_on_strings=allow_dask_on_strings, 787 | force_parallel=force_parallel, 788 | ) 789 | self._resampler_kwds = kwds.copy() 790 | self._comparison_pd = self._obj_pd.iloc[: self._SAMPLE_SIZE] 791 | self._sample_pd = self._sample_pd.resample(**kwds) 792 | self._obj_pd = self._obj_pd.resample(**kwds) 793 | # Setting dask dataframe `self._obj_dd` to None when there are 0 `self._nrows` 794 | # because swifter will immediately return the pandas form during the apply 795 | # function if there are 0 `self._nrows` 796 | self._obj_dd = ( 797 | self._obj_dd.resample(**{k: v for k, v in kwds.items() if k in ["rule", "closed", "label"]}) 798 | if self._nrows 799 | else None 800 | ) 801 | 802 | def _parallel_apply(self, func, *args, **kwds): 803 | try: 804 | # check that the dask resampler apply matches the pandas apply 805 | with suppress_stdout_stderr_logging(): 806 | tmp_df = ( 807 | dd.from_pandas(self._comparison_pd, npartitions=self._npartitions) 808 | .resample(**{k: v for k, v in self._resampler_kwds.items() if k in ["rule", "closed", "label"]}) 809 | .agg(func, *args, **kwds) 810 | .compute(scheduler=self._scheduler) 811 | ) 812 | self._validate_apply( 813 | tmp_df.equals(self._comparison_pd.resample(**self._resampler_kwds).apply(func, *args, **kwds)), 814 | error_message=("Dask resampler apply sample does not match " "pandas resampler apply sample."), 815 | ) 816 | 817 | if self._progress_bar: 818 | with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): 819 | return self._obj_dd.agg(func, *args, **kwds).compute(scheduler=self._scheduler) 820 | else: 821 | return self._obj_dd.agg(func, *args, **kwds).compute(scheduler=self._scheduler) 822 | except ERRORS_TO_HANDLE: 823 | # use pandas -- no progress_apply available for resampler objects 824 | return self._obj_pd.apply(func, *args, **kwds) 825 | -------------------------------------------------------------------------------- /swifter/swifter_tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import importlib 4 | import unittest 5 | import subprocess 6 | import time 7 | import logging 8 | import warnings 9 | from psutil import cpu_count 10 | 11 | import numpy as np 12 | import numpy.testing as npt 13 | import pandas as pd 14 | import swifter 15 | 16 | from .swifter import RAY_INSTALLED, GROUPBY_MAX_ROWS_PANDAS_DEFAULT 17 | 18 | from tqdm.auto import tqdm 19 | 20 | WINDOWS_CI = "windows" in os.environ.get("CIRCLE_JOB", "") 21 | 22 | 23 | LOG = logging.getLogger(__name__) 24 | LOG.setLevel(logging.INFO) 25 | ch = logging.StreamHandler() 26 | ch.setLevel(logging.INFO) 27 | formatter = logging.Formatter("%(asctime)-8s.%(msecs)03d %(levelname)-8s %(name)s:%(lineno)-3s %(message)s") 28 | ch.setFormatter(formatter) 29 | LOG.addHandler(ch) 30 | 31 | 32 | def math_vec_square(x): 33 | return x**2 34 | 35 | 36 | def math_foo(x, compare_to=1): 37 | return x**2 if x < compare_to else x ** (1 / 2) 38 | 39 | 40 | def math_vec_multiply(row): 41 | return row["x"] * row["y"] 42 | 43 | 44 | def math_agg_foo(row): 45 | return row.sum() - row.min() 46 | 47 | 48 | def numeric_func(x): 49 | return x["x"].mean() / x["y"].var() 50 | 51 | 52 | def text_foo(row): 53 | if row["letter"] == "A": 54 | return row["value"] * 3 55 | elif row["letter"] == "B": 56 | return row["value"] ** 3 57 | elif row["letter"] == "C": 58 | return row["value"] / 3 59 | elif row["letter"] == "D": 60 | return row["value"] ** (1 / 3) 61 | elif row["letter"] == "E": 62 | return row["value"] 63 | 64 | 65 | def clean_text_foo(row): 66 | text = " ".join(row) 67 | text = text.strip() 68 | text = text.replace(" ", "_") 69 | return text 70 | 71 | 72 | def run_if_modin_installed(cls): 73 | # if modin is installed, run the test/test suite 74 | if importlib.util.find_spec("modin") is not None: 75 | return cls 76 | else: # if modin isnt installed just skip the test(s) 77 | return True 78 | 79 | 80 | def run_if_ray_installed(func): 81 | # if ray is installed, run the test/test suite 82 | if RAY_INSTALLED: 83 | return func 84 | else: # if ray isnt installed just skip the test(s) 85 | return True 86 | 87 | 88 | class TestSwifter(unittest.TestCase): 89 | def assertLessLinux(self, a, b, msg=None): 90 | if WINDOWS_CI: 91 | pass 92 | else: 93 | super().assertLess(a, b, msg=msg) 94 | 95 | def assertSeriesEqual(self, a, b, msg): 96 | try: 97 | pd.testing.assert_series_equal(a, b) 98 | except AssertionError as e: 99 | raise self.failureException(msg) from e 100 | 101 | def assertDataFrameEqual(self, a, b, msg): 102 | try: 103 | pd.testing.assert_frame_equal(a, b) 104 | except AssertionError as e: 105 | raise self.failureException(msg) from e 106 | 107 | def assertModinSeriesEqual(self, a, b, msg): 108 | try: 109 | npt.assert_array_almost_equal(a, b) 110 | except AssertionError as e: 111 | raise self.failureException(msg) from e 112 | 113 | def assertModinDataFrameEqual(self, a, b, msg): 114 | try: 115 | npt.assert_array_almost_equal(a, b) 116 | except AssertionError as e: 117 | raise self.failureException(msg) from e 118 | 119 | def modinSetUp(self): 120 | """ 121 | Imports modin before swifter so that we have access to modin functionality 122 | """ 123 | import os 124 | 125 | os.environ["MODIN_ENGINE"] = "dask" 126 | import modin.pandas as md 127 | import swifter 128 | 129 | swifter.register_modin() 130 | self.addTypeEqualityFunc(md.Series, self.assertModinSeriesEqual) 131 | self.addTypeEqualityFunc(md.DataFrame, self.assertModinDataFrameEqual) 132 | return md 133 | 134 | def setUp(self): 135 | LOG.info(f"Version {swifter.__version__}") 136 | self.addTypeEqualityFunc(pd.Series, self.assertSeriesEqual) 137 | self.addTypeEqualityFunc(pd.DataFrame, self.assertDataFrameEqual) 138 | self.ncores = cpu_count() 139 | 140 | 141 | class TestSetup(TestSwifter): 142 | def test_set_defaults(self): 143 | LOG.info("test_set_defaults") 144 | from swifter import set_defaults 145 | 146 | expected_npartitions = 2 147 | expected_dask_threshold = 1.5 148 | expected_scheduler = "threads" 149 | expected_progress_bar = False 150 | expected_progress_bar_desc = "TEST" 151 | expected_allow_dask_on_strings = True 152 | expected_force_parallel = True 153 | set_defaults( 154 | npartitions=expected_npartitions, 155 | dask_threshold=expected_dask_threshold, 156 | scheduler=expected_scheduler, 157 | progress_bar=expected_progress_bar, 158 | progress_bar_desc=expected_progress_bar_desc, 159 | allow_dask_on_strings=expected_allow_dask_on_strings, 160 | force_parallel=expected_force_parallel, 161 | ) 162 | for swifter_df in [ 163 | pd.DataFrame().swifter, 164 | pd.Series().swifter, 165 | pd.DataFrame( 166 | {"x": np.arange(0, 10)}, 167 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 168 | ).swifter.rolling("1d"), 169 | pd.DataFrame( 170 | {"x": np.arange(0, 10)}, 171 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 172 | ).swifter.resample("3T"), 173 | ]: 174 | swifter_df._npartitions == expected_npartitions 175 | swifter_df._dask_threshold == expected_dask_threshold 176 | swifter_df._scheduler == expected_scheduler 177 | swifter_df._progress_bar == expected_progress_bar 178 | swifter_df._progress_bar_desc == expected_progress_bar_desc 179 | swifter_df._allow_dask_on_strings == expected_allow_dask_on_strings 180 | swifter_df._force_parallel == expected_force_parallel 181 | 182 | def test_override_defaults(self): 183 | LOG.info("test_set_defaults") 184 | from swifter import set_defaults 185 | 186 | set_npartitions = 2 187 | set_dask_threshold = 1.5 188 | set_scheduler = "threads" 189 | set_progress_bar = False 190 | set_progress_bar_desc = "TEST" 191 | set_allow_dask_on_strings = True 192 | set_force_parallel = True 193 | 194 | expected_npartitions = 3 195 | expected_dask_threshold = 4.5 196 | expected_scheduler = "processes" 197 | expected_progress_bar = True 198 | expected_progress_bar_desc = "TEST-AGAIN" 199 | expected_allow_dask_on_strings = False 200 | expected_force_parallel = False 201 | set_defaults( 202 | npartitions=set_npartitions, 203 | dask_threshold=set_dask_threshold, 204 | scheduler=set_scheduler, 205 | progress_bar=set_progress_bar, 206 | progress_bar_desc=set_progress_bar_desc, 207 | allow_dask_on_strings=set_allow_dask_on_strings, 208 | force_parallel=set_force_parallel, 209 | ) 210 | for swifter_df_1, swifter_df_2 in [ 211 | [pd.DataFrame().swifter, pd.Series().swifter], 212 | [ 213 | pd.Series().swifter, 214 | pd.DataFrame( 215 | {"x": np.arange(0, 10)}, 216 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 217 | ).swifter.rolling("1d"), 218 | ], 219 | [ 220 | pd.DataFrame( 221 | {"x": np.arange(0, 10)}, 222 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 223 | ).swifter.rolling("1d"), 224 | pd.DataFrame( 225 | {"x": np.arange(0, 10)}, 226 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 227 | ).swifter.resample("3T"), 228 | ], 229 | [ 230 | pd.DataFrame( 231 | {"x": np.arange(0, 10)}, 232 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 233 | ).swifter.resample("3T"), 234 | pd.DataFrame().swifter, 235 | ], 236 | ]: 237 | swifter_df_1 = ( 238 | swifter_df_1.set_npartitions(npartitions=expected_npartitions) 239 | .set_dask_threshold(dask_threshold=expected_dask_threshold) 240 | .set_dask_scheduler(scheduler=expected_scheduler) 241 | .progress_bar(enable=expected_progress_bar, desc=expected_progress_bar_desc) 242 | .allow_dask_on_strings(enable=expected_allow_dask_on_strings) 243 | .force_parallel(enable=expected_force_parallel) 244 | ) 245 | 246 | swifter_df_1._npartitions == expected_npartitions 247 | swifter_df_1._dask_threshold == expected_dask_threshold 248 | swifter_df_1._scheduler == expected_scheduler 249 | swifter_df_1._progress_bar == expected_progress_bar 250 | swifter_df_1._progress_bar_desc == expected_progress_bar_desc 251 | swifter_df_1._allow_dask_on_strings == expected_allow_dask_on_strings 252 | swifter_df_1._force_parallel == expected_force_parallel 253 | 254 | swifter_df_2._npartitions == set_npartitions 255 | swifter_df_2._dask_threshold == set_dask_threshold 256 | swifter_df_2._scheduler == set_scheduler 257 | swifter_df_2._progress_bar == set_progress_bar 258 | swifter_df_2._progress_bar_desc == set_progress_bar_desc 259 | swifter_df_2._allow_dask_on_strings == set_allow_dask_on_strings 260 | swifter_df_2._force_parallel = set_force_parallel 261 | 262 | def test_set_npartitions(self): 263 | LOG.info("test_set_npartitions") 264 | for swifter_df, set_npartitions, expected in zip( 265 | [ 266 | pd.DataFrame().swifter, 267 | pd.Series().swifter, 268 | pd.DataFrame( 269 | {"x": np.arange(0, 10)}, 270 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 271 | ).swifter.rolling("1d"), 272 | pd.DataFrame( 273 | {"x": np.arange(0, 10)}, 274 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 275 | ).swifter.resample("3T"), 276 | ], 277 | [None, 1000, 1001, 1002], 278 | [cpu_count() * 2, 1000, 1001, 1002], 279 | ): 280 | before = swifter_df._npartitions 281 | swifter_df.set_npartitions(set_npartitions) 282 | actual = swifter_df._npartitions 283 | self.assertEqual(actual, expected) 284 | if set_npartitions is not None: 285 | self.assertNotEqual(before, actual) 286 | 287 | def test_set_dask_threshold(self): 288 | LOG.info("test_set_dask_threshold") 289 | expected = 1000 290 | for swifter_df in [ 291 | pd.DataFrame().swifter, 292 | pd.Series().swifter, 293 | pd.DataFrame( 294 | {"x": np.arange(0, 10)}, 295 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 296 | ).swifter.rolling("1d"), 297 | pd.DataFrame( 298 | {"x": np.arange(0, 10)}, 299 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 300 | ).swifter.resample("3T"), 301 | ]: 302 | before = swifter_df._dask_threshold 303 | swifter_df.set_dask_threshold(expected) 304 | actual = swifter_df._dask_threshold 305 | self.assertEqual(actual, expected) 306 | self.assertNotEqual(before, actual) 307 | 308 | def test_set_dask_scheduler(self): 309 | LOG.info("test_set_dask_scheduler") 310 | expected = "my-scheduler" 311 | for swifter_df in [ 312 | pd.DataFrame().swifter, 313 | pd.Series().swifter, 314 | pd.DataFrame( 315 | {"x": np.arange(0, 10)}, 316 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 317 | ).swifter.rolling("1d"), 318 | pd.DataFrame( 319 | {"x": np.arange(0, 10)}, 320 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 321 | ).swifter.resample("3T"), 322 | ]: 323 | before = swifter_df._scheduler 324 | swifter_df.set_dask_scheduler(expected) 325 | actual = swifter_df._scheduler 326 | self.assertEqual(actual, expected) 327 | self.assertNotEqual(before, actual) 328 | 329 | def test_disable_progress_bar(self): 330 | LOG.info("test_disable_progress_bar") 331 | expected = False 332 | for swifter_df in [ 333 | pd.DataFrame().swifter, 334 | pd.Series().swifter, 335 | pd.DataFrame( 336 | {"x": np.arange(0, 10)}, 337 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 338 | ).swifter.rolling("1d"), 339 | pd.DataFrame( 340 | {"x": np.arange(0, 10)}, 341 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10), 342 | ).swifter.resample("3T"), 343 | ]: 344 | before = swifter_df._progress_bar 345 | swifter_df.progress_bar(expected) 346 | actual = swifter_df._progress_bar 347 | self.assertEqual(actual, expected) 348 | self.assertNotEqual(before, actual) 349 | 350 | def test_allow_dask_on_strings(self): 351 | LOG.info("test_allow_dask_on_strings") 352 | expected = True 353 | swifter_df = pd.DataFrame().swifter 354 | before = swifter_df._allow_dask_on_strings 355 | swifter_df.allow_dask_on_strings(expected) 356 | actual = swifter_df._allow_dask_on_strings 357 | self.assertEqual(actual, expected) 358 | self.assertNotEqual(before, actual) 359 | 360 | def test_force_parallel(self): 361 | LOG.info("test_force_parallel") 362 | expected = True 363 | swifter_df = pd.DataFrame().swifter 364 | before = swifter_df._force_parallel 365 | swifter_df.force_parallel(expected) 366 | actual = swifter_df._force_parallel 367 | self.assertEqual(actual, expected) 368 | self.assertNotEqual(before, actual) 369 | 370 | def test_stdout_redirected(self): 371 | LOG.info("test_stdout_redirected") 372 | print_messages = subprocess.check_output( 373 | [ 374 | sys.executable, 375 | "-c", 376 | "import pandas as pd; import numpy as np; import swifter; " 377 | + "df = pd.DataFrame({'x': np.random.normal(size=4)}, dtype='float32'); " 378 | + "df.swifter.progress_bar(enable=False)" 379 | + ".apply(lambda x: print(x.values))", 380 | ], 381 | stderr=subprocess.STDOUT, 382 | ) 383 | self.assertEqual(len(print_messages.decode("utf-8").rstrip("\n").split("\n")), 1) 384 | 385 | 386 | class TestPandasSeries(TestSwifter): 387 | def test_apply_on_empty_series(self): 388 | LOG.info("test_apply_on_empty_series") 389 | series = pd.Series() 390 | pd_val = series.apply(math_foo, compare_to=1) 391 | swifter_val = series.swifter.apply(math_foo, compare_to=1) 392 | self.assertEqual(pd_val, swifter_val) # equality test 393 | 394 | def test_nonvectorized_math_apply_on_small_series(self): 395 | LOG.info("test_nonvectorized_math_apply_on_small_series") 396 | df = pd.DataFrame({"x": np.random.normal(size=1000)}) 397 | series = df["x"] 398 | tqdm.pandas(desc="Pandas Vec math apply ~ Series") 399 | pd_val = series.progress_apply(math_foo, compare_to=1) 400 | swifter_val = series.swifter.progress_bar(desc="Vec math apply ~ Series").apply(math_foo, compare_to=1) 401 | self.assertEqual(pd_val, swifter_val) # equality test 402 | 403 | def test_nonvectorized_math_apply_on_small_series_no_progress_bar(self): 404 | LOG.info("test_nonvectorized_math_apply_on_small_series_no_progress_bar") 405 | df = pd.DataFrame({"x": np.random.normal(size=1000)}) 406 | series = df["x"] 407 | pd_val = series.apply(math_foo, compare_to=1) 408 | swifter_val = series.swifter.progress_bar(enable=False).apply(math_foo, compare_to=1) 409 | self.assertEqual(pd_val, swifter_val) # equality test 410 | 411 | def test_vectorized_math_apply_on_large_series(self): 412 | LOG.info("test_vectorized_math_apply_on_large_series") 413 | df = pd.DataFrame({"x": np.random.normal(size=10_000_000)}) 414 | series = df["x"] 415 | 416 | tqdm.pandas(desc="Pandas Vec math apply ~ Series") 417 | start_pd = time.time() 418 | pd_val = series.progress_apply(math_vec_square) 419 | end_pd = time.time() 420 | pd_time = end_pd - start_pd 421 | 422 | start_swifter = time.time() 423 | swifter_val = ( 424 | series.swifter.set_npartitions(4).progress_bar(desc="Vec math apply ~ Series").apply(math_vec_square) 425 | ) 426 | end_swifter = time.time() 427 | swifter_time = end_swifter - start_swifter 428 | 429 | self.assertEqual(pd_val, swifter_val) # equality test 430 | if self.ncores > 1: # speed test 431 | self.assertLessLinux(swifter_time, pd_time) 432 | 433 | def test_nonvectorized_math_apply_on_large_series(self): 434 | LOG.info("test_nonvectorized_math_apply_on_large_series") 435 | df = pd.DataFrame({"x": np.random.normal(size=10_000_000)}) 436 | series = df["x"] 437 | 438 | tqdm.pandas(desc="Pandas Nonvec math apply ~ Series") 439 | start_pd = time.time() 440 | pd_val = series.progress_apply(math_foo, compare_to=1) 441 | end_pd = time.time() 442 | pd_time = end_pd - start_pd 443 | 444 | start_swifter = time.time() 445 | swifter_val = ( 446 | series.swifter.set_npartitions(4) 447 | .progress_bar(desc="Nonvec math apply ~ Series") 448 | .apply(math_foo, compare_to=1) 449 | ) 450 | end_swifter = time.time() 451 | swifter_time = end_swifter - start_swifter 452 | 453 | self.assertEqual(pd_val, swifter_val) # equality test 454 | if self.ncores > 1: # speed test 455 | self.assertLessLinux(swifter_time, pd_time) 456 | 457 | def test_vectorized_force_parallel_math_apply_on_large_series(self): 458 | LOG.info("test_vectorized_force_parallel_math_apply_on_large_series") 459 | df = pd.DataFrame({"x": np.random.normal(size=2_000_000)}) 460 | series = df["x"] 461 | 462 | tqdm.pandas(desc="Pandas Vec math apply ~ Series") 463 | start_pd = time.time() 464 | pd_val = series.progress_apply(math_vec_square) 465 | end_pd = time.time() 466 | pd_time = end_pd - start_pd 467 | 468 | start_swifter = time.time() 469 | swifter_val = ( 470 | series.swifter.set_npartitions(4) 471 | .force_parallel(True) 472 | .progress_bar(desc="Force Parallel - Vec math apply ~ Series") 473 | .apply(math_vec_square) 474 | ) 475 | end_swifter = time.time() 476 | swifter_time = end_swifter - start_swifter 477 | 478 | self.assertEqual(pd_val, swifter_val) # equality test 479 | if self.ncores > 1: # speed test 480 | self.assertLessLinux(swifter_time, pd_time) 481 | 482 | 483 | class TestPandasDataFrame(TestSwifter): 484 | def test_apply_on_empty_dataframe(self): 485 | LOG.info("test_apply_on_empty_dataframe") 486 | df = pd.DataFrame(columns=["x", "y"]) 487 | pd_val = df.apply(math_vec_multiply, axis=1) 488 | swifter_val = df.swifter.apply(math_vec_multiply, axis=1) 489 | self.assertEqual(pd_val, swifter_val) # equality test 490 | 491 | def test_applymap_on_empty_dataframe(self): 492 | LOG.info("test_applymap_on_empty_dataframe") 493 | df = pd.DataFrame(columns=["x", "y"]) 494 | pd_val = df.applymap(math_vec_square) 495 | swifter_val = df.swifter.applymap(math_vec_square) 496 | self.assertEqual(pd_val, swifter_val) # equality test 497 | 498 | @run_if_ray_installed 499 | def test_groupby_apply_on_empty_dataframe(self): 500 | LOG.info("test_groupby_apply_on_empty_dataframe") 501 | df = pd.DataFrame(columns=["x", "y"]) 502 | pd_val = df.groupby("x").apply(math_vec_square) 503 | swifter_val = df.swifter.groupby("x").apply(math_vec_square) 504 | self.assertEqual(pd_val, swifter_val) # equality test 505 | 506 | @run_if_ray_installed 507 | def test_groupby_index_apply(self): 508 | LOG.info("test_groupby_index_apply") 509 | SIZE = GROUPBY_MAX_ROWS_PANDAS_DEFAULT * 2 510 | df = pd.DataFrame( 511 | { 512 | "x": np.random.normal(size=SIZE), 513 | "y": np.random.uniform(size=SIZE), 514 | "g": np.random.choice(np.arange(100), size=SIZE), 515 | } 516 | ) 517 | pd_val = df.groupby("g")["x"].apply(lambda x: x.std()) 518 | swifter_val = df.swifter.groupby("g")["x"].apply(lambda x: x.std()) 519 | self.assertEqual(pd_val, swifter_val) 520 | 521 | def test_nonvectorized_math_apply_on_small_dataframe(self): 522 | LOG.info("test_nonvectorized_math_apply_on_small_dataframe") 523 | df = pd.DataFrame({"x": np.random.normal(size=1000), "y": np.random.uniform(size=1000)}) 524 | tqdm.pandas(desc="Pandas Nonvec math apply ~ DF") 525 | pd_val = df.progress_apply(math_agg_foo) 526 | swifter_val = df.swifter.progress_bar(desc="Vec math apply ~ DF").apply(math_agg_foo) 527 | self.assertEqual(pd_val, swifter_val) # equality test 528 | 529 | def test_nonvectorized_math_apply_on_small_dataframe_no_progress_bar(self): 530 | LOG.info("test_nonvectorized_math_apply_on_small_dataframe_no_progress_bar") 531 | df = pd.DataFrame({"x": np.random.normal(size=1000), "y": np.random.uniform(size=1000)}) 532 | pd_val = df.apply(math_agg_foo) 533 | swifter_val = df.swifter.progress_bar(enable=False).apply(math_agg_foo) 534 | self.assertEqual(pd_val, swifter_val) # equality test 535 | 536 | def test_vectorized_math_apply_on_large_dataframe(self): 537 | LOG.info("test_vectorized_math_apply_on_large_dataframe") 538 | df = pd.DataFrame( 539 | { 540 | "x": np.random.normal(size=1_000_000), 541 | "y": np.random.uniform(size=1_000_000), 542 | } 543 | ) 544 | 545 | tqdm.pandas(desc="Pandas Vec math apply ~ DF") 546 | start_pd = time.time() 547 | pd_val = df.progress_apply(math_vec_multiply, axis=1) 548 | end_pd = time.time() 549 | pd_time = end_pd - start_pd 550 | 551 | start_swifter = time.time() 552 | swifter_val = ( 553 | df.swifter.set_npartitions(4).progress_bar(desc="Vec math apply ~ DF").apply(math_vec_multiply, axis=1) 554 | ) 555 | end_swifter = time.time() 556 | swifter_time = end_swifter - start_swifter 557 | 558 | self.assertEqual(pd_val, swifter_val) # equality test 559 | if self.ncores > 1: # speed test 560 | self.assertLessLinux(swifter_time, pd_time) 561 | 562 | def test_nonvectorized_math_apply_on_large_dataframe_broadcast(self): 563 | LOG.info("test_nonvectorized_math_apply_on_large_dataframe_broadcast") 564 | df = pd.DataFrame({"x": np.random.normal(size=500_000), "y": np.random.uniform(size=500_000)}) 565 | 566 | tqdm.pandas(desc="Pandas Nonvec math apply + broadcast ~ DF") 567 | start_pd = time.time() 568 | pd_val = df.progress_apply(math_agg_foo, axis=1, result_type="broadcast") 569 | end_pd = time.time() 570 | pd_time = end_pd - start_pd 571 | 572 | start_swifter = time.time() 573 | swifter_val = ( 574 | df.swifter.set_npartitions(4) 575 | .progress_bar(desc="Nonvec math apply + broadcast ~ DF") 576 | .apply(math_agg_foo, axis=1, result_type="broadcast") 577 | ) 578 | end_swifter = time.time() 579 | swifter_time = end_swifter - start_swifter 580 | 581 | self.assertEqual(pd_val, swifter_val) # equality test 582 | if self.ncores > 1: # speed test 583 | self.assertLessLinux(swifter_time, pd_time) 584 | 585 | def test_nonvectorized_math_apply_on_large_dataframe_reduce(self): 586 | LOG.info("test_nonvectorized_math_apply_on_large_dataframe_reduce") 587 | df = pd.DataFrame({"x": np.random.normal(size=250_000), "y": np.random.uniform(size=250_000)}) 588 | 589 | tqdm.pandas(desc="Pandas Nonvec math apply + reduce ~ DF") 590 | start_pd = time.time() 591 | pd_val = df.progress_apply(math_agg_foo, axis=1, result_type="reduce") 592 | end_pd = time.time() 593 | pd_time = end_pd - start_pd 594 | 595 | start_swifter = time.time() 596 | swifter_val = ( 597 | df.swifter.set_npartitions(4) 598 | .progress_bar(desc="Nonvec math apply + reduce ~ DF") 599 | .apply(math_agg_foo, axis=1, result_type="reduce") 600 | ) 601 | end_swifter = time.time() 602 | swifter_time = end_swifter - start_swifter 603 | 604 | self.assertEqual(pd_val, swifter_val) # equality test 605 | if self.ncores > 1: # speed test 606 | self.assertLessLinux(swifter_time, pd_time) 607 | 608 | def test_nonvectorized_text_dask_apply_on_large_dataframe(self): 609 | LOG.info("test_nonvectorized_text_dask_apply_on_large_dataframe") 610 | df = pd.DataFrame( 611 | { 612 | "letter": ["A", "B", "C", "D", "E"] * 200_000, 613 | "value": np.random.normal(size=1_000_000), 614 | } 615 | ) 616 | 617 | tqdm.pandas(desc="Pandas Nonvec text apply ~ DF") 618 | start_pd = time.time() 619 | pd_val = df.progress_apply(text_foo, axis=1) 620 | end_pd = time.time() 621 | pd_time = end_pd - start_pd 622 | 623 | start_swifter = time.time() 624 | swifter_val = ( 625 | df.swifter.allow_dask_on_strings(True) 626 | .set_npartitions(4) 627 | .progress_bar(desc="Nonvec Dask text apply ~ DF") 628 | .apply(text_foo, axis=1) 629 | ) 630 | end_swifter = time.time() 631 | swifter_time = end_swifter - start_swifter 632 | 633 | self.assertEqual(pd_val, swifter_val) # equality test 634 | if self.ncores > 1: # speed test 635 | self.assertLessLinux(swifter_time, pd_time) 636 | 637 | def test_vectorized_force_parallel_math_apply_on_large_dataframe(self): 638 | LOG.info("test_vectorized_force_parallel_math_apply_on_large_dataframe") 639 | df = pd.DataFrame( 640 | { 641 | "x": np.random.normal(size=1_000_000), 642 | "y": np.random.uniform(size=1_000_000), 643 | } 644 | ) 645 | 646 | tqdm.pandas(desc="Pandas Nonvec math apply ~ DF") 647 | start_pd = time.time() 648 | pd_val = df.progress_apply(math_vec_multiply, axis=1) 649 | end_pd = time.time() 650 | pd_time = end_pd - start_pd 651 | 652 | start_swifter = time.time() 653 | swifter_val = ( 654 | df.swifter.set_npartitions(4) 655 | .force_parallel(True) 656 | .progress_bar(desc="Forced Parallel - Vec math apply ~ DF") 657 | .apply(math_vec_multiply, axis=1) 658 | ) 659 | end_swifter = time.time() 660 | swifter_time = end_swifter - start_swifter 661 | 662 | self.assertEqual(pd_val, swifter_val) # equality test 663 | if self.ncores > 1: # speed test 664 | self.assertLessLinux(swifter_time, pd_time) 665 | 666 | def test_vectorized_math_applymap_on_large_dataframe(self): 667 | LOG.info("test_vectorized_math_applymap_on_large_dataframe") 668 | df = pd.DataFrame( 669 | { 670 | "x": np.random.normal(size=2_000_000), 671 | "y": np.random.uniform(size=2_000_000), 672 | } 673 | ) 674 | 675 | tqdm.pandas(desc="Pandas Vec math applymap ~ DF") 676 | start_pd = time.time() 677 | pd_val = df.progress_applymap(math_vec_square) 678 | end_pd = time.time() 679 | pd_time = end_pd - start_pd 680 | 681 | start_swifter = time.time() 682 | swifter_val = ( 683 | df.swifter.set_npartitions(4).progress_bar(desc="Vec math applymap ~ DF").applymap(math_vec_square) 684 | ) 685 | end_swifter = time.time() 686 | swifter_time = end_swifter - start_swifter 687 | 688 | self.assertEqual(pd_val, swifter_val) # equality test 689 | if self.ncores > 1: # speed test 690 | self.assertLessLinux(swifter_time, pd_time) 691 | 692 | def test_vectorized_force_parallel_math_applymap_on_large_dataframe(self): 693 | LOG.info("test_vectorized_force_parallel_math_applymap_on_large_dataframe") 694 | df = pd.DataFrame( 695 | { 696 | "x": np.random.normal(size=2_000_000), 697 | "y": np.random.uniform(size=2_000_000), 698 | } 699 | ) 700 | 701 | tqdm.pandas(desc="Pandas Vec math applymap ~ DF") 702 | start_pd = time.time() 703 | pd_val = df.progress_applymap(math_vec_square) 704 | end_pd = time.time() 705 | pd_time = end_pd - start_pd 706 | 707 | start_swifter = time.time() 708 | swifter_val = ( 709 | df.swifter.set_npartitions(4) 710 | .force_parallel(True) 711 | .progress_bar(desc="Force Parallel ~ Vec math applymap ~ DF") 712 | .applymap(math_vec_square) 713 | ) 714 | end_swifter = time.time() 715 | swifter_time = end_swifter - start_swifter 716 | 717 | self.assertEqual(pd_val, swifter_val) # equality test 718 | if self.ncores > 1: # speed test 719 | self.assertLessLinux(swifter_time, pd_time) 720 | 721 | def test_nonvectorized_math_applymap_on_large_dataframe(self): 722 | LOG.info("test_nonvectorized_math_applymap_on_large_dataframe") 723 | df = pd.DataFrame( 724 | { 725 | "x": np.random.normal(size=5_000_000), 726 | "y": np.random.uniform(size=5_000_000), 727 | } 728 | ) 729 | 730 | tqdm.pandas(desc="Pandas Nonvec math applymap ~ DF") 731 | start_pd = time.time() 732 | pd_val = df.progress_applymap(math_foo) 733 | end_pd = time.time() 734 | pd_time = end_pd - start_pd 735 | 736 | start_swifter = time.time() 737 | swifter_val = df.swifter.set_npartitions(4).progress_bar(desc="Nonvec math applymap ~ DF").applymap(math_foo) 738 | end_swifter = time.time() 739 | swifter_time = end_swifter - start_swifter 740 | 741 | self.assertEqual(pd_val, swifter_val) # equality test 742 | if self.ncores > 1: # speed test 743 | self.assertLessLinux(swifter_time, pd_time) 744 | 745 | def test_nonvectorized_math_applymap_on_small_dataframe(self): 746 | LOG.info("test_nonvectorized_math_applymap_on_small_dataframe") 747 | df = pd.DataFrame({"x": np.random.normal(size=1000), "y": np.random.uniform(size=1000)}) 748 | pd_val = df.applymap(math_foo) 749 | swifter_val = df.swifter.set_npartitions(4).applymap(math_foo) 750 | self.assertEqual(pd_val, swifter_val) # equality test 751 | 752 | def test_nonvectorized_math_applymap_on_small_dataframe_no_progress_bar(self): 753 | LOG.info("test_nonvectorized_math_applymap_on_small_dataframe_no_progress_bar") 754 | df = pd.DataFrame({"x": np.random.normal(size=1000), "y": np.random.uniform(size=1000)}) 755 | pd_val = df.applymap(math_foo) 756 | swifter_val = df.swifter.progress_bar(enable=False).applymap(math_foo) 757 | self.assertEqual(pd_val, swifter_val) # equality test 758 | 759 | @run_if_ray_installed 760 | def test_vectorized_math_groupby_apply_on_small_dataframe(self): 761 | LOG.info("test_vectorized_math_groupby_apply_on_small_dataframe") 762 | df = pd.DataFrame( 763 | { 764 | "g": np.random.choice([0, 1, 2], size=500), 765 | "x": np.random.normal(size=500), 766 | "y": np.random.uniform(size=500), 767 | } 768 | ) 769 | pd_val = df.groupby("g").apply(numeric_func) 770 | swifter_val = df.swifter.groupby("g").apply(numeric_func) 771 | self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test 772 | 773 | @run_if_ray_installed 774 | def test_vectorized_force_parallel_math_groupby_apply_on_small_dataframe(self): 775 | LOG.info("test_vectorized_force_parallel_math_groupby_apply_on_small_dataframe") 776 | df = pd.DataFrame( 777 | { 778 | "g": np.random.choice([0, 1, 2], size=500), 779 | "x": np.random.normal(size=500), 780 | "y": np.random.uniform(size=500), 781 | } 782 | ) 783 | pd_val = df.groupby("g").apply(numeric_func) 784 | swifter_val = df.swifter.force_parallel(True).groupby("g").apply(numeric_func) 785 | self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test 786 | 787 | @run_if_ray_installed 788 | def test_vectorized_math_groupby_apply_on_large_dataframe(self): 789 | LOG.info("test_vectorized_math_groupby_apply_on_large_dataframe") 790 | df = pd.DataFrame( 791 | { 792 | "g": np.random.choice(np.arange(50000), size=500000), 793 | "x": np.random.normal(size=500000), 794 | "y": np.random.uniform(size=500000), 795 | } 796 | ) 797 | pd_val = df.groupby("g").apply(numeric_func) 798 | swifter_val = df.swifter.groupby("g").apply(numeric_func) 799 | self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test 800 | 801 | @run_if_ray_installed 802 | def test_vectorized_math_groupby_apply_on_large_dataframe_index(self): 803 | LOG.info("test_vectorized_math_groupby_apply_on_large_dataframe_index") 804 | df = pd.DataFrame( 805 | { 806 | "x": np.random.normal(size=500000), 807 | "y": np.random.uniform(size=500000), 808 | }, 809 | index=np.random.choice(np.arange(50000), size=500000), 810 | ) 811 | pd_val = df.groupby(df.index).apply(numeric_func) 812 | swifter_val = df.swifter.groupby(df.index).apply(numeric_func) 813 | self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test 814 | 815 | @run_if_ray_installed 816 | def test_vectorized_force_parallel_math_groupby_apply_on_large_dataframe(self): 817 | LOG.info("test_vectorized_force_parallel_math_groupby_apply_on_large_dataframe") 818 | df = pd.DataFrame( 819 | { 820 | "g": np.random.choice(np.arange(50000), size=500000), 821 | "x": np.random.normal(size=500000), 822 | "y": np.random.uniform(size=500000), 823 | } 824 | ) 825 | pd_val = df.groupby("g").apply(numeric_func) 826 | swifter_val = df.swifter.force_parallel(True).groupby("g").apply(numeric_func) 827 | self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test 828 | 829 | @run_if_ray_installed 830 | def test_vectorized_text_groupby_apply_on_small_dataframe(self): 831 | LOG.info("test_vectorized_text_groupby_apply_on_small_dataframe") 832 | df = pd.DataFrame( 833 | {"g": np.random.choice([0, 1, 2], size=500), "text": np.random.choice(["A", "B", "C"], size=500)} 834 | ) 835 | pd_val = df.groupby("g").apply(clean_text_foo) 836 | swifter_val = df.swifter.groupby("g").apply(clean_text_foo) 837 | self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test 838 | 839 | @run_if_ray_installed 840 | def test_vectorized_force_parallel_text_groupby_apply_on_small_dataframe(self): 841 | LOG.info("test_vectorized_force_parallel_text_groupby_apply_on_small_dataframe") 842 | df = pd.DataFrame( 843 | {"g": np.random.choice([0, 1, 2], size=500), "text": np.random.choice(["A", "B", "C"], size=500)} 844 | ) 845 | pd_val = df.groupby("g").apply(clean_text_foo) 846 | swifter_val = df.swifter.force_parallel(True).groupby("g").apply(clean_text_foo) 847 | self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test 848 | 849 | @run_if_ray_installed 850 | def test_vectorized_text_groupby_apply_on_large_dataframe(self): 851 | LOG.info("test_vectorized_text_groupby_apply_on_large_dataframe") 852 | df = pd.DataFrame( 853 | { 854 | "g": np.random.choice(np.arange(50000), size=500000), 855 | "text": np.random.choice(["A", "B", "C"], size=500000), 856 | } 857 | ) 858 | pd_val = df.groupby("g").apply(clean_text_foo) 859 | swifter_val = df.swifter.groupby("g").apply(clean_text_foo) 860 | self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test 861 | 862 | @run_if_ray_installed 863 | def test_vectorized_force_parallel_text_groupby_apply_on_large_dataframe(self): 864 | LOG.info("test_vectorized_force_parallel_text_groupby_apply_on_large_dataframe") 865 | df = pd.DataFrame( 866 | { 867 | "g": np.random.choice(np.arange(50000), size=500000), 868 | "text": np.random.choice(["A", "B", "C"], size=500000), 869 | } 870 | ) 871 | pd_val = df.groupby("g").apply(clean_text_foo) 872 | swifter_val = df.swifter.force_parallel(True).groupby("g").apply(clean_text_foo) 873 | self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test 874 | 875 | 876 | class TestPandasTransformation(TestSwifter): 877 | def test_rolling_apply_on_empty_dataframe(self): 878 | LOG.info("test_rolling_apply_on_empty_dataframe") 879 | df = pd.DataFrame(columns=["x", "y"]) 880 | pd_val = df.rolling(1).apply(math_agg_foo, raw=True) 881 | swifter_val = df.swifter.set_npartitions(4).rolling(1).apply(math_agg_foo, raw=True) 882 | self.assertEqual(pd_val, swifter_val) # equality test 883 | 884 | def test_resample_apply_on_empty_dataframe(self): 885 | LOG.info("test_resample_apply_on_empty_dataframe") 886 | df = pd.DataFrame(columns=["x", "y"], index=pd.date_range(start="2020/01/01", periods=0)) 887 | pd_val = df.resample("1d").apply(math_agg_foo) 888 | swifter_val = df.swifter.set_npartitions(4).resample("1d").apply(math_agg_foo) 889 | self.assertEqual(pd_val, swifter_val) # equality test 890 | 891 | def test_nonvectorized_math_apply_on_small_rolling_dataframe(self): 892 | LOG.info("test_nonvectorized_math_apply_on_small_rolling_dataframe") 893 | df = pd.DataFrame( 894 | {"x": np.arange(0, 1000)}, 895 | index=pd.date_range("2019-01-1", "2020-01-1", periods=1000), 896 | ) 897 | pd_val = df.rolling("1d").apply(math_agg_foo, raw=True) 898 | swifter_val = ( 899 | df.swifter.set_npartitions(4) 900 | .rolling("1d") 901 | .progress_bar(desc="Nonvec math apply ~ Rolling DF") 902 | .apply(math_agg_foo, raw=True) 903 | ) 904 | self.assertEqual(pd_val, swifter_val) # equality test 905 | 906 | def test_nonvectorized_math_apply_on_small_rolling_dataframe_no_progress_bar(self): 907 | LOG.info("test_nonvectorized_math_apply_on_small_rolling_dataframe_no_progress_bar") 908 | df = pd.DataFrame( 909 | {"x": np.arange(0, 1000)}, 910 | index=pd.date_range("2019-01-1", "2020-01-1", periods=1000), 911 | ) 912 | pd_val = df.rolling("1d").apply(math_agg_foo, raw=True) 913 | swifter_val = ( 914 | df.swifter.set_npartitions(4).rolling("1d").progress_bar(enable=False).apply(math_agg_foo, raw=True) 915 | ) 916 | self.assertEqual(pd_val, swifter_val) # equality test 917 | 918 | def test_vectorized_math_apply_on_large_rolling_dataframe(self): 919 | LOG.info("test_vectorized_math_apply_on_large_rolling_dataframe") 920 | df = pd.DataFrame( 921 | {"x": np.arange(0, 1_000_000)}, 922 | index=pd.date_range("2019-01-1", "2020-01-1", periods=1_000_000), 923 | ) 924 | pd_val = df.rolling("1d").apply(max, raw=True) 925 | swifter_val = ( 926 | df.swifter.set_npartitions(4) 927 | .rolling("1d") 928 | .progress_bar(desc="Vec math apply ~ Rolling DF") 929 | .apply(max, raw=True) 930 | ) 931 | self.assertEqual(pd_val, swifter_val) # equality test 932 | 933 | def test_nonvectorized_math_apply_on_large_rolling_dataframe(self): 934 | LOG.info("test_nonvectorized_math_apply_on_large_rolling_dataframe") 935 | df = pd.DataFrame( 936 | {"x": np.arange(0, 10_000_000)}, 937 | index=pd.date_range("2019-01-1", "2020-01-1", periods=10_000_000), 938 | ) 939 | 940 | start_pd = time.time() 941 | pd_val = df.rolling("3T").apply(math_agg_foo, raw=True) 942 | end_pd = time.time() 943 | pd_time = end_pd - start_pd 944 | 945 | start_swifter = time.time() 946 | swifter_val = ( 947 | df.swifter.set_npartitions(10) 948 | .rolling("3T") 949 | .progress_bar(desc="Nonvec math apply ~ Rolling DF") 950 | .apply(math_agg_foo, raw=True) 951 | ) 952 | end_swifter = time.time() 953 | swifter_time = end_swifter - start_swifter 954 | 955 | self.assertEqual(pd_val, swifter_val) # equality test 956 | if self.ncores > 1: # speed test 957 | self.assertLessLinux(swifter_time, pd_time) 958 | 959 | def test_vectorized_force_parallel_math_apply_on_large_rolling_dataframe(self): 960 | LOG.info("test_vectorized_force_parallel_math_apply_on_large_rolling_dataframe") 961 | df = pd.DataFrame( 962 | {"x": np.arange(0, 1_000_000)}, 963 | index=pd.date_range("2019-01-1", "2020-01-1", periods=1_000_000), 964 | ) 965 | pd_val = df.rolling("1d").apply(max, raw=True) 966 | swifter_val = ( 967 | df.swifter.set_npartitions(4) 968 | .force_parallel(True) 969 | .rolling("1d") 970 | .progress_bar(desc="Force Parallel ~ Vec math apply ~ Rolling DF") 971 | .apply(max, raw=True) 972 | ) 973 | self.assertEqual(pd_val, swifter_val) # equality test 974 | 975 | def test_nonvectorized_math_apply_on_small_resampler_dataframe(self): 976 | LOG.info("test_nonvectorized_math_apply_on_small_resampler_dataframe") 977 | df = pd.DataFrame( 978 | {"x": np.arange(0, 1000)}, 979 | index=pd.date_range("2019-01-1", "2020-01-1", periods=1000), 980 | ) 981 | pd_val = df.resample("1M").apply(math_agg_foo) 982 | swifter_val = ( 983 | df.swifter.set_npartitions(4) 984 | .resample("1M") 985 | .progress_bar(desc="Nonvec math apply ~ Resample DF") 986 | .apply(math_agg_foo) 987 | ) 988 | self.assertEqual(pd_val, swifter_val) # equality test 989 | 990 | def test_nonvectorized_math_apply_on_large_resampler_dataframe(self): 991 | LOG.info("test_nonvectorized_math_apply_on_large_resampler_dataframe") 992 | df = pd.DataFrame( 993 | {"x": np.arange(0, 1_000_000)}, 994 | index=pd.date_range("2019-01-1", "2020-01-1", periods=1_000_000), 995 | ) 996 | 997 | start_pd = time.time() 998 | pd_val = df.resample("3T").apply(math_agg_foo) 999 | end_pd = time.time() 1000 | pd_time = end_pd - start_pd 1001 | 1002 | start_swifter = time.time() 1003 | swifter_val = ( 1004 | df.swifter.set_npartitions(4) 1005 | .resample("3T") 1006 | .progress_bar(desc="Nonvec math apply ~ Resample DF") 1007 | .apply(math_agg_foo) 1008 | ) 1009 | end_swifter = time.time() 1010 | swifter_time = end_swifter - start_swifter 1011 | 1012 | self.assertEqual(pd_val, swifter_val) # equality test 1013 | if self.ncores > 1: # speed test 1014 | self.assertLessLinux(swifter_time, pd_time) 1015 | 1016 | def test_nonvectorized_force_parallel_math_apply_on_large_resampler_dataframe(self): 1017 | LOG.info("test_nonvectorized_force_parallel_math_apply_on_large_resampler_dataframe") 1018 | df = pd.DataFrame( 1019 | {"x": np.arange(0, 1_000_000)}, 1020 | index=pd.date_range("2019-01-1", "2020-01-1", periods=1_000_000), 1021 | ) 1022 | 1023 | start_pd = time.time() 1024 | pd_val = df.resample("3T").apply(math_agg_foo) 1025 | end_pd = time.time() 1026 | pd_time = end_pd - start_pd 1027 | 1028 | start_swifter = time.time() 1029 | swifter_val = ( 1030 | df.swifter.set_npartitions(4) 1031 | .force_parallel(True) 1032 | .resample("3T") 1033 | .progress_bar(desc="Force Parallel ~ Nonvec math apply ~ Resample DF") 1034 | .apply(math_agg_foo) 1035 | ) 1036 | end_swifter = time.time() 1037 | swifter_time = end_swifter - start_swifter 1038 | 1039 | self.assertEqual(pd_val, swifter_val) # equality test 1040 | if self.ncores > 1: # speed test 1041 | self.assertLessLinux(swifter_time, pd_time) 1042 | 1043 | 1044 | @run_if_modin_installed 1045 | class TestModinSeries(TestSwifter): 1046 | def test_modin_series_warns_on_missing_attributes(self): 1047 | LOG.info("test_modin_series_warns_on_missing_attributes") 1048 | md = self.modinSetUp() 1049 | series = md.Series() 1050 | with warnings.catch_warnings(record=True) as w: 1051 | warnings.simplefilter("always") 1052 | series.swifter.set_dask_threshold(1) 1053 | self.assertEqual(len(w), 1) 1054 | 1055 | with warnings.catch_warnings(record=True) as w: 1056 | warnings.simplefilter("always") 1057 | series.swifter.set_dask_scheduler("threads") 1058 | self.assertEqual(len(w), 1) 1059 | 1060 | with warnings.catch_warnings(record=True) as w: 1061 | warnings.simplefilter("always") 1062 | series.swifter.allow_dask_on_strings(True) 1063 | self.assertEqual(len(w), 1) 1064 | 1065 | with warnings.catch_warnings(record=True) as w: 1066 | warnings.simplefilter("always") 1067 | series.swifter.progress_bar(False) 1068 | self.assertEqual(len(w), 1) 1069 | 1070 | with warnings.catch_warnings(record=True) as w: 1071 | warnings.simplefilter("always") 1072 | series.swifter.force_parallel(False) 1073 | self.assertEqual(len(w), 1) 1074 | 1075 | def test_modin_series_errors_on_missing_transformations(self): 1076 | LOG.info("test_modin_series_errors_on_missing_transformations") 1077 | md = self.modinSetUp() 1078 | series = md.Series() 1079 | with self.assertRaises(NotImplementedError): 1080 | series.swifter.rolling(1) 1081 | 1082 | with self.assertRaises(NotImplementedError): 1083 | series.swifter.resample(1) 1084 | 1085 | def test_apply_on_empty_modin_series(self): 1086 | LOG.info("test_apply_on_empty_series") 1087 | md = self.modinSetUp() 1088 | series = md.Series() 1089 | md_val = series.apply(math_foo, compare_to=1) 1090 | swifter_val = series.swifter.apply(math_foo, compare_to=1) 1091 | self.assertEqual(md_val, swifter_val) # equality test 1092 | 1093 | def test_nonvectorized_modin_apply_on_small_series(self): 1094 | LOG.info("test_nonvectorized_modin_apply_on_small_series") 1095 | md = self.modinSetUp() 1096 | df = md.Series(np.random.normal(size=200_000), name="x") 1097 | md_val = df.apply(math_foo) 1098 | swifter_val = df.swifter.set_npartitions(4).apply(math_foo) 1099 | self.assertEqual(md_val, swifter_val) # equality test 1100 | 1101 | def test_vectorized_modin_apply_on_large_series(self): 1102 | LOG.info("test_vectorized_modin_apply_on_large_series") 1103 | md = self.modinSetUp() 1104 | df = md.Series(np.random.uniform(size=10_000_000), name="x") 1105 | 1106 | md_val = df.apply(math_vec_square, axis=0) 1107 | md_pd_val = md_val._to_pandas() # We have to bring it into pandas to confirm swifter apply speed is quicker 1108 | 1109 | swifter_val = df.swifter.set_npartitions(4).apply(math_vec_square) 1110 | swifter_pd_val = ( 1111 | swifter_val._to_pandas() 1112 | ) # We have to bring it into pandas to confirm swifter apply speed is quicker 1113 | 1114 | self.assertEqual(md_val, swifter_val) # equality test 1115 | self.assertEqual(md_pd_val, swifter_pd_val) # equality test after converting to pandas 1116 | 1117 | 1118 | @run_if_modin_installed 1119 | class TestModinDataFrame(TestSwifter): 1120 | def test_modin_dataframe_warns_on_missing_attributes(self): 1121 | LOG.info("test_modin_dataframe_warns_on_missing_attributes") 1122 | md = self.modinSetUp() 1123 | df = md.DataFrame() 1124 | with warnings.catch_warnings(record=True) as w: 1125 | warnings.simplefilter("always") 1126 | df.swifter.set_dask_threshold(1) 1127 | self.assertEqual(len(w), 1) 1128 | 1129 | with warnings.catch_warnings(record=True) as w: 1130 | warnings.simplefilter("always") 1131 | df.swifter.set_dask_scheduler("threads") 1132 | self.assertEqual(len(w), 1) 1133 | 1134 | with warnings.catch_warnings(record=True) as w: 1135 | warnings.simplefilter("always") 1136 | df.swifter.allow_dask_on_strings(True) 1137 | self.assertEqual(len(w), 1) 1138 | 1139 | with warnings.catch_warnings(record=True) as w: 1140 | warnings.simplefilter("always") 1141 | df.swifter.progress_bar(False) 1142 | self.assertEqual(len(w), 1) 1143 | 1144 | with warnings.catch_warnings(record=True) as w: 1145 | warnings.simplefilter("always") 1146 | df.swifter.force_parallel(False) 1147 | self.assertEqual(len(w), 1) 1148 | 1149 | def test_modin_dataframe_errors_on_missing_transformations(self): 1150 | LOG.info("test_modin_dataframe_errors_on_missing_transformations") 1151 | md = self.modinSetUp() 1152 | df = md.DataFrame() 1153 | with self.assertRaises(NotImplementedError): 1154 | df.swifter.rolling(1) 1155 | 1156 | with self.assertRaises(NotImplementedError): 1157 | df.swifter.resample(1) 1158 | 1159 | def test_apply_on_empty_modin_dataframe(self): 1160 | LOG.info("test_apply_on_empty_series") 1161 | md = self.modinSetUp() 1162 | df = md.DataFrame() 1163 | md_val = df.apply(math_foo, compare_to=1) 1164 | swifter_val = df.swifter.apply(math_foo, compare_to=1) 1165 | self.assertEqual(md_val, swifter_val) # equality test 1166 | 1167 | def test_nonvectorized_modin_apply_on_small_dataframe(self): 1168 | LOG.info("test_nonvectorized_modin_apply_on_small_dataframe") 1169 | md = self.modinSetUp() 1170 | df = md.DataFrame( 1171 | { 1172 | "letter": ["A", "B", "C", "D", "E"] * 200_000, 1173 | "value": np.random.normal(size=1_000_000), 1174 | } 1175 | ) 1176 | md_val = df.apply(text_foo, axis=1) 1177 | swifter_val = df.swifter.set_npartitions(4).apply(text_foo, axis=1) 1178 | self.assertEqual(md_val, swifter_val) # equality test 1179 | 1180 | def test_vectorized_modin_apply_on_large_dataframe(self): 1181 | LOG.info("test_vectorized_modin_apply_on_large_dataframe") 1182 | md = self.modinSetUp() 1183 | df = md.DataFrame( 1184 | { 1185 | "x": np.random.normal(size=1_000_000), 1186 | "y": np.random.uniform(size=1_000_000), 1187 | } 1188 | ) 1189 | start_md = time.time() 1190 | md_val = df.apply(math_vec_square, axis=1) 1191 | md_pd_val = md_val._to_pandas() # We have to bring it into pandas to confirm swifter apply speed is quicker 1192 | end_md = time.time() 1193 | md_time = end_md - start_md 1194 | 1195 | start_swifter = time.time() 1196 | swifter_val = df.swifter.set_npartitions(4).apply(math_vec_square, axis=1) 1197 | swifter_pd_val = ( 1198 | swifter_val._to_pandas() 1199 | ) # We have to bring it into pandas to confirm swifter apply speed is quicker 1200 | end_swifter = time.time() 1201 | swifter_time = end_swifter - start_swifter 1202 | 1203 | self.assertEqual(md_val, swifter_val) # equality test 1204 | self.assertEqual(md_pd_val, swifter_pd_val) # equality test after converting to pandas 1205 | self.assertLessLinux(swifter_time, md_time) # speed test 1206 | -------------------------------------------------------------------------------- /swifter/tqdm_dask_progressbar.py: -------------------------------------------------------------------------------- 1 | from dask.callbacks import Callback 2 | from tqdm.autonotebook import tqdm 3 | 4 | 5 | class TQDMDaskProgressBar(Callback, object): 6 | """ 7 | A tqdm progress bar for dask. 8 | 9 | Usage: 10 | ``` 11 | with TQDMDaskProgressBar(): 12 | da.compute() 13 | ``` 14 | Author : wassname 15 | Source : https://gist.github.com/wassname/1837d0365247430e02abda41f0e7f184 16 | See: http://dask.pydata.org/en/latest/diagnostics-local.html?highlight=progress 17 | """ 18 | 19 | def __init__(self, start=None, start_state=None, pretask=None, posttask=None, finish=None, **kwargs): 20 | super(TQDMDaskProgressBar, self).__init__( 21 | start=start, 22 | start_state=start_state, 23 | pretask=pretask, 24 | posttask=posttask, 25 | finish=finish, 26 | ) 27 | self.tqdm_args = kwargs 28 | self.states = ["ready", "waiting", "running", "finished"] 29 | 30 | def _start_state(self, dsk, state): 31 | self._tqdm = tqdm(total=sum(len(state[k]) for k in self.states), **self.tqdm_args) 32 | 33 | def _posttask(self, key, result, dsk, state, worker_id): 34 | self._tqdm.update(1) 35 | 36 | def _finish(self, dsk, state, errored): 37 | self._tqdm.close() 38 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # keep in sync with myproject.toml 3 | exclude = 4 | .git 5 | # ignore cached 6 | __pycache__ 7 | # data folder should only contain data. Don't lint 8 | data 9 | # don't lint sphinx docs 10 | docs 11 | max-line-length = 120 12 | max-complexity = 10 13 | ignore = D203 14 | # E203 whitespace before ':' (rule not PEP8 compliant) 15 | E203 16 | # W503 line break before binary operator (rule not PEP8 compliant) 17 | W503 18 | --------------------------------------------------------------------------------