├── .coveragerc ├── .gitattributes ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.rst ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── ci ├── checks │ └── style.sh ├── cpu │ ├── build.sh │ └── dask-cudf │ │ ├── build_dask_cudf.sh │ │ └── upload-anaconda.sh ├── gpu │ └── build.sh └── release │ └── update-version.sh ├── conda ├── envs │ └── dev-environment.yml └── recipes │ └── dask-cudf │ ├── build.sh │ └── meta.yaml ├── dask_cudf ├── DASK_LICENSE.txt ├── __init__.py ├── _version.py ├── accessor.py ├── backends.py ├── batcher_sortnet.py ├── core.py ├── io │ ├── __init__.py │ ├── csv.py │ ├── json.py │ ├── orc.py │ ├── parquet.py │ └── tests │ │ ├── __init__.py │ │ ├── sample.orc │ │ ├── test_csv.py │ │ ├── test_json.py │ │ ├── test_orc.py │ │ ├── test_parquet.py │ │ └── test_s3.py ├── join_impl.py └── tests │ ├── __init__.py │ ├── test_accessor.py │ ├── test_batcher_sortnet.py │ ├── test_binops.py │ ├── test_core.py │ ├── test_delayed_io.py │ ├── test_distributed.py │ ├── test_groupby.py │ ├── test_join.py │ ├── test_reductions.py │ └── test_sort.py ├── gpuci_build.sh ├── requirements.txt ├── setup.cfg ├── setup.py └── versioneer.py /.coveragerc: -------------------------------------------------------------------------------- 1 | # Configuration file for Python coverage tests 2 | [run] 3 | include = dask_cudf/* 4 | omit = dask_cudf/tests/* 5 | 6 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | dask_cudf/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # vscode 104 | .vscode 105 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: stable 4 | hooks: 5 | - id: black 6 | language_version: python3.6 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: v1.2.3 9 | hooks: 10 | - id: flake8 11 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | dask-cudf 0.8.0 (27 June 2019) 2 | ----------------------------- 3 | 4 | 5 | dask-cudf 0.7.0 (10 May 2019) 6 | ----------------------------- 7 | 8 | - Remove dependency on libgdf_cffi (#228) `Keith Kraus`_ 9 | - Update build process `Rick Ratzel`_ 10 | - Convert query to use standard dask query and update GPUCI to cudf 0.7 (#196) `Nick Becker`_ 11 | - Update GPU CI to use cudf 0.7 (#204) `Nick Becker`_ 12 | - Route single-partition merge cases through dask.dataframe (#194) `Matthew Rocklin`_ 13 | - Avoid compression warning in read_csv if chunksize=None (#192) `Matthew Rocklin`_ 14 | - Fix classifier (#182) `Ray Douglass`_ 15 | - Fix gpuCI build script (#173) `Dillon Cullinan`_ 16 | 17 | 18 | 0.6.1 - 2019-04-09 19 | ------------------ 20 | 21 | - Add cudf.DataFrame.mean = None (#205) `Matthew Rocklin`_ 22 | 23 | 24 | dask-cudf 0.6.0 (22 Mar 2019) 25 | ----------------------------- 26 | 27 | In this release we aligned Dask cuDF to the mainline Dask Dataframe 28 | codebase. This was made possible by an alignment of cuDF to Pandas, and 29 | resulted in us maintaining much less code in this repository. Dask cuDF 30 | dataframes are now just Dask DataFrames that contain cuDF dataframes, and have 31 | a few extra methods. 32 | 33 | - Bump cudf to 0.6 (#162) `Keith Kraus`_ 34 | - Fix upload-anaconda to find the right package (#159) `Ray Douglass`_ 35 | - Add gpuCI (#151) `Mike Wendt`_ 36 | - Skip s3fs tests before importing dask.bytes.s3 (#153) `Matthew Rocklin`_ 37 | - Raise FileNotFoundError if no files found (#145) `Benjamin Zaitlen`_ 38 | - Add tests for repartition and indexed joins (#91) `Matthew Rocklin`_ 39 | - URLs for CSVs (#122) `Benjamin Zaitlen`_ 40 | - Rely on mainline concat function (#126) `Matthew Rocklin`_ 41 | - add test for generic idx test using loc (#121) `Benjamin Zaitlen`_ 42 | - Fix gzip `Benjamin Zaitlen`_ 43 | - Replace custom make_meta with mainline make_meta (#105) `Matthew Rocklin`_ 44 | - Cleanup dead code (#99) `Matthew Rocklin`_ 45 | - Remove from_cudf and from_dask_dataframe functions (#98) `Matthew Rocklin`_ 46 | - Increase default chunk size in read_csv (#95) `Matthew Rocklin`_ 47 | - Remove assertions outlawing inner joins (#89) `Matthew Rocklin`_ 48 | - Fix reset_index(drop=) keyword handling (#94) `Matthew Rocklin`_ 49 | - Add index= keyword to make_meta dispatch functions `Matthew Rocklin`_ 50 | - Use mainline groupby aggregation codebase (#69) `Matthew Rocklin`_ 51 | - remove dtype inference on chunks of data when parsing csv (#86) `Matthew Rocklin`_ 52 | - Avoid format strings to support Python 3.5 `Matthew Rocklin`_ 53 | - use byte_range when reading CSVs (#78) `Benjamin Zaitlen`_ 54 | - Move cudf dask backends code to backends file here (#75) `Matthew Rocklin`_ 55 | - Clean up join code (#70) `Matthew Rocklin`_ 56 | - Replace pygdf with cudf in README (#65) `Matthew Rocklin`_ 57 | - Add dask_cudf.io to setup.py packages (#60) `Matthew Rocklin`_ 58 | - Add basic read_csv implementation (#58) `Matthew Rocklin`_ 59 | - Add tests for repr (#56) `Matthew Rocklin`_ 60 | - Rename gd to cudf in tests `Matthew Rocklin`_ 61 | - add style instructions to README `Matthew Rocklin`_ 62 | - Apply isort to code `Matthew Rocklin`_ 63 | - Add pre-commit-config.yaml including black and flake8 `Matthew Rocklin`_ 64 | - Inherit from Dask Dataframe and respond to cudf update (#48) `Matthew Rocklin`_ 65 | - updating for new cuDF API `Matthew Jones`_ 66 | - add orc reader (#220) `Benjamin Zaitlen`_ 67 | 68 | .. _`Matthew Jones`: https://github.com/mt-jones 69 | .. _`Keith Kraus`: https://github.com/kkraus14 70 | .. _`Ray Douglass`: https://github.com/raydouglass 71 | .. _`Matthew Rocklin`: https://github.com/mrocklin 72 | .. _`Benjamin Zaitlen`: https://github.com/quasiben 73 | .. _`Mike Wendt`: https://github.com/mike-wendt 74 | .. _`Dillon Cullinan`: https://github.com/dillon-cullinan 75 | .. _`Nick Becker`: https://github.com/beckernick 76 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include dask_cudf *.py 2 | include versioneer.py 3 | include setup.py 4 | include README.rst 5 | include LICENSE.txt 6 | include MANIFEST.in 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dask GPU Dataframes 2 | 3 | A partitioned gpu-backed dataframe, using Dask. 4 | 5 | ## Setup from source 6 | 7 | Setup from source repo: 8 | 9 | 1. Install dependencies into a new conda environment where `CUDA_VERSION` is either 9.2 or 10 10 | 11 | conda create -n dask-cudf \ 12 | -c rapidsai -c numba -c conda-forge -c defaults \ 13 | cudf dask cudatoolkit={CUDA_VERSION} 14 | 15 | 2. Activate conda environment: 16 | 17 | source activate dask-cudf 18 | 19 | 3. Clone `dask-cudf` repo: 20 | 21 | git clone https://github.com/rapidsai/dask-cudf 22 | 23 | 4. Install from source: 24 | 25 | cd dask-cudf 26 | pip install . 27 | 28 | ## Test 29 | 30 | 1. Install `pytest` 31 | 32 | conda install pytest 33 | 34 | 2. Run all tests: 35 | 36 | py.test dask_cudf 37 | 38 | 3. Or, run individual tests: 39 | 40 | py.test dask_cudf/tests/test_file.py 41 | 42 | ## Style 43 | 44 | For style we use `black`, `isort`, and `flake8`. These are available as 45 | pre-commit hooks that will run every time you are about to commit code. 46 | 47 | From the root directory of this project run the following: 48 | 49 | ``` 50 | pip install pre-commit 51 | pre-commit install 52 | ``` 53 | -------------------------------------------------------------------------------- /ci/checks/style.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2019, NVIDIA CORPORATION. 3 | ########################## 4 | # dask-cudf Style Tester # 5 | ########################## 6 | 7 | # Ignore errors and set path 8 | set +e 9 | PATH=/conda/bin:$PATH 10 | 11 | # Activate common conda env 12 | source activate gdf 13 | 14 | # Run flake8 and get results/return code 15 | FLAKE=`flake8 python` 16 | RETVAL=$? 17 | 18 | # Output results if failure otherwise show pass 19 | if [ "$FLAKE" != "" ]; then 20 | echo -e "\n\n>>>> FAILED: flake8 style check; begin output\n\n" 21 | echo -e "$FLAKE" 22 | echo -e "\n\n>>>> FAILED: flake8 style check; end output\n\n" 23 | else 24 | echo -e "\n\n>>>> PASSED: flake8 style check\n\n" 25 | fi 26 | 27 | exit $RETVAL -------------------------------------------------------------------------------- /ci/cpu/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2019, NVIDIA CORPORATION. 3 | ########################################### 4 | # dask-cudf CPU conda build script for CI # 5 | ########################################### 6 | set -e 7 | 8 | # Logger function for build status output 9 | function logger() { 10 | echo -e "\n>>>> $@\n" 11 | } 12 | 13 | # Set path and build parallel level 14 | export PATH=/conda/bin:/usr/local/cuda/bin:$PATH 15 | export PARALLEL_LEVEL=4 16 | 17 | # Set home to the job's workspace 18 | export HOME=$WORKSPACE 19 | 20 | # Switch to project root; also root of repo checkout 21 | cd $WORKSPACE 22 | 23 | # Get latest tag and number of commits since tag 24 | export GIT_DESCRIBE_TAG=`git describe --abbrev=0 --tags` 25 | export GIT_DESCRIBE_NUMBER=`git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count` 26 | 27 | ################################################################################ 28 | # SETUP - Check environment 29 | ################################################################################ 30 | 31 | logger "Get env..." 32 | env 33 | 34 | logger "Activate conda env..." 35 | source activate gdf 36 | 37 | logger "Check versions..." 38 | python --version 39 | gcc --version 40 | g++ --version 41 | conda list 42 | 43 | # FIX Added to deal with Anancoda SSL verification issues during conda builds 44 | conda config --set ssl_verify False 45 | 46 | ################################################################################ 47 | # INSTALL - Install NVIDIA driver 48 | ################################################################################ 49 | 50 | logger "Install NVIDIA driver for CUDA $CUDA..." 51 | apt-get update -q 52 | DRIVER_VER="396.44-1" 53 | LIBCUDA_VER="396" 54 | if [ "$CUDA" == "10.0" ]; then 55 | DRIVER_VER="410.72-1" 56 | LIBCUDA_VER="410" 57 | fi 58 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 59 | cuda-drivers=${DRIVER_VER} libcuda1-${LIBCUDA_VER} 60 | 61 | ################################################################################ 62 | # BUILD - Conda package builds (conda deps: cudf <- dask-cudf) 63 | ################################################################################ 64 | 65 | logger "Build conda pkg for dask-cudf..." 66 | source ci/cpu/dask-cudf/build_dask_cudf.sh 67 | 68 | ################################################################################ 69 | # UPLOAD - Conda packages 70 | ################################################################################ 71 | 72 | logger "Upload conda pkg..." 73 | source ci/cpu/dask-cudf/upload-anaconda.sh 74 | -------------------------------------------------------------------------------- /ci/cpu/dask-cudf/build_dask_cudf.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Logger function for build status output 5 | function logger() { 6 | echo -e "\n>>>> $@\n" 7 | } 8 | 9 | logger "Building dask_cudf" 10 | conda build conda/recipes/dask-cudf -c nvidia -c rapidsai -c rapidsai-nightly -c numba -c defaults -c conda-forge --python=$PYTHON 11 | -------------------------------------------------------------------------------- /ci/cpu/dask-cudf/upload-anaconda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh 4 | export UPLOADFILE=`conda build conda/recipes/dask-cudf -c nvidia -c rapidsai -c rapidsai-nightly -c numba -c defaults -c conda-forge --python=$PYTHON --output` 5 | 6 | set -e 7 | 8 | SOURCE_BRANCH=master 9 | 10 | test -e ${UPLOADFILE} 11 | 12 | LABEL_OPTION="--label main --label cuda9.2 --label cuda10.0" 13 | 14 | # Restrict uploads to master branch 15 | if [ ${GIT_BRANCH} != ${SOURCE_BRANCH} ]; then 16 | echo "Skipping upload" 17 | return 0 18 | fi 19 | 20 | if [ -z "$MY_UPLOAD_KEY" ]; then 21 | echo "No upload key" 22 | return 0 23 | fi 24 | 25 | echo "LABEL_OPTION=${LABEL_OPTION}" 26 | 27 | echo "Upload" 28 | echo ${UPLOADFILE} 29 | anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --force ${UPLOADFILE} 30 | -------------------------------------------------------------------------------- /ci/gpu/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2019, NVIDIA CORPORATION. 3 | ############################################## 4 | # dask-cudf GPU build and test script for CI # 5 | ############################################## 6 | set -e 7 | 8 | # Logger function for build status output 9 | function logger() { 10 | echo -e "\n>>>> $@\n" 11 | } 12 | 13 | # Set path and build parallel level 14 | export PATH=/conda/bin:/usr/local/cuda/bin:$PATH 15 | export PARALLEL_LEVEL=4 16 | export CUDA_REL=${CUDA_VERSION%.*} 17 | 18 | # Set home to the job's workspace 19 | export HOME=$WORKSPACE 20 | 21 | ################################################################################ 22 | # SETUP - Check environment 23 | ################################################################################ 24 | 25 | logger "Check environment..." 26 | env 27 | 28 | logger "Check GPU usage..." 29 | nvidia-smi 30 | 31 | logger "Activate conda env..." 32 | source activate gdf 33 | 34 | logger "Check versions..." 35 | python --version 36 | $CC --version 37 | $CXX --version 38 | 39 | logger "Setup new environment..." 40 | conda install \ 41 | 'cudf=0.8*' \ 42 | 'pyarrow=0.12.1' \ 43 | 'dask>=1.1.5' 44 | pip install git+https://github.com/dask/dask.git --upgrade --no-deps 45 | 46 | conda list 47 | 48 | logger "Python py.test for dask-cudf..." 49 | cd $WORKSPACE 50 | pip install -e . 51 | 52 | py.test --cache-clear --junitxml=${WORKSPACE}/junit-dask-cudf.xml -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:${WORKSPACE}/dask-cudf-coverage.xml --cov-report term 53 | 54 | -------------------------------------------------------------------------------- /ci/release/update-version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ############################# 3 | # dask-cudf Version Updater # 4 | ############################# 5 | 6 | ## Usage 7 | # bash update-version.sh 8 | # where is either `major`, `minor`, `patch` 9 | 10 | set -e 11 | 12 | # Grab argument for release type 13 | RELEASE_TYPE=$1 14 | 15 | # Get current version and calculate next versions 16 | CURRENT_TAG=`git tag | grep -xE 'v[0-9\.]+' | sort --version-sort | tail -n 1 | tr -d 'v'` 17 | CURRENT_MAJOR=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}'` 18 | CURRENT_MINOR=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}'` 19 | CURRENT_PATCH=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}'` 20 | NEXT_MAJOR=$((CURRENT_MAJOR + 1)) 21 | NEXT_MINOR=$((CURRENT_MINOR + 1)) 22 | NEXT_PATCH=$((CURRENT_PATCH + 1)) 23 | NEXT_FULL_TAG="" 24 | NEXT_SHORT_TAG="" 25 | 26 | # Determine release type 27 | if [ "$RELEASE_TYPE" == "major" ]; then 28 | NEXT_FULL_TAG="${NEXT_MAJOR}.0.0" 29 | NEXT_SHORT_TAG="${NEXT_MAJOR}.0" 30 | elif [ "$RELEASE_TYPE" == "minor" ]; then 31 | NEXT_FULL_TAG="${CURRENT_MAJOR}.${NEXT_MINOR}.0" 32 | NEXT_SHORT_TAG="${CURRENT_MAJOR}.${NEXT_MINOR}" 33 | elif [ "$RELEASE_TYPE" == "patch" ]; then 34 | NEXT_FULL_TAG="${CURRENT_MAJOR}.${CURRENT_MINOR}.${NEXT_PATCH}" 35 | NEXT_SHORT_TAG="${CURRENT_MAJOR}.${NEXT_MINOR}" 36 | else 37 | echo "Incorrect release type; use 'major', 'minor', or 'patch' as an argument" 38 | exit 1 39 | fi 40 | 41 | echo "Preparing '$RELEASE_TYPE' release [$CURRENT_TAG -> $NEXT_FULL_TAG]" 42 | 43 | # Inplace sed replace; workaround for Linux and Mac 44 | function sed_runner() { 45 | sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak 46 | } 47 | 48 | #No-op -------------------------------------------------------------------------------- /conda/envs/dev-environment.yml: -------------------------------------------------------------------------------- 1 | name: dask-cudf-dev 2 | channels: 3 | - nvidia 4 | - rapidsai 5 | - rapidsai-nightly 6 | - conda-forge 7 | - defaults 8 | dependencies: 9 | - cmake>=3.12 10 | - python>=3.6,<3.8 11 | - numba>=0.41 12 | - pandas>=0.23.4 13 | - pyarrow=0.12.1 14 | - notebook>=0.5.0 15 | - nvstrings 16 | - cython>=0.29,<0.30 17 | - pytest 18 | - sphinx 19 | - sphinx_rtd_theme 20 | - sphinxcontrib-websupport 21 | - nbsphinx 22 | - numpydoc 23 | - ipython 24 | - recommonmark 25 | - pytest 26 | - partd 27 | - moto 28 | - boto3 29 | - httpretty 30 | - flake8 31 | - dask 32 | - s3fs 33 | - pip: 34 | - git+https://github.com/dask/dask.git 35 | - git+https://github.com/dask/distributed.git 36 | 37 | -------------------------------------------------------------------------------- /conda/recipes/dask-cudf/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python setup.py install --single-version-externally-managed --record=record.txt 3 | -------------------------------------------------------------------------------- /conda/recipes/dask-cudf/meta.yaml: -------------------------------------------------------------------------------- 1 | # Usage: 2 | # conda build -c defaults -c conda-forge . 3 | {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} 4 | {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} 5 | {% set git_revision_count=environ.get('GIT_DESCRIBE_NUMBER', 0) %} 6 | {% set py_version=environ.get('CONDA_PY', 36) %} 7 | package: 8 | name: dask-cudf 9 | version: {{ version }} 10 | 11 | source: 12 | path: ../../.. 13 | 14 | build: 15 | number: {{ git_revision_count }} 16 | string: py{{ py_version }}_{{ git_revision_count }} 17 | 18 | requirements: 19 | host: 20 | - python x.x 21 | - cudf {{minor_version}}.* 22 | - dask >=1.2.2 23 | - distributed >=1.23.0 24 | run: 25 | - python x.x 26 | - cudf {{minor_version}}.* 27 | - dask >=1.2.2 28 | - distributed >=1.23.0 29 | test: 30 | imports: 31 | - dask_cudf 32 | 33 | about: 34 | home: http://rapids.ai 35 | license: Apache 36 | license_file: ../../../LICENSE.txt 37 | summary: dask-cudf library 38 | -------------------------------------------------------------------------------- /dask_cudf/DASK_LICENSE.txt: -------------------------------------------------------------------------------- 1 | This library contains modified code from the Dask library 2 | (https://github.com/dask/dask). The original Dask license is below. 3 | 4 | Copyright (c) 2014-2017, Continuum Analytics, Inc. and contributors 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | 13 | Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | Neither the name of Continuum Analytics nor the names of any contributors 18 | may be used to endorse or promote products derived from this software 19 | without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 25 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 31 | THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /dask_cudf/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import ( 2 | DataFrame, 3 | Series, 4 | from_cudf, 5 | from_dask_dataframe, 6 | concat, 7 | from_delayed, 8 | ) 9 | from .io import read_csv, read_orc, read_json, read_parquet 10 | from . import backends 11 | 12 | import cudf 13 | from cudf._version import get_versions 14 | 15 | __version__ = get_versions()["version"] 16 | del get_versions 17 | 18 | __all__ = [ 19 | "DataFrame", 20 | "Series", 21 | "from_cudf", 22 | "from_dask_dataframe", 23 | "concat", 24 | "from_delayed", 25 | ] 26 | 27 | if not hasattr(cudf.DataFrame, "mean"): 28 | cudf.DataFrame.mean = None 29 | del cudf 30 | -------------------------------------------------------------------------------- /dask_cudf/_version.py: -------------------------------------------------------------------------------- 1 | # This file helps to compute a version number in source trees obtained from 2 | # git-archive tarball (such as those provided by githubs download-from-tag 3 | # feature). Distribution tarballs (built by setup.py sdist) and build 4 | # directories (produced by setup.py build) will contain a much shorter file 5 | # that just contains the computed version number. 6 | 7 | # This file is released into the public domain. Generated by 8 | # versioneer-0.18 (https://github.com/warner/python-versioneer) 9 | 10 | """Git implementation of _version.py.""" 11 | 12 | import errno 13 | import os 14 | import re 15 | import subprocess 16 | import sys 17 | 18 | 19 | def get_keywords(): 20 | """Get the keywords needed to look up the version information.""" 21 | # these strings will be replaced by git during git-archive. 22 | # setup.py/versioneer.py will grep for the variable names, so they must 23 | # each be defined on a line of their own. _version.py will just call 24 | # get_keywords(). 25 | git_refnames = " (HEAD -> branch-0.9, tag: v0.9.0a1)" 26 | git_full = "b566ab60ea69e6e165533b68b1966875528afb06" 27 | git_date = "2019-06-25 16:45:30 -0400" 28 | keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} 29 | return keywords 30 | 31 | 32 | class VersioneerConfig: 33 | """Container for Versioneer configuration parameters.""" 34 | 35 | 36 | def get_config(): 37 | """Create, populate and return the VersioneerConfig() object.""" 38 | # these strings are filled in when 'setup.py versioneer' creates 39 | # _version.py 40 | cfg = VersioneerConfig() 41 | cfg.VCS = "git" 42 | cfg.style = "pep440" 43 | cfg.tag_prefix = "" 44 | cfg.parentdir_prefix = "dask_cudf-" 45 | cfg.versionfile_source = "dask_cudf/_version.py" 46 | cfg.verbose = False 47 | return cfg 48 | 49 | 50 | class NotThisMethod(Exception): 51 | """Exception raised if a method is not valid for the current scenario.""" 52 | 53 | 54 | LONG_VERSION_PY = {} 55 | HANDLERS = {} 56 | 57 | 58 | def register_vcs_handler(vcs, method): # decorator 59 | """Decorator to mark a method as the handler for a particular VCS.""" 60 | 61 | def decorate(f): 62 | """Store f in HANDLERS[vcs][method].""" 63 | if vcs not in HANDLERS: 64 | HANDLERS[vcs] = {} 65 | HANDLERS[vcs][method] = f 66 | return f 67 | 68 | return decorate 69 | 70 | 71 | def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): 72 | """Call the given command(s).""" 73 | assert isinstance(commands, list) 74 | p = None 75 | for c in commands: 76 | try: 77 | dispcmd = str([c] + args) 78 | # remember shell=False, so use git.cmd on windows, not just git 79 | p = subprocess.Popen( 80 | [c] + args, 81 | cwd=cwd, 82 | env=env, 83 | stdout=subprocess.PIPE, 84 | stderr=(subprocess.PIPE if hide_stderr else None), 85 | ) 86 | break 87 | except EnvironmentError: 88 | e = sys.exc_info()[1] 89 | if e.errno == errno.ENOENT: 90 | continue 91 | if verbose: 92 | print("unable to run %s" % dispcmd) 93 | print(e) 94 | return None, None 95 | else: 96 | if verbose: 97 | print("unable to find command, tried %s" % (commands,)) 98 | return None, None 99 | stdout = p.communicate()[0].strip() 100 | if sys.version_info[0] >= 3: 101 | stdout = stdout.decode() 102 | if p.returncode != 0: 103 | if verbose: 104 | print("unable to run %s (error)" % dispcmd) 105 | print("stdout was %s" % stdout) 106 | return None, p.returncode 107 | return stdout, p.returncode 108 | 109 | 110 | def versions_from_parentdir(parentdir_prefix, root, verbose): 111 | """Try to determine the version from the parent directory name. 112 | 113 | Source tarballs conventionally unpack into a directory that includes both 114 | the project name and a version string. We will also support searching up 115 | two directory levels for an appropriately named parent directory 116 | """ 117 | rootdirs = [] 118 | 119 | for i in range(3): 120 | dirname = os.path.basename(root) 121 | if dirname.startswith(parentdir_prefix): 122 | return { 123 | "version": dirname[len(parentdir_prefix) :], 124 | "full-revisionid": None, 125 | "dirty": False, 126 | "error": None, 127 | "date": None, 128 | } 129 | else: 130 | rootdirs.append(root) 131 | root = os.path.dirname(root) # up a level 132 | 133 | if verbose: 134 | print( 135 | "Tried directories %s but none started with prefix %s" 136 | % (str(rootdirs), parentdir_prefix) 137 | ) 138 | raise NotThisMethod("rootdir doesn't start with parentdir_prefix") 139 | 140 | 141 | @register_vcs_handler("git", "get_keywords") 142 | def git_get_keywords(versionfile_abs): 143 | """Extract version information from the given file.""" 144 | # the code embedded in _version.py can just fetch the value of these 145 | # keywords. When used from setup.py, we don't want to import _version.py, 146 | # so we do it with a regexp instead. This function is not used from 147 | # _version.py. 148 | keywords = {} 149 | try: 150 | f = open(versionfile_abs, "r") 151 | for line in f.readlines(): 152 | if line.strip().startswith("git_refnames ="): 153 | mo = re.search(r'=\s*"(.*)"', line) 154 | if mo: 155 | keywords["refnames"] = mo.group(1) 156 | if line.strip().startswith("git_full ="): 157 | mo = re.search(r'=\s*"(.*)"', line) 158 | if mo: 159 | keywords["full"] = mo.group(1) 160 | if line.strip().startswith("git_date ="): 161 | mo = re.search(r'=\s*"(.*)"', line) 162 | if mo: 163 | keywords["date"] = mo.group(1) 164 | f.close() 165 | except EnvironmentError: 166 | pass 167 | return keywords 168 | 169 | 170 | @register_vcs_handler("git", "keywords") 171 | def git_versions_from_keywords(keywords, tag_prefix, verbose): 172 | """Get version information from git keywords.""" 173 | if not keywords: 174 | raise NotThisMethod("no keywords at all, weird") 175 | date = keywords.get("date") 176 | if date is not None: 177 | # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant 178 | # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 179 | # -like" string, which we must then edit to make compliant), because 180 | # it's been around since git-1.5.3, and it's too difficult to 181 | # discover which version we're using, or to work around using an 182 | # older one. 183 | date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) 184 | refnames = keywords["refnames"].strip() 185 | if refnames.startswith("$Format"): 186 | if verbose: 187 | print("keywords are unexpanded, not using") 188 | raise NotThisMethod("unexpanded keywords, not a git-archive tarball") 189 | refs = set([r.strip() for r in refnames.strip("()").split(",")]) 190 | # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of 191 | # just "foo-1.0". If we see a "tag: " prefix, prefer those. 192 | TAG = "tag: " 193 | tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) 194 | if not tags: 195 | # Either we're using git < 1.8.3, or there really are no tags. We use 196 | # a heuristic: assume all version tags have a digit. The old git %d 197 | # expansion behaves like git log --decorate=short and strips out the 198 | # refs/heads/ and refs/tags/ prefixes that would let us distinguish 199 | # between branches and tags. By ignoring refnames without digits, we 200 | # filter out many common branch names like "release" and 201 | # "stabilization", as well as "HEAD" and "master". 202 | tags = set([r for r in refs if re.search(r"\d", r)]) 203 | if verbose: 204 | print("discarding '%s', no digits" % ",".join(refs - tags)) 205 | if verbose: 206 | print("likely tags: %s" % ",".join(sorted(tags))) 207 | for ref in sorted(tags): 208 | # sorting will prefer e.g. "2.0" over "2.0rc1" 209 | if ref.startswith(tag_prefix): 210 | r = ref[len(tag_prefix) :] 211 | if verbose: 212 | print("picking %s" % r) 213 | return { 214 | "version": r, 215 | "full-revisionid": keywords["full"].strip(), 216 | "dirty": False, 217 | "error": None, 218 | "date": date, 219 | } 220 | # no suitable tags, so version is "0+unknown", but full hex is still there 221 | if verbose: 222 | print("no suitable tags, using unknown + full revision id") 223 | return { 224 | "version": "0+unknown", 225 | "full-revisionid": keywords["full"].strip(), 226 | "dirty": False, 227 | "error": "no suitable tags", 228 | "date": None, 229 | } 230 | 231 | 232 | @register_vcs_handler("git", "pieces_from_vcs") 233 | def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): 234 | """Get version from 'git describe' in the root of the source tree. 235 | 236 | This only gets called if the git-archive 'subst' keywords were *not* 237 | expanded, and _version.py hasn't already been rewritten with a short 238 | version string, meaning we're inside a checked out source tree. 239 | """ 240 | GITS = ["git"] 241 | if sys.platform == "win32": 242 | GITS = ["git.cmd", "git.exe"] 243 | 244 | out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) 245 | if rc != 0: 246 | if verbose: 247 | print("Directory %s not under git control" % root) 248 | raise NotThisMethod("'git rev-parse --git-dir' returned error") 249 | 250 | # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] 251 | # if there isn't one, this yields HEX[-dirty] (no NUM) 252 | describe_out, rc = run_command( 253 | GITS, 254 | [ 255 | "describe", 256 | "--tags", 257 | "--dirty", 258 | "--always", 259 | "--long", 260 | "--match", 261 | "%s*" % tag_prefix, 262 | ], 263 | cwd=root, 264 | ) 265 | # --long was added in git-1.5.5 266 | if describe_out is None: 267 | raise NotThisMethod("'git describe' failed") 268 | describe_out = describe_out.strip() 269 | full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) 270 | if full_out is None: 271 | raise NotThisMethod("'git rev-parse' failed") 272 | full_out = full_out.strip() 273 | 274 | pieces = {} 275 | pieces["long"] = full_out 276 | pieces["short"] = full_out[:7] # maybe improved later 277 | pieces["error"] = None 278 | 279 | # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] 280 | # TAG might have hyphens. 281 | git_describe = describe_out 282 | 283 | # look for -dirty suffix 284 | dirty = git_describe.endswith("-dirty") 285 | pieces["dirty"] = dirty 286 | if dirty: 287 | git_describe = git_describe[: git_describe.rindex("-dirty")] 288 | 289 | # now we have TAG-NUM-gHEX or HEX 290 | 291 | if "-" in git_describe: 292 | # TAG-NUM-gHEX 293 | mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) 294 | if not mo: 295 | # unparseable. Maybe git-describe is misbehaving? 296 | pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out 297 | return pieces 298 | 299 | # tag 300 | full_tag = mo.group(1) 301 | if not full_tag.startswith(tag_prefix): 302 | if verbose: 303 | fmt = "tag '%s' doesn't start with prefix '%s'" 304 | print(fmt % (full_tag, tag_prefix)) 305 | pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( 306 | full_tag, 307 | tag_prefix, 308 | ) 309 | return pieces 310 | pieces["closest-tag"] = full_tag[len(tag_prefix) :] 311 | 312 | # distance: number of commits since tag 313 | pieces["distance"] = int(mo.group(2)) 314 | 315 | # commit: short hex revision ID 316 | pieces["short"] = mo.group(3) 317 | 318 | else: 319 | # HEX: no tags 320 | pieces["closest-tag"] = None 321 | count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) 322 | pieces["distance"] = int(count_out) # total number of commits 323 | 324 | # commit date: see ISO-8601 comment in git_versions_from_keywords() 325 | date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ 326 | 0 327 | ].strip() 328 | pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) 329 | 330 | return pieces 331 | 332 | 333 | def plus_or_dot(pieces): 334 | """Return a + if we don't already have one, else return a .""" 335 | if "+" in pieces.get("closest-tag", ""): 336 | return "." 337 | return "+" 338 | 339 | 340 | def render_pep440(pieces): 341 | """Build up version string, with post-release "local version identifier". 342 | 343 | Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you 344 | get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty 345 | 346 | Exceptions: 347 | 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] 348 | """ 349 | if pieces["closest-tag"]: 350 | rendered = pieces["closest-tag"] 351 | if pieces["distance"] or pieces["dirty"]: 352 | rendered += plus_or_dot(pieces) 353 | rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) 354 | if pieces["dirty"]: 355 | rendered += ".dirty" 356 | else: 357 | # exception #1 358 | rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) 359 | if pieces["dirty"]: 360 | rendered += ".dirty" 361 | return rendered 362 | 363 | 364 | def render_pep440_pre(pieces): 365 | """TAG[.post.devDISTANCE] -- No -dirty. 366 | 367 | Exceptions: 368 | 1: no tags. 0.post.devDISTANCE 369 | """ 370 | if pieces["closest-tag"]: 371 | rendered = pieces["closest-tag"] 372 | if pieces["distance"]: 373 | rendered += ".post.dev%d" % pieces["distance"] 374 | else: 375 | # exception #1 376 | rendered = "0.post.dev%d" % pieces["distance"] 377 | return rendered 378 | 379 | 380 | def render_pep440_post(pieces): 381 | """TAG[.postDISTANCE[.dev0]+gHEX] . 382 | 383 | The ".dev0" means dirty. Note that .dev0 sorts backwards 384 | (a dirty tree will appear "older" than the corresponding clean one), 385 | but you shouldn't be releasing software with -dirty anyways. 386 | 387 | Exceptions: 388 | 1: no tags. 0.postDISTANCE[.dev0] 389 | """ 390 | if pieces["closest-tag"]: 391 | rendered = pieces["closest-tag"] 392 | if pieces["distance"] or pieces["dirty"]: 393 | rendered += ".post%d" % pieces["distance"] 394 | if pieces["dirty"]: 395 | rendered += ".dev0" 396 | rendered += plus_or_dot(pieces) 397 | rendered += "g%s" % pieces["short"] 398 | else: 399 | # exception #1 400 | rendered = "0.post%d" % pieces["distance"] 401 | if pieces["dirty"]: 402 | rendered += ".dev0" 403 | rendered += "+g%s" % pieces["short"] 404 | return rendered 405 | 406 | 407 | def render_pep440_old(pieces): 408 | """TAG[.postDISTANCE[.dev0]] . 409 | 410 | The ".dev0" means dirty. 411 | 412 | Eexceptions: 413 | 1: no tags. 0.postDISTANCE[.dev0] 414 | """ 415 | if pieces["closest-tag"]: 416 | rendered = pieces["closest-tag"] 417 | if pieces["distance"] or pieces["dirty"]: 418 | rendered += ".post%d" % pieces["distance"] 419 | if pieces["dirty"]: 420 | rendered += ".dev0" 421 | else: 422 | # exception #1 423 | rendered = "0.post%d" % pieces["distance"] 424 | if pieces["dirty"]: 425 | rendered += ".dev0" 426 | return rendered 427 | 428 | 429 | def render_git_describe(pieces): 430 | """TAG[-DISTANCE-gHEX][-dirty]. 431 | 432 | Like 'git describe --tags --dirty --always'. 433 | 434 | Exceptions: 435 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 436 | """ 437 | if pieces["closest-tag"]: 438 | rendered = pieces["closest-tag"] 439 | if pieces["distance"]: 440 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) 441 | else: 442 | # exception #1 443 | rendered = pieces["short"] 444 | if pieces["dirty"]: 445 | rendered += "-dirty" 446 | return rendered 447 | 448 | 449 | def render_git_describe_long(pieces): 450 | """TAG-DISTANCE-gHEX[-dirty]. 451 | 452 | Like 'git describe --tags --dirty --always -long'. 453 | The distance/hash is unconditional. 454 | 455 | Exceptions: 456 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 457 | """ 458 | if pieces["closest-tag"]: 459 | rendered = pieces["closest-tag"] 460 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) 461 | else: 462 | # exception #1 463 | rendered = pieces["short"] 464 | if pieces["dirty"]: 465 | rendered += "-dirty" 466 | return rendered 467 | 468 | 469 | def render(pieces, style): 470 | """Render the given version pieces into the requested style.""" 471 | if pieces["error"]: 472 | return { 473 | "version": "unknown", 474 | "full-revisionid": pieces.get("long"), 475 | "dirty": None, 476 | "error": pieces["error"], 477 | "date": None, 478 | } 479 | 480 | if not style or style == "default": 481 | style = "pep440" # the default 482 | 483 | if style == "pep440": 484 | rendered = render_pep440(pieces) 485 | elif style == "pep440-pre": 486 | rendered = render_pep440_pre(pieces) 487 | elif style == "pep440-post": 488 | rendered = render_pep440_post(pieces) 489 | elif style == "pep440-old": 490 | rendered = render_pep440_old(pieces) 491 | elif style == "git-describe": 492 | rendered = render_git_describe(pieces) 493 | elif style == "git-describe-long": 494 | rendered = render_git_describe_long(pieces) 495 | else: 496 | raise ValueError("unknown style '%s'" % style) 497 | 498 | return { 499 | "version": rendered, 500 | "full-revisionid": pieces["long"], 501 | "dirty": pieces["dirty"], 502 | "error": None, 503 | "date": pieces.get("date"), 504 | } 505 | 506 | 507 | def get_versions(): 508 | """Get version information or return default if unable to do so.""" 509 | # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have 510 | # __file__, we can work backwards from there to the root. Some 511 | # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which 512 | # case we can only use expanded keywords. 513 | 514 | cfg = get_config() 515 | verbose = cfg.verbose 516 | 517 | try: 518 | return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) 519 | except NotThisMethod: 520 | pass 521 | 522 | try: 523 | root = os.path.realpath(__file__) 524 | # versionfile_source is the relative path from the top of the source 525 | # tree (where the .git directory might live) to this file. Invert 526 | # this to find the root from __file__. 527 | for i in cfg.versionfile_source.split("/"): 528 | root = os.path.dirname(root) 529 | except NameError: 530 | return { 531 | "version": "0+unknown", 532 | "full-revisionid": None, 533 | "dirty": None, 534 | "error": "unable to find root of source tree", 535 | "date": None, 536 | } 537 | 538 | try: 539 | pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) 540 | return render(pieces, cfg.style) 541 | except NotThisMethod: 542 | pass 543 | 544 | try: 545 | if cfg.parentdir_prefix: 546 | return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) 547 | except NotThisMethod: 548 | pass 549 | 550 | return { 551 | "version": "0+unknown", 552 | "full-revisionid": None, 553 | "dirty": None, 554 | "error": "unable to compute version", 555 | "date": None, 556 | } 557 | -------------------------------------------------------------------------------- /dask_cudf/accessor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. 2 | # and PyData Development Team 3 | # All rights reserved. 4 | 5 | # Copyright (c) 2014-2018, Anaconda, Inc. and contributors 6 | # All rights reserved. 7 | 8 | """ 9 | 10 | accessor.py contains classes for implementing 11 | accessor properties. 12 | 13 | """ 14 | 15 | from toolz import partial 16 | 17 | import cudf 18 | from cudf.dataframe.categorical import CategoricalAccessor as GdfCategoricalAccessor 19 | from cudf.dataframe.series import DatetimeProperties 20 | 21 | # Adapted from 22 | # https://github.com/dask/dask/blob/master/dask/dataframe/accessor.py 23 | 24 | 25 | class Accessor(object): 26 | """ 27 | Base class for Accessor objects dt, str, cat. 28 | 29 | Notes 30 | ----- 31 | Subclasses should define the following attributes: 32 | * _accessor 33 | * _accessor_name 34 | 35 | Subclasses should also implement the following methods: 36 | * _validate() 37 | 38 | """ 39 | 40 | _not_implemented = frozenset([]) 41 | 42 | def __init__(self, series): 43 | from .core import Series 44 | 45 | if not isinstance(series, Series): 46 | raise ValueError("Accessor cannot be initialized") 47 | self._series = series 48 | self._validate(series) 49 | 50 | def _validate(self, series): 51 | """ Validates the data type of series passed to the 52 | accessor. 53 | """ 54 | raise NotImplementedError("Must implement") 55 | 56 | @staticmethod 57 | def _delegate_property(obj, accessor, attr): 58 | out = getattr(getattr(obj, accessor, obj), attr) 59 | return out 60 | 61 | @staticmethod 62 | def _delegate_method(obj, accessor, attr, args, kwargs): 63 | out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs) 64 | return out 65 | 66 | def _property_map(self, attr): 67 | meta = self._delegate_property(self._series._meta, self._accessor_name, attr) 68 | token = "%s-%s" % (self._accessor_name, attr) 69 | return self._series.map_partitions( 70 | self._delegate_property, self._accessor_name, attr, token=token, meta=meta 71 | ) 72 | 73 | def _function_map(self, attr, *args, **kwargs): 74 | meta = self._delegate_method( 75 | self._series._meta_nonempty, self._accessor_name, attr, args, kwargs 76 | ) 77 | token = "%s-%s" % (self._accessor_name, attr) 78 | return self._series.map_partitions( 79 | self._delegate_method, 80 | self._accessor_name, 81 | attr, 82 | args, 83 | kwargs, 84 | meta=meta, 85 | token=token, 86 | ) 87 | 88 | @property 89 | def _delegates(self): 90 | return set(dir(self._accessor)).difference(self._not_implemented) 91 | 92 | def __dir__(self): 93 | o = self._delegates 94 | o.update(self.__dict__) 95 | o.update(dir(type(self))) 96 | return list(o) 97 | 98 | def __getattr__(self, key): 99 | if key in self._delegates: 100 | if isinstance(getattr(self._accessor, key), property): 101 | return self._property_map(key) 102 | else: 103 | return partial(self._function_map, key) 104 | else: 105 | raise AttributeError(key) 106 | 107 | 108 | # Adapted from 109 | # https://github.com/pandas-dev/pandas/blob/master/pandas/core/accessor.py 110 | 111 | 112 | class CachedAccessor(object): 113 | """Custom property-like object (descriptor) for caching accessors. 114 | Parameters 115 | ---------- 116 | name : str 117 | The namespace this will be accessed under, e.g. ``df.timestamp.dt`` 118 | accessor : cls 119 | The class with the extension methods. The class' __init__ method 120 | should expect a ``Series`` as the single argument ``data`` 121 | """ 122 | 123 | def __init__(self, name, accessor): 124 | self._name = name 125 | self._accessor = accessor 126 | 127 | def __get__(self, obj, cls): 128 | if obj is None: 129 | # we're accessing the attribute of the class, i.e., Dataset.geo 130 | return self._accessor 131 | accessor_obj = self._accessor(obj) 132 | return accessor_obj 133 | 134 | 135 | class DatetimeAccessor(Accessor): 136 | """ Accessor object for datetimelike properties of the Series values. 137 | """ 138 | 139 | _accessor = DatetimeProperties 140 | _accessor_name = "dt" 141 | 142 | def _validate(self, series): 143 | if not isinstance(series._meta._column, cudf.dataframe.DatetimeColumn): 144 | raise AttributeError( 145 | "Can only use .dt accessor with datetimelike " "values" 146 | ) 147 | 148 | 149 | class CategoricalAccessor(Accessor): 150 | """ Accessor object for categorical properties of the Series values 151 | of Categorical type. 152 | """ 153 | 154 | _accessor = GdfCategoricalAccessor 155 | _accessor_name = "cat" 156 | ordered = True 157 | 158 | def _validate(self, series): 159 | if not isinstance( 160 | series._meta._column, cudf.dataframe.categorical.CategoricalColumn 161 | ): 162 | raise AttributeError("Can only use .cat accessor with categorical values") 163 | -------------------------------------------------------------------------------- /dask_cudf/backends.py: -------------------------------------------------------------------------------- 1 | from dask.dataframe.methods import concat_dispatch 2 | from dask.dataframe.core import get_parallel_type, meta_nonempty, make_meta 3 | import cudf 4 | 5 | from .core import DataFrame, Series, Index 6 | 7 | 8 | get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) 9 | get_parallel_type.register(cudf.Series, lambda _: Series) 10 | get_parallel_type.register(cudf.Index, lambda _: Index) 11 | 12 | 13 | @meta_nonempty.register((cudf.DataFrame, cudf.Series, cudf.Index)) 14 | def meta_nonempty_cudf(x, index=None): 15 | y = meta_nonempty(x.to_pandas()) # TODO: add iloc[:5] 16 | return cudf.from_pandas(y) 17 | 18 | 19 | @make_meta.register((cudf.Series, cudf.DataFrame)) 20 | def make_meta_cudf(x, index=None): 21 | return x.head(0) 22 | 23 | 24 | @make_meta.register(cudf.Index) 25 | def make_meta_cudf_index(x, index=None): 26 | return x[:0] 27 | 28 | 29 | @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index)) 30 | def concat_cudf(dfs, axis=0, join="outer", uniform=False, filter_warning=True): 31 | assert axis == 0 32 | assert join == "outer" 33 | return cudf.concat(dfs) 34 | -------------------------------------------------------------------------------- /dask_cudf/batcher_sortnet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Batcher's Odd-even sorting network 3 | Adapted from https://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort 4 | """ 5 | import math 6 | 7 | from dask import compute, delayed 8 | 9 | import cudf as gd 10 | 11 | 12 | def get_oversized(length): 13 | """ 14 | The oddeven network requires a power-of-2 length. 15 | This method computes the next power-of-2 from the *length* if 16 | *length* is not a power-of-2 value. 17 | """ 18 | return 2 ** math.ceil(math.log2(length)) 19 | 20 | 21 | def is_power_of_2(length): 22 | return math.log2(length).is_integer() 23 | 24 | 25 | def oddeven_merge(lo, hi, r): 26 | step = r * 2 27 | if step < hi - lo: 28 | for each in oddeven_merge(lo, hi, step): 29 | yield each 30 | for each in oddeven_merge(lo + r, hi, step): 31 | yield each 32 | for i in range(lo + r, hi - r, step): 33 | yield (i, i + r) 34 | else: 35 | yield (lo, lo + r) 36 | 37 | 38 | def oddeven_merge_sort_range(lo, hi): 39 | """ sort the part of x with indices between lo and hi. 40 | 41 | Note: endpoints (lo and hi) are included. 42 | """ 43 | if (hi - lo) >= 1: 44 | # if there is more than one element, split the input 45 | # down the middle and first sort the first and second 46 | # half, followed by merging them. 47 | mid = lo + ((hi - lo) // 2) 48 | for each in oddeven_merge_sort_range(lo, mid): 49 | yield each 50 | for each in oddeven_merge_sort_range(mid + 1, hi): 51 | yield each 52 | for each in oddeven_merge(lo, hi, 1): 53 | yield each 54 | 55 | 56 | def oddeven_merge_sort(length): 57 | """ "length" is the length of the list to be sorted. 58 | Returns a list of pairs of indices starting with 0 """ 59 | assert is_power_of_2(length) 60 | for each in oddeven_merge_sort_range(0, length - 1): 61 | yield each 62 | 63 | 64 | def _pad_data_to_length(parts): 65 | parts = list(parts) 66 | needed = get_oversized(len(parts)) 67 | padn = needed - len(parts) 68 | return parts + [None] * padn, len(parts) 69 | 70 | 71 | def _compare_frame(a, b, max_part_size, by): 72 | if a is not None and b is not None: 73 | joint = gd.concat([a, b]) 74 | sorten = joint.sort_values(by=by) 75 | # Split the sorted frame using the *max_part_size* 76 | lhs, rhs = sorten[:max_part_size], sorten[max_part_size:] 77 | # Replace empty frame with None 78 | return lhs or None, rhs or None 79 | elif a is None and b is None: 80 | return None, None 81 | elif a is None: 82 | return b.sort_values(by=by), None 83 | else: 84 | return a.sort_values(by=by), None 85 | 86 | 87 | def _compare_and_swap_frame(parts, a, b, max_part_size, by): 88 | compared = delayed(_compare_frame)(parts[a], parts[b], max_part_size, by=by) 89 | parts[a] = compared[0] 90 | parts[b] = compared[1] 91 | 92 | 93 | def _cleanup(df): 94 | if "__dask_cudf__valid" in df.columns: 95 | out = df.query("__dask_cudf__valid") 96 | del out["__dask_cudf__valid"] 97 | else: 98 | out = df 99 | return out 100 | 101 | 102 | def sort_delayed_frame(parts, by): 103 | """ 104 | Parameters 105 | ---------- 106 | parts : 107 | Delayed partitions of cudf.DataFrame 108 | by : str 109 | Column name by which to sort 110 | 111 | The sort will also rebalance the partition sizes so that all output 112 | partitions has partition size of atmost `max(original_partition_sizes)`. 113 | Therefore, they may be fewer partitions in the output. 114 | """ 115 | # Empty frame? 116 | if len(parts) == 0: 117 | return parts 118 | # Compute maximum paritition size, which is needed 119 | # for non-uniform partition size 120 | max_part_size = delayed(max)(*map(delayed(len), parts)) 121 | # Add empty partitions to match power-of-2 requirement. 122 | parts, valid = _pad_data_to_length(parts) 123 | # More than 1 input? 124 | if len(parts) > 1: 125 | # Build batcher's odd-even sorting network 126 | for a, b in oddeven_merge_sort(len(parts)): 127 | _compare_and_swap_frame(parts, a, b, max_part_size, by=by) 128 | # Single input? 129 | else: 130 | parts = [delayed(lambda x: x.sort_values(by=by))(parts[0])] 131 | # Count number of non-empty partitions 132 | valid_ct = delayed(sum)( 133 | list(map(delayed(lambda x: int(x is not None)), parts[:valid])) 134 | ) 135 | valid = compute(valid_ct)[0] 136 | validparts = parts[:valid] 137 | return validparts 138 | -------------------------------------------------------------------------------- /dask_cudf/core.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. 2 | import warnings 3 | from collections import OrderedDict 4 | 5 | import pandas as pd 6 | 7 | import dask 8 | import dask.dataframe as dd 9 | import numpy as np 10 | from dask import compute 11 | from dask.base import normalize_token, tokenize 12 | from dask.compatibility import apply 13 | from dask.context import _globals 14 | from dask.core import flatten 15 | from dask.dataframe import from_delayed 16 | from dask.dataframe.core import Scalar, handle_out, map_partitions 17 | from dask.dataframe.utils import raise_on_meta_error 18 | from dask.delayed import delayed 19 | from dask.optimization import cull, fuse 20 | from dask.utils import M, OperatorMethodMixin, funcname, derived_from 21 | from toolz import partition_all 22 | 23 | import cudf 24 | import cudf.bindings.reduce as cpp_reduce 25 | from dask_cudf import batcher_sortnet, join_impl 26 | from dask_cudf.accessor import CachedAccessor, CategoricalAccessor, DatetimeAccessor 27 | 28 | 29 | def optimize(dsk, keys, **kwargs): 30 | flatkeys = list(flatten(keys)) if isinstance(keys, list) else [keys] 31 | dsk, dependencies = cull(dsk, flatkeys) 32 | dsk, dependencies = fuse( 33 | dsk, 34 | keys, 35 | dependencies=dependencies, 36 | ave_width=_globals.get("fuse_ave_width", 1), 37 | ) 38 | dsk, _ = cull(dsk, keys) 39 | return dsk 40 | 41 | 42 | def finalize(results): 43 | return cudf.concat(results) 44 | 45 | 46 | class _Frame(dd.core._Frame, OperatorMethodMixin): 47 | """ Superclass for DataFrame and Series 48 | 49 | Parameters 50 | ---------- 51 | dsk : dict 52 | The dask graph to compute this DataFrame 53 | name : str 54 | The key prefix that specifies which keys in the dask comprise this 55 | particular DataFrame / Series 56 | meta : cudf.DataFrame, cudf.Series, or cudf.Index 57 | An empty cudf object with names, dtypes, and indices matching the 58 | expected output. 59 | divisions : tuple of index values 60 | Values along which we partition our blocks on the index 61 | """ 62 | 63 | __dask_scheduler__ = staticmethod(dask.get) 64 | __dask_optimize__ = staticmethod(optimize) 65 | 66 | def __dask_postcompute__(self): 67 | return finalize, () 68 | 69 | def __dask_postpersist__(self): 70 | return type(self), (self._name, self._meta, self.divisions) 71 | 72 | def __init__(self, dsk, name, meta, divisions): 73 | self.dask = dsk 74 | self._name = name 75 | meta = dd.core.make_meta(meta) 76 | if not isinstance(meta, self._partition_type): 77 | raise TypeError( 78 | "Expected meta to specify type {0}, got type " 79 | "{1}".format(self._partition_type.__name__, type(meta).__name__) 80 | ) 81 | self._meta = dd.core.make_meta(meta) 82 | self.divisions = tuple(divisions) 83 | 84 | def __getstate__(self): 85 | return (self.dask, self._name, self._meta, self.divisions) 86 | 87 | def __setstate__(self, state): 88 | self.dask, self._name, self._meta, self.divisions = state 89 | 90 | def __repr__(self): 91 | s = "" 92 | return s % (type(self).__name__, len(self.dask), self.npartitions) 93 | 94 | def to_dask_dataframe(self): 95 | """Create a dask.dataframe object from a dask_cudf object""" 96 | return self.map_partitions(M.to_pandas) 97 | 98 | 99 | concat = dd.concat 100 | 101 | 102 | normalize_token.register(_Frame, lambda a: a._name) 103 | 104 | 105 | class DataFrame(_Frame, dd.core.DataFrame): 106 | _partition_type = cudf.DataFrame 107 | 108 | def _assign_column(self, k, v): 109 | def assigner(df, k, v): 110 | out = df.copy() 111 | out[k] = v 112 | return out 113 | 114 | meta = assigner(self._meta, k, dd.core.make_meta(v)) 115 | return self.map_partitions(assigner, k, v, meta=meta) 116 | 117 | def apply_rows(self, func, incols, outcols, kwargs={}, cache_key=None): 118 | import uuid 119 | 120 | if cache_key is None: 121 | cache_key = uuid.uuid4() 122 | 123 | def do_apply_rows(df, func, incols, outcols, kwargs): 124 | return df.apply_rows(func, incols, outcols, kwargs, cache_key=cache_key) 125 | 126 | meta = do_apply_rows(self._meta, func, incols, outcols, kwargs) 127 | return self.map_partitions( 128 | do_apply_rows, func, incols, outcols, kwargs, meta=meta 129 | ) 130 | 131 | def merge( 132 | self, 133 | other, 134 | on=None, 135 | how="left", 136 | left_index=False, 137 | right_index=False, 138 | suffixes=("_x", "_y"), 139 | ): 140 | """Merging two dataframes on the column(s) indicated in *on*. 141 | """ 142 | if ( 143 | left_index 144 | or right_index 145 | or not dask.is_dask_collection(other) 146 | or self.npartitions == 1 147 | and how in ("inner", "right") 148 | or other.npartitions == 1 149 | and how in ("inner", "left") 150 | ): 151 | return dd.merge( 152 | self, 153 | other, 154 | how=how, 155 | suffixes=suffixes, 156 | left_index=left_index, 157 | right_index=right_index, 158 | ) 159 | 160 | if not on and not left_index and not right_index: 161 | on = [c for c in self.columns if c in other.columns] 162 | if not on: 163 | left_index = right_index = True 164 | 165 | return join_impl.join_frames( 166 | left=self, 167 | right=other, 168 | on=on, 169 | how=how, 170 | lsuffix=suffixes[0], 171 | rsuffix=suffixes[1], 172 | ) 173 | 174 | def join(self, other, how="left", lsuffix="", rsuffix=""): 175 | """Join two datatframes 176 | 177 | *on* is not supported. 178 | """ 179 | if how == "right": 180 | return other.join(other=self, how="left", lsuffix=rsuffix, rsuffix=lsuffix) 181 | 182 | same_names = set(self.columns) & set(other.columns) 183 | if same_names and not (lsuffix or rsuffix): 184 | raise ValueError( 185 | "there are overlapping columns but " 186 | "lsuffix and rsuffix are not defined" 187 | ) 188 | 189 | left, leftuniques = self._align_divisions() 190 | right, rightuniques = other._align_to_indices(leftuniques) 191 | 192 | leftparts = left.to_delayed() 193 | rightparts = right.to_delayed() 194 | 195 | @delayed 196 | def part_join(left, right, how): 197 | return left.join( 198 | right, how=how, sort=True, lsuffix=lsuffix, rsuffix=rsuffix 199 | ) 200 | 201 | def inner_selector(): 202 | pivot = 0 203 | for i in range(len(leftparts)): 204 | for j in range(pivot, len(rightparts)): 205 | if leftuniques[i] & rightuniques[j]: 206 | yield leftparts[i], rightparts[j] 207 | pivot = j + 1 208 | break 209 | 210 | def left_selector(): 211 | pivot = 0 212 | for i in range(len(leftparts)): 213 | for j in range(pivot, len(rightparts)): 214 | if leftuniques[i] & rightuniques[j]: 215 | yield leftparts[i], rightparts[j] 216 | pivot = j + 1 217 | break 218 | else: 219 | yield leftparts[i], None 220 | 221 | selector = {"left": left_selector, "inner": inner_selector}[how] 222 | 223 | rhs_dtypes = [(k, other._meta.dtypes[k]) for k in other._meta.columns] 224 | 225 | @delayed 226 | def fix_column(lhs): 227 | df = cudf.DataFrame() 228 | for k in lhs.columns: 229 | df[k + lsuffix] = lhs[k] 230 | 231 | for k, dtype in rhs_dtypes: 232 | data = np.zeros(len(lhs), dtype=dtype) 233 | mask_size = cudf.utils.utils.calc_chunk_size( 234 | data.size, cudf.utils.utils.mask_bitsize 235 | ) 236 | mask = np.zeros(mask_size, dtype=cudf.utils.utils.mask_dtype) 237 | sr = cudf.Series.from_masked_array( 238 | data=data, mask=mask, null_count=data.size 239 | ) 240 | 241 | df[k + rsuffix] = sr.set_index(df.index) 242 | 243 | return df 244 | 245 | joinedparts = [ 246 | (part_join(lhs, rhs, how=how) if rhs is not None else fix_column(lhs)) 247 | for lhs, rhs in selector() 248 | ] 249 | 250 | meta = self._meta.join(other._meta, how=how, lsuffix=lsuffix, rsuffix=rsuffix) 251 | return from_delayed(joinedparts, meta=meta) 252 | 253 | def _align_divisions(self): 254 | """Align so that the values do not split across partitions 255 | """ 256 | parts = self.to_delayed() 257 | uniques = self._get_unique_indices(parts=parts) 258 | originals = list(map(frozenset, uniques)) 259 | 260 | changed = True 261 | while changed: 262 | changed = False 263 | for i in range(len(uniques))[:-1]: 264 | intersect = uniques[i] & uniques[i + 1] 265 | if intersect: 266 | smaller = min(uniques[i], uniques[i + 1], key=len) 267 | bigger = max(uniques[i], uniques[i + 1], key=len) 268 | smaller |= intersect 269 | bigger -= intersect 270 | changed = True 271 | 272 | # Fix empty partitions 273 | uniques = list(filter(bool, uniques)) 274 | 275 | return self._align_to_indices(uniques, originals=originals, parts=parts) 276 | 277 | def _get_unique_indices(self, parts=None): 278 | if parts is None: 279 | parts = self.to_delayed() 280 | 281 | @delayed 282 | def unique(x): 283 | return set(x.index.as_column().unique().to_array()) 284 | 285 | parts = self.to_delayed() 286 | return compute(*map(unique, parts)) 287 | 288 | def _align_to_indices(self, uniques, originals=None, parts=None): 289 | uniques = list(map(set, uniques)) 290 | 291 | if parts is None: 292 | parts = self.to_delayed() 293 | 294 | if originals is None: 295 | originals = self._get_unique_indices(parts=parts) 296 | allindices = set() 297 | for x in originals: 298 | allindices |= x 299 | for us in uniques: 300 | us &= allindices 301 | uniques = list(filter(bool, uniques)) 302 | 303 | extras = originals[-1] - uniques[-1] 304 | extras = {x for x in extras if x > max(uniques[-1])} 305 | 306 | if extras: 307 | uniques.append(extras) 308 | 309 | remap = OrderedDict() 310 | for idxset in uniques: 311 | remap[tuple(sorted(idxset))] = bins = [] 312 | for i, orig in enumerate(originals): 313 | if idxset & orig: 314 | bins.append(parts[i]) 315 | 316 | @delayed 317 | def take(indices, depends): 318 | first = min(indices) 319 | last = max(indices) 320 | others = [] 321 | for d in depends: 322 | # TODO: this can be replaced with searchsorted 323 | # Normalize to index data in range before selection. 324 | firstindex = d.index[0] 325 | lastindex = d.index[-1] 326 | s = max(first, firstindex) 327 | e = min(last, lastindex) 328 | others.append(d.loc[s:e]) 329 | return cudf.concat(others) 330 | 331 | newparts = [] 332 | for idx, depends in remap.items(): 333 | newparts.append(take(idx, depends)) 334 | 335 | divisions = list(map(min, uniques)) 336 | divisions.append(max(uniques[-1])) 337 | 338 | newdd = from_delayed(newparts, meta=self._meta) 339 | return newdd, uniques 340 | 341 | def _compute_divisions(self): 342 | if self.known_divisions: 343 | return self 344 | 345 | @delayed 346 | def first_index(df): 347 | return df.index[0] 348 | 349 | @delayed 350 | def last_index(df): 351 | return df.index[-1] 352 | 353 | parts = self.to_delayed() 354 | divs = [first_index(p) for p in parts] + [last_index(parts[-1])] 355 | divisions = compute(*divs) 356 | return type(self)(self.dask, self._name, self._meta, divisions) 357 | 358 | def set_index(self, index, drop=True, sorted=False): 359 | """Set new index. 360 | 361 | Parameters 362 | ---------- 363 | index : str or Series 364 | If a ``str`` is provided, it is used as the name of the 365 | column to be made into the index. 366 | If a ``Series`` is provided, it is used as the new index 367 | drop : bool 368 | Whether the first original index column is dropped. 369 | sorted : bool 370 | Whether the new index column is already sorted. 371 | """ 372 | if not drop: 373 | raise NotImplementedError("drop=False not supported yet") 374 | 375 | if isinstance(index, str): 376 | tmpdf = self.sort_values(index) 377 | return tmpdf._set_column_as_sorted_index(index, drop=drop) 378 | elif isinstance(index, Series): 379 | indexname = "__dask_cudf.index" 380 | df = self.assign(**{indexname: index}) 381 | return df.set_index(indexname, drop=drop, sorted=sorted) 382 | else: 383 | raise TypeError("cannot set_index from {}".format(type(index))) 384 | 385 | def _set_column_as_sorted_index(self, colname, drop): 386 | def select_index(df, col): 387 | return df.set_index(col) 388 | 389 | return self.map_partitions( 390 | select_index, col=colname, meta=self._meta.set_index(colname) 391 | ) 392 | 393 | def _argsort(self, col, sorted=False): 394 | """ 395 | Returns 396 | ------- 397 | shufidx : Series 398 | Positional indices to be used with .take() to 399 | put the dataframe in order w.r.t ``col``. 400 | """ 401 | # Get subset with just the index and positional value 402 | subset = self[col].to_dask_dataframe() 403 | subset = subset.reset_index(drop=False) 404 | ordered = subset.set_index(0, sorted=sorted) 405 | shufidx = from_dask_dataframe(ordered)["index"] 406 | return shufidx 407 | 408 | def _set_index_raw(self, indexname, drop, sorted): 409 | shufidx = self._argsort(indexname, sorted=sorted) 410 | # Shuffle the GPU data 411 | shuffled = self.take(shufidx, npartitions=self.npartitions) 412 | out = shuffled.map_partitions(lambda df: df.set_index(indexname)) 413 | return out 414 | 415 | def reset_index(self, force=False, drop=False): 416 | """Reset index to range based 417 | """ 418 | if force: 419 | dfs = self.to_delayed() 420 | sizes = np.asarray(compute(*map(delayed(len), dfs))) 421 | prefixes = np.zeros_like(sizes) 422 | prefixes[1:] = np.cumsum(sizes[:-1]) 423 | 424 | @delayed 425 | def fix_index(df, startpos): 426 | stoppos = startpos + len(df) 427 | return df.set_index( 428 | cudf.dataframe.RangeIndex(start=startpos, stop=stoppos) 429 | ) 430 | 431 | outdfs = [fix_index(df, startpos) for df, startpos in zip(dfs, prefixes)] 432 | return from_delayed(outdfs, meta=self._meta.reset_index(drop=True)) 433 | else: 434 | return self.map_partitions(M.reset_index, drop=drop) 435 | 436 | def sort_values(self, by, ignore_index=False): 437 | """Sort by the given column 438 | 439 | Parameter 440 | --------- 441 | by : str 442 | """ 443 | parts = self.to_delayed() 444 | sorted_parts = batcher_sortnet.sort_delayed_frame(parts, by) 445 | return from_delayed(sorted_parts, meta=self._meta).reset_index( 446 | force=not ignore_index 447 | ) 448 | 449 | def sort_values_binned(self, by): 450 | """Sorty by the given column and ensure that the same key 451 | doesn't spread across multiple partitions. 452 | """ 453 | # Get sorted partitions 454 | parts = self.sort_values(by=by).to_delayed() 455 | 456 | # Get unique keys in each partition 457 | @delayed 458 | def get_unique(p): 459 | return set(p[by].unique()) 460 | 461 | uniques = list(compute(*map(get_unique, parts))) 462 | 463 | joiner = {} 464 | for i in range(len(uniques)): 465 | joiner[i] = to_join = {} 466 | for j in range(i + 1, len(uniques)): 467 | intersect = uniques[i] & uniques[j] 468 | # If the keys intersect 469 | if intersect: 470 | # Remove keys 471 | uniques[j] -= intersect 472 | to_join[j] = frozenset(intersect) 473 | else: 474 | break 475 | 476 | @delayed 477 | def join(df, other, keys): 478 | others = [other.query("{by}==@k".format(by=by)) for k in sorted(keys)] 479 | return cudf.concat([df] + others) 480 | 481 | @delayed 482 | def drop(df, keep_keys): 483 | locvars = locals() 484 | for i, k in enumerate(keep_keys): 485 | locvars["k{}".format(i)] = k 486 | 487 | conds = ["{by}==@k{i}".format(by=by, i=i) for i in range(len(keep_keys))] 488 | expr = " or ".join(conds) 489 | return df.query(expr) 490 | 491 | for i in range(len(parts)): 492 | if uniques[i]: 493 | parts[i] = drop(parts[i], uniques[i]) 494 | for joinee, intersect in joiner[i].items(): 495 | parts[i] = join(parts[i], parts[joinee], intersect) 496 | 497 | results = [p for i, p in enumerate(parts) if uniques[i]] 498 | return from_delayed(results, meta=self._meta).reset_index() 499 | 500 | def _shuffle_sort_values(self, by): 501 | """Slow shuffle based sort by the given column 502 | 503 | Parameter 504 | --------- 505 | by : str 506 | """ 507 | shufidx = self._argsort(by) 508 | return self.take(shufidx) 509 | 510 | @derived_from(pd.DataFrame) 511 | def var(self, axis=None, skipna=True, ddof=1, split_every=False, 512 | dtype=None, out=None): 513 | axis = self._validate_axis(axis) 514 | meta = self._meta_nonempty.var(axis=axis, skipna=skipna) 515 | if axis == 1: 516 | result = map_partitions(M.var, self, meta=meta, 517 | token=self._token_prefix + 'var', 518 | axis=axis, skipna=skipna, ddof=ddof) 519 | return handle_out(out, result) 520 | 521 | else: 522 | num = self._get_numeric_data() 523 | x = 1.0 * num.sum(skipna=skipna, split_every=split_every) 524 | x2 = 1.0 * (num ** 2).sum(skipna=skipna, split_every=split_every) 525 | n = num.count(split_every=split_every) 526 | name = self._token_prefix + 'var' 527 | result = map_partitions(var_aggregate, x2, x, n, 528 | token=name, meta=meta, ddof=ddof) 529 | if isinstance(self, DataFrame): 530 | result.divisions = (min(self.columns), max(self.columns)) 531 | return handle_out(out, result) 532 | 533 | 534 | def sum_of_squares(x): 535 | x = x.astype("f8")._column 536 | outcol = cpp_reduce.apply_reduce("sum_of_squares", x) 537 | return cudf.Series(outcol) 538 | 539 | 540 | def var_aggregate(x2, x, n, ddof): 541 | try: 542 | with warnings.catch_warnings(record=True): 543 | warnings.simplefilter('always') 544 | result = (x2 / n) - (x / n)**2 545 | if ddof != 0: 546 | result = result * n / (n - ddof) 547 | return result 548 | except ZeroDivisionError: 549 | return np.float64(np.nan) 550 | 551 | 552 | def nlargest_agg(x, **kwargs): 553 | return cudf.concat(x).nlargest(**kwargs) 554 | 555 | 556 | def nsmallest_agg(x, **kwargs): 557 | return cudf.concat(x).nsmallest(**kwargs) 558 | 559 | 560 | def unique_k_agg(x, **kwargs): 561 | return cudf.concat(x).unique_k(**kwargs) 562 | 563 | 564 | class Series(_Frame, dd.core.Series): 565 | _partition_type = cudf.Series 566 | 567 | def count(self, split_every=False): 568 | return reduction( 569 | self, chunk=M.count, aggregate=np.sum, split_every=split_every, meta="i8" 570 | ) 571 | 572 | def mean(self, split_every=False): 573 | sum = self.sum(split_every=split_every) 574 | n = self.count(split_every=split_every) 575 | return sum / n 576 | 577 | def unique_k(self, k, split_every=None): 578 | return reduction( 579 | self, 580 | chunk=M.unique_k, 581 | aggregate=unique_k_agg, 582 | meta=self._meta, 583 | token="unique-k", 584 | split_every=split_every, 585 | k=k, 586 | ) 587 | 588 | @derived_from(pd.DataFrame) 589 | def var(self, axis=None, skipna=True, ddof=1, split_every=False, dtype=None, out=None): 590 | axis = self._validate_axis(axis) 591 | meta = self._meta_nonempty.var(axis=axis, skipna=skipna) 592 | if axis == 1: 593 | result = map_partitions(M.var, self, meta=meta, 594 | token=self._token_prefix + 'var', 595 | axis=axis, skipna=skipna, ddof=ddof) 596 | return handle_out(out, result) 597 | 598 | else: 599 | num = self._get_numeric_data() 600 | x = 1.0 * num.sum(skipna=skipna, split_every=split_every) 601 | x2 = 1.0 * (num ** 2).sum(skipna=skipna, split_every=split_every) 602 | n = num.count(split_every=split_every) 603 | name = self._token_prefix + 'var' 604 | result = map_partitions(var_aggregate, x2, x, n, 605 | token=name, meta=meta, ddof=ddof) 606 | if isinstance(self, DataFrame): 607 | result.divisions = (min(self.columns), max(self.columns)) 608 | return handle_out(out, result) 609 | 610 | 611 | # ---------------------------------------------------------------------- 612 | # Accessor Methods 613 | # ---------------------------------------------------------------------- 614 | dt = CachedAccessor("dt", DatetimeAccessor) 615 | cat = CachedAccessor("cat", CategoricalAccessor) 616 | 617 | 618 | class Index(Series, dd.core.Index): 619 | _partition_type = cudf.dataframe.index.Index 620 | 621 | 622 | def splits_divisions_sorted_cudf(df, chunksize): 623 | segments = list(df.index.find_segments().to_array()) 624 | segments.append(len(df) - 1) 625 | 626 | splits = [0] 627 | last = current_size = 0 628 | for s in segments: 629 | size = s - last 630 | last = s 631 | current_size += size 632 | if current_size >= chunksize: 633 | splits.append(s) 634 | current_size = 0 635 | # Ensure end is included 636 | if splits[-1] != segments[-1]: 637 | splits.append(segments[-1]) 638 | divisions = tuple(df.index.take(np.array(splits)).values) 639 | splits[-1] += 1 # Offset to extract to end 640 | 641 | return splits, divisions 642 | 643 | 644 | def _extract_meta(x): 645 | """ 646 | Extract internal cache data (``_meta``) from dask_cudf objects 647 | """ 648 | if isinstance(x, (Scalar, _Frame)): 649 | return x._meta 650 | elif isinstance(x, list): 651 | return [_extract_meta(_x) for _x in x] 652 | elif isinstance(x, tuple): 653 | return tuple([_extract_meta(_x) for _x in x]) 654 | elif isinstance(x, dict): 655 | return {k: _extract_meta(v) for k, v in x.items()} 656 | return x 657 | 658 | 659 | def _emulate(func, *args, **kwargs): 660 | """ 661 | Apply a function using args / kwargs. If arguments contain dd.DataFrame / 662 | dd.Series, using internal cache (``_meta``) for calculation 663 | """ 664 | with raise_on_meta_error(funcname(func)): 665 | return func(*_extract_meta(args), **_extract_meta(kwargs)) 666 | 667 | 668 | def align_partitions(args): 669 | """Align partitions between dask_cudf objects. 670 | 671 | Note that if all divisions are unknown, but have equal npartitions, then 672 | they will be passed through unchanged.""" 673 | dfs = [df for df in args if isinstance(df, _Frame)] 674 | if not dfs: 675 | return args 676 | 677 | divisions = dfs[0].divisions 678 | if not all(df.divisions == divisions for df in dfs): 679 | raise NotImplementedError("Aligning mismatched partitions") 680 | return args 681 | 682 | 683 | def reduction( 684 | args, 685 | chunk=None, 686 | aggregate=None, 687 | combine=None, 688 | meta=None, 689 | token=None, 690 | chunk_kwargs=None, 691 | aggregate_kwargs=None, 692 | combine_kwargs=None, 693 | split_every=None, 694 | **kwargs 695 | ): 696 | """Generic tree reduction operation. 697 | 698 | Parameters 699 | ---------- 700 | args : 701 | Positional arguments for the `chunk` function. All `dask.dataframe` 702 | objects should be partitioned and indexed equivalently. 703 | chunk : function [block-per-arg] -> block 704 | Function to operate on each block of data 705 | aggregate : function list-of-blocks -> block 706 | Function to operate on the list of results of chunk 707 | combine : function list-of-blocks -> block, optional 708 | Function to operate on intermediate lists of results of chunk 709 | in a tree-reduction. If not provided, defaults to aggregate. 710 | $META 711 | token : str, optional 712 | The name to use for the output keys. 713 | chunk_kwargs : dict, optional 714 | Keywords for the chunk function only. 715 | aggregate_kwargs : dict, optional 716 | Keywords for the aggregate function only. 717 | combine_kwargs : dict, optional 718 | Keywords for the combine function only. 719 | split_every : int, optional 720 | Group partitions into groups of this size while performing a 721 | tree-reduction. If set to False, no tree-reduction will be used, 722 | and all intermediates will be concatenated and passed to ``aggregate``. 723 | Default is 8. 724 | kwargs : 725 | All remaining keywords will be passed to ``chunk``, ``aggregate``, and 726 | ``combine``. 727 | """ 728 | if chunk_kwargs is None: 729 | chunk_kwargs = dict() 730 | if aggregate_kwargs is None: 731 | aggregate_kwargs = dict() 732 | chunk_kwargs.update(kwargs) 733 | aggregate_kwargs.update(kwargs) 734 | 735 | if combine is None: 736 | if combine_kwargs: 737 | raise ValueError("`combine_kwargs` provided with no `combine`") 738 | combine = aggregate 739 | combine_kwargs = aggregate_kwargs 740 | else: 741 | if combine_kwargs is None: 742 | combine_kwargs = dict() 743 | combine_kwargs.update(kwargs) 744 | 745 | if not isinstance(args, (tuple, list)): 746 | args = [args] 747 | 748 | npartitions = set(arg.npartitions for arg in args if isinstance(arg, _Frame)) 749 | if len(npartitions) > 1: 750 | raise ValueError("All arguments must have same number of partitions") 751 | npartitions = npartitions.pop() 752 | 753 | if split_every is None: 754 | split_every = 8 755 | elif split_every is False: 756 | split_every = npartitions 757 | elif split_every < 2 or not isinstance(split_every, int): 758 | raise ValueError("split_every must be an integer >= 2") 759 | 760 | token_key = tokenize( 761 | token or (chunk, aggregate), 762 | meta, 763 | args, 764 | chunk_kwargs, 765 | aggregate_kwargs, 766 | combine_kwargs, 767 | split_every, 768 | ) 769 | 770 | # Chunk 771 | a = "{0}-chunk-{1}".format(token or funcname(chunk), token_key) 772 | if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: 773 | dsk = {(a, 0, i): (chunk, key) for i, key in enumerate(args[0].__dask_keys__())} 774 | else: 775 | dsk = { 776 | (a, 0, i): ( 777 | apply, 778 | chunk, 779 | [(x._name, i) if isinstance(x, _Frame) else x for x in args], 780 | chunk_kwargs, 781 | ) 782 | for i in range(args[0].npartitions) 783 | } 784 | 785 | # Combine 786 | b = "{0}-combine-{1}".format(token or funcname(combine), token_key) 787 | k = npartitions 788 | depth = 0 789 | while k > split_every: 790 | for part_i, inds in enumerate(partition_all(split_every, range(k))): 791 | conc = (list, [(a, depth, i) for i in inds]) 792 | dsk[(b, depth + 1, part_i)] = ( 793 | (apply, combine, [conc], combine_kwargs) 794 | if combine_kwargs 795 | else (combine, conc) 796 | ) 797 | k = part_i + 1 798 | a = b 799 | depth += 1 800 | 801 | # Aggregate 802 | b = "{0}-agg-{1}".format(token or funcname(aggregate), token_key) 803 | conc = (list, [(a, depth, i) for i in range(k)]) 804 | if aggregate_kwargs: 805 | dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) 806 | else: 807 | dsk[(b, 0)] = (aggregate, conc) 808 | 809 | if meta is None: 810 | meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) 811 | meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) 812 | meta = dd.core.make_meta(meta) 813 | 814 | for arg in args: 815 | if isinstance(arg, _Frame): 816 | dsk.update(arg.dask) 817 | 818 | return dd.core.new_dd_object(dsk, b, meta, (None, None)) 819 | 820 | 821 | from_cudf = dd.from_pandas 822 | 823 | 824 | def from_dask_dataframe(df): 825 | return df.map_partitions(cudf.from_pandas) 826 | -------------------------------------------------------------------------------- /dask_cudf/io/__init__.py: -------------------------------------------------------------------------------- 1 | from .csv import read_csv 2 | from .orc import read_orc 3 | from .json import read_json 4 | from .parquet import read_parquet 5 | -------------------------------------------------------------------------------- /dask_cudf/io/csv.py: -------------------------------------------------------------------------------- 1 | import os 2 | from glob import glob 3 | from warnings import warn 4 | 5 | from dask.base import tokenize 6 | from dask.compatibility import apply 7 | import dask.dataframe as dd 8 | from dask.utils import parse_bytes 9 | from dask.dataframe.io.csv import make_reader 10 | 11 | import cudf 12 | from cudf.bindings.GDFError import GDFError 13 | 14 | 15 | def read_csv(path, chunksize="256 MiB", **kwargs): 16 | if "://" in str(path): 17 | func = make_reader(cudf.read_csv, "read_csv", "CSV") 18 | return func(path, blocksize=chunksize, **kwargs) 19 | else: 20 | return _internal_read_csv(path=path, chunksize=chunksize, **kwargs) 21 | 22 | 23 | def _internal_read_csv(path, chunksize="256 MiB", **kwargs): 24 | if isinstance(chunksize, str): 25 | chunksize = parse_bytes(chunksize) 26 | 27 | filenames = sorted(glob(str(path))) 28 | if not filenames: 29 | msg = f"A file in: {filenames} does not exist." 30 | raise FileNotFoundError(msg) 31 | 32 | name = "read-csv-" + tokenize( 33 | path, tokenize, **kwargs 34 | ) # TODO: get last modified time 35 | 36 | compression = kwargs.get("compression", False) 37 | if compression and chunksize: 38 | # compressed CSVs reading must read the entire file 39 | kwargs.pop("byte_range", None) 40 | warn( 41 | "Warning %s compression does not support breaking apart files\n" 42 | "Please ensure that each individual file can fit in memory and\n" 43 | "use the keyword ``chunksize=None to remove this message``\n" 44 | "Setting ``chunksize=(size of file)``" % compression 45 | ) 46 | chunksize = None 47 | 48 | if chunksize is None: 49 | return read_csv_without_chunksize(path, **kwargs) 50 | 51 | dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") 52 | meta = dask_reader(filenames[0], **kwargs)._meta 53 | 54 | dsk = {} 55 | i = 0 56 | dtypes = meta.dtypes.values 57 | 58 | for fn in filenames: 59 | size = os.path.getsize(fn) 60 | for start in range(0, size, chunksize): 61 | kwargs2 = kwargs.copy() 62 | kwargs2["byte_range"] = ( 63 | start, 64 | chunksize, 65 | ) # specify which chunk of the file we care about 66 | if start != 0: 67 | kwargs2["names"] = meta.columns # no header in the middle of the file 68 | kwargs2["header"] = None 69 | dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) 70 | 71 | i += 1 72 | 73 | divisions = [None] * (len(dsk) + 1) 74 | return dd.core.new_dd_object(dsk, name, meta, divisions) 75 | 76 | 77 | def _read_csv(fn, dtypes=None, **kwargs): 78 | try: 79 | cdf = cudf.read_csv(fn, **kwargs) 80 | except GDFError: 81 | # end of file check https://github.com/rapidsai/dask-cudf/issues/103 82 | # this should be removed when CUDF has better dtype/parse_date support 83 | dtypes = dict(zip(kwargs["names"], dtypes)) 84 | df = dd.core.make_meta(dtypes) 85 | cdf = cudf.from_pandas(df) 86 | return cdf 87 | 88 | 89 | def read_csv_without_chunksize(path, **kwargs): 90 | """Read entire CSV with optional compression (gzip/zip) 91 | 92 | Parameters 93 | ---------- 94 | path : str 95 | path to files (support for glob) 96 | """ 97 | filenames = sorted(glob(str(path))) 98 | name = "read-csv-" + tokenize(path, **kwargs) 99 | 100 | meta = cudf.read_csv(filenames[0], **kwargs) 101 | 102 | graph = { 103 | (name, i): (apply, cudf.read_csv, [fn], kwargs) 104 | for i, fn in enumerate(filenames) 105 | } 106 | 107 | divisions = [None] * (len(filenames) + 1) 108 | 109 | return dd.core.new_dd_object(graph, name, meta, divisions) 110 | -------------------------------------------------------------------------------- /dask_cudf/io/json.py: -------------------------------------------------------------------------------- 1 | import cudf 2 | import dask 3 | from functools import partial 4 | 5 | 6 | read_json = partial(dask.dataframe.read_json, engine=cudf.read_json) 7 | -------------------------------------------------------------------------------- /dask_cudf/io/orc.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | 3 | from dask.base import tokenize 4 | from dask.compatibility import apply 5 | import dask.dataframe as dd 6 | 7 | import cudf 8 | 9 | 10 | def read_orc(path, **kwargs): 11 | """ Read ORC files into a Dask DataFrame 12 | 13 | This calls the ``cudf.read_orc`` function on many ORC files. 14 | See that function for additional details. 15 | 16 | Examples 17 | -------- 18 | >>> import dask_cudf 19 | >>> df = dask_cudf.read_orc("/path/to/*.orc") # doctest: +SKIP 20 | 21 | See Also 22 | -------- 23 | cudf.read_orc 24 | """ 25 | 26 | filenames = sorted(glob(str(path))) 27 | name = "read-orc-" + tokenize(path, **kwargs) 28 | 29 | meta = cudf.read_orc(filenames[0], **kwargs) 30 | 31 | graph = { 32 | (name, i): (apply, cudf.read_orc, [fn], kwargs) 33 | for i, fn in enumerate(filenames) 34 | } 35 | 36 | divisions = [None] * (len(filenames) + 1) 37 | 38 | return dd.core.new_dd_object(graph, name, meta, divisions) 39 | -------------------------------------------------------------------------------- /dask_cudf/io/parquet.py: -------------------------------------------------------------------------------- 1 | import os 2 | from glob import glob 3 | 4 | from dask.base import tokenize 5 | from dask.compatibility import apply 6 | import dask.dataframe as dd 7 | from dask.utils import natural_sort_key 8 | 9 | import cudf 10 | 11 | 12 | def read_parquet(path, **kwargs): 13 | """ Read parquet files into a Dask DataFrame 14 | 15 | This calls the ``cudf.read_parquet`` function on many parquet files. 16 | See that function for additional details. 17 | 18 | Examples 19 | -------- 20 | >>> import dask_cudf 21 | >>> df = dask_cudf.read_parquet("/path/to/dataset/") # doctest: +SKIP 22 | 23 | See Also 24 | -------- 25 | cudf.read_parquet 26 | """ 27 | 28 | name = "read-parquet-" + tokenize( 29 | path, 30 | **kwargs 31 | ) 32 | 33 | paths = path 34 | if isinstance(path, str): 35 | paths = sorted(glob(str(path))) 36 | 37 | # Ignore *_metadata files for now 38 | paths = sorted([f for f in paths if not f.endswith('_metadata')], 39 | key=natural_sort_key) 40 | 41 | # Use 0th file to create meta 42 | meta = cudf.read_parquet(paths[0], **kwargs) 43 | graph = { 44 | (name, i): (apply, cudf.read_parquet, [fn], kwargs) 45 | for i, fn in enumerate(paths) 46 | } 47 | divisions = [None] * (len(paths) + 1) 48 | 49 | return dd.core.new_dd_object(graph, name, meta, divisions) 50 | -------------------------------------------------------------------------------- /dask_cudf/io/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/dask-cudf/b566ab60ea69e6e165533b68b1966875528afb06/dask_cudf/io/tests/__init__.py -------------------------------------------------------------------------------- /dask_cudf/io/tests/sample.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/dask-cudf/b566ab60ea69e6e165533b68b1966875528afb06/dask_cudf/io/tests/sample.orc -------------------------------------------------------------------------------- /dask_cudf/io/tests/test_csv.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import dask 4 | import dask_cudf 5 | import dask.dataframe as dd 6 | import pandas as pd 7 | import numpy as np 8 | 9 | import pytest 10 | 11 | 12 | def test_read_csv(tmp_path): 13 | df = dask.datasets.timeseries(dtypes={"x": int, "y": int}, freq="120s").reset_index( 14 | drop=True 15 | ) 16 | 17 | df.to_csv(tmp_path / "data-*.csv", index=False) 18 | 19 | df2 = dask_cudf.read_csv(tmp_path / "data-*.csv") 20 | dd.assert_eq(df, df2) 21 | 22 | # file path test 23 | stmp_path = str(tmp_path / "data-*.csv") 24 | df3 = dask_cudf.read_csv(f"file://{stmp_path}") 25 | dd.assert_eq(df2, df3) 26 | 27 | 28 | def test_raises_FileNotFoundError(): 29 | with pytest.raises(FileNotFoundError): 30 | dask_cudf.read_csv("foo.csv") 31 | 32 | 33 | def test_read_csv_w_bytes(tmp_path): 34 | df = dask.datasets.timeseries(dtypes={"x": int, "y": int}, freq="120s").reset_index( 35 | drop=True 36 | ) 37 | df = pd.DataFrame(dict(x=np.arange(20), y=np.arange(20))) 38 | df.to_csv(tmp_path / "data-*.csv", index=False) 39 | 40 | df2 = dask_cudf.read_csv(tmp_path / "*.csv", chunksize="50 B") 41 | assert df2.npartitions is 3 42 | dd.assert_eq(df2, df, check_index=False) 43 | 44 | 45 | def test_read_csv_compression(tmp_path): 46 | df = pd.DataFrame(dict(x=np.arange(20), y=np.arange(20))) 47 | df.to_csv(tmp_path / "data.csv.gz", index=False, compression="gzip") 48 | 49 | with pytest.warns(UserWarning) as w: 50 | df2 = dask_cudf.read_csv( 51 | tmp_path / "*.csv.gz", chunksize="50 B", compression="gzip" 52 | ) 53 | 54 | assert len(w) == 1 55 | msg = str(w[0].message) 56 | assert "gzip" in msg 57 | 58 | assert df2.npartitions is 1 59 | dd.assert_eq(df2, df, check_index=False) 60 | 61 | with warnings.catch_warnings(record=True) as record: 62 | df2 = dask_cudf.read_csv( 63 | tmp_path / "*.csv.gz", chunksize=None, compression="gzip" 64 | ) 65 | 66 | assert not record 67 | -------------------------------------------------------------------------------- /dask_cudf/io/tests/test_json.py: -------------------------------------------------------------------------------- 1 | import dask 2 | import dask_cudf 3 | import dask.dataframe as dd 4 | from dask.utils import tmpfile 5 | import pandas as pd 6 | 7 | import pytest 8 | 9 | 10 | def test_read_json(tmp_path): 11 | df1 = dask.datasets.timeseries( 12 | dtypes={"x": int, "y": int}, freq="120s").reset_index(drop=True) 13 | df1.to_json(tmp_path / "data-*.json") 14 | df2 = dask_cudf.read_json(tmp_path / "data-*.json") 15 | dd.assert_eq(df1, df2) 16 | 17 | 18 | @pytest.mark.filterwarnings("ignore:Using CPU") 19 | @pytest.mark.parametrize('orient', ['split', 'index', 'columns', 'values']) 20 | def test_read_json_basic(orient): 21 | df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}) 22 | with tmpfile('json') as f: 23 | df.to_json(f, orient=orient, lines=False) 24 | actual = dask_cudf.read_json(f, orient=orient, lines=False) 25 | actual_pd = pd.read_json(f, orient=orient, lines=False) 26 | dd.assert_eq(actual, actual_pd) 27 | 28 | 29 | @pytest.mark.filterwarnings("ignore:Using CPU") 30 | @pytest.mark.parametrize('lines', [True, False]) 31 | def test_read_json_lines(lines): 32 | df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}) 33 | with tmpfile('json') as f: 34 | df.to_json(f, orient='records', lines=lines) 35 | actual = dask_cudf.read_json(f, orient='records', lines=lines) 36 | actual_pd = pd.read_json(f, orient='records', lines=lines) 37 | dd.assert_eq(actual, actual_pd) 38 | -------------------------------------------------------------------------------- /dask_cudf/io/tests/test_orc.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import dask_cudf 4 | import dask.dataframe as dd 5 | import cudf 6 | 7 | import pytest 8 | 9 | # import pyarrow.orc as orc 10 | 11 | cur_dir = os.path.dirname(__file__) 12 | sample_orc = os.path.join(cur_dir, "sample.orc") 13 | 14 | 15 | def test_read_orc_defaults(): 16 | df1 = cudf.read_orc(sample_orc) 17 | df2 = dask_cudf.read_orc(sample_orc) 18 | df2.head().to_pandas() 19 | dd.assert_eq(df1, df2, check_index=False) 20 | 21 | 22 | # engine pyarrow fails 23 | # https://github.com/rapidsai/cudf/issues/1595 24 | @pytest.mark.parametrize("engine", ["cudf"]) 25 | @pytest.mark.parametrize("columns", [["time", "date"], ["time"]]) 26 | def test_read_orc_cols(engine, columns): 27 | df1 = cudf.read_orc(sample_orc, engine=engine, columns=columns) 28 | 29 | df2 = dask_cudf.read_orc(sample_orc, engine=engine, columns=columns) 30 | 31 | dd.assert_eq(df1, df2, check_index=False) 32 | -------------------------------------------------------------------------------- /dask_cudf/io/tests/test_parquet.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import dask_cudf 4 | import pandas as pd 5 | import dask.dataframe as dd 6 | from dask.dataframe.utils import assert_eq 7 | from dask.utils import natural_sort_key 8 | import cudf 9 | 10 | import pytest 11 | 12 | 13 | nrows = 40 14 | npartitions = 15 15 | df = pd.DataFrame({'x': [i * 7 % 5 for i in range(nrows)], # Not sorted 16 | 'y': [i * 2.5 for i in range(nrows)]}) # Sorted 17 | ddf = dd.from_pandas(df, npartitions=npartitions) 18 | 19 | 20 | def test_roundtrip_from_dask(tmpdir): 21 | tmpdir = str(tmpdir) 22 | ddf.to_parquet(tmpdir, engine='pyarrow') 23 | files = sorted([os.path.join(tmpdir, f) 24 | for f in os.listdir(tmpdir) 25 | if not f.endswith('_metadata')], 26 | key=natural_sort_key) 27 | 28 | # Read list of parquet files 29 | ddf2 = dask_cudf.read_parquet(files) 30 | assert_eq(ddf, ddf2, check_divisions=False) 31 | 32 | # Specify columns=['x'] 33 | ddf2 = dask_cudf.read_parquet(files, columns=['x']) 34 | assert_eq(ddf[['x']], ddf2, check_divisions=False) 35 | 36 | # Specify columns='y' 37 | ddf2 = dask_cudf.read_parquet(files, columns='y') 38 | assert_eq(ddf[['y']], ddf2, check_divisions=False) 39 | 40 | # Read parquet-dataset directory 41 | # dask_cudf.read_parquet will ignore *_metadata files 42 | ddf2 = dask_cudf.read_parquet(os.path.join(tmpdir, '*')) 43 | assert_eq(ddf, ddf2, check_divisions=False) 44 | 45 | 46 | def test_roundtrip_from_pandas(tmpdir): 47 | fn = str(tmpdir.join('test.parquet')) 48 | df.to_parquet(fn) 49 | ddf2 = dask_cudf.read_parquet(fn) 50 | assert_eq(df, ddf2) 51 | -------------------------------------------------------------------------------- /dask_cudf/io/tests/test_s3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from contextlib import contextmanager 4 | 5 | 6 | import dask_cudf 7 | 8 | s3fs = pytest.importorskip("s3fs") 9 | boto3 = pytest.importorskip("boto3") 10 | moto = pytest.importorskip("moto") 11 | httpretty = pytest.importorskip("httpretty") 12 | 13 | from dask.bytes.s3 import DaskS3FileSystem 14 | 15 | 16 | @contextmanager 17 | def ensure_safe_environment_variables(): 18 | """ 19 | Get a context manager to safely set environment variables 20 | All changes will be undone on close, hence environment variables set 21 | within this contextmanager will neither persist nor change global state. 22 | """ 23 | saved_environ = dict(os.environ) 24 | try: 25 | yield 26 | finally: 27 | os.environ.clear() 28 | os.environ.update(saved_environ) 29 | 30 | 31 | @contextmanager 32 | def s3_context(bucket, files): 33 | with ensure_safe_environment_variables(): 34 | # temporary workaround as moto fails for botocore >= 1.11 otherwise, 35 | # see https://github.com/spulec/moto/issues/1924 & 1952 36 | os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") 37 | os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") 38 | 39 | with moto.mock_s3(): 40 | client = boto3.client("s3") 41 | client.create_bucket(Bucket=bucket, ACL="public-read-write") 42 | for f, data in files.items(): 43 | client.put_object(Bucket=bucket, Key=f, Body=data) 44 | 45 | yield DaskS3FileSystem(anon=True) 46 | 47 | for f, data in files.items(): 48 | try: 49 | client.delete_object(Bucket=bucket, Key=f, Body=data) 50 | except Exception: 51 | pass 52 | finally: 53 | httpretty.HTTPretty.disable() 54 | httpretty.HTTPretty.reset() 55 | 56 | 57 | def test_read_csv(): 58 | with s3_context("csv", {"a.csv": b"a,b\n1,2\n3,4\n"}) as s3: 59 | df = dask_cudf.read_csv( 60 | "s3://csv/*.csv", chunksize="50 B", storage_options={"s3": s3} 61 | ) 62 | assert df.a.sum().compute() == 4 63 | -------------------------------------------------------------------------------- /dask_cudf/join_impl.py: -------------------------------------------------------------------------------- 1 | from dask import delayed 2 | import dask.dataframe as dd 3 | 4 | import cudf 5 | 6 | 7 | @delayed 8 | def local_shuffle(frame, num_new_parts, key_columns): 9 | """Regroup the frame based on the key column(s) 10 | """ 11 | partitions = frame.partition_by_hash(columns=key_columns, nparts=num_new_parts) 12 | return dict(enumerate(partitions)) 13 | 14 | 15 | @delayed 16 | def get_subgroup(groups, i): 17 | out = groups.get(i) 18 | if out is None: 19 | return () 20 | return out 21 | 22 | 23 | def group_frame(frame_partitions, num_new_parts, key_columns): 24 | """Group frame to prepare for the join 25 | """ 26 | return [ 27 | local_shuffle(part, num_new_parts, key_columns) for part in frame_partitions 28 | ] 29 | 30 | 31 | def fanout_subgroups(grouped_parts, num_new_parts): 32 | return [ 33 | [get_subgroup(part, j) for part in grouped_parts] for j in range(num_new_parts) 34 | ] 35 | 36 | 37 | def join_frames(left, right, on, how, lsuffix, rsuffix): 38 | """Join two frames on 1 or more columns. 39 | 40 | Parameters 41 | ---------- 42 | left, right : dask_cudf.DataFrame 43 | on : tuple[str] 44 | key column(s) 45 | how : str 46 | Join method 47 | lsuffix, rsuffix : str 48 | """ 49 | 50 | if on: 51 | on = [on] if isinstance(on, str) else list(on) 52 | 53 | empty_frame = left._meta.merge( 54 | right._meta, on=on, how=how, suffixes=(lsuffix, rsuffix) 55 | ) 56 | 57 | def merge(left, right): 58 | return left.merge(right, on=on, how=how, suffixes=(lsuffix, rsuffix)) 59 | 60 | left_val_names = [k for k in left.columns if k not in on] 61 | right_val_names = [k for k in right.columns if k not in on] 62 | same_names = set(left_val_names) & set(right_val_names) 63 | if same_names and not (lsuffix or rsuffix): 64 | raise ValueError( 65 | "there are overlapping columns but " "lsuffix and rsuffix are not defined" 66 | ) 67 | 68 | dtypes = {k: left[k].dtype for k in left.columns} 69 | dtypes.update({k: right[k].dtype for k in right.columns}) 70 | 71 | left_parts = left.to_delayed() 72 | right_parts = right.to_delayed() 73 | 74 | # Add column w/ hash(v) % nparts 75 | nparts = max(len(left_parts), len(right_parts)) 76 | 77 | left_hashed = group_frame(left_parts, nparts, on) 78 | right_hashed = group_frame(right_parts, nparts, on) 79 | 80 | # Fanout each partition into nparts subgroups 81 | left_subgroups = fanout_subgroups(left_hashed, nparts) 82 | right_subgroups = fanout_subgroups(right_hashed, nparts) 83 | 84 | assert len(left_subgroups) == len(right_subgroups) 85 | 86 | # Concat 87 | left_cats = [delayed(cudf.concat, pure=True)(it) for it in left_subgroups] 88 | right_cats = [delayed(cudf.concat, pure=True)(it) for it in right_subgroups] 89 | 90 | # Combine 91 | merged = [ 92 | delayed(merge, pure=True)(left_cats[i], right_cats[i]) for i in range(nparts) 93 | ] 94 | 95 | return dd.from_delayed(merged, prefix="join_result", meta=empty_frame) 96 | -------------------------------------------------------------------------------- /dask_cudf/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/dask-cudf/b566ab60ea69e6e165533b68b1966875528afb06/dask_cudf/tests/__init__.py -------------------------------------------------------------------------------- /dask_cudf/tests/test_accessor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from pandas.util.testing import assert_series_equal 5 | 6 | import dask_cudf as dgd 7 | from cudf.dataframe import Series 8 | 9 | ############################################################################# 10 | # Datetime Accessor # 11 | ############################################################################# 12 | 13 | 14 | def data_dt_1(): 15 | return pd.date_range("20010101", "20020215", freq="400h") 16 | 17 | 18 | def data_dt_2(): 19 | return np.random.randn(100) 20 | 21 | 22 | dt_fields = ["year", "month", "day", "hour", "minute", "second"] 23 | 24 | 25 | @pytest.mark.parametrize("data", [data_dt_2()]) 26 | @pytest.mark.xfail(raises=AttributeError) 27 | def test_datetime_accessor_initialization(data): 28 | pdsr = pd.Series(data.copy()) 29 | sr = Series(pdsr) 30 | dsr = dgd.from_cudf(sr, npartitions=5) 31 | dsr.dt 32 | 33 | 34 | @pytest.mark.parametrize("data", [data_dt_1()]) 35 | def test_series(data): 36 | pdsr = pd.Series(data.copy()) 37 | sr = Series(pdsr) 38 | dsr = dgd.from_cudf(sr, npartitions=5) 39 | 40 | np.testing.assert_equal(np.array(pdsr), np.array(dsr.compute())) 41 | 42 | 43 | @pytest.mark.parametrize("data", [data_dt_1()]) 44 | @pytest.mark.parametrize("field", dt_fields) 45 | def test_dt_series(data, field): 46 | pdsr = pd.Series(data.copy()) 47 | sr = Series(pdsr) 48 | dsr = dgd.from_cudf(sr, npartitions=5) 49 | base = getattr(pdsr.dt, field) 50 | test = getattr(dsr.dt, field).compute().to_pandas().astype("int64") 51 | assert_series_equal(base, test) 52 | 53 | 54 | ############################################################################# 55 | # Categorical Accessor # 56 | ############################################################################# 57 | 58 | 59 | def data_cat_1(): 60 | cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) 61 | return cat 62 | 63 | 64 | def data_cat_2(): 65 | return pd.Series([1, 2, 3]) 66 | 67 | 68 | def data_cat_3(): 69 | cat1 = pd.Categorical( 70 | ["a", "a", "b", "c", "a"], categories=["a", "b", "c"], ordered=True 71 | ) 72 | cat2 = pd.Categorical( 73 | ["a", "b", "a", "c", "b"], categories=["a", "b", "c"], ordered=True 74 | ) 75 | return cat1, cat2 76 | 77 | 78 | @pytest.mark.parametrize("data", [data_cat_2()]) 79 | @pytest.mark.xfail(raises=AttributeError) 80 | def test_categorical_accessor_initialization(data): 81 | sr = Series(data.copy()) 82 | dsr = dgd.from_cudf(sr, npartitions=5) 83 | dsr.cat 84 | 85 | 86 | @pytest.mark.xfail(reason="") 87 | @pytest.mark.parametrize("data", [data_cat_1()]) 88 | def test_categorical_basic(data): 89 | cat = data.copy() 90 | pdsr = pd.Series(cat) 91 | sr = Series(cat) 92 | dsr = dgd.from_cudf(sr, npartitions=2) 93 | result = dsr.compute() 94 | np.testing.assert_array_equal(cat.codes, result.to_array()) 95 | assert dsr.dtype == pdsr.dtype 96 | 97 | # Test attributes 98 | assert pdsr.cat.ordered == dsr.cat.ordered 99 | # TODO: Investigate dsr.cat.categories: It raises 100 | # ValueError: Expected iterable of tuples of (name, dtype), 101 | # got ('a', 'b', 'c') 102 | # assert(tuple(pdsr.cat.categories) == tuple(dsr.cat.categories)) 103 | 104 | np.testing.assert_array_equal(pdsr.cat.codes.data, result.to_array()) 105 | np.testing.assert_array_equal(pdsr.cat.codes.dtype, dsr.cat.codes.dtype) 106 | 107 | string = str(result) 108 | expect_str = """ 109 | 0 a 110 | 1 a 111 | 2 b 112 | 3 c 113 | 4 a 114 | """ 115 | assert all(x == y for x, y in zip(string.split(), expect_str.split())) 116 | 117 | 118 | @pytest.mark.xfail(reason="") 119 | @pytest.mark.parametrize("data", [data_cat_1()]) 120 | def test_categorical_compare_unordered(data): 121 | cat = data.copy() 122 | pdsr = pd.Series(cat) 123 | sr = Series(cat) 124 | dsr = dgd.from_cudf(sr, npartitions=2) 125 | 126 | # Test equality 127 | out = dsr == dsr 128 | assert out.dtype == np.bool_ 129 | assert np.all(out.compute()) 130 | assert np.all(pdsr == pdsr) 131 | 132 | # Test inequality 133 | out = dsr != dsr 134 | assert not np.any(out.compute()) 135 | assert not np.any(pdsr != pdsr) 136 | 137 | assert not dsr.cat.ordered 138 | assert not pdsr.cat.ordered 139 | 140 | with pytest.raises((TypeError, ValueError)) as raises: 141 | pdsr < pdsr 142 | 143 | raises.match("Unordered Categoricals can only compare equality or not") 144 | 145 | with pytest.raises((TypeError, ValueError)) as raises: 146 | dsr < dsr 147 | 148 | raises.match("Unordered Categoricals can only compare equality or not") 149 | 150 | 151 | @pytest.mark.parametrize("data", [data_cat_3()]) 152 | def test_categorical_compare_ordered(data): 153 | cat1 = data[0] 154 | cat2 = data[1] 155 | pdsr1 = pd.Series(cat1) 156 | pdsr2 = pd.Series(cat2) 157 | sr1 = Series(cat1) 158 | sr2 = Series(cat2) 159 | dsr1 = dgd.from_cudf(sr1, npartitions=2) 160 | dsr2 = dgd.from_cudf(sr2, npartitions=2) 161 | 162 | # Test equality 163 | out = dsr1 == dsr1 164 | assert out.dtype == np.bool_ 165 | assert np.all(out.compute().to_array()) 166 | assert np.all(pdsr1 == pdsr1) 167 | 168 | # Test inequality 169 | out = dsr1 != dsr1 170 | assert not np.any(out.compute().to_array()) 171 | assert not np.any(pdsr1 != pdsr1) 172 | 173 | assert dsr1.cat.ordered 174 | assert pdsr1.cat.ordered 175 | 176 | # Test ordered operators 177 | np.testing.assert_array_equal(pdsr1 < pdsr2, (dsr1 < dsr2).compute()) 178 | np.testing.assert_array_equal(pdsr1 > pdsr2, (dsr1 > dsr2).compute()) 179 | -------------------------------------------------------------------------------- /dask_cudf/tests/test_batcher_sortnet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import cudf 5 | from dask_cudf import batcher_sortnet 6 | 7 | 8 | @pytest.mark.parametrize("n", list(range(1, 40))) 9 | def test_padding(n): 10 | data = list(range(n)) 11 | padded, valid = batcher_sortnet._pad_data_to_length(data) 12 | assert len(data) == valid 13 | assert batcher_sortnet.is_power_of_2(len(padded)) 14 | assert valid > len(padded) / 2 15 | assert all(x is not None for x in padded[:valid]) 16 | assert all(x is None for x in padded[valid:]) 17 | 18 | 19 | @pytest.mark.parametrize("seed", [43, 120]) 20 | @pytest.mark.parametrize("nelem", [2, 10, 100]) 21 | def test_compare_frame(seed, nelem): 22 | np.random.seed(seed) 23 | max_part_size = nelem 24 | # Make LHS 25 | lhs = cudf.DataFrame() 26 | lhs["a"] = lhs_a = np.random.random(nelem) 27 | lhs["b"] = lhs_b = np.random.random(nelem) 28 | 29 | # Make RHS 30 | rhs = cudf.DataFrame() 31 | rhs["a"] = rhs_a = np.random.random(nelem) 32 | rhs["b"] = rhs_b = np.random.random(nelem) 33 | 34 | # Sort by column "a" 35 | got_a = batcher_sortnet._compare_frame(lhs, rhs, max_part_size, by="a") 36 | # Check 37 | expect_a = np.hstack([lhs_a, rhs_a]) 38 | expect_a.sort() 39 | np.testing.assert_array_equal(got_a[0].a.to_array(), expect_a[:nelem]) 40 | np.testing.assert_array_equal(got_a[1].a.to_array(), expect_a[nelem:]) 41 | 42 | # Sort by column "b" 43 | got_b = batcher_sortnet._compare_frame(lhs, rhs, max_part_size, by="b") 44 | # Check 45 | expect_b = np.hstack([lhs_b, rhs_b]) 46 | expect_b.sort() 47 | np.testing.assert_array_equal(got_b[0].b.to_array(), expect_b[:nelem]) 48 | np.testing.assert_array_equal(got_b[1].b.to_array(), expect_b[nelem:]) 49 | 50 | 51 | def test_compare_frame_with_none(): 52 | df = cudf.DataFrame() 53 | max_part_size = 1 54 | df["a"] = [0] 55 | res = batcher_sortnet._compare_frame(df, None, max_part_size, by="a") 56 | assert res[0] is not None, res[1] is None 57 | res = batcher_sortnet._compare_frame(None, df, max_part_size, by="a") 58 | assert res[0] is not None, res[1] is None 59 | res = batcher_sortnet._compare_frame(None, None, max_part_size, by="a") 60 | assert res == (None, None) 61 | -------------------------------------------------------------------------------- /dask_cudf/tests/test_binops.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | 7 | import cudf 8 | import dask.dataframe as dd 9 | 10 | 11 | def _make_empty_frame(npartitions=2): 12 | df = pd.DataFrame({"x": [], "y": []}) 13 | gdf = cudf.DataFrame.from_pandas(df) 14 | dgf = dd.from_pandas(gdf, npartitions=npartitions) 15 | return dgf 16 | 17 | 18 | def _make_random_frame(nelem, npartitions=2): 19 | df = pd.DataFrame( 20 | {"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)} 21 | ) 22 | gdf = cudf.DataFrame.from_pandas(df) 23 | dgf = dd.from_pandas(gdf, npartitions=npartitions) 24 | return df, dgf 25 | 26 | 27 | def _make_random_frame_float(nelem, npartitions=2): 28 | df = pd.DataFrame( 29 | { 30 | "x": np.random.randint(0, 5, size=nelem), 31 | "y": np.random.normal(size=nelem) + 1, 32 | } 33 | ) 34 | gdf = cudf.from_pandas(df) 35 | dgf = dd.from_pandas(gdf, npartitions=npartitions) 36 | return df, dgf 37 | 38 | 39 | _binops = [ 40 | operator.add, 41 | operator.sub, 42 | operator.mul, 43 | operator.truediv, 44 | operator.floordiv, 45 | operator.eq, 46 | operator.ne, 47 | operator.gt, 48 | operator.ge, 49 | operator.lt, 50 | operator.le, 51 | ] 52 | 53 | 54 | @pytest.mark.parametrize("binop", _binops) 55 | def test_series_binops_integer(binop): 56 | np.random.seed(0) 57 | size = 1000 58 | lhs_df, lhs_gdf = _make_random_frame(size) 59 | rhs_df, rhs_gdf = _make_random_frame(size) 60 | got = binop(lhs_gdf.x, rhs_gdf.y) 61 | exp = binop(lhs_df.x, rhs_df.y) 62 | dd.assert_eq(got, exp) 63 | 64 | 65 | @pytest.mark.parametrize("binop", _binops) 66 | def test_series_binops_float(binop): 67 | np.random.seed(0) 68 | size = 1000 69 | lhs_df, lhs_gdf = _make_random_frame_float(size) 70 | rhs_df, rhs_gdf = _make_random_frame_float(size) 71 | got = binop(lhs_gdf.x, rhs_gdf.y) 72 | exp = binop(lhs_df.x, rhs_df.y) 73 | dd.assert_eq(got, exp) 74 | -------------------------------------------------------------------------------- /dask_cudf/tests/test_core.py: -------------------------------------------------------------------------------- 1 | import dask 2 | import dask.dataframe as dd 3 | import numpy as np 4 | import pandas as pd 5 | import pandas.util.testing as tm 6 | import pytest 7 | 8 | import cudf 9 | import dask_cudf as dgd 10 | 11 | 12 | def test_from_cudf(): 13 | np.random.seed(0) 14 | 15 | df = pd.DataFrame( 16 | {"x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000)} 17 | ) 18 | 19 | gdf = cudf.DataFrame.from_pandas(df) 20 | 21 | # Test simple around to/from dask 22 | ingested = dd.from_pandas(gdf, npartitions=2) 23 | dd.assert_eq(ingested, df) 24 | 25 | # Test conversion to dask.dataframe 26 | ddf = ingested.to_dask_dataframe() 27 | dd.assert_eq(ddf, df) 28 | 29 | 30 | def test_from_cudf_with_generic_idx(): 31 | 32 | cdf = cudf.DataFrame( 33 | [ 34 | ("a", list(range(20))), 35 | ("b", list(reversed(range(20)))), 36 | ("c", list(range(20))), 37 | ] 38 | ) 39 | 40 | ddf = dgd.from_cudf(cdf, npartitions=2) 41 | 42 | assert isinstance(ddf.index.compute(), cudf.dataframe.index.GenericIndex) 43 | dd.assert_eq(ddf.loc[1:2, ["a"]], cdf.loc[1:2, ["a"]]) 44 | 45 | 46 | def _fragmented_gdf(df, nsplit): 47 | n = len(df) 48 | 49 | # Split dataframe in *nsplit* 50 | subdivsize = n // nsplit 51 | starts = [i * subdivsize for i in range(nsplit)] 52 | ends = starts[1:] + [None] 53 | frags = [df[s:e] for s, e in zip(starts, ends)] 54 | return frags 55 | 56 | 57 | def test_query(): 58 | np.random.seed(0) 59 | 60 | df = pd.DataFrame( 61 | {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)} 62 | ) 63 | gdf = cudf.DataFrame.from_pandas(df) 64 | expr = "x > 2" 65 | 66 | dd.assert_eq(gdf.query(expr), df.query(expr)) 67 | 68 | queried = dd.from_pandas(gdf, npartitions=2).query(expr) 69 | 70 | got = queried 71 | expect = gdf.query(expr) 72 | 73 | dd.assert_eq(got, expect) 74 | 75 | 76 | def test_query_local_dict(): 77 | np.random.seed(0) 78 | df = pd.DataFrame( 79 | {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)} 80 | ) 81 | gdf = cudf.DataFrame.from_pandas(df) 82 | ddf = dgd.from_cudf(gdf, npartitions=2) 83 | 84 | val = 2 85 | 86 | gdf_queried = gdf.query("x > @val") 87 | ddf_queried = ddf.query("x > @val", local_dict={"val": val}) 88 | 89 | dd.assert_eq(gdf_queried, ddf_queried) 90 | 91 | 92 | def test_head(): 93 | np.random.seed(0) 94 | df = pd.DataFrame( 95 | {"x": np.random.randint(0, 5, size=100), "y": np.random.normal(size=100)} 96 | ) 97 | gdf = cudf.DataFrame.from_pandas(df) 98 | dgf = dd.from_pandas(gdf, npartitions=2) 99 | 100 | dd.assert_eq(dgf.head(), df.head()) 101 | 102 | 103 | def test_from_dask_dataframe(): 104 | np.random.seed(0) 105 | df = pd.DataFrame( 106 | {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} 107 | ) 108 | ddf = dd.from_pandas(df, npartitions=2) 109 | dgdf = ddf.map_partitions(cudf.from_pandas) 110 | got = dgdf.compute().to_pandas() 111 | expect = df 112 | 113 | dd.assert_eq(got, expect) 114 | 115 | 116 | @pytest.mark.parametrize("nelem", [10, 200, 1333]) 117 | def test_set_index(nelem): 118 | with dask.config.set(scheduler="single-threaded"): 119 | np.random.seed(0) 120 | # Use unique index range as the sort may not be stable-ordering 121 | x = np.arange(nelem) 122 | np.random.shuffle(x) 123 | df = pd.DataFrame({"x": x, "y": np.random.randint(0, nelem, size=nelem)}) 124 | ddf = dd.from_pandas(df, npartitions=2) 125 | dgdf = ddf.map_partitions(cudf.from_pandas) 126 | 127 | expect = ddf.set_index("x") 128 | got = dgdf.set_index("x") 129 | 130 | dd.assert_eq(expect, got, check_index=False, check_divisions=False) 131 | 132 | 133 | def assert_frame_equal_by_index_group(expect, got): 134 | assert sorted(expect.columns) == sorted(got.columns) 135 | assert sorted(set(got.index)) == sorted(set(expect.index)) 136 | # Note the set_index sort is not stable, 137 | unique_values = sorted(set(got.index)) 138 | for iv in unique_values: 139 | sr_expect = expect.loc[[iv]] 140 | sr_got = got.loc[[iv]] 141 | 142 | for k in expect.columns: 143 | # Sort each column before we compare them 144 | sorted_expect = sr_expect.sort_values(k)[k] 145 | sorted_got = sr_got.sort_values(k)[k] 146 | np.testing.assert_array_equal(sorted_expect, sorted_got) 147 | 148 | 149 | @pytest.mark.parametrize("nelem", [10, 200, 1333]) 150 | def test_set_index_2(nelem): 151 | with dask.config.set(scheduler="single-threaded"): 152 | np.random.seed(0) 153 | df = pd.DataFrame( 154 | { 155 | "x": 100 + np.random.randint(0, nelem // 2, size=nelem), 156 | "y": np.random.normal(size=nelem), 157 | } 158 | ) 159 | expect = df.set_index("x").sort_index() 160 | 161 | dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=4) 162 | res = dgf.set_index("x") # sort by default 163 | got = res.compute().to_pandas() 164 | 165 | assert_frame_equal_by_index_group(expect, got) 166 | 167 | 168 | def test_set_index_w_series(): 169 | with dask.config.set(scheduler="single-threaded"): 170 | nelem = 20 171 | np.random.seed(0) 172 | df = pd.DataFrame( 173 | { 174 | "x": 100 + np.random.randint(0, nelem // 2, size=nelem), 175 | "y": np.random.normal(size=nelem), 176 | } 177 | ) 178 | expect = df.set_index(df.x).sort_index() 179 | 180 | dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=4) 181 | res = dgf.set_index(dgf.x) # sort by default 182 | got = res.compute().to_pandas() 183 | 184 | expect.index.name = None 185 | dd.assert_eq(expect, got) 186 | 187 | 188 | def test_assign(): 189 | np.random.seed(0) 190 | df = pd.DataFrame( 191 | {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} 192 | ) 193 | 194 | dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2) 195 | pdcol = pd.Series(np.arange(20) + 1000) 196 | newcol = dd.from_pandas(cudf.Series(pdcol), npartitions=dgf.npartitions) 197 | out = dgf.assign(z=newcol) 198 | 199 | got = out 200 | dd.assert_eq(got.loc[:, ["x", "y"]], df) 201 | np.testing.assert_array_equal(got["z"], pdcol) 202 | 203 | 204 | @pytest.mark.parametrize("data_type", ["int8", "int16", "int32", "int64"]) 205 | def test_setitem_scalar_integer(data_type): 206 | np.random.seed(0) 207 | scalar = np.random.randint(0, 100, dtype=data_type) 208 | df = pd.DataFrame( 209 | {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} 210 | ) 211 | dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2) 212 | 213 | df["z"] = scalar 214 | dgf["z"] = scalar 215 | 216 | got = dgf.compute().to_pandas() 217 | np.testing.assert_array_equal(got["z"], df["z"]) 218 | 219 | 220 | @pytest.mark.parametrize("data_type", ["float32", "float64"]) 221 | def test_setitem_scalar_float(data_type): 222 | np.random.seed(0) 223 | scalar = np.random.randn(1).astype(data_type)[0] 224 | df = pd.DataFrame( 225 | {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} 226 | ) 227 | dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2) 228 | 229 | df["z"] = scalar 230 | dgf["z"] = scalar 231 | 232 | got = dgf.compute().to_pandas() 233 | np.testing.assert_array_equal(got["z"], df["z"]) 234 | 235 | 236 | def test_setitem_scalar_datetime(): 237 | np.random.seed(0) 238 | scalar = np.int64(np.random.randint(0, 100)).astype("datetime64[ms]") 239 | df = pd.DataFrame( 240 | {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} 241 | ) 242 | dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2) 243 | 244 | df["z"] = scalar 245 | dgf["z"] = scalar 246 | 247 | got = dgf.compute().to_pandas() 248 | np.testing.assert_array_equal(got["z"], df["z"]) 249 | 250 | 251 | @pytest.mark.parametrize( 252 | "func", 253 | [ 254 | lambda: tm.makeDataFrame().reset_index(), 255 | tm.makeDataFrame, 256 | tm.makeMixedDataFrame, 257 | tm.makeObjectSeries, 258 | tm.makeTimeSeries, 259 | ], 260 | ) 261 | def test_repr(func): 262 | pdf = func() 263 | try: 264 | gdf = cudf.from_pandas(pdf) 265 | except Exception: 266 | raise pytest.xfail() 267 | # gddf = dd.from_pandas(gdf, npartitions=3, sort=False) # TODO 268 | gddf = dd.from_pandas(gdf, npartitions=3, sort=False) 269 | 270 | assert repr(gddf) 271 | if hasattr(pdf, "_repr_html_"): 272 | assert gddf._repr_html_() 273 | 274 | 275 | @pytest.mark.skip(reason="datetime indexes not fully supported in cudf") 276 | @pytest.mark.parametrize("start", ["1d", "5d", "1w", "12h"]) 277 | @pytest.mark.parametrize("stop", ["1d", "3d", "8h"]) 278 | def test_repartition_timeseries(start, stop): 279 | # This test is currently absurdly slow. It should not be unskipped without 280 | # slimming it down. 281 | pdf = dask.datasets.timeseries( 282 | "2000-01-01", 283 | "2000-01-31", 284 | freq="1s", 285 | partition_freq=start, 286 | dtypes={"x": int, "y": float}, 287 | ) 288 | gdf = pdf.map_partitions(cudf.DataFrame.from_pandas) 289 | 290 | a = pdf.repartition(freq=stop) 291 | b = gdf.repartition(freq=stop) 292 | assert a.divisions == b.divisions 293 | 294 | dd.utils.assert_eq(a, b) 295 | 296 | 297 | @pytest.mark.parametrize("start", [1, 2, 5]) 298 | @pytest.mark.parametrize("stop", [1, 3, 7]) 299 | def test_repartition_simple_divisions(start, stop): 300 | pdf = pd.DataFrame({"x": range(100)}) 301 | 302 | pdf = dd.from_pandas(pdf, npartitions=start) 303 | gdf = pdf.map_partitions(cudf.DataFrame.from_pandas) 304 | 305 | a = pdf.repartition(npartitions=stop) 306 | b = gdf.repartition(npartitions=stop) 307 | assert a.divisions == b.divisions 308 | 309 | dd.utils.assert_eq(a, b) 310 | 311 | 312 | @pytest.fixture 313 | def pdf(): 314 | return pd.DataFrame( 315 | {"x": [1, 2, 3, 4, 5, 6], "y": [11.0, 12.0, 13.0, 14.0, 15.0, 16.0]} 316 | ) 317 | 318 | 319 | @pytest.fixture 320 | def gdf(pdf): 321 | return cudf.from_pandas(pdf) 322 | 323 | 324 | @pytest.fixture 325 | def ddf(pdf): 326 | return dd.from_pandas(pdf, npartitions=3) 327 | 328 | 329 | @pytest.fixture 330 | def gddf(gdf): 331 | return dd.from_pandas(gdf, npartitions=3) 332 | 333 | 334 | @pytest.mark.parametrize( 335 | "func", 336 | [ 337 | lambda df: df + 1, 338 | lambda df: df.index, 339 | lambda df: df.x.sum(), 340 | lambda df: df.x.astype(float), 341 | lambda df: df.assign(z=df.x.astype("int")), 342 | ], 343 | ) 344 | def test_unary_ops(func, gdf, gddf): 345 | p = func(gdf) 346 | g = func(gddf) 347 | 348 | # Fixed in https://github.com/dask/dask/pull/4657 349 | if isinstance(p, cudf.Index): 350 | from packaging import version 351 | if version.parse(dask.__version__) < version.parse("1.1.6"): 352 | pytest.skip("dask.dataframe assert_eq index check hardcoded to " 353 | "pandas prior to 1.1.6 release") 354 | 355 | dd.assert_eq(p, g, check_names=False) 356 | 357 | 358 | @pytest.mark.parametrize("series", [True, False]) 359 | def test_concat(gdf, gddf, series): 360 | if series: 361 | gdf = gdf.x 362 | gddf = gddf.x 363 | a = cudf.concat([gdf, gdf + 1, gdf + 2]).sort_values("x").reset_index(drop=True) 364 | b = ( 365 | dd.concat([gddf, gddf + 1, gddf + 2], interleave_partitions=True) 366 | .compute() 367 | .sort_values("x") 368 | .reset_index(drop=True) 369 | ) 370 | dd.assert_eq(a, b) 371 | 372 | 373 | def test_boolean_index(gdf, gddf): 374 | 375 | gdf2 = gdf[gdf.x > 2] 376 | gddf2 = gddf[gddf.x > 2] 377 | 378 | dd.assert_eq(gdf2, gddf2) 379 | -------------------------------------------------------------------------------- /dask_cudf/tests/test_delayed_io.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test IO with dask.delayed API 3 | """ 4 | import numpy as np 5 | import pytest 6 | from dask.delayed import delayed 7 | from pandas.util.testing import assert_frame_equal 8 | 9 | import cudf as gd 10 | import dask_cudf as dgd 11 | 12 | 13 | @delayed 14 | def load_data(nelem, ident): 15 | df = gd.DataFrame() 16 | df["x"] = np.arange(nelem) 17 | df["ident"] = np.asarray([ident] * nelem) 18 | return df 19 | 20 | 21 | @delayed 22 | def get_combined_column(df): 23 | return df.x * df.ident 24 | 25 | 26 | def test_dataframe_from_delayed(): 27 | delays = [load_data(10 * i, i) for i in range(1, 3)] 28 | out = dgd.from_delayed(delays) 29 | res = out.compute() 30 | assert isinstance(res, gd.DataFrame) 31 | 32 | expected = gd.concat([d.compute() for d in delays]) 33 | assert_frame_equal(res.to_pandas(), expected.to_pandas()) 34 | 35 | 36 | def test_series_from_delayed(): 37 | delays = [get_combined_column(load_data(10 * i, i)) for i in range(1, 3)] 38 | out = dgd.from_delayed(delays) 39 | res = out.compute() 40 | assert isinstance(res, gd.Series) 41 | 42 | expected = gd.concat([d.compute() for d in delays]) 43 | np.testing.assert_array_equal(res.to_pandas(), expected.to_pandas()) 44 | 45 | 46 | def test_dataframe_to_delayed(): 47 | nelem = 100 48 | 49 | df = gd.DataFrame() 50 | df["x"] = np.arange(nelem) 51 | df["y"] = np.random.randint(nelem, size=nelem) 52 | 53 | ddf = dgd.from_cudf(df, npartitions=5) 54 | 55 | delays = ddf.to_delayed() 56 | 57 | assert len(delays) == 5 58 | 59 | # Concat the delayed partitions 60 | got = gd.concat([d.compute() for d in delays]) 61 | assert_frame_equal(got.to_pandas(), df.to_pandas()) 62 | 63 | # Check individual partitions 64 | divs = ddf.divisions 65 | assert len(divs) == len(delays) + 1 66 | 67 | for i, part in enumerate(delays): 68 | s = divs[i] 69 | # The last divisions in the last index 70 | e = None if i + 1 == len(delays) else divs[i + 1] 71 | expect = df[s:e].to_pandas() 72 | got = part.compute().to_pandas() 73 | assert_frame_equal(got, expect) 74 | 75 | 76 | def test_series_to_delayed(): 77 | nelem = 100 78 | 79 | sr = gd.Series(np.random.randint(nelem, size=nelem)) 80 | 81 | dsr = dgd.from_cudf(sr, npartitions=5) 82 | 83 | delays = dsr.to_delayed() 84 | 85 | assert len(delays) == 5 86 | 87 | # Concat the delayed partitions 88 | got = gd.concat([d.compute() for d in delays]) 89 | assert isinstance(got, gd.Series) 90 | np.testing.assert_array_equal(got.to_pandas(), sr.to_pandas()) 91 | 92 | # Check individual partitions 93 | divs = dsr.divisions 94 | assert len(divs) == len(delays) + 1 95 | 96 | for i, part in enumerate(delays): 97 | s = divs[i] 98 | # The last divisions in the last index 99 | e = None if i + 1 == len(delays) else divs[i + 1] 100 | expect = sr[s:e].to_pandas() 101 | got = part.compute().to_pandas() 102 | np.testing.assert_array_equal(got, expect) 103 | 104 | 105 | def test_mixing_series_frame_error(): 106 | nelem = 20 107 | 108 | df = gd.DataFrame() 109 | df["x"] = np.arange(nelem) 110 | df["y"] = np.random.randint(nelem, size=nelem) 111 | 112 | ddf = dgd.from_cudf(df, npartitions=5) 113 | 114 | delay_frame = ddf.to_delayed() 115 | delay_series = ddf.x.to_delayed() 116 | combined = dgd.from_delayed(delay_frame + delay_series) 117 | 118 | with pytest.raises(ValueError) as raises: 119 | combined.compute() 120 | 121 | raises.match(r"^Metadata mismatch found in `from_delayed`.") 122 | 123 | 124 | def test_frame_extra_columns_error(): 125 | nelem = 20 126 | 127 | df = gd.DataFrame() 128 | df["x"] = np.arange(nelem) 129 | df["y"] = np.random.randint(nelem, size=nelem) 130 | ddf1 = dgd.from_cudf(df, npartitions=5) 131 | 132 | df["z"] = np.arange(nelem) 133 | ddf2 = dgd.from_cudf(df, npartitions=5) 134 | 135 | combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed()) 136 | 137 | with pytest.raises(ValueError) as raises: 138 | combined.compute() 139 | 140 | raises.match(r"^Metadata mismatch found in `from_delayed`.") 141 | raises.match(r"z") 142 | 143 | 144 | @pytest.mark.xfail(reason="") 145 | def test_frame_dtype_error(): 146 | nelem = 20 147 | 148 | df1 = gd.DataFrame() 149 | df1["bad"] = np.arange(nelem) 150 | df1["bad"] = np.arange(nelem, dtype=np.float64) 151 | 152 | df2 = gd.DataFrame() 153 | df2["bad"] = np.arange(nelem) 154 | df2["bad"] = np.arange(nelem, dtype=np.float32) 155 | 156 | ddf1 = dgd.from_cudf(df1, npartitions=5) 157 | ddf2 = dgd.from_cudf(df2, npartitions=5) 158 | 159 | combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed()) 160 | 161 | with pytest.raises(ValueError) as raises: 162 | combined.compute() 163 | 164 | raises.match(r"same type") 165 | -------------------------------------------------------------------------------- /dask_cudf/tests/test_distributed.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import cudf 4 | import dask 5 | from dask.distributed import Client 6 | import dask.dataframe as dd 7 | from distributed.utils_test import loop # noqa: F401 8 | 9 | dask_cuda = pytest.importorskip("dask_cuda") 10 | 11 | 12 | @pytest.mark.parametrize("delayed", [True, False]) # noqa: F811 13 | def test_basic(loop, delayed): # noqa: F811 14 | with dask_cuda.LocalCUDACluster(loop=loop) as cluster: 15 | with Client(cluster): 16 | pdf = dask.datasets.timeseries(dtypes={"x": int}).reset_index() 17 | gdf = pdf.map_partitions(cudf.DataFrame.from_pandas) 18 | if delayed: 19 | gdf = dd.from_delayed(gdf.to_delayed()) 20 | dd.assert_eq(pdf.head(), gdf.head()) 21 | -------------------------------------------------------------------------------- /dask_cudf/tests/test_groupby.py: -------------------------------------------------------------------------------- 1 | import dask.dataframe as dd 2 | import dask_cudf 3 | import pandas as pd 4 | import cudf 5 | import numpy as np 6 | 7 | import pytest 8 | 9 | 10 | @pytest.mark.parametrize( 11 | 'agg', 12 | [ 13 | 'sum', 14 | 'mean', 15 | 'count', 16 | 'min', 17 | 'max' 18 | ] 19 | ) 20 | def test_groupby_basic_aggs(agg): 21 | pdf = pd.DataFrame( 22 | {"x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000)} 23 | ) 24 | 25 | gdf = cudf.DataFrame.from_pandas(pdf) 26 | 27 | ddf = dask_cudf.from_cudf(gdf, npartitions=5) 28 | 29 | a = getattr(gdf.groupby("x"), agg)().to_pandas() 30 | b = getattr(ddf.groupby("x"), agg)().compute().to_pandas() 31 | 32 | a.index.name = None 33 | a.name = None 34 | b.index.name = None 35 | b.name = None 36 | 37 | if agg == "count": 38 | a["y"] = a["y"].astype(np.int64) 39 | 40 | dd.assert_eq(a, b) 41 | 42 | 43 | @pytest.mark.parametrize( 44 | "func", 45 | [ 46 | lambda df: df.groupby("x").agg({"y": "max"}), 47 | pytest.param( 48 | lambda df: df.groupby("x").y.agg(["sum", "max"]), 49 | marks=pytest.mark.skip 50 | ) 51 | ], 52 | ) 53 | def test_groupby_agg(func): 54 | pdf = pd.DataFrame( 55 | {"x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000)} 56 | ) 57 | 58 | gdf = cudf.DataFrame.from_pandas(pdf) 59 | 60 | ddf = dask_cudf.from_cudf(gdf, npartitions=5) 61 | 62 | a = func(gdf).to_pandas() 63 | b = func(ddf).compute().to_pandas() 64 | 65 | a.index.name = None 66 | a.name = None 67 | b.index.name = None 68 | b.name = None 69 | 70 | dd.assert_eq(a, b) 71 | 72 | 73 | @pytest.mark.xfail(reason="cudf issues") 74 | @pytest.mark.parametrize( 75 | "func", [lambda df: df.groupby("x").std(), lambda df: df.groupby("x").y.std()] 76 | ) 77 | def test_groupby_std(func): 78 | pdf = pd.DataFrame( 79 | {"x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000)} 80 | ) 81 | 82 | gdf = cudf.DataFrame.from_pandas(pdf) 83 | 84 | ddf = dask_cudf.from_cudf(gdf, npartitions=5) 85 | 86 | a = func(gdf.to_pandas()) 87 | b = func(ddf).compute().to_pandas() 88 | 89 | a.index.name = None 90 | a.name = None 91 | b.index.name = None 92 | 93 | dd.assert_eq(a, b) 94 | 95 | 96 | # reason gotattr in cudf 97 | @pytest.mark.parametrize( 98 | "func", 99 | [ 100 | pytest.param( 101 | lambda df: df.groupby(["a", "b"]).x.sum(), 102 | marks=pytest.mark.xfail 103 | ), 104 | pytest.param( 105 | lambda df: df.groupby(["a", "b"]).sum(), marks=pytest.mark.xfail 106 | ), 107 | pytest.param( 108 | lambda df: df.groupby(["a", "b"]).agg({'x', "sum"}), marks=pytest.mark.xfail 109 | ) 110 | ], 111 | ) 112 | def test_groupby_multi_column(func): 113 | pdf = pd.DataFrame( 114 | { 115 | "a": np.random.randint(0, 20, size=1000), 116 | "b": np.random.randint(0, 5, size=1000), 117 | "x": np.random.normal(size=1000), 118 | } 119 | ) 120 | 121 | gdf = cudf.DataFrame.from_pandas(pdf) 122 | 123 | ddf = dask_cudf.from_cudf(gdf, npartitions=5) 124 | 125 | a = func(gdf).to_pandas() 126 | b = func(ddf).compute().to_pandas() 127 | 128 | dd.assert_eq(a, b) 129 | -------------------------------------------------------------------------------- /dask_cudf/tests/test_join.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import pytest 6 | 7 | import cudf 8 | import dask_cudf as dgd 9 | import dask.dataframe as dd 10 | 11 | param_nrows = [5, 10, 50, 100] 12 | 13 | 14 | @pytest.mark.parametrize("left_nrows", param_nrows) 15 | @pytest.mark.parametrize("right_nrows", param_nrows) 16 | @pytest.mark.parametrize("left_nkeys", [4, 5]) 17 | @pytest.mark.parametrize("right_nkeys", [4, 5]) 18 | def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys): 19 | chunksize = 50 20 | 21 | np.random.seed(0) 22 | 23 | # cuDF 24 | left = cudf.DataFrame( 25 | { 26 | "x": np.random.randint(0, left_nkeys, size=left_nrows), 27 | "a": np.arange(left_nrows), 28 | }.items() 29 | ) 30 | right = cudf.DataFrame( 31 | { 32 | "x": np.random.randint(0, right_nkeys, size=right_nrows), 33 | "a": 1000 * np.arange(right_nrows), 34 | }.items() 35 | ) 36 | 37 | expect = left.set_index("x").join( 38 | right.set_index("x"), how="inner", sort=True, lsuffix="l", rsuffix="r" 39 | ) 40 | expect = expect.to_pandas() 41 | 42 | # dask_cudf 43 | left = dgd.from_cudf(left, chunksize=chunksize) 44 | right = dgd.from_cudf(right, chunksize=chunksize) 45 | 46 | joined = left.set_index("x").join( 47 | right.set_index("x"), how="inner", lsuffix="l", rsuffix="r" 48 | ) 49 | got = joined.compute().to_pandas() 50 | 51 | # Check index 52 | np.testing.assert_array_equal(expect.index.values, got.index.values) 53 | 54 | # Check rows in each groups 55 | expect_rows = {} 56 | got_rows = {} 57 | 58 | def gather(df, grows): 59 | grows[df["index"].values[0]] = (set(df.al), set(df.ar)) 60 | 61 | expect.reset_index().groupby("index").apply(partial(gather, grows=expect_rows)) 62 | 63 | expect.reset_index().groupby("index").apply(partial(gather, grows=got_rows)) 64 | 65 | assert got_rows == expect_rows 66 | 67 | 68 | @pytest.mark.parametrize("left_nrows", param_nrows) 69 | @pytest.mark.parametrize("right_nrows", param_nrows) 70 | @pytest.mark.parametrize("left_nkeys", [4, 5]) 71 | @pytest.mark.parametrize("right_nkeys", [4, 5]) 72 | @pytest.mark.parametrize("how", ["left", "right"]) 73 | def test_join_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how): 74 | chunksize = 50 75 | 76 | np.random.seed(0) 77 | 78 | # cuDF 79 | left = cudf.DataFrame( 80 | { 81 | "x": np.random.randint(0, left_nkeys, size=left_nrows), 82 | "a": np.arange(left_nrows, dtype=np.float64), 83 | }.items() 84 | ) 85 | right = cudf.DataFrame( 86 | { 87 | "x": np.random.randint(0, right_nkeys, size=right_nrows), 88 | "a": 1000 * np.arange(right_nrows, dtype=np.float64), 89 | }.items() 90 | ) 91 | 92 | expect = left.set_index("x").join( 93 | right.set_index("x"), how=how, sort=True, lsuffix="l", rsuffix="r" 94 | ) 95 | expect = expect.to_pandas() 96 | 97 | # dask_cudf 98 | left = dgd.from_cudf(left, chunksize=chunksize) 99 | right = dgd.from_cudf(right, chunksize=chunksize) 100 | 101 | joined = left.set_index("x").join( 102 | right.set_index("x"), how=how, lsuffix="l", rsuffix="r" 103 | ) 104 | got = joined.compute().to_pandas() 105 | 106 | # Check index 107 | np.testing.assert_array_equal(expect.index.values, got.index.values) 108 | 109 | # Check rows in each groups 110 | expect_rows = {} 111 | got_rows = {} 112 | 113 | def gather(df, grows): 114 | cola = np.sort(np.asarray(df.al)) 115 | colb = np.sort(np.asarray(df.ar)) 116 | 117 | grows[df["index"].values[0]] = (cola, colb) 118 | 119 | expect.reset_index().groupby("index").apply(partial(gather, grows=expect_rows)) 120 | 121 | expect.reset_index().groupby("index").apply(partial(gather, grows=got_rows)) 122 | 123 | for k in expect_rows: 124 | np.testing.assert_array_equal(expect_rows[k][0], got_rows[k][0]) 125 | np.testing.assert_array_equal(expect_rows[k][1], got_rows[k][1]) 126 | 127 | 128 | @pytest.mark.parametrize("left_nrows", param_nrows) 129 | @pytest.mark.parametrize("right_nrows", param_nrows) 130 | @pytest.mark.parametrize("left_nkeys", [4, 5]) 131 | @pytest.mark.parametrize("right_nkeys", [4, 5]) 132 | def test_merge_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how="left"): 133 | chunksize = 3 134 | 135 | np.random.seed(0) 136 | 137 | # cuDF 138 | left = cudf.DataFrame( 139 | { 140 | "x": np.random.randint(0, left_nkeys, size=left_nrows), 141 | "y": np.random.randint(0, left_nkeys, size=left_nrows), 142 | "a": np.arange(left_nrows, dtype=np.float64), 143 | }.items() 144 | ) 145 | right = cudf.DataFrame( 146 | { 147 | "x": np.random.randint(0, right_nkeys, size=right_nrows), 148 | "y": np.random.randint(0, right_nkeys, size=right_nrows), 149 | "a": 1000 * np.arange(right_nrows, dtype=np.float64), 150 | }.items() 151 | ) 152 | 153 | expect = left.merge(right, on=("x", "y"), how=how) 154 | 155 | def normalize(df): 156 | return ( 157 | df.to_pandas().sort_values(["x", "y", "a_x", "a_y"]).reset_index(drop=True) 158 | ) 159 | 160 | # dask_cudf 161 | left = dgd.from_cudf(left, chunksize=chunksize) 162 | right = dgd.from_cudf(right, chunksize=chunksize) 163 | 164 | result = left.merge(right, on=("x", "y"), how=how).compute( 165 | scheduler="single-threaded" 166 | ) 167 | 168 | dd.assert_eq(normalize(expect), normalize(result)) 169 | 170 | 171 | @pytest.mark.parametrize("left_nrows", [2, 5]) 172 | @pytest.mark.parametrize("right_nrows", [5, 10]) 173 | @pytest.mark.parametrize("left_nkeys", [4]) 174 | @pytest.mark.parametrize("right_nkeys", [4]) 175 | def test_merge_1col_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how="left"): 176 | chunksize = 3 177 | 178 | np.random.seed(0) 179 | 180 | # cuDF 181 | left = cudf.DataFrame( 182 | { 183 | "x": np.random.randint(0, left_nkeys, size=left_nrows), 184 | "a": np.arange(left_nrows, dtype=np.float64), 185 | }.items() 186 | ) 187 | right = cudf.DataFrame( 188 | { 189 | "x": np.random.randint(0, right_nkeys, size=right_nrows), 190 | "a": 1000 * np.arange(right_nrows, dtype=np.float64), 191 | }.items() 192 | ) 193 | 194 | expect = left.merge(right, on=["x"], how=how) 195 | expect = expect.to_pandas().sort_values(["x", "a_x", "a_y"]).reset_index(drop=True) 196 | 197 | # dask_cudf 198 | left = dgd.from_cudf(left, chunksize=chunksize) 199 | right = dgd.from_cudf(right, chunksize=chunksize) 200 | 201 | joined = left.merge(right, on=["x"], how=how) 202 | 203 | got = joined.compute().to_pandas() 204 | 205 | got = got.sort_values(["x", "a_x", "a_y"]).reset_index(drop=True) 206 | 207 | dd.assert_eq(expect, got) 208 | 209 | 210 | @pytest.mark.parametrize("how", ["inner", "left"]) 211 | def test_indexed_join(how): 212 | p_left = pd.DataFrame({"x": np.arange(10)}, index=np.arange(10) * 2) 213 | p_right = pd.DataFrame({"y": 1}, index=np.arange(15)) 214 | 215 | g_left = cudf.from_pandas(p_left) 216 | g_right = cudf.from_pandas(p_right) 217 | 218 | dg_left = dd.from_pandas(g_left, npartitions=4) 219 | dg_right = dd.from_pandas(g_right, npartitions=5) 220 | 221 | d = g_left.merge(g_right, left_index=True, right_index=True, how=how) 222 | dg = dg_left.merge(dg_right, left_index=True, right_index=True, how=how) 223 | 224 | # occassionally order is not correct (possibly do to hashing in the merge) 225 | d = d.sort_values('x') # index is preserved 226 | dg = dg.sort_values('x') # index is reset -- sort_values will slow test down 227 | 228 | dd.assert_eq(d, dg, check_index=False) 229 | 230 | 231 | @pytest.mark.parametrize("how", ["left", "inner"]) 232 | def test_how(how): 233 | left = cudf.DataFrame({"x": [1, 2, 3, 4, None], "y": [1.0, 2.0, 3.0, 4.0, 0.0]}) 234 | right = cudf.DataFrame({"x": [2, 3, None, 2], "y": [20, 30, 0, 20]}) 235 | 236 | dleft = dd.from_pandas(left, npartitions=2) 237 | dright = dd.from_pandas(right, npartitions=3) 238 | 239 | expected = left.merge(right, how=how, on="x") 240 | result = dleft.merge(dright, how=how, on="x") 241 | 242 | dd.assert_eq( 243 | result.compute().to_pandas().sort_values("x"), 244 | expected.to_pandas().sort_values("x"), 245 | check_index=False, 246 | ) 247 | 248 | 249 | @pytest.mark.parametrize("daskify", [True, False]) 250 | def test_single_dataframe_merge(daskify): 251 | right = cudf.DataFrame({"x": [1, 2, 1, 2], "y": [1, 2, 3, 4]}) 252 | left = cudf.DataFrame({"x": np.arange(100) % 10, "z": np.arange(100)}) 253 | 254 | dleft = dd.from_pandas(left, npartitions=10) 255 | 256 | if daskify: 257 | dright = dd.from_pandas(right, npartitions=1) 258 | else: 259 | dright = right 260 | 261 | expected = left.merge(right, how="inner") 262 | result = dd.merge(dleft, dright, how="inner") 263 | assert len(result.dask) < 25 264 | 265 | dd.assert_eq( 266 | result.compute().to_pandas().sort_values(["z", "y"]), 267 | expected.to_pandas().sort_values(["z", "y"]), 268 | check_index=False, 269 | ) 270 | 271 | 272 | @pytest.mark.parametrize("how", ["inner", "left"]) 273 | @pytest.mark.parametrize("on", ["id_1", ["id_1"], ["id_1", "id_2"]]) 274 | def test_on(how, on): 275 | left = cudf.DataFrame({"id_1": [1, 2, 3, 4, 5], "id_2": [1.0, 2.0, 3.0, 4.0, 0.0]}) 276 | right = cudf.DataFrame({"id_1": [2, 3, None, 2], "id_2": [2.0, 3.0, 4.0, 20]}) 277 | 278 | dleft = dd.from_pandas(left, npartitions=2) 279 | dright = dd.from_pandas(right, npartitions=3) 280 | 281 | expected = left.merge(right, how=how, on=on) 282 | result = dleft.merge(dright, how=how, on=on) 283 | 284 | dd.assert_eq( 285 | result.compute().to_pandas().sort_values(on), 286 | expected.to_pandas().sort_values(on), 287 | check_index=False, 288 | ) 289 | 290 | 291 | def test_single_partition(): 292 | left = cudf.DataFrame({"x": range(200), "y": range(200)}) 293 | right = cudf.DataFrame({"x": range(100), "z": range(100)}) 294 | 295 | dleft = dd.from_pandas(left, npartitions=1) 296 | dright = dd.from_pandas(right, npartitions=10) 297 | 298 | m = dleft.merge(dright, how="inner") 299 | assert len(m.dask) < len(dleft.dask) + len(dright.dask) * 3 300 | 301 | dleft = dd.from_pandas(left, npartitions=5) 302 | m2 = dleft.merge(right, how="inner") 303 | assert len(m2.dask) < len(dleft.dask) * 3 304 | assert len(m2) == 100 305 | -------------------------------------------------------------------------------- /dask_cudf/tests/test_reductions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from dask.dataframe.utils import assert_eq 5 | 6 | import cudf as gd 7 | import dask_cudf as dgd 8 | 9 | 10 | def _make_random_frame(nelem, npartitions=2): 11 | df = pd.DataFrame( 12 | { 13 | "x": np.random.randint(0, 5, size=nelem), 14 | "y": np.random.normal(size=nelem) + 1, 15 | } 16 | ) 17 | gdf = gd.DataFrame.from_pandas(df) 18 | dgf = dgd.from_cudf(gdf, npartitions=npartitions) 19 | return df, dgf 20 | 21 | 22 | _reducers = ["sum", "count", "mean", "var", "std", "min", "max"] 23 | 24 | 25 | def _get_reduce_fn(name): 26 | def wrapped(series): 27 | fn = getattr(series, name) 28 | return fn() 29 | 30 | return wrapped 31 | 32 | 33 | @pytest.mark.parametrize("reducer", _reducers) 34 | def test_series_reduce(reducer): 35 | reducer = _get_reduce_fn(reducer) 36 | np.random.seed(0) 37 | size = 10 38 | df, gdf = _make_random_frame(size) 39 | 40 | got = reducer(gdf.x) 41 | exp = reducer(df.x) 42 | assert_eq(got, exp) 43 | -------------------------------------------------------------------------------- /dask_cudf/tests/test_sort.py: -------------------------------------------------------------------------------- 1 | import dask 2 | import numpy as np 3 | import pandas as pd 4 | import pytest 5 | 6 | import cudf 7 | import dask.dataframe as dd 8 | 9 | 10 | @pytest.mark.parametrize("by", ["a", "b"]) 11 | @pytest.mark.parametrize("nelem", [10, 100, 1000]) 12 | @pytest.mark.parametrize("nparts", [1, 2, 5, 10]) 13 | def test_sort_values(nelem, nparts, by): 14 | df = cudf.DataFrame() 15 | df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1]) 16 | df["b"] = np.arange(100, nelem + 100) 17 | ddf = dd.from_pandas(df, npartitions=nparts) 18 | 19 | with dask.config.set(scheduler="single-threaded"): 20 | got = ddf.sort_values(by=by).compute().to_pandas() 21 | expect = df.sort_values(by=by).to_pandas().reset_index(drop=True) 22 | pd.util.testing.assert_frame_equal(got, expect) 23 | 24 | 25 | def test_sort_values_binned(): 26 | np.random.seed(43) 27 | nelem = 100 28 | nparts = 5 29 | by = "a" 30 | df = cudf.DataFrame() 31 | df["a"] = np.random.randint(1, 5, nelem) 32 | ddf = dd.from_pandas(df, npartitions=nparts) 33 | 34 | parts = ddf.sort_values_binned(by=by).to_delayed() 35 | part_uniques = [] 36 | for i, p in enumerate(parts): 37 | part = dask.compute(p)[0] 38 | part_uniques.append(set(part.a.unique())) 39 | 40 | # Partitions do not have intersecting keys 41 | for i in range(len(part_uniques)): 42 | for j in range(i + 1, len(part_uniques)): 43 | assert not ( 44 | part_uniques[i] & part_uniques[j] 45 | ), "should have empty intersection" 46 | 47 | 48 | def test_sort_binned_meta(): 49 | df = cudf.DataFrame([("a", [0, 1, 2, 3, 4]), ("b", [5, 6, 7, 7, 8])]) 50 | ddf = dd.from_pandas(df, npartitions=2).persist() 51 | 52 | ddf.sort_values_binned(by="b") 53 | -------------------------------------------------------------------------------- /gpuci_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/lib 4 | CC=/usr/bin/gcc 5 | CXX=/usr/bin/g++ 6 | DASKCUDF_REPO=https://github.com/rapidsai/dask_cudf 7 | NUMBA_VERSION=0.40.0 8 | NUMPY_VERSION=1.14.5 9 | PANDAS_VERSION=0.20.3 10 | PYTHON_VERSION=3.5 11 | PYARROW_VERSION=0.10.0 12 | 13 | function logger() { 14 | echo -e "\n>>>> $@\n" 15 | } 16 | 17 | logger "Check environment..." 18 | env 19 | 20 | logger "Check GPU usage..." 21 | nvidia-smi 22 | 23 | logger "Create conda env..." 24 | rm -rf /home/jenkins/.conda/envs/daskcudf 25 | conda create -n daskcudf python=${PYTHON_VERSION} 26 | conda install -n daskcudf -y -c rapidsai -c numba -c conda-forge -c defaults \ 27 | numba=${NUMBA_VERSION} \ 28 | numpy=${NUMPY_VERSION} \ 29 | pandas=${PANDAS_VERSION} \ 30 | pyarrow=${PYARROW_VERSION} \ 31 | pytest \ 32 | dask \ 33 | cudf 34 | 35 | 36 | logger "Activate conda env..." 37 | source activate daskcudf 38 | 39 | logger "Check versions..." 40 | python --version 41 | gcc --version 42 | g++ --version 43 | conda list 44 | 45 | logger "Clone dask_cudf..." 46 | rm -rf $WORKSPACE/daskcudf 47 | git clone --recurse-submodules ${DASKCUDF_REPO} $WORKSPACE/daskcudf 48 | 49 | 50 | logger "Build dask_cudf..." 51 | cd $WORKSPACE 52 | python setup.py install 53 | 54 | logger "Check GPU usage..." 55 | nvidia-smi 56 | 57 | logger "Test dask_cudf..." 58 | py.test --cache-clear --junitxml=junit.xml --ignore=daskcudf -v 59 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cudf>=0.7* 2 | dask>=1.2.2 3 | distributed>=1.26 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [versioneer] 2 | VCS = git 3 | style = pep440 4 | versionfile_source = dask_cudf/_version.py 5 | versionfile_build = dask_cudf/_version.py 6 | tag_prefix = 7 | parentdir_prefix = dask_cudf- 8 | 9 | [flake8] 10 | exclude = docs, __init__.py 11 | max-line-length = 88 12 | ignore = 13 | # Assigning lambda expression 14 | E731 15 | # Ambiguous variable names 16 | E741 17 | # line break before binary operator 18 | W503 19 | # whitespace before : 20 | E203 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from codecs import open 3 | 4 | from setuptools import find_packages, setup 5 | 6 | # Get the long description from the README file 7 | with open(os.path.join(os.path.dirname(__file__), "README.md")) as f: 8 | long_description = f.read() 9 | 10 | version = os.environ.get("GIT_DESCRIBE_TAG", "0.0.0.dev0").lstrip("v") 11 | setup( 12 | name="dask-cudf", 13 | version=version, 14 | description="Utilities for Dask and cuDF interactions", 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/rapidsai/dask-cudf", 18 | author="NVIDIA Corporation", 19 | license="Apache 2.0", 20 | classifiers=[ 21 | "Intended Audience :: Developers", 22 | "Topic :: Database", 23 | "Topic :: Scientific/Engineering", 24 | "License :: OSI Approved :: Apache Software License", 25 | "Programming Language :: Python :: 3.6", 26 | "Programming Language :: Python :: 3.7", 27 | ], 28 | packages=find_packages(exclude=["docs", "tests", "tests.*", "docs.*"]), 29 | install_requires=open("requirements.txt").read().strip().split("\n"), 30 | ) 31 | --------------------------------------------------------------------------------