├── .coveragerc
├── .gitattributes
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.rst
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── ci
    ├── checks
    │   └── style.sh
    ├── cpu
    │   ├── build.sh
    │   └── dask-cudf
    │   │   ├── build_dask_cudf.sh
    │   │   └── upload-anaconda.sh
    ├── gpu
    │   └── build.sh
    └── release
    │   └── update-version.sh
├── conda
    ├── envs
    │   └── dev-environment.yml
    └── recipes
    │   └── dask-cudf
    │       ├── build.sh
    │       └── meta.yaml
├── dask_cudf
    ├── DASK_LICENSE.txt
    ├── __init__.py
    ├── _version.py
    ├── accessor.py
    ├── backends.py
    ├── batcher_sortnet.py
    ├── core.py
    ├── io
    │   ├── __init__.py
    │   ├── csv.py
    │   ├── json.py
    │   ├── orc.py
    │   ├── parquet.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── sample.orc
    │   │   ├── test_csv.py
    │   │   ├── test_json.py
    │   │   ├── test_orc.py
    │   │   ├── test_parquet.py
    │   │   └── test_s3.py
    ├── join_impl.py
    └── tests
    │   ├── __init__.py
    │   ├── test_accessor.py
    │   ├── test_batcher_sortnet.py
    │   ├── test_binops.py
    │   ├── test_core.py
    │   ├── test_delayed_io.py
    │   ├── test_distributed.py
    │   ├── test_groupby.py
    │   ├── test_join.py
    │   ├── test_reductions.py
    │   └── test_sort.py
├── gpuci_build.sh
├── requirements.txt
├── setup.cfg
├── setup.py
└── versioneer.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | # Configuration file for Python coverage tests
2 | [run]
3 | include = dask_cudf/*
4 | omit = dask_cudf/tests/*
5 | 
6 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | dask_cudf/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # vscode
104 | .vscode
105 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/ambv/black
 3 |     rev: stable
 4 |     hooks:
 5 |     - id: black
 6 |       language_version: python3.6
 7 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 8 |     rev: v1.2.3
 9 |     hooks:
10 |     - id: flake8
11 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | dask-cudf 0.8.0 (27 June 2019)
 2 | -----------------------------
 3 | 
 4 | 
 5 | dask-cudf 0.7.0 (10 May 2019)
 6 | -----------------------------
 7 | 
 8 | -  Remove dependency on libgdf_cffi (#228) `Keith Kraus`_
 9 | -  Update build process `Rick Ratzel`_
10 | -  Convert query to use standard dask query and update GPUCI to cudf 0.7 (#196) `Nick Becker`_
11 | -  Update GPU CI to use cudf 0.7 (#204) `Nick Becker`_
12 | -  Route single-partition merge cases through dask.dataframe (#194) `Matthew Rocklin`_
13 | -  Avoid compression warning in read_csv if chunksize=None (#192) `Matthew Rocklin`_
14 | -  Fix classifier (#182) `Ray Douglass`_
15 | -  Fix gpuCI build script (#173) `Dillon Cullinan`_
16 | 
17 | 
18 | 0.6.1 - 2019-04-09
19 | ------------------
20 | 
21 | -  Add cudf.DataFrame.mean = None (#205) `Matthew Rocklin`_
22 | 
23 | 
24 | dask-cudf 0.6.0 (22 Mar 2019)
25 | -----------------------------
26 | 
27 | In this release we aligned Dask cuDF to the mainline Dask Dataframe
28 | codebase.  This was made possible by an alignment of cuDF to Pandas, and
29 | resulted in us maintaining much less code in this repository.  Dask cuDF
30 | dataframes are now just Dask DataFrames that contain cuDF dataframes, and have
31 | a few extra methods.
32 | 
33 | -  Bump cudf to 0.6 (#162) `Keith Kraus`_
34 | -  Fix upload-anaconda to find the right package (#159) `Ray Douglass`_
35 | -  Add gpuCI (#151) `Mike Wendt`_
36 | -  Skip s3fs tests before importing dask.bytes.s3 (#153) `Matthew Rocklin`_
37 | -  Raise FileNotFoundError if no files found (#145) `Benjamin Zaitlen`_
38 | -  Add tests for repartition and indexed joins (#91) `Matthew Rocklin`_
39 | -  URLs for CSVs (#122) `Benjamin Zaitlen`_
40 | -  Rely on mainline concat function (#126) `Matthew Rocklin`_
41 | -  add test for generic idx test using loc (#121) `Benjamin Zaitlen`_
42 | -  Fix gzip `Benjamin Zaitlen`_
43 | -  Replace custom make_meta with mainline make_meta (#105) `Matthew Rocklin`_
44 | -  Cleanup dead code (#99) `Matthew Rocklin`_
45 | -  Remove from_cudf and from_dask_dataframe functions (#98) `Matthew Rocklin`_
46 | -  Increase default chunk size in read_csv (#95) `Matthew Rocklin`_
47 | -  Remove assertions outlawing inner joins (#89) `Matthew Rocklin`_
48 | -  Fix reset_index(drop=) keyword handling (#94) `Matthew Rocklin`_
49 | -  Add index= keyword to make_meta dispatch functions `Matthew Rocklin`_
50 | -  Use mainline groupby aggregation codebase (#69) `Matthew Rocklin`_
51 | -  remove dtype inference on chunks of data when parsing csv (#86) `Matthew Rocklin`_
52 | -  Avoid format strings to support Python 3.5 `Matthew Rocklin`_
53 | -  use byte_range when reading CSVs (#78) `Benjamin Zaitlen`_
54 | -  Move cudf dask backends code to backends file here (#75) `Matthew Rocklin`_
55 | -  Clean up join code (#70) `Matthew Rocklin`_
56 | -  Replace pygdf with cudf in README (#65) `Matthew Rocklin`_
57 | -  Add dask_cudf.io to setup.py packages (#60) `Matthew Rocklin`_
58 | -  Add basic read_csv implementation (#58) `Matthew Rocklin`_
59 | -  Add tests for repr (#56) `Matthew Rocklin`_
60 | -  Rename gd to cudf in tests `Matthew Rocklin`_
61 | -  add style instructions to README `Matthew Rocklin`_
62 | -  Apply isort to code `Matthew Rocklin`_
63 | -  Add pre-commit-config.yaml including black and flake8 `Matthew Rocklin`_
64 | -  Inherit from Dask Dataframe and respond to cudf update (#48) `Matthew Rocklin`_
65 | -  updating for new cuDF API `Matthew Jones`_
66 | -  add orc reader (#220) `Benjamin Zaitlen`_
67 | 
68 | .. _`Matthew Jones`: https://github.com/mt-jones
69 | .. _`Keith Kraus`: https://github.com/kkraus14
70 | .. _`Ray Douglass`: https://github.com/raydouglass
71 | .. _`Matthew Rocklin`: https://github.com/mrocklin
72 | .. _`Benjamin Zaitlen`: https://github.com/quasiben
73 | .. _`Mike Wendt`: https://github.com/mike-wendt
74 | .. _`Dillon Cullinan`: https://github.com/dillon-cullinan
75 | .. _`Nick Becker`: https://github.com/beckernick
76 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include dask_cudf *.py
2 | include versioneer.py
3 | include setup.py
4 | include README.rst
5 | include LICENSE.txt
6 | include MANIFEST.in
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Dask GPU Dataframes
 2 | 
 3 | A partitioned gpu-backed dataframe, using Dask.
 4 | 
 5 | ## Setup from source
 6 | 
 7 | Setup from source repo:
 8 | 
 9 | 1.  Install dependencies into a new conda environment where `CUDA_VERSION` is either 9.2 or 10
10 | 
11 |         conda create -n dask-cudf \
12 |            -c rapidsai -c numba -c conda-forge -c defaults \
13 |            cudf dask cudatoolkit={CUDA_VERSION}
14 | 
15 | 2.  Activate conda environment:
16 | 
17 |         source activate dask-cudf
18 | 
19 | 3.  Clone `dask-cudf` repo:
20 | 
21 |         git clone https://github.com/rapidsai/dask-cudf
22 | 
23 | 4.  Install from source:
24 | 
25 |         cd dask-cudf
26 |         pip install .
27 | 
28 | ## Test
29 | 
30 | 1.  Install `pytest`
31 | 
32 |         conda install pytest
33 | 
34 | 2.  Run all tests:
35 | 
36 |         py.test dask_cudf
37 | 
38 | 3. Or, run individual tests:
39 | 
40 |         py.test dask_cudf/tests/test_file.py
41 | 
42 | ## Style
43 | 
44 | For style we use `black`, `isort`, and `flake8`.  These are available as
45 | pre-commit hooks that will run every time you are about to commit code.
46 | 
47 | From the root directory of this project run the following:
48 | 
49 | ```
50 | pip install pre-commit
51 | pre-commit install
52 | ```
53 | 


--------------------------------------------------------------------------------
/ci/checks/style.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.
 3 | ##########################
 4 | # dask-cudf Style Tester #
 5 | ##########################
 6 | 
 7 | # Ignore errors and set path
 8 | set +e
 9 | PATH=/conda/bin:$PATH
10 | 
11 | # Activate common conda env
12 | source activate gdf
13 | 
14 | # Run flake8 and get results/return code
15 | FLAKE=`flake8 python`
16 | RETVAL=$?
17 | 
18 | # Output results if failure otherwise show pass
19 | if [ "$FLAKE" != "" ]; then
20 |   echo -e "\n\n>>>> FAILED: flake8 style check; begin output\n\n"
21 |   echo -e "$FLAKE"
22 |   echo -e "\n\n>>>> FAILED: flake8 style check; end output\n\n"
23 | else
24 |   echo -e "\n\n>>>> PASSED: flake8 style check\n\n"
25 | fi
26 | 
27 | exit $RETVAL


--------------------------------------------------------------------------------
/ci/cpu/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.
 3 | ###########################################
 4 | # dask-cudf CPU conda build script for CI #
 5 | ###########################################
 6 | set -e
 7 | 
 8 | # Logger function for build status output
 9 | function logger() {
10 |   echo -e "\n>>>> $@\n"
11 | }
12 | 
13 | # Set path and build parallel level
14 | export PATH=/conda/bin:/usr/local/cuda/bin:$PATH
15 | export PARALLEL_LEVEL=4
16 | 
17 | # Set home to the job's workspace
18 | export HOME=$WORKSPACE
19 | 
20 | # Switch to project root; also root of repo checkout
21 | cd $WORKSPACE
22 | 
23 | # Get latest tag and number of commits since tag
24 | export GIT_DESCRIBE_TAG=`git describe --abbrev=0 --tags`
25 | export GIT_DESCRIBE_NUMBER=`git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count`
26 | 
27 | ################################################################################
28 | # SETUP - Check environment
29 | ################################################################################
30 | 
31 | logger "Get env..."
32 | env
33 | 
34 | logger "Activate conda env..."
35 | source activate gdf
36 | 
37 | logger "Check versions..."
38 | python --version
39 | gcc --version
40 | g++ --version
41 | conda list
42 | 
43 | # FIX Added to deal with Anancoda SSL verification issues during conda builds
44 | conda config --set ssl_verify False
45 | 
46 | ################################################################################
47 | # INSTALL - Install NVIDIA driver
48 | ################################################################################
49 | 
50 | logger "Install NVIDIA driver for CUDA $CUDA..."
51 | apt-get update -q
52 | DRIVER_VER="396.44-1"
53 | LIBCUDA_VER="396"
54 | if [ "$CUDA" == "10.0" ]; then
55 |   DRIVER_VER="410.72-1"
56 |   LIBCUDA_VER="410"
57 | fi
58 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
59 |   cuda-drivers=${DRIVER_VER} libcuda1-${LIBCUDA_VER}
60 | 
61 | ################################################################################
62 | # BUILD - Conda package builds (conda deps: cudf <- dask-cudf)
63 | ################################################################################
64 | 
65 | logger "Build conda pkg for dask-cudf..."
66 | source ci/cpu/dask-cudf/build_dask_cudf.sh
67 | 
68 | ################################################################################
69 | # UPLOAD - Conda packages
70 | ################################################################################
71 | 
72 | logger "Upload conda pkg..."
73 | source ci/cpu/dask-cudf/upload-anaconda.sh
74 | 


--------------------------------------------------------------------------------
/ci/cpu/dask-cudf/build_dask_cudf.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Logger function for build status output
 5 | function logger() {
 6 |   echo -e "\n>>>> $@\n"
 7 | }
 8 | 
 9 | logger "Building dask_cudf"
10 | conda build conda/recipes/dask-cudf -c nvidia -c rapidsai -c rapidsai-nightly -c numba -c defaults -c conda-forge --python=$PYTHON
11 | 


--------------------------------------------------------------------------------
/ci/cpu/dask-cudf/upload-anaconda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh
 4 | export UPLOADFILE=`conda build conda/recipes/dask-cudf -c nvidia -c rapidsai -c rapidsai-nightly -c numba -c defaults -c conda-forge --python=$PYTHON --output`
 5 | 
 6 | set -e
 7 | 
 8 | SOURCE_BRANCH=master
 9 | 
10 | test -e ${UPLOADFILE}
11 | 
12 | LABEL_OPTION="--label main --label cuda9.2 --label cuda10.0"
13 | 
14 | # Restrict uploads to master branch
15 | if [ ${GIT_BRANCH} != ${SOURCE_BRANCH} ]; then
16 |   echo "Skipping upload"
17 |   return 0
18 | fi
19 | 
20 | if [ -z "$MY_UPLOAD_KEY" ]; then
21 |   echo "No upload key"
22 |   return 0
23 | fi
24 | 
25 | echo "LABEL_OPTION=${LABEL_OPTION}"
26 | 
27 | echo "Upload"
28 | echo ${UPLOADFILE}
29 | anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --force ${UPLOADFILE}
30 | 


--------------------------------------------------------------------------------
/ci/gpu/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.
 3 | ##############################################
 4 | # dask-cudf GPU build and test script for CI #
 5 | ##############################################
 6 | set -e
 7 | 
 8 | # Logger function for build status output
 9 | function logger() {
10 |   echo -e "\n>>>> $@\n"
11 | }
12 | 
13 | # Set path and build parallel level
14 | export PATH=/conda/bin:/usr/local/cuda/bin:$PATH
15 | export PARALLEL_LEVEL=4
16 | export CUDA_REL=${CUDA_VERSION%.*}
17 | 
18 | # Set home to the job's workspace
19 | export HOME=$WORKSPACE
20 | 
21 | ################################################################################
22 | # SETUP - Check environment
23 | ################################################################################
24 | 
25 | logger "Check environment..."
26 | env
27 | 
28 | logger "Check GPU usage..."
29 | nvidia-smi
30 | 
31 | logger "Activate conda env..."
32 | source activate gdf
33 | 
34 | logger "Check versions..."
35 | python --version
36 | $CC --version
37 | $CXX --version
38 | 
39 | logger "Setup new environment..."
40 | conda install \
41 |     'cudf=0.8*' \
42 |     'pyarrow=0.12.1' \
43 |     'dask>=1.1.5'
44 | pip install git+https://github.com/dask/dask.git --upgrade --no-deps
45 | 
46 | conda list
47 | 
48 | logger "Python py.test for dask-cudf..."
49 | cd $WORKSPACE
50 | pip install -e .
51 | 
52 | py.test --cache-clear --junitxml=${WORKSPACE}/junit-dask-cudf.xml -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:${WORKSPACE}/dask-cudf-coverage.xml --cov-report term
53 | 
54 | 


--------------------------------------------------------------------------------
/ci/release/update-version.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #############################
 3 | # dask-cudf Version Updater #
 4 | #############################
 5 | 
 6 | ## Usage
 7 | # bash update-version.sh <type>
 8 | #     where <type> is either `major`, `minor`, `patch`
 9 | 
10 | set -e
11 | 
12 | # Grab argument for release type
13 | RELEASE_TYPE=$1
14 | 
15 | # Get current version and calculate next versions
16 | CURRENT_TAG=`git tag | grep -xE 'v[0-9\.]+' | sort --version-sort | tail -n 1 | tr -d 'v'`
17 | CURRENT_MAJOR=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}'`
18 | CURRENT_MINOR=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}'`
19 | CURRENT_PATCH=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}'`
20 | NEXT_MAJOR=$((CURRENT_MAJOR + 1))
21 | NEXT_MINOR=$((CURRENT_MINOR + 1))
22 | NEXT_PATCH=$((CURRENT_PATCH + 1))
23 | NEXT_FULL_TAG=""
24 | NEXT_SHORT_TAG=""
25 | 
26 | # Determine release type
27 | if [ "$RELEASE_TYPE" == "major" ]; then
28 |   NEXT_FULL_TAG="${NEXT_MAJOR}.0.0"
29 |   NEXT_SHORT_TAG="${NEXT_MAJOR}.0"
30 | elif [ "$RELEASE_TYPE" == "minor" ]; then
31 |   NEXT_FULL_TAG="${CURRENT_MAJOR}.${NEXT_MINOR}.0"
32 |   NEXT_SHORT_TAG="${CURRENT_MAJOR}.${NEXT_MINOR}"
33 | elif [ "$RELEASE_TYPE" == "patch" ]; then
34 |   NEXT_FULL_TAG="${CURRENT_MAJOR}.${CURRENT_MINOR}.${NEXT_PATCH}"
35 |   NEXT_SHORT_TAG="${CURRENT_MAJOR}.${NEXT_MINOR}"
36 | else
37 |   echo "Incorrect release type; use 'major', 'minor', or 'patch' as an argument"
38 |   exit 1
39 | fi
40 | 
41 | echo "Preparing '$RELEASE_TYPE' release [$CURRENT_TAG -> $NEXT_FULL_TAG]"
42 | 
43 | # Inplace sed replace; workaround for Linux and Mac
44 | function sed_runner() {
45 |     sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak
46 | }
47 | 
48 | #No-op


--------------------------------------------------------------------------------
/conda/envs/dev-environment.yml:
--------------------------------------------------------------------------------
 1 | name: dask-cudf-dev
 2 | channels:
 3 |   - nvidia
 4 |   - rapidsai
 5 |   - rapidsai-nightly
 6 |   - conda-forge
 7 |   - defaults
 8 | dependencies:
 9 |   - cmake>=3.12
10 |   - python>=3.6,<3.8
11 |   - numba>=0.41
12 |   - pandas>=0.23.4
13 |   - pyarrow=0.12.1
14 |   - notebook>=0.5.0
15 |   - nvstrings
16 |   - cython>=0.29,<0.30
17 |   - pytest
18 |   - sphinx
19 |   - sphinx_rtd_theme
20 |   - sphinxcontrib-websupport
21 |   - nbsphinx
22 |   - numpydoc
23 |   - ipython
24 |   - recommonmark
25 |   - pytest
26 |   - partd
27 |   - moto
28 |   - boto3
29 |   - httpretty
30 |   - flake8
31 |   - dask
32 |   - s3fs
33 |   - pip:
34 |       - git+https://github.com/dask/dask.git
35 |       - git+https://github.com/dask/distributed.git
36 | 
37 | 


--------------------------------------------------------------------------------
/conda/recipes/dask-cudf/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python setup.py install --single-version-externally-managed --record=record.txt
3 | 


--------------------------------------------------------------------------------
/conda/recipes/dask-cudf/meta.yaml:
--------------------------------------------------------------------------------
 1 | # Usage:
 2 | #   conda build -c defaults -c conda-forge .
 3 | {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
 4 | {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 5 | {% set git_revision_count=environ.get('GIT_DESCRIBE_NUMBER', 0) %}
 6 | {% set py_version=environ.get('CONDA_PY', 36) %}
 7 | package:
 8 |   name: dask-cudf
 9 |   version: {{ version }}
10 | 
11 | source:
12 |   path: ../../..
13 | 
14 | build:
15 |   number: {{ git_revision_count }}
16 |   string: py{{ py_version }}_{{ git_revision_count }}
17 | 
18 | requirements:
19 |   host:
20 |     - python x.x
21 |     - cudf {{minor_version}}.*
22 |     - dask >=1.2.2
23 |     - distributed >=1.23.0
24 |   run:
25 |     - python x.x
26 |     - cudf {{minor_version}}.*
27 |     - dask >=1.2.2
28 |     - distributed >=1.23.0
29 | test:
30 |   imports:
31 |     - dask_cudf
32 | 
33 | about:
34 |   home: http://rapids.ai
35 |   license: Apache
36 |   license_file: ../../../LICENSE.txt
37 |   summary: dask-cudf library
38 | 


--------------------------------------------------------------------------------
/dask_cudf/DASK_LICENSE.txt:
--------------------------------------------------------------------------------
 1 | ﻿This library contains modified code from the Dask library
 2 | (https://github.com/dask/dask). The original Dask license is below.
 3 | 
 4 | Copyright (c) 2014-2017, Continuum Analytics, Inc. and contributors
 5 | All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without modification,
 8 | are permitted provided that the following conditions are met:
 9 | 
10 | Redistributions of source code must retain the above copyright notice,
11 | this list of conditions and the following disclaimer.
12 | 
13 | Redistributions in binary form must reproduce the above copyright notice,
14 | this list of conditions and the following disclaimer in the documentation
15 | and/or other materials provided with the distribution.
16 | 
17 | Neither the name of Continuum Analytics nor the names of any contributors
18 | may be used to endorse or promote products derived from this software
19 | without specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
31 | THE POSSIBILITY OF SUCH DAMAGE.
32 | 


--------------------------------------------------------------------------------
/dask_cudf/__init__.py:
--------------------------------------------------------------------------------
 1 | from .core import (
 2 |     DataFrame,
 3 |     Series,
 4 |     from_cudf,
 5 |     from_dask_dataframe,
 6 |     concat,
 7 |     from_delayed,
 8 | )
 9 | from .io import read_csv, read_orc, read_json, read_parquet
10 | from . import backends
11 | 
12 | import cudf
13 | from cudf._version import get_versions
14 | 
15 | __version__ = get_versions()["version"]
16 | del get_versions
17 | 
18 | __all__ = [
19 |     "DataFrame",
20 |     "Series",
21 |     "from_cudf",
22 |     "from_dask_dataframe",
23 |     "concat",
24 |     "from_delayed",
25 | ]
26 | 
27 | if not hasattr(cudf.DataFrame, "mean"):
28 |     cudf.DataFrame.mean = None
29 | del cudf
30 | 


--------------------------------------------------------------------------------
/dask_cudf/_version.py:
--------------------------------------------------------------------------------
  1 | # This file helps to compute a version number in source trees obtained from
  2 | # git-archive tarball (such as those provided by githubs download-from-tag
  3 | # feature). Distribution tarballs (built by setup.py sdist) and build
  4 | # directories (produced by setup.py build) will contain a much shorter file
  5 | # that just contains the computed version number.
  6 | 
  7 | # This file is released into the public domain. Generated by
  8 | # versioneer-0.18 (https://github.com/warner/python-versioneer)
  9 | 
 10 | """Git implementation of _version.py."""
 11 | 
 12 | import errno
 13 | import os
 14 | import re
 15 | import subprocess
 16 | import sys
 17 | 
 18 | 
 19 | def get_keywords():
 20 |     """Get the keywords needed to look up the version information."""
 21 |     # these strings will be replaced by git during git-archive.
 22 |     # setup.py/versioneer.py will grep for the variable names, so they must
 23 |     # each be defined on a line of their own. _version.py will just call
 24 |     # get_keywords().
 25 |     git_refnames = " (HEAD -> branch-0.9, tag: v0.9.0a1)"
 26 |     git_full = "b566ab60ea69e6e165533b68b1966875528afb06"
 27 |     git_date = "2019-06-25 16:45:30 -0400"
 28 |     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
 29 |     return keywords
 30 | 
 31 | 
 32 | class VersioneerConfig:
 33 |     """Container for Versioneer configuration parameters."""
 34 | 
 35 | 
 36 | def get_config():
 37 |     """Create, populate and return the VersioneerConfig() object."""
 38 |     # these strings are filled in when 'setup.py versioneer' creates
 39 |     # _version.py
 40 |     cfg = VersioneerConfig()
 41 |     cfg.VCS = "git"
 42 |     cfg.style = "pep440"
 43 |     cfg.tag_prefix = ""
 44 |     cfg.parentdir_prefix = "dask_cudf-"
 45 |     cfg.versionfile_source = "dask_cudf/_version.py"
 46 |     cfg.verbose = False
 47 |     return cfg
 48 | 
 49 | 
 50 | class NotThisMethod(Exception):
 51 |     """Exception raised if a method is not valid for the current scenario."""
 52 | 
 53 | 
 54 | LONG_VERSION_PY = {}
 55 | HANDLERS = {}
 56 | 
 57 | 
 58 | def register_vcs_handler(vcs, method):  # decorator
 59 |     """Decorator to mark a method as the handler for a particular VCS."""
 60 | 
 61 |     def decorate(f):
 62 |         """Store f in HANDLERS[vcs][method]."""
 63 |         if vcs not in HANDLERS:
 64 |             HANDLERS[vcs] = {}
 65 |         HANDLERS[vcs][method] = f
 66 |         return f
 67 | 
 68 |     return decorate
 69 | 
 70 | 
 71 | def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
 72 |     """Call the given command(s)."""
 73 |     assert isinstance(commands, list)
 74 |     p = None
 75 |     for c in commands:
 76 |         try:
 77 |             dispcmd = str([c] + args)
 78 |             # remember shell=False, so use git.cmd on windows, not just git
 79 |             p = subprocess.Popen(
 80 |                 [c] + args,
 81 |                 cwd=cwd,
 82 |                 env=env,
 83 |                 stdout=subprocess.PIPE,
 84 |                 stderr=(subprocess.PIPE if hide_stderr else None),
 85 |             )
 86 |             break
 87 |         except EnvironmentError:
 88 |             e = sys.exc_info()[1]
 89 |             if e.errno == errno.ENOENT:
 90 |                 continue
 91 |             if verbose:
 92 |                 print("unable to run %s" % dispcmd)
 93 |                 print(e)
 94 |             return None, None
 95 |     else:
 96 |         if verbose:
 97 |             print("unable to find command, tried %s" % (commands,))
 98 |         return None, None
 99 |     stdout = p.communicate()[0].strip()
100 |     if sys.version_info[0] >= 3:
101 |         stdout = stdout.decode()
102 |     if p.returncode != 0:
103 |         if verbose:
104 |             print("unable to run %s (error)" % dispcmd)
105 |             print("stdout was %s" % stdout)
106 |         return None, p.returncode
107 |     return stdout, p.returncode
108 | 
109 | 
110 | def versions_from_parentdir(parentdir_prefix, root, verbose):
111 |     """Try to determine the version from the parent directory name.
112 | 
113 |     Source tarballs conventionally unpack into a directory that includes both
114 |     the project name and a version string. We will also support searching up
115 |     two directory levels for an appropriately named parent directory
116 |     """
117 |     rootdirs = []
118 | 
119 |     for i in range(3):
120 |         dirname = os.path.basename(root)
121 |         if dirname.startswith(parentdir_prefix):
122 |             return {
123 |                 "version": dirname[len(parentdir_prefix) :],
124 |                 "full-revisionid": None,
125 |                 "dirty": False,
126 |                 "error": None,
127 |                 "date": None,
128 |             }
129 |         else:
130 |             rootdirs.append(root)
131 |             root = os.path.dirname(root)  # up a level
132 | 
133 |     if verbose:
134 |         print(
135 |             "Tried directories %s but none started with prefix %s"
136 |             % (str(rootdirs), parentdir_prefix)
137 |         )
138 |     raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
139 | 
140 | 
141 | @register_vcs_handler("git", "get_keywords")
142 | def git_get_keywords(versionfile_abs):
143 |     """Extract version information from the given file."""
144 |     # the code embedded in _version.py can just fetch the value of these
145 |     # keywords. When used from setup.py, we don't want to import _version.py,
146 |     # so we do it with a regexp instead. This function is not used from
147 |     # _version.py.
148 |     keywords = {}
149 |     try:
150 |         f = open(versionfile_abs, "r")
151 |         for line in f.readlines():
152 |             if line.strip().startswith("git_refnames ="):
153 |                 mo = re.search(r'=\s*"(.*)"', line)
154 |                 if mo:
155 |                     keywords["refnames"] = mo.group(1)
156 |             if line.strip().startswith("git_full ="):
157 |                 mo = re.search(r'=\s*"(.*)"', line)
158 |                 if mo:
159 |                     keywords["full"] = mo.group(1)
160 |             if line.strip().startswith("git_date ="):
161 |                 mo = re.search(r'=\s*"(.*)"', line)
162 |                 if mo:
163 |                     keywords["date"] = mo.group(1)
164 |         f.close()
165 |     except EnvironmentError:
166 |         pass
167 |     return keywords
168 | 
169 | 
170 | @register_vcs_handler("git", "keywords")
171 | def git_versions_from_keywords(keywords, tag_prefix, verbose):
172 |     """Get version information from git keywords."""
173 |     if not keywords:
174 |         raise NotThisMethod("no keywords at all, weird")
175 |     date = keywords.get("date")
176 |     if date is not None:
177 |         # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
178 |         # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
179 |         # -like" string, which we must then edit to make compliant), because
180 |         # it's been around since git-1.5.3, and it's too difficult to
181 |         # discover which version we're using, or to work around using an
182 |         # older one.
183 |         date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
184 |     refnames = keywords["refnames"].strip()
185 |     if refnames.startswith("$Format"):
186 |         if verbose:
187 |             print("keywords are unexpanded, not using")
188 |         raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
189 |     refs = set([r.strip() for r in refnames.strip("()").split(",")])
190 |     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
191 |     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
192 |     TAG = "tag: "
193 |     tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
194 |     if not tags:
195 |         # Either we're using git < 1.8.3, or there really are no tags. We use
196 |         # a heuristic: assume all version tags have a digit. The old git %d
197 |         # expansion behaves like git log --decorate=short and strips out the
198 |         # refs/heads/ and refs/tags/ prefixes that would let us distinguish
199 |         # between branches and tags. By ignoring refnames without digits, we
200 |         # filter out many common branch names like "release" and
201 |         # "stabilization", as well as "HEAD" and "master".
202 |         tags = set([r for r in refs if re.search(r"\d", r)])
203 |         if verbose:
204 |             print("discarding '%s', no digits" % ",".join(refs - tags))
205 |     if verbose:
206 |         print("likely tags: %s" % ",".join(sorted(tags)))
207 |     for ref in sorted(tags):
208 |         # sorting will prefer e.g. "2.0" over "2.0rc1"
209 |         if ref.startswith(tag_prefix):
210 |             r = ref[len(tag_prefix) :]
211 |             if verbose:
212 |                 print("picking %s" % r)
213 |             return {
214 |                 "version": r,
215 |                 "full-revisionid": keywords["full"].strip(),
216 |                 "dirty": False,
217 |                 "error": None,
218 |                 "date": date,
219 |             }
220 |     # no suitable tags, so version is "0+unknown", but full hex is still there
221 |     if verbose:
222 |         print("no suitable tags, using unknown + full revision id")
223 |     return {
224 |         "version": "0+unknown",
225 |         "full-revisionid": keywords["full"].strip(),
226 |         "dirty": False,
227 |         "error": "no suitable tags",
228 |         "date": None,
229 |     }
230 | 
231 | 
232 | @register_vcs_handler("git", "pieces_from_vcs")
233 | def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
234 |     """Get version from 'git describe' in the root of the source tree.
235 | 
236 |     This only gets called if the git-archive 'subst' keywords were *not*
237 |     expanded, and _version.py hasn't already been rewritten with a short
238 |     version string, meaning we're inside a checked out source tree.
239 |     """
240 |     GITS = ["git"]
241 |     if sys.platform == "win32":
242 |         GITS = ["git.cmd", "git.exe"]
243 | 
244 |     out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
245 |     if rc != 0:
246 |         if verbose:
247 |             print("Directory %s not under git control" % root)
248 |         raise NotThisMethod("'git rev-parse --git-dir' returned error")
249 | 
250 |     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
251 |     # if there isn't one, this yields HEX[-dirty] (no NUM)
252 |     describe_out, rc = run_command(
253 |         GITS,
254 |         [
255 |             "describe",
256 |             "--tags",
257 |             "--dirty",
258 |             "--always",
259 |             "--long",
260 |             "--match",
261 |             "%s*" % tag_prefix,
262 |         ],
263 |         cwd=root,
264 |     )
265 |     # --long was added in git-1.5.5
266 |     if describe_out is None:
267 |         raise NotThisMethod("'git describe' failed")
268 |     describe_out = describe_out.strip()
269 |     full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
270 |     if full_out is None:
271 |         raise NotThisMethod("'git rev-parse' failed")
272 |     full_out = full_out.strip()
273 | 
274 |     pieces = {}
275 |     pieces["long"] = full_out
276 |     pieces["short"] = full_out[:7]  # maybe improved later
277 |     pieces["error"] = None
278 | 
279 |     # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
280 |     # TAG might have hyphens.
281 |     git_describe = describe_out
282 | 
283 |     # look for -dirty suffix
284 |     dirty = git_describe.endswith("-dirty")
285 |     pieces["dirty"] = dirty
286 |     if dirty:
287 |         git_describe = git_describe[: git_describe.rindex("-dirty")]
288 | 
289 |     # now we have TAG-NUM-gHEX or HEX
290 | 
291 |     if "-" in git_describe:
292 |         # TAG-NUM-gHEX
293 |         mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
294 |         if not mo:
295 |             # unparseable. Maybe git-describe is misbehaving?
296 |             pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
297 |             return pieces
298 | 
299 |         # tag
300 |         full_tag = mo.group(1)
301 |         if not full_tag.startswith(tag_prefix):
302 |             if verbose:
303 |                 fmt = "tag '%s' doesn't start with prefix '%s'"
304 |                 print(fmt % (full_tag, tag_prefix))
305 |             pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
306 |                 full_tag,
307 |                 tag_prefix,
308 |             )
309 |             return pieces
310 |         pieces["closest-tag"] = full_tag[len(tag_prefix) :]
311 | 
312 |         # distance: number of commits since tag
313 |         pieces["distance"] = int(mo.group(2))
314 | 
315 |         # commit: short hex revision ID
316 |         pieces["short"] = mo.group(3)
317 | 
318 |     else:
319 |         # HEX: no tags
320 |         pieces["closest-tag"] = None
321 |         count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
322 |         pieces["distance"] = int(count_out)  # total number of commits
323 | 
324 |     # commit date: see ISO-8601 comment in git_versions_from_keywords()
325 |     date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
326 |         0
327 |     ].strip()
328 |     pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
329 | 
330 |     return pieces
331 | 
332 | 
333 | def plus_or_dot(pieces):
334 |     """Return a + if we don't already have one, else return a ."""
335 |     if "+" in pieces.get("closest-tag", ""):
336 |         return "."
337 |     return "+"
338 | 
339 | 
340 | def render_pep440(pieces):
341 |     """Build up version string, with post-release "local version identifier".
342 | 
343 |     Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
344 |     get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
345 | 
346 |     Exceptions:
347 |     1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
348 |     """
349 |     if pieces["closest-tag"]:
350 |         rendered = pieces["closest-tag"]
351 |         if pieces["distance"] or pieces["dirty"]:
352 |             rendered += plus_or_dot(pieces)
353 |             rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
354 |             if pieces["dirty"]:
355 |                 rendered += ".dirty"
356 |     else:
357 |         # exception #1
358 |         rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
359 |         if pieces["dirty"]:
360 |             rendered += ".dirty"
361 |     return rendered
362 | 
363 | 
364 | def render_pep440_pre(pieces):
365 |     """TAG[.post.devDISTANCE] -- No -dirty.
366 | 
367 |     Exceptions:
368 |     1: no tags. 0.post.devDISTANCE
369 |     """
370 |     if pieces["closest-tag"]:
371 |         rendered = pieces["closest-tag"]
372 |         if pieces["distance"]:
373 |             rendered += ".post.dev%d" % pieces["distance"]
374 |     else:
375 |         # exception #1
376 |         rendered = "0.post.dev%d" % pieces["distance"]
377 |     return rendered
378 | 
379 | 
380 | def render_pep440_post(pieces):
381 |     """TAG[.postDISTANCE[.dev0]+gHEX] .
382 | 
383 |     The ".dev0" means dirty. Note that .dev0 sorts backwards
384 |     (a dirty tree will appear "older" than the corresponding clean one),
385 |     but you shouldn't be releasing software with -dirty anyways.
386 | 
387 |     Exceptions:
388 |     1: no tags. 0.postDISTANCE[.dev0]
389 |     """
390 |     if pieces["closest-tag"]:
391 |         rendered = pieces["closest-tag"]
392 |         if pieces["distance"] or pieces["dirty"]:
393 |             rendered += ".post%d" % pieces["distance"]
394 |             if pieces["dirty"]:
395 |                 rendered += ".dev0"
396 |             rendered += plus_or_dot(pieces)
397 |             rendered += "g%s" % pieces["short"]
398 |     else:
399 |         # exception #1
400 |         rendered = "0.post%d" % pieces["distance"]
401 |         if pieces["dirty"]:
402 |             rendered += ".dev0"
403 |         rendered += "+g%s" % pieces["short"]
404 |     return rendered
405 | 
406 | 
407 | def render_pep440_old(pieces):
408 |     """TAG[.postDISTANCE[.dev0]] .
409 | 
410 |     The ".dev0" means dirty.
411 | 
412 |     Eexceptions:
413 |     1: no tags. 0.postDISTANCE[.dev0]
414 |     """
415 |     if pieces["closest-tag"]:
416 |         rendered = pieces["closest-tag"]
417 |         if pieces["distance"] or pieces["dirty"]:
418 |             rendered += ".post%d" % pieces["distance"]
419 |             if pieces["dirty"]:
420 |                 rendered += ".dev0"
421 |     else:
422 |         # exception #1
423 |         rendered = "0.post%d" % pieces["distance"]
424 |         if pieces["dirty"]:
425 |             rendered += ".dev0"
426 |     return rendered
427 | 
428 | 
429 | def render_git_describe(pieces):
430 |     """TAG[-DISTANCE-gHEX][-dirty].
431 | 
432 |     Like 'git describe --tags --dirty --always'.
433 | 
434 |     Exceptions:
435 |     1: no tags. HEX[-dirty]  (note: no 'g' prefix)
436 |     """
437 |     if pieces["closest-tag"]:
438 |         rendered = pieces["closest-tag"]
439 |         if pieces["distance"]:
440 |             rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
441 |     else:
442 |         # exception #1
443 |         rendered = pieces["short"]
444 |     if pieces["dirty"]:
445 |         rendered += "-dirty"
446 |     return rendered
447 | 
448 | 
449 | def render_git_describe_long(pieces):
450 |     """TAG-DISTANCE-gHEX[-dirty].
451 | 
452 |     Like 'git describe --tags --dirty --always -long'.
453 |     The distance/hash is unconditional.
454 | 
455 |     Exceptions:
456 |     1: no tags. HEX[-dirty]  (note: no 'g' prefix)
457 |     """
458 |     if pieces["closest-tag"]:
459 |         rendered = pieces["closest-tag"]
460 |         rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
461 |     else:
462 |         # exception #1
463 |         rendered = pieces["short"]
464 |     if pieces["dirty"]:
465 |         rendered += "-dirty"
466 |     return rendered
467 | 
468 | 
469 | def render(pieces, style):
470 |     """Render the given version pieces into the requested style."""
471 |     if pieces["error"]:
472 |         return {
473 |             "version": "unknown",
474 |             "full-revisionid": pieces.get("long"),
475 |             "dirty": None,
476 |             "error": pieces["error"],
477 |             "date": None,
478 |         }
479 | 
480 |     if not style or style == "default":
481 |         style = "pep440"  # the default
482 | 
483 |     if style == "pep440":
484 |         rendered = render_pep440(pieces)
485 |     elif style == "pep440-pre":
486 |         rendered = render_pep440_pre(pieces)
487 |     elif style == "pep440-post":
488 |         rendered = render_pep440_post(pieces)
489 |     elif style == "pep440-old":
490 |         rendered = render_pep440_old(pieces)
491 |     elif style == "git-describe":
492 |         rendered = render_git_describe(pieces)
493 |     elif style == "git-describe-long":
494 |         rendered = render_git_describe_long(pieces)
495 |     else:
496 |         raise ValueError("unknown style '%s'" % style)
497 | 
498 |     return {
499 |         "version": rendered,
500 |         "full-revisionid": pieces["long"],
501 |         "dirty": pieces["dirty"],
502 |         "error": None,
503 |         "date": pieces.get("date"),
504 |     }
505 | 
506 | 
507 | def get_versions():
508 |     """Get version information or return default if unable to do so."""
509 |     # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
510 |     # __file__, we can work backwards from there to the root. Some
511 |     # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
512 |     # case we can only use expanded keywords.
513 | 
514 |     cfg = get_config()
515 |     verbose = cfg.verbose
516 | 
517 |     try:
518 |         return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
519 |     except NotThisMethod:
520 |         pass
521 | 
522 |     try:
523 |         root = os.path.realpath(__file__)
524 |         # versionfile_source is the relative path from the top of the source
525 |         # tree (where the .git directory might live) to this file. Invert
526 |         # this to find the root from __file__.
527 |         for i in cfg.versionfile_source.split("/"):
528 |             root = os.path.dirname(root)
529 |     except NameError:
530 |         return {
531 |             "version": "0+unknown",
532 |             "full-revisionid": None,
533 |             "dirty": None,
534 |             "error": "unable to find root of source tree",
535 |             "date": None,
536 |         }
537 | 
538 |     try:
539 |         pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
540 |         return render(pieces, cfg.style)
541 |     except NotThisMethod:
542 |         pass
543 | 
544 |     try:
545 |         if cfg.parentdir_prefix:
546 |             return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
547 |     except NotThisMethod:
548 |         pass
549 | 
550 |     return {
551 |         "version": "0+unknown",
552 |         "full-revisionid": None,
553 |         "dirty": None,
554 |         "error": "unable to compute version",
555 |         "date": None,
556 |     }
557 | 


--------------------------------------------------------------------------------
/dask_cudf/accessor.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc.
  2 | # and PyData Development Team
  3 | # All rights reserved.
  4 | 
  5 | # Copyright (c) 2014-2018, Anaconda, Inc. and contributors
  6 | # All rights reserved.
  7 | 
  8 | """
  9 | 
 10 | accessor.py contains classes for implementing
 11 | accessor properties.
 12 | 
 13 | """
 14 | 
 15 | from toolz import partial
 16 | 
 17 | import cudf
 18 | from cudf.dataframe.categorical import CategoricalAccessor as GdfCategoricalAccessor
 19 | from cudf.dataframe.series import DatetimeProperties
 20 | 
 21 | # Adapted from
 22 | # https://github.com/dask/dask/blob/master/dask/dataframe/accessor.py
 23 | 
 24 | 
 25 | class Accessor(object):
 26 |     """
 27 |     Base class for Accessor objects dt, str, cat.
 28 | 
 29 |     Notes
 30 |     -----
 31 |     Subclasses should define the following attributes:
 32 |     * _accessor
 33 |     * _accessor_name
 34 | 
 35 |     Subclasses should also implement the following methods:
 36 |     * _validate()
 37 | 
 38 |     """
 39 | 
 40 |     _not_implemented = frozenset([])
 41 | 
 42 |     def __init__(self, series):
 43 |         from .core import Series
 44 | 
 45 |         if not isinstance(series, Series):
 46 |             raise ValueError("Accessor cannot be initialized")
 47 |         self._series = series
 48 |         self._validate(series)
 49 | 
 50 |     def _validate(self, series):
 51 |         """ Validates the data type of series passed to the
 52 |         accessor.
 53 |         """
 54 |         raise NotImplementedError("Must implement")
 55 | 
 56 |     @staticmethod
 57 |     def _delegate_property(obj, accessor, attr):
 58 |         out = getattr(getattr(obj, accessor, obj), attr)
 59 |         return out
 60 | 
 61 |     @staticmethod
 62 |     def _delegate_method(obj, accessor, attr, args, kwargs):
 63 |         out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
 64 |         return out
 65 | 
 66 |     def _property_map(self, attr):
 67 |         meta = self._delegate_property(self._series._meta, self._accessor_name, attr)
 68 |         token = "%s-%s" % (self._accessor_name, attr)
 69 |         return self._series.map_partitions(
 70 |             self._delegate_property, self._accessor_name, attr, token=token, meta=meta
 71 |         )
 72 | 
 73 |     def _function_map(self, attr, *args, **kwargs):
 74 |         meta = self._delegate_method(
 75 |             self._series._meta_nonempty, self._accessor_name, attr, args, kwargs
 76 |         )
 77 |         token = "%s-%s" % (self._accessor_name, attr)
 78 |         return self._series.map_partitions(
 79 |             self._delegate_method,
 80 |             self._accessor_name,
 81 |             attr,
 82 |             args,
 83 |             kwargs,
 84 |             meta=meta,
 85 |             token=token,
 86 |         )
 87 | 
 88 |     @property
 89 |     def _delegates(self):
 90 |         return set(dir(self._accessor)).difference(self._not_implemented)
 91 | 
 92 |     def __dir__(self):
 93 |         o = self._delegates
 94 |         o.update(self.__dict__)
 95 |         o.update(dir(type(self)))
 96 |         return list(o)
 97 | 
 98 |     def __getattr__(self, key):
 99 |         if key in self._delegates:
100 |             if isinstance(getattr(self._accessor, key), property):
101 |                 return self._property_map(key)
102 |             else:
103 |                 return partial(self._function_map, key)
104 |         else:
105 |             raise AttributeError(key)
106 | 
107 | 
108 | # Adapted from
109 | # https://github.com/pandas-dev/pandas/blob/master/pandas/core/accessor.py
110 | 
111 | 
112 | class CachedAccessor(object):
113 |     """Custom property-like object (descriptor) for caching accessors.
114 |     Parameters
115 |     ----------
116 |     name : str
117 |         The namespace this will be accessed under, e.g. ``df.timestamp.dt``
118 |     accessor : cls
119 |         The class with the extension methods. The class' __init__ method
120 |         should expect a ``Series`` as the single argument ``data``
121 |     """
122 | 
123 |     def __init__(self, name, accessor):
124 |         self._name = name
125 |         self._accessor = accessor
126 | 
127 |     def __get__(self, obj, cls):
128 |         if obj is None:
129 |             # we're accessing the attribute of the class, i.e., Dataset.geo
130 |             return self._accessor
131 |         accessor_obj = self._accessor(obj)
132 |         return accessor_obj
133 | 
134 | 
135 | class DatetimeAccessor(Accessor):
136 |     """ Accessor object for datetimelike properties of the Series values.
137 |     """
138 | 
139 |     _accessor = DatetimeProperties
140 |     _accessor_name = "dt"
141 | 
142 |     def _validate(self, series):
143 |         if not isinstance(series._meta._column, cudf.dataframe.DatetimeColumn):
144 |             raise AttributeError(
145 |                 "Can only use .dt accessor with datetimelike " "values"
146 |             )
147 | 
148 | 
149 | class CategoricalAccessor(Accessor):
150 |     """ Accessor object for categorical properties of the Series values
151 |     of Categorical type.
152 |     """
153 | 
154 |     _accessor = GdfCategoricalAccessor
155 |     _accessor_name = "cat"
156 |     ordered = True
157 | 
158 |     def _validate(self, series):
159 |         if not isinstance(
160 |             series._meta._column, cudf.dataframe.categorical.CategoricalColumn
161 |         ):
162 |             raise AttributeError("Can only use .cat accessor with categorical values")
163 | 


--------------------------------------------------------------------------------
/dask_cudf/backends.py:
--------------------------------------------------------------------------------
 1 | from dask.dataframe.methods import concat_dispatch
 2 | from dask.dataframe.core import get_parallel_type, meta_nonempty, make_meta
 3 | import cudf
 4 | 
 5 | from .core import DataFrame, Series, Index
 6 | 
 7 | 
 8 | get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame)
 9 | get_parallel_type.register(cudf.Series, lambda _: Series)
10 | get_parallel_type.register(cudf.Index, lambda _: Index)
11 | 
12 | 
13 | @meta_nonempty.register((cudf.DataFrame, cudf.Series, cudf.Index))
14 | def meta_nonempty_cudf(x, index=None):
15 |     y = meta_nonempty(x.to_pandas())  # TODO: add iloc[:5]
16 |     return cudf.from_pandas(y)
17 | 
18 | 
19 | @make_meta.register((cudf.Series, cudf.DataFrame))
20 | def make_meta_cudf(x, index=None):
21 |     return x.head(0)
22 | 
23 | 
24 | @make_meta.register(cudf.Index)
25 | def make_meta_cudf_index(x, index=None):
26 |     return x[:0]
27 | 
28 | 
29 | @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index))
30 | def concat_cudf(dfs, axis=0, join="outer", uniform=False, filter_warning=True):
31 |     assert axis == 0
32 |     assert join == "outer"
33 |     return cudf.concat(dfs)
34 | 


--------------------------------------------------------------------------------
/dask_cudf/batcher_sortnet.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Batcher's Odd-even sorting network
  3 | Adapted from https://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
  4 | """
  5 | import math
  6 | 
  7 | from dask import compute, delayed
  8 | 
  9 | import cudf as gd
 10 | 
 11 | 
 12 | def get_oversized(length):
 13 |     """
 14 |     The oddeven network requires a power-of-2 length.
 15 |     This method computes the next power-of-2 from the *length* if
 16 |     *length* is not a power-of-2 value.
 17 |     """
 18 |     return 2 ** math.ceil(math.log2(length))
 19 | 
 20 | 
 21 | def is_power_of_2(length):
 22 |     return math.log2(length).is_integer()
 23 | 
 24 | 
 25 | def oddeven_merge(lo, hi, r):
 26 |     step = r * 2
 27 |     if step < hi - lo:
 28 |         for each in oddeven_merge(lo, hi, step):
 29 |             yield each
 30 |         for each in oddeven_merge(lo + r, hi, step):
 31 |             yield each
 32 |         for i in range(lo + r, hi - r, step):
 33 |             yield (i, i + r)
 34 |     else:
 35 |         yield (lo, lo + r)
 36 | 
 37 | 
 38 | def oddeven_merge_sort_range(lo, hi):
 39 |     """ sort the part of x with indices between lo and hi.
 40 | 
 41 |     Note: endpoints (lo and hi) are included.
 42 |     """
 43 |     if (hi - lo) >= 1:
 44 |         # if there is more than one element, split the input
 45 |         # down the middle and first sort the first and second
 46 |         # half, followed by merging them.
 47 |         mid = lo + ((hi - lo) // 2)
 48 |         for each in oddeven_merge_sort_range(lo, mid):
 49 |             yield each
 50 |         for each in oddeven_merge_sort_range(mid + 1, hi):
 51 |             yield each
 52 |         for each in oddeven_merge(lo, hi, 1):
 53 |             yield each
 54 | 
 55 | 
 56 | def oddeven_merge_sort(length):
 57 |     """ "length" is the length of the list to be sorted.
 58 |     Returns a list of pairs of indices starting with 0 """
 59 |     assert is_power_of_2(length)
 60 |     for each in oddeven_merge_sort_range(0, length - 1):
 61 |         yield each
 62 | 
 63 | 
 64 | def _pad_data_to_length(parts):
 65 |     parts = list(parts)
 66 |     needed = get_oversized(len(parts))
 67 |     padn = needed - len(parts)
 68 |     return parts + [None] * padn, len(parts)
 69 | 
 70 | 
 71 | def _compare_frame(a, b, max_part_size, by):
 72 |     if a is not None and b is not None:
 73 |         joint = gd.concat([a, b])
 74 |         sorten = joint.sort_values(by=by)
 75 |         # Split the sorted frame using the *max_part_size*
 76 |         lhs, rhs = sorten[:max_part_size], sorten[max_part_size:]
 77 |         # Replace empty frame with None
 78 |         return lhs or None, rhs or None
 79 |     elif a is None and b is None:
 80 |         return None, None
 81 |     elif a is None:
 82 |         return b.sort_values(by=by), None
 83 |     else:
 84 |         return a.sort_values(by=by), None
 85 | 
 86 | 
 87 | def _compare_and_swap_frame(parts, a, b, max_part_size, by):
 88 |     compared = delayed(_compare_frame)(parts[a], parts[b], max_part_size, by=by)
 89 |     parts[a] = compared[0]
 90 |     parts[b] = compared[1]
 91 | 
 92 | 
 93 | def _cleanup(df):
 94 |     if "__dask_cudf__valid" in df.columns:
 95 |         out = df.query("__dask_cudf__valid")
 96 |         del out["__dask_cudf__valid"]
 97 |     else:
 98 |         out = df
 99 |     return out
100 | 
101 | 
102 | def sort_delayed_frame(parts, by):
103 |     """
104 |     Parameters
105 |     ----------
106 |     parts :
107 |         Delayed partitions of cudf.DataFrame
108 |     by : str
109 |         Column name by which to sort
110 | 
111 |     The sort will also rebalance the partition sizes so that all output
112 |     partitions has partition size of atmost `max(original_partition_sizes)`.
113 |     Therefore, they may be fewer partitions in the output.
114 |     """
115 |     # Empty frame?
116 |     if len(parts) == 0:
117 |         return parts
118 |     # Compute maximum paritition size, which is needed
119 |     # for non-uniform partition size
120 |     max_part_size = delayed(max)(*map(delayed(len), parts))
121 |     # Add empty partitions to match power-of-2 requirement.
122 |     parts, valid = _pad_data_to_length(parts)
123 |     # More than 1 input?
124 |     if len(parts) > 1:
125 |         # Build batcher's odd-even sorting network
126 |         for a, b in oddeven_merge_sort(len(parts)):
127 |             _compare_and_swap_frame(parts, a, b, max_part_size, by=by)
128 |     # Single input?
129 |     else:
130 |         parts = [delayed(lambda x: x.sort_values(by=by))(parts[0])]
131 |     # Count number of non-empty partitions
132 |     valid_ct = delayed(sum)(
133 |         list(map(delayed(lambda x: int(x is not None)), parts[:valid]))
134 |     )
135 |     valid = compute(valid_ct)[0]
136 |     validparts = parts[:valid]
137 |     return validparts
138 | 


--------------------------------------------------------------------------------
/dask_cudf/core.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018, NVIDIA CORPORATION.
  2 | import warnings
  3 | from collections import OrderedDict
  4 | 
  5 | import pandas as pd
  6 | 
  7 | import dask
  8 | import dask.dataframe as dd
  9 | import numpy as np
 10 | from dask import compute
 11 | from dask.base import normalize_token, tokenize
 12 | from dask.compatibility import apply
 13 | from dask.context import _globals
 14 | from dask.core import flatten
 15 | from dask.dataframe import from_delayed
 16 | from dask.dataframe.core import Scalar, handle_out, map_partitions
 17 | from dask.dataframe.utils import raise_on_meta_error
 18 | from dask.delayed import delayed
 19 | from dask.optimization import cull, fuse
 20 | from dask.utils import M, OperatorMethodMixin, funcname, derived_from
 21 | from toolz import partition_all
 22 | 
 23 | import cudf
 24 | import cudf.bindings.reduce as cpp_reduce
 25 | from dask_cudf import batcher_sortnet, join_impl
 26 | from dask_cudf.accessor import CachedAccessor, CategoricalAccessor, DatetimeAccessor
 27 | 
 28 | 
 29 | def optimize(dsk, keys, **kwargs):
 30 |     flatkeys = list(flatten(keys)) if isinstance(keys, list) else [keys]
 31 |     dsk, dependencies = cull(dsk, flatkeys)
 32 |     dsk, dependencies = fuse(
 33 |         dsk,
 34 |         keys,
 35 |         dependencies=dependencies,
 36 |         ave_width=_globals.get("fuse_ave_width", 1),
 37 |     )
 38 |     dsk, _ = cull(dsk, keys)
 39 |     return dsk
 40 | 
 41 | 
 42 | def finalize(results):
 43 |     return cudf.concat(results)
 44 | 
 45 | 
 46 | class _Frame(dd.core._Frame, OperatorMethodMixin):
 47 |     """ Superclass for DataFrame and Series
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     dsk : dict
 52 |         The dask graph to compute this DataFrame
 53 |     name : str
 54 |         The key prefix that specifies which keys in the dask comprise this
 55 |         particular DataFrame / Series
 56 |     meta : cudf.DataFrame, cudf.Series, or cudf.Index
 57 |         An empty cudf object with names, dtypes, and indices matching the
 58 |         expected output.
 59 |     divisions : tuple of index values
 60 |         Values along which we partition our blocks on the index
 61 |     """
 62 | 
 63 |     __dask_scheduler__ = staticmethod(dask.get)
 64 |     __dask_optimize__ = staticmethod(optimize)
 65 | 
 66 |     def __dask_postcompute__(self):
 67 |         return finalize, ()
 68 | 
 69 |     def __dask_postpersist__(self):
 70 |         return type(self), (self._name, self._meta, self.divisions)
 71 | 
 72 |     def __init__(self, dsk, name, meta, divisions):
 73 |         self.dask = dsk
 74 |         self._name = name
 75 |         meta = dd.core.make_meta(meta)
 76 |         if not isinstance(meta, self._partition_type):
 77 |             raise TypeError(
 78 |                 "Expected meta to specify type {0}, got type "
 79 |                 "{1}".format(self._partition_type.__name__, type(meta).__name__)
 80 |             )
 81 |         self._meta = dd.core.make_meta(meta)
 82 |         self.divisions = tuple(divisions)
 83 | 
 84 |     def __getstate__(self):
 85 |         return (self.dask, self._name, self._meta, self.divisions)
 86 | 
 87 |     def __setstate__(self, state):
 88 |         self.dask, self._name, self._meta, self.divisions = state
 89 | 
 90 |     def __repr__(self):
 91 |         s = "<dask_cudf.%s | %d tasks | %d npartitions>"
 92 |         return s % (type(self).__name__, len(self.dask), self.npartitions)
 93 | 
 94 |     def to_dask_dataframe(self):
 95 |         """Create a dask.dataframe object from a dask_cudf object"""
 96 |         return self.map_partitions(M.to_pandas)
 97 | 
 98 | 
 99 | concat = dd.concat
100 | 
101 | 
102 | normalize_token.register(_Frame, lambda a: a._name)
103 | 
104 | 
105 | class DataFrame(_Frame, dd.core.DataFrame):
106 |     _partition_type = cudf.DataFrame
107 | 
108 |     def _assign_column(self, k, v):
109 |         def assigner(df, k, v):
110 |             out = df.copy()
111 |             out[k] = v
112 |             return out
113 | 
114 |         meta = assigner(self._meta, k, dd.core.make_meta(v))
115 |         return self.map_partitions(assigner, k, v, meta=meta)
116 | 
117 |     def apply_rows(self, func, incols, outcols, kwargs={}, cache_key=None):
118 |         import uuid
119 | 
120 |         if cache_key is None:
121 |             cache_key = uuid.uuid4()
122 | 
123 |         def do_apply_rows(df, func, incols, outcols, kwargs):
124 |             return df.apply_rows(func, incols, outcols, kwargs, cache_key=cache_key)
125 | 
126 |         meta = do_apply_rows(self._meta, func, incols, outcols, kwargs)
127 |         return self.map_partitions(
128 |             do_apply_rows, func, incols, outcols, kwargs, meta=meta
129 |         )
130 | 
131 |     def merge(
132 |         self,
133 |         other,
134 |         on=None,
135 |         how="left",
136 |         left_index=False,
137 |         right_index=False,
138 |         suffixes=("_x", "_y"),
139 |     ):
140 |         """Merging two dataframes on the column(s) indicated in *on*.
141 |         """
142 |         if (
143 |             left_index
144 |             or right_index
145 |             or not dask.is_dask_collection(other)
146 |             or self.npartitions == 1
147 |             and how in ("inner", "right")
148 |             or other.npartitions == 1
149 |             and how in ("inner", "left")
150 |         ):
151 |             return dd.merge(
152 |                 self,
153 |                 other,
154 |                 how=how,
155 |                 suffixes=suffixes,
156 |                 left_index=left_index,
157 |                 right_index=right_index,
158 |             )
159 | 
160 |         if not on and not left_index and not right_index:
161 |             on = [c for c in self.columns if c in other.columns]
162 |             if not on:
163 |                 left_index = right_index = True
164 | 
165 |         return join_impl.join_frames(
166 |             left=self,
167 |             right=other,
168 |             on=on,
169 |             how=how,
170 |             lsuffix=suffixes[0],
171 |             rsuffix=suffixes[1],
172 |         )
173 | 
174 |     def join(self, other, how="left", lsuffix="", rsuffix=""):
175 |         """Join two datatframes
176 | 
177 |         *on* is not supported.
178 |         """
179 |         if how == "right":
180 |             return other.join(other=self, how="left", lsuffix=rsuffix, rsuffix=lsuffix)
181 | 
182 |         same_names = set(self.columns) & set(other.columns)
183 |         if same_names and not (lsuffix or rsuffix):
184 |             raise ValueError(
185 |                 "there are overlapping columns but "
186 |                 "lsuffix and rsuffix are not defined"
187 |             )
188 | 
189 |         left, leftuniques = self._align_divisions()
190 |         right, rightuniques = other._align_to_indices(leftuniques)
191 | 
192 |         leftparts = left.to_delayed()
193 |         rightparts = right.to_delayed()
194 | 
195 |         @delayed
196 |         def part_join(left, right, how):
197 |             return left.join(
198 |                 right, how=how, sort=True, lsuffix=lsuffix, rsuffix=rsuffix
199 |             )
200 | 
201 |         def inner_selector():
202 |             pivot = 0
203 |             for i in range(len(leftparts)):
204 |                 for j in range(pivot, len(rightparts)):
205 |                     if leftuniques[i] & rightuniques[j]:
206 |                         yield leftparts[i], rightparts[j]
207 |                         pivot = j + 1
208 |                         break
209 | 
210 |         def left_selector():
211 |             pivot = 0
212 |             for i in range(len(leftparts)):
213 |                 for j in range(pivot, len(rightparts)):
214 |                     if leftuniques[i] & rightuniques[j]:
215 |                         yield leftparts[i], rightparts[j]
216 |                         pivot = j + 1
217 |                         break
218 |                 else:
219 |                     yield leftparts[i], None
220 | 
221 |         selector = {"left": left_selector, "inner": inner_selector}[how]
222 | 
223 |         rhs_dtypes = [(k, other._meta.dtypes[k]) for k in other._meta.columns]
224 | 
225 |         @delayed
226 |         def fix_column(lhs):
227 |             df = cudf.DataFrame()
228 |             for k in lhs.columns:
229 |                 df[k + lsuffix] = lhs[k]
230 | 
231 |             for k, dtype in rhs_dtypes:
232 |                 data = np.zeros(len(lhs), dtype=dtype)
233 |                 mask_size = cudf.utils.utils.calc_chunk_size(
234 |                     data.size, cudf.utils.utils.mask_bitsize
235 |                 )
236 |                 mask = np.zeros(mask_size, dtype=cudf.utils.utils.mask_dtype)
237 |                 sr = cudf.Series.from_masked_array(
238 |                     data=data, mask=mask, null_count=data.size
239 |                 )
240 | 
241 |                 df[k + rsuffix] = sr.set_index(df.index)
242 | 
243 |             return df
244 | 
245 |         joinedparts = [
246 |             (part_join(lhs, rhs, how=how) if rhs is not None else fix_column(lhs))
247 |             for lhs, rhs in selector()
248 |         ]
249 | 
250 |         meta = self._meta.join(other._meta, how=how, lsuffix=lsuffix, rsuffix=rsuffix)
251 |         return from_delayed(joinedparts, meta=meta)
252 | 
253 |     def _align_divisions(self):
254 |         """Align so that the values do not split across partitions
255 |         """
256 |         parts = self.to_delayed()
257 |         uniques = self._get_unique_indices(parts=parts)
258 |         originals = list(map(frozenset, uniques))
259 | 
260 |         changed = True
261 |         while changed:
262 |             changed = False
263 |             for i in range(len(uniques))[:-1]:
264 |                 intersect = uniques[i] & uniques[i + 1]
265 |                 if intersect:
266 |                     smaller = min(uniques[i], uniques[i + 1], key=len)
267 |                     bigger = max(uniques[i], uniques[i + 1], key=len)
268 |                     smaller |= intersect
269 |                     bigger -= intersect
270 |                     changed = True
271 | 
272 |         # Fix empty partitions
273 |         uniques = list(filter(bool, uniques))
274 | 
275 |         return self._align_to_indices(uniques, originals=originals, parts=parts)
276 | 
277 |     def _get_unique_indices(self, parts=None):
278 |         if parts is None:
279 |             parts = self.to_delayed()
280 | 
281 |         @delayed
282 |         def unique(x):
283 |             return set(x.index.as_column().unique().to_array())
284 | 
285 |         parts = self.to_delayed()
286 |         return compute(*map(unique, parts))
287 | 
288 |     def _align_to_indices(self, uniques, originals=None, parts=None):
289 |         uniques = list(map(set, uniques))
290 | 
291 |         if parts is None:
292 |             parts = self.to_delayed()
293 | 
294 |         if originals is None:
295 |             originals = self._get_unique_indices(parts=parts)
296 |             allindices = set()
297 |             for x in originals:
298 |                 allindices |= x
299 |             for us in uniques:
300 |                 us &= allindices
301 |             uniques = list(filter(bool, uniques))
302 | 
303 |         extras = originals[-1] - uniques[-1]
304 |         extras = {x for x in extras if x > max(uniques[-1])}
305 | 
306 |         if extras:
307 |             uniques.append(extras)
308 | 
309 |         remap = OrderedDict()
310 |         for idxset in uniques:
311 |             remap[tuple(sorted(idxset))] = bins = []
312 |             for i, orig in enumerate(originals):
313 |                 if idxset & orig:
314 |                     bins.append(parts[i])
315 | 
316 |         @delayed
317 |         def take(indices, depends):
318 |             first = min(indices)
319 |             last = max(indices)
320 |             others = []
321 |             for d in depends:
322 |                 # TODO: this can be replaced with searchsorted
323 |                 # Normalize to index data in range before selection.
324 |                 firstindex = d.index[0]
325 |                 lastindex = d.index[-1]
326 |                 s = max(first, firstindex)
327 |                 e = min(last, lastindex)
328 |                 others.append(d.loc[s:e])
329 |             return cudf.concat(others)
330 | 
331 |         newparts = []
332 |         for idx, depends in remap.items():
333 |             newparts.append(take(idx, depends))
334 | 
335 |         divisions = list(map(min, uniques))
336 |         divisions.append(max(uniques[-1]))
337 | 
338 |         newdd = from_delayed(newparts, meta=self._meta)
339 |         return newdd, uniques
340 | 
341 |     def _compute_divisions(self):
342 |         if self.known_divisions:
343 |             return self
344 | 
345 |         @delayed
346 |         def first_index(df):
347 |             return df.index[0]
348 | 
349 |         @delayed
350 |         def last_index(df):
351 |             return df.index[-1]
352 | 
353 |         parts = self.to_delayed()
354 |         divs = [first_index(p) for p in parts] + [last_index(parts[-1])]
355 |         divisions = compute(*divs)
356 |         return type(self)(self.dask, self._name, self._meta, divisions)
357 | 
358 |     def set_index(self, index, drop=True, sorted=False):
359 |         """Set new index.
360 | 
361 |         Parameters
362 |         ----------
363 |         index : str or Series
364 |             If a ``str`` is provided, it is used as the name of the
365 |             column to be made into the index.
366 |             If a ``Series`` is provided, it is used as the new index
367 |         drop : bool
368 |             Whether the first original index column is dropped.
369 |         sorted : bool
370 |             Whether the new index column is already sorted.
371 |         """
372 |         if not drop:
373 |             raise NotImplementedError("drop=False not supported yet")
374 | 
375 |         if isinstance(index, str):
376 |             tmpdf = self.sort_values(index)
377 |             return tmpdf._set_column_as_sorted_index(index, drop=drop)
378 |         elif isinstance(index, Series):
379 |             indexname = "__dask_cudf.index"
380 |             df = self.assign(**{indexname: index})
381 |             return df.set_index(indexname, drop=drop, sorted=sorted)
382 |         else:
383 |             raise TypeError("cannot set_index from {}".format(type(index)))
384 | 
385 |     def _set_column_as_sorted_index(self, colname, drop):
386 |         def select_index(df, col):
387 |             return df.set_index(col)
388 | 
389 |         return self.map_partitions(
390 |             select_index, col=colname, meta=self._meta.set_index(colname)
391 |         )
392 | 
393 |     def _argsort(self, col, sorted=False):
394 |         """
395 |         Returns
396 |         -------
397 |         shufidx : Series
398 |             Positional indices to be used with .take() to
399 |             put the dataframe in order w.r.t ``col``.
400 |         """
401 |         # Get subset with just the index and positional value
402 |         subset = self[col].to_dask_dataframe()
403 |         subset = subset.reset_index(drop=False)
404 |         ordered = subset.set_index(0, sorted=sorted)
405 |         shufidx = from_dask_dataframe(ordered)["index"]
406 |         return shufidx
407 | 
408 |     def _set_index_raw(self, indexname, drop, sorted):
409 |         shufidx = self._argsort(indexname, sorted=sorted)
410 |         # Shuffle the GPU data
411 |         shuffled = self.take(shufidx, npartitions=self.npartitions)
412 |         out = shuffled.map_partitions(lambda df: df.set_index(indexname))
413 |         return out
414 | 
415 |     def reset_index(self, force=False, drop=False):
416 |         """Reset index to range based
417 |         """
418 |         if force:
419 |             dfs = self.to_delayed()
420 |             sizes = np.asarray(compute(*map(delayed(len), dfs)))
421 |             prefixes = np.zeros_like(sizes)
422 |             prefixes[1:] = np.cumsum(sizes[:-1])
423 | 
424 |             @delayed
425 |             def fix_index(df, startpos):
426 |                 stoppos = startpos + len(df)
427 |                 return df.set_index(
428 |                     cudf.dataframe.RangeIndex(start=startpos, stop=stoppos)
429 |                 )
430 | 
431 |             outdfs = [fix_index(df, startpos) for df, startpos in zip(dfs, prefixes)]
432 |             return from_delayed(outdfs, meta=self._meta.reset_index(drop=True))
433 |         else:
434 |             return self.map_partitions(M.reset_index, drop=drop)
435 | 
436 |     def sort_values(self, by, ignore_index=False):
437 |         """Sort by the given column
438 | 
439 |         Parameter
440 |         ---------
441 |         by : str
442 |         """
443 |         parts = self.to_delayed()
444 |         sorted_parts = batcher_sortnet.sort_delayed_frame(parts, by)
445 |         return from_delayed(sorted_parts, meta=self._meta).reset_index(
446 |             force=not ignore_index
447 |         )
448 | 
449 |     def sort_values_binned(self, by):
450 |         """Sorty by the given column and ensure that the same key
451 |         doesn't spread across multiple partitions.
452 |         """
453 |         # Get sorted partitions
454 |         parts = self.sort_values(by=by).to_delayed()
455 | 
456 |         # Get unique keys in each partition
457 |         @delayed
458 |         def get_unique(p):
459 |             return set(p[by].unique())
460 | 
461 |         uniques = list(compute(*map(get_unique, parts)))
462 | 
463 |         joiner = {}
464 |         for i in range(len(uniques)):
465 |             joiner[i] = to_join = {}
466 |             for j in range(i + 1, len(uniques)):
467 |                 intersect = uniques[i] & uniques[j]
468 |                 # If the keys intersect
469 |                 if intersect:
470 |                     # Remove keys
471 |                     uniques[j] -= intersect
472 |                     to_join[j] = frozenset(intersect)
473 |                 else:
474 |                     break
475 | 
476 |         @delayed
477 |         def join(df, other, keys):
478 |             others = [other.query("{by}==@k".format(by=by)) for k in sorted(keys)]
479 |             return cudf.concat([df] + others)
480 | 
481 |         @delayed
482 |         def drop(df, keep_keys):
483 |             locvars = locals()
484 |             for i, k in enumerate(keep_keys):
485 |                 locvars["k{}".format(i)] = k
486 | 
487 |             conds = ["{by}==@k{i}".format(by=by, i=i) for i in range(len(keep_keys))]
488 |             expr = " or ".join(conds)
489 |             return df.query(expr)
490 | 
491 |         for i in range(len(parts)):
492 |             if uniques[i]:
493 |                 parts[i] = drop(parts[i], uniques[i])
494 |                 for joinee, intersect in joiner[i].items():
495 |                     parts[i] = join(parts[i], parts[joinee], intersect)
496 | 
497 |         results = [p for i, p in enumerate(parts) if uniques[i]]
498 |         return from_delayed(results, meta=self._meta).reset_index()
499 | 
500 |     def _shuffle_sort_values(self, by):
501 |         """Slow shuffle based sort by the given column
502 | 
503 |         Parameter
504 |         ---------
505 |         by : str
506 |         """
507 |         shufidx = self._argsort(by)
508 |         return self.take(shufidx)
509 | 
510 |     @derived_from(pd.DataFrame)
511 |     def var(self, axis=None, skipna=True, ddof=1, split_every=False,
512 |             dtype=None, out=None):
513 |         axis = self._validate_axis(axis)
514 |         meta = self._meta_nonempty.var(axis=axis, skipna=skipna)
515 |         if axis == 1:
516 |             result = map_partitions(M.var, self, meta=meta,
517 |                                     token=self._token_prefix + 'var',
518 |                                     axis=axis, skipna=skipna, ddof=ddof)
519 |             return handle_out(out, result)
520 | 
521 |         else:
522 |             num = self._get_numeric_data()
523 |             x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
524 |             x2 = 1.0 * (num ** 2).sum(skipna=skipna, split_every=split_every)
525 |             n = num.count(split_every=split_every)
526 |             name = self._token_prefix + 'var'
527 |             result = map_partitions(var_aggregate, x2, x, n,
528 |                                     token=name, meta=meta, ddof=ddof)
529 |             if isinstance(self, DataFrame):
530 |                 result.divisions = (min(self.columns), max(self.columns))
531 |             return handle_out(out, result)
532 | 
533 | 
534 | def sum_of_squares(x):
535 |     x = x.astype("f8")._column
536 |     outcol = cpp_reduce.apply_reduce("sum_of_squares", x)
537 |     return cudf.Series(outcol)
538 | 
539 | 
540 | def var_aggregate(x2, x, n, ddof):
541 |     try:
542 |         with warnings.catch_warnings(record=True):
543 |             warnings.simplefilter('always')
544 |             result = (x2 / n) - (x / n)**2
545 |         if ddof != 0:
546 |             result = result * n / (n - ddof)
547 |         return result
548 |     except ZeroDivisionError:
549 |         return np.float64(np.nan)
550 | 
551 | 
552 | def nlargest_agg(x, **kwargs):
553 |     return cudf.concat(x).nlargest(**kwargs)
554 | 
555 | 
556 | def nsmallest_agg(x, **kwargs):
557 |     return cudf.concat(x).nsmallest(**kwargs)
558 | 
559 | 
560 | def unique_k_agg(x, **kwargs):
561 |     return cudf.concat(x).unique_k(**kwargs)
562 | 
563 | 
564 | class Series(_Frame, dd.core.Series):
565 |     _partition_type = cudf.Series
566 | 
567 |     def count(self, split_every=False):
568 |         return reduction(
569 |             self, chunk=M.count, aggregate=np.sum, split_every=split_every, meta="i8"
570 |         )
571 | 
572 |     def mean(self, split_every=False):
573 |         sum = self.sum(split_every=split_every)
574 |         n = self.count(split_every=split_every)
575 |         return sum / n
576 | 
577 |     def unique_k(self, k, split_every=None):
578 |         return reduction(
579 |             self,
580 |             chunk=M.unique_k,
581 |             aggregate=unique_k_agg,
582 |             meta=self._meta,
583 |             token="unique-k",
584 |             split_every=split_every,
585 |             k=k,
586 |         )
587 | 
588 |     @derived_from(pd.DataFrame)
589 |     def var(self, axis=None, skipna=True, ddof=1, split_every=False, dtype=None, out=None):
590 |         axis = self._validate_axis(axis)
591 |         meta = self._meta_nonempty.var(axis=axis, skipna=skipna)
592 |         if axis == 1:
593 |             result = map_partitions(M.var, self, meta=meta,
594 |                                     token=self._token_prefix + 'var',
595 |                                     axis=axis, skipna=skipna, ddof=ddof)
596 |             return handle_out(out, result)
597 | 
598 |         else:
599 |             num = self._get_numeric_data()
600 |             x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
601 |             x2 = 1.0 * (num ** 2).sum(skipna=skipna, split_every=split_every)
602 |             n = num.count(split_every=split_every)
603 |             name = self._token_prefix + 'var'
604 |             result = map_partitions(var_aggregate, x2, x, n,
605 |                                     token=name, meta=meta, ddof=ddof)
606 |             if isinstance(self, DataFrame):
607 |                 result.divisions = (min(self.columns), max(self.columns))
608 |             return handle_out(out, result)
609 | 
610 | 
611 |     # ----------------------------------------------------------------------
612 |     # Accessor Methods
613 |     # ----------------------------------------------------------------------
614 |     dt = CachedAccessor("dt", DatetimeAccessor)
615 |     cat = CachedAccessor("cat", CategoricalAccessor)
616 | 
617 | 
618 | class Index(Series, dd.core.Index):
619 |     _partition_type = cudf.dataframe.index.Index
620 | 
621 | 
622 | def splits_divisions_sorted_cudf(df, chunksize):
623 |     segments = list(df.index.find_segments().to_array())
624 |     segments.append(len(df) - 1)
625 | 
626 |     splits = [0]
627 |     last = current_size = 0
628 |     for s in segments:
629 |         size = s - last
630 |         last = s
631 |         current_size += size
632 |         if current_size >= chunksize:
633 |             splits.append(s)
634 |             current_size = 0
635 |     # Ensure end is included
636 |     if splits[-1] != segments[-1]:
637 |         splits.append(segments[-1])
638 |     divisions = tuple(df.index.take(np.array(splits)).values)
639 |     splits[-1] += 1  # Offset to extract to end
640 | 
641 |     return splits, divisions
642 | 
643 | 
644 | def _extract_meta(x):
645 |     """
646 |     Extract internal cache data (``_meta``) from dask_cudf objects
647 |     """
648 |     if isinstance(x, (Scalar, _Frame)):
649 |         return x._meta
650 |     elif isinstance(x, list):
651 |         return [_extract_meta(_x) for _x in x]
652 |     elif isinstance(x, tuple):
653 |         return tuple([_extract_meta(_x) for _x in x])
654 |     elif isinstance(x, dict):
655 |         return {k: _extract_meta(v) for k, v in x.items()}
656 |     return x
657 | 
658 | 
659 | def _emulate(func, *args, **kwargs):
660 |     """
661 |     Apply a function using args / kwargs. If arguments contain dd.DataFrame /
662 |     dd.Series, using internal cache (``_meta``) for calculation
663 |     """
664 |     with raise_on_meta_error(funcname(func)):
665 |         return func(*_extract_meta(args), **_extract_meta(kwargs))
666 | 
667 | 
668 | def align_partitions(args):
669 |     """Align partitions between dask_cudf objects.
670 | 
671 |     Note that if all divisions are unknown, but have equal npartitions, then
672 |     they will be passed through unchanged."""
673 |     dfs = [df for df in args if isinstance(df, _Frame)]
674 |     if not dfs:
675 |         return args
676 | 
677 |     divisions = dfs[0].divisions
678 |     if not all(df.divisions == divisions for df in dfs):
679 |         raise NotImplementedError("Aligning mismatched partitions")
680 |     return args
681 | 
682 | 
683 | def reduction(
684 |     args,
685 |     chunk=None,
686 |     aggregate=None,
687 |     combine=None,
688 |     meta=None,
689 |     token=None,
690 |     chunk_kwargs=None,
691 |     aggregate_kwargs=None,
692 |     combine_kwargs=None,
693 |     split_every=None,
694 |     **kwargs
695 | ):
696 |     """Generic tree reduction operation.
697 | 
698 |     Parameters
699 |     ----------
700 |     args :
701 |         Positional arguments for the `chunk` function. All `dask.dataframe`
702 |         objects should be partitioned and indexed equivalently.
703 |     chunk : function [block-per-arg] -> block
704 |         Function to operate on each block of data
705 |     aggregate : function list-of-blocks -> block
706 |         Function to operate on the list of results of chunk
707 |     combine : function list-of-blocks -> block, optional
708 |         Function to operate on intermediate lists of results of chunk
709 |         in a tree-reduction. If not provided, defaults to aggregate.
710 |     $META
711 |     token : str, optional
712 |         The name to use for the output keys.
713 |     chunk_kwargs : dict, optional
714 |         Keywords for the chunk function only.
715 |     aggregate_kwargs : dict, optional
716 |         Keywords for the aggregate function only.
717 |     combine_kwargs : dict, optional
718 |         Keywords for the combine function only.
719 |     split_every : int, optional
720 |         Group partitions into groups of this size while performing a
721 |         tree-reduction. If set to False, no tree-reduction will be used,
722 |         and all intermediates will be concatenated and passed to ``aggregate``.
723 |         Default is 8.
724 |     kwargs :
725 |         All remaining keywords will be passed to ``chunk``, ``aggregate``, and
726 |         ``combine``.
727 |     """
728 |     if chunk_kwargs is None:
729 |         chunk_kwargs = dict()
730 |     if aggregate_kwargs is None:
731 |         aggregate_kwargs = dict()
732 |     chunk_kwargs.update(kwargs)
733 |     aggregate_kwargs.update(kwargs)
734 | 
735 |     if combine is None:
736 |         if combine_kwargs:
737 |             raise ValueError("`combine_kwargs` provided with no `combine`")
738 |         combine = aggregate
739 |         combine_kwargs = aggregate_kwargs
740 |     else:
741 |         if combine_kwargs is None:
742 |             combine_kwargs = dict()
743 |         combine_kwargs.update(kwargs)
744 | 
745 |     if not isinstance(args, (tuple, list)):
746 |         args = [args]
747 | 
748 |     npartitions = set(arg.npartitions for arg in args if isinstance(arg, _Frame))
749 |     if len(npartitions) > 1:
750 |         raise ValueError("All arguments must have same number of partitions")
751 |     npartitions = npartitions.pop()
752 | 
753 |     if split_every is None:
754 |         split_every = 8
755 |     elif split_every is False:
756 |         split_every = npartitions
757 |     elif split_every < 2 or not isinstance(split_every, int):
758 |         raise ValueError("split_every must be an integer >= 2")
759 | 
760 |     token_key = tokenize(
761 |         token or (chunk, aggregate),
762 |         meta,
763 |         args,
764 |         chunk_kwargs,
765 |         aggregate_kwargs,
766 |         combine_kwargs,
767 |         split_every,
768 |     )
769 | 
770 |     # Chunk
771 |     a = "{0}-chunk-{1}".format(token or funcname(chunk), token_key)
772 |     if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs:
773 |         dsk = {(a, 0, i): (chunk, key) for i, key in enumerate(args[0].__dask_keys__())}
774 |     else:
775 |         dsk = {
776 |             (a, 0, i): (
777 |                 apply,
778 |                 chunk,
779 |                 [(x._name, i) if isinstance(x, _Frame) else x for x in args],
780 |                 chunk_kwargs,
781 |             )
782 |             for i in range(args[0].npartitions)
783 |         }
784 | 
785 |     # Combine
786 |     b = "{0}-combine-{1}".format(token or funcname(combine), token_key)
787 |     k = npartitions
788 |     depth = 0
789 |     while k > split_every:
790 |         for part_i, inds in enumerate(partition_all(split_every, range(k))):
791 |             conc = (list, [(a, depth, i) for i in inds])
792 |             dsk[(b, depth + 1, part_i)] = (
793 |                 (apply, combine, [conc], combine_kwargs)
794 |                 if combine_kwargs
795 |                 else (combine, conc)
796 |             )
797 |         k = part_i + 1
798 |         a = b
799 |         depth += 1
800 | 
801 |     # Aggregate
802 |     b = "{0}-agg-{1}".format(token or funcname(aggregate), token_key)
803 |     conc = (list, [(a, depth, i) for i in range(k)])
804 |     if aggregate_kwargs:
805 |         dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs)
806 |     else:
807 |         dsk[(b, 0)] = (aggregate, conc)
808 | 
809 |     if meta is None:
810 |         meta_chunk = _emulate(apply, chunk, args, chunk_kwargs)
811 |         meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs)
812 |     meta = dd.core.make_meta(meta)
813 | 
814 |     for arg in args:
815 |         if isinstance(arg, _Frame):
816 |             dsk.update(arg.dask)
817 | 
818 |     return dd.core.new_dd_object(dsk, b, meta, (None, None))
819 | 
820 | 
821 | from_cudf = dd.from_pandas
822 | 
823 | 
824 | def from_dask_dataframe(df):
825 |     return df.map_partitions(cudf.from_pandas)
826 | 


--------------------------------------------------------------------------------
/dask_cudf/io/__init__.py:
--------------------------------------------------------------------------------
1 | from .csv import read_csv
2 | from .orc import read_orc
3 | from .json import read_json
4 | from .parquet import read_parquet
5 | 


--------------------------------------------------------------------------------
/dask_cudf/io/csv.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from glob import glob
  3 | from warnings import warn
  4 | 
  5 | from dask.base import tokenize
  6 | from dask.compatibility import apply
  7 | import dask.dataframe as dd
  8 | from dask.utils import parse_bytes
  9 | from dask.dataframe.io.csv import make_reader
 10 | 
 11 | import cudf
 12 | from cudf.bindings.GDFError import GDFError
 13 | 
 14 | 
 15 | def read_csv(path, chunksize="256 MiB", **kwargs):
 16 |     if "://" in str(path):
 17 |         func = make_reader(cudf.read_csv, "read_csv", "CSV")
 18 |         return func(path, blocksize=chunksize, **kwargs)
 19 |     else:
 20 |         return _internal_read_csv(path=path, chunksize=chunksize, **kwargs)
 21 | 
 22 | 
 23 | def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
 24 |     if isinstance(chunksize, str):
 25 |         chunksize = parse_bytes(chunksize)
 26 | 
 27 |     filenames = sorted(glob(str(path)))
 28 |     if not filenames:
 29 |         msg = f"A file in: {filenames} does not exist."
 30 |         raise FileNotFoundError(msg)
 31 | 
 32 |     name = "read-csv-" + tokenize(
 33 |         path, tokenize, **kwargs
 34 |     )  # TODO: get last modified time
 35 | 
 36 |     compression = kwargs.get("compression", False)
 37 |     if compression and chunksize:
 38 |         # compressed CSVs reading must read the entire file
 39 |         kwargs.pop("byte_range", None)
 40 |         warn(
 41 |             "Warning %s compression does not support breaking apart files\n"
 42 |             "Please ensure that each individual file can fit in memory and\n"
 43 |             "use the keyword ``chunksize=None to remove this message``\n"
 44 |             "Setting ``chunksize=(size of file)``" % compression
 45 |         )
 46 |         chunksize = None
 47 | 
 48 |     if chunksize is None:
 49 |         return read_csv_without_chunksize(path, **kwargs)
 50 |     
 51 |     dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
 52 |     meta = dask_reader(filenames[0], **kwargs)._meta
 53 | 
 54 |     dsk = {}
 55 |     i = 0
 56 |     dtypes = meta.dtypes.values
 57 | 
 58 |     for fn in filenames:
 59 |         size = os.path.getsize(fn)
 60 |         for start in range(0, size, chunksize):
 61 |             kwargs2 = kwargs.copy()
 62 |             kwargs2["byte_range"] = (
 63 |                 start,
 64 |                 chunksize,
 65 |             )  # specify which chunk of the file we care about
 66 |             if start != 0:
 67 |                 kwargs2["names"] = meta.columns  # no header in the middle of the file
 68 |                 kwargs2["header"] = None
 69 |             dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2)
 70 | 
 71 |             i += 1
 72 | 
 73 |     divisions = [None] * (len(dsk) + 1)
 74 |     return dd.core.new_dd_object(dsk, name, meta, divisions)
 75 | 
 76 | 
 77 | def _read_csv(fn, dtypes=None, **kwargs):
 78 |     try:
 79 |         cdf = cudf.read_csv(fn, **kwargs)
 80 |     except GDFError:
 81 |         # end of file check https://github.com/rapidsai/dask-cudf/issues/103
 82 |         # this should be removed when CUDF has better dtype/parse_date support
 83 |         dtypes = dict(zip(kwargs["names"], dtypes))
 84 |         df = dd.core.make_meta(dtypes)
 85 |         cdf = cudf.from_pandas(df)
 86 |     return cdf
 87 | 
 88 | 
 89 | def read_csv_without_chunksize(path, **kwargs):
 90 |     """Read entire CSV with optional compression (gzip/zip)
 91 | 
 92 |     Parameters
 93 |     ----------
 94 |     path : str
 95 |         path to files (support for glob)
 96 |     """
 97 |     filenames = sorted(glob(str(path)))
 98 |     name = "read-csv-" + tokenize(path, **kwargs)
 99 | 
100 |     meta = cudf.read_csv(filenames[0], **kwargs)
101 | 
102 |     graph = {
103 |         (name, i): (apply, cudf.read_csv, [fn], kwargs)
104 |         for i, fn in enumerate(filenames)
105 |     }
106 | 
107 |     divisions = [None] * (len(filenames) + 1)
108 | 
109 |     return dd.core.new_dd_object(graph, name, meta, divisions)
110 | 


--------------------------------------------------------------------------------
/dask_cudf/io/json.py:
--------------------------------------------------------------------------------
1 | import cudf
2 | import dask
3 | from functools import partial
4 | 
5 | 
6 | read_json = partial(dask.dataframe.read_json, engine=cudf.read_json)
7 | 


--------------------------------------------------------------------------------
/dask_cudf/io/orc.py:
--------------------------------------------------------------------------------
 1 | from glob import glob
 2 | 
 3 | from dask.base import tokenize
 4 | from dask.compatibility import apply
 5 | import dask.dataframe as dd
 6 | 
 7 | import cudf
 8 | 
 9 | 
10 | def read_orc(path, **kwargs):
11 |     """ Read ORC files into a Dask DataFrame
12 | 
13 |     This calls the ``cudf.read_orc`` function on many ORC files.
14 |     See that function for additional details.
15 | 
16 |     Examples
17 |     --------
18 |     >>> import dask_cudf
19 |     >>> df = dask_cudf.read_orc("/path/to/*.orc")  # doctest: +SKIP
20 | 
21 |     See Also
22 |     --------
23 |     cudf.read_orc
24 |     """
25 | 
26 |     filenames = sorted(glob(str(path)))
27 |     name = "read-orc-" + tokenize(path, **kwargs)
28 | 
29 |     meta = cudf.read_orc(filenames[0], **kwargs)
30 | 
31 |     graph = {
32 |         (name, i): (apply, cudf.read_orc, [fn], kwargs)
33 |         for i, fn in enumerate(filenames)
34 |     }
35 | 
36 |     divisions = [None] * (len(filenames) + 1)
37 | 
38 |     return dd.core.new_dd_object(graph, name, meta, divisions)
39 | 


--------------------------------------------------------------------------------
/dask_cudf/io/parquet.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from glob import glob
 3 | 
 4 | from dask.base import tokenize
 5 | from dask.compatibility import apply
 6 | import dask.dataframe as dd
 7 | from dask.utils import natural_sort_key
 8 | 
 9 | import cudf
10 | 
11 | 
12 | def read_parquet(path, **kwargs):
13 |     """ Read parquet files into a Dask DataFrame
14 | 
15 |     This calls the ``cudf.read_parquet`` function on many parquet files.
16 |     See that function for additional details.
17 | 
18 |     Examples
19 |     --------
20 |     >>> import dask_cudf
21 |     >>> df = dask_cudf.read_parquet("/path/to/dataset/")  # doctest: +SKIP
22 | 
23 |     See Also
24 |     --------
25 |     cudf.read_parquet
26 |     """
27 | 
28 |     name = "read-parquet-" + tokenize(
29 |         path,
30 |         **kwargs
31 |     )
32 | 
33 |     paths = path
34 |     if isinstance(path, str):
35 |         paths = sorted(glob(str(path)))
36 | 
37 |     # Ignore *_metadata files for now
38 |     paths = sorted([f for f in paths if not f.endswith('_metadata')],
39 |                    key=natural_sort_key)
40 | 
41 |     # Use 0th file to create meta
42 |     meta = cudf.read_parquet(paths[0], **kwargs)
43 |     graph = {
44 |         (name, i): (apply, cudf.read_parquet, [fn], kwargs)
45 |         for i, fn in enumerate(paths)
46 |     }
47 |     divisions = [None] * (len(paths) + 1)
48 | 
49 |     return dd.core.new_dd_object(graph, name, meta, divisions)
50 | 


--------------------------------------------------------------------------------
/dask_cudf/io/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/dask-cudf/b566ab60ea69e6e165533b68b1966875528afb06/dask_cudf/io/tests/__init__.py


--------------------------------------------------------------------------------
/dask_cudf/io/tests/sample.orc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/dask-cudf/b566ab60ea69e6e165533b68b1966875528afb06/dask_cudf/io/tests/sample.orc


--------------------------------------------------------------------------------
/dask_cudf/io/tests/test_csv.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | import dask
 4 | import dask_cudf
 5 | import dask.dataframe as dd
 6 | import pandas as pd
 7 | import numpy as np
 8 | 
 9 | import pytest
10 | 
11 | 
12 | def test_read_csv(tmp_path):
13 |     df = dask.datasets.timeseries(dtypes={"x": int, "y": int}, freq="120s").reset_index(
14 |         drop=True
15 |     )
16 | 
17 |     df.to_csv(tmp_path / "data-*.csv", index=False)
18 | 
19 |     df2 = dask_cudf.read_csv(tmp_path / "data-*.csv")
20 |     dd.assert_eq(df, df2)
21 | 
22 |     # file path test
23 |     stmp_path = str(tmp_path / "data-*.csv")
24 |     df3 = dask_cudf.read_csv(f"file://{stmp_path}")
25 |     dd.assert_eq(df2, df3)
26 | 
27 | 
28 | def test_raises_FileNotFoundError():
29 |     with pytest.raises(FileNotFoundError):
30 |         dask_cudf.read_csv("foo.csv")
31 | 
32 | 
33 | def test_read_csv_w_bytes(tmp_path):
34 |     df = dask.datasets.timeseries(dtypes={"x": int, "y": int}, freq="120s").reset_index(
35 |         drop=True
36 |     )
37 |     df = pd.DataFrame(dict(x=np.arange(20), y=np.arange(20)))
38 |     df.to_csv(tmp_path / "data-*.csv", index=False)
39 | 
40 |     df2 = dask_cudf.read_csv(tmp_path / "*.csv", chunksize="50 B")
41 |     assert df2.npartitions is 3
42 |     dd.assert_eq(df2, df, check_index=False)
43 | 
44 | 
45 | def test_read_csv_compression(tmp_path):
46 |     df = pd.DataFrame(dict(x=np.arange(20), y=np.arange(20)))
47 |     df.to_csv(tmp_path / "data.csv.gz", index=False, compression="gzip")
48 | 
49 |     with pytest.warns(UserWarning) as w:
50 |         df2 = dask_cudf.read_csv(
51 |             tmp_path / "*.csv.gz", chunksize="50 B", compression="gzip"
52 |         )
53 | 
54 |     assert len(w) == 1
55 |     msg = str(w[0].message)
56 |     assert "gzip" in msg
57 | 
58 |     assert df2.npartitions is 1
59 |     dd.assert_eq(df2, df, check_index=False)
60 | 
61 |     with warnings.catch_warnings(record=True) as record:
62 |         df2 = dask_cudf.read_csv(
63 |             tmp_path / "*.csv.gz", chunksize=None, compression="gzip"
64 |         )
65 | 
66 |         assert not record
67 | 


--------------------------------------------------------------------------------
/dask_cudf/io/tests/test_json.py:
--------------------------------------------------------------------------------
 1 | import dask
 2 | import dask_cudf
 3 | import dask.dataframe as dd
 4 | from dask.utils import tmpfile
 5 | import pandas as pd
 6 | 
 7 | import pytest
 8 | 
 9 | 
10 | def test_read_json(tmp_path):
11 |     df1 = dask.datasets.timeseries(
12 |         dtypes={"x": int, "y": int}, freq="120s").reset_index(drop=True)
13 |     df1.to_json(tmp_path / "data-*.json")
14 |     df2 = dask_cudf.read_json(tmp_path / "data-*.json")
15 |     dd.assert_eq(df1, df2)
16 | 
17 | 
18 | @pytest.mark.filterwarnings("ignore:Using CPU")
19 | @pytest.mark.parametrize('orient', ['split', 'index', 'columns', 'values'])
20 | def test_read_json_basic(orient):
21 |     df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]})
22 |     with tmpfile('json') as f:
23 |         df.to_json(f, orient=orient, lines=False)
24 |         actual = dask_cudf.read_json(f, orient=orient, lines=False)
25 |         actual_pd = pd.read_json(f, orient=orient, lines=False)
26 |         dd.assert_eq(actual, actual_pd)
27 | 
28 | 
29 | @pytest.mark.filterwarnings("ignore:Using CPU")
30 | @pytest.mark.parametrize('lines', [True, False])
31 | def test_read_json_lines(lines):
32 |     df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]})
33 |     with tmpfile('json') as f:
34 |         df.to_json(f, orient='records', lines=lines)
35 |         actual = dask_cudf.read_json(f, orient='records', lines=lines)
36 |         actual_pd = pd.read_json(f, orient='records', lines=lines)
37 |         dd.assert_eq(actual, actual_pd)
38 | 


--------------------------------------------------------------------------------
/dask_cudf/io/tests/test_orc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import dask_cudf
 4 | import dask.dataframe as dd
 5 | import cudf
 6 | 
 7 | import pytest
 8 | 
 9 | # import pyarrow.orc as orc
10 | 
11 | cur_dir = os.path.dirname(__file__)
12 | sample_orc = os.path.join(cur_dir, "sample.orc")
13 | 
14 | 
15 | def test_read_orc_defaults():
16 |     df1 = cudf.read_orc(sample_orc)
17 |     df2 = dask_cudf.read_orc(sample_orc)
18 |     df2.head().to_pandas()
19 |     dd.assert_eq(df1, df2, check_index=False)
20 | 
21 | 
22 | # engine pyarrow fails
23 | # https://github.com/rapidsai/cudf/issues/1595
24 | @pytest.mark.parametrize("engine", ["cudf"])
25 | @pytest.mark.parametrize("columns", [["time", "date"], ["time"]])
26 | def test_read_orc_cols(engine, columns):
27 |     df1 = cudf.read_orc(sample_orc, engine=engine, columns=columns)
28 | 
29 |     df2 = dask_cudf.read_orc(sample_orc, engine=engine, columns=columns)
30 | 
31 |     dd.assert_eq(df1, df2, check_index=False)
32 | 


--------------------------------------------------------------------------------
/dask_cudf/io/tests/test_parquet.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import dask_cudf
 4 | import pandas as pd
 5 | import dask.dataframe as dd
 6 | from dask.dataframe.utils import assert_eq
 7 | from dask.utils import natural_sort_key
 8 | import cudf
 9 | 
10 | import pytest
11 | 
12 | 
13 | nrows = 40
14 | npartitions = 15
15 | df = pd.DataFrame({'x': [i * 7 % 5 for i in range(nrows)],  # Not sorted
16 |                    'y': [i * 2.5 for i in range(nrows)]})   # Sorted
17 | ddf = dd.from_pandas(df, npartitions=npartitions)
18 | 
19 | 
20 | def test_roundtrip_from_dask(tmpdir):
21 |     tmpdir = str(tmpdir)
22 |     ddf.to_parquet(tmpdir, engine='pyarrow')
23 |     files = sorted([os.path.join(tmpdir, f)
24 |                    for f in os.listdir(tmpdir)
25 |                    if not f.endswith('_metadata')],
26 |                    key=natural_sort_key)
27 | 
28 |     # Read list of parquet files
29 |     ddf2 = dask_cudf.read_parquet(files)
30 |     assert_eq(ddf, ddf2, check_divisions=False)
31 | 
32 |     # Specify columns=['x']
33 |     ddf2 = dask_cudf.read_parquet(files, columns=['x'])
34 |     assert_eq(ddf[['x']], ddf2, check_divisions=False)
35 | 
36 |     # Specify columns='y'
37 |     ddf2 = dask_cudf.read_parquet(files, columns='y')
38 |     assert_eq(ddf[['y']], ddf2, check_divisions=False)
39 | 
40 |     # Read parquet-dataset directory
41 |     # dask_cudf.read_parquet will ignore *_metadata files
42 |     ddf2 = dask_cudf.read_parquet(os.path.join(tmpdir, '*'))
43 |     assert_eq(ddf, ddf2, check_divisions=False)
44 | 
45 | 
46 | def test_roundtrip_from_pandas(tmpdir):
47 |     fn = str(tmpdir.join('test.parquet'))
48 |     df.to_parquet(fn)
49 |     ddf2 = dask_cudf.read_parquet(fn)
50 |     assert_eq(df, ddf2)
51 | 


--------------------------------------------------------------------------------
/dask_cudf/io/tests/test_s3.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | from contextlib import contextmanager
 4 | 
 5 | 
 6 | import dask_cudf
 7 | 
 8 | s3fs = pytest.importorskip("s3fs")
 9 | boto3 = pytest.importorskip("boto3")
10 | moto = pytest.importorskip("moto")
11 | httpretty = pytest.importorskip("httpretty")
12 | 
13 | from dask.bytes.s3 import DaskS3FileSystem
14 | 
15 | 
16 | @contextmanager
17 | def ensure_safe_environment_variables():
18 |     """
19 |     Get a context manager to safely set environment variables
20 |     All changes will be undone on close, hence environment variables set
21 |     within this contextmanager will neither persist nor change global state.
22 |     """
23 |     saved_environ = dict(os.environ)
24 |     try:
25 |         yield
26 |     finally:
27 |         os.environ.clear()
28 |         os.environ.update(saved_environ)
29 | 
30 | 
31 | @contextmanager
32 | def s3_context(bucket, files):
33 |     with ensure_safe_environment_variables():
34 |         # temporary workaround as moto fails for botocore >= 1.11 otherwise,
35 |         # see https://github.com/spulec/moto/issues/1924 & 1952
36 |         os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key")
37 |         os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret")
38 | 
39 |         with moto.mock_s3():
40 |             client = boto3.client("s3")
41 |             client.create_bucket(Bucket=bucket, ACL="public-read-write")
42 |             for f, data in files.items():
43 |                 client.put_object(Bucket=bucket, Key=f, Body=data)
44 | 
45 |             yield DaskS3FileSystem(anon=True)
46 | 
47 |             for f, data in files.items():
48 |                 try:
49 |                     client.delete_object(Bucket=bucket, Key=f, Body=data)
50 |                 except Exception:
51 |                     pass
52 |                 finally:
53 |                     httpretty.HTTPretty.disable()
54 |                     httpretty.HTTPretty.reset()
55 | 
56 | 
57 | def test_read_csv():
58 |     with s3_context("csv", {"a.csv": b"a,b\n1,2\n3,4\n"}) as s3:
59 |         df = dask_cudf.read_csv(
60 |             "s3://csv/*.csv", chunksize="50 B", storage_options={"s3": s3}
61 |         )
62 |         assert df.a.sum().compute() == 4
63 | 


--------------------------------------------------------------------------------
/dask_cudf/join_impl.py:
--------------------------------------------------------------------------------
 1 | from dask import delayed
 2 | import dask.dataframe as dd
 3 | 
 4 | import cudf
 5 | 
 6 | 
 7 | @delayed
 8 | def local_shuffle(frame, num_new_parts, key_columns):
 9 |     """Regroup the frame based on the key column(s)
10 |     """
11 |     partitions = frame.partition_by_hash(columns=key_columns, nparts=num_new_parts)
12 |     return dict(enumerate(partitions))
13 | 
14 | 
15 | @delayed
16 | def get_subgroup(groups, i):
17 |     out = groups.get(i)
18 |     if out is None:
19 |         return ()
20 |     return out
21 | 
22 | 
23 | def group_frame(frame_partitions, num_new_parts, key_columns):
24 |     """Group frame to prepare for the join
25 |     """
26 |     return [
27 |         local_shuffle(part, num_new_parts, key_columns) for part in frame_partitions
28 |     ]
29 | 
30 | 
31 | def fanout_subgroups(grouped_parts, num_new_parts):
32 |     return [
33 |         [get_subgroup(part, j) for part in grouped_parts] for j in range(num_new_parts)
34 |     ]
35 | 
36 | 
37 | def join_frames(left, right, on, how, lsuffix, rsuffix):
38 |     """Join two frames on 1 or more columns.
39 | 
40 |     Parameters
41 |     ----------
42 |     left, right : dask_cudf.DataFrame
43 |     on : tuple[str]
44 |         key column(s)
45 |     how : str
46 |         Join method
47 |     lsuffix, rsuffix : str
48 |     """
49 | 
50 |     if on:
51 |         on = [on] if isinstance(on, str) else list(on)
52 | 
53 |     empty_frame = left._meta.merge(
54 |         right._meta, on=on, how=how, suffixes=(lsuffix, rsuffix)
55 |     )
56 | 
57 |     def merge(left, right):
58 |         return left.merge(right, on=on, how=how, suffixes=(lsuffix, rsuffix))
59 | 
60 |     left_val_names = [k for k in left.columns if k not in on]
61 |     right_val_names = [k for k in right.columns if k not in on]
62 |     same_names = set(left_val_names) & set(right_val_names)
63 |     if same_names and not (lsuffix or rsuffix):
64 |         raise ValueError(
65 |             "there are overlapping columns but " "lsuffix and rsuffix are not defined"
66 |         )
67 | 
68 |     dtypes = {k: left[k].dtype for k in left.columns}
69 |     dtypes.update({k: right[k].dtype for k in right.columns})
70 | 
71 |     left_parts = left.to_delayed()
72 |     right_parts = right.to_delayed()
73 | 
74 |     # Add column w/ hash(v) % nparts
75 |     nparts = max(len(left_parts), len(right_parts))
76 | 
77 |     left_hashed = group_frame(left_parts, nparts, on)
78 |     right_hashed = group_frame(right_parts, nparts, on)
79 | 
80 |     # Fanout each partition into nparts subgroups
81 |     left_subgroups = fanout_subgroups(left_hashed, nparts)
82 |     right_subgroups = fanout_subgroups(right_hashed, nparts)
83 | 
84 |     assert len(left_subgroups) == len(right_subgroups)
85 | 
86 |     # Concat
87 |     left_cats = [delayed(cudf.concat, pure=True)(it) for it in left_subgroups]
88 |     right_cats = [delayed(cudf.concat, pure=True)(it) for it in right_subgroups]
89 | 
90 |     # Combine
91 |     merged = [
92 |         delayed(merge, pure=True)(left_cats[i], right_cats[i]) for i in range(nparts)
93 |     ]
94 | 
95 |     return dd.from_delayed(merged, prefix="join_result", meta=empty_frame)
96 | 


--------------------------------------------------------------------------------
/dask_cudf/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/dask-cudf/b566ab60ea69e6e165533b68b1966875528afb06/dask_cudf/tests/__init__.py


--------------------------------------------------------------------------------
/dask_cudf/tests/test_accessor.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pytest
  4 | from pandas.util.testing import assert_series_equal
  5 | 
  6 | import dask_cudf as dgd
  7 | from cudf.dataframe import Series
  8 | 
  9 | #############################################################################
 10 | #                        Datetime Accessor                                  #
 11 | #############################################################################
 12 | 
 13 | 
 14 | def data_dt_1():
 15 |     return pd.date_range("20010101", "20020215", freq="400h")
 16 | 
 17 | 
 18 | def data_dt_2():
 19 |     return np.random.randn(100)
 20 | 
 21 | 
 22 | dt_fields = ["year", "month", "day", "hour", "minute", "second"]
 23 | 
 24 | 
 25 | @pytest.mark.parametrize("data", [data_dt_2()])
 26 | @pytest.mark.xfail(raises=AttributeError)
 27 | def test_datetime_accessor_initialization(data):
 28 |     pdsr = pd.Series(data.copy())
 29 |     sr = Series(pdsr)
 30 |     dsr = dgd.from_cudf(sr, npartitions=5)
 31 |     dsr.dt
 32 | 
 33 | 
 34 | @pytest.mark.parametrize("data", [data_dt_1()])
 35 | def test_series(data):
 36 |     pdsr = pd.Series(data.copy())
 37 |     sr = Series(pdsr)
 38 |     dsr = dgd.from_cudf(sr, npartitions=5)
 39 | 
 40 |     np.testing.assert_equal(np.array(pdsr), np.array(dsr.compute()))
 41 | 
 42 | 
 43 | @pytest.mark.parametrize("data", [data_dt_1()])
 44 | @pytest.mark.parametrize("field", dt_fields)
 45 | def test_dt_series(data, field):
 46 |     pdsr = pd.Series(data.copy())
 47 |     sr = Series(pdsr)
 48 |     dsr = dgd.from_cudf(sr, npartitions=5)
 49 |     base = getattr(pdsr.dt, field)
 50 |     test = getattr(dsr.dt, field).compute().to_pandas().astype("int64")
 51 |     assert_series_equal(base, test)
 52 | 
 53 | 
 54 | #############################################################################
 55 | #                        Categorical Accessor                               #
 56 | #############################################################################
 57 | 
 58 | 
 59 | def data_cat_1():
 60 |     cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"])
 61 |     return cat
 62 | 
 63 | 
 64 | def data_cat_2():
 65 |     return pd.Series([1, 2, 3])
 66 | 
 67 | 
 68 | def data_cat_3():
 69 |     cat1 = pd.Categorical(
 70 |         ["a", "a", "b", "c", "a"], categories=["a", "b", "c"], ordered=True
 71 |     )
 72 |     cat2 = pd.Categorical(
 73 |         ["a", "b", "a", "c", "b"], categories=["a", "b", "c"], ordered=True
 74 |     )
 75 |     return cat1, cat2
 76 | 
 77 | 
 78 | @pytest.mark.parametrize("data", [data_cat_2()])
 79 | @pytest.mark.xfail(raises=AttributeError)
 80 | def test_categorical_accessor_initialization(data):
 81 |     sr = Series(data.copy())
 82 |     dsr = dgd.from_cudf(sr, npartitions=5)
 83 |     dsr.cat
 84 | 
 85 | 
 86 | @pytest.mark.xfail(reason="")
 87 | @pytest.mark.parametrize("data", [data_cat_1()])
 88 | def test_categorical_basic(data):
 89 |     cat = data.copy()
 90 |     pdsr = pd.Series(cat)
 91 |     sr = Series(cat)
 92 |     dsr = dgd.from_cudf(sr, npartitions=2)
 93 |     result = dsr.compute()
 94 |     np.testing.assert_array_equal(cat.codes, result.to_array())
 95 |     assert dsr.dtype == pdsr.dtype
 96 | 
 97 |     # Test attributes
 98 |     assert pdsr.cat.ordered == dsr.cat.ordered
 99 |     # TODO: Investigate dsr.cat.categories: It raises
100 |     # ValueError: Expected iterable of tuples of (name, dtype),
101 |     # got ('a', 'b', 'c')
102 |     # assert(tuple(pdsr.cat.categories) == tuple(dsr.cat.categories))
103 | 
104 |     np.testing.assert_array_equal(pdsr.cat.codes.data, result.to_array())
105 |     np.testing.assert_array_equal(pdsr.cat.codes.dtype, dsr.cat.codes.dtype)
106 | 
107 |     string = str(result)
108 |     expect_str = """
109 | 0 a
110 | 1 a
111 | 2 b
112 | 3 c
113 | 4 a
114 | """
115 |     assert all(x == y for x, y in zip(string.split(), expect_str.split()))
116 | 
117 | 
118 | @pytest.mark.xfail(reason="")
119 | @pytest.mark.parametrize("data", [data_cat_1()])
120 | def test_categorical_compare_unordered(data):
121 |     cat = data.copy()
122 |     pdsr = pd.Series(cat)
123 |     sr = Series(cat)
124 |     dsr = dgd.from_cudf(sr, npartitions=2)
125 | 
126 |     # Test equality
127 |     out = dsr == dsr
128 |     assert out.dtype == np.bool_
129 |     assert np.all(out.compute())
130 |     assert np.all(pdsr == pdsr)
131 | 
132 |     # Test inequality
133 |     out = dsr != dsr
134 |     assert not np.any(out.compute())
135 |     assert not np.any(pdsr != pdsr)
136 | 
137 |     assert not dsr.cat.ordered
138 |     assert not pdsr.cat.ordered
139 | 
140 |     with pytest.raises((TypeError, ValueError)) as raises:
141 |         pdsr < pdsr
142 | 
143 |     raises.match("Unordered Categoricals can only compare equality or not")
144 | 
145 |     with pytest.raises((TypeError, ValueError)) as raises:
146 |         dsr < dsr
147 | 
148 |     raises.match("Unordered Categoricals can only compare equality or not")
149 | 
150 | 
151 | @pytest.mark.parametrize("data", [data_cat_3()])
152 | def test_categorical_compare_ordered(data):
153 |     cat1 = data[0]
154 |     cat2 = data[1]
155 |     pdsr1 = pd.Series(cat1)
156 |     pdsr2 = pd.Series(cat2)
157 |     sr1 = Series(cat1)
158 |     sr2 = Series(cat2)
159 |     dsr1 = dgd.from_cudf(sr1, npartitions=2)
160 |     dsr2 = dgd.from_cudf(sr2, npartitions=2)
161 | 
162 |     # Test equality
163 |     out = dsr1 == dsr1
164 |     assert out.dtype == np.bool_
165 |     assert np.all(out.compute().to_array())
166 |     assert np.all(pdsr1 == pdsr1)
167 | 
168 |     # Test inequality
169 |     out = dsr1 != dsr1
170 |     assert not np.any(out.compute().to_array())
171 |     assert not np.any(pdsr1 != pdsr1)
172 | 
173 |     assert dsr1.cat.ordered
174 |     assert pdsr1.cat.ordered
175 | 
176 |     # Test ordered operators
177 |     np.testing.assert_array_equal(pdsr1 < pdsr2, (dsr1 < dsr2).compute())
178 |     np.testing.assert_array_equal(pdsr1 > pdsr2, (dsr1 > dsr2).compute())
179 | 


--------------------------------------------------------------------------------
/dask_cudf/tests/test_batcher_sortnet.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | 
 4 | import cudf
 5 | from dask_cudf import batcher_sortnet
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("n", list(range(1, 40)))
 9 | def test_padding(n):
10 |     data = list(range(n))
11 |     padded, valid = batcher_sortnet._pad_data_to_length(data)
12 |     assert len(data) == valid
13 |     assert batcher_sortnet.is_power_of_2(len(padded))
14 |     assert valid > len(padded) / 2
15 |     assert all(x is not None for x in padded[:valid])
16 |     assert all(x is None for x in padded[valid:])
17 | 
18 | 
19 | @pytest.mark.parametrize("seed", [43, 120])
20 | @pytest.mark.parametrize("nelem", [2, 10, 100])
21 | def test_compare_frame(seed, nelem):
22 |     np.random.seed(seed)
23 |     max_part_size = nelem
24 |     # Make LHS
25 |     lhs = cudf.DataFrame()
26 |     lhs["a"] = lhs_a = np.random.random(nelem)
27 |     lhs["b"] = lhs_b = np.random.random(nelem)
28 | 
29 |     # Make RHS
30 |     rhs = cudf.DataFrame()
31 |     rhs["a"] = rhs_a = np.random.random(nelem)
32 |     rhs["b"] = rhs_b = np.random.random(nelem)
33 | 
34 |     # Sort by column "a"
35 |     got_a = batcher_sortnet._compare_frame(lhs, rhs, max_part_size, by="a")
36 |     # Check
37 |     expect_a = np.hstack([lhs_a, rhs_a])
38 |     expect_a.sort()
39 |     np.testing.assert_array_equal(got_a[0].a.to_array(), expect_a[:nelem])
40 |     np.testing.assert_array_equal(got_a[1].a.to_array(), expect_a[nelem:])
41 | 
42 |     # Sort by column "b"
43 |     got_b = batcher_sortnet._compare_frame(lhs, rhs, max_part_size, by="b")
44 |     # Check
45 |     expect_b = np.hstack([lhs_b, rhs_b])
46 |     expect_b.sort()
47 |     np.testing.assert_array_equal(got_b[0].b.to_array(), expect_b[:nelem])
48 |     np.testing.assert_array_equal(got_b[1].b.to_array(), expect_b[nelem:])
49 | 
50 | 
51 | def test_compare_frame_with_none():
52 |     df = cudf.DataFrame()
53 |     max_part_size = 1
54 |     df["a"] = [0]
55 |     res = batcher_sortnet._compare_frame(df, None, max_part_size, by="a")
56 |     assert res[0] is not None, res[1] is None
57 |     res = batcher_sortnet._compare_frame(None, df, max_part_size, by="a")
58 |     assert res[0] is not None, res[1] is None
59 |     res = batcher_sortnet._compare_frame(None, None, max_part_size, by="a")
60 |     assert res == (None, None)
61 | 


--------------------------------------------------------------------------------
/dask_cudf/tests/test_binops.py:
--------------------------------------------------------------------------------
 1 | import operator
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | 
 7 | import cudf
 8 | import dask.dataframe as dd
 9 | 
10 | 
11 | def _make_empty_frame(npartitions=2):
12 |     df = pd.DataFrame({"x": [], "y": []})
13 |     gdf = cudf.DataFrame.from_pandas(df)
14 |     dgf = dd.from_pandas(gdf, npartitions=npartitions)
15 |     return dgf
16 | 
17 | 
18 | def _make_random_frame(nelem, npartitions=2):
19 |     df = pd.DataFrame(
20 |         {"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)}
21 |     )
22 |     gdf = cudf.DataFrame.from_pandas(df)
23 |     dgf = dd.from_pandas(gdf, npartitions=npartitions)
24 |     return df, dgf
25 | 
26 | 
27 | def _make_random_frame_float(nelem, npartitions=2):
28 |     df = pd.DataFrame(
29 |         {
30 |             "x": np.random.randint(0, 5, size=nelem),
31 |             "y": np.random.normal(size=nelem) + 1,
32 |         }
33 |     )
34 |     gdf = cudf.from_pandas(df)
35 |     dgf = dd.from_pandas(gdf, npartitions=npartitions)
36 |     return df, dgf
37 | 
38 | 
39 | _binops = [
40 |     operator.add,
41 |     operator.sub,
42 |     operator.mul,
43 |     operator.truediv,
44 |     operator.floordiv,
45 |     operator.eq,
46 |     operator.ne,
47 |     operator.gt,
48 |     operator.ge,
49 |     operator.lt,
50 |     operator.le,
51 | ]
52 | 
53 | 
54 | @pytest.mark.parametrize("binop", _binops)
55 | def test_series_binops_integer(binop):
56 |     np.random.seed(0)
57 |     size = 1000
58 |     lhs_df, lhs_gdf = _make_random_frame(size)
59 |     rhs_df, rhs_gdf = _make_random_frame(size)
60 |     got = binop(lhs_gdf.x, rhs_gdf.y)
61 |     exp = binop(lhs_df.x, rhs_df.y)
62 |     dd.assert_eq(got, exp)
63 | 
64 | 
65 | @pytest.mark.parametrize("binop", _binops)
66 | def test_series_binops_float(binop):
67 |     np.random.seed(0)
68 |     size = 1000
69 |     lhs_df, lhs_gdf = _make_random_frame_float(size)
70 |     rhs_df, rhs_gdf = _make_random_frame_float(size)
71 |     got = binop(lhs_gdf.x, rhs_gdf.y)
72 |     exp = binop(lhs_df.x, rhs_df.y)
73 |     dd.assert_eq(got, exp)
74 | 


--------------------------------------------------------------------------------
/dask_cudf/tests/test_core.py:
--------------------------------------------------------------------------------
  1 | import dask
  2 | import dask.dataframe as dd
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pandas.util.testing as tm
  6 | import pytest
  7 | 
  8 | import cudf
  9 | import dask_cudf as dgd
 10 | 
 11 | 
 12 | def test_from_cudf():
 13 |     np.random.seed(0)
 14 | 
 15 |     df = pd.DataFrame(
 16 |         {"x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000)}
 17 |     )
 18 | 
 19 |     gdf = cudf.DataFrame.from_pandas(df)
 20 | 
 21 |     # Test simple around to/from dask
 22 |     ingested = dd.from_pandas(gdf, npartitions=2)
 23 |     dd.assert_eq(ingested, df)
 24 | 
 25 |     # Test conversion to dask.dataframe
 26 |     ddf = ingested.to_dask_dataframe()
 27 |     dd.assert_eq(ddf, df)
 28 | 
 29 | 
 30 | def test_from_cudf_with_generic_idx():
 31 | 
 32 |     cdf = cudf.DataFrame(
 33 |         [
 34 |             ("a", list(range(20))),
 35 |             ("b", list(reversed(range(20)))),
 36 |             ("c", list(range(20))),
 37 |         ]
 38 |     )
 39 | 
 40 |     ddf = dgd.from_cudf(cdf, npartitions=2)
 41 | 
 42 |     assert isinstance(ddf.index.compute(), cudf.dataframe.index.GenericIndex)
 43 |     dd.assert_eq(ddf.loc[1:2, ["a"]], cdf.loc[1:2, ["a"]])
 44 | 
 45 | 
 46 | def _fragmented_gdf(df, nsplit):
 47 |     n = len(df)
 48 | 
 49 |     # Split dataframe in *nsplit*
 50 |     subdivsize = n // nsplit
 51 |     starts = [i * subdivsize for i in range(nsplit)]
 52 |     ends = starts[1:] + [None]
 53 |     frags = [df[s:e] for s, e in zip(starts, ends)]
 54 |     return frags
 55 | 
 56 | 
 57 | def test_query():
 58 |     np.random.seed(0)
 59 | 
 60 |     df = pd.DataFrame(
 61 |         {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)}
 62 |     )
 63 |     gdf = cudf.DataFrame.from_pandas(df)
 64 |     expr = "x > 2"
 65 | 
 66 |     dd.assert_eq(gdf.query(expr), df.query(expr))
 67 | 
 68 |     queried = dd.from_pandas(gdf, npartitions=2).query(expr)
 69 | 
 70 |     got = queried
 71 |     expect = gdf.query(expr)
 72 | 
 73 |     dd.assert_eq(got, expect)
 74 | 
 75 | 
 76 | def test_query_local_dict():
 77 |     np.random.seed(0)
 78 |     df = pd.DataFrame(
 79 |         {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)}
 80 |     )
 81 |     gdf = cudf.DataFrame.from_pandas(df)
 82 |     ddf = dgd.from_cudf(gdf, npartitions=2)
 83 | 
 84 |     val = 2
 85 | 
 86 |     gdf_queried = gdf.query("x > @val")
 87 |     ddf_queried = ddf.query("x > @val", local_dict={"val": val})
 88 | 
 89 |     dd.assert_eq(gdf_queried, ddf_queried)
 90 | 
 91 | 
 92 | def test_head():
 93 |     np.random.seed(0)
 94 |     df = pd.DataFrame(
 95 |         {"x": np.random.randint(0, 5, size=100), "y": np.random.normal(size=100)}
 96 |     )
 97 |     gdf = cudf.DataFrame.from_pandas(df)
 98 |     dgf = dd.from_pandas(gdf, npartitions=2)
 99 | 
100 |     dd.assert_eq(dgf.head(), df.head())
101 | 
102 | 
103 | def test_from_dask_dataframe():
104 |     np.random.seed(0)
105 |     df = pd.DataFrame(
106 |         {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
107 |     )
108 |     ddf = dd.from_pandas(df, npartitions=2)
109 |     dgdf = ddf.map_partitions(cudf.from_pandas)
110 |     got = dgdf.compute().to_pandas()
111 |     expect = df
112 | 
113 |     dd.assert_eq(got, expect)
114 | 
115 | 
116 | @pytest.mark.parametrize("nelem", [10, 200, 1333])
117 | def test_set_index(nelem):
118 |     with dask.config.set(scheduler="single-threaded"):
119 |         np.random.seed(0)
120 |         # Use unique index range as the sort may not be stable-ordering
121 |         x = np.arange(nelem)
122 |         np.random.shuffle(x)
123 |         df = pd.DataFrame({"x": x, "y": np.random.randint(0, nelem, size=nelem)})
124 |         ddf = dd.from_pandas(df, npartitions=2)
125 |         dgdf = ddf.map_partitions(cudf.from_pandas)
126 | 
127 |         expect = ddf.set_index("x")
128 |         got = dgdf.set_index("x")
129 | 
130 |         dd.assert_eq(expect, got, check_index=False, check_divisions=False)
131 | 
132 | 
133 | def assert_frame_equal_by_index_group(expect, got):
134 |     assert sorted(expect.columns) == sorted(got.columns)
135 |     assert sorted(set(got.index)) == sorted(set(expect.index))
136 |     # Note the set_index sort is not stable,
137 |     unique_values = sorted(set(got.index))
138 |     for iv in unique_values:
139 |         sr_expect = expect.loc[[iv]]
140 |         sr_got = got.loc[[iv]]
141 | 
142 |         for k in expect.columns:
143 |             # Sort each column before we compare them
144 |             sorted_expect = sr_expect.sort_values(k)[k]
145 |             sorted_got = sr_got.sort_values(k)[k]
146 |             np.testing.assert_array_equal(sorted_expect, sorted_got)
147 | 
148 | 
149 | @pytest.mark.parametrize("nelem", [10, 200, 1333])
150 | def test_set_index_2(nelem):
151 |     with dask.config.set(scheduler="single-threaded"):
152 |         np.random.seed(0)
153 |         df = pd.DataFrame(
154 |             {
155 |                 "x": 100 + np.random.randint(0, nelem // 2, size=nelem),
156 |                 "y": np.random.normal(size=nelem),
157 |             }
158 |         )
159 |         expect = df.set_index("x").sort_index()
160 | 
161 |         dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=4)
162 |         res = dgf.set_index("x")  # sort by default
163 |         got = res.compute().to_pandas()
164 | 
165 |         assert_frame_equal_by_index_group(expect, got)
166 | 
167 | 
168 | def test_set_index_w_series():
169 |     with dask.config.set(scheduler="single-threaded"):
170 |         nelem = 20
171 |         np.random.seed(0)
172 |         df = pd.DataFrame(
173 |             {
174 |                 "x": 100 + np.random.randint(0, nelem // 2, size=nelem),
175 |                 "y": np.random.normal(size=nelem),
176 |             }
177 |         )
178 |         expect = df.set_index(df.x).sort_index()
179 | 
180 |         dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=4)
181 |         res = dgf.set_index(dgf.x)  # sort by default
182 |         got = res.compute().to_pandas()
183 | 
184 |         expect.index.name = None
185 |         dd.assert_eq(expect, got)
186 | 
187 | 
188 | def test_assign():
189 |     np.random.seed(0)
190 |     df = pd.DataFrame(
191 |         {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
192 |     )
193 | 
194 |     dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2)
195 |     pdcol = pd.Series(np.arange(20) + 1000)
196 |     newcol = dd.from_pandas(cudf.Series(pdcol), npartitions=dgf.npartitions)
197 |     out = dgf.assign(z=newcol)
198 | 
199 |     got = out
200 |     dd.assert_eq(got.loc[:, ["x", "y"]], df)
201 |     np.testing.assert_array_equal(got["z"], pdcol)
202 | 
203 | 
204 | @pytest.mark.parametrize("data_type", ["int8", "int16", "int32", "int64"])
205 | def test_setitem_scalar_integer(data_type):
206 |     np.random.seed(0)
207 |     scalar = np.random.randint(0, 100, dtype=data_type)
208 |     df = pd.DataFrame(
209 |         {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
210 |     )
211 |     dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2)
212 | 
213 |     df["z"] = scalar
214 |     dgf["z"] = scalar
215 | 
216 |     got = dgf.compute().to_pandas()
217 |     np.testing.assert_array_equal(got["z"], df["z"])
218 | 
219 | 
220 | @pytest.mark.parametrize("data_type", ["float32", "float64"])
221 | def test_setitem_scalar_float(data_type):
222 |     np.random.seed(0)
223 |     scalar = np.random.randn(1).astype(data_type)[0]
224 |     df = pd.DataFrame(
225 |         {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
226 |     )
227 |     dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2)
228 | 
229 |     df["z"] = scalar
230 |     dgf["z"] = scalar
231 | 
232 |     got = dgf.compute().to_pandas()
233 |     np.testing.assert_array_equal(got["z"], df["z"])
234 | 
235 | 
236 | def test_setitem_scalar_datetime():
237 |     np.random.seed(0)
238 |     scalar = np.int64(np.random.randint(0, 100)).astype("datetime64[ms]")
239 |     df = pd.DataFrame(
240 |         {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
241 |     )
242 |     dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2)
243 | 
244 |     df["z"] = scalar
245 |     dgf["z"] = scalar
246 | 
247 |     got = dgf.compute().to_pandas()
248 |     np.testing.assert_array_equal(got["z"], df["z"])
249 | 
250 | 
251 | @pytest.mark.parametrize(
252 |     "func",
253 |     [
254 |         lambda: tm.makeDataFrame().reset_index(),
255 |         tm.makeDataFrame,
256 |         tm.makeMixedDataFrame,
257 |         tm.makeObjectSeries,
258 |         tm.makeTimeSeries,
259 |     ],
260 | )
261 | def test_repr(func):
262 |     pdf = func()
263 |     try:
264 |         gdf = cudf.from_pandas(pdf)
265 |     except Exception:
266 |         raise pytest.xfail()
267 |     # gddf = dd.from_pandas(gdf, npartitions=3, sort=False)  # TODO
268 |     gddf = dd.from_pandas(gdf, npartitions=3, sort=False)
269 | 
270 |     assert repr(gddf)
271 |     if hasattr(pdf, "_repr_html_"):
272 |         assert gddf._repr_html_()
273 | 
274 | 
275 | @pytest.mark.skip(reason="datetime indexes not fully supported in cudf")
276 | @pytest.mark.parametrize("start", ["1d", "5d", "1w", "12h"])
277 | @pytest.mark.parametrize("stop", ["1d", "3d", "8h"])
278 | def test_repartition_timeseries(start, stop):
279 |     # This test is currently absurdly slow.  It should not be unskipped without
280 |     # slimming it down.
281 |     pdf = dask.datasets.timeseries(
282 |         "2000-01-01",
283 |         "2000-01-31",
284 |         freq="1s",
285 |         partition_freq=start,
286 |         dtypes={"x": int, "y": float},
287 |     )
288 |     gdf = pdf.map_partitions(cudf.DataFrame.from_pandas)
289 | 
290 |     a = pdf.repartition(freq=stop)
291 |     b = gdf.repartition(freq=stop)
292 |     assert a.divisions == b.divisions
293 | 
294 |     dd.utils.assert_eq(a, b)
295 | 
296 | 
297 | @pytest.mark.parametrize("start", [1, 2, 5])
298 | @pytest.mark.parametrize("stop", [1, 3, 7])
299 | def test_repartition_simple_divisions(start, stop):
300 |     pdf = pd.DataFrame({"x": range(100)})
301 | 
302 |     pdf = dd.from_pandas(pdf, npartitions=start)
303 |     gdf = pdf.map_partitions(cudf.DataFrame.from_pandas)
304 | 
305 |     a = pdf.repartition(npartitions=stop)
306 |     b = gdf.repartition(npartitions=stop)
307 |     assert a.divisions == b.divisions
308 | 
309 |     dd.utils.assert_eq(a, b)
310 | 
311 | 
312 | @pytest.fixture
313 | def pdf():
314 |     return pd.DataFrame(
315 |         {"x": [1, 2, 3, 4, 5, 6], "y": [11.0, 12.0, 13.0, 14.0, 15.0, 16.0]}
316 |     )
317 | 
318 | 
319 | @pytest.fixture
320 | def gdf(pdf):
321 |     return cudf.from_pandas(pdf)
322 | 
323 | 
324 | @pytest.fixture
325 | def ddf(pdf):
326 |     return dd.from_pandas(pdf, npartitions=3)
327 | 
328 | 
329 | @pytest.fixture
330 | def gddf(gdf):
331 |     return dd.from_pandas(gdf, npartitions=3)
332 | 
333 | 
334 | @pytest.mark.parametrize(
335 |     "func",
336 |     [
337 |         lambda df: df + 1,
338 |         lambda df: df.index,
339 |         lambda df: df.x.sum(),
340 |         lambda df: df.x.astype(float),
341 |         lambda df: df.assign(z=df.x.astype("int")),
342 |     ],
343 | )
344 | def test_unary_ops(func, gdf, gddf):
345 |     p = func(gdf)
346 |     g = func(gddf)
347 | 
348 |     # Fixed in https://github.com/dask/dask/pull/4657
349 |     if isinstance(p, cudf.Index):
350 |         from packaging import version
351 |         if version.parse(dask.__version__) < version.parse("1.1.6"):
352 |             pytest.skip("dask.dataframe assert_eq index check hardcoded to "
353 |                         "pandas prior to 1.1.6 release")
354 | 
355 |     dd.assert_eq(p, g, check_names=False)
356 | 
357 | 
358 | @pytest.mark.parametrize("series", [True, False])
359 | def test_concat(gdf, gddf, series):
360 |     if series:
361 |         gdf = gdf.x
362 |         gddf = gddf.x
363 |     a = cudf.concat([gdf, gdf + 1, gdf + 2]).sort_values("x").reset_index(drop=True)
364 |     b = (
365 |         dd.concat([gddf, gddf + 1, gddf + 2], interleave_partitions=True)
366 |         .compute()
367 |         .sort_values("x")
368 |         .reset_index(drop=True)
369 |     )
370 |     dd.assert_eq(a, b)
371 | 
372 | 
373 | def test_boolean_index(gdf, gddf):
374 | 
375 |     gdf2 = gdf[gdf.x > 2]
376 |     gddf2 = gddf[gddf.x > 2]
377 | 
378 |     dd.assert_eq(gdf2, gddf2)
379 | 


--------------------------------------------------------------------------------
/dask_cudf/tests/test_delayed_io.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test IO with dask.delayed API
  3 | """
  4 | import numpy as np
  5 | import pytest
  6 | from dask.delayed import delayed
  7 | from pandas.util.testing import assert_frame_equal
  8 | 
  9 | import cudf as gd
 10 | import dask_cudf as dgd
 11 | 
 12 | 
 13 | @delayed
 14 | def load_data(nelem, ident):
 15 |     df = gd.DataFrame()
 16 |     df["x"] = np.arange(nelem)
 17 |     df["ident"] = np.asarray([ident] * nelem)
 18 |     return df
 19 | 
 20 | 
 21 | @delayed
 22 | def get_combined_column(df):
 23 |     return df.x * df.ident
 24 | 
 25 | 
 26 | def test_dataframe_from_delayed():
 27 |     delays = [load_data(10 * i, i) for i in range(1, 3)]
 28 |     out = dgd.from_delayed(delays)
 29 |     res = out.compute()
 30 |     assert isinstance(res, gd.DataFrame)
 31 | 
 32 |     expected = gd.concat([d.compute() for d in delays])
 33 |     assert_frame_equal(res.to_pandas(), expected.to_pandas())
 34 | 
 35 | 
 36 | def test_series_from_delayed():
 37 |     delays = [get_combined_column(load_data(10 * i, i)) for i in range(1, 3)]
 38 |     out = dgd.from_delayed(delays)
 39 |     res = out.compute()
 40 |     assert isinstance(res, gd.Series)
 41 | 
 42 |     expected = gd.concat([d.compute() for d in delays])
 43 |     np.testing.assert_array_equal(res.to_pandas(), expected.to_pandas())
 44 | 
 45 | 
 46 | def test_dataframe_to_delayed():
 47 |     nelem = 100
 48 | 
 49 |     df = gd.DataFrame()
 50 |     df["x"] = np.arange(nelem)
 51 |     df["y"] = np.random.randint(nelem, size=nelem)
 52 | 
 53 |     ddf = dgd.from_cudf(df, npartitions=5)
 54 | 
 55 |     delays = ddf.to_delayed()
 56 | 
 57 |     assert len(delays) == 5
 58 | 
 59 |     # Concat the delayed partitions
 60 |     got = gd.concat([d.compute() for d in delays])
 61 |     assert_frame_equal(got.to_pandas(), df.to_pandas())
 62 | 
 63 |     # Check individual partitions
 64 |     divs = ddf.divisions
 65 |     assert len(divs) == len(delays) + 1
 66 | 
 67 |     for i, part in enumerate(delays):
 68 |         s = divs[i]
 69 |         # The last divisions in the last index
 70 |         e = None if i + 1 == len(delays) else divs[i + 1]
 71 |         expect = df[s:e].to_pandas()
 72 |         got = part.compute().to_pandas()
 73 |         assert_frame_equal(got, expect)
 74 | 
 75 | 
 76 | def test_series_to_delayed():
 77 |     nelem = 100
 78 | 
 79 |     sr = gd.Series(np.random.randint(nelem, size=nelem))
 80 | 
 81 |     dsr = dgd.from_cudf(sr, npartitions=5)
 82 | 
 83 |     delays = dsr.to_delayed()
 84 | 
 85 |     assert len(delays) == 5
 86 | 
 87 |     # Concat the delayed partitions
 88 |     got = gd.concat([d.compute() for d in delays])
 89 |     assert isinstance(got, gd.Series)
 90 |     np.testing.assert_array_equal(got.to_pandas(), sr.to_pandas())
 91 | 
 92 |     # Check individual partitions
 93 |     divs = dsr.divisions
 94 |     assert len(divs) == len(delays) + 1
 95 | 
 96 |     for i, part in enumerate(delays):
 97 |         s = divs[i]
 98 |         # The last divisions in the last index
 99 |         e = None if i + 1 == len(delays) else divs[i + 1]
100 |         expect = sr[s:e].to_pandas()
101 |         got = part.compute().to_pandas()
102 |         np.testing.assert_array_equal(got, expect)
103 | 
104 | 
105 | def test_mixing_series_frame_error():
106 |     nelem = 20
107 | 
108 |     df = gd.DataFrame()
109 |     df["x"] = np.arange(nelem)
110 |     df["y"] = np.random.randint(nelem, size=nelem)
111 | 
112 |     ddf = dgd.from_cudf(df, npartitions=5)
113 | 
114 |     delay_frame = ddf.to_delayed()
115 |     delay_series = ddf.x.to_delayed()
116 |     combined = dgd.from_delayed(delay_frame + delay_series)
117 | 
118 |     with pytest.raises(ValueError) as raises:
119 |         combined.compute()
120 | 
121 |     raises.match(r"^Metadata mismatch found in `from_delayed`.")
122 | 
123 | 
124 | def test_frame_extra_columns_error():
125 |     nelem = 20
126 | 
127 |     df = gd.DataFrame()
128 |     df["x"] = np.arange(nelem)
129 |     df["y"] = np.random.randint(nelem, size=nelem)
130 |     ddf1 = dgd.from_cudf(df, npartitions=5)
131 | 
132 |     df["z"] = np.arange(nelem)
133 |     ddf2 = dgd.from_cudf(df, npartitions=5)
134 | 
135 |     combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed())
136 | 
137 |     with pytest.raises(ValueError) as raises:
138 |         combined.compute()
139 | 
140 |     raises.match(r"^Metadata mismatch found in `from_delayed`.")
141 |     raises.match(r"z")
142 | 
143 | 
144 | @pytest.mark.xfail(reason="")
145 | def test_frame_dtype_error():
146 |     nelem = 20
147 | 
148 |     df1 = gd.DataFrame()
149 |     df1["bad"] = np.arange(nelem)
150 |     df1["bad"] = np.arange(nelem, dtype=np.float64)
151 | 
152 |     df2 = gd.DataFrame()
153 |     df2["bad"] = np.arange(nelem)
154 |     df2["bad"] = np.arange(nelem, dtype=np.float32)
155 | 
156 |     ddf1 = dgd.from_cudf(df1, npartitions=5)
157 |     ddf2 = dgd.from_cudf(df2, npartitions=5)
158 | 
159 |     combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed())
160 | 
161 |     with pytest.raises(ValueError) as raises:
162 |         combined.compute()
163 | 
164 |     raises.match(r"same type")
165 | 


--------------------------------------------------------------------------------
/dask_cudf/tests/test_distributed.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import cudf
 4 | import dask
 5 | from dask.distributed import Client
 6 | import dask.dataframe as dd
 7 | from distributed.utils_test import loop  # noqa: F401
 8 | 
 9 | dask_cuda = pytest.importorskip("dask_cuda")
10 | 
11 | 
12 | @pytest.mark.parametrize("delayed", [True, False])  # noqa: F811
13 | def test_basic(loop, delayed):  # noqa: F811
14 |     with dask_cuda.LocalCUDACluster(loop=loop) as cluster:
15 |         with Client(cluster):
16 |             pdf = dask.datasets.timeseries(dtypes={"x": int}).reset_index()
17 |             gdf = pdf.map_partitions(cudf.DataFrame.from_pandas)
18 |             if delayed:
19 |                 gdf = dd.from_delayed(gdf.to_delayed())
20 |             dd.assert_eq(pdf.head(), gdf.head())
21 | 


--------------------------------------------------------------------------------
/dask_cudf/tests/test_groupby.py:
--------------------------------------------------------------------------------
  1 | import dask.dataframe as dd
  2 | import dask_cudf
  3 | import pandas as pd
  4 | import cudf
  5 | import numpy as np
  6 | 
  7 | import pytest
  8 | 
  9 | 
 10 | @pytest.mark.parametrize(
 11 |     'agg',
 12 |     [
 13 |         'sum',
 14 |         'mean',
 15 |         'count',
 16 |         'min',
 17 |         'max'
 18 |     ]
 19 | )
 20 | def test_groupby_basic_aggs(agg):
 21 |     pdf = pd.DataFrame(
 22 |         {"x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000)}
 23 |     )
 24 | 
 25 |     gdf = cudf.DataFrame.from_pandas(pdf)
 26 | 
 27 |     ddf = dask_cudf.from_cudf(gdf, npartitions=5)
 28 | 
 29 |     a = getattr(gdf.groupby("x"), agg)().to_pandas()
 30 |     b = getattr(ddf.groupby("x"), agg)().compute().to_pandas()
 31 | 
 32 |     a.index.name = None
 33 |     a.name = None
 34 |     b.index.name = None
 35 |     b.name = None
 36 | 
 37 |     if agg == "count":
 38 |         a["y"] = a["y"].astype(np.int64)
 39 | 
 40 |     dd.assert_eq(a, b)
 41 | 
 42 | 
 43 | @pytest.mark.parametrize(
 44 |     "func",
 45 |     [
 46 |         lambda df: df.groupby("x").agg({"y": "max"}),
 47 |         pytest.param(
 48 |             lambda df: df.groupby("x").y.agg(["sum", "max"]),
 49 |             marks=pytest.mark.skip
 50 |         )
 51 |     ],
 52 | )
 53 | def test_groupby_agg(func):
 54 |     pdf = pd.DataFrame(
 55 |         {"x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000)}
 56 |     )
 57 | 
 58 |     gdf = cudf.DataFrame.from_pandas(pdf)
 59 | 
 60 |     ddf = dask_cudf.from_cudf(gdf, npartitions=5)
 61 | 
 62 |     a = func(gdf).to_pandas()
 63 |     b = func(ddf).compute().to_pandas()
 64 | 
 65 |     a.index.name = None
 66 |     a.name = None
 67 |     b.index.name = None
 68 |     b.name = None
 69 | 
 70 |     dd.assert_eq(a, b)
 71 | 
 72 | 
 73 | @pytest.mark.xfail(reason="cudf issues")
 74 | @pytest.mark.parametrize(
 75 |     "func", [lambda df: df.groupby("x").std(), lambda df: df.groupby("x").y.std()]
 76 | )
 77 | def test_groupby_std(func):
 78 |     pdf = pd.DataFrame(
 79 |         {"x": np.random.randint(0, 5, size=10000), "y": np.random.normal(size=10000)}
 80 |     )
 81 | 
 82 |     gdf = cudf.DataFrame.from_pandas(pdf)
 83 | 
 84 |     ddf = dask_cudf.from_cudf(gdf, npartitions=5)
 85 | 
 86 |     a = func(gdf.to_pandas())
 87 |     b = func(ddf).compute().to_pandas()
 88 | 
 89 |     a.index.name = None
 90 |     a.name = None
 91 |     b.index.name = None
 92 | 
 93 |     dd.assert_eq(a, b)
 94 | 
 95 | 
 96 | # reason gotattr in cudf
 97 | @pytest.mark.parametrize(
 98 |     "func",
 99 |     [
100 |         pytest.param(
101 |             lambda df: df.groupby(["a", "b"]).x.sum(),
102 |             marks=pytest.mark.xfail
103 |         ),
104 |         pytest.param(
105 |             lambda df: df.groupby(["a", "b"]).sum(), marks=pytest.mark.xfail
106 |         ),
107 |         pytest.param(
108 |             lambda df: df.groupby(["a", "b"]).agg({'x', "sum"}), marks=pytest.mark.xfail
109 |         )
110 |     ],
111 | )
112 | def test_groupby_multi_column(func):
113 |     pdf = pd.DataFrame(
114 |         {
115 |             "a": np.random.randint(0, 20, size=1000),
116 |             "b": np.random.randint(0, 5, size=1000),
117 |             "x": np.random.normal(size=1000),
118 |         }
119 |     )
120 | 
121 |     gdf = cudf.DataFrame.from_pandas(pdf)
122 | 
123 |     ddf = dask_cudf.from_cudf(gdf, npartitions=5)
124 | 
125 |     a = func(gdf).to_pandas()
126 |     b = func(ddf).compute().to_pandas()
127 | 
128 |     dd.assert_eq(a, b)
129 | 


--------------------------------------------------------------------------------
/dask_cudf/tests/test_join.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | import pytest
  6 | 
  7 | import cudf
  8 | import dask_cudf as dgd
  9 | import dask.dataframe as dd
 10 | 
 11 | param_nrows = [5, 10, 50, 100]
 12 | 
 13 | 
 14 | @pytest.mark.parametrize("left_nrows", param_nrows)
 15 | @pytest.mark.parametrize("right_nrows", param_nrows)
 16 | @pytest.mark.parametrize("left_nkeys", [4, 5])
 17 | @pytest.mark.parametrize("right_nkeys", [4, 5])
 18 | def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys):
 19 |     chunksize = 50
 20 | 
 21 |     np.random.seed(0)
 22 | 
 23 |     # cuDF
 24 |     left = cudf.DataFrame(
 25 |         {
 26 |             "x": np.random.randint(0, left_nkeys, size=left_nrows),
 27 |             "a": np.arange(left_nrows),
 28 |         }.items()
 29 |     )
 30 |     right = cudf.DataFrame(
 31 |         {
 32 |             "x": np.random.randint(0, right_nkeys, size=right_nrows),
 33 |             "a": 1000 * np.arange(right_nrows),
 34 |         }.items()
 35 |     )
 36 | 
 37 |     expect = left.set_index("x").join(
 38 |         right.set_index("x"), how="inner", sort=True, lsuffix="l", rsuffix="r"
 39 |     )
 40 |     expect = expect.to_pandas()
 41 | 
 42 |     # dask_cudf
 43 |     left = dgd.from_cudf(left, chunksize=chunksize)
 44 |     right = dgd.from_cudf(right, chunksize=chunksize)
 45 | 
 46 |     joined = left.set_index("x").join(
 47 |         right.set_index("x"), how="inner", lsuffix="l", rsuffix="r"
 48 |     )
 49 |     got = joined.compute().to_pandas()
 50 | 
 51 |     # Check index
 52 |     np.testing.assert_array_equal(expect.index.values, got.index.values)
 53 | 
 54 |     # Check rows in each groups
 55 |     expect_rows = {}
 56 |     got_rows = {}
 57 | 
 58 |     def gather(df, grows):
 59 |         grows[df["index"].values[0]] = (set(df.al), set(df.ar))
 60 | 
 61 |     expect.reset_index().groupby("index").apply(partial(gather, grows=expect_rows))
 62 | 
 63 |     expect.reset_index().groupby("index").apply(partial(gather, grows=got_rows))
 64 | 
 65 |     assert got_rows == expect_rows
 66 | 
 67 | 
 68 | @pytest.mark.parametrize("left_nrows", param_nrows)
 69 | @pytest.mark.parametrize("right_nrows", param_nrows)
 70 | @pytest.mark.parametrize("left_nkeys", [4, 5])
 71 | @pytest.mark.parametrize("right_nkeys", [4, 5])
 72 | @pytest.mark.parametrize("how", ["left", "right"])
 73 | def test_join_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how):
 74 |     chunksize = 50
 75 | 
 76 |     np.random.seed(0)
 77 | 
 78 |     # cuDF
 79 |     left = cudf.DataFrame(
 80 |         {
 81 |             "x": np.random.randint(0, left_nkeys, size=left_nrows),
 82 |             "a": np.arange(left_nrows, dtype=np.float64),
 83 |         }.items()
 84 |     )
 85 |     right = cudf.DataFrame(
 86 |         {
 87 |             "x": np.random.randint(0, right_nkeys, size=right_nrows),
 88 |             "a": 1000 * np.arange(right_nrows, dtype=np.float64),
 89 |         }.items()
 90 |     )
 91 | 
 92 |     expect = left.set_index("x").join(
 93 |         right.set_index("x"), how=how, sort=True, lsuffix="l", rsuffix="r"
 94 |     )
 95 |     expect = expect.to_pandas()
 96 | 
 97 |     # dask_cudf
 98 |     left = dgd.from_cudf(left, chunksize=chunksize)
 99 |     right = dgd.from_cudf(right, chunksize=chunksize)
100 | 
101 |     joined = left.set_index("x").join(
102 |         right.set_index("x"), how=how, lsuffix="l", rsuffix="r"
103 |     )
104 |     got = joined.compute().to_pandas()
105 | 
106 |     # Check index
107 |     np.testing.assert_array_equal(expect.index.values, got.index.values)
108 | 
109 |     # Check rows in each groups
110 |     expect_rows = {}
111 |     got_rows = {}
112 | 
113 |     def gather(df, grows):
114 |         cola = np.sort(np.asarray(df.al))
115 |         colb = np.sort(np.asarray(df.ar))
116 | 
117 |         grows[df["index"].values[0]] = (cola, colb)
118 | 
119 |     expect.reset_index().groupby("index").apply(partial(gather, grows=expect_rows))
120 | 
121 |     expect.reset_index().groupby("index").apply(partial(gather, grows=got_rows))
122 | 
123 |     for k in expect_rows:
124 |         np.testing.assert_array_equal(expect_rows[k][0], got_rows[k][0])
125 |         np.testing.assert_array_equal(expect_rows[k][1], got_rows[k][1])
126 | 
127 | 
128 | @pytest.mark.parametrize("left_nrows", param_nrows)
129 | @pytest.mark.parametrize("right_nrows", param_nrows)
130 | @pytest.mark.parametrize("left_nkeys", [4, 5])
131 | @pytest.mark.parametrize("right_nkeys", [4, 5])
132 | def test_merge_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how="left"):
133 |     chunksize = 3
134 | 
135 |     np.random.seed(0)
136 | 
137 |     # cuDF
138 |     left = cudf.DataFrame(
139 |         {
140 |             "x": np.random.randint(0, left_nkeys, size=left_nrows),
141 |             "y": np.random.randint(0, left_nkeys, size=left_nrows),
142 |             "a": np.arange(left_nrows, dtype=np.float64),
143 |         }.items()
144 |     )
145 |     right = cudf.DataFrame(
146 |         {
147 |             "x": np.random.randint(0, right_nkeys, size=right_nrows),
148 |             "y": np.random.randint(0, right_nkeys, size=right_nrows),
149 |             "a": 1000 * np.arange(right_nrows, dtype=np.float64),
150 |         }.items()
151 |     )
152 | 
153 |     expect = left.merge(right, on=("x", "y"), how=how)
154 | 
155 |     def normalize(df):
156 |         return (
157 |             df.to_pandas().sort_values(["x", "y", "a_x", "a_y"]).reset_index(drop=True)
158 |         )
159 | 
160 |     # dask_cudf
161 |     left = dgd.from_cudf(left, chunksize=chunksize)
162 |     right = dgd.from_cudf(right, chunksize=chunksize)
163 | 
164 |     result = left.merge(right, on=("x", "y"), how=how).compute(
165 |         scheduler="single-threaded"
166 |     )
167 | 
168 |     dd.assert_eq(normalize(expect), normalize(result))
169 | 
170 | 
171 | @pytest.mark.parametrize("left_nrows", [2, 5])
172 | @pytest.mark.parametrize("right_nrows", [5, 10])
173 | @pytest.mark.parametrize("left_nkeys", [4])
174 | @pytest.mark.parametrize("right_nkeys", [4])
175 | def test_merge_1col_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how="left"):
176 |     chunksize = 3
177 | 
178 |     np.random.seed(0)
179 | 
180 |     # cuDF
181 |     left = cudf.DataFrame(
182 |         {
183 |             "x": np.random.randint(0, left_nkeys, size=left_nrows),
184 |             "a": np.arange(left_nrows, dtype=np.float64),
185 |         }.items()
186 |     )
187 |     right = cudf.DataFrame(
188 |         {
189 |             "x": np.random.randint(0, right_nkeys, size=right_nrows),
190 |             "a": 1000 * np.arange(right_nrows, dtype=np.float64),
191 |         }.items()
192 |     )
193 | 
194 |     expect = left.merge(right, on=["x"], how=how)
195 |     expect = expect.to_pandas().sort_values(["x", "a_x", "a_y"]).reset_index(drop=True)
196 | 
197 |     # dask_cudf
198 |     left = dgd.from_cudf(left, chunksize=chunksize)
199 |     right = dgd.from_cudf(right, chunksize=chunksize)
200 | 
201 |     joined = left.merge(right, on=["x"], how=how)
202 | 
203 |     got = joined.compute().to_pandas()
204 | 
205 |     got = got.sort_values(["x", "a_x", "a_y"]).reset_index(drop=True)
206 | 
207 |     dd.assert_eq(expect, got)
208 | 
209 | 
210 | @pytest.mark.parametrize("how", ["inner", "left"])
211 | def test_indexed_join(how):
212 |     p_left = pd.DataFrame({"x": np.arange(10)}, index=np.arange(10) * 2)
213 |     p_right = pd.DataFrame({"y": 1}, index=np.arange(15))
214 | 
215 |     g_left = cudf.from_pandas(p_left)
216 |     g_right = cudf.from_pandas(p_right)
217 | 
218 |     dg_left = dd.from_pandas(g_left, npartitions=4)
219 |     dg_right = dd.from_pandas(g_right, npartitions=5)
220 | 
221 |     d = g_left.merge(g_right, left_index=True, right_index=True, how=how)
222 |     dg = dg_left.merge(dg_right, left_index=True, right_index=True, how=how)
223 | 
224 |     # occassionally order is not correct (possibly do to hashing in the merge)
225 |     d = d.sort_values('x')  # index is preserved
226 |     dg = dg.sort_values('x')  # index is reset -- sort_values will slow test down
227 | 
228 |     dd.assert_eq(d, dg, check_index=False)
229 | 
230 | 
231 | @pytest.mark.parametrize("how", ["left", "inner"])
232 | def test_how(how):
233 |     left = cudf.DataFrame({"x": [1, 2, 3, 4, None], "y": [1.0, 2.0, 3.0, 4.0, 0.0]})
234 |     right = cudf.DataFrame({"x": [2, 3, None, 2], "y": [20, 30, 0, 20]})
235 | 
236 |     dleft = dd.from_pandas(left, npartitions=2)
237 |     dright = dd.from_pandas(right, npartitions=3)
238 | 
239 |     expected = left.merge(right, how=how, on="x")
240 |     result = dleft.merge(dright, how=how, on="x")
241 | 
242 |     dd.assert_eq(
243 |         result.compute().to_pandas().sort_values("x"),
244 |         expected.to_pandas().sort_values("x"),
245 |         check_index=False,
246 |     )
247 | 
248 | 
249 | @pytest.mark.parametrize("daskify", [True, False])
250 | def test_single_dataframe_merge(daskify):
251 |     right = cudf.DataFrame({"x": [1, 2, 1, 2], "y": [1, 2, 3, 4]})
252 |     left = cudf.DataFrame({"x": np.arange(100) % 10, "z": np.arange(100)})
253 | 
254 |     dleft = dd.from_pandas(left, npartitions=10)
255 | 
256 |     if daskify:
257 |         dright = dd.from_pandas(right, npartitions=1)
258 |     else:
259 |         dright = right
260 | 
261 |     expected = left.merge(right, how="inner")
262 |     result = dd.merge(dleft, dright, how="inner")
263 |     assert len(result.dask) < 25
264 | 
265 |     dd.assert_eq(
266 |         result.compute().to_pandas().sort_values(["z", "y"]),
267 |         expected.to_pandas().sort_values(["z", "y"]),
268 |         check_index=False,
269 |     )
270 | 
271 | 
272 | @pytest.mark.parametrize("how", ["inner", "left"])
273 | @pytest.mark.parametrize("on", ["id_1", ["id_1"], ["id_1", "id_2"]])
274 | def test_on(how, on):
275 |     left = cudf.DataFrame({"id_1": [1, 2, 3, 4, 5], "id_2": [1.0, 2.0, 3.0, 4.0, 0.0]})
276 |     right = cudf.DataFrame({"id_1": [2, 3, None, 2], "id_2": [2.0, 3.0, 4.0, 20]})
277 | 
278 |     dleft = dd.from_pandas(left, npartitions=2)
279 |     dright = dd.from_pandas(right, npartitions=3)
280 | 
281 |     expected = left.merge(right, how=how, on=on)
282 |     result = dleft.merge(dright, how=how, on=on)
283 | 
284 |     dd.assert_eq(
285 |         result.compute().to_pandas().sort_values(on),
286 |         expected.to_pandas().sort_values(on),
287 |         check_index=False,
288 |     )
289 | 
290 | 
291 | def test_single_partition():
292 |     left = cudf.DataFrame({"x": range(200), "y": range(200)})
293 |     right = cudf.DataFrame({"x": range(100), "z": range(100)})
294 | 
295 |     dleft = dd.from_pandas(left, npartitions=1)
296 |     dright = dd.from_pandas(right, npartitions=10)
297 | 
298 |     m = dleft.merge(dright, how="inner")
299 |     assert len(m.dask) < len(dleft.dask) + len(dright.dask) * 3
300 | 
301 |     dleft = dd.from_pandas(left, npartitions=5)
302 |     m2 = dleft.merge(right, how="inner")
303 |     assert len(m2.dask) < len(dleft.dask) * 3
304 |     assert len(m2) == 100
305 | 


--------------------------------------------------------------------------------
/dask_cudf/tests/test_reductions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | from dask.dataframe.utils import assert_eq
 5 | 
 6 | import cudf as gd
 7 | import dask_cudf as dgd
 8 | 
 9 | 
10 | def _make_random_frame(nelem, npartitions=2):
11 |     df = pd.DataFrame(
12 |         {
13 |             "x": np.random.randint(0, 5, size=nelem),
14 |             "y": np.random.normal(size=nelem) + 1,
15 |         }
16 |     )
17 |     gdf = gd.DataFrame.from_pandas(df)
18 |     dgf = dgd.from_cudf(gdf, npartitions=npartitions)
19 |     return df, dgf
20 | 
21 | 
22 | _reducers = ["sum", "count", "mean", "var", "std", "min", "max"]
23 | 
24 | 
25 | def _get_reduce_fn(name):
26 |     def wrapped(series):
27 |         fn = getattr(series, name)
28 |         return fn()
29 | 
30 |     return wrapped
31 | 
32 | 
33 | @pytest.mark.parametrize("reducer", _reducers)
34 | def test_series_reduce(reducer):
35 |     reducer = _get_reduce_fn(reducer)
36 |     np.random.seed(0)
37 |     size = 10
38 |     df, gdf = _make_random_frame(size)
39 | 
40 |     got = reducer(gdf.x)
41 |     exp = reducer(df.x)
42 |     assert_eq(got, exp)
43 | 


--------------------------------------------------------------------------------
/dask_cudf/tests/test_sort.py:
--------------------------------------------------------------------------------
 1 | import dask
 2 | import numpy as np
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | import cudf
 7 | import dask.dataframe as dd
 8 | 
 9 | 
10 | @pytest.mark.parametrize("by", ["a", "b"])
11 | @pytest.mark.parametrize("nelem", [10, 100, 1000])
12 | @pytest.mark.parametrize("nparts", [1, 2, 5, 10])
13 | def test_sort_values(nelem, nparts, by):
14 |     df = cudf.DataFrame()
15 |     df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1])
16 |     df["b"] = np.arange(100, nelem + 100)
17 |     ddf = dd.from_pandas(df, npartitions=nparts)
18 | 
19 |     with dask.config.set(scheduler="single-threaded"):
20 |         got = ddf.sort_values(by=by).compute().to_pandas()
21 |     expect = df.sort_values(by=by).to_pandas().reset_index(drop=True)
22 |     pd.util.testing.assert_frame_equal(got, expect)
23 | 
24 | 
25 | def test_sort_values_binned():
26 |     np.random.seed(43)
27 |     nelem = 100
28 |     nparts = 5
29 |     by = "a"
30 |     df = cudf.DataFrame()
31 |     df["a"] = np.random.randint(1, 5, nelem)
32 |     ddf = dd.from_pandas(df, npartitions=nparts)
33 | 
34 |     parts = ddf.sort_values_binned(by=by).to_delayed()
35 |     part_uniques = []
36 |     for i, p in enumerate(parts):
37 |         part = dask.compute(p)[0]
38 |         part_uniques.append(set(part.a.unique()))
39 | 
40 |     # Partitions do not have intersecting keys
41 |     for i in range(len(part_uniques)):
42 |         for j in range(i + 1, len(part_uniques)):
43 |             assert not (
44 |                 part_uniques[i] & part_uniques[j]
45 |             ), "should have empty intersection"
46 | 
47 | 
48 | def test_sort_binned_meta():
49 |     df = cudf.DataFrame([("a", [0, 1, 2, 3, 4]), ("b", [5, 6, 7, 7, 8])])
50 |     ddf = dd.from_pandas(df, npartitions=2).persist()
51 | 
52 |     ddf.sort_values_binned(by="b")
53 | 


--------------------------------------------------------------------------------
/gpuci_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/lib
 4 | CC=/usr/bin/gcc
 5 | CXX=/usr/bin/g++
 6 | DASKCUDF_REPO=https://github.com/rapidsai/dask_cudf
 7 | NUMBA_VERSION=0.40.0
 8 | NUMPY_VERSION=1.14.5
 9 | PANDAS_VERSION=0.20.3
10 | PYTHON_VERSION=3.5
11 | PYARROW_VERSION=0.10.0
12 | 
13 | function logger() {
14 |   echo -e "\n>>>> $@\n"
15 | }
16 | 
17 | logger "Check environment..."
18 | env
19 | 
20 | logger "Check GPU usage..."
21 | nvidia-smi
22 | 
23 | logger "Create conda env..."
24 | rm -rf /home/jenkins/.conda/envs/daskcudf
25 | conda create -n daskcudf python=${PYTHON_VERSION}
26 | conda install -n daskcudf -y -c rapidsai -c numba -c conda-forge -c defaults \
27 |   numba=${NUMBA_VERSION} \
28 |   numpy=${NUMPY_VERSION} \
29 |   pandas=${PANDAS_VERSION} \
30 |   pyarrow=${PYARROW_VERSION} \
31 |   pytest \
32 |   dask \
33 |   cudf
34 | 
35 | 
36 | logger "Activate conda env..."
37 | source activate daskcudf
38 | 
39 | logger "Check versions..."
40 | python --version
41 | gcc --version
42 | g++ --version
43 | conda list
44 | 
45 | logger "Clone dask_cudf..."
46 | rm -rf $WORKSPACE/daskcudf
47 | git clone --recurse-submodules ${DASKCUDF_REPO} $WORKSPACE/daskcudf
48 | 
49 | 
50 | logger "Build dask_cudf..."
51 | cd $WORKSPACE
52 | python setup.py install
53 | 
54 | logger "Check GPU usage..."
55 | nvidia-smi
56 | 
57 | logger "Test dask_cudf..."
58 | py.test --cache-clear --junitxml=junit.xml --ignore=daskcudf -v
59 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cudf>=0.7*
2 | dask>=1.2.2
3 | distributed>=1.26
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [versioneer]
 2 | VCS = git
 3 | style = pep440
 4 | versionfile_source = dask_cudf/_version.py
 5 | versionfile_build = dask_cudf/_version.py
 6 | tag_prefix =
 7 | parentdir_prefix = dask_cudf-
 8 | 
 9 | [flake8]
10 | exclude = docs, __init__.py
11 | max-line-length = 88
12 | ignore =
13 |     # Assigning lambda expression
14 |     E731
15 |     # Ambiguous variable names
16 |     E741
17 |     # line break before binary operator
18 |     W503
19 |     # whitespace before :
20 |     E203
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from codecs import open
 3 | 
 4 | from setuptools import find_packages, setup
 5 | 
 6 | # Get the long description from the README file
 7 | with open(os.path.join(os.path.dirname(__file__), "README.md")) as f:
 8 |     long_description = f.read()
 9 | 
10 | version = os.environ.get("GIT_DESCRIBE_TAG", "0.0.0.dev0").lstrip("v")
11 | setup(
12 |     name="dask-cudf",
13 |     version=version,
14 |     description="Utilities for Dask and cuDF interactions",
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     url="https://github.com/rapidsai/dask-cudf",
18 |     author="NVIDIA Corporation",
19 |     license="Apache 2.0",
20 |     classifiers=[
21 |         "Intended Audience :: Developers",
22 |         "Topic :: Database",
23 |         "Topic :: Scientific/Engineering",
24 |         "License :: OSI Approved :: Apache Software License",
25 |         "Programming Language :: Python :: 3.6",
26 |         "Programming Language :: Python :: 3.7",
27 |     ],
28 |     packages=find_packages(exclude=["docs", "tests", "tests.*", "docs.*"]),
29 |     install_requires=open("requirements.txt").read().strip().split("\n"),
30 | )
31 | 


--------------------------------------------------------------------------------