├── code ├── .gitkeep ├── ibis.png ├── cudf-pandas-demo.ipynb ├── cupy-interop.ipynb └── Introduction_to_Strings.ipynb ├── data ├── .gitkeep └── scratch │ └── .gitkeep ├── models └── .gitkeep ├── requirements.txt ├── apt.txt ├── .gitattributes ├── variables.env ├── .project ├── configpacks └── spec.yaml ├── postBuild.bash ├── preBuild.bash ├── .gitignore ├── README.md └── LICENSE.txt /code/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/scratch/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jupyterlab>3.0 2 | plotly 3 | -------------------------------------------------------------------------------- /apt.txt: -------------------------------------------------------------------------------- 1 | # apt packages to install should be listed one per line 2 | -------------------------------------------------------------------------------- /code/ibis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/workbench-example-rapids-cudf/HEAD/code/ibis.png -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | models/** filter=lfs diff=lfs merge=lfs -text 2 | data/** filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /variables.env: -------------------------------------------------------------------------------- 1 | # Set environment variables in the format KEY=VALUE, 1 per line 2 | # This file will be sourced inside the project container when started. 3 | # NOTE: If you change this file while the project is running, you must restart the project container for changes to take effect. 4 | 5 | -------------------------------------------------------------------------------- /.project/configpacks: -------------------------------------------------------------------------------- 1 | *defaults.ContainerUser 2 | *bash.PreBuild 3 | *cuda.CUDA 4 | *defaults.EnvVars 5 | *defaults.Readme 6 | *defaults.Entrypoint 7 | *apt.PackageManager 8 | *bash.PreLanguage 9 | *python.PipPackageManager 10 | *bash.PostBuild 11 | *jupyterlab.JupyterLab 12 | *vs_code.VSCode -------------------------------------------------------------------------------- /postBuild.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This file contains bash commands that will be executed at the end of the container build process, 3 | # after all system packages and programming language specific package have been installed. 4 | # 5 | # Note: This file may be removed if you don't need to use it 6 | 7 | -------------------------------------------------------------------------------- /preBuild.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This file contains bash commands that will be executed at the beginning of the container build process, 3 | # before any system packages or programming language specific package have been installed. 4 | # 5 | # Note: This file may be removed if you don't need to use it 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore generated or temporary files managed by the Workbench 2 | .project/* 3 | !.project/spec.yaml 4 | !.project/configpacks 5 | 6 | # General ignores 7 | .DS_Store 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | .ipynb_checkpoints 12 | 13 | # Workbench Project Layout 14 | data/scratch/* 15 | !data/scratch/.gitkeep 16 | 17 | # Rapids ignores 18 | *.html 19 | /.ipynb_checkpoints/ 20 | *~ 21 | /rows.csv 22 | /cufile.log 23 | *.pyc 24 | 25 | # Byte-compiled / optimized / DLL files 26 | 27 | # Temp directories, notebooks created by jupyterlab 28 | .Trash-*/ 29 | .jupyter/ 30 | .local/ 31 | 32 | # Python distribution / packaging 33 | .Python 34 | build/ 35 | develop-eggs/ 36 | dist/ 37 | downloads/ 38 | eggs/ 39 | .eggs/ 40 | lib/ 41 | lib64/ 42 | parts/ 43 | sdist/ 44 | var/ 45 | wheels/ 46 | share/python-wheels/ 47 | *.egg-info/ 48 | .installed.cfg 49 | *.egg 50 | MANIFEST 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .nox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *.cover 62 | *.py,cover 63 | .hypothesis/ 64 | .pytest_cache/ 65 | cover/ -------------------------------------------------------------------------------- /.project/spec.yaml: -------------------------------------------------------------------------------- 1 | specVersion: v2 2 | specMinorVersion: 2 3 | meta: 4 | name: rapids-cudf 5 | image: project-rapids-cudf 6 | description: Accelerate Data Science workflows with RAPIDS cuDF and cuDF.pandas 7 | labels: [] 8 | createdOn: "2023-09-19T22:48:48Z" 9 | defaultBranch: main 10 | layout: 11 | - path: code/ 12 | type: code 13 | storage: git 14 | - path: models/ 15 | type: models 16 | storage: gitlfs 17 | - path: data/ 18 | type: data 19 | storage: gitlfs 20 | - path: data/scratch/ 21 | type: data 22 | storage: gitignore 23 | environment: 24 | base: 25 | registry: nvcr.io 26 | image: nvidia/rapidsai/notebooks:25.08-cuda12.9-py3.13 27 | build_timestamp: "" 28 | name: RAPIDS with CUDA 12.9.1 29 | supported_architectures: [] 30 | cuda_version: 12.9.1 31 | description: RAPIDS with CUDA 12.9.1 32 | entrypoint_script: /home/rapids/entrypoint.sh 33 | labels: 34 | - cuda12.9.1 35 | apps: 36 | - name: jupyterlab 37 | type: jupyterlab 38 | class: webapp 39 | start_command: jupyter lab --allow-root --port 8888 --ip 0.0.0.0 --no-browser --NotebookApp.base_url=\$PROXY_PREFIX --NotebookApp.default_url=/lab --NotebookApp.allow_origin='*' 40 | health_check_command: '[ \$(echo url=\$(jupyter lab list | head -n 2 | tail -n 1 | cut -f1 -d'' '' | grep -v ''Currently'' | sed "s@/?@/lab?@g") | curl -o /dev/null -s -w ''%{http_code}'' --config -) == ''200'' ]' 41 | stop_command: jupyter lab stop 8888 42 | user_msg: "" 43 | logfile_path: "" 44 | timeout_seconds: 60 45 | icon_url: "" 46 | webapp_options: 47 | autolaunch: true 48 | port: "8888" 49 | proxy: 50 | trim_prefix: false 51 | url_command: jupyter lab list | head -n 2 | tail -n 1 | cut -f1 -d' ' | grep -v 'Currently' 52 | programming_languages: 53 | - python3 54 | icon_url: "" 55 | image_version: 25.08.01 56 | os: linux 57 | os_distro: ubuntu 58 | os_distro_release: "22.04" 59 | schema_version: v2 60 | user_info: 61 | uid: "1001" 62 | gid: "1000" 63 | username: rapids 64 | package_managers: 65 | - name: apt 66 | binary_path: /usr/bin/apt 67 | installed_packages: 68 | - "" 69 | - name: conda3 70 | binary_path: /opt/conda/bin/conda 71 | installed_packages: 72 | - rapids 73 | - cudf 74 | - cuml 75 | - cugraph 76 | - rmm 77 | - pylibraft 78 | - cuspatial 79 | - cuxfilter 80 | - cucim 81 | - xgboost 82 | - dask-sql 83 | - jupyterlab 84 | - name: pip 85 | binary_path: /opt/conda/bin/pip 86 | installed_packages: 87 | - jupyterlab-nvdashboard 88 | package_manager_environment: 89 | name: conda 90 | target: /opt/conda 91 | compose_file_path: "" 92 | execution: 93 | apps: 94 | - name: jupyterlab 95 | type: jupyterlab 96 | class: webapp 97 | start_command: jupyter lab --allow-root --port 8888 --ip 0.0.0.0 --no-browser --NotebookApp.base_url=\$PROXY_PREFIX --NotebookApp.default_url=/lab --notebook-dir=/project/ 98 | health_check_command: '[ \$(echo url=\$(jupyter lab list 2>&1 | head -n 2 | tail -n 1 | cut -f1 -d'''' '''' | grep -v ''''Currently'''' | sed ''''s@/?@/lab?@g'''') | curl -o /dev/null -s -w ''''%{http_code}'''' --config -) == ''''200'''' ]' 99 | stop_command: jupyter lab stop 8888 100 | user_msg: "" 101 | logfile_path: "" 102 | timeout_seconds: 60 103 | icon_url: "" 104 | webapp_options: 105 | autolaunch: true 106 | port: "8888" 107 | proxy: 108 | trim_prefix: false 109 | url_command: jupyter lab list 2>&1 | head -n 2 | tail -n 1 | cut -f1 -d' ' | grep -v 'Currently' 110 | resources: 111 | gpu: 112 | requested: 1 113 | sharedMemoryMB: 1024 114 | secrets: [] 115 | mounts: 116 | - type: project 117 | target: /project/ 118 | description: Project directory 119 | options: rw 120 | - type: volume 121 | target: /data/tensorboard/logs/ 122 | description: Tensorboard Log Files 123 | options: volumeName=tensorboard-logs-volume 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NVIDIA AI Workbench: Introduction 2 | This is an [NVIDIA AI Workbench](https://developer.nvidia.com/blog/develop-and-deploy-scalable-generative-ai-models-seamlessly-with-nvidia-ai-workbench/) example project that provides a short introduction of the cuDF library, a Python GPU-accelerated DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating data. cuDF also provides a pandas-like API that will be familiar to data engineers & data scientists, so they can use it to easily accelerate their workflows without going into the details of CUDA programming. Users who have [installed AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) can get up and running with this project in minutes. 3 | 4 | Have questions? Please direct any issues, fixes, suggestions, and discussion on this project to the DevZone Members Only Forum thread [here](https://forums.developer.nvidia.com/t/support-workbench-example-project-rapids-cudf/278372/1). 5 | 6 | ## Project Description 7 | Included in this project are eight tutorial notebooks. The first five are relatively easy to run; the last three (*) may require a low GPU RAM user ( < 16GB) to push the project to heavier hardware to run all of the performance benchmarks. Good news: Workbench makes this easy! 8 | 9 | * [cudf-pandas-demo](./code/cudf-pandas-demo.ipynb): This notebook demonstrates the acceleration that `cudf.pandas` gives over vanilla Pandas. The example runs through loading some data with Pandas and getting some performance numbers, then running the same code again with the `cudf.pandas` plugin to show the speedup that is possible with NVIDIA hardware. 10 | 11 | * [10min](./code/10min.ipynb): This is a short introduction to cuDF and Dask-cuDF, geared mainly towards new users. 12 | 13 | _cuDF_ is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API in the style of pandas. 14 | 15 | _Dask_ is a flexible library for parallel computing in Python that makes scaling out your workflow smooth and simple. On the CPU, Dask uses Pandas to execute operations in parallel on DataFrame partitions. 16 | 17 | _Dask-cuDF_ extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call dask_cudf.read_csv(...), your cluster’s GPUs do the work of parsing the CSV file(s) by calling cudf.read_csv(). 18 | 19 | Which libraries do I use? If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF. 20 | 21 | * [cupy-interop](./code/cupy-interop.ipynb): This notebook provides introductory examples of how you can use cuDF and CuPy together to take advantage of CuPy array functionality (such as advanced linear algebra operations). 22 | 23 | * [missing-data](./code/missing-data.ipynb): In this section, we will discuss missing (also referred to as NA) values in cudf. cudf supports having missing values in all dtypes. These missing values are represented by . These values are also referenced as “null values”. 24 | 25 | * [Introduction_to_Strings](./code/Introduction_to_Strings.ipynb): This notebook shows how to manipulate strings with cuDF DataFrames. 26 | 27 | * [Introduction_to_Exploratory_Data_Analysis_using_cuDF](./code/Introduction_to_Exploratory_Data_Analysis_using_cuDF.ipynb): This notebook shows how to perform basic EDA with cuDF DataFrames 28 | 29 | * [Introduction_to_Time_Series_Data_Analysis_using_cuDF](./code/Introduction_to_Time_Series_Data_Analysis_using_cuDF.ipynb): This notebook shows how to do EDA on time-series DataFrame with cuDF 30 | 31 | * [performance-comparisons](./code/performance-comparisons.ipynb) (*): This notebook compares the performance of cuDF and pandas. The comparisons performed are on identical data sizes. This notebook primarily showcases the factor of speedups users can have when the similar pandas APIs are run on GPUs using cudf. This notebook is written to measure performance on NVIDIA GPUs with _large_ memory. Performance results may vary by data size, as well as the CPU and GPU used. 32 | 33 | --- 34 | **Important Considerations:** 35 | * The notebook titled ```performance-comparisons.ipynb``` may take a long time to execute on laptop and/or workstation hardware. This is because we are running benchmarks and conducting dataframe operations on massive datasets using both Pandas and cuDF. Feel free to adjust the ```num_rows``` variable as needed. 36 | 37 | * If working locally on a laptop or workstation, also consider pushing this project to heavier hardware (original notebook authors used 2x H100 GPUs) to run this notebook. Good news: NVIDIA AI Workbench makes this push easy! 38 | 39 | --- 40 | 41 | ## System Requirements: 42 | * Operating System: Ubuntu 22.04 43 | * CPU requirements: None, tested with Intel® Xeon® Gold 6240R CPU @ 2.40GHz 44 | * GPU requirements: Any NVIDIA training GPU, tested with NVIDIA A100-40GB 45 | * NVIDIA driver requirements: Latest driver version 46 | * Storage requirements: 40GB 47 | 48 | # Quickstart 49 | The notebook(s) in this project were adapted from the RAPIDS cuDF Github repository, which can be found [here](https://github.com/rapidsai/cudf/tree/branch-23.12/notebooks). 50 | 51 | If you have NVIDIA AI Workbench already installed, you can use this Project in AI Workbench on your choice of machine by: 52 | 1. Forking this Project to your own GitHub namespace and copying the clone link 53 | 54 | ```https://github.com/[your_namespace]/.git``` 55 | 56 | 2. Opening a shell and activating the Context you want to clone into by 57 | 58 | ``` 59 | $ nvwb list contexts 60 | 61 | $ nvwb activate 62 | ``` 63 | 64 | 3. Cloning this Project onto your desired machine by running 65 | 66 | ``` 67 | $ nvwb clone project 68 | ``` 69 | 70 | 4. Opening the Project by 71 | 72 | ``` 73 | $ nvwb list projects 74 | 75 | $ nvwb open 76 | ``` 77 | 78 | 5. Starting JupyterLab by 79 | 80 | ``` 81 | $ nvwb start jupyterlab 82 | ``` 83 | 84 | 6. Navigate to the code directory of the project. Then, open the notebooks provided and begin working through them at your own pace. Happy coding! 85 | 86 | --- 87 | **Tip:** Use ```nvwb help``` to see a full list of commands. 88 | 89 | --- 90 | 91 | ## Tested On 92 | This notebook has been tested with an NVIDIA A100-40gb GPU and an Intel(R) Xeon(R) Gold 6240R CPU (2.40GHz) on the following version of NVIDIA AI Workbench: ```nvwb 0.2.66 (internal; linux; amd64; go1.18.10; Tue Sep 12 18:50:21 UTC 2023)``` 93 | 94 | ## License 95 | This NVIDIA AI Workbench example project is under the [Apache 2.0 License](https://github.com/NVIDIA/rapids-cudf/blob/main/LICENSE.txt) 96 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2023 NVIDIA Corporation 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /code/cudf-pandas-demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "kcF9ZWvjSybR" 7 | }, 8 | "source": [ 9 | "# 10 Minutes to RAPIDS cuDF's pandas accelerator mode (cudf.pandas)\n", 10 | "\n", 11 | "cuDF is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API in the style of pandas.\n", 12 | "\n", 13 | "cuDF now provides a pandas accelerator mode (`cudf.pandas`), allowing you to bring accelerated computing to your pandas workflows without requiring any code change.\n", 14 | "\n", 15 | "This notebook is a short introduction to `cudf.pandas`." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "SH_h6ci1Sx0u" 22 | }, 23 | "source": [ 24 | "# ⚠️ Verify your setup\n", 25 | "\n", 26 | "First, we'll verify that you are running with an NVIDIA GPU." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "colab": { 34 | "base_uri": "https://localhost:8080/" 35 | }, 36 | "id": "Y2vPCtXcCvUR", 37 | "outputId": "fb93a4bc-9ef1-4333-a81c-48d7b5e8ceb9", 38 | "tags": [] 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "!nvidia-smi # this should display information about available GPUs" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "id": "zhPt4Xj8THgo", 50 | "tags": [] 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "import cudf # this should work without any errors" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "id": "4zGUeWvcTbDs" 61 | }, 62 | "source": [ 63 | "# Download the data\n", 64 | "\n", 65 | "The data we'll be working with is the [Parking Violations Issued - Fiscal Year 2022](https://data.cityofnewyork.us/City-Government/Parking-Violations-Issued-Fiscal-Year-2022/7mxj-7a6y) dataset from NYC Open Data.\n", 66 | "\n", 67 | "We're downloading a copy of this dataset from an s3 bucket hosted by NVIDIA to provide faster download speeds. We'll start by downloading the data. This should take about 30 seconds.\n", 68 | "\n", 69 | "## Data License and Terms\n", 70 | "As this dataset originates from the NYC Open Data Portal, it's governed by their license and terms of use.\n", 71 | "\n", 72 | "### Are there restrictions on how I can use Open Data?\n", 73 | "\n", 74 | "> Open Data belongs to all New Yorkers. There are no restrictions on the use of Open Data. Refer to Terms of Use for more information.\n", 75 | "\n", 76 | "### [Terms of Use](https://opendata.cityofnewyork.us/overview/#termsofuse)\n", 77 | "\n", 78 | "> By accessing datasets and feeds available through NYC Open Data, the user agrees to all of the Terms of Use of NYC.gov as well as the Privacy Policy for NYC.gov. The user also agrees to any additional terms of use defined by the agencies, bureaus, and offices providing data. Public data sets made available on NYC Open Data are provided for informational purposes. The City does not warranty the completeness, accuracy, content, or fitness for any particular purpose or use of any public data set made available on NYC Open Data, nor are any such warranties to be implied or inferred with respect to the public data sets furnished therein.\n", 79 | "\n", 80 | "> The City is not liable for any deficiencies in the completeness, accuracy, content, or fitness for any particular purpose or use of any public data set, or application utilizing such data set, provided by any third party.\n", 81 | "\n", 82 | "> Submitting City Agencies are the authoritative source of data available on NYC Open Data. These entities are responsible for data quality and retain version control of data sets and feeds accessed on the Site. Data may be updated, corrected, or refreshed at any time." 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "colab": { 90 | "base_uri": "https://localhost:8080/" 91 | }, 92 | "id": "5EoQqNwsTqeP", 93 | "outputId": "b5f9b7f5-b1e4-4a40-d30e-30f59ff106d9", 94 | "tags": [] 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "!wget https://data.rapids.ai/datasets/nyc_parking/nyc_parking_violations_2022.parquet -O /tmp/nyc_parking_violations_2022.parquet" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": { 104 | "id": "hAvNFbYKWwti" 105 | }, 106 | "source": [ 107 | "# Analysis using Standard Pandas\n", 108 | "\n", 109 | "First, let's use Pandas to read in some columns of the dataset:" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "id": "SLRleX9xWxqX", 117 | "tags": [] 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "import pandas as pd" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "colab": { 129 | "base_uri": "https://localhost:8080/", 130 | "height": 363 131 | }, 132 | "id": "OLatEi7rW0la", 133 | "outputId": "22cfa4c5-58ee-4514-ecb1-f984f241a465", 134 | "tags": [] 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "# read 5 columns data:\n", 139 | "df = pd.read_parquet(\n", 140 | " \"/tmp/nyc_parking_violations_2022.parquet\",\n", 141 | " columns=[\"Registration State\", \"Violation Description\", \"Vehicle Body Type\", \"Issue Date\", \"Summons Number\"]\n", 142 | ")\n", 143 | "\n", 144 | "# view a random sample of 10 rows:\n", 145 | "df.sample(10)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": { 151 | "id": "m7qXNJU9W53D" 152 | }, 153 | "source": [ 154 | "Next, we'll try to answer a few questions using the data." 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": { 160 | "id": "VmkFv9ZUW37g" 161 | }, 162 | "source": [ 163 | "## Which parking violation is most commonly committed by vehicles from various U.S states?\n", 164 | "\n", 165 | "Each record in our dataset contains the state of registration of the offending vehicle, and the type of parking offence. Let's say we want to get the most common type of offence for vehicles registered in different states. We can do this in Pandas using a combination of [value_counts](https://pandas.pydata.org/docs/reference/api/pandas.Series.value_counts.html) and [GroupBy.head](https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.head.html):" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "colab": { 173 | "base_uri": "https://localhost:8080/", 174 | "height": 423 175 | }, 176 | "id": "bHXq-s_ZXOQN", 177 | "outputId": "a0ca97bc-0c91-4f89-931b-bb0377e1c1c8" 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "(df[[\"Registration State\", \"Violation Description\"]] # get only these two columns\n", 182 | " .value_counts() # get the count of offences per state and per type of offence\n", 183 | " .groupby(\"Registration State\") # group by state\n", 184 | " .head(1) # get the first row in each group (the type of offence with the largest count)\n", 185 | " .sort_index() # sort by state name\n", 186 | " .reset_index()\n", 187 | ")" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": { 193 | "id": "8lXF4v4SXRf3" 194 | }, 195 | "source": [ 196 | "The code above uses [method chaining](https://tomaugspurger.net/posts/method-chaining/) to combine a series of operations into a single statement. You might find it useful to break the code up into multiple statements and inspect each of the intermediate results!" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": { 202 | "id": "H7_9EmGyXUJd" 203 | }, 204 | "source": [ 205 | "## Which vehicle body types are most frequently involved in parking violations?\n", 206 | "\n", 207 | "We can also investigate which vehicle body types most commonly appear in parking violations" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "colab": { 215 | "base_uri": "https://localhost:8080/", 216 | "height": 455 217 | }, 218 | "id": "d7Ax-u4TXZtp", 219 | "outputId": "0feeca28-ca17-4818-8ecb-d62ea5e1c1db" 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "(df\n", 224 | " .groupby([\"Vehicle Body Type\"])\n", 225 | " .agg({\"Summons Number\": \"count\"})\n", 226 | " .rename(columns={\"Summons Number\": \"Count\"})\n", 227 | " .sort_values([\"Count\"], ascending=False)\n", 228 | ")" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": { 234 | "id": "VjFfQLZHXehM" 235 | }, 236 | "source": [ 237 | "## How do parking violations vary across days of the week?" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "colab": { 245 | "base_uri": "https://localhost:8080/" 246 | }, 247 | "id": "s5_y9m_AXhIw", 248 | "outputId": "11d16e26-4a5d-4fbb-c777-7d50a23895ae" 249 | }, 250 | "outputs": [], 251 | "source": [ 252 | "weekday_names = {\n", 253 | " 0: \"Monday\",\n", 254 | " 1: \"Tuesday\",\n", 255 | " 2: \"Wednesday\",\n", 256 | " 3: \"Thursday\",\n", 257 | " 4: \"Friday\",\n", 258 | " 5: \"Saturday\",\n", 259 | " 6: \"Sunday\",\n", 260 | "}\n", 261 | "\n", 262 | "df[\"Issue Date\"] = df[\"Issue Date\"].astype(\"datetime64[ms]\")\n", 263 | "df[\"issue_weekday\"] = df[\"Issue Date\"].dt.weekday.map(weekday_names)\n", 264 | "\n", 265 | "df.groupby([\"issue_weekday\"])[\"Summons Number\"].count().sort_values()" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": { 271 | "id": "LDeYr6xkXiDc" 272 | }, 273 | "source": [ 274 | "It looks like there are fewer violations on weekends, which makes sense! During the week, more people are driving in New York City." 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": { 280 | "id": "JKBQcT64XlMr" 281 | }, 282 | "source": [ 283 | "## Let's time it!\n", 284 | "\n", 285 | "Loading and processing this data took a little time. Let's measure how long these pipelines take in Pandas:" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": { 292 | "colab": { 293 | "base_uri": "https://localhost:8080/", 294 | "height": 458 295 | }, 296 | "id": "mDpQhus-Xnfs", 297 | "outputId": "e9af1194-a0f8-48d4-a1b3-42dba63f3110" 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "%%time\n", 302 | "\n", 303 | "df = pd.read_parquet(\n", 304 | " \"/tmp/nyc_parking_violations_2022.parquet\",\n", 305 | " columns=[\"Registration State\", \"Violation Description\", \"Vehicle Body Type\", \"Issue Date\", \"Summons Number\"]\n", 306 | ")\n", 307 | "\n", 308 | "(df[[\"Registration State\", \"Violation Description\"]]\n", 309 | " .value_counts()\n", 310 | " .groupby(\"Registration State\")\n", 311 | " .head(1)\n", 312 | " .sort_index()\n", 313 | " .reset_index()\n", 314 | ")" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": { 321 | "colab": { 322 | "base_uri": "https://localhost:8080/", 323 | "height": 490 324 | }, 325 | "id": "9Gw5TWH2Xqgv", 326 | "outputId": "eabf2e77-6fef-4751-d682-7c5c51ebc86e" 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "%%time\n", 331 | "\n", 332 | "(df\n", 333 | " .groupby([\"Vehicle Body Type\"])\n", 334 | " .agg({\"Summons Number\": \"count\"})\n", 335 | " .rename(columns={\"Summons Number\": \"Count\"})\n", 336 | " .sort_values([\"Count\"], ascending=False)\n", 337 | ")" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "colab": { 345 | "base_uri": "https://localhost:8080/" 346 | }, 347 | "id": "BovQgNrpXr2l", 348 | "outputId": "01b2c4fc-8c8c-4947-92a2-a3e2f0a7cf14" 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "%%time\n", 353 | "\n", 354 | "weekday_names = {\n", 355 | " 0: \"Monday\",\n", 356 | " 1: \"Tuesday\",\n", 357 | " 2: \"Wednesday\",\n", 358 | " 3: \"Thursday\",\n", 359 | " 4: \"Friday\",\n", 360 | " 5: \"Saturday\",\n", 361 | " 6: \"Sunday\",\n", 362 | "}\n", 363 | "\n", 364 | "df[\"Issue Date\"] = df[\"Issue Date\"].astype(\"datetime64[ms]\")\n", 365 | "df[\"issue_weekday\"] = df[\"Issue Date\"].dt.weekday.map(weekday_names)\n", 366 | "\n", 367 | "df.groupby([\"issue_weekday\"])[\"Summons Number\"].count().sort_values()" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": { 373 | "id": "VgAWS0yXXtGj" 374 | }, 375 | "source": [ 376 | "# Using cudf.pandas\n", 377 | "\n", 378 | "Now, let's re-run the Pandas code above with the `cudf.pandas` extension loaded.\n", 379 | "\n", 380 | "Typically, you should load the `cudf.pandas` extension as the first step in your notebook, before importing any modules. Here, we explicitly restart the kernel to simulate that behavior." 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": { 387 | "colab": { 388 | "base_uri": "https://localhost:8080/" 389 | }, 390 | "id": "hW5rUr2tXzUW", 391 | "outputId": "5ad04ad3-9e5a-4609-8e0d-fe19f02c32d0" 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "get_ipython().kernel.do_shutdown(restart=True)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": { 402 | "id": "NjvPsTlGZrW7" 403 | }, 404 | "outputs": [], 405 | "source": [ 406 | "%load_ext cudf.pandas" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": { 413 | "colab": { 414 | "base_uri": "https://localhost:8080/" 415 | }, 416 | "id": "XL_u4l5gZJte", 417 | "outputId": "bf96ea78-1baa-4542-e4d7-c5ac85e035f4" 418 | }, 419 | "outputs": [], 420 | "source": [ 421 | "%%time\n", 422 | "\n", 423 | "import pandas as pd\n", 424 | "\n", 425 | "df = pd.read_parquet(\n", 426 | " \"/tmp/nyc_parking_violations_2022.parquet\",\n", 427 | " columns=[\"Registration State\", \"Violation Description\", \"Vehicle Body Type\", \"Issue Date\", \"Summons Number\"]\n", 428 | ")\n", 429 | "\n", 430 | "(df[[\"Registration State\", \"Violation Description\"]]\n", 431 | " .value_counts()\n", 432 | " .groupby(\"Registration State\")\n", 433 | " .head(1)\n", 434 | " .sort_index()\n", 435 | " .reset_index()\n", 436 | ")" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": { 443 | "colab": { 444 | "base_uri": "https://localhost:8080/", 445 | "height": 490 446 | }, 447 | "id": "BLWa8ed6d-pD", 448 | "outputId": "a717d797-b6d4-4baf-dce3-c281e52d1576" 449 | }, 450 | "outputs": [], 451 | "source": [ 452 | "%%time\n", 453 | "\n", 454 | "(df\n", 455 | " .groupby([\"Vehicle Body Type\"])\n", 456 | " .agg({\"Summons Number\": \"count\"})\n", 457 | " .rename(columns={\"Summons Number\": \"Count\"})\n", 458 | " .sort_values([\"Count\"], ascending=False)\n", 459 | ")" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": { 466 | "colab": { 467 | "base_uri": "https://localhost:8080/" 468 | }, 469 | "id": "X6ASy4mPd_-c", 470 | "outputId": "d45b6616-a0f8-48ba-e86a-6ad087c4af4c" 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "%%time\n", 475 | "\n", 476 | "weekday_names = {\n", 477 | " 0: \"Monday\",\n", 478 | " 1: \"Tuesday\",\n", 479 | " 2: \"Wednesday\",\n", 480 | " 3: \"Thursday\",\n", 481 | " 4: \"Friday\",\n", 482 | " 5: \"Saturday\",\n", 483 | " 6: \"Sunday\",\n", 484 | "}\n", 485 | "\n", 486 | "df[\"Issue Date\"] = df[\"Issue Date\"].astype(\"datetime64[ms]\")\n", 487 | "df[\"issue_weekday\"] = df[\"Issue Date\"].dt.weekday.map(weekday_names)\n", 488 | "\n", 489 | "df.groupby([\"issue_weekday\"])[\"Summons Number\"].count().sort_values()" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": { 495 | "id": "FMUrf6iMeBdM" 496 | }, 497 | "source": [ 498 | "Much faster! Operations that took 5-20 seconds can now potentially finish in just milliseconds without changing any code." 499 | ] 500 | }, 501 | { 502 | "cell_type": "markdown", 503 | "metadata": { 504 | "id": "00m6gUxqeGzk" 505 | }, 506 | "source": [ 507 | "# Understanding Performance\n", 508 | "\n", 509 | "`cudf.pandas` provides profiling utilities to help you better understand performance. With these tools, you can identify which parts of your code ran on the GPU and which parts ran on the CPU.\n", 510 | "\n", 511 | "They're accessible in the `cudf.pandas` namespace since the `cudf.pandas` extension was loaded above with `load_ext cudf.pandas`.\n", 512 | "\n", 513 | "#### Colab Note\n", 514 | "If you're running in Colab, the first time you run use the profiler it may take 10+ seconds due to Colab's debugger interacting with the built-in Python function [sys.settrace](https://docs.python.org/3/library/sys.html#sys.settrace) that we use for profiling. For demo purposes, this isn't an issue. Just run the cell again.\n", 515 | "\n", 516 | "## Profiling Functionality\n", 517 | "\n", 518 | "We can generate a per-function profile:" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "len(df)" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": null, 533 | "metadata": { 534 | "colab": { 535 | "base_uri": "https://localhost:8080/", 536 | "height": 334 537 | }, 538 | "id": "RFm22OWbeHF2", 539 | "outputId": "eed4240d-01e1-4007-aced-28c29fe172a5" 540 | }, 541 | "outputs": [], 542 | "source": [ 543 | "%%cudf.pandas.profile\n", 544 | "\n", 545 | "small_df = pd.DataFrame({'a': [0, 1, 2], 'b': [\"x\", \"y\", \"z\"]})\n", 546 | "small_df = pd.concat([small_df, small_df])\n", 547 | "\n", 548 | "axis = 0\n", 549 | "for i in range(0, 2):\n", 550 | " small_df.min(axis=axis)\n", 551 | " axis = 1\n", 552 | "\n", 553 | "counts = small_df.groupby(\"a\").b.count()" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": null, 559 | "metadata": { 560 | "colab": { 561 | "base_uri": "https://localhost:8080/", 562 | "height": 448 563 | }, 564 | "id": "Syb-_vZweN2H", 565 | "outputId": "96e1ea1c-f3d7-4792-abcb-9e90c1dcdd1c" 566 | }, 567 | "outputs": [], 568 | "source": [ 569 | "%%cudf.pandas.line_profile\n", 570 | "\n", 571 | "small_df = pd.DataFrame({'a': [0, 1, 2], 'b': [\"x\", \"y\", \"z\"]})\n", 572 | "small_df = pd.concat([small_df, small_df])\n", 573 | "\n", 574 | "axis = 0\n", 575 | "for i in range(0, 2):\n", 576 | " small_df.min(axis=axis)\n", 577 | " axis = 1\n", 578 | "\n", 579 | "counts = small_df.groupby(\"a\").b.count()" 580 | ] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": { 585 | "id": "VCZ6BxwBpfjL" 586 | }, 587 | "source": [ 588 | "## Behind the scenes: What's going on here?\n", 589 | "\n", 590 | "When you load `cudf.pandas`, Pandas types like `Series` and `DataFrame` are replaced by proxy objects that dispatch operations to cuDF when possible. We can verify that `cudf.pandas` is active by looking at our `pd` variable:" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": null, 596 | "metadata": { 597 | "colab": { 598 | "base_uri": "https://localhost:8080/" 599 | }, 600 | "id": "jogk5UrgeTkS", 601 | "outputId": "c4b9b4cd-894b-4379-fde2-fee17138bd36" 602 | }, 603 | "outputs": [], 604 | "source": [ 605 | "pd" 606 | ] 607 | }, 608 | { 609 | "cell_type": "markdown", 610 | "metadata": { 611 | "id": "vxh70rpDph3I" 612 | }, 613 | "source": [ 614 | "As a result, all pandas functions, methods, and created objects are proxies:" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": null, 620 | "metadata": { 621 | "colab": { 622 | "base_uri": "https://localhost:8080/" 623 | }, 624 | "id": "RYTCGl7spgjs", 625 | "outputId": "ef9a2113-1ea4-4104-c28e-5c97286fb72a" 626 | }, 627 | "outputs": [], 628 | "source": [ 629 | "type(pd.read_csv)" 630 | ] 631 | }, 632 | { 633 | "cell_type": "markdown", 634 | "metadata": { 635 | "id": "9-NvKu7XplmO" 636 | }, 637 | "source": [ 638 | "Operations supported by cuDF will be **very** fast:" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": { 645 | "colab": { 646 | "base_uri": "https://localhost:8080/" 647 | }, 648 | "id": "MFvLJo4upnUG", 649 | "outputId": "3cc21f0c-798f-4589-c8e3-23e52fd052ae" 650 | }, 651 | "outputs": [], 652 | "source": [ 653 | "%%time\n", 654 | "df.count(axis=0)" 655 | ] 656 | }, 657 | { 658 | "cell_type": "markdown", 659 | "metadata": { 660 | "id": "Np6VP-wSpomO" 661 | }, 662 | "source": [ 663 | "Operations not supported by cuDF will be slower, as they fall back to using Pandas (copying data between the CPU and GPU under the hood as needed). For example, cuDF does not currently support the `axis=` parameter to the `count` method. So this operation will run on the CPU and be noticeably slower than the previous one." 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "metadata": { 670 | "colab": { 671 | "base_uri": "https://localhost:8080/" 672 | }, 673 | "id": "mThydJIYpuha", 674 | "outputId": "77a843f4-0ead-4b61-a2b3-23952a2dd35d" 675 | }, 676 | "outputs": [], 677 | "source": [ 678 | "%%time\n", 679 | "df.count(axis=1) # This will use pandas, because cuDF doesn't support axis=1 for the .count() method" 680 | ] 681 | }, 682 | { 683 | "cell_type": "markdown", 684 | "metadata": { 685 | "id": "tbDVvkP2pyra" 686 | }, 687 | "source": [ 688 | "But the story doesn't end here. We often need to mix our own code with third-party libraries that other people have written. Many of these libraries accept pandas objects as inputs." 689 | ] 690 | }, 691 | { 692 | "cell_type": "markdown", 693 | "metadata": { 694 | "id": "3yK3a-mIp0vr" 695 | }, 696 | "source": [ 697 | "# Using third-party libraries with cudf.pandas\n", 698 | "\n", 699 | "You can pass Pandas objects to third-party libraries when using `cudf.pandas`, just like you would when using regular Pandas.\n", 700 | "\n", 701 | "Below, we show an example of using [plotly-express](https://plotly.com/python/plotly-express/) to visualize the data we've been processing:" 702 | ] 703 | }, 704 | { 705 | "cell_type": "markdown", 706 | "metadata": { 707 | "id": "H0QwPQcAp2RV" 708 | }, 709 | "source": [ 710 | "## Visualizing which states have more pickup trucks relative to other vehicles?" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": null, 716 | "metadata": { 717 | "colab": { 718 | "base_uri": "https://localhost:8080/", 719 | "height": 542 720 | }, 721 | "id": "Ecs213eEqCd9", 722 | "outputId": "5c798902-301c-4aaf-dcf2-18ffd863befd" 723 | }, 724 | "outputs": [], 725 | "source": [ 726 | "import plotly.express as px\n", 727 | "\n", 728 | "df = df.rename(columns={\n", 729 | " \"Registration State\": \"reg_state\",\n", 730 | " \"Vehicle Body Type\": \"vehicle_type\",\n", 731 | "})\n", 732 | "\n", 733 | "# vehicle counts per state:\n", 734 | "counts = df.groupby(\"reg_state\").size().sort_index()\n", 735 | "# vehicles with type \"PICK\" (Pickup Truck)\n", 736 | "pickup_counts = df.where(df[\"vehicle_type\"] == \"PICK\").groupby(\"reg_state\").size()\n", 737 | "# percentage of pickup trucks by state:\n", 738 | "pickup_frac = ((pickup_counts / counts) * 100).rename(\"% Pickup Trucks\")\n", 739 | "del pickup_frac[\"MB\"] # (Manitoba is a huge outlier!)\n", 740 | "\n", 741 | "# plot the results:\n", 742 | "pickup_frac = pickup_frac.reset_index()\n", 743 | "px.choropleth(pickup_frac, locations=\"reg_state\", color=\"% Pickup Trucks\", locationmode=\"USA-states\", scope=\"usa\")" 744 | ] 745 | }, 746 | { 747 | "cell_type": "markdown", 748 | "metadata": { 749 | "id": "9bgMrWs5qDG_" 750 | }, 751 | "source": [ 752 | "## Beyond just passing data: **Accelerating** third-party code\n", 753 | "\n", 754 | "Being able to pass these proxy objects to libraries like Plotly is great, but the benefits don't end there.\n", 755 | "\n", 756 | "When you enable `cudf.pandas`, pandas operations running **inside the third-party library's functions** will also benefit from GPU acceleration where possible!\n", 757 | "\n", 758 | "Below, you can see an image illustrating how `cudf.pandas` can accelerate the pandas backend in Ibis, a library that provides a unified DataFrame API to various backends. We ran this example on a system with an NVIDIA H100 GPU and an Intel Xeon Platinum 8480CL CPU.\n", 759 | "\n", 760 | "\n", 761 | "By loading the `cudf.pandas` extension, pandas operations within Ibis can use the GPU with zero code change. It just works." 762 | ] 763 | }, 764 | { 765 | "cell_type": "markdown", 766 | "metadata": { 767 | "id": "8JW2CQL6qEv3" 768 | }, 769 | "source": [ 770 | "![ibis](https://drive.google.com/uc?id=1uOJq2JtbgVb7tb8qw8a2gG3JRBo72t_H)" 771 | ] 772 | }, 773 | { 774 | "cell_type": "markdown", 775 | "metadata": { 776 | "id": "pyVNtGUhtFs5" 777 | }, 778 | "source": [ 779 | "# Conclusion\n", 780 | "\n", 781 | "With `cudf.pandas`, you can keep using pandas as your primary dataframe library. When things start to get a little slow, just load the `cudf.pandas` and run your existing code on a GPU!\n", 782 | "\n", 783 | "To learn more, we encourage you to visit [rapids.ai/cudf-pandas](https://rapids.ai/cudf-pandas)." 784 | ] 785 | }, 786 | { 787 | "cell_type": "code", 788 | "execution_count": null, 789 | "metadata": { 790 | "id": "XjELOIf3xykH" 791 | }, 792 | "outputs": [], 793 | "source": [] 794 | } 795 | ], 796 | "metadata": { 797 | "accelerator": "GPU", 798 | "colab": { 799 | "collapsed_sections": [ 800 | "VmkFv9ZUW37g", 801 | "H7_9EmGyXUJd", 802 | "VjFfQLZHXehM", 803 | "JKBQcT64XlMr" 804 | ], 805 | "gpuType": "T4", 806 | "provenance": [] 807 | }, 808 | "kernelspec": { 809 | "display_name": "Python 3 (ipykernel)", 810 | "language": "python", 811 | "name": "python3" 812 | }, 813 | "language_info": { 814 | "codemirror_mode": { 815 | "name": "ipython", 816 | "version": 3 817 | }, 818 | "file_extension": ".py", 819 | "mimetype": "text/x-python", 820 | "name": "python", 821 | "nbconvert_exporter": "python", 822 | "pygments_lexer": "ipython3", 823 | "version": "3.10.13" 824 | } 825 | }, 826 | "nbformat": 4, 827 | "nbformat_minor": 4 828 | } 829 | -------------------------------------------------------------------------------- /code/cupy-interop.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8e5e6878", 6 | "metadata": {}, 7 | "source": [ 8 | "# Interoperability between cuDF and CuPy\n", 9 | "\n", 10 | "This notebook provides introductory examples of how you can use cuDF and CuPy together to take advantage of CuPy array functionality (such as advanced linear algebra operations)." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "8b2d45c3", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import timeit\n", 21 | "\n", 22 | "import cupy as cp\n", 23 | "from packaging import version\n", 24 | "\n", 25 | "import cudf\n", 26 | "\n", 27 | "if version.parse(cp.__version__) >= version.parse(\"10.0.0\"):\n", 28 | " cupy_from_dlpack = cp.from_dlpack\n", 29 | "else:\n", 30 | " cupy_from_dlpack = cp.fromDlpack" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "e7e64b1a", 36 | "metadata": {}, 37 | "source": [ 38 | "### Converting a cuDF DataFrame to a CuPy Array\n", 39 | "\n", 40 | "If we want to convert a cuDF DataFrame to a CuPy ndarray, There are multiple ways to do it:\n", 41 | "\n", 42 | "1. We can use the [dlpack](https://github.com/dmlc/dlpack) interface.\n", 43 | "\n", 44 | "2. We can also use `DataFrame.values`.\n", 45 | "\n", 46 | "3. We can also convert via the [CUDA array interface](https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html) by using cuDF's `to_cupy` functionality." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "id": "45c482ab", 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "123 µs ± 658 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n", 60 | "379 µs ± 3.04 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n", 61 | "386 µs ± 5.01 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "nelem = 10000\n", 67 | "df = cudf.DataFrame(\n", 68 | " {\n", 69 | " \"a\": range(nelem),\n", 70 | " \"b\": range(500, nelem + 500),\n", 71 | " \"c\": range(1000, nelem + 1000),\n", 72 | " }\n", 73 | ")\n", 74 | "\n", 75 | "%timeit arr_cupy = cupy_from_dlpack(df.to_dlpack())\n", 76 | "%timeit arr_cupy = df.values\n", 77 | "%timeit arr_cupy = df.to_cupy()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 3, 83 | "id": "a565effc", 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "array([[ 0, 500, 1000],\n", 90 | " [ 1, 501, 1001],\n", 91 | " [ 2, 502, 1002],\n", 92 | " ...,\n", 93 | " [ 9997, 10497, 10997],\n", 94 | " [ 9998, 10498, 10998],\n", 95 | " [ 9999, 10499, 10999]])" 96 | ] 97 | }, 98 | "execution_count": 3, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "arr_cupy = cupy_from_dlpack(df.to_dlpack())\n", 105 | "arr_cupy" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "id": "0759ab29", 111 | "metadata": {}, 112 | "source": [ 113 | "### Converting a cuDF Series to a CuPy Array" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "4f35ffbd", 119 | "metadata": {}, 120 | "source": [ 121 | "There are also multiple ways to convert a cuDF Series to a CuPy array:\n", 122 | "\n", 123 | "1. We can pass the Series to `cupy.asarray` as cuDF Series exposes [`__cuda_array_interface__`](https://docs-cupy.chainer.org/en/stable/reference/interoperability.html).\n", 124 | "2. We can leverage the dlpack interface `to_dlpack()`. \n", 125 | "3. We can also use `Series.values`" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 4, 131 | "id": "8f97f304", 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "40.2 µs ± 107 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n", 139 | "124 µs ± 918 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n", 140 | "105 µs ± 318 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" 141 | ] 142 | } 143 | ], 144 | "source": [ 145 | "col = \"a\"\n", 146 | "\n", 147 | "%timeit cola_cupy = cp.asarray(df[col])\n", 148 | "%timeit cola_cupy = cupy_from_dlpack(df[col].to_dlpack())\n", 149 | "%timeit cola_cupy = df[col].values" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 5, 155 | "id": "f96d5676", 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "array([ 0, 1, 2, ..., 9997, 9998, 9999])" 162 | ] 163 | }, 164 | "execution_count": 5, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "cola_cupy = cp.asarray(df[col])\n", 171 | "cola_cupy" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "id": "c36e5b88", 177 | "metadata": {}, 178 | "source": [ 179 | "From here, we can proceed with normal CuPy workflows, such as reshaping the array, getting the diagonal, or calculating the norm." 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 6, 185 | "id": "2a7ae43f", 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "data": { 190 | "text/plain": [ 191 | "array([[ 0, 1, 2, ..., 197, 198, 199],\n", 192 | " [ 200, 201, 202, ..., 397, 398, 399],\n", 193 | " [ 400, 401, 402, ..., 597, 598, 599],\n", 194 | " ...,\n", 195 | " [9400, 9401, 9402, ..., 9597, 9598, 9599],\n", 196 | " [9600, 9601, 9602, ..., 9797, 9798, 9799],\n", 197 | " [9800, 9801, 9802, ..., 9997, 9998, 9999]])" 198 | ] 199 | }, 200 | "execution_count": 6, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "reshaped_arr = cola_cupy.reshape(50, 200)\n", 207 | "reshaped_arr" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 7, 213 | "id": "b442a30c", 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/plain": [ 219 | "array([ 0, 201, 402, 603, 804, 1005, 1206, 1407, 1608, 1809, 2010,\n", 220 | " 2211, 2412, 2613, 2814, 3015, 3216, 3417, 3618, 3819, 4020, 4221,\n", 221 | " 4422, 4623, 4824, 5025, 5226, 5427, 5628, 5829, 6030, 6231, 6432,\n", 222 | " 6633, 6834, 7035, 7236, 7437, 7638, 7839, 8040, 8241, 8442, 8643,\n", 223 | " 8844, 9045, 9246, 9447, 9648, 9849])" 224 | ] 225 | }, 226 | "execution_count": 7, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "reshaped_arr.diagonal()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 8, 238 | "id": "be7f4d32", 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/plain": [ 244 | "array(577306.967739)" 245 | ] 246 | }, 247 | "execution_count": 8, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | } 251 | ], 252 | "source": [ 253 | "cp.linalg.norm(reshaped_arr)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "id": "b353bded", 259 | "metadata": {}, 260 | "source": [ 261 | "### Converting a CuPy Array to a cuDF DataFrame\n", 262 | "\n", 263 | "We can also convert a CuPy ndarray to a cuDF DataFrame. Like before, there are multiple ways to do it:\n", 264 | "\n", 265 | "1. **Easiest;** We can directly use the `DataFrame` constructor.\n", 266 | "\n", 267 | "2. We can use CUDA array interface with the `DataFrame` constructor.\n", 268 | "\n", 269 | "3. We can also use the [dlpack](https://github.com/dmlc/dlpack) interface.\n", 270 | "\n", 271 | "For the latter two cases, we'll need to make sure that our CuPy array is Fortran contiguous in memory (if it's not already). We can either transpose the array or simply coerce it to be Fortran contiguous beforehand." 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 9, 277 | "id": "8887b253", 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "name": "stdout", 282 | "output_type": "stream", 283 | "text": [ 284 | "16.7 ms ± 102 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 285 | ] 286 | } 287 | ], 288 | "source": [ 289 | "%timeit reshaped_df = cudf.DataFrame(reshaped_arr)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 10, 295 | "id": "08ec4ffa", 296 | "metadata": {}, 297 | "outputs": [ 298 | { 299 | "data": { 300 | "text/html": [ 301 | "
\n", 302 | "\n", 315 | "\n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | "
0123456789...190191192193194195196197198199
00123456789...190191192193194195196197198199
1200201202203204205206207208209...390391392393394395396397398399
2400401402403404405406407408409...590591592593594595596597598599
3600601602603604605606607608609...790791792793794795796797798799
4800801802803804805806807808809...990991992993994995996997998999
\n", 465 | "

5 rows × 200 columns

\n", 466 | "
" 467 | ], 468 | "text/plain": [ 469 | " 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \\\n", 470 | "0 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \n", 471 | "1 200 201 202 203 204 205 206 207 208 209 ... 390 391 392 393 \n", 472 | "2 400 401 402 403 404 405 406 407 408 409 ... 590 591 592 593 \n", 473 | "3 600 601 602 603 604 605 606 607 608 609 ... 790 791 792 793 \n", 474 | "4 800 801 802 803 804 805 806 807 808 809 ... 990 991 992 993 \n", 475 | "\n", 476 | " 194 195 196 197 198 199 \n", 477 | "0 194 195 196 197 198 199 \n", 478 | "1 394 395 396 397 398 399 \n", 479 | "2 594 595 596 597 598 599 \n", 480 | "3 794 795 796 797 798 799 \n", 481 | "4 994 995 996 997 998 999 \n", 482 | "\n", 483 | "[5 rows x 200 columns]" 484 | ] 485 | }, 486 | "execution_count": 10, 487 | "metadata": {}, 488 | "output_type": "execute_result" 489 | } 490 | ], 491 | "source": [ 492 | "reshaped_df = cudf.DataFrame(reshaped_arr)\n", 493 | "reshaped_df.head()" 494 | ] 495 | }, 496 | { 497 | "cell_type": "markdown", 498 | "id": "6804d291", 499 | "metadata": {}, 500 | "source": [ 501 | "We can check whether our array is Fortran contiguous by using cupy.isfortran or looking at the [flags](https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.ndarray.html#cupy.ndarray.flags) of the array." 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 11, 507 | "id": "65b8bd0d", 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "text/plain": [ 513 | "False" 514 | ] 515 | }, 516 | "execution_count": 11, 517 | "metadata": {}, 518 | "output_type": "execute_result" 519 | } 520 | ], 521 | "source": [ 522 | "cp.isfortran(reshaped_arr)" 523 | ] 524 | }, 525 | { 526 | "cell_type": "markdown", 527 | "id": "151982ad", 528 | "metadata": {}, 529 | "source": [ 530 | "In this case, we'll need to convert it before going to a cuDF DataFrame. In the next two cells, we create the DataFrame by leveraging dlpack and the CUDA array interface, respectively." 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 12, 536 | "id": "27b2f563", 537 | "metadata": {}, 538 | "outputs": [ 539 | { 540 | "name": "stdout", 541 | "output_type": "stream", 542 | "text": [ 543 | "6.26 ms ± 30.8 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 544 | ] 545 | } 546 | ], 547 | "source": [ 548 | "%%timeit\n", 549 | "\n", 550 | "fortran_arr = cp.asfortranarray(reshaped_arr)\n", 551 | "reshaped_df = cudf.DataFrame(fortran_arr)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 13, 557 | "id": "0a0cc290", 558 | "metadata": {}, 559 | "outputs": [ 560 | { 561 | "name": "stdout", 562 | "output_type": "stream", 563 | "text": [ 564 | "4.65 ms ± 82.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 565 | ] 566 | } 567 | ], 568 | "source": [ 569 | "%%timeit\n", 570 | "\n", 571 | "fortran_arr = cp.asfortranarray(reshaped_arr)\n", 572 | "reshaped_df = cudf.from_dlpack(fortran_arr.toDlpack())" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 14, 578 | "id": "0d2c5beb", 579 | "metadata": {}, 580 | "outputs": [ 581 | { 582 | "data": { 583 | "text/html": [ 584 | "
\n", 585 | "\n", 598 | "\n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | "
0123456789...190191192193194195196197198199
00123456789...190191192193194195196197198199
1200201202203204205206207208209...390391392393394395396397398399
2400401402403404405406407408409...590591592593594595596597598599
3600601602603604605606607608609...790791792793794795796797798799
4800801802803804805806807808809...990991992993994995996997998999
\n", 748 | "

5 rows × 200 columns

\n", 749 | "
" 750 | ], 751 | "text/plain": [ 752 | " 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \\\n", 753 | "0 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \n", 754 | "1 200 201 202 203 204 205 206 207 208 209 ... 390 391 392 393 \n", 755 | "2 400 401 402 403 404 405 406 407 408 409 ... 590 591 592 593 \n", 756 | "3 600 601 602 603 604 605 606 607 608 609 ... 790 791 792 793 \n", 757 | "4 800 801 802 803 804 805 806 807 808 809 ... 990 991 992 993 \n", 758 | "\n", 759 | " 194 195 196 197 198 199 \n", 760 | "0 194 195 196 197 198 199 \n", 761 | "1 394 395 396 397 398 399 \n", 762 | "2 594 595 596 597 598 599 \n", 763 | "3 794 795 796 797 798 799 \n", 764 | "4 994 995 996 997 998 999 \n", 765 | "\n", 766 | "[5 rows x 200 columns]" 767 | ] 768 | }, 769 | "execution_count": 14, 770 | "metadata": {}, 771 | "output_type": "execute_result" 772 | } 773 | ], 774 | "source": [ 775 | "fortran_arr = cp.asfortranarray(reshaped_arr)\n", 776 | "reshaped_df = cudf.DataFrame(fortran_arr)\n", 777 | "reshaped_df.head()" 778 | ] 779 | }, 780 | { 781 | "cell_type": "markdown", 782 | "id": "395e2bba", 783 | "metadata": {}, 784 | "source": [ 785 | "### Converting a CuPy Array to a cuDF Series\n", 786 | "\n", 787 | "To convert an array to a Series, we can directly pass the array to the `Series` constructor." 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": 15, 793 | "id": "d8518208", 794 | "metadata": {}, 795 | "outputs": [ 796 | { 797 | "data": { 798 | "text/plain": [ 799 | "0 0\n", 800 | "1 201\n", 801 | "2 402\n", 802 | "3 603\n", 803 | "4 804\n", 804 | "dtype: int64" 805 | ] 806 | }, 807 | "execution_count": 15, 808 | "metadata": {}, 809 | "output_type": "execute_result" 810 | } 811 | ], 812 | "source": [ 813 | "cudf.Series(reshaped_arr.diagonal()).head()" 814 | ] 815 | }, 816 | { 817 | "cell_type": "markdown", 818 | "id": "7e159619", 819 | "metadata": {}, 820 | "source": [ 821 | "### Interweaving CuDF and CuPy for Smooth PyData Workflows\n", 822 | "\n", 823 | "RAPIDS libraries and the entire GPU PyData ecosystem are developing quickly, but sometimes a one library may not have the functionality you need. One example of this might be taking the row-wise sum (or mean) of a Pandas DataFrame. cuDF's support for row-wise operations isn't mature, so you'd need to either transpose the DataFrame or write a UDF and explicitly calculate the sum across each row. Transposing could lead to hundreds of thousands of columns (which cuDF wouldn't perform well with) depending on your data's shape, and writing a UDF can be time intensive.\n", 824 | "\n", 825 | "By leveraging the interoperability of the GPU PyData ecosystem, this operation becomes very easy. Let's take the row-wise sum of our previously reshaped cuDF DataFrame." 826 | ] 827 | }, 828 | { 829 | "cell_type": "code", 830 | "execution_count": 16, 831 | "id": "2bb8ed81", 832 | "metadata": {}, 833 | "outputs": [ 834 | { 835 | "data": { 836 | "text/html": [ 837 | "
\n", 838 | "\n", 851 | "\n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | "
0123456789...190191192193194195196197198199
00123456789...190191192193194195196197198199
1200201202203204205206207208209...390391392393394395396397398399
2400401402403404405406407408409...590591592593594595596597598599
3600601602603604605606607608609...790791792793794795796797798799
4800801802803804805806807808809...990991992993994995996997998999
\n", 1001 | "

5 rows × 200 columns

\n", 1002 | "
" 1003 | ], 1004 | "text/plain": [ 1005 | " 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \\\n", 1006 | "0 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \n", 1007 | "1 200 201 202 203 204 205 206 207 208 209 ... 390 391 392 393 \n", 1008 | "2 400 401 402 403 404 405 406 407 408 409 ... 590 591 592 593 \n", 1009 | "3 600 601 602 603 604 605 606 607 608 609 ... 790 791 792 793 \n", 1010 | "4 800 801 802 803 804 805 806 807 808 809 ... 990 991 992 993 \n", 1011 | "\n", 1012 | " 194 195 196 197 198 199 \n", 1013 | "0 194 195 196 197 198 199 \n", 1014 | "1 394 395 396 397 398 399 \n", 1015 | "2 594 595 596 597 598 599 \n", 1016 | "3 794 795 796 797 798 799 \n", 1017 | "4 994 995 996 997 998 999 \n", 1018 | "\n", 1019 | "[5 rows x 200 columns]" 1020 | ] 1021 | }, 1022 | "execution_count": 16, 1023 | "metadata": {}, 1024 | "output_type": "execute_result" 1025 | } 1026 | ], 1027 | "source": [ 1028 | "reshaped_df.head()" 1029 | ] 1030 | }, 1031 | { 1032 | "cell_type": "markdown", 1033 | "id": "2f3d4e78", 1034 | "metadata": {}, 1035 | "source": [ 1036 | "We can just transform it into a CuPy array and use the `axis` argument of `sum`." 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "code", 1041 | "execution_count": 17, 1042 | "id": "2dde030d", 1043 | "metadata": {}, 1044 | "outputs": [ 1045 | { 1046 | "data": { 1047 | "text/plain": [ 1048 | "array([ 19900, 59900, 99900, 139900, 179900, 219900, 259900,\n", 1049 | " 299900, 339900, 379900, 419900, 459900, 499900, 539900,\n", 1050 | " 579900, 619900, 659900, 699900, 739900, 779900, 819900,\n", 1051 | " 859900, 899900, 939900, 979900, 1019900, 1059900, 1099900,\n", 1052 | " 1139900, 1179900, 1219900, 1259900, 1299900, 1339900, 1379900,\n", 1053 | " 1419900, 1459900, 1499900, 1539900, 1579900, 1619900, 1659900,\n", 1054 | " 1699900, 1739900, 1779900, 1819900, 1859900, 1899900, 1939900,\n", 1055 | " 1979900])" 1056 | ] 1057 | }, 1058 | "execution_count": 17, 1059 | "metadata": {}, 1060 | "output_type": "execute_result" 1061 | } 1062 | ], 1063 | "source": [ 1064 | "new_arr = cupy_from_dlpack(reshaped_df.to_dlpack())\n", 1065 | "new_arr.sum(axis=1)" 1066 | ] 1067 | }, 1068 | { 1069 | "cell_type": "markdown", 1070 | "id": "4450dcc3", 1071 | "metadata": {}, 1072 | "source": [ 1073 | "With just that single line, we're able to seamlessly move between data structures in this ecosystem, giving us enormous flexibility without sacrificing speed." 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "markdown", 1078 | "id": "61bfb868", 1079 | "metadata": {}, 1080 | "source": [ 1081 | "### Converting a cuDF DataFrame to a CuPy Sparse Matrix\n", 1082 | "\n", 1083 | "We can also convert a DataFrame or Series to a CuPy sparse matrix. We might want to do this if downstream processes expect CuPy sparse matrices as an input.\n", 1084 | "\n", 1085 | "The sparse matrix data structure is defined by three dense arrays. We'll define a small helper function for cleanliness." 1086 | ] 1087 | }, 1088 | { 1089 | "cell_type": "code", 1090 | "execution_count": 18, 1091 | "id": "e531fd15", 1092 | "metadata": {}, 1093 | "outputs": [], 1094 | "source": [ 1095 | "def cudf_to_cupy_sparse_matrix(data, sparseformat=\"column\"):\n", 1096 | " \"\"\"Converts a cuDF object to a CuPy Sparse Column matrix.\"\"\"\n", 1097 | " if sparseformat not in (\n", 1098 | " \"row\",\n", 1099 | " \"column\",\n", 1100 | " ):\n", 1101 | " raise ValueError(\"Let's focus on column and row formats for now.\")\n", 1102 | "\n", 1103 | " _sparse_constructor = cp.sparse.csc_matrix\n", 1104 | " if sparseformat == \"row\":\n", 1105 | " _sparse_constructor = cp.sparse.csr_matrix\n", 1106 | "\n", 1107 | " return _sparse_constructor(cupy_from_dlpack(data.to_dlpack()))" 1108 | ] 1109 | }, 1110 | { 1111 | "cell_type": "markdown", 1112 | "id": "3f5e6ade", 1113 | "metadata": {}, 1114 | "source": [ 1115 | "We can define a sparsely populated DataFrame to illustrate this conversion to either sparse matrix format." 1116 | ] 1117 | }, 1118 | { 1119 | "cell_type": "code", 1120 | "execution_count": 19, 1121 | "id": "58c7e074", 1122 | "metadata": {}, 1123 | "outputs": [], 1124 | "source": [ 1125 | "df = cudf.DataFrame()\n", 1126 | "nelem = 10000\n", 1127 | "nonzero = 1000\n", 1128 | "for i in range(20):\n", 1129 | " arr = cp.random.normal(5, 5, nelem)\n", 1130 | " arr[cp.random.choice(arr.shape[0], nelem - nonzero, replace=False)] = 0\n", 1131 | " df[\"a\" + str(i)] = arr" 1132 | ] 1133 | }, 1134 | { 1135 | "cell_type": "code", 1136 | "execution_count": 20, 1137 | "id": "9265228d", 1138 | "metadata": {}, 1139 | "outputs": [ 1140 | { 1141 | "data": { 1142 | "text/html": [ 1143 | "
\n", 1144 | "\n", 1157 | "\n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | "
a0a1a2a3a4a5a6a7a8a9a10a11a12a13a14a15a16a17a18a19
00.00.00.00.00.00.0000000.00.00.00.00.00.0000000.00.00.00.0000000.00.00.00.0
10.00.00.00.00.00.0000000.00.00.00.00.00.0000000.00.00.00.0000000.00.00.00.0
20.00.00.00.00.00.0000000.00.00.00.00.00.0000000.00.00.00.0000000.00.00.00.0
30.00.00.00.00.00.0000000.00.00.00.00.010.1903420.00.00.03.3909560.00.00.00.0
40.00.00.00.00.01.9140440.00.00.00.00.00.0000000.00.00.00.0000000.00.00.00.0
\n", 1301 | "
" 1302 | ], 1303 | "text/plain": [ 1304 | " a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 \\\n", 1305 | "0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 \n", 1306 | "1 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 \n", 1307 | "2 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 \n", 1308 | "3 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 10.190342 0.0 \n", 1309 | "4 0.0 0.0 0.0 0.0 0.0 1.914044 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 \n", 1310 | "\n", 1311 | " a13 a14 a15 a16 a17 a18 a19 \n", 1312 | "0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 \n", 1313 | "1 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 \n", 1314 | "2 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 \n", 1315 | "3 0.0 0.0 3.390956 0.0 0.0 0.0 0.0 \n", 1316 | "4 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 " 1317 | ] 1318 | }, 1319 | "execution_count": 20, 1320 | "metadata": {}, 1321 | "output_type": "execute_result" 1322 | } 1323 | ], 1324 | "source": [ 1325 | "df.head()" 1326 | ] 1327 | }, 1328 | { 1329 | "cell_type": "code", 1330 | "execution_count": 21, 1331 | "id": "5ba1a551", 1332 | "metadata": {}, 1333 | "outputs": [ 1334 | { 1335 | "name": "stdout", 1336 | "output_type": "stream", 1337 | "text": [ 1338 | " (897, 0)\t10.472083065532264\n", 1339 | " (1665, 0)\t-3.9887061769298446\n", 1340 | " (1155, 0)\t7.370990696856217\n", 1341 | " (772, 0)\t0.299235212766345\n", 1342 | " (1286, 0)\t5.721357813626815\n", 1343 | " (775, 0)\t7.428655280999701\n", 1344 | " (903, 0)\t11.540080662967277\n", 1345 | " (1287, 0)\t-0.011231698453708638\n", 1346 | " (265, 0)\t-4.878065816206471\n", 1347 | " (1161, 0)\t1.3966861068924148\n", 1348 | " (266, 0)\t1.0549982663300899\n", 1349 | " (1291, 0)\t0.5910012568901113\n", 1350 | " (909, 0)\t5.731776333301544\n", 1351 | " (1549, 0)\t3.6972508613199615\n", 1352 | " (144, 0)\t-4.970091038596944\n", 1353 | " (1424, 0)\t1.3033828282534228\n", 1354 | " (1297, 0)\t8.258941912132176\n", 1355 | " (914, 0)\t4.616224036044147\n", 1356 | " (21, 0)\t6.25336788325076\n", 1357 | " (534, 0)\t8.419889621961996\n", 1358 | " (918, 0)\t5.691837083015048\n", 1359 | " (1046, 0)\t8.104031527567054\n", 1360 | " (1686, 0)\t3.000304365412834\n", 1361 | " (535, 0)\t3.1746290131636425\n", 1362 | " (665, 0)\t0.7317762727252719\n", 1363 | " :\t:\n", 1364 | " (9200, 19)\t5.538603016004742\n", 1365 | " (9713, 19)\t6.404913440477216\n", 1366 | " (9202, 19)\t10.133358471330899\n", 1367 | " (8567, 19)\t-1.6576792573911858\n", 1368 | " (9847, 19)\t10.284007122371538\n", 1369 | " (8440, 19)\t8.605460481669013\n", 1370 | " (9336, 19)\t7.398549223780951\n", 1371 | " (9720, 19)\t4.720142296850481\n", 1372 | " (8441, 19)\t7.17687459848627\n", 1373 | " (9209, 19)\t1.7813006515085739\n", 1374 | " (9337, 19)\t7.4672893771361455\n", 1375 | " (8570, 19)\t12.837330165297741\n", 1376 | " (9210, 19)\t9.70564905788214\n", 1377 | " (9083, 19)\t0.4805280345257057\n", 1378 | " (9339, 19)\t4.740715090704008\n", 1379 | " (9468, 19)\t2.9916780999709296\n", 1380 | " (9596, 19)\t10.314674648882447\n", 1381 | " (9724, 19)\t7.854766410475708\n", 1382 | " (9852, 19)\t1.5899611807514598\n", 1383 | " (9087, 19)\t11.67209323271626\n", 1384 | " (9953, 19)\t11.444522857416047\n", 1385 | " (9954, 19)\t4.672717133639532\n", 1386 | " (9959, 19)\t6.54204170098849\n", 1387 | " (9976, 19)\t10.600419849454331\n", 1388 | " (9978, 19)\t6.839645924414838\n" 1389 | ] 1390 | } 1391 | ], 1392 | "source": [ 1393 | "sparse_data = cudf_to_cupy_sparse_matrix(df)\n", 1394 | "print(sparse_data)" 1395 | ] 1396 | }, 1397 | { 1398 | "cell_type": "markdown", 1399 | "id": "e8e58cd5", 1400 | "metadata": {}, 1401 | "source": [ 1402 | "From here, we could continue our workflow with a CuPy sparse matrix.\n", 1403 | "\n", 1404 | "For a full list of the functionality built into these libraries, we encourage you to check out the API docs for [cuDF](https://docs.rapids.ai/api/cudf/nightly/) and [CuPy](https://docs-cupy.chainer.org/en/stable/index.html)." 1405 | ] 1406 | } 1407 | ], 1408 | "metadata": { 1409 | "kernelspec": { 1410 | "display_name": "Python 3 (ipykernel)", 1411 | "language": "python", 1412 | "name": "python3" 1413 | }, 1414 | "language_info": { 1415 | "codemirror_mode": { 1416 | "name": "ipython", 1417 | "version": 3 1418 | }, 1419 | "file_extension": ".py", 1420 | "mimetype": "text/x-python", 1421 | "name": "python", 1422 | "nbconvert_exporter": "python", 1423 | "pygments_lexer": "ipython3", 1424 | "version": "3.10.11" 1425 | } 1426 | }, 1427 | "nbformat": 4, 1428 | "nbformat_minor": 5 1429 | } 1430 | -------------------------------------------------------------------------------- /code/Introduction_to_Strings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Intro into Strings \n", 8 | "\n", 9 | "**Authorship**
\n", 10 | "Original Author: Nicholas Davis
" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## Working with text data
\n", 18 | "\n", 19 | "Enterprise analytics workflows commonly require processing large-scale text data. To address this need, the RAPIDS CUDA DataFrame library (cuDF) and RAPIDS CUDA Machine Learning library (cuML) now include string processing capabilities. cuDF has a fully-featured string and regular expression processing engine. With a pandas-like API, cuDF string analytics can provide data scientists with up to 90x performance improvement with minimal changes to their code.
\n", 20 | "\n", 21 | "This notebook serves as an intro to string capabilities with cuDF. Each string functionality will have a pandas example and it's cuDF equivalent.
\n", 22 | "\n", 23 | "For any additional information please reference:
\n", 24 | "[cuDF Documentation](https://docs.rapids.ai/api/cudf/stable/api.html#strings)

\n", 25 | "[GPU-Accelerated String Processing with RAPIDS Video](https://www.nvidia.com/en-us/on-demand/session/gtcfall20-a21131/)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "\n", 33 | "Before we begin, let's check out our hardware setup by running the nvidia-smi command." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 1, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "Mon Mar 11 16:25:31 2024 \n", 46 | "+-----------------------------------------------------------------------------+\n", 47 | "| NVIDIA-SMI 525.147.05 Driver Version: 525.147.05 CUDA Version: 12.0 |\n", 48 | "|-------------------------------+----------------------+----------------------+\n", 49 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 50 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 51 | "| | | MIG M. |\n", 52 | "|===============================+======================+======================|\n", 53 | "| 0 Quadro GV100 Off | 00000000:15:00.0 Off | Off |\n", 54 | "| 29% 40C P2 26W / 250W | 12750MiB / 32768MiB | 0% Default |\n", 55 | "| | | N/A |\n", 56 | "+-------------------------------+----------------------+----------------------+\n", 57 | "| 1 Quadro GV100 Off | 00000000:2D:00.0 Off | Off |\n", 58 | "| 33% 46C P2 29W / 250W | 3497MiB / 32768MiB | 0% Default |\n", 59 | "| | | N/A |\n", 60 | "+-------------------------------+----------------------+----------------------+\n", 61 | " \n", 62 | "+-----------------------------------------------------------------------------+\n", 63 | "| Processes: |\n", 64 | "| GPU GI CI PID Type Process name GPU Memory |\n", 65 | "| ID ID Usage |\n", 66 | "|=============================================================================|\n", 67 | "+-----------------------------------------------------------------------------+\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "!nvidia-smi" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "### Text data types" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "There are two ways to store text data in pandas and cudf:\n", 87 | "\n", 88 | "1. object -dtype NumPy array.\n", 89 | "\n", 90 | "1. StringDtype extension type.\n", 91 | "\n", 92 | "We recommend using StringDtype to store text data.\n", 93 | "\n", 94 | "Prior to pandas 1.0, object dtype was the only option. This was unfortunate for many reasons:\n", 95 | "\n", 96 | "1. You can accidentally store a mixture of strings and non-strings in an object dtype array. It’s better to have a dedicated dtype.\n", 97 | "\n", 98 | "1. object dtype breaks dtype-specific operations like `DataFrame.select_dtypes()`. There isn’t a clear way to select just text while excluding non-text but still object-dtype columns.\n", 99 | "\n", 100 | "1. When reading code, the contents of an object dtype array is less clear than 'string'." 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Currently, the performance of object dtype arrays of strings and arrays.StringArray are about the same. We expect future enhancements to significantly increase the performance and lower the memory overhead of StringArray.\n", 108 | "\n" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 2, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "Pandas Version: 1.5.3\n", 121 | "CuDF Version: 24.02.02\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "import pandas as pd; print('Pandas Version:', pd.__version__)\n", 127 | "import numpy as np\n", 128 | "import cupy as cp\n", 129 | "import cudf; print('CuDF Version:', cudf.__version__)\n", 130 | "import warnings\n", 131 | "warnings.filterwarnings('ignore')\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "\n", 139 | "For backwards-compatibility, object dtype remains the default type we infer a list of strings to." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 3, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "0 a\n", 151 | "1 b\n", 152 | "2 c\n", 153 | "dtype: object" 154 | ] 155 | }, 156 | "execution_count": 3, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "# Pandas\n", 163 | "\n", 164 | "pd.Series([\"a\", \"b\", \"c\"])" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 4, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "0 a\n", 176 | "1 b\n", 177 | "2 c\n", 178 | "dtype: object" 179 | ] 180 | }, 181 | "execution_count": 4, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "# cuDF\n", 188 | "\n", 189 | "cudf.Series([\"a\", \"b\", \"c\"])\n" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "\n", 197 | "To explicitly request string dtype, specify the dtype." 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 5, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "data": { 207 | "text/plain": [ 208 | "0 a\n", 209 | "1 b\n", 210 | "2 c\n", 211 | "dtype: string" 212 | ] 213 | }, 214 | "execution_count": 5, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "pd.Series([\"a\", \"b\", \"c\"], dtype=\"string\")" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 6, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/plain": [ 231 | "0 a\n", 232 | "1 b\n", 233 | "2 c\n", 234 | "dtype: object" 235 | ] 236 | }, 237 | "execution_count": 6, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "cudf.Series([\"a\", \"b\", \"c\"], dtype=\"str\")" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "\n", 251 | "Or astype after the Series or DataFrame is created." 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 7, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "name": "stdout", 261 | "output_type": "stream", 262 | "text": [ 263 | "Original: \n", 264 | "0 a\n", 265 | "1 b\n", 266 | "2 c\n", 267 | "dtype: string\n", 268 | "\n", 269 | "# of 'n': \n", 270 | "0 0\n", 271 | "1 0\n", 272 | "2 0\n", 273 | "dtype: int64\n" 274 | ] 275 | } 276 | ], 277 | "source": [ 278 | "pandasSeries = pd.Series([\"a\", \"b\", \"c\"])\n", 279 | "print('Original: ')\n", 280 | "print(pandasSeries.astype(\"string\"))\n", 281 | "\n", 282 | "print(\"\\n# of 'n': \")\n", 283 | "print(pandasSeries.str.count('n'))" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 8, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "name": "stdout", 293 | "output_type": "stream", 294 | "text": [ 295 | "Original: \n", 296 | "0 a\n", 297 | "1 b\n", 298 | "2 c\n", 299 | "dtype: object\n", 300 | "\n", 301 | "# of 'n': \n", 302 | "0 0\n", 303 | "1 0\n", 304 | "2 0\n", 305 | "dtype: int32\n" 306 | ] 307 | } 308 | ], 309 | "source": [ 310 | "cudfSeries = cudf.Series([\"a\", \"b\", \"c\"])\n", 311 | "print('Original: ')\n", 312 | "print(cudfSeries.astype(\"string\"))\n", 313 | "\n", 314 | "print(\"\\n# of 'n': \")\n", 315 | "print(cudfSeries.str.count('n'))" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "\n", 323 | "You can also use StringDtype/\"string\" as the dtype on non-string data and it will be converted to string dtype:" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 9, 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "name": "stdout", 333 | "output_type": "stream", 334 | "text": [ 335 | "0 a\n", 336 | "1 2\n", 337 | "2 \n", 338 | "dtype: string\n" 339 | ] 340 | }, 341 | { 342 | "data": { 343 | "text/plain": [ 344 | "str" 345 | ] 346 | }, 347 | "execution_count": 9, 348 | "metadata": {}, 349 | "output_type": "execute_result" 350 | } 351 | ], 352 | "source": [ 353 | "pandasSeries = pd.Series([\"a\", 2, np.nan], dtype=\"string\")\n", 354 | "print(pandasSeries)\n", 355 | "type(pandasSeries[1])" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 10, 361 | "metadata": {}, 362 | "outputs": [ 363 | { 364 | "name": "stdout", 365 | "output_type": "stream", 366 | "text": [ 367 | "0 a\n", 368 | "1 2\n", 369 | "2 \n", 370 | "dtype: object\n" 371 | ] 372 | }, 373 | { 374 | "data": { 375 | "text/plain": [ 376 | "str" 377 | ] 378 | }, 379 | "execution_count": 10, 380 | "metadata": {}, 381 | "output_type": "execute_result" 382 | } 383 | ], 384 | "source": [ 385 | "cudfSeries = cudf.Series([\"a\", 2, np.nan], dtype=\"str\")\n", 386 | "print(cudfSeries)\n", 387 | "type(cudfSeries[1])" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "\n", 395 | "or convert from existing pandas data:" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 11, 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "0 1\n", 408 | "1 2\n", 409 | "2 \n", 410 | "dtype: string\n" 411 | ] 412 | }, 413 | { 414 | "data": { 415 | "text/plain": [ 416 | "str" 417 | ] 418 | }, 419 | "execution_count": 11, 420 | "metadata": {}, 421 | "output_type": "execute_result" 422 | } 423 | ], 424 | "source": [ 425 | "pandasSeries = pd.Series([1, 2, np.nan], dtype=\"Int64\")\n", 426 | "\n", 427 | "pandasSeries2 = pandasSeries.astype(\"string\")\n", 428 | "print(pandasSeries2)\n", 429 | "type(pandasSeries2[0])" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 12, 435 | "metadata": {}, 436 | "outputs": [ 437 | { 438 | "name": "stdout", 439 | "output_type": "stream", 440 | "text": [ 441 | "0 1\n", 442 | "1 2\n", 443 | "2 \n", 444 | "dtype: object\n" 445 | ] 446 | }, 447 | { 448 | "data": { 449 | "text/plain": [ 450 | "str" 451 | ] 452 | }, 453 | "execution_count": 12, 454 | "metadata": {}, 455 | "output_type": "execute_result" 456 | } 457 | ], 458 | "source": [ 459 | "cudfSeries1 = cudf.Series([1, 2, np.nan], dtype=\"int64\")\n", 460 | "\n", 461 | "cudfSeries2 = cudfSeries1.astype(\"string\")\n", 462 | "print(cudfSeries2)\n", 463 | "type(cudfSeries2[0])" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "\n", 471 | "## Behavior differences\n", 472 | "\n", 473 | "These are places where the behavior of StringDtype objects differ from object dtype." 474 | ] 475 | }, 476 | { 477 | "cell_type": "markdown", 478 | "metadata": {}, 479 | "source": [ 480 | "For `StringDtype`, string accessor methods that return numeric output will always return a nullable integer dtype, rather than either int or float dtype, depending on the presence of `NA` values. Methods returning boolean output will return a nullable boolean dtype." 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": 13, 486 | "metadata": {}, 487 | "outputs": [ 488 | { 489 | "name": "stdout", 490 | "output_type": "stream", 491 | "text": [ 492 | "Original: \n", 493 | "0 a\n", 494 | "1 \n", 495 | "2 b\n", 496 | "dtype: string\n", 497 | "# of 'a': \n", 498 | "0 1\n", 499 | "1 \n", 500 | "2 0\n", 501 | "dtype: Int64\n", 502 | "\n", 503 | "# of 'a' after dropping n/a: \n", 504 | "0 1\n", 505 | "2 0\n", 506 | "dtype: Int64\n", 507 | "\n", 508 | "Check if numeric: \n", 509 | "0 False\n", 510 | "1 \n", 511 | "2 False\n", 512 | "dtype: boolean\n" 513 | ] 514 | } 515 | ], 516 | "source": [ 517 | "pandasSeries = pd.Series([\"a\", None, \"b\"], dtype=\"string\")\n", 518 | "print('Original: ')\n", 519 | "print(pandasSeries)\n", 520 | "print(\"# of 'a': \")\n", 521 | "print(pandasSeries.str.count(\"a\"))\n", 522 | "print(\"\\n# of 'a' after dropping n/a: \")\n", 523 | "print(pandasSeries.dropna().str.count(\"a\"))\n", 524 | "print(\"\\nCheck if numeric: \")\n", 525 | "print(pandasSeries.str.isnumeric())\n" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 14, 531 | "metadata": {}, 532 | "outputs": [ 533 | { 534 | "name": "stdout", 535 | "output_type": "stream", 536 | "text": [ 537 | "Original: \n", 538 | "0 a\n", 539 | "1 \n", 540 | "2 b\n", 541 | "dtype: object\n", 542 | "# of 'a': \n", 543 | "0 1\n", 544 | "1 \n", 545 | "2 0\n", 546 | "dtype: int32\n", 547 | "\n", 548 | "# of 'a' after dropping n/a: \n", 549 | "0 1\n", 550 | "2 0\n", 551 | "dtype: int32\n", 552 | "\n", 553 | "Check if numeric: \n", 554 | "0 False\n", 555 | "1 \n", 556 | "2 False\n", 557 | "dtype: bool\n" 558 | ] 559 | } 560 | ], 561 | "source": [ 562 | "cudfSeries = cudf.Series([\"a\", None, \"b\"], dtype=\"str\")\n", 563 | "print('Original: ')\n", 564 | "print(cudfSeries)\n", 565 | "print(\"# of 'a': \")\n", 566 | "print(cudfSeries.str.count(\"a\"))\n", 567 | "print(\"\\n# of 'a' after dropping n/a: \")\n", 568 | "print(cudfSeries.dropna().str.count(\"a\"))\n", 569 | "print(\"\\nCheck if numeric: \")\n", 570 | "print(cudfSeries.str.isnumeric())" 571 | ] 572 | }, 573 | { 574 | "cell_type": "markdown", 575 | "metadata": {}, 576 | "source": [ 577 | "\n", 578 | "Both outputs are Int64 dtype. Compare that with object-dtype." 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": 15, 584 | "metadata": {}, 585 | "outputs": [ 586 | { 587 | "name": "stdout", 588 | "output_type": "stream", 589 | "text": [ 590 | "# of 'a': \n", 591 | "0 1.0\n", 592 | "1 NaN\n", 593 | "2 0.0\n", 594 | "dtype: float64\n", 595 | "\n", 596 | "# of 'a' after dropping n/a: \n" 597 | ] 598 | }, 599 | { 600 | "data": { 601 | "text/plain": [ 602 | "0 1\n", 603 | "2 0\n", 604 | "dtype: int64" 605 | ] 606 | }, 607 | "execution_count": 15, 608 | "metadata": {}, 609 | "output_type": "execute_result" 610 | } 611 | ], 612 | "source": [ 613 | "pandasSeries2 = pd.Series([\"a\", None, \"b\"], dtype=\"object\")\n", 614 | "print(\"# of 'a': \")\n", 615 | "print(pandasSeries2.str.count(\"a\"))\n", 616 | "print(\"\\n# of 'a' after dropping n/a: \")\n", 617 | "pandasSeries2.dropna().str.count(\"a\")" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 16, 623 | "metadata": {}, 624 | "outputs": [ 625 | { 626 | "name": "stdout", 627 | "output_type": "stream", 628 | "text": [ 629 | "# of 'a': \n", 630 | "0 1\n", 631 | "1 \n", 632 | "2 0\n", 633 | "dtype: int32\n", 634 | "\n", 635 | "# of 'a' after dropping n/a: \n" 636 | ] 637 | }, 638 | { 639 | "data": { 640 | "text/plain": [ 641 | "0 1\n", 642 | "2 0\n", 643 | "dtype: int32" 644 | ] 645 | }, 646 | "execution_count": 16, 647 | "metadata": {}, 648 | "output_type": "execute_result" 649 | } 650 | ], 651 | "source": [ 652 | "cudfSeries2 = cudf.Series([\"a\", None, \"b\"], dtype=\"object\")\n", 653 | "print(\"# of 'a': \")\n", 654 | "print(cudfSeries2.str.count(\"a\"))\n", 655 | "print(\"\\n# of 'a' after dropping n/a: \")\n", 656 | "cudfSeries2.dropna().str.count(\"a\")" 657 | ] 658 | }, 659 | { 660 | "cell_type": "markdown", 661 | "metadata": {}, 662 | "source": [ 663 | "\n", 664 | "When NA values are present, the output dtype is float64. Similarly for methods returning boolean values." 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 17, 670 | "metadata": {}, 671 | "outputs": [ 672 | { 673 | "name": "stdout", 674 | "output_type": "stream", 675 | "text": [ 676 | "Check if digit: \n", 677 | "0 False\n", 678 | "1 \n", 679 | "2 False\n", 680 | "dtype: boolean\n", 681 | "\n", 682 | "Match against 'a': \n" 683 | ] 684 | }, 685 | { 686 | "data": { 687 | "text/plain": [ 688 | "0 True\n", 689 | "1 \n", 690 | "2 False\n", 691 | "dtype: boolean" 692 | ] 693 | }, 694 | "execution_count": 17, 695 | "metadata": {}, 696 | "output_type": "execute_result" 697 | } 698 | ], 699 | "source": [ 700 | "print(\"Check if digit: \")\n", 701 | "print(pandasSeries.str.isdigit())\n", 702 | "print(\"\\nMatch against 'a': \")\n", 703 | "pandasSeries.str.match(\"a\")" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 18, 709 | "metadata": {}, 710 | "outputs": [ 711 | { 712 | "name": "stdout", 713 | "output_type": "stream", 714 | "text": [ 715 | "Check if digit: \n", 716 | "0 False\n", 717 | "1 \n", 718 | "2 False\n", 719 | "dtype: bool\n", 720 | "\n", 721 | "Match against 'a': \n" 722 | ] 723 | }, 724 | { 725 | "data": { 726 | "text/plain": [ 727 | "0 True\n", 728 | "1 \n", 729 | "2 False\n", 730 | "dtype: bool" 731 | ] 732 | }, 733 | "execution_count": 18, 734 | "metadata": {}, 735 | "output_type": "execute_result" 736 | } 737 | ], 738 | "source": [ 739 | "print(\"Check if digit: \")\n", 740 | "print(cudfSeries.str.isdigit())\n", 741 | "print(\"\\nMatch against 'a': \")\n", 742 | "cudfSeries.str.match(\"a\")" 743 | ] 744 | }, 745 | { 746 | "cell_type": "markdown", 747 | "metadata": {}, 748 | "source": [ 749 | "
\n", 750 | "\n", 751 | "Some string methods, like `Series.str.decode()` are not available on StringArray because StringArray only holds strings, not bytes." 752 | ] 753 | }, 754 | { 755 | "cell_type": "markdown", 756 | "metadata": {}, 757 | "source": [ 758 | "In comparison operations, `arrays.StringArray` and Series backed by a `StringArray` will return an object with `BooleanDtype`, rather than a bool dtype object. Missing values in a `StringArray` will propagate in comparison operations, rather than always comparing unequal like `numpy.nan`." 759 | ] 760 | }, 761 | { 762 | "cell_type": "markdown", 763 | "metadata": {}, 764 | "source": [ 765 | "Everything else that follows in the rest of this document applies equally to string and object dtype." 766 | ] 767 | }, 768 | { 769 | "cell_type": "markdown", 770 | "metadata": {}, 771 | "source": [ 772 | "\n", 773 | "## String methods" 774 | ] 775 | }, 776 | { 777 | "cell_type": "markdown", 778 | "metadata": {}, 779 | "source": [ 780 | "Series and Index are equipped with a set of string processing methods that make it easy to operate on each element of the array. Perhaps most importantly, these methods exclude missing/NA values automatically. These are accessed via the str attribute and generally have names matching the equivalent (scalar) built-in string methods:" 781 | ] 782 | }, 783 | { 784 | "cell_type": "code", 785 | "execution_count": 19, 786 | "metadata": {}, 787 | "outputs": [ 788 | { 789 | "name": "stdout", 790 | "output_type": "stream", 791 | "text": [ 792 | "Original: \n", 793 | "0 A\n", 794 | "1 B\n", 795 | "2 C\n", 796 | "3 Aaba\n", 797 | "4 Baca\n", 798 | "5 \n", 799 | "6 CABA\n", 800 | "7 dog\n", 801 | "8 cat\n", 802 | "dtype: string\n", 803 | "\n", 804 | "Lowered: \n", 805 | "0 a\n", 806 | "1 b\n", 807 | "2 c\n", 808 | "3 aaba\n", 809 | "4 baca\n", 810 | "5 \n", 811 | "6 caba\n", 812 | "7 dog\n", 813 | "8 cat\n", 814 | "dtype: string\n", 815 | "\n", 816 | "Check if Lowered: \n", 817 | "0 False\n", 818 | "1 False\n", 819 | "2 False\n", 820 | "3 False\n", 821 | "4 False\n", 822 | "5 \n", 823 | "6 False\n", 824 | "7 True\n", 825 | "8 True\n", 826 | "dtype: boolean\n", 827 | "\n", 828 | "Uppercase: \n", 829 | "0 A\n", 830 | "1 B\n", 831 | "2 C\n", 832 | "3 AABA\n", 833 | "4 BACA\n", 834 | "5 \n", 835 | "6 CABA\n", 836 | "7 DOG\n", 837 | "8 CAT\n", 838 | "dtype: string\n", 839 | "\n", 840 | "Check if Uppercase: \n", 841 | "0 True\n", 842 | "1 True\n", 843 | "2 True\n", 844 | "3 False\n", 845 | "4 False\n", 846 | "5 \n", 847 | "6 True\n", 848 | "7 False\n", 849 | "8 False\n", 850 | "dtype: boolean\n", 851 | "\n", 852 | "Determine Length: \n" 853 | ] 854 | }, 855 | { 856 | "data": { 857 | "text/plain": [ 858 | "0 1\n", 859 | "1 1\n", 860 | "2 1\n", 861 | "3 4\n", 862 | "4 4\n", 863 | "5 \n", 864 | "6 4\n", 865 | "7 3\n", 866 | "8 3\n", 867 | "dtype: Int64" 868 | ] 869 | }, 870 | "execution_count": 19, 871 | "metadata": {}, 872 | "output_type": "execute_result" 873 | } 874 | ], 875 | "source": [ 876 | "pandasSeries = pd.Series(\n", 877 | " ....: [\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", np.nan, \"CABA\", \"dog\", \"cat\"], dtype=\"string\"\n", 878 | " ....: )\n", 879 | " ....: \n", 880 | "print('Original: ')\n", 881 | "print(pandasSeries)\n", 882 | "print('\\nLowered: ')\n", 883 | "print(pandasSeries.str.lower())\n", 884 | "print('\\nCheck if Lowered: ')\n", 885 | "print(pandasSeries.str.islower())\n", 886 | "print('\\nUppercase: ')\n", 887 | "print(pandasSeries.str.upper())\n", 888 | "print('\\nCheck if Uppercase: ')\n", 889 | "print(pandasSeries.str.isupper())\n", 890 | "print('\\nDetermine Length: ')\n", 891 | "pandasSeries.str.len()\n", 892 | "\n" 893 | ] 894 | }, 895 | { 896 | "cell_type": "code", 897 | "execution_count": 20, 898 | "metadata": {}, 899 | "outputs": [ 900 | { 901 | "name": "stdout", 902 | "output_type": "stream", 903 | "text": [ 904 | "Original: \n", 905 | "0 A\n", 906 | "1 B\n", 907 | "2 C\n", 908 | "3 Aaba\n", 909 | "4 Baca\n", 910 | "5 \n", 911 | "6 CABA\n", 912 | "7 dog\n", 913 | "8 cat\n", 914 | "dtype: object\n", 915 | "\n", 916 | "Lowered: \n", 917 | "0 a\n", 918 | "1 b\n", 919 | "2 c\n", 920 | "3 aaba\n", 921 | "4 baca\n", 922 | "5 \n", 923 | "6 caba\n", 924 | "7 dog\n", 925 | "8 cat\n", 926 | "dtype: object\n", 927 | "\n", 928 | "Check if Lowered: \n", 929 | "0 False\n", 930 | "1 False\n", 931 | "2 False\n", 932 | "3 False\n", 933 | "4 False\n", 934 | "5 \n", 935 | "6 False\n", 936 | "7 True\n", 937 | "8 True\n", 938 | "dtype: bool\n", 939 | "\n", 940 | "Uppercase: \n", 941 | "0 A\n", 942 | "1 B\n", 943 | "2 C\n", 944 | "3 AABA\n", 945 | "4 BACA\n", 946 | "5 \n", 947 | "6 CABA\n", 948 | "7 DOG\n", 949 | "8 CAT\n", 950 | "dtype: object\n", 951 | "\n", 952 | "Check if Uppercase: \n", 953 | "0 True\n", 954 | "1 True\n", 955 | "2 True\n", 956 | "3 False\n", 957 | "4 False\n", 958 | "5 \n", 959 | "6 True\n", 960 | "7 False\n", 961 | "8 False\n", 962 | "dtype: bool\n", 963 | "\n", 964 | "Determine Length: \n" 965 | ] 966 | }, 967 | { 968 | "data": { 969 | "text/plain": [ 970 | "0 1\n", 971 | "1 1\n", 972 | "2 1\n", 973 | "3 4\n", 974 | "4 4\n", 975 | "5 \n", 976 | "6 4\n", 977 | "7 3\n", 978 | "8 3\n", 979 | "dtype: int32" 980 | ] 981 | }, 982 | "execution_count": 20, 983 | "metadata": {}, 984 | "output_type": "execute_result" 985 | } 986 | ], 987 | "source": [ 988 | "cudfSeries = cudf.Series(\n", 989 | " ....: [\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", np.nan, \"CABA\", \"dog\", \"cat\"], dtype=\"str\"\n", 990 | " ....: )\n", 991 | " ....: \n", 992 | "\n", 993 | "print('Original: ')\n", 994 | "print(cudfSeries)\n", 995 | "print('\\nLowered: ')\n", 996 | "print(cudfSeries.str.lower())\n", 997 | "print('\\nCheck if Lowered: ')\n", 998 | "print(cudfSeries.str.islower())\n", 999 | "print('\\nUppercase: ')\n", 1000 | "print(cudfSeries.str.upper())\n", 1001 | "print('\\nCheck if Uppercase: ')\n", 1002 | "print(cudfSeries.str.isupper())\n", 1003 | "print('\\nDetermine Length: ')\n", 1004 | "cudfSeries.str.len()\n" 1005 | ] 1006 | }, 1007 | { 1008 | "cell_type": "code", 1009 | "execution_count": 21, 1010 | "metadata": {}, 1011 | "outputs": [ 1012 | { 1013 | "name": "stdout", 1014 | "output_type": "stream", 1015 | "text": [ 1016 | "Right Strip: \n", 1017 | "Index([' jack', 'jill', ' jesse', 'frank'], dtype='object')\n", 1018 | "\n", 1019 | "Left Strip: \n" 1020 | ] 1021 | }, 1022 | { 1023 | "data": { 1024 | "text/plain": [ 1025 | "Index(['jack', 'jill ', 'jesse ', 'frank'], dtype='object')" 1026 | ] 1027 | }, 1028 | "execution_count": 21, 1029 | "metadata": {}, 1030 | "output_type": "execute_result" 1031 | } 1032 | ], 1033 | "source": [ 1034 | "pandasIdx = pd.Index([\" jack\", \"jill \", \" jesse \", \"frank\"])\n", 1035 | "\n", 1036 | "pandasIdx.str.strip()\n", 1037 | "\n", 1038 | "print('Right Strip: ')\n", 1039 | "print(pandasIdx.str.rstrip())\n", 1040 | "\n", 1041 | "print('\\nLeft Strip: ')\n", 1042 | "pandasIdx.str.lstrip()\n" 1043 | ] 1044 | }, 1045 | { 1046 | "cell_type": "code", 1047 | "execution_count": 22, 1048 | "metadata": {}, 1049 | "outputs": [ 1050 | { 1051 | "name": "stdout", 1052 | "output_type": "stream", 1053 | "text": [ 1054 | "Right Strip: \n", 1055 | "StringIndex([' jack' 'jill' ' jesse' 'frank'], dtype='object')\n", 1056 | "\n", 1057 | "Left Strip: \n" 1058 | ] 1059 | }, 1060 | { 1061 | "data": { 1062 | "text/plain": [ 1063 | "StringIndex(['jack' 'jill ' 'jesse ' 'frank'], dtype='object')" 1064 | ] 1065 | }, 1066 | "execution_count": 22, 1067 | "metadata": {}, 1068 | "output_type": "execute_result" 1069 | } 1070 | ], 1071 | "source": [ 1072 | "cudfIdx = cudf.Index([\" jack\", \"jill \", \" jesse \", \"frank\"])\n", 1073 | "\n", 1074 | "cudfIdx.str.strip()\n", 1075 | "\n", 1076 | "print('Right Strip: ')\n", 1077 | "print(cudfIdx.str.rstrip())\n", 1078 | "\n", 1079 | "print('\\nLeft Strip: ')\n", 1080 | "cudfIdx.str.lstrip()\n" 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "markdown", 1085 | "metadata": {}, 1086 | "source": [ 1087 | "\n", 1088 | "The string methods on Index are especially useful for cleaning up or transforming DataFrame columns. For instance, you may have columns with leading or trailing whitespace:" 1089 | ] 1090 | }, 1091 | { 1092 | "cell_type": "code", 1093 | "execution_count": 23, 1094 | "metadata": {}, 1095 | "outputs": [ 1096 | { 1097 | "data": { 1098 | "text/html": [ 1099 | "
\n", 1100 | "\n", 1113 | "\n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | "
Column AColumn B
01.766351-0.006574
1-0.034232-1.638306
2-0.836389-1.506215
\n", 1139 | "
" 1140 | ], 1141 | "text/plain": [ 1142 | " Column A Column B \n", 1143 | "0 1.766351 -0.006574\n", 1144 | "1 -0.034232 -1.638306\n", 1145 | "2 -0.836389 -1.506215" 1146 | ] 1147 | }, 1148 | "execution_count": 23, 1149 | "metadata": {}, 1150 | "output_type": "execute_result" 1151 | } 1152 | ], 1153 | "source": [ 1154 | "pandasDataFrame = pd.DataFrame(np.random.randn(3, 2), columns=[\" Column A \", \" Column B \"], index=range(3))\n", 1155 | " \n", 1156 | "pandasDataFrame" 1157 | ] 1158 | }, 1159 | { 1160 | "cell_type": "code", 1161 | "execution_count": 24, 1162 | "metadata": {}, 1163 | "outputs": [ 1164 | { 1165 | "data": { 1166 | "text/html": [ 1167 | "
\n", 1168 | "\n", 1181 | "\n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | "
Column AColumn B
00.234977-0.617927
1-1.8240230.061936
2-0.0661820.006777
\n", 1207 | "
" 1208 | ], 1209 | "text/plain": [ 1210 | " Column A Column B \n", 1211 | "0 0.234977 -0.617927\n", 1212 | "1 -1.824023 0.061936\n", 1213 | "2 -0.066182 0.006777" 1214 | ] 1215 | }, 1216 | "execution_count": 24, 1217 | "metadata": {}, 1218 | "output_type": "execute_result" 1219 | } 1220 | ], 1221 | "source": [ 1222 | "cudfDataFrame = cudf.DataFrame(np.random.randn(3, 2), columns=[\" Column A \", \" Column B \"], index=range(3))\n", 1223 | " \n", 1224 | "cudfDataFrame" 1225 | ] 1226 | }, 1227 | { 1228 | "cell_type": "markdown", 1229 | "metadata": {}, 1230 | "source": [ 1231 | "\n", 1232 | "Since df.columns is an Index object, we can use the .str accessor." 1233 | ] 1234 | }, 1235 | { 1236 | "cell_type": "code", 1237 | "execution_count": 25, 1238 | "metadata": {}, 1239 | "outputs": [ 1240 | { 1241 | "name": "stdout", 1242 | "output_type": "stream", 1243 | "text": [ 1244 | "Stripped: \n", 1245 | "Index(['Column A', 'Column B'], dtype='object')\n", 1246 | "\n", 1247 | "Lowered: \n" 1248 | ] 1249 | }, 1250 | { 1251 | "data": { 1252 | "text/plain": [ 1253 | "Index([' column a ', ' column b '], dtype='object')" 1254 | ] 1255 | }, 1256 | "execution_count": 25, 1257 | "metadata": {}, 1258 | "output_type": "execute_result" 1259 | } 1260 | ], 1261 | "source": [ 1262 | "print(\"Stripped: \")\n", 1263 | "print(pandasDataFrame.columns.str.strip())\n", 1264 | "print(\"\\nLowered: \")\n", 1265 | "pandasDataFrame.columns.str.lower()" 1266 | ] 1267 | }, 1268 | { 1269 | "cell_type": "code", 1270 | "execution_count": 26, 1271 | "metadata": {}, 1272 | "outputs": [ 1273 | { 1274 | "name": "stdout", 1275 | "output_type": "stream", 1276 | "text": [ 1277 | "Stripped: \n", 1278 | "Index(['Column A', 'Column B'], dtype='object')\n", 1279 | "\n", 1280 | "Lowered: \n" 1281 | ] 1282 | }, 1283 | { 1284 | "data": { 1285 | "text/plain": [ 1286 | "Index([' column a ', ' column b '], dtype='object')" 1287 | ] 1288 | }, 1289 | "execution_count": 26, 1290 | "metadata": {}, 1291 | "output_type": "execute_result" 1292 | } 1293 | ], 1294 | "source": [ 1295 | "print(\"Stripped: \")\n", 1296 | "print(cudfDataFrame.columns.str.strip())\n", 1297 | "print(\"\\nLowered: \")\n", 1298 | "cudfDataFrame.columns.str.lower()" 1299 | ] 1300 | }, 1301 | { 1302 | "cell_type": "markdown", 1303 | "metadata": {}, 1304 | "source": [ 1305 | "\n", 1306 | "These string methods can then be used to clean up the columns as needed. Here we are removing leading and trailing whitespaces, lower casing all names, and replacing any remaining whitespaces with underscores:" 1307 | ] 1308 | }, 1309 | { 1310 | "cell_type": "code", 1311 | "execution_count": 27, 1312 | "metadata": {}, 1313 | "outputs": [ 1314 | { 1315 | "data": { 1316 | "text/html": [ 1317 | "
\n", 1318 | "\n", 1331 | "\n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | "
column_acolumn_b
01.766351-0.006574
1-0.034232-1.638306
2-0.836389-1.506215
\n", 1357 | "
" 1358 | ], 1359 | "text/plain": [ 1360 | " column_a column_b\n", 1361 | "0 1.766351 -0.006574\n", 1362 | "1 -0.034232 -1.638306\n", 1363 | "2 -0.836389 -1.506215" 1364 | ] 1365 | }, 1366 | "execution_count": 27, 1367 | "metadata": {}, 1368 | "output_type": "execute_result" 1369 | } 1370 | ], 1371 | "source": [ 1372 | "pandasDataFrame.columns = pandasDataFrame.columns.str.strip().str.lower().str.replace(\" \", \"_\")\n", 1373 | "pandasDataFrame" 1374 | ] 1375 | }, 1376 | { 1377 | "cell_type": "code", 1378 | "execution_count": 28, 1379 | "metadata": {}, 1380 | "outputs": [ 1381 | { 1382 | "data": { 1383 | "text/html": [ 1384 | "
\n", 1385 | "\n", 1398 | "\n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | "
column_acolumn_b
00.234977-0.617927
1-1.8240230.061936
2-0.0661820.006777
\n", 1424 | "
" 1425 | ], 1426 | "text/plain": [ 1427 | " column_a column_b\n", 1428 | "0 0.234977 -0.617927\n", 1429 | "1 -1.824023 0.061936\n", 1430 | "2 -0.066182 0.006777" 1431 | ] 1432 | }, 1433 | "execution_count": 28, 1434 | "metadata": {}, 1435 | "output_type": "execute_result" 1436 | } 1437 | ], 1438 | "source": [ 1439 | "cudfDataFrame.columns = cudfDataFrame.columns.str.strip().str.lower().str.replace(\" \", \"_\")\n", 1440 | "cudfDataFrame" 1441 | ] 1442 | }, 1443 | { 1444 | "cell_type": "markdown", 1445 | "metadata": {}, 1446 | "source": [ 1447 | "## Splitting and replacing strings" 1448 | ] 1449 | }, 1450 | { 1451 | "cell_type": "markdown", 1452 | "metadata": {}, 1453 | "source": [ 1454 | "Methods like split return a Series of lists:" 1455 | ] 1456 | }, 1457 | { 1458 | "cell_type": "code", 1459 | "execution_count": 29, 1460 | "metadata": {}, 1461 | "outputs": [ 1462 | { 1463 | "data": { 1464 | "text/plain": [ 1465 | "0 [a, b, c]\n", 1466 | "1 [c, d, e]\n", 1467 | "2 \n", 1468 | "3 [f, g, h]\n", 1469 | "dtype: object" 1470 | ] 1471 | }, 1472 | "execution_count": 29, 1473 | "metadata": {}, 1474 | "output_type": "execute_result" 1475 | } 1476 | ], 1477 | "source": [ 1478 | "pandasSeries3 = pd.Series([\"a_b_c\", \"c_d_e\", np.nan, \"f_g_h\"], dtype=\"string\")\n", 1479 | "pandasSeries3.str.split(\"_\")" 1480 | ] 1481 | }, 1482 | { 1483 | "cell_type": "code", 1484 | "execution_count": 30, 1485 | "metadata": {}, 1486 | "outputs": [ 1487 | { 1488 | "data": { 1489 | "text/plain": [ 1490 | "0 [a, b, c]\n", 1491 | "1 [c, d, e]\n", 1492 | "2 None\n", 1493 | "3 [f, g, h]\n", 1494 | "dtype: list" 1495 | ] 1496 | }, 1497 | "execution_count": 30, 1498 | "metadata": {}, 1499 | "output_type": "execute_result" 1500 | } 1501 | ], 1502 | "source": [ 1503 | "cudfSeries3 = cudf.Series([\"a_b_c\", \"c_d_e\", np.nan, \"f_g_h\"], dtype=\"str\")\n", 1504 | "cudfSeries3.str.split(\"_\")" 1505 | ] 1506 | }, 1507 | { 1508 | "cell_type": "markdown", 1509 | "metadata": {}, 1510 | "source": [ 1511 | "\n", 1512 | "It is easy to expand this to return a DataFrame using expand." 1513 | ] 1514 | }, 1515 | { 1516 | "cell_type": "code", 1517 | "execution_count": 31, 1518 | "metadata": {}, 1519 | "outputs": [ 1520 | { 1521 | "data": { 1522 | "text/html": [ 1523 | "
\n", 1524 | "\n", 1537 | "\n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | "
012
0abc
1cde
2<NA><NA><NA>
3fgh
\n", 1573 | "
" 1574 | ], 1575 | "text/plain": [ 1576 | " 0 1 2\n", 1577 | "0 a b c\n", 1578 | "1 c d e\n", 1579 | "2 \n", 1580 | "3 f g h" 1581 | ] 1582 | }, 1583 | "execution_count": 31, 1584 | "metadata": {}, 1585 | "output_type": "execute_result" 1586 | } 1587 | ], 1588 | "source": [ 1589 | "pandasSeries3.str.split(\"_\", expand=True)" 1590 | ] 1591 | }, 1592 | { 1593 | "cell_type": "code", 1594 | "execution_count": 32, 1595 | "metadata": {}, 1596 | "outputs": [ 1597 | { 1598 | "data": { 1599 | "text/html": [ 1600 | "
\n", 1601 | "\n", 1614 | "\n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | "
012
0abc
1cde
2<NA><NA><NA>
3fgh
\n", 1650 | "
" 1651 | ], 1652 | "text/plain": [ 1653 | " 0 1 2\n", 1654 | "0 a b c\n", 1655 | "1 c d e\n", 1656 | "2 \n", 1657 | "3 f g h" 1658 | ] 1659 | }, 1660 | "execution_count": 32, 1661 | "metadata": {}, 1662 | "output_type": "execute_result" 1663 | } 1664 | ], 1665 | "source": [ 1666 | "cudfSeries3.str.split(\"_\", expand=True)" 1667 | ] 1668 | }, 1669 | { 1670 | "cell_type": "markdown", 1671 | "metadata": {}, 1672 | "source": [ 1673 | "\n", 1674 | "When original Series has StringDtype, the output columns will all be StringDtype as well." 1675 | ] 1676 | }, 1677 | { 1678 | "cell_type": "markdown", 1679 | "metadata": {}, 1680 | "source": [ 1681 | "It is also possible to limit the number of splits:" 1682 | ] 1683 | }, 1684 | { 1685 | "cell_type": "code", 1686 | "execution_count": 33, 1687 | "metadata": {}, 1688 | "outputs": [ 1689 | { 1690 | "data": { 1691 | "text/html": [ 1692 | "
\n", 1693 | "\n", 1706 | "\n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | "
01
0ab_c
1cd_e
2<NA><NA>
3fg_h
\n", 1737 | "
" 1738 | ], 1739 | "text/plain": [ 1740 | " 0 1\n", 1741 | "0 a b_c\n", 1742 | "1 c d_e\n", 1743 | "2 \n", 1744 | "3 f g_h" 1745 | ] 1746 | }, 1747 | "execution_count": 33, 1748 | "metadata": {}, 1749 | "output_type": "execute_result" 1750 | } 1751 | ], 1752 | "source": [ 1753 | "pandasSeries3.str.split(\"_\", expand=True, n=1)" 1754 | ] 1755 | }, 1756 | { 1757 | "cell_type": "code", 1758 | "execution_count": 34, 1759 | "metadata": {}, 1760 | "outputs": [ 1761 | { 1762 | "data": { 1763 | "text/html": [ 1764 | "
\n", 1765 | "\n", 1778 | "\n", 1779 | " \n", 1780 | " \n", 1781 | " \n", 1782 | " \n", 1783 | " \n", 1784 | " \n", 1785 | " \n", 1786 | " \n", 1787 | " \n", 1788 | " \n", 1789 | " \n", 1790 | " \n", 1791 | " \n", 1792 | " \n", 1793 | " \n", 1794 | " \n", 1795 | " \n", 1796 | " \n", 1797 | " \n", 1798 | " \n", 1799 | " \n", 1800 | " \n", 1801 | " \n", 1802 | " \n", 1803 | " \n", 1804 | " \n", 1805 | " \n", 1806 | " \n", 1807 | " \n", 1808 | "
01
0ab_c
1cd_e
2<NA><NA>
3fg_h
\n", 1809 | "
" 1810 | ], 1811 | "text/plain": [ 1812 | " 0 1\n", 1813 | "0 a b_c\n", 1814 | "1 c d_e\n", 1815 | "2 \n", 1816 | "3 f g_h" 1817 | ] 1818 | }, 1819 | "execution_count": 34, 1820 | "metadata": {}, 1821 | "output_type": "execute_result" 1822 | } 1823 | ], 1824 | "source": [ 1825 | "cudfSeries3.str.split(\"_\", expand=True, n=1)" 1826 | ] 1827 | }, 1828 | { 1829 | "cell_type": "markdown", 1830 | "metadata": {}, 1831 | "source": [ 1832 | "\n", 1833 | "rsplit is similar to split except it works in the reverse direction, i.e., from the end of the string to the beginning of the string:" 1834 | ] 1835 | }, 1836 | { 1837 | "cell_type": "code", 1838 | "execution_count": 35, 1839 | "metadata": {}, 1840 | "outputs": [ 1841 | { 1842 | "data": { 1843 | "text/html": [ 1844 | "
\n", 1845 | "\n", 1858 | "\n", 1859 | " \n", 1860 | " \n", 1861 | " \n", 1862 | " \n", 1863 | " \n", 1864 | " \n", 1865 | " \n", 1866 | " \n", 1867 | " \n", 1868 | " \n", 1869 | " \n", 1870 | " \n", 1871 | " \n", 1872 | " \n", 1873 | " \n", 1874 | " \n", 1875 | " \n", 1876 | " \n", 1877 | " \n", 1878 | " \n", 1879 | " \n", 1880 | " \n", 1881 | " \n", 1882 | " \n", 1883 | " \n", 1884 | " \n", 1885 | " \n", 1886 | " \n", 1887 | " \n", 1888 | "
01
0a_bc
1c_de
2<NA><NA>
3f_gh
\n", 1889 | "
" 1890 | ], 1891 | "text/plain": [ 1892 | " 0 1\n", 1893 | "0 a_b c\n", 1894 | "1 c_d e\n", 1895 | "2 \n", 1896 | "3 f_g h" 1897 | ] 1898 | }, 1899 | "execution_count": 35, 1900 | "metadata": {}, 1901 | "output_type": "execute_result" 1902 | } 1903 | ], 1904 | "source": [ 1905 | "pandasSeries3.str.rsplit(\"_\", expand=True, n=1)" 1906 | ] 1907 | }, 1908 | { 1909 | "cell_type": "code", 1910 | "execution_count": 36, 1911 | "metadata": {}, 1912 | "outputs": [ 1913 | { 1914 | "data": { 1915 | "text/html": [ 1916 | "
\n", 1917 | "\n", 1930 | "\n", 1931 | " \n", 1932 | " \n", 1933 | " \n", 1934 | " \n", 1935 | " \n", 1936 | " \n", 1937 | " \n", 1938 | " \n", 1939 | " \n", 1940 | " \n", 1941 | " \n", 1942 | " \n", 1943 | " \n", 1944 | " \n", 1945 | " \n", 1946 | " \n", 1947 | " \n", 1948 | " \n", 1949 | " \n", 1950 | " \n", 1951 | " \n", 1952 | " \n", 1953 | " \n", 1954 | " \n", 1955 | " \n", 1956 | " \n", 1957 | " \n", 1958 | " \n", 1959 | " \n", 1960 | "
01
0a_bc
1c_de
2<NA><NA>
3f_gh
\n", 1961 | "
" 1962 | ], 1963 | "text/plain": [ 1964 | " 0 1\n", 1965 | "0 a_b c\n", 1966 | "1 c_d e\n", 1967 | "2 \n", 1968 | "3 f_g h" 1969 | ] 1970 | }, 1971 | "execution_count": 36, 1972 | "metadata": {}, 1973 | "output_type": "execute_result" 1974 | } 1975 | ], 1976 | "source": [ 1977 | "cudfSeries3.str.rsplit(\"_\", expand=True, n=1)" 1978 | ] 1979 | }, 1980 | { 1981 | "cell_type": "markdown", 1982 | "metadata": {}, 1983 | "source": [ 1984 | "## The replace method\n" 1985 | ] 1986 | }, 1987 | { 1988 | "cell_type": "markdown", 1989 | "metadata": {}, 1990 | "source": [ 1991 | "replace optionally uses regular expressions:" 1992 | ] 1993 | }, 1994 | { 1995 | "cell_type": "code", 1996 | "execution_count": 37, 1997 | "metadata": {}, 1998 | "outputs": [ 1999 | { 2000 | "name": "stdout", 2001 | "output_type": "stream", 2002 | "text": [ 2003 | "Original: \n", 2004 | "0 A\n", 2005 | "1 B\n", 2006 | "2 C\n", 2007 | "3 Aaba\n", 2008 | "4 Baca\n", 2009 | "5 \n", 2010 | "6 \n", 2011 | "7 CABA\n", 2012 | "8 dog\n", 2013 | "9 cat\n", 2014 | "dtype: string\n", 2015 | "\n", 2016 | "Replaced: \n" 2017 | ] 2018 | }, 2019 | { 2020 | "data": { 2021 | "text/plain": [ 2022 | "0 A\n", 2023 | "1 B\n", 2024 | "2 C\n", 2025 | "3 XX-XX ba\n", 2026 | "4 XX-XX ca\n", 2027 | "5 \n", 2028 | "6 \n", 2029 | "7 CABA\n", 2030 | "8 XX-XX \n", 2031 | "9 XX-XX t\n", 2032 | "dtype: string" 2033 | ] 2034 | }, 2035 | "execution_count": 37, 2036 | "metadata": {}, 2037 | "output_type": "execute_result" 2038 | } 2039 | ], 2040 | "source": [ 2041 | "pandasSeries4 = pd.Series(\n", 2042 | " ....: [\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", \"\", np.nan, \"CABA\", \"dog\", \"cat\"],\n", 2043 | " ....: dtype=\"string\",\n", 2044 | " ....: )\n", 2045 | " ....: \n", 2046 | "print('Original: ')\n", 2047 | "print(pandasSeries4) \n", 2048 | "print('\\nReplaced: ')\n", 2049 | "pandasSeries4.str.replace(\"^.a|dog\", \"XX-XX \", regex=True)" 2050 | ] 2051 | }, 2052 | { 2053 | "cell_type": "code", 2054 | "execution_count": 38, 2055 | "metadata": {}, 2056 | "outputs": [ 2057 | { 2058 | "name": "stdout", 2059 | "output_type": "stream", 2060 | "text": [ 2061 | "Original: \n", 2062 | "0 A\n", 2063 | "1 B\n", 2064 | "2 C\n", 2065 | "3 Aaba\n", 2066 | "4 Baca\n", 2067 | "5 \n", 2068 | "6 \n", 2069 | "7 CABA\n", 2070 | "8 dog\n", 2071 | "9 cat\n", 2072 | "dtype: object\n", 2073 | "\n", 2074 | "Replaced: \n" 2075 | ] 2076 | }, 2077 | { 2078 | "data": { 2079 | "text/plain": [ 2080 | "0 A\n", 2081 | "1 B\n", 2082 | "2 C\n", 2083 | "3 XX-XX ba\n", 2084 | "4 XX-XX ca\n", 2085 | "5 \n", 2086 | "6 \n", 2087 | "7 CABA\n", 2088 | "8 XX-XX \n", 2089 | "9 XX-XX t\n", 2090 | "dtype: object" 2091 | ] 2092 | }, 2093 | "execution_count": 38, 2094 | "metadata": {}, 2095 | "output_type": "execute_result" 2096 | } 2097 | ], 2098 | "source": [ 2099 | "cudfSeries4 = cudf.Series(\n", 2100 | " ....: [\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", \"\", np.nan, \"CABA\", \"dog\", \"cat\"],\n", 2101 | " ....: dtype=\"str\",\n", 2102 | " ....: )\n", 2103 | " ....: \n", 2104 | "print('Original: ')\n", 2105 | "print(cudfSeries4) \n", 2106 | "print('\\nReplaced: ')\n", 2107 | "cudfSeries4.str.replace(\"^.a|dog\", \"XX-XX \", regex=True)" 2108 | ] 2109 | }, 2110 | { 2111 | "cell_type": "markdown", 2112 | "metadata": {}, 2113 | "source": [ 2114 | "\n", 2115 | "If you want literal replacement of a string (equivalent to str.replace()), you can set the optional regex parameter to False, rather than escaping each character. In this case both pat and repl must be strings:" 2116 | ] 2117 | }, 2118 | { 2119 | "cell_type": "code", 2120 | "execution_count": 39, 2121 | "metadata": {}, 2122 | "outputs": [ 2123 | { 2124 | "name": "stdout", 2125 | "output_type": "stream", 2126 | "text": [ 2127 | "0 12\n", 2128 | "1 -10\n", 2129 | "2 $10,000\n", 2130 | "dtype: string\n", 2131 | "\n", 2132 | "Are these equivalent? \n", 2133 | "\n" 2134 | ] 2135 | }, 2136 | { 2137 | "data": { 2138 | "text/plain": [ 2139 | "0 12\n", 2140 | "1 -10\n", 2141 | "2 $10,000\n", 2142 | "dtype: string" 2143 | ] 2144 | }, 2145 | "execution_count": 39, 2146 | "metadata": {}, 2147 | "output_type": "execute_result" 2148 | } 2149 | ], 2150 | "source": [ 2151 | "pandasdollars = pd.Series([\"12\", \"-$10\", \"$10,000\"], dtype=\"string\")\n", 2152 | "\n", 2153 | "# These lines are equivalent\n", 2154 | "print(pandasdollars.str.replace(r\"-\\$\", \"-\", regex=True))\n", 2155 | "print(\"\\nAre these equivalent? \\n\")\n", 2156 | "pandasdollars.str.replace(\"-$\", \"-\", regex=False)" 2157 | ] 2158 | }, 2159 | { 2160 | "cell_type": "code", 2161 | "execution_count": 40, 2162 | "metadata": {}, 2163 | "outputs": [ 2164 | { 2165 | "name": "stdout", 2166 | "output_type": "stream", 2167 | "text": [ 2168 | "0 12\n", 2169 | "1 -10\n", 2170 | "2 $10,000\n", 2171 | "dtype: object\n", 2172 | "\n", 2173 | "Are these equivalent? \n", 2174 | "\n" 2175 | ] 2176 | }, 2177 | { 2178 | "data": { 2179 | "text/plain": [ 2180 | "0 12\n", 2181 | "1 -10\n", 2182 | "2 $10,000\n", 2183 | "dtype: object" 2184 | ] 2185 | }, 2186 | "execution_count": 40, 2187 | "metadata": {}, 2188 | "output_type": "execute_result" 2189 | } 2190 | ], 2191 | "source": [ 2192 | "cudfDollars = cudf.Series([\"12\", \"-$10\", \"$10,000\"], dtype=\"str\")\n", 2193 | "\n", 2194 | "# These lines are equivalent\n", 2195 | "print(cudfDollars.str.replace(r\"-\\$\", \"-\", regex=True))\n", 2196 | "print(\"\\nAre these equivalent? \\n\")\n", 2197 | "cudfDollars.str.replace(\"-$\", \"-\", regex=False)" 2198 | ] 2199 | }, 2200 | { 2201 | "cell_type": "markdown", 2202 | "metadata": {}, 2203 | "source": [ 2204 | "## Concatenation\n" 2205 | ] 2206 | }, 2207 | { 2208 | "cell_type": "markdown", 2209 | "metadata": {}, 2210 | "source": [ 2211 | "There are several ways to concatenate a Series or Index, either with itself or others, all based on cat(), resp. Index.str.cat." 2212 | ] 2213 | }, 2214 | { 2215 | "cell_type": "markdown", 2216 | "metadata": {}, 2217 | "source": [ 2218 | "### Concatenating a single Series into a string" 2219 | ] 2220 | }, 2221 | { 2222 | "cell_type": "markdown", 2223 | "metadata": {}, 2224 | "source": [ 2225 | "The content of a Series (or Index) can be concatenated:" 2226 | ] 2227 | }, 2228 | { 2229 | "cell_type": "code", 2230 | "execution_count": 41, 2231 | "metadata": {}, 2232 | "outputs": [ 2233 | { 2234 | "data": { 2235 | "text/plain": [ 2236 | "'a,b,c,d'" 2237 | ] 2238 | }, 2239 | "execution_count": 41, 2240 | "metadata": {}, 2241 | "output_type": "execute_result" 2242 | } 2243 | ], 2244 | "source": [ 2245 | "pandasSeries = pd.Series([\"a\", \"b\", \"c\", \"d\"], dtype=\"string\")\n", 2246 | "\n", 2247 | "pandasSeries.str.cat(sep=\",\")" 2248 | ] 2249 | }, 2250 | { 2251 | "cell_type": "code", 2252 | "execution_count": 42, 2253 | "metadata": {}, 2254 | "outputs": [ 2255 | { 2256 | "data": { 2257 | "text/plain": [ 2258 | "'a,b,c,d'" 2259 | ] 2260 | }, 2261 | "execution_count": 42, 2262 | "metadata": {}, 2263 | "output_type": "execute_result" 2264 | } 2265 | ], 2266 | "source": [ 2267 | "cudfSeries = cudf.Series([\"a\", \"b\", \"c\", \"d\"], dtype=\"str\")\n", 2268 | "\n", 2269 | "cudfSeries.str.cat(sep=\",\")" 2270 | ] 2271 | }, 2272 | { 2273 | "cell_type": "markdown", 2274 | "metadata": {}, 2275 | "source": [ 2276 | "\n", 2277 | "If not specified, the keyword sep for the separator defaults to the empty string, sep='':" 2278 | ] 2279 | }, 2280 | { 2281 | "cell_type": "code", 2282 | "execution_count": 43, 2283 | "metadata": {}, 2284 | "outputs": [ 2285 | { 2286 | "data": { 2287 | "text/plain": [ 2288 | "'abcd'" 2289 | ] 2290 | }, 2291 | "execution_count": 43, 2292 | "metadata": {}, 2293 | "output_type": "execute_result" 2294 | } 2295 | ], 2296 | "source": [ 2297 | "pandasSeries.str.cat()" 2298 | ] 2299 | }, 2300 | { 2301 | "cell_type": "code", 2302 | "execution_count": 44, 2303 | "metadata": {}, 2304 | "outputs": [ 2305 | { 2306 | "data": { 2307 | "text/plain": [ 2308 | "'abcd'" 2309 | ] 2310 | }, 2311 | "execution_count": 44, 2312 | "metadata": {}, 2313 | "output_type": "execute_result" 2314 | } 2315 | ], 2316 | "source": [ 2317 | "cudfSeries.str.cat()" 2318 | ] 2319 | }, 2320 | { 2321 | "cell_type": "markdown", 2322 | "metadata": {}, 2323 | "source": [ 2324 | "\n", 2325 | "By default, missing values are ignored. Using na_rep, they can be given a representation:" 2326 | ] 2327 | }, 2328 | { 2329 | "cell_type": "code", 2330 | "execution_count": 45, 2331 | "metadata": {}, 2332 | "outputs": [ 2333 | { 2334 | "name": "stdout", 2335 | "output_type": "stream", 2336 | "text": [ 2337 | "Seperated by ,: \n", 2338 | "a,b,d\n", 2339 | "\n", 2340 | "Seperated by , & -: \n" 2341 | ] 2342 | }, 2343 | { 2344 | "data": { 2345 | "text/plain": [ 2346 | "'a,b,-,d'" 2347 | ] 2348 | }, 2349 | "execution_count": 45, 2350 | "metadata": {}, 2351 | "output_type": "execute_result" 2352 | } 2353 | ], 2354 | "source": [ 2355 | "pandasSeriesB = pd.Series([\"a\", \"b\", np.nan, \"d\"], dtype=\"string\")\n", 2356 | "print('Seperated by ,: ')\n", 2357 | "print(pandasSeriesB.str.cat(sep=\",\"))\n", 2358 | "print('\\nSeperated by , & -: ')\n", 2359 | "pandasSeriesB.str.cat(sep=\",\", na_rep=\"-\")" 2360 | ] 2361 | }, 2362 | { 2363 | "cell_type": "code", 2364 | "execution_count": 46, 2365 | "metadata": {}, 2366 | "outputs": [ 2367 | { 2368 | "name": "stdout", 2369 | "output_type": "stream", 2370 | "text": [ 2371 | "Seperated by ,: \n", 2372 | "a,b,d\n", 2373 | "\n", 2374 | "Seperated by , & -: \n" 2375 | ] 2376 | }, 2377 | { 2378 | "data": { 2379 | "text/plain": [ 2380 | "'a,b,-,d'" 2381 | ] 2382 | }, 2383 | "execution_count": 46, 2384 | "metadata": {}, 2385 | "output_type": "execute_result" 2386 | } 2387 | ], 2388 | "source": [ 2389 | "cudfSeriesB = cudf.Series([\"a\", \"b\", np.nan, \"d\"], dtype=\"str\")\n", 2390 | "print('Seperated by ,: ')\n", 2391 | "print(cudfSeriesB.str.cat(sep=\",\"))\n", 2392 | "print('\\nSeperated by , & -: ')\n", 2393 | "cudfSeriesB.str.cat(sep=\",\", na_rep=\"-\")" 2394 | ] 2395 | }, 2396 | { 2397 | "cell_type": "markdown", 2398 | "metadata": {}, 2399 | "source": [ 2400 | "## Concatenating a Series and something list-like into a Series" 2401 | ] 2402 | }, 2403 | { 2404 | "cell_type": "markdown", 2405 | "metadata": {}, 2406 | "source": [ 2407 | "The first argument to cat() can be a list-like object, provided that it matches the length of the calling Series (or Index)." 2408 | ] 2409 | }, 2410 | { 2411 | "cell_type": "code", 2412 | "execution_count": 47, 2413 | "metadata": {}, 2414 | "outputs": [ 2415 | { 2416 | "data": { 2417 | "text/plain": [ 2418 | "0 aA\n", 2419 | "1 bB\n", 2420 | "2 cC\n", 2421 | "3 dD\n", 2422 | "dtype: string" 2423 | ] 2424 | }, 2425 | "execution_count": 47, 2426 | "metadata": {}, 2427 | "output_type": "execute_result" 2428 | } 2429 | ], 2430 | "source": [ 2431 | "pandasSeries.str.cat([\"A\", \"B\", \"C\", \"D\"])" 2432 | ] 2433 | }, 2434 | { 2435 | "cell_type": "code", 2436 | "execution_count": 48, 2437 | "metadata": {}, 2438 | "outputs": [ 2439 | { 2440 | "data": { 2441 | "text/plain": [ 2442 | "0 aA\n", 2443 | "1 bB\n", 2444 | "2 cC\n", 2445 | "3 dD\n", 2446 | "dtype: object" 2447 | ] 2448 | }, 2449 | "execution_count": 48, 2450 | "metadata": {}, 2451 | "output_type": "execute_result" 2452 | } 2453 | ], 2454 | "source": [ 2455 | "cudfSeries.str.cat([\"A\", \"B\", \"C\", \"D\"])" 2456 | ] 2457 | }, 2458 | { 2459 | "cell_type": "markdown", 2460 | "metadata": {}, 2461 | "source": [ 2462 | "Missing values on either side will result in missing values in the result as well, unless na_rep is specified:" 2463 | ] 2464 | }, 2465 | { 2466 | "cell_type": "code", 2467 | "execution_count": 49, 2468 | "metadata": {}, 2469 | "outputs": [ 2470 | { 2471 | "name": "stdout", 2472 | "output_type": "stream", 2473 | "text": [ 2474 | "Original: \n", 2475 | "0 aa\n", 2476 | "1 bb\n", 2477 | "2 \n", 2478 | "3 dd\n", 2479 | "dtype: string\n", 2480 | "\n", 2481 | "na_rep is specified\n" 2482 | ] 2483 | }, 2484 | { 2485 | "data": { 2486 | "text/plain": [ 2487 | "0 aa\n", 2488 | "1 bb\n", 2489 | "2 c-\n", 2490 | "3 dd\n", 2491 | "dtype: string" 2492 | ] 2493 | }, 2494 | "execution_count": 49, 2495 | "metadata": {}, 2496 | "output_type": "execute_result" 2497 | } 2498 | ], 2499 | "source": [ 2500 | "print('Original: ')\n", 2501 | "print(pandasSeries.str.cat(pandasSeriesB))\n", 2502 | "print('\\nna_rep is specified')\n", 2503 | "pandasSeries.str.cat(pandasSeriesB, na_rep=\"-\")" 2504 | ] 2505 | }, 2506 | { 2507 | "cell_type": "code", 2508 | "execution_count": 50, 2509 | "metadata": {}, 2510 | "outputs": [ 2511 | { 2512 | "name": "stdout", 2513 | "output_type": "stream", 2514 | "text": [ 2515 | "Original: \n", 2516 | "0 aa\n", 2517 | "1 bb\n", 2518 | "2 \n", 2519 | "3 dd\n", 2520 | "dtype: object\n", 2521 | "\n", 2522 | "na_rep is specified\n" 2523 | ] 2524 | }, 2525 | { 2526 | "data": { 2527 | "text/plain": [ 2528 | "0 aa\n", 2529 | "1 bb\n", 2530 | "2 c-\n", 2531 | "3 dd\n", 2532 | "dtype: object" 2533 | ] 2534 | }, 2535 | "execution_count": 50, 2536 | "metadata": {}, 2537 | "output_type": "execute_result" 2538 | } 2539 | ], 2540 | "source": [ 2541 | "print('Original: ')\n", 2542 | "print(cudfSeries.str.cat(cudfSeriesB))\n", 2543 | "print('\\nna_rep is specified')\n", 2544 | "cudfSeries.str.cat(cudfSeriesB, na_rep=\"-\")" 2545 | ] 2546 | }, 2547 | { 2548 | "cell_type": "markdown", 2549 | "metadata": {}, 2550 | "source": [ 2551 | "\n", 2552 | "## Concatenating a Series and something array-like into a Series" 2553 | ] 2554 | }, 2555 | { 2556 | "cell_type": "markdown", 2557 | "metadata": {}, 2558 | "source": [ 2559 | "The parameter others can also be two-dimensional. In this case, the number or rows must match the lengths of the calling Series (or Index)." 2560 | ] 2561 | }, 2562 | { 2563 | "cell_type": "code", 2564 | "execution_count": 51, 2565 | "metadata": {}, 2566 | "outputs": [ 2567 | { 2568 | "name": "stdout", 2569 | "output_type": "stream", 2570 | "text": [ 2571 | "Original: \n", 2572 | "0 a\n", 2573 | "1 b\n", 2574 | "2 c\n", 2575 | "3 d\n", 2576 | "dtype: string\n", 2577 | "\n", 2578 | "Concatenating a Series and something array-like\n", 2579 | " 0 1\n", 2580 | "0 a a\n", 2581 | "1 b b\n", 2582 | "2 c\n", 2583 | "3 d d\n" 2584 | ] 2585 | }, 2586 | { 2587 | "data": { 2588 | "text/plain": [ 2589 | "0 aaa\n", 2590 | "1 bbb\n", 2591 | "2 c-c\n", 2592 | "3 ddd\n", 2593 | "dtype: string" 2594 | ] 2595 | }, 2596 | "execution_count": 51, 2597 | "metadata": {}, 2598 | "output_type": "execute_result" 2599 | } 2600 | ], 2601 | "source": [ 2602 | "pandasArray = pd.concat([pandasSeriesB, pandasSeries], axis=1)\n", 2603 | "print('Original: ')\n", 2604 | "print(pandasSeries)\n", 2605 | "print('\\nConcatenating a Series and something array-like')\n", 2606 | "print(pandasArray)\n", 2607 | "pandasSeries.str.cat(pandasArray, na_rep=\"-\")\n" 2608 | ] 2609 | }, 2610 | { 2611 | "cell_type": "code", 2612 | "execution_count": 52, 2613 | "metadata": {}, 2614 | "outputs": [ 2615 | { 2616 | "name": "stdout", 2617 | "output_type": "stream", 2618 | "text": [ 2619 | "Original: \n", 2620 | "0 a\n", 2621 | "1 b\n", 2622 | "2 c\n", 2623 | "3 d\n", 2624 | "dtype: object\n", 2625 | "\n", 2626 | "Concatenating a Series and something array-like\n", 2627 | " 0 1\n", 2628 | "0 a a\n", 2629 | "1 b b\n", 2630 | "2 c\n", 2631 | "3 d d\n" 2632 | ] 2633 | }, 2634 | { 2635 | "data": { 2636 | "text/plain": [ 2637 | "0 aaa\n", 2638 | "1 bbb\n", 2639 | "2 c-c\n", 2640 | "3 ddd\n", 2641 | "Name: 1, dtype: object" 2642 | ] 2643 | }, 2644 | "execution_count": 52, 2645 | "metadata": {}, 2646 | "output_type": "execute_result" 2647 | } 2648 | ], 2649 | "source": [ 2650 | "cudfArray = cudf.concat([cudfSeriesB, cudfSeries], axis=1)\n", 2651 | "print('Original: ')\n", 2652 | "print(cudfSeries)\n", 2653 | "print('\\nConcatenating a Series and something array-like')\n", 2654 | "print(cudfArray)\n", 2655 | "cudfArray[1].str.cat(cudfArray[0], na_rep=\"-\").str.cat(cudfSeries, na_rep=\"-\")" 2656 | ] 2657 | }, 2658 | { 2659 | "cell_type": "markdown", 2660 | "metadata": {}, 2661 | "source": [ 2662 | "\n", 2663 | "## Indexing with .str" 2664 | ] 2665 | }, 2666 | { 2667 | "cell_type": "markdown", 2668 | "metadata": {}, 2669 | "source": [ 2670 | "You can use `[]` notation to directly index by position locations. If you index past the end of the string, the result will be a NaN." 2671 | ] 2672 | }, 2673 | { 2674 | "cell_type": "code", 2675 | "execution_count": 53, 2676 | "metadata": {}, 2677 | "outputs": [ 2678 | { 2679 | "name": "stdout", 2680 | "output_type": "stream", 2681 | "text": [ 2682 | "Indexed at position 0: \n", 2683 | "0 A\n", 2684 | "1 B\n", 2685 | "2 C\n", 2686 | "3 A\n", 2687 | "4 B\n", 2688 | "5 \n", 2689 | "6 C\n", 2690 | "7 d\n", 2691 | "8 c\n", 2692 | "dtype: string\n", 2693 | "\n", 2694 | "Indexed at position 1: \n" 2695 | ] 2696 | }, 2697 | { 2698 | "data": { 2699 | "text/plain": [ 2700 | "0 \n", 2701 | "1 \n", 2702 | "2 \n", 2703 | "3 a\n", 2704 | "4 a\n", 2705 | "5 \n", 2706 | "6 A\n", 2707 | "7 o\n", 2708 | "8 a\n", 2709 | "dtype: string" 2710 | ] 2711 | }, 2712 | "execution_count": 53, 2713 | "metadata": {}, 2714 | "output_type": "execute_result" 2715 | } 2716 | ], 2717 | "source": [ 2718 | "pandasSeries = pd.Series([\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", np.nan, \"CABA\", \"dog\", \"cat\"], dtype=\"string\")\n", 2719 | " \n", 2720 | "print('Indexed at position 0: ')\n", 2721 | "print(pandasSeries.str[0])\n", 2722 | "print('\\nIndexed at position 1: ')\n", 2723 | "pandasSeries.str[1]" 2724 | ] 2725 | }, 2726 | { 2727 | "cell_type": "code", 2728 | "execution_count": 54, 2729 | "metadata": {}, 2730 | "outputs": [ 2731 | { 2732 | "name": "stdout", 2733 | "output_type": "stream", 2734 | "text": [ 2735 | "Indexed at position 0: \n", 2736 | "0 A\n", 2737 | "1 B\n", 2738 | "2 C\n", 2739 | "3 A\n", 2740 | "4 B\n", 2741 | "5 \n", 2742 | "6 C\n", 2743 | "7 d\n", 2744 | "8 c\n", 2745 | "dtype: object\n", 2746 | "\n", 2747 | "Indexed at position 1: \n" 2748 | ] 2749 | }, 2750 | { 2751 | "data": { 2752 | "text/plain": [ 2753 | "0 \n", 2754 | "1 \n", 2755 | "2 \n", 2756 | "3 a\n", 2757 | "4 a\n", 2758 | "5 \n", 2759 | "6 A\n", 2760 | "7 o\n", 2761 | "8 a\n", 2762 | "dtype: object" 2763 | ] 2764 | }, 2765 | "execution_count": 54, 2766 | "metadata": {}, 2767 | "output_type": "execute_result" 2768 | } 2769 | ], 2770 | "source": [ 2771 | "cudfSeries = cudf.Series([\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", np.nan, \"CABA\", \"dog\", \"cat\"], dtype=\"str\")\n", 2772 | " \n", 2773 | "print('Indexed at position 0: ')\n", 2774 | "print(cudfSeries.str[0])\n", 2775 | "print('\\nIndexed at position 1: ')\n", 2776 | "cudfSeries.str[1]" 2777 | ] 2778 | }, 2779 | { 2780 | "cell_type": "markdown", 2781 | "metadata": {}, 2782 | "source": [ 2783 | "\n", 2784 | "## Extracting substrings" 2785 | ] 2786 | }, 2787 | { 2788 | "cell_type": "markdown", 2789 | "metadata": {}, 2790 | "source": [ 2791 | "Extract first match in each subject (`extract`)." 2792 | ] 2793 | }, 2794 | { 2795 | "cell_type": "code", 2796 | "execution_count": 55, 2797 | "metadata": {}, 2798 | "outputs": [ 2799 | { 2800 | "name": "stdout", 2801 | "output_type": "stream", 2802 | "text": [ 2803 | " 0 1\n", 2804 | "0 a 1\n", 2805 | "1 b 2\n", 2806 | "2 \n" 2807 | ] 2808 | } 2809 | ], 2810 | "source": [ 2811 | "pdSeries = pd.Series([\"a1\", \"b2\", \"c3\"],dtype=\"string\",).str.extract(r\"([ab])(\\d)\", )\n", 2812 | "print(pdSeries)" 2813 | ] 2814 | }, 2815 | { 2816 | "cell_type": "code", 2817 | "execution_count": 56, 2818 | "metadata": {}, 2819 | "outputs": [ 2820 | { 2821 | "name": "stdout", 2822 | "output_type": "stream", 2823 | "text": [ 2824 | " 0 1\n", 2825 | "0 a 1\n", 2826 | "1 b 2\n", 2827 | "2 \n" 2828 | ] 2829 | } 2830 | ], 2831 | "source": [ 2832 | "cudfSeries = cudf.Series(['a1', 'b2', 'c3']).str.extract(r'([ab])(\\d)')\n", 2833 | "print(cudfSeries) " 2834 | ] 2835 | }, 2836 | { 2837 | "cell_type": "markdown", 2838 | "metadata": {}, 2839 | "source": [ 2840 | "\n", 2841 | "Extracting a regular expression with one group returns a DataFrame with one column if `expand=True`." 2842 | ] 2843 | }, 2844 | { 2845 | "cell_type": "code", 2846 | "execution_count": 57, 2847 | "metadata": {}, 2848 | "outputs": [ 2849 | { 2850 | "data": { 2851 | "text/html": [ 2852 | "
\n", 2853 | "\n", 2866 | "\n", 2867 | " \n", 2868 | " \n", 2869 | " \n", 2870 | " \n", 2871 | " \n", 2872 | " \n", 2873 | " \n", 2874 | " \n", 2875 | " \n", 2876 | " \n", 2877 | " \n", 2878 | " \n", 2879 | " \n", 2880 | " \n", 2881 | " \n", 2882 | " \n", 2883 | " \n", 2884 | " \n", 2885 | " \n", 2886 | " \n", 2887 | "
0
01
12
2<NA>
\n", 2888 | "
" 2889 | ], 2890 | "text/plain": [ 2891 | " 0\n", 2892 | "0 1\n", 2893 | "1 2\n", 2894 | "2 " 2895 | ] 2896 | }, 2897 | "execution_count": 57, 2898 | "metadata": {}, 2899 | "output_type": "execute_result" 2900 | } 2901 | ], 2902 | "source": [ 2903 | "pd.Series([\"a1\", \"b2\", \"c3\"], dtype=\"string\").str.extract(r\"[ab](\\d)\", expand=True)" 2904 | ] 2905 | }, 2906 | { 2907 | "cell_type": "code", 2908 | "execution_count": 58, 2909 | "metadata": {}, 2910 | "outputs": [ 2911 | { 2912 | "data": { 2913 | "text/html": [ 2914 | "
\n", 2915 | "\n", 2928 | "\n", 2929 | " \n", 2930 | " \n", 2931 | " \n", 2932 | " \n", 2933 | " \n", 2934 | " \n", 2935 | " \n", 2936 | " \n", 2937 | " \n", 2938 | " \n", 2939 | " \n", 2940 | " \n", 2941 | " \n", 2942 | " \n", 2943 | " \n", 2944 | " \n", 2945 | " \n", 2946 | " \n", 2947 | " \n", 2948 | " \n", 2949 | "
0
01
12
2<NA>
\n", 2950 | "
" 2951 | ], 2952 | "text/plain": [ 2953 | " 0\n", 2954 | "0 1\n", 2955 | "1 2\n", 2956 | "2 " 2957 | ] 2958 | }, 2959 | "execution_count": 58, 2960 | "metadata": {}, 2961 | "output_type": "execute_result" 2962 | } 2963 | ], 2964 | "source": [ 2965 | "cudf.Series([\"a1\", \"b2\", \"c3\"], dtype=\"str\").str.extract(r\"[ab](\\d)\", expand=True)" 2966 | ] 2967 | }, 2968 | { 2969 | "cell_type": "markdown", 2970 | "metadata": {}, 2971 | "source": [ 2972 | "It returns a Series if `expand=False`." 2973 | ] 2974 | }, 2975 | { 2976 | "cell_type": "code", 2977 | "execution_count": 59, 2978 | "metadata": {}, 2979 | "outputs": [ 2980 | { 2981 | "data": { 2982 | "text/plain": [ 2983 | "0 1\n", 2984 | "1 2\n", 2985 | "2 \n", 2986 | "dtype: string" 2987 | ] 2988 | }, 2989 | "execution_count": 59, 2990 | "metadata": {}, 2991 | "output_type": "execute_result" 2992 | } 2993 | ], 2994 | "source": [ 2995 | "pd.Series([\"a1\", \"b2\", \"c3\"], dtype=\"string\").str.extract(r\"[ab](\\d)\", expand=False)" 2996 | ] 2997 | }, 2998 | { 2999 | "cell_type": "code", 3000 | "execution_count": 60, 3001 | "metadata": {}, 3002 | "outputs": [ 3003 | { 3004 | "data": { 3005 | "text/plain": [ 3006 | "0 1\n", 3007 | "1 2\n", 3008 | "2 \n", 3009 | "dtype: object" 3010 | ] 3011 | }, 3012 | "execution_count": 60, 3013 | "metadata": {}, 3014 | "output_type": "execute_result" 3015 | } 3016 | ], 3017 | "source": [ 3018 | "cudf.Series([\"a1\", \"b2\", \"c3\"], dtype=\"str\").str.extract(r\"[ab](\\d)\", expand=False)" 3019 | ] 3020 | }, 3021 | { 3022 | "cell_type": "markdown", 3023 | "metadata": {}, 3024 | "source": [ 3025 | "\n", 3026 | "When each subject string in the Series has exactly one match." 3027 | ] 3028 | }, 3029 | { 3030 | "cell_type": "code", 3031 | "execution_count": 61, 3032 | "metadata": {}, 3033 | "outputs": [ 3034 | { 3035 | "name": "stdout", 3036 | "output_type": "stream", 3037 | "text": [ 3038 | "0 a3\n", 3039 | "1 b3\n", 3040 | "2 c2\n", 3041 | "dtype: string\n" 3042 | ] 3043 | } 3044 | ], 3045 | "source": [ 3046 | "pandasSeries = pd.Series([\"a3\", \"b3\", \"c2\"], dtype=\"string\")\n", 3047 | "print(pandasSeries)" 3048 | ] 3049 | }, 3050 | { 3051 | "cell_type": "code", 3052 | "execution_count": 62, 3053 | "metadata": {}, 3054 | "outputs": [ 3055 | { 3056 | "name": "stdout", 3057 | "output_type": "stream", 3058 | "text": [ 3059 | "0 a3\n", 3060 | "1 b3\n", 3061 | "2 c2\n", 3062 | "dtype: object\n" 3063 | ] 3064 | } 3065 | ], 3066 | "source": [ 3067 | "cudfSeries = cudf.Series([\"a3\", \"b3\", \"c2\"], dtype=\"str\")\n", 3068 | "print(cudfSeries)" 3069 | ] 3070 | }, 3071 | { 3072 | "cell_type": "markdown", 3073 | "metadata": {}, 3074 | "source": [ 3075 | "\n", 3076 | "## Testing for strings that match or contain a pattern" 3077 | ] 3078 | }, 3079 | { 3080 | "cell_type": "markdown", 3081 | "metadata": {}, 3082 | "source": [ 3083 | "You can check whether elements contain a pattern:" 3084 | ] 3085 | }, 3086 | { 3087 | "cell_type": "code", 3088 | "execution_count": 63, 3089 | "metadata": {}, 3090 | "outputs": [ 3091 | { 3092 | "data": { 3093 | "text/plain": [ 3094 | "0 False\n", 3095 | "1 False\n", 3096 | "2 True\n", 3097 | "3 True\n", 3098 | "4 True\n", 3099 | "5 True\n", 3100 | "dtype: bool" 3101 | ] 3102 | }, 3103 | "execution_count": 63, 3104 | "metadata": {}, 3105 | "output_type": "execute_result" 3106 | } 3107 | ], 3108 | "source": [ 3109 | "pattern = r\"[0-9][a-z]\"\n", 3110 | "\n", 3111 | "pd.Series([\"1\", \"2\", \"3a\", \"3b\", \"03c\", \"4dx\"],dtype=\"str\",\n", 3112 | " ).str.contains(pattern)\n", 3113 | " " 3114 | ] 3115 | }, 3116 | { 3117 | "cell_type": "code", 3118 | "execution_count": 64, 3119 | "metadata": {}, 3120 | "outputs": [ 3121 | { 3122 | "data": { 3123 | "text/plain": [ 3124 | "0 False\n", 3125 | "1 False\n", 3126 | "2 True\n", 3127 | "3 True\n", 3128 | "4 True\n", 3129 | "5 True\n", 3130 | "dtype: bool" 3131 | ] 3132 | }, 3133 | "execution_count": 64, 3134 | "metadata": {}, 3135 | "output_type": "execute_result" 3136 | } 3137 | ], 3138 | "source": [ 3139 | "pattern = r\"[0-9][a-z]\"\n", 3140 | "\n", 3141 | "cudf.Series([\"1\", \"2\", \"3a\", \"3b\", \"03c\", \"4dx\"],dtype=\"str\",\n", 3142 | " ).str.contains(pattern)\n", 3143 | " " 3144 | ] 3145 | }, 3146 | { 3147 | "cell_type": "markdown", 3148 | "metadata": {}, 3149 | "source": [ 3150 | "\n", 3151 | "Or whether elements match a pattern:" 3152 | ] 3153 | }, 3154 | { 3155 | "cell_type": "code", 3156 | "execution_count": 65, 3157 | "metadata": {}, 3158 | "outputs": [ 3159 | { 3160 | "data": { 3161 | "text/plain": [ 3162 | "0 False\n", 3163 | "1 False\n", 3164 | "2 True\n", 3165 | "3 True\n", 3166 | "4 False\n", 3167 | "5 True\n", 3168 | "dtype: boolean" 3169 | ] 3170 | }, 3171 | "execution_count": 65, 3172 | "metadata": {}, 3173 | "output_type": "execute_result" 3174 | } 3175 | ], 3176 | "source": [ 3177 | "pd.Series([\"1\", \"2\", \"3a\", \"3b\", \"03c\", \"4dx\"],dtype=\"string\",\n", 3178 | " ).str.match(pattern)\n", 3179 | " " 3180 | ] 3181 | }, 3182 | { 3183 | "cell_type": "code", 3184 | "execution_count": 66, 3185 | "metadata": {}, 3186 | "outputs": [ 3187 | { 3188 | "data": { 3189 | "text/plain": [ 3190 | "0 False\n", 3191 | "1 False\n", 3192 | "2 True\n", 3193 | "3 True\n", 3194 | "4 False\n", 3195 | "5 True\n", 3196 | "dtype: bool" 3197 | ] 3198 | }, 3199 | "execution_count": 66, 3200 | "metadata": {}, 3201 | "output_type": "execute_result" 3202 | } 3203 | ], 3204 | "source": [ 3205 | "cudf.Series([\"1\", \"2\", \"3a\", \"3b\", \"03c\", \"4dx\"],dtype=\"str\",\n", 3206 | " ).str.match(pattern) " 3207 | ] 3208 | }, 3209 | { 3210 | "cell_type": "markdown", 3211 | "metadata": {}, 3212 | "source": [ 3213 | "\n", 3214 | "New in version 1.1.0." 3215 | ] 3216 | }, 3217 | { 3218 | "cell_type": "code", 3219 | "execution_count": 67, 3220 | "metadata": {}, 3221 | "outputs": [ 3222 | { 3223 | "data": { 3224 | "text/plain": [ 3225 | "0 False\n", 3226 | "1 False\n", 3227 | "2 True\n", 3228 | "3 True\n", 3229 | "4 False\n", 3230 | "5 False\n", 3231 | "dtype: boolean" 3232 | ] 3233 | }, 3234 | "execution_count": 67, 3235 | "metadata": {}, 3236 | "output_type": "execute_result" 3237 | } 3238 | ], 3239 | "source": [ 3240 | "pd.Series([\"1\", \"2\", \"3a\", \"3b\", \"03c\", \"4dx\"],dtype=\"string\",\n", 3241 | " ).str.fullmatch(pattern)\n", 3242 | " " 3243 | ] 3244 | }, 3245 | { 3246 | "cell_type": "code", 3247 | "execution_count": 68, 3248 | "metadata": {}, 3249 | "outputs": [ 3250 | { 3251 | "data": { 3252 | "text/plain": [ 3253 | "0 False\n", 3254 | "1 False\n", 3255 | "2 True\n", 3256 | "3 True\n", 3257 | "4 False\n", 3258 | "5 True\n", 3259 | "dtype: bool" 3260 | ] 3261 | }, 3262 | "execution_count": 68, 3263 | "metadata": {}, 3264 | "output_type": "execute_result" 3265 | } 3266 | ], 3267 | "source": [ 3268 | "cudf.Series([\"1\", \"2\", \"3a\", \"3b\", \"03c\", \"4dx\"],dtype=\"str\",\n", 3269 | " ).str.match(pattern)" 3270 | ] 3271 | }, 3272 | { 3273 | "cell_type": "markdown", 3274 | "metadata": {}, 3275 | "source": [ 3276 | "In pandas, methods like `match`, `fullmatch`, `contains`, `startswith`, and `endswith` take an extra `na` argument so missing values can be considered `True` or `False`:" 3277 | ] 3278 | }, 3279 | { 3280 | "cell_type": "code", 3281 | "execution_count": 69, 3282 | "metadata": {}, 3283 | "outputs": [ 3284 | { 3285 | "name": "stdout", 3286 | "output_type": "stream", 3287 | "text": [ 3288 | "Strings that contain 'A':\n", 3289 | "0 True\n", 3290 | "1 False\n", 3291 | "2 False\n", 3292 | "3 True\n", 3293 | "4 False\n", 3294 | "5 False\n", 3295 | "6 True\n", 3296 | "7 False\n", 3297 | "8 False\n", 3298 | "dtype: boolean\n", 3299 | "\n", 3300 | "Strings that have swapped case:\n", 3301 | "0 a\n", 3302 | "1 b\n", 3303 | "2 c\n", 3304 | "3 aABA\n", 3305 | "4 bACA\n", 3306 | "5 \n", 3307 | "6 caba\n", 3308 | "7 DOG\n", 3309 | "8 CAT\n", 3310 | "dtype: string\n", 3311 | "\n", 3312 | "Strings that start with 'b':\n", 3313 | "0 False\n", 3314 | "1 False\n", 3315 | "2 False\n", 3316 | "3 False\n", 3317 | "4 False\n", 3318 | "5 \n", 3319 | "6 False\n", 3320 | "7 False\n", 3321 | "8 False\n", 3322 | "dtype: boolean\n", 3323 | "\n", 3324 | "Strings that ends with 'a':\n", 3325 | "0 False\n", 3326 | "1 False\n", 3327 | "2 False\n", 3328 | "3 True\n", 3329 | "4 True\n", 3330 | "5 \n", 3331 | "6 False\n", 3332 | "7 False\n", 3333 | "8 False\n", 3334 | "dtype: boolean\n" 3335 | ] 3336 | } 3337 | ], 3338 | "source": [ 3339 | "pandasSeries5 = pd.Series([\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", np.nan, \"CABA\", \"dog\", \"cat\"], dtype=\"string\") \n", 3340 | "print(\"Strings that contain 'A':\")\n", 3341 | "print(pandasSeries5.str.contains(\"A\", na=False))\n", 3342 | "print(\"\\nStrings that have swapped case:\")\n", 3343 | "print(pandasSeries5.str.swapcase())\n", 3344 | "print(\"\\nStrings that start with 'b':\")\n", 3345 | "print(pandasSeries5.str.startswith ('b'))\n", 3346 | "print((\"\\nStrings that ends with 'a':\"))\n", 3347 | "print(pandasSeries5.str.endswith ('a'))" 3348 | ] 3349 | }, 3350 | { 3351 | "cell_type": "markdown", 3352 | "metadata": {}, 3353 | "source": [ 3354 | "cuDF does not have this extra parameter, so it will return `na`." 3355 | ] 3356 | }, 3357 | { 3358 | "cell_type": "code", 3359 | "execution_count": 70, 3360 | "metadata": {}, 3361 | "outputs": [ 3362 | { 3363 | "name": "stdout", 3364 | "output_type": "stream", 3365 | "text": [ 3366 | "Strings that contain 'A':\n", 3367 | "0 True\n", 3368 | "1 False\n", 3369 | "2 False\n", 3370 | "3 True\n", 3371 | "4 False\n", 3372 | "5 \n", 3373 | "6 True\n", 3374 | "7 False\n", 3375 | "8 False\n", 3376 | "dtype: bool\n", 3377 | "\n", 3378 | "Strings that have swapped case:\n", 3379 | "0 a\n", 3380 | "1 b\n", 3381 | "2 c\n", 3382 | "3 aABA\n", 3383 | "4 bACA\n", 3384 | "5 \n", 3385 | "6 caba\n", 3386 | "7 DOG\n", 3387 | "8 CAT\n", 3388 | "dtype: object\n", 3389 | "\n", 3390 | "Strings that start with 'b':\n", 3391 | "0 False\n", 3392 | "1 False\n", 3393 | "2 False\n", 3394 | "3 False\n", 3395 | "4 False\n", 3396 | "5 \n", 3397 | "6 False\n", 3398 | "7 False\n", 3399 | "8 False\n", 3400 | "dtype: bool\n", 3401 | "\n", 3402 | "Strings that ends with 'a':\n", 3403 | "0 False\n", 3404 | "1 False\n", 3405 | "2 False\n", 3406 | "3 True\n", 3407 | "4 True\n", 3408 | "5 \n", 3409 | "6 False\n", 3410 | "7 False\n", 3411 | "8 False\n", 3412 | "dtype: bool\n" 3413 | ] 3414 | } 3415 | ], 3416 | "source": [ 3417 | "cudfSeries5 = cudf.Series([\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", np.nan, \"CABA\", \"dog\", \"cat\"], dtype=\"str\") \n", 3418 | "print(\"Strings that contain 'A':\")\n", 3419 | "print(cudfSeries5.str.contains(\"A\"))\n", 3420 | "print(\"\\nStrings that have swapped case:\")\n", 3421 | "print(cudfSeries5.str.swapcase())\n", 3422 | "print(\"\\nStrings that start with 'b':\")\n", 3423 | "print(cudfSeries5.str.startswith ('b'))\n", 3424 | "print((\"\\nStrings that ends with 'a':\"))\n", 3425 | "print(cudfSeries5.str.endswith ('a'))" 3426 | ] 3427 | }, 3428 | { 3429 | "cell_type": "markdown", 3430 | "metadata": {}, 3431 | "source": [] 3432 | } 3433 | ], 3434 | "metadata": { 3435 | "kernelspec": { 3436 | "display_name": "Python 3 (ipykernel)", 3437 | "language": "python", 3438 | "name": "python3" 3439 | }, 3440 | "language_info": { 3441 | "codemirror_mode": { 3442 | "name": "ipython", 3443 | "version": 3 3444 | }, 3445 | "file_extension": ".py", 3446 | "mimetype": "text/x-python", 3447 | "name": "python", 3448 | "nbconvert_exporter": "python", 3449 | "pygments_lexer": "ipython3", 3450 | "version": "3.10.13" 3451 | } 3452 | }, 3453 | "nbformat": 4, 3454 | "nbformat_minor": 4 3455 | } 3456 | --------------------------------------------------------------------------------