├── code
    ├── .gitkeep
    ├── ibis.png
    ├── cudf-pandas-demo.ipynb
    ├── cupy-interop.ipynb
    └── Introduction_to_Strings.ipynb
├── data
    ├── .gitkeep
    └── scratch
    │   └── .gitkeep
├── models
    └── .gitkeep
├── requirements.txt
├── apt.txt
├── .gitattributes
├── variables.env
├── .project
    ├── configpacks
    └── spec.yaml
├── postBuild.bash
├── preBuild.bash
├── .gitignore
├── README.md
└── LICENSE.txt


/code/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/scratch/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab>3.0
2 | plotly
3 | 


--------------------------------------------------------------------------------
/apt.txt:
--------------------------------------------------------------------------------
1 | # apt packages to install should be listed one per line
2 | 


--------------------------------------------------------------------------------
/code/ibis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/workbench-example-rapids-cudf/HEAD/code/ibis.png


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | models/** filter=lfs diff=lfs merge=lfs -text
2 | data/** filter=lfs diff=lfs merge=lfs -text
3 | 


--------------------------------------------------------------------------------
/variables.env:
--------------------------------------------------------------------------------
1 | # Set environment variables in the format KEY=VALUE, 1 per line
2 | # This file will be sourced inside the project container when started.
3 | # NOTE: If you change this file while the project is running, you must restart the project container for changes to take effect.
4 | 
5 | 


--------------------------------------------------------------------------------
/.project/configpacks:
--------------------------------------------------------------------------------
 1 | *defaults.ContainerUser
 2 | *bash.PreBuild
 3 | *cuda.CUDA
 4 | *defaults.EnvVars
 5 | *defaults.Readme
 6 | *defaults.Entrypoint
 7 | *apt.PackageManager
 8 | *bash.PreLanguage
 9 | *python.PipPackageManager
10 | *bash.PostBuild
11 | *jupyterlab.JupyterLab
12 | *vs_code.VSCode


--------------------------------------------------------------------------------
/postBuild.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # This file contains bash commands that will be executed at the end of the container build process,
3 | # after all system packages and programming language specific package have been installed.
4 | #
5 | # Note: This file may be removed if you don't need to use it
6 | 
7 | 


--------------------------------------------------------------------------------
/preBuild.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # This file contains bash commands that will be executed at the beginning of the container build process,
3 | # before any system packages or programming language specific package have been installed.
4 | #
5 | # Note: This file may be removed if you don't need to use it
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ignore generated or temporary files managed by the Workbench
 2 | .project/*
 3 | !.project/spec.yaml
 4 | !.project/configpacks
 5 | 
 6 | # General ignores
 7 | .DS_Store
 8 | __pycache__/
 9 | *.py[cod]
10 | *$py.class
11 | .ipynb_checkpoints
12 | 
13 | # Workbench Project Layout
14 | data/scratch/*
15 | !data/scratch/.gitkeep
16 | 
17 | # Rapids ignores
18 | *.html
19 | /.ipynb_checkpoints/
20 | *~
21 | /rows.csv
22 | /cufile.log
23 | *.pyc
24 | 
25 | # Byte-compiled / optimized / DLL files
26 | 
27 | # Temp directories, notebooks created by jupyterlab
28 | .Trash-*/
29 | .jupyter/
30 | .local/
31 | 
32 | # Python distribution / packaging
33 | .Python
34 | build/
35 | develop-eggs/
36 | dist/
37 | downloads/
38 | eggs/
39 | .eggs/
40 | lib/
41 | lib64/
42 | parts/
43 | sdist/
44 | var/
45 | wheels/
46 | share/python-wheels/
47 | *.egg-info/
48 | .installed.cfg
49 | *.egg
50 | MANIFEST
51 | 
52 | # Unit test / coverage reports
53 | htmlcov/
54 | .tox/
55 | .nox/
56 | .coverage
57 | .coverage.*
58 | .cache
59 | nosetests.xml
60 | coverage.xml
61 | *.cover
62 | *.py,cover
63 | .hypothesis/
64 | .pytest_cache/
65 | cover/


--------------------------------------------------------------------------------
/.project/spec.yaml:
--------------------------------------------------------------------------------
  1 | specVersion: v2
  2 | specMinorVersion: 2
  3 | meta:
  4 |     name: rapids-cudf
  5 |     image: project-rapids-cudf
  6 |     description: Accelerate Data Science workflows with RAPIDS cuDF and cuDF.pandas
  7 |     labels: []
  8 |     createdOn: "2023-09-19T22:48:48Z"
  9 |     defaultBranch: main
 10 | layout:
 11 |     - path: code/
 12 |       type: code
 13 |       storage: git
 14 |     - path: models/
 15 |       type: models
 16 |       storage: gitlfs
 17 |     - path: data/
 18 |       type: data
 19 |       storage: gitlfs
 20 |     - path: data/scratch/
 21 |       type: data
 22 |       storage: gitignore
 23 | environment:
 24 |     base:
 25 |         registry: nvcr.io
 26 |         image: nvidia/rapidsai/notebooks:25.08-cuda12.9-py3.13
 27 |         build_timestamp: ""
 28 |         name: RAPIDS with CUDA 12.9.1
 29 |         supported_architectures: []
 30 |         cuda_version: 12.9.1
 31 |         description: RAPIDS with CUDA 12.9.1
 32 |         entrypoint_script: /home/rapids/entrypoint.sh
 33 |         labels:
 34 |             - cuda12.9.1
 35 |         apps:
 36 |             - name: jupyterlab
 37 |               type: jupyterlab
 38 |               class: webapp
 39 |               start_command: jupyter lab --allow-root --port 8888 --ip 0.0.0.0 --no-browser --NotebookApp.base_url=\$PROXY_PREFIX --NotebookApp.default_url=/lab --NotebookApp.allow_origin='*'
 40 |               health_check_command: '[ \$(echo url=\$(jupyter lab list | head -n 2 | tail -n 1 | cut -f1 -d'' '' | grep -v ''Currently'' | sed "s@/?@/lab?@g") | curl -o /dev/null -s -w ''%{http_code}'' --config -) == ''200'' ]'
 41 |               stop_command: jupyter lab stop 8888
 42 |               user_msg: ""
 43 |               logfile_path: ""
 44 |               timeout_seconds: 60
 45 |               icon_url: ""
 46 |               webapp_options:
 47 |                 autolaunch: true
 48 |                 port: "8888"
 49 |                 proxy:
 50 |                     trim_prefix: false
 51 |                 url_command: jupyter lab list | head -n 2 | tail -n 1 | cut -f1 -d' ' | grep -v 'Currently'
 52 |         programming_languages:
 53 |             - python3
 54 |         icon_url: ""
 55 |         image_version: 25.08.01
 56 |         os: linux
 57 |         os_distro: ubuntu
 58 |         os_distro_release: "22.04"
 59 |         schema_version: v2
 60 |         user_info:
 61 |             uid: "1001"
 62 |             gid: "1000"
 63 |             username: rapids
 64 |         package_managers:
 65 |             - name: apt
 66 |               binary_path: /usr/bin/apt
 67 |               installed_packages:
 68 |                 - ""
 69 |             - name: conda3
 70 |               binary_path: /opt/conda/bin/conda
 71 |               installed_packages:
 72 |                 - rapids
 73 |                 - cudf
 74 |                 - cuml
 75 |                 - cugraph
 76 |                 - rmm
 77 |                 - pylibraft
 78 |                 - cuspatial
 79 |                 - cuxfilter
 80 |                 - cucim
 81 |                 - xgboost
 82 |                 - dask-sql
 83 |                 - jupyterlab
 84 |             - name: pip
 85 |               binary_path: /opt/conda/bin/pip
 86 |               installed_packages:
 87 |                 - jupyterlab-nvdashboard
 88 |         package_manager_environment:
 89 |             name: conda
 90 |             target: /opt/conda
 91 |     compose_file_path: ""
 92 | execution:
 93 |     apps:
 94 |         - name: jupyterlab
 95 |           type: jupyterlab
 96 |           class: webapp
 97 |           start_command: jupyter lab --allow-root --port 8888 --ip 0.0.0.0 --no-browser --NotebookApp.base_url=\$PROXY_PREFIX --NotebookApp.default_url=/lab --notebook-dir=/project/
 98 |           health_check_command: '[ \$(echo url=\$(jupyter lab list 2>&1 | head -n 2 | tail -n 1 | cut -f1 -d'''' '''' | grep -v ''''Currently'''' | sed ''''s@/?@/lab?@g'''') | curl -o /dev/null -s -w ''''%{http_code}'''' --config -) == ''''200'''' ]'
 99 |           stop_command: jupyter lab stop 8888
100 |           user_msg: ""
101 |           logfile_path: ""
102 |           timeout_seconds: 60
103 |           icon_url: ""
104 |           webapp_options:
105 |             autolaunch: true
106 |             port: "8888"
107 |             proxy:
108 |                 trim_prefix: false
109 |             url_command: jupyter lab list 2>&1 | head -n 2 | tail -n 1 | cut -f1 -d' ' | grep -v 'Currently'
110 |     resources:
111 |         gpu:
112 |             requested: 1
113 |         sharedMemoryMB: 1024
114 |     secrets: []
115 |     mounts:
116 |         - type: project
117 |           target: /project/
118 |           description: Project directory
119 |           options: rw
120 |         - type: volume
121 |           target: /data/tensorboard/logs/
122 |           description: Tensorboard Log Files
123 |           options: volumeName=tensorboard-logs-volume
124 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NVIDIA AI Workbench: Introduction
 2 | This is an [NVIDIA AI Workbench](https://developer.nvidia.com/blog/develop-and-deploy-scalable-generative-ai-models-seamlessly-with-nvidia-ai-workbench/) example project that provides a short introduction of the cuDF library, a Python GPU-accelerated DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating data. cuDF also provides a pandas-like API that will be familiar to data engineers & data scientists, so they can use it to easily accelerate their workflows without going into the details of CUDA programming. Users who have [installed AI Workbench](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/workbench/) can get up and running with this project in minutes. 
 3 | 
 4 | Have questions? Please direct any issues, fixes, suggestions, and discussion on this project to the DevZone Members Only Forum thread [here](https://forums.developer.nvidia.com/t/support-workbench-example-project-rapids-cudf/278372/1). 
 5 | 
 6 | ## Project Description
 7 | Included in this project are eight tutorial notebooks. The first five are relatively easy to run; the last three (*) may require a low GPU RAM user ( < 16GB) to push the project to heavier hardware to run all of the performance benchmarks. Good news: Workbench makes this easy!  
 8 | 
 9 | * [cudf-pandas-demo](./code/cudf-pandas-demo.ipynb): This notebook demonstrates the acceleration that `cudf.pandas` gives over vanilla Pandas. The example runs through loading some data with Pandas and getting some performance numbers, then running the same code again with the `cudf.pandas` plugin to show the speedup that is possible with NVIDIA hardware.
10 | 
11 | * [10min](./code/10min.ipynb): This is a short introduction to cuDF and Dask-cuDF, geared mainly towards new users.
12 | 
13 |    _cuDF_ is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API in the style of pandas.
14 | 
15 |    _Dask_ is a flexible library for parallel computing in Python that makes scaling out your workflow smooth and simple. On the CPU, Dask uses Pandas to execute operations in parallel on DataFrame partitions.
16 | 
17 |    _Dask-cuDF_ extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call dask_cudf.read_csv(...), your cluster’s GPUs do the work of parsing the CSV file(s) by calling cudf.read_csv().
18 | 
19 |    Which libraries do I use? If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF. 
20 |   
21 | * [cupy-interop](./code/cupy-interop.ipynb): This notebook provides introductory examples of how you can use cuDF and CuPy together to take advantage of CuPy array functionality (such as advanced linear algebra operations). 
22 |   
23 | * [missing-data](./code/missing-data.ipynb): In this section, we will discuss missing (also referred to as NA) values in cudf. cudf supports having missing values in all dtypes. These missing values are represented by <NA>. These values are also referenced as “null values”.
24 | 
25 | * [Introduction_to_Strings](./code/Introduction_to_Strings.ipynb): This notebook shows how to manipulate strings with cuDF DataFrames.
26 | 
27 | * [Introduction_to_Exploratory_Data_Analysis_using_cuDF](./code/Introduction_to_Exploratory_Data_Analysis_using_cuDF.ipynb): This notebook shows how to perform basic EDA with cuDF DataFrames
28 | 
29 | * [Introduction_to_Time_Series_Data_Analysis_using_cuDF](./code/Introduction_to_Time_Series_Data_Analysis_using_cuDF.ipynb): This notebook shows how to do EDA on time-series DataFrame with cuDF
30 | 
31 | * [performance-comparisons](./code/performance-comparisons.ipynb) (*): This notebook compares the performance of cuDF and pandas. The comparisons performed are on identical data sizes. This notebook primarily showcases the factor of speedups users can have when the similar pandas APIs are run on GPUs using cudf. This notebook is written to measure performance on NVIDIA GPUs with _large_ memory. Performance results may vary by data size, as well as the CPU and GPU used.
32 | 
33 | ---
34 | **Important Considerations:**
35 | * The notebook titled ```performance-comparisons.ipynb``` may take a long time to execute on laptop and/or workstation hardware. This is because we are running benchmarks and conducting dataframe operations on massive datasets using both Pandas and cuDF. Feel free to adjust the ```num_rows``` variable as needed. 
36 | 
37 | * If working locally on a laptop or workstation, also consider pushing this project to heavier hardware (original notebook authors used 2x H100 GPUs) to run this notebook. Good news: NVIDIA AI Workbench makes this push easy! 
38 | 
39 | ---
40 | 
41 | ## System Requirements:
42 | * Operating System: Ubuntu 22.04
43 | * CPU requirements: None, tested with Intel&reg; Xeon&reg; Gold 6240R CPU @ 2.40GHz
44 | * GPU requirements: Any NVIDIA training GPU, tested with NVIDIA A100-40GB
45 | * NVIDIA driver requirements: Latest driver version
46 | * Storage requirements: 40GB
47 | 
48 | # Quickstart
49 | The notebook(s) in this project were adapted from the RAPIDS cuDF Github repository, which can be found [here](https://github.com/rapidsai/cudf/tree/branch-23.12/notebooks).
50 | 
51 | If you have NVIDIA AI Workbench already installed, you can use this Project in AI Workbench on your choice of machine by:
52 | 1. Forking this Project to your own GitHub namespace and copying the clone link
53 | 
54 |    ```https://github.com/[your_namespace]/<project_name>.git```
55 |    
56 | 2. Opening a shell and activating the Context you want to clone into by
57 | 
58 |    ```
59 |    $ nvwb list contexts
60 |    
61 |    $ nvwb activate <desired_context>
62 |    ```
63 |    
64 | 3. Cloning this Project onto your desired machine by running
65 | 
66 |    ```
67 |    $ nvwb clone project <your_project_url>
68 |    ```
69 |    
70 | 4. Opening the Project by
71 | 
72 |    ```
73 |    $ nvwb list projects
74 |    
75 |    $ nvwb open <project_name>
76 |    ```
77 |    
78 | 5. Starting JupyterLab by
79 | 
80 |    ```
81 |    $ nvwb start jupyterlab
82 |    ```
83 | 
84 | 6. Navigate to the code directory of the project. Then, open the notebooks provided and begin working through them at your own pace. Happy coding!
85 | 
86 | ---
87 | **Tip:** Use ```nvwb help``` to see a full list of commands. 
88 | 
89 | ---
90 | 
91 | ## Tested On
92 | This notebook has been tested with an NVIDIA A100-40gb GPU and an Intel(R) Xeon(R) Gold 6240R CPU (2.40GHz) on the following version of NVIDIA AI Workbench: ```nvwb 0.2.66 (internal; linux; amd64; go1.18.10; Tue Sep 12 18:50:21 UTC 2023)```
93 | 
94 | ## License
95 | This NVIDIA AI Workbench example project is under the [Apache 2.0 License](https://github.com/NVIDIA/rapids-cudf/blob/main/LICENSE.txt)
96 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2023 NVIDIA Corporation
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/code/cudf-pandas-demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "kcF9ZWvjSybR"
  7 |    },
  8 |    "source": [
  9 |     "# 10 Minutes to RAPIDS cuDF's pandas accelerator mode (cudf.pandas)\n",
 10 |     "\n",
 11 |     "cuDF is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API in the style of pandas.\n",
 12 |     "\n",
 13 |     "cuDF now provides a pandas accelerator mode (`cudf.pandas`), allowing you to bring accelerated computing to your pandas workflows without requiring any code change.\n",
 14 |     "\n",
 15 |     "This notebook is a short introduction to `cudf.pandas`."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {
 21 |     "id": "SH_h6ci1Sx0u"
 22 |    },
 23 |    "source": [
 24 |     "# ⚠️ Verify your setup\n",
 25 |     "\n",
 26 |     "First, we'll verify that you are running with an NVIDIA GPU."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {
 33 |     "colab": {
 34 |      "base_uri": "https://localhost:8080/"
 35 |     },
 36 |     "id": "Y2vPCtXcCvUR",
 37 |     "outputId": "fb93a4bc-9ef1-4333-a81c-48d7b5e8ceb9",
 38 |     "tags": []
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "!nvidia-smi  # this should display information about available GPUs"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {
 49 |     "id": "zhPt4Xj8THgo",
 50 |     "tags": []
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "import cudf  # this should work without any errors"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {
 60 |     "id": "4zGUeWvcTbDs"
 61 |    },
 62 |    "source": [
 63 |     "# Download the data\n",
 64 |     "\n",
 65 |     "The data we'll be working with is the [Parking Violations Issued - Fiscal Year 2022](https://data.cityofnewyork.us/City-Government/Parking-Violations-Issued-Fiscal-Year-2022/7mxj-7a6y) dataset from NYC Open Data.\n",
 66 |     "\n",
 67 |     "We're downloading a copy of this dataset from an s3 bucket hosted by NVIDIA to provide faster download speeds. We'll start by downloading the data. This should take about 30 seconds.\n",
 68 |     "\n",
 69 |     "## Data License and Terms\n",
 70 |     "As this dataset originates from the NYC Open Data Portal, it's governed by their license and terms of use.\n",
 71 |     "\n",
 72 |     "### Are there restrictions on how I can use Open Data?\n",
 73 |     "\n",
 74 |     "> Open Data belongs to all New Yorkers. There are no restrictions on the use of Open Data. Refer to Terms of Use for more information.\n",
 75 |     "\n",
 76 |     "### [Terms of Use](https://opendata.cityofnewyork.us/overview/#termsofuse)\n",
 77 |     "\n",
 78 |     "> By accessing datasets and feeds available through NYC Open Data, the user agrees to all of the Terms of Use of NYC.gov as well as the Privacy Policy for NYC.gov. The user also agrees to any additional terms of use defined by the agencies, bureaus, and offices providing data. Public data sets made available on NYC Open Data are provided for informational purposes. The City does not warranty the completeness, accuracy, content, or fitness for any particular purpose or use of any public data set made available on NYC Open Data, nor are any such warranties to be implied or inferred with respect to the public data sets furnished therein.\n",
 79 |     "\n",
 80 |     "> The City is not liable for any deficiencies in the completeness, accuracy, content, or fitness for any particular purpose or use of any public data set, or application utilizing such data set, provided by any third party.\n",
 81 |     "\n",
 82 |     "> Submitting City Agencies are the authoritative source of data available on NYC Open Data. These entities are responsible for data quality and retain version control of data sets and feeds accessed on the Site. Data may be updated, corrected, or refreshed at any time."
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "colab": {
 90 |      "base_uri": "https://localhost:8080/"
 91 |     },
 92 |     "id": "5EoQqNwsTqeP",
 93 |     "outputId": "b5f9b7f5-b1e4-4a40-d30e-30f59ff106d9",
 94 |     "tags": []
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "!wget https://data.rapids.ai/datasets/nyc_parking/nyc_parking_violations_2022.parquet -O /tmp/nyc_parking_violations_2022.parquet"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {
104 |     "id": "hAvNFbYKWwti"
105 |    },
106 |    "source": [
107 |     "# Analysis using Standard Pandas\n",
108 |     "\n",
109 |     "First, let's use Pandas to read in some columns of the dataset:"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {
116 |     "id": "SLRleX9xWxqX",
117 |     "tags": []
118 |    },
119 |    "outputs": [],
120 |    "source": [
121 |     "import pandas as pd"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {
128 |     "colab": {
129 |      "base_uri": "https://localhost:8080/",
130 |      "height": 363
131 |     },
132 |     "id": "OLatEi7rW0la",
133 |     "outputId": "22cfa4c5-58ee-4514-ecb1-f984f241a465",
134 |     "tags": []
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "# read 5 columns data:\n",
139 |     "df = pd.read_parquet(\n",
140 |     "    \"/tmp/nyc_parking_violations_2022.parquet\",\n",
141 |     "    columns=[\"Registration State\", \"Violation Description\", \"Vehicle Body Type\", \"Issue Date\", \"Summons Number\"]\n",
142 |     ")\n",
143 |     "\n",
144 |     "# view a random sample of 10 rows:\n",
145 |     "df.sample(10)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {
151 |     "id": "m7qXNJU9W53D"
152 |    },
153 |    "source": [
154 |     "Next, we'll try to answer a few questions using the data."
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {
160 |     "id": "VmkFv9ZUW37g"
161 |    },
162 |    "source": [
163 |     "## Which parking violation is most commonly committed by vehicles from various U.S states?\n",
164 |     "\n",
165 |     "Each record in our dataset contains the state of registration of the offending vehicle, and the type of parking offence. Let's say we want to get the most common type of offence for vehicles registered in different states. We can do this in Pandas using a combination of [value_counts](https://pandas.pydata.org/docs/reference/api/pandas.Series.value_counts.html) and [GroupBy.head](https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.head.html):"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {
172 |     "colab": {
173 |      "base_uri": "https://localhost:8080/",
174 |      "height": 423
175 |     },
176 |     "id": "bHXq-s_ZXOQN",
177 |     "outputId": "a0ca97bc-0c91-4f89-931b-bb0377e1c1c8"
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "(df[[\"Registration State\", \"Violation Description\"]]  # get only these two columns\n",
182 |     " .value_counts()  # get the count of offences per state and per type of offence\n",
183 |     " .groupby(\"Registration State\")  # group by state\n",
184 |     " .head(1)  # get the first row in each group (the type of offence with the largest count)\n",
185 |     " .sort_index()  # sort by state name\n",
186 |     " .reset_index()\n",
187 |     ")"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {
193 |     "id": "8lXF4v4SXRf3"
194 |    },
195 |    "source": [
196 |     "The code above uses [method chaining](https://tomaugspurger.net/posts/method-chaining/) to combine a series of operations into a single statement. You might find it useful to break the code up into multiple statements and inspect each of the intermediate results!"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {
202 |     "id": "H7_9EmGyXUJd"
203 |    },
204 |    "source": [
205 |     "## Which vehicle body types are most frequently involved in parking violations?\n",
206 |     "\n",
207 |     "We can also investigate which vehicle body types most commonly appear in parking violations"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {
214 |     "colab": {
215 |      "base_uri": "https://localhost:8080/",
216 |      "height": 455
217 |     },
218 |     "id": "d7Ax-u4TXZtp",
219 |     "outputId": "0feeca28-ca17-4818-8ecb-d62ea5e1c1db"
220 |    },
221 |    "outputs": [],
222 |    "source": [
223 |     "(df\n",
224 |     " .groupby([\"Vehicle Body Type\"])\n",
225 |     " .agg({\"Summons Number\": \"count\"})\n",
226 |     " .rename(columns={\"Summons Number\": \"Count\"})\n",
227 |     " .sort_values([\"Count\"], ascending=False)\n",
228 |     ")"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {
234 |     "id": "VjFfQLZHXehM"
235 |    },
236 |    "source": [
237 |     "## How do parking violations vary across days of the week?"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "colab": {
245 |      "base_uri": "https://localhost:8080/"
246 |     },
247 |     "id": "s5_y9m_AXhIw",
248 |     "outputId": "11d16e26-4a5d-4fbb-c777-7d50a23895ae"
249 |    },
250 |    "outputs": [],
251 |    "source": [
252 |     "weekday_names = {\n",
253 |     "    0: \"Monday\",\n",
254 |     "    1: \"Tuesday\",\n",
255 |     "    2: \"Wednesday\",\n",
256 |     "    3: \"Thursday\",\n",
257 |     "    4: \"Friday\",\n",
258 |     "    5: \"Saturday\",\n",
259 |     "    6: \"Sunday\",\n",
260 |     "}\n",
261 |     "\n",
262 |     "df[\"Issue Date\"] = df[\"Issue Date\"].astype(\"datetime64[ms]\")\n",
263 |     "df[\"issue_weekday\"] = df[\"Issue Date\"].dt.weekday.map(weekday_names)\n",
264 |     "\n",
265 |     "df.groupby([\"issue_weekday\"])[\"Summons Number\"].count().sort_values()"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {
271 |     "id": "LDeYr6xkXiDc"
272 |    },
273 |    "source": [
274 |     "It looks like there are fewer violations on weekends, which makes sense! During the week, more people are driving in New York City."
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {
280 |     "id": "JKBQcT64XlMr"
281 |    },
282 |    "source": [
283 |     "## Let's time it!\n",
284 |     "\n",
285 |     "Loading and processing this data took a little time. Let's measure how long these pipelines take in Pandas:"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {
292 |     "colab": {
293 |      "base_uri": "https://localhost:8080/",
294 |      "height": 458
295 |     },
296 |     "id": "mDpQhus-Xnfs",
297 |     "outputId": "e9af1194-a0f8-48d4-a1b3-42dba63f3110"
298 |    },
299 |    "outputs": [],
300 |    "source": [
301 |     "%%time\n",
302 |     "\n",
303 |     "df = pd.read_parquet(\n",
304 |     "    \"/tmp/nyc_parking_violations_2022.parquet\",\n",
305 |     "    columns=[\"Registration State\", \"Violation Description\", \"Vehicle Body Type\", \"Issue Date\", \"Summons Number\"]\n",
306 |     ")\n",
307 |     "\n",
308 |     "(df[[\"Registration State\", \"Violation Description\"]]\n",
309 |     " .value_counts()\n",
310 |     " .groupby(\"Registration State\")\n",
311 |     " .head(1)\n",
312 |     " .sort_index()\n",
313 |     " .reset_index()\n",
314 |     ")"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {
321 |     "colab": {
322 |      "base_uri": "https://localhost:8080/",
323 |      "height": 490
324 |     },
325 |     "id": "9Gw5TWH2Xqgv",
326 |     "outputId": "eabf2e77-6fef-4751-d682-7c5c51ebc86e"
327 |    },
328 |    "outputs": [],
329 |    "source": [
330 |     "%%time\n",
331 |     "\n",
332 |     "(df\n",
333 |     " .groupby([\"Vehicle Body Type\"])\n",
334 |     " .agg({\"Summons Number\": \"count\"})\n",
335 |     " .rename(columns={\"Summons Number\": \"Count\"})\n",
336 |     " .sort_values([\"Count\"], ascending=False)\n",
337 |     ")"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {
344 |     "colab": {
345 |      "base_uri": "https://localhost:8080/"
346 |     },
347 |     "id": "BovQgNrpXr2l",
348 |     "outputId": "01b2c4fc-8c8c-4947-92a2-a3e2f0a7cf14"
349 |    },
350 |    "outputs": [],
351 |    "source": [
352 |     "%%time\n",
353 |     "\n",
354 |     "weekday_names = {\n",
355 |     "    0: \"Monday\",\n",
356 |     "    1: \"Tuesday\",\n",
357 |     "    2: \"Wednesday\",\n",
358 |     "    3: \"Thursday\",\n",
359 |     "    4: \"Friday\",\n",
360 |     "    5: \"Saturday\",\n",
361 |     "    6: \"Sunday\",\n",
362 |     "}\n",
363 |     "\n",
364 |     "df[\"Issue Date\"] = df[\"Issue Date\"].astype(\"datetime64[ms]\")\n",
365 |     "df[\"issue_weekday\"] = df[\"Issue Date\"].dt.weekday.map(weekday_names)\n",
366 |     "\n",
367 |     "df.groupby([\"issue_weekday\"])[\"Summons Number\"].count().sort_values()"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "markdown",
372 |    "metadata": {
373 |     "id": "VgAWS0yXXtGj"
374 |    },
375 |    "source": [
376 |     "# Using cudf.pandas\n",
377 |     "\n",
378 |     "Now, let's re-run the Pandas code above with the `cudf.pandas` extension loaded.\n",
379 |     "\n",
380 |     "Typically, you should load the `cudf.pandas` extension as the first step in your notebook, before importing any modules. Here, we explicitly restart the kernel to simulate that behavior."
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {
387 |     "colab": {
388 |      "base_uri": "https://localhost:8080/"
389 |     },
390 |     "id": "hW5rUr2tXzUW",
391 |     "outputId": "5ad04ad3-9e5a-4609-8e0d-fe19f02c32d0"
392 |    },
393 |    "outputs": [],
394 |    "source": [
395 |     "get_ipython().kernel.do_shutdown(restart=True)"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "metadata": {
402 |     "id": "NjvPsTlGZrW7"
403 |    },
404 |    "outputs": [],
405 |    "source": [
406 |     "%load_ext cudf.pandas"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": null,
412 |    "metadata": {
413 |     "colab": {
414 |      "base_uri": "https://localhost:8080/"
415 |     },
416 |     "id": "XL_u4l5gZJte",
417 |     "outputId": "bf96ea78-1baa-4542-e4d7-c5ac85e035f4"
418 |    },
419 |    "outputs": [],
420 |    "source": [
421 |     "%%time\n",
422 |     "\n",
423 |     "import pandas as pd\n",
424 |     "\n",
425 |     "df = pd.read_parquet(\n",
426 |     "    \"/tmp/nyc_parking_violations_2022.parquet\",\n",
427 |     "    columns=[\"Registration State\", \"Violation Description\", \"Vehicle Body Type\", \"Issue Date\", \"Summons Number\"]\n",
428 |     ")\n",
429 |     "\n",
430 |     "(df[[\"Registration State\", \"Violation Description\"]]\n",
431 |     " .value_counts()\n",
432 |     " .groupby(\"Registration State\")\n",
433 |     " .head(1)\n",
434 |     " .sort_index()\n",
435 |     " .reset_index()\n",
436 |     ")"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": null,
442 |    "metadata": {
443 |     "colab": {
444 |      "base_uri": "https://localhost:8080/",
445 |      "height": 490
446 |     },
447 |     "id": "BLWa8ed6d-pD",
448 |     "outputId": "a717d797-b6d4-4baf-dce3-c281e52d1576"
449 |    },
450 |    "outputs": [],
451 |    "source": [
452 |     "%%time\n",
453 |     "\n",
454 |     "(df\n",
455 |     " .groupby([\"Vehicle Body Type\"])\n",
456 |     " .agg({\"Summons Number\": \"count\"})\n",
457 |     " .rename(columns={\"Summons Number\": \"Count\"})\n",
458 |     " .sort_values([\"Count\"], ascending=False)\n",
459 |     ")"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": null,
465 |    "metadata": {
466 |     "colab": {
467 |      "base_uri": "https://localhost:8080/"
468 |     },
469 |     "id": "X6ASy4mPd_-c",
470 |     "outputId": "d45b6616-a0f8-48ba-e86a-6ad087c4af4c"
471 |    },
472 |    "outputs": [],
473 |    "source": [
474 |     "%%time\n",
475 |     "\n",
476 |     "weekday_names = {\n",
477 |     "    0: \"Monday\",\n",
478 |     "    1: \"Tuesday\",\n",
479 |     "    2: \"Wednesday\",\n",
480 |     "    3: \"Thursday\",\n",
481 |     "    4: \"Friday\",\n",
482 |     "    5: \"Saturday\",\n",
483 |     "    6: \"Sunday\",\n",
484 |     "}\n",
485 |     "\n",
486 |     "df[\"Issue Date\"] = df[\"Issue Date\"].astype(\"datetime64[ms]\")\n",
487 |     "df[\"issue_weekday\"] = df[\"Issue Date\"].dt.weekday.map(weekday_names)\n",
488 |     "\n",
489 |     "df.groupby([\"issue_weekday\"])[\"Summons Number\"].count().sort_values()"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "markdown",
494 |    "metadata": {
495 |     "id": "FMUrf6iMeBdM"
496 |    },
497 |    "source": [
498 |     "Much faster! Operations that took 5-20 seconds can now potentially finish in just milliseconds without changing any code."
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "markdown",
503 |    "metadata": {
504 |     "id": "00m6gUxqeGzk"
505 |    },
506 |    "source": [
507 |     "# Understanding Performance\n",
508 |     "\n",
509 |     "`cudf.pandas` provides profiling utilities to help you better understand performance. With these tools, you can identify which parts of your code ran on the GPU and which parts ran on the CPU.\n",
510 |     "\n",
511 |     "They're accessible in the `cudf.pandas` namespace since the `cudf.pandas` extension was loaded above with `load_ext cudf.pandas`.\n",
512 |     "\n",
513 |     "#### Colab Note\n",
514 |     "If you're running in Colab, the first time you run use the profiler it may take 10+ seconds due to Colab's debugger interacting with the built-in Python function [sys.settrace](https://docs.python.org/3/library/sys.html#sys.settrace) that we use for profiling. For demo purposes, this isn't an issue. Just run the cell again.\n",
515 |     "\n",
516 |     "## Profiling Functionality\n",
517 |     "\n",
518 |     "We can generate a per-function profile:"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "code",
523 |    "execution_count": null,
524 |    "metadata": {},
525 |    "outputs": [],
526 |    "source": [
527 |     "len(df)"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": null,
533 |    "metadata": {
534 |     "colab": {
535 |      "base_uri": "https://localhost:8080/",
536 |      "height": 334
537 |     },
538 |     "id": "RFm22OWbeHF2",
539 |     "outputId": "eed4240d-01e1-4007-aced-28c29fe172a5"
540 |    },
541 |    "outputs": [],
542 |    "source": [
543 |     "%%cudf.pandas.profile\n",
544 |     "\n",
545 |     "small_df = pd.DataFrame({'a': [0, 1, 2], 'b': [\"x\", \"y\", \"z\"]})\n",
546 |     "small_df = pd.concat([small_df, small_df])\n",
547 |     "\n",
548 |     "axis = 0\n",
549 |     "for i in range(0, 2):\n",
550 |     "    small_df.min(axis=axis)\n",
551 |     "    axis = 1\n",
552 |     "\n",
553 |     "counts = small_df.groupby(\"a\").b.count()"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": null,
559 |    "metadata": {
560 |     "colab": {
561 |      "base_uri": "https://localhost:8080/",
562 |      "height": 448
563 |     },
564 |     "id": "Syb-_vZweN2H",
565 |     "outputId": "96e1ea1c-f3d7-4792-abcb-9e90c1dcdd1c"
566 |    },
567 |    "outputs": [],
568 |    "source": [
569 |     "%%cudf.pandas.line_profile\n",
570 |     "\n",
571 |     "small_df = pd.DataFrame({'a': [0, 1, 2], 'b': [\"x\", \"y\", \"z\"]})\n",
572 |     "small_df = pd.concat([small_df, small_df])\n",
573 |     "\n",
574 |     "axis = 0\n",
575 |     "for i in range(0, 2):\n",
576 |     "    small_df.min(axis=axis)\n",
577 |     "    axis = 1\n",
578 |     "\n",
579 |     "counts = small_df.groupby(\"a\").b.count()"
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "markdown",
584 |    "metadata": {
585 |     "id": "VCZ6BxwBpfjL"
586 |    },
587 |    "source": [
588 |     "## Behind the scenes: What's going on here?\n",
589 |     "\n",
590 |     "When you load `cudf.pandas`, Pandas types like `Series` and `DataFrame` are replaced by proxy objects that dispatch operations to cuDF when possible. We can verify that `cudf.pandas` is active by looking at our `pd` variable:"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "code",
595 |    "execution_count": null,
596 |    "metadata": {
597 |     "colab": {
598 |      "base_uri": "https://localhost:8080/"
599 |     },
600 |     "id": "jogk5UrgeTkS",
601 |     "outputId": "c4b9b4cd-894b-4379-fde2-fee17138bd36"
602 |    },
603 |    "outputs": [],
604 |    "source": [
605 |     "pd"
606 |    ]
607 |   },
608 |   {
609 |    "cell_type": "markdown",
610 |    "metadata": {
611 |     "id": "vxh70rpDph3I"
612 |    },
613 |    "source": [
614 |     "As a result, all pandas functions, methods, and created objects are proxies:"
615 |    ]
616 |   },
617 |   {
618 |    "cell_type": "code",
619 |    "execution_count": null,
620 |    "metadata": {
621 |     "colab": {
622 |      "base_uri": "https://localhost:8080/"
623 |     },
624 |     "id": "RYTCGl7spgjs",
625 |     "outputId": "ef9a2113-1ea4-4104-c28e-5c97286fb72a"
626 |    },
627 |    "outputs": [],
628 |    "source": [
629 |     "type(pd.read_csv)"
630 |    ]
631 |   },
632 |   {
633 |    "cell_type": "markdown",
634 |    "metadata": {
635 |     "id": "9-NvKu7XplmO"
636 |    },
637 |    "source": [
638 |     "Operations supported by cuDF will be **very** fast:"
639 |    ]
640 |   },
641 |   {
642 |    "cell_type": "code",
643 |    "execution_count": null,
644 |    "metadata": {
645 |     "colab": {
646 |      "base_uri": "https://localhost:8080/"
647 |     },
648 |     "id": "MFvLJo4upnUG",
649 |     "outputId": "3cc21f0c-798f-4589-c8e3-23e52fd052ae"
650 |    },
651 |    "outputs": [],
652 |    "source": [
653 |     "%%time\n",
654 |     "df.count(axis=0)"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "markdown",
659 |    "metadata": {
660 |     "id": "Np6VP-wSpomO"
661 |    },
662 |    "source": [
663 |     "Operations not supported by cuDF will be slower, as they fall back to using Pandas (copying data between the CPU and GPU under the hood as needed). For example, cuDF does not currently support the `axis=` parameter to the `count` method. So this operation will run on the CPU and be noticeably slower than the previous one."
664 |    ]
665 |   },
666 |   {
667 |    "cell_type": "code",
668 |    "execution_count": null,
669 |    "metadata": {
670 |     "colab": {
671 |      "base_uri": "https://localhost:8080/"
672 |     },
673 |     "id": "mThydJIYpuha",
674 |     "outputId": "77a843f4-0ead-4b61-a2b3-23952a2dd35d"
675 |    },
676 |    "outputs": [],
677 |    "source": [
678 |     "%%time\n",
679 |     "df.count(axis=1) # This will use pandas, because cuDF doesn't support axis=1 for the .count() method"
680 |    ]
681 |   },
682 |   {
683 |    "cell_type": "markdown",
684 |    "metadata": {
685 |     "id": "tbDVvkP2pyra"
686 |    },
687 |    "source": [
688 |     "But the story doesn't end here. We often need to mix our own code with third-party libraries that other people have written. Many of these libraries accept pandas objects as inputs."
689 |    ]
690 |   },
691 |   {
692 |    "cell_type": "markdown",
693 |    "metadata": {
694 |     "id": "3yK3a-mIp0vr"
695 |    },
696 |    "source": [
697 |     "# Using third-party libraries with cudf.pandas\n",
698 |     "\n",
699 |     "You can pass Pandas objects to third-party libraries when using `cudf.pandas`, just like you would when using regular Pandas.\n",
700 |     "\n",
701 |     "Below, we show an example of using [plotly-express](https://plotly.com/python/plotly-express/) to visualize the data we've been processing:"
702 |    ]
703 |   },
704 |   {
705 |    "cell_type": "markdown",
706 |    "metadata": {
707 |     "id": "H0QwPQcAp2RV"
708 |    },
709 |    "source": [
710 |     "## Visualizing which states have more pickup trucks relative to other vehicles?"
711 |    ]
712 |   },
713 |   {
714 |    "cell_type": "code",
715 |    "execution_count": null,
716 |    "metadata": {
717 |     "colab": {
718 |      "base_uri": "https://localhost:8080/",
719 |      "height": 542
720 |     },
721 |     "id": "Ecs213eEqCd9",
722 |     "outputId": "5c798902-301c-4aaf-dcf2-18ffd863befd"
723 |    },
724 |    "outputs": [],
725 |    "source": [
726 |     "import plotly.express as px\n",
727 |     "\n",
728 |     "df = df.rename(columns={\n",
729 |     "    \"Registration State\": \"reg_state\",\n",
730 |     "    \"Vehicle Body Type\": \"vehicle_type\",\n",
731 |     "})\n",
732 |     "\n",
733 |     "# vehicle counts per state:\n",
734 |     "counts = df.groupby(\"reg_state\").size().sort_index()\n",
735 |     "# vehicles with type \"PICK\" (Pickup Truck)\n",
736 |     "pickup_counts = df.where(df[\"vehicle_type\"] == \"PICK\").groupby(\"reg_state\").size()\n",
737 |     "# percentage of pickup trucks by state:\n",
738 |     "pickup_frac = ((pickup_counts / counts) * 100).rename(\"% Pickup Trucks\")\n",
739 |     "del pickup_frac[\"MB\"]  # (Manitoba is a huge outlier!)\n",
740 |     "\n",
741 |     "# plot the results:\n",
742 |     "pickup_frac = pickup_frac.reset_index()\n",
743 |     "px.choropleth(pickup_frac, locations=\"reg_state\", color=\"% Pickup Trucks\", locationmode=\"USA-states\", scope=\"usa\")"
744 |    ]
745 |   },
746 |   {
747 |    "cell_type": "markdown",
748 |    "metadata": {
749 |     "id": "9bgMrWs5qDG_"
750 |    },
751 |    "source": [
752 |     "## Beyond just passing data: **Accelerating** third-party code\n",
753 |     "\n",
754 |     "Being able to pass these proxy objects to libraries like Plotly is great, but the benefits don't end there.\n",
755 |     "\n",
756 |     "When you enable `cudf.pandas`, pandas operations running **inside the third-party library's functions** will also benefit from GPU acceleration where possible!\n",
757 |     "\n",
758 |     "Below, you can see an image illustrating how `cudf.pandas` can accelerate the pandas backend in Ibis, a library that provides a unified DataFrame API to various backends. We ran this example on a system with an NVIDIA H100 GPU and an Intel Xeon Platinum 8480CL CPU.\n",
759 |     "\n",
760 |     "\n",
761 |     "By loading the `cudf.pandas` extension, pandas operations within Ibis can use the GPU with zero code change. It just works."
762 |    ]
763 |   },
764 |   {
765 |    "cell_type": "markdown",
766 |    "metadata": {
767 |     "id": "8JW2CQL6qEv3"
768 |    },
769 |    "source": [
770 |     "![ibis](https://drive.google.com/uc?id=1uOJq2JtbgVb7tb8qw8a2gG3JRBo72t_H)"
771 |    ]
772 |   },
773 |   {
774 |    "cell_type": "markdown",
775 |    "metadata": {
776 |     "id": "pyVNtGUhtFs5"
777 |    },
778 |    "source": [
779 |     "# Conclusion\n",
780 |     "\n",
781 |     "With `cudf.pandas`, you can keep using pandas as your primary dataframe library. When things start to get a little slow, just load the `cudf.pandas` and run your existing code on a GPU!\n",
782 |     "\n",
783 |     "To learn more, we encourage you to visit [rapids.ai/cudf-pandas](https://rapids.ai/cudf-pandas)."
784 |    ]
785 |   },
786 |   {
787 |    "cell_type": "code",
788 |    "execution_count": null,
789 |    "metadata": {
790 |     "id": "XjELOIf3xykH"
791 |    },
792 |    "outputs": [],
793 |    "source": []
794 |   }
795 |  ],
796 |  "metadata": {
797 |   "accelerator": "GPU",
798 |   "colab": {
799 |    "collapsed_sections": [
800 |     "VmkFv9ZUW37g",
801 |     "H7_9EmGyXUJd",
802 |     "VjFfQLZHXehM",
803 |     "JKBQcT64XlMr"
804 |    ],
805 |    "gpuType": "T4",
806 |    "provenance": []
807 |   },
808 |   "kernelspec": {
809 |    "display_name": "Python 3 (ipykernel)",
810 |    "language": "python",
811 |    "name": "python3"
812 |   },
813 |   "language_info": {
814 |    "codemirror_mode": {
815 |     "name": "ipython",
816 |     "version": 3
817 |    },
818 |    "file_extension": ".py",
819 |    "mimetype": "text/x-python",
820 |    "name": "python",
821 |    "nbconvert_exporter": "python",
822 |    "pygments_lexer": "ipython3",
823 |    "version": "3.10.13"
824 |   }
825 |  },
826 |  "nbformat": 4,
827 |  "nbformat_minor": 4
828 | }
829 | 


--------------------------------------------------------------------------------
/code/cupy-interop.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "8e5e6878",
   6 |    "metadata": {},
   7 |    "source": [
   8 |     "# Interoperability between cuDF and CuPy\n",
   9 |     "\n",
  10 |     "This notebook provides introductory examples of how you can use cuDF and CuPy together to take advantage of CuPy array functionality (such as advanced linear algebra operations)."
  11 |    ]
  12 |   },
  13 |   {
  14 |    "cell_type": "code",
  15 |    "execution_count": 1,
  16 |    "id": "8b2d45c3",
  17 |    "metadata": {},
  18 |    "outputs": [],
  19 |    "source": [
  20 |     "import timeit\n",
  21 |     "\n",
  22 |     "import cupy as cp\n",
  23 |     "from packaging import version\n",
  24 |     "\n",
  25 |     "import cudf\n",
  26 |     "\n",
  27 |     "if version.parse(cp.__version__) >= version.parse(\"10.0.0\"):\n",
  28 |     "    cupy_from_dlpack = cp.from_dlpack\n",
  29 |     "else:\n",
  30 |     "    cupy_from_dlpack = cp.fromDlpack"
  31 |    ]
  32 |   },
  33 |   {
  34 |    "cell_type": "markdown",
  35 |    "id": "e7e64b1a",
  36 |    "metadata": {},
  37 |    "source": [
  38 |     "### Converting a cuDF DataFrame to a CuPy Array\n",
  39 |     "\n",
  40 |     "If we want to convert a cuDF DataFrame to a CuPy ndarray, There are multiple ways to do it:\n",
  41 |     "\n",
  42 |     "1. We can use the [dlpack](https://github.com/dmlc/dlpack) interface.\n",
  43 |     "\n",
  44 |     "2. We can also use `DataFrame.values`.\n",
  45 |     "\n",
  46 |     "3. We can also convert via the [CUDA array interface](https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html) by using cuDF's `to_cupy` functionality."
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "code",
  51 |    "execution_count": 2,
  52 |    "id": "45c482ab",
  53 |    "metadata": {},
  54 |    "outputs": [
  55 |     {
  56 |      "name": "stdout",
  57 |      "output_type": "stream",
  58 |      "text": [
  59 |       "123 µs ± 658 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
  60 |       "379 µs ± 3.04 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n",
  61 |       "386 µs ± 5.01 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
  62 |      ]
  63 |     }
  64 |    ],
  65 |    "source": [
  66 |     "nelem = 10000\n",
  67 |     "df = cudf.DataFrame(\n",
  68 |     "    {\n",
  69 |     "        \"a\": range(nelem),\n",
  70 |     "        \"b\": range(500, nelem + 500),\n",
  71 |     "        \"c\": range(1000, nelem + 1000),\n",
  72 |     "    }\n",
  73 |     ")\n",
  74 |     "\n",
  75 |     "%timeit arr_cupy = cupy_from_dlpack(df.to_dlpack())\n",
  76 |     "%timeit arr_cupy = df.values\n",
  77 |     "%timeit arr_cupy = df.to_cupy()"
  78 |    ]
  79 |   },
  80 |   {
  81 |    "cell_type": "code",
  82 |    "execution_count": 3,
  83 |    "id": "a565effc",
  84 |    "metadata": {},
  85 |    "outputs": [
  86 |     {
  87 |      "data": {
  88 |       "text/plain": [
  89 |        "array([[    0,   500,  1000],\n",
  90 |        "       [    1,   501,  1001],\n",
  91 |        "       [    2,   502,  1002],\n",
  92 |        "       ...,\n",
  93 |        "       [ 9997, 10497, 10997],\n",
  94 |        "       [ 9998, 10498, 10998],\n",
  95 |        "       [ 9999, 10499, 10999]])"
  96 |       ]
  97 |      },
  98 |      "execution_count": 3,
  99 |      "metadata": {},
 100 |      "output_type": "execute_result"
 101 |     }
 102 |    ],
 103 |    "source": [
 104 |     "arr_cupy = cupy_from_dlpack(df.to_dlpack())\n",
 105 |     "arr_cupy"
 106 |    ]
 107 |   },
 108 |   {
 109 |    "cell_type": "markdown",
 110 |    "id": "0759ab29",
 111 |    "metadata": {},
 112 |    "source": [
 113 |     "### Converting a cuDF Series to a CuPy Array"
 114 |    ]
 115 |   },
 116 |   {
 117 |    "cell_type": "markdown",
 118 |    "id": "4f35ffbd",
 119 |    "metadata": {},
 120 |    "source": [
 121 |     "There are also multiple ways to convert a cuDF Series to a CuPy array:\n",
 122 |     "\n",
 123 |     "1. We can pass the Series to `cupy.asarray` as cuDF Series exposes [`__cuda_array_interface__`](https://docs-cupy.chainer.org/en/stable/reference/interoperability.html).\n",
 124 |     "2. We can leverage the dlpack interface `to_dlpack()`. \n",
 125 |     "3. We can also use `Series.values`"
 126 |    ]
 127 |   },
 128 |   {
 129 |    "cell_type": "code",
 130 |    "execution_count": 4,
 131 |    "id": "8f97f304",
 132 |    "metadata": {},
 133 |    "outputs": [
 134 |     {
 135 |      "name": "stdout",
 136 |      "output_type": "stream",
 137 |      "text": [
 138 |       "40.2 µs ± 107 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
 139 |       "124 µs ± 918 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
 140 |       "105 µs ± 318 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
 141 |      ]
 142 |     }
 143 |    ],
 144 |    "source": [
 145 |     "col = \"a\"\n",
 146 |     "\n",
 147 |     "%timeit cola_cupy = cp.asarray(df[col])\n",
 148 |     "%timeit cola_cupy = cupy_from_dlpack(df[col].to_dlpack())\n",
 149 |     "%timeit cola_cupy = df[col].values"
 150 |    ]
 151 |   },
 152 |   {
 153 |    "cell_type": "code",
 154 |    "execution_count": 5,
 155 |    "id": "f96d5676",
 156 |    "metadata": {},
 157 |    "outputs": [
 158 |     {
 159 |      "data": {
 160 |       "text/plain": [
 161 |        "array([   0,    1,    2, ..., 9997, 9998, 9999])"
 162 |       ]
 163 |      },
 164 |      "execution_count": 5,
 165 |      "metadata": {},
 166 |      "output_type": "execute_result"
 167 |     }
 168 |    ],
 169 |    "source": [
 170 |     "cola_cupy = cp.asarray(df[col])\n",
 171 |     "cola_cupy"
 172 |    ]
 173 |   },
 174 |   {
 175 |    "cell_type": "markdown",
 176 |    "id": "c36e5b88",
 177 |    "metadata": {},
 178 |    "source": [
 179 |     "From here, we can proceed with normal CuPy workflows, such as reshaping the array, getting the diagonal, or calculating the norm."
 180 |    ]
 181 |   },
 182 |   {
 183 |    "cell_type": "code",
 184 |    "execution_count": 6,
 185 |    "id": "2a7ae43f",
 186 |    "metadata": {},
 187 |    "outputs": [
 188 |     {
 189 |      "data": {
 190 |       "text/plain": [
 191 |        "array([[   0,    1,    2, ...,  197,  198,  199],\n",
 192 |        "       [ 200,  201,  202, ...,  397,  398,  399],\n",
 193 |        "       [ 400,  401,  402, ...,  597,  598,  599],\n",
 194 |        "       ...,\n",
 195 |        "       [9400, 9401, 9402, ..., 9597, 9598, 9599],\n",
 196 |        "       [9600, 9601, 9602, ..., 9797, 9798, 9799],\n",
 197 |        "       [9800, 9801, 9802, ..., 9997, 9998, 9999]])"
 198 |       ]
 199 |      },
 200 |      "execution_count": 6,
 201 |      "metadata": {},
 202 |      "output_type": "execute_result"
 203 |     }
 204 |    ],
 205 |    "source": [
 206 |     "reshaped_arr = cola_cupy.reshape(50, 200)\n",
 207 |     "reshaped_arr"
 208 |    ]
 209 |   },
 210 |   {
 211 |    "cell_type": "code",
 212 |    "execution_count": 7,
 213 |    "id": "b442a30c",
 214 |    "metadata": {},
 215 |    "outputs": [
 216 |     {
 217 |      "data": {
 218 |       "text/plain": [
 219 |        "array([   0,  201,  402,  603,  804, 1005, 1206, 1407, 1608, 1809, 2010,\n",
 220 |        "       2211, 2412, 2613, 2814, 3015, 3216, 3417, 3618, 3819, 4020, 4221,\n",
 221 |        "       4422, 4623, 4824, 5025, 5226, 5427, 5628, 5829, 6030, 6231, 6432,\n",
 222 |        "       6633, 6834, 7035, 7236, 7437, 7638, 7839, 8040, 8241, 8442, 8643,\n",
 223 |        "       8844, 9045, 9246, 9447, 9648, 9849])"
 224 |       ]
 225 |      },
 226 |      "execution_count": 7,
 227 |      "metadata": {},
 228 |      "output_type": "execute_result"
 229 |     }
 230 |    ],
 231 |    "source": [
 232 |     "reshaped_arr.diagonal()"
 233 |    ]
 234 |   },
 235 |   {
 236 |    "cell_type": "code",
 237 |    "execution_count": 8,
 238 |    "id": "be7f4d32",
 239 |    "metadata": {},
 240 |    "outputs": [
 241 |     {
 242 |      "data": {
 243 |       "text/plain": [
 244 |        "array(577306.967739)"
 245 |       ]
 246 |      },
 247 |      "execution_count": 8,
 248 |      "metadata": {},
 249 |      "output_type": "execute_result"
 250 |     }
 251 |    ],
 252 |    "source": [
 253 |     "cp.linalg.norm(reshaped_arr)"
 254 |    ]
 255 |   },
 256 |   {
 257 |    "cell_type": "markdown",
 258 |    "id": "b353bded",
 259 |    "metadata": {},
 260 |    "source": [
 261 |     "### Converting a CuPy Array to a cuDF DataFrame\n",
 262 |     "\n",
 263 |     "We can also convert a CuPy ndarray to a cuDF DataFrame. Like before, there are multiple ways to do it:\n",
 264 |     "\n",
 265 |     "1. **Easiest;** We can directly use the `DataFrame` constructor.\n",
 266 |     "\n",
 267 |     "2. We can use CUDA array interface with the `DataFrame` constructor.\n",
 268 |     "\n",
 269 |     "3. We can also use the [dlpack](https://github.com/dmlc/dlpack) interface.\n",
 270 |     "\n",
 271 |     "For the latter two cases, we'll need to make sure that our CuPy array is Fortran contiguous in memory (if it's not already). We can either transpose the array or simply coerce it to be Fortran contiguous beforehand."
 272 |    ]
 273 |   },
 274 |   {
 275 |    "cell_type": "code",
 276 |    "execution_count": 9,
 277 |    "id": "8887b253",
 278 |    "metadata": {},
 279 |    "outputs": [
 280 |     {
 281 |      "name": "stdout",
 282 |      "output_type": "stream",
 283 |      "text": [
 284 |       "16.7 ms ± 102 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
 285 |      ]
 286 |     }
 287 |    ],
 288 |    "source": [
 289 |     "%timeit reshaped_df = cudf.DataFrame(reshaped_arr)"
 290 |    ]
 291 |   },
 292 |   {
 293 |    "cell_type": "code",
 294 |    "execution_count": 10,
 295 |    "id": "08ec4ffa",
 296 |    "metadata": {},
 297 |    "outputs": [
 298 |     {
 299 |      "data": {
 300 |       "text/html": [
 301 |        "<div>\n",
 302 |        "<style scoped>\n",
 303 |        "    .dataframe tbody tr th:only-of-type {\n",
 304 |        "        vertical-align: middle;\n",
 305 |        "    }\n",
 306 |        "\n",
 307 |        "    .dataframe tbody tr th {\n",
 308 |        "        vertical-align: top;\n",
 309 |        "    }\n",
 310 |        "\n",
 311 |        "    .dataframe thead th {\n",
 312 |        "        text-align: right;\n",
 313 |        "    }\n",
 314 |        "</style>\n",
 315 |        "<table border=\"1\" class=\"dataframe\">\n",
 316 |        "  <thead>\n",
 317 |        "    <tr style=\"text-align: right;\">\n",
 318 |        "      <th></th>\n",
 319 |        "      <th>0</th>\n",
 320 |        "      <th>1</th>\n",
 321 |        "      <th>2</th>\n",
 322 |        "      <th>3</th>\n",
 323 |        "      <th>4</th>\n",
 324 |        "      <th>5</th>\n",
 325 |        "      <th>6</th>\n",
 326 |        "      <th>7</th>\n",
 327 |        "      <th>8</th>\n",
 328 |        "      <th>9</th>\n",
 329 |        "      <th>...</th>\n",
 330 |        "      <th>190</th>\n",
 331 |        "      <th>191</th>\n",
 332 |        "      <th>192</th>\n",
 333 |        "      <th>193</th>\n",
 334 |        "      <th>194</th>\n",
 335 |        "      <th>195</th>\n",
 336 |        "      <th>196</th>\n",
 337 |        "      <th>197</th>\n",
 338 |        "      <th>198</th>\n",
 339 |        "      <th>199</th>\n",
 340 |        "    </tr>\n",
 341 |        "  </thead>\n",
 342 |        "  <tbody>\n",
 343 |        "    <tr>\n",
 344 |        "      <th>0</th>\n",
 345 |        "      <td>0</td>\n",
 346 |        "      <td>1</td>\n",
 347 |        "      <td>2</td>\n",
 348 |        "      <td>3</td>\n",
 349 |        "      <td>4</td>\n",
 350 |        "      <td>5</td>\n",
 351 |        "      <td>6</td>\n",
 352 |        "      <td>7</td>\n",
 353 |        "      <td>8</td>\n",
 354 |        "      <td>9</td>\n",
 355 |        "      <td>...</td>\n",
 356 |        "      <td>190</td>\n",
 357 |        "      <td>191</td>\n",
 358 |        "      <td>192</td>\n",
 359 |        "      <td>193</td>\n",
 360 |        "      <td>194</td>\n",
 361 |        "      <td>195</td>\n",
 362 |        "      <td>196</td>\n",
 363 |        "      <td>197</td>\n",
 364 |        "      <td>198</td>\n",
 365 |        "      <td>199</td>\n",
 366 |        "    </tr>\n",
 367 |        "    <tr>\n",
 368 |        "      <th>1</th>\n",
 369 |        "      <td>200</td>\n",
 370 |        "      <td>201</td>\n",
 371 |        "      <td>202</td>\n",
 372 |        "      <td>203</td>\n",
 373 |        "      <td>204</td>\n",
 374 |        "      <td>205</td>\n",
 375 |        "      <td>206</td>\n",
 376 |        "      <td>207</td>\n",
 377 |        "      <td>208</td>\n",
 378 |        "      <td>209</td>\n",
 379 |        "      <td>...</td>\n",
 380 |        "      <td>390</td>\n",
 381 |        "      <td>391</td>\n",
 382 |        "      <td>392</td>\n",
 383 |        "      <td>393</td>\n",
 384 |        "      <td>394</td>\n",
 385 |        "      <td>395</td>\n",
 386 |        "      <td>396</td>\n",
 387 |        "      <td>397</td>\n",
 388 |        "      <td>398</td>\n",
 389 |        "      <td>399</td>\n",
 390 |        "    </tr>\n",
 391 |        "    <tr>\n",
 392 |        "      <th>2</th>\n",
 393 |        "      <td>400</td>\n",
 394 |        "      <td>401</td>\n",
 395 |        "      <td>402</td>\n",
 396 |        "      <td>403</td>\n",
 397 |        "      <td>404</td>\n",
 398 |        "      <td>405</td>\n",
 399 |        "      <td>406</td>\n",
 400 |        "      <td>407</td>\n",
 401 |        "      <td>408</td>\n",
 402 |        "      <td>409</td>\n",
 403 |        "      <td>...</td>\n",
 404 |        "      <td>590</td>\n",
 405 |        "      <td>591</td>\n",
 406 |        "      <td>592</td>\n",
 407 |        "      <td>593</td>\n",
 408 |        "      <td>594</td>\n",
 409 |        "      <td>595</td>\n",
 410 |        "      <td>596</td>\n",
 411 |        "      <td>597</td>\n",
 412 |        "      <td>598</td>\n",
 413 |        "      <td>599</td>\n",
 414 |        "    </tr>\n",
 415 |        "    <tr>\n",
 416 |        "      <th>3</th>\n",
 417 |        "      <td>600</td>\n",
 418 |        "      <td>601</td>\n",
 419 |        "      <td>602</td>\n",
 420 |        "      <td>603</td>\n",
 421 |        "      <td>604</td>\n",
 422 |        "      <td>605</td>\n",
 423 |        "      <td>606</td>\n",
 424 |        "      <td>607</td>\n",
 425 |        "      <td>608</td>\n",
 426 |        "      <td>609</td>\n",
 427 |        "      <td>...</td>\n",
 428 |        "      <td>790</td>\n",
 429 |        "      <td>791</td>\n",
 430 |        "      <td>792</td>\n",
 431 |        "      <td>793</td>\n",
 432 |        "      <td>794</td>\n",
 433 |        "      <td>795</td>\n",
 434 |        "      <td>796</td>\n",
 435 |        "      <td>797</td>\n",
 436 |        "      <td>798</td>\n",
 437 |        "      <td>799</td>\n",
 438 |        "    </tr>\n",
 439 |        "    <tr>\n",
 440 |        "      <th>4</th>\n",
 441 |        "      <td>800</td>\n",
 442 |        "      <td>801</td>\n",
 443 |        "      <td>802</td>\n",
 444 |        "      <td>803</td>\n",
 445 |        "      <td>804</td>\n",
 446 |        "      <td>805</td>\n",
 447 |        "      <td>806</td>\n",
 448 |        "      <td>807</td>\n",
 449 |        "      <td>808</td>\n",
 450 |        "      <td>809</td>\n",
 451 |        "      <td>...</td>\n",
 452 |        "      <td>990</td>\n",
 453 |        "      <td>991</td>\n",
 454 |        "      <td>992</td>\n",
 455 |        "      <td>993</td>\n",
 456 |        "      <td>994</td>\n",
 457 |        "      <td>995</td>\n",
 458 |        "      <td>996</td>\n",
 459 |        "      <td>997</td>\n",
 460 |        "      <td>998</td>\n",
 461 |        "      <td>999</td>\n",
 462 |        "    </tr>\n",
 463 |        "  </tbody>\n",
 464 |        "</table>\n",
 465 |        "<p>5 rows × 200 columns</p>\n",
 466 |        "</div>"
 467 |       ],
 468 |       "text/plain": [
 469 |        "   0    1    2    3    4    5    6    7    8    9    ...  190  191  192  193  \\\n",
 470 |        "0    0    1    2    3    4    5    6    7    8    9  ...  190  191  192  193   \n",
 471 |        "1  200  201  202  203  204  205  206  207  208  209  ...  390  391  392  393   \n",
 472 |        "2  400  401  402  403  404  405  406  407  408  409  ...  590  591  592  593   \n",
 473 |        "3  600  601  602  603  604  605  606  607  608  609  ...  790  791  792  793   \n",
 474 |        "4  800  801  802  803  804  805  806  807  808  809  ...  990  991  992  993   \n",
 475 |        "\n",
 476 |        "   194  195  196  197  198  199  \n",
 477 |        "0  194  195  196  197  198  199  \n",
 478 |        "1  394  395  396  397  398  399  \n",
 479 |        "2  594  595  596  597  598  599  \n",
 480 |        "3  794  795  796  797  798  799  \n",
 481 |        "4  994  995  996  997  998  999  \n",
 482 |        "\n",
 483 |        "[5 rows x 200 columns]"
 484 |       ]
 485 |      },
 486 |      "execution_count": 10,
 487 |      "metadata": {},
 488 |      "output_type": "execute_result"
 489 |     }
 490 |    ],
 491 |    "source": [
 492 |     "reshaped_df = cudf.DataFrame(reshaped_arr)\n",
 493 |     "reshaped_df.head()"
 494 |    ]
 495 |   },
 496 |   {
 497 |    "cell_type": "markdown",
 498 |    "id": "6804d291",
 499 |    "metadata": {},
 500 |    "source": [
 501 |     "We can check whether our array is Fortran contiguous by using cupy.isfortran or looking at the [flags](https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.ndarray.html#cupy.ndarray.flags) of the array."
 502 |    ]
 503 |   },
 504 |   {
 505 |    "cell_type": "code",
 506 |    "execution_count": 11,
 507 |    "id": "65b8bd0d",
 508 |    "metadata": {},
 509 |    "outputs": [
 510 |     {
 511 |      "data": {
 512 |       "text/plain": [
 513 |        "False"
 514 |       ]
 515 |      },
 516 |      "execution_count": 11,
 517 |      "metadata": {},
 518 |      "output_type": "execute_result"
 519 |     }
 520 |    ],
 521 |    "source": [
 522 |     "cp.isfortran(reshaped_arr)"
 523 |    ]
 524 |   },
 525 |   {
 526 |    "cell_type": "markdown",
 527 |    "id": "151982ad",
 528 |    "metadata": {},
 529 |    "source": [
 530 |     "In this case, we'll need to convert it before going to a cuDF DataFrame. In the next two cells, we create the DataFrame by leveraging dlpack and the CUDA array interface, respectively."
 531 |    ]
 532 |   },
 533 |   {
 534 |    "cell_type": "code",
 535 |    "execution_count": 12,
 536 |    "id": "27b2f563",
 537 |    "metadata": {},
 538 |    "outputs": [
 539 |     {
 540 |      "name": "stdout",
 541 |      "output_type": "stream",
 542 |      "text": [
 543 |       "6.26 ms ± 30.8 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
 544 |      ]
 545 |     }
 546 |    ],
 547 |    "source": [
 548 |     "%%timeit\n",
 549 |     "\n",
 550 |     "fortran_arr = cp.asfortranarray(reshaped_arr)\n",
 551 |     "reshaped_df = cudf.DataFrame(fortran_arr)"
 552 |    ]
 553 |   },
 554 |   {
 555 |    "cell_type": "code",
 556 |    "execution_count": 13,
 557 |    "id": "0a0cc290",
 558 |    "metadata": {},
 559 |    "outputs": [
 560 |     {
 561 |      "name": "stdout",
 562 |      "output_type": "stream",
 563 |      "text": [
 564 |       "4.65 ms ± 82.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
 565 |      ]
 566 |     }
 567 |    ],
 568 |    "source": [
 569 |     "%%timeit\n",
 570 |     "\n",
 571 |     "fortran_arr = cp.asfortranarray(reshaped_arr)\n",
 572 |     "reshaped_df = cudf.from_dlpack(fortran_arr.toDlpack())"
 573 |    ]
 574 |   },
 575 |   {
 576 |    "cell_type": "code",
 577 |    "execution_count": 14,
 578 |    "id": "0d2c5beb",
 579 |    "metadata": {},
 580 |    "outputs": [
 581 |     {
 582 |      "data": {
 583 |       "text/html": [
 584 |        "<div>\n",
 585 |        "<style scoped>\n",
 586 |        "    .dataframe tbody tr th:only-of-type {\n",
 587 |        "        vertical-align: middle;\n",
 588 |        "    }\n",
 589 |        "\n",
 590 |        "    .dataframe tbody tr th {\n",
 591 |        "        vertical-align: top;\n",
 592 |        "    }\n",
 593 |        "\n",
 594 |        "    .dataframe thead th {\n",
 595 |        "        text-align: right;\n",
 596 |        "    }\n",
 597 |        "</style>\n",
 598 |        "<table border=\"1\" class=\"dataframe\">\n",
 599 |        "  <thead>\n",
 600 |        "    <tr style=\"text-align: right;\">\n",
 601 |        "      <th></th>\n",
 602 |        "      <th>0</th>\n",
 603 |        "      <th>1</th>\n",
 604 |        "      <th>2</th>\n",
 605 |        "      <th>3</th>\n",
 606 |        "      <th>4</th>\n",
 607 |        "      <th>5</th>\n",
 608 |        "      <th>6</th>\n",
 609 |        "      <th>7</th>\n",
 610 |        "      <th>8</th>\n",
 611 |        "      <th>9</th>\n",
 612 |        "      <th>...</th>\n",
 613 |        "      <th>190</th>\n",
 614 |        "      <th>191</th>\n",
 615 |        "      <th>192</th>\n",
 616 |        "      <th>193</th>\n",
 617 |        "      <th>194</th>\n",
 618 |        "      <th>195</th>\n",
 619 |        "      <th>196</th>\n",
 620 |        "      <th>197</th>\n",
 621 |        "      <th>198</th>\n",
 622 |        "      <th>199</th>\n",
 623 |        "    </tr>\n",
 624 |        "  </thead>\n",
 625 |        "  <tbody>\n",
 626 |        "    <tr>\n",
 627 |        "      <th>0</th>\n",
 628 |        "      <td>0</td>\n",
 629 |        "      <td>1</td>\n",
 630 |        "      <td>2</td>\n",
 631 |        "      <td>3</td>\n",
 632 |        "      <td>4</td>\n",
 633 |        "      <td>5</td>\n",
 634 |        "      <td>6</td>\n",
 635 |        "      <td>7</td>\n",
 636 |        "      <td>8</td>\n",
 637 |        "      <td>9</td>\n",
 638 |        "      <td>...</td>\n",
 639 |        "      <td>190</td>\n",
 640 |        "      <td>191</td>\n",
 641 |        "      <td>192</td>\n",
 642 |        "      <td>193</td>\n",
 643 |        "      <td>194</td>\n",
 644 |        "      <td>195</td>\n",
 645 |        "      <td>196</td>\n",
 646 |        "      <td>197</td>\n",
 647 |        "      <td>198</td>\n",
 648 |        "      <td>199</td>\n",
 649 |        "    </tr>\n",
 650 |        "    <tr>\n",
 651 |        "      <th>1</th>\n",
 652 |        "      <td>200</td>\n",
 653 |        "      <td>201</td>\n",
 654 |        "      <td>202</td>\n",
 655 |        "      <td>203</td>\n",
 656 |        "      <td>204</td>\n",
 657 |        "      <td>205</td>\n",
 658 |        "      <td>206</td>\n",
 659 |        "      <td>207</td>\n",
 660 |        "      <td>208</td>\n",
 661 |        "      <td>209</td>\n",
 662 |        "      <td>...</td>\n",
 663 |        "      <td>390</td>\n",
 664 |        "      <td>391</td>\n",
 665 |        "      <td>392</td>\n",
 666 |        "      <td>393</td>\n",
 667 |        "      <td>394</td>\n",
 668 |        "      <td>395</td>\n",
 669 |        "      <td>396</td>\n",
 670 |        "      <td>397</td>\n",
 671 |        "      <td>398</td>\n",
 672 |        "      <td>399</td>\n",
 673 |        "    </tr>\n",
 674 |        "    <tr>\n",
 675 |        "      <th>2</th>\n",
 676 |        "      <td>400</td>\n",
 677 |        "      <td>401</td>\n",
 678 |        "      <td>402</td>\n",
 679 |        "      <td>403</td>\n",
 680 |        "      <td>404</td>\n",
 681 |        "      <td>405</td>\n",
 682 |        "      <td>406</td>\n",
 683 |        "      <td>407</td>\n",
 684 |        "      <td>408</td>\n",
 685 |        "      <td>409</td>\n",
 686 |        "      <td>...</td>\n",
 687 |        "      <td>590</td>\n",
 688 |        "      <td>591</td>\n",
 689 |        "      <td>592</td>\n",
 690 |        "      <td>593</td>\n",
 691 |        "      <td>594</td>\n",
 692 |        "      <td>595</td>\n",
 693 |        "      <td>596</td>\n",
 694 |        "      <td>597</td>\n",
 695 |        "      <td>598</td>\n",
 696 |        "      <td>599</td>\n",
 697 |        "    </tr>\n",
 698 |        "    <tr>\n",
 699 |        "      <th>3</th>\n",
 700 |        "      <td>600</td>\n",
 701 |        "      <td>601</td>\n",
 702 |        "      <td>602</td>\n",
 703 |        "      <td>603</td>\n",
 704 |        "      <td>604</td>\n",
 705 |        "      <td>605</td>\n",
 706 |        "      <td>606</td>\n",
 707 |        "      <td>607</td>\n",
 708 |        "      <td>608</td>\n",
 709 |        "      <td>609</td>\n",
 710 |        "      <td>...</td>\n",
 711 |        "      <td>790</td>\n",
 712 |        "      <td>791</td>\n",
 713 |        "      <td>792</td>\n",
 714 |        "      <td>793</td>\n",
 715 |        "      <td>794</td>\n",
 716 |        "      <td>795</td>\n",
 717 |        "      <td>796</td>\n",
 718 |        "      <td>797</td>\n",
 719 |        "      <td>798</td>\n",
 720 |        "      <td>799</td>\n",
 721 |        "    </tr>\n",
 722 |        "    <tr>\n",
 723 |        "      <th>4</th>\n",
 724 |        "      <td>800</td>\n",
 725 |        "      <td>801</td>\n",
 726 |        "      <td>802</td>\n",
 727 |        "      <td>803</td>\n",
 728 |        "      <td>804</td>\n",
 729 |        "      <td>805</td>\n",
 730 |        "      <td>806</td>\n",
 731 |        "      <td>807</td>\n",
 732 |        "      <td>808</td>\n",
 733 |        "      <td>809</td>\n",
 734 |        "      <td>...</td>\n",
 735 |        "      <td>990</td>\n",
 736 |        "      <td>991</td>\n",
 737 |        "      <td>992</td>\n",
 738 |        "      <td>993</td>\n",
 739 |        "      <td>994</td>\n",
 740 |        "      <td>995</td>\n",
 741 |        "      <td>996</td>\n",
 742 |        "      <td>997</td>\n",
 743 |        "      <td>998</td>\n",
 744 |        "      <td>999</td>\n",
 745 |        "    </tr>\n",
 746 |        "  </tbody>\n",
 747 |        "</table>\n",
 748 |        "<p>5 rows × 200 columns</p>\n",
 749 |        "</div>"
 750 |       ],
 751 |       "text/plain": [
 752 |        "   0    1    2    3    4    5    6    7    8    9    ...  190  191  192  193  \\\n",
 753 |        "0    0    1    2    3    4    5    6    7    8    9  ...  190  191  192  193   \n",
 754 |        "1  200  201  202  203  204  205  206  207  208  209  ...  390  391  392  393   \n",
 755 |        "2  400  401  402  403  404  405  406  407  408  409  ...  590  591  592  593   \n",
 756 |        "3  600  601  602  603  604  605  606  607  608  609  ...  790  791  792  793   \n",
 757 |        "4  800  801  802  803  804  805  806  807  808  809  ...  990  991  992  993   \n",
 758 |        "\n",
 759 |        "   194  195  196  197  198  199  \n",
 760 |        "0  194  195  196  197  198  199  \n",
 761 |        "1  394  395  396  397  398  399  \n",
 762 |        "2  594  595  596  597  598  599  \n",
 763 |        "3  794  795  796  797  798  799  \n",
 764 |        "4  994  995  996  997  998  999  \n",
 765 |        "\n",
 766 |        "[5 rows x 200 columns]"
 767 |       ]
 768 |      },
 769 |      "execution_count": 14,
 770 |      "metadata": {},
 771 |      "output_type": "execute_result"
 772 |     }
 773 |    ],
 774 |    "source": [
 775 |     "fortran_arr = cp.asfortranarray(reshaped_arr)\n",
 776 |     "reshaped_df = cudf.DataFrame(fortran_arr)\n",
 777 |     "reshaped_df.head()"
 778 |    ]
 779 |   },
 780 |   {
 781 |    "cell_type": "markdown",
 782 |    "id": "395e2bba",
 783 |    "metadata": {},
 784 |    "source": [
 785 |     "### Converting a CuPy Array to a cuDF Series\n",
 786 |     "\n",
 787 |     "To convert an array to a Series, we can directly pass the array to the `Series` constructor."
 788 |    ]
 789 |   },
 790 |   {
 791 |    "cell_type": "code",
 792 |    "execution_count": 15,
 793 |    "id": "d8518208",
 794 |    "metadata": {},
 795 |    "outputs": [
 796 |     {
 797 |      "data": {
 798 |       "text/plain": [
 799 |        "0      0\n",
 800 |        "1    201\n",
 801 |        "2    402\n",
 802 |        "3    603\n",
 803 |        "4    804\n",
 804 |        "dtype: int64"
 805 |       ]
 806 |      },
 807 |      "execution_count": 15,
 808 |      "metadata": {},
 809 |      "output_type": "execute_result"
 810 |     }
 811 |    ],
 812 |    "source": [
 813 |     "cudf.Series(reshaped_arr.diagonal()).head()"
 814 |    ]
 815 |   },
 816 |   {
 817 |    "cell_type": "markdown",
 818 |    "id": "7e159619",
 819 |    "metadata": {},
 820 |    "source": [
 821 |     "### Interweaving CuDF and CuPy for Smooth PyData Workflows\n",
 822 |     "\n",
 823 |     "RAPIDS libraries and the entire GPU PyData ecosystem are developing quickly, but sometimes a one library may not have the functionality you need. One example of this might be taking the row-wise sum (or mean) of a Pandas DataFrame. cuDF's support for row-wise operations isn't mature, so you'd need to either transpose the DataFrame or write a UDF and explicitly calculate the sum across each row. Transposing could lead to hundreds of thousands of columns (which cuDF wouldn't perform well with) depending on your data's shape, and writing a UDF can be time intensive.\n",
 824 |     "\n",
 825 |     "By leveraging the interoperability of the GPU PyData ecosystem, this operation becomes very easy. Let's take the row-wise sum of our previously reshaped cuDF DataFrame."
 826 |    ]
 827 |   },
 828 |   {
 829 |    "cell_type": "code",
 830 |    "execution_count": 16,
 831 |    "id": "2bb8ed81",
 832 |    "metadata": {},
 833 |    "outputs": [
 834 |     {
 835 |      "data": {
 836 |       "text/html": [
 837 |        "<div>\n",
 838 |        "<style scoped>\n",
 839 |        "    .dataframe tbody tr th:only-of-type {\n",
 840 |        "        vertical-align: middle;\n",
 841 |        "    }\n",
 842 |        "\n",
 843 |        "    .dataframe tbody tr th {\n",
 844 |        "        vertical-align: top;\n",
 845 |        "    }\n",
 846 |        "\n",
 847 |        "    .dataframe thead th {\n",
 848 |        "        text-align: right;\n",
 849 |        "    }\n",
 850 |        "</style>\n",
 851 |        "<table border=\"1\" class=\"dataframe\">\n",
 852 |        "  <thead>\n",
 853 |        "    <tr style=\"text-align: right;\">\n",
 854 |        "      <th></th>\n",
 855 |        "      <th>0</th>\n",
 856 |        "      <th>1</th>\n",
 857 |        "      <th>2</th>\n",
 858 |        "      <th>3</th>\n",
 859 |        "      <th>4</th>\n",
 860 |        "      <th>5</th>\n",
 861 |        "      <th>6</th>\n",
 862 |        "      <th>7</th>\n",
 863 |        "      <th>8</th>\n",
 864 |        "      <th>9</th>\n",
 865 |        "      <th>...</th>\n",
 866 |        "      <th>190</th>\n",
 867 |        "      <th>191</th>\n",
 868 |        "      <th>192</th>\n",
 869 |        "      <th>193</th>\n",
 870 |        "      <th>194</th>\n",
 871 |        "      <th>195</th>\n",
 872 |        "      <th>196</th>\n",
 873 |        "      <th>197</th>\n",
 874 |        "      <th>198</th>\n",
 875 |        "      <th>199</th>\n",
 876 |        "    </tr>\n",
 877 |        "  </thead>\n",
 878 |        "  <tbody>\n",
 879 |        "    <tr>\n",
 880 |        "      <th>0</th>\n",
 881 |        "      <td>0</td>\n",
 882 |        "      <td>1</td>\n",
 883 |        "      <td>2</td>\n",
 884 |        "      <td>3</td>\n",
 885 |        "      <td>4</td>\n",
 886 |        "      <td>5</td>\n",
 887 |        "      <td>6</td>\n",
 888 |        "      <td>7</td>\n",
 889 |        "      <td>8</td>\n",
 890 |        "      <td>9</td>\n",
 891 |        "      <td>...</td>\n",
 892 |        "      <td>190</td>\n",
 893 |        "      <td>191</td>\n",
 894 |        "      <td>192</td>\n",
 895 |        "      <td>193</td>\n",
 896 |        "      <td>194</td>\n",
 897 |        "      <td>195</td>\n",
 898 |        "      <td>196</td>\n",
 899 |        "      <td>197</td>\n",
 900 |        "      <td>198</td>\n",
 901 |        "      <td>199</td>\n",
 902 |        "    </tr>\n",
 903 |        "    <tr>\n",
 904 |        "      <th>1</th>\n",
 905 |        "      <td>200</td>\n",
 906 |        "      <td>201</td>\n",
 907 |        "      <td>202</td>\n",
 908 |        "      <td>203</td>\n",
 909 |        "      <td>204</td>\n",
 910 |        "      <td>205</td>\n",
 911 |        "      <td>206</td>\n",
 912 |        "      <td>207</td>\n",
 913 |        "      <td>208</td>\n",
 914 |        "      <td>209</td>\n",
 915 |        "      <td>...</td>\n",
 916 |        "      <td>390</td>\n",
 917 |        "      <td>391</td>\n",
 918 |        "      <td>392</td>\n",
 919 |        "      <td>393</td>\n",
 920 |        "      <td>394</td>\n",
 921 |        "      <td>395</td>\n",
 922 |        "      <td>396</td>\n",
 923 |        "      <td>397</td>\n",
 924 |        "      <td>398</td>\n",
 925 |        "      <td>399</td>\n",
 926 |        "    </tr>\n",
 927 |        "    <tr>\n",
 928 |        "      <th>2</th>\n",
 929 |        "      <td>400</td>\n",
 930 |        "      <td>401</td>\n",
 931 |        "      <td>402</td>\n",
 932 |        "      <td>403</td>\n",
 933 |        "      <td>404</td>\n",
 934 |        "      <td>405</td>\n",
 935 |        "      <td>406</td>\n",
 936 |        "      <td>407</td>\n",
 937 |        "      <td>408</td>\n",
 938 |        "      <td>409</td>\n",
 939 |        "      <td>...</td>\n",
 940 |        "      <td>590</td>\n",
 941 |        "      <td>591</td>\n",
 942 |        "      <td>592</td>\n",
 943 |        "      <td>593</td>\n",
 944 |        "      <td>594</td>\n",
 945 |        "      <td>595</td>\n",
 946 |        "      <td>596</td>\n",
 947 |        "      <td>597</td>\n",
 948 |        "      <td>598</td>\n",
 949 |        "      <td>599</td>\n",
 950 |        "    </tr>\n",
 951 |        "    <tr>\n",
 952 |        "      <th>3</th>\n",
 953 |        "      <td>600</td>\n",
 954 |        "      <td>601</td>\n",
 955 |        "      <td>602</td>\n",
 956 |        "      <td>603</td>\n",
 957 |        "      <td>604</td>\n",
 958 |        "      <td>605</td>\n",
 959 |        "      <td>606</td>\n",
 960 |        "      <td>607</td>\n",
 961 |        "      <td>608</td>\n",
 962 |        "      <td>609</td>\n",
 963 |        "      <td>...</td>\n",
 964 |        "      <td>790</td>\n",
 965 |        "      <td>791</td>\n",
 966 |        "      <td>792</td>\n",
 967 |        "      <td>793</td>\n",
 968 |        "      <td>794</td>\n",
 969 |        "      <td>795</td>\n",
 970 |        "      <td>796</td>\n",
 971 |        "      <td>797</td>\n",
 972 |        "      <td>798</td>\n",
 973 |        "      <td>799</td>\n",
 974 |        "    </tr>\n",
 975 |        "    <tr>\n",
 976 |        "      <th>4</th>\n",
 977 |        "      <td>800</td>\n",
 978 |        "      <td>801</td>\n",
 979 |        "      <td>802</td>\n",
 980 |        "      <td>803</td>\n",
 981 |        "      <td>804</td>\n",
 982 |        "      <td>805</td>\n",
 983 |        "      <td>806</td>\n",
 984 |        "      <td>807</td>\n",
 985 |        "      <td>808</td>\n",
 986 |        "      <td>809</td>\n",
 987 |        "      <td>...</td>\n",
 988 |        "      <td>990</td>\n",
 989 |        "      <td>991</td>\n",
 990 |        "      <td>992</td>\n",
 991 |        "      <td>993</td>\n",
 992 |        "      <td>994</td>\n",
 993 |        "      <td>995</td>\n",
 994 |        "      <td>996</td>\n",
 995 |        "      <td>997</td>\n",
 996 |        "      <td>998</td>\n",
 997 |        "      <td>999</td>\n",
 998 |        "    </tr>\n",
 999 |        "  </tbody>\n",
1000 |        "</table>\n",
1001 |        "<p>5 rows × 200 columns</p>\n",
1002 |        "</div>"
1003 |       ],
1004 |       "text/plain": [
1005 |        "   0    1    2    3    4    5    6    7    8    9    ...  190  191  192  193  \\\n",
1006 |        "0    0    1    2    3    4    5    6    7    8    9  ...  190  191  192  193   \n",
1007 |        "1  200  201  202  203  204  205  206  207  208  209  ...  390  391  392  393   \n",
1008 |        "2  400  401  402  403  404  405  406  407  408  409  ...  590  591  592  593   \n",
1009 |        "3  600  601  602  603  604  605  606  607  608  609  ...  790  791  792  793   \n",
1010 |        "4  800  801  802  803  804  805  806  807  808  809  ...  990  991  992  993   \n",
1011 |        "\n",
1012 |        "   194  195  196  197  198  199  \n",
1013 |        "0  194  195  196  197  198  199  \n",
1014 |        "1  394  395  396  397  398  399  \n",
1015 |        "2  594  595  596  597  598  599  \n",
1016 |        "3  794  795  796  797  798  799  \n",
1017 |        "4  994  995  996  997  998  999  \n",
1018 |        "\n",
1019 |        "[5 rows x 200 columns]"
1020 |       ]
1021 |      },
1022 |      "execution_count": 16,
1023 |      "metadata": {},
1024 |      "output_type": "execute_result"
1025 |     }
1026 |    ],
1027 |    "source": [
1028 |     "reshaped_df.head()"
1029 |    ]
1030 |   },
1031 |   {
1032 |    "cell_type": "markdown",
1033 |    "id": "2f3d4e78",
1034 |    "metadata": {},
1035 |    "source": [
1036 |     "We can just transform it into a CuPy array and use the `axis` argument of `sum`."
1037 |    ]
1038 |   },
1039 |   {
1040 |    "cell_type": "code",
1041 |    "execution_count": 17,
1042 |    "id": "2dde030d",
1043 |    "metadata": {},
1044 |    "outputs": [
1045 |     {
1046 |      "data": {
1047 |       "text/plain": [
1048 |        "array([  19900,   59900,   99900,  139900,  179900,  219900,  259900,\n",
1049 |        "        299900,  339900,  379900,  419900,  459900,  499900,  539900,\n",
1050 |        "        579900,  619900,  659900,  699900,  739900,  779900,  819900,\n",
1051 |        "        859900,  899900,  939900,  979900, 1019900, 1059900, 1099900,\n",
1052 |        "       1139900, 1179900, 1219900, 1259900, 1299900, 1339900, 1379900,\n",
1053 |        "       1419900, 1459900, 1499900, 1539900, 1579900, 1619900, 1659900,\n",
1054 |        "       1699900, 1739900, 1779900, 1819900, 1859900, 1899900, 1939900,\n",
1055 |        "       1979900])"
1056 |       ]
1057 |      },
1058 |      "execution_count": 17,
1059 |      "metadata": {},
1060 |      "output_type": "execute_result"
1061 |     }
1062 |    ],
1063 |    "source": [
1064 |     "new_arr = cupy_from_dlpack(reshaped_df.to_dlpack())\n",
1065 |     "new_arr.sum(axis=1)"
1066 |    ]
1067 |   },
1068 |   {
1069 |    "cell_type": "markdown",
1070 |    "id": "4450dcc3",
1071 |    "metadata": {},
1072 |    "source": [
1073 |     "With just that single line, we're able to seamlessly move between data structures in this ecosystem, giving us enormous flexibility without sacrificing speed."
1074 |    ]
1075 |   },
1076 |   {
1077 |    "cell_type": "markdown",
1078 |    "id": "61bfb868",
1079 |    "metadata": {},
1080 |    "source": [
1081 |     "### Converting a cuDF DataFrame to a CuPy Sparse Matrix\n",
1082 |     "\n",
1083 |     "We can also convert a DataFrame or Series to a CuPy sparse matrix. We might want to do this if downstream processes expect CuPy sparse matrices as an input.\n",
1084 |     "\n",
1085 |     "The sparse matrix data structure is defined by three dense arrays. We'll define a small helper function for cleanliness."
1086 |    ]
1087 |   },
1088 |   {
1089 |    "cell_type": "code",
1090 |    "execution_count": 18,
1091 |    "id": "e531fd15",
1092 |    "metadata": {},
1093 |    "outputs": [],
1094 |    "source": [
1095 |     "def cudf_to_cupy_sparse_matrix(data, sparseformat=\"column\"):\n",
1096 |     "    \"\"\"Converts a cuDF object to a CuPy Sparse Column matrix.\"\"\"\n",
1097 |     "    if sparseformat not in (\n",
1098 |     "        \"row\",\n",
1099 |     "        \"column\",\n",
1100 |     "    ):\n",
1101 |     "        raise ValueError(\"Let's focus on column and row formats for now.\")\n",
1102 |     "\n",
1103 |     "    _sparse_constructor = cp.sparse.csc_matrix\n",
1104 |     "    if sparseformat == \"row\":\n",
1105 |     "        _sparse_constructor = cp.sparse.csr_matrix\n",
1106 |     "\n",
1107 |     "    return _sparse_constructor(cupy_from_dlpack(data.to_dlpack()))"
1108 |    ]
1109 |   },
1110 |   {
1111 |    "cell_type": "markdown",
1112 |    "id": "3f5e6ade",
1113 |    "metadata": {},
1114 |    "source": [
1115 |     "We can define a sparsely populated DataFrame to illustrate this conversion to either sparse matrix format."
1116 |    ]
1117 |   },
1118 |   {
1119 |    "cell_type": "code",
1120 |    "execution_count": 19,
1121 |    "id": "58c7e074",
1122 |    "metadata": {},
1123 |    "outputs": [],
1124 |    "source": [
1125 |     "df = cudf.DataFrame()\n",
1126 |     "nelem = 10000\n",
1127 |     "nonzero = 1000\n",
1128 |     "for i in range(20):\n",
1129 |     "    arr = cp.random.normal(5, 5, nelem)\n",
1130 |     "    arr[cp.random.choice(arr.shape[0], nelem - nonzero, replace=False)] = 0\n",
1131 |     "    df[\"a\" + str(i)] = arr"
1132 |    ]
1133 |   },
1134 |   {
1135 |    "cell_type": "code",
1136 |    "execution_count": 20,
1137 |    "id": "9265228d",
1138 |    "metadata": {},
1139 |    "outputs": [
1140 |     {
1141 |      "data": {
1142 |       "text/html": [
1143 |        "<div>\n",
1144 |        "<style scoped>\n",
1145 |        "    .dataframe tbody tr th:only-of-type {\n",
1146 |        "        vertical-align: middle;\n",
1147 |        "    }\n",
1148 |        "\n",
1149 |        "    .dataframe tbody tr th {\n",
1150 |        "        vertical-align: top;\n",
1151 |        "    }\n",
1152 |        "\n",
1153 |        "    .dataframe thead th {\n",
1154 |        "        text-align: right;\n",
1155 |        "    }\n",
1156 |        "</style>\n",
1157 |        "<table border=\"1\" class=\"dataframe\">\n",
1158 |        "  <thead>\n",
1159 |        "    <tr style=\"text-align: right;\">\n",
1160 |        "      <th></th>\n",
1161 |        "      <th>a0</th>\n",
1162 |        "      <th>a1</th>\n",
1163 |        "      <th>a2</th>\n",
1164 |        "      <th>a3</th>\n",
1165 |        "      <th>a4</th>\n",
1166 |        "      <th>a5</th>\n",
1167 |        "      <th>a6</th>\n",
1168 |        "      <th>a7</th>\n",
1169 |        "      <th>a8</th>\n",
1170 |        "      <th>a9</th>\n",
1171 |        "      <th>a10</th>\n",
1172 |        "      <th>a11</th>\n",
1173 |        "      <th>a12</th>\n",
1174 |        "      <th>a13</th>\n",
1175 |        "      <th>a14</th>\n",
1176 |        "      <th>a15</th>\n",
1177 |        "      <th>a16</th>\n",
1178 |        "      <th>a17</th>\n",
1179 |        "      <th>a18</th>\n",
1180 |        "      <th>a19</th>\n",
1181 |        "    </tr>\n",
1182 |        "  </thead>\n",
1183 |        "  <tbody>\n",
1184 |        "    <tr>\n",
1185 |        "      <th>0</th>\n",
1186 |        "      <td>0.0</td>\n",
1187 |        "      <td>0.0</td>\n",
1188 |        "      <td>0.0</td>\n",
1189 |        "      <td>0.0</td>\n",
1190 |        "      <td>0.0</td>\n",
1191 |        "      <td>0.000000</td>\n",
1192 |        "      <td>0.0</td>\n",
1193 |        "      <td>0.0</td>\n",
1194 |        "      <td>0.0</td>\n",
1195 |        "      <td>0.0</td>\n",
1196 |        "      <td>0.0</td>\n",
1197 |        "      <td>0.000000</td>\n",
1198 |        "      <td>0.0</td>\n",
1199 |        "      <td>0.0</td>\n",
1200 |        "      <td>0.0</td>\n",
1201 |        "      <td>0.000000</td>\n",
1202 |        "      <td>0.0</td>\n",
1203 |        "      <td>0.0</td>\n",
1204 |        "      <td>0.0</td>\n",
1205 |        "      <td>0.0</td>\n",
1206 |        "    </tr>\n",
1207 |        "    <tr>\n",
1208 |        "      <th>1</th>\n",
1209 |        "      <td>0.0</td>\n",
1210 |        "      <td>0.0</td>\n",
1211 |        "      <td>0.0</td>\n",
1212 |        "      <td>0.0</td>\n",
1213 |        "      <td>0.0</td>\n",
1214 |        "      <td>0.000000</td>\n",
1215 |        "      <td>0.0</td>\n",
1216 |        "      <td>0.0</td>\n",
1217 |        "      <td>0.0</td>\n",
1218 |        "      <td>0.0</td>\n",
1219 |        "      <td>0.0</td>\n",
1220 |        "      <td>0.000000</td>\n",
1221 |        "      <td>0.0</td>\n",
1222 |        "      <td>0.0</td>\n",
1223 |        "      <td>0.0</td>\n",
1224 |        "      <td>0.000000</td>\n",
1225 |        "      <td>0.0</td>\n",
1226 |        "      <td>0.0</td>\n",
1227 |        "      <td>0.0</td>\n",
1228 |        "      <td>0.0</td>\n",
1229 |        "    </tr>\n",
1230 |        "    <tr>\n",
1231 |        "      <th>2</th>\n",
1232 |        "      <td>0.0</td>\n",
1233 |        "      <td>0.0</td>\n",
1234 |        "      <td>0.0</td>\n",
1235 |        "      <td>0.0</td>\n",
1236 |        "      <td>0.0</td>\n",
1237 |        "      <td>0.000000</td>\n",
1238 |        "      <td>0.0</td>\n",
1239 |        "      <td>0.0</td>\n",
1240 |        "      <td>0.0</td>\n",
1241 |        "      <td>0.0</td>\n",
1242 |        "      <td>0.0</td>\n",
1243 |        "      <td>0.000000</td>\n",
1244 |        "      <td>0.0</td>\n",
1245 |        "      <td>0.0</td>\n",
1246 |        "      <td>0.0</td>\n",
1247 |        "      <td>0.000000</td>\n",
1248 |        "      <td>0.0</td>\n",
1249 |        "      <td>0.0</td>\n",
1250 |        "      <td>0.0</td>\n",
1251 |        "      <td>0.0</td>\n",
1252 |        "    </tr>\n",
1253 |        "    <tr>\n",
1254 |        "      <th>3</th>\n",
1255 |        "      <td>0.0</td>\n",
1256 |        "      <td>0.0</td>\n",
1257 |        "      <td>0.0</td>\n",
1258 |        "      <td>0.0</td>\n",
1259 |        "      <td>0.0</td>\n",
1260 |        "      <td>0.000000</td>\n",
1261 |        "      <td>0.0</td>\n",
1262 |        "      <td>0.0</td>\n",
1263 |        "      <td>0.0</td>\n",
1264 |        "      <td>0.0</td>\n",
1265 |        "      <td>0.0</td>\n",
1266 |        "      <td>10.190342</td>\n",
1267 |        "      <td>0.0</td>\n",
1268 |        "      <td>0.0</td>\n",
1269 |        "      <td>0.0</td>\n",
1270 |        "      <td>3.390956</td>\n",
1271 |        "      <td>0.0</td>\n",
1272 |        "      <td>0.0</td>\n",
1273 |        "      <td>0.0</td>\n",
1274 |        "      <td>0.0</td>\n",
1275 |        "    </tr>\n",
1276 |        "    <tr>\n",
1277 |        "      <th>4</th>\n",
1278 |        "      <td>0.0</td>\n",
1279 |        "      <td>0.0</td>\n",
1280 |        "      <td>0.0</td>\n",
1281 |        "      <td>0.0</td>\n",
1282 |        "      <td>0.0</td>\n",
1283 |        "      <td>1.914044</td>\n",
1284 |        "      <td>0.0</td>\n",
1285 |        "      <td>0.0</td>\n",
1286 |        "      <td>0.0</td>\n",
1287 |        "      <td>0.0</td>\n",
1288 |        "      <td>0.0</td>\n",
1289 |        "      <td>0.000000</td>\n",
1290 |        "      <td>0.0</td>\n",
1291 |        "      <td>0.0</td>\n",
1292 |        "      <td>0.0</td>\n",
1293 |        "      <td>0.000000</td>\n",
1294 |        "      <td>0.0</td>\n",
1295 |        "      <td>0.0</td>\n",
1296 |        "      <td>0.0</td>\n",
1297 |        "      <td>0.0</td>\n",
1298 |        "    </tr>\n",
1299 |        "  </tbody>\n",
1300 |        "</table>\n",
1301 |        "</div>"
1302 |       ],
1303 |       "text/plain": [
1304 |        "    a0   a1   a2   a3   a4        a5   a6   a7   a8   a9  a10        a11  a12  \\\n",
1305 |        "0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0   0.000000  0.0   \n",
1306 |        "1  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0   0.000000  0.0   \n",
1307 |        "2  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0   0.000000  0.0   \n",
1308 |        "3  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  10.190342  0.0   \n",
1309 |        "4  0.0  0.0  0.0  0.0  0.0  1.914044  0.0  0.0  0.0  0.0  0.0   0.000000  0.0   \n",
1310 |        "\n",
1311 |        "   a13  a14       a15  a16  a17  a18  a19  \n",
1312 |        "0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  \n",
1313 |        "1  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  \n",
1314 |        "2  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  \n",
1315 |        "3  0.0  0.0  3.390956  0.0  0.0  0.0  0.0  \n",
1316 |        "4  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  "
1317 |       ]
1318 |      },
1319 |      "execution_count": 20,
1320 |      "metadata": {},
1321 |      "output_type": "execute_result"
1322 |     }
1323 |    ],
1324 |    "source": [
1325 |     "df.head()"
1326 |    ]
1327 |   },
1328 |   {
1329 |    "cell_type": "code",
1330 |    "execution_count": 21,
1331 |    "id": "5ba1a551",
1332 |    "metadata": {},
1333 |    "outputs": [
1334 |     {
1335 |      "name": "stdout",
1336 |      "output_type": "stream",
1337 |      "text": [
1338 |       "  (897, 0)\t10.472083065532264\n",
1339 |       "  (1665, 0)\t-3.9887061769298446\n",
1340 |       "  (1155, 0)\t7.370990696856217\n",
1341 |       "  (772, 0)\t0.299235212766345\n",
1342 |       "  (1286, 0)\t5.721357813626815\n",
1343 |       "  (775, 0)\t7.428655280999701\n",
1344 |       "  (903, 0)\t11.540080662967277\n",
1345 |       "  (1287, 0)\t-0.011231698453708638\n",
1346 |       "  (265, 0)\t-4.878065816206471\n",
1347 |       "  (1161, 0)\t1.3966861068924148\n",
1348 |       "  (266, 0)\t1.0549982663300899\n",
1349 |       "  (1291, 0)\t0.5910012568901113\n",
1350 |       "  (909, 0)\t5.731776333301544\n",
1351 |       "  (1549, 0)\t3.6972508613199615\n",
1352 |       "  (144, 0)\t-4.970091038596944\n",
1353 |       "  (1424, 0)\t1.3033828282534228\n",
1354 |       "  (1297, 0)\t8.258941912132176\n",
1355 |       "  (914, 0)\t4.616224036044147\n",
1356 |       "  (21, 0)\t6.25336788325076\n",
1357 |       "  (534, 0)\t8.419889621961996\n",
1358 |       "  (918, 0)\t5.691837083015048\n",
1359 |       "  (1046, 0)\t8.104031527567054\n",
1360 |       "  (1686, 0)\t3.000304365412834\n",
1361 |       "  (535, 0)\t3.1746290131636425\n",
1362 |       "  (665, 0)\t0.7317762727252719\n",
1363 |       "  :\t:\n",
1364 |       "  (9200, 19)\t5.538603016004742\n",
1365 |       "  (9713, 19)\t6.404913440477216\n",
1366 |       "  (9202, 19)\t10.133358471330899\n",
1367 |       "  (8567, 19)\t-1.6576792573911858\n",
1368 |       "  (9847, 19)\t10.284007122371538\n",
1369 |       "  (8440, 19)\t8.605460481669013\n",
1370 |       "  (9336, 19)\t7.398549223780951\n",
1371 |       "  (9720, 19)\t4.720142296850481\n",
1372 |       "  (8441, 19)\t7.17687459848627\n",
1373 |       "  (9209, 19)\t1.7813006515085739\n",
1374 |       "  (9337, 19)\t7.4672893771361455\n",
1375 |       "  (8570, 19)\t12.837330165297741\n",
1376 |       "  (9210, 19)\t9.70564905788214\n",
1377 |       "  (9083, 19)\t0.4805280345257057\n",
1378 |       "  (9339, 19)\t4.740715090704008\n",
1379 |       "  (9468, 19)\t2.9916780999709296\n",
1380 |       "  (9596, 19)\t10.314674648882447\n",
1381 |       "  (9724, 19)\t7.854766410475708\n",
1382 |       "  (9852, 19)\t1.5899611807514598\n",
1383 |       "  (9087, 19)\t11.67209323271626\n",
1384 |       "  (9953, 19)\t11.444522857416047\n",
1385 |       "  (9954, 19)\t4.672717133639532\n",
1386 |       "  (9959, 19)\t6.54204170098849\n",
1387 |       "  (9976, 19)\t10.600419849454331\n",
1388 |       "  (9978, 19)\t6.839645924414838\n"
1389 |      ]
1390 |     }
1391 |    ],
1392 |    "source": [
1393 |     "sparse_data = cudf_to_cupy_sparse_matrix(df)\n",
1394 |     "print(sparse_data)"
1395 |    ]
1396 |   },
1397 |   {
1398 |    "cell_type": "markdown",
1399 |    "id": "e8e58cd5",
1400 |    "metadata": {},
1401 |    "source": [
1402 |     "From here, we could continue our workflow with a CuPy sparse matrix.\n",
1403 |     "\n",
1404 |     "For a full list of the functionality built into these libraries, we encourage you to check out the API docs for [cuDF](https://docs.rapids.ai/api/cudf/nightly/) and [CuPy](https://docs-cupy.chainer.org/en/stable/index.html)."
1405 |    ]
1406 |   }
1407 |  ],
1408 |  "metadata": {
1409 |   "kernelspec": {
1410 |    "display_name": "Python 3 (ipykernel)",
1411 |    "language": "python",
1412 |    "name": "python3"
1413 |   },
1414 |   "language_info": {
1415 |    "codemirror_mode": {
1416 |     "name": "ipython",
1417 |     "version": 3
1418 |    },
1419 |    "file_extension": ".py",
1420 |    "mimetype": "text/x-python",
1421 |    "name": "python",
1422 |    "nbconvert_exporter": "python",
1423 |    "pygments_lexer": "ipython3",
1424 |    "version": "3.10.11"
1425 |   }
1426 |  },
1427 |  "nbformat": 4,
1428 |  "nbformat_minor": 5
1429 | }
1430 | 


--------------------------------------------------------------------------------
/code/Introduction_to_Strings.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Intro into Strings \n",
   8 |     "\n",
   9 |     "**Authorship**<br />\n",
  10 |     "Original Author: Nicholas Davis<br />"
  11 |    ]
  12 |   },
  13 |   {
  14 |    "cell_type": "markdown",
  15 |    "metadata": {},
  16 |    "source": [
  17 |     "## Working with text data <br />\n",
  18 |     "\n",
  19 |     "Enterprise analytics workflows commonly require processing large-scale text data. To address this need, the RAPIDS CUDA DataFrame library (cuDF) and RAPIDS CUDA Machine Learning library (cuML) now include string processing capabilities. cuDF has a fully-featured string and regular expression processing engine. With a pandas-like API, cuDF string analytics can provide data scientists with up to 90x performance improvement with minimal changes to their code.<br />\n",
  20 |     "\n",
  21 |     "This notebook serves as an intro to string capabilities with cuDF. Each string functionality will have a pandas example and it's cuDF equivalent.<br />\n",
  22 |     "\n",
  23 |     "For any additional information please reference:<br />\n",
  24 |     "[cuDF Documentation](https://docs.rapids.ai/api/cudf/stable/api.html#strings)<br /><br />\n",
  25 |     "[GPU-Accelerated String Processing with RAPIDS Video](https://www.nvidia.com/en-us/on-demand/session/gtcfall20-a21131/)"
  26 |    ]
  27 |   },
  28 |   {
  29 |    "cell_type": "markdown",
  30 |    "metadata": {},
  31 |    "source": [
  32 |     "\n",
  33 |     "Before we begin, let's check out our hardware setup by running the nvidia-smi command."
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "code",
  38 |    "execution_count": 1,
  39 |    "metadata": {},
  40 |    "outputs": [
  41 |     {
  42 |      "name": "stdout",
  43 |      "output_type": "stream",
  44 |      "text": [
  45 |       "Mon Mar 11 16:25:31 2024       \n",
  46 |       "+-----------------------------------------------------------------------------+\n",
  47 |       "| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |\n",
  48 |       "|-------------------------------+----------------------+----------------------+\n",
  49 |       "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
  50 |       "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
  51 |       "|                               |                      |               MIG M. |\n",
  52 |       "|===============================+======================+======================|\n",
  53 |       "|   0  Quadro GV100        Off  | 00000000:15:00.0 Off |                  Off |\n",
  54 |       "| 29%   40C    P2    26W / 250W |  12750MiB / 32768MiB |      0%      Default |\n",
  55 |       "|                               |                      |                  N/A |\n",
  56 |       "+-------------------------------+----------------------+----------------------+\n",
  57 |       "|   1  Quadro GV100        Off  | 00000000:2D:00.0 Off |                  Off |\n",
  58 |       "| 33%   46C    P2    29W / 250W |   3497MiB / 32768MiB |      0%      Default |\n",
  59 |       "|                               |                      |                  N/A |\n",
  60 |       "+-------------------------------+----------------------+----------------------+\n",
  61 |       "                                                                               \n",
  62 |       "+-----------------------------------------------------------------------------+\n",
  63 |       "| Processes:                                                                  |\n",
  64 |       "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
  65 |       "|        ID   ID                                                   Usage      |\n",
  66 |       "|=============================================================================|\n",
  67 |       "+-----------------------------------------------------------------------------+\n"
  68 |      ]
  69 |     }
  70 |    ],
  71 |    "source": [
  72 |     "!nvidia-smi"
  73 |    ]
  74 |   },
  75 |   {
  76 |    "cell_type": "markdown",
  77 |    "metadata": {},
  78 |    "source": [
  79 |     "### Text data types"
  80 |    ]
  81 |   },
  82 |   {
  83 |    "cell_type": "markdown",
  84 |    "metadata": {},
  85 |    "source": [
  86 |     "There are two ways to store text data in pandas and cudf:\n",
  87 |     "\n",
  88 |     "1. object -dtype NumPy array.\n",
  89 |     "\n",
  90 |     "1. StringDtype extension type.\n",
  91 |     "\n",
  92 |     "We recommend using StringDtype to store text data.\n",
  93 |     "\n",
  94 |     "Prior to pandas 1.0, object dtype was the only option. This was unfortunate for many reasons:\n",
  95 |     "\n",
  96 |     "1. You can accidentally store a mixture of strings and non-strings in an object dtype array. It’s better to have a dedicated dtype.\n",
  97 |     "\n",
  98 |     "1. object dtype breaks dtype-specific operations like `DataFrame.select_dtypes()`. There isn’t a clear way to select just text while excluding non-text but still object-dtype columns.\n",
  99 |     "\n",
 100 |     "1. When reading code, the contents of an object dtype array is less clear than 'string'."
 101 |    ]
 102 |   },
 103 |   {
 104 |    "cell_type": "markdown",
 105 |    "metadata": {},
 106 |    "source": [
 107 |     "Currently, the performance of object dtype arrays of strings and arrays.StringArray are about the same. We expect future enhancements to significantly increase the performance and lower the memory overhead of StringArray.\n",
 108 |     "\n"
 109 |    ]
 110 |   },
 111 |   {
 112 |    "cell_type": "code",
 113 |    "execution_count": 2,
 114 |    "metadata": {},
 115 |    "outputs": [
 116 |     {
 117 |      "name": "stdout",
 118 |      "output_type": "stream",
 119 |      "text": [
 120 |       "Pandas Version: 1.5.3\n",
 121 |       "CuDF Version: 24.02.02\n"
 122 |      ]
 123 |     }
 124 |    ],
 125 |    "source": [
 126 |     "import pandas as pd; print('Pandas Version:', pd.__version__)\n",
 127 |     "import numpy as np\n",
 128 |     "import cupy as cp\n",
 129 |     "import cudf; print('CuDF Version:', cudf.__version__)\n",
 130 |     "import warnings\n",
 131 |     "warnings.filterwarnings('ignore')\n"
 132 |    ]
 133 |   },
 134 |   {
 135 |    "cell_type": "markdown",
 136 |    "metadata": {},
 137 |    "source": [
 138 |     "\n",
 139 |     "For backwards-compatibility, object dtype remains the default type we infer a list of strings to."
 140 |    ]
 141 |   },
 142 |   {
 143 |    "cell_type": "code",
 144 |    "execution_count": 3,
 145 |    "metadata": {},
 146 |    "outputs": [
 147 |     {
 148 |      "data": {
 149 |       "text/plain": [
 150 |        "0    a\n",
 151 |        "1    b\n",
 152 |        "2    c\n",
 153 |        "dtype: object"
 154 |       ]
 155 |      },
 156 |      "execution_count": 3,
 157 |      "metadata": {},
 158 |      "output_type": "execute_result"
 159 |     }
 160 |    ],
 161 |    "source": [
 162 |     "# Pandas\n",
 163 |     "\n",
 164 |     "pd.Series([\"a\", \"b\", \"c\"])"
 165 |    ]
 166 |   },
 167 |   {
 168 |    "cell_type": "code",
 169 |    "execution_count": 4,
 170 |    "metadata": {},
 171 |    "outputs": [
 172 |     {
 173 |      "data": {
 174 |       "text/plain": [
 175 |        "0    a\n",
 176 |        "1    b\n",
 177 |        "2    c\n",
 178 |        "dtype: object"
 179 |       ]
 180 |      },
 181 |      "execution_count": 4,
 182 |      "metadata": {},
 183 |      "output_type": "execute_result"
 184 |     }
 185 |    ],
 186 |    "source": [
 187 |     "# cuDF\n",
 188 |     "\n",
 189 |     "cudf.Series([\"a\", \"b\", \"c\"])\n"
 190 |    ]
 191 |   },
 192 |   {
 193 |    "cell_type": "markdown",
 194 |    "metadata": {},
 195 |    "source": [
 196 |     "\n",
 197 |     "To explicitly request string dtype, specify the dtype."
 198 |    ]
 199 |   },
 200 |   {
 201 |    "cell_type": "code",
 202 |    "execution_count": 5,
 203 |    "metadata": {},
 204 |    "outputs": [
 205 |     {
 206 |      "data": {
 207 |       "text/plain": [
 208 |        "0    a\n",
 209 |        "1    b\n",
 210 |        "2    c\n",
 211 |        "dtype: string"
 212 |       ]
 213 |      },
 214 |      "execution_count": 5,
 215 |      "metadata": {},
 216 |      "output_type": "execute_result"
 217 |     }
 218 |    ],
 219 |    "source": [
 220 |     "pd.Series([\"a\", \"b\", \"c\"], dtype=\"string\")"
 221 |    ]
 222 |   },
 223 |   {
 224 |    "cell_type": "code",
 225 |    "execution_count": 6,
 226 |    "metadata": {},
 227 |    "outputs": [
 228 |     {
 229 |      "data": {
 230 |       "text/plain": [
 231 |        "0    a\n",
 232 |        "1    b\n",
 233 |        "2    c\n",
 234 |        "dtype: object"
 235 |       ]
 236 |      },
 237 |      "execution_count": 6,
 238 |      "metadata": {},
 239 |      "output_type": "execute_result"
 240 |     }
 241 |    ],
 242 |    "source": [
 243 |     "cudf.Series([\"a\", \"b\", \"c\"], dtype=\"str\")"
 244 |    ]
 245 |   },
 246 |   {
 247 |    "cell_type": "markdown",
 248 |    "metadata": {},
 249 |    "source": [
 250 |     "\n",
 251 |     "Or astype after the Series or DataFrame is created."
 252 |    ]
 253 |   },
 254 |   {
 255 |    "cell_type": "code",
 256 |    "execution_count": 7,
 257 |    "metadata": {},
 258 |    "outputs": [
 259 |     {
 260 |      "name": "stdout",
 261 |      "output_type": "stream",
 262 |      "text": [
 263 |       "Original: \n",
 264 |       "0    a\n",
 265 |       "1    b\n",
 266 |       "2    c\n",
 267 |       "dtype: string\n",
 268 |       "\n",
 269 |       "# of 'n': \n",
 270 |       "0    0\n",
 271 |       "1    0\n",
 272 |       "2    0\n",
 273 |       "dtype: int64\n"
 274 |      ]
 275 |     }
 276 |    ],
 277 |    "source": [
 278 |     "pandasSeries = pd.Series([\"a\", \"b\", \"c\"])\n",
 279 |     "print('Original: ')\n",
 280 |     "print(pandasSeries.astype(\"string\"))\n",
 281 |     "\n",
 282 |     "print(\"\\n# of 'n': \")\n",
 283 |     "print(pandasSeries.str.count('n'))"
 284 |    ]
 285 |   },
 286 |   {
 287 |    "cell_type": "code",
 288 |    "execution_count": 8,
 289 |    "metadata": {},
 290 |    "outputs": [
 291 |     {
 292 |      "name": "stdout",
 293 |      "output_type": "stream",
 294 |      "text": [
 295 |       "Original: \n",
 296 |       "0    a\n",
 297 |       "1    b\n",
 298 |       "2    c\n",
 299 |       "dtype: object\n",
 300 |       "\n",
 301 |       "# of 'n': \n",
 302 |       "0    0\n",
 303 |       "1    0\n",
 304 |       "2    0\n",
 305 |       "dtype: int32\n"
 306 |      ]
 307 |     }
 308 |    ],
 309 |    "source": [
 310 |     "cudfSeries = cudf.Series([\"a\", \"b\", \"c\"])\n",
 311 |     "print('Original: ')\n",
 312 |     "print(cudfSeries.astype(\"string\"))\n",
 313 |     "\n",
 314 |     "print(\"\\n# of 'n': \")\n",
 315 |     "print(cudfSeries.str.count('n'))"
 316 |    ]
 317 |   },
 318 |   {
 319 |    "cell_type": "markdown",
 320 |    "metadata": {},
 321 |    "source": [
 322 |     "\n",
 323 |     "You can also use StringDtype/\"string\" as the dtype on non-string data and it will be converted to string dtype:"
 324 |    ]
 325 |   },
 326 |   {
 327 |    "cell_type": "code",
 328 |    "execution_count": 9,
 329 |    "metadata": {},
 330 |    "outputs": [
 331 |     {
 332 |      "name": "stdout",
 333 |      "output_type": "stream",
 334 |      "text": [
 335 |       "0       a\n",
 336 |       "1       2\n",
 337 |       "2    <NA>\n",
 338 |       "dtype: string\n"
 339 |      ]
 340 |     },
 341 |     {
 342 |      "data": {
 343 |       "text/plain": [
 344 |        "str"
 345 |       ]
 346 |      },
 347 |      "execution_count": 9,
 348 |      "metadata": {},
 349 |      "output_type": "execute_result"
 350 |     }
 351 |    ],
 352 |    "source": [
 353 |     "pandasSeries = pd.Series([\"a\", 2, np.nan], dtype=\"string\")\n",
 354 |     "print(pandasSeries)\n",
 355 |     "type(pandasSeries[1])"
 356 |    ]
 357 |   },
 358 |   {
 359 |    "cell_type": "code",
 360 |    "execution_count": 10,
 361 |    "metadata": {},
 362 |    "outputs": [
 363 |     {
 364 |      "name": "stdout",
 365 |      "output_type": "stream",
 366 |      "text": [
 367 |       "0       a\n",
 368 |       "1       2\n",
 369 |       "2    <NA>\n",
 370 |       "dtype: object\n"
 371 |      ]
 372 |     },
 373 |     {
 374 |      "data": {
 375 |       "text/plain": [
 376 |        "str"
 377 |       ]
 378 |      },
 379 |      "execution_count": 10,
 380 |      "metadata": {},
 381 |      "output_type": "execute_result"
 382 |     }
 383 |    ],
 384 |    "source": [
 385 |     "cudfSeries = cudf.Series([\"a\", 2, np.nan], dtype=\"str\")\n",
 386 |     "print(cudfSeries)\n",
 387 |     "type(cudfSeries[1])"
 388 |    ]
 389 |   },
 390 |   {
 391 |    "cell_type": "markdown",
 392 |    "metadata": {},
 393 |    "source": [
 394 |     "\n",
 395 |     "or convert from existing pandas data:"
 396 |    ]
 397 |   },
 398 |   {
 399 |    "cell_type": "code",
 400 |    "execution_count": 11,
 401 |    "metadata": {},
 402 |    "outputs": [
 403 |     {
 404 |      "name": "stdout",
 405 |      "output_type": "stream",
 406 |      "text": [
 407 |       "0       1\n",
 408 |       "1       2\n",
 409 |       "2    <NA>\n",
 410 |       "dtype: string\n"
 411 |      ]
 412 |     },
 413 |     {
 414 |      "data": {
 415 |       "text/plain": [
 416 |        "str"
 417 |       ]
 418 |      },
 419 |      "execution_count": 11,
 420 |      "metadata": {},
 421 |      "output_type": "execute_result"
 422 |     }
 423 |    ],
 424 |    "source": [
 425 |     "pandasSeries = pd.Series([1, 2, np.nan], dtype=\"Int64\")\n",
 426 |     "\n",
 427 |     "pandasSeries2 = pandasSeries.astype(\"string\")\n",
 428 |     "print(pandasSeries2)\n",
 429 |     "type(pandasSeries2[0])"
 430 |    ]
 431 |   },
 432 |   {
 433 |    "cell_type": "code",
 434 |    "execution_count": 12,
 435 |    "metadata": {},
 436 |    "outputs": [
 437 |     {
 438 |      "name": "stdout",
 439 |      "output_type": "stream",
 440 |      "text": [
 441 |       "0       1\n",
 442 |       "1       2\n",
 443 |       "2    <NA>\n",
 444 |       "dtype: object\n"
 445 |      ]
 446 |     },
 447 |     {
 448 |      "data": {
 449 |       "text/plain": [
 450 |        "str"
 451 |       ]
 452 |      },
 453 |      "execution_count": 12,
 454 |      "metadata": {},
 455 |      "output_type": "execute_result"
 456 |     }
 457 |    ],
 458 |    "source": [
 459 |     "cudfSeries1 = cudf.Series([1, 2, np.nan], dtype=\"int64\")\n",
 460 |     "\n",
 461 |     "cudfSeries2 = cudfSeries1.astype(\"string\")\n",
 462 |     "print(cudfSeries2)\n",
 463 |     "type(cudfSeries2[0])"
 464 |    ]
 465 |   },
 466 |   {
 467 |    "cell_type": "markdown",
 468 |    "metadata": {},
 469 |    "source": [
 470 |     "\n",
 471 |     "## Behavior differences\n",
 472 |     "\n",
 473 |     "These are places where the behavior of StringDtype objects differ from object dtype."
 474 |    ]
 475 |   },
 476 |   {
 477 |    "cell_type": "markdown",
 478 |    "metadata": {},
 479 |    "source": [
 480 |     "For `StringDtype`, string accessor methods that return numeric output will always return a nullable integer dtype, rather than either int or float dtype, depending on the presence of `NA` values. Methods returning boolean output will return a nullable boolean dtype."
 481 |    ]
 482 |   },
 483 |   {
 484 |    "cell_type": "code",
 485 |    "execution_count": 13,
 486 |    "metadata": {},
 487 |    "outputs": [
 488 |     {
 489 |      "name": "stdout",
 490 |      "output_type": "stream",
 491 |      "text": [
 492 |       "Original: \n",
 493 |       "0       a\n",
 494 |       "1    <NA>\n",
 495 |       "2       b\n",
 496 |       "dtype: string\n",
 497 |       "# of 'a': \n",
 498 |       "0       1\n",
 499 |       "1    <NA>\n",
 500 |       "2       0\n",
 501 |       "dtype: Int64\n",
 502 |       "\n",
 503 |       "# of 'a' after dropping n/a: \n",
 504 |       "0    1\n",
 505 |       "2    0\n",
 506 |       "dtype: Int64\n",
 507 |       "\n",
 508 |       "Check if numeric: \n",
 509 |       "0    False\n",
 510 |       "1     <NA>\n",
 511 |       "2    False\n",
 512 |       "dtype: boolean\n"
 513 |      ]
 514 |     }
 515 |    ],
 516 |    "source": [
 517 |     "pandasSeries = pd.Series([\"a\", None, \"b\"], dtype=\"string\")\n",
 518 |     "print('Original: ')\n",
 519 |     "print(pandasSeries)\n",
 520 |     "print(\"# of 'a': \")\n",
 521 |     "print(pandasSeries.str.count(\"a\"))\n",
 522 |     "print(\"\\n# of 'a' after dropping n/a: \")\n",
 523 |     "print(pandasSeries.dropna().str.count(\"a\"))\n",
 524 |     "print(\"\\nCheck if numeric: \")\n",
 525 |     "print(pandasSeries.str.isnumeric())\n"
 526 |    ]
 527 |   },
 528 |   {
 529 |    "cell_type": "code",
 530 |    "execution_count": 14,
 531 |    "metadata": {},
 532 |    "outputs": [
 533 |     {
 534 |      "name": "stdout",
 535 |      "output_type": "stream",
 536 |      "text": [
 537 |       "Original: \n",
 538 |       "0       a\n",
 539 |       "1    <NA>\n",
 540 |       "2       b\n",
 541 |       "dtype: object\n",
 542 |       "# of 'a': \n",
 543 |       "0       1\n",
 544 |       "1    <NA>\n",
 545 |       "2       0\n",
 546 |       "dtype: int32\n",
 547 |       "\n",
 548 |       "# of 'a' after dropping n/a: \n",
 549 |       "0    1\n",
 550 |       "2    0\n",
 551 |       "dtype: int32\n",
 552 |       "\n",
 553 |       "Check if numeric: \n",
 554 |       "0    False\n",
 555 |       "1     <NA>\n",
 556 |       "2    False\n",
 557 |       "dtype: bool\n"
 558 |      ]
 559 |     }
 560 |    ],
 561 |    "source": [
 562 |     "cudfSeries = cudf.Series([\"a\", None, \"b\"], dtype=\"str\")\n",
 563 |     "print('Original: ')\n",
 564 |     "print(cudfSeries)\n",
 565 |     "print(\"# of 'a': \")\n",
 566 |     "print(cudfSeries.str.count(\"a\"))\n",
 567 |     "print(\"\\n# of 'a' after dropping n/a: \")\n",
 568 |     "print(cudfSeries.dropna().str.count(\"a\"))\n",
 569 |     "print(\"\\nCheck if numeric: \")\n",
 570 |     "print(cudfSeries.str.isnumeric())"
 571 |    ]
 572 |   },
 573 |   {
 574 |    "cell_type": "markdown",
 575 |    "metadata": {},
 576 |    "source": [
 577 |     "\n",
 578 |     "Both outputs are Int64 dtype. Compare that with object-dtype."
 579 |    ]
 580 |   },
 581 |   {
 582 |    "cell_type": "code",
 583 |    "execution_count": 15,
 584 |    "metadata": {},
 585 |    "outputs": [
 586 |     {
 587 |      "name": "stdout",
 588 |      "output_type": "stream",
 589 |      "text": [
 590 |       "# of 'a': \n",
 591 |       "0    1.0\n",
 592 |       "1    NaN\n",
 593 |       "2    0.0\n",
 594 |       "dtype: float64\n",
 595 |       "\n",
 596 |       "# of 'a' after dropping n/a: \n"
 597 |      ]
 598 |     },
 599 |     {
 600 |      "data": {
 601 |       "text/plain": [
 602 |        "0    1\n",
 603 |        "2    0\n",
 604 |        "dtype: int64"
 605 |       ]
 606 |      },
 607 |      "execution_count": 15,
 608 |      "metadata": {},
 609 |      "output_type": "execute_result"
 610 |     }
 611 |    ],
 612 |    "source": [
 613 |     "pandasSeries2 = pd.Series([\"a\", None, \"b\"], dtype=\"object\")\n",
 614 |     "print(\"# of 'a': \")\n",
 615 |     "print(pandasSeries2.str.count(\"a\"))\n",
 616 |     "print(\"\\n# of 'a' after dropping n/a: \")\n",
 617 |     "pandasSeries2.dropna().str.count(\"a\")"
 618 |    ]
 619 |   },
 620 |   {
 621 |    "cell_type": "code",
 622 |    "execution_count": 16,
 623 |    "metadata": {},
 624 |    "outputs": [
 625 |     {
 626 |      "name": "stdout",
 627 |      "output_type": "stream",
 628 |      "text": [
 629 |       "# of 'a': \n",
 630 |       "0       1\n",
 631 |       "1    <NA>\n",
 632 |       "2       0\n",
 633 |       "dtype: int32\n",
 634 |       "\n",
 635 |       "# of 'a' after dropping n/a: \n"
 636 |      ]
 637 |     },
 638 |     {
 639 |      "data": {
 640 |       "text/plain": [
 641 |        "0    1\n",
 642 |        "2    0\n",
 643 |        "dtype: int32"
 644 |       ]
 645 |      },
 646 |      "execution_count": 16,
 647 |      "metadata": {},
 648 |      "output_type": "execute_result"
 649 |     }
 650 |    ],
 651 |    "source": [
 652 |     "cudfSeries2 = cudf.Series([\"a\", None, \"b\"], dtype=\"object\")\n",
 653 |     "print(\"# of 'a': \")\n",
 654 |     "print(cudfSeries2.str.count(\"a\"))\n",
 655 |     "print(\"\\n# of 'a' after dropping n/a: \")\n",
 656 |     "cudfSeries2.dropna().str.count(\"a\")"
 657 |    ]
 658 |   },
 659 |   {
 660 |    "cell_type": "markdown",
 661 |    "metadata": {},
 662 |    "source": [
 663 |     "\n",
 664 |     "When NA values are present, the output dtype is float64. Similarly for methods returning boolean values."
 665 |    ]
 666 |   },
 667 |   {
 668 |    "cell_type": "code",
 669 |    "execution_count": 17,
 670 |    "metadata": {},
 671 |    "outputs": [
 672 |     {
 673 |      "name": "stdout",
 674 |      "output_type": "stream",
 675 |      "text": [
 676 |       "Check if digit: \n",
 677 |       "0    False\n",
 678 |       "1     <NA>\n",
 679 |       "2    False\n",
 680 |       "dtype: boolean\n",
 681 |       "\n",
 682 |       "Match against 'a': \n"
 683 |      ]
 684 |     },
 685 |     {
 686 |      "data": {
 687 |       "text/plain": [
 688 |        "0     True\n",
 689 |        "1     <NA>\n",
 690 |        "2    False\n",
 691 |        "dtype: boolean"
 692 |       ]
 693 |      },
 694 |      "execution_count": 17,
 695 |      "metadata": {},
 696 |      "output_type": "execute_result"
 697 |     }
 698 |    ],
 699 |    "source": [
 700 |     "print(\"Check if digit: \")\n",
 701 |     "print(pandasSeries.str.isdigit())\n",
 702 |     "print(\"\\nMatch against 'a': \")\n",
 703 |     "pandasSeries.str.match(\"a\")"
 704 |    ]
 705 |   },
 706 |   {
 707 |    "cell_type": "code",
 708 |    "execution_count": 18,
 709 |    "metadata": {},
 710 |    "outputs": [
 711 |     {
 712 |      "name": "stdout",
 713 |      "output_type": "stream",
 714 |      "text": [
 715 |       "Check if digit: \n",
 716 |       "0    False\n",
 717 |       "1     <NA>\n",
 718 |       "2    False\n",
 719 |       "dtype: bool\n",
 720 |       "\n",
 721 |       "Match against 'a': \n"
 722 |      ]
 723 |     },
 724 |     {
 725 |      "data": {
 726 |       "text/plain": [
 727 |        "0     True\n",
 728 |        "1     <NA>\n",
 729 |        "2    False\n",
 730 |        "dtype: bool"
 731 |       ]
 732 |      },
 733 |      "execution_count": 18,
 734 |      "metadata": {},
 735 |      "output_type": "execute_result"
 736 |     }
 737 |    ],
 738 |    "source": [
 739 |     "print(\"Check if digit: \")\n",
 740 |     "print(cudfSeries.str.isdigit())\n",
 741 |     "print(\"\\nMatch against 'a': \")\n",
 742 |     "cudfSeries.str.match(\"a\")"
 743 |    ]
 744 |   },
 745 |   {
 746 |    "cell_type": "markdown",
 747 |    "metadata": {},
 748 |    "source": [
 749 |     "<br />\n",
 750 |     "\n",
 751 |     "Some string methods, like `Series.str.decode()` are not available on StringArray because StringArray only holds strings, not bytes."
 752 |    ]
 753 |   },
 754 |   {
 755 |    "cell_type": "markdown",
 756 |    "metadata": {},
 757 |    "source": [
 758 |     "In comparison operations, `arrays.StringArray` and Series backed by a `StringArray` will return an object with `BooleanDtype`, rather than a bool dtype object. Missing values in a `StringArray` will propagate in comparison operations, rather than always comparing unequal like `numpy.nan`."
 759 |    ]
 760 |   },
 761 |   {
 762 |    "cell_type": "markdown",
 763 |    "metadata": {},
 764 |    "source": [
 765 |     "Everything else that follows in the rest of this document applies equally to string and object dtype."
 766 |    ]
 767 |   },
 768 |   {
 769 |    "cell_type": "markdown",
 770 |    "metadata": {},
 771 |    "source": [
 772 |     "\n",
 773 |     "## String methods"
 774 |    ]
 775 |   },
 776 |   {
 777 |    "cell_type": "markdown",
 778 |    "metadata": {},
 779 |    "source": [
 780 |     "Series and Index are equipped with a set of string processing methods that make it easy to operate on each element of the array. Perhaps most importantly, these methods exclude missing/NA values automatically. These are accessed via the str attribute and generally have names matching the equivalent (scalar) built-in string methods:"
 781 |    ]
 782 |   },
 783 |   {
 784 |    "cell_type": "code",
 785 |    "execution_count": 19,
 786 |    "metadata": {},
 787 |    "outputs": [
 788 |     {
 789 |      "name": "stdout",
 790 |      "output_type": "stream",
 791 |      "text": [
 792 |       "Original: \n",
 793 |       "0       A\n",
 794 |       "1       B\n",
 795 |       "2       C\n",
 796 |       "3    Aaba\n",
 797 |       "4    Baca\n",
 798 |       "5    <NA>\n",
 799 |       "6    CABA\n",
 800 |       "7     dog\n",
 801 |       "8     cat\n",
 802 |       "dtype: string\n",
 803 |       "\n",
 804 |       "Lowered: \n",
 805 |       "0       a\n",
 806 |       "1       b\n",
 807 |       "2       c\n",
 808 |       "3    aaba\n",
 809 |       "4    baca\n",
 810 |       "5    <NA>\n",
 811 |       "6    caba\n",
 812 |       "7     dog\n",
 813 |       "8     cat\n",
 814 |       "dtype: string\n",
 815 |       "\n",
 816 |       "Check if Lowered: \n",
 817 |       "0    False\n",
 818 |       "1    False\n",
 819 |       "2    False\n",
 820 |       "3    False\n",
 821 |       "4    False\n",
 822 |       "5     <NA>\n",
 823 |       "6    False\n",
 824 |       "7     True\n",
 825 |       "8     True\n",
 826 |       "dtype: boolean\n",
 827 |       "\n",
 828 |       "Uppercase: \n",
 829 |       "0       A\n",
 830 |       "1       B\n",
 831 |       "2       C\n",
 832 |       "3    AABA\n",
 833 |       "4    BACA\n",
 834 |       "5    <NA>\n",
 835 |       "6    CABA\n",
 836 |       "7     DOG\n",
 837 |       "8     CAT\n",
 838 |       "dtype: string\n",
 839 |       "\n",
 840 |       "Check if Uppercase: \n",
 841 |       "0     True\n",
 842 |       "1     True\n",
 843 |       "2     True\n",
 844 |       "3    False\n",
 845 |       "4    False\n",
 846 |       "5     <NA>\n",
 847 |       "6     True\n",
 848 |       "7    False\n",
 849 |       "8    False\n",
 850 |       "dtype: boolean\n",
 851 |       "\n",
 852 |       "Determine Length: \n"
 853 |      ]
 854 |     },
 855 |     {
 856 |      "data": {
 857 |       "text/plain": [
 858 |        "0       1\n",
 859 |        "1       1\n",
 860 |        "2       1\n",
 861 |        "3       4\n",
 862 |        "4       4\n",
 863 |        "5    <NA>\n",
 864 |        "6       4\n",
 865 |        "7       3\n",
 866 |        "8       3\n",
 867 |        "dtype: Int64"
 868 |       ]
 869 |      },
 870 |      "execution_count": 19,
 871 |      "metadata": {},
 872 |      "output_type": "execute_result"
 873 |     }
 874 |    ],
 875 |    "source": [
 876 |     "pandasSeries = pd.Series(\n",
 877 |     "  ....:     [\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", np.nan, \"CABA\", \"dog\", \"cat\"], dtype=\"string\"\n",
 878 |     "  ....: )\n",
 879 |     "  ....: \n",
 880 |     "print('Original: ')\n",
 881 |     "print(pandasSeries)\n",
 882 |     "print('\\nLowered: ')\n",
 883 |     "print(pandasSeries.str.lower())\n",
 884 |     "print('\\nCheck if Lowered: ')\n",
 885 |     "print(pandasSeries.str.islower())\n",
 886 |     "print('\\nUppercase: ')\n",
 887 |     "print(pandasSeries.str.upper())\n",
 888 |     "print('\\nCheck if Uppercase: ')\n",
 889 |     "print(pandasSeries.str.isupper())\n",
 890 |     "print('\\nDetermine Length: ')\n",
 891 |     "pandasSeries.str.len()\n",
 892 |     "\n"
 893 |    ]
 894 |   },
 895 |   {
 896 |    "cell_type": "code",
 897 |    "execution_count": 20,
 898 |    "metadata": {},
 899 |    "outputs": [
 900 |     {
 901 |      "name": "stdout",
 902 |      "output_type": "stream",
 903 |      "text": [
 904 |       "Original: \n",
 905 |       "0       A\n",
 906 |       "1       B\n",
 907 |       "2       C\n",
 908 |       "3    Aaba\n",
 909 |       "4    Baca\n",
 910 |       "5    <NA>\n",
 911 |       "6    CABA\n",
 912 |       "7     dog\n",
 913 |       "8     cat\n",
 914 |       "dtype: object\n",
 915 |       "\n",
 916 |       "Lowered: \n",
 917 |       "0       a\n",
 918 |       "1       b\n",
 919 |       "2       c\n",
 920 |       "3    aaba\n",
 921 |       "4    baca\n",
 922 |       "5    <NA>\n",
 923 |       "6    caba\n",
 924 |       "7     dog\n",
 925 |       "8     cat\n",
 926 |       "dtype: object\n",
 927 |       "\n",
 928 |       "Check if Lowered: \n",
 929 |       "0    False\n",
 930 |       "1    False\n",
 931 |       "2    False\n",
 932 |       "3    False\n",
 933 |       "4    False\n",
 934 |       "5     <NA>\n",
 935 |       "6    False\n",
 936 |       "7     True\n",
 937 |       "8     True\n",
 938 |       "dtype: bool\n",
 939 |       "\n",
 940 |       "Uppercase: \n",
 941 |       "0       A\n",
 942 |       "1       B\n",
 943 |       "2       C\n",
 944 |       "3    AABA\n",
 945 |       "4    BACA\n",
 946 |       "5    <NA>\n",
 947 |       "6    CABA\n",
 948 |       "7     DOG\n",
 949 |       "8     CAT\n",
 950 |       "dtype: object\n",
 951 |       "\n",
 952 |       "Check if Uppercase: \n",
 953 |       "0     True\n",
 954 |       "1     True\n",
 955 |       "2     True\n",
 956 |       "3    False\n",
 957 |       "4    False\n",
 958 |       "5     <NA>\n",
 959 |       "6     True\n",
 960 |       "7    False\n",
 961 |       "8    False\n",
 962 |       "dtype: bool\n",
 963 |       "\n",
 964 |       "Determine Length: \n"
 965 |      ]
 966 |     },
 967 |     {
 968 |      "data": {
 969 |       "text/plain": [
 970 |        "0       1\n",
 971 |        "1       1\n",
 972 |        "2       1\n",
 973 |        "3       4\n",
 974 |        "4       4\n",
 975 |        "5    <NA>\n",
 976 |        "6       4\n",
 977 |        "7       3\n",
 978 |        "8       3\n",
 979 |        "dtype: int32"
 980 |       ]
 981 |      },
 982 |      "execution_count": 20,
 983 |      "metadata": {},
 984 |      "output_type": "execute_result"
 985 |     }
 986 |    ],
 987 |    "source": [
 988 |     "cudfSeries = cudf.Series(\n",
 989 |     "  ....:     [\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", np.nan, \"CABA\", \"dog\", \"cat\"], dtype=\"str\"\n",
 990 |     "  ....: )\n",
 991 |     "  ....: \n",
 992 |     "\n",
 993 |     "print('Original: ')\n",
 994 |     "print(cudfSeries)\n",
 995 |     "print('\\nLowered: ')\n",
 996 |     "print(cudfSeries.str.lower())\n",
 997 |     "print('\\nCheck if Lowered: ')\n",
 998 |     "print(cudfSeries.str.islower())\n",
 999 |     "print('\\nUppercase: ')\n",
1000 |     "print(cudfSeries.str.upper())\n",
1001 |     "print('\\nCheck if Uppercase: ')\n",
1002 |     "print(cudfSeries.str.isupper())\n",
1003 |     "print('\\nDetermine Length: ')\n",
1004 |     "cudfSeries.str.len()\n"
1005 |    ]
1006 |   },
1007 |   {
1008 |    "cell_type": "code",
1009 |    "execution_count": 21,
1010 |    "metadata": {},
1011 |    "outputs": [
1012 |     {
1013 |      "name": "stdout",
1014 |      "output_type": "stream",
1015 |      "text": [
1016 |       "Right Strip: \n",
1017 |       "Index([' jack', 'jill', ' jesse', 'frank'], dtype='object')\n",
1018 |       "\n",
1019 |       "Left Strip: \n"
1020 |      ]
1021 |     },
1022 |     {
1023 |      "data": {
1024 |       "text/plain": [
1025 |        "Index(['jack', 'jill ', 'jesse ', 'frank'], dtype='object')"
1026 |       ]
1027 |      },
1028 |      "execution_count": 21,
1029 |      "metadata": {},
1030 |      "output_type": "execute_result"
1031 |     }
1032 |    ],
1033 |    "source": [
1034 |     "pandasIdx = pd.Index([\" jack\", \"jill \", \" jesse \", \"frank\"])\n",
1035 |     "\n",
1036 |     "pandasIdx.str.strip()\n",
1037 |     "\n",
1038 |     "print('Right Strip: ')\n",
1039 |     "print(pandasIdx.str.rstrip())\n",
1040 |     "\n",
1041 |     "print('\\nLeft Strip: ')\n",
1042 |     "pandasIdx.str.lstrip()\n"
1043 |    ]
1044 |   },
1045 |   {
1046 |    "cell_type": "code",
1047 |    "execution_count": 22,
1048 |    "metadata": {},
1049 |    "outputs": [
1050 |     {
1051 |      "name": "stdout",
1052 |      "output_type": "stream",
1053 |      "text": [
1054 |       "Right Strip: \n",
1055 |       "StringIndex([' jack' 'jill' ' jesse' 'frank'], dtype='object')\n",
1056 |       "\n",
1057 |       "Left Strip: \n"
1058 |      ]
1059 |     },
1060 |     {
1061 |      "data": {
1062 |       "text/plain": [
1063 |        "StringIndex(['jack' 'jill ' 'jesse ' 'frank'], dtype='object')"
1064 |       ]
1065 |      },
1066 |      "execution_count": 22,
1067 |      "metadata": {},
1068 |      "output_type": "execute_result"
1069 |     }
1070 |    ],
1071 |    "source": [
1072 |     "cudfIdx = cudf.Index([\" jack\", \"jill \", \" jesse \", \"frank\"])\n",
1073 |     "\n",
1074 |     "cudfIdx.str.strip()\n",
1075 |     "\n",
1076 |     "print('Right Strip: ')\n",
1077 |     "print(cudfIdx.str.rstrip())\n",
1078 |     "\n",
1079 |     "print('\\nLeft Strip: ')\n",
1080 |     "cudfIdx.str.lstrip()\n"
1081 |    ]
1082 |   },
1083 |   {
1084 |    "cell_type": "markdown",
1085 |    "metadata": {},
1086 |    "source": [
1087 |     "\n",
1088 |     "The string methods on Index are especially useful for cleaning up or transforming DataFrame columns. For instance, you may have columns with leading or trailing whitespace:"
1089 |    ]
1090 |   },
1091 |   {
1092 |    "cell_type": "code",
1093 |    "execution_count": 23,
1094 |    "metadata": {},
1095 |    "outputs": [
1096 |     {
1097 |      "data": {
1098 |       "text/html": [
1099 |        "<div>\n",
1100 |        "<style scoped>\n",
1101 |        "    .dataframe tbody tr th:only-of-type {\n",
1102 |        "        vertical-align: middle;\n",
1103 |        "    }\n",
1104 |        "\n",
1105 |        "    .dataframe tbody tr th {\n",
1106 |        "        vertical-align: top;\n",
1107 |        "    }\n",
1108 |        "\n",
1109 |        "    .dataframe thead th {\n",
1110 |        "        text-align: right;\n",
1111 |        "    }\n",
1112 |        "</style>\n",
1113 |        "<table border=\"1\" class=\"dataframe\">\n",
1114 |        "  <thead>\n",
1115 |        "    <tr style=\"text-align: right;\">\n",
1116 |        "      <th></th>\n",
1117 |        "      <th>Column A</th>\n",
1118 |        "      <th>Column B</th>\n",
1119 |        "    </tr>\n",
1120 |        "  </thead>\n",
1121 |        "  <tbody>\n",
1122 |        "    <tr>\n",
1123 |        "      <th>0</th>\n",
1124 |        "      <td>1.766351</td>\n",
1125 |        "      <td>-0.006574</td>\n",
1126 |        "    </tr>\n",
1127 |        "    <tr>\n",
1128 |        "      <th>1</th>\n",
1129 |        "      <td>-0.034232</td>\n",
1130 |        "      <td>-1.638306</td>\n",
1131 |        "    </tr>\n",
1132 |        "    <tr>\n",
1133 |        "      <th>2</th>\n",
1134 |        "      <td>-0.836389</td>\n",
1135 |        "      <td>-1.506215</td>\n",
1136 |        "    </tr>\n",
1137 |        "  </tbody>\n",
1138 |        "</table>\n",
1139 |        "</div>"
1140 |       ],
1141 |       "text/plain": [
1142 |        "    Column A    Column B \n",
1143 |        "0    1.766351   -0.006574\n",
1144 |        "1   -0.034232   -1.638306\n",
1145 |        "2   -0.836389   -1.506215"
1146 |       ]
1147 |      },
1148 |      "execution_count": 23,
1149 |      "metadata": {},
1150 |      "output_type": "execute_result"
1151 |     }
1152 |    ],
1153 |    "source": [
1154 |     "pandasDataFrame = pd.DataFrame(np.random.randn(3, 2), columns=[\" Column A \", \" Column B \"], index=range(3))\n",
1155 |     "   \n",
1156 |     "pandasDataFrame"
1157 |    ]
1158 |   },
1159 |   {
1160 |    "cell_type": "code",
1161 |    "execution_count": 24,
1162 |    "metadata": {},
1163 |    "outputs": [
1164 |     {
1165 |      "data": {
1166 |       "text/html": [
1167 |        "<div>\n",
1168 |        "<style scoped>\n",
1169 |        "    .dataframe tbody tr th:only-of-type {\n",
1170 |        "        vertical-align: middle;\n",
1171 |        "    }\n",
1172 |        "\n",
1173 |        "    .dataframe tbody tr th {\n",
1174 |        "        vertical-align: top;\n",
1175 |        "    }\n",
1176 |        "\n",
1177 |        "    .dataframe thead th {\n",
1178 |        "        text-align: right;\n",
1179 |        "    }\n",
1180 |        "</style>\n",
1181 |        "<table border=\"1\" class=\"dataframe\">\n",
1182 |        "  <thead>\n",
1183 |        "    <tr style=\"text-align: right;\">\n",
1184 |        "      <th></th>\n",
1185 |        "      <th>Column A</th>\n",
1186 |        "      <th>Column B</th>\n",
1187 |        "    </tr>\n",
1188 |        "  </thead>\n",
1189 |        "  <tbody>\n",
1190 |        "    <tr>\n",
1191 |        "      <th>0</th>\n",
1192 |        "      <td>0.234977</td>\n",
1193 |        "      <td>-0.617927</td>\n",
1194 |        "    </tr>\n",
1195 |        "    <tr>\n",
1196 |        "      <th>1</th>\n",
1197 |        "      <td>-1.824023</td>\n",
1198 |        "      <td>0.061936</td>\n",
1199 |        "    </tr>\n",
1200 |        "    <tr>\n",
1201 |        "      <th>2</th>\n",
1202 |        "      <td>-0.066182</td>\n",
1203 |        "      <td>0.006777</td>\n",
1204 |        "    </tr>\n",
1205 |        "  </tbody>\n",
1206 |        "</table>\n",
1207 |        "</div>"
1208 |       ],
1209 |       "text/plain": [
1210 |        "    Column A    Column B \n",
1211 |        "0    0.234977   -0.617927\n",
1212 |        "1   -1.824023    0.061936\n",
1213 |        "2   -0.066182    0.006777"
1214 |       ]
1215 |      },
1216 |      "execution_count": 24,
1217 |      "metadata": {},
1218 |      "output_type": "execute_result"
1219 |     }
1220 |    ],
1221 |    "source": [
1222 |     "cudfDataFrame = cudf.DataFrame(np.random.randn(3, 2), columns=[\" Column A \", \" Column B \"], index=range(3))\n",
1223 |     "   \n",
1224 |     "cudfDataFrame"
1225 |    ]
1226 |   },
1227 |   {
1228 |    "cell_type": "markdown",
1229 |    "metadata": {},
1230 |    "source": [
1231 |     "\n",
1232 |     "Since df.columns is an Index object, we can use the .str accessor."
1233 |    ]
1234 |   },
1235 |   {
1236 |    "cell_type": "code",
1237 |    "execution_count": 25,
1238 |    "metadata": {},
1239 |    "outputs": [
1240 |     {
1241 |      "name": "stdout",
1242 |      "output_type": "stream",
1243 |      "text": [
1244 |       "Stripped: \n",
1245 |       "Index(['Column A', 'Column B'], dtype='object')\n",
1246 |       "\n",
1247 |       "Lowered: \n"
1248 |      ]
1249 |     },
1250 |     {
1251 |      "data": {
1252 |       "text/plain": [
1253 |        "Index([' column a ', ' column b '], dtype='object')"
1254 |       ]
1255 |      },
1256 |      "execution_count": 25,
1257 |      "metadata": {},
1258 |      "output_type": "execute_result"
1259 |     }
1260 |    ],
1261 |    "source": [
1262 |     "print(\"Stripped: \")\n",
1263 |     "print(pandasDataFrame.columns.str.strip())\n",
1264 |     "print(\"\\nLowered: \")\n",
1265 |     "pandasDataFrame.columns.str.lower()"
1266 |    ]
1267 |   },
1268 |   {
1269 |    "cell_type": "code",
1270 |    "execution_count": 26,
1271 |    "metadata": {},
1272 |    "outputs": [
1273 |     {
1274 |      "name": "stdout",
1275 |      "output_type": "stream",
1276 |      "text": [
1277 |       "Stripped: \n",
1278 |       "Index(['Column A', 'Column B'], dtype='object')\n",
1279 |       "\n",
1280 |       "Lowered: \n"
1281 |      ]
1282 |     },
1283 |     {
1284 |      "data": {
1285 |       "text/plain": [
1286 |        "Index([' column a ', ' column b '], dtype='object')"
1287 |       ]
1288 |      },
1289 |      "execution_count": 26,
1290 |      "metadata": {},
1291 |      "output_type": "execute_result"
1292 |     }
1293 |    ],
1294 |    "source": [
1295 |     "print(\"Stripped: \")\n",
1296 |     "print(cudfDataFrame.columns.str.strip())\n",
1297 |     "print(\"\\nLowered: \")\n",
1298 |     "cudfDataFrame.columns.str.lower()"
1299 |    ]
1300 |   },
1301 |   {
1302 |    "cell_type": "markdown",
1303 |    "metadata": {},
1304 |    "source": [
1305 |     "\n",
1306 |     "These string methods can then be used to clean up the columns as needed. Here we are removing leading and trailing whitespaces, lower casing all names, and replacing any remaining whitespaces with underscores:"
1307 |    ]
1308 |   },
1309 |   {
1310 |    "cell_type": "code",
1311 |    "execution_count": 27,
1312 |    "metadata": {},
1313 |    "outputs": [
1314 |     {
1315 |      "data": {
1316 |       "text/html": [
1317 |        "<div>\n",
1318 |        "<style scoped>\n",
1319 |        "    .dataframe tbody tr th:only-of-type {\n",
1320 |        "        vertical-align: middle;\n",
1321 |        "    }\n",
1322 |        "\n",
1323 |        "    .dataframe tbody tr th {\n",
1324 |        "        vertical-align: top;\n",
1325 |        "    }\n",
1326 |        "\n",
1327 |        "    .dataframe thead th {\n",
1328 |        "        text-align: right;\n",
1329 |        "    }\n",
1330 |        "</style>\n",
1331 |        "<table border=\"1\" class=\"dataframe\">\n",
1332 |        "  <thead>\n",
1333 |        "    <tr style=\"text-align: right;\">\n",
1334 |        "      <th></th>\n",
1335 |        "      <th>column_a</th>\n",
1336 |        "      <th>column_b</th>\n",
1337 |        "    </tr>\n",
1338 |        "  </thead>\n",
1339 |        "  <tbody>\n",
1340 |        "    <tr>\n",
1341 |        "      <th>0</th>\n",
1342 |        "      <td>1.766351</td>\n",
1343 |        "      <td>-0.006574</td>\n",
1344 |        "    </tr>\n",
1345 |        "    <tr>\n",
1346 |        "      <th>1</th>\n",
1347 |        "      <td>-0.034232</td>\n",
1348 |        "      <td>-1.638306</td>\n",
1349 |        "    </tr>\n",
1350 |        "    <tr>\n",
1351 |        "      <th>2</th>\n",
1352 |        "      <td>-0.836389</td>\n",
1353 |        "      <td>-1.506215</td>\n",
1354 |        "    </tr>\n",
1355 |        "  </tbody>\n",
1356 |        "</table>\n",
1357 |        "</div>"
1358 |       ],
1359 |       "text/plain": [
1360 |        "   column_a  column_b\n",
1361 |        "0  1.766351 -0.006574\n",
1362 |        "1 -0.034232 -1.638306\n",
1363 |        "2 -0.836389 -1.506215"
1364 |       ]
1365 |      },
1366 |      "execution_count": 27,
1367 |      "metadata": {},
1368 |      "output_type": "execute_result"
1369 |     }
1370 |    ],
1371 |    "source": [
1372 |     "pandasDataFrame.columns = pandasDataFrame.columns.str.strip().str.lower().str.replace(\" \", \"_\")\n",
1373 |     "pandasDataFrame"
1374 |    ]
1375 |   },
1376 |   {
1377 |    "cell_type": "code",
1378 |    "execution_count": 28,
1379 |    "metadata": {},
1380 |    "outputs": [
1381 |     {
1382 |      "data": {
1383 |       "text/html": [
1384 |        "<div>\n",
1385 |        "<style scoped>\n",
1386 |        "    .dataframe tbody tr th:only-of-type {\n",
1387 |        "        vertical-align: middle;\n",
1388 |        "    }\n",
1389 |        "\n",
1390 |        "    .dataframe tbody tr th {\n",
1391 |        "        vertical-align: top;\n",
1392 |        "    }\n",
1393 |        "\n",
1394 |        "    .dataframe thead th {\n",
1395 |        "        text-align: right;\n",
1396 |        "    }\n",
1397 |        "</style>\n",
1398 |        "<table border=\"1\" class=\"dataframe\">\n",
1399 |        "  <thead>\n",
1400 |        "    <tr style=\"text-align: right;\">\n",
1401 |        "      <th></th>\n",
1402 |        "      <th>column_a</th>\n",
1403 |        "      <th>column_b</th>\n",
1404 |        "    </tr>\n",
1405 |        "  </thead>\n",
1406 |        "  <tbody>\n",
1407 |        "    <tr>\n",
1408 |        "      <th>0</th>\n",
1409 |        "      <td>0.234977</td>\n",
1410 |        "      <td>-0.617927</td>\n",
1411 |        "    </tr>\n",
1412 |        "    <tr>\n",
1413 |        "      <th>1</th>\n",
1414 |        "      <td>-1.824023</td>\n",
1415 |        "      <td>0.061936</td>\n",
1416 |        "    </tr>\n",
1417 |        "    <tr>\n",
1418 |        "      <th>2</th>\n",
1419 |        "      <td>-0.066182</td>\n",
1420 |        "      <td>0.006777</td>\n",
1421 |        "    </tr>\n",
1422 |        "  </tbody>\n",
1423 |        "</table>\n",
1424 |        "</div>"
1425 |       ],
1426 |       "text/plain": [
1427 |        "   column_a  column_b\n",
1428 |        "0  0.234977 -0.617927\n",
1429 |        "1 -1.824023  0.061936\n",
1430 |        "2 -0.066182  0.006777"
1431 |       ]
1432 |      },
1433 |      "execution_count": 28,
1434 |      "metadata": {},
1435 |      "output_type": "execute_result"
1436 |     }
1437 |    ],
1438 |    "source": [
1439 |     "cudfDataFrame.columns = cudfDataFrame.columns.str.strip().str.lower().str.replace(\" \", \"_\")\n",
1440 |     "cudfDataFrame"
1441 |    ]
1442 |   },
1443 |   {
1444 |    "cell_type": "markdown",
1445 |    "metadata": {},
1446 |    "source": [
1447 |     "## Splitting and replacing strings"
1448 |    ]
1449 |   },
1450 |   {
1451 |    "cell_type": "markdown",
1452 |    "metadata": {},
1453 |    "source": [
1454 |     "Methods like split return a Series of lists:"
1455 |    ]
1456 |   },
1457 |   {
1458 |    "cell_type": "code",
1459 |    "execution_count": 29,
1460 |    "metadata": {},
1461 |    "outputs": [
1462 |     {
1463 |      "data": {
1464 |       "text/plain": [
1465 |        "0    [a, b, c]\n",
1466 |        "1    [c, d, e]\n",
1467 |        "2         <NA>\n",
1468 |        "3    [f, g, h]\n",
1469 |        "dtype: object"
1470 |       ]
1471 |      },
1472 |      "execution_count": 29,
1473 |      "metadata": {},
1474 |      "output_type": "execute_result"
1475 |     }
1476 |    ],
1477 |    "source": [
1478 |     "pandasSeries3 = pd.Series([\"a_b_c\", \"c_d_e\", np.nan, \"f_g_h\"], dtype=\"string\")\n",
1479 |     "pandasSeries3.str.split(\"_\")"
1480 |    ]
1481 |   },
1482 |   {
1483 |    "cell_type": "code",
1484 |    "execution_count": 30,
1485 |    "metadata": {},
1486 |    "outputs": [
1487 |     {
1488 |      "data": {
1489 |       "text/plain": [
1490 |        "0    [a, b, c]\n",
1491 |        "1    [c, d, e]\n",
1492 |        "2         None\n",
1493 |        "3    [f, g, h]\n",
1494 |        "dtype: list"
1495 |       ]
1496 |      },
1497 |      "execution_count": 30,
1498 |      "metadata": {},
1499 |      "output_type": "execute_result"
1500 |     }
1501 |    ],
1502 |    "source": [
1503 |     "cudfSeries3 = cudf.Series([\"a_b_c\", \"c_d_e\", np.nan, \"f_g_h\"], dtype=\"str\")\n",
1504 |     "cudfSeries3.str.split(\"_\")"
1505 |    ]
1506 |   },
1507 |   {
1508 |    "cell_type": "markdown",
1509 |    "metadata": {},
1510 |    "source": [
1511 |     "\n",
1512 |     "It is easy to expand this to return a DataFrame using expand."
1513 |    ]
1514 |   },
1515 |   {
1516 |    "cell_type": "code",
1517 |    "execution_count": 31,
1518 |    "metadata": {},
1519 |    "outputs": [
1520 |     {
1521 |      "data": {
1522 |       "text/html": [
1523 |        "<div>\n",
1524 |        "<style scoped>\n",
1525 |        "    .dataframe tbody tr th:only-of-type {\n",
1526 |        "        vertical-align: middle;\n",
1527 |        "    }\n",
1528 |        "\n",
1529 |        "    .dataframe tbody tr th {\n",
1530 |        "        vertical-align: top;\n",
1531 |        "    }\n",
1532 |        "\n",
1533 |        "    .dataframe thead th {\n",
1534 |        "        text-align: right;\n",
1535 |        "    }\n",
1536 |        "</style>\n",
1537 |        "<table border=\"1\" class=\"dataframe\">\n",
1538 |        "  <thead>\n",
1539 |        "    <tr style=\"text-align: right;\">\n",
1540 |        "      <th></th>\n",
1541 |        "      <th>0</th>\n",
1542 |        "      <th>1</th>\n",
1543 |        "      <th>2</th>\n",
1544 |        "    </tr>\n",
1545 |        "  </thead>\n",
1546 |        "  <tbody>\n",
1547 |        "    <tr>\n",
1548 |        "      <th>0</th>\n",
1549 |        "      <td>a</td>\n",
1550 |        "      <td>b</td>\n",
1551 |        "      <td>c</td>\n",
1552 |        "    </tr>\n",
1553 |        "    <tr>\n",
1554 |        "      <th>1</th>\n",
1555 |        "      <td>c</td>\n",
1556 |        "      <td>d</td>\n",
1557 |        "      <td>e</td>\n",
1558 |        "    </tr>\n",
1559 |        "    <tr>\n",
1560 |        "      <th>2</th>\n",
1561 |        "      <td>&lt;NA&gt;</td>\n",
1562 |        "      <td>&lt;NA&gt;</td>\n",
1563 |        "      <td>&lt;NA&gt;</td>\n",
1564 |        "    </tr>\n",
1565 |        "    <tr>\n",
1566 |        "      <th>3</th>\n",
1567 |        "      <td>f</td>\n",
1568 |        "      <td>g</td>\n",
1569 |        "      <td>h</td>\n",
1570 |        "    </tr>\n",
1571 |        "  </tbody>\n",
1572 |        "</table>\n",
1573 |        "</div>"
1574 |       ],
1575 |       "text/plain": [
1576 |        "      0     1     2\n",
1577 |        "0     a     b     c\n",
1578 |        "1     c     d     e\n",
1579 |        "2  <NA>  <NA>  <NA>\n",
1580 |        "3     f     g     h"
1581 |       ]
1582 |      },
1583 |      "execution_count": 31,
1584 |      "metadata": {},
1585 |      "output_type": "execute_result"
1586 |     }
1587 |    ],
1588 |    "source": [
1589 |     "pandasSeries3.str.split(\"_\", expand=True)"
1590 |    ]
1591 |   },
1592 |   {
1593 |    "cell_type": "code",
1594 |    "execution_count": 32,
1595 |    "metadata": {},
1596 |    "outputs": [
1597 |     {
1598 |      "data": {
1599 |       "text/html": [
1600 |        "<div>\n",
1601 |        "<style scoped>\n",
1602 |        "    .dataframe tbody tr th:only-of-type {\n",
1603 |        "        vertical-align: middle;\n",
1604 |        "    }\n",
1605 |        "\n",
1606 |        "    .dataframe tbody tr th {\n",
1607 |        "        vertical-align: top;\n",
1608 |        "    }\n",
1609 |        "\n",
1610 |        "    .dataframe thead th {\n",
1611 |        "        text-align: right;\n",
1612 |        "    }\n",
1613 |        "</style>\n",
1614 |        "<table border=\"1\" class=\"dataframe\">\n",
1615 |        "  <thead>\n",
1616 |        "    <tr style=\"text-align: right;\">\n",
1617 |        "      <th></th>\n",
1618 |        "      <th>0</th>\n",
1619 |        "      <th>1</th>\n",
1620 |        "      <th>2</th>\n",
1621 |        "    </tr>\n",
1622 |        "  </thead>\n",
1623 |        "  <tbody>\n",
1624 |        "    <tr>\n",
1625 |        "      <th>0</th>\n",
1626 |        "      <td>a</td>\n",
1627 |        "      <td>b</td>\n",
1628 |        "      <td>c</td>\n",
1629 |        "    </tr>\n",
1630 |        "    <tr>\n",
1631 |        "      <th>1</th>\n",
1632 |        "      <td>c</td>\n",
1633 |        "      <td>d</td>\n",
1634 |        "      <td>e</td>\n",
1635 |        "    </tr>\n",
1636 |        "    <tr>\n",
1637 |        "      <th>2</th>\n",
1638 |        "      <td>&lt;NA&gt;</td>\n",
1639 |        "      <td>&lt;NA&gt;</td>\n",
1640 |        "      <td>&lt;NA&gt;</td>\n",
1641 |        "    </tr>\n",
1642 |        "    <tr>\n",
1643 |        "      <th>3</th>\n",
1644 |        "      <td>f</td>\n",
1645 |        "      <td>g</td>\n",
1646 |        "      <td>h</td>\n",
1647 |        "    </tr>\n",
1648 |        "  </tbody>\n",
1649 |        "</table>\n",
1650 |        "</div>"
1651 |       ],
1652 |       "text/plain": [
1653 |        "      0     1     2\n",
1654 |        "0     a     b     c\n",
1655 |        "1     c     d     e\n",
1656 |        "2  <NA>  <NA>  <NA>\n",
1657 |        "3     f     g     h"
1658 |       ]
1659 |      },
1660 |      "execution_count": 32,
1661 |      "metadata": {},
1662 |      "output_type": "execute_result"
1663 |     }
1664 |    ],
1665 |    "source": [
1666 |     "cudfSeries3.str.split(\"_\", expand=True)"
1667 |    ]
1668 |   },
1669 |   {
1670 |    "cell_type": "markdown",
1671 |    "metadata": {},
1672 |    "source": [
1673 |     "\n",
1674 |     "When original Series has StringDtype, the output columns will all be StringDtype as well."
1675 |    ]
1676 |   },
1677 |   {
1678 |    "cell_type": "markdown",
1679 |    "metadata": {},
1680 |    "source": [
1681 |     "It is also possible to limit the number of splits:"
1682 |    ]
1683 |   },
1684 |   {
1685 |    "cell_type": "code",
1686 |    "execution_count": 33,
1687 |    "metadata": {},
1688 |    "outputs": [
1689 |     {
1690 |      "data": {
1691 |       "text/html": [
1692 |        "<div>\n",
1693 |        "<style scoped>\n",
1694 |        "    .dataframe tbody tr th:only-of-type {\n",
1695 |        "        vertical-align: middle;\n",
1696 |        "    }\n",
1697 |        "\n",
1698 |        "    .dataframe tbody tr th {\n",
1699 |        "        vertical-align: top;\n",
1700 |        "    }\n",
1701 |        "\n",
1702 |        "    .dataframe thead th {\n",
1703 |        "        text-align: right;\n",
1704 |        "    }\n",
1705 |        "</style>\n",
1706 |        "<table border=\"1\" class=\"dataframe\">\n",
1707 |        "  <thead>\n",
1708 |        "    <tr style=\"text-align: right;\">\n",
1709 |        "      <th></th>\n",
1710 |        "      <th>0</th>\n",
1711 |        "      <th>1</th>\n",
1712 |        "    </tr>\n",
1713 |        "  </thead>\n",
1714 |        "  <tbody>\n",
1715 |        "    <tr>\n",
1716 |        "      <th>0</th>\n",
1717 |        "      <td>a</td>\n",
1718 |        "      <td>b_c</td>\n",
1719 |        "    </tr>\n",
1720 |        "    <tr>\n",
1721 |        "      <th>1</th>\n",
1722 |        "      <td>c</td>\n",
1723 |        "      <td>d_e</td>\n",
1724 |        "    </tr>\n",
1725 |        "    <tr>\n",
1726 |        "      <th>2</th>\n",
1727 |        "      <td>&lt;NA&gt;</td>\n",
1728 |        "      <td>&lt;NA&gt;</td>\n",
1729 |        "    </tr>\n",
1730 |        "    <tr>\n",
1731 |        "      <th>3</th>\n",
1732 |        "      <td>f</td>\n",
1733 |        "      <td>g_h</td>\n",
1734 |        "    </tr>\n",
1735 |        "  </tbody>\n",
1736 |        "</table>\n",
1737 |        "</div>"
1738 |       ],
1739 |       "text/plain": [
1740 |        "      0     1\n",
1741 |        "0     a   b_c\n",
1742 |        "1     c   d_e\n",
1743 |        "2  <NA>  <NA>\n",
1744 |        "3     f   g_h"
1745 |       ]
1746 |      },
1747 |      "execution_count": 33,
1748 |      "metadata": {},
1749 |      "output_type": "execute_result"
1750 |     }
1751 |    ],
1752 |    "source": [
1753 |     "pandasSeries3.str.split(\"_\", expand=True, n=1)"
1754 |    ]
1755 |   },
1756 |   {
1757 |    "cell_type": "code",
1758 |    "execution_count": 34,
1759 |    "metadata": {},
1760 |    "outputs": [
1761 |     {
1762 |      "data": {
1763 |       "text/html": [
1764 |        "<div>\n",
1765 |        "<style scoped>\n",
1766 |        "    .dataframe tbody tr th:only-of-type {\n",
1767 |        "        vertical-align: middle;\n",
1768 |        "    }\n",
1769 |        "\n",
1770 |        "    .dataframe tbody tr th {\n",
1771 |        "        vertical-align: top;\n",
1772 |        "    }\n",
1773 |        "\n",
1774 |        "    .dataframe thead th {\n",
1775 |        "        text-align: right;\n",
1776 |        "    }\n",
1777 |        "</style>\n",
1778 |        "<table border=\"1\" class=\"dataframe\">\n",
1779 |        "  <thead>\n",
1780 |        "    <tr style=\"text-align: right;\">\n",
1781 |        "      <th></th>\n",
1782 |        "      <th>0</th>\n",
1783 |        "      <th>1</th>\n",
1784 |        "    </tr>\n",
1785 |        "  </thead>\n",
1786 |        "  <tbody>\n",
1787 |        "    <tr>\n",
1788 |        "      <th>0</th>\n",
1789 |        "      <td>a</td>\n",
1790 |        "      <td>b_c</td>\n",
1791 |        "    </tr>\n",
1792 |        "    <tr>\n",
1793 |        "      <th>1</th>\n",
1794 |        "      <td>c</td>\n",
1795 |        "      <td>d_e</td>\n",
1796 |        "    </tr>\n",
1797 |        "    <tr>\n",
1798 |        "      <th>2</th>\n",
1799 |        "      <td>&lt;NA&gt;</td>\n",
1800 |        "      <td>&lt;NA&gt;</td>\n",
1801 |        "    </tr>\n",
1802 |        "    <tr>\n",
1803 |        "      <th>3</th>\n",
1804 |        "      <td>f</td>\n",
1805 |        "      <td>g_h</td>\n",
1806 |        "    </tr>\n",
1807 |        "  </tbody>\n",
1808 |        "</table>\n",
1809 |        "</div>"
1810 |       ],
1811 |       "text/plain": [
1812 |        "      0     1\n",
1813 |        "0     a   b_c\n",
1814 |        "1     c   d_e\n",
1815 |        "2  <NA>  <NA>\n",
1816 |        "3     f   g_h"
1817 |       ]
1818 |      },
1819 |      "execution_count": 34,
1820 |      "metadata": {},
1821 |      "output_type": "execute_result"
1822 |     }
1823 |    ],
1824 |    "source": [
1825 |     "cudfSeries3.str.split(\"_\", expand=True, n=1)"
1826 |    ]
1827 |   },
1828 |   {
1829 |    "cell_type": "markdown",
1830 |    "metadata": {},
1831 |    "source": [
1832 |     "\n",
1833 |     "rsplit is similar to split except it works in the reverse direction, i.e., from the end of the string to the beginning of the string:"
1834 |    ]
1835 |   },
1836 |   {
1837 |    "cell_type": "code",
1838 |    "execution_count": 35,
1839 |    "metadata": {},
1840 |    "outputs": [
1841 |     {
1842 |      "data": {
1843 |       "text/html": [
1844 |        "<div>\n",
1845 |        "<style scoped>\n",
1846 |        "    .dataframe tbody tr th:only-of-type {\n",
1847 |        "        vertical-align: middle;\n",
1848 |        "    }\n",
1849 |        "\n",
1850 |        "    .dataframe tbody tr th {\n",
1851 |        "        vertical-align: top;\n",
1852 |        "    }\n",
1853 |        "\n",
1854 |        "    .dataframe thead th {\n",
1855 |        "        text-align: right;\n",
1856 |        "    }\n",
1857 |        "</style>\n",
1858 |        "<table border=\"1\" class=\"dataframe\">\n",
1859 |        "  <thead>\n",
1860 |        "    <tr style=\"text-align: right;\">\n",
1861 |        "      <th></th>\n",
1862 |        "      <th>0</th>\n",
1863 |        "      <th>1</th>\n",
1864 |        "    </tr>\n",
1865 |        "  </thead>\n",
1866 |        "  <tbody>\n",
1867 |        "    <tr>\n",
1868 |        "      <th>0</th>\n",
1869 |        "      <td>a_b</td>\n",
1870 |        "      <td>c</td>\n",
1871 |        "    </tr>\n",
1872 |        "    <tr>\n",
1873 |        "      <th>1</th>\n",
1874 |        "      <td>c_d</td>\n",
1875 |        "      <td>e</td>\n",
1876 |        "    </tr>\n",
1877 |        "    <tr>\n",
1878 |        "      <th>2</th>\n",
1879 |        "      <td>&lt;NA&gt;</td>\n",
1880 |        "      <td>&lt;NA&gt;</td>\n",
1881 |        "    </tr>\n",
1882 |        "    <tr>\n",
1883 |        "      <th>3</th>\n",
1884 |        "      <td>f_g</td>\n",
1885 |        "      <td>h</td>\n",
1886 |        "    </tr>\n",
1887 |        "  </tbody>\n",
1888 |        "</table>\n",
1889 |        "</div>"
1890 |       ],
1891 |       "text/plain": [
1892 |        "      0     1\n",
1893 |        "0   a_b     c\n",
1894 |        "1   c_d     e\n",
1895 |        "2  <NA>  <NA>\n",
1896 |        "3   f_g     h"
1897 |       ]
1898 |      },
1899 |      "execution_count": 35,
1900 |      "metadata": {},
1901 |      "output_type": "execute_result"
1902 |     }
1903 |    ],
1904 |    "source": [
1905 |     "pandasSeries3.str.rsplit(\"_\", expand=True, n=1)"
1906 |    ]
1907 |   },
1908 |   {
1909 |    "cell_type": "code",
1910 |    "execution_count": 36,
1911 |    "metadata": {},
1912 |    "outputs": [
1913 |     {
1914 |      "data": {
1915 |       "text/html": [
1916 |        "<div>\n",
1917 |        "<style scoped>\n",
1918 |        "    .dataframe tbody tr th:only-of-type {\n",
1919 |        "        vertical-align: middle;\n",
1920 |        "    }\n",
1921 |        "\n",
1922 |        "    .dataframe tbody tr th {\n",
1923 |        "        vertical-align: top;\n",
1924 |        "    }\n",
1925 |        "\n",
1926 |        "    .dataframe thead th {\n",
1927 |        "        text-align: right;\n",
1928 |        "    }\n",
1929 |        "</style>\n",
1930 |        "<table border=\"1\" class=\"dataframe\">\n",
1931 |        "  <thead>\n",
1932 |        "    <tr style=\"text-align: right;\">\n",
1933 |        "      <th></th>\n",
1934 |        "      <th>0</th>\n",
1935 |        "      <th>1</th>\n",
1936 |        "    </tr>\n",
1937 |        "  </thead>\n",
1938 |        "  <tbody>\n",
1939 |        "    <tr>\n",
1940 |        "      <th>0</th>\n",
1941 |        "      <td>a_b</td>\n",
1942 |        "      <td>c</td>\n",
1943 |        "    </tr>\n",
1944 |        "    <tr>\n",
1945 |        "      <th>1</th>\n",
1946 |        "      <td>c_d</td>\n",
1947 |        "      <td>e</td>\n",
1948 |        "    </tr>\n",
1949 |        "    <tr>\n",
1950 |        "      <th>2</th>\n",
1951 |        "      <td>&lt;NA&gt;</td>\n",
1952 |        "      <td>&lt;NA&gt;</td>\n",
1953 |        "    </tr>\n",
1954 |        "    <tr>\n",
1955 |        "      <th>3</th>\n",
1956 |        "      <td>f_g</td>\n",
1957 |        "      <td>h</td>\n",
1958 |        "    </tr>\n",
1959 |        "  </tbody>\n",
1960 |        "</table>\n",
1961 |        "</div>"
1962 |       ],
1963 |       "text/plain": [
1964 |        "      0     1\n",
1965 |        "0   a_b     c\n",
1966 |        "1   c_d     e\n",
1967 |        "2  <NA>  <NA>\n",
1968 |        "3   f_g     h"
1969 |       ]
1970 |      },
1971 |      "execution_count": 36,
1972 |      "metadata": {},
1973 |      "output_type": "execute_result"
1974 |     }
1975 |    ],
1976 |    "source": [
1977 |     "cudfSeries3.str.rsplit(\"_\", expand=True, n=1)"
1978 |    ]
1979 |   },
1980 |   {
1981 |    "cell_type": "markdown",
1982 |    "metadata": {},
1983 |    "source": [
1984 |     "## The replace method\n"
1985 |    ]
1986 |   },
1987 |   {
1988 |    "cell_type": "markdown",
1989 |    "metadata": {},
1990 |    "source": [
1991 |     "replace optionally uses regular expressions:"
1992 |    ]
1993 |   },
1994 |   {
1995 |    "cell_type": "code",
1996 |    "execution_count": 37,
1997 |    "metadata": {},
1998 |    "outputs": [
1999 |     {
2000 |      "name": "stdout",
2001 |      "output_type": "stream",
2002 |      "text": [
2003 |       "Original: \n",
2004 |       "0       A\n",
2005 |       "1       B\n",
2006 |       "2       C\n",
2007 |       "3    Aaba\n",
2008 |       "4    Baca\n",
2009 |       "5        \n",
2010 |       "6    <NA>\n",
2011 |       "7    CABA\n",
2012 |       "8     dog\n",
2013 |       "9     cat\n",
2014 |       "dtype: string\n",
2015 |       "\n",
2016 |       "Replaced: \n"
2017 |      ]
2018 |     },
2019 |     {
2020 |      "data": {
2021 |       "text/plain": [
2022 |        "0           A\n",
2023 |        "1           B\n",
2024 |        "2           C\n",
2025 |        "3    XX-XX ba\n",
2026 |        "4    XX-XX ca\n",
2027 |        "5            \n",
2028 |        "6        <NA>\n",
2029 |        "7        CABA\n",
2030 |        "8      XX-XX \n",
2031 |        "9     XX-XX t\n",
2032 |        "dtype: string"
2033 |       ]
2034 |      },
2035 |      "execution_count": 37,
2036 |      "metadata": {},
2037 |      "output_type": "execute_result"
2038 |     }
2039 |    ],
2040 |    "source": [
2041 |     "pandasSeries4 = pd.Series(\n",
2042 |     "   ....:     [\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", \"\", np.nan, \"CABA\", \"dog\", \"cat\"],\n",
2043 |     "   ....:     dtype=\"string\",\n",
2044 |     "   ....: )\n",
2045 |     "   ....: \n",
2046 |     "print('Original: ')\n",
2047 |     "print(pandasSeries4) \n",
2048 |     "print('\\nReplaced: ')\n",
2049 |     "pandasSeries4.str.replace(\"^.a|dog\", \"XX-XX \", regex=True)"
2050 |    ]
2051 |   },
2052 |   {
2053 |    "cell_type": "code",
2054 |    "execution_count": 38,
2055 |    "metadata": {},
2056 |    "outputs": [
2057 |     {
2058 |      "name": "stdout",
2059 |      "output_type": "stream",
2060 |      "text": [
2061 |       "Original: \n",
2062 |       "0       A\n",
2063 |       "1       B\n",
2064 |       "2       C\n",
2065 |       "3    Aaba\n",
2066 |       "4    Baca\n",
2067 |       "5        \n",
2068 |       "6    <NA>\n",
2069 |       "7    CABA\n",
2070 |       "8     dog\n",
2071 |       "9     cat\n",
2072 |       "dtype: object\n",
2073 |       "\n",
2074 |       "Replaced: \n"
2075 |      ]
2076 |     },
2077 |     {
2078 |      "data": {
2079 |       "text/plain": [
2080 |        "0           A\n",
2081 |        "1           B\n",
2082 |        "2           C\n",
2083 |        "3    XX-XX ba\n",
2084 |        "4    XX-XX ca\n",
2085 |        "5            \n",
2086 |        "6        <NA>\n",
2087 |        "7        CABA\n",
2088 |        "8      XX-XX \n",
2089 |        "9     XX-XX t\n",
2090 |        "dtype: object"
2091 |       ]
2092 |      },
2093 |      "execution_count": 38,
2094 |      "metadata": {},
2095 |      "output_type": "execute_result"
2096 |     }
2097 |    ],
2098 |    "source": [
2099 |     "cudfSeries4 = cudf.Series(\n",
2100 |     "   ....:     [\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", \"\", np.nan, \"CABA\", \"dog\", \"cat\"],\n",
2101 |     "   ....:     dtype=\"str\",\n",
2102 |     "   ....: )\n",
2103 |     "   ....: \n",
2104 |     "print('Original: ')\n",
2105 |     "print(cudfSeries4) \n",
2106 |     "print('\\nReplaced: ')\n",
2107 |     "cudfSeries4.str.replace(\"^.a|dog\", \"XX-XX \",  regex=True)"
2108 |    ]
2109 |   },
2110 |   {
2111 |    "cell_type": "markdown",
2112 |    "metadata": {},
2113 |    "source": [
2114 |     "\n",
2115 |     "If you want literal replacement of a string (equivalent to str.replace()), you can set the optional regex parameter to False, rather than escaping each character. In this case both pat and repl must be strings:"
2116 |    ]
2117 |   },
2118 |   {
2119 |    "cell_type": "code",
2120 |    "execution_count": 39,
2121 |    "metadata": {},
2122 |    "outputs": [
2123 |     {
2124 |      "name": "stdout",
2125 |      "output_type": "stream",
2126 |      "text": [
2127 |       "0         12\n",
2128 |       "1        -10\n",
2129 |       "2    $10,000\n",
2130 |       "dtype: string\n",
2131 |       "\n",
2132 |       "Are these equivalent? \n",
2133 |       "\n"
2134 |      ]
2135 |     },
2136 |     {
2137 |      "data": {
2138 |       "text/plain": [
2139 |        "0         12\n",
2140 |        "1        -10\n",
2141 |        "2    $10,000\n",
2142 |        "dtype: string"
2143 |       ]
2144 |      },
2145 |      "execution_count": 39,
2146 |      "metadata": {},
2147 |      "output_type": "execute_result"
2148 |     }
2149 |    ],
2150 |    "source": [
2151 |     "pandasdollars = pd.Series([\"12\", \"-$10\", \"$10,000\"], dtype=\"string\")\n",
2152 |     "\n",
2153 |     "# These lines are equivalent\n",
2154 |     "print(pandasdollars.str.replace(r\"-\\$\", \"-\", regex=True))\n",
2155 |     "print(\"\\nAre these equivalent? \\n\")\n",
2156 |     "pandasdollars.str.replace(\"-$\", \"-\", regex=False)"
2157 |    ]
2158 |   },
2159 |   {
2160 |    "cell_type": "code",
2161 |    "execution_count": 40,
2162 |    "metadata": {},
2163 |    "outputs": [
2164 |     {
2165 |      "name": "stdout",
2166 |      "output_type": "stream",
2167 |      "text": [
2168 |       "0         12\n",
2169 |       "1        -10\n",
2170 |       "2    $10,000\n",
2171 |       "dtype: object\n",
2172 |       "\n",
2173 |       "Are these equivalent? \n",
2174 |       "\n"
2175 |      ]
2176 |     },
2177 |     {
2178 |      "data": {
2179 |       "text/plain": [
2180 |        "0         12\n",
2181 |        "1        -10\n",
2182 |        "2    $10,000\n",
2183 |        "dtype: object"
2184 |       ]
2185 |      },
2186 |      "execution_count": 40,
2187 |      "metadata": {},
2188 |      "output_type": "execute_result"
2189 |     }
2190 |    ],
2191 |    "source": [
2192 |     "cudfDollars = cudf.Series([\"12\", \"-$10\", \"$10,000\"], dtype=\"str\")\n",
2193 |     "\n",
2194 |     "# These lines are equivalent\n",
2195 |     "print(cudfDollars.str.replace(r\"-\\$\", \"-\", regex=True))\n",
2196 |     "print(\"\\nAre these equivalent? \\n\")\n",
2197 |     "cudfDollars.str.replace(\"-$\", \"-\", regex=False)"
2198 |    ]
2199 |   },
2200 |   {
2201 |    "cell_type": "markdown",
2202 |    "metadata": {},
2203 |    "source": [
2204 |     "## Concatenation\n"
2205 |    ]
2206 |   },
2207 |   {
2208 |    "cell_type": "markdown",
2209 |    "metadata": {},
2210 |    "source": [
2211 |     "There are several ways to concatenate a Series or Index, either with itself or others, all based on cat(), resp. Index.str.cat."
2212 |    ]
2213 |   },
2214 |   {
2215 |    "cell_type": "markdown",
2216 |    "metadata": {},
2217 |    "source": [
2218 |     "### Concatenating a single Series into a string"
2219 |    ]
2220 |   },
2221 |   {
2222 |    "cell_type": "markdown",
2223 |    "metadata": {},
2224 |    "source": [
2225 |     "The content of a Series (or Index) can be concatenated:"
2226 |    ]
2227 |   },
2228 |   {
2229 |    "cell_type": "code",
2230 |    "execution_count": 41,
2231 |    "metadata": {},
2232 |    "outputs": [
2233 |     {
2234 |      "data": {
2235 |       "text/plain": [
2236 |        "'a,b,c,d'"
2237 |       ]
2238 |      },
2239 |      "execution_count": 41,
2240 |      "metadata": {},
2241 |      "output_type": "execute_result"
2242 |     }
2243 |    ],
2244 |    "source": [
2245 |     "pandasSeries = pd.Series([\"a\", \"b\", \"c\", \"d\"], dtype=\"string\")\n",
2246 |     "\n",
2247 |     "pandasSeries.str.cat(sep=\",\")"
2248 |    ]
2249 |   },
2250 |   {
2251 |    "cell_type": "code",
2252 |    "execution_count": 42,
2253 |    "metadata": {},
2254 |    "outputs": [
2255 |     {
2256 |      "data": {
2257 |       "text/plain": [
2258 |        "'a,b,c,d'"
2259 |       ]
2260 |      },
2261 |      "execution_count": 42,
2262 |      "metadata": {},
2263 |      "output_type": "execute_result"
2264 |     }
2265 |    ],
2266 |    "source": [
2267 |     "cudfSeries = cudf.Series([\"a\", \"b\", \"c\", \"d\"], dtype=\"str\")\n",
2268 |     "\n",
2269 |     "cudfSeries.str.cat(sep=\",\")"
2270 |    ]
2271 |   },
2272 |   {
2273 |    "cell_type": "markdown",
2274 |    "metadata": {},
2275 |    "source": [
2276 |     "\n",
2277 |     "If not specified, the keyword sep for the separator defaults to the empty string, sep='':"
2278 |    ]
2279 |   },
2280 |   {
2281 |    "cell_type": "code",
2282 |    "execution_count": 43,
2283 |    "metadata": {},
2284 |    "outputs": [
2285 |     {
2286 |      "data": {
2287 |       "text/plain": [
2288 |        "'abcd'"
2289 |       ]
2290 |      },
2291 |      "execution_count": 43,
2292 |      "metadata": {},
2293 |      "output_type": "execute_result"
2294 |     }
2295 |    ],
2296 |    "source": [
2297 |     "pandasSeries.str.cat()"
2298 |    ]
2299 |   },
2300 |   {
2301 |    "cell_type": "code",
2302 |    "execution_count": 44,
2303 |    "metadata": {},
2304 |    "outputs": [
2305 |     {
2306 |      "data": {
2307 |       "text/plain": [
2308 |        "'abcd'"
2309 |       ]
2310 |      },
2311 |      "execution_count": 44,
2312 |      "metadata": {},
2313 |      "output_type": "execute_result"
2314 |     }
2315 |    ],
2316 |    "source": [
2317 |     "cudfSeries.str.cat()"
2318 |    ]
2319 |   },
2320 |   {
2321 |    "cell_type": "markdown",
2322 |    "metadata": {},
2323 |    "source": [
2324 |     "\n",
2325 |     "By default, missing values are ignored. Using na_rep, they can be given a representation:"
2326 |    ]
2327 |   },
2328 |   {
2329 |    "cell_type": "code",
2330 |    "execution_count": 45,
2331 |    "metadata": {},
2332 |    "outputs": [
2333 |     {
2334 |      "name": "stdout",
2335 |      "output_type": "stream",
2336 |      "text": [
2337 |       "Seperated by ,: \n",
2338 |       "a,b,d\n",
2339 |       "\n",
2340 |       "Seperated by , & -: \n"
2341 |      ]
2342 |     },
2343 |     {
2344 |      "data": {
2345 |       "text/plain": [
2346 |        "'a,b,-,d'"
2347 |       ]
2348 |      },
2349 |      "execution_count": 45,
2350 |      "metadata": {},
2351 |      "output_type": "execute_result"
2352 |     }
2353 |    ],
2354 |    "source": [
2355 |     "pandasSeriesB = pd.Series([\"a\", \"b\", np.nan, \"d\"], dtype=\"string\")\n",
2356 |     "print('Seperated by ,: ')\n",
2357 |     "print(pandasSeriesB.str.cat(sep=\",\"))\n",
2358 |     "print('\\nSeperated by , & -: ')\n",
2359 |     "pandasSeriesB.str.cat(sep=\",\", na_rep=\"-\")"
2360 |    ]
2361 |   },
2362 |   {
2363 |    "cell_type": "code",
2364 |    "execution_count": 46,
2365 |    "metadata": {},
2366 |    "outputs": [
2367 |     {
2368 |      "name": "stdout",
2369 |      "output_type": "stream",
2370 |      "text": [
2371 |       "Seperated by ,: \n",
2372 |       "a,b,d\n",
2373 |       "\n",
2374 |       "Seperated by , & -: \n"
2375 |      ]
2376 |     },
2377 |     {
2378 |      "data": {
2379 |       "text/plain": [
2380 |        "'a,b,-,d'"
2381 |       ]
2382 |      },
2383 |      "execution_count": 46,
2384 |      "metadata": {},
2385 |      "output_type": "execute_result"
2386 |     }
2387 |    ],
2388 |    "source": [
2389 |     "cudfSeriesB = cudf.Series([\"a\", \"b\", np.nan, \"d\"], dtype=\"str\")\n",
2390 |     "print('Seperated by ,: ')\n",
2391 |     "print(cudfSeriesB.str.cat(sep=\",\"))\n",
2392 |     "print('\\nSeperated by , & -: ')\n",
2393 |     "cudfSeriesB.str.cat(sep=\",\", na_rep=\"-\")"
2394 |    ]
2395 |   },
2396 |   {
2397 |    "cell_type": "markdown",
2398 |    "metadata": {},
2399 |    "source": [
2400 |     "## Concatenating a Series and something list-like into a Series"
2401 |    ]
2402 |   },
2403 |   {
2404 |    "cell_type": "markdown",
2405 |    "metadata": {},
2406 |    "source": [
2407 |     "The first argument to cat() can be a list-like object, provided that it matches the length of the calling Series (or Index)."
2408 |    ]
2409 |   },
2410 |   {
2411 |    "cell_type": "code",
2412 |    "execution_count": 47,
2413 |    "metadata": {},
2414 |    "outputs": [
2415 |     {
2416 |      "data": {
2417 |       "text/plain": [
2418 |        "0    aA\n",
2419 |        "1    bB\n",
2420 |        "2    cC\n",
2421 |        "3    dD\n",
2422 |        "dtype: string"
2423 |       ]
2424 |      },
2425 |      "execution_count": 47,
2426 |      "metadata": {},
2427 |      "output_type": "execute_result"
2428 |     }
2429 |    ],
2430 |    "source": [
2431 |     "pandasSeries.str.cat([\"A\", \"B\", \"C\", \"D\"])"
2432 |    ]
2433 |   },
2434 |   {
2435 |    "cell_type": "code",
2436 |    "execution_count": 48,
2437 |    "metadata": {},
2438 |    "outputs": [
2439 |     {
2440 |      "data": {
2441 |       "text/plain": [
2442 |        "0    aA\n",
2443 |        "1    bB\n",
2444 |        "2    cC\n",
2445 |        "3    dD\n",
2446 |        "dtype: object"
2447 |       ]
2448 |      },
2449 |      "execution_count": 48,
2450 |      "metadata": {},
2451 |      "output_type": "execute_result"
2452 |     }
2453 |    ],
2454 |    "source": [
2455 |     "cudfSeries.str.cat([\"A\", \"B\", \"C\", \"D\"])"
2456 |    ]
2457 |   },
2458 |   {
2459 |    "cell_type": "markdown",
2460 |    "metadata": {},
2461 |    "source": [
2462 |     "Missing values on either side will result in missing values in the result as well, unless na_rep is specified:"
2463 |    ]
2464 |   },
2465 |   {
2466 |    "cell_type": "code",
2467 |    "execution_count": 49,
2468 |    "metadata": {},
2469 |    "outputs": [
2470 |     {
2471 |      "name": "stdout",
2472 |      "output_type": "stream",
2473 |      "text": [
2474 |       "Original: \n",
2475 |       "0      aa\n",
2476 |       "1      bb\n",
2477 |       "2    <NA>\n",
2478 |       "3      dd\n",
2479 |       "dtype: string\n",
2480 |       "\n",
2481 |       "na_rep is specified\n"
2482 |      ]
2483 |     },
2484 |     {
2485 |      "data": {
2486 |       "text/plain": [
2487 |        "0    aa\n",
2488 |        "1    bb\n",
2489 |        "2    c-\n",
2490 |        "3    dd\n",
2491 |        "dtype: string"
2492 |       ]
2493 |      },
2494 |      "execution_count": 49,
2495 |      "metadata": {},
2496 |      "output_type": "execute_result"
2497 |     }
2498 |    ],
2499 |    "source": [
2500 |     "print('Original: ')\n",
2501 |     "print(pandasSeries.str.cat(pandasSeriesB))\n",
2502 |     "print('\\nna_rep is specified')\n",
2503 |     "pandasSeries.str.cat(pandasSeriesB, na_rep=\"-\")"
2504 |    ]
2505 |   },
2506 |   {
2507 |    "cell_type": "code",
2508 |    "execution_count": 50,
2509 |    "metadata": {},
2510 |    "outputs": [
2511 |     {
2512 |      "name": "stdout",
2513 |      "output_type": "stream",
2514 |      "text": [
2515 |       "Original: \n",
2516 |       "0      aa\n",
2517 |       "1      bb\n",
2518 |       "2    <NA>\n",
2519 |       "3      dd\n",
2520 |       "dtype: object\n",
2521 |       "\n",
2522 |       "na_rep is specified\n"
2523 |      ]
2524 |     },
2525 |     {
2526 |      "data": {
2527 |       "text/plain": [
2528 |        "0    aa\n",
2529 |        "1    bb\n",
2530 |        "2    c-\n",
2531 |        "3    dd\n",
2532 |        "dtype: object"
2533 |       ]
2534 |      },
2535 |      "execution_count": 50,
2536 |      "metadata": {},
2537 |      "output_type": "execute_result"
2538 |     }
2539 |    ],
2540 |    "source": [
2541 |     "print('Original: ')\n",
2542 |     "print(cudfSeries.str.cat(cudfSeriesB))\n",
2543 |     "print('\\nna_rep is specified')\n",
2544 |     "cudfSeries.str.cat(cudfSeriesB, na_rep=\"-\")"
2545 |    ]
2546 |   },
2547 |   {
2548 |    "cell_type": "markdown",
2549 |    "metadata": {},
2550 |    "source": [
2551 |     "\n",
2552 |     "## Concatenating a Series and something array-like into a Series"
2553 |    ]
2554 |   },
2555 |   {
2556 |    "cell_type": "markdown",
2557 |    "metadata": {},
2558 |    "source": [
2559 |     "The parameter others can also be two-dimensional. In this case, the number or rows must match the lengths of the calling Series (or Index)."
2560 |    ]
2561 |   },
2562 |   {
2563 |    "cell_type": "code",
2564 |    "execution_count": 51,
2565 |    "metadata": {},
2566 |    "outputs": [
2567 |     {
2568 |      "name": "stdout",
2569 |      "output_type": "stream",
2570 |      "text": [
2571 |       "Original: \n",
2572 |       "0    a\n",
2573 |       "1    b\n",
2574 |       "2    c\n",
2575 |       "3    d\n",
2576 |       "dtype: string\n",
2577 |       "\n",
2578 |       "Concatenating a Series and something array-like\n",
2579 |       "      0  1\n",
2580 |       "0     a  a\n",
2581 |       "1     b  b\n",
2582 |       "2  <NA>  c\n",
2583 |       "3     d  d\n"
2584 |      ]
2585 |     },
2586 |     {
2587 |      "data": {
2588 |       "text/plain": [
2589 |        "0    aaa\n",
2590 |        "1    bbb\n",
2591 |        "2    c-c\n",
2592 |        "3    ddd\n",
2593 |        "dtype: string"
2594 |       ]
2595 |      },
2596 |      "execution_count": 51,
2597 |      "metadata": {},
2598 |      "output_type": "execute_result"
2599 |     }
2600 |    ],
2601 |    "source": [
2602 |     "pandasArray = pd.concat([pandasSeriesB, pandasSeries], axis=1)\n",
2603 |     "print('Original: ')\n",
2604 |     "print(pandasSeries)\n",
2605 |     "print('\\nConcatenating a Series and something array-like')\n",
2606 |     "print(pandasArray)\n",
2607 |     "pandasSeries.str.cat(pandasArray, na_rep=\"-\")\n"
2608 |    ]
2609 |   },
2610 |   {
2611 |    "cell_type": "code",
2612 |    "execution_count": 52,
2613 |    "metadata": {},
2614 |    "outputs": [
2615 |     {
2616 |      "name": "stdout",
2617 |      "output_type": "stream",
2618 |      "text": [
2619 |       "Original: \n",
2620 |       "0    a\n",
2621 |       "1    b\n",
2622 |       "2    c\n",
2623 |       "3    d\n",
2624 |       "dtype: object\n",
2625 |       "\n",
2626 |       "Concatenating a Series and something array-like\n",
2627 |       "      0  1\n",
2628 |       "0     a  a\n",
2629 |       "1     b  b\n",
2630 |       "2  <NA>  c\n",
2631 |       "3     d  d\n"
2632 |      ]
2633 |     },
2634 |     {
2635 |      "data": {
2636 |       "text/plain": [
2637 |        "0    aaa\n",
2638 |        "1    bbb\n",
2639 |        "2    c-c\n",
2640 |        "3    ddd\n",
2641 |        "Name: 1, dtype: object"
2642 |       ]
2643 |      },
2644 |      "execution_count": 52,
2645 |      "metadata": {},
2646 |      "output_type": "execute_result"
2647 |     }
2648 |    ],
2649 |    "source": [
2650 |     "cudfArray = cudf.concat([cudfSeriesB, cudfSeries], axis=1)\n",
2651 |     "print('Original: ')\n",
2652 |     "print(cudfSeries)\n",
2653 |     "print('\\nConcatenating a Series and something array-like')\n",
2654 |     "print(cudfArray)\n",
2655 |     "cudfArray[1].str.cat(cudfArray[0], na_rep=\"-\").str.cat(cudfSeries, na_rep=\"-\")"
2656 |    ]
2657 |   },
2658 |   {
2659 |    "cell_type": "markdown",
2660 |    "metadata": {},
2661 |    "source": [
2662 |     "\n",
2663 |     "## Indexing with .str"
2664 |    ]
2665 |   },
2666 |   {
2667 |    "cell_type": "markdown",
2668 |    "metadata": {},
2669 |    "source": [
2670 |     "You can use `[]` notation to directly index by position locations. If you index past the end of the string, the result will be a NaN."
2671 |    ]
2672 |   },
2673 |   {
2674 |    "cell_type": "code",
2675 |    "execution_count": 53,
2676 |    "metadata": {},
2677 |    "outputs": [
2678 |     {
2679 |      "name": "stdout",
2680 |      "output_type": "stream",
2681 |      "text": [
2682 |       "Indexed at position 0: \n",
2683 |       "0       A\n",
2684 |       "1       B\n",
2685 |       "2       C\n",
2686 |       "3       A\n",
2687 |       "4       B\n",
2688 |       "5    <NA>\n",
2689 |       "6       C\n",
2690 |       "7       d\n",
2691 |       "8       c\n",
2692 |       "dtype: string\n",
2693 |       "\n",
2694 |       "Indexed at position 1: \n"
2695 |      ]
2696 |     },
2697 |     {
2698 |      "data": {
2699 |       "text/plain": [
2700 |        "0    <NA>\n",
2701 |        "1    <NA>\n",
2702 |        "2    <NA>\n",
2703 |        "3       a\n",
2704 |        "4       a\n",
2705 |        "5    <NA>\n",
2706 |        "6       A\n",
2707 |        "7       o\n",
2708 |        "8       a\n",
2709 |        "dtype: string"
2710 |       ]
2711 |      },
2712 |      "execution_count": 53,
2713 |      "metadata": {},
2714 |      "output_type": "execute_result"
2715 |     }
2716 |    ],
2717 |    "source": [
2718 |     "pandasSeries = pd.Series([\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", np.nan, \"CABA\", \"dog\", \"cat\"], dtype=\"string\")\n",
2719 |     "   \n",
2720 |     "print('Indexed at position 0: ')\n",
2721 |     "print(pandasSeries.str[0])\n",
2722 |     "print('\\nIndexed at position 1: ')\n",
2723 |     "pandasSeries.str[1]"
2724 |    ]
2725 |   },
2726 |   {
2727 |    "cell_type": "code",
2728 |    "execution_count": 54,
2729 |    "metadata": {},
2730 |    "outputs": [
2731 |     {
2732 |      "name": "stdout",
2733 |      "output_type": "stream",
2734 |      "text": [
2735 |       "Indexed at position 0: \n",
2736 |       "0       A\n",
2737 |       "1       B\n",
2738 |       "2       C\n",
2739 |       "3       A\n",
2740 |       "4       B\n",
2741 |       "5    <NA>\n",
2742 |       "6       C\n",
2743 |       "7       d\n",
2744 |       "8       c\n",
2745 |       "dtype: object\n",
2746 |       "\n",
2747 |       "Indexed at position 1: \n"
2748 |      ]
2749 |     },
2750 |     {
2751 |      "data": {
2752 |       "text/plain": [
2753 |        "0        \n",
2754 |        "1        \n",
2755 |        "2        \n",
2756 |        "3       a\n",
2757 |        "4       a\n",
2758 |        "5    <NA>\n",
2759 |        "6       A\n",
2760 |        "7       o\n",
2761 |        "8       a\n",
2762 |        "dtype: object"
2763 |       ]
2764 |      },
2765 |      "execution_count": 54,
2766 |      "metadata": {},
2767 |      "output_type": "execute_result"
2768 |     }
2769 |    ],
2770 |    "source": [
2771 |     "cudfSeries = cudf.Series([\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", np.nan, \"CABA\", \"dog\", \"cat\"], dtype=\"str\")\n",
2772 |     "   \n",
2773 |     "print('Indexed at position 0: ')\n",
2774 |     "print(cudfSeries.str[0])\n",
2775 |     "print('\\nIndexed at position 1: ')\n",
2776 |     "cudfSeries.str[1]"
2777 |    ]
2778 |   },
2779 |   {
2780 |    "cell_type": "markdown",
2781 |    "metadata": {},
2782 |    "source": [
2783 |     "\n",
2784 |     "## Extracting substrings"
2785 |    ]
2786 |   },
2787 |   {
2788 |    "cell_type": "markdown",
2789 |    "metadata": {},
2790 |    "source": [
2791 |     "Extract first match in each subject (`extract`)."
2792 |    ]
2793 |   },
2794 |   {
2795 |    "cell_type": "code",
2796 |    "execution_count": 55,
2797 |    "metadata": {},
2798 |    "outputs": [
2799 |     {
2800 |      "name": "stdout",
2801 |      "output_type": "stream",
2802 |      "text": [
2803 |       "      0     1\n",
2804 |       "0     a     1\n",
2805 |       "1     b     2\n",
2806 |       "2  <NA>  <NA>\n"
2807 |      ]
2808 |     }
2809 |    ],
2810 |    "source": [
2811 |     "pdSeries = pd.Series([\"a1\", \"b2\", \"c3\"],dtype=\"string\",).str.extract(r\"([ab])(\\d)\", )\n",
2812 |     "print(pdSeries)"
2813 |    ]
2814 |   },
2815 |   {
2816 |    "cell_type": "code",
2817 |    "execution_count": 56,
2818 |    "metadata": {},
2819 |    "outputs": [
2820 |     {
2821 |      "name": "stdout",
2822 |      "output_type": "stream",
2823 |      "text": [
2824 |       "      0     1\n",
2825 |       "0     a     1\n",
2826 |       "1     b     2\n",
2827 |       "2  <NA>  <NA>\n"
2828 |      ]
2829 |     }
2830 |    ],
2831 |    "source": [
2832 |     "cudfSeries = cudf.Series(['a1', 'b2', 'c3']).str.extract(r'([ab])(\\d)')\n",
2833 |     "print(cudfSeries)    "
2834 |    ]
2835 |   },
2836 |   {
2837 |    "cell_type": "markdown",
2838 |    "metadata": {},
2839 |    "source": [
2840 |     "\n",
2841 |     "Extracting a regular expression with one group returns a DataFrame with one column if `expand=True`."
2842 |    ]
2843 |   },
2844 |   {
2845 |    "cell_type": "code",
2846 |    "execution_count": 57,
2847 |    "metadata": {},
2848 |    "outputs": [
2849 |     {
2850 |      "data": {
2851 |       "text/html": [
2852 |        "<div>\n",
2853 |        "<style scoped>\n",
2854 |        "    .dataframe tbody tr th:only-of-type {\n",
2855 |        "        vertical-align: middle;\n",
2856 |        "    }\n",
2857 |        "\n",
2858 |        "    .dataframe tbody tr th {\n",
2859 |        "        vertical-align: top;\n",
2860 |        "    }\n",
2861 |        "\n",
2862 |        "    .dataframe thead th {\n",
2863 |        "        text-align: right;\n",
2864 |        "    }\n",
2865 |        "</style>\n",
2866 |        "<table border=\"1\" class=\"dataframe\">\n",
2867 |        "  <thead>\n",
2868 |        "    <tr style=\"text-align: right;\">\n",
2869 |        "      <th></th>\n",
2870 |        "      <th>0</th>\n",
2871 |        "    </tr>\n",
2872 |        "  </thead>\n",
2873 |        "  <tbody>\n",
2874 |        "    <tr>\n",
2875 |        "      <th>0</th>\n",
2876 |        "      <td>1</td>\n",
2877 |        "    </tr>\n",
2878 |        "    <tr>\n",
2879 |        "      <th>1</th>\n",
2880 |        "      <td>2</td>\n",
2881 |        "    </tr>\n",
2882 |        "    <tr>\n",
2883 |        "      <th>2</th>\n",
2884 |        "      <td>&lt;NA&gt;</td>\n",
2885 |        "    </tr>\n",
2886 |        "  </tbody>\n",
2887 |        "</table>\n",
2888 |        "</div>"
2889 |       ],
2890 |       "text/plain": [
2891 |        "      0\n",
2892 |        "0     1\n",
2893 |        "1     2\n",
2894 |        "2  <NA>"
2895 |       ]
2896 |      },
2897 |      "execution_count": 57,
2898 |      "metadata": {},
2899 |      "output_type": "execute_result"
2900 |     }
2901 |    ],
2902 |    "source": [
2903 |     "pd.Series([\"a1\", \"b2\", \"c3\"], dtype=\"string\").str.extract(r\"[ab](\\d)\", expand=True)"
2904 |    ]
2905 |   },
2906 |   {
2907 |    "cell_type": "code",
2908 |    "execution_count": 58,
2909 |    "metadata": {},
2910 |    "outputs": [
2911 |     {
2912 |      "data": {
2913 |       "text/html": [
2914 |        "<div>\n",
2915 |        "<style scoped>\n",
2916 |        "    .dataframe tbody tr th:only-of-type {\n",
2917 |        "        vertical-align: middle;\n",
2918 |        "    }\n",
2919 |        "\n",
2920 |        "    .dataframe tbody tr th {\n",
2921 |        "        vertical-align: top;\n",
2922 |        "    }\n",
2923 |        "\n",
2924 |        "    .dataframe thead th {\n",
2925 |        "        text-align: right;\n",
2926 |        "    }\n",
2927 |        "</style>\n",
2928 |        "<table border=\"1\" class=\"dataframe\">\n",
2929 |        "  <thead>\n",
2930 |        "    <tr style=\"text-align: right;\">\n",
2931 |        "      <th></th>\n",
2932 |        "      <th>0</th>\n",
2933 |        "    </tr>\n",
2934 |        "  </thead>\n",
2935 |        "  <tbody>\n",
2936 |        "    <tr>\n",
2937 |        "      <th>0</th>\n",
2938 |        "      <td>1</td>\n",
2939 |        "    </tr>\n",
2940 |        "    <tr>\n",
2941 |        "      <th>1</th>\n",
2942 |        "      <td>2</td>\n",
2943 |        "    </tr>\n",
2944 |        "    <tr>\n",
2945 |        "      <th>2</th>\n",
2946 |        "      <td>&lt;NA&gt;</td>\n",
2947 |        "    </tr>\n",
2948 |        "  </tbody>\n",
2949 |        "</table>\n",
2950 |        "</div>"
2951 |       ],
2952 |       "text/plain": [
2953 |        "      0\n",
2954 |        "0     1\n",
2955 |        "1     2\n",
2956 |        "2  <NA>"
2957 |       ]
2958 |      },
2959 |      "execution_count": 58,
2960 |      "metadata": {},
2961 |      "output_type": "execute_result"
2962 |     }
2963 |    ],
2964 |    "source": [
2965 |     "cudf.Series([\"a1\", \"b2\", \"c3\"], dtype=\"str\").str.extract(r\"[ab](\\d)\", expand=True)"
2966 |    ]
2967 |   },
2968 |   {
2969 |    "cell_type": "markdown",
2970 |    "metadata": {},
2971 |    "source": [
2972 |     "It returns a Series if `expand=False`."
2973 |    ]
2974 |   },
2975 |   {
2976 |    "cell_type": "code",
2977 |    "execution_count": 59,
2978 |    "metadata": {},
2979 |    "outputs": [
2980 |     {
2981 |      "data": {
2982 |       "text/plain": [
2983 |        "0       1\n",
2984 |        "1       2\n",
2985 |        "2    <NA>\n",
2986 |        "dtype: string"
2987 |       ]
2988 |      },
2989 |      "execution_count": 59,
2990 |      "metadata": {},
2991 |      "output_type": "execute_result"
2992 |     }
2993 |    ],
2994 |    "source": [
2995 |     "pd.Series([\"a1\", \"b2\", \"c3\"], dtype=\"string\").str.extract(r\"[ab](\\d)\", expand=False)"
2996 |    ]
2997 |   },
2998 |   {
2999 |    "cell_type": "code",
3000 |    "execution_count": 60,
3001 |    "metadata": {},
3002 |    "outputs": [
3003 |     {
3004 |      "data": {
3005 |       "text/plain": [
3006 |        "0       1\n",
3007 |        "1       2\n",
3008 |        "2    <NA>\n",
3009 |        "dtype: object"
3010 |       ]
3011 |      },
3012 |      "execution_count": 60,
3013 |      "metadata": {},
3014 |      "output_type": "execute_result"
3015 |     }
3016 |    ],
3017 |    "source": [
3018 |     "cudf.Series([\"a1\", \"b2\", \"c3\"], dtype=\"str\").str.extract(r\"[ab](\\d)\", expand=False)"
3019 |    ]
3020 |   },
3021 |   {
3022 |    "cell_type": "markdown",
3023 |    "metadata": {},
3024 |    "source": [
3025 |     "\n",
3026 |     "When each subject string in the Series has exactly one match."
3027 |    ]
3028 |   },
3029 |   {
3030 |    "cell_type": "code",
3031 |    "execution_count": 61,
3032 |    "metadata": {},
3033 |    "outputs": [
3034 |     {
3035 |      "name": "stdout",
3036 |      "output_type": "stream",
3037 |      "text": [
3038 |       "0    a3\n",
3039 |       "1    b3\n",
3040 |       "2    c2\n",
3041 |       "dtype: string\n"
3042 |      ]
3043 |     }
3044 |    ],
3045 |    "source": [
3046 |     "pandasSeries = pd.Series([\"a3\", \"b3\", \"c2\"], dtype=\"string\")\n",
3047 |     "print(pandasSeries)"
3048 |    ]
3049 |   },
3050 |   {
3051 |    "cell_type": "code",
3052 |    "execution_count": 62,
3053 |    "metadata": {},
3054 |    "outputs": [
3055 |     {
3056 |      "name": "stdout",
3057 |      "output_type": "stream",
3058 |      "text": [
3059 |       "0    a3\n",
3060 |       "1    b3\n",
3061 |       "2    c2\n",
3062 |       "dtype: object\n"
3063 |      ]
3064 |     }
3065 |    ],
3066 |    "source": [
3067 |     "cudfSeries = cudf.Series([\"a3\", \"b3\", \"c2\"], dtype=\"str\")\n",
3068 |     "print(cudfSeries)"
3069 |    ]
3070 |   },
3071 |   {
3072 |    "cell_type": "markdown",
3073 |    "metadata": {},
3074 |    "source": [
3075 |     "\n",
3076 |     "## Testing for strings that match or contain a pattern"
3077 |    ]
3078 |   },
3079 |   {
3080 |    "cell_type": "markdown",
3081 |    "metadata": {},
3082 |    "source": [
3083 |     "You can check whether elements contain a pattern:"
3084 |    ]
3085 |   },
3086 |   {
3087 |    "cell_type": "code",
3088 |    "execution_count": 63,
3089 |    "metadata": {},
3090 |    "outputs": [
3091 |     {
3092 |      "data": {
3093 |       "text/plain": [
3094 |        "0    False\n",
3095 |        "1    False\n",
3096 |        "2     True\n",
3097 |        "3     True\n",
3098 |        "4     True\n",
3099 |        "5     True\n",
3100 |        "dtype: bool"
3101 |       ]
3102 |      },
3103 |      "execution_count": 63,
3104 |      "metadata": {},
3105 |      "output_type": "execute_result"
3106 |     }
3107 |    ],
3108 |    "source": [
3109 |     "pattern = r\"[0-9][a-z]\"\n",
3110 |     "\n",
3111 |     "pd.Series([\"1\", \"2\", \"3a\", \"3b\", \"03c\", \"4dx\"],dtype=\"str\",\n",
3112 |     "         ).str.contains(pattern)\n",
3113 |     "   "
3114 |    ]
3115 |   },
3116 |   {
3117 |    "cell_type": "code",
3118 |    "execution_count": 64,
3119 |    "metadata": {},
3120 |    "outputs": [
3121 |     {
3122 |      "data": {
3123 |       "text/plain": [
3124 |        "0    False\n",
3125 |        "1    False\n",
3126 |        "2     True\n",
3127 |        "3     True\n",
3128 |        "4     True\n",
3129 |        "5     True\n",
3130 |        "dtype: bool"
3131 |       ]
3132 |      },
3133 |      "execution_count": 64,
3134 |      "metadata": {},
3135 |      "output_type": "execute_result"
3136 |     }
3137 |    ],
3138 |    "source": [
3139 |     "pattern = r\"[0-9][a-z]\"\n",
3140 |     "\n",
3141 |     "cudf.Series([\"1\", \"2\", \"3a\", \"3b\", \"03c\", \"4dx\"],dtype=\"str\",\n",
3142 |     "         ).str.contains(pattern)\n",
3143 |     "   "
3144 |    ]
3145 |   },
3146 |   {
3147 |    "cell_type": "markdown",
3148 |    "metadata": {},
3149 |    "source": [
3150 |     "\n",
3151 |     "Or whether elements match a pattern:"
3152 |    ]
3153 |   },
3154 |   {
3155 |    "cell_type": "code",
3156 |    "execution_count": 65,
3157 |    "metadata": {},
3158 |    "outputs": [
3159 |     {
3160 |      "data": {
3161 |       "text/plain": [
3162 |        "0    False\n",
3163 |        "1    False\n",
3164 |        "2     True\n",
3165 |        "3     True\n",
3166 |        "4    False\n",
3167 |        "5     True\n",
3168 |        "dtype: boolean"
3169 |       ]
3170 |      },
3171 |      "execution_count": 65,
3172 |      "metadata": {},
3173 |      "output_type": "execute_result"
3174 |     }
3175 |    ],
3176 |    "source": [
3177 |     "pd.Series([\"1\", \"2\", \"3a\", \"3b\", \"03c\", \"4dx\"],dtype=\"string\",\n",
3178 |     "         ).str.match(pattern)\n",
3179 |     "   "
3180 |    ]
3181 |   },
3182 |   {
3183 |    "cell_type": "code",
3184 |    "execution_count": 66,
3185 |    "metadata": {},
3186 |    "outputs": [
3187 |     {
3188 |      "data": {
3189 |       "text/plain": [
3190 |        "0    False\n",
3191 |        "1    False\n",
3192 |        "2     True\n",
3193 |        "3     True\n",
3194 |        "4    False\n",
3195 |        "5     True\n",
3196 |        "dtype: bool"
3197 |       ]
3198 |      },
3199 |      "execution_count": 66,
3200 |      "metadata": {},
3201 |      "output_type": "execute_result"
3202 |     }
3203 |    ],
3204 |    "source": [
3205 |     "cudf.Series([\"1\", \"2\", \"3a\", \"3b\", \"03c\", \"4dx\"],dtype=\"str\",\n",
3206 |     "         ).str.match(pattern) "
3207 |    ]
3208 |   },
3209 |   {
3210 |    "cell_type": "markdown",
3211 |    "metadata": {},
3212 |    "source": [
3213 |     "\n",
3214 |     "New in version 1.1.0."
3215 |    ]
3216 |   },
3217 |   {
3218 |    "cell_type": "code",
3219 |    "execution_count": 67,
3220 |    "metadata": {},
3221 |    "outputs": [
3222 |     {
3223 |      "data": {
3224 |       "text/plain": [
3225 |        "0    False\n",
3226 |        "1    False\n",
3227 |        "2     True\n",
3228 |        "3     True\n",
3229 |        "4    False\n",
3230 |        "5    False\n",
3231 |        "dtype: boolean"
3232 |       ]
3233 |      },
3234 |      "execution_count": 67,
3235 |      "metadata": {},
3236 |      "output_type": "execute_result"
3237 |     }
3238 |    ],
3239 |    "source": [
3240 |     "pd.Series([\"1\", \"2\", \"3a\", \"3b\", \"03c\", \"4dx\"],dtype=\"string\",\n",
3241 |     "         ).str.fullmatch(pattern)\n",
3242 |     "    "
3243 |    ]
3244 |   },
3245 |   {
3246 |    "cell_type": "code",
3247 |    "execution_count": 68,
3248 |    "metadata": {},
3249 |    "outputs": [
3250 |     {
3251 |      "data": {
3252 |       "text/plain": [
3253 |        "0    False\n",
3254 |        "1    False\n",
3255 |        "2     True\n",
3256 |        "3     True\n",
3257 |        "4    False\n",
3258 |        "5     True\n",
3259 |        "dtype: bool"
3260 |       ]
3261 |      },
3262 |      "execution_count": 68,
3263 |      "metadata": {},
3264 |      "output_type": "execute_result"
3265 |     }
3266 |    ],
3267 |    "source": [
3268 |     "cudf.Series([\"1\", \"2\", \"3a\", \"3b\", \"03c\", \"4dx\"],dtype=\"str\",\n",
3269 |     "         ).str.match(pattern)"
3270 |    ]
3271 |   },
3272 |   {
3273 |    "cell_type": "markdown",
3274 |    "metadata": {},
3275 |    "source": [
3276 |     "In pandas, methods like `match`, `fullmatch`, `contains`, `startswith`, and `endswith` take an extra `na` argument so missing values can be considered `True` or `False`:"
3277 |    ]
3278 |   },
3279 |   {
3280 |    "cell_type": "code",
3281 |    "execution_count": 69,
3282 |    "metadata": {},
3283 |    "outputs": [
3284 |     {
3285 |      "name": "stdout",
3286 |      "output_type": "stream",
3287 |      "text": [
3288 |       "Strings that contain 'A':\n",
3289 |       "0     True\n",
3290 |       "1    False\n",
3291 |       "2    False\n",
3292 |       "3     True\n",
3293 |       "4    False\n",
3294 |       "5    False\n",
3295 |       "6     True\n",
3296 |       "7    False\n",
3297 |       "8    False\n",
3298 |       "dtype: boolean\n",
3299 |       "\n",
3300 |       "Strings that have swapped case:\n",
3301 |       "0       a\n",
3302 |       "1       b\n",
3303 |       "2       c\n",
3304 |       "3    aABA\n",
3305 |       "4    bACA\n",
3306 |       "5    <NA>\n",
3307 |       "6    caba\n",
3308 |       "7     DOG\n",
3309 |       "8     CAT\n",
3310 |       "dtype: string\n",
3311 |       "\n",
3312 |       "Strings that start with 'b':\n",
3313 |       "0    False\n",
3314 |       "1    False\n",
3315 |       "2    False\n",
3316 |       "3    False\n",
3317 |       "4    False\n",
3318 |       "5     <NA>\n",
3319 |       "6    False\n",
3320 |       "7    False\n",
3321 |       "8    False\n",
3322 |       "dtype: boolean\n",
3323 |       "\n",
3324 |       "Strings that ends with 'a':\n",
3325 |       "0    False\n",
3326 |       "1    False\n",
3327 |       "2    False\n",
3328 |       "3     True\n",
3329 |       "4     True\n",
3330 |       "5     <NA>\n",
3331 |       "6    False\n",
3332 |       "7    False\n",
3333 |       "8    False\n",
3334 |       "dtype: boolean\n"
3335 |      ]
3336 |     }
3337 |    ],
3338 |    "source": [
3339 |     "pandasSeries5 = pd.Series([\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", np.nan, \"CABA\", \"dog\", \"cat\"], dtype=\"string\")   \n",
3340 |     "print(\"Strings that contain 'A':\")\n",
3341 |     "print(pandasSeries5.str.contains(\"A\", na=False))\n",
3342 |     "print(\"\\nStrings that have swapped case:\")\n",
3343 |     "print(pandasSeries5.str.swapcase())\n",
3344 |     "print(\"\\nStrings that start with 'b':\")\n",
3345 |     "print(pandasSeries5.str.startswith ('b'))\n",
3346 |     "print((\"\\nStrings that ends with 'a':\"))\n",
3347 |     "print(pandasSeries5.str.endswith ('a'))"
3348 |    ]
3349 |   },
3350 |   {
3351 |    "cell_type": "markdown",
3352 |    "metadata": {},
3353 |    "source": [
3354 |     "cuDF does not have this extra parameter, so it will return `na`."
3355 |    ]
3356 |   },
3357 |   {
3358 |    "cell_type": "code",
3359 |    "execution_count": 70,
3360 |    "metadata": {},
3361 |    "outputs": [
3362 |     {
3363 |      "name": "stdout",
3364 |      "output_type": "stream",
3365 |      "text": [
3366 |       "Strings that contain 'A':\n",
3367 |       "0     True\n",
3368 |       "1    False\n",
3369 |       "2    False\n",
3370 |       "3     True\n",
3371 |       "4    False\n",
3372 |       "5     <NA>\n",
3373 |       "6     True\n",
3374 |       "7    False\n",
3375 |       "8    False\n",
3376 |       "dtype: bool\n",
3377 |       "\n",
3378 |       "Strings that have swapped case:\n",
3379 |       "0       a\n",
3380 |       "1       b\n",
3381 |       "2       c\n",
3382 |       "3    aABA\n",
3383 |       "4    bACA\n",
3384 |       "5    <NA>\n",
3385 |       "6    caba\n",
3386 |       "7     DOG\n",
3387 |       "8     CAT\n",
3388 |       "dtype: object\n",
3389 |       "\n",
3390 |       "Strings that start with 'b':\n",
3391 |       "0    False\n",
3392 |       "1    False\n",
3393 |       "2    False\n",
3394 |       "3    False\n",
3395 |       "4    False\n",
3396 |       "5     <NA>\n",
3397 |       "6    False\n",
3398 |       "7    False\n",
3399 |       "8    False\n",
3400 |       "dtype: bool\n",
3401 |       "\n",
3402 |       "Strings that ends with 'a':\n",
3403 |       "0    False\n",
3404 |       "1    False\n",
3405 |       "2    False\n",
3406 |       "3     True\n",
3407 |       "4     True\n",
3408 |       "5     <NA>\n",
3409 |       "6    False\n",
3410 |       "7    False\n",
3411 |       "8    False\n",
3412 |       "dtype: bool\n"
3413 |      ]
3414 |     }
3415 |    ],
3416 |    "source": [
3417 |     "cudfSeries5 = cudf.Series([\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", np.nan, \"CABA\", \"dog\", \"cat\"], dtype=\"str\")   \n",
3418 |     "print(\"Strings that contain 'A':\")\n",
3419 |     "print(cudfSeries5.str.contains(\"A\"))\n",
3420 |     "print(\"\\nStrings that have swapped case:\")\n",
3421 |     "print(cudfSeries5.str.swapcase())\n",
3422 |     "print(\"\\nStrings that start with 'b':\")\n",
3423 |     "print(cudfSeries5.str.startswith ('b'))\n",
3424 |     "print((\"\\nStrings that ends with 'a':\"))\n",
3425 |     "print(cudfSeries5.str.endswith ('a'))"
3426 |    ]
3427 |   },
3428 |   {
3429 |    "cell_type": "markdown",
3430 |    "metadata": {},
3431 |    "source": []
3432 |   }
3433 |  ],
3434 |  "metadata": {
3435 |   "kernelspec": {
3436 |    "display_name": "Python 3 (ipykernel)",
3437 |    "language": "python",
3438 |    "name": "python3"
3439 |   },
3440 |   "language_info": {
3441 |    "codemirror_mode": {
3442 |     "name": "ipython",
3443 |     "version": 3
3444 |    },
3445 |    "file_extension": ".py",
3446 |    "mimetype": "text/x-python",
3447 |    "name": "python",
3448 |    "nbconvert_exporter": "python",
3449 |    "pygments_lexer": "ipython3",
3450 |    "version": "3.10.13"
3451 |   }
3452 |  },
3453 |  "nbformat": 4,
3454 |  "nbformat_minor": 4
3455 | }
3456 | 


--------------------------------------------------------------------------------