├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── deploying-to-github-pages.yml
├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── MonitorThread.md
    ├── Py4JServer.md
    ├── PythonAccumulatorV2.md
    ├── PythonBroadcast.md
    ├── PythonForeachWriter.md
    ├── PythonFunction.md
    ├── PythonGatewayServer.md
    ├── PythonRDD.md
    ├── PythonRunner.md
    ├── PythonUtils.md
    ├── PythonWorkerFactory.md
    ├── RDD.md
    ├── Setup.md
    ├── SimplePythonFunction.md
    ├── SocketAuthHelper.md
    ├── SocketAuthServer.md
    ├── SocketFuncServer.md
    ├── SparkConf.md
    ├── SparkContext.md
    ├── SparkEnv.md
    ├── arrow-optimization
    │   ├── .pages
    │   └── index.md
    ├── building-from-sources.md
    ├── configuration-properties
    │   ├── .pages
    │   ├── index.md
    │   ├── spark.md
    │   ├── spark.pyspark.md
    │   ├── spark.python.md
    │   └── spark.sql.execution.md
    ├── connect
    │   ├── .pages
    │   └── index.md
    ├── demo
    │   ├── .pages
    │   ├── executing-pyspark-applications-using-spark-submit.md
    │   ├── index.md
    │   └── running-pyspark-application-on-minikube.md
    ├── environment-variables.md
    ├── features
    │   └── index.md
    ├── images
    │   ├── PythonRunner.png
    │   ├── PythonWorkerFactory.png
    │   └── SparkContext.png
    ├── index.md
    ├── logging.md
    ├── ml
    │   ├── .pages
    │   ├── Distributor.md
    │   └── index.md
    ├── pandas-on-spark
    │   ├── .pages
    │   └── index.md
    ├── pandas-udafs
    │   ├── .pages
    │   └── index.md
    ├── pandas-udfs
    │   ├── .pages
    │   └── index.md
    ├── pyspark
    │   ├── daemon.md
    │   ├── index.md
    │   ├── java_gateway.md
    │   ├── pandas
    │   │   ├── DataFrame.md
    │   │   ├── InternalFrame.md
    │   │   ├── generic
    │   │   │   ├── Frame.md
    │   │   │   └── index.md
    │   │   └── index.md
    │   ├── rdd.md
    │   ├── shell.md
    │   ├── sql
    │   │   ├── .pages
    │   │   ├── SparkSession.Builder.md
    │   │   ├── SparkSession.md
    │   │   ├── UserDefinedFunction.md
    │   │   ├── dataframe.md
    │   │   ├── functions.md
    │   │   ├── group.md
    │   │   ├── index.md
    │   │   ├── pandas
    │   │   │   ├── PandasUDFType.md
    │   │   │   ├── functions.md
    │   │   │   └── index.md
    │   │   ├── session.md
    │   │   └── udf.md
    │   └── worker.md
    ├── python-api.md
    ├── pytorch-distributed
    │   ├── .pages
    │   ├── TorchDistributor.md
    │   ├── index.md
    │   └── torch_run_process_wrapper.md
    ├── runners
    │   ├── .pages
    │   ├── ArrowPythonRunner.md
    │   ├── BasePythonRunner.md
    │   ├── BasicPythonArrowOutput.md
    │   ├── PythonArrowOutput.md
    │   ├── PythonRunner.md
    │   ├── PythonUDFRunner.md
    │   └── ReaderIterator.md
    ├── scala-api.md
    ├── sql
    │   ├── .pages
    │   ├── AggregateInPandasExec.md
    │   ├── ArrowEvalPython.md
    │   ├── ArrowEvalPythonExec.md
    │   ├── BaseEvalPython.md
    │   ├── DataFrame.md
    │   ├── EvalPythonExec.md
    │   ├── FlatMapGroupsInPandas.md
    │   ├── FlatMapGroupsInPandasExec.md
    │   ├── GroupedData.md
    │   ├── Observation.md
    │   ├── PandasCogroupedOps.md
    │   ├── PandasConversionMixin.md
    │   ├── PandasGroupUtils.md
    │   ├── PandasGroupedOpsMixin.md
    │   ├── PandasMapOpsMixin.md
    │   ├── PythonEvalType.md
    │   ├── PythonSQLMetrics.md
    │   ├── PythonUDF.md
    │   ├── RelationalGroupedDataset.md
    │   ├── SQLContext.md
    │   ├── SparkConversionMixin.md
    │   ├── UDFRegistration.md
    │   ├── UserDefinedPythonFunction.md
    │   └── index.md
    ├── tags.md
    └── udts
    │   ├── .pages
    │   └── index.md
├── graffles
    ├── PythonRunner.graffle
    ├── PythonWorkerFactory.graffle
    └── SparkContext.graffle
├── mkdocs.yml
└── requirements.txt


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: jaceklaskowski
2 | ko_fi: jaceklaskowski
3 | custom: "https://paypal.me/JacekLaskowski"
4 | 


--------------------------------------------------------------------------------
/.github/workflows/deploying-to-github-pages.yml:
--------------------------------------------------------------------------------
 1 | # Based on https://github.com/squidfunk/mkdocs-material/blob/master/.github/workflows/ci.yml
 2 | 
 3 | name: Deploying to GitHub Pages
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | jobs:
10 |   deploy:
11 |     if: github.event.pull_request.head.repo.fork == false
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v3
15 |         with:
16 |           fetch-depth: 0
17 |       - uses: actions/setup-python@v3
18 |         with:
19 |           python-version: 3.x
20 |       - name: Install dependencies
21 |         env:
22 |           GH_TOKEN: ${{ secrets.GH_TOKEN }}
23 |         run: |
24 |           pip install -r requirements.txt
25 |       - name: Build documentation
26 |         # env:
27 |         #   GOOGLE_ANALYTICS_KEY: ${{ secrets.GOOGLE_ANALYTICS_KEY }}
28 |         run: |
29 |           mkdocs gh-deploy --force
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | site/
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # The Internals of PySpark Online Book
2 | 
3 | [![GitHub Pages](https://github.com/japila-books/pyspark-internals/actions/workflows/deploying-to-github-pages.yml/badge.svg)](https://github.com/japila-books/pyspark-internals/actions)
4 | 
5 | The project contains the sources of [The Internals of PySpark](https://books.japila.pl/pyspark-internals) online book.
6 | 


--------------------------------------------------------------------------------
/docs/MonitorThread.md:
--------------------------------------------------------------------------------
1 | # MonitorThread
2 | 
3 | `MonitorThread` is...FIXME
4 | 


--------------------------------------------------------------------------------
/docs/Py4JServer.md:
--------------------------------------------------------------------------------
 1 | # Py4JServer
 2 | 
 3 | `Py4JServer` is a gateway server between Python and Java Virtual Machine (JVM) using [Py4J]({{ py4j.doc }}).
 4 | 
 5 | `Py4JServer` is a wrapper for a [py4j Server](#server).
 6 | 
 7 | ## Creating Instance
 8 | 
 9 | `Py4JServer` takes the following to be created:
10 | 
11 | * <span id="sparkConf"> `SparkConf` ([Spark Core]({{ book.spark_core }}/SparkConf))
12 | 
13 | `Py4JServer` is created when:
14 | 
15 | * [PythonGatewayServer](PythonGatewayServer.md) command-line application is started
16 | * [PythonRunner](PythonRunner.md) command-line application is started
17 | 
18 | ## <span id="server"> py4j Server
19 | 
20 | `Py4JServer` creates a `ClientServer` ([py4j]({{ py4j.javadoc }}/py4j/ClientServer.html)) or `GatewayServer` ([py4j]({{ py4j.javadoc }}/py4j/GatewayServer.html)) based on [PYSPARK_PIN_THREAD](environment-variables.md#PYSPARK_PIN_THREAD) environment variable.
21 | 
22 | ## <span id="secret"> Connection Secret
23 | 
24 | ```scala
25 | secret: String
26 | ```
27 | 
28 | `Py4JServer` creates a connection secret for a secure communication.
29 | 
30 | ## <span id="start"> start
31 | 
32 | ```scala
33 | start(): Unit
34 | ```
35 | 
36 | `start` requests the [py4j Server](#server) to start.
37 | 
38 | ## <span id="getListeningPort"> getListeningPort
39 | 
40 | ```scala
41 | getListeningPort: Int
42 | ```
43 | 
44 | `getListeningPort` requests the [py4j Server](#server) for the listening port.
45 | 


--------------------------------------------------------------------------------
/docs/PythonAccumulatorV2.md:
--------------------------------------------------------------------------------
1 | # PythonAccumulatorV2
2 | 
3 | `PythonAccumulatorV2` is...FIXME


--------------------------------------------------------------------------------
/docs/PythonBroadcast.md:
--------------------------------------------------------------------------------
1 | # PythonBroadcast
2 | 
3 | `PythonBroadcast` is...FIXME


--------------------------------------------------------------------------------
/docs/PythonForeachWriter.md:
--------------------------------------------------------------------------------
1 | # PythonForeachWriter
2 | 
3 | `PythonForeachWriter` is...FIXME
4 | 


--------------------------------------------------------------------------------
/docs/PythonFunction.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | tags:
 3 |   - Scala
 4 | ---
 5 | 
 6 | # PythonFunction
 7 | 
 8 | `PythonFunction` is an [abstraction](#contract) of the [metadata](#implementations) of a [Python function](sql/PythonUDF.md#func) of a [PythonUDF](sql/PythonUDF.md).
 9 | 
10 | `PythonFunction` is executed in a [BasePythonRunner](runners/BasePythonRunner.md).
11 | 
12 | `PythonFunction` is used to create the following:
13 | 
14 | * [PythonRDD](PythonRDD.md#func)
15 | * [PythonRunner](PythonRunner.md#apply)
16 | * [PythonForeachWriter](PythonForeachWriter.md#func)
17 | * [UserDefinedPythonFunction](sql/UserDefinedPythonFunction.md#func)
18 | 
19 | !!! note "ChainedPythonFunctions"
20 |     `ChainedPythonFunctions` is a collection of chained `PythonFunction`s.
21 | 
22 | ## Contract (Subset)
23 | 
24 | ### accumulator
25 | 
26 | ```scala
27 | accumulator: PythonAccumulatorV2
28 | ```
29 | 
30 | [PythonAccumulatorV2](PythonAccumulatorV2.md)
31 | 
32 | Used when:
33 | 
34 | * `BasePythonRunner` is [created](runners/BasePythonRunner.md#accumulator)
35 | 
36 | ### broadcastVars { #broadcastVars }
37 | 
38 | ```scala
39 | broadcastVars: JList[Broadcast[PythonBroadcast]]
40 | ```
41 | 
42 | A collection of broadcast variables ([Spark Core]({{ book.spark_core }}/broadcast-variables/Broadcast)) with a [PythonBroadcast](PythonBroadcast.md)
43 | 
44 | Used when:
45 | 
46 | * `WriterThread` is created
47 | 
48 | ### command
49 | 
50 | ```scala
51 | command: Seq[Byte]
52 | ```
53 | 
54 | Used when:
55 | 
56 | * `PythonRunner` is requested to [newWriterThread](PythonRunner.md#newWriterThread)
57 | * `UDFRegistration` is requested to [register a Python UDF](sql/UDFRegistration.md#registerPython) (for logging purposes only)
58 | * `PythonUDFRunner` is requested to [writeUDFs](runners/PythonUDFRunner.md#writeUDFs)
59 | 
60 | ## Implementations
61 | 
62 | * [SimplePythonFunction](SimplePythonFunction.md)
63 | 


--------------------------------------------------------------------------------
/docs/PythonGatewayServer.md:
--------------------------------------------------------------------------------
 1 | # PythonGatewayServer
 2 | 
 3 | `PythonGatewayServer` is a [command-line application](#main) (_process_) that starts a [Py4JServer](Py4JServer.md) on an ephemeral port.
 4 | 
 5 | `PythonGatewayServer` is the Python runner for `pyspark` shell script ([Spark Core]({{ book.spark_core }}/tools/SparkSubmit#PYSPARK_SHELL)).
 6 | 
 7 | ## <span id="main"> main
 8 | 
 9 | `main` creates a [Py4JServer](Py4JServer.md) and requests it to [start](Py4JServer.md#start).
10 | 
11 | `main` requests the `Py4JServer` for the [listening port](Py4JServer.md#getListeningPort) (_boundPort_) and prints out the following DEBUG message to the logs:
12 | 
13 | ```text
14 | Started PythonGatewayServer on port [boundPort]
15 | ```
16 | 
17 | <span id="main-_PYSPARK_DRIVER_CONN_INFO_PATH">
18 | `main` uses [_PYSPARK_DRIVER_CONN_INFO_PATH](#_PYSPARK_DRIVER_CONN_INFO_PATH) environment variable for the path of a connection info file (for the associated python process) with the listening port and the [secret](Py4JServer.md#secret).
19 | 
20 | `main` pauses (_blocks_) until the Python driver finishes (by reading from the system input that blocks until input data is available, the end of the stream is detected, or an exception is thrown).
21 | 
22 | In the end, once the Python driver finishes, `main` prints out the following DEBUG message to the logs:
23 | 
24 | ```text
25 | Exiting due to broken pipe from Python driver
26 | ```
27 | 
28 | `main` prints out the following ERROR message to the logs and exists when the [listening port](Py4JServer.md#getListeningPort) is `-1`:
29 | 
30 | ```text
31 | [server] failed to bind; exiting
32 | ```
33 | 
34 | ## <span id="_PYSPARK_DRIVER_CONN_INFO_PATH"> _PYSPARK_DRIVER_CONN_INFO_PATH
35 | 
36 | `PythonGatewayServer` uses `_PYSPARK_DRIVER_CONN_INFO_PATH` environment variable for the [path of a connection info file](#main-_PYSPARK_DRIVER_CONN_INFO_PATH) for communication between this and the Python processes.
37 | 
38 | `_PYSPARK_DRIVER_CONN_INFO_PATH` is configured when [java_gateway.py](pyspark/java_gateway.md) module is requested to [launch_gateway](pyspark/java_gateway.md#launch_gateway).
39 | 
40 | ## Logging
41 | 
42 | Enable `ALL` logging level for `org.apache.spark.api.python.PythonGatewayServer` logger to see what happens inside.
43 | 
44 | Add the following line to `conf/log4j2.properties`:
45 | 
46 | ```text
47 | logger.PythonGatewayServer.name = org.apache.spark.api.python.PythonGatewayServer
48 | logger.PythonGatewayServer.level = all
49 | ```
50 | 
51 | Refer to [Logging](logging.md).
52 | 


--------------------------------------------------------------------------------
/docs/PythonRDD.md:
--------------------------------------------------------------------------------
 1 | # PythonRDD
 2 | 
 3 | `PythonRDD` is an `RDD` (`RDD[Array[Byte]]`) that uses [PythonRunner](runners/PythonRunner.md) (to [compute a partition](#compute)).
 4 | 
 5 | ## Creating Instance
 6 | 
 7 | `PythonRDD` takes the following to be created:
 8 | 
 9 | * <span id="parent"> Parent `RDD`
10 | * <span id="func"> [PythonFunction](PythonFunction.md)
11 | * <span id="preservePartitoning"> `preservePartitoning` flag
12 | * <span id="isFromBarrier"> `isFromBarrier` flag (default: `false`)
13 | 
14 | `PythonRDD` is created when...FIXME
15 | 
16 | ## <span id="runJob"> runJob
17 | 
18 | ```scala
19 | runJob(
20 |   sc: SparkContext,
21 |   rdd: JavaRDD[Array[Byte]],
22 |   partitions: JArrayList[Int]): Array[Any]
23 | ```
24 | 
25 | `runJob`...FIXME
26 | 
27 | ## <span id="collectAndServe"> collectAndServe
28 | 
29 | ```scala
30 | collectAndServe[T](
31 |   rdd: RDD[T]): Array[Any]
32 | ```
33 | 
34 | `collectAndServe`...FIXME
35 | 
36 | ## <span id="collectAndServeWithJobGroup"> collectAndServeWithJobGroup
37 | 
38 | ```scala
39 | collectAndServeWithJobGroup[T](
40 |   rdd: RDD[T],
41 |   groupId: String,
42 |   description: String,
43 |   interruptOnCancel: Boolean): Array[Any]
44 | ```
45 | 
46 | `collectAndServeWithJobGroup`...FIXME
47 | 
48 | ## <span id="serveIterator"> serveIterator Utility
49 | 
50 | ```scala
51 | serveIterator(
52 |   items: Iterator[_],
53 |   threadName: String): Array[Any]
54 | ```
55 | 
56 | `serveIterator` [serveToStream](#serveToStream) with a writer function that...FIXME
57 | 
58 | `serveIterator` is used when:
59 | 
60 | * `PythonRDD` utility is used to [runJob](#runJob), [collectAndServe](#collectAndServe) and [collectAndServeWithJobGroup](#collectAndServeWithJobGroup)
61 | * `Dataset` is requested to `collectToPython`, `tailToPython`, `getRowsToPython`
62 | 
63 | ## <span id="serveToStream"> serveToStream Utility
64 | 
65 | ```scala
66 | serveToStream(
67 |   threadName: String)(
68 |   writeFunc: OutputStream => Unit): Array[Any]
69 | ```
70 | 
71 | `serveToStream` [serveToStream](SocketAuthServer.md#serveToStream) with the [authHelper](#authHelper) and the input arguments.
72 | 
73 | `serveToStream` is used when:
74 | 
75 | * `PythonRDD` utility is used to [serveIterator](#serveIterator)
76 | * `Dataset` is requested to `collectAsArrowToPython`
77 | 
78 | ## <span id="authHelper"> SocketAuthHelper
79 | 
80 | `PythonRDD` uses a [SocketAuthHelper](SocketAuthHelper.md).
81 | 


--------------------------------------------------------------------------------
/docs/PythonRunner.md:
--------------------------------------------------------------------------------
 1 | # PythonRunner
 2 | 
 3 | `PythonRunner` is a [command-line application](#main) to launch a separate process to run a Python application (alongside the JVM process of `PythonRunner` with Apache Spark services).
 4 | 
 5 | <figure markdown>
 6 |   ![PythonRunner and Python Process](images/PythonRunner.png)
 7 | </figure>
 8 | 
 9 | `PythonRunner` can be launched using `spark-submit` shell script ([Spark Core]({{ book.spark_core }}/tools/spark-submit/)).
10 | 
11 | `PythonRunner` executes the [Python executable](#pythonExec) (with the PySpark application and arguments) as a subprocess that is expected to connect back to the JVM to access Spark services.
12 | 
13 | ??? note "Uh-oh, there are two PythonRunners 🙄"
14 |     This page is about `org.apache.spark.deploy.PythonRunner` while there is another [PythonRunner](runners/PythonRunner.md).
15 | 
16 | ## Arguments
17 | 
18 | `PythonRunner` accepts the following command-line arguments (in that order):
19 | 
20 | 1. Main python file (`pythonFile`)
21 | 1. Extra python files (`pyFiles`)
22 | 1. PySpark application arguments, if any
23 | 
24 | ## Python Executable { #pythonExec }
25 | 
26 | `PythonRunner` determines the Python executable to launch a PySpark application with based on the following (in the order of precedence):
27 | 
28 | 1. [spark.pyspark.driver.python](configuration-properties/index.md#spark.pyspark.driver.python) configuration property
29 | 1. [spark.pyspark.python](configuration-properties/index.md#spark.pyspark.python) configuration property
30 | 1. [PYSPARK_DRIVER_PYTHON](environment-variables.md#PYSPARK_DRIVER_PYTHON) environment variable
31 | 1. [PYSPARK_PYTHON](environment-variables.md#PYSPARK_PYTHON) environment variable
32 | 1. `python3`
33 | 
34 | ## Environment Variables
35 | 
36 | `PythonRunner` defines the following environment variables to configure the PySpark application's execution environment.
37 | 
38 | Environment Variable | Value
39 | ---------------------|---------
40 |  `PYTHONPATH` | Comma-separated list of local paths with formatted `pyFiles` and [sparkPythonPath](PythonUtils.md#sparkPythonPath), followed by the existing `PYTHONPATH`
41 |  `PYTHONUNBUFFERED` | `YES`
42 |  [PYSPARK_GATEWAY_PORT](environment-variables.md#PYSPARK_GATEWAY_PORT) | The [listening port](Py4JServer.md#getListeningPort) of the started `Py4JServer`
43 |  [PYSPARK_GATEWAY_SECRET](environment-variables.md#PYSPARK_GATEWAY_SECRET) | The [secret](Py4JServer.md#secret) of the started `Py4JServer`
44 |  `PYSPARK_PYTHON` | [spark.pyspark.python](configuration-properties/index.md#spark.pyspark.python) if defined
45 |  `PYTHONHASHSEED` | `PYTHONHASHSEED` env var if defined
46 |  `OMP_NUM_THREADS` | `spark.driver.cores` (unless defined for Spark on k8s, YARN and Mesos)
47 |  `SPARK_REMOTE` | `spark.remote` if defined
48 | 
49 | ## Launching Application { #main }
50 | 
51 | ```scala
52 | main(
53 |   args: Array[String]): Unit
54 | ```
55 | 
56 | `main` takes the [arguments](#arguments) (from the given `args`).
57 | 
58 | `main` determines the [Python executable](#pythonExec) to launch the PySpark application (based on configuration properties and environment variables).
59 | 
60 | `main` creates a [Py4JServer](Py4JServer.md) that is immediately [started](Py4JServer.md#start) (on a daemon **py4j-gateway-init** thread). `main` waits until the `Py4JServer` has started.
61 | 
62 | `main` starts a Python process using the [Python executable](#pythonExec) and the [environment variables](#environment-variables).
63 | 
64 | `main` pauses itself and waits for the Python process to finish. Once it happens, `main` requests the `Py4JServer` to [shutdown](Py4JServer.md#shutdown).
65 | 
66 | ## Demo
67 | 
68 | [Demo: Executing PySpark Applications Using spark-submit](demo/executing-pyspark-applications-using-spark-submit.md)
69 | 


--------------------------------------------------------------------------------
/docs/PythonUtils.md:
--------------------------------------------------------------------------------
 1 | # PythonUtils
 2 | 
 3 | ## Broadcast Threshold { #getBroadcastThreshold }
 4 | 
 5 | ```scala
 6 | getBroadcastThreshold(
 7 |   sc: JavaSparkContext): Long
 8 | ```
 9 | 
10 | `getBroadcastThreshold` is the value of [spark.broadcast.UDFCompressionThreshold](configuration-properties/spark.md#spark.broadcast.UDFCompressionThreshold) configuration property.
11 | 
12 | !!! note "py4j"
13 |     `getBroadcastThreshold` is a Scala method that is used by [pyspark.rdd](pyspark/rdd.md#_prepare_for_python_RDD) Python module via [py4j](SparkContext.md#_jvm) bridge.
14 | 
15 | ---
16 | 
17 | `getBroadcastThreshold` is used when:
18 | 
19 | * `pyspark.rdd` is requested to [_prepare_for_python_RDD](pyspark/rdd.md#_prepare_for_python_RDD)
20 | 


--------------------------------------------------------------------------------
/docs/PythonWorkerFactory.md:
--------------------------------------------------------------------------------
  1 | # PythonWorkerFactory
  2 | 
  3 | `PythonWorkerFactory` is a factory of [Python workers](#create) to execute [PythonFunction](PythonFunction.md)s.
  4 | 
  5 | ![PythonWorkerFactory](images/PythonWorkerFactory.png)
  6 | 
  7 | !!! note
  8 |     There could be many `PythonWorkerFactory`s on a single executor (one for every pair of the [pythonExec](#pythonExec) and the [envVars](#envVars)).
  9 | 
 10 | ## Creating Instance
 11 | 
 12 | `PythonWorkerFactory` takes the following to be created:
 13 | 
 14 | * [Python Executable](#pythonExec)
 15 | * <span id="envVars"> Environment Variables
 16 | 
 17 | `PythonWorkerFactory` is created when:
 18 | 
 19 | * `SparkEnv` is requested to [createPythonWorker](SparkEnv.md#createPythonWorker) (for `BasePythonRunner` to [compute a partition](runners/BasePythonRunner.md#compute)).
 20 | 
 21 | ### Python Executable { #pythonExec }
 22 | 
 23 | `PythonWorkerFactory` is given a Python executable (`pythonExec`) when [created](#creating-instance).
 24 | 
 25 | The Python executable is the [pythonExec](PythonFunction.md#pythonExec) of the first [PythonFunction](PythonFunction.md) (of all the Python UDFs to execute by [BasePythonRunner](runners/BasePythonRunner.md)).
 26 | 
 27 | !!! note
 28 |     It is assumed that all [PythonFunction](PythonFunction.md)s (of a [BasePythonRunner](runners/BasePythonRunner.md)) should have the same Python executable, version and env vars. That is why it is safe to use the first `PythonFunction`.
 29 | 
 30 | ## useDaemon { #useDaemon }
 31 | 
 32 | `PythonWorkerFactory` initializes `useDaemon` internal flag when [created](#creating-instance).
 33 | 
 34 | `useDaemon` is enabled when the following all hold:
 35 | 
 36 | * [spark.python.use.daemon](configuration-properties/index.md#spark.python.use.daemon) is enabled
 37 | * The operating system is not MS Windows (based on `os.name` JVM property) as it works on UNIX-based systems only (because it uses signals for child management)
 38 | 
 39 | `useDaemon` flag is used when `PythonWorkerFactory` is requested for the following:
 40 | 
 41 | * [create](#create)
 42 | * [stopDaemon](#stopDaemon)
 43 | * [stopWorker](#stopWorker)
 44 | * [releaseWorker](#releaseWorker)
 45 | 
 46 | ## Daemon Process { #daemon }
 47 | 
 48 | ```scala
 49 | daemon: Process = null
 50 | ```
 51 | 
 52 | `daemon` is a `Process` ([Java]({{ java.api }}/java/lang/Process.html)) to control [Python worker processes](#daemonWorkers).
 53 | 
 54 | `daemon` is uninitialized (`null`) right after `PythonWorkerFactory` is [created](#creating-instance) and right after [stopDaemon](#stopDaemon).
 55 | 
 56 | `daemon` is initialized and immediately started when [startDaemon](#startDaemon) (and listens at [daemonPort](#daemonPort)).
 57 | 
 58 | `daemon` is alive until [stopDaemon](#stopDaemon).
 59 | 
 60 | Any communication with the `daemon` happens through [daemonPort](#daemonPort).
 61 | 
 62 | ### Port { #daemonPort }
 63 | 
 64 | ```scala
 65 | daemonPort: Int = 0
 66 | ```
 67 | 
 68 | `daemonPort` is the communication channel (port) of the [daemon](#daemon) Python process (that is known only after [startDaemon](#startDaemon)).
 69 | 
 70 | `daemonPort` (alongside the [daemonHost](#daemonHost)) is used to open a socket stream and launch [workers](#daemonWorkers).
 71 | 
 72 | ### Python Workers { #daemonWorkers }
 73 | 
 74 | ```scala
 75 | daemonWorkers: mutable.WeakHashMap[Socket, Int]
 76 | ```
 77 | 
 78 | `PythonWorkerFactory` creates `daemonWorkers` internal registry of socket streams and the worker's PID when [created](#creating-instance).
 79 | 
 80 | A new pair is added in [createSocket](#createSocket) (when [createThroughDaemon](#createThroughDaemon)).
 81 | 
 82 | `daemonWorkers` is used when:
 83 | 
 84 | * [create](#create) (with [useDaemon](#useDaemon) flag enabled and non-empty [idleWorkers](#idleWorkers))
 85 | * [stopWorker](#stopWorker)
 86 | 
 87 | ## Python Modules
 88 | 
 89 | ### Daemon { #daemonModule }
 90 | 
 91 | `PythonWorkerFactory` initializes `daemonModule` internal property for the **Python Daemon Module** when [created](#creating-instance).
 92 | 
 93 | `daemonModule` is the value of [spark.python.daemon.module](configuration-properties/index.md#spark.python.daemon.module) configuration property.
 94 | 
 95 | The Python Daemon Module is used when `PythonWorkerFactory` is requested to [create and start a daemon module](#startDaemon).
 96 | 
 97 | ### Worker { #workerModule }
 98 | 
 99 | `PythonWorkerFactory` uses [spark.python.worker.module](configuration-properties/index.md#PYTHON_WORKER_MODULE) configuration property to specify the **Python Worker Module**.
100 | 
101 | The Python Worker Module is used when `PythonWorkerFactory` is requested to [create and start a worker](#createSimpleWorker).
102 | 
103 | ## Creating Python Worker { #create }
104 | 
105 | ```scala
106 | create(): (Socket, Option[Int])
107 | ```
108 | 
109 | `create` branches off based on the [useDaemon](#useDaemon) flag:
110 | 
111 | * When enabled, `create` firstly checks the [idleWorkers](#idleWorkers) queue and returns one if available. Otherwise, `create` [createThroughDaemon](#createThroughDaemon)
112 | * When disabled, `create` [createSimpleWorker](#createSimpleWorker)
113 | 
114 | ---
115 | 
116 | `create` is used when:
117 | 
118 | * `SparkEnv` is requested to [createPythonWorker](SparkEnv.md#createPythonWorker)
119 | 
120 | ### Creating Daemon Worker { #createThroughDaemon }
121 | 
122 | ```scala
123 | createThroughDaemon(): (Socket, Option[Int])
124 | ```
125 | 
126 | `createThroughDaemon` [startDaemon](#startDaemon) followed by [createSocket](#createSocket).
127 | 
128 | In case of a `SocketException`, `createThroughDaemon` prints out the following WARN message to the logs:
129 | 
130 | ```text
131 | Failed to open socket to Python daemon: [exception]
132 | Assuming that daemon unexpectedly quit, attempting to restart
133 | ```
134 | 
135 | And then, `createThroughDaemon` [stopDaemon](#stopDaemon), [startDaemon](#startDaemon) and [createSocket](#createSocket).
136 | 
137 | #### createSocket { #createSocket }
138 | 
139 | ```scala
140 | createSocket(): (Socket, Option[Int])
141 | ```
142 | 
143 | `createSocket` creates a new stream socket and connects it to the [daemonPort](#daemonPort) at the [daemonHost](#daemonHost).
144 | 
145 | `createSocket` reads the PID (of the python worker behind the stream socket) and requests the [authHelper](#authHelper) to `authToServer`.
146 | 
147 | In the end, `createSocket` returns the socket and the PID (after registering them in the [daemonWorkers](#daemonWorkers) registry).
148 | 
149 | ### Starting Python Daemon Process { #startDaemon }
150 | 
151 | ```scala
152 | startDaemon(): Unit
153 | ```
154 | 
155 | !!! note "Does nothing with `daemon` initialized"
156 |     `startDaemon` does nothing when [daemon](#daemon) is initialized (non-`null`) that indicates that the daemon is already up and running.
157 | 
158 | `startDaemon` creates the command (using the given [pythonExec](#pythonExec) and the [daemon module](#daemonModule)):
159 | 
160 | ```text
161 | [pythonExec] -m [daemonModule]
162 | ```
163 | 
164 | `startDaemon` adds the given [envVars](#envVars) and the following (extra) environment variables to the environment of future python processes:
165 | 
166 | Environment Variable | Value
167 | ---------------------|------
168 |  `PYTHONPATH` | [pythonPath](#pythonPath)
169 |  `PYTHON_WORKER_FACTORY_SECRET` | [authHelper](#authHelper)
170 |  `SPARK_PREFER_IPV6` | `True` if the underlying JVM prefer IPv6 addresses (based on `java.net.preferIPv6Addresses` JVM property)
171 |  `PYTHONUNBUFFERED` | `YES`
172 | 
173 | `startDaemon` starts a new process (that is known as the [daemon](#daemon)).
174 | 
175 | `startDaemon` connects to the python process to read the [daemonPort](#daemonPort).
176 | 
177 | In the end, `startDaemon` [redirectStreamsToStderr](#redirectStreamsToStderr).
178 | 
179 | ## <span id="createSimpleWorker"> Creating Simple Non-Daemon Worker
180 | 
181 | ```scala
182 | createSimpleWorker(): Socket
183 | ```
184 | 
185 | `createSimpleWorker`...FIXME
186 | 
187 | `createSimpleWorker` is used when `PythonWorkerFactory` is requested to [create a Python worker](#create) (with [useDaemon](#useDaemon) flag disabled).
188 | 
189 | ## Logging
190 | 
191 | Enable `ALL` logging level for `org.apache.spark.api.python.PythonWorkerFactory` logger to see what happens inside.
192 | 
193 | Add the following line to `conf/log4j2.properties`:
194 | 
195 | ```text
196 | logger.PythonWorkerFactory.name = org.apache.spark.api.python.PythonWorkerFactory
197 | logger.PythonWorkerFactory.level = all
198 | ```
199 | 
200 | Refer to [Logging](logging.md).
201 | 


--------------------------------------------------------------------------------
/docs/RDD.md:
--------------------------------------------------------------------------------
1 | # RDD
2 | 


--------------------------------------------------------------------------------
/docs/Setup.md:
--------------------------------------------------------------------------------
 1 | # PySpark Setup
 2 | 
 3 | ## Install IPython
 4 | 
 5 | Follow the steps as described in the [official documentation](https://ipython.readthedocs.io/en/stable/install/install.html) of IPython.
 6 | 
 7 | ```text
 8 | pip install ipython
 9 | ```
10 | 
11 | ## Start PySpark
12 | 
13 | ```bash
14 | export PYSPARK_DRIVER_PYTHON=ipython
15 | ```
16 | 
17 | For Java 11, use `-Dio.netty.tryReflectionSetAccessible=true` (see [Downloading](http://spark.apache.org/docs/latest/index.html#downloading) in the official documentation of Apache Spark).
18 | 
19 | ```bash
20 | ./bin/pyspark --driver-java-options=-Dio.netty.tryReflectionSetAccessible=true
21 | ```
22 | 
23 | ```text
24 | Python 3.9.1 (default, Feb  3 2021, 07:38:02)
25 | Type 'copyright', 'credits' or 'license' for more information
26 | IPython 7.20.0 -- An enhanced Interactive Python. Type '?' for help.
27 | Setting default log level to "WARN".
28 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
29 | Welcome to
30 |       ____              __
31 |      / __/__  ___ _____/ /__
32 |     _\ \/ _ \/ _ `/ __/  '_/
33 |    /__ / .__/\_,_/_/ /_/\_\   version 3.1.1
34 |       /_/
35 | 
36 | Using Python version 3.9.1 (default, Feb  3 2021 07:38:02)
37 | Spark context Web UI available at http://192.168.68.101:4040
38 | Spark context available as 'sc' (master = local[*], app id = local-1613571272142).
39 | SparkSession available as 'spark'.
40 | 
41 | In [1]:
42 | ```
43 | 
44 | ```text
45 | In [1]: spark.version
46 | Out[1]: '3.1.1'
47 | ```
48 | 


--------------------------------------------------------------------------------
/docs/SimplePythonFunction.md:
--------------------------------------------------------------------------------
 1 | # SimplePythonFunction
 2 | 
 3 | `SimplePythonFunction` is a [PythonFunction](PythonFunction.md).
 4 | 
 5 | ## Creating Instance
 6 | 
 7 | `SimplePythonFunction` takes the following to be created:
 8 | 
 9 | * <span id="command"> Command (byte array)
10 | * <span id="envVars"> Environment Variables
11 | * <span id="pythonIncludes"> Python Includes
12 | * [Python Executable](#pythonExec)
13 | * <span id="pythonVer"> Python Version
14 | * <span id="broadcastVars"> `Broadcast`s of [PythonBroadcast](PythonBroadcast.md)s
15 | * <span id="accumulator"> [PythonAccumulatorV2](PythonAccumulatorV2.md)
16 | 
17 | `SimplePythonFunction` is created when:
18 | 
19 | * `SparkConnectPlanner` is requested to `transformPythonFunction`
20 | * `pyspark.rdd` (Python module) is requested to [_wrap_function](pyspark/rdd.md#_wrap_function)
21 | * `pyspark.sql.udf` (Python module) is requested to [_wrap_function](pyspark/sql/udf.md#_wrap_function)
22 | 
23 | ### Python Executable { #pythonExec }
24 | 
25 | `SimplePythonFunction` is given the **Python Executable** when [created](#creating-instance).
26 | 
27 | The Python Executable is controlled by [PYSPARK_PYTHON](environment-variables.md#PYSPARK_PYTHON) environment variable (in PySpark) or [PYSPARK_DRIVER_PYTHON](environment-variables.md#PYSPARK_DRIVER_PYTHON) (in [PySpark Connect](connect/index.md)).
28 | 


--------------------------------------------------------------------------------
/docs/SocketAuthHelper.md:
--------------------------------------------------------------------------------
1 | # SocketAuthHelper
2 | 
3 | `SocketAuthHelper` is...FIXME
4 | 


--------------------------------------------------------------------------------
/docs/SocketAuthServer.md:
--------------------------------------------------------------------------------
 1 | # SocketAuthServer
 2 | 
 3 | ## <span id="serveToStream"> serveToStream Utility
 4 | 
 5 | ```scala
 6 | serveToStream(
 7 |   threadName: String,
 8 |   authHelper: SocketAuthHelper)(writeFunc: OutputStream => Unit): Array[Any]
 9 | ```
10 | 
11 | `serveToStream`...FIXME
12 | 
13 | `serveToStream` is used when:
14 | 
15 | * FIXME
16 | 


--------------------------------------------------------------------------------
/docs/SocketFuncServer.md:
--------------------------------------------------------------------------------
1 | # SocketFuncServer
2 | 
3 | `SocketFuncServer` is...FIXME
4 | 


--------------------------------------------------------------------------------
/docs/SparkConf.md:
--------------------------------------------------------------------------------
 1 | # SparkConf
 2 | 
 3 | `SparkConf` is a Python class.
 4 | 
 5 | ## Creating Instance
 6 | 
 7 | `SparkConf` takes the following to be created:
 8 | 
 9 | * <span id="loadDefaults"> `loadDefaults` flag (default: `True`)
10 | * <span id="_jvm"> `JVMView` ([py4j]({{ py4j.doc }}/py4j_java_gateway.html#jvmview))
11 | * <span id="_jconf"> JConf (default: `None`)
12 | 
13 | While being created, `SparkConf` uses the [JVMView](SparkContext.md#_jvm) (of the [SparkContext](SparkContext.md)) unless the `_jconf` and `_jvm` are given.
14 | 
15 | ## Demo
16 | 
17 | ```python
18 | from pyspark import SparkConf
19 | ```
20 | 


--------------------------------------------------------------------------------
/docs/SparkContext.md:
--------------------------------------------------------------------------------
 1 | # SparkContext
 2 | 
 3 | ![SparkContext Initialization](images/SparkContext.png)
 4 | 
 5 | ## Creating Instance
 6 | 
 7 | `SparkContext` takes the following to be created:
 8 | 
 9 | * <span id="master"> Master URL (default: `None`)
10 | * <span id="appName"> Application Name (default: `None`)
11 | * <span id="sparkHome"> Spark Home (default: `None`)
12 | * <span id="pyFiles"> Py Files (default: `None`)
13 | * <span id="environment"> Environment (default: `None`)
14 | * <span id="batchSize"> Batch Size (default: `0`)
15 | * <span id="serializer"> `PickleSerializer`
16 | * <span id="conf"> `SparkConf` (default: `None`)
17 | * <span id="gateway"> Gateway (default: `None`)
18 | * <span id="jsc"> Corresponding `SparkContext` on JVM (default: `None`)
19 | * <span id="profiler_cls"> `BasicProfiler`
20 | 
21 | While being created, `SparkContext` [_ensure_initialized](#_ensure_initialized) (with the [gateway](#gateway) and the [conf](#conf)) followed by [_do_init](#_do_init).
22 | 
23 | ## Demo
24 | 
25 | ```python
26 | from pyspark import SparkContext
27 | ```
28 | 
29 | ## <span id="_gateway"> JavaGateway
30 | 
31 | `SparkContext` defines `_gateway` property for a `JavaGateway` that is given or launched when [_ensure_initialized](#_ensure_initialized).
32 | 
33 | ## <span id="_jvm"> JVMView
34 | 
35 | `SparkContext` defines `_jvm` property for a `JVMView` ([py4j]({{ py4j.doc }}/py4j_java_gateway.html#jvmview)) to access to the Java Virtual Machine of the [JavaGateway](#_gateway).
36 | 
37 | ## <span id="_ensure_initialized"> _ensure_initialized
38 | 
39 | ```python
40 | _ensure_initialized(
41 |   cls, instance=None, gateway=None, conf=None)
42 | ```
43 | 
44 | `_ensure_initialized` is a `@classmethod`.
45 | 
46 | `_ensure_initialized` takes the given [gateway](#gateway) or [launch_gateway](pyspark/java_gateway.md#launch_gateway).
47 | 
48 | `_ensure_initialized`...FIXME
49 | 
50 | `_ensure_initialized` is used when:
51 | 
52 | * `SparkContext` is [created](#creating-instance) and `setSystemProperty`
53 | * [shell.py](pyspark/shell.md) is launched
54 | 
55 | ## <span id="_do_init"> _do_init
56 | 
57 | ```python
58 | _do_init(
59 |   self, master, appName, sparkHome,
60 |   pyFiles, environment, batchSize, serializer,
61 |   conf, jsc, profiler_cls)
62 | ```
63 | 
64 | `_do_init`...FIXME
65 | 


--------------------------------------------------------------------------------
/docs/SparkEnv.md:
--------------------------------------------------------------------------------
 1 | # SparkEnv
 2 | 
 3 | !!! note "Learn More"
 4 |     This is a stub for [pythonWorkers](#pythonWorkers) et al.
 5 |     Learn more in [The Internals of Apache Spark]({{ book.spark_core }}/SparkEnv/).
 6 | 
 7 | ## pythonWorkers Registry { #pythonWorkers }
 8 | 
 9 | ```scala
10 | pythonWorkers: Map[(String, Map[String, String]), PythonWorkerFactory]
11 | ```
12 | 
13 | `SparkEnv` creates an empty collection of [PythonWorkerFactory](PythonWorkerFactory.md)s (by their `pythonExec` and the `envVars`) when created.
14 | 
15 | A new `PythonWorkerFactory` is created in [createPythonWorker](#createPythonWorker) when there was no `PythonWorkerFactory` for a `pythonExec` and a `envVars` pair.
16 | 
17 | All `PythonWorkerFactory`s are requested to [stop](PythonWorkerFactory.md#stop) when `SparkEnv` is requested to `stop`.
18 | 
19 | `pythonWorkers` is used in [destroyPythonWorker](#destroyPythonWorker) and [releasePythonWorker](#releasePythonWorker).
20 | 
21 | ## Looking Up or Creating Python Worker Process { #createPythonWorker }
22 | 
23 | ```scala
24 | createPythonWorker(
25 |   pythonExec: String,
26 |   envVars: Map[String, String]): (java.net.Socket, Option[Int])
27 | ```
28 | 
29 | `createPythonWorker` looks up a [PythonWorkerFactory](PythonWorkerFactory.md) (in [pythonWorkers](#pythonWorkers)) for the given `pythonExec` and the `envVars` pair. Unless found, `createPythonWorker` registers a new `PythonWorkerFactory`.
30 | 
31 | In the end, `createPythonWorker` requests the `PythonWorkerFactory` to [create a Python worker process](PythonWorkerFactory.md#create).
32 | 
33 | ---
34 | 
35 | ``createPythonWorker`` is used when:
36 | 
37 | * `BasePythonRunner` is requested to [compute a partition](runners/BasePythonRunner.md#compute)
38 | 


--------------------------------------------------------------------------------
/docs/arrow-optimization/.pages:
--------------------------------------------------------------------------------
1 | title: Arrow Optimization
2 | nav:
3 |     - index.md
4 |     - ...
5 | 


--------------------------------------------------------------------------------
/docs/arrow-optimization/index.md:
--------------------------------------------------------------------------------
1 | # Arrow Optimization
2 | 
3 | **Arrow Optimization** is an optimization that uses [Apache Arrow]({{ arrow.home }}) for columnar data transfers in the following:
4 | 
5 | * [pyspark.sql.DataFrame.toPandas](../sql/PandasConversionMixin.md#toPandas)
6 | * [pyspark.sql.SparkSession.createDataFrame](../sql/SparkConversionMixin.md#createDataFrame) (when called with a Pandas `DataFrame` or a NumPy `ndarray`)
7 | 
8 | The following data types are unsupported: `ArrayType` of `TimestampType`.
9 | 


--------------------------------------------------------------------------------
/docs/building-from-sources.md:
--------------------------------------------------------------------------------
 1 | # Building from Sources
 2 | 
 3 | ```text
 4 | $ java -version
 5 | openjdk version "11.0.10" 2021-01-19
 6 | OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.10+9)
 7 | OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.10+9, mixed mode)
 8 | ```
 9 | 
10 | ```text
11 | ./build/mvn \
12 |   -Pyarn,kubernetes,hive,hive-thriftserver,scala-2.12 \
13 |   -DskipTests \
14 |   clean install
15 | ```
16 | 
17 | ## Building PySpark-Related Operators
18 | 
19 | ```text
20 | ./build/mvn -DskipTests -pl :spark-sql_2.12 clean install
21 | ```
22 | 
23 | ```text
24 | cp sql/core/target/spark-sql_2.12-3.1.1.jar assembly/target/scala-2.12/jars/
25 | ```
26 | 


--------------------------------------------------------------------------------
/docs/configuration-properties/.pages:
--------------------------------------------------------------------------------
1 | title: Configuration Properties
2 | nav:
3 |     - index.md
4 |     - ...
5 | 


--------------------------------------------------------------------------------
/docs/configuration-properties/index.md:
--------------------------------------------------------------------------------
1 | # Configuration Properties
2 | 


--------------------------------------------------------------------------------
/docs/configuration-properties/spark.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: spark
 3 | ---
 4 | 
 5 | # spark Configuration Properties
 6 | 
 7 | ## <span id="BROADCAST_FOR_UDF_COMPRESSION_THRESHOLD"> broadcast.UDFCompressionThreshold { #spark.broadcast.UDFCompressionThreshold }
 8 | 
 9 | **spark.broadcast.UDFCompressionThreshold**
10 | 
11 | The threshold at which user-defined functions (UDFs) and Python RDD commands are compressed by broadcast (in bytes)
12 | 
13 | Default: `1L * 1024 * 1024` (1MB)
14 | 
15 | Used when:
16 | 
17 | * `PythonUtils` is requested to [getBroadcastThreshold](../PythonUtils.md#getBroadcastThreshold)
18 | 


--------------------------------------------------------------------------------
/docs/configuration-properties/spark.pyspark.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: spark.pyspark
 3 | ---
 4 | 
 5 | # spark.pyspark Configuration Properties
 6 | 
 7 | ## <span id="PYSPARK_DRIVER_PYTHON"> driver.python { #spark.pyspark.driver.python }
 8 | 
 9 | **spark.pyspark.driver.python**
10 | 
11 | Default: (undefined)
12 | 
13 | ## <span id="PYSPARK_PYTHON"> python { #spark.pyspark.python }
14 | 
15 | **spark.pyspark.python**
16 | 
17 | Default: (undefined)
18 | 


--------------------------------------------------------------------------------
/docs/configuration-properties/spark.python.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: spark.python
 3 | ---
 4 | 
 5 | # spark.python Configuration Properties
 6 | 
 7 | ## <span id="PYTHON_DAEMON_MODULE"> daemon.module { #spark.python.daemon.module }
 8 | 
 9 | **spark.python.daemon.module**
10 | 
11 | The Python module to run the daemon to execute Python workers
12 | 
13 | Default: [pyspark.daemon](../pyspark/daemon.md)
14 | 
15 | Used when:
16 | 
17 | * `PythonWorkerFactory` is [created](../PythonWorkerFactory.md#daemonModule)
18 | 
19 | ## <span id="PYTHON_USE_DAEMON"> use.daemon { #spark.python.use.daemon }
20 | 
21 | **spark.python.use.daemon**
22 | 
23 | Because forking processes from Java is expensive, PySpark prefers launching a single Python daemon ([spark.python.daemon.module](#spark.python.daemon.module)) to fork new workers for tasks.
24 | This daemon currently only works on UNIX-based systems now because it uses signals for child management, so we can also fall back to launching workers ([spark.python.worker.module](#spark.python.worker.module)) directly.
25 | 
26 | Default: `true` (unless PySpark runs on Windows)
27 | 
28 | Used when:
29 | 
30 | * `PythonWorkerFactory` is [created](../PythonWorkerFactory.md#useDaemon)
31 | 
32 | ## <span id="PYTHON_WORKER_MODULE"> worker.module { #spark.python.worker.module }
33 | 
34 | **spark.python.worker.module**
35 | 
36 | The Python module to run a Python worker
37 | 
38 | Default: [pyspark.worker](../pyspark/worker.md)
39 | 
40 | Used when:
41 | 
42 | * `PythonWorkerFactory` is [created](../PythonWorkerFactory.md#workerModule)
43 | 


--------------------------------------------------------------------------------
/docs/configuration-properties/spark.sql.execution.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: spark.sql.execution
 3 | ---
 4 | 
 5 | # spark.sql.execution Configuration Properties
 6 | 
 7 | ## <span id="ARROW_EXECUTION_MAX_RECORDS_PER_BATCH"><span id="arrowMaxRecordsPerBatch"> arrow.maxRecordsPerBatch { #spark.sql.execution.arrow.maxRecordsPerBatch }
 8 | 
 9 | **spark.sql.execution.arrow.maxRecordsPerBatch**
10 | 
11 | When using Apache Arrow, the maximum number of records that can be written to a single `ArrowRecordBatch` in memory.
12 | 
13 | If zero or negative there is no limit.
14 | 
15 | Default: `10000`
16 | 
17 | Used when:
18 | 
19 | * `ApplyInPandasWithStatePythonRunner` is requested for `workerConf`
20 | * `ArrowEvalPythonExec` is [created](../sql/ArrowEvalPythonExec.md#batchSize)
21 | * `Dataset` is requested to `toArrowBatchRdd`
22 | * `MapInBatchExec` is created
23 | * `SparkConnectPlanner` is requested to `handleSqlCommand`
24 | * `SparkConnectStreamHandler` is requested to `processAsArrowBatches`
25 | 
26 | ## <span id="ARROW_PYSPARK_EXECUTION_ENABLED"><span id="arrowPySparkEnabled"> arrow.pyspark.enabled { #spark.sql.execution.arrow.pyspark.enabled }
27 | 
28 | **spark.sql.execution.arrow.pyspark.enabled**
29 | 
30 | Enables [Arrow Optimization](../arrow-optimization/index.md)
31 | 
32 | Default: `false`
33 | 
34 | ## <span id="PANDAS_UDF_BUFFER_SIZE"><span id="pandasUDFBufferSize"> pandas.udf.buffer.size { #spark.sql.execution.pandas.udf.buffer.size }
35 | 
36 | **spark.sql.execution.pandas.udf.buffer.size**
37 | 
38 | `spark.buffer.size` for Pandas UDF executions
39 | 
40 | Note that Pandas execution requires more than 4 bytes.
41 | Lowering this value could make small Pandas UDF batch iterated and pipelined; however, it might degrade performance.
42 | See SPARK-27870.
43 | 
44 | Default: `spark.buffer.size` ([Spark Core]({{ book.spark_core }}/configuration-properties/#spark.buffer.size))
45 | 
46 | Used when:
47 | 
48 | * `ApplyInPandasWithStatePythonRunner` and [ArrowPythonRunner](../runners/ArrowPythonRunner.md#bufferSize) are created (and initialize [bufferSize](../runners/BasePythonRunner.md#bufferSize))
49 | 
50 | ## <span id="PYSPARK_SIMPLIFIEID_TRACEBACK"><span id="pysparkSimplifiedTraceback"> pyspark.udf.simplifiedTraceback.enabled { #spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled }
51 | 
52 | **spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled**
53 | 
54 | Controls the traceback from Python UDFs. When enabled (`true`), traceback is simplified and hides the Python worker, (de)serialization, etc. from PySpark in tracebacks, and only shows the exception messages from UDFs.
55 | 
56 | Works only with CPython 3.7+
57 | 
58 | Default: `true`
59 | 
60 | Used when:
61 | 
62 | * `ApplyInPandasWithStatePythonRunner`, [ArrowPythonRunner](../runners/ArrowPythonRunner.md#simplifiedTraceback), `CoGroupedArrowPythonRunner`, [PythonUDFRunner](../runners/PythonUDFRunner.md#simplifiedTraceback) are created (and initialize [simplifiedTraceback](../runners/BasePythonRunner.md#simplifiedTraceback) flag)
63 | 


--------------------------------------------------------------------------------
/docs/connect/.pages:
--------------------------------------------------------------------------------
1 | title: Spark Connect
2 | nav:
3 |     - index.md
4 |     - ...
5 | 


--------------------------------------------------------------------------------
/docs/connect/index.md:
--------------------------------------------------------------------------------
 1 | # Spark Connect
 2 | 
 3 | PySpark supports remote connection to Spark clusters using Spark Connect ([Spark SQL]({{ book.spark_sql }}/connect)).
 4 | 
 5 | ```console
 6 | $ ./bin/pyspark --help
 7 | Usage: ./bin/pyspark [options]
 8 | 
 9 | Options:
10 |  Spark Connect only:
11 |    --remote CONNECT_URL       URL to connect to the server for Spark Connect, e.g.,
12 |                               sc://host:port. --master and --deploy-mode cannot be set
13 |                               together with this option. This option is experimental, and
14 |                               might change between minor releases.
15 |  ...
16 | ```
17 | 
18 | Spark Connect for Python requires the following Python libraries:
19 | 
20 | Module | Version
21 | -------|--------
22 | [pandas](https://pandas.pydata.org/) | 1.0.5
23 | [pyarrow](https://arrow.apache.org/docs/python/index.html) | 1.0.0
24 | [grpc](https://grpc.io/docs/languages/python/) | 1.48.1
25 | 
26 | ```console
27 | // switching to an conda environment with the libraries
28 | $ conda activate pyspark
29 | 
30 | $ ./bin/pyspark --remote sc://localhost
31 | Python 3.10.10 (main, Mar 21 2023, 13:41:39) [Clang 14.0.6 ] on darwin
32 | Type "help", "copyright", "credits" or "license" for more information.
33 | Welcome to
34 |       ____              __
35 |      / __/__  ___ _____/ /__
36 |     _\ \/ _ \/ _ `/ __/  '_/
37 |    /__ / .__/\_,_/_/ /_/\_\   version 3.4.0
38 |       /_/
39 | 
40 | Using Python version 3.10.10 (main, Mar 21 2023 13:41:39)
41 | Client connected to the Spark Connect server at localhost
42 | SparkSession available as 'spark'.
43 | 
44 | >>> spark.client
45 | <pyspark.sql.connect.client.SparkConnectClient object at 0x7fed8867ab90>
46 | ```
47 | 
48 | ## is_remote { #is_remote }
49 | 
50 | ```py
51 | # from pyspark.sql.utils import is_remote
52 | is_remote() -> bool
53 | ```
54 | 
55 | `is_remote` is `True` when `SPARK_REMOTE` environment variable is defined (in `os.environ`).
56 | 


--------------------------------------------------------------------------------
/docs/demo/.pages:
--------------------------------------------------------------------------------
1 | title: Demos
2 | nav:
3 |     - index.md
4 |     - ...
5 | 


--------------------------------------------------------------------------------
/docs/demo/executing-pyspark-applications-using-spark-submit.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | hide:
 3 |   - navigation
 4 | ---
 5 | 
 6 | # Demo: Executing PySpark Applications Using spark-submit
 7 | 
 8 | PySpark applications are executed using `spark-submit` ([Spark Core]({{ book.spark_core }}/tools/spark-submit)) command-line application.
 9 | 
10 | ```text
11 | spark-submit 1.py extra args
12 | ```
13 | 
14 | For a PySpark application, `spark-submit` uses [PythonRunner](../PythonRunner.md) and launches an extra python process:
15 | 
16 | ```text
17 | ps -o pid,ppid,command | grep python | grep -v grep
18 | ```
19 | 
20 | ```text
21 | org.apache.spark.deploy.SparkSubmit 1.py extra args
22 | ```
23 | 
24 | ```text
25 | Python /usr/local/bin/ipython 1.py extra args
26 | ```
27 | 
28 | ## SPARK_PRINT_LAUNCH_COMMAND Environment Variable
29 | 
30 | Use `SPARK_PRINT_LAUNCH_COMMAND` environment variable to have the complete Spark command printed out to the standard output (cf. [spark-submit shell script]({{ book.spark_core }}/tools/spark-submit/#spark_print_launch_command)).
31 | 
32 | ```text
33 | SPARK_PRINT_LAUNCH_COMMAND=1 spark-submit 1.py extra args
34 | ```
35 | 
36 | ## verbose Option
37 | 
38 | Use `--verbose` option for verbose debugging output.
39 | 
40 | ```text
41 | Parsed arguments:
42 |   ...
43 |   pyFiles                 null
44 |   ...
45 |   primaryResource         file:/Users/jacek/dev/sandbox/python-sandbox/1.py
46 |   name                    1.py
47 |   childArgs               [extra args]
48 | ...
49 | Main class:
50 | org.apache.spark.deploy.PythonRunner
51 | Arguments:
52 | file:/Users/jacek/dev/sandbox/python-sandbox/1.py
53 | null
54 | extra
55 | args
56 | Spark config:
57 | (spark.app.name,1.py)
58 | (spark.master,local[*])
59 | (spark.submit.pyFiles,)
60 | (spark.submit.deployMode,client)
61 | ```
62 | 


--------------------------------------------------------------------------------
/docs/demo/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | hide:
 3 |   - toc
 4 |   - navigation
 5 | ---
 6 | 
 7 | # Demos
 8 | 
 9 | 1. [Executing PySpark Applications Using spark-submit](executing-pyspark-applications-using-spark-submit.md)
10 | 1. [Running PySpark Application on minikube](running-pyspark-application-on-minikube.md)
11 | 


--------------------------------------------------------------------------------
/docs/demo/running-pyspark-application-on-minikube.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | hide:
 3 |   - navigation
 4 | ---
 5 | 
 6 | # Demo: Running PySpark Application on minikube
 7 | 
 8 | This demo shows how to run a PySpark application on Kubernetes (using minikube).
 9 | 
10 | !!! tip
11 |     This is a follow-up demo to [Demo: Running Spark Application on minikube]({{ book.spark_k8s }}/demo/running-spark-application-on-minikube/) in the [The Internals of Spark on Kubernetes]({{ book.spark_k8s }}).
12 | 


--------------------------------------------------------------------------------
/docs/environment-variables.md:
--------------------------------------------------------------------------------
 1 | # Environment Variables
 2 | 
 3 | PySpark uses environment variables to configure execution environment.
 4 | 
 5 | ## PYSPARK_DRIVER_PYTHON { #PYSPARK_DRIVER_PYTHON }
 6 | 
 7 | The Python Executable in [PySpark Connect](connect/index.md) unless [PYSPARK_PYTHON](#PYSPARK_PYTHON) is defined
 8 | 
 9 | Default: `python3`
10 | 
11 | ## PYSPARK_GATEWAY_PORT { #PYSPARK_GATEWAY_PORT }
12 | 
13 | ## PYSPARK_GATEWAY_SECRET { #PYSPARK_GATEWAY_SECRET }
14 | 
15 | ## PYSPARK_PIN_THREAD { #PYSPARK_PIN_THREAD }
16 | 
17 | Enables **pinned thread mode** to synchronize PVM threads with JVM threads based on Py4J's [ClientServer]({{ py4j.javadoc }}/py4j/ClientServer.html) (`true`) or [GatewayServer]({{ py4j.javadoc }}/py4j/GatewayServer.html) (`false`)
18 | 
19 | Default: `false`
20 | 
21 | Used when:
22 | 
23 | * [launch_gateway](pyspark/java_gateway.md) is executed
24 | * [Py4JServer](Py4JServer.md) is created (and initializes the [server](Py4JServer.md#server))
25 | 
26 | ## PYSPARK_PYTHON { #PYSPARK_PYTHON }
27 | 
28 | The Python Executable
29 | 
30 | Default: `python3`
31 | 


--------------------------------------------------------------------------------
/docs/features/index.md:
--------------------------------------------------------------------------------
  1 | # PySpark &mdash; Python on Apache Spark
  2 | 
  3 | **PySpark** is the Python API (_frontend_) of Apache Spark.
  4 | 
  5 | ## How It Works
  6 | 
  7 | When a Python script is executed using `spark-submit` shell script ([Spark Core]({{ book.spark_core }}/tools/spark-submit/)), [PythonRunner](../PythonRunner.md) is started (and `--verbose` option can show it as `Main class`).
  8 | 
  9 | ``` shell
 10 | $ ./bin/spark-submit --version hello_pyspark.py
 11 | Using properties file: null
 12 | Parsed arguments:
 13 |   master                  local[*]
 14 |   ...
 15 |   primaryResource         file:/Users/jacek/dev/oss/spark/hello_pyspark.py
 16 |   name                    hello_pyspark.py
 17 | ...
 18 | Main class:
 19 | org.apache.spark.deploy.PythonRunner
 20 | Arguments:
 21 | file:/Users/jacek/dev/oss/spark/hello_pyspark.py
 22 | null
 23 | Spark config:
 24 | (spark.app.name,hello_pyspark.py)
 25 | (spark.app.submitTime,1684188276759)
 26 | (spark.master,local[*])
 27 | (spark.submit.deployMode,client)
 28 | (spark.submit.pyFiles,)
 29 | ...
 30 | ```
 31 | 
 32 | `spark-submit` execution above could be translated to the following:
 33 | 
 34 | ```text
 35 | ./bin/spark-class org.apache.spark.deploy.PythonRunner hello_pyspark.py ""
 36 | ```
 37 | 
 38 | `PythonRunner` then launches a [Py4JServer](../Py4JServer.md) (on a `py4j-gateway-init` daemon thread) and waits until it is started.
 39 | 
 40 | Finally, `PythonRunner` launches a Python process (to run the Python script) and waits until the process finishes (successfully or not).
 41 | 
 42 | ```shell
 43 | $ ps -o pid,command | grep python3 | grep -v grep
 44 | 12607 python3 /Users/jacek/dev/oss/spark/hello_pyspark.py
 45 | ```
 46 | 
 47 | ??? note "lsof for open files and TCP inter-process connections"
 48 |     Use `lsof` command to have a look at the open files and connections.
 49 |         
 50 |     ```shell
 51 |     sudo lsof -p [pid of the python process]
 52 |     ```
 53 | 
 54 | ## Python 3.8 and Later
 55 | 
 56 | The minimum version of Python is **3.8**.
 57 | 
 58 | ??? note "Python 3.7 Deprecated"
 59 |     Python 3.7 support is deprecated in Spark 3.4.
 60 | 
 61 | ## shell.py
 62 | 
 63 | `pyspark` shell defines [PYTHONSTARTUP]({{ python.docs }}/using/cmdline.html#envvar-PYTHONSTARTUP) environment variable to execute [shell.py](../pyspark/shell.md) before the first prompt is displayed in Python interactive mode.
 64 | 
 65 | ## Py4J
 66 | 
 67 | [java_gateway](../pyspark/java_gateway.md) uses [Py4J - A Bridge between Python and Java]({{ py4j.doc }}):
 68 | 
 69 | > Py4J enables Python programs running in a Python interpreter to dynamically access Java objects in a Java Virtual Machine. Methods are called as if the Java objects resided in the Python interpreter and Java collections can be accessed through standard Python collection methods. Py4J also enables Java programs to call back Python objects.
 70 | 
 71 | ## pyspark.sql Package
 72 | 
 73 | `pyspark.sql` is a Python package for Spark SQL.
 74 | 
 75 | ```python
 76 | from pyspark.sql import *
 77 | ```
 78 | 
 79 | !!! tip
 80 |     Learn more about [Modules and Packages](https://docs.python.org/3/tutorial/modules.html) in Python in [The Python Tutorial](https://docs.python.org/3/tutorial/index.html).
 81 | 
 82 | ### \_\_init\__.py
 83 | 
 84 | The `__init__.py` files are required to make Python treat directories containing the file as packages.
 85 | 
 86 | Per [6.4.1. Importing * From a Package](https://docs.python.org/3/tutorial/modules.html#importing-from-a-package):
 87 | 
 88 | > The import statement uses the following convention: if a package's `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered.
 89 | 
 90 | Per [Public and Internal Interfaces](https://www.python.org/dev/peps/pep-0008/#public-and-internal-interfaces) in [PEP 8 -- Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/):
 91 | 
 92 | > To better support introspection, modules should explicitly declare the names in their public API using the `__all__` attribute.
 93 | 
 94 | From `python/pyspark/sql/__init__.py`:
 95 | 
 96 | ```python
 97 | __all__ = [
 98 |     'SparkSession', 'SQLContext', 'HiveContext', 'UDFRegistration',
 99 |     'DataFrame', 'GroupedData', 'Column', 'Catalog', 'Row',
100 |     'DataFrameNaFunctions', 'DataFrameStatFunctions', 'Window', 'WindowSpec',
101 |     'DataFrameReader', 'DataFrameWriter', 'PandasCogroupedOps'
102 | ]
103 | ```
104 | 
105 | ## pandas
106 | 
107 | The minimum version of [Pandas](https://pandas.pydata.org/) is `0.23.2` (and [PandasConversionMixin](../sql/PandasConversionMixin.md) asserts that).
108 | 
109 | ```python
110 | import pandas as pd
111 | ```
112 | 
113 | ## pyarrow
114 | 
115 | The minimum version of [PyArrow](https://pypi.org/project/pyarrow/) is `1.0.0` (and [PandasConversionMixin](../sql/PandasConversionMixin.md) asserts that).
116 | 
117 | ```python
118 | import pyarrow
119 | ```
120 | 
121 | ## Python Mixins
122 | 
123 | From [8.7. Class definitions](https://docs.python.org/3/reference/compound_stmts.html#class-definitions):
124 | 
125 | > classdef    ::=  [decorators] "class" classname [inheritance] ":" suite
126 | >
127 | > The inheritance list usually gives a list of base classes
128 | 
129 | PySpark uses mixins:
130 | 
131 | * [PandasConversionMixin](../sql/PandasConversionMixin.md)
132 | * [PandasMapOpsMixin](../sql/PandasMapOpsMixin.md)
133 | * [SparkConversionMixin](../sql/SparkConversionMixin.md)
134 | 


--------------------------------------------------------------------------------
/docs/images/PythonRunner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/japila-books/pyspark-internals/824e54fbf19ddb39ba9ec1a50ca65d7629470cf6/docs/images/PythonRunner.png


--------------------------------------------------------------------------------
/docs/images/PythonWorkerFactory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/japila-books/pyspark-internals/824e54fbf19ddb39ba9ec1a50ca65d7629470cf6/docs/images/PythonWorkerFactory.png


--------------------------------------------------------------------------------
/docs/images/SparkContext.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/japila-books/pyspark-internals/824e54fbf19ddb39ba9ec1a50ca65d7629470cf6/docs/images/SparkContext.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: PySpark
 3 | icon: fontawesome/brands/python
 4 | hide:
 5 |   - toc
 6 |   - navigation
 7 | ---
 8 | 
 9 | # The Internals of {{ book.title }} (Apache Spark {{ spark.version }})
10 | 
11 | Welcome to **The Internals of {{ book.title }}** online book! 🤙
12 | 
13 | I'm [Jacek Laskowski](https://pl.linkedin.com/in/jaceklaskowski), a Freelance Data(bricks) Engineer specializing in
14 | [Apache Spark](https://books.japila.pl/apache-spark-internals/) (incl. [Spark SQL](https://books.japila.pl/spark-sql-internals/) and [Spark Structured Streaming](https://books.japila.pl/spark-structured-streaming-internals/)),
15 | [Delta Lake](https://books.japila.pl/delta-lake-internals/),
16 | [Databricks](https://www.databricks.com/),
17 | and [Apache Kafka](https://books.japila.pl/kafka-internals/) (incl. [Kafka Streams](https://books.japila.pl/kafka-streams-internals/)) with brief forays into a wider data engineering space (mostly during [Warsaw Data Engineering](https://www.meetup.com/Warsaw-Data-Engineering/) meetups).
18 | 
19 | I'm very excited to have you here and hope you will enjoy exploring the internals of {{ book.title }} as much as I have.
20 | 
21 | !!! quote "Flannery O'Connor"
22 |     I write to discover what I know.
23 | 
24 | !!! note ""The Internals Of" series"
25 |     I'm also writing other online books in the "The Internals Of" series. Please visit ["The Internals Of" Online Books](https://books.japila.pl) home page.
26 | 
27 | Expect text and code snippets from a variety of public sources. Attribution follows.
28 | 
29 | Now, let's take a deep dive into [{{ book.title }}](features/index.md) 🔥
30 | 
31 | ---
32 | 
33 | <small>Last update: {{ git.date.strftime('%Y-%m-%d') }}</small>
34 | 


--------------------------------------------------------------------------------
/docs/logging.md:
--------------------------------------------------------------------------------
1 | ---
2 | hide:
3 |   - toc
4 | ---
5 | 
6 | # Logging
7 | 
8 | Delta Lake uses the same logging infrastructure as [Apache Spark]({{ book.spark_core }}/spark-logging/).
9 | 


--------------------------------------------------------------------------------
/docs/ml/.pages:
--------------------------------------------------------------------------------
1 | title: MLlib
2 | nav:
3 |     - index.md
4 |     - ...
5 | 


--------------------------------------------------------------------------------
/docs/ml/Distributor.md:
--------------------------------------------------------------------------------
 1 | # Distributor
 2 | 
 3 | `Distributor` is the parent (_abstract_) class of [TorchDistributor](../pytorch-distributed/TorchDistributor.md).
 4 | 
 5 | ## Creating Instance
 6 | 
 7 | `Distributor` takes the following to be created:
 8 | 
 9 | * <span id="num_processes"> Number of processes (default: `1`)
10 | * <span id="local_mode"> `local_mode` flag (default: `True`)
11 | * <span id="use_gpu"> `use_gpu` flag (default: `True`)
12 | 
13 | !!! note "Abstract Class"
14 |     `Distributor` is not supposed to be created directly.
15 | 
16 | ## _get_num_tasks { #_get_num_tasks }
17 | 
18 | ```py
19 | _get_num_tasks(
20 |   self) -> int
21 | ```
22 | 
23 | `_get_num_tasks`...FIXME
24 | 
25 | ## get_gpus_owned { #get_gpus_owned }
26 | 
27 | ```py
28 | get_gpus_owned(
29 |   context: Union[SparkContext, BarrierTaskContext]) -> List[str]
30 | ```
31 | 
32 | `get_gpus_owned`...FIXME
33 | 


--------------------------------------------------------------------------------
/docs/ml/index.md:
--------------------------------------------------------------------------------
1 | # PySpark MLlib
2 | 
3 | **PySpark MLlib** is a Python module to work with Spark MLlib for `DataFrame`-based machine learning pipelines.
4 | 
5 | ```py
6 | from pyspark.ml import *
7 | ```
8 | 


--------------------------------------------------------------------------------
/docs/pandas-on-spark/.pages:
--------------------------------------------------------------------------------
1 | title: pandas API on Spark
2 | nav:
3 |     - index.md
4 |     - ...
5 | 


--------------------------------------------------------------------------------
/docs/pandas-on-spark/index.md:
--------------------------------------------------------------------------------
 1 | # pandas API on Spark
 2 | 
 3 | **pandas API on Spark** ([pyspark.pandas](../pyspark/pandas/index.md) package) has been added to PySpark to execute [pandas]({{ pandas.home }}) code on Spark clusters with no changes (except the import).
 4 | 
 5 | There are two related PySpark packages with pandas support:
 6 | 
 7 | * [pyspark.pandas](../pyspark/pandas/index.md)
 8 | * [pyspark.sql.pandas](../pyspark/sql/pandas/index.md)
 9 | 
10 | !!! note "Spark Structured Streaming"
11 |     pandas API on Spark does not support Spark Structured Streaming (_streaming queries_).
12 | 
13 | ## Modules
14 | 
15 | pandas API on Spark requires that the following modules to be installed:
16 | 
17 | Module | Version
18 | -------|--------
19 |  [pandas]({{ pandas.home }}) | 1.0.5
20 |  [PyArrow]({{ arrow.docs }}/python/index.html) | 1.0.0
21 | 
22 | ## PYARROW_IGNORE_TIMEZONE { #PYARROW_IGNORE_TIMEZONE }
23 | 
24 | For PyArrow 2.0.0 and above, pandas API on Spark requires `PYARROW_IGNORE_TIMEZONE` environment variable to be set to `1` (on the driver and executors).
25 | 
26 | ## <span id="_auto_patch_spark"> PYSPARK_PANDAS_USAGE_LOGGER { #PYSPARK_PANDAS_USAGE_LOGGER }
27 | 
28 | pandas API on Spark uses `PYSPARK_PANDAS_USAGE_LOGGER` (formerly `KOALAS_USAGE_LOGGER`) environment variable for a usage logger.
29 | 
30 | ## Demo
31 | 
32 | ```py
33 | # The following would be required if we used pandas
34 | # import pandas as pd
35 | 
36 | # but we don't need it anymore 😊
37 | 
38 | # The only change is supposed to be this extra `pyspark` prefix
39 | # in the name of the package
40 | 
41 | import pyspark.pandas as pd
42 | ```
43 | 
44 | === "Python"
45 | 
46 |     ```py
47 |     pd.read_csv("people.csv")
48 |     ```
49 | 
50 | ```text
51 |    id  name
52 | 0   0  zero
53 | 1   1   one
54 | 2   2   two
55 | ```
56 | 


--------------------------------------------------------------------------------
/docs/pandas-udafs/.pages:
--------------------------------------------------------------------------------
1 | title: pandas UDAFs
2 | nav:
3 |     - index.md
4 |     - ...
5 | 


--------------------------------------------------------------------------------
/docs/pandas-udafs/index.md:
--------------------------------------------------------------------------------
 1 | # pandas User-Defined Aggregate Functions
 2 | 
 3 | **pandas User-Defined Aggregate Functions** (_pandas UDAFs_) are [PythonUDFs](../pandas-udfs/index.md) (with optional [PandasUDFType.GROUPED_AGG](../pyspark/sql/pandas/PandasUDFType.md#GROUPED_AGG) function type) to used as aggregation functions in [GroupedData.agg](../sql/GroupedData.md#agg) operator.
 4 | 
 5 | pandas UDAFs are also known as **Group Aggregate pandas UDFs**.
 6 | 
 7 | ## Limitations
 8 | 
 9 | 1. There is no partial aggregation with group aggregate UDFs (i.e., a full shuffle is required).
10 | 1. All the data of a group will be loaded into memory, so there is a potential OOM risk if data is skewed and certain groups are too large to fit in memory
11 | 1. Group aggregate pandas UDFs and built-in aggregation functions cannot be mixed in a single [GroupedData.agg](../sql/GroupedData.md#agg) operator. Otherwise, the following `AnalysisException` is thrown:
12 | 
13 |     ```text
14 |     [INVALID_PANDAS_UDF_PLACEMENT] The group aggregate pandas UDF `my_udaf` cannot be invoked together with as other, non-pandas aggregate functions.
15 |     ```
16 | 
17 | ## Demo
18 | 
19 | ```py
20 | import pandas as pd
21 | from pyspark.sql.functions import pandas_udf
22 | ```
23 | 
24 | ```py
25 | @pandas_udf(returnType = "long")
26 | def my_count(s: pd.Series) -> 'long':
27 |     return pd.Series(s.count())
28 | ```
29 | 
30 | ```py
31 | from pyspark.sql.functions import abs
32 | nums = spark.range(5) # FIXME More meaningful dataset
33 | grouped_nums = (nums
34 |     .withColumn("gid", abs((nums.id * 100) % 2))
35 |     .groupBy("gid"))
36 | count_by_gid_agg = my_count("gid").alias("count")
37 | counts_by_gid = grouped_nums.agg(count_by_gid_agg)
38 | ```
39 | 
40 | ```py
41 | counts_by_gid.show()
42 | ```
43 | 


--------------------------------------------------------------------------------
/docs/pandas-udfs/.pages:
--------------------------------------------------------------------------------
1 | title: pandas UDFs
2 | nav:
3 |     - index.md
4 |     - ...
5 | 


--------------------------------------------------------------------------------
/docs/pandas-udfs/index.md:
--------------------------------------------------------------------------------
  1 | # pandas User-Defined Functions
  2 | 
  3 | **pandas User-Defined Functions** (_Vectorized User-Defined Functions_ or _pandas UDFs_) are user-defined functions that are executed using Apache Arrow to transfer data and pandas to work with the data, which allows for vectorized operations.
  4 | 
  5 | Pandas UDFs are defined using [@pandas_udf](#pandas_udf) decorator.
  6 | 
  7 | A Pandas UDF behaves as a regular PySpark function API in general.
  8 | 
  9 | As of Spark 3.0.0 ([SPARK-28264](https://issues.apache.org/jira/browse/SPARK-28264)), using [Python type hints](https://www.python.org/dev/peps/pep-0484) in pandas UDF is encouraged (instead of specifying pandas UDF type via [functionType](#functionType) argument).
 10 | 
 11 | The return type (type hint) of a user-defined function should be as follows:
 12 | 
 13 | * `pandas.Series` ([pandas]({{ pandas.api }}/pandas.Series.html)) in most cases
 14 | * `pandas.DataFrame` ([pandas]({{ pandas.api }}/pandas.DataFrame.html)) for `struct` input or output
 15 | 
 16 | ## @pandas_udf Decorator { #pandas_udf }
 17 | 
 18 | ```py
 19 | pandas_udf(
 20 |   f=None,
 21 |   returnType=None,
 22 |   functionType=None)
 23 | ```
 24 | 
 25 | [pandas_udf](../pyspark/sql/pandas/functions.md#pandas_udf) function is used a decorator (using `@pandas_udf` annotation).
 26 | 
 27 | ??? note "Python Decorators"
 28 |     Learn more in [PEP 318 – Decorators for Functions and Methods]({{ python.peps }}/pep-0318/).
 29 | 
 30 | `pandas_udf` belongs to [pyspark.sql.functions](../pyspark/sql/functions.md) module.
 31 | 
 32 | ```py
 33 | from pyspark.sql.functions import pandas_udf
 34 | ```
 35 | 
 36 | ### functionType { #functionType }
 37 | 
 38 | `functionType` can be one of [PandasUDFType](../pyspark/sql/pandas/PandasUDFType.md)s (but is currently discouraged in favour of type hints).
 39 | 
 40 | ```py
 41 | @pandas_udf(returnType = "long", functionType = PandasUDFType.GROUPED_AGG)
 42 | def my_udaf(names: pd.Series) -> 'long':
 43 |   return pd.Series(names.count())
 44 | ```
 45 | 
 46 | `functionType` is also known as `evalType`.
 47 | 
 48 | [SQL_SCALAR_PANDAS_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_UDF) is the default scalar UDF type.
 49 | 
 50 | ### returnType { #returnType }
 51 | 
 52 | `@pandas_udf` decorator can optionally specify a return type (as the first positional argument or using `returnType`).
 53 | 
 54 | A return type can be one of the names of `pyspark.sql.types.DataType` instances or the `DataType` themselves.
 55 | 
 56 | ```py
 57 | @pandas_udf(dataType)
 58 | @pandas_udf(returnType=dataType)
 59 | ```
 60 | 
 61 | ## pandas UDAFs
 62 | 
 63 | [pandas User-Defined Aggregate Functions](../pandas-udafs/index.md).
 64 | 
 65 | ## Demo
 66 | 
 67 | ```py
 68 | import pandas as pd
 69 | from pyspark.sql.functions import pandas_udf
 70 | ```
 71 | 
 72 | ```py
 73 | @pandas_udf("string")
 74 | def to_upper(s: pd.Series) -> pd.Series:
 75 |     return s.str.upper()
 76 | ```
 77 | 
 78 | ```py
 79 | @pandas_udf("string")
 80 | def my_concat(names: pd.Series, ages: pd.Series) -> pd.Series:
 81 |     return pd.Series([f"{n} is {a} years old" for (n, a) in zip(names, ages)])
 82 | ```
 83 | 
 84 | ```py
 85 | pandas_df = pd.DataFrame({
 86 |   'name': ['jacek', 'agata', 'iweta', 'patryk', 'maksym'],
 87 |   'age': [50, 49, 29, 26, 11]
 88 |   })
 89 | df = spark.createDataFrame(pandas_df)
 90 | ```
 91 | 
 92 | ```text
 93 | >>> df.show()
 94 | +------+---+
 95 | |  name|age|
 96 | +------+---+
 97 | | jacek| 50|
 98 | | agata| 49|
 99 | | iweta| 29|
100 | |patryk| 26|
101 | |maksym| 11|
102 | +------+---+
103 | ```
104 | 
105 | ```text
106 | >>> df.printSchema()
107 | root
108 |  |-- name: string (nullable = true)
109 |  |-- age: long (nullable = true)
110 | ```
111 | 
112 | ```py
113 | (df
114 |   .select(to_upper(df.name).alias("upper_name"))
115 |   .show())
116 | ```
117 | 
118 | ```text
119 | +----------+
120 | |upper_name|
121 | +----------+
122 | |     JACEK|
123 | |     AGATA|
124 | |     IWETA|
125 | |    PATRYK|
126 | |    MAKSYM|
127 | +----------+
128 | ```
129 | 
130 | ```py
131 | df.select(my_concat(df.name, df.age)).show(truncate = False)
132 | ```
133 | 
134 | ```text
135 | +----------------------+
136 | |my_concat(name, age)  |
137 | +----------------------+
138 | |jacek is 50 years old |
139 | |agata is 49 years old |
140 | |iweta is 29 years old |
141 | |patryk is 26 years old|
142 | |maksym is 11 years old|
143 | +----------------------+
144 | ```
145 | 


--------------------------------------------------------------------------------
/docs/pyspark/daemon.md:
--------------------------------------------------------------------------------
 1 | # daemon.py
 2 | 
 3 | `daemon.py` is a Python module in [pyspark](index.md) package.
 4 | 
 5 | ```py
 6 | from pyspark import daemon
 7 | ```
 8 | 
 9 | ## <span id="__main__"> Entry Point
10 | 
11 | ??? note "Top-Level Code Environment"
12 |     If the module is executed in the top-level code environment (e.g., `python -m`), its `__name__` is set to the string `__main__`.
13 | 
14 |     Sometimes "top-level code" is called an _entry point_ to the application.
15 | 
16 |     Learn more in the [\_\_main__ — Top-level code environment]({{ python.docs }}/library/__main__.html).
17 | 
18 | When executed in the top-level code environment, `daemon.py` calls [manager](#manager) function.
19 | 
20 | ## manager { #manager }
21 | 
22 | ```py
23 | manager()
24 | ```
25 | 
26 | `manager` runs until it is stopped (e.g., `CTRL-C`).
27 | 
28 | `manager` creates a new process group (`os.setpgid(0, 0)`).
29 | 
30 | `manager` creates a listening socket on the loopback interface (possibly using IPv6 based on `SPARK_PREFER_IPV6` environment variable).
31 | 
32 | `manager` reads `SPARK_REUSE_WORKER` environment variable (`reuse`).
33 | 
34 | `manager` launches a [worker process](#worker) (in a child process using `os.fork()`).
35 | 
36 | ### Launching Worker Process { #worker }
37 | 
38 | ```py
39 | worker(
40 |   sock: socket,
41 |   authenticated: Bool) -> Optional[int]
42 | ```
43 | 
44 | !!! note
45 |     `worker` is called by a worker process after the`os.fork()`.
46 | 
47 | `worker` [runs a worker](worker.md#main).
48 | 


--------------------------------------------------------------------------------
/docs/pyspark/index.md:
--------------------------------------------------------------------------------
 1 | # pyspark Package
 2 | 
 3 | ```py
 4 | import pyspark
 5 | ```
 6 | 
 7 | ## \_\_all__
 8 | 
 9 | ??? note "import *"
10 |     The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered.
11 | 
12 |     Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package).
13 | 
14 | * `SparkConf`
15 | * `SparkContext`
16 | * `SparkFiles`
17 | * `RDD`
18 | * `StorageLevel`
19 | * `Broadcast`
20 | * `Accumulator`
21 | * `AccumulatorParam`
22 | * `MarshalSerializer`
23 | * `CPickleSerializer`
24 | * `StatusTracker`
25 | * `SparkJobInfo`
26 | * `SparkStageInfo`
27 | * `Profiler`
28 | * `BasicProfiler`
29 | * `TaskContext`
30 | * `RDDBarrier`
31 | * `BarrierTaskContext`
32 | * `BarrierTaskInfo`
33 | * `InheritableThread`
34 | * `inheritable_thread_target`
35 | * `__version__`
36 | 


--------------------------------------------------------------------------------
/docs/pyspark/java_gateway.md:
--------------------------------------------------------------------------------
 1 | # java_gateway.py
 2 | 
 3 | `java_gateway` is a Python module that allows [launching a gateway process](#launch_gateway) to establish communication channel to [Py4JServer](../Py4JServer.md).
 4 | 
 5 | ## <span id="launch_gateway"> launch_gateway
 6 | 
 7 | ```python
 8 | launch_gateway(
 9 |   conf=None,
10 |   popen_kwargs=None)
11 | ```
12 | 
13 | `launch_gateway` reads [PYSPARK_GATEWAY_PORT](../environment-variables.md#PYSPARK_GATEWAY_PORT) and [PYSPARK_GATEWAY_SECRET](../environment-variables.md#PYSPARK_GATEWAY_SECRET) environment variables if defined and assumes that the child Java gateway process has already been started (e.g. [PythonGatewayServer](../PythonGatewayServer.md)).
14 | 
15 | <span id="launch_gateway-command">
16 | 
17 | Otherwise, `launch_gateway` builds the command to start `spark-submit`:
18 | 
19 | 1. Finds `SPARK_HOME` with `./bin/spark-submit`
20 | 1. Appends all the configuration properties (from the input `conf`) using `--conf`
21 | 1. Appends `PYSPARK_SUBMIT_ARGS` environment variable if defined or assumes `pyspark-shell`
22 | 
23 | `launch_gateway` sets up `_PYSPARK_DRIVER_CONN_INFO_PATH` environment variable to point at an unique temporary file.
24 | 
25 | `launch_gateway` configures a pipe to stdin for the corresponding Java gateway process to use to monitor the Python process.
26 | 
27 | `launch_gateway` starts `bin/spark-submit` command and waits for a connection info file to be created at `_PYSPARK_DRIVER_CONN_INFO_PATH`. `launch_gateway` reads the port and the secret from the file once available.
28 | 
29 | `launch_gateway` connects to the gateway using py4j's `ClientServer` or `JavaGateway` based on [PYSPARK_PIN_THREAD](../environment-variables.md#PYSPARK_PIN_THREAD) environment variable.
30 | 
31 | `launch_gateway` imports Spark packages and classes (using py4j):
32 | 
33 | * `org.apache.spark.SparkConf`
34 | * `org.apache.spark.api.java.*`
35 | * `org.apache.spark.api.python.*`
36 | * `org.apache.spark.ml.python.*`
37 | * `org.apache.spark.mllib.api.python.*`
38 | * `org.apache.spark.resource.*`
39 | * `org.apache.spark.sql.*`
40 | * `org.apache.spark.sql.api.python.*`
41 | * `org.apache.spark.sql.hive.*`
42 | * `scala.Tuple2`
43 | 
44 | `launch_gateway` is used when:
45 | 
46 | * `SparkContext` is requested to [_ensure_initialized](../SparkContext.md#_ensure_initialized)
47 | 


--------------------------------------------------------------------------------
/docs/pyspark/pandas/DataFrame.md:
--------------------------------------------------------------------------------
 1 | # DataFrame
 2 | 
 3 | `DataFrame` is a [Frame](generic/Frame.md) with an [InternalFrame](InternalFrame.md).
 4 | 
 5 | `DataFrame` is a `Generic[T]` ([Python]({{ python.api }}/library/typing.html#user-defined-generic-types)).
 6 | 
 7 | ## Creating Instance
 8 | 
 9 | `DataFrame` takes the following to be created:
10 | 
11 | * <span id="data"> data (optional)
12 | * <span id="index"> index (optional)
13 | * <span id="columns"> columns (optional)
14 | * <span id="dtype"> dtype (optional)
15 | * <span id="copy"> copy (optional)
16 | 
17 | ### _internal_frame { #_internal_frame }
18 | 
19 | `DataFrame` is given or creates an [InternalFrame](InternalFrame.md) when [created](#creating-instance).
20 | 
21 | ```py
22 | object.__setattr__(self, "_internal_frame", internal)
23 | ```
24 | 
25 | ## InternalFrame { #_internal }
26 | 
27 | ??? note "Frame"
28 | 
29 |     ```py
30 |     @property
31 |     def _internal(
32 |         self) -> InternalFrame
33 |     ```
34 | 
35 |     `_internal` is part of the [Frame](generic/Frame.md#_internal) abstraction.
36 | 
37 | `_internal` returns the [_internal_frame](#_internal_frame) (that is expected to be of type [InternalFrame](InternalFrame.md)).
38 | 


--------------------------------------------------------------------------------
/docs/pyspark/pandas/InternalFrame.md:
--------------------------------------------------------------------------------
  1 | # InternalFrame
  2 | 
  3 | `InternalFrame` is the underlying managed Spark DataFrame of [pyspark.pandas.DataFrame](DataFrame.md#_internal).
  4 | 
  5 | ## Creating Instance
  6 | 
  7 | `InternalFrame` takes the following to be created:
  8 | 
  9 | * [Spark DataFrame](#spark_frame)
 10 | * <span id="index_spark_columns"> `index_spark_columns` (optional)
 11 | * <span id="index_names"> `index_names` (optional)
 12 | * <span id="index_fields"> `index_fields` (optional)
 13 | * <span id="column_labels"> `column_labels` (optional)
 14 | * <span id="data_spark_columns"> `data_spark_columns` (optional)
 15 | * <span id="data_fields"> `data_fields` (optional)
 16 | * <span id="column_label_names"> `column_label_names` (optional)
 17 | 
 18 | ### Spark DataFrame { #spark_frame }
 19 | 
 20 | `InternalFrame` is given a Spark [DataFrame](../../sql/DataFrame.md) when [created](#creating-instance).
 21 | 
 22 | ## Managed Spark DataFrame { #_sdf }
 23 | 
 24 | `_sdf` is the underlying managed Spark DataFrame.
 25 | 
 26 | `_sdf` is the [Spark DataFrame](#spark_frame) with [attach_default_index](#attach_default_index) and [\_\_natural_order__](#NATURAL_ORDER_COLUMN_NAME) columns selected.
 27 | 
 28 | ## Default Index Column Name { #SPARK_DEFAULT_INDEX_NAME }
 29 | 
 30 | `InternalFrame` uses the following as the name of the default index column:
 31 | 
 32 | ```text
 33 | __index_level_0__
 34 | ```
 35 | 
 36 | ## Index Column Pattern { #SPARK_INDEX_NAME_PATTERN }
 37 | 
 38 | `InternalFrame` defines a regular pattern to match the index columns.
 39 | 
 40 | ```text
 41 | __index_level_[0-9]+__
 42 | ```
 43 | 
 44 | It is invalid to name columns in the [Spark DataFrame](#spark_frame) to match the index column pattern.
 45 | Index columns must not be in the columns of the Spark DataFrame.
 46 | 
 47 | ## to_internal_spark_frame { #to_internal_spark_frame }
 48 | 
 49 | ```py
 50 | @lazy_property
 51 | def to_internal_spark_frame(
 52 |     self) -> SparkDataFrame
 53 | ```
 54 | 
 55 | `to_internal_spark_frame` returns the [spark_frame](#spark_frame) with the [index_spark_columns](#index_spark_columns) followed by the [data_spark_columns](#data_spark_columns).
 56 | 
 57 | ## spark_frame { #spark_frame }
 58 | 
 59 | ```py
 60 | from pyspark.sql import DataFrame as SparkDataFrame
 61 | 
 62 | @property
 63 | def spark_frame(
 64 |     self) -> SparkDataFrame
 65 | ```
 66 | 
 67 | `spark_frame` returns the underlying [managed Spark DataFrame](#_sdf).
 68 | 
 69 | ## Demo
 70 | 
 71 | ```py
 72 | from pyspark import pandas as ps
 73 | 
 74 | psdf = ps.DataFrame({
 75 |     'A': [1, 2, 3, 4],
 76 |     'B': [5, 6, 7, 8],
 77 |     'C': [9, 10, 11, 12],
 78 |     'D': [13, 14, 15, 16],
 79 |     'E': [17, 18, 19, 20]}, columns = ['A', 'B', 'C', 'D', 'E'])
 80 | 
 81 | psdf._internal
 82 | # <pyspark.pandas.internal.InternalFrame object at 0x7f7ff024f820>
 83 | 
 84 | psdf._internal.spark_frame
 85 | # DataFrame[__index_level_0__: bigint, A: bigint, B: bigint, C: bigint, D: bigint, E: bigint, __natural_order__: bigint]
 86 | 
 87 | psdf._internal.spark_frame.show()
 88 | # +-----------------+---+---+---+---+---+-----------------+
 89 | # |__index_level_0__|  A|  B|  C|  D|  E|__natural_order__|
 90 | # +-----------------+---+---+---+---+---+-----------------+
 91 | # |                0|  1|  5|  9| 13| 17|      17179869184|
 92 | # |                1|  2|  6| 10| 14| 18|      42949672960|
 93 | # |                2|  3|  7| 11| 15| 19|      68719476736|
 94 | # |                3|  4|  8| 12| 16| 20|      94489280512|
 95 | # +-----------------+---+---+---+---+---+-----------------+
 96 | 
 97 | psdf._internal.to_internal_spark_frame.show()
 98 | # +-----------------+---+---+---+---+---+
 99 | # |__index_level_0__|  A|  B|  C|  D|  E|
100 | # +-----------------+---+---+---+---+---+
101 | # |                0|  1|  5|  9| 13| 17|
102 | # |                1|  2|  6| 10| 14| 18|
103 | # |                2|  3|  7| 11| 15| 19|
104 | # |                3|  4|  8| 12| 16| 20|
105 | # +-----------------+---+---+---+---+---+
106 | ```
107 | 


--------------------------------------------------------------------------------
/docs/pyspark/pandas/generic/Frame.md:
--------------------------------------------------------------------------------
 1 | # Frame
 2 | 
 3 | `Frame` is an [abstraction](#contract) of [frames](#implementations) that behave like [pandas.DataFrame]({{ pandas.api }}/pandas.DataFrame.html) and [pandas.Series]({{ pandas.api }}/pandas.Series.html).
 4 | 
 5 | ```py
 6 | class Frame(object, metaclass=ABCMeta)
 7 | ```
 8 | 
 9 | ## Contract
10 | 
11 | ### <span id="__getitem__"> \_\_getitem\_\_ { #__getitem }
12 | 
13 | ```py
14 | @abstractmethod
15 | def __getitem__(
16 |   self,
17 |   key: Any) -> Any
18 | ```
19 | 
20 | ```py
21 | class hello():
22 |   def __getitem__(self, key):
23 |     print(f"__getitem__({key})")
24 | 
25 | h = hello()
26 | 
27 | >>> h[4]
28 | __getitem__(4)
29 | ```
30 | 
31 | ### _internal { #_internal }
32 | 
33 | ```py
34 | @property
35 | @abstractmethod
36 | def _internal(
37 |   self) -> InternalFrame
38 | ```
39 | 
40 | ## Implementations
41 | 
42 | * [DataFrame](../DataFrame.md)
43 | * `Series`
44 | 


--------------------------------------------------------------------------------
/docs/pyspark/pandas/generic/index.md:
--------------------------------------------------------------------------------
1 | # pyspark.pandas.generic Package
2 | 
3 | `pyspark.pandas.generic` package is...FIXME
4 | 


--------------------------------------------------------------------------------
/docs/pyspark/pandas/index.md:
--------------------------------------------------------------------------------
1 | # pyspark.pandas Package
2 | 
3 | When imported (that triggers `__init__.py`), `pyspark.pandas` does _monkey-patching_ of `pandas.DataFrame` and `pandas.Series` classes (using [\_\_class_getitem__]({{ python.docs }}/reference/datamodel.html#emulating-generic-types) dunder method).
4 | 
5 | Pandas | PySpark
6 | -------|--------
7 | [pandas.DataFrame]({{ pandas.api }}/pandas.DataFrame.html) | `pyspark.pandas.frame.DataFrame`
8 | [pandas.Series]({{ pandas.api }}/pandas.Series.html) | `pyspark.pandas.series.Series`
9 | 


--------------------------------------------------------------------------------
/docs/pyspark/rdd.md:
--------------------------------------------------------------------------------
 1 | # rdd.py
 2 | 
 3 | `rdd` module (in `pyspark` package) defines [RDD](../RDD.md).
 4 | 
 5 | ```py
 6 | from pyspark.rdd import *
 7 | ```
 8 | 
 9 | ## \_\_all__
10 | 
11 | ??? note "import *"
12 |     The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered.
13 | 
14 |     Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package).
15 | 
16 | * [RDD](../RDD.md)
17 | 
18 | ## _prepare_for_python_RDD { #_prepare_for_python_RDD }
19 | 
20 | ```py
21 | _prepare_for_python_RDD(
22 |   sc: "SparkContext",
23 |   command: Any) -> Tuple[bytes, Any, Any, Any]
24 | ```
25 | 
26 | `_prepare_for_python_RDD` creates a `CloudPickleSerializer` to `dumps` the given `command` pair (that creates a `pickled_command`).
27 | 
28 | If the size of the `pickled_command` is above the [broadcast threshold](../PythonUtils.md#getBroadcastThreshold), `_prepare_for_python_RDD` creates a broadcast variable for `pickled_command` that is in turn `dumps` using the `CloudPickleSerializer` (that overrides the `pickled_command`).
29 | 
30 | In the end, `_prepare_for_python_RDD` returns the following:
31 | 
32 | * `pickled_command`
33 | * `broadcast_vars`
34 | * [environment](../SparkContext.md#environment)
35 | * [_python_includes](../SparkContext.md#_python_includes)
36 | 
37 | ---
38 | 
39 | `_prepare_for_python_RDD` is used when:
40 | 
41 | * `pyspark.rdd` is requested to [_wrap_function](#_wrap_function)
42 | * `pyspark.sql.udf` is requested to [_wrap_function](sql/udf.md#_wrap_function)
43 | 


--------------------------------------------------------------------------------
/docs/pyspark/shell.md:
--------------------------------------------------------------------------------
 1 | # shell.py
 2 | 
 3 | `shell.py` script is the interactive shell of PySpark.
 4 | 
 5 | `shell.py` defines the following variables:
 6 | 
 7 | * `sc` being [pyspark.SparkContext](../SparkContext.md)
 8 | * `spark` being [pyspark.sql.session.SparkSession](../pyspark/sql/SparkSession.md)
 9 | * `sql` being [SparkSession.sql](../pyspark/sql/SparkSession.md#sql)
10 | * `sqlContext` and `sqlCtx` for compatibility
11 | 


--------------------------------------------------------------------------------
/docs/pyspark/sql/.pages:
--------------------------------------------------------------------------------
1 | title: pyspark.sql
2 | nav:
3 |     - index.md
4 |     - ...
5 | 


--------------------------------------------------------------------------------
/docs/pyspark/sql/SparkSession.Builder.md:
--------------------------------------------------------------------------------
 1 | # SparkSession.Builder
 2 | 
 3 | ## Creating Instance
 4 | 
 5 | `Builder` takes no arguments to be created.
 6 | 
 7 | `Builder` is created when:
 8 | 
 9 | * `SparkSession` is requested for [one](SparkSession.md#builder)
10 | 
11 | ## getOrCreate { #getOrCreate }
12 | 
13 | ```py
14 | getOrCreate(
15 |   self) -> "SparkSession"
16 | ```
17 | 
18 | With `SPARK_REMOTE` environment variable or `spark.remote` configuration property defined, `getOrCreate`...FIXME
19 | 
20 | `getOrCreate` [_instantiatedSession](SparkSession.md#_instantiatedSession).
21 | 
22 | Unless `SparkSession` is already created, `getOrCreate` creates [one](SparkSession.md).
23 | 


--------------------------------------------------------------------------------
/docs/pyspark/sql/SparkSession.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | tags:
  3 |   - Python
  4 | ---
  5 | 
  6 | # SparkSession
  7 | 
  8 | `SparkSession` is a Python class in [pyspark.sql.session](session.md) module.
  9 | 
 10 | ```py
 11 | from pyspark.sql.session import SparkSession
 12 | ```
 13 | 
 14 | ## SparkConversionMixin { #SparkConversionMixin }
 15 | 
 16 | `SparkSession` uses [SparkConversionMixin](../../sql/SparkConversionMixin.md) (for pandas to Spark conversion).
 17 | 
 18 | ## Creating Instance
 19 | 
 20 | `SparkSession` takes the following to be created:
 21 | 
 22 | * <span id="sparkContext"><span id="_sc"> [SparkContext](../../SparkContext.md)
 23 | * <span id="jsparkSession"> `SparkSession` (`Optional[JavaObject]`)
 24 | * <span id="options"> Options
 25 | 
 26 | While being created, `SparkSession` gets access to [_jsc](#_jsc) and [_jvm](#_jvm) using the given [SparkContext](#_sc).
 27 | 
 28 | !!! note
 29 |     It is expected that [_jvm](../../SparkContext.md#_jvm) is defined (or an exception is thrown).
 30 | 
 31 | Unless the given [SparkSession](#jsparkSession) is defined, `SparkSession` gets one from the [_jvm](../../SparkContext.md#_jvm).
 32 | 
 33 | `SparkSession` [_monkey_patch_RDD](#_monkey_patch_RDD).
 34 | 
 35 | `SparkSession` [install_exception_handler](#install_exception_handler).
 36 | 
 37 | ---
 38 | 
 39 | `SparkSession` is created when:
 40 | 
 41 | * `SparkSession.Builder` is requested to [get or create one](SparkSession.Builder.md#getOrCreate)
 42 | * `SparkSession` is requested to [get an active SparkSession](#getActiveSession)
 43 | 
 44 | ### Java SparkContext { #_jsc }
 45 | 
 46 | ```py
 47 | _jsc: JavaObject
 48 | ```
 49 | 
 50 | `_jsc` is a Java `SparkContext` ([Spark Core]({{ book.spark_core }}/SparkContext)) that is created through Py4J.
 51 | 
 52 | ??? note "JavaObject"
 53 |     `JavaObject` ([Py4J]({{ py4j.docs }}/py4j_java_gateway.html#javaobject)) represents a Java object from which you can call methods or access fields.
 54 | 
 55 | `_jsc` is initialized when `SparkSession` is [created](#creating-instance) to be the [_jsc](../../SparkContext.md#_jsc) of the given [SparkContext](#_sc).
 56 | 
 57 | `_jsc` is used (among the other internal uses) when:
 58 | 
 59 | * `SCCallSiteSync` is requested to `__enter__` and `__exit__`
 60 | 
 61 | ### py4j JVMView { #_jvm }
 62 | 
 63 | ```py
 64 | _jvm: ClassVar[Optional[JVMView]]
 65 | ```
 66 | 
 67 | ??? note "JVMView"
 68 |     `JVMView` ([Py4J]({{ py4j.docs }}/py4j_java_gateway.html#jvmview)) that allows access to the Java Virtual Machine of a `JavaGateway`.
 69 | 
 70 |     `JVMView` can be used to reference static members (fields and methods) and to call constructors.
 71 | 
 72 |     From [py4j.JVMView]({{ py4j.javadoc }}/py4j/JVMView.html) javadoc:
 73 | 
 74 |     > A JVM view keeps track of imports and import searches. A Python client can have multiple JVM views (e.g., one for each module) so that imports in one view do not conflict with imports from other views.
 75 | 
 76 | `_jvm` is initialized when `SparkSession` is [created](#creating-instance) to be the [_jvm](../../SparkContext.md#_jvm) of the given [SparkContext](#_sc).
 77 | 
 78 | `_jvm` must be defined when `SparkSession` is [created](#creating-instance) or an `AssertionError` is thrown.
 79 | 
 80 | `_jvm` is "cleared" (_stopped_) in [stop](#stop).
 81 | 
 82 | `_jvm` is used (among the other internal uses) when:
 83 | 
 84 | * `ChannelBuilder` is requested to `default_port`
 85 | * `InternalFrame` is requested to `attach_distributed_column`
 86 | * `DataFrameReader` is requested to `csv` and `json`
 87 | * `pyspark.pandas.spark.functions.py` module is requested to `_call_udf` and `_make_arguments`
 88 | * `SparkConversionMixin` is requested to [_create_from_pandas_with_arrow](../../sql/SparkConversionMixin.md#_create_from_pandas_with_arrow)
 89 | * `SparkSession` is requested to [_create_dataframe](#_create_dataframe)
 90 | 
 91 | ```text
 92 | >>> type(spark)
 93 | <class 'pyspark.sql.session.SparkSession'>
 94 | 
 95 | >>> type(spark._jvm)
 96 | <class 'py4j.java_gateway.JVMView'>
 97 | ```
 98 | 
 99 | ## Creating Builder { #builder }
100 | 
101 | ```py
102 | @classproperty
103 | builder(
104 |   cls) -> Builder
105 | ```
106 | 
107 | ??? note "`@classproperty` Decorator"
108 |     `builder` is a `@classproperty` that is PySpark-specific to mimic how [@classmethod]({{ python.docs }}/library/functions.html#classmethod) and [@property]({{ python.docs }}/library/functions.html#property) should work together.
109 | 
110 | `builder` creates a new [SparkSession.Builder](SparkSession.Builder.md).
111 | 
112 | ## \_\_enter__
113 | 
114 | ```py
115 | __enter__(
116 |   self) -> "SparkSession"
117 | ```
118 | 
119 | ??? note "Special Method"
120 |     Enables `with SparkSession.builder.(...).getOrCreate() as session:` syntax.
121 | 
122 |     Learn more:
123 | 
124 |     1. [PEP 343 – The "with" Statement]({{ python.peps }}/pep-0343/)
125 |     1. [3.3.9. With Statement Context Managers]({{ python.docs }}/reference/datamodel.html#with-statement-context-managers)
126 |     1. [Context Managers and Python's with Statement]({{ python.realpython }}/python-with-statement/)
127 | 
128 | `__enter__` returns `self`.
129 | 
130 | ## \_\_exit__
131 | 
132 | ```py
133 | __exit__(
134 |   self,
135 |   exc_type: Optional[Type[BaseException]],
136 |   exc_val: Optional[BaseException],
137 |   exc_tb: Optional[TracebackType],
138 | ) -> None
139 | ```
140 | 
141 | ??? note "Special Method"
142 |     Enables `with SparkSession.builder.(...).getOrCreate() as session:` syntax.
143 | 
144 |     Learn more:
145 | 
146 |     1. [PEP 343 – The "with" Statement]({{ python.peps }}/pep-0343/)
147 |     1. [3.3.9. With Statement Context Managers]({{ python.docs }}/reference/datamodel.html#with-statement-context-managers)
148 |     1. [Context Managers and Python's with Statement]({{ python.realpython }}/python-with-statement/)
149 | 
150 | `__exit__` [stop](#stop) this `SparkSession` (which is exactly what `__exit__` is supposed to do with resource manager once they're out of scope and resources should be released).
151 | 
152 | ## _create_shell_session { #_create_shell_session }
153 | 
154 | ```py
155 | @staticmethod
156 | _create_shell_session() -> "SparkSession"
157 | ```
158 | 
159 | ??? note "`@staticmethod`"
160 |     Learn more in [Python Documentation]({{ python.docs }}/library/functions.html#staticmethod).
161 | 
162 | `_create_shell_session`...FIXME
163 | 
164 | ---
165 | 
166 | `_create_shell_session` is used when:
167 | 
168 | * [pyspark/shell.py](../shell.md) module is imported
169 | 
170 | ## Executing SQL Statement { #sql }
171 | 
172 | ```py
173 | sql(
174 |   self,
175 |   sqlQuery: str,
176 |   args: Optional[Dict[str, Any]] = None,
177 |   **kwargs: Any) -> DataFrame
178 | ```
179 | 
180 | `sql` creates a [DataFrame](../../sql/DataFrame.md) with the `sqlQuery` query executed.
181 | 
182 | `sql` uses `SQLStringFormatter` to `format` the given `sqlQuery` with the `kwargs`, if defined.
183 | 


--------------------------------------------------------------------------------
/docs/pyspark/sql/UserDefinedFunction.md:
--------------------------------------------------------------------------------
 1 | # UserDefinedFunction
 2 | 
 3 | `UserDefinedFunction` is a Python class in [pyspark.sql.udf](udf.md) module.
 4 | 
 5 | ```py
 6 | from pyspark.sql.udf import UserDefinedFunction
 7 | ```
 8 | 
 9 | ## Creating Instance
10 | 
11 | `UserDefinedFunction` takes the following to be created:
12 | 
13 | * <span id="func"> Function (`Callable`)
14 | * <span id="returnType"><span id="_returnType"> Return Type (default: `StringType`)
15 | * <span id="name"> Name (default: `None`)
16 | * <span id="evalType"> Eval Type (default: [SQL_BATCHED_UDF](../../sql/PythonEvalType.md#SQL_BATCHED_UDF))
17 | * <span id="deterministic"> `deterministic` flag (default: `True`)
18 | 
19 | `UserDefinedFunction` is created when:
20 | 
21 | * [_create_udf](udf.md#_create_udf) (from `pyspark.sql.udf` module) is executed
22 | 
23 | ### _judf_placeholder { #_judf_placeholder }
24 | 
25 | `UserDefinedFunction` initializes `_judf_placeholder` to be `None` when [created](#creating-instance).
26 | 
27 | `_judf_placeholder` is [_create_judf](#_create_judf) of the [func](#func) when `UserDefinedFunction` is requested to [_judf](#_judf).
28 | 
29 | `_judf_placeholder` is available as [_judf](#_judf).
30 | 
31 | `_judf_placeholder` can be reset (`None`) when `UserDefinedFunction` is requested to [asNondeterministic](#asNondeterministic).
32 | 
33 | ## \_\_call__
34 | 
35 | ```py
36 | __call__(
37 |   self,
38 |   *cols: "ColumnOrName") -> Column
39 | ```
40 | 
41 | ??? note "Emulating callable objects"
42 |     Instances of arbitrary classes can be made callable by defining a `__call__()` method in their class.
43 | 
44 |     `__call__` is called when an instance is "called" as a function.
45 | 
46 |     Learn more in [3.3.6. Emulating callable objects]({{ python.docs }}/reference/datamodel.html?#object.__call__).
47 | 
48 | With `profiler_collector` enabled, `__call__`...FIXME
49 | 
50 | Otherwise, `__call__` assigns the [_judf](#_judf) as the [judf](#judf) and creates a [PythonUDF](../../sql/PythonUDF.md).
51 | 
52 | In the end, `__call__` creates a `Column` with the `PythonUDF`.
53 | 
54 | ## _judf { #_judf }
55 | 
56 | ```py
57 | @property
58 | _judf(
59 |   self) -> JavaObject
60 | ```
61 | 
62 | `_judf` [_create_judf](#_create_judf) for the [func](#func) unless the [_judf_placeholder](#_judf_placeholder) has already been initialized.
63 | 
64 | In the end, `_judf` returns the [_judf_placeholder](#_judf_placeholder).
65 | 
66 | ---
67 | 
68 | `_judf` is used when:
69 | 
70 | * `UserDefinedFunction` is requested to [\_\_call__](#__call__)
71 | * `UDFRegistration` is requested to [register](../../sql/UDFRegistration.md#register)
72 | 
73 | ## Creating Java UserDefinedPythonFunction { #_create_judf }
74 | 
75 | ```py
76 | _create_judf(
77 |   self,
78 |   func: Callable[..., Any]) -> JavaObject
79 | ```
80 | 
81 | `_create_judf` uses the [_jvm](../../SparkContext.md#_jvm) bridge to create a [UserDefinedPythonFunction](../../sql/UserDefinedPythonFunction.md) with the following:
82 | 
83 | * [_name](#_name)
84 | * [SimplePythonFunction](udf.md#_wrap_function) (with a pickled version) of the given `func` and the [returnType](#returnType)
85 | * The [returnType](#returnType) (parsed from JSON format to Java)
86 | * [evalType](#evalType)
87 | * [deterministic](#deterministic)
88 | 
89 | ---
90 | 
91 | `_create_judf` is used when:
92 | 
93 | * `UserDefinedFunction` is requested to [\_\_call__](#__call__) and [_judf](#_judf)
94 | 


--------------------------------------------------------------------------------
/docs/pyspark/sql/dataframe.md:
--------------------------------------------------------------------------------
 1 | # dataframe.py
 2 | 
 3 | `dataframe` module (in `pyspark.sql` package) defines [DataFrame et al.](#__all__)
 4 | 
 5 | ```py
 6 | from pyspark.sql.dataframe import *
 7 | ```
 8 | 
 9 | ## \_\_all__
10 | 
11 | ??? note "import *"
12 |     The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered.
13 | 
14 |     Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package).
15 | 
16 | * [DataFrame](../../sql/DataFrame.md)
17 | * `DataFrameNaFunctions`
18 | * `DataFrameStatFunctions`
19 | 


--------------------------------------------------------------------------------
/docs/pyspark/sql/functions.md:
--------------------------------------------------------------------------------
 1 | # functions.py
 2 | 
 3 | `functions.py` module belongs to `pyspark.sql` package.
 4 | 
 5 | ```py
 6 | from pyspark.sql.functions import udf
 7 | ```
 8 | 
 9 | ## udf
10 | 
11 | ```py
12 | udf(
13 |     f: Optional[Union[Callable[..., Any], "DataTypeOrString"]] = None,
14 |     returnType: "DataTypeOrString" = StringType(),
15 | ) -> Union["UserDefinedFunctionLike", Callable[[Callable[..., Any]], "UserDefinedFunctionLike"]]
16 | ```
17 | 
18 | `udf` [_create_py_udf](udf.md#_create_py_udf) with [SQL_BATCHED_UDF](../../sql/PythonEvalType.md#SQL_BATCHED_UDF) eval type.
19 | 


--------------------------------------------------------------------------------
/docs/pyspark/sql/group.md:
--------------------------------------------------------------------------------
 1 | # group.py
 2 | 
 3 | `group` module (in `pyspark.sql` package) defines [GroupedData](../../sql/GroupedData.md).
 4 | 
 5 | ```py
 6 | from pyspark.sql.group import *
 7 | ```
 8 | 
 9 | ## \_\_all__
10 | 
11 | ??? note "import *"
12 |     The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered.
13 | 
14 |     Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package).
15 | 
16 | * [GroupedData](../../sql/GroupedData.md)
17 | 


--------------------------------------------------------------------------------
/docs/pyspark/sql/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: pyspark.sql
 3 | ---
 4 | 
 5 | # pyspark.sql Package
 6 | 
 7 | ```py
 8 | import pyspark.sql
 9 | ```
10 | 
11 | ## \_\_all__
12 | 
13 | ??? note "import *"
14 |     The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered.
15 | 
16 |     Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package).
17 | 
18 | * [SparkSession](SparkSession.md)
19 | * `SQLContext`
20 | * `HiveContext`
21 | * [UDFRegistration](../../sql/UDFRegistration.md)
22 | * `DataFrame`
23 | * [GroupedData](../../sql/GroupedData.md)
24 | * `Column`
25 | * `Catalog`
26 | * [Observation](../../sql/Observation.md)
27 | * `Row`
28 | * `DataFrameNaFunctions`
29 | * `DataFrameStatFunctions`
30 | * `Window`
31 | * `WindowSpec`
32 | * `DataFrameReader`
33 | * `DataFrameWriter`
34 | * `DataFrameWriterV2`
35 | * `PandasCogroupedOps`
36 | 


--------------------------------------------------------------------------------
/docs/pyspark/sql/pandas/PandasUDFType.md:
--------------------------------------------------------------------------------
 1 | # PandasUDFType
 2 | 
 3 | !!! warning "Deprecation Notice"
 4 |     As of [PySpark 3.0.0](https://issues.apache.org/jira/browse/SPARK-28264), `PandasUDFType` is deprecated in favour of Python type hints.
 5 | 
 6 | `PandasUDFType` is the `functionType` of [pandas_udf](../../../pandas-udfs/index.md#pandas_udf) for Python methods to be used as [pandas UDFs](../../../pandas-udfs/index.md) (with the types matching [PythonEvalType](../../../sql/PythonEvalType.md) on the JVM/Scala side).
 7 | 
 8 | PandasUDFType | PythonEvalType
 9 | --------------|---------------
10 |  `GROUPED_AGG` | [SQL_GROUPED_AGG_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_GROUPED_AGG_PANDAS_UDF)
11 |  `GROUPED_MAP` | [SQL_GROUPED_MAP_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_GROUPED_MAP_PANDAS_UDF)
12 |  `SCALAR` | [SQL_SCALAR_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_UDF)
13 |  `SCALAR_ITER` | [SQL_SCALAR_PANDAS_ITER_UDF](../../../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_ITER_UDF)
14 | 


--------------------------------------------------------------------------------
/docs/pyspark/sql/pandas/functions.md:
--------------------------------------------------------------------------------
 1 | # functions.py
 2 | 
 3 | `functions.py` defines [pandas_udf](#pandas_udf) for [pandas user-defined function](../../../pandas-udfs/index.md).
 4 | 
 5 | `functions.py` is part of `pyspark.sql.pandas` package.
 6 | 
 7 | ```python
 8 | from pyspark.sql.functions import pandas_udf
 9 | ```
10 | 
11 | ## pandas_udf { #pandas_udf }
12 | 
13 | ```python
14 | pandas_udf(
15 |   f=None,
16 |   returnType=None,
17 |   functionType=None)
18 | ```
19 | 
20 | `pandas_udf` creates a [pandas user-defined function](../../../pandas-udfs/index.md).
21 | 
22 | `pandas_udf` [_create_pandas_udf](#_create_pandas_udf) (possibly creating a partial function with `functools.partial` ([Python]({{ python.docs }}/library/functools.html#functools.partial)) when used as a [decorator](#pandas_udf_decorator)).
23 | 
24 | ### Decorator { #pandas_udf_decorator }
25 | 
26 | `pandas_udf` can and usually is used as a Python decorator with two positional arguments for the return and function types.
27 | 
28 | ```py
29 | @pandas_udf(returnType, functionType)
30 | ```
31 | 
32 | ### returnType { #pandas_udf_returnType }
33 | 
34 | `returnType` can be one of the following:
35 | 
36 | * `pyspark.sql.types.DataType`
37 | * A DDL-formatted type string
38 | 
39 | ### functionType { #pandas_udf_functionType }
40 | 
41 | `functionType` must be one the values from `PandasUDFType`:
42 | 
43 | * [SQL_SCALAR_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_UDF)
44 | * [SQL_SCALAR_PANDAS_ITER_UDF](../../../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_ITER_UDF)
45 | * [SQL_GROUPED_MAP_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_GROUPED_MAP_PANDAS_UDF)
46 | * [SQL_GROUPED_AGG_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_GROUPED_AGG_PANDAS_UDF)
47 | * [SQL_MAP_PANDAS_ITER_UDF](../../../sql/PythonEvalType.md#SQL_MAP_PANDAS_ITER_UDF)
48 | * [SQL_COGROUPED_MAP_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_COGROUPED_MAP_PANDAS_UDF)
49 | 
50 | ### _create_pandas_udf { #_create_pandas_udf }
51 | 
52 | ```py
53 | _create_pandas_udf(
54 |   f,
55 |   returnType,
56 |   evalType)
57 | ```
58 | 
59 | `_create_pandas_udf`...FIXME
60 | 


--------------------------------------------------------------------------------
/docs/pyspark/sql/pandas/index.md:
--------------------------------------------------------------------------------
1 | # pyspark.sql.pandas Package
2 | 
3 | `pyspark.sql.pandas` package is...FIXME
4 | 


--------------------------------------------------------------------------------
/docs/pyspark/sql/session.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: session.py
 3 | ---
 4 | 
 5 | # pyspark.sql.session Module
 6 | 
 7 | `session` module (in `pyspark.sql` package) defines [SparkSession](SparkSession.md).
 8 | 
 9 | ```py
10 | from pyspark.sql.session import *
11 | ```
12 | 
13 | ## \_\_all__
14 | 
15 | ??? note "import *"
16 |     The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered.
17 | 
18 |     Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package).
19 | 
20 | * [SparkSession](SparkSession.md)
21 | 


--------------------------------------------------------------------------------
/docs/pyspark/sql/udf.md:
--------------------------------------------------------------------------------
 1 | # udf.py
 2 | 
 3 | `udf` module (in `pyspark.sql` package) defines [UDFRegistration](../../sql/UDFRegistration.md).
 4 | 
 5 | ```py
 6 | from pyspark.sql.udf import *
 7 | ```
 8 | 
 9 | ## \_\_all__
10 | 
11 | ??? note "import *"
12 |     The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered.
13 | 
14 |     Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package).
15 | 
16 | * [UDFRegistration](../../sql/UDFRegistration.md)
17 | 
18 | ## _create_udf { #_create_udf }
19 | 
20 | ```py
21 | _create_udf(
22 |     f: Callable[..., Any],
23 |     returnType: "DataTypeOrString",
24 |     evalType: int,
25 |     name: Optional[str] = None,
26 |     deterministic: bool = True) -> "UserDefinedFunctionLike"
27 | ```
28 | 
29 | `_create_udf` creates a [UserDefinedFunction](UserDefinedFunction.md) (with the name of the object to be the name of function `f`).
30 | 
31 | ---
32 | 
33 | `_create_udf` is used when:
34 | 
35 | * `UDFRegistration` is requested to [register](../../sql/UDFRegistration.md#register)
36 | * [udf](functions.md#udf) is used (and [_create_py_udf](#_create_py_udf) is executed)
37 | * [pandas_udf](pandas/functions.md#pandas_udf) (from `pyspark.sql.pandas`) is executed
38 | 
39 | ## _create_py_udf { #_create_py_udf }
40 | 
41 | ```py
42 | _create_py_udf(
43 |     f: Callable[..., Any],
44 |     returnType: "DataTypeOrString",
45 |     evalType: int,
46 | ) -> "UserDefinedFunctionLike"
47 | ```
48 | 
49 | `_create_py_udf`...FIXME
50 | 
51 | ---
52 | 
53 | `_create_py_udf` is used when:
54 | 
55 | * [udf](functions.md#udf) is executed
56 | 
57 | ## Creating SimplePythonFunction for (Pickled) Python Function { #_wrap_function }
58 | 
59 | ```py
60 | _wrap_function(
61 |   sc: SparkContext,
62 |   func: Callable[..., Any],
63 |   returnType: "DataTypeOrString") -> JavaObject
64 | ```
65 | 
66 | `_wrap_function` creates a `command` tuple with the given `func` and `returnType`.
67 | 
68 | `_wrap_function` [_prepare_for_python_RDD](../rdd.md#_prepare_for_python_RDD) for the `command` tuple that builds the input for a [SimplePythonFunction](../../SimplePythonFunction.md):
69 | 
70 | * `pickled_command` byte array
71 | * `env`
72 | * `includes`
73 | * `broadcast_vars`
74 | 
75 | In the end, `_wrap_function` creates a [SimplePythonFunction](../../SimplePythonFunction.md) with the above and the following from the given [SparkContext](../../SparkContext.md):
76 | 
77 | * [pythonExec](../../SparkContext.md#pythonExec)
78 | * [pythonVer](../../SparkContext.md#pythonVer)
79 | * [_javaAccumulator](../../SparkContext.md#_javaAccumulator)
80 | 
81 | ---
82 | 
83 | `_wrap_function` is used when:
84 | 
85 | * `UserDefinedFunction` is requested to [_create_judf](UserDefinedFunction.md#_create_judf)
86 | 


--------------------------------------------------------------------------------
/docs/pyspark/worker.md:
--------------------------------------------------------------------------------
 1 | # worker.py
 2 | 
 3 | `worker.py` is a Python module in [pyspark](index.md) package.
 4 | 
 5 | ```py
 6 | from pyspark import worker
 7 | ```
 8 | 
 9 | ## <span id="__main__"> Entry Point
10 | 
11 | ??? note "Top-Level Code Environment"
12 |     If the module is executed in the top-level code environment (and not initialized from an import statement), its `__name__` is set to the string `__main__`.
13 | 
14 |     Sometimes "top-level code" is called an _entry point_ to the application.
15 | 
16 |     Learn more in the [\_\_main__ — Top-level code environment]({{ python.docs }}/library/__main__.html).
17 | 
18 | When executed in the top-level code environment (e.g., `python3 -m`), `worker.py` reads the following environment variables:
19 | 
20 | Environment Variable | Description
21 | ---------------------|------------
22 |  `PYTHON_WORKER_FACTORY_PORT` | Port the JVM listens to
23 |  `PYTHON_WORKER_FACTORY_SECRET` | Authorization Secret
24 | 
25 | `worker.py` [local_connect_and_auth](#local_connect_and_auth) (that gives a `sock_file`).
26 | 
27 | `worker.py` [write_int](#write_int) with the PID of the Python process to the `sock_file`.
28 | 
29 | In the end, `worker.py` [main](#main) (with the `sock_file` and `sock_file` for the input and output files).
30 | 
31 | ## main { #main }
32 | 
33 | ```py
34 | main(
35 |     infile,
36 |     outfile)
37 | ```
38 | 
39 | `main` reads `PYTHON_FAULTHANDLER_DIR` environment variable.
40 | 
41 | `main` does a lot of initializations.
42 | 
43 | ??? note "FIXME Review the initializations"
44 | 
45 | `main` [read_udfs](#read_udfs) that gives the following:
46 | 
47 | * `func`
48 | * `profiler`
49 | * `deserializer`
50 | * `serializer`
51 | 
52 | requests the `deserializer` to `load_stream` from the given `infile` and executes `func` (with the `split_index` and the deserialized stream).
53 | 
54 | `main` does a lot of post-processings.
55 | 
56 | ??? note "FIXME Review the post-processings"
57 | 
58 | ## read_udfs { #read_udfs }
59 | 
60 | ```py
61 | read_udfs(
62 |     pickleSer,
63 |     infile,
64 |     eval_type)
65 | ```
66 | 
67 | `read_udfs`...FIXME
68 | 
69 | ### read_single_udf { #read_single_udf }
70 | 
71 | ```py
72 | read_single_udf(
73 |     pickleSer,
74 |     infile,
75 |     eval_type,
76 |     runner_conf,
77 |     udf_index)
78 | ```
79 | 
80 | `read_single_udf`...FIXME
81 | 


--------------------------------------------------------------------------------
/docs/python-api.md:
--------------------------------------------------------------------------------
1 | # Python API
2 | 
3 | [TAGS]
4 | 


--------------------------------------------------------------------------------
/docs/pytorch-distributed/.pages:
--------------------------------------------------------------------------------
1 | title: Distributed Training using PyTorch
2 | nav:
3 |     - index.md
4 |     - ...
5 | 


--------------------------------------------------------------------------------
/docs/pytorch-distributed/TorchDistributor.md:
--------------------------------------------------------------------------------
  1 | # TorchDistributor
  2 | 
  3 | `TorchDistributor` is a [Distributor](../ml/Distributor.md) to run PyTorch's [torch.distributed.run]({{ pytorch.github }}/blob/main/torch/distributed/run.py) module on Apache Spark clusters.
  4 | 
  5 | `TorchDistributor` is a PySpark translation of [torchrun]({{ pytorch.docs }}/elastic/run.html) (from [Torch Distributed Elastic]({{ pytorch.docs }}/distributed.elastic.html)).
  6 | 
  7 | ## Demo
  8 | 
  9 | ```py
 10 | from pyspark.ml.torch.distributor import TorchDistributor
 11 | 
 12 | distributor = TorchDistributor(
 13 |     num_processes=1,
 14 |     local_mode=False,
 15 |     use_gpu=False)
 16 | ```
 17 | 
 18 | ```py
 19 | # Use a path to a training script
 20 | # and variable-length kwargs
 21 | distributor.run(
 22 |     "train.py",
 23 |     "--learning-rate=1e-3",
 24 |     "--batch-size=64",
 25 |     "--my-key=my-value")
 26 | 
 27 | # Started local training with 1 processes
 28 | # NOTE: Redirects are currently not supported in Windows or MacOs.
 29 | # Finished local training with 1 processes
 30 | ```
 31 | 
 32 | ```py
 33 | # Use a Callable (function)
 34 | # The number of positional arguments is the number of kwargs
 35 | def train(a, b, c):
 36 |     print(f"Got a={a}, b={b}, c={c}")
 37 |     return 'success'
 38 | 
 39 | distributor.run(
 40 |     train,
 41 |     "--learning-rate=1e-3",
 42 |     "--batch-size=64",
 43 |     "--my-key=my-value")
 44 | 
 45 | # Started distributed training with 1 executor proceses
 46 | # NOTE: Redirects are currently not supported in Windows or MacOs.    (0 + 1) / 1]
 47 | # NOTE: Redirects are currently not supported in Windows or MacOs.
 48 | # Got a=--learning-rate=1e-3, b=--batch-size=64, c=--my-key=my-value
 49 | # Got a=--learning-rate=1e-3, b=--batch-size=64, c=--my-key=my-value
 50 | # Finished distributed training with 1 executor proceses
 51 | # 'success'
 52 | ```
 53 | 
 54 | ## Running Distributed Training { #run }
 55 | 
 56 | ```py
 57 | run(
 58 |     self,
 59 |     train_object: Union[Callable, str],
 60 |     *args: Any) -> Optional[Any]
 61 | ```
 62 | 
 63 | `run` determines what to run (e.g., a function or a script based on the given `train_object`).
 64 | 
 65 | * With a function, `run` uses [_run_training_on_pytorch_function](#_run_training_on_pytorch_function)
 66 | * With a script, `run` uses [_run_training_on_pytorch_file](#_run_training_on_pytorch_file)
 67 | 
 68 | In the end, `run` runs a local or distributed training.
 69 | 
 70 | * In [local mode](../ml/Distributor.md#local_mode), `run` [runs local training](#_run_local_training)
 71 | * In non-[local mode](../ml/Distributor.md#local_mode), `run` [runs distributed training](#_run_distributed_training)
 72 | 
 73 | ### Local Training { #_run_local_training }
 74 | 
 75 | ```py
 76 | _run_local_training(
 77 |     self,
 78 |     framework_wrapper_fn: Callable,
 79 |     train_object: Union[Callable, str],
 80 |     *args: Any,
 81 | ) -> Optional[Any]
 82 | ```
 83 | 
 84 | `_run_local_training` looks up `CUDA_VISIBLE_DEVICES` among the environment variables.
 85 | 
 86 | With [use_gpu](../ml/Distributor.md#use_gpu), `_run_local_training`...FIXME
 87 | 
 88 | `_run_local_training` prints out the following INFO message to the logs:
 89 | 
 90 | ```text
 91 | Started local training with [num_processes] processes
 92 | ```
 93 | 
 94 | `_run_local_training` executes the given `framework_wrapper_fn` function (with the [input_params](#input_params), the given `train_object` and the `args`).
 95 | 
 96 | In the end, `_run_local_training` prints out the following INFO message to the logs:
 97 | 
 98 | ```text
 99 | Finished local training with [num_processes] processes
100 | ```
101 | 
102 | ### Distributed Training { #_run_distributed_training }
103 | 
104 | ```py
105 | _run_distributed_training(
106 |     self,
107 |     framework_wrapper_fn: Callable,
108 |     train_object: Union[Callable, str],
109 |     *args: Any,
110 | ) -> Optional[Any]
111 | ```
112 | 
113 | `_run_distributed_training`...FIXME
114 | 
115 | ### _run_training_on_pytorch_function { #_run_training_on_pytorch_function }
116 | 
117 | ```py
118 | _run_training_on_pytorch_function(
119 |     input_params: Dict[str, Any],
120 |     train_fn: Callable,
121 |     *args: Any
122 | ) -> Any
123 | ```
124 | 
125 | `_run_training_on_pytorch_function` [prepares train and output files](#_setup_files).
126 | 
127 | `_run_training_on_pytorch_function`...FIXME
128 | 
129 | ### Setting Up Files { #_setup_files }
130 | 
131 | ```py
132 | # @contextmanager
133 | _setup_files(
134 |     train_fn: Callable,
135 |     *args: Any
136 | ) -> Generator[Tuple[str, str], None, None]
137 | ```
138 | 
139 | `_setup_files` gives the paths of a TorchRun train file and `output.pickle` output file.
140 | 
141 | ---
142 | 
143 | `_setup_files` [creates a save directory](#_create_save_dir).
144 | 
145 | `_setup_files` [saves train_fn function](#_save_pickled_function) to the save directory (that gives a `pickle_file_path`).
146 | 
147 | `_setup_files` uses the save directory and `output.pickle` name for the output file path.
148 | 
149 | `_setup_files` [creates a torchrun_train_file](#_create_torchrun_train_file) with the following:
150 | 
151 | * [Save directory](#_create_save_dir)
152 | * `pickle_file_path`
153 | * `output.pickle` output file path
154 | 
155 | In the end, `_setup_files` yields (_gives_) the `torchrun_train_file` and the `output.pickle` output file path.
156 | 
157 | ### Creating TorchRun Train File { #_create_torchrun_train_file }
158 | 
159 | ```py
160 | _create_torchrun_train_file(
161 |     save_dir_path: str,
162 |     pickle_file_path: str,
163 |     output_file_path: str
164 | ) -> str
165 | ```
166 | 
167 | `_create_torchrun_train_file` creates `train.py` in the given `save_dir_path` with the following content (based on the given `pickle_file_path` and the `output_file_path`):
168 | 
169 | ```py
170 | import cloudpickle
171 | import os
172 | 
173 | if __name__ == "__main__":
174 |     with open("[pickle_file_path]", "rb") as f:
175 |         train_fn, args = cloudpickle.load(f)
176 |     output = train_fn(*args)
177 |     with open("[output_file_path]", "wb") as f:
178 |         cloudpickle.dump(output, f)
179 | ```
180 | 
181 | ## _run_training_on_pytorch_file { #_run_training_on_pytorch_file }
182 | 
183 | ```py
184 | _run_training_on_pytorch_file(
185 |     input_params: Dict[str, Any],
186 |     train_path: str,
187 |     *args: Any
188 | ) -> None
189 | ```
190 | 
191 | `_run_training_on_pytorch_file` looks up the `log_streaming_client` in the given `input_params` (or assumes `None`).
192 | 
193 | !!! note "FIXME What's log_streaming_client?"
194 | 
195 | `_run_training_on_pytorch_file` [creates torchrun command](#_create_torchrun_command).
196 | 
197 | `_run_training_on_pytorch_file` [executes the command](#_execute_command).
198 | 
199 | ### _create_torchrun_command { #_create_torchrun_command }
200 | 
201 | ```py
202 | _create_torchrun_command(
203 |     input_params: Dict[str, Any],
204 |     path_to_train_file: str,
205 |     *args: Any
206 | ) -> List[str]
207 | ```
208 | 
209 | `_create_torchrun_command` takes the value of the following parameters (from the given `input_params`):
210 | 
211 | * `local_mode`
212 | * `num_processes`
213 | 
214 | `_create_torchrun_command` determines the `torchrun_args` and `processes_per_node` based on `local_mode`.
215 | 
216 |  local_mode | torchrun_args | processes_per_node
217 | -------------|-----------------|---------------------
218 | `True` | <ul><li>`--standalone`<li>`--nnodes=1`</ul> | `num_processes`<br>(from the given `input_params`)
219 | `False` | <ul><li>`--nnodes=[num_processes]`<li>`--node_rank=[node_rank]`<li>`--rdzv_endpoint=[MASTER_ADDR]:[MASTER_PORT]`<li>`--rdzv_id=0`</ul> | 1
220 | 
221 | In the end, `_create_torchrun_command` returns a Python command to execute [torch_run_process_wrapper](torch_run_process_wrapper.md) module (`python -m`) with the following positional arguments:
222 | 
223 | * `torchrun_args`
224 | * `--nproc_per_node=[processes_per_node]`
225 | * The given `path_to_train_file`
226 | * The given `args`
227 | 


--------------------------------------------------------------------------------
/docs/pytorch-distributed/index.md:
--------------------------------------------------------------------------------
1 | # Distributed Training using PyTorch
2 | 
3 | PySpark 3.4.0 introduces [TorchDistributor](TorchDistributor.md) for distributed training on Apache Spark clusters using [PyTorch Distributed]({{ pytorch.tutorials }}/beginner/dist_overview.html).
4 | 
5 | ## Learn More
6 | 
7 | 1. [Distributed training with TorchDistributor](https://docs.databricks.com/machine-learning/train-model/distributed-training/spark-pytorch-distributor.html)
8 | 


--------------------------------------------------------------------------------
/docs/pytorch-distributed/torch_run_process_wrapper.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: torch_run_process_wrapper
 3 | ---
 4 | 
 5 | # torch_run_process_wrapper Module
 6 | 
 7 | `torch_run_process_wrapper` is used as the [torchrun command](TorchDistributor.md#_create_torchrun_command) in [TorchDistributor](TorchDistributor.md).
 8 | 
 9 | `torch_run_process_wrapper` executes `torch.distributed.run` module (using `python -m`). `torch_run_process_wrapper` monitors the child process and prints out the output to the standard output.
10 | 


--------------------------------------------------------------------------------
/docs/runners/.pages:
--------------------------------------------------------------------------------
1 | title: Python Runners
2 | nav:
3 |     - ...
4 | 


--------------------------------------------------------------------------------
/docs/runners/ArrowPythonRunner.md:
--------------------------------------------------------------------------------
 1 | # ArrowPythonRunner
 2 | 
 3 | `ArrowPythonRunner` is a [BasePythonRunner](BasePythonRunner.md) with `Iterator[InternalRow]` input and `ColumnarBatch` (vectorized) output.
 4 | 
 5 | `ArrowPythonRunner` supports `BasicPythonArrowInput` and [BasicPythonArrowOutput](BasicPythonArrowOutput.md).
 6 | 
 7 | ## Creating Instance
 8 | 
 9 | `ArrowPythonRunner` takes the following to be created:
10 | 
11 | * <span id="funcs"> `ChainedPythonFunctions`es
12 | * <span id="evalType"> Eval Type
13 | * <span id="argOffsets"> Argument Offsets
14 | * <span id="schema"> `Schema` ([Spark SQL]({{ book.spark_sql }}/types/StructType))
15 | * <span id="timeZoneId"> TimeZone ID
16 | * <span id="workerConf"> Worker Configuration
17 | * <span id="pythonMetrics"> Performance Metrics
18 | 
19 | `ArrowPythonRunner` is created when the following physical operators ([Spark SQL]({{ book.spark_sql }}/physical-operators/)) are executed:
20 | 
21 | * [AggregateInPandasExec](../sql/AggregateInPandasExec.md)
22 | * [ArrowEvalPythonExec](../sql/ArrowEvalPythonExec.md)
23 | * `FlatMapGroupsInPandasExec`
24 | * `MapInPandasExec`
25 | * `WindowInPandasExec`
26 | 
27 | ## bufferSize { #bufferSize }
28 | 
29 | ??? note "BasePythonRunner"
30 | 
31 |     ```scala
32 |     bufferSize: Int
33 |     ```
34 | 
35 |     `bufferSize` is part of the [BasePythonRunner](BasePythonRunner.md#bufferSize) abstraction.
36 | 
37 | `bufferSize` is the value of [spark.sql.execution.pandas.udf.buffer.size](../configuration-properties/index.md#spark.sql.execution.pandas.udf.buffer.size) configuration property.
38 | 
39 | ## simplifiedTraceback { #simplifiedTraceback }
40 | 
41 | ??? note "BasePythonRunner"
42 | 
43 |     ```scala
44 |     simplifiedTraceback: Boolean
45 |     ```
46 | 
47 |     `simplifiedTraceback` is part of the [BasePythonRunner](BasePythonRunner.md#simplifiedTraceback) abstraction.
48 | 
49 | `simplifiedTraceback` is the value of [spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled](../configuration-properties/index.md#spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled) configuration property.
50 | 


--------------------------------------------------------------------------------
/docs/runners/BasePythonRunner.md:
--------------------------------------------------------------------------------
  1 | # BasePythonRunner
  2 | 
  3 | `BasePythonRunner` is an [abstraction](#contract) of [Python Runners](#implementations).
  4 | 
  5 | `BasePythonRunner` is executed as part of Spark tasks (that run on executors).
  6 | 
  7 | ??? note "Scala Definition"
  8 | 
  9 |     `BasePythonRunner` is a type constructor in Scala (_generic class_ in Java) with the following definition:
 10 | 
 11 |     ```scala
 12 |     abstract class BasePythonRunner[IN, OUT](...) {
 13 |         // ...
 14 |     }
 15 |     ```
 16 | 
 17 |     `BasePythonRunner` uses `IN` and `OUT` as the name of the types for the input and output values.
 18 | 
 19 | ## Contract
 20 | 
 21 | ### newReaderIterator { #newReaderIterator }
 22 | 
 23 | ```scala
 24 | newReaderIterator(
 25 |   stream: DataInputStream,
 26 |   writerThread: WriterThread,
 27 |   startTime: Long,
 28 |   env: SparkEnv,
 29 |   worker: Socket,
 30 |   pid: Option[Int],
 31 |   releasedOrClosed: AtomicBoolean,
 32 |   context: TaskContext): Iterator[OUT]
 33 | ```
 34 | 
 35 | See:
 36 | 
 37 | * [PythonRunner](PythonRunner.md#newReaderIterator)
 38 | * [PythonUDFRunner](PythonUDFRunner.md#newReaderIterator)
 39 | 
 40 | Used when:
 41 | 
 42 | * `BasePythonRunner` is requested to [compute](#compute)
 43 | 
 44 | ### newWriterThread { #newWriterThread }
 45 | 
 46 | ```scala
 47 | newWriterThread(
 48 |   env: SparkEnv,
 49 |   worker: Socket,
 50 |   inputIterator: Iterator[IN],
 51 |   partitionIndex: Int,
 52 |   context: TaskContext): WriterThread
 53 | ```
 54 | 
 55 | See:
 56 | 
 57 | * [PythonRunner](PythonRunner.md#newWriterThread)
 58 | * [PythonUDFRunner](PythonUDFRunner.md#newWriterThread)
 59 | 
 60 | Used when:
 61 | 
 62 | * `BasePythonRunner` is requested to [compute](#compute)
 63 | 
 64 | ## Implementations
 65 | 
 66 | * `ApplyInPandasWithStatePythonRunner`
 67 | * [ArrowPythonRunner](ArrowPythonRunner.md)
 68 | * `CoGroupedArrowPythonRunner`
 69 | * [PythonRunner](PythonRunner.md)
 70 | * [PythonUDFRunner](PythonUDFRunner.md)
 71 | 
 72 | ## Creating Instance
 73 | 
 74 | `BasePythonRunner` takes the following to be created:
 75 | 
 76 | * <span id="funcs"> `ChainedPythonFunctions`
 77 | * <span id="evalType"> Eval Type
 78 | * <span id="argOffsets"> Argument Offsets
 79 | 
 80 | `BasePythonRunner` requires that the number of [ChainedPythonFunctions](#funcs) and [Argument Offsets](#argOffsets) are the same.
 81 | 
 82 | !!! note "Abstract Class"
 83 |     `BasePythonRunner` is an abstract class and cannot be created directly. It is created indirectly for the [concrete BasePythonRunners](#implementations).
 84 | 
 85 | ### <span id="maybeAccumulator"> accumulator { #accumulator }
 86 | 
 87 | ```scala
 88 | accumulator: PythonAccumulatorV2
 89 | ```
 90 | 
 91 | `BasePythonRunner` initializes a registry of a [PythonAccumulatorV2](../PythonAccumulatorV2.md) when [created](#creating-instance) to be the [accumulator](../PythonFunction.md#accumulator) of the head [PythonFunction](../PythonFunction.md) among the given [ChainedPythonFunctions](#funcs).
 92 | 
 93 | The `PythonAccumulatorV2` is used when `ReaderIterator` is requested to [handleEndOfDataSection](ReaderIterator.md#handleEndOfDataSection) (to update metrics).
 94 | 
 95 | ## Computing Result { #compute }
 96 | 
 97 | ```scala
 98 | compute(
 99 |   inputIterator: Iterator[IN],
100 |   partitionIndex: Int,
101 |   context: TaskContext): Iterator[OUT]
102 | ```
103 | 
104 | !!! note "Runs on Executors"
105 |     `compute` runs on Spark executors.
106 | 
107 | `compute` uses the given `TaskContext` to look up the following local properties (if they were specified via `ResourceProfile`):
108 | 
109 | * `resource.executor.cores`
110 | * `resource.pyspark.memory`
111 | 
112 | `compute` requests the `DiskBlockManager` for the local directories and creates a comma-separated list of them (`localdir`).
113 | 
114 | Unless `spark.executorEnv.OMP_NUM_THREADS` is explicitly specified (in the [SparkConf](#conf)), `compute` sets `OMP_NUM_THREADS` (in the [envVars](#envVars)) to be the value of`resource.executor.cores` (if defined).
115 | 
116 | `compute` sets the following in the [envVars](#envVars):
117 | 
118 | * `SPARK_LOCAL_DIRS` as the local directories of the local `DiskBlockManager` (`localdir`)
119 | 
120 | `compute` can optionally define environment variables:
121 | 
122 | * `SPARK_REUSE_WORKER` as `1` when `spark.python.worker.reuse` configuration property is enabled
123 | * `SPARK_SIMPLIFIED_TRACEBACK` as `1` when [simplifiedTraceback](#simplifiedTraceback) is enabled
124 | * _others_
125 | 
126 | `compute` requests `SparkEnv` to [createPythonWorker](../SparkEnv.md#createPythonWorker) (for the [pythonExec](#pythonExec) and the [envVars](#envVars)).
127 | 
128 | `compute` [creates a new WriterThread](#newWriterThread) (to feed the worker process input from the given `inputIterator`) and starts it.
129 | 
130 | `compute` creates and starts a `WriterMonitorThread`.
131 | 
132 | `compute` creates a `MonitorThread`.
133 | 
134 | `compute` creates a buffered `DataInputStream` to read from the worker (socket) output. `compute` uses the [bufferSize](#bufferSize).
135 | 
136 | In the end, `compute` [creates a new ReaderIterator](#newReaderIterator) to read lines from the Python worker's stdout (from the buffered `DataInputStream`).
137 | 
138 | ---
139 | 
140 | `compute` is used when:
141 | 
142 | * `PythonRDD` is requested to [compute a partition](../PythonRDD.md#compute)
143 | * [AggregateInPandasExec](../sql/AggregateInPandasExec.md), [ArrowEvalPythonExec](../sql/ArrowEvalPythonExec.md), `BatchEvalPythonExec`, `FlatMapCoGroupsInPandasExec`, `FlatMapGroupsInPandasExec` `MapInPandasExec`, `WindowInPandasExec` physical operators are executed
144 | * `PandasGroupUtils` is requested to `executePython`
145 | * `PythonForeachWriter` is requested for the [outputIterator](../PythonForeachWriter.md#outputIterator)
146 | 


--------------------------------------------------------------------------------
/docs/runners/BasicPythonArrowOutput.md:
--------------------------------------------------------------------------------
 1 | # BasicPythonArrowOutput
 2 | 
 3 | `BasicPythonArrowOutput` is a marker extension of the [PythonArrowOutput](PythonArrowOutput.md) abstraction for [vectorized outputs](#implementations) of [BasePythonRunner](BasePythonRunner.md)s that produce `ColumnarBatch`es ([Spark SQL]({{ book.spark_sql }}/vectorized-query-execution/ColumnarBatch)).
 4 | 
 5 | ## Implementations
 6 | 
 7 | * [ArrowPythonRunner](ArrowPythonRunner.md)
 8 | * `CoGroupedArrowPythonRunner`
 9 | 
10 | ## Deserializing ColumnarBatch { #deserializeColumnarBatch }
11 | 
12 | ??? note "PythonArrowOutput"
13 | 
14 |     ```scala
15 |     deserializeColumnarBatch(
16 |       batch: ColumnarBatch,
17 |       schema: StructType): ColumnarBatch
18 |     ```
19 | 
20 |     `deserializeColumnarBatch` is part of the [PythonArrowOutput](PythonArrowOutput.md#deserializeColumnarBatch) abstraction.
21 | 
22 | `deserializeColumnarBatch` returns the given `ColumnarBatch` unchanged.
23 | 


--------------------------------------------------------------------------------
/docs/runners/PythonArrowOutput.md:
--------------------------------------------------------------------------------
 1 | # PythonArrowOutput
 2 | 
 3 | `PythonArrowOutput` is an [extension](#contract) of the [BasePythonRunner](BasePythonRunner.md) abstraction for [vectorized (ColumnarBatch) runners](#implementations).
 4 | 
 5 | ??? note "Scala Definition"
 6 | 
 7 |     ```scala
 8 |     trait PythonArrowOutput[OUT <: AnyRef] {
 9 |         self: BasePythonRunner[_, OUT] =>
10 |         // ...
11 |     }
12 |     ```
13 | 
14 | ## Contract
15 | 
16 | ### Deserializing ColumnarBatch { #deserializeColumnarBatch }
17 | 
18 | ```scala
19 | deserializeColumnarBatch(
20 |   batch: ColumnarBatch,
21 |   schema: StructType): OUT
22 | ```
23 | 
24 | See:
25 | 
26 | * [BasicPythonArrowOutput](BasicPythonArrowOutput.md#deserializeColumnarBatch)
27 | 
28 | Used when:
29 | 
30 | * `PythonArrowOutput` is requested to [newReaderIterator](#newReaderIterator) (after a batch is loaded)
31 | 
32 | ### Performance Metrics { #pythonMetrics }
33 | 
34 | ```scala
35 | pythonMetrics: Map[String, SQLMetric]
36 | ```
37 | 
38 | `SQLMetric`s ([Spark SQL]({{ book.spark_sql }}/SQLMetric)):
39 | 
40 | * `pythonNumRowsReceived`
41 | * `pythonDataReceived`
42 | 
43 | Used when:
44 | 
45 | * `PythonArrowOutput` is requested to [newReaderIterator](#newReaderIterator) (after a batch is loaded)
46 | 
47 | ## Implementations
48 | 
49 | * `ApplyInPandasWithStatePythonRunner`
50 | * [BasicPythonArrowOutput](BasicPythonArrowOutput.md)
51 | 


--------------------------------------------------------------------------------
/docs/runners/PythonRunner.md:
--------------------------------------------------------------------------------
 1 | # PythonRunner
 2 | 
 3 | `PythonRunner` is a concrete [BasePythonRunner](BasePythonRunner.md).
 4 | 
 5 | ## Creating Instance
 6 | 
 7 | `PythonRunner` takes the following to be created:
 8 | 
 9 | * <span id="funcs"> `ChainedPythonFunctions`es
10 | 
11 | `PythonRunner` is created (indirectly using [apply](#apply) factory method) when:
12 | 
13 | * `PythonRDD` is requested to [compute a partition](../PythonRDD.md#compute)
14 | * `PythonForeachWriter` is requested for a [PythonRunner](../PythonForeachWriter.md#pythonRunner)
15 | 
16 | ## <span id="apply"> Creating PythonRunner
17 | 
18 | ```scala
19 | apply(
20 |   func: PythonFunction): PythonRunner
21 | ```
22 | 
23 | `apply` simply creates a [PythonRunner](PythonRunner.md) for the [PythonFunction](../PythonFunction.md).
24 | 
25 | ---
26 | 
27 | `apply` is used when:
28 | 
29 | * `PythonRDD` is requested to [compute a partition](../PythonRDD.md#compute)
30 | * `PythonForeachWriter` is requested for a [PythonRunner](../PythonForeachWriter.md#pythonRunner)
31 | 


--------------------------------------------------------------------------------
/docs/runners/PythonUDFRunner.md:
--------------------------------------------------------------------------------
1 | # PythonUDFRunner
2 | 
3 | `PythonUDFRunner` is...FIXME
4 | 


--------------------------------------------------------------------------------
/docs/runners/ReaderIterator.md:
--------------------------------------------------------------------------------
 1 | # ReaderIterator
 2 | 
 3 | `ReaderIterator` is an [extension](#contract) of the `Iterator` ([Scala]({{ scala.api }}/scala/collection/Iterator.html)) abstraction for [iterators](#implementations) to [read](#read) `OUT` values.
 4 | 
 5 | ```scala
 6 | abstract class ReaderIterator(...)
 7 | extends Iterator[OUT]
 8 | ```
 9 | 
10 | ## Contract
11 | 
12 | ### Reading Value { #read }
13 | 
14 | ```scala
15 | read(): OUT
16 | ```
17 | 
18 | See:
19 | 
20 | * [PythonArrowOutput](PythonArrowOutput.md#newReaderIterator)
21 | * [PythonRunner](PythonRunner.md#newReaderIterator)
22 | * [PythonUDFRunner](PythonUDFRunner.md#newReaderIterator)
23 | 
24 | Used when:
25 | 
26 | * `ReaderIterator` is requested to [hasNext](#hasNext)
27 | 
28 | ## Implementations
29 | 
30 | * [PythonArrowOutput](PythonArrowOutput.md#newReaderIterator)
31 | * [PythonRunner](PythonRunner.md#newReaderIterator)
32 | * [PythonUDFRunner](PythonUDFRunner.md#newReaderIterator)
33 | 
34 | ## handleEndOfDataSection { #handleEndOfDataSection }
35 | 
36 | ```scala
37 | handleEndOfDataSection(): Unit
38 | ```
39 | 
40 | `handleEndOfDataSection`...FIXME
41 | 
42 | ---
43 | 
44 | `handleEndOfDataSection` is used when:
45 | 
46 | * `PythonRunner` is requested to [newReaderIterator](PythonRunner.md#newReaderIterator)
47 | * `PythonArrowOutput` is requested to [newReaderIterator](PythonArrowOutput.md#newReaderIterator)
48 | * `PythonUDFRunner` is requested to [newReaderIterator](PythonUDFRunner.md#newReaderIterator)
49 | 


--------------------------------------------------------------------------------
/docs/scala-api.md:
--------------------------------------------------------------------------------
1 | # Scala API
2 | 
3 | [TAGS]
4 | 


--------------------------------------------------------------------------------
/docs/sql/.pages:
--------------------------------------------------------------------------------
1 | title: SQL
2 | nav:
3 |     - index.md
4 |     - Physical Operators:
5 |         - ... | *Exec.md
6 |         - PythonSQLMetrics.md
7 |     - ...
8 | 


--------------------------------------------------------------------------------
/docs/sql/AggregateInPandasExec.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: AggregateInPandasExec
 3 | ---
 4 | 
 5 | # AggregateInPandasExec Physical Operator
 6 | 
 7 | `AggregateInPandasExec` is a unary physical operator ([Spark SQL]({{ book.spark_sql }}/physical-operators/UnaryExecNode)) that executes [pandas UDAFs](#udfExpressions) using [ArrowPythonRunner](../runners/ArrowPythonRunner.md) (one per partition).
 8 | 
 9 | ## Creating Instance
10 | 
11 | `AggregateInPandasExec` takes the following to be created:
12 | 
13 | * <span id="groupingExpressions"> Grouping Expressions ([Spark SQL]({{ book.spark_sql }}/expressions/Expression)) (`Seq[NamedExpression]`)
14 | * <span id="udfExpressions"> pandas UDAFs ([PythonUDF](PythonUDF.md)s with [SQL_GROUPED_AGG_PANDAS_UDF](PythonEvalType.md#SQL_GROUPED_AGG_PANDAS_UDF))
15 | * <span id="resultExpressions"> Result Named Expressions ([Spark SQL]({{ book.spark_sql }}/expressions/NamedExpression)) (`Seq[NamedExpression]`)
16 | * <span id="child"> Child Physical Operator ([Spark SQL]({{ book.spark_sql }}/physical-operators/SparkPlan))
17 | 
18 | `AggregateInPandasExec` is created when `Aggregation` execution planning strategy ([Spark SQL]({{ book.spark_sql }}/execution-planning-strategies/Aggregation)) is executed for `Aggregate` logical operators ([Spark SQL]({{ book.spark_sql }}/logical-operators/Aggregate)) with [PythonUDF](PythonUDF.md) aggregate expressions only.
19 | 
20 | ## Executing Operator { #doExecute }
21 | 
22 | ??? note "SparkPlan"
23 | 
24 |     ```scala
25 |     doExecute(): RDD[InternalRow]
26 |     ```
27 | 
28 |     `doExecute` is part of the `SparkPlan` ([Spark SQL]({{ book.spark_sql }}/physical-operators/SparkPlan#doExecute)) abstraction.
29 | 
30 | `doExecute` uses [ArrowPythonRunner](../runners/ArrowPythonRunner.md) (one per partition) to execute [PythonUDFs](#udfExpressions).
31 | 


--------------------------------------------------------------------------------
/docs/sql/ArrowEvalPython.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: ArrowEvalPython
 3 | ---
 4 | 
 5 | # ArrowEvalPython Logical Operator
 6 | 
 7 | `ArrowEvalPython` is a [BaseEvalPython](BaseEvalPython.md) unary logical operator that evaluates [scalar PythonUDF](PythonUDF.md#isScalarPythonUDF)s with [Apache Arrow]({{ arrow.home }}).
 8 | 
 9 | `ArrowEvalPython` is planned as [ArrowEvalPythonExec](ArrowEvalPythonExec.md) physical operator.
10 | 
11 | ## Creating Instance
12 | 
13 | `ArrowEvalPython` takes the following to be created:
14 | 
15 | * <span id="udfs"> [Scalar PythonUDF](PythonUDF.md#isScalarPythonUDF)s
16 | * <span id="resultAttrs"> Result `Attribute`s ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute))
17 | * <span id="child"> Child `LogicalPlan` ([Spark SQL]({{ book.spark_sql }}/logical-operators/LogicalPlan))
18 | * [Eval Type](#evalType)
19 | 
20 | `ArrowEvalPython` is created when:
21 | 
22 | * `ExtractPythonUDFs` logical optimization is executed (and requested to extract [scalar PythonUDF](PythonUDF.md#isScalarPythonUDF)s from a logical query plan)
23 | 
24 | ### evalType { #evalType }
25 | 
26 | ```scala
27 | evalType: Int
28 | ```
29 | 
30 | `ArrowEvalPython` is given an `evalType` when [created](#creating-instance) that can only be one of the following:
31 | 
32 | * [SQL_SCALAR_PANDAS_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_UDF)
33 | * [SQL_SCALAR_PANDAS_ITER_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_ITER_UDF)
34 | 


--------------------------------------------------------------------------------
/docs/sql/ArrowEvalPythonExec.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: ArrowEvalPythonExec
 3 | ---
 4 | 
 5 | # ArrowEvalPythonExec Physical Operator
 6 | 
 7 | `ArrowEvalPythonExec` is an [EvalPythonExec](EvalPythonExec.md) physical operator to [evaluate scalar PythonUDFs](#evaluate) using [ArrowPythonRunner](../runners/ArrowPythonRunner.md).
 8 | 
 9 | `ArrowEvalPythonExec` represents [ArrowEvalPython](ArrowEvalPython.md) logical operator at execution time.
10 | 
11 | ## Creating Instance
12 | 
13 | `ArrowEvalPythonExec` takes the following to be created:
14 | 
15 | * <span id="udfs"> [Scalar PythonUDF](PythonUDF.md#isScalarPythonUDF)s
16 | * <span id="resultAttrs"> Result `Attribute`s ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute))
17 | * <span id="child"> Child `SparkPlan` ([Spark SQL]({{ book.spark_sql }}/physical-operators/SparkPlan))
18 | * <span id="evalType"> Eval Type
19 | 
20 | `ArrowEvalPythonExec` is created when:
21 | 
22 | * `PythonEvals` physical execution strategy is executed (and plans [ArrowEvalPython](ArrowEvalPython.md) logical operators)
23 | 
24 | ## Performance Metrics
25 | 
26 | `ArrowEvalPythonExec` is a [PythonSQLMetrics](PythonSQLMetrics.md).
27 | 
28 | ## Maximum Records per Batch { #batchSize }
29 | 
30 | `batchSize` is the value of [spark.sql.execution.arrow.maxRecordsPerBatch](../configuration-properties/index.md#spark.sql.execution.arrow.maxRecordsPerBatch) configuration property.
31 | 
32 | `batchSize` is used while [evaluating PythonUDFs](#evaluate).
33 | 
34 | ## Evaluating PythonUDFs { #evaluate }
35 | 
36 | ??? note "EvalPythonExec"
37 | 
38 |     ```scala
39 |     evaluate(
40 |       funcs: Seq[ChainedPythonFunctions],
41 |       argOffsets: Array[Array[Int]],
42 |       iter: Iterator[InternalRow],
43 |       schema: StructType,
44 |       context: TaskContext): Iterator[InternalRow]
45 |     ```
46 | 
47 |     `evaluate` is part of the [EvalPythonExec](EvalPythonExec.md#evaluate) abstraction.
48 | 
49 | `evaluate` creates an [ArrowPythonRunner](../runners/ArrowPythonRunner.md) to [compute partitions](../runners/BasePythonRunner.md#compute).
50 | 
51 | In the end, `evaluate` converts `ColumnarBatch`es into `InternalRow`s.
52 | 


--------------------------------------------------------------------------------
/docs/sql/BaseEvalPython.md:
--------------------------------------------------------------------------------
1 | # BaseEvalPython
2 | 
3 | `BaseEvalPython` is...FIXME
4 | 


--------------------------------------------------------------------------------
/docs/sql/DataFrame.md:
--------------------------------------------------------------------------------
 1 | # DataFrame
 2 | 
 3 | `DataFrame` is a Python class with [PandasMapOpsMixin](PandasMapOpsMixin.md) and [PandasConversionMixin](PandasConversionMixin.md) mixins.
 4 | 
 5 | `DataFrame` is defined in [pyspark.sql.dataframe](../pyspark/sql/dataframe.md) module.
 6 | 
 7 | ```py
 8 | from pyspark.sql.dataframe import DataFrame
 9 | ```
10 | 
11 | ## Creating Instance
12 | 
13 | `DataFrame` takes the following to be created:
14 | 
15 | * <span id="jdf"><span id="_jdf"> jdf
16 | * <span id="sql_ctx"> [SQLContext](SQLContext.md)
17 | 
18 | ## <span id="groupBy"> groupBy
19 | 
20 | ```scala
21 | groupBy(self, *cols)
22 | ```
23 | 
24 | `groupBy` requests the [_jdf](#jdf) to `groupBy` and creates a [GroupedData](GroupedData.md) with it.
25 | 
26 | ## observe { #observe }
27 | 
28 | ```py
29 | observe(
30 |   self,
31 |   observation: Union["Observation", str],
32 |   *exprs: Column,
33 | ) -> "DataFrame"
34 | ```
35 | 
36 | `observe` accepts an [Observation](Observation.md) or a name as the `observation`:
37 | 
38 | * For an [Observation](Observation.md), `observe` requests it to [_on](Observation.md#_on) (with this `DataFrame` and the `exprs` columns).
39 | 
40 | * For a name, `observe` creates a new `DataFrame` after requesting [_jdf](#_jdf) to `observe` (with the name).
41 | 
42 | ### Demo { #observe-demo }
43 | 
44 | !!! note "QueryExecutionListener"
45 |     You should install `QueryExecutionListener` ([Spark SQL]({{ book.spark_sql }}/QueryExecutionListener)) to intercept `QueryExecution` on a successful query execution (to access `observedMetrics`).
46 | 
47 | ```py
48 | import pandas as pd
49 | 
50 | pandas_df = pd.DataFrame({
51 |   'name': ['jacek', 'agata', 'iweta', 'patryk', 'maksym'],
52 |   'age': [50, 49, 29, 26, 11]
53 |   })
54 | df = spark.createDataFrame(pandas_df)
55 | ```
56 | 
57 | ```py
58 | from pyspark.sql.functions import *
59 | row_count_metric = count(lit(1)).alias("count")
60 | observed_df = df.observe("observe_demo", row_count_metric)
61 | ```
62 | 
63 | ```py
64 | observed_df.count()
65 | ```
66 | 


--------------------------------------------------------------------------------
/docs/sql/EvalPythonExec.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: EvalPythonExec
 3 | ---
 4 | 
 5 | # EvalPythonExec Unary Physical Operators
 6 | 
 7 | `EvalPythonExec` is an [extension](#contract) of the `UnaryExecNode` ([Spark SQL]({{ book.spark_sql }}/physical-operators/UnaryExecNode)) abstraction for [unary physical operators](#implementations) that [evaluate PythonUDFs](#evaluate) (when [executed](#doExecute)).
 8 | 
 9 | ## Contract
10 | 
11 | ### Evaluating PythonUDFs { #evaluate }
12 | 
13 | ```scala
14 | evaluate(
15 |   funcs: Seq[ChainedPythonFunctions],
16 |   argOffsets: Array[Array[Int]],
17 |   iter: Iterator[InternalRow],
18 |   schema: StructType,
19 |   context: TaskContext): Iterator[InternalRow]
20 | ```
21 | 
22 | See:
23 | 
24 | * [ArrowEvalPythonExec](ArrowEvalPythonExec.md#evaluate)
25 | 
26 | Used when:
27 | 
28 | * `EvalPythonExec` physical operator is requested to [doExecute](#doExecute)
29 | 
30 | ### Result Attributes { #resultAttrs }
31 | 
32 | ```scala
33 | resultAttrs: Seq[Attribute]
34 | ```
35 | 
36 | Result `Attribute`s ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute))
37 | 
38 | See:
39 | 
40 | * [ArrowEvalPythonExec](ArrowEvalPythonExec.md#resultAttrs)
41 | 
42 | Used when:
43 | 
44 | * `EvalPythonExec` physical operator is requested for the [output](#output) and [producedAttributes](#producedAttributes)
45 | 
46 | ### Python UDFs { #udfs }
47 | 
48 | ```scala
49 | udfs: Seq[PythonUDF]
50 | ```
51 | 
52 | [PythonUDF](PythonUDF.md)s to [evaluate](#evaluate)
53 | 
54 | See:
55 | 
56 | * [ArrowEvalPythonExec](ArrowEvalPythonExec.md#udfs)
57 | 
58 | Used when:
59 | 
60 | * `EvalPythonExec` physical operator is requested to [doExecute](#doExecute)
61 | 
62 | ## Implementations
63 | 
64 | * [ArrowEvalPythonExec](ArrowEvalPythonExec.md)
65 | * `BatchEvalPythonExec`
66 | 
67 | ## Executing Physical Operator { #doExecute }
68 | 
69 | ??? note "SparkPlan"
70 | 
71 |     ```scala
72 |     doExecute(): RDD[InternalRow]
73 |     ```
74 | 
75 |     `doExecute` is part of the `SparkPlan` ([Spark SQL]({{ book.spark_sql }}/physical-operators/SparkPlan#doExecute)) abstraction.
76 | 
77 | The gist of `doExecute` is to [evaluate Python UDFs](#evaluate) (for every `InternalRow`) with some pre- and post-processing.
78 | 
79 | ---
80 | 
81 | `doExecute` requests the child physical operator to `execute` (to produce an input `RDD[InternalRow]`).
82 | 
83 | !!! note
84 |     `EvalPythonExec`s are `UnaryExecNode`s ([Spark SQL]({{ book.spark_sql }}/physical-operators/UnaryExecNode)).
85 | 
86 | `doExecute` uses `RDD.mapPartitions` operator to execute a function over partitions of `InternalRow`s.
87 | 
88 | For every partition, `doExecute` creates a `MutableProjection` for the inputs (and the child's output) and requests it to `initialize`.
89 | 
90 | `doExecute` [evaluates Python UDFs](#evaluate) (for every `InternalRow`).
91 | 
92 | In the end, `doExecute` creates an `UnsafeProjection` for the [output](#output) to "map over" the rows (from evaluating Python UDFs).
93 | 


--------------------------------------------------------------------------------
/docs/sql/FlatMapGroupsInPandas.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: FlatMapGroupsInPandas
 3 | ---
 4 | 
 5 | # FlatMapGroupsInPandas Logical Operator
 6 | 
 7 | `FlatMapGroupsInPandas` is a unary logical operator ([Spark SQL]({{ book.spark_sql }}/logical-operators/LogicalPlan/#UnaryNode)).
 8 | 
 9 | `FlatMapGroupsInPandas` is planned as a [FlatMapGroupsInPandasExec](FlatMapGroupsInPandasExec.md) physical operator.
10 | 
11 | ## Creating Instance
12 | 
13 | `FlatMapGroupsInPandas` takes the following to be created:
14 | 
15 | * <span id="groupingAttributes"> Grouping Attributes ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute))
16 | * <span id="functionExpr"> Function Expression ([Spark SQL]({{ book.spark_sql }}/expressions/Expression))
17 | * <span id="output"> Output Attributes ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute))
18 | * <span id="child"> Child Logical Operator ([Spark SQL]({{ book.spark_sql }}/logical-operators/LogicalPlan))
19 | 
20 | `FlatMapGroupsInPandas` is created when:
21 | 
22 | * `RelationalGroupedDataset` is requested to [flatMapGroupsInPandas](RelationalGroupedDataset.md#flatMapGroupsInPandas) (with a [PythonUDF](PythonUDF.md))
23 | 


--------------------------------------------------------------------------------
/docs/sql/FlatMapGroupsInPandasExec.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: FlatMapGroupsInPandasExec
 3 | ---
 4 | 
 5 | # FlatMapGroupsInPandasExec Physical Operator
 6 | 
 7 | `FlatMapGroupsInPandasExec` is a unary physical operator ([Spark SQL]({{ book.spark_sql }}/physical-operators/UnaryExecNode)) to execute a [PythonUDF](#func) using [ArrowPythonRunner](../runners/ArrowPythonRunner.md) (in [SQL_GROUPED_MAP_PANDAS_UDF](PythonEvalType.md#SQL_GROUPED_MAP_PANDAS_UDF) eval mode).
 8 | 
 9 | `FlatMapGroupsInPandasExec` represents a [FlatMapGroupsInPandas](FlatMapGroupsInPandas.md) logical operator at execution time.
10 | 
11 | ## Creating Instance
12 | 
13 | `FlatMapGroupsInPandasExec` takes the following to be created:
14 | 
15 | * <span id="groupingAttributes"> Grouping Attributes ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute))
16 | * <span id="func"> Function Expression ([Spark SQL]({{ book.spark_sql }}/expressions/Expression))
17 | * <span id="output"> Output Attributes ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute))
18 | * <span id="child"> Child Physical Operator ([Spark SQL]({{ book.spark_sql }}/physical-operators/SparkPlan))
19 | 
20 | `FlatMapGroupsInPandasExec` is created when:
21 | 
22 | * `BasicOperators` ([Spark SQL]({{ book.spark_sql }}/execution-planning-strategies/BasicOperators/)) execution planning strategy is executed (on a logical query plan with [FlatMapGroupsInPandas](FlatMapGroupsInPandas.md) logical operators)
23 | 
24 | ## Performance Metrics
25 | 
26 | `ArrowEvalPythonExec` is a [PythonSQLMetrics](PythonSQLMetrics.md).
27 | 
28 | ## Executing Operator { #doExecute }
29 | 
30 | ??? note "SparkPlan"
31 | 
32 |     ```scala
33 |     doExecute(): RDD[InternalRow]
34 |     ```
35 | 
36 |     `doExecute` is part of the `SparkPlan` ([Spark SQL]({{ book.spark_sql }}/physical-operators/SparkPlan#doExecute)) abstraction.
37 | 
38 | `doExecute` requests the [child physical operator](#child) to `execute` (and produce a `RDD[InternalRow]`).
39 | 
40 | For every non-empty partition (using `RDD.mapPartitionsInternal`), `doExecute` creates an [ArrowPythonRunner](../runners/ArrowPythonRunner.md) (with [SQL_GROUPED_MAP_PANDAS_UDF](PythonEvalType.md#SQL_GROUPED_MAP_PANDAS_UDF) eval type) and [executePython](PandasGroupUtils.md#executePython).
41 | 


--------------------------------------------------------------------------------
/docs/sql/GroupedData.md:
--------------------------------------------------------------------------------
 1 | # GroupedData
 2 | 
 3 | `GroupedData` is created for the following high-level operators:
 4 | 
 5 | * [DataFrame.cube](DataFrame.md#cube)
 6 | * [DataFrame.groupBy](DataFrame.md#groupBy)
 7 | * [DataFrame.rollup](DataFrame.md#rollup)
 8 | * [GroupedData.pivot](#pivot)
 9 | 
10 | `GroupedData` is then used to execute aggregate functions (over groups of rows) using [agg](#agg) operator:
11 | 
12 | * Built-In Aggregation Functions
13 | * [pandas UDAFs](../pandas-udafs/index.md)
14 | 
15 | `GroupedData` is a Python class with [PandasGroupedOpsMixin](PandasGroupedOpsMixin.md) mixin.
16 | 
17 | `GroupedData` is defined in [pyspark.sql.group](../pyspark/sql/group.md) module.
18 | 
19 | ```py
20 | from pyspark.sql.group import GroupedData
21 | ```
22 | 
23 | ## Creating Instance
24 | 
25 | `GroupedData` takes the following to be created:
26 | 
27 | * <span id="jgd"><span id="_jgd"> [RelationalGroupedDataset](RelationalGroupedDataset.md)
28 | * <span id="df"><span id="_df"> [DataFrame](DataFrame.md)
29 | 
30 | ## agg
31 | 
32 | ```py
33 | agg(
34 |   self,
35 |   *exprs: Union[Column, Dict[str, str]]) -> DataFrame
36 | ```
37 | 
38 | !!! note
39 |     Built-in aggregation functions and [pandas UDAFs](../pandas-udafs/index.md) cannot be used together in a single `agg`.
40 | 
41 | `agg` accepts a collection of `Column` expressions or a single `Dict[str, str]` object.
42 | 
43 | `agg` requests the [RelationalGroupedDataset](#_jgd) to `agg` ([Spark SQL]({{ book.spark_sql }}/RelationalGroupedDataset/#agg)).
44 | 
45 | In the end, `agg` creates a [DataFrame](DataFrame.md) with the `agg` result.
46 | 


--------------------------------------------------------------------------------
/docs/sql/Observation.md:
--------------------------------------------------------------------------------
 1 | # Observation
 2 | 
 3 | `Observation` is a Python class to observe (named) metrics on a [DataFrame](DataFrame.md).
 4 | 
 5 | ```py
 6 | from pyspark.sql.observation import Observation
 7 | ```
 8 | 
 9 | ??? note "pyspark.sql"
10 |     `Observation` is imported using `*` import from `pyspark.sql` as well as `pyspark.sql.observation` (as is included in `__all__` of the modules).
11 | 
12 |     ```py
13 |     from pyspark.sql import *
14 |     ```
15 | 
16 | ## Creating Instance
17 | 
18 | `Observation` takes the following to be created:
19 | 
20 | * <span id="name"><span id="_name"> Name (optional)
21 | 
22 | ## _jo { #_jo }
23 | 
24 | ```py
25 | _jo: Optional[JavaObject]
26 | ```
27 | 
28 | ## get { #get }
29 | 
30 | ```py
31 | get(
32 |   self) -> Dict[str, Any]
33 | ```
34 | 
35 | `get` requests the [_jo](#_jo) to `getAsJava` and converts the py4j `JavaMap` to a Python dict.
36 | 
37 | ## Demo
38 | 
39 | ```py
40 | from pyspark.sql.observation import Observation
41 | 
42 | observation = Observation("demo")
43 | ```
44 | 
45 | ```py
46 | import pandas as pd
47 | 
48 | pandas_df = pd.DataFrame({
49 |   'name': ['jacek', 'agata', 'iweta', 'patryk', 'maksym'],
50 |   'age': [50, 49, 29, 26, 11]
51 |   })
52 | df = spark.createDataFrame(pandas_df)
53 | ```
54 | 
55 | ```py
56 | from pyspark.sql.functions import *
57 | row_count_metric = count(lit(1)).alias("count")
58 | observed_df = df.observe(observation, row_count_metric)
59 | ```
60 | 
61 | ```py
62 | observed_df.count()
63 | ```
64 | 
65 | === "Python"
66 | 
67 |     ```py
68 |     observation.get()
69 |     ```
70 | 
71 | ```text
72 | {'count': 5}
73 | ```
74 | 


--------------------------------------------------------------------------------
/docs/sql/PandasCogroupedOps.md:
--------------------------------------------------------------------------------
 1 | # PandasCogroupedOps
 2 | 
 3 | `PandasCogroupedOps` is a logical grouping created by [GroupedData.cogroup](GroupedData.md#cogroup) over two [GroupedData](GroupedData.md)s.
 4 | 
 5 | ```py
 6 | from pyspark.sql.pandas.group_ops import PandasCogroupedOps
 7 | ```
 8 | 
 9 | `PandasCogroupedOps` is included in `__all__` of `pyspark.sql` module (via `__init__.py`).
10 | 
11 | ## Creating Instance
12 | 
13 | `PandasCogroupedOps` takes the following to be created:
14 | 
15 | * <span id="gd1"> [GroupedData](GroupedData.md)
16 | * <span id="gd2"> [GroupedData](GroupedData.md)
17 | 
18 | `PandasCogroupedOps` is created when:
19 | 
20 | * `PandasGroupedOpsMixin` is requested to [cogroup](PandasGroupedOpsMixin.md#cogroup)
21 | 
22 | ## applyInPandas { #applyInPandas }
23 | 
24 | ```py
25 | applyInPandas(
26 |   self,
27 |   func: "PandasCogroupedMapFunction", # (1)!
28 |   schema: Union[StructType, str]
29 | ) -> DataFrame
30 | ```
31 | 
32 | 1. 
33 | ```py
34 | from pandas.core.frame import DataFrame as PandasDataFrame
35 | DataFrameLike = PandasDataFrame
36 | PandasCogroupedMapFunction = Union[
37 |   # func: (pandas.DataFrame, pandas.DataFrame) -> pandas.DataFrame
38 |   Callable[[DataFrameLike, DataFrameLike], DataFrameLike],
39 |   # func: (groupKey(s), pandas.DataFrame, pandas.DataFrame) -> pandas.DataFrame
40 |   Callable[[Any, DataFrameLike, DataFrameLike], DataFrameLike],
41 | ]
42 | ```
43 | 
44 | `applyInPandas` creates a [DataFrame](DataFrame.md) with the result of [flatMapCoGroupsInPandas](RelationalGroupedDataset.md#flatMapCoGroupsInPandas) with a [pandas user defined function](../pyspark/sql/pandas/functions.md#pandas_udf) of `SQL_COGROUPED_MAP_PANDAS_UDF` type.
45 | 
46 | ---
47 | 
48 | `applyInPandas` [creates a pandas user defined function](../pyspark/sql/pandas/functions.md#pandas_udf) for the given `func` and the return type by the given `schema`. The pandas UDF is of `SQL_COGROUPED_MAP_PANDAS_UDF` type.
49 | 
50 | `applyInPandas` applies the pandas UDF on all the columns of the two [GroupedData](#creating-instance)s (that creates a `Column` expression).
51 | 
52 | `applyInPandas` requests the [GroupedData](#gd1) for the associated [RelationalGroupedDataset](GroupedData.md#jgd) that is in turn requested to [flatMapCoGroupsInPandas](RelationalGroupedDataset.md#flatMapCoGroupsInPandas).
53 | 
54 | ### Example { #applyInPandas-example }
55 | 
56 | ```py
57 | df1 = spark.createDataFrame(
58 |     data = [
59 |       (20000101, 1, 1.0),
60 |       (20000101, 2, 2.0),
61 |       (20000102, 1, 3.0),
62 |       (20000102, 2, 4.0)],
63 |     schema = ("time", "id", "v1"))
64 | df2 = spark.createDataFrame(
65 |     data = [
66 |       (20000101, 1, "x"),
67 |       (20000101, 2, "y")],
68 |     schema = ("time", "id", "v2"))
69 | ```
70 | 
71 | ```py
72 | import pandas as pd
73 | def asof_join(k, l, r):
74 |   if k == (1,):
75 |     return pd.merge_asof(l, r, on="time", by="id")
76 |   else:
77 |     return pd.DataFrame(columns=['time', 'id', 'v1', 'v2'])
78 | ```
79 | 
80 | ```py
81 | gd1 = df1.groupby("id")
82 | gd2 = df2.groupby("id")
83 | ```
84 | 
85 | ```py
86 | gd1
87 |   .cogroup(gd2)
88 |   .applyInPandas(
89 |     asof_join,
90 |     "time int, id int, v1 double, v2 string")
91 |   .show()
92 | ```
93 | 


--------------------------------------------------------------------------------
/docs/sql/PandasConversionMixin.md:
--------------------------------------------------------------------------------
 1 | # PandasConversionMixin
 2 | 
 3 | `PandasConversionMixin` is a Python mixin of [DataFrame](DataFrame.md) to [convert to Pandas](#toPandas) ([pandas.DataFrame]({{ pandas.api }}/pandas.DataFrame.html)).
 4 | 
 5 | ## toPandas { #toPandas }
 6 | 
 7 | ```python
 8 | toPandas(self)
 9 | ```
10 | 
11 | `toPandas` can only be used with [DataFrame](DataFrame.md).
12 | 
13 | With [Arrow optimization](../configuration-properties/index.md#arrowPySparkEnabled) enabled, `toPandas` [to_arrow_schema](#to_arrow_schema).
14 | 
15 | !!! note "pyarrow"
16 |     Arrow Optimization uses `pyarrow` module.
17 | 
18 | `toPandas` renames the columns to be of `col_[index]` format and [_collect_as_arrow](#_collect_as_arrow) (with `split_batches` based on `arrowPySparkSelfDestructEnabled` configuration property).
19 | 
20 | `toPandas` creates a `pyarrow.Table` (from the `RecordBatch`es) and converts the table to a pandas-compatible NumPy array or `DataFrame`. `toPandas` renames the columns back to the initial column names.
21 | 
22 | !!! note
23 |     Column order is assumed.
24 | 
25 | With [Arrow optimization](../configuration-properties/index.md#arrowPySparkEnabled) disabled, `toPandas` collects the records (`DataFrame.collect`) and creates a `pandas.DataFrame` (with some type _munging_).
26 | 


--------------------------------------------------------------------------------
/docs/sql/PandasGroupUtils.md:
--------------------------------------------------------------------------------
 1 | # PandasGroupUtils
 2 | 
 3 | `PandasGroupUtils` utility is used by the following physical operators when executed:
 4 | 
 5 | * `FlatMapCoGroupsInPandasExec`
 6 | * [FlatMapGroupsInPandasExec](FlatMapGroupsInPandasExec.md#doExecute)
 7 | 
 8 | ## executePython { #executePython }
 9 | 
10 | ```scala
11 | executePython[T](
12 |   data: Iterator[T],
13 |   output: Seq[Attribute],
14 |   runner: BasePythonRunner[T, ColumnarBatch]): Iterator[InternalRow]
15 | ```
16 | 
17 | `executePython` requests the given [BasePythonRunner](../runners/BasePythonRunner.md) to [compute](../runners/BasePythonRunner.md#compute) the (partition) `data` (with the current task's `TaskContext` and the partition ID).
18 | 
19 | `executePython`...FIXME
20 | 
21 | ---
22 | 
23 | `executePython` is used when:
24 | 
25 | * `FlatMapCoGroupsInPandasExec` and [FlatMapGroupsInPandasExec](FlatMapGroupsInPandasExec.md#doExecute) physical operators are executed
26 | 
27 | ## groupAndProject { #groupAndProject }
28 | 
29 | ```scala
30 | groupAndProject(
31 |   input: Iterator[InternalRow],
32 |   groupingAttributes: Seq[Attribute],
33 |   inputSchema: Seq[Attribute],
34 |   dedupSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])]
35 | ```
36 | 
37 | `groupAndProject` creates a `GroupedIterator` for the `input` iterator (of `InternalRow`s), the `groupingAttributes` and the `inputSchema`.
38 | 
39 | `groupAndProject`...FIXME
40 | 
41 | ---
42 | 
43 | `groupAndProject` is used when:
44 | 
45 | * `FlatMapCoGroupsInPandasExec` and [FlatMapGroupsInPandasExec](FlatMapGroupsInPandasExec.md#doExecute) physical operators are executed
46 | 


--------------------------------------------------------------------------------
/docs/sql/PandasGroupedOpsMixin.md:
--------------------------------------------------------------------------------
 1 | # PandasGroupedOpsMixin
 2 | 
 3 | `PandasGroupedOpsMixin` is a Python mixin for [GroupedData](GroupedData.md) class.
 4 | 
 5 | ## applyInPandas { #applyInPandas }
 6 | 
 7 | ```py
 8 | applyInPandas(
 9 |   self,
10 |   func: "PandasGroupedMapFunction", # (1)!
11 |   schema: Union[StructType, str]
12 | ) -> DataFrame
13 | ```
14 | 
15 | 1. 
16 | ```py
17 | from pandas.core.frame import DataFrame as PandasDataFrame
18 | DataFrameLike = PandasDataFrame
19 | PandasGroupedMapFunction = Union[
20 |   # func: pandas.DataFrame -> pandas.DataFrame
21 |   Callable[[DataFrameLike], DataFrameLike],
22 |   # func: (groupKey(s), pandas.DataFrame) -> pandas.DataFrame
23 |   Callable[[Any, DataFrameLike], DataFrameLike],
24 | ]
25 | ```
26 | 
27 | `applyInPandas` creates a [pandas_udf](../pyspark/sql/pandas/functions.md#pandas_udf) with the following:
28 | 
29 | pandas_udf | Value
30 | -----------|------
31 |  `f` | The given `func`
32 |  `returnType` | The given `schema`
33 |  `functionType` | [PandasUDFType.GROUPED_MAP](../pyspark/sql/pandas/PandasUDFType.md#GROUPED_MAP)
34 | 
35 | `applyInPandas` creates a `Column` wtih the `pandas_udf` applied to all the columns of the [DataFrame](GroupedData.md#_df) of this [GroupedData](GroupedData.md).
36 | 
37 | `applyInPandas` requests the [RelationalGroupedDataset](#_jgd) to [flatMapGroupsInPandas](RelationalGroupedDataset.md#flatMapGroupsInPandas) with the underlying Catalyst expression of the `Column` with the `pandas_udf`.
38 | 
39 | In the end, `applyInPandas` creates a [DataFrame](DataFrame.md) with the result.
40 | 
41 | ## cogroup { #cogroup }
42 | 
43 | ```py
44 | cogroup(
45 |   self,
46 |   other: "GroupedData") -> "PandasCogroupedOps"
47 | ```
48 | 
49 | `cogroup` creates a [PandasCogroupedOps](PandasCogroupedOps.md) for this and the other [GroupedData](GroupedData.md)s.
50 | 


--------------------------------------------------------------------------------
/docs/sql/PandasMapOpsMixin.md:
--------------------------------------------------------------------------------
1 | # PandasMapOpsMixin
2 | 
3 | `PandasMapOpsMixin` is a Python mixin for [DataFrame](DataFrame.md) class.
4 | 


--------------------------------------------------------------------------------
/docs/sql/PythonEvalType.md:
--------------------------------------------------------------------------------
 1 | # PythonEvalType
 2 | 
 3 | `PythonEvalType` are the types of commands that will be sent to the Python worker for execution.
 4 | 
 5 | Name | Value | PandasUDFType
 6 | -----|-------|--------------
 7 | [SQL_GROUPED_AGG_PANDAS_UDF](#SQL_GROUPED_AGG_PANDAS_UDF) | 202 | [GROUPED_AGG](../pyspark/sql/pandas/PandasUDFType.md#GROUPED_AGG)
 8 | [SQL_GROUPED_MAP_PANDAS_UDF](#SQL_GROUPED_MAP_PANDAS_UDF) | 201 | [GROUPED_MAP](../pyspark/sql/pandas/PandasUDFType.md#GROUPED_MAP)
 9 | [SQL_SCALAR_PANDAS_UDF](#SQL_SCALAR_PANDAS_UDF) | 200 | [SCALAR](../pyspark/sql/pandas/PandasUDFType.md#SCALAR)
10 | [SQL_SCALAR_PANDAS_ITER_UDF](#SQL_SCALAR_PANDAS_ITER_UDF) | 204 | [SCALAR_ITER](../pyspark/sql/pandas/PandasUDFType.md#SCALAR_ITER)
11 | 
12 | `PythonEvalType` is defined in `org.apache.spark.api.python` Scala package with the same values defined on Python side in the [PythonEvalType](../pyspark/sql/pandas/PandasUDFType.md) Python class (in `pyspark/rdd.py` package).
13 | 
14 | ## SQL_GROUPED_AGG_PANDAS_UDF { #SQL_GROUPED_AGG_PANDAS_UDF }
15 | 
16 | `SQL_GROUPED_AGG_PANDAS_UDF` is a UDF marker of **Grouped Aggregate Pandas UDFs** (_pandas User-Defined Aggregate Functions_, _pandas UDAFs_).
17 | 
18 | `SQL_GROUPED_AGG_PANDAS_UDF` is executed using [AggregateInPandasExec](AggregateInPandasExec.md) physical operator (using [ArrowPythonRunner](../runners/ArrowPythonRunner.md)).
19 | 
20 | Limitations of Pandas UDAFs:
21 | 
22 | * [Return type](../pyspark/sql/UserDefinedFunction.md#returnType) cannot be `StructType`
23 | * Not supported in the `PIVOT` clause
24 | * Not supported in streaming aggregation
25 | 
26 | `SQL_GROUPED_AGG_PANDAS_UDF` is used (on Python side) when:
27 | 
28 | * `pyspark/worker.py` is requested to [read_single_udf](../pyspark/worker.md#read_single_udf) and [read_udfs](../pyspark/worker.md#read_udfs)
29 | * `pyspark/sql/pandas/functions.py` is requested to `_create_pandas_udf` and `pandas_udf`
30 | 
31 | `SQL_GROUPED_AGG_PANDAS_UDF` is used (on Scala side) when:
32 | 
33 | * `PythonUDF` is requested for [isGroupedAggPandasUDF](PythonUDF.md#isGroupedAggPandasUDF)
34 | 
35 | ## SQL_SCALAR_PANDAS_UDF { #SQL_SCALAR_PANDAS_UDF }
36 | 
37 | `SQL_SCALAR_PANDAS_UDF` is among [SCALAR_TYPES](PythonUDF.md#SCALAR_TYPES) of [PythonUDF](PythonUDF.md).
38 | 
39 | `SQL_SCALAR_PANDAS_UDF` (with [SQL_SCALAR_PANDAS_ITER_UDF](#SQL_SCALAR_PANDAS_ITER_UDF)) are evaluated using [ArrowEvalPython](ArrowEvalPython.md).
40 | 
41 | `SQL_SCALAR_PANDAS_UDF` is used (on Python side) when:
42 | 
43 | * `pyspark/worker.py` is requested to [read_single_udf](../pyspark/worker.md#read_single_udf) and [read_udfs](../pyspark/worker.md#read_udfs)
44 | * `pyspark/sql/pandas/functions.py` is requested to `_create_pandas_udf` and `pandas_udf`
45 | 
46 | ## SQL_SCALAR_PANDAS_ITER_UDF { #SQL_SCALAR_PANDAS_ITER_UDF }
47 | 
48 | ## User-Defined Functions
49 | 
50 | [UDFRegistration](UDFRegistration.md#register) allows user-defined functions to be one of the following `PythonEvalType`s:
51 | 
52 | * [SQL_BATCHED_UDF](#SQL_BATCHED_UDF)
53 | * [SQL_SCALAR_PANDAS_UDF](#SQL_SCALAR_PANDAS_UDF)
54 | * [SQL_SCALAR_PANDAS_ITER_UDF](#SQL_SCALAR_PANDAS_ITER_UDF)
55 | * [SQL_GROUPED_AGG_PANDAS_UDF](#SQL_GROUPED_AGG_PANDAS_UDF)
56 | 


--------------------------------------------------------------------------------
/docs/sql/PythonSQLMetrics.md:
--------------------------------------------------------------------------------
 1 | # PythonSQLMetrics
 2 | 
 3 | `PythonSQLMetrics` is a collection of [SQL metrics](#performance-metrics) of the [physical operators](#implementations) in PySpark.
 4 | 
 5 | ## Performance Metrics
 6 | 
 7 | ### data returned from Python workers { #pythonDataReceived }
 8 | 
 9 | ### data sent to Python workers { #pythonDataSent }
10 | 
11 | ### number of output rows { #pythonNumRowsReceived }
12 | 
13 | ## Implementations
14 | 
15 | * [AggregateInPandasExec](AggregateInPandasExec.md)
16 | * [ArrowEvalPythonExec](ArrowEvalPythonExec.md)
17 | * `BatchEvalPythonExec`
18 | * `FlatMapCoGroupsInPandasExec`
19 | * [FlatMapGroupsInPandasExec](FlatMapGroupsInPandasExec.md)
20 | * `MapInBatchExec`
21 | * `StateStoreWriter`
22 | * `WindowInPandasExec`
23 | 


--------------------------------------------------------------------------------
/docs/sql/PythonUDF.md:
--------------------------------------------------------------------------------
 1 | # PythonUDF
 2 | 
 3 | `PythonUDF` is a Catalyst expression ([Spark SQL]({{ book.spark_sql }}/expressions/Expression)).
 4 | 
 5 | ## Creating Instance
 6 | 
 7 | `PythonUDF` takes the following to be created:
 8 | 
 9 | * <span id="name"> Name
10 | * <span id="func"> [PythonFunction](../PythonFunction.md)
11 | * <span id="dataType"> `DataType` ([Spark SQL]({{ book.spark_sql }}/DataType))
12 | * <span id="children"> Children Catalyst Expressions ([Spark SQL]({{ book.spark_sql }}/expressions/Expression))
13 | * <span id="evalType"> Python Eval Type
14 | * <span id="udfDeterministic"> `udfDeterministic` flag
15 | * <span id="resultId"> Result ID (`ExprId`)
16 | 
17 | `PythonUDF` is created when:
18 | 
19 | * `UserDefinedPythonFunction` is requested to [builder](UserDefinedPythonFunction.md#builder)
20 | 
21 | ## Unevaluable
22 | 
23 | `PythonUDF` is an `Unevaluable` expression ([Spark SQL]({{ book.spark_sql }}/expressions/Unevaluable)).
24 | 
25 | ## NonSQLExpression
26 | 
27 | `PythonUDF` is a `NonSQLExpression` expression ([Spark SQL]({{ book.spark_sql }}/expressions/NonSQLExpression)).
28 | 
29 | ## UserDefinedExpression
30 | 
31 | `PythonUDF` is a `UserDefinedExpression` expression ([Spark SQL]({{ book.spark_sql }}/expressions/UserDefinedExpression)).
32 | 
33 | ## isScalarPythonUDF { #isScalarPythonUDF }
34 | 
35 | ```scala
36 | isScalarPythonUDF(
37 |   e: Expression): Boolean
38 | ```
39 | 
40 | `isScalarPythonUDF` holds `true` when the following all hold `true`:
41 | 
42 | * The given `Expression` ([Spark SQL]({{ book.spark_sql }}/expressions/Expression)) is a [PythonUDF](PythonUDF.md)
43 | * The [evalType](#evalType) is [scalar](#SCALAR_TYPES)
44 | 
45 | ---
46 | 
47 | `isScalarPythonUDF` is used when:
48 | 
49 | * `ExtractPythonUDFFromJoinCondition` is requested to `hasUnevaluablePythonUDF`
50 | * `ExtractPythonUDFFromAggregate` is requested to `hasPythonUdfOverAggregate`
51 | * `ExtractGroupingPythonUDFFromAggregate` is requested to `hasScalarPythonUDF`
52 | * `ExtractPythonUDFs` is requested to `hasScalarPythonUDF`, `collectEvaluableUDFs`, `extract`
53 | 
54 | ## Scalar PythonUDF Types { #SCALAR_TYPES }
55 | 
56 | `PythonUDF` is [scalar](#isScalarPythonUDF) for the following eval types:
57 | 
58 | * [SQL_BATCHED_UDF](../sql/PythonEvalType.md#SQL_BATCHED_UDF)
59 | * [SQL_SCALAR_PANDAS_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_UDF)
60 | * [SQL_SCALAR_PANDAS_ITER_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_ITER_UDF)
61 | 
62 | ## isGroupedAggPandasUDF { #isGroupedAggPandasUDF }
63 | 
64 | ```scala
65 | isGroupedAggPandasUDF(
66 |   e: Expression): Boolean
67 | ```
68 | 
69 | `isGroupedAggPandasUDF` is `true` when the given `Expression` is a [PythonUDF](PythonUDF.md) with [SQL_GROUPED_AGG_PANDAS_UDF](PythonEvalType.md#SQL_GROUPED_AGG_PANDAS_UDF) eval type. Otherwise, `isGroupedAggPandasUDF` is `false`.
70 | 


--------------------------------------------------------------------------------
/docs/sql/RelationalGroupedDataset.md:
--------------------------------------------------------------------------------
 1 | # RelationalGroupedDataset
 2 | 
 3 | `RelationalGroupedDataset` is a result of executing high-level grouping operators.
 4 | 
 5 | !!! note "This is a stub"
 6 |     This page is a stub to describe PySpark-related methods only. Learn more about [RelationalGroupedDataset]({{ book.spark_sql }}/RelationalGroupedDataset/) in [The Internals of Spark SQL]({{ book.spark_sql }}).
 7 | 
 8 | ## flatMapCoGroupsInPandas { #flatMapCoGroupsInPandas }
 9 | 
10 | ```scala
11 | flatMapCoGroupsInPandas(
12 |   r: RelationalGroupedDataset,
13 |   expr: PythonUDF): DataFrame
14 | ```
15 | 
16 | `flatMapCoGroupsInPandas`...FIXME
17 | 
18 | ---
19 | 
20 | `flatMapCoGroupsInPandas` is used when:
21 | 
22 | * `PandasCogroupedOps` is requested to [applyInPandas](PandasCogroupedOps.md#applyInPandas)
23 | 
24 | ## flatMapGroupsInPandas { #flatMapGroupsInPandas }
25 | 
26 | ```scala
27 | flatMapGroupsInPandas(
28 |   expr: PythonUDF): DataFrame
29 | ```
30 | 
31 | `flatMapGroupsInPandas` creates a `DataFrame` with a [FlatMapGroupsInPandas](FlatMapGroupsInPandas.md) logical operator (to execute the given [PythonUDF](PythonUDF.md)).
32 | 
33 | ---
34 | 
35 | `flatMapGroupsInPandas` asserts that the input [PythonUDF](PythonUDF.md) is a grouped map udf (the [eval type](PythonUDF.md#evalType) is [SQL_GROUPED_MAP_PANDAS_UDF](PythonEvalType.md#SQL_GROUPED_MAP_PANDAS_UDF)).
36 | 
37 | `flatMapGroupsInPandas` asserts that the [return type](PythonUDF.md#dataType) of the input [PythonUDF](PythonUDF.md) is `StructType`.
38 | 
39 | ---
40 | 
41 | `flatMapGroupsInPandas` is used when:
42 | 
43 | * `PandasGroupedOpsMixin` is requested to [applyInPandas](PandasGroupedOpsMixin.md#applyInPandas)
44 | 


--------------------------------------------------------------------------------
/docs/sql/SQLContext.md:
--------------------------------------------------------------------------------
1 | # SQLContext
2 | 
3 | `SQLContext` is...FIXME
4 | 


--------------------------------------------------------------------------------
/docs/sql/SparkConversionMixin.md:
--------------------------------------------------------------------------------
1 | # SparkConversionMixin
2 | 
3 | `SparkConversionMixin` is a Python mixin for [SparkSession](../pyspark/sql/SparkSession.md) class.
4 | 


--------------------------------------------------------------------------------
/docs/sql/UDFRegistration.md:
--------------------------------------------------------------------------------
 1 | # UDFRegistration
 2 | 
 3 | `UDFRegistration` is a Python class in [pyspark.sql.udf](../pyspark/sql/udf.md) module.
 4 | 
 5 | ## Registering Python UDF { #register }
 6 | 
 7 | ```python
 8 | register(
 9 |   self,
10 |   name: str,
11 |   f: Union[Callable[..., Any], "UserDefinedFunctionLike"],
12 |   returnType: Optional[Union[pyspark.sql.types.DataType, str]] = None,
13 | ) -> "UserDefinedFunctionLike"
14 | ```
15 | 
16 | `register` registers a Python function (incl. lambda function) or a user-defined function as a SQL function (under the given `name`).
17 | 
18 | Function `f` | Description
19 | -------------|------------
20 |  A Python function | <ul><li>Includes lambda (_unnamed_) functions<li>`Callable[..., Any]`<li>The return type is `StringType` when not specified<li>Always `PythonEvalType.SQL_BATCHED_UDF`</ul>
21 |  `pyspark.sql.functions.udf` | <ul><li>_row-at-a-time_<li>`UserDefinedFunctionLike`
22 |  `pyspark.sql.functions.pandas_udf` | <ul><li>_vectorized_<li>`UserDefinedFunctionLike`
23 | 
24 | `evalType` of the a user-defined function can be one of the following:
25 | 
26 | * [SQL_BATCHED_UDF](../sql/PythonEvalType.md#SQL_BATCHED_UDF)
27 | * [SQL_SCALAR_PANDAS_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_UDF)
28 | * [SQL_SCALAR_PANDAS_ITER_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_ITER_UDF)
29 | * [SQL_GROUPED_AGG_PANDAS_UDF](../sql/PythonEvalType.md#SQL_GROUPED_AGG_PANDAS_UDF)
30 | 
31 | ---
32 | 
33 | `register` [_create_udf](#_create_udf) and requests the `_jsparkSession` for the `UDFRegistration` ([Spark SQL]({{ book.spark_sql }}/user-defined-functions/UDFRegistration/)) to `registerPython` ([Spark SQL]({{ book.spark_sql }}/user-defined-functions/UDFRegistration/#registerPython)).
34 | 
35 | ```python
36 | from pyspark.sql.functions import call_udf, col
37 | from pyspark.sql.types import IntegerType, StringType
38 | 
39 | rows = [(1, "a"),(2, "b"), (3, "c")]
40 | columns = ["id", "name"]
41 | df = spark.createDataFrame(rows, columns)
42 | 
43 | spark.udf.register("intX2", lambda i: i * 2, IntegerType())
44 | df.select(call_udf("intX2", "id")).show()
45 | ```
46 | 


--------------------------------------------------------------------------------
/docs/sql/UserDefinedPythonFunction.md:
--------------------------------------------------------------------------------
 1 | # UserDefinedPythonFunction
 2 | 
 3 | ## Creating Instance
 4 | 
 5 | `UserDefinedPythonFunction` takes the following to be created:
 6 | 
 7 | * <span id="name"> Name
 8 | * <span id="func"> `PythonFunction`
 9 | * <span id="dataType"> `DataType` ([Spark SQL]({{ book.spark_sql }}/DataType))
10 | * <span id="pythonEvalType"> Python Eval Type
11 | * <span id="udfDeterministic"> `udfDeterministic` flag
12 | 
13 | `UserDefinedPythonFunction` is created when:
14 | 
15 | * `SparkConnectPlanner` ([Spark Connect](../connect/index.md)) is requested to `handleRegisterPythonUDF`
16 | * `UserDefinedFunction` ([pyspark/sql/udf.py](../pyspark/sql/udf.md)) is requested to [_create_judf](../pyspark/sql/UserDefinedFunction.md#_create_judf)
17 | 
18 | ## <span id="builder"> Creating PythonUDF
19 | 
20 | ```scala
21 | builder(
22 |   e: Seq[Expression]): Expression
23 | ```
24 | 
25 | `builder` creates a [PythonUDF](PythonUDF.md) (for all the [arguments](#creating-instance) and the given children expressions).
26 | 
27 | ---
28 | 
29 | `builder` is used when:
30 | 
31 | * `UDFRegistration` is requested to register a Python UDF ([Spark SQL]({{ book.spark_sql }}/UDFRegistration#registerPython))
32 | * `UserDefinedPythonFunction` is requested to [apply](#apply)
33 | 
34 | ## <span id="apply"> Applying PythonUDF
35 | 
36 | ```scala
37 | apply(
38 |   exprs: Column*): Column
39 | ```
40 | 
41 | `apply` [creates a PythonUDF](#builder) (for the input `Column` ([Spark SQL]({{ book.spark_sql }}/Column)) expressions) and wraps it up into a `Column`.
42 | 
43 | ---
44 | 
45 | `apply` is used when:
46 | 
47 | * `UDFRegistration` is requested to register a Python UDF ([Spark SQL]({{ book.spark_sql }}/UDFRegistration#registerPython))
48 | * `UserDefinedPythonFunction` is requested to [apply](#apply)
49 | 


--------------------------------------------------------------------------------
/docs/sql/index.md:
--------------------------------------------------------------------------------
 1 | # PySpark SQL
 2 | 
 3 | **PySpark SQL** is a Python module to work with [Spark SQL]({{ book.spark_sql }}).
 4 | 
 5 | ```py
 6 | from pyspark.sql import *
 7 | ```
 8 | 
 9 | The above `*` import imports the following classes:
10 | 
11 | * `SparkSession`
12 | * `SQLContext`
13 | * `HiveContext`
14 | * `UDFRegistration`
15 | * `DataFrame`
16 | * `GroupedData`
17 | * `Column`
18 | * `Catalog`
19 | * [Observation](Observation.md)
20 | * `Row`
21 | * `DataFrameNaFunctions`
22 | * `DataFrameStatFunctions`
23 | * `Window`
24 | * `WindowSpec`
25 | * `DataFrameReader`
26 | * `DataFrameWriter`
27 | * `DataFrameWriterV2`
28 | * `PandasCogroupedOps`
29 | 


--------------------------------------------------------------------------------
/docs/tags.md:
--------------------------------------------------------------------------------
1 | # APIs
2 | 
3 | [TAGS]
4 | 


--------------------------------------------------------------------------------
/docs/udts/.pages:
--------------------------------------------------------------------------------
1 | title: User-Defined Table Functions (UDTFs)
2 | nav:
3 |     - index.md
4 |     - ...
5 | 


--------------------------------------------------------------------------------
/docs/udts/index.md:
--------------------------------------------------------------------------------
 1 | # User-Defined Table Functions (UDTFs)
 2 | 
 3 | **User-Defined Table Functions (UDTFs)** are user-defined functions that...FIXME
 4 | 
 5 | ```py
 6 | from pyspark.sql.functions import udtf
 7 | from pyspark.sql import Row
 8 | 
 9 | udtf(returnType="a: int")
10 | class TestUDTF:
11 |     def eval(self, row: Row):
12 |         if row[0] > 5:
13 |             yield row[0]
14 |     
15 |     def terminate(self):
16 |         """
17 |         This method is optional, but
18 |         there's a bug in 3.5.4 that makes terminate required
19 |         https://issues.apache.org/jira/browse/SPARK-50674
20 |         """
21 |         pass
22 | ```
23 | 
24 | ```py
25 | spark.udtf.register("test_udtf", TestUDTF)
26 | ```
27 | 
28 | ```py
29 | spark.sql("SELECT * FROM test_udtf(range(0, 8)) PARTITION BY id)").show()
30 | ```
31 | 


--------------------------------------------------------------------------------
/graffles/PythonRunner.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/japila-books/pyspark-internals/824e54fbf19ddb39ba9ec1a50ca65d7629470cf6/graffles/PythonRunner.graffle


--------------------------------------------------------------------------------
/graffles/PythonWorkerFactory.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/japila-books/pyspark-internals/824e54fbf19ddb39ba9ec1a50ca65d7629470cf6/graffles/PythonWorkerFactory.graffle


--------------------------------------------------------------------------------
/graffles/SparkContext.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/japila-books/pyspark-internals/824e54fbf19ddb39ba9ec1a50ca65d7629470cf6/graffles/SparkContext.graffle


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
  1 | site_name: The Internals of PySpark
  2 | site_url: https://books.japila.pl/pyspark-internals/
  3 | site_author: Jacek Laskowski
  4 | site_description: Demystifying inner-workings of PySpark
  5 | 
  6 | repo_name: pyspark-internals
  7 | repo_url: https://github.com/japila-books/pyspark-internals
  8 | edit_uri: edit/main/docs/
  9 | 
 10 | copyright: Copyright &copy; 2024 Jacek Laskowski
 11 | 
 12 | theme:
 13 |   name: material
 14 |   language: en
 15 |   icon:
 16 |     logo: material/book-open-page-variant
 17 |     repo: fontawesome/brands/github
 18 |     tag:
 19 |       python: fontawesome/brands/python
 20 |       scala: simple/scala
 21 |   features:
 22 |     # https://squidfunk.github.io/mkdocs-material/reference/code-blocks/#adding-annotations
 23 |     - content.code.annotate
 24 |     # https://squidfunk.github.io/mkdocs-material/upgrade/#contentcodecopy
 25 |     - content.code.copy
 26 |     - content.tooltips
 27 |     - navigation.indexes
 28 |     - navigation.instant
 29 |     # https://squidfunk.github.io/mkdocs-material/setup/setting-up-navigation/#navigation-path
 30 |     - navigation.path
 31 |     - navigation.tabs
 32 |     - navigation.tabs.sticky
 33 |     - navigation.top
 34 |     - navigation.tracking
 35 |     - search.highlight
 36 |     - search.share
 37 |     - search.suggest
 38 |   palette:
 39 |     - scheme: default
 40 |       primary: indigo
 41 |       accent: indigo
 42 |       toggle:
 43 |         icon: material/toggle-switch-off-outline
 44 |         name: Switch to dark mode
 45 |     - scheme: slate
 46 |       primary: blue
 47 |       accent: blue
 48 |       toggle:
 49 |         icon: material/toggle-switch
 50 |         name: Switch to light mode
 51 | 
 52 | markdown_extensions:
 53 |   - admonition
 54 |   - attr_list
 55 |   - footnotes
 56 |   - md_in_html
 57 |   - toc:
 58 |       permalink: true
 59 |   - pymdownx.arithmatex
 60 |   - pymdownx.betterem:
 61 |       smart_enable: all
 62 |   - pymdownx.caret
 63 |   - pymdownx.critic
 64 |   - pymdownx.details
 65 |   - pymdownx.emoji
 66 |   - pymdownx.inlinehilite
 67 |   - pymdownx.magiclink
 68 |   - pymdownx.mark
 69 |   - pymdownx.smartsymbols
 70 |   - pymdownx.superfences
 71 |   - pymdownx.tasklist:
 72 |       custom_checkbox: true
 73 |   - pymdownx.tabbed:
 74 |       alternate_style: true
 75 |   - pymdownx.tilde
 76 | 
 77 | plugins:
 78 |   - search
 79 |   - minify:
 80 |       minify_html: true
 81 |   - awesome-pages
 82 |   - macros
 83 |   # https://squidfunk.github.io/mkdocs-material/reference/#built-in-meta-plugin
 84 |   - meta
 85 |   # https://squidfunk.github.io/mkdocs-material/setup/setting-up-tags/
 86 |   - tags:
 87 |       # enabled: !ENV [CI, false]
 88 |       tags_file: tags.md
 89 |       tags_extra_files:
 90 |         python-api.md:
 91 |           - python
 92 |         scala-api.md:
 93 |           - scala
 94 |   # https://squidfunk.github.io/mkdocs-material/reference/#built-in-typeset-plugin
 95 |   - typeset
 96 | 
 97 | extra:
 98 |   arrow:
 99 |     docs: https://arrow.apache.org/docs
100 |     home: https://arrow.apache.org/
101 |   book:
102 |     title: PySpark
103 |     spark_core: https://books.japila.pl/apache-spark-internals
104 |     spark_sql: https://books.japila.pl/spark-sql-internals
105 |     spark_k8s: https://jaceklaskowski.github.io/spark-kubernetes-book
106 |   java:
107 |     api: https://docs.oracle.com/en/java/javase/17/docs/api/java.base
108 |   pandas:
109 |     version: 2.2.0
110 |     api: https://pandas.pydata.org/docs/reference/api
111 |     home: https://pandas.pydata.org/
112 |   pyarrow:
113 |     version: 17.0.0
114 |   py4j:
115 |     version: 0.10.9
116 |     doc: https://www.py4j.org
117 |     docs: https://www.py4j.org
118 |     javadoc: https://www.py4j.org/_static/javadoc
119 |   python:
120 |     version: 3.11
121 |     docs: https://docs.python.org/3
122 |     api: https://docs.python.org/3
123 |     peps: https://peps.python.org
124 |     realpython: https://realpython.com
125 |   pytorch:
126 |     docs: https://pytorch.org/docs/stable
127 |     github: https://github.com/pytorch/pytorch
128 |     tutorials: https://pytorch.org/tutorials
129 |   scala:
130 |     api: https://www.scala-lang.org/api/2.13.8
131 |   social:
132 |     - icon: fontawesome/brands/github
133 |       link: https://github.com/jaceklaskowski
134 |     - icon: fontawesome/brands/twitter
135 |       link: https://twitter.com/jaceklaskowski
136 |     - icon: fontawesome/brands/linkedin
137 |       link: https://linkedin.com/in/jaceklaskowski
138 |     - icon: fontawesome/brands/medium
139 |       link: https://jaceklaskowski.medium.com
140 |     - icon: fontawesome/brands/mastodon
141 |       link: https://fosstodon.org/@jaceklaskowski
142 |   spark:
143 |     version: 3.5.4
144 |     github: https://github.com/apache/spark/tree/v3.5.4
145 |     jira: https://issues.apache.org/jira/browse
146 |   # https://squidfunk.github.io/mkdocs-material/setup/setting-up-tags/#tag-icons-and-identifiers
147 |   tags:
148 |     Python: python
149 |     Scala: scala
150 | 
151 | nav:
152 |   - index.md
153 |   - Features:
154 |     - features/index.md
155 |     - ... | arrow-optimization/**.md
156 |     - ... | configuration-properties/**.md
157 |     - environment-variables.md
158 |     - ... | pytorch-distributed/**.md
159 |     - ... | pandas-on-spark/**.md
160 |     - ... | pandas-udafs/**.md
161 |     - ... | pandas-udfs/**.md
162 |     - PySpark API:
163 |       - tags.md
164 |       - python-api.md
165 |       - scala-api.md
166 |     - ... | udts/**.md
167 |     - ... | connect/**.md
168 |   - ... | ml/**.md
169 |   - ... | sql/**.md
170 |   - Internals:
171 |     - Setup: Setup.md
172 |     - Building from Sources: building-from-sources.md
173 |     - PythonRunner: PythonRunner.md
174 |     - PythonGatewayServer: PythonGatewayServer.md
175 |     - Py4JServer: Py4JServer.md
176 |     - SparkConf: SparkConf.md
177 |     - SparkContext: SparkContext.md
178 |     - PythonWorkerFactory: PythonWorkerFactory.md
179 |     - MonitorThread: MonitorThread.md
180 |     - PythonFunction: PythonFunction.md
181 |     - PythonRDD: PythonRDD.md
182 |     - PythonForeachWriter: PythonForeachWriter.md
183 |     - PythonAccumulatorV2: PythonAccumulatorV2.md
184 |     - PythonBroadcast: PythonBroadcast.md
185 |     - PythonUtils: PythonUtils.md
186 |     - RDD: RDD.md
187 |     - SimplePythonFunction: SimplePythonFunction.md
188 |     - SocketAuthServer: SocketAuthServer.md
189 |     - SocketFuncServer: SocketFuncServer.md
190 |     - SocketAuthHelper: SocketAuthHelper.md
191 |     - SparkEnv: SparkEnv.md
192 |     - logging.md
193 |   - Modules:
194 |     - pyspark:
195 |       - pyspark/index.md
196 |       - daemon.py: pyspark/daemon.md
197 |       - java_gateway.py: pyspark/java_gateway.md
198 |       - rdd.py: pyspark/rdd.md
199 |       - shell.py: pyspark/shell.md
200 |       - worker.py: pyspark/worker.md
201 |     - pyspark.pandas:
202 |       - pyspark/pandas/index.md
203 |       - pyspark/pandas/DataFrame.md
204 |       - pyspark/pandas/InternalFrame.md
205 |     - pyspark.pandas.generic:
206 |       - pyspark/pandas/generic/index.md
207 |       - pyspark/pandas/generic/Frame.md
208 |     - pyspark.sql:
209 |       - ... | flat | pyspark/sql/**.md
210 |     - pyspark.sql.pandas:
211 |       - pyspark/sql/pandas/index.md
212 |       - functions.py: pyspark/sql/pandas/functions.md
213 |       - pyspark/sql/pandas/PandasUDFType.md
214 |   - ... | runners/**.md
215 |   - ... | demo/**.md
216 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://${GH_TOKEN}@github.com/squidfunk/mkdocs-material-insiders.git
2 | mkdocs-minify-plugin>=0.3.0
3 | mkdocs-git-revision-date-localized-plugin>=0.8
4 | mkdocs-git-revision-date-plugin>=0.3.1
5 | mkdocs-awesome-pages-plugin>=2.5.0
6 | mkdocs-redirects>=1.0.1
7 | mkdocs-macros-plugin>=0.5.0
8 | 


--------------------------------------------------------------------------------