├── .github ├── FUNDING.yml └── workflows │ └── deploying-to-github-pages.yml ├── .gitignore ├── LICENSE ├── README.md ├── docs ├── MonitorThread.md ├── Py4JServer.md ├── PythonAccumulatorV2.md ├── PythonBroadcast.md ├── PythonForeachWriter.md ├── PythonFunction.md ├── PythonGatewayServer.md ├── PythonRDD.md ├── PythonRunner.md ├── PythonUtils.md ├── PythonWorkerFactory.md ├── RDD.md ├── Setup.md ├── SimplePythonFunction.md ├── SocketAuthHelper.md ├── SocketAuthServer.md ├── SocketFuncServer.md ├── SparkConf.md ├── SparkContext.md ├── SparkEnv.md ├── arrow-optimization │ ├── .pages │ └── index.md ├── building-from-sources.md ├── configuration-properties │ ├── .pages │ ├── index.md │ ├── spark.md │ ├── spark.pyspark.md │ ├── spark.python.md │ └── spark.sql.execution.md ├── connect │ ├── .pages │ └── index.md ├── demo │ ├── .pages │ ├── executing-pyspark-applications-using-spark-submit.md │ ├── index.md │ └── running-pyspark-application-on-minikube.md ├── environment-variables.md ├── features │ └── index.md ├── images │ ├── PythonRunner.png │ ├── PythonWorkerFactory.png │ └── SparkContext.png ├── index.md ├── logging.md ├── ml │ ├── .pages │ ├── Distributor.md │ └── index.md ├── pandas-on-spark │ ├── .pages │ └── index.md ├── pandas-udafs │ ├── .pages │ └── index.md ├── pandas-udfs │ ├── .pages │ └── index.md ├── pyspark │ ├── daemon.md │ ├── index.md │ ├── java_gateway.md │ ├── pandas │ │ ├── DataFrame.md │ │ ├── InternalFrame.md │ │ ├── generic │ │ │ ├── Frame.md │ │ │ └── index.md │ │ └── index.md │ ├── rdd.md │ ├── shell.md │ ├── sql │ │ ├── .pages │ │ ├── SparkSession.Builder.md │ │ ├── SparkSession.md │ │ ├── UserDefinedFunction.md │ │ ├── dataframe.md │ │ ├── functions.md │ │ ├── group.md │ │ ├── index.md │ │ ├── pandas │ │ │ ├── PandasUDFType.md │ │ │ ├── functions.md │ │ │ └── index.md │ │ ├── session.md │ │ └── udf.md │ └── worker.md ├── python-api.md ├── pytorch-distributed │ ├── .pages │ ├── TorchDistributor.md │ ├── index.md │ └── torch_run_process_wrapper.md ├── runners │ ├── .pages │ ├── ArrowPythonRunner.md │ ├── BasePythonRunner.md │ ├── BasicPythonArrowOutput.md │ ├── PythonArrowOutput.md │ ├── PythonRunner.md │ ├── PythonUDFRunner.md │ └── ReaderIterator.md ├── scala-api.md ├── sql │ ├── .pages │ ├── AggregateInPandasExec.md │ ├── ArrowEvalPython.md │ ├── ArrowEvalPythonExec.md │ ├── BaseEvalPython.md │ ├── DataFrame.md │ ├── EvalPythonExec.md │ ├── FlatMapGroupsInPandas.md │ ├── FlatMapGroupsInPandasExec.md │ ├── GroupedData.md │ ├── Observation.md │ ├── PandasCogroupedOps.md │ ├── PandasConversionMixin.md │ ├── PandasGroupUtils.md │ ├── PandasGroupedOpsMixin.md │ ├── PandasMapOpsMixin.md │ ├── PythonEvalType.md │ ├── PythonSQLMetrics.md │ ├── PythonUDF.md │ ├── RelationalGroupedDataset.md │ ├── SQLContext.md │ ├── SparkConversionMixin.md │ ├── UDFRegistration.md │ ├── UserDefinedPythonFunction.md │ └── index.md ├── tags.md └── udts │ ├── .pages │ └── index.md ├── graffles ├── PythonRunner.graffle ├── PythonWorkerFactory.graffle └── SparkContext.graffle ├── mkdocs.yml └── requirements.txt /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: jaceklaskowski 2 | ko_fi: jaceklaskowski 3 | custom: "https://paypal.me/JacekLaskowski" 4 | -------------------------------------------------------------------------------- /.github/workflows/deploying-to-github-pages.yml: -------------------------------------------------------------------------------- 1 | # Based on https://github.com/squidfunk/mkdocs-material/blob/master/.github/workflows/ci.yml 2 | 3 | name: Deploying to GitHub Pages 4 | on: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | deploy: 11 | if: github.event.pull_request.head.repo.fork == false 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3 15 | with: 16 | fetch-depth: 0 17 | - uses: actions/setup-python@v3 18 | with: 19 | python-version: 3.x 20 | - name: Install dependencies 21 | env: 22 | GH_TOKEN: ${{ secrets.GH_TOKEN }} 23 | run: | 24 | pip install -r requirements.txt 25 | - name: Build documentation 26 | # env: 27 | # GOOGLE_ANALYTICS_KEY: ${{ secrets.GOOGLE_ANALYTICS_KEY }} 28 | run: | 29 | mkdocs gh-deploy --force 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | site/ 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The Internals of PySpark Online Book 2 | 3 | [![GitHub Pages](https://github.com/japila-books/pyspark-internals/actions/workflows/deploying-to-github-pages.yml/badge.svg)](https://github.com/japila-books/pyspark-internals/actions) 4 | 5 | The project contains the sources of [The Internals of PySpark](https://books.japila.pl/pyspark-internals) online book. 6 | -------------------------------------------------------------------------------- /docs/MonitorThread.md: -------------------------------------------------------------------------------- 1 | # MonitorThread 2 | 3 | `MonitorThread` is...FIXME 4 | -------------------------------------------------------------------------------- /docs/Py4JServer.md: -------------------------------------------------------------------------------- 1 | # Py4JServer 2 | 3 | `Py4JServer` is a gateway server between Python and Java Virtual Machine (JVM) using [Py4J]({{ py4j.doc }}). 4 | 5 | `Py4JServer` is a wrapper for a [py4j Server](#server). 6 | 7 | ## Creating Instance 8 | 9 | `Py4JServer` takes the following to be created: 10 | 11 | * `SparkConf` ([Spark Core]({{ book.spark_core }}/SparkConf)) 12 | 13 | `Py4JServer` is created when: 14 | 15 | * [PythonGatewayServer](PythonGatewayServer.md) command-line application is started 16 | * [PythonRunner](PythonRunner.md) command-line application is started 17 | 18 | ## py4j Server 19 | 20 | `Py4JServer` creates a `ClientServer` ([py4j]({{ py4j.javadoc }}/py4j/ClientServer.html)) or `GatewayServer` ([py4j]({{ py4j.javadoc }}/py4j/GatewayServer.html)) based on [PYSPARK_PIN_THREAD](environment-variables.md#PYSPARK_PIN_THREAD) environment variable. 21 | 22 | ## Connection Secret 23 | 24 | ```scala 25 | secret: String 26 | ``` 27 | 28 | `Py4JServer` creates a connection secret for a secure communication. 29 | 30 | ## start 31 | 32 | ```scala 33 | start(): Unit 34 | ``` 35 | 36 | `start` requests the [py4j Server](#server) to start. 37 | 38 | ## getListeningPort 39 | 40 | ```scala 41 | getListeningPort: Int 42 | ``` 43 | 44 | `getListeningPort` requests the [py4j Server](#server) for the listening port. 45 | -------------------------------------------------------------------------------- /docs/PythonAccumulatorV2.md: -------------------------------------------------------------------------------- 1 | # PythonAccumulatorV2 2 | 3 | `PythonAccumulatorV2` is...FIXME -------------------------------------------------------------------------------- /docs/PythonBroadcast.md: -------------------------------------------------------------------------------- 1 | # PythonBroadcast 2 | 3 | `PythonBroadcast` is...FIXME -------------------------------------------------------------------------------- /docs/PythonForeachWriter.md: -------------------------------------------------------------------------------- 1 | # PythonForeachWriter 2 | 3 | `PythonForeachWriter` is...FIXME 4 | -------------------------------------------------------------------------------- /docs/PythonFunction.md: -------------------------------------------------------------------------------- 1 | --- 2 | tags: 3 | - Scala 4 | --- 5 | 6 | # PythonFunction 7 | 8 | `PythonFunction` is an [abstraction](#contract) of the [metadata](#implementations) of a [Python function](sql/PythonUDF.md#func) of a [PythonUDF](sql/PythonUDF.md). 9 | 10 | `PythonFunction` is executed in a [BasePythonRunner](runners/BasePythonRunner.md). 11 | 12 | `PythonFunction` is used to create the following: 13 | 14 | * [PythonRDD](PythonRDD.md#func) 15 | * [PythonRunner](PythonRunner.md#apply) 16 | * [PythonForeachWriter](PythonForeachWriter.md#func) 17 | * [UserDefinedPythonFunction](sql/UserDefinedPythonFunction.md#func) 18 | 19 | !!! note "ChainedPythonFunctions" 20 | `ChainedPythonFunctions` is a collection of chained `PythonFunction`s. 21 | 22 | ## Contract (Subset) 23 | 24 | ### accumulator 25 | 26 | ```scala 27 | accumulator: PythonAccumulatorV2 28 | ``` 29 | 30 | [PythonAccumulatorV2](PythonAccumulatorV2.md) 31 | 32 | Used when: 33 | 34 | * `BasePythonRunner` is [created](runners/BasePythonRunner.md#accumulator) 35 | 36 | ### broadcastVars { #broadcastVars } 37 | 38 | ```scala 39 | broadcastVars: JList[Broadcast[PythonBroadcast]] 40 | ``` 41 | 42 | A collection of broadcast variables ([Spark Core]({{ book.spark_core }}/broadcast-variables/Broadcast)) with a [PythonBroadcast](PythonBroadcast.md) 43 | 44 | Used when: 45 | 46 | * `WriterThread` is created 47 | 48 | ### command 49 | 50 | ```scala 51 | command: Seq[Byte] 52 | ``` 53 | 54 | Used when: 55 | 56 | * `PythonRunner` is requested to [newWriterThread](PythonRunner.md#newWriterThread) 57 | * `UDFRegistration` is requested to [register a Python UDF](sql/UDFRegistration.md#registerPython) (for logging purposes only) 58 | * `PythonUDFRunner` is requested to [writeUDFs](runners/PythonUDFRunner.md#writeUDFs) 59 | 60 | ## Implementations 61 | 62 | * [SimplePythonFunction](SimplePythonFunction.md) 63 | -------------------------------------------------------------------------------- /docs/PythonGatewayServer.md: -------------------------------------------------------------------------------- 1 | # PythonGatewayServer 2 | 3 | `PythonGatewayServer` is a [command-line application](#main) (_process_) that starts a [Py4JServer](Py4JServer.md) on an ephemeral port. 4 | 5 | `PythonGatewayServer` is the Python runner for `pyspark` shell script ([Spark Core]({{ book.spark_core }}/tools/SparkSubmit#PYSPARK_SHELL)). 6 | 7 | ## main 8 | 9 | `main` creates a [Py4JServer](Py4JServer.md) and requests it to [start](Py4JServer.md#start). 10 | 11 | `main` requests the `Py4JServer` for the [listening port](Py4JServer.md#getListeningPort) (_boundPort_) and prints out the following DEBUG message to the logs: 12 | 13 | ```text 14 | Started PythonGatewayServer on port [boundPort] 15 | ``` 16 | 17 | 18 | `main` uses [_PYSPARK_DRIVER_CONN_INFO_PATH](#_PYSPARK_DRIVER_CONN_INFO_PATH) environment variable for the path of a connection info file (for the associated python process) with the listening port and the [secret](Py4JServer.md#secret). 19 | 20 | `main` pauses (_blocks_) until the Python driver finishes (by reading from the system input that blocks until input data is available, the end of the stream is detected, or an exception is thrown). 21 | 22 | In the end, once the Python driver finishes, `main` prints out the following DEBUG message to the logs: 23 | 24 | ```text 25 | Exiting due to broken pipe from Python driver 26 | ``` 27 | 28 | `main` prints out the following ERROR message to the logs and exists when the [listening port](Py4JServer.md#getListeningPort) is `-1`: 29 | 30 | ```text 31 | [server] failed to bind; exiting 32 | ``` 33 | 34 | ## _PYSPARK_DRIVER_CONN_INFO_PATH 35 | 36 | `PythonGatewayServer` uses `_PYSPARK_DRIVER_CONN_INFO_PATH` environment variable for the [path of a connection info file](#main-_PYSPARK_DRIVER_CONN_INFO_PATH) for communication between this and the Python processes. 37 | 38 | `_PYSPARK_DRIVER_CONN_INFO_PATH` is configured when [java_gateway.py](pyspark/java_gateway.md) module is requested to [launch_gateway](pyspark/java_gateway.md#launch_gateway). 39 | 40 | ## Logging 41 | 42 | Enable `ALL` logging level for `org.apache.spark.api.python.PythonGatewayServer` logger to see what happens inside. 43 | 44 | Add the following line to `conf/log4j2.properties`: 45 | 46 | ```text 47 | logger.PythonGatewayServer.name = org.apache.spark.api.python.PythonGatewayServer 48 | logger.PythonGatewayServer.level = all 49 | ``` 50 | 51 | Refer to [Logging](logging.md). 52 | -------------------------------------------------------------------------------- /docs/PythonRDD.md: -------------------------------------------------------------------------------- 1 | # PythonRDD 2 | 3 | `PythonRDD` is an `RDD` (`RDD[Array[Byte]]`) that uses [PythonRunner](runners/PythonRunner.md) (to [compute a partition](#compute)). 4 | 5 | ## Creating Instance 6 | 7 | `PythonRDD` takes the following to be created: 8 | 9 | * Parent `RDD` 10 | * [PythonFunction](PythonFunction.md) 11 | * `preservePartitoning` flag 12 | * `isFromBarrier` flag (default: `false`) 13 | 14 | `PythonRDD` is created when...FIXME 15 | 16 | ## runJob 17 | 18 | ```scala 19 | runJob( 20 | sc: SparkContext, 21 | rdd: JavaRDD[Array[Byte]], 22 | partitions: JArrayList[Int]): Array[Any] 23 | ``` 24 | 25 | `runJob`...FIXME 26 | 27 | ## collectAndServe 28 | 29 | ```scala 30 | collectAndServe[T]( 31 | rdd: RDD[T]): Array[Any] 32 | ``` 33 | 34 | `collectAndServe`...FIXME 35 | 36 | ## collectAndServeWithJobGroup 37 | 38 | ```scala 39 | collectAndServeWithJobGroup[T]( 40 | rdd: RDD[T], 41 | groupId: String, 42 | description: String, 43 | interruptOnCancel: Boolean): Array[Any] 44 | ``` 45 | 46 | `collectAndServeWithJobGroup`...FIXME 47 | 48 | ## serveIterator Utility 49 | 50 | ```scala 51 | serveIterator( 52 | items: Iterator[_], 53 | threadName: String): Array[Any] 54 | ``` 55 | 56 | `serveIterator` [serveToStream](#serveToStream) with a writer function that...FIXME 57 | 58 | `serveIterator` is used when: 59 | 60 | * `PythonRDD` utility is used to [runJob](#runJob), [collectAndServe](#collectAndServe) and [collectAndServeWithJobGroup](#collectAndServeWithJobGroup) 61 | * `Dataset` is requested to `collectToPython`, `tailToPython`, `getRowsToPython` 62 | 63 | ## serveToStream Utility 64 | 65 | ```scala 66 | serveToStream( 67 | threadName: String)( 68 | writeFunc: OutputStream => Unit): Array[Any] 69 | ``` 70 | 71 | `serveToStream` [serveToStream](SocketAuthServer.md#serveToStream) with the [authHelper](#authHelper) and the input arguments. 72 | 73 | `serveToStream` is used when: 74 | 75 | * `PythonRDD` utility is used to [serveIterator](#serveIterator) 76 | * `Dataset` is requested to `collectAsArrowToPython` 77 | 78 | ## SocketAuthHelper 79 | 80 | `PythonRDD` uses a [SocketAuthHelper](SocketAuthHelper.md). 81 | -------------------------------------------------------------------------------- /docs/PythonRunner.md: -------------------------------------------------------------------------------- 1 | # PythonRunner 2 | 3 | `PythonRunner` is a [command-line application](#main) to launch a separate process to run a Python application (alongside the JVM process of `PythonRunner` with Apache Spark services). 4 | 5 |
6 | ![PythonRunner and Python Process](images/PythonRunner.png) 7 |
8 | 9 | `PythonRunner` can be launched using `spark-submit` shell script ([Spark Core]({{ book.spark_core }}/tools/spark-submit/)). 10 | 11 | `PythonRunner` executes the [Python executable](#pythonExec) (with the PySpark application and arguments) as a subprocess that is expected to connect back to the JVM to access Spark services. 12 | 13 | ??? note "Uh-oh, there are two PythonRunners 🙄" 14 | This page is about `org.apache.spark.deploy.PythonRunner` while there is another [PythonRunner](runners/PythonRunner.md). 15 | 16 | ## Arguments 17 | 18 | `PythonRunner` accepts the following command-line arguments (in that order): 19 | 20 | 1. Main python file (`pythonFile`) 21 | 1. Extra python files (`pyFiles`) 22 | 1. PySpark application arguments, if any 23 | 24 | ## Python Executable { #pythonExec } 25 | 26 | `PythonRunner` determines the Python executable to launch a PySpark application with based on the following (in the order of precedence): 27 | 28 | 1. [spark.pyspark.driver.python](configuration-properties/index.md#spark.pyspark.driver.python) configuration property 29 | 1. [spark.pyspark.python](configuration-properties/index.md#spark.pyspark.python) configuration property 30 | 1. [PYSPARK_DRIVER_PYTHON](environment-variables.md#PYSPARK_DRIVER_PYTHON) environment variable 31 | 1. [PYSPARK_PYTHON](environment-variables.md#PYSPARK_PYTHON) environment variable 32 | 1. `python3` 33 | 34 | ## Environment Variables 35 | 36 | `PythonRunner` defines the following environment variables to configure the PySpark application's execution environment. 37 | 38 | Environment Variable | Value 39 | ---------------------|--------- 40 | `PYTHONPATH` | Comma-separated list of local paths with formatted `pyFiles` and [sparkPythonPath](PythonUtils.md#sparkPythonPath), followed by the existing `PYTHONPATH` 41 | `PYTHONUNBUFFERED` | `YES` 42 | [PYSPARK_GATEWAY_PORT](environment-variables.md#PYSPARK_GATEWAY_PORT) | The [listening port](Py4JServer.md#getListeningPort) of the started `Py4JServer` 43 | [PYSPARK_GATEWAY_SECRET](environment-variables.md#PYSPARK_GATEWAY_SECRET) | The [secret](Py4JServer.md#secret) of the started `Py4JServer` 44 | `PYSPARK_PYTHON` | [spark.pyspark.python](configuration-properties/index.md#spark.pyspark.python) if defined 45 | `PYTHONHASHSEED` | `PYTHONHASHSEED` env var if defined 46 | `OMP_NUM_THREADS` | `spark.driver.cores` (unless defined for Spark on k8s, YARN and Mesos) 47 | `SPARK_REMOTE` | `spark.remote` if defined 48 | 49 | ## Launching Application { #main } 50 | 51 | ```scala 52 | main( 53 | args: Array[String]): Unit 54 | ``` 55 | 56 | `main` takes the [arguments](#arguments) (from the given `args`). 57 | 58 | `main` determines the [Python executable](#pythonExec) to launch the PySpark application (based on configuration properties and environment variables). 59 | 60 | `main` creates a [Py4JServer](Py4JServer.md) that is immediately [started](Py4JServer.md#start) (on a daemon **py4j-gateway-init** thread). `main` waits until the `Py4JServer` has started. 61 | 62 | `main` starts a Python process using the [Python executable](#pythonExec) and the [environment variables](#environment-variables). 63 | 64 | `main` pauses itself and waits for the Python process to finish. Once it happens, `main` requests the `Py4JServer` to [shutdown](Py4JServer.md#shutdown). 65 | 66 | ## Demo 67 | 68 | [Demo: Executing PySpark Applications Using spark-submit](demo/executing-pyspark-applications-using-spark-submit.md) 69 | -------------------------------------------------------------------------------- /docs/PythonUtils.md: -------------------------------------------------------------------------------- 1 | # PythonUtils 2 | 3 | ## Broadcast Threshold { #getBroadcastThreshold } 4 | 5 | ```scala 6 | getBroadcastThreshold( 7 | sc: JavaSparkContext): Long 8 | ``` 9 | 10 | `getBroadcastThreshold` is the value of [spark.broadcast.UDFCompressionThreshold](configuration-properties/spark.md#spark.broadcast.UDFCompressionThreshold) configuration property. 11 | 12 | !!! note "py4j" 13 | `getBroadcastThreshold` is a Scala method that is used by [pyspark.rdd](pyspark/rdd.md#_prepare_for_python_RDD) Python module via [py4j](SparkContext.md#_jvm) bridge. 14 | 15 | --- 16 | 17 | `getBroadcastThreshold` is used when: 18 | 19 | * `pyspark.rdd` is requested to [_prepare_for_python_RDD](pyspark/rdd.md#_prepare_for_python_RDD) 20 | -------------------------------------------------------------------------------- /docs/PythonWorkerFactory.md: -------------------------------------------------------------------------------- 1 | # PythonWorkerFactory 2 | 3 | `PythonWorkerFactory` is a factory of [Python workers](#create) to execute [PythonFunction](PythonFunction.md)s. 4 | 5 | ![PythonWorkerFactory](images/PythonWorkerFactory.png) 6 | 7 | !!! note 8 | There could be many `PythonWorkerFactory`s on a single executor (one for every pair of the [pythonExec](#pythonExec) and the [envVars](#envVars)). 9 | 10 | ## Creating Instance 11 | 12 | `PythonWorkerFactory` takes the following to be created: 13 | 14 | * [Python Executable](#pythonExec) 15 | * Environment Variables 16 | 17 | `PythonWorkerFactory` is created when: 18 | 19 | * `SparkEnv` is requested to [createPythonWorker](SparkEnv.md#createPythonWorker) (for `BasePythonRunner` to [compute a partition](runners/BasePythonRunner.md#compute)). 20 | 21 | ### Python Executable { #pythonExec } 22 | 23 | `PythonWorkerFactory` is given a Python executable (`pythonExec`) when [created](#creating-instance). 24 | 25 | The Python executable is the [pythonExec](PythonFunction.md#pythonExec) of the first [PythonFunction](PythonFunction.md) (of all the Python UDFs to execute by [BasePythonRunner](runners/BasePythonRunner.md)). 26 | 27 | !!! note 28 | It is assumed that all [PythonFunction](PythonFunction.md)s (of a [BasePythonRunner](runners/BasePythonRunner.md)) should have the same Python executable, version and env vars. That is why it is safe to use the first `PythonFunction`. 29 | 30 | ## useDaemon { #useDaemon } 31 | 32 | `PythonWorkerFactory` initializes `useDaemon` internal flag when [created](#creating-instance). 33 | 34 | `useDaemon` is enabled when the following all hold: 35 | 36 | * [spark.python.use.daemon](configuration-properties/index.md#spark.python.use.daemon) is enabled 37 | * The operating system is not MS Windows (based on `os.name` JVM property) as it works on UNIX-based systems only (because it uses signals for child management) 38 | 39 | `useDaemon` flag is used when `PythonWorkerFactory` is requested for the following: 40 | 41 | * [create](#create) 42 | * [stopDaemon](#stopDaemon) 43 | * [stopWorker](#stopWorker) 44 | * [releaseWorker](#releaseWorker) 45 | 46 | ## Daemon Process { #daemon } 47 | 48 | ```scala 49 | daemon: Process = null 50 | ``` 51 | 52 | `daemon` is a `Process` ([Java]({{ java.api }}/java/lang/Process.html)) to control [Python worker processes](#daemonWorkers). 53 | 54 | `daemon` is uninitialized (`null`) right after `PythonWorkerFactory` is [created](#creating-instance) and right after [stopDaemon](#stopDaemon). 55 | 56 | `daemon` is initialized and immediately started when [startDaemon](#startDaemon) (and listens at [daemonPort](#daemonPort)). 57 | 58 | `daemon` is alive until [stopDaemon](#stopDaemon). 59 | 60 | Any communication with the `daemon` happens through [daemonPort](#daemonPort). 61 | 62 | ### Port { #daemonPort } 63 | 64 | ```scala 65 | daemonPort: Int = 0 66 | ``` 67 | 68 | `daemonPort` is the communication channel (port) of the [daemon](#daemon) Python process (that is known only after [startDaemon](#startDaemon)). 69 | 70 | `daemonPort` (alongside the [daemonHost](#daemonHost)) is used to open a socket stream and launch [workers](#daemonWorkers). 71 | 72 | ### Python Workers { #daemonWorkers } 73 | 74 | ```scala 75 | daemonWorkers: mutable.WeakHashMap[Socket, Int] 76 | ``` 77 | 78 | `PythonWorkerFactory` creates `daemonWorkers` internal registry of socket streams and the worker's PID when [created](#creating-instance). 79 | 80 | A new pair is added in [createSocket](#createSocket) (when [createThroughDaemon](#createThroughDaemon)). 81 | 82 | `daemonWorkers` is used when: 83 | 84 | * [create](#create) (with [useDaemon](#useDaemon) flag enabled and non-empty [idleWorkers](#idleWorkers)) 85 | * [stopWorker](#stopWorker) 86 | 87 | ## Python Modules 88 | 89 | ### Daemon { #daemonModule } 90 | 91 | `PythonWorkerFactory` initializes `daemonModule` internal property for the **Python Daemon Module** when [created](#creating-instance). 92 | 93 | `daemonModule` is the value of [spark.python.daemon.module](configuration-properties/index.md#spark.python.daemon.module) configuration property. 94 | 95 | The Python Daemon Module is used when `PythonWorkerFactory` is requested to [create and start a daemon module](#startDaemon). 96 | 97 | ### Worker { #workerModule } 98 | 99 | `PythonWorkerFactory` uses [spark.python.worker.module](configuration-properties/index.md#PYTHON_WORKER_MODULE) configuration property to specify the **Python Worker Module**. 100 | 101 | The Python Worker Module is used when `PythonWorkerFactory` is requested to [create and start a worker](#createSimpleWorker). 102 | 103 | ## Creating Python Worker { #create } 104 | 105 | ```scala 106 | create(): (Socket, Option[Int]) 107 | ``` 108 | 109 | `create` branches off based on the [useDaemon](#useDaemon) flag: 110 | 111 | * When enabled, `create` firstly checks the [idleWorkers](#idleWorkers) queue and returns one if available. Otherwise, `create` [createThroughDaemon](#createThroughDaemon) 112 | * When disabled, `create` [createSimpleWorker](#createSimpleWorker) 113 | 114 | --- 115 | 116 | `create` is used when: 117 | 118 | * `SparkEnv` is requested to [createPythonWorker](SparkEnv.md#createPythonWorker) 119 | 120 | ### Creating Daemon Worker { #createThroughDaemon } 121 | 122 | ```scala 123 | createThroughDaemon(): (Socket, Option[Int]) 124 | ``` 125 | 126 | `createThroughDaemon` [startDaemon](#startDaemon) followed by [createSocket](#createSocket). 127 | 128 | In case of a `SocketException`, `createThroughDaemon` prints out the following WARN message to the logs: 129 | 130 | ```text 131 | Failed to open socket to Python daemon: [exception] 132 | Assuming that daemon unexpectedly quit, attempting to restart 133 | ``` 134 | 135 | And then, `createThroughDaemon` [stopDaemon](#stopDaemon), [startDaemon](#startDaemon) and [createSocket](#createSocket). 136 | 137 | #### createSocket { #createSocket } 138 | 139 | ```scala 140 | createSocket(): (Socket, Option[Int]) 141 | ``` 142 | 143 | `createSocket` creates a new stream socket and connects it to the [daemonPort](#daemonPort) at the [daemonHost](#daemonHost). 144 | 145 | `createSocket` reads the PID (of the python worker behind the stream socket) and requests the [authHelper](#authHelper) to `authToServer`. 146 | 147 | In the end, `createSocket` returns the socket and the PID (after registering them in the [daemonWorkers](#daemonWorkers) registry). 148 | 149 | ### Starting Python Daemon Process { #startDaemon } 150 | 151 | ```scala 152 | startDaemon(): Unit 153 | ``` 154 | 155 | !!! note "Does nothing with `daemon` initialized" 156 | `startDaemon` does nothing when [daemon](#daemon) is initialized (non-`null`) that indicates that the daemon is already up and running. 157 | 158 | `startDaemon` creates the command (using the given [pythonExec](#pythonExec) and the [daemon module](#daemonModule)): 159 | 160 | ```text 161 | [pythonExec] -m [daemonModule] 162 | ``` 163 | 164 | `startDaemon` adds the given [envVars](#envVars) and the following (extra) environment variables to the environment of future python processes: 165 | 166 | Environment Variable | Value 167 | ---------------------|------ 168 | `PYTHONPATH` | [pythonPath](#pythonPath) 169 | `PYTHON_WORKER_FACTORY_SECRET` | [authHelper](#authHelper) 170 | `SPARK_PREFER_IPV6` | `True` if the underlying JVM prefer IPv6 addresses (based on `java.net.preferIPv6Addresses` JVM property) 171 | `PYTHONUNBUFFERED` | `YES` 172 | 173 | `startDaemon` starts a new process (that is known as the [daemon](#daemon)). 174 | 175 | `startDaemon` connects to the python process to read the [daemonPort](#daemonPort). 176 | 177 | In the end, `startDaemon` [redirectStreamsToStderr](#redirectStreamsToStderr). 178 | 179 | ## Creating Simple Non-Daemon Worker 180 | 181 | ```scala 182 | createSimpleWorker(): Socket 183 | ``` 184 | 185 | `createSimpleWorker`...FIXME 186 | 187 | `createSimpleWorker` is used when `PythonWorkerFactory` is requested to [create a Python worker](#create) (with [useDaemon](#useDaemon) flag disabled). 188 | 189 | ## Logging 190 | 191 | Enable `ALL` logging level for `org.apache.spark.api.python.PythonWorkerFactory` logger to see what happens inside. 192 | 193 | Add the following line to `conf/log4j2.properties`: 194 | 195 | ```text 196 | logger.PythonWorkerFactory.name = org.apache.spark.api.python.PythonWorkerFactory 197 | logger.PythonWorkerFactory.level = all 198 | ``` 199 | 200 | Refer to [Logging](logging.md). 201 | -------------------------------------------------------------------------------- /docs/RDD.md: -------------------------------------------------------------------------------- 1 | # RDD 2 | -------------------------------------------------------------------------------- /docs/Setup.md: -------------------------------------------------------------------------------- 1 | # PySpark Setup 2 | 3 | ## Install IPython 4 | 5 | Follow the steps as described in the [official documentation](https://ipython.readthedocs.io/en/stable/install/install.html) of IPython. 6 | 7 | ```text 8 | pip install ipython 9 | ``` 10 | 11 | ## Start PySpark 12 | 13 | ```bash 14 | export PYSPARK_DRIVER_PYTHON=ipython 15 | ``` 16 | 17 | For Java 11, use `-Dio.netty.tryReflectionSetAccessible=true` (see [Downloading](http://spark.apache.org/docs/latest/index.html#downloading) in the official documentation of Apache Spark). 18 | 19 | ```bash 20 | ./bin/pyspark --driver-java-options=-Dio.netty.tryReflectionSetAccessible=true 21 | ``` 22 | 23 | ```text 24 | Python 3.9.1 (default, Feb 3 2021, 07:38:02) 25 | Type 'copyright', 'credits' or 'license' for more information 26 | IPython 7.20.0 -- An enhanced Interactive Python. Type '?' for help. 27 | Setting default log level to "WARN". 28 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 29 | Welcome to 30 | ____ __ 31 | / __/__ ___ _____/ /__ 32 | _\ \/ _ \/ _ `/ __/ '_/ 33 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1 34 | /_/ 35 | 36 | Using Python version 3.9.1 (default, Feb 3 2021 07:38:02) 37 | Spark context Web UI available at http://192.168.68.101:4040 38 | Spark context available as 'sc' (master = local[*], app id = local-1613571272142). 39 | SparkSession available as 'spark'. 40 | 41 | In [1]: 42 | ``` 43 | 44 | ```text 45 | In [1]: spark.version 46 | Out[1]: '3.1.1' 47 | ``` 48 | -------------------------------------------------------------------------------- /docs/SimplePythonFunction.md: -------------------------------------------------------------------------------- 1 | # SimplePythonFunction 2 | 3 | `SimplePythonFunction` is a [PythonFunction](PythonFunction.md). 4 | 5 | ## Creating Instance 6 | 7 | `SimplePythonFunction` takes the following to be created: 8 | 9 | * Command (byte array) 10 | * Environment Variables 11 | * Python Includes 12 | * [Python Executable](#pythonExec) 13 | * Python Version 14 | * `Broadcast`s of [PythonBroadcast](PythonBroadcast.md)s 15 | * [PythonAccumulatorV2](PythonAccumulatorV2.md) 16 | 17 | `SimplePythonFunction` is created when: 18 | 19 | * `SparkConnectPlanner` is requested to `transformPythonFunction` 20 | * `pyspark.rdd` (Python module) is requested to [_wrap_function](pyspark/rdd.md#_wrap_function) 21 | * `pyspark.sql.udf` (Python module) is requested to [_wrap_function](pyspark/sql/udf.md#_wrap_function) 22 | 23 | ### Python Executable { #pythonExec } 24 | 25 | `SimplePythonFunction` is given the **Python Executable** when [created](#creating-instance). 26 | 27 | The Python Executable is controlled by [PYSPARK_PYTHON](environment-variables.md#PYSPARK_PYTHON) environment variable (in PySpark) or [PYSPARK_DRIVER_PYTHON](environment-variables.md#PYSPARK_DRIVER_PYTHON) (in [PySpark Connect](connect/index.md)). 28 | -------------------------------------------------------------------------------- /docs/SocketAuthHelper.md: -------------------------------------------------------------------------------- 1 | # SocketAuthHelper 2 | 3 | `SocketAuthHelper` is...FIXME 4 | -------------------------------------------------------------------------------- /docs/SocketAuthServer.md: -------------------------------------------------------------------------------- 1 | # SocketAuthServer 2 | 3 | ## serveToStream Utility 4 | 5 | ```scala 6 | serveToStream( 7 | threadName: String, 8 | authHelper: SocketAuthHelper)(writeFunc: OutputStream => Unit): Array[Any] 9 | ``` 10 | 11 | `serveToStream`...FIXME 12 | 13 | `serveToStream` is used when: 14 | 15 | * FIXME 16 | -------------------------------------------------------------------------------- /docs/SocketFuncServer.md: -------------------------------------------------------------------------------- 1 | # SocketFuncServer 2 | 3 | `SocketFuncServer` is...FIXME 4 | -------------------------------------------------------------------------------- /docs/SparkConf.md: -------------------------------------------------------------------------------- 1 | # SparkConf 2 | 3 | `SparkConf` is a Python class. 4 | 5 | ## Creating Instance 6 | 7 | `SparkConf` takes the following to be created: 8 | 9 | * `loadDefaults` flag (default: `True`) 10 | * `JVMView` ([py4j]({{ py4j.doc }}/py4j_java_gateway.html#jvmview)) 11 | * JConf (default: `None`) 12 | 13 | While being created, `SparkConf` uses the [JVMView](SparkContext.md#_jvm) (of the [SparkContext](SparkContext.md)) unless the `_jconf` and `_jvm` are given. 14 | 15 | ## Demo 16 | 17 | ```python 18 | from pyspark import SparkConf 19 | ``` 20 | -------------------------------------------------------------------------------- /docs/SparkContext.md: -------------------------------------------------------------------------------- 1 | # SparkContext 2 | 3 | ![SparkContext Initialization](images/SparkContext.png) 4 | 5 | ## Creating Instance 6 | 7 | `SparkContext` takes the following to be created: 8 | 9 | * Master URL (default: `None`) 10 | * Application Name (default: `None`) 11 | * Spark Home (default: `None`) 12 | * Py Files (default: `None`) 13 | * Environment (default: `None`) 14 | * Batch Size (default: `0`) 15 | * `PickleSerializer` 16 | * `SparkConf` (default: `None`) 17 | * Gateway (default: `None`) 18 | * Corresponding `SparkContext` on JVM (default: `None`) 19 | * `BasicProfiler` 20 | 21 | While being created, `SparkContext` [_ensure_initialized](#_ensure_initialized) (with the [gateway](#gateway) and the [conf](#conf)) followed by [_do_init](#_do_init). 22 | 23 | ## Demo 24 | 25 | ```python 26 | from pyspark import SparkContext 27 | ``` 28 | 29 | ## JavaGateway 30 | 31 | `SparkContext` defines `_gateway` property for a `JavaGateway` that is given or launched when [_ensure_initialized](#_ensure_initialized). 32 | 33 | ## JVMView 34 | 35 | `SparkContext` defines `_jvm` property for a `JVMView` ([py4j]({{ py4j.doc }}/py4j_java_gateway.html#jvmview)) to access to the Java Virtual Machine of the [JavaGateway](#_gateway). 36 | 37 | ## _ensure_initialized 38 | 39 | ```python 40 | _ensure_initialized( 41 | cls, instance=None, gateway=None, conf=None) 42 | ``` 43 | 44 | `_ensure_initialized` is a `@classmethod`. 45 | 46 | `_ensure_initialized` takes the given [gateway](#gateway) or [launch_gateway](pyspark/java_gateway.md#launch_gateway). 47 | 48 | `_ensure_initialized`...FIXME 49 | 50 | `_ensure_initialized` is used when: 51 | 52 | * `SparkContext` is [created](#creating-instance) and `setSystemProperty` 53 | * [shell.py](pyspark/shell.md) is launched 54 | 55 | ## _do_init 56 | 57 | ```python 58 | _do_init( 59 | self, master, appName, sparkHome, 60 | pyFiles, environment, batchSize, serializer, 61 | conf, jsc, profiler_cls) 62 | ``` 63 | 64 | `_do_init`...FIXME 65 | -------------------------------------------------------------------------------- /docs/SparkEnv.md: -------------------------------------------------------------------------------- 1 | # SparkEnv 2 | 3 | !!! note "Learn More" 4 | This is a stub for [pythonWorkers](#pythonWorkers) et al. 5 | Learn more in [The Internals of Apache Spark]({{ book.spark_core }}/SparkEnv/). 6 | 7 | ## pythonWorkers Registry { #pythonWorkers } 8 | 9 | ```scala 10 | pythonWorkers: Map[(String, Map[String, String]), PythonWorkerFactory] 11 | ``` 12 | 13 | `SparkEnv` creates an empty collection of [PythonWorkerFactory](PythonWorkerFactory.md)s (by their `pythonExec` and the `envVars`) when created. 14 | 15 | A new `PythonWorkerFactory` is created in [createPythonWorker](#createPythonWorker) when there was no `PythonWorkerFactory` for a `pythonExec` and a `envVars` pair. 16 | 17 | All `PythonWorkerFactory`s are requested to [stop](PythonWorkerFactory.md#stop) when `SparkEnv` is requested to `stop`. 18 | 19 | `pythonWorkers` is used in [destroyPythonWorker](#destroyPythonWorker) and [releasePythonWorker](#releasePythonWorker). 20 | 21 | ## Looking Up or Creating Python Worker Process { #createPythonWorker } 22 | 23 | ```scala 24 | createPythonWorker( 25 | pythonExec: String, 26 | envVars: Map[String, String]): (java.net.Socket, Option[Int]) 27 | ``` 28 | 29 | `createPythonWorker` looks up a [PythonWorkerFactory](PythonWorkerFactory.md) (in [pythonWorkers](#pythonWorkers)) for the given `pythonExec` and the `envVars` pair. Unless found, `createPythonWorker` registers a new `PythonWorkerFactory`. 30 | 31 | In the end, `createPythonWorker` requests the `PythonWorkerFactory` to [create a Python worker process](PythonWorkerFactory.md#create). 32 | 33 | --- 34 | 35 | ``createPythonWorker`` is used when: 36 | 37 | * `BasePythonRunner` is requested to [compute a partition](runners/BasePythonRunner.md#compute) 38 | -------------------------------------------------------------------------------- /docs/arrow-optimization/.pages: -------------------------------------------------------------------------------- 1 | title: Arrow Optimization 2 | nav: 3 | - index.md 4 | - ... 5 | -------------------------------------------------------------------------------- /docs/arrow-optimization/index.md: -------------------------------------------------------------------------------- 1 | # Arrow Optimization 2 | 3 | **Arrow Optimization** is an optimization that uses [Apache Arrow]({{ arrow.home }}) for columnar data transfers in the following: 4 | 5 | * [pyspark.sql.DataFrame.toPandas](../sql/PandasConversionMixin.md#toPandas) 6 | * [pyspark.sql.SparkSession.createDataFrame](../sql/SparkConversionMixin.md#createDataFrame) (when called with a Pandas `DataFrame` or a NumPy `ndarray`) 7 | 8 | The following data types are unsupported: `ArrayType` of `TimestampType`. 9 | -------------------------------------------------------------------------------- /docs/building-from-sources.md: -------------------------------------------------------------------------------- 1 | # Building from Sources 2 | 3 | ```text 4 | $ java -version 5 | openjdk version "11.0.10" 2021-01-19 6 | OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.10+9) 7 | OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.10+9, mixed mode) 8 | ``` 9 | 10 | ```text 11 | ./build/mvn \ 12 | -Pyarn,kubernetes,hive,hive-thriftserver,scala-2.12 \ 13 | -DskipTests \ 14 | clean install 15 | ``` 16 | 17 | ## Building PySpark-Related Operators 18 | 19 | ```text 20 | ./build/mvn -DskipTests -pl :spark-sql_2.12 clean install 21 | ``` 22 | 23 | ```text 24 | cp sql/core/target/spark-sql_2.12-3.1.1.jar assembly/target/scala-2.12/jars/ 25 | ``` 26 | -------------------------------------------------------------------------------- /docs/configuration-properties/.pages: -------------------------------------------------------------------------------- 1 | title: Configuration Properties 2 | nav: 3 | - index.md 4 | - ... 5 | -------------------------------------------------------------------------------- /docs/configuration-properties/index.md: -------------------------------------------------------------------------------- 1 | # Configuration Properties 2 | -------------------------------------------------------------------------------- /docs/configuration-properties/spark.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: spark 3 | --- 4 | 5 | # spark Configuration Properties 6 | 7 | ## broadcast.UDFCompressionThreshold { #spark.broadcast.UDFCompressionThreshold } 8 | 9 | **spark.broadcast.UDFCompressionThreshold** 10 | 11 | The threshold at which user-defined functions (UDFs) and Python RDD commands are compressed by broadcast (in bytes) 12 | 13 | Default: `1L * 1024 * 1024` (1MB) 14 | 15 | Used when: 16 | 17 | * `PythonUtils` is requested to [getBroadcastThreshold](../PythonUtils.md#getBroadcastThreshold) 18 | -------------------------------------------------------------------------------- /docs/configuration-properties/spark.pyspark.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: spark.pyspark 3 | --- 4 | 5 | # spark.pyspark Configuration Properties 6 | 7 | ## driver.python { #spark.pyspark.driver.python } 8 | 9 | **spark.pyspark.driver.python** 10 | 11 | Default: (undefined) 12 | 13 | ## python { #spark.pyspark.python } 14 | 15 | **spark.pyspark.python** 16 | 17 | Default: (undefined) 18 | -------------------------------------------------------------------------------- /docs/configuration-properties/spark.python.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: spark.python 3 | --- 4 | 5 | # spark.python Configuration Properties 6 | 7 | ## daemon.module { #spark.python.daemon.module } 8 | 9 | **spark.python.daemon.module** 10 | 11 | The Python module to run the daemon to execute Python workers 12 | 13 | Default: [pyspark.daemon](../pyspark/daemon.md) 14 | 15 | Used when: 16 | 17 | * `PythonWorkerFactory` is [created](../PythonWorkerFactory.md#daemonModule) 18 | 19 | ## use.daemon { #spark.python.use.daemon } 20 | 21 | **spark.python.use.daemon** 22 | 23 | Because forking processes from Java is expensive, PySpark prefers launching a single Python daemon ([spark.python.daemon.module](#spark.python.daemon.module)) to fork new workers for tasks. 24 | This daemon currently only works on UNIX-based systems now because it uses signals for child management, so we can also fall back to launching workers ([spark.python.worker.module](#spark.python.worker.module)) directly. 25 | 26 | Default: `true` (unless PySpark runs on Windows) 27 | 28 | Used when: 29 | 30 | * `PythonWorkerFactory` is [created](../PythonWorkerFactory.md#useDaemon) 31 | 32 | ## worker.module { #spark.python.worker.module } 33 | 34 | **spark.python.worker.module** 35 | 36 | The Python module to run a Python worker 37 | 38 | Default: [pyspark.worker](../pyspark/worker.md) 39 | 40 | Used when: 41 | 42 | * `PythonWorkerFactory` is [created](../PythonWorkerFactory.md#workerModule) 43 | -------------------------------------------------------------------------------- /docs/configuration-properties/spark.sql.execution.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: spark.sql.execution 3 | --- 4 | 5 | # spark.sql.execution Configuration Properties 6 | 7 | ## arrow.maxRecordsPerBatch { #spark.sql.execution.arrow.maxRecordsPerBatch } 8 | 9 | **spark.sql.execution.arrow.maxRecordsPerBatch** 10 | 11 | When using Apache Arrow, the maximum number of records that can be written to a single `ArrowRecordBatch` in memory. 12 | 13 | If zero or negative there is no limit. 14 | 15 | Default: `10000` 16 | 17 | Used when: 18 | 19 | * `ApplyInPandasWithStatePythonRunner` is requested for `workerConf` 20 | * `ArrowEvalPythonExec` is [created](../sql/ArrowEvalPythonExec.md#batchSize) 21 | * `Dataset` is requested to `toArrowBatchRdd` 22 | * `MapInBatchExec` is created 23 | * `SparkConnectPlanner` is requested to `handleSqlCommand` 24 | * `SparkConnectStreamHandler` is requested to `processAsArrowBatches` 25 | 26 | ## arrow.pyspark.enabled { #spark.sql.execution.arrow.pyspark.enabled } 27 | 28 | **spark.sql.execution.arrow.pyspark.enabled** 29 | 30 | Enables [Arrow Optimization](../arrow-optimization/index.md) 31 | 32 | Default: `false` 33 | 34 | ## pandas.udf.buffer.size { #spark.sql.execution.pandas.udf.buffer.size } 35 | 36 | **spark.sql.execution.pandas.udf.buffer.size** 37 | 38 | `spark.buffer.size` for Pandas UDF executions 39 | 40 | Note that Pandas execution requires more than 4 bytes. 41 | Lowering this value could make small Pandas UDF batch iterated and pipelined; however, it might degrade performance. 42 | See SPARK-27870. 43 | 44 | Default: `spark.buffer.size` ([Spark Core]({{ book.spark_core }}/configuration-properties/#spark.buffer.size)) 45 | 46 | Used when: 47 | 48 | * `ApplyInPandasWithStatePythonRunner` and [ArrowPythonRunner](../runners/ArrowPythonRunner.md#bufferSize) are created (and initialize [bufferSize](../runners/BasePythonRunner.md#bufferSize)) 49 | 50 | ## pyspark.udf.simplifiedTraceback.enabled { #spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled } 51 | 52 | **spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled** 53 | 54 | Controls the traceback from Python UDFs. When enabled (`true`), traceback is simplified and hides the Python worker, (de)serialization, etc. from PySpark in tracebacks, and only shows the exception messages from UDFs. 55 | 56 | Works only with CPython 3.7+ 57 | 58 | Default: `true` 59 | 60 | Used when: 61 | 62 | * `ApplyInPandasWithStatePythonRunner`, [ArrowPythonRunner](../runners/ArrowPythonRunner.md#simplifiedTraceback), `CoGroupedArrowPythonRunner`, [PythonUDFRunner](../runners/PythonUDFRunner.md#simplifiedTraceback) are created (and initialize [simplifiedTraceback](../runners/BasePythonRunner.md#simplifiedTraceback) flag) 63 | -------------------------------------------------------------------------------- /docs/connect/.pages: -------------------------------------------------------------------------------- 1 | title: Spark Connect 2 | nav: 3 | - index.md 4 | - ... 5 | -------------------------------------------------------------------------------- /docs/connect/index.md: -------------------------------------------------------------------------------- 1 | # Spark Connect 2 | 3 | PySpark supports remote connection to Spark clusters using Spark Connect ([Spark SQL]({{ book.spark_sql }}/connect)). 4 | 5 | ```console 6 | $ ./bin/pyspark --help 7 | Usage: ./bin/pyspark [options] 8 | 9 | Options: 10 | Spark Connect only: 11 | --remote CONNECT_URL URL to connect to the server for Spark Connect, e.g., 12 | sc://host:port. --master and --deploy-mode cannot be set 13 | together with this option. This option is experimental, and 14 | might change between minor releases. 15 | ... 16 | ``` 17 | 18 | Spark Connect for Python requires the following Python libraries: 19 | 20 | Module | Version 21 | -------|-------- 22 | [pandas](https://pandas.pydata.org/) | 1.0.5 23 | [pyarrow](https://arrow.apache.org/docs/python/index.html) | 1.0.0 24 | [grpc](https://grpc.io/docs/languages/python/) | 1.48.1 25 | 26 | ```console 27 | // switching to an conda environment with the libraries 28 | $ conda activate pyspark 29 | 30 | $ ./bin/pyspark --remote sc://localhost 31 | Python 3.10.10 (main, Mar 21 2023, 13:41:39) [Clang 14.0.6 ] on darwin 32 | Type "help", "copyright", "credits" or "license" for more information. 33 | Welcome to 34 | ____ __ 35 | / __/__ ___ _____/ /__ 36 | _\ \/ _ \/ _ `/ __/ '_/ 37 | /__ / .__/\_,_/_/ /_/\_\ version 3.4.0 38 | /_/ 39 | 40 | Using Python version 3.10.10 (main, Mar 21 2023 13:41:39) 41 | Client connected to the Spark Connect server at localhost 42 | SparkSession available as 'spark'. 43 | 44 | >>> spark.client 45 | 46 | ``` 47 | 48 | ## is_remote { #is_remote } 49 | 50 | ```py 51 | # from pyspark.sql.utils import is_remote 52 | is_remote() -> bool 53 | ``` 54 | 55 | `is_remote` is `True` when `SPARK_REMOTE` environment variable is defined (in `os.environ`). 56 | -------------------------------------------------------------------------------- /docs/demo/.pages: -------------------------------------------------------------------------------- 1 | title: Demos 2 | nav: 3 | - index.md 4 | - ... 5 | -------------------------------------------------------------------------------- /docs/demo/executing-pyspark-applications-using-spark-submit.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - navigation 4 | --- 5 | 6 | # Demo: Executing PySpark Applications Using spark-submit 7 | 8 | PySpark applications are executed using `spark-submit` ([Spark Core]({{ book.spark_core }}/tools/spark-submit)) command-line application. 9 | 10 | ```text 11 | spark-submit 1.py extra args 12 | ``` 13 | 14 | For a PySpark application, `spark-submit` uses [PythonRunner](../PythonRunner.md) and launches an extra python process: 15 | 16 | ```text 17 | ps -o pid,ppid,command | grep python | grep -v grep 18 | ``` 19 | 20 | ```text 21 | org.apache.spark.deploy.SparkSubmit 1.py extra args 22 | ``` 23 | 24 | ```text 25 | Python /usr/local/bin/ipython 1.py extra args 26 | ``` 27 | 28 | ## SPARK_PRINT_LAUNCH_COMMAND Environment Variable 29 | 30 | Use `SPARK_PRINT_LAUNCH_COMMAND` environment variable to have the complete Spark command printed out to the standard output (cf. [spark-submit shell script]({{ book.spark_core }}/tools/spark-submit/#spark_print_launch_command)). 31 | 32 | ```text 33 | SPARK_PRINT_LAUNCH_COMMAND=1 spark-submit 1.py extra args 34 | ``` 35 | 36 | ## verbose Option 37 | 38 | Use `--verbose` option for verbose debugging output. 39 | 40 | ```text 41 | Parsed arguments: 42 | ... 43 | pyFiles null 44 | ... 45 | primaryResource file:/Users/jacek/dev/sandbox/python-sandbox/1.py 46 | name 1.py 47 | childArgs [extra args] 48 | ... 49 | Main class: 50 | org.apache.spark.deploy.PythonRunner 51 | Arguments: 52 | file:/Users/jacek/dev/sandbox/python-sandbox/1.py 53 | null 54 | extra 55 | args 56 | Spark config: 57 | (spark.app.name,1.py) 58 | (spark.master,local[*]) 59 | (spark.submit.pyFiles,) 60 | (spark.submit.deployMode,client) 61 | ``` 62 | -------------------------------------------------------------------------------- /docs/demo/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - toc 4 | - navigation 5 | --- 6 | 7 | # Demos 8 | 9 | 1. [Executing PySpark Applications Using spark-submit](executing-pyspark-applications-using-spark-submit.md) 10 | 1. [Running PySpark Application on minikube](running-pyspark-application-on-minikube.md) 11 | -------------------------------------------------------------------------------- /docs/demo/running-pyspark-application-on-minikube.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - navigation 4 | --- 5 | 6 | # Demo: Running PySpark Application on minikube 7 | 8 | This demo shows how to run a PySpark application on Kubernetes (using minikube). 9 | 10 | !!! tip 11 | This is a follow-up demo to [Demo: Running Spark Application on minikube]({{ book.spark_k8s }}/demo/running-spark-application-on-minikube/) in the [The Internals of Spark on Kubernetes]({{ book.spark_k8s }}). 12 | -------------------------------------------------------------------------------- /docs/environment-variables.md: -------------------------------------------------------------------------------- 1 | # Environment Variables 2 | 3 | PySpark uses environment variables to configure execution environment. 4 | 5 | ## PYSPARK_DRIVER_PYTHON { #PYSPARK_DRIVER_PYTHON } 6 | 7 | The Python Executable in [PySpark Connect](connect/index.md) unless [PYSPARK_PYTHON](#PYSPARK_PYTHON) is defined 8 | 9 | Default: `python3` 10 | 11 | ## PYSPARK_GATEWAY_PORT { #PYSPARK_GATEWAY_PORT } 12 | 13 | ## PYSPARK_GATEWAY_SECRET { #PYSPARK_GATEWAY_SECRET } 14 | 15 | ## PYSPARK_PIN_THREAD { #PYSPARK_PIN_THREAD } 16 | 17 | Enables **pinned thread mode** to synchronize PVM threads with JVM threads based on Py4J's [ClientServer]({{ py4j.javadoc }}/py4j/ClientServer.html) (`true`) or [GatewayServer]({{ py4j.javadoc }}/py4j/GatewayServer.html) (`false`) 18 | 19 | Default: `false` 20 | 21 | Used when: 22 | 23 | * [launch_gateway](pyspark/java_gateway.md) is executed 24 | * [Py4JServer](Py4JServer.md) is created (and initializes the [server](Py4JServer.md#server)) 25 | 26 | ## PYSPARK_PYTHON { #PYSPARK_PYTHON } 27 | 28 | The Python Executable 29 | 30 | Default: `python3` 31 | -------------------------------------------------------------------------------- /docs/features/index.md: -------------------------------------------------------------------------------- 1 | # PySpark — Python on Apache Spark 2 | 3 | **PySpark** is the Python API (_frontend_) of Apache Spark. 4 | 5 | ## How It Works 6 | 7 | When a Python script is executed using `spark-submit` shell script ([Spark Core]({{ book.spark_core }}/tools/spark-submit/)), [PythonRunner](../PythonRunner.md) is started (and `--verbose` option can show it as `Main class`). 8 | 9 | ``` shell 10 | $ ./bin/spark-submit --version hello_pyspark.py 11 | Using properties file: null 12 | Parsed arguments: 13 | master local[*] 14 | ... 15 | primaryResource file:/Users/jacek/dev/oss/spark/hello_pyspark.py 16 | name hello_pyspark.py 17 | ... 18 | Main class: 19 | org.apache.spark.deploy.PythonRunner 20 | Arguments: 21 | file:/Users/jacek/dev/oss/spark/hello_pyspark.py 22 | null 23 | Spark config: 24 | (spark.app.name,hello_pyspark.py) 25 | (spark.app.submitTime,1684188276759) 26 | (spark.master,local[*]) 27 | (spark.submit.deployMode,client) 28 | (spark.submit.pyFiles,) 29 | ... 30 | ``` 31 | 32 | `spark-submit` execution above could be translated to the following: 33 | 34 | ```text 35 | ./bin/spark-class org.apache.spark.deploy.PythonRunner hello_pyspark.py "" 36 | ``` 37 | 38 | `PythonRunner` then launches a [Py4JServer](../Py4JServer.md) (on a `py4j-gateway-init` daemon thread) and waits until it is started. 39 | 40 | Finally, `PythonRunner` launches a Python process (to run the Python script) and waits until the process finishes (successfully or not). 41 | 42 | ```shell 43 | $ ps -o pid,command | grep python3 | grep -v grep 44 | 12607 python3 /Users/jacek/dev/oss/spark/hello_pyspark.py 45 | ``` 46 | 47 | ??? note "lsof for open files and TCP inter-process connections" 48 | Use `lsof` command to have a look at the open files and connections. 49 | 50 | ```shell 51 | sudo lsof -p [pid of the python process] 52 | ``` 53 | 54 | ## Python 3.8 and Later 55 | 56 | The minimum version of Python is **3.8**. 57 | 58 | ??? note "Python 3.7 Deprecated" 59 | Python 3.7 support is deprecated in Spark 3.4. 60 | 61 | ## shell.py 62 | 63 | `pyspark` shell defines [PYTHONSTARTUP]({{ python.docs }}/using/cmdline.html#envvar-PYTHONSTARTUP) environment variable to execute [shell.py](../pyspark/shell.md) before the first prompt is displayed in Python interactive mode. 64 | 65 | ## Py4J 66 | 67 | [java_gateway](../pyspark/java_gateway.md) uses [Py4J - A Bridge between Python and Java]({{ py4j.doc }}): 68 | 69 | > Py4J enables Python programs running in a Python interpreter to dynamically access Java objects in a Java Virtual Machine. Methods are called as if the Java objects resided in the Python interpreter and Java collections can be accessed through standard Python collection methods. Py4J also enables Java programs to call back Python objects. 70 | 71 | ## pyspark.sql Package 72 | 73 | `pyspark.sql` is a Python package for Spark SQL. 74 | 75 | ```python 76 | from pyspark.sql import * 77 | ``` 78 | 79 | !!! tip 80 | Learn more about [Modules and Packages](https://docs.python.org/3/tutorial/modules.html) in Python in [The Python Tutorial](https://docs.python.org/3/tutorial/index.html). 81 | 82 | ### \_\_init\__.py 83 | 84 | The `__init__.py` files are required to make Python treat directories containing the file as packages. 85 | 86 | Per [6.4.1. Importing * From a Package](https://docs.python.org/3/tutorial/modules.html#importing-from-a-package): 87 | 88 | > The import statement uses the following convention: if a package's `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered. 89 | 90 | Per [Public and Internal Interfaces](https://www.python.org/dev/peps/pep-0008/#public-and-internal-interfaces) in [PEP 8 -- Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/): 91 | 92 | > To better support introspection, modules should explicitly declare the names in their public API using the `__all__` attribute. 93 | 94 | From `python/pyspark/sql/__init__.py`: 95 | 96 | ```python 97 | __all__ = [ 98 | 'SparkSession', 'SQLContext', 'HiveContext', 'UDFRegistration', 99 | 'DataFrame', 'GroupedData', 'Column', 'Catalog', 'Row', 100 | 'DataFrameNaFunctions', 'DataFrameStatFunctions', 'Window', 'WindowSpec', 101 | 'DataFrameReader', 'DataFrameWriter', 'PandasCogroupedOps' 102 | ] 103 | ``` 104 | 105 | ## pandas 106 | 107 | The minimum version of [Pandas](https://pandas.pydata.org/) is `0.23.2` (and [PandasConversionMixin](../sql/PandasConversionMixin.md) asserts that). 108 | 109 | ```python 110 | import pandas as pd 111 | ``` 112 | 113 | ## pyarrow 114 | 115 | The minimum version of [PyArrow](https://pypi.org/project/pyarrow/) is `1.0.0` (and [PandasConversionMixin](../sql/PandasConversionMixin.md) asserts that). 116 | 117 | ```python 118 | import pyarrow 119 | ``` 120 | 121 | ## Python Mixins 122 | 123 | From [8.7. Class definitions](https://docs.python.org/3/reference/compound_stmts.html#class-definitions): 124 | 125 | > classdef ::= [decorators] "class" classname [inheritance] ":" suite 126 | > 127 | > The inheritance list usually gives a list of base classes 128 | 129 | PySpark uses mixins: 130 | 131 | * [PandasConversionMixin](../sql/PandasConversionMixin.md) 132 | * [PandasMapOpsMixin](../sql/PandasMapOpsMixin.md) 133 | * [SparkConversionMixin](../sql/SparkConversionMixin.md) 134 | -------------------------------------------------------------------------------- /docs/images/PythonRunner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/japila-books/pyspark-internals/824e54fbf19ddb39ba9ec1a50ca65d7629470cf6/docs/images/PythonRunner.png -------------------------------------------------------------------------------- /docs/images/PythonWorkerFactory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/japila-books/pyspark-internals/824e54fbf19ddb39ba9ec1a50ca65d7629470cf6/docs/images/PythonWorkerFactory.png -------------------------------------------------------------------------------- /docs/images/SparkContext.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/japila-books/pyspark-internals/824e54fbf19ddb39ba9ec1a50ca65d7629470cf6/docs/images/SparkContext.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: PySpark 3 | icon: fontawesome/brands/python 4 | hide: 5 | - toc 6 | - navigation 7 | --- 8 | 9 | # The Internals of {{ book.title }} (Apache Spark {{ spark.version }}) 10 | 11 | Welcome to **The Internals of {{ book.title }}** online book! 🤙 12 | 13 | I'm [Jacek Laskowski](https://pl.linkedin.com/in/jaceklaskowski), a Freelance Data(bricks) Engineer specializing in 14 | [Apache Spark](https://books.japila.pl/apache-spark-internals/) (incl. [Spark SQL](https://books.japila.pl/spark-sql-internals/) and [Spark Structured Streaming](https://books.japila.pl/spark-structured-streaming-internals/)), 15 | [Delta Lake](https://books.japila.pl/delta-lake-internals/), 16 | [Databricks](https://www.databricks.com/), 17 | and [Apache Kafka](https://books.japila.pl/kafka-internals/) (incl. [Kafka Streams](https://books.japila.pl/kafka-streams-internals/)) with brief forays into a wider data engineering space (mostly during [Warsaw Data Engineering](https://www.meetup.com/Warsaw-Data-Engineering/) meetups). 18 | 19 | I'm very excited to have you here and hope you will enjoy exploring the internals of {{ book.title }} as much as I have. 20 | 21 | !!! quote "Flannery O'Connor" 22 | I write to discover what I know. 23 | 24 | !!! note ""The Internals Of" series" 25 | I'm also writing other online books in the "The Internals Of" series. Please visit ["The Internals Of" Online Books](https://books.japila.pl) home page. 26 | 27 | Expect text and code snippets from a variety of public sources. Attribution follows. 28 | 29 | Now, let's take a deep dive into [{{ book.title }}](features/index.md) 🔥 30 | 31 | --- 32 | 33 | Last update: {{ git.date.strftime('%Y-%m-%d') }} 34 | -------------------------------------------------------------------------------- /docs/logging.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - toc 4 | --- 5 | 6 | # Logging 7 | 8 | Delta Lake uses the same logging infrastructure as [Apache Spark]({{ book.spark_core }}/spark-logging/). 9 | -------------------------------------------------------------------------------- /docs/ml/.pages: -------------------------------------------------------------------------------- 1 | title: MLlib 2 | nav: 3 | - index.md 4 | - ... 5 | -------------------------------------------------------------------------------- /docs/ml/Distributor.md: -------------------------------------------------------------------------------- 1 | # Distributor 2 | 3 | `Distributor` is the parent (_abstract_) class of [TorchDistributor](../pytorch-distributed/TorchDistributor.md). 4 | 5 | ## Creating Instance 6 | 7 | `Distributor` takes the following to be created: 8 | 9 | * Number of processes (default: `1`) 10 | * `local_mode` flag (default: `True`) 11 | * `use_gpu` flag (default: `True`) 12 | 13 | !!! note "Abstract Class" 14 | `Distributor` is not supposed to be created directly. 15 | 16 | ## _get_num_tasks { #_get_num_tasks } 17 | 18 | ```py 19 | _get_num_tasks( 20 | self) -> int 21 | ``` 22 | 23 | `_get_num_tasks`...FIXME 24 | 25 | ## get_gpus_owned { #get_gpus_owned } 26 | 27 | ```py 28 | get_gpus_owned( 29 | context: Union[SparkContext, BarrierTaskContext]) -> List[str] 30 | ``` 31 | 32 | `get_gpus_owned`...FIXME 33 | -------------------------------------------------------------------------------- /docs/ml/index.md: -------------------------------------------------------------------------------- 1 | # PySpark MLlib 2 | 3 | **PySpark MLlib** is a Python module to work with Spark MLlib for `DataFrame`-based machine learning pipelines. 4 | 5 | ```py 6 | from pyspark.ml import * 7 | ``` 8 | -------------------------------------------------------------------------------- /docs/pandas-on-spark/.pages: -------------------------------------------------------------------------------- 1 | title: pandas API on Spark 2 | nav: 3 | - index.md 4 | - ... 5 | -------------------------------------------------------------------------------- /docs/pandas-on-spark/index.md: -------------------------------------------------------------------------------- 1 | # pandas API on Spark 2 | 3 | **pandas API on Spark** ([pyspark.pandas](../pyspark/pandas/index.md) package) has been added to PySpark to execute [pandas]({{ pandas.home }}) code on Spark clusters with no changes (except the import). 4 | 5 | There are two related PySpark packages with pandas support: 6 | 7 | * [pyspark.pandas](../pyspark/pandas/index.md) 8 | * [pyspark.sql.pandas](../pyspark/sql/pandas/index.md) 9 | 10 | !!! note "Spark Structured Streaming" 11 | pandas API on Spark does not support Spark Structured Streaming (_streaming queries_). 12 | 13 | ## Modules 14 | 15 | pandas API on Spark requires that the following modules to be installed: 16 | 17 | Module | Version 18 | -------|-------- 19 | [pandas]({{ pandas.home }}) | 1.0.5 20 | [PyArrow]({{ arrow.docs }}/python/index.html) | 1.0.0 21 | 22 | ## PYARROW_IGNORE_TIMEZONE { #PYARROW_IGNORE_TIMEZONE } 23 | 24 | For PyArrow 2.0.0 and above, pandas API on Spark requires `PYARROW_IGNORE_TIMEZONE` environment variable to be set to `1` (on the driver and executors). 25 | 26 | ## PYSPARK_PANDAS_USAGE_LOGGER { #PYSPARK_PANDAS_USAGE_LOGGER } 27 | 28 | pandas API on Spark uses `PYSPARK_PANDAS_USAGE_LOGGER` (formerly `KOALAS_USAGE_LOGGER`) environment variable for a usage logger. 29 | 30 | ## Demo 31 | 32 | ```py 33 | # The following would be required if we used pandas 34 | # import pandas as pd 35 | 36 | # but we don't need it anymore 😊 37 | 38 | # The only change is supposed to be this extra `pyspark` prefix 39 | # in the name of the package 40 | 41 | import pyspark.pandas as pd 42 | ``` 43 | 44 | === "Python" 45 | 46 | ```py 47 | pd.read_csv("people.csv") 48 | ``` 49 | 50 | ```text 51 | id name 52 | 0 0 zero 53 | 1 1 one 54 | 2 2 two 55 | ``` 56 | -------------------------------------------------------------------------------- /docs/pandas-udafs/.pages: -------------------------------------------------------------------------------- 1 | title: pandas UDAFs 2 | nav: 3 | - index.md 4 | - ... 5 | -------------------------------------------------------------------------------- /docs/pandas-udafs/index.md: -------------------------------------------------------------------------------- 1 | # pandas User-Defined Aggregate Functions 2 | 3 | **pandas User-Defined Aggregate Functions** (_pandas UDAFs_) are [PythonUDFs](../pandas-udfs/index.md) (with optional [PandasUDFType.GROUPED_AGG](../pyspark/sql/pandas/PandasUDFType.md#GROUPED_AGG) function type) to used as aggregation functions in [GroupedData.agg](../sql/GroupedData.md#agg) operator. 4 | 5 | pandas UDAFs are also known as **Group Aggregate pandas UDFs**. 6 | 7 | ## Limitations 8 | 9 | 1. There is no partial aggregation with group aggregate UDFs (i.e., a full shuffle is required). 10 | 1. All the data of a group will be loaded into memory, so there is a potential OOM risk if data is skewed and certain groups are too large to fit in memory 11 | 1. Group aggregate pandas UDFs and built-in aggregation functions cannot be mixed in a single [GroupedData.agg](../sql/GroupedData.md#agg) operator. Otherwise, the following `AnalysisException` is thrown: 12 | 13 | ```text 14 | [INVALID_PANDAS_UDF_PLACEMENT] The group aggregate pandas UDF `my_udaf` cannot be invoked together with as other, non-pandas aggregate functions. 15 | ``` 16 | 17 | ## Demo 18 | 19 | ```py 20 | import pandas as pd 21 | from pyspark.sql.functions import pandas_udf 22 | ``` 23 | 24 | ```py 25 | @pandas_udf(returnType = "long") 26 | def my_count(s: pd.Series) -> 'long': 27 | return pd.Series(s.count()) 28 | ``` 29 | 30 | ```py 31 | from pyspark.sql.functions import abs 32 | nums = spark.range(5) # FIXME More meaningful dataset 33 | grouped_nums = (nums 34 | .withColumn("gid", abs((nums.id * 100) % 2)) 35 | .groupBy("gid")) 36 | count_by_gid_agg = my_count("gid").alias("count") 37 | counts_by_gid = grouped_nums.agg(count_by_gid_agg) 38 | ``` 39 | 40 | ```py 41 | counts_by_gid.show() 42 | ``` 43 | -------------------------------------------------------------------------------- /docs/pandas-udfs/.pages: -------------------------------------------------------------------------------- 1 | title: pandas UDFs 2 | nav: 3 | - index.md 4 | - ... 5 | -------------------------------------------------------------------------------- /docs/pandas-udfs/index.md: -------------------------------------------------------------------------------- 1 | # pandas User-Defined Functions 2 | 3 | **pandas User-Defined Functions** (_Vectorized User-Defined Functions_ or _pandas UDFs_) are user-defined functions that are executed using Apache Arrow to transfer data and pandas to work with the data, which allows for vectorized operations. 4 | 5 | Pandas UDFs are defined using [@pandas_udf](#pandas_udf) decorator. 6 | 7 | A Pandas UDF behaves as a regular PySpark function API in general. 8 | 9 | As of Spark 3.0.0 ([SPARK-28264](https://issues.apache.org/jira/browse/SPARK-28264)), using [Python type hints](https://www.python.org/dev/peps/pep-0484) in pandas UDF is encouraged (instead of specifying pandas UDF type via [functionType](#functionType) argument). 10 | 11 | The return type (type hint) of a user-defined function should be as follows: 12 | 13 | * `pandas.Series` ([pandas]({{ pandas.api }}/pandas.Series.html)) in most cases 14 | * `pandas.DataFrame` ([pandas]({{ pandas.api }}/pandas.DataFrame.html)) for `struct` input or output 15 | 16 | ## @pandas_udf Decorator { #pandas_udf } 17 | 18 | ```py 19 | pandas_udf( 20 | f=None, 21 | returnType=None, 22 | functionType=None) 23 | ``` 24 | 25 | [pandas_udf](../pyspark/sql/pandas/functions.md#pandas_udf) function is used a decorator (using `@pandas_udf` annotation). 26 | 27 | ??? note "Python Decorators" 28 | Learn more in [PEP 318 – Decorators for Functions and Methods]({{ python.peps }}/pep-0318/). 29 | 30 | `pandas_udf` belongs to [pyspark.sql.functions](../pyspark/sql/functions.md) module. 31 | 32 | ```py 33 | from pyspark.sql.functions import pandas_udf 34 | ``` 35 | 36 | ### functionType { #functionType } 37 | 38 | `functionType` can be one of [PandasUDFType](../pyspark/sql/pandas/PandasUDFType.md)s (but is currently discouraged in favour of type hints). 39 | 40 | ```py 41 | @pandas_udf(returnType = "long", functionType = PandasUDFType.GROUPED_AGG) 42 | def my_udaf(names: pd.Series) -> 'long': 43 | return pd.Series(names.count()) 44 | ``` 45 | 46 | `functionType` is also known as `evalType`. 47 | 48 | [SQL_SCALAR_PANDAS_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_UDF) is the default scalar UDF type. 49 | 50 | ### returnType { #returnType } 51 | 52 | `@pandas_udf` decorator can optionally specify a return type (as the first positional argument or using `returnType`). 53 | 54 | A return type can be one of the names of `pyspark.sql.types.DataType` instances or the `DataType` themselves. 55 | 56 | ```py 57 | @pandas_udf(dataType) 58 | @pandas_udf(returnType=dataType) 59 | ``` 60 | 61 | ## pandas UDAFs 62 | 63 | [pandas User-Defined Aggregate Functions](../pandas-udafs/index.md). 64 | 65 | ## Demo 66 | 67 | ```py 68 | import pandas as pd 69 | from pyspark.sql.functions import pandas_udf 70 | ``` 71 | 72 | ```py 73 | @pandas_udf("string") 74 | def to_upper(s: pd.Series) -> pd.Series: 75 | return s.str.upper() 76 | ``` 77 | 78 | ```py 79 | @pandas_udf("string") 80 | def my_concat(names: pd.Series, ages: pd.Series) -> pd.Series: 81 | return pd.Series([f"{n} is {a} years old" for (n, a) in zip(names, ages)]) 82 | ``` 83 | 84 | ```py 85 | pandas_df = pd.DataFrame({ 86 | 'name': ['jacek', 'agata', 'iweta', 'patryk', 'maksym'], 87 | 'age': [50, 49, 29, 26, 11] 88 | }) 89 | df = spark.createDataFrame(pandas_df) 90 | ``` 91 | 92 | ```text 93 | >>> df.show() 94 | +------+---+ 95 | | name|age| 96 | +------+---+ 97 | | jacek| 50| 98 | | agata| 49| 99 | | iweta| 29| 100 | |patryk| 26| 101 | |maksym| 11| 102 | +------+---+ 103 | ``` 104 | 105 | ```text 106 | >>> df.printSchema() 107 | root 108 | |-- name: string (nullable = true) 109 | |-- age: long (nullable = true) 110 | ``` 111 | 112 | ```py 113 | (df 114 | .select(to_upper(df.name).alias("upper_name")) 115 | .show()) 116 | ``` 117 | 118 | ```text 119 | +----------+ 120 | |upper_name| 121 | +----------+ 122 | | JACEK| 123 | | AGATA| 124 | | IWETA| 125 | | PATRYK| 126 | | MAKSYM| 127 | +----------+ 128 | ``` 129 | 130 | ```py 131 | df.select(my_concat(df.name, df.age)).show(truncate = False) 132 | ``` 133 | 134 | ```text 135 | +----------------------+ 136 | |my_concat(name, age) | 137 | +----------------------+ 138 | |jacek is 50 years old | 139 | |agata is 49 years old | 140 | |iweta is 29 years old | 141 | |patryk is 26 years old| 142 | |maksym is 11 years old| 143 | +----------------------+ 144 | ``` 145 | -------------------------------------------------------------------------------- /docs/pyspark/daemon.md: -------------------------------------------------------------------------------- 1 | # daemon.py 2 | 3 | `daemon.py` is a Python module in [pyspark](index.md) package. 4 | 5 | ```py 6 | from pyspark import daemon 7 | ``` 8 | 9 | ## Entry Point 10 | 11 | ??? note "Top-Level Code Environment" 12 | If the module is executed in the top-level code environment (e.g., `python -m`), its `__name__` is set to the string `__main__`. 13 | 14 | Sometimes "top-level code" is called an _entry point_ to the application. 15 | 16 | Learn more in the [\_\_main__ — Top-level code environment]({{ python.docs }}/library/__main__.html). 17 | 18 | When executed in the top-level code environment, `daemon.py` calls [manager](#manager) function. 19 | 20 | ## manager { #manager } 21 | 22 | ```py 23 | manager() 24 | ``` 25 | 26 | `manager` runs until it is stopped (e.g., `CTRL-C`). 27 | 28 | `manager` creates a new process group (`os.setpgid(0, 0)`). 29 | 30 | `manager` creates a listening socket on the loopback interface (possibly using IPv6 based on `SPARK_PREFER_IPV6` environment variable). 31 | 32 | `manager` reads `SPARK_REUSE_WORKER` environment variable (`reuse`). 33 | 34 | `manager` launches a [worker process](#worker) (in a child process using `os.fork()`). 35 | 36 | ### Launching Worker Process { #worker } 37 | 38 | ```py 39 | worker( 40 | sock: socket, 41 | authenticated: Bool) -> Optional[int] 42 | ``` 43 | 44 | !!! note 45 | `worker` is called by a worker process after the`os.fork()`. 46 | 47 | `worker` [runs a worker](worker.md#main). 48 | -------------------------------------------------------------------------------- /docs/pyspark/index.md: -------------------------------------------------------------------------------- 1 | # pyspark Package 2 | 3 | ```py 4 | import pyspark 5 | ``` 6 | 7 | ## \_\_all__ 8 | 9 | ??? note "import *" 10 | The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered. 11 | 12 | Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package). 13 | 14 | * `SparkConf` 15 | * `SparkContext` 16 | * `SparkFiles` 17 | * `RDD` 18 | * `StorageLevel` 19 | * `Broadcast` 20 | * `Accumulator` 21 | * `AccumulatorParam` 22 | * `MarshalSerializer` 23 | * `CPickleSerializer` 24 | * `StatusTracker` 25 | * `SparkJobInfo` 26 | * `SparkStageInfo` 27 | * `Profiler` 28 | * `BasicProfiler` 29 | * `TaskContext` 30 | * `RDDBarrier` 31 | * `BarrierTaskContext` 32 | * `BarrierTaskInfo` 33 | * `InheritableThread` 34 | * `inheritable_thread_target` 35 | * `__version__` 36 | -------------------------------------------------------------------------------- /docs/pyspark/java_gateway.md: -------------------------------------------------------------------------------- 1 | # java_gateway.py 2 | 3 | `java_gateway` is a Python module that allows [launching a gateway process](#launch_gateway) to establish communication channel to [Py4JServer](../Py4JServer.md). 4 | 5 | ## launch_gateway 6 | 7 | ```python 8 | launch_gateway( 9 | conf=None, 10 | popen_kwargs=None) 11 | ``` 12 | 13 | `launch_gateway` reads [PYSPARK_GATEWAY_PORT](../environment-variables.md#PYSPARK_GATEWAY_PORT) and [PYSPARK_GATEWAY_SECRET](../environment-variables.md#PYSPARK_GATEWAY_SECRET) environment variables if defined and assumes that the child Java gateway process has already been started (e.g. [PythonGatewayServer](../PythonGatewayServer.md)). 14 | 15 | 16 | 17 | Otherwise, `launch_gateway` builds the command to start `spark-submit`: 18 | 19 | 1. Finds `SPARK_HOME` with `./bin/spark-submit` 20 | 1. Appends all the configuration properties (from the input `conf`) using `--conf` 21 | 1. Appends `PYSPARK_SUBMIT_ARGS` environment variable if defined or assumes `pyspark-shell` 22 | 23 | `launch_gateway` sets up `_PYSPARK_DRIVER_CONN_INFO_PATH` environment variable to point at an unique temporary file. 24 | 25 | `launch_gateway` configures a pipe to stdin for the corresponding Java gateway process to use to monitor the Python process. 26 | 27 | `launch_gateway` starts `bin/spark-submit` command and waits for a connection info file to be created at `_PYSPARK_DRIVER_CONN_INFO_PATH`. `launch_gateway` reads the port and the secret from the file once available. 28 | 29 | `launch_gateway` connects to the gateway using py4j's `ClientServer` or `JavaGateway` based on [PYSPARK_PIN_THREAD](../environment-variables.md#PYSPARK_PIN_THREAD) environment variable. 30 | 31 | `launch_gateway` imports Spark packages and classes (using py4j): 32 | 33 | * `org.apache.spark.SparkConf` 34 | * `org.apache.spark.api.java.*` 35 | * `org.apache.spark.api.python.*` 36 | * `org.apache.spark.ml.python.*` 37 | * `org.apache.spark.mllib.api.python.*` 38 | * `org.apache.spark.resource.*` 39 | * `org.apache.spark.sql.*` 40 | * `org.apache.spark.sql.api.python.*` 41 | * `org.apache.spark.sql.hive.*` 42 | * `scala.Tuple2` 43 | 44 | `launch_gateway` is used when: 45 | 46 | * `SparkContext` is requested to [_ensure_initialized](../SparkContext.md#_ensure_initialized) 47 | -------------------------------------------------------------------------------- /docs/pyspark/pandas/DataFrame.md: -------------------------------------------------------------------------------- 1 | # DataFrame 2 | 3 | `DataFrame` is a [Frame](generic/Frame.md) with an [InternalFrame](InternalFrame.md). 4 | 5 | `DataFrame` is a `Generic[T]` ([Python]({{ python.api }}/library/typing.html#user-defined-generic-types)). 6 | 7 | ## Creating Instance 8 | 9 | `DataFrame` takes the following to be created: 10 | 11 | * data (optional) 12 | * index (optional) 13 | * columns (optional) 14 | * dtype (optional) 15 | * copy (optional) 16 | 17 | ### _internal_frame { #_internal_frame } 18 | 19 | `DataFrame` is given or creates an [InternalFrame](InternalFrame.md) when [created](#creating-instance). 20 | 21 | ```py 22 | object.__setattr__(self, "_internal_frame", internal) 23 | ``` 24 | 25 | ## InternalFrame { #_internal } 26 | 27 | ??? note "Frame" 28 | 29 | ```py 30 | @property 31 | def _internal( 32 | self) -> InternalFrame 33 | ``` 34 | 35 | `_internal` is part of the [Frame](generic/Frame.md#_internal) abstraction. 36 | 37 | `_internal` returns the [_internal_frame](#_internal_frame) (that is expected to be of type [InternalFrame](InternalFrame.md)). 38 | -------------------------------------------------------------------------------- /docs/pyspark/pandas/InternalFrame.md: -------------------------------------------------------------------------------- 1 | # InternalFrame 2 | 3 | `InternalFrame` is the underlying managed Spark DataFrame of [pyspark.pandas.DataFrame](DataFrame.md#_internal). 4 | 5 | ## Creating Instance 6 | 7 | `InternalFrame` takes the following to be created: 8 | 9 | * [Spark DataFrame](#spark_frame) 10 | * `index_spark_columns` (optional) 11 | * `index_names` (optional) 12 | * `index_fields` (optional) 13 | * `column_labels` (optional) 14 | * `data_spark_columns` (optional) 15 | * `data_fields` (optional) 16 | * `column_label_names` (optional) 17 | 18 | ### Spark DataFrame { #spark_frame } 19 | 20 | `InternalFrame` is given a Spark [DataFrame](../../sql/DataFrame.md) when [created](#creating-instance). 21 | 22 | ## Managed Spark DataFrame { #_sdf } 23 | 24 | `_sdf` is the underlying managed Spark DataFrame. 25 | 26 | `_sdf` is the [Spark DataFrame](#spark_frame) with [attach_default_index](#attach_default_index) and [\_\_natural_order__](#NATURAL_ORDER_COLUMN_NAME) columns selected. 27 | 28 | ## Default Index Column Name { #SPARK_DEFAULT_INDEX_NAME } 29 | 30 | `InternalFrame` uses the following as the name of the default index column: 31 | 32 | ```text 33 | __index_level_0__ 34 | ``` 35 | 36 | ## Index Column Pattern { #SPARK_INDEX_NAME_PATTERN } 37 | 38 | `InternalFrame` defines a regular pattern to match the index columns. 39 | 40 | ```text 41 | __index_level_[0-9]+__ 42 | ``` 43 | 44 | It is invalid to name columns in the [Spark DataFrame](#spark_frame) to match the index column pattern. 45 | Index columns must not be in the columns of the Spark DataFrame. 46 | 47 | ## to_internal_spark_frame { #to_internal_spark_frame } 48 | 49 | ```py 50 | @lazy_property 51 | def to_internal_spark_frame( 52 | self) -> SparkDataFrame 53 | ``` 54 | 55 | `to_internal_spark_frame` returns the [spark_frame](#spark_frame) with the [index_spark_columns](#index_spark_columns) followed by the [data_spark_columns](#data_spark_columns). 56 | 57 | ## spark_frame { #spark_frame } 58 | 59 | ```py 60 | from pyspark.sql import DataFrame as SparkDataFrame 61 | 62 | @property 63 | def spark_frame( 64 | self) -> SparkDataFrame 65 | ``` 66 | 67 | `spark_frame` returns the underlying [managed Spark DataFrame](#_sdf). 68 | 69 | ## Demo 70 | 71 | ```py 72 | from pyspark import pandas as ps 73 | 74 | psdf = ps.DataFrame({ 75 | 'A': [1, 2, 3, 4], 76 | 'B': [5, 6, 7, 8], 77 | 'C': [9, 10, 11, 12], 78 | 'D': [13, 14, 15, 16], 79 | 'E': [17, 18, 19, 20]}, columns = ['A', 'B', 'C', 'D', 'E']) 80 | 81 | psdf._internal 82 | # 83 | 84 | psdf._internal.spark_frame 85 | # DataFrame[__index_level_0__: bigint, A: bigint, B: bigint, C: bigint, D: bigint, E: bigint, __natural_order__: bigint] 86 | 87 | psdf._internal.spark_frame.show() 88 | # +-----------------+---+---+---+---+---+-----------------+ 89 | # |__index_level_0__| A| B| C| D| E|__natural_order__| 90 | # +-----------------+---+---+---+---+---+-----------------+ 91 | # | 0| 1| 5| 9| 13| 17| 17179869184| 92 | # | 1| 2| 6| 10| 14| 18| 42949672960| 93 | # | 2| 3| 7| 11| 15| 19| 68719476736| 94 | # | 3| 4| 8| 12| 16| 20| 94489280512| 95 | # +-----------------+---+---+---+---+---+-----------------+ 96 | 97 | psdf._internal.to_internal_spark_frame.show() 98 | # +-----------------+---+---+---+---+---+ 99 | # |__index_level_0__| A| B| C| D| E| 100 | # +-----------------+---+---+---+---+---+ 101 | # | 0| 1| 5| 9| 13| 17| 102 | # | 1| 2| 6| 10| 14| 18| 103 | # | 2| 3| 7| 11| 15| 19| 104 | # | 3| 4| 8| 12| 16| 20| 105 | # +-----------------+---+---+---+---+---+ 106 | ``` 107 | -------------------------------------------------------------------------------- /docs/pyspark/pandas/generic/Frame.md: -------------------------------------------------------------------------------- 1 | # Frame 2 | 3 | `Frame` is an [abstraction](#contract) of [frames](#implementations) that behave like [pandas.DataFrame]({{ pandas.api }}/pandas.DataFrame.html) and [pandas.Series]({{ pandas.api }}/pandas.Series.html). 4 | 5 | ```py 6 | class Frame(object, metaclass=ABCMeta) 7 | ``` 8 | 9 | ## Contract 10 | 11 | ### \_\_getitem\_\_ { #__getitem } 12 | 13 | ```py 14 | @abstractmethod 15 | def __getitem__( 16 | self, 17 | key: Any) -> Any 18 | ``` 19 | 20 | ```py 21 | class hello(): 22 | def __getitem__(self, key): 23 | print(f"__getitem__({key})") 24 | 25 | h = hello() 26 | 27 | >>> h[4] 28 | __getitem__(4) 29 | ``` 30 | 31 | ### _internal { #_internal } 32 | 33 | ```py 34 | @property 35 | @abstractmethod 36 | def _internal( 37 | self) -> InternalFrame 38 | ``` 39 | 40 | ## Implementations 41 | 42 | * [DataFrame](../DataFrame.md) 43 | * `Series` 44 | -------------------------------------------------------------------------------- /docs/pyspark/pandas/generic/index.md: -------------------------------------------------------------------------------- 1 | # pyspark.pandas.generic Package 2 | 3 | `pyspark.pandas.generic` package is...FIXME 4 | -------------------------------------------------------------------------------- /docs/pyspark/pandas/index.md: -------------------------------------------------------------------------------- 1 | # pyspark.pandas Package 2 | 3 | When imported (that triggers `__init__.py`), `pyspark.pandas` does _monkey-patching_ of `pandas.DataFrame` and `pandas.Series` classes (using [\_\_class_getitem__]({{ python.docs }}/reference/datamodel.html#emulating-generic-types) dunder method). 4 | 5 | Pandas | PySpark 6 | -------|-------- 7 | [pandas.DataFrame]({{ pandas.api }}/pandas.DataFrame.html) | `pyspark.pandas.frame.DataFrame` 8 | [pandas.Series]({{ pandas.api }}/pandas.Series.html) | `pyspark.pandas.series.Series` 9 | -------------------------------------------------------------------------------- /docs/pyspark/rdd.md: -------------------------------------------------------------------------------- 1 | # rdd.py 2 | 3 | `rdd` module (in `pyspark` package) defines [RDD](../RDD.md). 4 | 5 | ```py 6 | from pyspark.rdd import * 7 | ``` 8 | 9 | ## \_\_all__ 10 | 11 | ??? note "import *" 12 | The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered. 13 | 14 | Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package). 15 | 16 | * [RDD](../RDD.md) 17 | 18 | ## _prepare_for_python_RDD { #_prepare_for_python_RDD } 19 | 20 | ```py 21 | _prepare_for_python_RDD( 22 | sc: "SparkContext", 23 | command: Any) -> Tuple[bytes, Any, Any, Any] 24 | ``` 25 | 26 | `_prepare_for_python_RDD` creates a `CloudPickleSerializer` to `dumps` the given `command` pair (that creates a `pickled_command`). 27 | 28 | If the size of the `pickled_command` is above the [broadcast threshold](../PythonUtils.md#getBroadcastThreshold), `_prepare_for_python_RDD` creates a broadcast variable for `pickled_command` that is in turn `dumps` using the `CloudPickleSerializer` (that overrides the `pickled_command`). 29 | 30 | In the end, `_prepare_for_python_RDD` returns the following: 31 | 32 | * `pickled_command` 33 | * `broadcast_vars` 34 | * [environment](../SparkContext.md#environment) 35 | * [_python_includes](../SparkContext.md#_python_includes) 36 | 37 | --- 38 | 39 | `_prepare_for_python_RDD` is used when: 40 | 41 | * `pyspark.rdd` is requested to [_wrap_function](#_wrap_function) 42 | * `pyspark.sql.udf` is requested to [_wrap_function](sql/udf.md#_wrap_function) 43 | -------------------------------------------------------------------------------- /docs/pyspark/shell.md: -------------------------------------------------------------------------------- 1 | # shell.py 2 | 3 | `shell.py` script is the interactive shell of PySpark. 4 | 5 | `shell.py` defines the following variables: 6 | 7 | * `sc` being [pyspark.SparkContext](../SparkContext.md) 8 | * `spark` being [pyspark.sql.session.SparkSession](../pyspark/sql/SparkSession.md) 9 | * `sql` being [SparkSession.sql](../pyspark/sql/SparkSession.md#sql) 10 | * `sqlContext` and `sqlCtx` for compatibility 11 | -------------------------------------------------------------------------------- /docs/pyspark/sql/.pages: -------------------------------------------------------------------------------- 1 | title: pyspark.sql 2 | nav: 3 | - index.md 4 | - ... 5 | -------------------------------------------------------------------------------- /docs/pyspark/sql/SparkSession.Builder.md: -------------------------------------------------------------------------------- 1 | # SparkSession.Builder 2 | 3 | ## Creating Instance 4 | 5 | `Builder` takes no arguments to be created. 6 | 7 | `Builder` is created when: 8 | 9 | * `SparkSession` is requested for [one](SparkSession.md#builder) 10 | 11 | ## getOrCreate { #getOrCreate } 12 | 13 | ```py 14 | getOrCreate( 15 | self) -> "SparkSession" 16 | ``` 17 | 18 | With `SPARK_REMOTE` environment variable or `spark.remote` configuration property defined, `getOrCreate`...FIXME 19 | 20 | `getOrCreate` [_instantiatedSession](SparkSession.md#_instantiatedSession). 21 | 22 | Unless `SparkSession` is already created, `getOrCreate` creates [one](SparkSession.md). 23 | -------------------------------------------------------------------------------- /docs/pyspark/sql/SparkSession.md: -------------------------------------------------------------------------------- 1 | --- 2 | tags: 3 | - Python 4 | --- 5 | 6 | # SparkSession 7 | 8 | `SparkSession` is a Python class in [pyspark.sql.session](session.md) module. 9 | 10 | ```py 11 | from pyspark.sql.session import SparkSession 12 | ``` 13 | 14 | ## SparkConversionMixin { #SparkConversionMixin } 15 | 16 | `SparkSession` uses [SparkConversionMixin](../../sql/SparkConversionMixin.md) (for pandas to Spark conversion). 17 | 18 | ## Creating Instance 19 | 20 | `SparkSession` takes the following to be created: 21 | 22 | * [SparkContext](../../SparkContext.md) 23 | * `SparkSession` (`Optional[JavaObject]`) 24 | * Options 25 | 26 | While being created, `SparkSession` gets access to [_jsc](#_jsc) and [_jvm](#_jvm) using the given [SparkContext](#_sc). 27 | 28 | !!! note 29 | It is expected that [_jvm](../../SparkContext.md#_jvm) is defined (or an exception is thrown). 30 | 31 | Unless the given [SparkSession](#jsparkSession) is defined, `SparkSession` gets one from the [_jvm](../../SparkContext.md#_jvm). 32 | 33 | `SparkSession` [_monkey_patch_RDD](#_monkey_patch_RDD). 34 | 35 | `SparkSession` [install_exception_handler](#install_exception_handler). 36 | 37 | --- 38 | 39 | `SparkSession` is created when: 40 | 41 | * `SparkSession.Builder` is requested to [get or create one](SparkSession.Builder.md#getOrCreate) 42 | * `SparkSession` is requested to [get an active SparkSession](#getActiveSession) 43 | 44 | ### Java SparkContext { #_jsc } 45 | 46 | ```py 47 | _jsc: JavaObject 48 | ``` 49 | 50 | `_jsc` is a Java `SparkContext` ([Spark Core]({{ book.spark_core }}/SparkContext)) that is created through Py4J. 51 | 52 | ??? note "JavaObject" 53 | `JavaObject` ([Py4J]({{ py4j.docs }}/py4j_java_gateway.html#javaobject)) represents a Java object from which you can call methods or access fields. 54 | 55 | `_jsc` is initialized when `SparkSession` is [created](#creating-instance) to be the [_jsc](../../SparkContext.md#_jsc) of the given [SparkContext](#_sc). 56 | 57 | `_jsc` is used (among the other internal uses) when: 58 | 59 | * `SCCallSiteSync` is requested to `__enter__` and `__exit__` 60 | 61 | ### py4j JVMView { #_jvm } 62 | 63 | ```py 64 | _jvm: ClassVar[Optional[JVMView]] 65 | ``` 66 | 67 | ??? note "JVMView" 68 | `JVMView` ([Py4J]({{ py4j.docs }}/py4j_java_gateway.html#jvmview)) that allows access to the Java Virtual Machine of a `JavaGateway`. 69 | 70 | `JVMView` can be used to reference static members (fields and methods) and to call constructors. 71 | 72 | From [py4j.JVMView]({{ py4j.javadoc }}/py4j/JVMView.html) javadoc: 73 | 74 | > A JVM view keeps track of imports and import searches. A Python client can have multiple JVM views (e.g., one for each module) so that imports in one view do not conflict with imports from other views. 75 | 76 | `_jvm` is initialized when `SparkSession` is [created](#creating-instance) to be the [_jvm](../../SparkContext.md#_jvm) of the given [SparkContext](#_sc). 77 | 78 | `_jvm` must be defined when `SparkSession` is [created](#creating-instance) or an `AssertionError` is thrown. 79 | 80 | `_jvm` is "cleared" (_stopped_) in [stop](#stop). 81 | 82 | `_jvm` is used (among the other internal uses) when: 83 | 84 | * `ChannelBuilder` is requested to `default_port` 85 | * `InternalFrame` is requested to `attach_distributed_column` 86 | * `DataFrameReader` is requested to `csv` and `json` 87 | * `pyspark.pandas.spark.functions.py` module is requested to `_call_udf` and `_make_arguments` 88 | * `SparkConversionMixin` is requested to [_create_from_pandas_with_arrow](../../sql/SparkConversionMixin.md#_create_from_pandas_with_arrow) 89 | * `SparkSession` is requested to [_create_dataframe](#_create_dataframe) 90 | 91 | ```text 92 | >>> type(spark) 93 | 94 | 95 | >>> type(spark._jvm) 96 | 97 | ``` 98 | 99 | ## Creating Builder { #builder } 100 | 101 | ```py 102 | @classproperty 103 | builder( 104 | cls) -> Builder 105 | ``` 106 | 107 | ??? note "`@classproperty` Decorator" 108 | `builder` is a `@classproperty` that is PySpark-specific to mimic how [@classmethod]({{ python.docs }}/library/functions.html#classmethod) and [@property]({{ python.docs }}/library/functions.html#property) should work together. 109 | 110 | `builder` creates a new [SparkSession.Builder](SparkSession.Builder.md). 111 | 112 | ## \_\_enter__ 113 | 114 | ```py 115 | __enter__( 116 | self) -> "SparkSession" 117 | ``` 118 | 119 | ??? note "Special Method" 120 | Enables `with SparkSession.builder.(...).getOrCreate() as session:` syntax. 121 | 122 | Learn more: 123 | 124 | 1. [PEP 343 – The "with" Statement]({{ python.peps }}/pep-0343/) 125 | 1. [3.3.9. With Statement Context Managers]({{ python.docs }}/reference/datamodel.html#with-statement-context-managers) 126 | 1. [Context Managers and Python's with Statement]({{ python.realpython }}/python-with-statement/) 127 | 128 | `__enter__` returns `self`. 129 | 130 | ## \_\_exit__ 131 | 132 | ```py 133 | __exit__( 134 | self, 135 | exc_type: Optional[Type[BaseException]], 136 | exc_val: Optional[BaseException], 137 | exc_tb: Optional[TracebackType], 138 | ) -> None 139 | ``` 140 | 141 | ??? note "Special Method" 142 | Enables `with SparkSession.builder.(...).getOrCreate() as session:` syntax. 143 | 144 | Learn more: 145 | 146 | 1. [PEP 343 – The "with" Statement]({{ python.peps }}/pep-0343/) 147 | 1. [3.3.9. With Statement Context Managers]({{ python.docs }}/reference/datamodel.html#with-statement-context-managers) 148 | 1. [Context Managers and Python's with Statement]({{ python.realpython }}/python-with-statement/) 149 | 150 | `__exit__` [stop](#stop) this `SparkSession` (which is exactly what `__exit__` is supposed to do with resource manager once they're out of scope and resources should be released). 151 | 152 | ## _create_shell_session { #_create_shell_session } 153 | 154 | ```py 155 | @staticmethod 156 | _create_shell_session() -> "SparkSession" 157 | ``` 158 | 159 | ??? note "`@staticmethod`" 160 | Learn more in [Python Documentation]({{ python.docs }}/library/functions.html#staticmethod). 161 | 162 | `_create_shell_session`...FIXME 163 | 164 | --- 165 | 166 | `_create_shell_session` is used when: 167 | 168 | * [pyspark/shell.py](../shell.md) module is imported 169 | 170 | ## Executing SQL Statement { #sql } 171 | 172 | ```py 173 | sql( 174 | self, 175 | sqlQuery: str, 176 | args: Optional[Dict[str, Any]] = None, 177 | **kwargs: Any) -> DataFrame 178 | ``` 179 | 180 | `sql` creates a [DataFrame](../../sql/DataFrame.md) with the `sqlQuery` query executed. 181 | 182 | `sql` uses `SQLStringFormatter` to `format` the given `sqlQuery` with the `kwargs`, if defined. 183 | -------------------------------------------------------------------------------- /docs/pyspark/sql/UserDefinedFunction.md: -------------------------------------------------------------------------------- 1 | # UserDefinedFunction 2 | 3 | `UserDefinedFunction` is a Python class in [pyspark.sql.udf](udf.md) module. 4 | 5 | ```py 6 | from pyspark.sql.udf import UserDefinedFunction 7 | ``` 8 | 9 | ## Creating Instance 10 | 11 | `UserDefinedFunction` takes the following to be created: 12 | 13 | * Function (`Callable`) 14 | * Return Type (default: `StringType`) 15 | * Name (default: `None`) 16 | * Eval Type (default: [SQL_BATCHED_UDF](../../sql/PythonEvalType.md#SQL_BATCHED_UDF)) 17 | * `deterministic` flag (default: `True`) 18 | 19 | `UserDefinedFunction` is created when: 20 | 21 | * [_create_udf](udf.md#_create_udf) (from `pyspark.sql.udf` module) is executed 22 | 23 | ### _judf_placeholder { #_judf_placeholder } 24 | 25 | `UserDefinedFunction` initializes `_judf_placeholder` to be `None` when [created](#creating-instance). 26 | 27 | `_judf_placeholder` is [_create_judf](#_create_judf) of the [func](#func) when `UserDefinedFunction` is requested to [_judf](#_judf). 28 | 29 | `_judf_placeholder` is available as [_judf](#_judf). 30 | 31 | `_judf_placeholder` can be reset (`None`) when `UserDefinedFunction` is requested to [asNondeterministic](#asNondeterministic). 32 | 33 | ## \_\_call__ 34 | 35 | ```py 36 | __call__( 37 | self, 38 | *cols: "ColumnOrName") -> Column 39 | ``` 40 | 41 | ??? note "Emulating callable objects" 42 | Instances of arbitrary classes can be made callable by defining a `__call__()` method in their class. 43 | 44 | `__call__` is called when an instance is "called" as a function. 45 | 46 | Learn more in [3.3.6. Emulating callable objects]({{ python.docs }}/reference/datamodel.html?#object.__call__). 47 | 48 | With `profiler_collector` enabled, `__call__`...FIXME 49 | 50 | Otherwise, `__call__` assigns the [_judf](#_judf) as the [judf](#judf) and creates a [PythonUDF](../../sql/PythonUDF.md). 51 | 52 | In the end, `__call__` creates a `Column` with the `PythonUDF`. 53 | 54 | ## _judf { #_judf } 55 | 56 | ```py 57 | @property 58 | _judf( 59 | self) -> JavaObject 60 | ``` 61 | 62 | `_judf` [_create_judf](#_create_judf) for the [func](#func) unless the [_judf_placeholder](#_judf_placeholder) has already been initialized. 63 | 64 | In the end, `_judf` returns the [_judf_placeholder](#_judf_placeholder). 65 | 66 | --- 67 | 68 | `_judf` is used when: 69 | 70 | * `UserDefinedFunction` is requested to [\_\_call__](#__call__) 71 | * `UDFRegistration` is requested to [register](../../sql/UDFRegistration.md#register) 72 | 73 | ## Creating Java UserDefinedPythonFunction { #_create_judf } 74 | 75 | ```py 76 | _create_judf( 77 | self, 78 | func: Callable[..., Any]) -> JavaObject 79 | ``` 80 | 81 | `_create_judf` uses the [_jvm](../../SparkContext.md#_jvm) bridge to create a [UserDefinedPythonFunction](../../sql/UserDefinedPythonFunction.md) with the following: 82 | 83 | * [_name](#_name) 84 | * [SimplePythonFunction](udf.md#_wrap_function) (with a pickled version) of the given `func` and the [returnType](#returnType) 85 | * The [returnType](#returnType) (parsed from JSON format to Java) 86 | * [evalType](#evalType) 87 | * [deterministic](#deterministic) 88 | 89 | --- 90 | 91 | `_create_judf` is used when: 92 | 93 | * `UserDefinedFunction` is requested to [\_\_call__](#__call__) and [_judf](#_judf) 94 | -------------------------------------------------------------------------------- /docs/pyspark/sql/dataframe.md: -------------------------------------------------------------------------------- 1 | # dataframe.py 2 | 3 | `dataframe` module (in `pyspark.sql` package) defines [DataFrame et al.](#__all__) 4 | 5 | ```py 6 | from pyspark.sql.dataframe import * 7 | ``` 8 | 9 | ## \_\_all__ 10 | 11 | ??? note "import *" 12 | The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered. 13 | 14 | Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package). 15 | 16 | * [DataFrame](../../sql/DataFrame.md) 17 | * `DataFrameNaFunctions` 18 | * `DataFrameStatFunctions` 19 | -------------------------------------------------------------------------------- /docs/pyspark/sql/functions.md: -------------------------------------------------------------------------------- 1 | # functions.py 2 | 3 | `functions.py` module belongs to `pyspark.sql` package. 4 | 5 | ```py 6 | from pyspark.sql.functions import udf 7 | ``` 8 | 9 | ## udf 10 | 11 | ```py 12 | udf( 13 | f: Optional[Union[Callable[..., Any], "DataTypeOrString"]] = None, 14 | returnType: "DataTypeOrString" = StringType(), 15 | ) -> Union["UserDefinedFunctionLike", Callable[[Callable[..., Any]], "UserDefinedFunctionLike"]] 16 | ``` 17 | 18 | `udf` [_create_py_udf](udf.md#_create_py_udf) with [SQL_BATCHED_UDF](../../sql/PythonEvalType.md#SQL_BATCHED_UDF) eval type. 19 | -------------------------------------------------------------------------------- /docs/pyspark/sql/group.md: -------------------------------------------------------------------------------- 1 | # group.py 2 | 3 | `group` module (in `pyspark.sql` package) defines [GroupedData](../../sql/GroupedData.md). 4 | 5 | ```py 6 | from pyspark.sql.group import * 7 | ``` 8 | 9 | ## \_\_all__ 10 | 11 | ??? note "import *" 12 | The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered. 13 | 14 | Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package). 15 | 16 | * [GroupedData](../../sql/GroupedData.md) 17 | -------------------------------------------------------------------------------- /docs/pyspark/sql/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: pyspark.sql 3 | --- 4 | 5 | # pyspark.sql Package 6 | 7 | ```py 8 | import pyspark.sql 9 | ``` 10 | 11 | ## \_\_all__ 12 | 13 | ??? note "import *" 14 | The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered. 15 | 16 | Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package). 17 | 18 | * [SparkSession](SparkSession.md) 19 | * `SQLContext` 20 | * `HiveContext` 21 | * [UDFRegistration](../../sql/UDFRegistration.md) 22 | * `DataFrame` 23 | * [GroupedData](../../sql/GroupedData.md) 24 | * `Column` 25 | * `Catalog` 26 | * [Observation](../../sql/Observation.md) 27 | * `Row` 28 | * `DataFrameNaFunctions` 29 | * `DataFrameStatFunctions` 30 | * `Window` 31 | * `WindowSpec` 32 | * `DataFrameReader` 33 | * `DataFrameWriter` 34 | * `DataFrameWriterV2` 35 | * `PandasCogroupedOps` 36 | -------------------------------------------------------------------------------- /docs/pyspark/sql/pandas/PandasUDFType.md: -------------------------------------------------------------------------------- 1 | # PandasUDFType 2 | 3 | !!! warning "Deprecation Notice" 4 | As of [PySpark 3.0.0](https://issues.apache.org/jira/browse/SPARK-28264), `PandasUDFType` is deprecated in favour of Python type hints. 5 | 6 | `PandasUDFType` is the `functionType` of [pandas_udf](../../../pandas-udfs/index.md#pandas_udf) for Python methods to be used as [pandas UDFs](../../../pandas-udfs/index.md) (with the types matching [PythonEvalType](../../../sql/PythonEvalType.md) on the JVM/Scala side). 7 | 8 | PandasUDFType | PythonEvalType 9 | --------------|--------------- 10 | `GROUPED_AGG` | [SQL_GROUPED_AGG_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_GROUPED_AGG_PANDAS_UDF) 11 | `GROUPED_MAP` | [SQL_GROUPED_MAP_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_GROUPED_MAP_PANDAS_UDF) 12 | `SCALAR` | [SQL_SCALAR_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_UDF) 13 | `SCALAR_ITER` | [SQL_SCALAR_PANDAS_ITER_UDF](../../../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_ITER_UDF) 14 | -------------------------------------------------------------------------------- /docs/pyspark/sql/pandas/functions.md: -------------------------------------------------------------------------------- 1 | # functions.py 2 | 3 | `functions.py` defines [pandas_udf](#pandas_udf) for [pandas user-defined function](../../../pandas-udfs/index.md). 4 | 5 | `functions.py` is part of `pyspark.sql.pandas` package. 6 | 7 | ```python 8 | from pyspark.sql.functions import pandas_udf 9 | ``` 10 | 11 | ## pandas_udf { #pandas_udf } 12 | 13 | ```python 14 | pandas_udf( 15 | f=None, 16 | returnType=None, 17 | functionType=None) 18 | ``` 19 | 20 | `pandas_udf` creates a [pandas user-defined function](../../../pandas-udfs/index.md). 21 | 22 | `pandas_udf` [_create_pandas_udf](#_create_pandas_udf) (possibly creating a partial function with `functools.partial` ([Python]({{ python.docs }}/library/functools.html#functools.partial)) when used as a [decorator](#pandas_udf_decorator)). 23 | 24 | ### Decorator { #pandas_udf_decorator } 25 | 26 | `pandas_udf` can and usually is used as a Python decorator with two positional arguments for the return and function types. 27 | 28 | ```py 29 | @pandas_udf(returnType, functionType) 30 | ``` 31 | 32 | ### returnType { #pandas_udf_returnType } 33 | 34 | `returnType` can be one of the following: 35 | 36 | * `pyspark.sql.types.DataType` 37 | * A DDL-formatted type string 38 | 39 | ### functionType { #pandas_udf_functionType } 40 | 41 | `functionType` must be one the values from `PandasUDFType`: 42 | 43 | * [SQL_SCALAR_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_UDF) 44 | * [SQL_SCALAR_PANDAS_ITER_UDF](../../../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_ITER_UDF) 45 | * [SQL_GROUPED_MAP_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_GROUPED_MAP_PANDAS_UDF) 46 | * [SQL_GROUPED_AGG_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_GROUPED_AGG_PANDAS_UDF) 47 | * [SQL_MAP_PANDAS_ITER_UDF](../../../sql/PythonEvalType.md#SQL_MAP_PANDAS_ITER_UDF) 48 | * [SQL_COGROUPED_MAP_PANDAS_UDF](../../../sql/PythonEvalType.md#SQL_COGROUPED_MAP_PANDAS_UDF) 49 | 50 | ### _create_pandas_udf { #_create_pandas_udf } 51 | 52 | ```py 53 | _create_pandas_udf( 54 | f, 55 | returnType, 56 | evalType) 57 | ``` 58 | 59 | `_create_pandas_udf`...FIXME 60 | -------------------------------------------------------------------------------- /docs/pyspark/sql/pandas/index.md: -------------------------------------------------------------------------------- 1 | # pyspark.sql.pandas Package 2 | 3 | `pyspark.sql.pandas` package is...FIXME 4 | -------------------------------------------------------------------------------- /docs/pyspark/sql/session.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: session.py 3 | --- 4 | 5 | # pyspark.sql.session Module 6 | 7 | `session` module (in `pyspark.sql` package) defines [SparkSession](SparkSession.md). 8 | 9 | ```py 10 | from pyspark.sql.session import * 11 | ``` 12 | 13 | ## \_\_all__ 14 | 15 | ??? note "import *" 16 | The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered. 17 | 18 | Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package). 19 | 20 | * [SparkSession](SparkSession.md) 21 | -------------------------------------------------------------------------------- /docs/pyspark/sql/udf.md: -------------------------------------------------------------------------------- 1 | # udf.py 2 | 3 | `udf` module (in `pyspark.sql` package) defines [UDFRegistration](../../sql/UDFRegistration.md). 4 | 5 | ```py 6 | from pyspark.sql.udf import * 7 | ``` 8 | 9 | ## \_\_all__ 10 | 11 | ??? note "import *" 12 | The `import` statement uses the following convention: if a package’s `__init__.py` code defines a list named `__all__`, it is taken to be the list of module names that should be imported when `from package import *` is encountered. 13 | 14 | Learn more in [6.4.1. Importing * From a Package]({{ python.docs }}/tutorial/modules.html#importing-from-a-package). 15 | 16 | * [UDFRegistration](../../sql/UDFRegistration.md) 17 | 18 | ## _create_udf { #_create_udf } 19 | 20 | ```py 21 | _create_udf( 22 | f: Callable[..., Any], 23 | returnType: "DataTypeOrString", 24 | evalType: int, 25 | name: Optional[str] = None, 26 | deterministic: bool = True) -> "UserDefinedFunctionLike" 27 | ``` 28 | 29 | `_create_udf` creates a [UserDefinedFunction](UserDefinedFunction.md) (with the name of the object to be the name of function `f`). 30 | 31 | --- 32 | 33 | `_create_udf` is used when: 34 | 35 | * `UDFRegistration` is requested to [register](../../sql/UDFRegistration.md#register) 36 | * [udf](functions.md#udf) is used (and [_create_py_udf](#_create_py_udf) is executed) 37 | * [pandas_udf](pandas/functions.md#pandas_udf) (from `pyspark.sql.pandas`) is executed 38 | 39 | ## _create_py_udf { #_create_py_udf } 40 | 41 | ```py 42 | _create_py_udf( 43 | f: Callable[..., Any], 44 | returnType: "DataTypeOrString", 45 | evalType: int, 46 | ) -> "UserDefinedFunctionLike" 47 | ``` 48 | 49 | `_create_py_udf`...FIXME 50 | 51 | --- 52 | 53 | `_create_py_udf` is used when: 54 | 55 | * [udf](functions.md#udf) is executed 56 | 57 | ## Creating SimplePythonFunction for (Pickled) Python Function { #_wrap_function } 58 | 59 | ```py 60 | _wrap_function( 61 | sc: SparkContext, 62 | func: Callable[..., Any], 63 | returnType: "DataTypeOrString") -> JavaObject 64 | ``` 65 | 66 | `_wrap_function` creates a `command` tuple with the given `func` and `returnType`. 67 | 68 | `_wrap_function` [_prepare_for_python_RDD](../rdd.md#_prepare_for_python_RDD) for the `command` tuple that builds the input for a [SimplePythonFunction](../../SimplePythonFunction.md): 69 | 70 | * `pickled_command` byte array 71 | * `env` 72 | * `includes` 73 | * `broadcast_vars` 74 | 75 | In the end, `_wrap_function` creates a [SimplePythonFunction](../../SimplePythonFunction.md) with the above and the following from the given [SparkContext](../../SparkContext.md): 76 | 77 | * [pythonExec](../../SparkContext.md#pythonExec) 78 | * [pythonVer](../../SparkContext.md#pythonVer) 79 | * [_javaAccumulator](../../SparkContext.md#_javaAccumulator) 80 | 81 | --- 82 | 83 | `_wrap_function` is used when: 84 | 85 | * `UserDefinedFunction` is requested to [_create_judf](UserDefinedFunction.md#_create_judf) 86 | -------------------------------------------------------------------------------- /docs/pyspark/worker.md: -------------------------------------------------------------------------------- 1 | # worker.py 2 | 3 | `worker.py` is a Python module in [pyspark](index.md) package. 4 | 5 | ```py 6 | from pyspark import worker 7 | ``` 8 | 9 | ## Entry Point 10 | 11 | ??? note "Top-Level Code Environment" 12 | If the module is executed in the top-level code environment (and not initialized from an import statement), its `__name__` is set to the string `__main__`. 13 | 14 | Sometimes "top-level code" is called an _entry point_ to the application. 15 | 16 | Learn more in the [\_\_main__ — Top-level code environment]({{ python.docs }}/library/__main__.html). 17 | 18 | When executed in the top-level code environment (e.g., `python3 -m`), `worker.py` reads the following environment variables: 19 | 20 | Environment Variable | Description 21 | ---------------------|------------ 22 | `PYTHON_WORKER_FACTORY_PORT` | Port the JVM listens to 23 | `PYTHON_WORKER_FACTORY_SECRET` | Authorization Secret 24 | 25 | `worker.py` [local_connect_and_auth](#local_connect_and_auth) (that gives a `sock_file`). 26 | 27 | `worker.py` [write_int](#write_int) with the PID of the Python process to the `sock_file`. 28 | 29 | In the end, `worker.py` [main](#main) (with the `sock_file` and `sock_file` for the input and output files). 30 | 31 | ## main { #main } 32 | 33 | ```py 34 | main( 35 | infile, 36 | outfile) 37 | ``` 38 | 39 | `main` reads `PYTHON_FAULTHANDLER_DIR` environment variable. 40 | 41 | `main` does a lot of initializations. 42 | 43 | ??? note "FIXME Review the initializations" 44 | 45 | `main` [read_udfs](#read_udfs) that gives the following: 46 | 47 | * `func` 48 | * `profiler` 49 | * `deserializer` 50 | * `serializer` 51 | 52 | requests the `deserializer` to `load_stream` from the given `infile` and executes `func` (with the `split_index` and the deserialized stream). 53 | 54 | `main` does a lot of post-processings. 55 | 56 | ??? note "FIXME Review the post-processings" 57 | 58 | ## read_udfs { #read_udfs } 59 | 60 | ```py 61 | read_udfs( 62 | pickleSer, 63 | infile, 64 | eval_type) 65 | ``` 66 | 67 | `read_udfs`...FIXME 68 | 69 | ### read_single_udf { #read_single_udf } 70 | 71 | ```py 72 | read_single_udf( 73 | pickleSer, 74 | infile, 75 | eval_type, 76 | runner_conf, 77 | udf_index) 78 | ``` 79 | 80 | `read_single_udf`...FIXME 81 | -------------------------------------------------------------------------------- /docs/python-api.md: -------------------------------------------------------------------------------- 1 | # Python API 2 | 3 | [TAGS] 4 | -------------------------------------------------------------------------------- /docs/pytorch-distributed/.pages: -------------------------------------------------------------------------------- 1 | title: Distributed Training using PyTorch 2 | nav: 3 | - index.md 4 | - ... 5 | -------------------------------------------------------------------------------- /docs/pytorch-distributed/TorchDistributor.md: -------------------------------------------------------------------------------- 1 | # TorchDistributor 2 | 3 | `TorchDistributor` is a [Distributor](../ml/Distributor.md) to run PyTorch's [torch.distributed.run]({{ pytorch.github }}/blob/main/torch/distributed/run.py) module on Apache Spark clusters. 4 | 5 | `TorchDistributor` is a PySpark translation of [torchrun]({{ pytorch.docs }}/elastic/run.html) (from [Torch Distributed Elastic]({{ pytorch.docs }}/distributed.elastic.html)). 6 | 7 | ## Demo 8 | 9 | ```py 10 | from pyspark.ml.torch.distributor import TorchDistributor 11 | 12 | distributor = TorchDistributor( 13 | num_processes=1, 14 | local_mode=False, 15 | use_gpu=False) 16 | ``` 17 | 18 | ```py 19 | # Use a path to a training script 20 | # and variable-length kwargs 21 | distributor.run( 22 | "train.py", 23 | "--learning-rate=1e-3", 24 | "--batch-size=64", 25 | "--my-key=my-value") 26 | 27 | # Started local training with 1 processes 28 | # NOTE: Redirects are currently not supported in Windows or MacOs. 29 | # Finished local training with 1 processes 30 | ``` 31 | 32 | ```py 33 | # Use a Callable (function) 34 | # The number of positional arguments is the number of kwargs 35 | def train(a, b, c): 36 | print(f"Got a={a}, b={b}, c={c}") 37 | return 'success' 38 | 39 | distributor.run( 40 | train, 41 | "--learning-rate=1e-3", 42 | "--batch-size=64", 43 | "--my-key=my-value") 44 | 45 | # Started distributed training with 1 executor proceses 46 | # NOTE: Redirects are currently not supported in Windows or MacOs. (0 + 1) / 1] 47 | # NOTE: Redirects are currently not supported in Windows or MacOs. 48 | # Got a=--learning-rate=1e-3, b=--batch-size=64, c=--my-key=my-value 49 | # Got a=--learning-rate=1e-3, b=--batch-size=64, c=--my-key=my-value 50 | # Finished distributed training with 1 executor proceses 51 | # 'success' 52 | ``` 53 | 54 | ## Running Distributed Training { #run } 55 | 56 | ```py 57 | run( 58 | self, 59 | train_object: Union[Callable, str], 60 | *args: Any) -> Optional[Any] 61 | ``` 62 | 63 | `run` determines what to run (e.g., a function or a script based on the given `train_object`). 64 | 65 | * With a function, `run` uses [_run_training_on_pytorch_function](#_run_training_on_pytorch_function) 66 | * With a script, `run` uses [_run_training_on_pytorch_file](#_run_training_on_pytorch_file) 67 | 68 | In the end, `run` runs a local or distributed training. 69 | 70 | * In [local mode](../ml/Distributor.md#local_mode), `run` [runs local training](#_run_local_training) 71 | * In non-[local mode](../ml/Distributor.md#local_mode), `run` [runs distributed training](#_run_distributed_training) 72 | 73 | ### Local Training { #_run_local_training } 74 | 75 | ```py 76 | _run_local_training( 77 | self, 78 | framework_wrapper_fn: Callable, 79 | train_object: Union[Callable, str], 80 | *args: Any, 81 | ) -> Optional[Any] 82 | ``` 83 | 84 | `_run_local_training` looks up `CUDA_VISIBLE_DEVICES` among the environment variables. 85 | 86 | With [use_gpu](../ml/Distributor.md#use_gpu), `_run_local_training`...FIXME 87 | 88 | `_run_local_training` prints out the following INFO message to the logs: 89 | 90 | ```text 91 | Started local training with [num_processes] processes 92 | ``` 93 | 94 | `_run_local_training` executes the given `framework_wrapper_fn` function (with the [input_params](#input_params), the given `train_object` and the `args`). 95 | 96 | In the end, `_run_local_training` prints out the following INFO message to the logs: 97 | 98 | ```text 99 | Finished local training with [num_processes] processes 100 | ``` 101 | 102 | ### Distributed Training { #_run_distributed_training } 103 | 104 | ```py 105 | _run_distributed_training( 106 | self, 107 | framework_wrapper_fn: Callable, 108 | train_object: Union[Callable, str], 109 | *args: Any, 110 | ) -> Optional[Any] 111 | ``` 112 | 113 | `_run_distributed_training`...FIXME 114 | 115 | ### _run_training_on_pytorch_function { #_run_training_on_pytorch_function } 116 | 117 | ```py 118 | _run_training_on_pytorch_function( 119 | input_params: Dict[str, Any], 120 | train_fn: Callable, 121 | *args: Any 122 | ) -> Any 123 | ``` 124 | 125 | `_run_training_on_pytorch_function` [prepares train and output files](#_setup_files). 126 | 127 | `_run_training_on_pytorch_function`...FIXME 128 | 129 | ### Setting Up Files { #_setup_files } 130 | 131 | ```py 132 | # @contextmanager 133 | _setup_files( 134 | train_fn: Callable, 135 | *args: Any 136 | ) -> Generator[Tuple[str, str], None, None] 137 | ``` 138 | 139 | `_setup_files` gives the paths of a TorchRun train file and `output.pickle` output file. 140 | 141 | --- 142 | 143 | `_setup_files` [creates a save directory](#_create_save_dir). 144 | 145 | `_setup_files` [saves train_fn function](#_save_pickled_function) to the save directory (that gives a `pickle_file_path`). 146 | 147 | `_setup_files` uses the save directory and `output.pickle` name for the output file path. 148 | 149 | `_setup_files` [creates a torchrun_train_file](#_create_torchrun_train_file) with the following: 150 | 151 | * [Save directory](#_create_save_dir) 152 | * `pickle_file_path` 153 | * `output.pickle` output file path 154 | 155 | In the end, `_setup_files` yields (_gives_) the `torchrun_train_file` and the `output.pickle` output file path. 156 | 157 | ### Creating TorchRun Train File { #_create_torchrun_train_file } 158 | 159 | ```py 160 | _create_torchrun_train_file( 161 | save_dir_path: str, 162 | pickle_file_path: str, 163 | output_file_path: str 164 | ) -> str 165 | ``` 166 | 167 | `_create_torchrun_train_file` creates `train.py` in the given `save_dir_path` with the following content (based on the given `pickle_file_path` and the `output_file_path`): 168 | 169 | ```py 170 | import cloudpickle 171 | import os 172 | 173 | if __name__ == "__main__": 174 | with open("[pickle_file_path]", "rb") as f: 175 | train_fn, args = cloudpickle.load(f) 176 | output = train_fn(*args) 177 | with open("[output_file_path]", "wb") as f: 178 | cloudpickle.dump(output, f) 179 | ``` 180 | 181 | ## _run_training_on_pytorch_file { #_run_training_on_pytorch_file } 182 | 183 | ```py 184 | _run_training_on_pytorch_file( 185 | input_params: Dict[str, Any], 186 | train_path: str, 187 | *args: Any 188 | ) -> None 189 | ``` 190 | 191 | `_run_training_on_pytorch_file` looks up the `log_streaming_client` in the given `input_params` (or assumes `None`). 192 | 193 | !!! note "FIXME What's log_streaming_client?" 194 | 195 | `_run_training_on_pytorch_file` [creates torchrun command](#_create_torchrun_command). 196 | 197 | `_run_training_on_pytorch_file` [executes the command](#_execute_command). 198 | 199 | ### _create_torchrun_command { #_create_torchrun_command } 200 | 201 | ```py 202 | _create_torchrun_command( 203 | input_params: Dict[str, Any], 204 | path_to_train_file: str, 205 | *args: Any 206 | ) -> List[str] 207 | ``` 208 | 209 | `_create_torchrun_command` takes the value of the following parameters (from the given `input_params`): 210 | 211 | * `local_mode` 212 | * `num_processes` 213 | 214 | `_create_torchrun_command` determines the `torchrun_args` and `processes_per_node` based on `local_mode`. 215 | 216 | local_mode | torchrun_args | processes_per_node 217 | -------------|-----------------|--------------------- 218 | `True` |
  • `--standalone`
  • `--nnodes=1`
| `num_processes`
(from the given `input_params`) 219 | `False` |
  • `--nnodes=[num_processes]`
  • `--node_rank=[node_rank]`
  • `--rdzv_endpoint=[MASTER_ADDR]:[MASTER_PORT]`
  • `--rdzv_id=0`
| 1 220 | 221 | In the end, `_create_torchrun_command` returns a Python command to execute [torch_run_process_wrapper](torch_run_process_wrapper.md) module (`python -m`) with the following positional arguments: 222 | 223 | * `torchrun_args` 224 | * `--nproc_per_node=[processes_per_node]` 225 | * The given `path_to_train_file` 226 | * The given `args` 227 | -------------------------------------------------------------------------------- /docs/pytorch-distributed/index.md: -------------------------------------------------------------------------------- 1 | # Distributed Training using PyTorch 2 | 3 | PySpark 3.4.0 introduces [TorchDistributor](TorchDistributor.md) for distributed training on Apache Spark clusters using [PyTorch Distributed]({{ pytorch.tutorials }}/beginner/dist_overview.html). 4 | 5 | ## Learn More 6 | 7 | 1. [Distributed training with TorchDistributor](https://docs.databricks.com/machine-learning/train-model/distributed-training/spark-pytorch-distributor.html) 8 | -------------------------------------------------------------------------------- /docs/pytorch-distributed/torch_run_process_wrapper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: torch_run_process_wrapper 3 | --- 4 | 5 | # torch_run_process_wrapper Module 6 | 7 | `torch_run_process_wrapper` is used as the [torchrun command](TorchDistributor.md#_create_torchrun_command) in [TorchDistributor](TorchDistributor.md). 8 | 9 | `torch_run_process_wrapper` executes `torch.distributed.run` module (using `python -m`). `torch_run_process_wrapper` monitors the child process and prints out the output to the standard output. 10 | -------------------------------------------------------------------------------- /docs/runners/.pages: -------------------------------------------------------------------------------- 1 | title: Python Runners 2 | nav: 3 | - ... 4 | -------------------------------------------------------------------------------- /docs/runners/ArrowPythonRunner.md: -------------------------------------------------------------------------------- 1 | # ArrowPythonRunner 2 | 3 | `ArrowPythonRunner` is a [BasePythonRunner](BasePythonRunner.md) with `Iterator[InternalRow]` input and `ColumnarBatch` (vectorized) output. 4 | 5 | `ArrowPythonRunner` supports `BasicPythonArrowInput` and [BasicPythonArrowOutput](BasicPythonArrowOutput.md). 6 | 7 | ## Creating Instance 8 | 9 | `ArrowPythonRunner` takes the following to be created: 10 | 11 | * `ChainedPythonFunctions`es 12 | * Eval Type 13 | * Argument Offsets 14 | * `Schema` ([Spark SQL]({{ book.spark_sql }}/types/StructType)) 15 | * TimeZone ID 16 | * Worker Configuration 17 | * Performance Metrics 18 | 19 | `ArrowPythonRunner` is created when the following physical operators ([Spark SQL]({{ book.spark_sql }}/physical-operators/)) are executed: 20 | 21 | * [AggregateInPandasExec](../sql/AggregateInPandasExec.md) 22 | * [ArrowEvalPythonExec](../sql/ArrowEvalPythonExec.md) 23 | * `FlatMapGroupsInPandasExec` 24 | * `MapInPandasExec` 25 | * `WindowInPandasExec` 26 | 27 | ## bufferSize { #bufferSize } 28 | 29 | ??? note "BasePythonRunner" 30 | 31 | ```scala 32 | bufferSize: Int 33 | ``` 34 | 35 | `bufferSize` is part of the [BasePythonRunner](BasePythonRunner.md#bufferSize) abstraction. 36 | 37 | `bufferSize` is the value of [spark.sql.execution.pandas.udf.buffer.size](../configuration-properties/index.md#spark.sql.execution.pandas.udf.buffer.size) configuration property. 38 | 39 | ## simplifiedTraceback { #simplifiedTraceback } 40 | 41 | ??? note "BasePythonRunner" 42 | 43 | ```scala 44 | simplifiedTraceback: Boolean 45 | ``` 46 | 47 | `simplifiedTraceback` is part of the [BasePythonRunner](BasePythonRunner.md#simplifiedTraceback) abstraction. 48 | 49 | `simplifiedTraceback` is the value of [spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled](../configuration-properties/index.md#spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled) configuration property. 50 | -------------------------------------------------------------------------------- /docs/runners/BasePythonRunner.md: -------------------------------------------------------------------------------- 1 | # BasePythonRunner 2 | 3 | `BasePythonRunner` is an [abstraction](#contract) of [Python Runners](#implementations). 4 | 5 | `BasePythonRunner` is executed as part of Spark tasks (that run on executors). 6 | 7 | ??? note "Scala Definition" 8 | 9 | `BasePythonRunner` is a type constructor in Scala (_generic class_ in Java) with the following definition: 10 | 11 | ```scala 12 | abstract class BasePythonRunner[IN, OUT](...) { 13 | // ... 14 | } 15 | ``` 16 | 17 | `BasePythonRunner` uses `IN` and `OUT` as the name of the types for the input and output values. 18 | 19 | ## Contract 20 | 21 | ### newReaderIterator { #newReaderIterator } 22 | 23 | ```scala 24 | newReaderIterator( 25 | stream: DataInputStream, 26 | writerThread: WriterThread, 27 | startTime: Long, 28 | env: SparkEnv, 29 | worker: Socket, 30 | pid: Option[Int], 31 | releasedOrClosed: AtomicBoolean, 32 | context: TaskContext): Iterator[OUT] 33 | ``` 34 | 35 | See: 36 | 37 | * [PythonRunner](PythonRunner.md#newReaderIterator) 38 | * [PythonUDFRunner](PythonUDFRunner.md#newReaderIterator) 39 | 40 | Used when: 41 | 42 | * `BasePythonRunner` is requested to [compute](#compute) 43 | 44 | ### newWriterThread { #newWriterThread } 45 | 46 | ```scala 47 | newWriterThread( 48 | env: SparkEnv, 49 | worker: Socket, 50 | inputIterator: Iterator[IN], 51 | partitionIndex: Int, 52 | context: TaskContext): WriterThread 53 | ``` 54 | 55 | See: 56 | 57 | * [PythonRunner](PythonRunner.md#newWriterThread) 58 | * [PythonUDFRunner](PythonUDFRunner.md#newWriterThread) 59 | 60 | Used when: 61 | 62 | * `BasePythonRunner` is requested to [compute](#compute) 63 | 64 | ## Implementations 65 | 66 | * `ApplyInPandasWithStatePythonRunner` 67 | * [ArrowPythonRunner](ArrowPythonRunner.md) 68 | * `CoGroupedArrowPythonRunner` 69 | * [PythonRunner](PythonRunner.md) 70 | * [PythonUDFRunner](PythonUDFRunner.md) 71 | 72 | ## Creating Instance 73 | 74 | `BasePythonRunner` takes the following to be created: 75 | 76 | * `ChainedPythonFunctions` 77 | * Eval Type 78 | * Argument Offsets 79 | 80 | `BasePythonRunner` requires that the number of [ChainedPythonFunctions](#funcs) and [Argument Offsets](#argOffsets) are the same. 81 | 82 | !!! note "Abstract Class" 83 | `BasePythonRunner` is an abstract class and cannot be created directly. It is created indirectly for the [concrete BasePythonRunners](#implementations). 84 | 85 | ### accumulator { #accumulator } 86 | 87 | ```scala 88 | accumulator: PythonAccumulatorV2 89 | ``` 90 | 91 | `BasePythonRunner` initializes a registry of a [PythonAccumulatorV2](../PythonAccumulatorV2.md) when [created](#creating-instance) to be the [accumulator](../PythonFunction.md#accumulator) of the head [PythonFunction](../PythonFunction.md) among the given [ChainedPythonFunctions](#funcs). 92 | 93 | The `PythonAccumulatorV2` is used when `ReaderIterator` is requested to [handleEndOfDataSection](ReaderIterator.md#handleEndOfDataSection) (to update metrics). 94 | 95 | ## Computing Result { #compute } 96 | 97 | ```scala 98 | compute( 99 | inputIterator: Iterator[IN], 100 | partitionIndex: Int, 101 | context: TaskContext): Iterator[OUT] 102 | ``` 103 | 104 | !!! note "Runs on Executors" 105 | `compute` runs on Spark executors. 106 | 107 | `compute` uses the given `TaskContext` to look up the following local properties (if they were specified via `ResourceProfile`): 108 | 109 | * `resource.executor.cores` 110 | * `resource.pyspark.memory` 111 | 112 | `compute` requests the `DiskBlockManager` for the local directories and creates a comma-separated list of them (`localdir`). 113 | 114 | Unless `spark.executorEnv.OMP_NUM_THREADS` is explicitly specified (in the [SparkConf](#conf)), `compute` sets `OMP_NUM_THREADS` (in the [envVars](#envVars)) to be the value of`resource.executor.cores` (if defined). 115 | 116 | `compute` sets the following in the [envVars](#envVars): 117 | 118 | * `SPARK_LOCAL_DIRS` as the local directories of the local `DiskBlockManager` (`localdir`) 119 | 120 | `compute` can optionally define environment variables: 121 | 122 | * `SPARK_REUSE_WORKER` as `1` when `spark.python.worker.reuse` configuration property is enabled 123 | * `SPARK_SIMPLIFIED_TRACEBACK` as `1` when [simplifiedTraceback](#simplifiedTraceback) is enabled 124 | * _others_ 125 | 126 | `compute` requests `SparkEnv` to [createPythonWorker](../SparkEnv.md#createPythonWorker) (for the [pythonExec](#pythonExec) and the [envVars](#envVars)). 127 | 128 | `compute` [creates a new WriterThread](#newWriterThread) (to feed the worker process input from the given `inputIterator`) and starts it. 129 | 130 | `compute` creates and starts a `WriterMonitorThread`. 131 | 132 | `compute` creates a `MonitorThread`. 133 | 134 | `compute` creates a buffered `DataInputStream` to read from the worker (socket) output. `compute` uses the [bufferSize](#bufferSize). 135 | 136 | In the end, `compute` [creates a new ReaderIterator](#newReaderIterator) to read lines from the Python worker's stdout (from the buffered `DataInputStream`). 137 | 138 | --- 139 | 140 | `compute` is used when: 141 | 142 | * `PythonRDD` is requested to [compute a partition](../PythonRDD.md#compute) 143 | * [AggregateInPandasExec](../sql/AggregateInPandasExec.md), [ArrowEvalPythonExec](../sql/ArrowEvalPythonExec.md), `BatchEvalPythonExec`, `FlatMapCoGroupsInPandasExec`, `FlatMapGroupsInPandasExec` `MapInPandasExec`, `WindowInPandasExec` physical operators are executed 144 | * `PandasGroupUtils` is requested to `executePython` 145 | * `PythonForeachWriter` is requested for the [outputIterator](../PythonForeachWriter.md#outputIterator) 146 | -------------------------------------------------------------------------------- /docs/runners/BasicPythonArrowOutput.md: -------------------------------------------------------------------------------- 1 | # BasicPythonArrowOutput 2 | 3 | `BasicPythonArrowOutput` is a marker extension of the [PythonArrowOutput](PythonArrowOutput.md) abstraction for [vectorized outputs](#implementations) of [BasePythonRunner](BasePythonRunner.md)s that produce `ColumnarBatch`es ([Spark SQL]({{ book.spark_sql }}/vectorized-query-execution/ColumnarBatch)). 4 | 5 | ## Implementations 6 | 7 | * [ArrowPythonRunner](ArrowPythonRunner.md) 8 | * `CoGroupedArrowPythonRunner` 9 | 10 | ## Deserializing ColumnarBatch { #deserializeColumnarBatch } 11 | 12 | ??? note "PythonArrowOutput" 13 | 14 | ```scala 15 | deserializeColumnarBatch( 16 | batch: ColumnarBatch, 17 | schema: StructType): ColumnarBatch 18 | ``` 19 | 20 | `deserializeColumnarBatch` is part of the [PythonArrowOutput](PythonArrowOutput.md#deserializeColumnarBatch) abstraction. 21 | 22 | `deserializeColumnarBatch` returns the given `ColumnarBatch` unchanged. 23 | -------------------------------------------------------------------------------- /docs/runners/PythonArrowOutput.md: -------------------------------------------------------------------------------- 1 | # PythonArrowOutput 2 | 3 | `PythonArrowOutput` is an [extension](#contract) of the [BasePythonRunner](BasePythonRunner.md) abstraction for [vectorized (ColumnarBatch) runners](#implementations). 4 | 5 | ??? note "Scala Definition" 6 | 7 | ```scala 8 | trait PythonArrowOutput[OUT <: AnyRef] { 9 | self: BasePythonRunner[_, OUT] => 10 | // ... 11 | } 12 | ``` 13 | 14 | ## Contract 15 | 16 | ### Deserializing ColumnarBatch { #deserializeColumnarBatch } 17 | 18 | ```scala 19 | deserializeColumnarBatch( 20 | batch: ColumnarBatch, 21 | schema: StructType): OUT 22 | ``` 23 | 24 | See: 25 | 26 | * [BasicPythonArrowOutput](BasicPythonArrowOutput.md#deserializeColumnarBatch) 27 | 28 | Used when: 29 | 30 | * `PythonArrowOutput` is requested to [newReaderIterator](#newReaderIterator) (after a batch is loaded) 31 | 32 | ### Performance Metrics { #pythonMetrics } 33 | 34 | ```scala 35 | pythonMetrics: Map[String, SQLMetric] 36 | ``` 37 | 38 | `SQLMetric`s ([Spark SQL]({{ book.spark_sql }}/SQLMetric)): 39 | 40 | * `pythonNumRowsReceived` 41 | * `pythonDataReceived` 42 | 43 | Used when: 44 | 45 | * `PythonArrowOutput` is requested to [newReaderIterator](#newReaderIterator) (after a batch is loaded) 46 | 47 | ## Implementations 48 | 49 | * `ApplyInPandasWithStatePythonRunner` 50 | * [BasicPythonArrowOutput](BasicPythonArrowOutput.md) 51 | -------------------------------------------------------------------------------- /docs/runners/PythonRunner.md: -------------------------------------------------------------------------------- 1 | # PythonRunner 2 | 3 | `PythonRunner` is a concrete [BasePythonRunner](BasePythonRunner.md). 4 | 5 | ## Creating Instance 6 | 7 | `PythonRunner` takes the following to be created: 8 | 9 | * `ChainedPythonFunctions`es 10 | 11 | `PythonRunner` is created (indirectly using [apply](#apply) factory method) when: 12 | 13 | * `PythonRDD` is requested to [compute a partition](../PythonRDD.md#compute) 14 | * `PythonForeachWriter` is requested for a [PythonRunner](../PythonForeachWriter.md#pythonRunner) 15 | 16 | ## Creating PythonRunner 17 | 18 | ```scala 19 | apply( 20 | func: PythonFunction): PythonRunner 21 | ``` 22 | 23 | `apply` simply creates a [PythonRunner](PythonRunner.md) for the [PythonFunction](../PythonFunction.md). 24 | 25 | --- 26 | 27 | `apply` is used when: 28 | 29 | * `PythonRDD` is requested to [compute a partition](../PythonRDD.md#compute) 30 | * `PythonForeachWriter` is requested for a [PythonRunner](../PythonForeachWriter.md#pythonRunner) 31 | -------------------------------------------------------------------------------- /docs/runners/PythonUDFRunner.md: -------------------------------------------------------------------------------- 1 | # PythonUDFRunner 2 | 3 | `PythonUDFRunner` is...FIXME 4 | -------------------------------------------------------------------------------- /docs/runners/ReaderIterator.md: -------------------------------------------------------------------------------- 1 | # ReaderIterator 2 | 3 | `ReaderIterator` is an [extension](#contract) of the `Iterator` ([Scala]({{ scala.api }}/scala/collection/Iterator.html)) abstraction for [iterators](#implementations) to [read](#read) `OUT` values. 4 | 5 | ```scala 6 | abstract class ReaderIterator(...) 7 | extends Iterator[OUT] 8 | ``` 9 | 10 | ## Contract 11 | 12 | ### Reading Value { #read } 13 | 14 | ```scala 15 | read(): OUT 16 | ``` 17 | 18 | See: 19 | 20 | * [PythonArrowOutput](PythonArrowOutput.md#newReaderIterator) 21 | * [PythonRunner](PythonRunner.md#newReaderIterator) 22 | * [PythonUDFRunner](PythonUDFRunner.md#newReaderIterator) 23 | 24 | Used when: 25 | 26 | * `ReaderIterator` is requested to [hasNext](#hasNext) 27 | 28 | ## Implementations 29 | 30 | * [PythonArrowOutput](PythonArrowOutput.md#newReaderIterator) 31 | * [PythonRunner](PythonRunner.md#newReaderIterator) 32 | * [PythonUDFRunner](PythonUDFRunner.md#newReaderIterator) 33 | 34 | ## handleEndOfDataSection { #handleEndOfDataSection } 35 | 36 | ```scala 37 | handleEndOfDataSection(): Unit 38 | ``` 39 | 40 | `handleEndOfDataSection`...FIXME 41 | 42 | --- 43 | 44 | `handleEndOfDataSection` is used when: 45 | 46 | * `PythonRunner` is requested to [newReaderIterator](PythonRunner.md#newReaderIterator) 47 | * `PythonArrowOutput` is requested to [newReaderIterator](PythonArrowOutput.md#newReaderIterator) 48 | * `PythonUDFRunner` is requested to [newReaderIterator](PythonUDFRunner.md#newReaderIterator) 49 | -------------------------------------------------------------------------------- /docs/scala-api.md: -------------------------------------------------------------------------------- 1 | # Scala API 2 | 3 | [TAGS] 4 | -------------------------------------------------------------------------------- /docs/sql/.pages: -------------------------------------------------------------------------------- 1 | title: SQL 2 | nav: 3 | - index.md 4 | - Physical Operators: 5 | - ... | *Exec.md 6 | - PythonSQLMetrics.md 7 | - ... 8 | -------------------------------------------------------------------------------- /docs/sql/AggregateInPandasExec.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: AggregateInPandasExec 3 | --- 4 | 5 | # AggregateInPandasExec Physical Operator 6 | 7 | `AggregateInPandasExec` is a unary physical operator ([Spark SQL]({{ book.spark_sql }}/physical-operators/UnaryExecNode)) that executes [pandas UDAFs](#udfExpressions) using [ArrowPythonRunner](../runners/ArrowPythonRunner.md) (one per partition). 8 | 9 | ## Creating Instance 10 | 11 | `AggregateInPandasExec` takes the following to be created: 12 | 13 | * Grouping Expressions ([Spark SQL]({{ book.spark_sql }}/expressions/Expression)) (`Seq[NamedExpression]`) 14 | * pandas UDAFs ([PythonUDF](PythonUDF.md)s with [SQL_GROUPED_AGG_PANDAS_UDF](PythonEvalType.md#SQL_GROUPED_AGG_PANDAS_UDF)) 15 | * Result Named Expressions ([Spark SQL]({{ book.spark_sql }}/expressions/NamedExpression)) (`Seq[NamedExpression]`) 16 | * Child Physical Operator ([Spark SQL]({{ book.spark_sql }}/physical-operators/SparkPlan)) 17 | 18 | `AggregateInPandasExec` is created when `Aggregation` execution planning strategy ([Spark SQL]({{ book.spark_sql }}/execution-planning-strategies/Aggregation)) is executed for `Aggregate` logical operators ([Spark SQL]({{ book.spark_sql }}/logical-operators/Aggregate)) with [PythonUDF](PythonUDF.md) aggregate expressions only. 19 | 20 | ## Executing Operator { #doExecute } 21 | 22 | ??? note "SparkPlan" 23 | 24 | ```scala 25 | doExecute(): RDD[InternalRow] 26 | ``` 27 | 28 | `doExecute` is part of the `SparkPlan` ([Spark SQL]({{ book.spark_sql }}/physical-operators/SparkPlan#doExecute)) abstraction. 29 | 30 | `doExecute` uses [ArrowPythonRunner](../runners/ArrowPythonRunner.md) (one per partition) to execute [PythonUDFs](#udfExpressions). 31 | -------------------------------------------------------------------------------- /docs/sql/ArrowEvalPython.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: ArrowEvalPython 3 | --- 4 | 5 | # ArrowEvalPython Logical Operator 6 | 7 | `ArrowEvalPython` is a [BaseEvalPython](BaseEvalPython.md) unary logical operator that evaluates [scalar PythonUDF](PythonUDF.md#isScalarPythonUDF)s with [Apache Arrow]({{ arrow.home }}). 8 | 9 | `ArrowEvalPython` is planned as [ArrowEvalPythonExec](ArrowEvalPythonExec.md) physical operator. 10 | 11 | ## Creating Instance 12 | 13 | `ArrowEvalPython` takes the following to be created: 14 | 15 | * [Scalar PythonUDF](PythonUDF.md#isScalarPythonUDF)s 16 | * Result `Attribute`s ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute)) 17 | * Child `LogicalPlan` ([Spark SQL]({{ book.spark_sql }}/logical-operators/LogicalPlan)) 18 | * [Eval Type](#evalType) 19 | 20 | `ArrowEvalPython` is created when: 21 | 22 | * `ExtractPythonUDFs` logical optimization is executed (and requested to extract [scalar PythonUDF](PythonUDF.md#isScalarPythonUDF)s from a logical query plan) 23 | 24 | ### evalType { #evalType } 25 | 26 | ```scala 27 | evalType: Int 28 | ``` 29 | 30 | `ArrowEvalPython` is given an `evalType` when [created](#creating-instance) that can only be one of the following: 31 | 32 | * [SQL_SCALAR_PANDAS_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_UDF) 33 | * [SQL_SCALAR_PANDAS_ITER_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_ITER_UDF) 34 | -------------------------------------------------------------------------------- /docs/sql/ArrowEvalPythonExec.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: ArrowEvalPythonExec 3 | --- 4 | 5 | # ArrowEvalPythonExec Physical Operator 6 | 7 | `ArrowEvalPythonExec` is an [EvalPythonExec](EvalPythonExec.md) physical operator to [evaluate scalar PythonUDFs](#evaluate) using [ArrowPythonRunner](../runners/ArrowPythonRunner.md). 8 | 9 | `ArrowEvalPythonExec` represents [ArrowEvalPython](ArrowEvalPython.md) logical operator at execution time. 10 | 11 | ## Creating Instance 12 | 13 | `ArrowEvalPythonExec` takes the following to be created: 14 | 15 | * [Scalar PythonUDF](PythonUDF.md#isScalarPythonUDF)s 16 | * Result `Attribute`s ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute)) 17 | * Child `SparkPlan` ([Spark SQL]({{ book.spark_sql }}/physical-operators/SparkPlan)) 18 | * Eval Type 19 | 20 | `ArrowEvalPythonExec` is created when: 21 | 22 | * `PythonEvals` physical execution strategy is executed (and plans [ArrowEvalPython](ArrowEvalPython.md) logical operators) 23 | 24 | ## Performance Metrics 25 | 26 | `ArrowEvalPythonExec` is a [PythonSQLMetrics](PythonSQLMetrics.md). 27 | 28 | ## Maximum Records per Batch { #batchSize } 29 | 30 | `batchSize` is the value of [spark.sql.execution.arrow.maxRecordsPerBatch](../configuration-properties/index.md#spark.sql.execution.arrow.maxRecordsPerBatch) configuration property. 31 | 32 | `batchSize` is used while [evaluating PythonUDFs](#evaluate). 33 | 34 | ## Evaluating PythonUDFs { #evaluate } 35 | 36 | ??? note "EvalPythonExec" 37 | 38 | ```scala 39 | evaluate( 40 | funcs: Seq[ChainedPythonFunctions], 41 | argOffsets: Array[Array[Int]], 42 | iter: Iterator[InternalRow], 43 | schema: StructType, 44 | context: TaskContext): Iterator[InternalRow] 45 | ``` 46 | 47 | `evaluate` is part of the [EvalPythonExec](EvalPythonExec.md#evaluate) abstraction. 48 | 49 | `evaluate` creates an [ArrowPythonRunner](../runners/ArrowPythonRunner.md) to [compute partitions](../runners/BasePythonRunner.md#compute). 50 | 51 | In the end, `evaluate` converts `ColumnarBatch`es into `InternalRow`s. 52 | -------------------------------------------------------------------------------- /docs/sql/BaseEvalPython.md: -------------------------------------------------------------------------------- 1 | # BaseEvalPython 2 | 3 | `BaseEvalPython` is...FIXME 4 | -------------------------------------------------------------------------------- /docs/sql/DataFrame.md: -------------------------------------------------------------------------------- 1 | # DataFrame 2 | 3 | `DataFrame` is a Python class with [PandasMapOpsMixin](PandasMapOpsMixin.md) and [PandasConversionMixin](PandasConversionMixin.md) mixins. 4 | 5 | `DataFrame` is defined in [pyspark.sql.dataframe](../pyspark/sql/dataframe.md) module. 6 | 7 | ```py 8 | from pyspark.sql.dataframe import DataFrame 9 | ``` 10 | 11 | ## Creating Instance 12 | 13 | `DataFrame` takes the following to be created: 14 | 15 | * jdf 16 | * [SQLContext](SQLContext.md) 17 | 18 | ## groupBy 19 | 20 | ```scala 21 | groupBy(self, *cols) 22 | ``` 23 | 24 | `groupBy` requests the [_jdf](#jdf) to `groupBy` and creates a [GroupedData](GroupedData.md) with it. 25 | 26 | ## observe { #observe } 27 | 28 | ```py 29 | observe( 30 | self, 31 | observation: Union["Observation", str], 32 | *exprs: Column, 33 | ) -> "DataFrame" 34 | ``` 35 | 36 | `observe` accepts an [Observation](Observation.md) or a name as the `observation`: 37 | 38 | * For an [Observation](Observation.md), `observe` requests it to [_on](Observation.md#_on) (with this `DataFrame` and the `exprs` columns). 39 | 40 | * For a name, `observe` creates a new `DataFrame` after requesting [_jdf](#_jdf) to `observe` (with the name). 41 | 42 | ### Demo { #observe-demo } 43 | 44 | !!! note "QueryExecutionListener" 45 | You should install `QueryExecutionListener` ([Spark SQL]({{ book.spark_sql }}/QueryExecutionListener)) to intercept `QueryExecution` on a successful query execution (to access `observedMetrics`). 46 | 47 | ```py 48 | import pandas as pd 49 | 50 | pandas_df = pd.DataFrame({ 51 | 'name': ['jacek', 'agata', 'iweta', 'patryk', 'maksym'], 52 | 'age': [50, 49, 29, 26, 11] 53 | }) 54 | df = spark.createDataFrame(pandas_df) 55 | ``` 56 | 57 | ```py 58 | from pyspark.sql.functions import * 59 | row_count_metric = count(lit(1)).alias("count") 60 | observed_df = df.observe("observe_demo", row_count_metric) 61 | ``` 62 | 63 | ```py 64 | observed_df.count() 65 | ``` 66 | -------------------------------------------------------------------------------- /docs/sql/EvalPythonExec.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: EvalPythonExec 3 | --- 4 | 5 | # EvalPythonExec Unary Physical Operators 6 | 7 | `EvalPythonExec` is an [extension](#contract) of the `UnaryExecNode` ([Spark SQL]({{ book.spark_sql }}/physical-operators/UnaryExecNode)) abstraction for [unary physical operators](#implementations) that [evaluate PythonUDFs](#evaluate) (when [executed](#doExecute)). 8 | 9 | ## Contract 10 | 11 | ### Evaluating PythonUDFs { #evaluate } 12 | 13 | ```scala 14 | evaluate( 15 | funcs: Seq[ChainedPythonFunctions], 16 | argOffsets: Array[Array[Int]], 17 | iter: Iterator[InternalRow], 18 | schema: StructType, 19 | context: TaskContext): Iterator[InternalRow] 20 | ``` 21 | 22 | See: 23 | 24 | * [ArrowEvalPythonExec](ArrowEvalPythonExec.md#evaluate) 25 | 26 | Used when: 27 | 28 | * `EvalPythonExec` physical operator is requested to [doExecute](#doExecute) 29 | 30 | ### Result Attributes { #resultAttrs } 31 | 32 | ```scala 33 | resultAttrs: Seq[Attribute] 34 | ``` 35 | 36 | Result `Attribute`s ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute)) 37 | 38 | See: 39 | 40 | * [ArrowEvalPythonExec](ArrowEvalPythonExec.md#resultAttrs) 41 | 42 | Used when: 43 | 44 | * `EvalPythonExec` physical operator is requested for the [output](#output) and [producedAttributes](#producedAttributes) 45 | 46 | ### Python UDFs { #udfs } 47 | 48 | ```scala 49 | udfs: Seq[PythonUDF] 50 | ``` 51 | 52 | [PythonUDF](PythonUDF.md)s to [evaluate](#evaluate) 53 | 54 | See: 55 | 56 | * [ArrowEvalPythonExec](ArrowEvalPythonExec.md#udfs) 57 | 58 | Used when: 59 | 60 | * `EvalPythonExec` physical operator is requested to [doExecute](#doExecute) 61 | 62 | ## Implementations 63 | 64 | * [ArrowEvalPythonExec](ArrowEvalPythonExec.md) 65 | * `BatchEvalPythonExec` 66 | 67 | ## Executing Physical Operator { #doExecute } 68 | 69 | ??? note "SparkPlan" 70 | 71 | ```scala 72 | doExecute(): RDD[InternalRow] 73 | ``` 74 | 75 | `doExecute` is part of the `SparkPlan` ([Spark SQL]({{ book.spark_sql }}/physical-operators/SparkPlan#doExecute)) abstraction. 76 | 77 | The gist of `doExecute` is to [evaluate Python UDFs](#evaluate) (for every `InternalRow`) with some pre- and post-processing. 78 | 79 | --- 80 | 81 | `doExecute` requests the child physical operator to `execute` (to produce an input `RDD[InternalRow]`). 82 | 83 | !!! note 84 | `EvalPythonExec`s are `UnaryExecNode`s ([Spark SQL]({{ book.spark_sql }}/physical-operators/UnaryExecNode)). 85 | 86 | `doExecute` uses `RDD.mapPartitions` operator to execute a function over partitions of `InternalRow`s. 87 | 88 | For every partition, `doExecute` creates a `MutableProjection` for the inputs (and the child's output) and requests it to `initialize`. 89 | 90 | `doExecute` [evaluates Python UDFs](#evaluate) (for every `InternalRow`). 91 | 92 | In the end, `doExecute` creates an `UnsafeProjection` for the [output](#output) to "map over" the rows (from evaluating Python UDFs). 93 | -------------------------------------------------------------------------------- /docs/sql/FlatMapGroupsInPandas.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: FlatMapGroupsInPandas 3 | --- 4 | 5 | # FlatMapGroupsInPandas Logical Operator 6 | 7 | `FlatMapGroupsInPandas` is a unary logical operator ([Spark SQL]({{ book.spark_sql }}/logical-operators/LogicalPlan/#UnaryNode)). 8 | 9 | `FlatMapGroupsInPandas` is planned as a [FlatMapGroupsInPandasExec](FlatMapGroupsInPandasExec.md) physical operator. 10 | 11 | ## Creating Instance 12 | 13 | `FlatMapGroupsInPandas` takes the following to be created: 14 | 15 | * Grouping Attributes ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute)) 16 | * Function Expression ([Spark SQL]({{ book.spark_sql }}/expressions/Expression)) 17 | * Output Attributes ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute)) 18 | * Child Logical Operator ([Spark SQL]({{ book.spark_sql }}/logical-operators/LogicalPlan)) 19 | 20 | `FlatMapGroupsInPandas` is created when: 21 | 22 | * `RelationalGroupedDataset` is requested to [flatMapGroupsInPandas](RelationalGroupedDataset.md#flatMapGroupsInPandas) (with a [PythonUDF](PythonUDF.md)) 23 | -------------------------------------------------------------------------------- /docs/sql/FlatMapGroupsInPandasExec.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: FlatMapGroupsInPandasExec 3 | --- 4 | 5 | # FlatMapGroupsInPandasExec Physical Operator 6 | 7 | `FlatMapGroupsInPandasExec` is a unary physical operator ([Spark SQL]({{ book.spark_sql }}/physical-operators/UnaryExecNode)) to execute a [PythonUDF](#func) using [ArrowPythonRunner](../runners/ArrowPythonRunner.md) (in [SQL_GROUPED_MAP_PANDAS_UDF](PythonEvalType.md#SQL_GROUPED_MAP_PANDAS_UDF) eval mode). 8 | 9 | `FlatMapGroupsInPandasExec` represents a [FlatMapGroupsInPandas](FlatMapGroupsInPandas.md) logical operator at execution time. 10 | 11 | ## Creating Instance 12 | 13 | `FlatMapGroupsInPandasExec` takes the following to be created: 14 | 15 | * Grouping Attributes ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute)) 16 | * Function Expression ([Spark SQL]({{ book.spark_sql }}/expressions/Expression)) 17 | * Output Attributes ([Spark SQL]({{ book.spark_sql }}/expressions/Attribute)) 18 | * Child Physical Operator ([Spark SQL]({{ book.spark_sql }}/physical-operators/SparkPlan)) 19 | 20 | `FlatMapGroupsInPandasExec` is created when: 21 | 22 | * `BasicOperators` ([Spark SQL]({{ book.spark_sql }}/execution-planning-strategies/BasicOperators/)) execution planning strategy is executed (on a logical query plan with [FlatMapGroupsInPandas](FlatMapGroupsInPandas.md) logical operators) 23 | 24 | ## Performance Metrics 25 | 26 | `ArrowEvalPythonExec` is a [PythonSQLMetrics](PythonSQLMetrics.md). 27 | 28 | ## Executing Operator { #doExecute } 29 | 30 | ??? note "SparkPlan" 31 | 32 | ```scala 33 | doExecute(): RDD[InternalRow] 34 | ``` 35 | 36 | `doExecute` is part of the `SparkPlan` ([Spark SQL]({{ book.spark_sql }}/physical-operators/SparkPlan#doExecute)) abstraction. 37 | 38 | `doExecute` requests the [child physical operator](#child) to `execute` (and produce a `RDD[InternalRow]`). 39 | 40 | For every non-empty partition (using `RDD.mapPartitionsInternal`), `doExecute` creates an [ArrowPythonRunner](../runners/ArrowPythonRunner.md) (with [SQL_GROUPED_MAP_PANDAS_UDF](PythonEvalType.md#SQL_GROUPED_MAP_PANDAS_UDF) eval type) and [executePython](PandasGroupUtils.md#executePython). 41 | -------------------------------------------------------------------------------- /docs/sql/GroupedData.md: -------------------------------------------------------------------------------- 1 | # GroupedData 2 | 3 | `GroupedData` is created for the following high-level operators: 4 | 5 | * [DataFrame.cube](DataFrame.md#cube) 6 | * [DataFrame.groupBy](DataFrame.md#groupBy) 7 | * [DataFrame.rollup](DataFrame.md#rollup) 8 | * [GroupedData.pivot](#pivot) 9 | 10 | `GroupedData` is then used to execute aggregate functions (over groups of rows) using [agg](#agg) operator: 11 | 12 | * Built-In Aggregation Functions 13 | * [pandas UDAFs](../pandas-udafs/index.md) 14 | 15 | `GroupedData` is a Python class with [PandasGroupedOpsMixin](PandasGroupedOpsMixin.md) mixin. 16 | 17 | `GroupedData` is defined in [pyspark.sql.group](../pyspark/sql/group.md) module. 18 | 19 | ```py 20 | from pyspark.sql.group import GroupedData 21 | ``` 22 | 23 | ## Creating Instance 24 | 25 | `GroupedData` takes the following to be created: 26 | 27 | * [RelationalGroupedDataset](RelationalGroupedDataset.md) 28 | * [DataFrame](DataFrame.md) 29 | 30 | ## agg 31 | 32 | ```py 33 | agg( 34 | self, 35 | *exprs: Union[Column, Dict[str, str]]) -> DataFrame 36 | ``` 37 | 38 | !!! note 39 | Built-in aggregation functions and [pandas UDAFs](../pandas-udafs/index.md) cannot be used together in a single `agg`. 40 | 41 | `agg` accepts a collection of `Column` expressions or a single `Dict[str, str]` object. 42 | 43 | `agg` requests the [RelationalGroupedDataset](#_jgd) to `agg` ([Spark SQL]({{ book.spark_sql }}/RelationalGroupedDataset/#agg)). 44 | 45 | In the end, `agg` creates a [DataFrame](DataFrame.md) with the `agg` result. 46 | -------------------------------------------------------------------------------- /docs/sql/Observation.md: -------------------------------------------------------------------------------- 1 | # Observation 2 | 3 | `Observation` is a Python class to observe (named) metrics on a [DataFrame](DataFrame.md). 4 | 5 | ```py 6 | from pyspark.sql.observation import Observation 7 | ``` 8 | 9 | ??? note "pyspark.sql" 10 | `Observation` is imported using `*` import from `pyspark.sql` as well as `pyspark.sql.observation` (as is included in `__all__` of the modules). 11 | 12 | ```py 13 | from pyspark.sql import * 14 | ``` 15 | 16 | ## Creating Instance 17 | 18 | `Observation` takes the following to be created: 19 | 20 | * Name (optional) 21 | 22 | ## _jo { #_jo } 23 | 24 | ```py 25 | _jo: Optional[JavaObject] 26 | ``` 27 | 28 | ## get { #get } 29 | 30 | ```py 31 | get( 32 | self) -> Dict[str, Any] 33 | ``` 34 | 35 | `get` requests the [_jo](#_jo) to `getAsJava` and converts the py4j `JavaMap` to a Python dict. 36 | 37 | ## Demo 38 | 39 | ```py 40 | from pyspark.sql.observation import Observation 41 | 42 | observation = Observation("demo") 43 | ``` 44 | 45 | ```py 46 | import pandas as pd 47 | 48 | pandas_df = pd.DataFrame({ 49 | 'name': ['jacek', 'agata', 'iweta', 'patryk', 'maksym'], 50 | 'age': [50, 49, 29, 26, 11] 51 | }) 52 | df = spark.createDataFrame(pandas_df) 53 | ``` 54 | 55 | ```py 56 | from pyspark.sql.functions import * 57 | row_count_metric = count(lit(1)).alias("count") 58 | observed_df = df.observe(observation, row_count_metric) 59 | ``` 60 | 61 | ```py 62 | observed_df.count() 63 | ``` 64 | 65 | === "Python" 66 | 67 | ```py 68 | observation.get() 69 | ``` 70 | 71 | ```text 72 | {'count': 5} 73 | ``` 74 | -------------------------------------------------------------------------------- /docs/sql/PandasCogroupedOps.md: -------------------------------------------------------------------------------- 1 | # PandasCogroupedOps 2 | 3 | `PandasCogroupedOps` is a logical grouping created by [GroupedData.cogroup](GroupedData.md#cogroup) over two [GroupedData](GroupedData.md)s. 4 | 5 | ```py 6 | from pyspark.sql.pandas.group_ops import PandasCogroupedOps 7 | ``` 8 | 9 | `PandasCogroupedOps` is included in `__all__` of `pyspark.sql` module (via `__init__.py`). 10 | 11 | ## Creating Instance 12 | 13 | `PandasCogroupedOps` takes the following to be created: 14 | 15 | * [GroupedData](GroupedData.md) 16 | * [GroupedData](GroupedData.md) 17 | 18 | `PandasCogroupedOps` is created when: 19 | 20 | * `PandasGroupedOpsMixin` is requested to [cogroup](PandasGroupedOpsMixin.md#cogroup) 21 | 22 | ## applyInPandas { #applyInPandas } 23 | 24 | ```py 25 | applyInPandas( 26 | self, 27 | func: "PandasCogroupedMapFunction", # (1)! 28 | schema: Union[StructType, str] 29 | ) -> DataFrame 30 | ``` 31 | 32 | 1. 33 | ```py 34 | from pandas.core.frame import DataFrame as PandasDataFrame 35 | DataFrameLike = PandasDataFrame 36 | PandasCogroupedMapFunction = Union[ 37 | # func: (pandas.DataFrame, pandas.DataFrame) -> pandas.DataFrame 38 | Callable[[DataFrameLike, DataFrameLike], DataFrameLike], 39 | # func: (groupKey(s), pandas.DataFrame, pandas.DataFrame) -> pandas.DataFrame 40 | Callable[[Any, DataFrameLike, DataFrameLike], DataFrameLike], 41 | ] 42 | ``` 43 | 44 | `applyInPandas` creates a [DataFrame](DataFrame.md) with the result of [flatMapCoGroupsInPandas](RelationalGroupedDataset.md#flatMapCoGroupsInPandas) with a [pandas user defined function](../pyspark/sql/pandas/functions.md#pandas_udf) of `SQL_COGROUPED_MAP_PANDAS_UDF` type. 45 | 46 | --- 47 | 48 | `applyInPandas` [creates a pandas user defined function](../pyspark/sql/pandas/functions.md#pandas_udf) for the given `func` and the return type by the given `schema`. The pandas UDF is of `SQL_COGROUPED_MAP_PANDAS_UDF` type. 49 | 50 | `applyInPandas` applies the pandas UDF on all the columns of the two [GroupedData](#creating-instance)s (that creates a `Column` expression). 51 | 52 | `applyInPandas` requests the [GroupedData](#gd1) for the associated [RelationalGroupedDataset](GroupedData.md#jgd) that is in turn requested to [flatMapCoGroupsInPandas](RelationalGroupedDataset.md#flatMapCoGroupsInPandas). 53 | 54 | ### Example { #applyInPandas-example } 55 | 56 | ```py 57 | df1 = spark.createDataFrame( 58 | data = [ 59 | (20000101, 1, 1.0), 60 | (20000101, 2, 2.0), 61 | (20000102, 1, 3.0), 62 | (20000102, 2, 4.0)], 63 | schema = ("time", "id", "v1")) 64 | df2 = spark.createDataFrame( 65 | data = [ 66 | (20000101, 1, "x"), 67 | (20000101, 2, "y")], 68 | schema = ("time", "id", "v2")) 69 | ``` 70 | 71 | ```py 72 | import pandas as pd 73 | def asof_join(k, l, r): 74 | if k == (1,): 75 | return pd.merge_asof(l, r, on="time", by="id") 76 | else: 77 | return pd.DataFrame(columns=['time', 'id', 'v1', 'v2']) 78 | ``` 79 | 80 | ```py 81 | gd1 = df1.groupby("id") 82 | gd2 = df2.groupby("id") 83 | ``` 84 | 85 | ```py 86 | gd1 87 | .cogroup(gd2) 88 | .applyInPandas( 89 | asof_join, 90 | "time int, id int, v1 double, v2 string") 91 | .show() 92 | ``` 93 | -------------------------------------------------------------------------------- /docs/sql/PandasConversionMixin.md: -------------------------------------------------------------------------------- 1 | # PandasConversionMixin 2 | 3 | `PandasConversionMixin` is a Python mixin of [DataFrame](DataFrame.md) to [convert to Pandas](#toPandas) ([pandas.DataFrame]({{ pandas.api }}/pandas.DataFrame.html)). 4 | 5 | ## toPandas { #toPandas } 6 | 7 | ```python 8 | toPandas(self) 9 | ``` 10 | 11 | `toPandas` can only be used with [DataFrame](DataFrame.md). 12 | 13 | With [Arrow optimization](../configuration-properties/index.md#arrowPySparkEnabled) enabled, `toPandas` [to_arrow_schema](#to_arrow_schema). 14 | 15 | !!! note "pyarrow" 16 | Arrow Optimization uses `pyarrow` module. 17 | 18 | `toPandas` renames the columns to be of `col_[index]` format and [_collect_as_arrow](#_collect_as_arrow) (with `split_batches` based on `arrowPySparkSelfDestructEnabled` configuration property). 19 | 20 | `toPandas` creates a `pyarrow.Table` (from the `RecordBatch`es) and converts the table to a pandas-compatible NumPy array or `DataFrame`. `toPandas` renames the columns back to the initial column names. 21 | 22 | !!! note 23 | Column order is assumed. 24 | 25 | With [Arrow optimization](../configuration-properties/index.md#arrowPySparkEnabled) disabled, `toPandas` collects the records (`DataFrame.collect`) and creates a `pandas.DataFrame` (with some type _munging_). 26 | -------------------------------------------------------------------------------- /docs/sql/PandasGroupUtils.md: -------------------------------------------------------------------------------- 1 | # PandasGroupUtils 2 | 3 | `PandasGroupUtils` utility is used by the following physical operators when executed: 4 | 5 | * `FlatMapCoGroupsInPandasExec` 6 | * [FlatMapGroupsInPandasExec](FlatMapGroupsInPandasExec.md#doExecute) 7 | 8 | ## executePython { #executePython } 9 | 10 | ```scala 11 | executePython[T]( 12 | data: Iterator[T], 13 | output: Seq[Attribute], 14 | runner: BasePythonRunner[T, ColumnarBatch]): Iterator[InternalRow] 15 | ``` 16 | 17 | `executePython` requests the given [BasePythonRunner](../runners/BasePythonRunner.md) to [compute](../runners/BasePythonRunner.md#compute) the (partition) `data` (with the current task's `TaskContext` and the partition ID). 18 | 19 | `executePython`...FIXME 20 | 21 | --- 22 | 23 | `executePython` is used when: 24 | 25 | * `FlatMapCoGroupsInPandasExec` and [FlatMapGroupsInPandasExec](FlatMapGroupsInPandasExec.md#doExecute) physical operators are executed 26 | 27 | ## groupAndProject { #groupAndProject } 28 | 29 | ```scala 30 | groupAndProject( 31 | input: Iterator[InternalRow], 32 | groupingAttributes: Seq[Attribute], 33 | inputSchema: Seq[Attribute], 34 | dedupSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] 35 | ``` 36 | 37 | `groupAndProject` creates a `GroupedIterator` for the `input` iterator (of `InternalRow`s), the `groupingAttributes` and the `inputSchema`. 38 | 39 | `groupAndProject`...FIXME 40 | 41 | --- 42 | 43 | `groupAndProject` is used when: 44 | 45 | * `FlatMapCoGroupsInPandasExec` and [FlatMapGroupsInPandasExec](FlatMapGroupsInPandasExec.md#doExecute) physical operators are executed 46 | -------------------------------------------------------------------------------- /docs/sql/PandasGroupedOpsMixin.md: -------------------------------------------------------------------------------- 1 | # PandasGroupedOpsMixin 2 | 3 | `PandasGroupedOpsMixin` is a Python mixin for [GroupedData](GroupedData.md) class. 4 | 5 | ## applyInPandas { #applyInPandas } 6 | 7 | ```py 8 | applyInPandas( 9 | self, 10 | func: "PandasGroupedMapFunction", # (1)! 11 | schema: Union[StructType, str] 12 | ) -> DataFrame 13 | ``` 14 | 15 | 1. 16 | ```py 17 | from pandas.core.frame import DataFrame as PandasDataFrame 18 | DataFrameLike = PandasDataFrame 19 | PandasGroupedMapFunction = Union[ 20 | # func: pandas.DataFrame -> pandas.DataFrame 21 | Callable[[DataFrameLike], DataFrameLike], 22 | # func: (groupKey(s), pandas.DataFrame) -> pandas.DataFrame 23 | Callable[[Any, DataFrameLike], DataFrameLike], 24 | ] 25 | ``` 26 | 27 | `applyInPandas` creates a [pandas_udf](../pyspark/sql/pandas/functions.md#pandas_udf) with the following: 28 | 29 | pandas_udf | Value 30 | -----------|------ 31 | `f` | The given `func` 32 | `returnType` | The given `schema` 33 | `functionType` | [PandasUDFType.GROUPED_MAP](../pyspark/sql/pandas/PandasUDFType.md#GROUPED_MAP) 34 | 35 | `applyInPandas` creates a `Column` wtih the `pandas_udf` applied to all the columns of the [DataFrame](GroupedData.md#_df) of this [GroupedData](GroupedData.md). 36 | 37 | `applyInPandas` requests the [RelationalGroupedDataset](#_jgd) to [flatMapGroupsInPandas](RelationalGroupedDataset.md#flatMapGroupsInPandas) with the underlying Catalyst expression of the `Column` with the `pandas_udf`. 38 | 39 | In the end, `applyInPandas` creates a [DataFrame](DataFrame.md) with the result. 40 | 41 | ## cogroup { #cogroup } 42 | 43 | ```py 44 | cogroup( 45 | self, 46 | other: "GroupedData") -> "PandasCogroupedOps" 47 | ``` 48 | 49 | `cogroup` creates a [PandasCogroupedOps](PandasCogroupedOps.md) for this and the other [GroupedData](GroupedData.md)s. 50 | -------------------------------------------------------------------------------- /docs/sql/PandasMapOpsMixin.md: -------------------------------------------------------------------------------- 1 | # PandasMapOpsMixin 2 | 3 | `PandasMapOpsMixin` is a Python mixin for [DataFrame](DataFrame.md) class. 4 | -------------------------------------------------------------------------------- /docs/sql/PythonEvalType.md: -------------------------------------------------------------------------------- 1 | # PythonEvalType 2 | 3 | `PythonEvalType` are the types of commands that will be sent to the Python worker for execution. 4 | 5 | Name | Value | PandasUDFType 6 | -----|-------|-------------- 7 | [SQL_GROUPED_AGG_PANDAS_UDF](#SQL_GROUPED_AGG_PANDAS_UDF) | 202 | [GROUPED_AGG](../pyspark/sql/pandas/PandasUDFType.md#GROUPED_AGG) 8 | [SQL_GROUPED_MAP_PANDAS_UDF](#SQL_GROUPED_MAP_PANDAS_UDF) | 201 | [GROUPED_MAP](../pyspark/sql/pandas/PandasUDFType.md#GROUPED_MAP) 9 | [SQL_SCALAR_PANDAS_UDF](#SQL_SCALAR_PANDAS_UDF) | 200 | [SCALAR](../pyspark/sql/pandas/PandasUDFType.md#SCALAR) 10 | [SQL_SCALAR_PANDAS_ITER_UDF](#SQL_SCALAR_PANDAS_ITER_UDF) | 204 | [SCALAR_ITER](../pyspark/sql/pandas/PandasUDFType.md#SCALAR_ITER) 11 | 12 | `PythonEvalType` is defined in `org.apache.spark.api.python` Scala package with the same values defined on Python side in the [PythonEvalType](../pyspark/sql/pandas/PandasUDFType.md) Python class (in `pyspark/rdd.py` package). 13 | 14 | ## SQL_GROUPED_AGG_PANDAS_UDF { #SQL_GROUPED_AGG_PANDAS_UDF } 15 | 16 | `SQL_GROUPED_AGG_PANDAS_UDF` is a UDF marker of **Grouped Aggregate Pandas UDFs** (_pandas User-Defined Aggregate Functions_, _pandas UDAFs_). 17 | 18 | `SQL_GROUPED_AGG_PANDAS_UDF` is executed using [AggregateInPandasExec](AggregateInPandasExec.md) physical operator (using [ArrowPythonRunner](../runners/ArrowPythonRunner.md)). 19 | 20 | Limitations of Pandas UDAFs: 21 | 22 | * [Return type](../pyspark/sql/UserDefinedFunction.md#returnType) cannot be `StructType` 23 | * Not supported in the `PIVOT` clause 24 | * Not supported in streaming aggregation 25 | 26 | `SQL_GROUPED_AGG_PANDAS_UDF` is used (on Python side) when: 27 | 28 | * `pyspark/worker.py` is requested to [read_single_udf](../pyspark/worker.md#read_single_udf) and [read_udfs](../pyspark/worker.md#read_udfs) 29 | * `pyspark/sql/pandas/functions.py` is requested to `_create_pandas_udf` and `pandas_udf` 30 | 31 | `SQL_GROUPED_AGG_PANDAS_UDF` is used (on Scala side) when: 32 | 33 | * `PythonUDF` is requested for [isGroupedAggPandasUDF](PythonUDF.md#isGroupedAggPandasUDF) 34 | 35 | ## SQL_SCALAR_PANDAS_UDF { #SQL_SCALAR_PANDAS_UDF } 36 | 37 | `SQL_SCALAR_PANDAS_UDF` is among [SCALAR_TYPES](PythonUDF.md#SCALAR_TYPES) of [PythonUDF](PythonUDF.md). 38 | 39 | `SQL_SCALAR_PANDAS_UDF` (with [SQL_SCALAR_PANDAS_ITER_UDF](#SQL_SCALAR_PANDAS_ITER_UDF)) are evaluated using [ArrowEvalPython](ArrowEvalPython.md). 40 | 41 | `SQL_SCALAR_PANDAS_UDF` is used (on Python side) when: 42 | 43 | * `pyspark/worker.py` is requested to [read_single_udf](../pyspark/worker.md#read_single_udf) and [read_udfs](../pyspark/worker.md#read_udfs) 44 | * `pyspark/sql/pandas/functions.py` is requested to `_create_pandas_udf` and `pandas_udf` 45 | 46 | ## SQL_SCALAR_PANDAS_ITER_UDF { #SQL_SCALAR_PANDAS_ITER_UDF } 47 | 48 | ## User-Defined Functions 49 | 50 | [UDFRegistration](UDFRegistration.md#register) allows user-defined functions to be one of the following `PythonEvalType`s: 51 | 52 | * [SQL_BATCHED_UDF](#SQL_BATCHED_UDF) 53 | * [SQL_SCALAR_PANDAS_UDF](#SQL_SCALAR_PANDAS_UDF) 54 | * [SQL_SCALAR_PANDAS_ITER_UDF](#SQL_SCALAR_PANDAS_ITER_UDF) 55 | * [SQL_GROUPED_AGG_PANDAS_UDF](#SQL_GROUPED_AGG_PANDAS_UDF) 56 | -------------------------------------------------------------------------------- /docs/sql/PythonSQLMetrics.md: -------------------------------------------------------------------------------- 1 | # PythonSQLMetrics 2 | 3 | `PythonSQLMetrics` is a collection of [SQL metrics](#performance-metrics) of the [physical operators](#implementations) in PySpark. 4 | 5 | ## Performance Metrics 6 | 7 | ### data returned from Python workers { #pythonDataReceived } 8 | 9 | ### data sent to Python workers { #pythonDataSent } 10 | 11 | ### number of output rows { #pythonNumRowsReceived } 12 | 13 | ## Implementations 14 | 15 | * [AggregateInPandasExec](AggregateInPandasExec.md) 16 | * [ArrowEvalPythonExec](ArrowEvalPythonExec.md) 17 | * `BatchEvalPythonExec` 18 | * `FlatMapCoGroupsInPandasExec` 19 | * [FlatMapGroupsInPandasExec](FlatMapGroupsInPandasExec.md) 20 | * `MapInBatchExec` 21 | * `StateStoreWriter` 22 | * `WindowInPandasExec` 23 | -------------------------------------------------------------------------------- /docs/sql/PythonUDF.md: -------------------------------------------------------------------------------- 1 | # PythonUDF 2 | 3 | `PythonUDF` is a Catalyst expression ([Spark SQL]({{ book.spark_sql }}/expressions/Expression)). 4 | 5 | ## Creating Instance 6 | 7 | `PythonUDF` takes the following to be created: 8 | 9 | * Name 10 | * [PythonFunction](../PythonFunction.md) 11 | * `DataType` ([Spark SQL]({{ book.spark_sql }}/DataType)) 12 | * Children Catalyst Expressions ([Spark SQL]({{ book.spark_sql }}/expressions/Expression)) 13 | * Python Eval Type 14 | * `udfDeterministic` flag 15 | * Result ID (`ExprId`) 16 | 17 | `PythonUDF` is created when: 18 | 19 | * `UserDefinedPythonFunction` is requested to [builder](UserDefinedPythonFunction.md#builder) 20 | 21 | ## Unevaluable 22 | 23 | `PythonUDF` is an `Unevaluable` expression ([Spark SQL]({{ book.spark_sql }}/expressions/Unevaluable)). 24 | 25 | ## NonSQLExpression 26 | 27 | `PythonUDF` is a `NonSQLExpression` expression ([Spark SQL]({{ book.spark_sql }}/expressions/NonSQLExpression)). 28 | 29 | ## UserDefinedExpression 30 | 31 | `PythonUDF` is a `UserDefinedExpression` expression ([Spark SQL]({{ book.spark_sql }}/expressions/UserDefinedExpression)). 32 | 33 | ## isScalarPythonUDF { #isScalarPythonUDF } 34 | 35 | ```scala 36 | isScalarPythonUDF( 37 | e: Expression): Boolean 38 | ``` 39 | 40 | `isScalarPythonUDF` holds `true` when the following all hold `true`: 41 | 42 | * The given `Expression` ([Spark SQL]({{ book.spark_sql }}/expressions/Expression)) is a [PythonUDF](PythonUDF.md) 43 | * The [evalType](#evalType) is [scalar](#SCALAR_TYPES) 44 | 45 | --- 46 | 47 | `isScalarPythonUDF` is used when: 48 | 49 | * `ExtractPythonUDFFromJoinCondition` is requested to `hasUnevaluablePythonUDF` 50 | * `ExtractPythonUDFFromAggregate` is requested to `hasPythonUdfOverAggregate` 51 | * `ExtractGroupingPythonUDFFromAggregate` is requested to `hasScalarPythonUDF` 52 | * `ExtractPythonUDFs` is requested to `hasScalarPythonUDF`, `collectEvaluableUDFs`, `extract` 53 | 54 | ## Scalar PythonUDF Types { #SCALAR_TYPES } 55 | 56 | `PythonUDF` is [scalar](#isScalarPythonUDF) for the following eval types: 57 | 58 | * [SQL_BATCHED_UDF](../sql/PythonEvalType.md#SQL_BATCHED_UDF) 59 | * [SQL_SCALAR_PANDAS_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_UDF) 60 | * [SQL_SCALAR_PANDAS_ITER_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_ITER_UDF) 61 | 62 | ## isGroupedAggPandasUDF { #isGroupedAggPandasUDF } 63 | 64 | ```scala 65 | isGroupedAggPandasUDF( 66 | e: Expression): Boolean 67 | ``` 68 | 69 | `isGroupedAggPandasUDF` is `true` when the given `Expression` is a [PythonUDF](PythonUDF.md) with [SQL_GROUPED_AGG_PANDAS_UDF](PythonEvalType.md#SQL_GROUPED_AGG_PANDAS_UDF) eval type. Otherwise, `isGroupedAggPandasUDF` is `false`. 70 | -------------------------------------------------------------------------------- /docs/sql/RelationalGroupedDataset.md: -------------------------------------------------------------------------------- 1 | # RelationalGroupedDataset 2 | 3 | `RelationalGroupedDataset` is a result of executing high-level grouping operators. 4 | 5 | !!! note "This is a stub" 6 | This page is a stub to describe PySpark-related methods only. Learn more about [RelationalGroupedDataset]({{ book.spark_sql }}/RelationalGroupedDataset/) in [The Internals of Spark SQL]({{ book.spark_sql }}). 7 | 8 | ## flatMapCoGroupsInPandas { #flatMapCoGroupsInPandas } 9 | 10 | ```scala 11 | flatMapCoGroupsInPandas( 12 | r: RelationalGroupedDataset, 13 | expr: PythonUDF): DataFrame 14 | ``` 15 | 16 | `flatMapCoGroupsInPandas`...FIXME 17 | 18 | --- 19 | 20 | `flatMapCoGroupsInPandas` is used when: 21 | 22 | * `PandasCogroupedOps` is requested to [applyInPandas](PandasCogroupedOps.md#applyInPandas) 23 | 24 | ## flatMapGroupsInPandas { #flatMapGroupsInPandas } 25 | 26 | ```scala 27 | flatMapGroupsInPandas( 28 | expr: PythonUDF): DataFrame 29 | ``` 30 | 31 | `flatMapGroupsInPandas` creates a `DataFrame` with a [FlatMapGroupsInPandas](FlatMapGroupsInPandas.md) logical operator (to execute the given [PythonUDF](PythonUDF.md)). 32 | 33 | --- 34 | 35 | `flatMapGroupsInPandas` asserts that the input [PythonUDF](PythonUDF.md) is a grouped map udf (the [eval type](PythonUDF.md#evalType) is [SQL_GROUPED_MAP_PANDAS_UDF](PythonEvalType.md#SQL_GROUPED_MAP_PANDAS_UDF)). 36 | 37 | `flatMapGroupsInPandas` asserts that the [return type](PythonUDF.md#dataType) of the input [PythonUDF](PythonUDF.md) is `StructType`. 38 | 39 | --- 40 | 41 | `flatMapGroupsInPandas` is used when: 42 | 43 | * `PandasGroupedOpsMixin` is requested to [applyInPandas](PandasGroupedOpsMixin.md#applyInPandas) 44 | -------------------------------------------------------------------------------- /docs/sql/SQLContext.md: -------------------------------------------------------------------------------- 1 | # SQLContext 2 | 3 | `SQLContext` is...FIXME 4 | -------------------------------------------------------------------------------- /docs/sql/SparkConversionMixin.md: -------------------------------------------------------------------------------- 1 | # SparkConversionMixin 2 | 3 | `SparkConversionMixin` is a Python mixin for [SparkSession](../pyspark/sql/SparkSession.md) class. 4 | -------------------------------------------------------------------------------- /docs/sql/UDFRegistration.md: -------------------------------------------------------------------------------- 1 | # UDFRegistration 2 | 3 | `UDFRegistration` is a Python class in [pyspark.sql.udf](../pyspark/sql/udf.md) module. 4 | 5 | ## Registering Python UDF { #register } 6 | 7 | ```python 8 | register( 9 | self, 10 | name: str, 11 | f: Union[Callable[..., Any], "UserDefinedFunctionLike"], 12 | returnType: Optional[Union[pyspark.sql.types.DataType, str]] = None, 13 | ) -> "UserDefinedFunctionLike" 14 | ``` 15 | 16 | `register` registers a Python function (incl. lambda function) or a user-defined function as a SQL function (under the given `name`). 17 | 18 | Function `f` | Description 19 | -------------|------------ 20 | A Python function |
  • Includes lambda (_unnamed_) functions
  • `Callable[..., Any]`
  • The return type is `StringType` when not specified
  • Always `PythonEvalType.SQL_BATCHED_UDF`
21 | `pyspark.sql.functions.udf` |
  • _row-at-a-time_
  • `UserDefinedFunctionLike` 22 | `pyspark.sql.functions.pandas_udf` |
    • _vectorized_
    • `UserDefinedFunctionLike` 23 | 24 | `evalType` of the a user-defined function can be one of the following: 25 | 26 | * [SQL_BATCHED_UDF](../sql/PythonEvalType.md#SQL_BATCHED_UDF) 27 | * [SQL_SCALAR_PANDAS_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_UDF) 28 | * [SQL_SCALAR_PANDAS_ITER_UDF](../sql/PythonEvalType.md#SQL_SCALAR_PANDAS_ITER_UDF) 29 | * [SQL_GROUPED_AGG_PANDAS_UDF](../sql/PythonEvalType.md#SQL_GROUPED_AGG_PANDAS_UDF) 30 | 31 | --- 32 | 33 | `register` [_create_udf](#_create_udf) and requests the `_jsparkSession` for the `UDFRegistration` ([Spark SQL]({{ book.spark_sql }}/user-defined-functions/UDFRegistration/)) to `registerPython` ([Spark SQL]({{ book.spark_sql }}/user-defined-functions/UDFRegistration/#registerPython)). 34 | 35 | ```python 36 | from pyspark.sql.functions import call_udf, col 37 | from pyspark.sql.types import IntegerType, StringType 38 | 39 | rows = [(1, "a"),(2, "b"), (3, "c")] 40 | columns = ["id", "name"] 41 | df = spark.createDataFrame(rows, columns) 42 | 43 | spark.udf.register("intX2", lambda i: i * 2, IntegerType()) 44 | df.select(call_udf("intX2", "id")).show() 45 | ``` 46 | -------------------------------------------------------------------------------- /docs/sql/UserDefinedPythonFunction.md: -------------------------------------------------------------------------------- 1 | # UserDefinedPythonFunction 2 | 3 | ## Creating Instance 4 | 5 | `UserDefinedPythonFunction` takes the following to be created: 6 | 7 | * Name 8 | * `PythonFunction` 9 | * `DataType` ([Spark SQL]({{ book.spark_sql }}/DataType)) 10 | * Python Eval Type 11 | * `udfDeterministic` flag 12 | 13 | `UserDefinedPythonFunction` is created when: 14 | 15 | * `SparkConnectPlanner` ([Spark Connect](../connect/index.md)) is requested to `handleRegisterPythonUDF` 16 | * `UserDefinedFunction` ([pyspark/sql/udf.py](../pyspark/sql/udf.md)) is requested to [_create_judf](../pyspark/sql/UserDefinedFunction.md#_create_judf) 17 | 18 | ## Creating PythonUDF 19 | 20 | ```scala 21 | builder( 22 | e: Seq[Expression]): Expression 23 | ``` 24 | 25 | `builder` creates a [PythonUDF](PythonUDF.md) (for all the [arguments](#creating-instance) and the given children expressions). 26 | 27 | --- 28 | 29 | `builder` is used when: 30 | 31 | * `UDFRegistration` is requested to register a Python UDF ([Spark SQL]({{ book.spark_sql }}/UDFRegistration#registerPython)) 32 | * `UserDefinedPythonFunction` is requested to [apply](#apply) 33 | 34 | ## Applying PythonUDF 35 | 36 | ```scala 37 | apply( 38 | exprs: Column*): Column 39 | ``` 40 | 41 | `apply` [creates a PythonUDF](#builder) (for the input `Column` ([Spark SQL]({{ book.spark_sql }}/Column)) expressions) and wraps it up into a `Column`. 42 | 43 | --- 44 | 45 | `apply` is used when: 46 | 47 | * `UDFRegistration` is requested to register a Python UDF ([Spark SQL]({{ book.spark_sql }}/UDFRegistration#registerPython)) 48 | * `UserDefinedPythonFunction` is requested to [apply](#apply) 49 | -------------------------------------------------------------------------------- /docs/sql/index.md: -------------------------------------------------------------------------------- 1 | # PySpark SQL 2 | 3 | **PySpark SQL** is a Python module to work with [Spark SQL]({{ book.spark_sql }}). 4 | 5 | ```py 6 | from pyspark.sql import * 7 | ``` 8 | 9 | The above `*` import imports the following classes: 10 | 11 | * `SparkSession` 12 | * `SQLContext` 13 | * `HiveContext` 14 | * `UDFRegistration` 15 | * `DataFrame` 16 | * `GroupedData` 17 | * `Column` 18 | * `Catalog` 19 | * [Observation](Observation.md) 20 | * `Row` 21 | * `DataFrameNaFunctions` 22 | * `DataFrameStatFunctions` 23 | * `Window` 24 | * `WindowSpec` 25 | * `DataFrameReader` 26 | * `DataFrameWriter` 27 | * `DataFrameWriterV2` 28 | * `PandasCogroupedOps` 29 | -------------------------------------------------------------------------------- /docs/tags.md: -------------------------------------------------------------------------------- 1 | # APIs 2 | 3 | [TAGS] 4 | -------------------------------------------------------------------------------- /docs/udts/.pages: -------------------------------------------------------------------------------- 1 | title: User-Defined Table Functions (UDTFs) 2 | nav: 3 | - index.md 4 | - ... 5 | -------------------------------------------------------------------------------- /docs/udts/index.md: -------------------------------------------------------------------------------- 1 | # User-Defined Table Functions (UDTFs) 2 | 3 | **User-Defined Table Functions (UDTFs)** are user-defined functions that...FIXME 4 | 5 | ```py 6 | from pyspark.sql.functions import udtf 7 | from pyspark.sql import Row 8 | 9 | udtf(returnType="a: int") 10 | class TestUDTF: 11 | def eval(self, row: Row): 12 | if row[0] > 5: 13 | yield row[0] 14 | 15 | def terminate(self): 16 | """ 17 | This method is optional, but 18 | there's a bug in 3.5.4 that makes terminate required 19 | https://issues.apache.org/jira/browse/SPARK-50674 20 | """ 21 | pass 22 | ``` 23 | 24 | ```py 25 | spark.udtf.register("test_udtf", TestUDTF) 26 | ``` 27 | 28 | ```py 29 | spark.sql("SELECT * FROM test_udtf(range(0, 8)) PARTITION BY id)").show() 30 | ``` 31 | -------------------------------------------------------------------------------- /graffles/PythonRunner.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/japila-books/pyspark-internals/824e54fbf19ddb39ba9ec1a50ca65d7629470cf6/graffles/PythonRunner.graffle -------------------------------------------------------------------------------- /graffles/PythonWorkerFactory.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/japila-books/pyspark-internals/824e54fbf19ddb39ba9ec1a50ca65d7629470cf6/graffles/PythonWorkerFactory.graffle -------------------------------------------------------------------------------- /graffles/SparkContext.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/japila-books/pyspark-internals/824e54fbf19ddb39ba9ec1a50ca65d7629470cf6/graffles/SparkContext.graffle -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: The Internals of PySpark 2 | site_url: https://books.japila.pl/pyspark-internals/ 3 | site_author: Jacek Laskowski 4 | site_description: Demystifying inner-workings of PySpark 5 | 6 | repo_name: pyspark-internals 7 | repo_url: https://github.com/japila-books/pyspark-internals 8 | edit_uri: edit/main/docs/ 9 | 10 | copyright: Copyright © 2024 Jacek Laskowski 11 | 12 | theme: 13 | name: material 14 | language: en 15 | icon: 16 | logo: material/book-open-page-variant 17 | repo: fontawesome/brands/github 18 | tag: 19 | python: fontawesome/brands/python 20 | scala: simple/scala 21 | features: 22 | # https://squidfunk.github.io/mkdocs-material/reference/code-blocks/#adding-annotations 23 | - content.code.annotate 24 | # https://squidfunk.github.io/mkdocs-material/upgrade/#contentcodecopy 25 | - content.code.copy 26 | - content.tooltips 27 | - navigation.indexes 28 | - navigation.instant 29 | # https://squidfunk.github.io/mkdocs-material/setup/setting-up-navigation/#navigation-path 30 | - navigation.path 31 | - navigation.tabs 32 | - navigation.tabs.sticky 33 | - navigation.top 34 | - navigation.tracking 35 | - search.highlight 36 | - search.share 37 | - search.suggest 38 | palette: 39 | - scheme: default 40 | primary: indigo 41 | accent: indigo 42 | toggle: 43 | icon: material/toggle-switch-off-outline 44 | name: Switch to dark mode 45 | - scheme: slate 46 | primary: blue 47 | accent: blue 48 | toggle: 49 | icon: material/toggle-switch 50 | name: Switch to light mode 51 | 52 | markdown_extensions: 53 | - admonition 54 | - attr_list 55 | - footnotes 56 | - md_in_html 57 | - toc: 58 | permalink: true 59 | - pymdownx.arithmatex 60 | - pymdownx.betterem: 61 | smart_enable: all 62 | - pymdownx.caret 63 | - pymdownx.critic 64 | - pymdownx.details 65 | - pymdownx.emoji 66 | - pymdownx.inlinehilite 67 | - pymdownx.magiclink 68 | - pymdownx.mark 69 | - pymdownx.smartsymbols 70 | - pymdownx.superfences 71 | - pymdownx.tasklist: 72 | custom_checkbox: true 73 | - pymdownx.tabbed: 74 | alternate_style: true 75 | - pymdownx.tilde 76 | 77 | plugins: 78 | - search 79 | - minify: 80 | minify_html: true 81 | - awesome-pages 82 | - macros 83 | # https://squidfunk.github.io/mkdocs-material/reference/#built-in-meta-plugin 84 | - meta 85 | # https://squidfunk.github.io/mkdocs-material/setup/setting-up-tags/ 86 | - tags: 87 | # enabled: !ENV [CI, false] 88 | tags_file: tags.md 89 | tags_extra_files: 90 | python-api.md: 91 | - python 92 | scala-api.md: 93 | - scala 94 | # https://squidfunk.github.io/mkdocs-material/reference/#built-in-typeset-plugin 95 | - typeset 96 | 97 | extra: 98 | arrow: 99 | docs: https://arrow.apache.org/docs 100 | home: https://arrow.apache.org/ 101 | book: 102 | title: PySpark 103 | spark_core: https://books.japila.pl/apache-spark-internals 104 | spark_sql: https://books.japila.pl/spark-sql-internals 105 | spark_k8s: https://jaceklaskowski.github.io/spark-kubernetes-book 106 | java: 107 | api: https://docs.oracle.com/en/java/javase/17/docs/api/java.base 108 | pandas: 109 | version: 2.2.0 110 | api: https://pandas.pydata.org/docs/reference/api 111 | home: https://pandas.pydata.org/ 112 | pyarrow: 113 | version: 17.0.0 114 | py4j: 115 | version: 0.10.9 116 | doc: https://www.py4j.org 117 | docs: https://www.py4j.org 118 | javadoc: https://www.py4j.org/_static/javadoc 119 | python: 120 | version: 3.11 121 | docs: https://docs.python.org/3 122 | api: https://docs.python.org/3 123 | peps: https://peps.python.org 124 | realpython: https://realpython.com 125 | pytorch: 126 | docs: https://pytorch.org/docs/stable 127 | github: https://github.com/pytorch/pytorch 128 | tutorials: https://pytorch.org/tutorials 129 | scala: 130 | api: https://www.scala-lang.org/api/2.13.8 131 | social: 132 | - icon: fontawesome/brands/github 133 | link: https://github.com/jaceklaskowski 134 | - icon: fontawesome/brands/twitter 135 | link: https://twitter.com/jaceklaskowski 136 | - icon: fontawesome/brands/linkedin 137 | link: https://linkedin.com/in/jaceklaskowski 138 | - icon: fontawesome/brands/medium 139 | link: https://jaceklaskowski.medium.com 140 | - icon: fontawesome/brands/mastodon 141 | link: https://fosstodon.org/@jaceklaskowski 142 | spark: 143 | version: 3.5.4 144 | github: https://github.com/apache/spark/tree/v3.5.4 145 | jira: https://issues.apache.org/jira/browse 146 | # https://squidfunk.github.io/mkdocs-material/setup/setting-up-tags/#tag-icons-and-identifiers 147 | tags: 148 | Python: python 149 | Scala: scala 150 | 151 | nav: 152 | - index.md 153 | - Features: 154 | - features/index.md 155 | - ... | arrow-optimization/**.md 156 | - ... | configuration-properties/**.md 157 | - environment-variables.md 158 | - ... | pytorch-distributed/**.md 159 | - ... | pandas-on-spark/**.md 160 | - ... | pandas-udafs/**.md 161 | - ... | pandas-udfs/**.md 162 | - PySpark API: 163 | - tags.md 164 | - python-api.md 165 | - scala-api.md 166 | - ... | udts/**.md 167 | - ... | connect/**.md 168 | - ... | ml/**.md 169 | - ... | sql/**.md 170 | - Internals: 171 | - Setup: Setup.md 172 | - Building from Sources: building-from-sources.md 173 | - PythonRunner: PythonRunner.md 174 | - PythonGatewayServer: PythonGatewayServer.md 175 | - Py4JServer: Py4JServer.md 176 | - SparkConf: SparkConf.md 177 | - SparkContext: SparkContext.md 178 | - PythonWorkerFactory: PythonWorkerFactory.md 179 | - MonitorThread: MonitorThread.md 180 | - PythonFunction: PythonFunction.md 181 | - PythonRDD: PythonRDD.md 182 | - PythonForeachWriter: PythonForeachWriter.md 183 | - PythonAccumulatorV2: PythonAccumulatorV2.md 184 | - PythonBroadcast: PythonBroadcast.md 185 | - PythonUtils: PythonUtils.md 186 | - RDD: RDD.md 187 | - SimplePythonFunction: SimplePythonFunction.md 188 | - SocketAuthServer: SocketAuthServer.md 189 | - SocketFuncServer: SocketFuncServer.md 190 | - SocketAuthHelper: SocketAuthHelper.md 191 | - SparkEnv: SparkEnv.md 192 | - logging.md 193 | - Modules: 194 | - pyspark: 195 | - pyspark/index.md 196 | - daemon.py: pyspark/daemon.md 197 | - java_gateway.py: pyspark/java_gateway.md 198 | - rdd.py: pyspark/rdd.md 199 | - shell.py: pyspark/shell.md 200 | - worker.py: pyspark/worker.md 201 | - pyspark.pandas: 202 | - pyspark/pandas/index.md 203 | - pyspark/pandas/DataFrame.md 204 | - pyspark/pandas/InternalFrame.md 205 | - pyspark.pandas.generic: 206 | - pyspark/pandas/generic/index.md 207 | - pyspark/pandas/generic/Frame.md 208 | - pyspark.sql: 209 | - ... | flat | pyspark/sql/**.md 210 | - pyspark.sql.pandas: 211 | - pyspark/sql/pandas/index.md 212 | - functions.py: pyspark/sql/pandas/functions.md 213 | - pyspark/sql/pandas/PandasUDFType.md 214 | - ... | runners/**.md 215 | - ... | demo/**.md 216 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://${GH_TOKEN}@github.com/squidfunk/mkdocs-material-insiders.git 2 | mkdocs-minify-plugin>=0.3.0 3 | mkdocs-git-revision-date-localized-plugin>=0.8 4 | mkdocs-git-revision-date-plugin>=0.3.1 5 | mkdocs-awesome-pages-plugin>=2.5.0 6 | mkdocs-redirects>=1.0.1 7 | mkdocs-macros-plugin>=0.5.0 8 | --------------------------------------------------------------------------------