├── README.md ├── build.sh ├── mkdir.sh ├── put.sh ├── python ├── main.py ├── my_udf │ ├── __init__.py │ └── functions.py └── setup.py ├── run.sh ├── run_submit.json ├── run_submit.sh ├── scala ├── build.sbt └── src │ └── main │ └── scala │ └── org │ └── andre │ └── udf │ └── Functions.scala ├── setup.env └── spark-submit.sh /README.md: -------------------------------------------------------------------------------- 1 | # Spark Python Scala UDF 2 | 3 | Demonstrates calling a Spark Scala UDF from Python with an EGG and a JAR. 4 | * Using spark-submit. 5 | * Using Databricks REST API endpoint [jobs/runs/submit](https://docs.databricks.com/api/latest/jobs.html#runs-submit). 6 | 7 | Prerequisites: 8 | * Spark 2.4.2 9 | * Python 2.7 or above 10 | * Scala 2.11.8 11 | * curl 12 | 13 | ## Code 14 | 15 | ### [main.py](python/main.py) 16 | ``` 17 | import sys 18 | from my_udf.functions import square 19 | from pyspark.sql import SparkSession,SQLContext 20 | 21 | spark = SparkSession.builder.appName("PythonScalaUDF").getOrCreate() 22 | spark.range(1, 4).createOrReplaceTempView("test") 23 | 24 | def call_python_udf_sql(): 25 | print("Calling Python UDF with SQL") 26 | spark.udf.register("squareWithPython", square) 27 | spark.sql("select id, squareWithPython(id) as id_square_sql from test").show() 28 | 29 | def call_python_udf_df(): 30 | from pyspark.sql.functions import udf 31 | from pyspark.sql.types import LongType 32 | print("Calling Python UDF with DataFrame") 33 | square_udf = udf(square, LongType()) 34 | df = spark.table("test") 35 | df.select("id", square_udf("id").alias("id_square_df")).show() 36 | 37 | def call_scala_udf_sql(): 38 | print("Calling Scala UDF with SQL") 39 | sqlContext = SQLContext(spark.sparkContext) 40 | spark._jvm.com.databricks.solutions.udf.Functions.registerFunc(sqlContext._jsqlContext,"cube") 41 | spark.sql("select id, cube(id) as id_cube_sql_scala from test").show() 42 | 43 | if __name__ == "__main__": 44 | call_python_udf_sql() 45 | call_python_udf_df() 46 | call_scala_udf_sql() 47 | ``` 48 | 49 | ### [functions.py](python/my_udf/functions.py) 50 | ``` 51 | import sys 52 | 53 | def square(s): 54 | return s * s 55 | 56 | ``` 57 | 58 | ### [Functions.scala](scala/src/main/scala/com/databricks/solutions/udf/Functions.scala) 59 | ``` 60 | package com.databricks.solutions.udf 61 | import org.apache.spark.sql.SQLContext 62 | 63 | object Functions { 64 | def cube(n: Int) = n * n * n 65 | 66 | def registerFunc(sqlContext: SQLContext, name: String) { 67 | val f = cube(_) 68 | sqlContext.udf.register(name, f) 69 | } 70 | } 71 | ``` 72 | 73 | ## Build 74 | 75 | [build.sh](build.sh) 76 | ``` 77 | cd python 78 | python setup.py bdist_egg 79 | cd ../scala 80 | sbt clean package 81 | ``` 82 | 83 | ## Run 84 | 85 | ### Run with spark-submit 86 | [spark-submit.sh](spark-submit.sh) 87 | ``` 88 | JAR=scala/target/scala-2.11/spark-python-scala-udf_2.11-0.0.1-SNAPSHOT.jar 89 | EGG=python/dist/spark_python_scala_udf-0.0.1-py2.7.egg 90 | spark-submit --master local[2] --jars $JAR --py-files $EGG python/main.py 91 | ``` 92 | 93 | ### Run with Databricks REST API endpoint jobs/runs/submit 94 | 95 | Steps: 96 | * Set your API URL and token in [setup.env](setup.env). 97 | * Create the sample DBFS job path /tmp/python-scala-udf-job with [mkdir.sh](mkdir.sh). 98 | * Upload the JAR, EGG and Python main files to dbfs with [put.sh](put.sh) to above path. 99 | * Tweak your job request file in [run_submit.json](run_submit.json). 100 | * Submit the job with [run_submit.sh](run_submit.sh). 101 | 102 | ### Run output 103 | ``` 104 | Calling Python UDF with SQL 105 | +---+-------------+ 106 | | id|id_square_sql| 107 | +---+-------------+ 108 | | 1| 1| 109 | | 2| 4| 110 | | 3| 9| 111 | +---+-------------+ 112 | 113 | Calling Python UDF with DataFrame 114 | +---+------------+ 115 | | id|id_square_df| 116 | +---+------------+ 117 | | 1| 1| 118 | | 2| 4| 119 | | 3| 9| 120 | +---+------------+ 121 | 122 | Calling Scala UDF with SQL 123 | +---+-----------------+ 124 | | id|id_cube_sql_scala| 125 | +---+-----------------+ 126 | | 1| 1| 127 | | 2| 8| 128 | | 3| 27| 129 | +---+-----------------+ 130 | ``` 131 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | 2 | cd python 3 | python setup.py bdist_egg 4 | cd ../scala 5 | sbt clean package 6 | -------------------------------------------------------------------------------- /mkdir.sh: -------------------------------------------------------------------------------- 1 | 2 | . ./setup.env 3 | path=/tmp/jobs/python-scala-udf-job 4 | curl -X POST -H "Authorization: Bearer $TOKEN" -F path=$path $API_URL/dbfs/mkdirs 5 | -------------------------------------------------------------------------------- /put.sh: -------------------------------------------------------------------------------- 1 | 2 | # upload .jar, .egg and .py files to dbfs 3 | 4 | dst_dir=dbfs:/tmp/jobs/python-scala-udf-job 5 | 6 | put() { 7 | src=$1 ; dst=$2 8 | echo "=====" 9 | echo "SRC: $src" ; echo "DST: $dst" 10 | databricks fs cp $src $dst --overwrite 11 | } 12 | put_all() { 13 | file=spark_python_scala_udf-0.0.1-py2.7.egg ; put python/dist/$file $dst_dir/$file 14 | file=spark-python-scala-udf_2.11-0.0.1-SNAPSHOT.jar ; put scala/target/scala-2.11/$file $dst_dir/$file 15 | file=main.py ; put python/$file $dst_dir/$file 16 | } 17 | 18 | put_all 19 | -------------------------------------------------------------------------------- /python/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark.sql import SparkSession,SQLContext 3 | 4 | spark = SparkSession.builder.appName("PythonScalaUDF").getOrCreate() 5 | spark.range(1, 4).createOrReplaceTempView("test") 6 | 7 | def call_python_udf_sql(): 8 | from my_udf.functions import square 9 | print("\nCalling Python UDF with SQL") 10 | spark.udf.register("squareWithPython", square) 11 | spark.sql("select id, squareWithPython(id) as square_sql from test").show() 12 | 13 | def call_python_udf_df(): 14 | print("Calling Python UDF with DataFrame") 15 | from pyspark.sql.functions import udf 16 | from my_udf.functions import square 17 | square_udf = udf(square) 18 | # or more type-safe 19 | # from pyspark.sql.types import LongType 20 | # square_udf = udf(square, LongType()) 21 | df = spark.table("test") 22 | df.select("id", square_udf("id").alias("square_df")).show() 23 | 24 | def call_scala_udf_sql(): 25 | print("Calling Scala UDF with SQL") 26 | sqlContext = SQLContext(spark.sparkContext) 27 | spark._jvm.com.databricks.solutions.udf.Functions.registerFunc(sqlContext._jsqlContext,"cube") 28 | spark.sql("select id, cube(id) as cube_sql_scala from test").show() 29 | 30 | if __name__ == "__main__": 31 | call_python_udf_sql() 32 | call_python_udf_df() 33 | call_scala_udf_sql() 34 | -------------------------------------------------------------------------------- /python/my_udf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amesar/spark-python-scala-udf/bd2e6982dbbd8d3a8e9779400d232780ff4e1217/python/my_udf/__init__.py -------------------------------------------------------------------------------- /python/my_udf/functions.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | 4 | def square(s): 5 | return s * s 6 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name='spark-python-scala-udf', 4 | version='0.0.1', 5 | description='Python and Scala UDF sample app', 6 | author='andre', 7 | packages = find_packages(), 8 | zip_safe=False) 9 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | 2 | JAR=scala/target/scala-2.11/spark-python-scala-udf_2.11-0.0.1-SNAPSHOT.jar 3 | EGG=python/dist/spark_python_scala_udf-0.0.1-py2.7.egg 4 | spark-submit --master local[2] --jars $JAR --py-files $EGG python/main.py 5 | -------------------------------------------------------------------------------- /run_submit.json: -------------------------------------------------------------------------------- 1 | { 2 | "run_name": "python-scala-udf-job", 3 | "new_cluster": { 4 | "spark_version": "5.3.x-scala2.11", 5 | "node_type_id": "i3.xlarge", 6 | "num_workers": 1 7 | }, 8 | "libraries": [ 9 | { "egg": "dbfs:/tmp/jobs/python-scala-udf-job/spark_python_scala_udf-0.0.1-py2.7.egg"}, 10 | { "jar": "dbfs:/tmp/jobs/python-scala-udf-job/spark-python-scala-udf_2.11-0.0.1-SNAPSHOT.jar" } 11 | ], 12 | "timeout_seconds": 3600, 13 | "spark_python_task": { 14 | "python_file": "dbfs:/tmp/jobs/python-scala-udf-job/main.py" 15 | } 16 | } 17 | 18 | -------------------------------------------------------------------------------- /run_submit.sh: -------------------------------------------------------------------------------- 1 | 2 | # Run Databricks job 3 | 4 | . ./setup.env 5 | curl -X POST -H "Authorization: Bearer $TOKEN" -d @run_submit.json $API_URL/jobs/runs/submit 6 | -------------------------------------------------------------------------------- /scala/build.sbt: -------------------------------------------------------------------------------- 1 | name := "spark-python-scala-udf" 2 | 3 | version := "0.0.1-SNAPSHOT" 4 | scalaVersion := "2.11.8" 5 | 6 | libraryDependencies ++= Seq( 7 | "org.apache.spark" %% "spark-sql" % "2.4.2" % "provided" 8 | ) 9 | -------------------------------------------------------------------------------- /scala/src/main/scala/org/andre/udf/Functions.scala: -------------------------------------------------------------------------------- 1 | package com.databricks.solutions.udf 2 | import org.apache.spark.sql.SQLContext 3 | 4 | object Functions { 5 | def cube(n: Int) = n * n * n 6 | 7 | def registerFunc(sqlContext: SQLContext, name: String) { 8 | val f = cube(_) 9 | sqlContext.udf.register(name, f) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /setup.env: -------------------------------------------------------------------------------- 1 | 2 | TOKEN=SET_MY_TOKEN 3 | API_URL=https://my_databricks_shard.com/api/2.0 4 | -------------------------------------------------------------------------------- /spark-submit.sh: -------------------------------------------------------------------------------- 1 | 2 | JAR=scala/target/scala-2.11/spark-python-scala-udf_2.11-0.0.1-SNAPSHOT.jar 3 | EGG=python/dist/spark_python_scala_udf-0.0.1-py2.7.egg 4 | spark-submit --master local[2] --jars $JAR --py-files $EGG python/main.py 5 | --------------------------------------------------------------------------------