├── README.md
├── build.sh
├── mkdir.sh
├── put.sh
├── python
    ├── main.py
    ├── my_udf
    │   ├── __init__.py
    │   └── functions.py
    └── setup.py
├── run.sh
├── run_submit.json
├── run_submit.sh
├── scala
    ├── build.sbt
    └── src
    │   └── main
    │       └── scala
    │           └── org
    │               └── andre
    │                   └── udf
    │                       └── Functions.scala
├── setup.env
└── spark-submit.sh


/README.md:
--------------------------------------------------------------------------------
  1 | # Spark Python Scala UDF
  2 | 
  3 | Demonstrates calling a Spark Scala UDF from Python with an EGG and a JAR.
  4 | * Using spark-submit.
  5 | * Using Databricks REST API endpoint [jobs/runs/submit](https://docs.databricks.com/api/latest/jobs.html#runs-submit).
  6 | 
  7 | Prerequisites:
  8 | * Spark 2.4.2
  9 | * Python 2.7 or above
 10 | * Scala 2.11.8
 11 | * curl
 12 | 
 13 | ## Code
 14 | 
 15 | ### [main.py](python/main.py)
 16 | ```
 17 | import sys
 18 | from my_udf.functions import square
 19 | from pyspark.sql import SparkSession,SQLContext
 20 | 
 21 | spark = SparkSession.builder.appName("PythonScalaUDF").getOrCreate()
 22 | spark.range(1, 4).createOrReplaceTempView("test")
 23 | 
 24 | def call_python_udf_sql():
 25 |   print("Calling Python UDF with SQL")
 26 |   spark.udf.register("squareWithPython", square)
 27 |   spark.sql("select id, squareWithPython(id) as id_square_sql from test").show()
 28 | 
 29 | def call_python_udf_df():
 30 |   from pyspark.sql.functions import udf
 31 |   from pyspark.sql.types import LongType
 32 |   print("Calling Python UDF with DataFrame")
 33 |   square_udf = udf(square, LongType())
 34 |   df = spark.table("test")
 35 |   df.select("id", square_udf("id").alias("id_square_df")).show()
 36 | 
 37 | def call_scala_udf_sql():
 38 |   print("Calling Scala UDF with SQL")
 39 |   sqlContext = SQLContext(spark.sparkContext)
 40 |   spark._jvm.com.databricks.solutions.udf.Functions.registerFunc(sqlContext._jsqlContext,"cube")
 41 |   spark.sql("select id, cube(id) as id_cube_sql_scala from test").show()
 42 | 
 43 | if __name__ == "__main__":
 44 |   call_python_udf_sql()
 45 |   call_python_udf_df()
 46 |   call_scala_udf_sql()
 47 | ```
 48 | 
 49 | ### [functions.py](python/my_udf/functions.py)
 50 | ```
 51 | import sys
 52 | 
 53 | def square(s):
 54 |   return s * s
 55 | 
 56 | ```
 57 | 
 58 | ### [Functions.scala](scala/src/main/scala/com/databricks/solutions/udf/Functions.scala)
 59 | ```
 60 | package com.databricks.solutions.udf
 61 | import org.apache.spark.sql.SQLContext
 62 | 
 63 | object Functions {
 64 |   def cube(n: Int) = n * n * n
 65 | 
 66 |   def registerFunc(sqlContext: SQLContext, name: String) {
 67 |     val f = cube(_)
 68 |     sqlContext.udf.register(name, f)
 69 |   }
 70 | }
 71 | ```
 72 | 
 73 | ## Build
 74 | 
 75 | [build.sh](build.sh)
 76 | ```
 77 | cd python
 78 | python setup.py bdist_egg
 79 | cd ../scala
 80 | sbt clean package
 81 | ```
 82 | 
 83 | ## Run
 84 | 
 85 | ### Run with spark-submit
 86 | [spark-submit.sh](spark-submit.sh)
 87 | ```
 88 | JAR=scala/target/scala-2.11/spark-python-scala-udf_2.11-0.0.1-SNAPSHOT.jar
 89 | EGG=python/dist/spark_python_scala_udf-0.0.1-py2.7.egg
 90 | spark-submit --master local[2] --jars $JAR --py-files $EGG python/main.py 
 91 | ```
 92 | 
 93 | ### Run with Databricks REST API endpoint jobs/runs/submit
 94 | 
 95 | Steps:
 96 | * Set your API URL and token in [setup.env](setup.env).
 97 | * Create the sample DBFS job path /tmp/python-scala-udf-job with [mkdir.sh](mkdir.sh).
 98 | * Upload the JAR, EGG and Python main files to dbfs with [put.sh](put.sh) to above path.
 99 | * Tweak your job request file in [run_submit.json](run_submit.json).
100 | * Submit the job with [run_submit.sh](run_submit.sh).
101 | 
102 | ### Run output
103 | ```
104 | Calling Python UDF with SQL
105 | +---+-------------+
106 | | id|id_square_sql|
107 | +---+-------------+
108 | |  1|            1|
109 | |  2|            4|
110 | |  3|            9|
111 | +---+-------------+
112 | 
113 | Calling Python UDF with DataFrame
114 | +---+------------+
115 | | id|id_square_df|
116 | +---+------------+
117 | |  1|           1|
118 | |  2|           4|
119 | |  3|           9|
120 | +---+------------+
121 | 
122 | Calling Scala UDF with SQL
123 | +---+-----------------+
124 | | id|id_cube_sql_scala|
125 | +---+-----------------+
126 | |  1|                1|
127 | |  2|                8|
128 | |  3|               27|
129 | +---+-----------------+
130 | ```
131 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | 
2 | cd python
3 | python setup.py bdist_egg
4 | cd ../scala
5 | sbt clean package
6 | 


--------------------------------------------------------------------------------
/mkdir.sh:
--------------------------------------------------------------------------------
1 | 
2 | . ./setup.env
3 | path=/tmp/jobs/python-scala-udf-job
4 | curl -X POST -H "Authorization: Bearer $TOKEN" -F path=$path $API_URL/dbfs/mkdirs
5 | 


--------------------------------------------------------------------------------
/put.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # upload .jar, .egg and .py files to dbfs
 3 | 
 4 | dst_dir=dbfs:/tmp/jobs/python-scala-udf-job
 5 | 
 6 | put() {
 7 |   src=$1 ; dst=$2
 8 |   echo "====="
 9 |   echo "SRC: $src" ; echo "DST: $dst"
10 |   databricks fs cp $src $dst --overwrite
11 | }
12 | put_all() {
13 |   file=spark_python_scala_udf-0.0.1-py2.7.egg  ; put python/dist/$file $dst_dir/$file
14 |   file=spark-python-scala-udf_2.11-0.0.1-SNAPSHOT.jar ; put scala/target/scala-2.11/$file $dst_dir/$file
15 |   file=main.py ; put python/$file $dst_dir/$file
16 | }
17 | 
18 | put_all 
19 | 


--------------------------------------------------------------------------------
/python/main.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pyspark.sql import SparkSession,SQLContext
 3 | 
 4 | spark = SparkSession.builder.appName("PythonScalaUDF").getOrCreate()
 5 | spark.range(1, 4).createOrReplaceTempView("test")
 6 | 
 7 | def call_python_udf_sql():
 8 |   from my_udf.functions import square
 9 |   print("\nCalling Python UDF with SQL")
10 |   spark.udf.register("squareWithPython", square)
11 |   spark.sql("select id, squareWithPython(id) as square_sql from test").show()
12 | 
13 | def call_python_udf_df():
14 |   print("Calling Python UDF with DataFrame")
15 |   from pyspark.sql.functions import udf
16 |   from my_udf.functions import square
17 |   square_udf = udf(square)
18 |   # or more type-safe
19 |   # from pyspark.sql.types import LongType
20 |   # square_udf = udf(square, LongType())
21 |   df = spark.table("test")
22 |   df.select("id", square_udf("id").alias("square_df")).show()
23 | 
24 | def call_scala_udf_sql():
25 |   print("Calling Scala UDF with SQL")
26 |   sqlContext = SQLContext(spark.sparkContext)
27 |   spark._jvm.com.databricks.solutions.udf.Functions.registerFunc(sqlContext._jsqlContext,"cube")
28 |   spark.sql("select id, cube(id) as cube_sql_scala from test").show()
29 | 
30 | if __name__ == "__main__":
31 |   call_python_udf_sql()
32 |   call_python_udf_df()
33 |   call_scala_udf_sql()
34 | 


--------------------------------------------------------------------------------
/python/my_udf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amesar/spark-python-scala-udf/bd2e6982dbbd8d3a8e9779400d232780ff4e1217/python/my_udf/__init__.py


--------------------------------------------------------------------------------
/python/my_udf/functions.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import sys
3 | 
4 | def square(s):
5 |   return s * s
6 | 


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | setup(name='spark-python-scala-udf',
4 |       version='0.0.1',
5 |       description='Python and Scala UDF sample app',
6 |       author='andre',
7 |       packages = find_packages(),
8 |       zip_safe=False)
9 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | 
2 | JAR=scala/target/scala-2.11/spark-python-scala-udf_2.11-0.0.1-SNAPSHOT.jar
3 | EGG=python/dist/spark_python_scala_udf-0.0.1-py2.7.egg
4 | spark-submit --master local[2] --jars $JAR --py-files $EGG python/main.py 
5 | 


--------------------------------------------------------------------------------
/run_submit.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "run_name": "python-scala-udf-job",
 3 |   "new_cluster": {
 4 |     "spark_version": "5.3.x-scala2.11",
 5 |     "node_type_id": "i3.xlarge",
 6 |     "num_workers": 1
 7 |   },
 8 |   "libraries": [
 9 |     { "egg": "dbfs:/tmp/jobs/python-scala-udf-job/spark_python_scala_udf-0.0.1-py2.7.egg"},
10 |     { "jar": "dbfs:/tmp/jobs/python-scala-udf-job/spark-python-scala-udf_2.11-0.0.1-SNAPSHOT.jar" }
11 |   ],
12 |   "timeout_seconds": 3600,
13 |   "spark_python_task": {
14 |     "python_file": "dbfs:/tmp/jobs/python-scala-udf-job/main.py"
15 |   }
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/run_submit.sh:
--------------------------------------------------------------------------------
1 | 
2 | # Run Databricks job
3 | 
4 | . ./setup.env
5 | curl -X POST -H "Authorization: Bearer $TOKEN" -d @run_submit.json $API_URL/jobs/runs/submit
6 | 


--------------------------------------------------------------------------------
/scala/build.sbt:
--------------------------------------------------------------------------------
1 | name := "spark-python-scala-udf"
2 | 
3 | version := "0.0.1-SNAPSHOT"
4 | scalaVersion := "2.11.8"
5 | 
6 | libraryDependencies ++= Seq(
7 |   "org.apache.spark" %% "spark-sql" % "2.4.2" % "provided"
8 | )
9 | 


--------------------------------------------------------------------------------
/scala/src/main/scala/org/andre/udf/Functions.scala:
--------------------------------------------------------------------------------
 1 | package com.databricks.solutions.udf
 2 | import org.apache.spark.sql.SQLContext
 3 | 
 4 | object Functions {
 5 |   def cube(n: Int) = n * n * n
 6 | 
 7 |   def registerFunc(sqlContext: SQLContext, name: String) {
 8 |     val f = cube(_)
 9 |     sqlContext.udf.register(name, f)
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/setup.env:
--------------------------------------------------------------------------------
1 | 
2 | TOKEN=SET_MY_TOKEN
3 | API_URL=https://my_databricks_shard.com/api/2.0
4 | 


--------------------------------------------------------------------------------
/spark-submit.sh:
--------------------------------------------------------------------------------
1 | 
2 | JAR=scala/target/scala-2.11/spark-python-scala-udf_2.11-0.0.1-SNAPSHOT.jar
3 | EGG=python/dist/spark_python_scala_udf-0.0.1-py2.7.egg
4 | spark-submit --master local[2] --jars $JAR --py-files $EGG python/main.py 
5 | 


--------------------------------------------------------------------------------