├── dataframes ├── .gitignore ├── dataframe_from_mongodb.md ├── dataframe_from_csv.ipynb ├── .ipynb_checkpoints │ ├── dataframe-from-csv-checkpoint.ipynb │ ├── sort_string_using_sorted_array-checkpoint.ipynb │ └── dataframe_from_nothing-checkpoint.ipynb ├── dataframe_from_nothing.ipynb ├── not_exits_and_exists_equivalent-checkpoint.ipynb └── dataframe_from_json.ipynb ├── .idea ├── .gitignore ├── misc.xml ├── vcs.xml ├── inspectionProfiles │ ├── profiles_settings.xml │ └── Project_Default.xml ├── modules.xml └── spark-ref.iml ├── .gitignore ├── README.md ├── SECURITY.md ├── random-examples ├── sort_string_using_sorted_array.ipynb └── word_counter.ipynb ├── rdd └── rdd.ipynb └── sql └── not_exits_and_exists_equivalent.ipynb /dataframes/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | ../*.swp 3 | .idea/ 4 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | .idea/ 3 | .bash_history 4 | .cache/ 5 | .ipython/ 6 | .jupyter/ 7 | .local/ 8 | .python_history 9 | 10 | .virtual_documents/ 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spark-ref 2 | pySpark references for developers 3 | 4 | This is a repository for pySpark that developers could use day-by-day. 5 | 6 | The idea is organize useful examples per themes 7 | 8 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/spark-ref.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | Use this section to tell people about which versions of your project are 6 | currently being supported with security updates. 7 | 8 | | Version | Supported | 9 | | ------- | ------------------ | 10 | | 5.1.x | :white_check_mark: | 11 | | 5.0.x | :x: | 12 | | 4.0.x | :white_check_mark: | 13 | | < 4.0 | :x: | 14 | 15 | ## Reporting a Vulnerability 16 | 17 | Use this section to tell people how to report a vulnerability. 18 | 19 | Tell them where to go, how often they can expect to get an update on a 20 | reported vulnerability, what to expect if the vulnerability is accepted or 21 | declined, etc. 22 | -------------------------------------------------------------------------------- /dataframes/dataframe_from_mongodb.md: -------------------------------------------------------------------------------- 1 | 2 | ## Importing 3 | ```python 4 | import pyspark 5 | import pyspark.sql.functions as F 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql.types import StructType,StructField,StringType,IntegerType 8 | ``` 9 | ## Connection directly on SparkSession 10 | ```python 11 | spark = SparkSession \ 12 | .builder \ 13 | .appName("tgt-santander-ingestion" 14 | ).config("spark.jars","jars/mongo-spark-connector_2.11-2.4.1.jar,jars/mongo-java-driver-3.11.0-rc0.jar,scala-library-2.11.12.jar" 15 | ).config("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.fake" 16 | ).config("spark.mongodb.output.uri", "mongodb://127.0.0.1/test.fake" 17 | ).config( 18 | "spark.hadoop.hive.metastore.warehouse.dir" 19 | ,"/home/hduser/Projects/job-test/metastore_db" 20 | ).config("spark.sql.warehouse.dir","/user/hive/warehouse" 21 | ).enableHiveSupport( 22 | ).getOrCreate() 23 | 24 | mongoDF = spark.read.format("mongo").load() 25 | ``` 26 | 27 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 34 | -------------------------------------------------------------------------------- /dataframes/dataframe_from_csv.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pyspark\n", 10 | "from pyspark.sql import SparkSession\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 6, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# Initializing Spark session\n", 20 | "spark = SparkSession.builder.appName('basic').getOrCreate()" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 7, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "+---+-----+---+\n", 33 | "| id| name|age|\n", 34 | "+---+-----+---+\n", 35 | "| 1|André| 41|\n", 36 | "| 2| João| 28|\n", 37 | "| 3|Maria| 29|\n", 38 | "+---+-----+---+\n", 39 | "\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "# Reading from HDFS\n", 45 | "spark.read.csv('/users/hduser/spark-ref/dataframes/examples/example-001.csv', \n", 46 | " sep=',',\n", 47 | " encoding='utf-8',\n", 48 | " header=True\n", 49 | " ).show()\n", 50 | "\n", 51 | "# If you want to load a local file, add the prefix 'file://' to path!" 52 | ] 53 | } 54 | ], 55 | "metadata": { 56 | "kernelspec": { 57 | "display_name": "Python 3 (ipykernel)", 58 | "language": "python", 59 | "name": "python3" 60 | }, 61 | "language_info": { 62 | "codemirror_mode": { 63 | "name": "ipython", 64 | "version": 3 65 | }, 66 | "file_extension": ".py", 67 | "mimetype": "text/x-python", 68 | "name": "python", 69 | "nbconvert_exporter": "python", 70 | "pygments_lexer": "ipython3", 71 | "version": "3.8.5" 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 2 76 | } 77 | -------------------------------------------------------------------------------- /dataframes/.ipynb_checkpoints/dataframe-from-csv-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pyspark\n", 10 | "from pyspark.sql import SparkSession\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# Initializing Spark session\n", 20 | "spark = SparkSession.builder.appName('basic').getOrCreate()" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "ename": "NameError", 30 | "evalue": "name 'spark' is not defined", 31 | "output_type": "error", 32 | "traceback": [ 33 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 34 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 35 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Reading from HDFS\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m spark.read.csv('/users/hduser/spark-ref/dataframes/examples/example-001.csv', \n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m','\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'utf-8'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 36 | "\u001b[0;31mNameError\u001b[0m: name 'spark' is not defined" 37 | ] 38 | } 39 | ], 40 | "source": [ 41 | "# Reading from HDFS\n", 42 | "spark.read.csv('/users/hduser/spark-ref/dataframes/examples/example-001.csv', \n", 43 | " sep=',',\n", 44 | " encoding='utf-8',\n", 45 | " header=True\n", 46 | " ).show()\n", 47 | "\n", 48 | "# If you want to load a local file, add the prefix 'file://' to path!" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "Python 3", 62 | "language": "python", 63 | "name": "python3" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.6.9" 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 2 80 | } 81 | -------------------------------------------------------------------------------- /random-examples/sort_string_using_sorted_array.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pyspark\n", 10 | "import pyspark.sql.functions as F\n", 11 | "from pyspark.sql import SparkSession\n", 12 | "from pyspark.sql.types import StructType, StructField, StringType" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 5, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "spark = SparkSession.builder.appName('strings_lists').enableHiveSupport().getOrCreate()" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 15, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "+-------+\n", 34 | "| target|\n", 35 | "+-------+\n", 36 | "|XADOWPQ|\n", 37 | "+-------+\n", 38 | "\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "# Creating a basic schema\n", 44 | "schema = StructType([\n", 45 | " StructField('target', StringType(), True)\n", 46 | "])\n", 47 | "data = [['XADOWPQ']]\n", 48 | "df = spark.createDataFrame(data, schema)\n", 49 | "dfo = df\n", 50 | "df.show()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 14, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "+--------------------+\n", 63 | "| target|\n", 64 | "+--------------------+\n", 65 | "|[X, A, D, O, W, P...|\n", 66 | "+--------------------+\n", 67 | "\n", 68 | "+--------------------+\n", 69 | "| target|\n", 70 | "+--------------------+\n", 71 | "|[, A, D, O, P, Q,...|\n", 72 | "+--------------------+\n", 73 | "\n", 74 | "+-------+\n", 75 | "| target|\n", 76 | "+-------+\n", 77 | "|ADOPQWX|\n", 78 | "+-------+\n", 79 | "\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "# First, Transform target to array. Let's using another field\n", 85 | "df = df.withColumn('target', F.split('target',''))\n", 86 | "df.show()\n", 87 | "\n", 88 | "# Second, sort the array\n", 89 | "df = df.withColumn('target', F.array_sort('target'))\n", 90 | "df.show()\n", 91 | "\n", 92 | "# Third, convert to string again\n", 93 | "df = df.withColumn('target', F.concat_ws('', 'target'))\n", 94 | "df.show()" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "Python 3 (ipykernel)", 108 | "language": "python", 109 | "name": "python3" 110 | }, 111 | "language_info": { 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "file_extension": ".py", 117 | "mimetype": "text/x-python", 118 | "name": "python", 119 | "nbconvert_exporter": "python", 120 | "pygments_lexer": "ipython3", 121 | "version": "3.10.12" 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 4 126 | } 127 | -------------------------------------------------------------------------------- /dataframes/.ipynb_checkpoints/sort_string_using_sorted_array-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pyspark\n", 10 | "import pyspark.sql.functions as F\n", 11 | "from pyspark.sql import SparkSession\n", 12 | "from pyspark.sql.types import StructType, StructField, StringType" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 5, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "spark = SparkSession.builder.appName('strings_lists').enableHiveSupport().getOrCreate()" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 15, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "+-------+\n", 34 | "| target|\n", 35 | "+-------+\n", 36 | "|XADOWPQ|\n", 37 | "+-------+\n", 38 | "\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "# Creating a basic schema\n", 44 | "schema = StructType([\n", 45 | " StructField('target', StringType(), True)\n", 46 | "])\n", 47 | "data = [['XADOWPQ']]\n", 48 | "df = spark.createDataFrame(data, schema)\n", 49 | "dfo = df\n", 50 | "df.show()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 14, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "+--------------------+\n", 63 | "| target|\n", 64 | "+--------------------+\n", 65 | "|[X, A, D, O, W, P...|\n", 66 | "+--------------------+\n", 67 | "\n", 68 | "+--------------------+\n", 69 | "| target|\n", 70 | "+--------------------+\n", 71 | "|[, A, D, O, P, Q,...|\n", 72 | "+--------------------+\n", 73 | "\n", 74 | "+-------+\n", 75 | "| target|\n", 76 | "+-------+\n", 77 | "|ADOPQWX|\n", 78 | "+-------+\n", 79 | "\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "# First, Transform target to array. Let's using another field\n", 85 | "df = df.withColumn('target', F.split('target',''))\n", 86 | "df.show()\n", 87 | "\n", 88 | "# Second, sort the array\n", 89 | "df = df.withColumn('target', F.array_sort('target'))\n", 90 | "df.show()\n", 91 | "\n", 92 | "# Third, convert to string again\n", 93 | "df = df.withColumn('target', F.concat_ws('', 'target'))\n", 94 | "df.show()" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "Python 3", 108 | "language": "python", 109 | "name": "python3" 110 | }, 111 | "language_info": { 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "file_extension": ".py", 117 | "mimetype": "text/x-python", 118 | "name": "python", 119 | "nbconvert_exporter": "python", 120 | "pygments_lexer": "ipython3", 121 | "version": "3.8.5" 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 4 126 | } 127 | -------------------------------------------------------------------------------- /rdd/rdd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "id": "88896fe3-993c-4e73-8f32-0dd22ef38ea5", 7 | "metadata": { 8 | "slideshow": { 9 | "slide_type": "" 10 | }, 11 | "tags": [] 12 | }, 13 | "outputs": [], 14 | "source": [ 15 | "import pyspark\n", 16 | "from pyspark import SparkContext\n", 17 | "from operator import add" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 9, 23 | "id": "e44ca0de-3148-40f2-9f3d-f0665a9fca94", 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/html": [ 29 | "\n", 30 | "
\n", 31 | "

SparkContext

\n", 32 | "\n", 33 | "

Spark UI

\n", 34 | "\n", 35 | "
\n", 36 | "
Version
\n", 37 | "
v3.2.4
\n", 38 | "
Master
\n", 39 | "
local[*]
\n", 40 | "
AppName
\n", 41 | "
pyspark-shell
\n", 42 | "
\n", 43 | "
\n", 44 | " " 45 | ], 46 | "text/plain": [ 47 | "" 48 | ] 49 | }, 50 | "execution_count": 9, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "sc = SparkContext.getOrCreate()\n", 57 | "sc" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 10, 63 | "id": "40f807e5-6ace-4c5c-aaf2-820426e73645", 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "[1, 2, 3, 4, 5]\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "collect_rdd = sc.parallelize([1,2,3,4,5])\n", 76 | "print(collect_rdd.collect())" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 11, 82 | "id": "8bf68939-cba5-4e3e-a868-a8d0ae82b5a4", 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "10\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "# Counting\n", 95 | "count_rdd = sc.parallelize([1,2,3,4,5,6,7,8,9,0])\n", 96 | "print(count_rdd.count())" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 12, 102 | "id": "9199423b-c9e4-4ade-90e3-506260cfceb7", 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "45\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "# reduce - immediate operations\n", 115 | "data_rdd = sc.parallelize([1,2,3,4,5,6,7,8,9,0])\n", 116 | "result_rdd = data_rdd.reduce(lambda x,y: x + y)\n", 117 | "print(result_rdd)\n" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 13, 123 | "id": "6bb70ade-fe11-4cd2-a092-4529af35f61f", 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "[('awesome', 1), ('Spark', 1), ('really', 2), ('is', 1)]\n" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "# word counter\n", 136 | "s = \"Spark is really really awesome!\"\n", 137 | "rdd = sc.parallelize([s])\n", 138 | "counts = rdd.flatMap(lambda line: line.split(\" \")) \\\n", 139 | " .map(lambda line: line.replace(\"!\",\"\")) \\\n", 140 | " .map(lambda word: (word, 1)) \\\n", 141 | " .reduceByKey(lambda x, y: x + y) \\\n", 142 | " .collect()\n", 143 | "print(str(counts))" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "id": "c064273f-1229-4d81-a37e-8a9674c55eda", 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/plain": [ 155 | "[('Spark', 1), ('really', 2), ('awesome!', 1), ('is', 1)]" 156 | ] 157 | }, 158 | "execution_count": 7, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "s = \"Spark is really really awesome!\"\n", 165 | "sc.parallelize(s.split(\" \"))\\\n", 166 | ".map(lambda x:(x, 1))\\\n", 167 | ".reduceByKey(add).collect()" 168 | ] 169 | } 170 | ], 171 | "metadata": { 172 | "kernelspec": { 173 | "display_name": "Python 3 (ipykernel)", 174 | "language": "python", 175 | "name": "python3" 176 | }, 177 | "language_info": { 178 | "codemirror_mode": { 179 | "name": "ipython", 180 | "version": 3 181 | }, 182 | "file_extension": ".py", 183 | "mimetype": "text/x-python", 184 | "name": "python", 185 | "nbconvert_exporter": "python", 186 | "pygments_lexer": "ipython3", 187 | "version": "3.10.12" 188 | } 189 | }, 190 | "nbformat": 4, 191 | "nbformat_minor": 5 192 | } 193 | -------------------------------------------------------------------------------- /random-examples/word_counter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "id": "b4be2271-1e6e-40f2-a2a2-e3dcbf68dbb8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pyspark\n", 11 | "import pyspark.sql.functions as F\n", 12 | "from pyspark.sql import SparkSession\n", 13 | "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, MapType, ArrayType\n", 14 | "from operator import add" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 11, 20 | "id": "3b83f99a-7763-487e-b4e3-645d9fe930a6", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "spark = SparkSession.builder.master('local').appName('word_counter').getOrCreate()" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "b23bf8a9-e8e4-49cd-89fc-3ffd80d35fbd", 30 | "metadata": {}, 31 | "source": [ 32 | "### Dataframe way\n", 33 | "\n", 34 | " 1. Create the dataframe with one column and call it 'word'\n", 35 | " 2. Use F.split and get one column with all words separated in a list\n", 36 | " 3. Use F.explode to transform each item in the list to a row\n", 37 | " 4. Group by 'word' and aggregate using 'count' function\n", 38 | "\n", 39 | " " 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 12, 45 | "id": "2d1bde9c-05eb-4f94-a0f9-ce7cc421b085", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# Input\n", 50 | "s = [\"Spark is totally totally awesome!\"]" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 13, 56 | "id": "de353bf4-1301-458e-a6be-212d4ae73d9d", 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "+-------+-----------+\n", 64 | "| word|count(word)|\n", 65 | "+-------+-----------+\n", 66 | "|totally| 2|\n", 67 | "| is| 1|\n", 68 | "| Spark| 1|\n", 69 | "|awesome| 1|\n", 70 | "+-------+-----------+\n", 71 | "\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "words_df = spark.createDataFrame([s], ['word'])\\\n", 77 | " .withColumn('word', F.explode(F.split(F.col('word'), ' ')))\\\n", 78 | " .groupBy('word').agg(F.count('word'))\\\n", 79 | " .withColumn('word', F.regexp_replace(F.col('word'), r\"^(.*)[\\!@#\\$%&*\\(\\)_\\-\\+\\=]+(.*)$\", \"$1$2\"))\\\n", 80 | " .show()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "155fdb25-4711-4095-ac7f-b9c84099bcb6", 86 | "metadata": {}, 87 | "source": [ 88 | "### Dataframe + SQL\n", 89 | "\n", 90 | "1. Create a dataframe and a view from it\n", 91 | "2. Split and explode exactly as before\n", 92 | "3. Create a temporary view\n", 93 | "4. Count and group using SQL\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 14, 99 | "id": "7ec2679c-1afd-47ac-99a8-70aa61baf43a", 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "# Create a data frame and a view\n", 104 | "s = [\"Spark is really really awesome!\"]\n", 105 | "lines_df = spark.createDataFrame([s], ['word'])\\\n", 106 | " .withColumn('word', F.explode(F.split(F.col('word'), ' ')))\n", 107 | "lines_df.createOrReplaceTempView('lines')" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 15, 113 | "id": "9a292827-710b-4231-9eda-3434cac14ec5", 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "+--------+-----------+\n", 121 | "| word|count(word)|\n", 122 | "+--------+-----------+\n", 123 | "| is| 1|\n", 124 | "| really| 2|\n", 125 | "| Spark| 1|\n", 126 | "|awesome!| 1|\n", 127 | "+--------+-----------+\n", 128 | "\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "# Select data from the view simply using Spark SQL\n", 134 | "spark.sql(\"\"\"select word, count(word) from lines group by word\"\"\").show()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "cc5800a1-42bb-4bf9-9696-2dd830315319", 140 | "metadata": {}, 141 | "source": [ 142 | "### RDD way" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 16, 148 | "id": "bc918308-4d4a-436c-90ed-0483f0cc5b56", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "[('Spark', 1), ('is', 1), ('really', 2), ('awesome!', 1)]" 155 | ] 156 | }, 157 | "execution_count": 16, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "# If you like map/reduce crap, go ahead!\n", 164 | "s = \"Spark is really really awesome!\"\n", 165 | "spark.sparkContext\\\n", 166 | " .parallelize(s.split()).map(lambda x:(x, 1))\\\n", 167 | " .reduceByKey(add).collect()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "id": "e82e9b12-7fbd-42d1-9492-be5cdef9dfdf", 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [] 177 | } 178 | ], 179 | "metadata": { 180 | "kernelspec": { 181 | "display_name": "Python 3 (ipykernel)", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.10.12" 196 | } 197 | }, 198 | "nbformat": 4, 199 | "nbformat_minor": 5 200 | } 201 | -------------------------------------------------------------------------------- /dataframes/dataframe_from_nothing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Dataframes from \"nothing\"" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pyspark\n", 17 | "from pyspark.sql import SparkSession\n", 18 | "from pyspark.sql.types import StructType, StructField, StringType" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "spark = SparkSession.builder.appName('basic').getOrCreate()" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Creating a empty dataframe" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### Parallelize way" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "+--------+-------+-------+\n", 54 | "| _1| _2| _3|\n", 55 | "+--------+-------+-------+\n", 56 | "| This|is only|a test!|\n", 57 | "|And this| is| too|\n", 58 | "+--------+-------+-------+\n", 59 | "\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "# Define your data by a set of data. Each data is a Row!\n", 65 | "data = [(\"This\",\"is only\", \"a test!\"),(\"And this\",\"is\",\"too\")]\n", 66 | "\n", 67 | "# Paralellize data\n", 68 | "spark.sparkContext.parallelize(data).toDF().show()\n" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "+--------+-------+-------+\n", 81 | "| colA| colB| colC|\n", 82 | "+--------+-------+-------+\n", 83 | "| This|is only|a test!|\n", 84 | "|And this| is| too|\n", 85 | "+--------+-------+-------+\n", 86 | "\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "# Defining column names\n", 92 | "columns = [\"colA\", \"colB\", \"colC\"]\n", 93 | "spark.sparkContext.parallelize(data).toDF(columns).show()" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "### createDataFrame way" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 5, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "++\n", 113 | "||\n", 114 | "++\n", 115 | "++\n", 116 | "\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "# First it's required unleast a empty schema\n", 122 | "schema = StructType([])\n", 123 | "\n", 124 | "# Now, an empty and useless dataframe\n", 125 | "spark.createDataFrame(spark.sparkContext.emptyRDD(),schema).show()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 11, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | "+--------+-------+-------+\n", 138 | "| colA| colB| colC|\n", 139 | "+--------+-------+-------+\n", 140 | "| This|is only|a test!|\n", 141 | "|And this| is| too|\n", 142 | "+--------+-------+-------+\n", 143 | "\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "# Schema for previous data\n", 149 | "schema = StructType([\n", 150 | " StructField(\"colA\",StringType(),False),\n", 151 | " StructField(\"colB\",StringType(),False),\n", 152 | " StructField(\"colC\",StringType(),False)\n", 153 | " ])\n", 154 | "# Creating DataFrame\n", 155 | "spark.createDataFrame(data,schema).show()\n" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 12, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "+--------+-------+------+\n", 168 | "| colA| colB| colC|\n", 169 | "+--------+-------+------+\n", 170 | "| This|is only|a test|\n", 171 | "|And this| is| too|\n", 172 | "+--------+-------+------+\n", 173 | "\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "# Minimalist\n", 179 | "df = spark.createDataFrame([[\"This\",\"is only\",\"a test\"],[\"And this\",\"is\",\"too\"]],schema)\n", 180 | "df.show()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "## Dataframes from CSV" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 13, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "+--------+-------+------+\n", 200 | "| colA| colB| colC|\n", 201 | "+--------+-------+------+\n", 202 | "| This|is only|a test|\n", 203 | "|And this| is| too|\n", 204 | "+--------+-------+------+\n", 205 | "\n" 206 | ] 207 | } 208 | ], 209 | "source": [ 210 | "# Let's create a file first from the dataframe stored on 'df' var!\n", 211 | "file='/users/hduser/spark-ref/dataframes/examples/example-001.csv'\n", 212 | "df.write.csv(file, sep=',', header=True)\n", 213 | "\n", 214 | "\n", 215 | "# Reading from CSV\n", 216 | "spark.read.csv(file, \n", 217 | " sep=',',\n", 218 | " encoding='utf-8',\n", 219 | " header=True).show()" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [] 228 | } 229 | ], 230 | "metadata": { 231 | "kernelspec": { 232 | "display_name": "Python 3 (ipykernel)", 233 | "language": "python", 234 | "name": "python3" 235 | }, 236 | "language_info": { 237 | "codemirror_mode": { 238 | "name": "ipython", 239 | "version": 3 240 | }, 241 | "file_extension": ".py", 242 | "mimetype": "text/x-python", 243 | "name": "python", 244 | "nbconvert_exporter": "python", 245 | "pygments_lexer": "ipython3", 246 | "version": "3.8.5" 247 | } 248 | }, 249 | "nbformat": 4, 250 | "nbformat_minor": 2 251 | } 252 | -------------------------------------------------------------------------------- /dataframes/.ipynb_checkpoints/dataframe_from_nothing-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Dataframes from \"nothing\"" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pyspark\n", 17 | "from pyspark.sql import SparkSession\n", 18 | "from pyspark.sql.types import StructType, StructField, StringType" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "spark = SparkSession.builder.appName('basic').getOrCreate()" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Creating a empty dataframe" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### Parallelize way" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "+--------+-------+-------+\n", 54 | "| _1| _2| _3|\n", 55 | "+--------+-------+-------+\n", 56 | "| This|is only|a test!|\n", 57 | "|And this| is| too|\n", 58 | "+--------+-------+-------+\n", 59 | "\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "# Define your data by a set of data. Each data is a Row!\n", 65 | "data = [(\"This\",\"is only\", \"a test!\"),(\"And this\",\"is\",\"too\")]\n", 66 | "\n", 67 | "# Paralellize data\n", 68 | "spark.sparkContext.parallelize(data).toDF().show()\n" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "+--------+-------+-------+\n", 81 | "| colA| colB| colC|\n", 82 | "+--------+-------+-------+\n", 83 | "| This|is only|a test!|\n", 84 | "|And this| is| too|\n", 85 | "+--------+-------+-------+\n", 86 | "\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "# Defining column names\n", 92 | "columns = [\"colA\", \"colB\", \"colC\"]\n", 93 | "spark.sparkContext.parallelize(data).toDF(columns).show()" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "### createDataFrame way" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 5, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "++\n", 113 | "||\n", 114 | "++\n", 115 | "++\n", 116 | "\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "# First it's required unleast a empty schema\n", 122 | "schema = StructType([])\n", 123 | "\n", 124 | "# Now, an empty and useless dataframe\n", 125 | "spark.createDataFrame(spark.sparkContext.emptyRDD(),schema).show()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 11, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | "+--------+-------+-------+\n", 138 | "| colA| colB| colC|\n", 139 | "+--------+-------+-------+\n", 140 | "| This|is only|a test!|\n", 141 | "|And this| is| too|\n", 142 | "+--------+-------+-------+\n", 143 | "\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "# Schema for previous data\n", 149 | "schema = StructType([\n", 150 | " StructField(\"colA\",StringType(),False),\n", 151 | " StructField(\"colB\",StringType(),False),\n", 152 | " StructField(\"colC\",StringType(),False)\n", 153 | " ])\n", 154 | "# Creating DataFrame\n", 155 | "spark.createDataFrame(data,schema).show()\n" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 12, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "+--------+-------+------+\n", 168 | "| colA| colB| colC|\n", 169 | "+--------+-------+------+\n", 170 | "| This|is only|a test|\n", 171 | "|And this| is| too|\n", 172 | "+--------+-------+------+\n", 173 | "\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "# Minimalist\n", 179 | "df = spark.createDataFrame([[\"This\",\"is only\",\"a test\"],[\"And this\",\"is\",\"too\"]],schema)\n", 180 | "df.show()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "## Dataframes from CSV" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 13, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "+--------+-------+------+\n", 200 | "| colA| colB| colC|\n", 201 | "+--------+-------+------+\n", 202 | "| This|is only|a test|\n", 203 | "|And this| is| too|\n", 204 | "+--------+-------+------+\n", 205 | "\n" 206 | ] 207 | } 208 | ], 209 | "source": [ 210 | "# Let's create a file first from the dataframe stored on 'df' var!\n", 211 | "file='/users/hduser/spark-ref/dataframes/examples/example-001.csv'\n", 212 | "df.write.csv(file, sep=',', header=True)\n", 213 | "\n", 214 | "\n", 215 | "# Reading from CSV\n", 216 | "spark.read.csv(file, \n", 217 | " sep=',',\n", 218 | " encoding='utf-8',\n", 219 | " header=True).show()" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [] 228 | } 229 | ], 230 | "metadata": { 231 | "kernelspec": { 232 | "display_name": "Python 3 (ipykernel)", 233 | "language": "python", 234 | "name": "python3" 235 | }, 236 | "language_info": { 237 | "codemirror_mode": { 238 | "name": "ipython", 239 | "version": 3 240 | }, 241 | "file_extension": ".py", 242 | "mimetype": "text/x-python", 243 | "name": "python", 244 | "nbconvert_exporter": "python", 245 | "pygments_lexer": "ipython3", 246 | "version": "3.8.5" 247 | } 248 | }, 249 | "nbformat": 4, 250 | "nbformat_minor": 2 251 | } 252 | -------------------------------------------------------------------------------- /dataframes/not_exits_and_exists_equivalent-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## \"NOT EXISTS\" AND \"EXISTS\" equivalent operations on dataframes\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pyspark.sql.functions as F\n", 17 | "from pyspark.sql.types import *\n", 18 | "from pyspark.sql import SparkSession" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "spark = SparkSession.builder.appName('test_dataframes').enableHiveSupport().getOrCreate()" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Generating data" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 35, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "+---+-----+\n", 47 | "| id| name|\n", 48 | "+---+-----+\n", 49 | "| 1|Andre|\n", 50 | "| 2| Rose|\n", 51 | "+---+-----+\n", 52 | "\n", 53 | "+---+------+\n", 54 | "| id| name|\n", 55 | "+---+------+\n", 56 | "| 1| Andre|\n", 57 | "| 2| Rose|\n", 58 | "| 3|Daniel|\n", 59 | "+---+------+\n", 60 | "\n", 61 | "+---+------+\n", 62 | "| id| name|\n", 63 | "+---+------+\n", 64 | "| 1| Andre|\n", 65 | "| 2| Rose|\n", 66 | "| 3|Daniel|\n", 67 | "| 3|Daniel|\n", 68 | "| 4| Anita|\n", 69 | "+---+------+\n", 70 | "\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "data1 = [(1,\"Andre\"),(2,\"Rose\")]\n", 76 | "data2 = [(1,\"Andre\"),(2,\"Rose\"),(3,\"Daniel\")]\n", 77 | "data3 = [(1,\"Andre\"),(2,\"Rose\"),(3,\"Daniel\"), (3,\"Daniel\"), (4,\"Anita\")]\n", 78 | "schema = StructType([\n", 79 | " StructField(\"id\",StringType(),True),\n", 80 | " StructField(\"name\",StringType(),True),\n", 81 | "])\n", 82 | "\n", 83 | "df1 = spark.createDataFrame(data1,schema)\n", 84 | "df2 = spark.createDataFrame(data2,schema)\n", 85 | "df3 = spark.createDataFrame(data3,schema)\n", 86 | "df1.show()\n", 87 | "df2.show()\n", 88 | "df3.show()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## NOT EXISTS EQUIVALENT" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Method 1 - subtract" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 36, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | "+---+------+\n", 115 | "| id| name|\n", 116 | "+---+------+\n", 117 | "| 3|Daniel|\n", 118 | "+---+------+\n", 119 | "\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "# All that exists in df2 but doesn't dexists in df1\n", 125 | "dfr = df2.subtract(df1)\n", 126 | "dfr.show()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 47, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "+---+------+\n", 139 | "| id| name|\n", 140 | "+---+------+\n", 141 | "| 3|Daniel|\n", 142 | "| 4| Anita|\n", 143 | "+---+------+\n", 144 | "\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "# All that exists in df3 but doesn't dexists in df1\n", 150 | "dfr2 = df3.subtract(df1)\n", 151 | "dfr2.show()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "### Method 2 - left_anti. This is the 'classical' way to have something equivalent to 'NOT EXISTS'" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 37, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "+---+------+\n", 171 | "| id| name|\n", 172 | "+---+------+\n", 173 | "| 3|Daniel|\n", 174 | "+---+------+\n", 175 | "\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "# All that exists in df2 but doesn't dexists in df1\n", 181 | "dfr = df2.join(df1,'id','left_anti')\n", 182 | "dfr.show()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 54, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "+---+------+\n", 195 | "| id| name|\n", 196 | "+---+------+\n", 197 | "| 3|Daniel|\n", 198 | "| 4| Anita|\n", 199 | "+---+------+\n", 200 | "\n" 201 | ] 202 | } 203 | ], 204 | "source": [ 205 | "# All that exists in df3 but doesn't dexists in df1\n", 206 | "dfr2 = df3.join(df1,'id','left_anti').dropDuplicates()\n", 207 | "dfr2.show()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "### Method 3 - exceptAll - same thing as 'subtract', apearently!" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 51, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "+---+------+\n", 227 | "| id| name|\n", 228 | "+---+------+\n", 229 | "| 3|Daniel|\n", 230 | "+---+------+\n", 231 | "\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "# All that exists in df2 but doesn't dexists in df1\n", 237 | "dfr = df2.exceptAll(df1)\n", 238 | "dfr.show()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 55, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "name": "stdout", 248 | "output_type": "stream", 249 | "text": [ 250 | "+---+------+\n", 251 | "| id| name|\n", 252 | "+---+------+\n", 253 | "| 3|Daniel|\n", 254 | "| 4| Anita|\n", 255 | "+---+------+\n", 256 | "\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "# All that exists in df3 but doesn't dexists in df1\n", 262 | "dfr2 = df3.exceptAll(df1).dropDuplicates()\n", 263 | "dfr2.show()" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "## EXISTS EQUIVALENT" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "### Method 1 - left_semi" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 40, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "name": "stdout", 287 | "output_type": "stream", 288 | "text": [ 289 | "+---+-----+\n", 290 | "| id| name|\n", 291 | "+---+-----+\n", 292 | "| 1|Andre|\n", 293 | "| 2| Rose|\n", 294 | "+---+-----+\n", 295 | "\n" 296 | ] 297 | } 298 | ], 299 | "source": [ 300 | "# Only that exists in df2 and df1\n", 301 | "dfr = df2.join(df1,'id','left_semi')\n", 302 | "dfr.show()" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 56, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "name": "stdout", 312 | "output_type": "stream", 313 | "text": [ 314 | "+---+-----+\n", 315 | "| id| name|\n", 316 | "+---+-----+\n", 317 | "| 1|Andre|\n", 318 | "| 2| Rose|\n", 319 | "+---+-----+\n", 320 | "\n" 321 | ] 322 | } 323 | ], 324 | "source": [ 325 | "# Only that exists in df3 and df1\n", 326 | "dfr = df3.join(df1,'id','left_semi')\n", 327 | "dfr.show()" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [] 336 | } 337 | ], 338 | "metadata": { 339 | "kernelspec": { 340 | "display_name": "Python 3", 341 | "language": "python", 342 | "name": "python3" 343 | }, 344 | "language_info": { 345 | "codemirror_mode": { 346 | "name": "ipython", 347 | "version": 3 348 | }, 349 | "file_extension": ".py", 350 | "mimetype": "text/x-python", 351 | "name": "python", 352 | "nbconvert_exporter": "python", 353 | "pygments_lexer": "ipython3", 354 | "version": "3.8.5" 355 | } 356 | }, 357 | "nbformat": 4, 358 | "nbformat_minor": 4 359 | } 360 | -------------------------------------------------------------------------------- /sql/not_exits_and_exists_equivalent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Spark SQL: \"NOT EXISTS\" AND \"EXISTS\" equivalent operations on dataframes\n", 8 | "\n" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 28, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import findspark\n", 18 | "findspark.init()\n", 19 | "import pyspark.sql.functions as F\n", 20 | "from pyspark.sql.types import *\n", 21 | "from pyspark.sql import SparkSession" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 29, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "spark = SparkSession.builder.appName('test_dataframes').enableHiveSupport().getOrCreate()" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Generating data" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 30, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "+---+-----+\n", 50 | "| id| name|\n", 51 | "+---+-----+\n", 52 | "| 1|Andre|\n", 53 | "| 2| Rose|\n", 54 | "+---+-----+\n", 55 | "\n", 56 | "+---+------+\n", 57 | "| id| name|\n", 58 | "+---+------+\n", 59 | "| 1| Andre|\n", 60 | "| 2| Rose|\n", 61 | "| 3|Daniel|\n", 62 | "+---+------+\n", 63 | "\n", 64 | "+---+------+\n", 65 | "| id| name|\n", 66 | "+---+------+\n", 67 | "| 1| Andre|\n", 68 | "| 2| Rose|\n", 69 | "| 3|Daniel|\n", 70 | "| 3|Daniel|\n", 71 | "| 4| Anita|\n", 72 | "+---+------+\n", 73 | "\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "data1 = [(1,\"Andre\"),(2,\"Rose\")]\n", 79 | "data2 = [(1,\"Andre\"),(2,\"Rose\"),(3,\"Daniel\")]\n", 80 | "data3 = [(1,\"Andre\"),(2,\"Rose\"),(3,\"Daniel\"), (3,\"Daniel\"), (4,\"Anita\")]\n", 81 | "schema = StructType([\n", 82 | " StructField(\"id\",StringType(),True),\n", 83 | " StructField(\"name\",StringType(),True),\n", 84 | "])\n", 85 | "\n", 86 | "df1 = spark.createDataFrame(data1,schema)\n", 87 | "df2 = spark.createDataFrame(data2,schema)\n", 88 | "df3 = spark.createDataFrame(data3,schema)\n", 89 | "df1.show()\n", 90 | "df2.show()\n", 91 | "df3.show()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "## NOT EXISTS EQUIVALENT" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "### Method 1 - subtract" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 31, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "+---+------+\n", 118 | "| id| name|\n", 119 | "+---+------+\n", 120 | "| 3|Daniel|\n", 121 | "+---+------+\n", 122 | "\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "# All that exists in df2 but doesn't dexists in df1\n", 128 | "dfr = df2.subtract(df1)\n", 129 | "dfr.show()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 32, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "name": "stderr", 139 | "output_type": "stream", 140 | "text": [ 141 | "\r", 142 | "[Stage 338:=================================================> (187 + 8) / 200]\r", 143 | "\r", 144 | " \r" 145 | ] 146 | }, 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "+---+------+\n", 152 | "| id| name|\n", 153 | "+---+------+\n", 154 | "| 3|Daniel|\n", 155 | "| 4| Anita|\n", 156 | "+---+------+\n", 157 | "\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "# All that exists in df3 but doesn't dexists in df1\n", 163 | "dfr2 = df3.subtract(df1)\n", 164 | "dfr2.show()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "### Method 2 - left_anti. This is the 'classical' way to have something equivalent to 'NOT EXISTS'" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 33, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "+---+------+\n", 184 | "| id| name|\n", 185 | "+---+------+\n", 186 | "| 3|Daniel|\n", 187 | "+---+------+\n", 188 | "\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "# All that exists in df2 but doesn't dexists in df1\n", 194 | "dfr = df2.join(df1,'id','left_anti')\n", 195 | "dfr.show()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 34, 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "+---+------+\n", 208 | "| id| name|\n", 209 | "+---+------+\n", 210 | "| 3|Daniel|\n", 211 | "| 4| Anita|\n", 212 | "+---+------+\n", 213 | "\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "# All that exists in df3 but doesn't dexists in df1\n", 219 | "dfr2 = df3.join(df1,'id','left_anti').dropDuplicates()\n", 220 | "dfr2.show()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "### Method 3 - exceptAll - same thing as 'subtract', apearently!" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 35, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "+---+------+\n", 240 | "| id| name|\n", 241 | "+---+------+\n", 242 | "| 3|Daniel|\n", 243 | "+---+------+\n", 244 | "\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "# All that exists in df2 but doesn't dexists in df1\n", 250 | "dfr = df2.exceptAll(df1)\n", 251 | "dfr.show()" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 36, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "name": "stdout", 261 | "output_type": "stream", 262 | "text": [ 263 | "+---+------+\n", 264 | "| id| name|\n", 265 | "+---+------+\n", 266 | "| 3|Daniel|\n", 267 | "| 4| Anita|\n", 268 | "+---+------+\n", 269 | "\n" 270 | ] 271 | } 272 | ], 273 | "source": [ 274 | "# All that exists in df3 but doesn't dexists in df1\n", 275 | "dfr2 = df3.exceptAll(df1).dropDuplicates()\n", 276 | "dfr2.show()" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "## EXISTS EQUIVALENT" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "### Method 1 - left_semi" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 15, 296 | "metadata": {}, 297 | "outputs": [ 298 | { 299 | "name": "stdout", 300 | "output_type": "stream", 301 | "text": [ 302 | "+---+-----+\n", 303 | "| id| name|\n", 304 | "+---+-----+\n", 305 | "| 1|Andre|\n", 306 | "| 2| Rose|\n", 307 | "+---+-----+\n", 308 | "\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "# Only that exists in df2 and df1\n", 314 | "dfr = df2.join(df1,'id','left_semi')\n", 315 | "dfr.show()" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 16, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "+---+-----+\n", 328 | "| id| name|\n", 329 | "+---+-----+\n", 330 | "| 1|Andre|\n", 331 | "| 2| Rose|\n", 332 | "+---+-----+\n", 333 | "\n" 334 | ] 335 | } 336 | ], 337 | "source": [ 338 | "# Only that exists in df3 and df1\n", 339 | "dfr = df3.join(df1,'id','left_semi')\n", 340 | "dfr.show()" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [] 349 | } 350 | ], 351 | "metadata": { 352 | "kernelspec": { 353 | "display_name": "Python 3 (ipykernel)", 354 | "language": "python", 355 | "name": "python3" 356 | }, 357 | "language_info": { 358 | "codemirror_mode": { 359 | "name": "ipython", 360 | "version": 3 361 | }, 362 | "file_extension": ".py", 363 | "mimetype": "text/x-python", 364 | "name": "python", 365 | "nbconvert_exporter": "python", 366 | "pygments_lexer": "ipython3", 367 | "version": "3.10.12" 368 | } 369 | }, 370 | "nbformat": 4, 371 | "nbformat_minor": 4 372 | } 373 | -------------------------------------------------------------------------------- /dataframes/dataframe_from_json.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "6524d53e-b3d6-41f2-9d1e-c39f98d52e1d", 6 | "metadata": {}, 7 | "source": [ 8 | "# Dataframe from JSON\n" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 19, 14 | "id": "460e7835-3700-4d4a-a0aa-212e268c34c2", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pyspark\n", 19 | "import pyspark.sql.functions as F\n", 20 | "import pandas as pd\n", 21 | "import re\n", 22 | "from pyspark import SparkConf, SparkContext\n", 23 | "from pyspark.sql import SparkSession, Row\n", 24 | "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, MapType, ArrayType\n", 25 | "from pyspark.sql.functions import udf\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 20, 31 | "id": "2e33eed2-c8f7-46f8-9943-cde7d53bc32a", 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "# Getting session from Spark\n", 36 | "spark = SparkSession.builder \\\n", 37 | ".appName('test').master('local[*]') \\\n", 38 | ".config(\"spark.cores.max\", \"2\") \\\n", 39 | ".config(\"spark.executor.memory\", \"2g\") \\\n", 40 | ".config(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\") \\\n", 41 | ".config(\"spark.shuffle.service.enabled\", \"false\") \\\n", 42 | ".config(\"spark.dynamicAllocation.enabled\", \"true\") \\\n", 43 | ".getOrCreate()\n", 44 | "\n", 45 | "\n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "f927ff89-14fd-4527-837c-29c8d3e2892e", 51 | "metadata": {}, 52 | "source": [ 53 | "### A proper JSON format in file dataframes/data/test.js\n", 54 | "\n", 55 | "```json\n", 56 | "[\n", 57 | " {\n", 58 | " \"name\": \"Andre\",\n", 59 | " \"id\": 1,\n", 60 | " \"doc_list\":[{\"docid\":\"DOC001\", \"name\":\"bla001.txt\"}, {\"docid\":\"DOC002\", \"name\":\"bla002.txt\"}],\n", 61 | " },\n", 62 | "\n", 63 | " {\n", 64 | " \"name\": \"Noé\",\n", 65 | " \"id\": 2,\n", 66 | " \"doc_list\":[{\"docid\":\"DOC003\", \"name\":\"bla003.txt\"}, {\"docid\":\"DOC004\", \"name\":\"bla004.txt\"}],\n", 67 | " }\n", 68 | "]\n", 69 | "\n", 70 | "\n", 71 | "```\n", 72 | "\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "fcfe3e08-812a-49ec-9679-0716780b12f5", 78 | "metadata": {}, 79 | "source": [ 80 | "### Example problem\n", 81 | "\n", 82 | "Based on JSON structure in the file 'test.js'(shown above), extract the doc file names associated to the people's names as following: \n", 83 | "\n", 84 | "```text\n", 85 | "+-----+----------------------+\n", 86 | "|name |doc_names |\n", 87 | "+-----+----------------------+\n", 88 | "|Andre|bla001.txt, bla002.txt|\n", 89 | "|Noé |bla003.txt, bla004.txt|\n", 90 | "+-----+----------------------+\n", 91 | "```\n", 92 | "\n", 93 | "Note that, 'docid' is not desirable.\n", 94 | "\n" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 4, 100 | "id": "c287da29-db31-4a48-832c-c161072a2a46", 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "+-----+------------------------------------------------------------------------------+\n", 108 | "|name |doc_list |\n", 109 | "+-----+------------------------------------------------------------------------------+\n", 110 | "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|\n", 111 | "|Noé |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|\n", 112 | "+-----+------------------------------------------------------------------------------+\n", 113 | "\n" 114 | ] 115 | }, 116 | { 117 | "name": "stderr", 118 | "output_type": "stream", 119 | "text": [ 120 | " " 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "# Defining schema\n", 126 | "schema = StructType([\n", 127 | " StructField(\"name\", StringType(), True),\n", 128 | " StructField(\"doc_list\", ArrayType(MapType(StringType(),StringType(),True),True), True),\n", 129 | "])\n", 130 | "# Reading JSON file using Dataframe API setting 'multiline' option as true\n", 131 | "sdf = spark.read.option(\"multiline\", \"true\").json('data/test.json', schema=schema)\n", 132 | "sdf.show(truncate=False)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "11cc02e6-f597-4aff-bcb2-588fb94d2426", 138 | "metadata": {}, 139 | "source": [ 140 | "### Transformations steps\n", 141 | "\n", 142 | "1. Transform the list in 'doc_list' column into various rows using the `explode` pyspark function;\n", 143 | "2. For each of those rows, extract 'name' from the data structure using `getItem` function and drop the original column 'doc_ex';\n", 144 | "3. Transform doc names into a list againd group rows by 'name' column and using the `collect_list` as the aggregate function;\n", 145 | "4. Transform the list into a string separated by ',' using `concat_ws`\n" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 5, 151 | "id": "bf90af12-0f3c-4823-a576-359f66f2dc5d", 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "Step 1\n", 159 | "+-----+------------------------------------------------------------------------------+-------------------------------------+\n", 160 | "|name |doc_list |doc_ex |\n", 161 | "+-----+------------------------------------------------------------------------------+-------------------------------------+\n", 162 | "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|{docid -> DOC001, name -> bla001.txt}|\n", 163 | "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|{docid -> DOC002, name -> bla002.txt}|\n", 164 | "|Noé |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|{docid -> DOC003, name -> bla003.txt}|\n", 165 | "|Noé |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|{docid -> DOC004, name -> bla004.txt}|\n", 166 | "+-----+------------------------------------------------------------------------------+-------------------------------------+\n", 167 | "\n", 168 | "Step 2\n", 169 | "+-----+------------------------------------------------------------------------------+----------+\n", 170 | "|name |doc_list |doc_name |\n", 171 | "+-----+------------------------------------------------------------------------------+----------+\n", 172 | "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|bla001.txt|\n", 173 | "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|bla002.txt|\n", 174 | "|Noé |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|bla003.txt|\n", 175 | "|Noé |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|bla004.txt|\n", 176 | "+-----+------------------------------------------------------------------------------+----------+\n", 177 | "\n", 178 | "Step 3\n", 179 | "+-----+------------------------+\n", 180 | "|name |doc_list |\n", 181 | "+-----+------------------------+\n", 182 | "|Noé |[bla003.txt, bla004.txt]|\n", 183 | "|Andre|[bla001.txt, bla002.txt]|\n", 184 | "+-----+------------------------+\n", 185 | "\n", 186 | "Step 4\n", 187 | "+-----+---------------------+\n", 188 | "|name |doc_names |\n", 189 | "+-----+---------------------+\n", 190 | "|Noé |bla003.txt,bla004.txt|\n", 191 | "|Andre|bla001.txt,bla002.txt|\n", 192 | "+-----+---------------------+\n", 193 | "\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "# Step 1\n", 199 | "adf = sdf.withColumn(\"doc_ex\", F.explode(\"doc_list\"))\n", 200 | "print(\"Step 1\")\n", 201 | "adf.show(truncate=False)\n", 202 | "# Step 2 - Extracting the value of interest. In this case, the names of documents.\n", 203 | "adf = adf.withColumn(\"doc_name\", adf.doc_ex.getItem(\"name\")).drop(\"doc_ex\")\n", 204 | "print(\"Step 2\")\n", 205 | "adf.show(truncate=False)\n", 206 | "# Step 3 - Time to revert the 'explode' effect. For this, let's group rows by name and use 'collect_list' as aggregate function\n", 207 | "ndf = adf.groupBy(\"name\").agg(F.collect_list(\"doc_name\").alias('doc_list'))\n", 208 | "print(\"Step 3\")\n", 209 | "ndf.show(truncate=False)\n", 210 | "# Step 4 - Transforming this list into a string list separated by ',' character.\n", 211 | "ndf = ndf.withColumn(\"doc_names\", F.concat_ws(\",\", \"doc_list\")).drop(\"doc_list\")\n", 212 | "print(\"Step 4\")\n", 213 | "ndf.show(truncate=False)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "id": "f6ba2ea4-eda9-4f53-924e-5f71e11f0899", 219 | "metadata": {}, 220 | "source": [ 221 | "### UDF(Not recommended)\n", 222 | "User Defined Functions(UDF) can be a way to parse information from a column. In this case, the docs inside the JSON file is available in a list of objects which is parsed by pySpark and convenient converted into Python data structure objects which looks like more simpler to deal with. However, in this particularly scenario is not recommended because is possible to use spark functions which supports this operation offering a better optimization than UDF. Besides, in Python performance is not particularly good specially if you have neasted loops." 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 6, 228 | "id": "5a87b5f0-95cc-4d05-b742-a155f196a2aa", 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "@udf\n", 233 | "def extract_doc(data_list: list) -> str:\n", 234 | " n = list()\n", 235 | " for li in data_list:\n", 236 | " n += [v for k,v in li.items() if k == 'name']\n", 237 | "\n", 238 | " return ','.join(n)\n" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 7, 244 | "id": "57fee3c9-fede-4649-a07e-9935bb82fd72", 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "+-----+---------------------+\n", 252 | "|name |doc_names |\n", 253 | "+-----+---------------------+\n", 254 | "|Andre|bla001.txt,bla002.txt|\n", 255 | "|Noé |bla003.txt,bla004.txt|\n", 256 | "+-----+---------------------+\n", 257 | "\n" 258 | ] 259 | }, 260 | { 261 | "name": "stderr", 262 | "output_type": "stream", 263 | "text": [ 264 | " " 265 | ] 266 | } 267 | ], 268 | "source": [ 269 | "# Running the UDF called 'extract_doc_udf' and storing into a new column called 'udf_res'\n", 270 | "dfu = sdf.withColumn('doc_names', extract_doc(F.col('doc_list'))).select('name','doc_names')\n", 271 | "# Showing the result\n", 272 | "dfu.show(truncate=False)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "id": "56069cfc-2d38-4ef9-848f-d7116afc7d3e", 278 | "metadata": {}, 279 | "source": [ 280 | "### \"UDF\" using RDD(less recommended)\n", 281 | "This is the old ways to handle UDFs. The reasons to not to do it is the same as the previous ways. And here is worse because you dealing directly with RDD and you will not have any optimization for doing that. If you don't know how to optimize RDD operations by yourself, don't use it." 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 8, 287 | "id": "db3daafa-8e25-44d4-bb9a-4170e10c054b", 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "def extract_doc_rdd(row):\n", 292 | " d = row.asDict()\n", 293 | " n = list()\n", 294 | " if 'doc_list' in d:\n", 295 | " for li in d['doc_list']:\n", 296 | " n += [v for k,v in li.items() if k == 'name']\n", 297 | "\n", 298 | " d['doc_names'] = ','.join(n)\n", 299 | "\n", 300 | " return Row(**d)\n", 301 | " \n" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 9, 307 | "id": "8aba685e-ab0c-4b05-ab63-a2d9a97ee41f", 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "name": "stderr", 312 | "output_type": "stream", 313 | "text": [ 314 | " " 315 | ] 316 | }, 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "+-----+---------------------+\n", 322 | "|name |doc_names |\n", 323 | "+-----+---------------------+\n", 324 | "|Andre|bla001.txt,bla002.txt|\n", 325 | "|Noé |bla003.txt,bla004.txt|\n", 326 | "+-----+---------------------+\n", 327 | "\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "# Executing 'extract_doc_rdd' using map method from rdd object\n", 333 | "rdd = sdf.rdd.map(extract_doc_rdd)\n", 334 | "# Converting into a dataframe object\n", 335 | "edf = rdd.toDF().select('name','doc_names')\n", 336 | "# Showing the result\n", 337 | "edf.show(truncate=False)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "id": "4d11b47f-4fc2-43a6-9433-712f7207deff", 343 | "metadata": {}, 344 | "source": [ 345 | "### Reading complex JSON using dynamic schema\n", 346 | "\n", 347 | "For the cases where JSON contains varying all the time. Example on dataframe/data/json-varying.csv:\n", 348 | "\n", 349 | "```json\n", 350 | "id,json_string\n", 351 | "1,'{\"name\": \"John Doe\", \"age\": 30}'\n", 352 | "2,'{\"city\": \"New York\", \"country\": \"USA\", \"zipcode\": \"10001\"}'\n", 353 | "3,'{\"product\": \"Laptop\", \"brand\": \"Dell\", \"specs\": {\"RAM\": \"16GB\", \"Storage\": \"512GB SSD\"}}'\n", 354 | "\n", 355 | "```\n", 356 | "\n", 357 | "Tipically this kind of data came from a column\n", 358 | "\n" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 16, 364 | "id": "7c1dc01a-2f41-4ffe-9ba8-c172048f9fe3", 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "name": "stdout", 369 | "output_type": "stream", 370 | "text": [ 371 | "+---+----------------------------------------------------------------------------------------+\n", 372 | "|id |json_string |\n", 373 | "+---+----------------------------------------------------------------------------------------+\n", 374 | "|1 |{\"name\": \"John Doe\", \"age\": 30} |\n", 375 | "|2 |{\"city\": \"New York\", \"country\": \"USA\", \"zipcode\": \"10001\"} |\n", 376 | "|3 |{\"product\": \"Laptop\", \"brand\": \"Dell\", \"specs\": {\"RAM\": \"16GB\", \"Storage\": \"512GB SSD\"}}|\n", 377 | "+---+----------------------------------------------------------------------------------------+\n", 378 | "\n" 379 | ] 380 | } 381 | ], 382 | "source": [ 383 | "# Or from file.\n", 384 | "df = spark.read.csv('data/json-varying.csv', header=True, quote=\"'\")\n", 385 | "\n", 386 | "# If you're reading from a file, remove the quote char from string. Otherwise, the parser will not be able to return a object\n", 387 | "df = df.withColumn('json_string', F.regexp_replace('json_string', r\"\\'\", \"\"))\n", 388 | "df.show(truncate=False)" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 14, 394 | "id": "0f2bf586-4804-4732-a14a-19c6b5f7fa83", 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "name": "stdout", 399 | "output_type": "stream", 400 | "text": [ 401 | "+---+----------------------------------------------------------------------------------------+---------------------------------------------------------------+\n", 402 | "|id |json_string |parsed |\n", 403 | "+---+----------------------------------------------------------------------------------------+---------------------------------------------------------------+\n", 404 | "|1 |{\"name\": \"John Doe\", \"age\": 30} |{30, null, null, null, John Doe, null, null, null} |\n", 405 | "|2 |{\"city\": \"New York\", \"country\": \"USA\", \"zipcode\": \"10001\"} |{null, null, New York, USA, null, null, null, 10001} |\n", 406 | "|3 |{\"product\": \"Laptop\", \"brand\": \"Dell\", \"specs\": {\"RAM\": \"16GB\", \"Storage\": \"512GB SSD\"}}|{null, Dell, null, null, null, Laptop, {16GB, 512GB SSD}, null}|\n", 407 | "+---+----------------------------------------------------------------------------------------+---------------------------------------------------------------+\n", 408 | "\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "dynamic_schema = spark.read.json(df.rdd.map(lambda row: row['json_string'])).schema\n", 414 | "jdf = df.withColumn(\"parsed\", F.from_json('json_string', dynamic_schema))\n", 415 | "jdf.show(truncate=False)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "id": "56e661c1-1d5e-4371-9f76-d8f9d39a73c4", 421 | "metadata": {}, 422 | "source": [ 423 | "# JSON from string" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 27, 429 | "id": "3dab31a6-256d-4d3a-846c-086dfc0b58fb", 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "json_str = \"\"\"[\n", 434 | " {\n", 435 | " \"name\":\"Andre\",\n", 436 | " \"doc_list\":[{\"docid\":\"DOC001\", \"name\":\"bla001.txt\"}, {\"docid\":\"DOC002\", \"name\":\"bla002.txt\"}]\n", 437 | " },\n", 438 | " {\n", 439 | " \"name\": \"Noe\",\n", 440 | " \"doc_list\":[{\"docid\":\"DOC002\", \"name\":\"bla002.txt\"}, {\"docid\":\"DOC003\", \"name\":\"bla003.txt\"}]\n", 441 | " }\n", 442 | "]\"\"\"\n" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 28, 448 | "id": "e9c6ba84-4ddc-4e06-b283-195833e33e72", 449 | "metadata": {}, 450 | "outputs": [ 451 | { 452 | "name": "stdout", 453 | "output_type": "stream", 454 | "text": [ 455 | "+-----+------------------------------------------------------------------------------+\n", 456 | "|name |doc_list |\n", 457 | "+-----+------------------------------------------------------------------------------+\n", 458 | "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|\n", 459 | "|Noe |[{docid -> DOC002, name -> bla002.txt}, {docid -> DOC003, name -> bla003.txt}]|\n", 460 | "+-----+------------------------------------------------------------------------------+\n", 461 | "\n" 462 | ] 463 | } 464 | ], 465 | "source": [ 466 | "json_str=re.sub(r\"\\n\",\"\",json_str)\n", 467 | "sc = spark.sparkContext\n", 468 | "schema = StructType([\n", 469 | " StructField(\"name\", StringType(), True),\n", 470 | " StructField(\"doc_list\", ArrayType(MapType(StringType(), StringType())), True)\n", 471 | "])\n", 472 | "df = spark.read.json(sc.parallelize([json_str]), schema)\n", 473 | "df.show(truncate=False)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 29, 479 | "id": "3e1a9042-65c5-4aea-a1ff-cf58b629ecd0", 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "name": "stdout", 484 | "output_type": "stream", 485 | "text": [ 486 | "+-----+---------------------+\n", 487 | "|name |doc_names |\n", 488 | "+-----+---------------------+\n", 489 | "|Noe |bla002.txt,bla003.txt|\n", 490 | "|Andre|bla001.txt,bla002.txt|\n", 491 | "+-----+---------------------+\n", 492 | "\n" 493 | ] 494 | } 495 | ], 496 | "source": [ 497 | "dfe = df.withColumn(\"item\", F.explode(\"doc_list\"))\n", 498 | "dfe = dfe.withColumn(\"doc\", dfe.item.getItem('name')).drop(\"item\") \\\n", 499 | " .groupBy('name').agg(F.collect_list(\"doc\").alias(\"doclist\")) \\\n", 500 | " .withColumn('doc_names', F.concat_ws(',','doclist')).drop(\"doclist\")\n", 501 | "dfe.show(truncate=False)" 502 | ] 503 | } 504 | ], 505 | "metadata": { 506 | "kernelspec": { 507 | "display_name": "Python 3 (ipykernel)", 508 | "language": "python", 509 | "name": "python3" 510 | }, 511 | "language_info": { 512 | "codemirror_mode": { 513 | "name": "ipython", 514 | "version": 3 515 | }, 516 | "file_extension": ".py", 517 | "mimetype": "text/x-python", 518 | "name": "python", 519 | "nbconvert_exporter": "python", 520 | "pygments_lexer": "ipython3", 521 | "version": "3.10.12" 522 | } 523 | }, 524 | "nbformat": 4, 525 | "nbformat_minor": 5 526 | } 527 | --------------------------------------------------------------------------------