├── dataframes
├── .gitignore
├── dataframe_from_mongodb.md
├── dataframe_from_csv.ipynb
├── .ipynb_checkpoints
│ ├── dataframe-from-csv-checkpoint.ipynb
│ ├── sort_string_using_sorted_array-checkpoint.ipynb
│ └── dataframe_from_nothing-checkpoint.ipynb
├── dataframe_from_nothing.ipynb
├── not_exits_and_exists_equivalent-checkpoint.ipynb
└── dataframe_from_json.ipynb
├── .idea
├── .gitignore
├── misc.xml
├── vcs.xml
├── inspectionProfiles
│ ├── profiles_settings.xml
│ └── Project_Default.xml
├── modules.xml
└── spark-ref.iml
├── .gitignore
├── README.md
├── SECURITY.md
├── random-examples
├── sort_string_using_sorted_array.ipynb
└── word_counter.ipynb
├── rdd
└── rdd.ipynb
└── sql
└── not_exits_and_exists_equivalent.ipynb
/dataframes/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | ../*.swp
3 | .idea/
4 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | .idea/
3 | .bash_history
4 | .cache/
5 | .ipython/
6 | .jupyter/
7 | .local/
8 | .python_history
9 |
10 | .virtual_documents/
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # spark-ref
2 | pySpark references for developers
3 |
4 | This is a repository for pySpark that developers could use day-by-day.
5 |
6 | The idea is organize useful examples per themes
7 |
8 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/spark-ref.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 |
3 | ## Supported Versions
4 |
5 | Use this section to tell people about which versions of your project are
6 | currently being supported with security updates.
7 |
8 | | Version | Supported |
9 | | ------- | ------------------ |
10 | | 5.1.x | :white_check_mark: |
11 | | 5.0.x | :x: |
12 | | 4.0.x | :white_check_mark: |
13 | | < 4.0 | :x: |
14 |
15 | ## Reporting a Vulnerability
16 |
17 | Use this section to tell people how to report a vulnerability.
18 |
19 | Tell them where to go, how often they can expect to get an update on a
20 | reported vulnerability, what to expect if the vulnerability is accepted or
21 | declined, etc.
22 |
--------------------------------------------------------------------------------
/dataframes/dataframe_from_mongodb.md:
--------------------------------------------------------------------------------
1 |
2 | ## Importing
3 | ```python
4 | import pyspark
5 | import pyspark.sql.functions as F
6 | from pyspark.sql import SparkSession
7 | from pyspark.sql.types import StructType,StructField,StringType,IntegerType
8 | ```
9 | ## Connection directly on SparkSession
10 | ```python
11 | spark = SparkSession \
12 | .builder \
13 | .appName("tgt-santander-ingestion"
14 | ).config("spark.jars","jars/mongo-spark-connector_2.11-2.4.1.jar,jars/mongo-java-driver-3.11.0-rc0.jar,scala-library-2.11.12.jar"
15 | ).config("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.fake"
16 | ).config("spark.mongodb.output.uri", "mongodb://127.0.0.1/test.fake"
17 | ).config(
18 | "spark.hadoop.hive.metastore.warehouse.dir"
19 | ,"/home/hduser/Projects/job-test/metastore_db"
20 | ).config("spark.sql.warehouse.dir","/user/hive/warehouse"
21 | ).enableHiveSupport(
22 | ).getOrCreate()
23 |
24 | mongoDF = spark.read.format("mongo").load()
25 | ```
26 |
27 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
13 |
14 |
15 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/dataframes/dataframe_from_csv.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pyspark\n",
10 | "from pyspark.sql import SparkSession\n"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 6,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "# Initializing Spark session\n",
20 | "spark = SparkSession.builder.appName('basic').getOrCreate()"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 7,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdout",
30 | "output_type": "stream",
31 | "text": [
32 | "+---+-----+---+\n",
33 | "| id| name|age|\n",
34 | "+---+-----+---+\n",
35 | "| 1|André| 41|\n",
36 | "| 2| João| 28|\n",
37 | "| 3|Maria| 29|\n",
38 | "+---+-----+---+\n",
39 | "\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "# Reading from HDFS\n",
45 | "spark.read.csv('/users/hduser/spark-ref/dataframes/examples/example-001.csv', \n",
46 | " sep=',',\n",
47 | " encoding='utf-8',\n",
48 | " header=True\n",
49 | " ).show()\n",
50 | "\n",
51 | "# If you want to load a local file, add the prefix 'file://' to path!"
52 | ]
53 | }
54 | ],
55 | "metadata": {
56 | "kernelspec": {
57 | "display_name": "Python 3 (ipykernel)",
58 | "language": "python",
59 | "name": "python3"
60 | },
61 | "language_info": {
62 | "codemirror_mode": {
63 | "name": "ipython",
64 | "version": 3
65 | },
66 | "file_extension": ".py",
67 | "mimetype": "text/x-python",
68 | "name": "python",
69 | "nbconvert_exporter": "python",
70 | "pygments_lexer": "ipython3",
71 | "version": "3.8.5"
72 | }
73 | },
74 | "nbformat": 4,
75 | "nbformat_minor": 2
76 | }
77 |
--------------------------------------------------------------------------------
/dataframes/.ipynb_checkpoints/dataframe-from-csv-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pyspark\n",
10 | "from pyspark.sql import SparkSession\n"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "# Initializing Spark session\n",
20 | "spark = SparkSession.builder.appName('basic').getOrCreate()"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "ename": "NameError",
30 | "evalue": "name 'spark' is not defined",
31 | "output_type": "error",
32 | "traceback": [
33 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
34 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
35 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Reading from HDFS\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m spark.read.csv('/users/hduser/spark-ref/dataframes/examples/example-001.csv', \n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m','\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'utf-8'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
36 | "\u001b[0;31mNameError\u001b[0m: name 'spark' is not defined"
37 | ]
38 | }
39 | ],
40 | "source": [
41 | "# Reading from HDFS\n",
42 | "spark.read.csv('/users/hduser/spark-ref/dataframes/examples/example-001.csv', \n",
43 | " sep=',',\n",
44 | " encoding='utf-8',\n",
45 | " header=True\n",
46 | " ).show()\n",
47 | "\n",
48 | "# If you want to load a local file, add the prefix 'file://' to path!"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": []
57 | }
58 | ],
59 | "metadata": {
60 | "kernelspec": {
61 | "display_name": "Python 3",
62 | "language": "python",
63 | "name": "python3"
64 | },
65 | "language_info": {
66 | "codemirror_mode": {
67 | "name": "ipython",
68 | "version": 3
69 | },
70 | "file_extension": ".py",
71 | "mimetype": "text/x-python",
72 | "name": "python",
73 | "nbconvert_exporter": "python",
74 | "pygments_lexer": "ipython3",
75 | "version": "3.6.9"
76 | }
77 | },
78 | "nbformat": 4,
79 | "nbformat_minor": 2
80 | }
81 |
--------------------------------------------------------------------------------
/random-examples/sort_string_using_sorted_array.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pyspark\n",
10 | "import pyspark.sql.functions as F\n",
11 | "from pyspark.sql import SparkSession\n",
12 | "from pyspark.sql.types import StructType, StructField, StringType"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 5,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "spark = SparkSession.builder.appName('strings_lists').enableHiveSupport().getOrCreate()"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 15,
27 | "metadata": {},
28 | "outputs": [
29 | {
30 | "name": "stdout",
31 | "output_type": "stream",
32 | "text": [
33 | "+-------+\n",
34 | "| target|\n",
35 | "+-------+\n",
36 | "|XADOWPQ|\n",
37 | "+-------+\n",
38 | "\n"
39 | ]
40 | }
41 | ],
42 | "source": [
43 | "# Creating a basic schema\n",
44 | "schema = StructType([\n",
45 | " StructField('target', StringType(), True)\n",
46 | "])\n",
47 | "data = [['XADOWPQ']]\n",
48 | "df = spark.createDataFrame(data, schema)\n",
49 | "dfo = df\n",
50 | "df.show()"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 14,
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "+--------------------+\n",
63 | "| target|\n",
64 | "+--------------------+\n",
65 | "|[X, A, D, O, W, P...|\n",
66 | "+--------------------+\n",
67 | "\n",
68 | "+--------------------+\n",
69 | "| target|\n",
70 | "+--------------------+\n",
71 | "|[, A, D, O, P, Q,...|\n",
72 | "+--------------------+\n",
73 | "\n",
74 | "+-------+\n",
75 | "| target|\n",
76 | "+-------+\n",
77 | "|ADOPQWX|\n",
78 | "+-------+\n",
79 | "\n"
80 | ]
81 | }
82 | ],
83 | "source": [
84 | "# First, Transform target to array. Let's using another field\n",
85 | "df = df.withColumn('target', F.split('target',''))\n",
86 | "df.show()\n",
87 | "\n",
88 | "# Second, sort the array\n",
89 | "df = df.withColumn('target', F.array_sort('target'))\n",
90 | "df.show()\n",
91 | "\n",
92 | "# Third, convert to string again\n",
93 | "df = df.withColumn('target', F.concat_ws('', 'target'))\n",
94 | "df.show()"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": []
103 | }
104 | ],
105 | "metadata": {
106 | "kernelspec": {
107 | "display_name": "Python 3 (ipykernel)",
108 | "language": "python",
109 | "name": "python3"
110 | },
111 | "language_info": {
112 | "codemirror_mode": {
113 | "name": "ipython",
114 | "version": 3
115 | },
116 | "file_extension": ".py",
117 | "mimetype": "text/x-python",
118 | "name": "python",
119 | "nbconvert_exporter": "python",
120 | "pygments_lexer": "ipython3",
121 | "version": "3.10.12"
122 | }
123 | },
124 | "nbformat": 4,
125 | "nbformat_minor": 4
126 | }
127 |
--------------------------------------------------------------------------------
/dataframes/.ipynb_checkpoints/sort_string_using_sorted_array-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pyspark\n",
10 | "import pyspark.sql.functions as F\n",
11 | "from pyspark.sql import SparkSession\n",
12 | "from pyspark.sql.types import StructType, StructField, StringType"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 5,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "spark = SparkSession.builder.appName('strings_lists').enableHiveSupport().getOrCreate()"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 15,
27 | "metadata": {},
28 | "outputs": [
29 | {
30 | "name": "stdout",
31 | "output_type": "stream",
32 | "text": [
33 | "+-------+\n",
34 | "| target|\n",
35 | "+-------+\n",
36 | "|XADOWPQ|\n",
37 | "+-------+\n",
38 | "\n"
39 | ]
40 | }
41 | ],
42 | "source": [
43 | "# Creating a basic schema\n",
44 | "schema = StructType([\n",
45 | " StructField('target', StringType(), True)\n",
46 | "])\n",
47 | "data = [['XADOWPQ']]\n",
48 | "df = spark.createDataFrame(data, schema)\n",
49 | "dfo = df\n",
50 | "df.show()"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 14,
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "+--------------------+\n",
63 | "| target|\n",
64 | "+--------------------+\n",
65 | "|[X, A, D, O, W, P...|\n",
66 | "+--------------------+\n",
67 | "\n",
68 | "+--------------------+\n",
69 | "| target|\n",
70 | "+--------------------+\n",
71 | "|[, A, D, O, P, Q,...|\n",
72 | "+--------------------+\n",
73 | "\n",
74 | "+-------+\n",
75 | "| target|\n",
76 | "+-------+\n",
77 | "|ADOPQWX|\n",
78 | "+-------+\n",
79 | "\n"
80 | ]
81 | }
82 | ],
83 | "source": [
84 | "# First, Transform target to array. Let's using another field\n",
85 | "df = df.withColumn('target', F.split('target',''))\n",
86 | "df.show()\n",
87 | "\n",
88 | "# Second, sort the array\n",
89 | "df = df.withColumn('target', F.array_sort('target'))\n",
90 | "df.show()\n",
91 | "\n",
92 | "# Third, convert to string again\n",
93 | "df = df.withColumn('target', F.concat_ws('', 'target'))\n",
94 | "df.show()"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": []
103 | }
104 | ],
105 | "metadata": {
106 | "kernelspec": {
107 | "display_name": "Python 3",
108 | "language": "python",
109 | "name": "python3"
110 | },
111 | "language_info": {
112 | "codemirror_mode": {
113 | "name": "ipython",
114 | "version": 3
115 | },
116 | "file_extension": ".py",
117 | "mimetype": "text/x-python",
118 | "name": "python",
119 | "nbconvert_exporter": "python",
120 | "pygments_lexer": "ipython3",
121 | "version": "3.8.5"
122 | }
123 | },
124 | "nbformat": 4,
125 | "nbformat_minor": 4
126 | }
127 |
--------------------------------------------------------------------------------
/rdd/rdd.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 8,
6 | "id": "88896fe3-993c-4e73-8f32-0dd22ef38ea5",
7 | "metadata": {
8 | "slideshow": {
9 | "slide_type": ""
10 | },
11 | "tags": []
12 | },
13 | "outputs": [],
14 | "source": [
15 | "import pyspark\n",
16 | "from pyspark import SparkContext\n",
17 | "from operator import add"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 9,
23 | "id": "e44ca0de-3148-40f2-9f3d-f0665a9fca94",
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "data": {
28 | "text/html": [
29 | "\n",
30 | " \n",
31 | "
SparkContext
\n",
32 | "\n",
33 | "
Spark UI
\n",
34 | "\n",
35 | "
\n",
36 | " - Version
\n",
37 | " v3.2.4 \n",
38 | " - Master
\n",
39 | " local[*] \n",
40 | " - AppName
\n",
41 | " pyspark-shell \n",
42 | "
\n",
43 | "
\n",
44 | " "
45 | ],
46 | "text/plain": [
47 | ""
48 | ]
49 | },
50 | "execution_count": 9,
51 | "metadata": {},
52 | "output_type": "execute_result"
53 | }
54 | ],
55 | "source": [
56 | "sc = SparkContext.getOrCreate()\n",
57 | "sc"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 10,
63 | "id": "40f807e5-6ace-4c5c-aaf2-820426e73645",
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "name": "stdout",
68 | "output_type": "stream",
69 | "text": [
70 | "[1, 2, 3, 4, 5]\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "collect_rdd = sc.parallelize([1,2,3,4,5])\n",
76 | "print(collect_rdd.collect())"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 11,
82 | "id": "8bf68939-cba5-4e3e-a868-a8d0ae82b5a4",
83 | "metadata": {},
84 | "outputs": [
85 | {
86 | "name": "stdout",
87 | "output_type": "stream",
88 | "text": [
89 | "10\n"
90 | ]
91 | }
92 | ],
93 | "source": [
94 | "# Counting\n",
95 | "count_rdd = sc.parallelize([1,2,3,4,5,6,7,8,9,0])\n",
96 | "print(count_rdd.count())"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 12,
102 | "id": "9199423b-c9e4-4ade-90e3-506260cfceb7",
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | "45\n"
110 | ]
111 | }
112 | ],
113 | "source": [
114 | "# reduce - immediate operations\n",
115 | "data_rdd = sc.parallelize([1,2,3,4,5,6,7,8,9,0])\n",
116 | "result_rdd = data_rdd.reduce(lambda x,y: x + y)\n",
117 | "print(result_rdd)\n"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 13,
123 | "id": "6bb70ade-fe11-4cd2-a092-4529af35f61f",
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "name": "stdout",
128 | "output_type": "stream",
129 | "text": [
130 | "[('awesome', 1), ('Spark', 1), ('really', 2), ('is', 1)]\n"
131 | ]
132 | }
133 | ],
134 | "source": [
135 | "# word counter\n",
136 | "s = \"Spark is really really awesome!\"\n",
137 | "rdd = sc.parallelize([s])\n",
138 | "counts = rdd.flatMap(lambda line: line.split(\" \")) \\\n",
139 | " .map(lambda line: line.replace(\"!\",\"\")) \\\n",
140 | " .map(lambda word: (word, 1)) \\\n",
141 | " .reduceByKey(lambda x, y: x + y) \\\n",
142 | " .collect()\n",
143 | "print(str(counts))"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 7,
149 | "id": "c064273f-1229-4d81-a37e-8a9674c55eda",
150 | "metadata": {},
151 | "outputs": [
152 | {
153 | "data": {
154 | "text/plain": [
155 | "[('Spark', 1), ('really', 2), ('awesome!', 1), ('is', 1)]"
156 | ]
157 | },
158 | "execution_count": 7,
159 | "metadata": {},
160 | "output_type": "execute_result"
161 | }
162 | ],
163 | "source": [
164 | "s = \"Spark is really really awesome!\"\n",
165 | "sc.parallelize(s.split(\" \"))\\\n",
166 | ".map(lambda x:(x, 1))\\\n",
167 | ".reduceByKey(add).collect()"
168 | ]
169 | }
170 | ],
171 | "metadata": {
172 | "kernelspec": {
173 | "display_name": "Python 3 (ipykernel)",
174 | "language": "python",
175 | "name": "python3"
176 | },
177 | "language_info": {
178 | "codemirror_mode": {
179 | "name": "ipython",
180 | "version": 3
181 | },
182 | "file_extension": ".py",
183 | "mimetype": "text/x-python",
184 | "name": "python",
185 | "nbconvert_exporter": "python",
186 | "pygments_lexer": "ipython3",
187 | "version": "3.10.12"
188 | }
189 | },
190 | "nbformat": 4,
191 | "nbformat_minor": 5
192 | }
193 |
--------------------------------------------------------------------------------
/random-examples/word_counter.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 10,
6 | "id": "b4be2271-1e6e-40f2-a2a2-e3dcbf68dbb8",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pyspark\n",
11 | "import pyspark.sql.functions as F\n",
12 | "from pyspark.sql import SparkSession\n",
13 | "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, MapType, ArrayType\n",
14 | "from operator import add"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 11,
20 | "id": "3b83f99a-7763-487e-b4e3-645d9fe930a6",
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "spark = SparkSession.builder.master('local').appName('word_counter').getOrCreate()"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "id": "b23bf8a9-e8e4-49cd-89fc-3ffd80d35fbd",
30 | "metadata": {},
31 | "source": [
32 | "### Dataframe way\n",
33 | "\n",
34 | " 1. Create the dataframe with one column and call it 'word'\n",
35 | " 2. Use F.split and get one column with all words separated in a list\n",
36 | " 3. Use F.explode to transform each item in the list to a row\n",
37 | " 4. Group by 'word' and aggregate using 'count' function\n",
38 | "\n",
39 | " "
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 12,
45 | "id": "2d1bde9c-05eb-4f94-a0f9-ce7cc421b085",
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "# Input\n",
50 | "s = [\"Spark is totally totally awesome!\"]"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 13,
56 | "id": "de353bf4-1301-458e-a6be-212d4ae73d9d",
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "name": "stdout",
61 | "output_type": "stream",
62 | "text": [
63 | "+-------+-----------+\n",
64 | "| word|count(word)|\n",
65 | "+-------+-----------+\n",
66 | "|totally| 2|\n",
67 | "| is| 1|\n",
68 | "| Spark| 1|\n",
69 | "|awesome| 1|\n",
70 | "+-------+-----------+\n",
71 | "\n"
72 | ]
73 | }
74 | ],
75 | "source": [
76 | "words_df = spark.createDataFrame([s], ['word'])\\\n",
77 | " .withColumn('word', F.explode(F.split(F.col('word'), ' ')))\\\n",
78 | " .groupBy('word').agg(F.count('word'))\\\n",
79 | " .withColumn('word', F.regexp_replace(F.col('word'), r\"^(.*)[\\!@#\\$%&*\\(\\)_\\-\\+\\=]+(.*)$\", \"$1$2\"))\\\n",
80 | " .show()"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "id": "155fdb25-4711-4095-ac7f-b9c84099bcb6",
86 | "metadata": {},
87 | "source": [
88 | "### Dataframe + SQL\n",
89 | "\n",
90 | "1. Create a dataframe and a view from it\n",
91 | "2. Split and explode exactly as before\n",
92 | "3. Create a temporary view\n",
93 | "4. Count and group using SQL\n"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 14,
99 | "id": "7ec2679c-1afd-47ac-99a8-70aa61baf43a",
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "# Create a data frame and a view\n",
104 | "s = [\"Spark is really really awesome!\"]\n",
105 | "lines_df = spark.createDataFrame([s], ['word'])\\\n",
106 | " .withColumn('word', F.explode(F.split(F.col('word'), ' ')))\n",
107 | "lines_df.createOrReplaceTempView('lines')"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 15,
113 | "id": "9a292827-710b-4231-9eda-3434cac14ec5",
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "name": "stdout",
118 | "output_type": "stream",
119 | "text": [
120 | "+--------+-----------+\n",
121 | "| word|count(word)|\n",
122 | "+--------+-----------+\n",
123 | "| is| 1|\n",
124 | "| really| 2|\n",
125 | "| Spark| 1|\n",
126 | "|awesome!| 1|\n",
127 | "+--------+-----------+\n",
128 | "\n"
129 | ]
130 | }
131 | ],
132 | "source": [
133 | "# Select data from the view simply using Spark SQL\n",
134 | "spark.sql(\"\"\"select word, count(word) from lines group by word\"\"\").show()"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "id": "cc5800a1-42bb-4bf9-9696-2dd830315319",
140 | "metadata": {},
141 | "source": [
142 | "### RDD way"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 16,
148 | "id": "bc918308-4d4a-436c-90ed-0483f0cc5b56",
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "data": {
153 | "text/plain": [
154 | "[('Spark', 1), ('is', 1), ('really', 2), ('awesome!', 1)]"
155 | ]
156 | },
157 | "execution_count": 16,
158 | "metadata": {},
159 | "output_type": "execute_result"
160 | }
161 | ],
162 | "source": [
163 | "# If you like map/reduce crap, go ahead!\n",
164 | "s = \"Spark is really really awesome!\"\n",
165 | "spark.sparkContext\\\n",
166 | " .parallelize(s.split()).map(lambda x:(x, 1))\\\n",
167 | " .reduceByKey(add).collect()"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "id": "e82e9b12-7fbd-42d1-9492-be5cdef9dfdf",
174 | "metadata": {},
175 | "outputs": [],
176 | "source": []
177 | }
178 | ],
179 | "metadata": {
180 | "kernelspec": {
181 | "display_name": "Python 3 (ipykernel)",
182 | "language": "python",
183 | "name": "python3"
184 | },
185 | "language_info": {
186 | "codemirror_mode": {
187 | "name": "ipython",
188 | "version": 3
189 | },
190 | "file_extension": ".py",
191 | "mimetype": "text/x-python",
192 | "name": "python",
193 | "nbconvert_exporter": "python",
194 | "pygments_lexer": "ipython3",
195 | "version": "3.10.12"
196 | }
197 | },
198 | "nbformat": 4,
199 | "nbformat_minor": 5
200 | }
201 |
--------------------------------------------------------------------------------
/dataframes/dataframe_from_nothing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Dataframes from \"nothing\""
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pyspark\n",
17 | "from pyspark.sql import SparkSession\n",
18 | "from pyspark.sql.types import StructType, StructField, StringType"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "spark = SparkSession.builder.appName('basic').getOrCreate()"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Creating a empty dataframe"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "### Parallelize way"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 3,
47 | "metadata": {},
48 | "outputs": [
49 | {
50 | "name": "stdout",
51 | "output_type": "stream",
52 | "text": [
53 | "+--------+-------+-------+\n",
54 | "| _1| _2| _3|\n",
55 | "+--------+-------+-------+\n",
56 | "| This|is only|a test!|\n",
57 | "|And this| is| too|\n",
58 | "+--------+-------+-------+\n",
59 | "\n"
60 | ]
61 | }
62 | ],
63 | "source": [
64 | "# Define your data by a set of data. Each data is a Row!\n",
65 | "data = [(\"This\",\"is only\", \"a test!\"),(\"And this\",\"is\",\"too\")]\n",
66 | "\n",
67 | "# Paralellize data\n",
68 | "spark.sparkContext.parallelize(data).toDF().show()\n"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 4,
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "name": "stdout",
78 | "output_type": "stream",
79 | "text": [
80 | "+--------+-------+-------+\n",
81 | "| colA| colB| colC|\n",
82 | "+--------+-------+-------+\n",
83 | "| This|is only|a test!|\n",
84 | "|And this| is| too|\n",
85 | "+--------+-------+-------+\n",
86 | "\n"
87 | ]
88 | }
89 | ],
90 | "source": [
91 | "# Defining column names\n",
92 | "columns = [\"colA\", \"colB\", \"colC\"]\n",
93 | "spark.sparkContext.parallelize(data).toDF(columns).show()"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "### createDataFrame way"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 5,
106 | "metadata": {},
107 | "outputs": [
108 | {
109 | "name": "stdout",
110 | "output_type": "stream",
111 | "text": [
112 | "++\n",
113 | "||\n",
114 | "++\n",
115 | "++\n",
116 | "\n"
117 | ]
118 | }
119 | ],
120 | "source": [
121 | "# First it's required unleast a empty schema\n",
122 | "schema = StructType([])\n",
123 | "\n",
124 | "# Now, an empty and useless dataframe\n",
125 | "spark.createDataFrame(spark.sparkContext.emptyRDD(),schema).show()"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 11,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "name": "stdout",
135 | "output_type": "stream",
136 | "text": [
137 | "+--------+-------+-------+\n",
138 | "| colA| colB| colC|\n",
139 | "+--------+-------+-------+\n",
140 | "| This|is only|a test!|\n",
141 | "|And this| is| too|\n",
142 | "+--------+-------+-------+\n",
143 | "\n"
144 | ]
145 | }
146 | ],
147 | "source": [
148 | "# Schema for previous data\n",
149 | "schema = StructType([\n",
150 | " StructField(\"colA\",StringType(),False),\n",
151 | " StructField(\"colB\",StringType(),False),\n",
152 | " StructField(\"colC\",StringType(),False)\n",
153 | " ])\n",
154 | "# Creating DataFrame\n",
155 | "spark.createDataFrame(data,schema).show()\n"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 12,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "name": "stdout",
165 | "output_type": "stream",
166 | "text": [
167 | "+--------+-------+------+\n",
168 | "| colA| colB| colC|\n",
169 | "+--------+-------+------+\n",
170 | "| This|is only|a test|\n",
171 | "|And this| is| too|\n",
172 | "+--------+-------+------+\n",
173 | "\n"
174 | ]
175 | }
176 | ],
177 | "source": [
178 | "# Minimalist\n",
179 | "df = spark.createDataFrame([[\"This\",\"is only\",\"a test\"],[\"And this\",\"is\",\"too\"]],schema)\n",
180 | "df.show()"
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "metadata": {},
186 | "source": [
187 | "## Dataframes from CSV"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 13,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "name": "stdout",
197 | "output_type": "stream",
198 | "text": [
199 | "+--------+-------+------+\n",
200 | "| colA| colB| colC|\n",
201 | "+--------+-------+------+\n",
202 | "| This|is only|a test|\n",
203 | "|And this| is| too|\n",
204 | "+--------+-------+------+\n",
205 | "\n"
206 | ]
207 | }
208 | ],
209 | "source": [
210 | "# Let's create a file first from the dataframe stored on 'df' var!\n",
211 | "file='/users/hduser/spark-ref/dataframes/examples/example-001.csv'\n",
212 | "df.write.csv(file, sep=',', header=True)\n",
213 | "\n",
214 | "\n",
215 | "# Reading from CSV\n",
216 | "spark.read.csv(file, \n",
217 | " sep=',',\n",
218 | " encoding='utf-8',\n",
219 | " header=True).show()"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {},
226 | "outputs": [],
227 | "source": []
228 | }
229 | ],
230 | "metadata": {
231 | "kernelspec": {
232 | "display_name": "Python 3 (ipykernel)",
233 | "language": "python",
234 | "name": "python3"
235 | },
236 | "language_info": {
237 | "codemirror_mode": {
238 | "name": "ipython",
239 | "version": 3
240 | },
241 | "file_extension": ".py",
242 | "mimetype": "text/x-python",
243 | "name": "python",
244 | "nbconvert_exporter": "python",
245 | "pygments_lexer": "ipython3",
246 | "version": "3.8.5"
247 | }
248 | },
249 | "nbformat": 4,
250 | "nbformat_minor": 2
251 | }
252 |
--------------------------------------------------------------------------------
/dataframes/.ipynb_checkpoints/dataframe_from_nothing-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Dataframes from \"nothing\""
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pyspark\n",
17 | "from pyspark.sql import SparkSession\n",
18 | "from pyspark.sql.types import StructType, StructField, StringType"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "spark = SparkSession.builder.appName('basic').getOrCreate()"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Creating a empty dataframe"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "### Parallelize way"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 3,
47 | "metadata": {},
48 | "outputs": [
49 | {
50 | "name": "stdout",
51 | "output_type": "stream",
52 | "text": [
53 | "+--------+-------+-------+\n",
54 | "| _1| _2| _3|\n",
55 | "+--------+-------+-------+\n",
56 | "| This|is only|a test!|\n",
57 | "|And this| is| too|\n",
58 | "+--------+-------+-------+\n",
59 | "\n"
60 | ]
61 | }
62 | ],
63 | "source": [
64 | "# Define your data by a set of data. Each data is a Row!\n",
65 | "data = [(\"This\",\"is only\", \"a test!\"),(\"And this\",\"is\",\"too\")]\n",
66 | "\n",
67 | "# Paralellize data\n",
68 | "spark.sparkContext.parallelize(data).toDF().show()\n"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 4,
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "name": "stdout",
78 | "output_type": "stream",
79 | "text": [
80 | "+--------+-------+-------+\n",
81 | "| colA| colB| colC|\n",
82 | "+--------+-------+-------+\n",
83 | "| This|is only|a test!|\n",
84 | "|And this| is| too|\n",
85 | "+--------+-------+-------+\n",
86 | "\n"
87 | ]
88 | }
89 | ],
90 | "source": [
91 | "# Defining column names\n",
92 | "columns = [\"colA\", \"colB\", \"colC\"]\n",
93 | "spark.sparkContext.parallelize(data).toDF(columns).show()"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "### createDataFrame way"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 5,
106 | "metadata": {},
107 | "outputs": [
108 | {
109 | "name": "stdout",
110 | "output_type": "stream",
111 | "text": [
112 | "++\n",
113 | "||\n",
114 | "++\n",
115 | "++\n",
116 | "\n"
117 | ]
118 | }
119 | ],
120 | "source": [
121 | "# First it's required unleast a empty schema\n",
122 | "schema = StructType([])\n",
123 | "\n",
124 | "# Now, an empty and useless dataframe\n",
125 | "spark.createDataFrame(spark.sparkContext.emptyRDD(),schema).show()"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 11,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "name": "stdout",
135 | "output_type": "stream",
136 | "text": [
137 | "+--------+-------+-------+\n",
138 | "| colA| colB| colC|\n",
139 | "+--------+-------+-------+\n",
140 | "| This|is only|a test!|\n",
141 | "|And this| is| too|\n",
142 | "+--------+-------+-------+\n",
143 | "\n"
144 | ]
145 | }
146 | ],
147 | "source": [
148 | "# Schema for previous data\n",
149 | "schema = StructType([\n",
150 | " StructField(\"colA\",StringType(),False),\n",
151 | " StructField(\"colB\",StringType(),False),\n",
152 | " StructField(\"colC\",StringType(),False)\n",
153 | " ])\n",
154 | "# Creating DataFrame\n",
155 | "spark.createDataFrame(data,schema).show()\n"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 12,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "name": "stdout",
165 | "output_type": "stream",
166 | "text": [
167 | "+--------+-------+------+\n",
168 | "| colA| colB| colC|\n",
169 | "+--------+-------+------+\n",
170 | "| This|is only|a test|\n",
171 | "|And this| is| too|\n",
172 | "+--------+-------+------+\n",
173 | "\n"
174 | ]
175 | }
176 | ],
177 | "source": [
178 | "# Minimalist\n",
179 | "df = spark.createDataFrame([[\"This\",\"is only\",\"a test\"],[\"And this\",\"is\",\"too\"]],schema)\n",
180 | "df.show()"
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "metadata": {},
186 | "source": [
187 | "## Dataframes from CSV"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 13,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "name": "stdout",
197 | "output_type": "stream",
198 | "text": [
199 | "+--------+-------+------+\n",
200 | "| colA| colB| colC|\n",
201 | "+--------+-------+------+\n",
202 | "| This|is only|a test|\n",
203 | "|And this| is| too|\n",
204 | "+--------+-------+------+\n",
205 | "\n"
206 | ]
207 | }
208 | ],
209 | "source": [
210 | "# Let's create a file first from the dataframe stored on 'df' var!\n",
211 | "file='/users/hduser/spark-ref/dataframes/examples/example-001.csv'\n",
212 | "df.write.csv(file, sep=',', header=True)\n",
213 | "\n",
214 | "\n",
215 | "# Reading from CSV\n",
216 | "spark.read.csv(file, \n",
217 | " sep=',',\n",
218 | " encoding='utf-8',\n",
219 | " header=True).show()"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {},
226 | "outputs": [],
227 | "source": []
228 | }
229 | ],
230 | "metadata": {
231 | "kernelspec": {
232 | "display_name": "Python 3 (ipykernel)",
233 | "language": "python",
234 | "name": "python3"
235 | },
236 | "language_info": {
237 | "codemirror_mode": {
238 | "name": "ipython",
239 | "version": 3
240 | },
241 | "file_extension": ".py",
242 | "mimetype": "text/x-python",
243 | "name": "python",
244 | "nbconvert_exporter": "python",
245 | "pygments_lexer": "ipython3",
246 | "version": "3.8.5"
247 | }
248 | },
249 | "nbformat": 4,
250 | "nbformat_minor": 2
251 | }
252 |
--------------------------------------------------------------------------------
/dataframes/not_exits_and_exists_equivalent-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## \"NOT EXISTS\" AND \"EXISTS\" equivalent operations on dataframes\n"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pyspark.sql.functions as F\n",
17 | "from pyspark.sql.types import *\n",
18 | "from pyspark.sql import SparkSession"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "spark = SparkSession.builder.appName('test_dataframes').enableHiveSupport().getOrCreate()"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Generating data"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 35,
40 | "metadata": {},
41 | "outputs": [
42 | {
43 | "name": "stdout",
44 | "output_type": "stream",
45 | "text": [
46 | "+---+-----+\n",
47 | "| id| name|\n",
48 | "+---+-----+\n",
49 | "| 1|Andre|\n",
50 | "| 2| Rose|\n",
51 | "+---+-----+\n",
52 | "\n",
53 | "+---+------+\n",
54 | "| id| name|\n",
55 | "+---+------+\n",
56 | "| 1| Andre|\n",
57 | "| 2| Rose|\n",
58 | "| 3|Daniel|\n",
59 | "+---+------+\n",
60 | "\n",
61 | "+---+------+\n",
62 | "| id| name|\n",
63 | "+---+------+\n",
64 | "| 1| Andre|\n",
65 | "| 2| Rose|\n",
66 | "| 3|Daniel|\n",
67 | "| 3|Daniel|\n",
68 | "| 4| Anita|\n",
69 | "+---+------+\n",
70 | "\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "data1 = [(1,\"Andre\"),(2,\"Rose\")]\n",
76 | "data2 = [(1,\"Andre\"),(2,\"Rose\"),(3,\"Daniel\")]\n",
77 | "data3 = [(1,\"Andre\"),(2,\"Rose\"),(3,\"Daniel\"), (3,\"Daniel\"), (4,\"Anita\")]\n",
78 | "schema = StructType([\n",
79 | " StructField(\"id\",StringType(),True),\n",
80 | " StructField(\"name\",StringType(),True),\n",
81 | "])\n",
82 | "\n",
83 | "df1 = spark.createDataFrame(data1,schema)\n",
84 | "df2 = spark.createDataFrame(data2,schema)\n",
85 | "df3 = spark.createDataFrame(data3,schema)\n",
86 | "df1.show()\n",
87 | "df2.show()\n",
88 | "df3.show()"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {},
94 | "source": [
95 | "## NOT EXISTS EQUIVALENT"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "### Method 1 - subtract"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 36,
108 | "metadata": {},
109 | "outputs": [
110 | {
111 | "name": "stdout",
112 | "output_type": "stream",
113 | "text": [
114 | "+---+------+\n",
115 | "| id| name|\n",
116 | "+---+------+\n",
117 | "| 3|Daniel|\n",
118 | "+---+------+\n",
119 | "\n"
120 | ]
121 | }
122 | ],
123 | "source": [
124 | "# All that exists in df2 but doesn't dexists in df1\n",
125 | "dfr = df2.subtract(df1)\n",
126 | "dfr.show()"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 47,
132 | "metadata": {},
133 | "outputs": [
134 | {
135 | "name": "stdout",
136 | "output_type": "stream",
137 | "text": [
138 | "+---+------+\n",
139 | "| id| name|\n",
140 | "+---+------+\n",
141 | "| 3|Daniel|\n",
142 | "| 4| Anita|\n",
143 | "+---+------+\n",
144 | "\n"
145 | ]
146 | }
147 | ],
148 | "source": [
149 | "# All that exists in df3 but doesn't dexists in df1\n",
150 | "dfr2 = df3.subtract(df1)\n",
151 | "dfr2.show()"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "### Method 2 - left_anti. This is the 'classical' way to have something equivalent to 'NOT EXISTS'"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 37,
164 | "metadata": {},
165 | "outputs": [
166 | {
167 | "name": "stdout",
168 | "output_type": "stream",
169 | "text": [
170 | "+---+------+\n",
171 | "| id| name|\n",
172 | "+---+------+\n",
173 | "| 3|Daniel|\n",
174 | "+---+------+\n",
175 | "\n"
176 | ]
177 | }
178 | ],
179 | "source": [
180 | "# All that exists in df2 but doesn't dexists in df1\n",
181 | "dfr = df2.join(df1,'id','left_anti')\n",
182 | "dfr.show()"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 54,
188 | "metadata": {},
189 | "outputs": [
190 | {
191 | "name": "stdout",
192 | "output_type": "stream",
193 | "text": [
194 | "+---+------+\n",
195 | "| id| name|\n",
196 | "+---+------+\n",
197 | "| 3|Daniel|\n",
198 | "| 4| Anita|\n",
199 | "+---+------+\n",
200 | "\n"
201 | ]
202 | }
203 | ],
204 | "source": [
205 | "# All that exists in df3 but doesn't dexists in df1\n",
206 | "dfr2 = df3.join(df1,'id','left_anti').dropDuplicates()\n",
207 | "dfr2.show()"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "### Method 3 - exceptAll - same thing as 'subtract', apearently!"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 51,
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "name": "stdout",
224 | "output_type": "stream",
225 | "text": [
226 | "+---+------+\n",
227 | "| id| name|\n",
228 | "+---+------+\n",
229 | "| 3|Daniel|\n",
230 | "+---+------+\n",
231 | "\n"
232 | ]
233 | }
234 | ],
235 | "source": [
236 | "# All that exists in df2 but doesn't dexists in df1\n",
237 | "dfr = df2.exceptAll(df1)\n",
238 | "dfr.show()"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 55,
244 | "metadata": {},
245 | "outputs": [
246 | {
247 | "name": "stdout",
248 | "output_type": "stream",
249 | "text": [
250 | "+---+------+\n",
251 | "| id| name|\n",
252 | "+---+------+\n",
253 | "| 3|Daniel|\n",
254 | "| 4| Anita|\n",
255 | "+---+------+\n",
256 | "\n"
257 | ]
258 | }
259 | ],
260 | "source": [
261 | "# All that exists in df3 but doesn't dexists in df1\n",
262 | "dfr2 = df3.exceptAll(df1).dropDuplicates()\n",
263 | "dfr2.show()"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {},
269 | "source": [
270 | "## EXISTS EQUIVALENT"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {},
276 | "source": [
277 | "### Method 1 - left_semi"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": 40,
283 | "metadata": {},
284 | "outputs": [
285 | {
286 | "name": "stdout",
287 | "output_type": "stream",
288 | "text": [
289 | "+---+-----+\n",
290 | "| id| name|\n",
291 | "+---+-----+\n",
292 | "| 1|Andre|\n",
293 | "| 2| Rose|\n",
294 | "+---+-----+\n",
295 | "\n"
296 | ]
297 | }
298 | ],
299 | "source": [
300 | "# Only that exists in df2 and df1\n",
301 | "dfr = df2.join(df1,'id','left_semi')\n",
302 | "dfr.show()"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 56,
308 | "metadata": {},
309 | "outputs": [
310 | {
311 | "name": "stdout",
312 | "output_type": "stream",
313 | "text": [
314 | "+---+-----+\n",
315 | "| id| name|\n",
316 | "+---+-----+\n",
317 | "| 1|Andre|\n",
318 | "| 2| Rose|\n",
319 | "+---+-----+\n",
320 | "\n"
321 | ]
322 | }
323 | ],
324 | "source": [
325 | "# Only that exists in df3 and df1\n",
326 | "dfr = df3.join(df1,'id','left_semi')\n",
327 | "dfr.show()"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": []
336 | }
337 | ],
338 | "metadata": {
339 | "kernelspec": {
340 | "display_name": "Python 3",
341 | "language": "python",
342 | "name": "python3"
343 | },
344 | "language_info": {
345 | "codemirror_mode": {
346 | "name": "ipython",
347 | "version": 3
348 | },
349 | "file_extension": ".py",
350 | "mimetype": "text/x-python",
351 | "name": "python",
352 | "nbconvert_exporter": "python",
353 | "pygments_lexer": "ipython3",
354 | "version": "3.8.5"
355 | }
356 | },
357 | "nbformat": 4,
358 | "nbformat_minor": 4
359 | }
360 |
--------------------------------------------------------------------------------
/sql/not_exits_and_exists_equivalent.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Spark SQL: \"NOT EXISTS\" AND \"EXISTS\" equivalent operations on dataframes\n",
8 | "\n"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 28,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import findspark\n",
18 | "findspark.init()\n",
19 | "import pyspark.sql.functions as F\n",
20 | "from pyspark.sql.types import *\n",
21 | "from pyspark.sql import SparkSession"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 29,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "spark = SparkSession.builder.appName('test_dataframes').enableHiveSupport().getOrCreate()"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "## Generating data"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 30,
43 | "metadata": {},
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "+---+-----+\n",
50 | "| id| name|\n",
51 | "+---+-----+\n",
52 | "| 1|Andre|\n",
53 | "| 2| Rose|\n",
54 | "+---+-----+\n",
55 | "\n",
56 | "+---+------+\n",
57 | "| id| name|\n",
58 | "+---+------+\n",
59 | "| 1| Andre|\n",
60 | "| 2| Rose|\n",
61 | "| 3|Daniel|\n",
62 | "+---+------+\n",
63 | "\n",
64 | "+---+------+\n",
65 | "| id| name|\n",
66 | "+---+------+\n",
67 | "| 1| Andre|\n",
68 | "| 2| Rose|\n",
69 | "| 3|Daniel|\n",
70 | "| 3|Daniel|\n",
71 | "| 4| Anita|\n",
72 | "+---+------+\n",
73 | "\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "data1 = [(1,\"Andre\"),(2,\"Rose\")]\n",
79 | "data2 = [(1,\"Andre\"),(2,\"Rose\"),(3,\"Daniel\")]\n",
80 | "data3 = [(1,\"Andre\"),(2,\"Rose\"),(3,\"Daniel\"), (3,\"Daniel\"), (4,\"Anita\")]\n",
81 | "schema = StructType([\n",
82 | " StructField(\"id\",StringType(),True),\n",
83 | " StructField(\"name\",StringType(),True),\n",
84 | "])\n",
85 | "\n",
86 | "df1 = spark.createDataFrame(data1,schema)\n",
87 | "df2 = spark.createDataFrame(data2,schema)\n",
88 | "df3 = spark.createDataFrame(data3,schema)\n",
89 | "df1.show()\n",
90 | "df2.show()\n",
91 | "df3.show()"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "## NOT EXISTS EQUIVALENT"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "### Method 1 - subtract"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 31,
111 | "metadata": {},
112 | "outputs": [
113 | {
114 | "name": "stdout",
115 | "output_type": "stream",
116 | "text": [
117 | "+---+------+\n",
118 | "| id| name|\n",
119 | "+---+------+\n",
120 | "| 3|Daniel|\n",
121 | "+---+------+\n",
122 | "\n"
123 | ]
124 | }
125 | ],
126 | "source": [
127 | "# All that exists in df2 but doesn't dexists in df1\n",
128 | "dfr = df2.subtract(df1)\n",
129 | "dfr.show()"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 32,
135 | "metadata": {},
136 | "outputs": [
137 | {
138 | "name": "stderr",
139 | "output_type": "stream",
140 | "text": [
141 | "\r",
142 | "[Stage 338:=================================================> (187 + 8) / 200]\r",
143 | "\r",
144 | " \r"
145 | ]
146 | },
147 | {
148 | "name": "stdout",
149 | "output_type": "stream",
150 | "text": [
151 | "+---+------+\n",
152 | "| id| name|\n",
153 | "+---+------+\n",
154 | "| 3|Daniel|\n",
155 | "| 4| Anita|\n",
156 | "+---+------+\n",
157 | "\n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "# All that exists in df3 but doesn't dexists in df1\n",
163 | "dfr2 = df3.subtract(df1)\n",
164 | "dfr2.show()"
165 | ]
166 | },
167 | {
168 | "cell_type": "markdown",
169 | "metadata": {},
170 | "source": [
171 | "### Method 2 - left_anti. This is the 'classical' way to have something equivalent to 'NOT EXISTS'"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 33,
177 | "metadata": {},
178 | "outputs": [
179 | {
180 | "name": "stdout",
181 | "output_type": "stream",
182 | "text": [
183 | "+---+------+\n",
184 | "| id| name|\n",
185 | "+---+------+\n",
186 | "| 3|Daniel|\n",
187 | "+---+------+\n",
188 | "\n"
189 | ]
190 | }
191 | ],
192 | "source": [
193 | "# All that exists in df2 but doesn't dexists in df1\n",
194 | "dfr = df2.join(df1,'id','left_anti')\n",
195 | "dfr.show()"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 34,
201 | "metadata": {},
202 | "outputs": [
203 | {
204 | "name": "stdout",
205 | "output_type": "stream",
206 | "text": [
207 | "+---+------+\n",
208 | "| id| name|\n",
209 | "+---+------+\n",
210 | "| 3|Daniel|\n",
211 | "| 4| Anita|\n",
212 | "+---+------+\n",
213 | "\n"
214 | ]
215 | }
216 | ],
217 | "source": [
218 | "# All that exists in df3 but doesn't dexists in df1\n",
219 | "dfr2 = df3.join(df1,'id','left_anti').dropDuplicates()\n",
220 | "dfr2.show()"
221 | ]
222 | },
223 | {
224 | "cell_type": "markdown",
225 | "metadata": {},
226 | "source": [
227 | "### Method 3 - exceptAll - same thing as 'subtract', apearently!"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 35,
233 | "metadata": {},
234 | "outputs": [
235 | {
236 | "name": "stdout",
237 | "output_type": "stream",
238 | "text": [
239 | "+---+------+\n",
240 | "| id| name|\n",
241 | "+---+------+\n",
242 | "| 3|Daniel|\n",
243 | "+---+------+\n",
244 | "\n"
245 | ]
246 | }
247 | ],
248 | "source": [
249 | "# All that exists in df2 but doesn't dexists in df1\n",
250 | "dfr = df2.exceptAll(df1)\n",
251 | "dfr.show()"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 36,
257 | "metadata": {},
258 | "outputs": [
259 | {
260 | "name": "stdout",
261 | "output_type": "stream",
262 | "text": [
263 | "+---+------+\n",
264 | "| id| name|\n",
265 | "+---+------+\n",
266 | "| 3|Daniel|\n",
267 | "| 4| Anita|\n",
268 | "+---+------+\n",
269 | "\n"
270 | ]
271 | }
272 | ],
273 | "source": [
274 | "# All that exists in df3 but doesn't dexists in df1\n",
275 | "dfr2 = df3.exceptAll(df1).dropDuplicates()\n",
276 | "dfr2.show()"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "metadata": {},
282 | "source": [
283 | "## EXISTS EQUIVALENT"
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "metadata": {},
289 | "source": [
290 | "### Method 1 - left_semi"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 15,
296 | "metadata": {},
297 | "outputs": [
298 | {
299 | "name": "stdout",
300 | "output_type": "stream",
301 | "text": [
302 | "+---+-----+\n",
303 | "| id| name|\n",
304 | "+---+-----+\n",
305 | "| 1|Andre|\n",
306 | "| 2| Rose|\n",
307 | "+---+-----+\n",
308 | "\n"
309 | ]
310 | }
311 | ],
312 | "source": [
313 | "# Only that exists in df2 and df1\n",
314 | "dfr = df2.join(df1,'id','left_semi')\n",
315 | "dfr.show()"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 16,
321 | "metadata": {},
322 | "outputs": [
323 | {
324 | "name": "stdout",
325 | "output_type": "stream",
326 | "text": [
327 | "+---+-----+\n",
328 | "| id| name|\n",
329 | "+---+-----+\n",
330 | "| 1|Andre|\n",
331 | "| 2| Rose|\n",
332 | "+---+-----+\n",
333 | "\n"
334 | ]
335 | }
336 | ],
337 | "source": [
338 | "# Only that exists in df3 and df1\n",
339 | "dfr = df3.join(df1,'id','left_semi')\n",
340 | "dfr.show()"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {},
347 | "outputs": [],
348 | "source": []
349 | }
350 | ],
351 | "metadata": {
352 | "kernelspec": {
353 | "display_name": "Python 3 (ipykernel)",
354 | "language": "python",
355 | "name": "python3"
356 | },
357 | "language_info": {
358 | "codemirror_mode": {
359 | "name": "ipython",
360 | "version": 3
361 | },
362 | "file_extension": ".py",
363 | "mimetype": "text/x-python",
364 | "name": "python",
365 | "nbconvert_exporter": "python",
366 | "pygments_lexer": "ipython3",
367 | "version": "3.10.12"
368 | }
369 | },
370 | "nbformat": 4,
371 | "nbformat_minor": 4
372 | }
373 |
--------------------------------------------------------------------------------
/dataframes/dataframe_from_json.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "6524d53e-b3d6-41f2-9d1e-c39f98d52e1d",
6 | "metadata": {},
7 | "source": [
8 | "# Dataframe from JSON\n"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 19,
14 | "id": "460e7835-3700-4d4a-a0aa-212e268c34c2",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import pyspark\n",
19 | "import pyspark.sql.functions as F\n",
20 | "import pandas as pd\n",
21 | "import re\n",
22 | "from pyspark import SparkConf, SparkContext\n",
23 | "from pyspark.sql import SparkSession, Row\n",
24 | "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, MapType, ArrayType\n",
25 | "from pyspark.sql.functions import udf\n"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 20,
31 | "id": "2e33eed2-c8f7-46f8-9943-cde7d53bc32a",
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "# Getting session from Spark\n",
36 | "spark = SparkSession.builder \\\n",
37 | ".appName('test').master('local[*]') \\\n",
38 | ".config(\"spark.cores.max\", \"2\") \\\n",
39 | ".config(\"spark.executor.memory\", \"2g\") \\\n",
40 | ".config(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\") \\\n",
41 | ".config(\"spark.shuffle.service.enabled\", \"false\") \\\n",
42 | ".config(\"spark.dynamicAllocation.enabled\", \"true\") \\\n",
43 | ".getOrCreate()\n",
44 | "\n",
45 | "\n"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "id": "f927ff89-14fd-4527-837c-29c8d3e2892e",
51 | "metadata": {},
52 | "source": [
53 | "### A proper JSON format in file dataframes/data/test.js\n",
54 | "\n",
55 | "```json\n",
56 | "[\n",
57 | " {\n",
58 | " \"name\": \"Andre\",\n",
59 | " \"id\": 1,\n",
60 | " \"doc_list\":[{\"docid\":\"DOC001\", \"name\":\"bla001.txt\"}, {\"docid\":\"DOC002\", \"name\":\"bla002.txt\"}],\n",
61 | " },\n",
62 | "\n",
63 | " {\n",
64 | " \"name\": \"Noé\",\n",
65 | " \"id\": 2,\n",
66 | " \"doc_list\":[{\"docid\":\"DOC003\", \"name\":\"bla003.txt\"}, {\"docid\":\"DOC004\", \"name\":\"bla004.txt\"}],\n",
67 | " }\n",
68 | "]\n",
69 | "\n",
70 | "\n",
71 | "```\n",
72 | "\n"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "id": "fcfe3e08-812a-49ec-9679-0716780b12f5",
78 | "metadata": {},
79 | "source": [
80 | "### Example problem\n",
81 | "\n",
82 | "Based on JSON structure in the file 'test.js'(shown above), extract the doc file names associated to the people's names as following: \n",
83 | "\n",
84 | "```text\n",
85 | "+-----+----------------------+\n",
86 | "|name |doc_names |\n",
87 | "+-----+----------------------+\n",
88 | "|Andre|bla001.txt, bla002.txt|\n",
89 | "|Noé |bla003.txt, bla004.txt|\n",
90 | "+-----+----------------------+\n",
91 | "```\n",
92 | "\n",
93 | "Note that, 'docid' is not desirable.\n",
94 | "\n"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 4,
100 | "id": "c287da29-db31-4a48-832c-c161072a2a46",
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | "+-----+------------------------------------------------------------------------------+\n",
108 | "|name |doc_list |\n",
109 | "+-----+------------------------------------------------------------------------------+\n",
110 | "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|\n",
111 | "|Noé |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|\n",
112 | "+-----+------------------------------------------------------------------------------+\n",
113 | "\n"
114 | ]
115 | },
116 | {
117 | "name": "stderr",
118 | "output_type": "stream",
119 | "text": [
120 | " "
121 | ]
122 | }
123 | ],
124 | "source": [
125 | "# Defining schema\n",
126 | "schema = StructType([\n",
127 | " StructField(\"name\", StringType(), True),\n",
128 | " StructField(\"doc_list\", ArrayType(MapType(StringType(),StringType(),True),True), True),\n",
129 | "])\n",
130 | "# Reading JSON file using Dataframe API setting 'multiline' option as true\n",
131 | "sdf = spark.read.option(\"multiline\", \"true\").json('data/test.json', schema=schema)\n",
132 | "sdf.show(truncate=False)"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "id": "11cc02e6-f597-4aff-bcb2-588fb94d2426",
138 | "metadata": {},
139 | "source": [
140 | "### Transformations steps\n",
141 | "\n",
142 | "1. Transform the list in 'doc_list' column into various rows using the `explode` pyspark function;\n",
143 | "2. For each of those rows, extract 'name' from the data structure using `getItem` function and drop the original column 'doc_ex';\n",
144 | "3. Transform doc names into a list againd group rows by 'name' column and using the `collect_list` as the aggregate function;\n",
145 | "4. Transform the list into a string separated by ',' using `concat_ws`\n"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 5,
151 | "id": "bf90af12-0f3c-4823-a576-359f66f2dc5d",
152 | "metadata": {},
153 | "outputs": [
154 | {
155 | "name": "stdout",
156 | "output_type": "stream",
157 | "text": [
158 | "Step 1\n",
159 | "+-----+------------------------------------------------------------------------------+-------------------------------------+\n",
160 | "|name |doc_list |doc_ex |\n",
161 | "+-----+------------------------------------------------------------------------------+-------------------------------------+\n",
162 | "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|{docid -> DOC001, name -> bla001.txt}|\n",
163 | "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|{docid -> DOC002, name -> bla002.txt}|\n",
164 | "|Noé |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|{docid -> DOC003, name -> bla003.txt}|\n",
165 | "|Noé |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|{docid -> DOC004, name -> bla004.txt}|\n",
166 | "+-----+------------------------------------------------------------------------------+-------------------------------------+\n",
167 | "\n",
168 | "Step 2\n",
169 | "+-----+------------------------------------------------------------------------------+----------+\n",
170 | "|name |doc_list |doc_name |\n",
171 | "+-----+------------------------------------------------------------------------------+----------+\n",
172 | "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|bla001.txt|\n",
173 | "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|bla002.txt|\n",
174 | "|Noé |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|bla003.txt|\n",
175 | "|Noé |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|bla004.txt|\n",
176 | "+-----+------------------------------------------------------------------------------+----------+\n",
177 | "\n",
178 | "Step 3\n",
179 | "+-----+------------------------+\n",
180 | "|name |doc_list |\n",
181 | "+-----+------------------------+\n",
182 | "|Noé |[bla003.txt, bla004.txt]|\n",
183 | "|Andre|[bla001.txt, bla002.txt]|\n",
184 | "+-----+------------------------+\n",
185 | "\n",
186 | "Step 4\n",
187 | "+-----+---------------------+\n",
188 | "|name |doc_names |\n",
189 | "+-----+---------------------+\n",
190 | "|Noé |bla003.txt,bla004.txt|\n",
191 | "|Andre|bla001.txt,bla002.txt|\n",
192 | "+-----+---------------------+\n",
193 | "\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "# Step 1\n",
199 | "adf = sdf.withColumn(\"doc_ex\", F.explode(\"doc_list\"))\n",
200 | "print(\"Step 1\")\n",
201 | "adf.show(truncate=False)\n",
202 | "# Step 2 - Extracting the value of interest. In this case, the names of documents.\n",
203 | "adf = adf.withColumn(\"doc_name\", adf.doc_ex.getItem(\"name\")).drop(\"doc_ex\")\n",
204 | "print(\"Step 2\")\n",
205 | "adf.show(truncate=False)\n",
206 | "# Step 3 - Time to revert the 'explode' effect. For this, let's group rows by name and use 'collect_list' as aggregate function\n",
207 | "ndf = adf.groupBy(\"name\").agg(F.collect_list(\"doc_name\").alias('doc_list'))\n",
208 | "print(\"Step 3\")\n",
209 | "ndf.show(truncate=False)\n",
210 | "# Step 4 - Transforming this list into a string list separated by ',' character.\n",
211 | "ndf = ndf.withColumn(\"doc_names\", F.concat_ws(\",\", \"doc_list\")).drop(\"doc_list\")\n",
212 | "print(\"Step 4\")\n",
213 | "ndf.show(truncate=False)"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "id": "f6ba2ea4-eda9-4f53-924e-5f71e11f0899",
219 | "metadata": {},
220 | "source": [
221 | "### UDF(Not recommended)\n",
222 | "User Defined Functions(UDF) can be a way to parse information from a column. In this case, the docs inside the JSON file is available in a list of objects which is parsed by pySpark and convenient converted into Python data structure objects which looks like more simpler to deal with. However, in this particularly scenario is not recommended because is possible to use spark functions which supports this operation offering a better optimization than UDF. Besides, in Python performance is not particularly good specially if you have neasted loops."
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 6,
228 | "id": "5a87b5f0-95cc-4d05-b742-a155f196a2aa",
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "@udf\n",
233 | "def extract_doc(data_list: list) -> str:\n",
234 | " n = list()\n",
235 | " for li in data_list:\n",
236 | " n += [v for k,v in li.items() if k == 'name']\n",
237 | "\n",
238 | " return ','.join(n)\n"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 7,
244 | "id": "57fee3c9-fede-4649-a07e-9935bb82fd72",
245 | "metadata": {},
246 | "outputs": [
247 | {
248 | "name": "stdout",
249 | "output_type": "stream",
250 | "text": [
251 | "+-----+---------------------+\n",
252 | "|name |doc_names |\n",
253 | "+-----+---------------------+\n",
254 | "|Andre|bla001.txt,bla002.txt|\n",
255 | "|Noé |bla003.txt,bla004.txt|\n",
256 | "+-----+---------------------+\n",
257 | "\n"
258 | ]
259 | },
260 | {
261 | "name": "stderr",
262 | "output_type": "stream",
263 | "text": [
264 | " "
265 | ]
266 | }
267 | ],
268 | "source": [
269 | "# Running the UDF called 'extract_doc_udf' and storing into a new column called 'udf_res'\n",
270 | "dfu = sdf.withColumn('doc_names', extract_doc(F.col('doc_list'))).select('name','doc_names')\n",
271 | "# Showing the result\n",
272 | "dfu.show(truncate=False)"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "id": "56069cfc-2d38-4ef9-848f-d7116afc7d3e",
278 | "metadata": {},
279 | "source": [
280 | "### \"UDF\" using RDD(less recommended)\n",
281 | "This is the old ways to handle UDFs. The reasons to not to do it is the same as the previous ways. And here is worse because you dealing directly with RDD and you will not have any optimization for doing that. If you don't know how to optimize RDD operations by yourself, don't use it."
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 8,
287 | "id": "db3daafa-8e25-44d4-bb9a-4170e10c054b",
288 | "metadata": {},
289 | "outputs": [],
290 | "source": [
291 | "def extract_doc_rdd(row):\n",
292 | " d = row.asDict()\n",
293 | " n = list()\n",
294 | " if 'doc_list' in d:\n",
295 | " for li in d['doc_list']:\n",
296 | " n += [v for k,v in li.items() if k == 'name']\n",
297 | "\n",
298 | " d['doc_names'] = ','.join(n)\n",
299 | "\n",
300 | " return Row(**d)\n",
301 | " \n"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": 9,
307 | "id": "8aba685e-ab0c-4b05-ab63-a2d9a97ee41f",
308 | "metadata": {},
309 | "outputs": [
310 | {
311 | "name": "stderr",
312 | "output_type": "stream",
313 | "text": [
314 | " "
315 | ]
316 | },
317 | {
318 | "name": "stdout",
319 | "output_type": "stream",
320 | "text": [
321 | "+-----+---------------------+\n",
322 | "|name |doc_names |\n",
323 | "+-----+---------------------+\n",
324 | "|Andre|bla001.txt,bla002.txt|\n",
325 | "|Noé |bla003.txt,bla004.txt|\n",
326 | "+-----+---------------------+\n",
327 | "\n"
328 | ]
329 | }
330 | ],
331 | "source": [
332 | "# Executing 'extract_doc_rdd' using map method from rdd object\n",
333 | "rdd = sdf.rdd.map(extract_doc_rdd)\n",
334 | "# Converting into a dataframe object\n",
335 | "edf = rdd.toDF().select('name','doc_names')\n",
336 | "# Showing the result\n",
337 | "edf.show(truncate=False)"
338 | ]
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "id": "4d11b47f-4fc2-43a6-9433-712f7207deff",
343 | "metadata": {},
344 | "source": [
345 | "### Reading complex JSON using dynamic schema\n",
346 | "\n",
347 | "For the cases where JSON contains varying all the time. Example on dataframe/data/json-varying.csv:\n",
348 | "\n",
349 | "```json\n",
350 | "id,json_string\n",
351 | "1,'{\"name\": \"John Doe\", \"age\": 30}'\n",
352 | "2,'{\"city\": \"New York\", \"country\": \"USA\", \"zipcode\": \"10001\"}'\n",
353 | "3,'{\"product\": \"Laptop\", \"brand\": \"Dell\", \"specs\": {\"RAM\": \"16GB\", \"Storage\": \"512GB SSD\"}}'\n",
354 | "\n",
355 | "```\n",
356 | "\n",
357 | "Tipically this kind of data came from a column\n",
358 | "\n"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": 16,
364 | "id": "7c1dc01a-2f41-4ffe-9ba8-c172048f9fe3",
365 | "metadata": {},
366 | "outputs": [
367 | {
368 | "name": "stdout",
369 | "output_type": "stream",
370 | "text": [
371 | "+---+----------------------------------------------------------------------------------------+\n",
372 | "|id |json_string |\n",
373 | "+---+----------------------------------------------------------------------------------------+\n",
374 | "|1 |{\"name\": \"John Doe\", \"age\": 30} |\n",
375 | "|2 |{\"city\": \"New York\", \"country\": \"USA\", \"zipcode\": \"10001\"} |\n",
376 | "|3 |{\"product\": \"Laptop\", \"brand\": \"Dell\", \"specs\": {\"RAM\": \"16GB\", \"Storage\": \"512GB SSD\"}}|\n",
377 | "+---+----------------------------------------------------------------------------------------+\n",
378 | "\n"
379 | ]
380 | }
381 | ],
382 | "source": [
383 | "# Or from file.\n",
384 | "df = spark.read.csv('data/json-varying.csv', header=True, quote=\"'\")\n",
385 | "\n",
386 | "# If you're reading from a file, remove the quote char from string. Otherwise, the parser will not be able to return a object\n",
387 | "df = df.withColumn('json_string', F.regexp_replace('json_string', r\"\\'\", \"\"))\n",
388 | "df.show(truncate=False)"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 14,
394 | "id": "0f2bf586-4804-4732-a14a-19c6b5f7fa83",
395 | "metadata": {},
396 | "outputs": [
397 | {
398 | "name": "stdout",
399 | "output_type": "stream",
400 | "text": [
401 | "+---+----------------------------------------------------------------------------------------+---------------------------------------------------------------+\n",
402 | "|id |json_string |parsed |\n",
403 | "+---+----------------------------------------------------------------------------------------+---------------------------------------------------------------+\n",
404 | "|1 |{\"name\": \"John Doe\", \"age\": 30} |{30, null, null, null, John Doe, null, null, null} |\n",
405 | "|2 |{\"city\": \"New York\", \"country\": \"USA\", \"zipcode\": \"10001\"} |{null, null, New York, USA, null, null, null, 10001} |\n",
406 | "|3 |{\"product\": \"Laptop\", \"brand\": \"Dell\", \"specs\": {\"RAM\": \"16GB\", \"Storage\": \"512GB SSD\"}}|{null, Dell, null, null, null, Laptop, {16GB, 512GB SSD}, null}|\n",
407 | "+---+----------------------------------------------------------------------------------------+---------------------------------------------------------------+\n",
408 | "\n"
409 | ]
410 | }
411 | ],
412 | "source": [
413 | "dynamic_schema = spark.read.json(df.rdd.map(lambda row: row['json_string'])).schema\n",
414 | "jdf = df.withColumn(\"parsed\", F.from_json('json_string', dynamic_schema))\n",
415 | "jdf.show(truncate=False)"
416 | ]
417 | },
418 | {
419 | "cell_type": "markdown",
420 | "id": "56e661c1-1d5e-4371-9f76-d8f9d39a73c4",
421 | "metadata": {},
422 | "source": [
423 | "# JSON from string"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 27,
429 | "id": "3dab31a6-256d-4d3a-846c-086dfc0b58fb",
430 | "metadata": {},
431 | "outputs": [],
432 | "source": [
433 | "json_str = \"\"\"[\n",
434 | " {\n",
435 | " \"name\":\"Andre\",\n",
436 | " \"doc_list\":[{\"docid\":\"DOC001\", \"name\":\"bla001.txt\"}, {\"docid\":\"DOC002\", \"name\":\"bla002.txt\"}]\n",
437 | " },\n",
438 | " {\n",
439 | " \"name\": \"Noe\",\n",
440 | " \"doc_list\":[{\"docid\":\"DOC002\", \"name\":\"bla002.txt\"}, {\"docid\":\"DOC003\", \"name\":\"bla003.txt\"}]\n",
441 | " }\n",
442 | "]\"\"\"\n"
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": 28,
448 | "id": "e9c6ba84-4ddc-4e06-b283-195833e33e72",
449 | "metadata": {},
450 | "outputs": [
451 | {
452 | "name": "stdout",
453 | "output_type": "stream",
454 | "text": [
455 | "+-----+------------------------------------------------------------------------------+\n",
456 | "|name |doc_list |\n",
457 | "+-----+------------------------------------------------------------------------------+\n",
458 | "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|\n",
459 | "|Noe |[{docid -> DOC002, name -> bla002.txt}, {docid -> DOC003, name -> bla003.txt}]|\n",
460 | "+-----+------------------------------------------------------------------------------+\n",
461 | "\n"
462 | ]
463 | }
464 | ],
465 | "source": [
466 | "json_str=re.sub(r\"\\n\",\"\",json_str)\n",
467 | "sc = spark.sparkContext\n",
468 | "schema = StructType([\n",
469 | " StructField(\"name\", StringType(), True),\n",
470 | " StructField(\"doc_list\", ArrayType(MapType(StringType(), StringType())), True)\n",
471 | "])\n",
472 | "df = spark.read.json(sc.parallelize([json_str]), schema)\n",
473 | "df.show(truncate=False)"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": 29,
479 | "id": "3e1a9042-65c5-4aea-a1ff-cf58b629ecd0",
480 | "metadata": {},
481 | "outputs": [
482 | {
483 | "name": "stdout",
484 | "output_type": "stream",
485 | "text": [
486 | "+-----+---------------------+\n",
487 | "|name |doc_names |\n",
488 | "+-----+---------------------+\n",
489 | "|Noe |bla002.txt,bla003.txt|\n",
490 | "|Andre|bla001.txt,bla002.txt|\n",
491 | "+-----+---------------------+\n",
492 | "\n"
493 | ]
494 | }
495 | ],
496 | "source": [
497 | "dfe = df.withColumn(\"item\", F.explode(\"doc_list\"))\n",
498 | "dfe = dfe.withColumn(\"doc\", dfe.item.getItem('name')).drop(\"item\") \\\n",
499 | " .groupBy('name').agg(F.collect_list(\"doc\").alias(\"doclist\")) \\\n",
500 | " .withColumn('doc_names', F.concat_ws(',','doclist')).drop(\"doclist\")\n",
501 | "dfe.show(truncate=False)"
502 | ]
503 | }
504 | ],
505 | "metadata": {
506 | "kernelspec": {
507 | "display_name": "Python 3 (ipykernel)",
508 | "language": "python",
509 | "name": "python3"
510 | },
511 | "language_info": {
512 | "codemirror_mode": {
513 | "name": "ipython",
514 | "version": 3
515 | },
516 | "file_extension": ".py",
517 | "mimetype": "text/x-python",
518 | "name": "python",
519 | "nbconvert_exporter": "python",
520 | "pygments_lexer": "ipython3",
521 | "version": "3.10.12"
522 | }
523 | },
524 | "nbformat": 4,
525 | "nbformat_minor": 5
526 | }
527 |
--------------------------------------------------------------------------------