├── dataframes
    ├── .gitignore
    ├── dataframe_from_mongodb.md
    ├── dataframe_from_csv.ipynb
    ├── .ipynb_checkpoints
    │   ├── dataframe-from-csv-checkpoint.ipynb
    │   ├── sort_string_using_sorted_array-checkpoint.ipynb
    │   └── dataframe_from_nothing-checkpoint.ipynb
    ├── dataframe_from_nothing.ipynb
    ├── not_exits_and_exists_equivalent-checkpoint.ipynb
    └── dataframe_from_json.ipynb
├── .idea
    ├── .gitignore
    ├── misc.xml
    ├── vcs.xml
    ├── inspectionProfiles
    │   ├── profiles_settings.xml
    │   └── Project_Default.xml
    ├── modules.xml
    └── spark-ref.iml
├── .gitignore
├── README.md
├── SECURITY.md
├── random-examples
    ├── sort_string_using_sorted_array.ipynb
    └── word_counter.ipynb
├── rdd
    └── rdd.ipynb
└── sql
    └── not_exits_and_exists_equivalent.ipynb


/dataframes/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | ../*.swp
3 | .idea/
4 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints/
 2 | .idea/
 3 | .bash_history
 4 | .cache/
 5 | .ipython/
 6 | .jupyter/
 7 | .local/
 8 | .python_history
 9 | 
10 | .virtual_documents/
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # spark-ref
2 | pySpark references for developers
3 | 
4 | This is a repository for pySpark that developers could use day-by-day.
5 | 
6 | The idea is organize useful examples per themes 
7 | 
8 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/spark-ref.iml" filepath="$PROJECT_DIR$/.idea/spark-ref.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/spark-ref.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | Use this section to tell people about which versions of your project are
 6 | currently being supported with security updates.
 7 | 
 8 | | Version | Supported          |
 9 | | ------- | ------------------ |
10 | | 5.1.x   | :white_check_mark: |
11 | | 5.0.x   | :x:                |
12 | | 4.0.x   | :white_check_mark: |
13 | | < 4.0   | :x:                |
14 | 
15 | ## Reporting a Vulnerability
16 | 
17 | Use this section to tell people how to report a vulnerability.
18 | 
19 | Tell them where to go, how often they can expect to get an update on a
20 | reported vulnerability, what to expect if the vulnerability is accepted or
21 | declined, etc.
22 | 


--------------------------------------------------------------------------------
/dataframes/dataframe_from_mongodb.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Importing 
 3 | ```python
 4 | import pyspark
 5 | import pyspark.sql.functions as F
 6 | from pyspark.sql import SparkSession
 7 | from pyspark.sql.types import StructType,StructField,StringType,IntegerType
 8 | ```
 9 | ## Connection directly on SparkSession
10 | ```python
11 | spark = SparkSession \
12 |     .builder \
13 |     .appName("tgt-santander-ingestion"
14 |     ).config("spark.jars","jars/mongo-spark-connector_2.11-2.4.1.jar,jars/mongo-java-driver-3.11.0-rc0.jar,scala-library-2.11.12.jar"
15 |     ).config("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.fake"
16 |     ).config("spark.mongodb.output.uri", "mongodb://127.0.0.1/test.fake"
17 |     ).config(
18 |         "spark.hadoop.hive.metastore.warehouse.dir"
19 |         ,"/home/hduser/Projects/job-test/metastore_db"
20 |     ).config("spark.sql.warehouse.dir","/user/hive/warehouse"
21 |     ).enableHiveSupport(
22 |     ).getOrCreate()
23 | 
24 | mongoDF = spark.read.format("mongo").load()
25 | ```
26 | 
27 | 


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="ignoredPackages">
 6 |         <value>
 7 |           <list size="2">
 8 |             <item index="0" class="java.lang.String" itemvalue="pyspark" />
 9 |             <item index="1" class="java.lang.String" itemvalue="paramiko" />
10 |           </list>
11 |         </value>
12 |       </option>
13 |     </inspection_tool>
14 |     <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
15 |       <option name="ignoredErrors">
16 |         <list>
17 |           <option value="N812" />
18 |         </list>
19 |       </option>
20 |     </inspection_tool>
21 |     <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
22 |       <option name="ignoredIdentifiers">
23 |         <list>
24 |           <option value="pyspark" />
25 |         </list>
26 |       </option>
27 |     </inspection_tool>
28 |     <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
29 |       <option name="processCode" value="true" />
30 |       <option name="processLiterals" value="true" />
31 |       <option name="processComments" value="true" />
32 |     </inspection_tool>
33 |   </profile>
34 | </component>


--------------------------------------------------------------------------------
/dataframes/dataframe_from_csv.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 5,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import pyspark\n",
10 |     "from pyspark.sql import SparkSession\n"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 6,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "# Initializing Spark session\n",
20 |     "spark = SparkSession.builder.appName('basic').getOrCreate()"
21 |    ]
22 |   },
23 |   {
24 |    "cell_type": "code",
25 |    "execution_count": 7,
26 |    "metadata": {},
27 |    "outputs": [
28 |     {
29 |      "name": "stdout",
30 |      "output_type": "stream",
31 |      "text": [
32 |       "+---+-----+---+\n",
33 |       "| id| name|age|\n",
34 |       "+---+-----+---+\n",
35 |       "|  1|André| 41|\n",
36 |       "|  2| João| 28|\n",
37 |       "|  3|Maria| 29|\n",
38 |       "+---+-----+---+\n",
39 |       "\n"
40 |      ]
41 |     }
42 |    ],
43 |    "source": [
44 |     "# Reading from HDFS\n",
45 |     "spark.read.csv('/users/hduser/spark-ref/dataframes/examples/example-001.csv', \n",
46 |     "               sep=',',\n",
47 |     "               encoding='utf-8',\n",
48 |     "               header=True\n",
49 |     "              ).show()\n",
50 |     "\n",
51 |     "# If you want to load a local file, add the prefix 'file://' to path!"
52 |    ]
53 |   }
54 |  ],
55 |  "metadata": {
56 |   "kernelspec": {
57 |    "display_name": "Python 3 (ipykernel)",
58 |    "language": "python",
59 |    "name": "python3"
60 |   },
61 |   "language_info": {
62 |    "codemirror_mode": {
63 |     "name": "ipython",
64 |     "version": 3
65 |    },
66 |    "file_extension": ".py",
67 |    "mimetype": "text/x-python",
68 |    "name": "python",
69 |    "nbconvert_exporter": "python",
70 |    "pygments_lexer": "ipython3",
71 |    "version": "3.8.5"
72 |   }
73 |  },
74 |  "nbformat": 4,
75 |  "nbformat_minor": 2
76 | }
77 | 


--------------------------------------------------------------------------------
/dataframes/.ipynb_checkpoints/dataframe-from-csv-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 3,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import pyspark\n",
10 |     "from pyspark.sql import SparkSession\n"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": null,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "# Initializing Spark session\n",
20 |     "spark = SparkSession.builder.appName('basic').getOrCreate()"
21 |    ]
22 |   },
23 |   {
24 |    "cell_type": "code",
25 |    "execution_count": 2,
26 |    "metadata": {},
27 |    "outputs": [
28 |     {
29 |      "ename": "NameError",
30 |      "evalue": "name 'spark' is not defined",
31 |      "output_type": "error",
32 |      "traceback": [
33 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
34 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
35 |       "\u001b[0;32m<ipython-input-2-01be4f7809e7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Reading from HDFS\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m spark.read.csv('/users/hduser/spark-ref/dataframes/examples/example-001.csv', \n\u001b[0m\u001b[1;32m      3\u001b[0m                \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m','\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m                \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'utf-8'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m                \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
36 |       "\u001b[0;31mNameError\u001b[0m: name 'spark' is not defined"
37 |      ]
38 |     }
39 |    ],
40 |    "source": [
41 |     "# Reading from HDFS\n",
42 |     "spark.read.csv('/users/hduser/spark-ref/dataframes/examples/example-001.csv', \n",
43 |     "               sep=',',\n",
44 |     "               encoding='utf-8',\n",
45 |     "               header=True\n",
46 |     "              ).show()\n",
47 |     "\n",
48 |     "# If you want to load a local file, add the prefix 'file://' to path!"
49 |    ]
50 |   },
51 |   {
52 |    "cell_type": "code",
53 |    "execution_count": null,
54 |    "metadata": {},
55 |    "outputs": [],
56 |    "source": []
57 |   }
58 |  ],
59 |  "metadata": {
60 |   "kernelspec": {
61 |    "display_name": "Python 3",
62 |    "language": "python",
63 |    "name": "python3"
64 |   },
65 |   "language_info": {
66 |    "codemirror_mode": {
67 |     "name": "ipython",
68 |     "version": 3
69 |    },
70 |    "file_extension": ".py",
71 |    "mimetype": "text/x-python",
72 |    "name": "python",
73 |    "nbconvert_exporter": "python",
74 |    "pygments_lexer": "ipython3",
75 |    "version": "3.6.9"
76 |   }
77 |  },
78 |  "nbformat": 4,
79 |  "nbformat_minor": 2
80 | }
81 | 


--------------------------------------------------------------------------------
/random-examples/sort_string_using_sorted_array.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 4,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pyspark\n",
 10 |     "import pyspark.sql.functions as F\n",
 11 |     "from pyspark.sql import SparkSession\n",
 12 |     "from pyspark.sql.types import StructType, StructField, StringType"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 5,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "spark = SparkSession.builder.appName('strings_lists').enableHiveSupport().getOrCreate()"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 15,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stdout",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       "+-------+\n",
 34 |       "| target|\n",
 35 |       "+-------+\n",
 36 |       "|XADOWPQ|\n",
 37 |       "+-------+\n",
 38 |       "\n"
 39 |      ]
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "# Creating a basic schema\n",
 44 |     "schema = StructType([\n",
 45 |     "    StructField('target', StringType(), True)\n",
 46 |     "])\n",
 47 |     "data = [['XADOWPQ']]\n",
 48 |     "df = spark.createDataFrame(data, schema)\n",
 49 |     "dfo = df\n",
 50 |     "df.show()"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 14,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "+--------------------+\n",
 63 |       "|              target|\n",
 64 |       "+--------------------+\n",
 65 |       "|[X, A, D, O, W, P...|\n",
 66 |       "+--------------------+\n",
 67 |       "\n",
 68 |       "+--------------------+\n",
 69 |       "|              target|\n",
 70 |       "+--------------------+\n",
 71 |       "|[, A, D, O, P, Q,...|\n",
 72 |       "+--------------------+\n",
 73 |       "\n",
 74 |       "+-------+\n",
 75 |       "| target|\n",
 76 |       "+-------+\n",
 77 |       "|ADOPQWX|\n",
 78 |       "+-------+\n",
 79 |       "\n"
 80 |      ]
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "# First, Transform target to array. Let's using another field\n",
 85 |     "df = df.withColumn('target', F.split('target',''))\n",
 86 |     "df.show()\n",
 87 |     "\n",
 88 |     "# Second, sort the array\n",
 89 |     "df = df.withColumn('target', F.array_sort('target'))\n",
 90 |     "df.show()\n",
 91 |     "\n",
 92 |     "# Third, convert to string again\n",
 93 |     "df = df.withColumn('target', F.concat_ws('', 'target'))\n",
 94 |     "df.show()"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": []
103 |   }
104 |  ],
105 |  "metadata": {
106 |   "kernelspec": {
107 |    "display_name": "Python 3 (ipykernel)",
108 |    "language": "python",
109 |    "name": "python3"
110 |   },
111 |   "language_info": {
112 |    "codemirror_mode": {
113 |     "name": "ipython",
114 |     "version": 3
115 |    },
116 |    "file_extension": ".py",
117 |    "mimetype": "text/x-python",
118 |    "name": "python",
119 |    "nbconvert_exporter": "python",
120 |    "pygments_lexer": "ipython3",
121 |    "version": "3.10.12"
122 |   }
123 |  },
124 |  "nbformat": 4,
125 |  "nbformat_minor": 4
126 | }
127 | 


--------------------------------------------------------------------------------
/dataframes/.ipynb_checkpoints/sort_string_using_sorted_array-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 4,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pyspark\n",
 10 |     "import pyspark.sql.functions as F\n",
 11 |     "from pyspark.sql import SparkSession\n",
 12 |     "from pyspark.sql.types import StructType, StructField, StringType"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 5,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "spark = SparkSession.builder.appName('strings_lists').enableHiveSupport().getOrCreate()"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 15,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stdout",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       "+-------+\n",
 34 |       "| target|\n",
 35 |       "+-------+\n",
 36 |       "|XADOWPQ|\n",
 37 |       "+-------+\n",
 38 |       "\n"
 39 |      ]
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "# Creating a basic schema\n",
 44 |     "schema = StructType([\n",
 45 |     "    StructField('target', StringType(), True)\n",
 46 |     "])\n",
 47 |     "data = [['XADOWPQ']]\n",
 48 |     "df = spark.createDataFrame(data, schema)\n",
 49 |     "dfo = df\n",
 50 |     "df.show()"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 14,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "+--------------------+\n",
 63 |       "|              target|\n",
 64 |       "+--------------------+\n",
 65 |       "|[X, A, D, O, W, P...|\n",
 66 |       "+--------------------+\n",
 67 |       "\n",
 68 |       "+--------------------+\n",
 69 |       "|              target|\n",
 70 |       "+--------------------+\n",
 71 |       "|[, A, D, O, P, Q,...|\n",
 72 |       "+--------------------+\n",
 73 |       "\n",
 74 |       "+-------+\n",
 75 |       "| target|\n",
 76 |       "+-------+\n",
 77 |       "|ADOPQWX|\n",
 78 |       "+-------+\n",
 79 |       "\n"
 80 |      ]
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "# First, Transform target to array. Let's using another field\n",
 85 |     "df = df.withColumn('target', F.split('target',''))\n",
 86 |     "df.show()\n",
 87 |     "\n",
 88 |     "# Second, sort the array\n",
 89 |     "df = df.withColumn('target', F.array_sort('target'))\n",
 90 |     "df.show()\n",
 91 |     "\n",
 92 |     "# Third, convert to string again\n",
 93 |     "df = df.withColumn('target', F.concat_ws('', 'target'))\n",
 94 |     "df.show()"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": []
103 |   }
104 |  ],
105 |  "metadata": {
106 |   "kernelspec": {
107 |    "display_name": "Python 3",
108 |    "language": "python",
109 |    "name": "python3"
110 |   },
111 |   "language_info": {
112 |    "codemirror_mode": {
113 |     "name": "ipython",
114 |     "version": 3
115 |    },
116 |    "file_extension": ".py",
117 |    "mimetype": "text/x-python",
118 |    "name": "python",
119 |    "nbconvert_exporter": "python",
120 |    "pygments_lexer": "ipython3",
121 |    "version": "3.8.5"
122 |   }
123 |  },
124 |  "nbformat": 4,
125 |  "nbformat_minor": 4
126 | }
127 | 


--------------------------------------------------------------------------------
/rdd/rdd.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 8,
  6 |    "id": "88896fe3-993c-4e73-8f32-0dd22ef38ea5",
  7 |    "metadata": {
  8 |     "slideshow": {
  9 |      "slide_type": ""
 10 |     },
 11 |     "tags": []
 12 |    },
 13 |    "outputs": [],
 14 |    "source": [
 15 |     "import pyspark\n",
 16 |     "from pyspark import SparkContext\n",
 17 |     "from operator import add"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 9,
 23 |    "id": "e44ca0de-3148-40f2-9f3d-f0665a9fca94",
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "data": {
 28 |       "text/html": [
 29 |        "\n",
 30 |        "        <div>\n",
 31 |        "            <p><b>SparkContext</b></p>\n",
 32 |        "\n",
 33 |        "            <p><a href=\"http://5780f2db4564:4044\">Spark UI</a></p>\n",
 34 |        "\n",
 35 |        "            <dl>\n",
 36 |        "              <dt>Version</dt>\n",
 37 |        "                <dd><code>v3.2.4</code></dd>\n",
 38 |        "              <dt>Master</dt>\n",
 39 |        "                <dd><code>local[*]</code></dd>\n",
 40 |        "              <dt>AppName</dt>\n",
 41 |        "                <dd><code>pyspark-shell</code></dd>\n",
 42 |        "            </dl>\n",
 43 |        "        </div>\n",
 44 |        "        "
 45 |       ],
 46 |       "text/plain": [
 47 |        "<SparkContext master=local[*] appName=pyspark-shell>"
 48 |       ]
 49 |      },
 50 |      "execution_count": 9,
 51 |      "metadata": {},
 52 |      "output_type": "execute_result"
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "sc = SparkContext.getOrCreate()\n",
 57 |     "sc"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 10,
 63 |    "id": "40f807e5-6ace-4c5c-aaf2-820426e73645",
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "[1, 2, 3, 4, 5]\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "collect_rdd = sc.parallelize([1,2,3,4,5])\n",
 76 |     "print(collect_rdd.collect())"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 11,
 82 |    "id": "8bf68939-cba5-4e3e-a868-a8d0ae82b5a4",
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "10\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "# Counting\n",
 95 |     "count_rdd = sc.parallelize([1,2,3,4,5,6,7,8,9,0])\n",
 96 |     "print(count_rdd.count())"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 12,
102 |    "id": "9199423b-c9e4-4ade-90e3-506260cfceb7",
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "name": "stdout",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "45\n"
110 |      ]
111 |     }
112 |    ],
113 |    "source": [
114 |     "# reduce - immediate operations\n",
115 |     "data_rdd = sc.parallelize([1,2,3,4,5,6,7,8,9,0])\n",
116 |     "result_rdd = data_rdd.reduce(lambda x,y: x + y)\n",
117 |     "print(result_rdd)\n"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 13,
123 |    "id": "6bb70ade-fe11-4cd2-a092-4529af35f61f",
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "[('awesome', 1), ('Spark', 1), ('really', 2), ('is', 1)]\n"
131 |      ]
132 |     }
133 |    ],
134 |    "source": [
135 |     "# word counter\n",
136 |     "s = \"Spark is really really awesome!\"\n",
137 |     "rdd = sc.parallelize([s])\n",
138 |     "counts = rdd.flatMap(lambda line: line.split(\" \")) \\\n",
139 |     "                            .map(lambda line: line.replace(\"!\",\"\")) \\\n",
140 |     "                            .map(lambda word: (word, 1)) \\\n",
141 |     "                            .reduceByKey(lambda x, y: x + y) \\\n",
142 |     "                            .collect()\n",
143 |     "print(str(counts))"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 7,
149 |    "id": "c064273f-1229-4d81-a37e-8a9674c55eda",
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "data": {
154 |       "text/plain": [
155 |        "[('Spark', 1), ('really', 2), ('awesome!', 1), ('is', 1)]"
156 |       ]
157 |      },
158 |      "execution_count": 7,
159 |      "metadata": {},
160 |      "output_type": "execute_result"
161 |     }
162 |    ],
163 |    "source": [
164 |     "s = \"Spark is really really awesome!\"\n",
165 |     "sc.parallelize(s.split(\" \"))\\\n",
166 |     ".map(lambda x:(x, 1))\\\n",
167 |     ".reduceByKey(add).collect()"
168 |    ]
169 |   }
170 |  ],
171 |  "metadata": {
172 |   "kernelspec": {
173 |    "display_name": "Python 3 (ipykernel)",
174 |    "language": "python",
175 |    "name": "python3"
176 |   },
177 |   "language_info": {
178 |    "codemirror_mode": {
179 |     "name": "ipython",
180 |     "version": 3
181 |    },
182 |    "file_extension": ".py",
183 |    "mimetype": "text/x-python",
184 |    "name": "python",
185 |    "nbconvert_exporter": "python",
186 |    "pygments_lexer": "ipython3",
187 |    "version": "3.10.12"
188 |   }
189 |  },
190 |  "nbformat": 4,
191 |  "nbformat_minor": 5
192 | }
193 | 


--------------------------------------------------------------------------------
/random-examples/word_counter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 10,
  6 |    "id": "b4be2271-1e6e-40f2-a2a2-e3dcbf68dbb8",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pyspark\n",
 11 |     "import pyspark.sql.functions as F\n",
 12 |     "from pyspark.sql import SparkSession\n",
 13 |     "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, MapType, ArrayType\n",
 14 |     "from operator import add"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 11,
 20 |    "id": "3b83f99a-7763-487e-b4e3-645d9fe930a6",
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "spark = SparkSession.builder.master('local').appName('word_counter').getOrCreate()"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "b23bf8a9-e8e4-49cd-89fc-3ffd80d35fbd",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### Dataframe way\n",
 33 |     "\n",
 34 |     " 1. Create the dataframe with one column and call it 'word'\n",
 35 |     " 2. Use F.split and get one column with all words separated in a list\n",
 36 |     " 3. Use F.explode to transform each item in the list to a row\n",
 37 |     " 4. Group by 'word' and aggregate using 'count' function\n",
 38 |     "\n",
 39 |     "    "
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 12,
 45 |    "id": "2d1bde9c-05eb-4f94-a0f9-ce7cc421b085",
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# Input\n",
 50 |     "s = [\"Spark is totally totally awesome!\"]"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 13,
 56 |    "id": "de353bf4-1301-458e-a6be-212d4ae73d9d",
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "+-------+-----------+\n",
 64 |       "|   word|count(word)|\n",
 65 |       "+-------+-----------+\n",
 66 |       "|totally|          2|\n",
 67 |       "|     is|          1|\n",
 68 |       "|  Spark|          1|\n",
 69 |       "|awesome|          1|\n",
 70 |       "+-------+-----------+\n",
 71 |       "\n"
 72 |      ]
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "words_df = spark.createDataFrame([s], ['word'])\\\n",
 77 |     "            .withColumn('word', F.explode(F.split(F.col('word'), ' ')))\\\n",
 78 |     "            .groupBy('word').agg(F.count('word'))\\\n",
 79 |     "            .withColumn('word', F.regexp_replace(F.col('word'), r\"^(.*)[\\!@#\\$%&*\\(\\)_\\-\\+\\=]+(.*)$\", \"$1$2\"))\\\n",
 80 |     "            .show()"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "id": "155fdb25-4711-4095-ac7f-b9c84099bcb6",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "### Dataframe + SQL\n",
 89 |     "\n",
 90 |     "1. Create a dataframe and a view from it\n",
 91 |     "2. Split and explode exactly as before\n",
 92 |     "3. Create a temporary view\n",
 93 |     "4. Count and group using SQL\n"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 14,
 99 |    "id": "7ec2679c-1afd-47ac-99a8-70aa61baf43a",
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "# Create a data frame and a view\n",
104 |     "s = [\"Spark is really really awesome!\"]\n",
105 |     "lines_df = spark.createDataFrame([s], ['word'])\\\n",
106 |     "            .withColumn('word', F.explode(F.split(F.col('word'), ' ')))\n",
107 |     "lines_df.createOrReplaceTempView('lines')"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 15,
113 |    "id": "9a292827-710b-4231-9eda-3434cac14ec5",
114 |    "metadata": {},
115 |    "outputs": [
116 |     {
117 |      "name": "stdout",
118 |      "output_type": "stream",
119 |      "text": [
120 |       "+--------+-----------+\n",
121 |       "|    word|count(word)|\n",
122 |       "+--------+-----------+\n",
123 |       "|      is|          1|\n",
124 |       "|  really|          2|\n",
125 |       "|   Spark|          1|\n",
126 |       "|awesome!|          1|\n",
127 |       "+--------+-----------+\n",
128 |       "\n"
129 |      ]
130 |     }
131 |    ],
132 |    "source": [
133 |     "# Select data from the view simply using Spark SQL\n",
134 |     "spark.sql(\"\"\"select word, count(word) from lines group by word\"\"\").show()"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "id": "cc5800a1-42bb-4bf9-9696-2dd830315319",
140 |    "metadata": {},
141 |    "source": [
142 |     "### RDD way"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 16,
148 |    "id": "bc918308-4d4a-436c-90ed-0483f0cc5b56",
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/plain": [
154 |        "[('Spark', 1), ('is', 1), ('really', 2), ('awesome!', 1)]"
155 |       ]
156 |      },
157 |      "execution_count": 16,
158 |      "metadata": {},
159 |      "output_type": "execute_result"
160 |     }
161 |    ],
162 |    "source": [
163 |     "# If you like map/reduce crap, go ahead!\n",
164 |     "s = \"Spark is really really awesome!\"\n",
165 |     "spark.sparkContext\\\n",
166 |     " .parallelize(s.split()).map(lambda x:(x, 1))\\\n",
167 |     " .reduceByKey(add).collect()"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "e82e9b12-7fbd-42d1-9492-be5cdef9dfdf",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": []
177 |   }
178 |  ],
179 |  "metadata": {
180 |   "kernelspec": {
181 |    "display_name": "Python 3 (ipykernel)",
182 |    "language": "python",
183 |    "name": "python3"
184 |   },
185 |   "language_info": {
186 |    "codemirror_mode": {
187 |     "name": "ipython",
188 |     "version": 3
189 |    },
190 |    "file_extension": ".py",
191 |    "mimetype": "text/x-python",
192 |    "name": "python",
193 |    "nbconvert_exporter": "python",
194 |    "pygments_lexer": "ipython3",
195 |    "version": "3.10.12"
196 |   }
197 |  },
198 |  "nbformat": 4,
199 |  "nbformat_minor": 5
200 | }
201 | 


--------------------------------------------------------------------------------
/dataframes/dataframe_from_nothing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Dataframes from \"nothing\""
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pyspark\n",
 17 |     "from pyspark.sql import SparkSession\n",
 18 |     "from pyspark.sql.types import StructType, StructField, StringType"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "spark = SparkSession.builder.appName('basic').getOrCreate()"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Creating a empty dataframe"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "### Parallelize way"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 3,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "+--------+-------+-------+\n",
 54 |       "|      _1|     _2|     _3|\n",
 55 |       "+--------+-------+-------+\n",
 56 |       "|    This|is only|a test!|\n",
 57 |       "|And this|     is|    too|\n",
 58 |       "+--------+-------+-------+\n",
 59 |       "\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "# Define your data by a set of data. Each data is a Row!\n",
 65 |     "data = [(\"This\",\"is only\", \"a test!\"),(\"And this\",\"is\",\"too\")]\n",
 66 |     "\n",
 67 |     "# Paralellize data\n",
 68 |     "spark.sparkContext.parallelize(data).toDF().show()\n"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "name": "stdout",
 78 |      "output_type": "stream",
 79 |      "text": [
 80 |       "+--------+-------+-------+\n",
 81 |       "|    colA|   colB|   colC|\n",
 82 |       "+--------+-------+-------+\n",
 83 |       "|    This|is only|a test!|\n",
 84 |       "|And this|     is|    too|\n",
 85 |       "+--------+-------+-------+\n",
 86 |       "\n"
 87 |      ]
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "# Defining column names\n",
 92 |     "columns = [\"colA\", \"colB\", \"colC\"]\n",
 93 |     "spark.sparkContext.parallelize(data).toDF(columns).show()"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "### createDataFrame way"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 5,
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "name": "stdout",
110 |      "output_type": "stream",
111 |      "text": [
112 |       "++\n",
113 |       "||\n",
114 |       "++\n",
115 |       "++\n",
116 |       "\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "# First it's required unleast a empty schema\n",
122 |     "schema = StructType([])\n",
123 |     "\n",
124 |     "# Now, an empty and useless dataframe\n",
125 |     "spark.createDataFrame(spark.sparkContext.emptyRDD(),schema).show()"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 11,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "name": "stdout",
135 |      "output_type": "stream",
136 |      "text": [
137 |       "+--------+-------+-------+\n",
138 |       "|    colA|   colB|   colC|\n",
139 |       "+--------+-------+-------+\n",
140 |       "|    This|is only|a test!|\n",
141 |       "|And this|     is|    too|\n",
142 |       "+--------+-------+-------+\n",
143 |       "\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "# Schema for previous data\n",
149 |     "schema = StructType([\n",
150 |     "            StructField(\"colA\",StringType(),False),\n",
151 |     "            StructField(\"colB\",StringType(),False),\n",
152 |     "            StructField(\"colC\",StringType(),False)\n",
153 |     "        ])\n",
154 |     "# Creating DataFrame\n",
155 |     "spark.createDataFrame(data,schema).show()\n"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 12,
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "name": "stdout",
165 |      "output_type": "stream",
166 |      "text": [
167 |       "+--------+-------+------+\n",
168 |       "|    colA|   colB|  colC|\n",
169 |       "+--------+-------+------+\n",
170 |       "|    This|is only|a test|\n",
171 |       "|And this|     is|   too|\n",
172 |       "+--------+-------+------+\n",
173 |       "\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "# Minimalist\n",
179 |     "df = spark.createDataFrame([[\"This\",\"is only\",\"a test\"],[\"And this\",\"is\",\"too\"]],schema)\n",
180 |     "df.show()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "## Dataframes from CSV"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 13,
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "name": "stdout",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "+--------+-------+------+\n",
200 |       "|    colA|   colB|  colC|\n",
201 |       "+--------+-------+------+\n",
202 |       "|    This|is only|a test|\n",
203 |       "|And this|     is|   too|\n",
204 |       "+--------+-------+------+\n",
205 |       "\n"
206 |      ]
207 |     }
208 |    ],
209 |    "source": [
210 |     "# Let's create a file first from the dataframe stored on 'df' var!\n",
211 |     "file='/users/hduser/spark-ref/dataframes/examples/example-001.csv'\n",
212 |     "df.write.csv(file, sep=',', header=True)\n",
213 |     "\n",
214 |     "\n",
215 |     "# Reading from CSV\n",
216 |     "spark.read.csv(file, \n",
217 |     "               sep=',',\n",
218 |     "               encoding='utf-8',\n",
219 |     "               header=True).show()"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": []
228 |   }
229 |  ],
230 |  "metadata": {
231 |   "kernelspec": {
232 |    "display_name": "Python 3 (ipykernel)",
233 |    "language": "python",
234 |    "name": "python3"
235 |   },
236 |   "language_info": {
237 |    "codemirror_mode": {
238 |     "name": "ipython",
239 |     "version": 3
240 |    },
241 |    "file_extension": ".py",
242 |    "mimetype": "text/x-python",
243 |    "name": "python",
244 |    "nbconvert_exporter": "python",
245 |    "pygments_lexer": "ipython3",
246 |    "version": "3.8.5"
247 |   }
248 |  },
249 |  "nbformat": 4,
250 |  "nbformat_minor": 2
251 | }
252 | 


--------------------------------------------------------------------------------
/dataframes/.ipynb_checkpoints/dataframe_from_nothing-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Dataframes from \"nothing\""
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pyspark\n",
 17 |     "from pyspark.sql import SparkSession\n",
 18 |     "from pyspark.sql.types import StructType, StructField, StringType"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "spark = SparkSession.builder.appName('basic').getOrCreate()"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Creating a empty dataframe"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "### Parallelize way"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 3,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "+--------+-------+-------+\n",
 54 |       "|      _1|     _2|     _3|\n",
 55 |       "+--------+-------+-------+\n",
 56 |       "|    This|is only|a test!|\n",
 57 |       "|And this|     is|    too|\n",
 58 |       "+--------+-------+-------+\n",
 59 |       "\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "# Define your data by a set of data. Each data is a Row!\n",
 65 |     "data = [(\"This\",\"is only\", \"a test!\"),(\"And this\",\"is\",\"too\")]\n",
 66 |     "\n",
 67 |     "# Paralellize data\n",
 68 |     "spark.sparkContext.parallelize(data).toDF().show()\n"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "name": "stdout",
 78 |      "output_type": "stream",
 79 |      "text": [
 80 |       "+--------+-------+-------+\n",
 81 |       "|    colA|   colB|   colC|\n",
 82 |       "+--------+-------+-------+\n",
 83 |       "|    This|is only|a test!|\n",
 84 |       "|And this|     is|    too|\n",
 85 |       "+--------+-------+-------+\n",
 86 |       "\n"
 87 |      ]
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "# Defining column names\n",
 92 |     "columns = [\"colA\", \"colB\", \"colC\"]\n",
 93 |     "spark.sparkContext.parallelize(data).toDF(columns).show()"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "### createDataFrame way"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 5,
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "name": "stdout",
110 |      "output_type": "stream",
111 |      "text": [
112 |       "++\n",
113 |       "||\n",
114 |       "++\n",
115 |       "++\n",
116 |       "\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "# First it's required unleast a empty schema\n",
122 |     "schema = StructType([])\n",
123 |     "\n",
124 |     "# Now, an empty and useless dataframe\n",
125 |     "spark.createDataFrame(spark.sparkContext.emptyRDD(),schema).show()"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 11,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "name": "stdout",
135 |      "output_type": "stream",
136 |      "text": [
137 |       "+--------+-------+-------+\n",
138 |       "|    colA|   colB|   colC|\n",
139 |       "+--------+-------+-------+\n",
140 |       "|    This|is only|a test!|\n",
141 |       "|And this|     is|    too|\n",
142 |       "+--------+-------+-------+\n",
143 |       "\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "# Schema for previous data\n",
149 |     "schema = StructType([\n",
150 |     "            StructField(\"colA\",StringType(),False),\n",
151 |     "            StructField(\"colB\",StringType(),False),\n",
152 |     "            StructField(\"colC\",StringType(),False)\n",
153 |     "        ])\n",
154 |     "# Creating DataFrame\n",
155 |     "spark.createDataFrame(data,schema).show()\n"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 12,
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "name": "stdout",
165 |      "output_type": "stream",
166 |      "text": [
167 |       "+--------+-------+------+\n",
168 |       "|    colA|   colB|  colC|\n",
169 |       "+--------+-------+------+\n",
170 |       "|    This|is only|a test|\n",
171 |       "|And this|     is|   too|\n",
172 |       "+--------+-------+------+\n",
173 |       "\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "# Minimalist\n",
179 |     "df = spark.createDataFrame([[\"This\",\"is only\",\"a test\"],[\"And this\",\"is\",\"too\"]],schema)\n",
180 |     "df.show()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "## Dataframes from CSV"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 13,
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "name": "stdout",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "+--------+-------+------+\n",
200 |       "|    colA|   colB|  colC|\n",
201 |       "+--------+-------+------+\n",
202 |       "|    This|is only|a test|\n",
203 |       "|And this|     is|   too|\n",
204 |       "+--------+-------+------+\n",
205 |       "\n"
206 |      ]
207 |     }
208 |    ],
209 |    "source": [
210 |     "# Let's create a file first from the dataframe stored on 'df' var!\n",
211 |     "file='/users/hduser/spark-ref/dataframes/examples/example-001.csv'\n",
212 |     "df.write.csv(file, sep=',', header=True)\n",
213 |     "\n",
214 |     "\n",
215 |     "# Reading from CSV\n",
216 |     "spark.read.csv(file, \n",
217 |     "               sep=',',\n",
218 |     "               encoding='utf-8',\n",
219 |     "               header=True).show()"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": []
228 |   }
229 |  ],
230 |  "metadata": {
231 |   "kernelspec": {
232 |    "display_name": "Python 3 (ipykernel)",
233 |    "language": "python",
234 |    "name": "python3"
235 |   },
236 |   "language_info": {
237 |    "codemirror_mode": {
238 |     "name": "ipython",
239 |     "version": 3
240 |    },
241 |    "file_extension": ".py",
242 |    "mimetype": "text/x-python",
243 |    "name": "python",
244 |    "nbconvert_exporter": "python",
245 |    "pygments_lexer": "ipython3",
246 |    "version": "3.8.5"
247 |   }
248 |  },
249 |  "nbformat": 4,
250 |  "nbformat_minor": 2
251 | }
252 | 


--------------------------------------------------------------------------------
/dataframes/not_exits_and_exists_equivalent-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## \"NOT EXISTS\" AND \"EXISTS\" equivalent operations on dataframes\n"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pyspark.sql.functions as F\n",
 17 |     "from pyspark.sql.types import *\n",
 18 |     "from pyspark.sql import SparkSession"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "spark = SparkSession.builder.appName('test_dataframes').enableHiveSupport().getOrCreate()"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Generating data"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 35,
 40 |    "metadata": {},
 41 |    "outputs": [
 42 |     {
 43 |      "name": "stdout",
 44 |      "output_type": "stream",
 45 |      "text": [
 46 |       "+---+-----+\n",
 47 |       "| id| name|\n",
 48 |       "+---+-----+\n",
 49 |       "|  1|Andre|\n",
 50 |       "|  2| Rose|\n",
 51 |       "+---+-----+\n",
 52 |       "\n",
 53 |       "+---+------+\n",
 54 |       "| id|  name|\n",
 55 |       "+---+------+\n",
 56 |       "|  1| Andre|\n",
 57 |       "|  2|  Rose|\n",
 58 |       "|  3|Daniel|\n",
 59 |       "+---+------+\n",
 60 |       "\n",
 61 |       "+---+------+\n",
 62 |       "| id|  name|\n",
 63 |       "+---+------+\n",
 64 |       "|  1| Andre|\n",
 65 |       "|  2|  Rose|\n",
 66 |       "|  3|Daniel|\n",
 67 |       "|  3|Daniel|\n",
 68 |       "|  4| Anita|\n",
 69 |       "+---+------+\n",
 70 |       "\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "data1 = [(1,\"Andre\"),(2,\"Rose\")]\n",
 76 |     "data2 = [(1,\"Andre\"),(2,\"Rose\"),(3,\"Daniel\")]\n",
 77 |     "data3 = [(1,\"Andre\"),(2,\"Rose\"),(3,\"Daniel\"), (3,\"Daniel\"), (4,\"Anita\")]\n",
 78 |     "schema = StructType([\n",
 79 |     "    StructField(\"id\",StringType(),True),\n",
 80 |     "    StructField(\"name\",StringType(),True),\n",
 81 |     "])\n",
 82 |     "\n",
 83 |     "df1 = spark.createDataFrame(data1,schema)\n",
 84 |     "df2 = spark.createDataFrame(data2,schema)\n",
 85 |     "df3 = spark.createDataFrame(data3,schema)\n",
 86 |     "df1.show()\n",
 87 |     "df2.show()\n",
 88 |     "df3.show()"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "## NOT EXISTS EQUIVALENT"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "### Method 1 - subtract"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 36,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "name": "stdout",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "+---+------+\n",
115 |       "| id|  name|\n",
116 |       "+---+------+\n",
117 |       "|  3|Daniel|\n",
118 |       "+---+------+\n",
119 |       "\n"
120 |      ]
121 |     }
122 |    ],
123 |    "source": [
124 |     "# All that exists in df2 but doesn't dexists in df1\n",
125 |     "dfr = df2.subtract(df1)\n",
126 |     "dfr.show()"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 47,
132 |    "metadata": {},
133 |    "outputs": [
134 |     {
135 |      "name": "stdout",
136 |      "output_type": "stream",
137 |      "text": [
138 |       "+---+------+\n",
139 |       "| id|  name|\n",
140 |       "+---+------+\n",
141 |       "|  3|Daniel|\n",
142 |       "|  4| Anita|\n",
143 |       "+---+------+\n",
144 |       "\n"
145 |      ]
146 |     }
147 |    ],
148 |    "source": [
149 |     "# All that exists in df3 but doesn't dexists in df1\n",
150 |     "dfr2 = df3.subtract(df1)\n",
151 |     "dfr2.show()"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "### Method 2 - left_anti. This is the 'classical' way to have something equivalent to 'NOT EXISTS'"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 37,
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "name": "stdout",
168 |      "output_type": "stream",
169 |      "text": [
170 |       "+---+------+\n",
171 |       "| id|  name|\n",
172 |       "+---+------+\n",
173 |       "|  3|Daniel|\n",
174 |       "+---+------+\n",
175 |       "\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "# All that exists in df2 but doesn't dexists in df1\n",
181 |     "dfr = df2.join(df1,'id','left_anti')\n",
182 |     "dfr.show()"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 54,
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "+---+------+\n",
195 |       "| id|  name|\n",
196 |       "+---+------+\n",
197 |       "|  3|Daniel|\n",
198 |       "|  4| Anita|\n",
199 |       "+---+------+\n",
200 |       "\n"
201 |      ]
202 |     }
203 |    ],
204 |    "source": [
205 |     "# All that exists in df3 but doesn't dexists in df1\n",
206 |     "dfr2 = df3.join(df1,'id','left_anti').dropDuplicates()\n",
207 |     "dfr2.show()"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "### Method 3 - exceptAll - same thing as 'subtract', apearently!"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 51,
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "name": "stdout",
224 |      "output_type": "stream",
225 |      "text": [
226 |       "+---+------+\n",
227 |       "| id|  name|\n",
228 |       "+---+------+\n",
229 |       "|  3|Daniel|\n",
230 |       "+---+------+\n",
231 |       "\n"
232 |      ]
233 |     }
234 |    ],
235 |    "source": [
236 |     "# All that exists in df2 but doesn't dexists in df1\n",
237 |     "dfr = df2.exceptAll(df1)\n",
238 |     "dfr.show()"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 55,
244 |    "metadata": {},
245 |    "outputs": [
246 |     {
247 |      "name": "stdout",
248 |      "output_type": "stream",
249 |      "text": [
250 |       "+---+------+\n",
251 |       "| id|  name|\n",
252 |       "+---+------+\n",
253 |       "|  3|Daniel|\n",
254 |       "|  4| Anita|\n",
255 |       "+---+------+\n",
256 |       "\n"
257 |      ]
258 |     }
259 |    ],
260 |    "source": [
261 |     "# All that exists in df3 but doesn't dexists in df1\n",
262 |     "dfr2 = df3.exceptAll(df1).dropDuplicates()\n",
263 |     "dfr2.show()"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "markdown",
268 |    "metadata": {},
269 |    "source": [
270 |     "## EXISTS EQUIVALENT"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "### Method 1 - left_semi"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 40,
283 |    "metadata": {},
284 |    "outputs": [
285 |     {
286 |      "name": "stdout",
287 |      "output_type": "stream",
288 |      "text": [
289 |       "+---+-----+\n",
290 |       "| id| name|\n",
291 |       "+---+-----+\n",
292 |       "|  1|Andre|\n",
293 |       "|  2| Rose|\n",
294 |       "+---+-----+\n",
295 |       "\n"
296 |      ]
297 |     }
298 |    ],
299 |    "source": [
300 |     "# Only that exists in df2 and df1\n",
301 |     "dfr = df2.join(df1,'id','left_semi')\n",
302 |     "dfr.show()"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 56,
308 |    "metadata": {},
309 |    "outputs": [
310 |     {
311 |      "name": "stdout",
312 |      "output_type": "stream",
313 |      "text": [
314 |       "+---+-----+\n",
315 |       "| id| name|\n",
316 |       "+---+-----+\n",
317 |       "|  1|Andre|\n",
318 |       "|  2| Rose|\n",
319 |       "+---+-----+\n",
320 |       "\n"
321 |      ]
322 |     }
323 |    ],
324 |    "source": [
325 |     "# Only that exists in df3 and df1\n",
326 |     "dfr = df3.join(df1,'id','left_semi')\n",
327 |     "dfr.show()"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "metadata": {},
334 |    "outputs": [],
335 |    "source": []
336 |   }
337 |  ],
338 |  "metadata": {
339 |   "kernelspec": {
340 |    "display_name": "Python 3",
341 |    "language": "python",
342 |    "name": "python3"
343 |   },
344 |   "language_info": {
345 |    "codemirror_mode": {
346 |     "name": "ipython",
347 |     "version": 3
348 |    },
349 |    "file_extension": ".py",
350 |    "mimetype": "text/x-python",
351 |    "name": "python",
352 |    "nbconvert_exporter": "python",
353 |    "pygments_lexer": "ipython3",
354 |    "version": "3.8.5"
355 |   }
356 |  },
357 |  "nbformat": 4,
358 |  "nbformat_minor": 4
359 | }
360 | 


--------------------------------------------------------------------------------
/sql/not_exits_and_exists_equivalent.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Spark SQL: \"NOT EXISTS\" AND \"EXISTS\" equivalent operations on dataframes\n",
  8 |     "\n"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 28,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import findspark\n",
 18 |     "findspark.init()\n",
 19 |     "import pyspark.sql.functions as F\n",
 20 |     "from pyspark.sql.types import *\n",
 21 |     "from pyspark.sql import SparkSession"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 29,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "spark = SparkSession.builder.appName('test_dataframes').enableHiveSupport().getOrCreate()"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "## Generating data"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 30,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stdout",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "+---+-----+\n",
 50 |       "| id| name|\n",
 51 |       "+---+-----+\n",
 52 |       "|  1|Andre|\n",
 53 |       "|  2| Rose|\n",
 54 |       "+---+-----+\n",
 55 |       "\n",
 56 |       "+---+------+\n",
 57 |       "| id|  name|\n",
 58 |       "+---+------+\n",
 59 |       "|  1| Andre|\n",
 60 |       "|  2|  Rose|\n",
 61 |       "|  3|Daniel|\n",
 62 |       "+---+------+\n",
 63 |       "\n",
 64 |       "+---+------+\n",
 65 |       "| id|  name|\n",
 66 |       "+---+------+\n",
 67 |       "|  1| Andre|\n",
 68 |       "|  2|  Rose|\n",
 69 |       "|  3|Daniel|\n",
 70 |       "|  3|Daniel|\n",
 71 |       "|  4| Anita|\n",
 72 |       "+---+------+\n",
 73 |       "\n"
 74 |      ]
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "data1 = [(1,\"Andre\"),(2,\"Rose\")]\n",
 79 |     "data2 = [(1,\"Andre\"),(2,\"Rose\"),(3,\"Daniel\")]\n",
 80 |     "data3 = [(1,\"Andre\"),(2,\"Rose\"),(3,\"Daniel\"), (3,\"Daniel\"), (4,\"Anita\")]\n",
 81 |     "schema = StructType([\n",
 82 |     "    StructField(\"id\",StringType(),True),\n",
 83 |     "    StructField(\"name\",StringType(),True),\n",
 84 |     "])\n",
 85 |     "\n",
 86 |     "df1 = spark.createDataFrame(data1,schema)\n",
 87 |     "df2 = spark.createDataFrame(data2,schema)\n",
 88 |     "df3 = spark.createDataFrame(data3,schema)\n",
 89 |     "df1.show()\n",
 90 |     "df2.show()\n",
 91 |     "df3.show()"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## NOT EXISTS EQUIVALENT"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "### Method 1 - subtract"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 31,
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "name": "stdout",
115 |      "output_type": "stream",
116 |      "text": [
117 |       "+---+------+\n",
118 |       "| id|  name|\n",
119 |       "+---+------+\n",
120 |       "|  3|Daniel|\n",
121 |       "+---+------+\n",
122 |       "\n"
123 |      ]
124 |     }
125 |    ],
126 |    "source": [
127 |     "# All that exists in df2 but doesn't dexists in df1\n",
128 |     "dfr = df2.subtract(df1)\n",
129 |     "dfr.show()"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 32,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "name": "stderr",
139 |      "output_type": "stream",
140 |      "text": [
141 |       "\r",
142 |       "[Stage 338:=================================================>   (187 + 8) / 200]\r",
143 |       "\r",
144 |       "                                                                                \r"
145 |      ]
146 |     },
147 |     {
148 |      "name": "stdout",
149 |      "output_type": "stream",
150 |      "text": [
151 |       "+---+------+\n",
152 |       "| id|  name|\n",
153 |       "+---+------+\n",
154 |       "|  3|Daniel|\n",
155 |       "|  4| Anita|\n",
156 |       "+---+------+\n",
157 |       "\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "# All that exists in df3 but doesn't dexists in df1\n",
163 |     "dfr2 = df3.subtract(df1)\n",
164 |     "dfr2.show()"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "### Method 2 - left_anti. This is the 'classical' way to have something equivalent to 'NOT EXISTS'"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 33,
177 |    "metadata": {},
178 |    "outputs": [
179 |     {
180 |      "name": "stdout",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "+---+------+\n",
184 |       "| id|  name|\n",
185 |       "+---+------+\n",
186 |       "|  3|Daniel|\n",
187 |       "+---+------+\n",
188 |       "\n"
189 |      ]
190 |     }
191 |    ],
192 |    "source": [
193 |     "# All that exists in df2 but doesn't dexists in df1\n",
194 |     "dfr = df2.join(df1,'id','left_anti')\n",
195 |     "dfr.show()"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 34,
201 |    "metadata": {},
202 |    "outputs": [
203 |     {
204 |      "name": "stdout",
205 |      "output_type": "stream",
206 |      "text": [
207 |       "+---+------+\n",
208 |       "| id|  name|\n",
209 |       "+---+------+\n",
210 |       "|  3|Daniel|\n",
211 |       "|  4| Anita|\n",
212 |       "+---+------+\n",
213 |       "\n"
214 |      ]
215 |     }
216 |    ],
217 |    "source": [
218 |     "# All that exists in df3 but doesn't dexists in df1\n",
219 |     "dfr2 = df3.join(df1,'id','left_anti').dropDuplicates()\n",
220 |     "dfr2.show()"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "### Method 3 - exceptAll - same thing as 'subtract', apearently!"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 35,
233 |    "metadata": {},
234 |    "outputs": [
235 |     {
236 |      "name": "stdout",
237 |      "output_type": "stream",
238 |      "text": [
239 |       "+---+------+\n",
240 |       "| id|  name|\n",
241 |       "+---+------+\n",
242 |       "|  3|Daniel|\n",
243 |       "+---+------+\n",
244 |       "\n"
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "# All that exists in df2 but doesn't dexists in df1\n",
250 |     "dfr = df2.exceptAll(df1)\n",
251 |     "dfr.show()"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 36,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "name": "stdout",
261 |      "output_type": "stream",
262 |      "text": [
263 |       "+---+------+\n",
264 |       "| id|  name|\n",
265 |       "+---+------+\n",
266 |       "|  3|Daniel|\n",
267 |       "|  4| Anita|\n",
268 |       "+---+------+\n",
269 |       "\n"
270 |      ]
271 |     }
272 |    ],
273 |    "source": [
274 |     "# All that exists in df3 but doesn't dexists in df1\n",
275 |     "dfr2 = df3.exceptAll(df1).dropDuplicates()\n",
276 |     "dfr2.show()"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "## EXISTS EQUIVALENT"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {},
289 |    "source": [
290 |     "### Method 1 - left_semi"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 15,
296 |    "metadata": {},
297 |    "outputs": [
298 |     {
299 |      "name": "stdout",
300 |      "output_type": "stream",
301 |      "text": [
302 |       "+---+-----+\n",
303 |       "| id| name|\n",
304 |       "+---+-----+\n",
305 |       "|  1|Andre|\n",
306 |       "|  2| Rose|\n",
307 |       "+---+-----+\n",
308 |       "\n"
309 |      ]
310 |     }
311 |    ],
312 |    "source": [
313 |     "# Only that exists in df2 and df1\n",
314 |     "dfr = df2.join(df1,'id','left_semi')\n",
315 |     "dfr.show()"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 16,
321 |    "metadata": {},
322 |    "outputs": [
323 |     {
324 |      "name": "stdout",
325 |      "output_type": "stream",
326 |      "text": [
327 |       "+---+-----+\n",
328 |       "| id| name|\n",
329 |       "+---+-----+\n",
330 |       "|  1|Andre|\n",
331 |       "|  2| Rose|\n",
332 |       "+---+-----+\n",
333 |       "\n"
334 |      ]
335 |     }
336 |    ],
337 |    "source": [
338 |     "# Only that exists in df3 and df1\n",
339 |     "dfr = df3.join(df1,'id','left_semi')\n",
340 |     "dfr.show()"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": []
349 |   }
350 |  ],
351 |  "metadata": {
352 |   "kernelspec": {
353 |    "display_name": "Python 3 (ipykernel)",
354 |    "language": "python",
355 |    "name": "python3"
356 |   },
357 |   "language_info": {
358 |    "codemirror_mode": {
359 |     "name": "ipython",
360 |     "version": 3
361 |    },
362 |    "file_extension": ".py",
363 |    "mimetype": "text/x-python",
364 |    "name": "python",
365 |    "nbconvert_exporter": "python",
366 |    "pygments_lexer": "ipython3",
367 |    "version": "3.10.12"
368 |   }
369 |  },
370 |  "nbformat": 4,
371 |  "nbformat_minor": 4
372 | }
373 | 


--------------------------------------------------------------------------------
/dataframes/dataframe_from_json.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "6524d53e-b3d6-41f2-9d1e-c39f98d52e1d",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Dataframe from JSON\n"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 19,
 14 |    "id": "460e7835-3700-4d4a-a0aa-212e268c34c2",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pyspark\n",
 19 |     "import pyspark.sql.functions as F\n",
 20 |     "import pandas as pd\n",
 21 |     "import re\n",
 22 |     "from pyspark import SparkConf, SparkContext\n",
 23 |     "from pyspark.sql import SparkSession, Row\n",
 24 |     "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, MapType, ArrayType\n",
 25 |     "from pyspark.sql.functions import udf\n"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 20,
 31 |    "id": "2e33eed2-c8f7-46f8-9943-cde7d53bc32a",
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# Getting session from Spark\n",
 36 |     "spark = SparkSession.builder \\\n",
 37 |     ".appName('test').master('local[*]') \\\n",
 38 |     ".config(\"spark.cores.max\", \"2\") \\\n",
 39 |     ".config(\"spark.executor.memory\", \"2g\") \\\n",
 40 |     ".config(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\") \\\n",
 41 |     ".config(\"spark.shuffle.service.enabled\", \"false\") \\\n",
 42 |     ".config(\"spark.dynamicAllocation.enabled\", \"true\") \\\n",
 43 |     ".getOrCreate()\n",
 44 |     "\n",
 45 |     "\n"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "id": "f927ff89-14fd-4527-837c-29c8d3e2892e",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "### A proper JSON format in file dataframes/data/test.js\n",
 54 |     "\n",
 55 |     "```json\n",
 56 |     "[\n",
 57 |     "    {\n",
 58 |     "        \"name\": \"Andre\",\n",
 59 |     "        \"id\": 1,\n",
 60 |     "        \"doc_list\":[{\"docid\":\"DOC001\", \"name\":\"bla001.txt\"}, {\"docid\":\"DOC002\", \"name\":\"bla002.txt\"}],\n",
 61 |     "    },\n",
 62 |     "\n",
 63 |     "    {\n",
 64 |     "        \"name\": \"Noé\",\n",
 65 |     "        \"id\": 2,\n",
 66 |     "        \"doc_list\":[{\"docid\":\"DOC003\", \"name\":\"bla003.txt\"}, {\"docid\":\"DOC004\", \"name\":\"bla004.txt\"}],\n",
 67 |     "    }\n",
 68 |     "]\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "```\n",
 72 |     "\n"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "id": "fcfe3e08-812a-49ec-9679-0716780b12f5",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "### Example problem\n",
 81 |     "\n",
 82 |     "Based on JSON structure in the file 'test.js'(shown above), extract the doc file names associated to the people's names as following: \n",
 83 |     "\n",
 84 |     "```text\n",
 85 |     "+-----+----------------------+\n",
 86 |     "|name |doc_names             |\n",
 87 |     "+-----+----------------------+\n",
 88 |     "|Andre|bla001.txt, bla002.txt|\n",
 89 |     "|Noé  |bla003.txt, bla004.txt|\n",
 90 |     "+-----+----------------------+\n",
 91 |     "```\n",
 92 |     "\n",
 93 |     "Note that, 'docid' is not desirable.\n",
 94 |     "\n"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 4,
100 |    "id": "c287da29-db31-4a48-832c-c161072a2a46",
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "+-----+------------------------------------------------------------------------------+\n",
108 |       "|name |doc_list                                                                      |\n",
109 |       "+-----+------------------------------------------------------------------------------+\n",
110 |       "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|\n",
111 |       "|Noé  |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|\n",
112 |       "+-----+------------------------------------------------------------------------------+\n",
113 |       "\n"
114 |      ]
115 |     },
116 |     {
117 |      "name": "stderr",
118 |      "output_type": "stream",
119 |      "text": [
120 |       "                                                                                "
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "# Defining schema\n",
126 |     "schema = StructType([\n",
127 |     "    StructField(\"name\", StringType(), True),\n",
128 |     "    StructField(\"doc_list\", ArrayType(MapType(StringType(),StringType(),True),True), True),\n",
129 |     "])\n",
130 |     "# Reading JSON file using Dataframe API setting 'multiline' option as true\n",
131 |     "sdf = spark.read.option(\"multiline\", \"true\").json('data/test.json', schema=schema)\n",
132 |     "sdf.show(truncate=False)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "id": "11cc02e6-f597-4aff-bcb2-588fb94d2426",
138 |    "metadata": {},
139 |    "source": [
140 |     "### Transformations steps\n",
141 |     "\n",
142 |     "1. Transform the list in 'doc_list' column into various rows using the `explode` pyspark function;\n",
143 |     "2. For each of those rows, extract 'name' from the data structure using `getItem` function and drop the original column 'doc_ex';\n",
144 |     "3. Transform doc names into a list againd group rows by 'name' column and using the `collect_list` as the aggregate function;\n",
145 |     "4. Transform the list into a string separated by ',' using `concat_ws`\n"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 5,
151 |    "id": "bf90af12-0f3c-4823-a576-359f66f2dc5d",
152 |    "metadata": {},
153 |    "outputs": [
154 |     {
155 |      "name": "stdout",
156 |      "output_type": "stream",
157 |      "text": [
158 |       "Step 1\n",
159 |       "+-----+------------------------------------------------------------------------------+-------------------------------------+\n",
160 |       "|name |doc_list                                                                      |doc_ex                               |\n",
161 |       "+-----+------------------------------------------------------------------------------+-------------------------------------+\n",
162 |       "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|{docid -> DOC001, name -> bla001.txt}|\n",
163 |       "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|{docid -> DOC002, name -> bla002.txt}|\n",
164 |       "|Noé  |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|{docid -> DOC003, name -> bla003.txt}|\n",
165 |       "|Noé  |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|{docid -> DOC004, name -> bla004.txt}|\n",
166 |       "+-----+------------------------------------------------------------------------------+-------------------------------------+\n",
167 |       "\n",
168 |       "Step 2\n",
169 |       "+-----+------------------------------------------------------------------------------+----------+\n",
170 |       "|name |doc_list                                                                      |doc_name  |\n",
171 |       "+-----+------------------------------------------------------------------------------+----------+\n",
172 |       "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|bla001.txt|\n",
173 |       "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|bla002.txt|\n",
174 |       "|Noé  |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|bla003.txt|\n",
175 |       "|Noé  |[{docid -> DOC003, name -> bla003.txt}, {docid -> DOC004, name -> bla004.txt}]|bla004.txt|\n",
176 |       "+-----+------------------------------------------------------------------------------+----------+\n",
177 |       "\n",
178 |       "Step 3\n",
179 |       "+-----+------------------------+\n",
180 |       "|name |doc_list                |\n",
181 |       "+-----+------------------------+\n",
182 |       "|Noé  |[bla003.txt, bla004.txt]|\n",
183 |       "|Andre|[bla001.txt, bla002.txt]|\n",
184 |       "+-----+------------------------+\n",
185 |       "\n",
186 |       "Step 4\n",
187 |       "+-----+---------------------+\n",
188 |       "|name |doc_names            |\n",
189 |       "+-----+---------------------+\n",
190 |       "|Noé  |bla003.txt,bla004.txt|\n",
191 |       "|Andre|bla001.txt,bla002.txt|\n",
192 |       "+-----+---------------------+\n",
193 |       "\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "# Step 1\n",
199 |     "adf = sdf.withColumn(\"doc_ex\", F.explode(\"doc_list\"))\n",
200 |     "print(\"Step 1\")\n",
201 |     "adf.show(truncate=False)\n",
202 |     "# Step 2 - Extracting the value of interest. In this case, the names of documents.\n",
203 |     "adf = adf.withColumn(\"doc_name\", adf.doc_ex.getItem(\"name\")).drop(\"doc_ex\")\n",
204 |     "print(\"Step 2\")\n",
205 |     "adf.show(truncate=False)\n",
206 |     "# Step 3 - Time to revert the 'explode' effect. For this, let's group rows by name and use 'collect_list' as aggregate function\n",
207 |     "ndf = adf.groupBy(\"name\").agg(F.collect_list(\"doc_name\").alias('doc_list'))\n",
208 |     "print(\"Step 3\")\n",
209 |     "ndf.show(truncate=False)\n",
210 |     "# Step 4 - Transforming this list into a string list separated by ',' character.\n",
211 |     "ndf = ndf.withColumn(\"doc_names\", F.concat_ws(\",\", \"doc_list\")).drop(\"doc_list\")\n",
212 |     "print(\"Step 4\")\n",
213 |     "ndf.show(truncate=False)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "id": "f6ba2ea4-eda9-4f53-924e-5f71e11f0899",
219 |    "metadata": {},
220 |    "source": [
221 |     "### UDF(Not recommended)\n",
222 |     "User Defined Functions(UDF) can be a way to parse information from a column. In this case, the docs inside the JSON file is available in a list of objects which is parsed by pySpark and convenient converted into Python data structure objects which looks like more simpler to deal with. However, in this particularly scenario is not recommended because is possible to use spark functions which supports this operation offering a better optimization than UDF. Besides, in Python performance is not particularly good specially if you have neasted loops."
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 6,
228 |    "id": "5a87b5f0-95cc-4d05-b742-a155f196a2aa",
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "@udf\n",
233 |     "def extract_doc(data_list: list) -> str:\n",
234 |     "    n = list()\n",
235 |     "    for li in data_list:\n",
236 |     "        n += [v for k,v in li.items() if k == 'name']\n",
237 |     "\n",
238 |     "    return ','.join(n)\n"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 7,
244 |    "id": "57fee3c9-fede-4649-a07e-9935bb82fd72",
245 |    "metadata": {},
246 |    "outputs": [
247 |     {
248 |      "name": "stdout",
249 |      "output_type": "stream",
250 |      "text": [
251 |       "+-----+---------------------+\n",
252 |       "|name |doc_names            |\n",
253 |       "+-----+---------------------+\n",
254 |       "|Andre|bla001.txt,bla002.txt|\n",
255 |       "|Noé  |bla003.txt,bla004.txt|\n",
256 |       "+-----+---------------------+\n",
257 |       "\n"
258 |      ]
259 |     },
260 |     {
261 |      "name": "stderr",
262 |      "output_type": "stream",
263 |      "text": [
264 |       "                                                                                "
265 |      ]
266 |     }
267 |    ],
268 |    "source": [
269 |     "# Running the UDF called 'extract_doc_udf' and storing into a new column called 'udf_res'\n",
270 |     "dfu = sdf.withColumn('doc_names', extract_doc(F.col('doc_list'))).select('name','doc_names')\n",
271 |     "# Showing the result\n",
272 |     "dfu.show(truncate=False)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "id": "56069cfc-2d38-4ef9-848f-d7116afc7d3e",
278 |    "metadata": {},
279 |    "source": [
280 |     "### \"UDF\" using RDD(less recommended)\n",
281 |     "This is the old ways to handle UDFs. The reasons to not to do it is the same as the previous ways. And here is worse because you dealing directly with RDD and you will not have any optimization for doing that. If you don't know how to optimize RDD operations by yourself, don't use it."
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 8,
287 |    "id": "db3daafa-8e25-44d4-bb9a-4170e10c054b",
288 |    "metadata": {},
289 |    "outputs": [],
290 |    "source": [
291 |     "def extract_doc_rdd(row):\n",
292 |     "    d = row.asDict()\n",
293 |     "    n = list()\n",
294 |     "    if 'doc_list' in d:\n",
295 |     "        for li in d['doc_list']:\n",
296 |     "            n += [v for k,v in li.items() if k == 'name']\n",
297 |     "\n",
298 |     "        d['doc_names'] = ','.join(n)\n",
299 |     "\n",
300 |     "    return Row(**d)\n",
301 |     "    \n"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 9,
307 |    "id": "8aba685e-ab0c-4b05-ab63-a2d9a97ee41f",
308 |    "metadata": {},
309 |    "outputs": [
310 |     {
311 |      "name": "stderr",
312 |      "output_type": "stream",
313 |      "text": [
314 |       "                                                                                "
315 |      ]
316 |     },
317 |     {
318 |      "name": "stdout",
319 |      "output_type": "stream",
320 |      "text": [
321 |       "+-----+---------------------+\n",
322 |       "|name |doc_names            |\n",
323 |       "+-----+---------------------+\n",
324 |       "|Andre|bla001.txt,bla002.txt|\n",
325 |       "|Noé  |bla003.txt,bla004.txt|\n",
326 |       "+-----+---------------------+\n",
327 |       "\n"
328 |      ]
329 |     }
330 |    ],
331 |    "source": [
332 |     "# Executing 'extract_doc_rdd' using map method from rdd object\n",
333 |     "rdd = sdf.rdd.map(extract_doc_rdd)\n",
334 |     "# Converting into a dataframe object\n",
335 |     "edf = rdd.toDF().select('name','doc_names')\n",
336 |     "# Showing the result\n",
337 |     "edf.show(truncate=False)"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "id": "4d11b47f-4fc2-43a6-9433-712f7207deff",
343 |    "metadata": {},
344 |    "source": [
345 |     "### Reading complex JSON using dynamic schema\n",
346 |     "\n",
347 |     "For the cases where JSON contains varying all the time. Example on dataframe/data/json-varying.csv:\n",
348 |     "\n",
349 |     "```json\n",
350 |     "id,json_string\n",
351 |     "1,'{\"name\": \"John Doe\", \"age\": 30}'\n",
352 |     "2,'{\"city\": \"New York\", \"country\": \"USA\", \"zipcode\": \"10001\"}'\n",
353 |     "3,'{\"product\": \"Laptop\", \"brand\": \"Dell\", \"specs\": {\"RAM\": \"16GB\", \"Storage\": \"512GB SSD\"}}'\n",
354 |     "\n",
355 |     "```\n",
356 |     "\n",
357 |     "Tipically this kind of data came from a column\n",
358 |     "\n"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 16,
364 |    "id": "7c1dc01a-2f41-4ffe-9ba8-c172048f9fe3",
365 |    "metadata": {},
366 |    "outputs": [
367 |     {
368 |      "name": "stdout",
369 |      "output_type": "stream",
370 |      "text": [
371 |       "+---+----------------------------------------------------------------------------------------+\n",
372 |       "|id |json_string                                                                             |\n",
373 |       "+---+----------------------------------------------------------------------------------------+\n",
374 |       "|1  |{\"name\": \"John Doe\", \"age\": 30}                                                         |\n",
375 |       "|2  |{\"city\": \"New York\", \"country\": \"USA\", \"zipcode\": \"10001\"}                              |\n",
376 |       "|3  |{\"product\": \"Laptop\", \"brand\": \"Dell\", \"specs\": {\"RAM\": \"16GB\", \"Storage\": \"512GB SSD\"}}|\n",
377 |       "+---+----------------------------------------------------------------------------------------+\n",
378 |       "\n"
379 |      ]
380 |     }
381 |    ],
382 |    "source": [
383 |     "# Or from file.\n",
384 |     "df = spark.read.csv('data/json-varying.csv', header=True, quote=\"'\")\n",
385 |     "\n",
386 |     "# If you're reading from a file, remove the quote char from string. Otherwise, the parser will not be able to return a object\n",
387 |     "df = df.withColumn('json_string', F.regexp_replace('json_string', r\"\\'\", \"\"))\n",
388 |     "df.show(truncate=False)"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 14,
394 |    "id": "0f2bf586-4804-4732-a14a-19c6b5f7fa83",
395 |    "metadata": {},
396 |    "outputs": [
397 |     {
398 |      "name": "stdout",
399 |      "output_type": "stream",
400 |      "text": [
401 |       "+---+----------------------------------------------------------------------------------------+---------------------------------------------------------------+\n",
402 |       "|id |json_string                                                                             |parsed                                                         |\n",
403 |       "+---+----------------------------------------------------------------------------------------+---------------------------------------------------------------+\n",
404 |       "|1  |{\"name\": \"John Doe\", \"age\": 30}                                                         |{30, null, null, null, John Doe, null, null, null}             |\n",
405 |       "|2  |{\"city\": \"New York\", \"country\": \"USA\", \"zipcode\": \"10001\"}                              |{null, null, New York, USA, null, null, null, 10001}           |\n",
406 |       "|3  |{\"product\": \"Laptop\", \"brand\": \"Dell\", \"specs\": {\"RAM\": \"16GB\", \"Storage\": \"512GB SSD\"}}|{null, Dell, null, null, null, Laptop, {16GB, 512GB SSD}, null}|\n",
407 |       "+---+----------------------------------------------------------------------------------------+---------------------------------------------------------------+\n",
408 |       "\n"
409 |      ]
410 |     }
411 |    ],
412 |    "source": [
413 |     "dynamic_schema = spark.read.json(df.rdd.map(lambda row: row['json_string'])).schema\n",
414 |     "jdf = df.withColumn(\"parsed\", F.from_json('json_string', dynamic_schema))\n",
415 |     "jdf.show(truncate=False)"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "markdown",
420 |    "id": "56e661c1-1d5e-4371-9f76-d8f9d39a73c4",
421 |    "metadata": {},
422 |    "source": [
423 |     "# JSON from string"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 27,
429 |    "id": "3dab31a6-256d-4d3a-846c-086dfc0b58fb",
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "json_str = \"\"\"[\n",
434 |     "    {\n",
435 |     "        \"name\":\"Andre\",\n",
436 |     "        \"doc_list\":[{\"docid\":\"DOC001\", \"name\":\"bla001.txt\"}, {\"docid\":\"DOC002\", \"name\":\"bla002.txt\"}]\n",
437 |     "    },\n",
438 |     "    {\n",
439 |     "        \"name\": \"Noe\",\n",
440 |     "        \"doc_list\":[{\"docid\":\"DOC002\", \"name\":\"bla002.txt\"}, {\"docid\":\"DOC003\", \"name\":\"bla003.txt\"}]\n",
441 |     "    }\n",
442 |     "]\"\"\"\n"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": 28,
448 |    "id": "e9c6ba84-4ddc-4e06-b283-195833e33e72",
449 |    "metadata": {},
450 |    "outputs": [
451 |     {
452 |      "name": "stdout",
453 |      "output_type": "stream",
454 |      "text": [
455 |       "+-----+------------------------------------------------------------------------------+\n",
456 |       "|name |doc_list                                                                      |\n",
457 |       "+-----+------------------------------------------------------------------------------+\n",
458 |       "|Andre|[{docid -> DOC001, name -> bla001.txt}, {docid -> DOC002, name -> bla002.txt}]|\n",
459 |       "|Noe  |[{docid -> DOC002, name -> bla002.txt}, {docid -> DOC003, name -> bla003.txt}]|\n",
460 |       "+-----+------------------------------------------------------------------------------+\n",
461 |       "\n"
462 |      ]
463 |     }
464 |    ],
465 |    "source": [
466 |     "json_str=re.sub(r\"\\n\",\"\",json_str)\n",
467 |     "sc = spark.sparkContext\n",
468 |     "schema = StructType([\n",
469 |     "    StructField(\"name\", StringType(), True),\n",
470 |     "    StructField(\"doc_list\", ArrayType(MapType(StringType(), StringType())), True)\n",
471 |     "])\n",
472 |     "df = spark.read.json(sc.parallelize([json_str]), schema)\n",
473 |     "df.show(truncate=False)"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": 29,
479 |    "id": "3e1a9042-65c5-4aea-a1ff-cf58b629ecd0",
480 |    "metadata": {},
481 |    "outputs": [
482 |     {
483 |      "name": "stdout",
484 |      "output_type": "stream",
485 |      "text": [
486 |       "+-----+---------------------+\n",
487 |       "|name |doc_names            |\n",
488 |       "+-----+---------------------+\n",
489 |       "|Noe  |bla002.txt,bla003.txt|\n",
490 |       "|Andre|bla001.txt,bla002.txt|\n",
491 |       "+-----+---------------------+\n",
492 |       "\n"
493 |      ]
494 |     }
495 |    ],
496 |    "source": [
497 |     "dfe = df.withColumn(\"item\", F.explode(\"doc_list\"))\n",
498 |     "dfe = dfe.withColumn(\"doc\", dfe.item.getItem('name')).drop(\"item\") \\\n",
499 |     "        .groupBy('name').agg(F.collect_list(\"doc\").alias(\"doclist\")) \\\n",
500 |     "        .withColumn('doc_names', F.concat_ws(',','doclist')).drop(\"doclist\")\n",
501 |     "dfe.show(truncate=False)"
502 |    ]
503 |   }
504 |  ],
505 |  "metadata": {
506 |   "kernelspec": {
507 |    "display_name": "Python 3 (ipykernel)",
508 |    "language": "python",
509 |    "name": "python3"
510 |   },
511 |   "language_info": {
512 |    "codemirror_mode": {
513 |     "name": "ipython",
514 |     "version": 3
515 |    },
516 |    "file_extension": ".py",
517 |    "mimetype": "text/x-python",
518 |    "name": "python",
519 |    "nbconvert_exporter": "python",
520 |    "pygments_lexer": "ipython3",
521 |    "version": "3.10.12"
522 |   }
523 |  },
524 |  "nbformat": 4,
525 |  "nbformat_minor": 5
526 | }
527 | 


--------------------------------------------------------------------------------