├── README.md ├── LICENSE └── Edit_Distance_Spark.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Edit-Distance-Spark 2 | Calculating Edit Distance with PySpark 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Ahmad Asadi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Edit_Distance_Spark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "# Step1: Install Librarys" 21 | ], 22 | "metadata": { 23 | "id": "Q1Wl1RbS0cqf" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "id": "7bfdMBkMz-6j", 31 | "colab": { 32 | "base_uri": "https://localhost:8080/" 33 | }, 34 | "outputId": "ceb23ef4-af79-4a41-bbc2-fb1521efbd0a" 35 | }, 36 | "outputs": [ 37 | { 38 | "output_type": "stream", 39 | "name": "stdout", 40 | "text": [ 41 | "Requirement already satisfied: pyspark in /usr/local/lib/python3.10/dist-packages (3.5.1)\n", 42 | "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "!pip install pyspark" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "source": [ 53 | "!pip install wikipedia" 54 | ], 55 | "metadata": { 56 | "colab": { 57 | "base_uri": "https://localhost:8080/" 58 | }, 59 | "id": "Jssq_bipeftx", 60 | "outputId": "9851b131-0312-4810-aa8f-4f158ecb69a0" 61 | }, 62 | "execution_count": 2, 63 | "outputs": [ 64 | { 65 | "output_type": "stream", 66 | "name": "stdout", 67 | "text": [ 68 | "Requirement already satisfied: wikipedia in /usr/local/lib/python3.10/dist-packages (1.4.0)\n", 69 | "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from wikipedia) (4.12.3)\n", 70 | "Requirement already satisfied: requests<3.0.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from wikipedia) (2.31.0)\n", 71 | "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.0.0->wikipedia) (3.3.2)\n", 72 | "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.0.0->wikipedia) (3.7)\n", 73 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.0.0->wikipedia) (2.0.7)\n", 74 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.0.0->wikipedia) (2024.2.2)\n", 75 | "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->wikipedia) (2.5)\n" 76 | ] 77 | } 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "source": [ 83 | "# Step2: Import Librarys" 84 | ], 85 | "metadata": { 86 | "id": "XKpaOtw7INZt" 87 | } 88 | }, 89 | { 90 | "cell_type": "code", 91 | "source": [ 92 | "import pyspark\n", 93 | "import wikipedia\n", 94 | "import numpy as np\n", 95 | "import re\n", 96 | "from pyspark.sql import SparkSession\n", 97 | "from pyspark.sql.window import Window\n", 98 | "from pyspark.sql.functions import col, row_number\n", 99 | "# import logging\n", 100 | "# logging.basicConfig(level=logging.DEBUG)\n" 101 | ], 102 | "metadata": { 103 | "id": "xP75d6QJIQhW" 104 | }, 105 | "execution_count": 9, 106 | "outputs": [] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "source": [ 111 | "# Step3: Download Dataset" 112 | ], 113 | "metadata": { 114 | "id": "xY_a66k1eS-w" 115 | } 116 | }, 117 | { 118 | "cell_type": "code", 119 | "source": [ 120 | "dataset = wikipedia.page('Python (programming language)').content\n", 121 | "dataset2 = [\"to be or not to be this is the problem\"]" 122 | ], 123 | "metadata": { 124 | "id": "JMuZD8KteWum" 125 | }, 126 | "execution_count": 11, 127 | "outputs": [] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "source": [ 132 | "# Step4: Preprocessing Dataset" 133 | ], 134 | "metadata": { 135 | "id": "MdOjKnXier6H" 136 | } 137 | }, 138 | { 139 | "cell_type": "code", 140 | "source": [ 141 | "dataset = re.sub(r'[\\n\\t]', ' ', dataset)\n", 142 | "dataset = re.sub(r'\\s+', ' ', dataset.strip())\n", 143 | "\n", 144 | "dataset = dataset.split()\n", 145 | "dataset = list(set(dataset))" 146 | ], 147 | "metadata": { 148 | "id": "DbP3IhT4evc-" 149 | }, 150 | "execution_count": 12, 151 | "outputs": [] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "source": [ 156 | "# Step5: Config Spark" 157 | ], 158 | "metadata": { 159 | "id": "7nBdHHqTbu1x" 160 | } 161 | }, 162 | { 163 | "cell_type": "code", 164 | "source": [ 165 | "# sc = pyspark.SparkContext('local[*]')\n", 166 | "spark = SparkSession.builder.appName(\"EditDistanceApp\").getOrCreate()" 167 | ], 168 | "metadata": { 169 | "id": "Q9b5_Qt3byig" 170 | }, 171 | "execution_count": 6, 172 | "outputs": [] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "source": [ 177 | "# Step6: Edit Distance method" 178 | ], 179 | "metadata": { 180 | "id": "kLgX6VJQIQ6O" 181 | } 182 | }, 183 | { 184 | "cell_type": "code", 185 | "source": [ 186 | "def edit_distance(word1, word2):\n", 187 | " num_rows = len(word1)\n", 188 | " num_cols = len(word2)\n", 189 | "\n", 190 | " matrix = np.empty((num_rows + 2, num_cols + 2), dtype=object)\n", 191 | "\n", 192 | " for i in range(num_rows + 2):\n", 193 | " for j in range(num_cols + 2):\n", 194 | " if i == 0 and j == 0:\n", 195 | " matrix[i, j] = '#'\n", 196 | " elif i == 0 and j == 1:\n", 197 | " matrix[i, j] = '_'\n", 198 | " elif i == 1 and j == 0:\n", 199 | " matrix[i, j] = '_'\n", 200 | " elif i == 0:\n", 201 | " matrix[i, j] = word2[j - 2] # row word\n", 202 | " elif j == 0:\n", 203 | " matrix[i, j] = word1[i - 2] # col word\n", 204 | " elif i == 1:\n", 205 | " matrix[i, j] = str(j - 1)\n", 206 | " elif i != 1 and j == 1:\n", 207 | " matrix[i, j] = str(i - 1)\n", 208 | " else:\n", 209 | " first = (int(matrix[i-1, j])+1)\n", 210 | " second = (int(matrix[i, j-1])+1)\n", 211 | " if matrix[i, 0] == matrix[0, j]:\n", 212 | " third = (int(matrix[i-1, j-1]))\n", 213 | " else :\n", 214 | " third = (int(matrix[i-1, j-1])+2)\n", 215 | " matrix[i, j] = str(min(first, second, third))\n", 216 | " return matrix[num_rows + 1, num_cols + 1]\n", 217 | "\n", 218 | "print(edit_distance(\"INTENTION\", \"EXECUTION\"))\n" 219 | ], 220 | "metadata": { 221 | "colab": { 222 | "base_uri": "https://localhost:8080/" 223 | }, 224 | "id": "FYCs_NQEIYAH", 225 | "outputId": "0a966118-9909-4e70-a27c-4478e7a2644c" 226 | }, 227 | "execution_count": 7, 228 | "outputs": [ 229 | { 230 | "output_type": "stream", 231 | "name": "stdout", 232 | "text": [ 233 | "8\n" 234 | ] 235 | } 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "source": [ 241 | "# Step7: Calculate Edit distance for all words" 242 | ], 243 | "metadata": { 244 | "id": "fmZYnay1d_bu" 245 | } 246 | }, 247 | { 248 | "cell_type": "code", 249 | "source": [ 250 | "dataset2 = [\"Ahmad\", \"Asghar\", \"Akbar\", \"Ehsan\", \"Mohsen\"]\n", 251 | "\n", 252 | "rdd = spark.sparkContext.parallelize(dataset)\n", 253 | "\n", 254 | "word_pairs = rdd.cartesian(rdd).filter(lambda x: x[0]!= x[1])\\\n", 255 | " .map(lambda x: (x[0], x[1], edit_distance(x[0], x[1])))\n", 256 | "\n", 257 | "df = word_pairs.toDF([\"Word1\", \"Word2\", \"EditDistance\"])\n", 258 | "\n", 259 | "# df = df.toPandas()\n", 260 | "\n", 261 | "window_spec = Window.partitionBy(\"Word1\").orderBy(\"EditDistance\")\n", 262 | "\n", 263 | "df_with_row_num = df.withColumn(\"row_num\", row_number().over(window_spec))\n", 264 | "\n", 265 | "result_df = df_with_row_num.filter(col(\"row_num\") == 1).drop(\"row_num\")\n", 266 | "\n", 267 | "result_df.show()" 268 | ], 269 | "metadata": { 270 | "colab": { 271 | "base_uri": "https://localhost:8080/" 272 | }, 273 | "id": "1fEdjLzIvpQR", 274 | "outputId": "1a6d33f2-8bfe-47a2-830f-6ad95bb7d202" 275 | }, 276 | "execution_count": 14, 277 | "outputs": [ 278 | { 279 | "output_type": "stream", 280 | "name": "stdout", 281 | "text": [ 282 | "+-----------------+----------+------------+\n", 283 | "| Word1| Word2|EditDistance|\n", 284 | "+-----------------+----------+------------+\n", 285 | "| \"2\"| \"spam={0}| 10|\n", 286 | "| \"2.7.18+\"| ?| 10|\n", 287 | "| \"22\".| state| 10|\n", 288 | "| \"@-quoting\"| ten| 10|\n", 289 | "| \"AI| AI| 1|\n", 290 | "|\"BDFL-emeritus\").| emit| 13|\n", 291 | "| \"Hello,| largely| 10|\n", 292 | "| \"I| I| 1|\n", 293 | "| \"PyAIML| 'u'| 10|\n", 294 | "| \"Python| Python| 1|\n", 295 | "| \"Pythonic\"| of| 10|\n", 296 | "| \"Pythonistas\".| CPython,| 10|\n", 297 | "| \"The| The| 1|\n", 298 | "| \"To| \"spam={0}| 10|\n", 299 | "| \"adding\"| e.g.| 10|\n", 300 | "| \"as| as| 1|\n", 301 | "| \"backported|backported| 1|\n", 302 | "| \"bar\".| blocks| 10|\n", 303 | "| \"batteries| Server| 10|\n", 304 | "| \"benevolent| violate| 10|\n", 305 | "+-----------------+----------+------------+\n", 306 | "only showing top 20 rows\n", 307 | "\n" 308 | ] 309 | } 310 | ] 311 | } 312 | ] 313 | } --------------------------------------------------------------------------------