├── README.md └── Tweet-Semantic-Meaning.ipynb /README.md: -------------------------------------------------------------------------------- 1 | #### Semantic Meaning of Transformers in Data Science 2 | - Proper labeling and assigning semantic meaning to tweets using transformers in Data Science. 3 | - Representation on the 2D grid. 4 | -------------------------------------------------------------------------------- /Tweet-Semantic-Meaning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Zadacha3_181225.ipynb", 7 | "provenance": [] 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": { 22 | "colab": { 23 | "base_uri": "https://localhost:8080/" 24 | }, 25 | "id": "sQoyuIuJMpWo", 26 | "outputId": "16ecf838-ff02-43ab-906e-085f911f7d47" 27 | }, 28 | "outputs": [ 29 | { 30 | "output_type": "stream", 31 | "name": "stdout", 32 | "text": [ 33 | "Mounted at /content/drive\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "#add your code \n", 39 | "from google.colab import drive\n", 40 | "drive.mount('/content/drive')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "source": [ 46 | "import pandas as pd\n", 47 | "\n", 48 | "df = pd.read_csv('/content/train_3.csv')" 49 | ], 50 | "metadata": { 51 | "id": "0O1yfskiMze9" 52 | }, 53 | "execution_count": 2, 54 | "outputs": [] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "source": [ 59 | "df.head()" 60 | ], 61 | "metadata": { 62 | "colab": { 63 | "base_uri": "https://localhost:8080/", 64 | "height": 206 65 | }, 66 | "id": "2EukyL2sM5HV", 67 | "outputId": "98fd7c7e-2c69-46bb-fa7e-55858c72c3dc" 68 | }, 69 | "execution_count": 3, 70 | "outputs": [ 71 | { 72 | "output_type": "execute_result", 73 | "data": { 74 | "text/html": [ 75 | "\n", 76 | "
\n", 77 | "
\n", 78 | "
\n", 79 | "\n", 92 | "\n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
TweetSentiment
0TRENDING: New Yorkers encounter empty supermar...Extremely Negative
1When I couldn't find hand sanitizer at Fred Me...Positive
2Find out how you can protect yourself and love...Extremely Positive
3#Panic buying hits #NewYork City as anxious sh...Negative
4#toiletpaper #dunnypaper #coronavirus #coronav...Neutral
\n", 128 | "
\n", 129 | " \n", 139 | " \n", 140 | " \n", 177 | "\n", 178 | " \n", 202 | "
\n", 203 | "
\n", 204 | " " 205 | ], 206 | "text/plain": [ 207 | " Tweet Sentiment\n", 208 | "0 TRENDING: New Yorkers encounter empty supermar... Extremely Negative\n", 209 | "1 When I couldn't find hand sanitizer at Fred Me... Positive\n", 210 | "2 Find out how you can protect yourself and love... Extremely Positive\n", 211 | "3 #Panic buying hits #NewYork City as anxious sh... Negative\n", 212 | "4 #toiletpaper #dunnypaper #coronavirus #coronav... Neutral" 213 | ] 214 | }, 215 | "metadata": {}, 216 | "execution_count": 3 217 | } 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "source": [ 223 | "from sklearn.preprocessing import LabelEncoder\n", 224 | "encoder = LabelEncoder()\n", 225 | "df['Sentiment'] = encoder.fit_transform(df['Sentiment'])" 226 | ], 227 | "metadata": { 228 | "id": "KVcwpi0TM_nf" 229 | }, 230 | "execution_count": 5, 231 | "outputs": [] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "source": [ 236 | "pip install transformers" 237 | ], 238 | "metadata": { 239 | "colab": { 240 | "base_uri": "https://localhost:8080/" 241 | }, 242 | "id": "VRKOjwIJOIYj", 243 | "outputId": "e1338110-87b8-4967-fbc9-34236cef4a2c" 244 | }, 245 | "execution_count": 6, 246 | "outputs": [ 247 | { 248 | "output_type": "stream", 249 | "name": "stdout", 250 | "text": [ 251 | "Collecting transformers\n", 252 | " Downloading transformers-4.16.1-py3-none-any.whl (3.5 MB)\n", 253 | "\u001b[K |████████████████████████████████| 3.5 MB 5.2 MB/s \n", 254 | "\u001b[?25hCollecting huggingface-hub<1.0,>=0.1.0\n", 255 | " Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)\n", 256 | "\u001b[K |████████████████████████████████| 67 kB 4.0 MB/s \n", 257 | "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.3)\n", 258 | "Collecting sacremoses\n", 259 | " Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)\n", 260 | "\u001b[K |████████████████████████████████| 895 kB 53.6 MB/s \n", 261 | "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.4.2)\n", 262 | "Collecting pyyaml>=5.1\n", 263 | " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", 264 | "\u001b[K |████████████████████████████████| 596 kB 55.1 MB/s \n", 265 | "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.3)\n", 266 | "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.10.1)\n", 267 | "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n", 268 | "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n", 269 | "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n", 270 | "Collecting tokenizers!=0.11.3,>=0.10.1\n", 271 | " Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)\n", 272 | "\u001b[K |████████████████████████████████| 6.8 MB 36.5 MB/s \n", 273 | "\u001b[?25hRequirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers) (3.10.0.2)\n", 274 | "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (3.0.7)\n", 275 | "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.7.0)\n", 276 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n", 277 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n", 278 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.10.8)\n", 279 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n", 280 | "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.1.0)\n", 281 | "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n", 282 | "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n", 283 | "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n", 284 | " Attempting uninstall: pyyaml\n", 285 | " Found existing installation: PyYAML 3.13\n", 286 | " Uninstalling PyYAML-3.13:\n", 287 | " Successfully uninstalled PyYAML-3.13\n", 288 | "Successfully installed huggingface-hub-0.4.0 pyyaml-6.0 sacremoses-0.0.47 tokenizers-0.11.4 transformers-4.16.1\n" 289 | ] 290 | } 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "source": [ 296 | "from absl import logging\n", 297 | "\n", 298 | "import tensorflow as tf\n", 299 | "\n", 300 | "import tensorflow_hub as hub\n", 301 | "import matplotlib.pyplot as plt\n", 302 | "import numpy as np\n", 303 | "import os\n", 304 | "import pandas as pd\n", 305 | "import re\n", 306 | "import seaborn as sns\n", 307 | "\n", 308 | "module_url = \"https://tfhub.dev/google/universal-sentence-encoder/4\"\n", 309 | "model = hub.load(module_url)\n", 310 | "print (\"module %s loaded\" % module_url)\n", 311 | "def embed(input):\n", 312 | " return model(input)" 313 | ], 314 | "metadata": { 315 | "colab": { 316 | "base_uri": "https://localhost:8080/" 317 | }, 318 | "id": "X7k7YbfgOWlh", 319 | "outputId": "8cc787f3-d6c9-4ccd-cf04-0749b7d93b7b" 320 | }, 321 | "execution_count": 7, 322 | "outputs": [ 323 | { 324 | "output_type": "stream", 325 | "name": "stdout", 326 | "text": [ 327 | "module https://tfhub.dev/google/universal-sentence-encoder/4 loaded\n" 328 | ] 329 | } 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "source": [ 335 | "# Reduce logging output.\n", 336 | "logging.set_verbosity(logging.ERROR)\n", 337 | "\n", 338 | "message_embeddings = embed(df.Tweet.values)" 339 | ], 340 | "metadata": { 341 | "id": "DdJj1Ww9OgRi" 342 | }, 343 | "execution_count": 9, 344 | "outputs": [] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "source": [ 349 | " message_embeddings" 350 | ], 351 | "metadata": { 352 | "colab": { 353 | "base_uri": "https://localhost:8080/" 354 | }, 355 | "id": "faO2SlyZOm1O", 356 | "outputId": "833d5ae8-9c44-4817-ffcc-090385cdd48f" 357 | }, 358 | "execution_count": 11, 359 | "outputs": [ 360 | { 361 | "output_type": "execute_result", 362 | "data": { 363 | "text/plain": [ 364 | "" 378 | ] 379 | }, 380 | "metadata": {}, 381 | "execution_count": 11 382 | } 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "source": [ 388 | "from sklearn.cluster import KMeans\n", 389 | "\n", 390 | "km = KMeans(n_clusters=3)\n", 391 | "km.fit(message_embeddings)\n", 392 | "clusters = km.labels_.tolist()" 393 | ], 394 | "metadata": { 395 | "id": "AGP2xK12Oxhq" 396 | }, 397 | "execution_count": 12, 398 | "outputs": [] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "source": [ 403 | "from sklearn.decomposition import PCA\n", 404 | "import numpy as np\n", 405 | "\n", 406 | "data = message_embeddings\n", 407 | "\n", 408 | "pca = PCA(3)\n", 409 | " \n", 410 | "#Transform the received data\n", 411 | "df_new = pca.fit_transform(data)" 412 | ], 413 | "metadata": { 414 | "id": "m4YiZ-pcO7kY" 415 | }, 416 | "execution_count": 13, 417 | "outputs": [] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "source": [ 422 | "#Initialize the class object\n", 423 | "kmeans = KMeans(n_clusters=5)\n", 424 | " \n", 425 | "#predict the labels of clusters.\n", 426 | "label = kmeans.fit_predict(df_new)\n", 427 | " \n", 428 | "#Getting unique labels\n", 429 | "u_labels = np.unique(label)\n", 430 | " \n", 431 | "labels = kmeans.labels_" 432 | ], 433 | "metadata": { 434 | "id": "dvwQ38M5PBm4" 435 | }, 436 | "execution_count": 14, 437 | "outputs": [] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "source": [ 442 | "import matplotlib.pyplot as plt\n", 443 | "import seaborn as sns\n", 444 | "%matplotlib inline\n", 445 | "#plotting the results:\n", 446 | "for i in u_labels:\n", 447 | " plt.scatter(df_new[label == i , 0] , df_new[label == i , 1] , label = i)\n", 448 | "plt.legend()\n", 449 | "plt.show()" 450 | ], 451 | "metadata": { 452 | "colab": { 453 | "base_uri": "https://localhost:8080/", 454 | "height": 265 455 | }, 456 | "id": "qRG5DnVCPIKl", 457 | "outputId": "62cf8e81-397e-4b76-fc06-2b83f52e4bca" 458 | }, 459 | "execution_count": 15, 460 | "outputs": [ 461 | { 462 | "output_type": "display_data", 463 | "data": { 464 | "image/png": "\n", 465 | "text/plain": [ 466 | "
" 467 | ] 468 | }, 469 | "metadata": { 470 | "needs_background": "light" 471 | } 472 | } 473 | ] 474 | } 475 | ] 476 | } --------------------------------------------------------------------------------