├── Projects ├── project.py └── credit-card-fraud.ipynb ├── .gitattributes ├── README.md ├── word2vec ├── Word2Vec Part 6 - Using Pretrained models.py ├── Word2Vec Part 1 - Importing the dataset.py ├── Word2Vec Part 2 - Preparing the dataset.py ├── Word2Vec Part 3 - Training the Word2Vec model.py ├── Word2Vec Part 4 - Testing our model.py └── Word2Vec Part 5 - Imporving our model.py ├── BagOfWords.ipynb ├── Classification.ipynb ├── Descriptive_Statistics.ipynb ├── BasicBinaryClassification.ipynb ├── TFIDF.ipynb ├── Count-TfIdf.ipynb ├── Working-with-PDF-Text.ipynb ├── Spacy-Basics.ipynb ├── Working-with-Text-Files.ipynb ├── Regular-Expressions.ipynb └── Tokenization.ipynb /Projects/project.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine learning + Natural Language Processing + Deep Learning 2 | 3 | For learning Purposes 4 | -------------------------------------------------------------------------------- /word2vec/Word2Vec Part 6 - Using Pretrained models.py: -------------------------------------------------------------------------------- 1 | # Word2Vec model visualization 2 | 3 | # Install gensim - pip install gensim 4 | from gensim.models import KeyedVectors 5 | 6 | filename = 'GoogleNews-vectors-negative300.bin' 7 | 8 | model = KeyedVectors.load_word2vec_format(filename, binary=True) 9 | 10 | model.wv.most_similar('king') 11 | 12 | model.wv.most_similar(positive=['king','woman'], negative= ['man']) -------------------------------------------------------------------------------- /word2vec/Word2Vec Part 1 - Importing the dataset.py: -------------------------------------------------------------------------------- 1 | # Word2Vec model visualization 2 | 3 | # Install gensim - pip install gensim 4 | import nltk 5 | import urllib 6 | import bs4 as bs 7 | import re 8 | from gensim.models import Word2Vec 9 | 10 | # Gettings the data source 11 | source = urllib.request.urlopen('https://en.wikipedia.org/wiki/Global_warming').read() 12 | 13 | # Parsing the data/ creating BeautifulSoup object 14 | soup = bs.BeautifulSoup(source,'lxml') 15 | 16 | # Fetching the data 17 | text = "" 18 | for paragraph in soup.find_all('p'): 19 | text += paragraph.text 20 | 21 | # Preprocessing the data 22 | text = re.sub(r'\[[0-9]*\]',' ',text) 23 | text = re.sub(r'\s+',' ',text) 24 | text = text.lower() 25 | text = re.sub(r'\W',' ',text) 26 | text = re.sub(r'\d',' ',text) 27 | text = re.sub(r'\s+',' ',text) -------------------------------------------------------------------------------- /word2vec/Word2Vec Part 2 - Preparing the dataset.py: -------------------------------------------------------------------------------- 1 | # Word2Vec model visualization 2 | 3 | # Install gensim - pip install gensim 4 | import nltk 5 | import urllib 6 | import bs4 as bs 7 | import re 8 | from gensim.models import Word2Vec 9 | 10 | # Gettings the data source 11 | source = urllib.request.urlopen('https://en.wikipedia.org/wiki/Global_warming').read() 12 | 13 | # Parsing the data/ creating BeautifulSoup object 14 | soup = bs.BeautifulSoup(source,'lxml') 15 | 16 | # Fetching the data 17 | text = "" 18 | for paragraph in soup.find_all('p'): 19 | text += paragraph.text 20 | 21 | # Preprocessing the data 22 | text = re.sub(r'\[[0-9]*\]',' ',text) 23 | text = re.sub(r'\s+',' ',text) 24 | text = text.lower() 25 | text = re.sub(r'\d',' ',text) 26 | text = re.sub(r'\s+',' ',text) 27 | 28 | # Preparing the dataset 29 | sentences = nltk.sent_tokenize(text) 30 | 31 | sentences = [nltk.word_tokenize(sentence) for sentence in sentences] -------------------------------------------------------------------------------- /word2vec/Word2Vec Part 3 - Training the Word2Vec model.py: -------------------------------------------------------------------------------- 1 | # Word2Vec model visualization 2 | 3 | # Install gensim - pip install gensim 4 | import nltk 5 | import urllib 6 | import bs4 as bs 7 | import re 8 | from gensim.models import Word2Vec 9 | 10 | # Gettings the data source 11 | source = urllib.request.urlopen('https://en.wikipedia.org/wiki/Global_warming').read() 12 | 13 | # Parsing the data/ creating BeautifulSoup object 14 | soup = bs.BeautifulSoup(source,'lxml') 15 | 16 | # Fetching the data 17 | text = "" 18 | for paragraph in soup.find_all('p'): 19 | text += paragraph.text 20 | 21 | # Preprocessing the data 22 | text = re.sub(r'\[[0-9]*\]',' ',text) 23 | text = re.sub(r'\s+',' ',text) 24 | text = text.lower() 25 | text = re.sub(r'\d',' ',text) 26 | text = re.sub(r'\s+',' ',text) 27 | 28 | # Preparing the dataset 29 | sentences = nltk.sent_tokenize(text) 30 | 31 | sentences = [nltk.word_tokenize(sentence) for sentence in sentences] 32 | 33 | # Training the Word2Vec model 34 | model = Word2Vec(sentences, min_count=1) 35 | 36 | words = model.wv.vocab -------------------------------------------------------------------------------- /word2vec/Word2Vec Part 4 - Testing our model.py: -------------------------------------------------------------------------------- 1 | # Word2Vec model visualization 2 | 3 | # Install gensim - pip install gensim 4 | import nltk 5 | import urllib 6 | import bs4 as bs 7 | import re 8 | from gensim.models import Word2Vec 9 | 10 | # Gettings the data source 11 | source = urllib.request.urlopen('https://en.wikipedia.org/wiki/Global_warming').read() 12 | 13 | # Parsing the data/ creating BeautifulSoup object 14 | soup = bs.BeautifulSoup(source,'lxml') 15 | 16 | # Fetching the data 17 | text = "" 18 | for paragraph in soup.find_all('p'): 19 | text += paragraph.text 20 | 21 | # Preprocessing the data 22 | text = re.sub(r'\[[0-9]*\]',' ',text) 23 | text = re.sub(r'\s+',' ',text) 24 | text = text.lower() 25 | text = re.sub(r'\d',' ',text) 26 | text = re.sub(r'\s+',' ',text) 27 | 28 | # Preparing the dataset 29 | sentences = nltk.sent_tokenize(text) 30 | 31 | sentences = [nltk.word_tokenize(sentence) for sentence in sentences] 32 | 33 | # Training the Word2Vec model 34 | model = Word2Vec(sentences, min_count=1) 35 | 36 | words = model.wv.vocab 37 | 38 | # Finding Word Vectors 39 | vector = model.wv['global'] 40 | 41 | # Most similar words 42 | similar = model.wv.most_similar('warming') -------------------------------------------------------------------------------- /word2vec/Word2Vec Part 5 - Imporving our model.py: -------------------------------------------------------------------------------- 1 | # Word2Vec model visualization 2 | 3 | # Install gensim - pip install gensim 4 | import nltk 5 | import urllib 6 | import bs4 as bs 7 | import re 8 | from gensim.models import Word2Vec 9 | from nltk.corpus import stopwords 10 | nltk.download('stopwords') 11 | # Gettings the data source 12 | source = urllib.request.urlopen('https://en.wikipedia.org/wiki/Global_warming').read() 13 | 14 | # Parsing the data/ creating BeautifulSoup object 15 | soup = bs.BeautifulSoup(source,'lxml') 16 | 17 | # Fetching the data 18 | text = "" 19 | for paragraph in soup.find_all('p'): 20 | text += paragraph.text 21 | 22 | # Preprocessing the data 23 | text = re.sub(r'\[[0-9]*\]',' ',text) 24 | text = re.sub(r'\s+',' ',text) 25 | text = text.lower() 26 | text = re.sub(r'\d',' ',text) 27 | text = re.sub(r'\s+',' ',text) 28 | 29 | # Preparing the dataset 30 | sentences = nltk.sent_tokenize(text) 31 | 32 | sentences = [nltk.word_tokenize(sentence) for sentence in sentences] 33 | 34 | for i in range(len(sentences)): 35 | sentences[i] = [word for word in sentences[i] if word not in stopwords.words('english')] 36 | 37 | 38 | # Training the Word2Vec model 39 | model = Word2Vec(sentences, min_count=1) 40 | 41 | words = model.wv.vocab 42 | 43 | # Finding Word Vectors 44 | vector = model.wv['global'] 45 | 46 | # Most similar words 47 | similar = model.wv.most_similar('global') 48 | -------------------------------------------------------------------------------- /BagOfWords.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "['and this is', 'document is the', 'is the first', 'is the second', 'is the third', 'the first article', 'this document is', 'this is the']\n", 13 | "[[0 0 1 0 0 1 0 1]\n", 14 | " [0 1 0 1 0 0 1 0]\n", 15 | " [1 0 0 0 1 0 0 1]]\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "from sklearn.feature_extraction.text import CountVectorizer\n", 21 | "corpus = [\n", 22 | " 'This is the first article.',\n", 23 | " 'This document is the second.',\n", 24 | " 'And this is the third.'\n", 25 | "]\n", 26 | "\n", 27 | "vectorizer = CountVectorizer(analyzer='word', ngram_range=(3, 3)) # or CountVectorizer()\n", 28 | "X = vectorizer.fit_transform(corpus)\n", 29 | "print(vectorizer.get_feature_names())\n", 30 | "\n", 31 | "print(X.toarray())" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [] 40 | } 41 | ], 42 | "metadata": { 43 | "kernelspec": { 44 | "display_name": "Python 3", 45 | "language": "python", 46 | "name": "python3" 47 | }, 48 | "language_info": { 49 | "codemirror_mode": { 50 | "name": "ipython", 51 | "version": 3 52 | }, 53 | "file_extension": ".py", 54 | "mimetype": "text/x-python", 55 | "name": "python", 56 | "nbconvert_exporter": "python", 57 | "pygments_lexer": "ipython3", 58 | "version": "3.8.5" 59 | } 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 2 63 | } 64 | -------------------------------------------------------------------------------- /Classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "id": "gFHNYBzXQHyI" 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# data acquisition\n", 12 | "df = pd.read_csv(\"https://raw.githubusercontent.com/J0AZZ/artificial-intelligence_studies/master/artificial-intelligence_studies/introducao-ia-UFPB/Prova%201/Stars.csv\")\n", 13 | "df.info()\n", 14 | "print(\"\\n\\n\")\n", 15 | "\n", 16 | "# approaching categorical variables\n", 17 | "color_cat = pd.get_dummies(df[\"Color\"])\n", 18 | "spect_cat = pd.get_dummies(df[\"Spectral_Class\"])\n", 19 | "X = df[[\"Temperature\", \"L\", \"R\", \"A_M\"]]\n", 20 | "y = df[\"Type\"]\n", 21 | "X_normalized = (X-X.mean())/X.std()\n", 22 | "x = pd.concat(objs=[color_cat, spect_cat, X_normalized], axis=1)\n", 23 | "\n", 24 | "# splitting training and validation sets\n", 25 | "X_test, X_train = x.iloc[200:], x.iloc[:200] \n", 26 | "y_test, y_train = y.iloc[200:], y.iloc[:200]\n", 27 | "\n", 28 | "# model building with two different metrics and 5 values of k\n", 29 | "for metric in [\"manhattan\", \"euclidean\"]:\n", 30 | " print(\"\\n\\nDistance Metric: \", metric, \"\\n\")\n", 31 | " print(\"--------------------\")\n", 32 | " for k in range(1,6):\n", 33 | " knn = KNeighborsClassifier(n_neighbors=k, metric=metric)\n", 34 | " knn.fit(X_train, y_train)\n", 35 | " print(sk.metrics.classification_report(y_test, knn.predict(X_test)))\n", 36 | " print(\"\\nk=\", k, \" Accuracy: \", knn.score(X_test, y_test)*100, \"%\")\n", 37 | " print(\"Confusion Matrix:\\n\", sk.metrics.confusion_matrix(y_test, knn.predict(X_test)))\n", 38 | " " 39 | ] 40 | } 41 | ], 42 | "metadata": { 43 | "colab": { 44 | "authorship_tag": "ABX9TyP3bYSoLIC2goIv5IoRxour", 45 | "include_colab_link": true, 46 | "name": "StarClassification.ipynb", 47 | "provenance": [] 48 | }, 49 | "kernelspec": { 50 | "display_name": "Python 3", 51 | "language": "python", 52 | "name": "python3" 53 | }, 54 | "language_info": { 55 | "codemirror_mode": { 56 | "name": "ipython", 57 | "version": 3 58 | }, 59 | "file_extension": ".py", 60 | "mimetype": "text/x-python", 61 | "name": "python", 62 | "nbconvert_exporter": "python", 63 | "pygments_lexer": "ipython3", 64 | "version": "3.8.5" 65 | } 66 | }, 67 | "nbformat": 4, 68 | "nbformat_minor": 1 69 | } 70 | -------------------------------------------------------------------------------- /Descriptive_Statistics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "E7c4DXzXUgZ5" 7 | }, 8 | "source": [ 9 | "# Descriptive Statistics" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "id": "4oGEWXUKUlkL" 16 | }, 17 | "source": [ 18 | "## Central Tendency" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "id": "Ub4eAS83TsJB" 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "# arithmetic mean\n", 30 | "def arithmeticMean(observations):\n", 31 | " var = 0\n", 32 | " for o in observations:\n", 33 | " var += o\n", 34 | " return var/len(observations)\n", 35 | "\n", 36 | "# geometric mean\n", 37 | "def geometricMean(observations):\n", 38 | " var = 1\n", 39 | " for o in observations:\n", 40 | " var *= o\n", 41 | " return var ** (1/float(len(observations)))\n", 42 | "\n", 43 | "# harmonic mean\n", 44 | "def harmonicMean(observations):\n", 45 | " var = 1\n", 46 | " for o in observations:\n", 47 | " var *= 1/o\n", 48 | " return var/len(observations)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "id": "-SOo5L8GVoKc" 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "# median\n", 60 | "def median(observations):\n", 61 | " ordered = observations.sort()\n", 62 | " n = len(observations)\n", 63 | " if ((n % 2) == 0):\n", 64 | " i = len(n)/2 \n", 65 | " return (ordered[i-1] + ordered[i])/2\n", 66 | " return ordered[ ((n-1)/2)-1 ]\n", 67 | "\n", 68 | "# quartiles\n", 69 | "def quartiles(observations)\n", 70 | " ordered = observations.sort()\n", 71 | " n = len(observations)\n", 72 | " if ((n % 2) == 0):\n", 73 | " return [ordered[(n/4)-1], ordered[(n*0.75)-1]]\n", 74 | " return [ordered[((n-1)/4)-1], ordered[((n-1)*0.75)-1]]\n", 75 | "\n", 76 | "# interquartile range\n", 77 | "def interquartileRange(observations):\n", 78 | " quartiles = quartiles(observations)\n", 79 | " return (quartiles[1] - quartiles[0])\n", 80 | "\n", 81 | "# midhinge\n", 82 | "def midhinge(observations):\n", 83 | " quartiles = quartiles(observations)\n", 84 | " return (quartiles[1] + quartiles[0])/2\n", 85 | "\n", 86 | "# range\n", 87 | "def range(observations):\n", 88 | " obs = observations.sort()\n", 89 | " return (obs[len(obs)-1] - obs[0])" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "id": "VAJ5ZKukVqaG" 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "from collections import Counter\n", 101 | "\n", 102 | "# mode\n", 103 | "def mode(observations):\n", 104 | " occurrences = Counter(observations)\n", 105 | " return occurrences.most_common(1)[0][0]" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": { 111 | "id": "KLfnVb51LB41" 112 | }, 113 | "source": [ 114 | "## Dispersion" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "id": "Rh593IhFVzB6" 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "# standard deviation\n", 126 | "def stdDeviation(observations):\n", 127 | " mean = arithmeticMean(observations)\n", 128 | " var = 0\n", 129 | " for o in observations:\n", 130 | " var += ((o - mean)**2)\n", 131 | " var /= (len(observations)-1)\n", 132 | " return var**(0.5)\n", 133 | "\n", 134 | "# mean absolute deviation\n", 135 | "def meanAbsDeviation(observations):\n", 136 | " mean = arithmeticMean(observations)\n", 137 | " var = 0\n", 138 | " for o in observations:\n", 139 | " var += abs(o - mean)\n", 140 | " return var/len(observations)\n", 141 | "\n", 142 | "# median absolute deviation\n", 143 | "def medianAbsDeviation(observations):\n", 144 | " median = median(observations)\n", 145 | " deviations = list()\n", 146 | " for o in observations:\n", 147 | " deviations.append(abs(o - median))\n", 148 | " return median(deviations)\n", 149 | "\n", 150 | "# variance\n", 151 | "def variance(observations):\n", 152 | " mean = arithmeticMean(observations)\n", 153 | " var = 0\n", 154 | " for o in observations:\n", 155 | " var += (o - mean)**2\n", 156 | " return var/(len(observations)-1)\n", 157 | "\n", 158 | "# coefficient of variation\n", 159 | "def coeffVariation(observations):\n", 160 | " return 100*(stdDeviation(observations)/arithmeticMean(observations)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": { 166 | "id": "Mzc82NaGpPeQ" 167 | }, 168 | "source": [ 169 | "## Shape" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "id": "YM3au_VYphOW" 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "# rth central moments\n", 181 | "def centralMoment(observations, r):\n", 182 | " mean = arithmeticMean(observations)\n", 183 | " var = 0\n", 184 | " for o in observations:\n", 185 | " var += (o - mean)**r \n", 186 | " return var/len(observations)\n", 187 | "\n", 188 | "# skewness\n", 189 | "def skewness(observations):\n", 190 | " return centralMoment(observations, 3)/(centralMoment(2)**(1.75))\n", 191 | " \n", 192 | "# kurtosis\n", 193 | "def kurtosis(observations):\n", 194 | " return (centralMoment(observations, 4)/(centralMoment(observations, 2)**2))-3\n", 195 | "\n", 196 | "# outliers detected via Box Plot Limits\n", 197 | "def outliers(observations):\n", 198 | " outliers = list()\n", 199 | " iqr = interquartileChange(observations)\n", 200 | " quartiles = quartiles(observation)\n", 201 | " for o in observations:\n", 202 | " if (o < quartiles[0]-(1.5*iqr) or o > quartiles[1]+(1.5*iqr)):\n", 203 | " outliers.append(o)\n", 204 | " return outliers.sort()" 205 | ] 206 | } 207 | ], 208 | "metadata": { 209 | "colab": { 210 | "authorship_tag": "ABX9TyMRgjmGxlkEda/3PMq45i8O", 211 | "include_colab_link": true, 212 | "name": "Descriptive Statistics.ipynb", 213 | "provenance": [], 214 | "toc_visible": true 215 | }, 216 | "kernelspec": { 217 | "display_name": "Python 3", 218 | "language": "python", 219 | "name": "python3" 220 | }, 221 | "language_info": { 222 | "codemirror_mode": { 223 | "name": "ipython", 224 | "version": 3 225 | }, 226 | "file_extension": ".py", 227 | "mimetype": "text/x-python", 228 | "name": "python", 229 | "nbconvert_exporter": "python", 230 | "pygments_lexer": "ipython3", 231 | "version": "3.8.5" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 1 236 | } 237 | -------------------------------------------------------------------------------- /BasicBinaryClassification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "aD7ZYYXCtBr6" 7 | }, 8 | "source": [ 9 | "# Student Classification" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "id": "8VG3o1bl0cM_" 16 | }, 17 | "source": [ 18 | "#### Framework and Data" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "colab": { 26 | "base_uri": "https://localhost:8080/", 27 | "height": 351 28 | }, 29 | "id": "6Jo7D19MfAiR", 30 | "outputId": "6849e234-327b-4a0d-de46-f62edf88cc2a" 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "from sklearn.preprocessing import StandardScaler\n", 35 | "from sklearn.linear_model import LogisticRegression\n", 36 | "from sklearn.metrics import accuracy_score\n", 37 | "from sklearn.metrics import classification_report\n", 38 | "from sklearn.externals import joblib\n", 39 | "import os\n", 40 | "import pandas as pd\n", 41 | "import numpy as np\n", 42 | "\n", 43 | "df = pd.read_csv(\"https://raw.githubusercontent.com/dipanjanS/practical-machine-learning-with-python/master/notebooks/Ch01_Machine_Learning_Basics/student_records.csv\")\n", 44 | "df" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "id": "v5mcq08q0pgo" 51 | }, 52 | "source": [ 53 | "#### Preprocessing" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 97, 59 | "metadata": { 60 | "id": "9deR0HhNfj1B" 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "feature_names = [\"OverallGrade\", \"Obedient\", \"ResearchScore\", \"ProjectScore\"]\n", 65 | "\n", 66 | "target = df[\"Recommend\"]\n", 67 | "features = df[feature_names]\n", 68 | "\n", 69 | "numerical_keys = [\"ResearchScore\", \"ProjectScore\"]\n", 70 | "categorical_keys = [\"OverallGrade\", \"Obedient\"]" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 98, 76 | "metadata": { 77 | "colab": { 78 | "base_uri": "https://localhost:8080/" 79 | }, 80 | "id": "_ZQ_jR6blU1Z", 81 | "outputId": "2eeba529-d34d-44bc-80ba-5875990ec63a" 82 | }, 83 | "outputs": [ 84 | { 85 | "name": "stderr", 86 | "output_type": "stream", 87 | "text": [ 88 | "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: \n", 89 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 90 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 91 | "\n", 92 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 93 | " import sys\n", 94 | "/usr/local/lib/python3.6/dist-packages/pandas/core/indexing.py:1734: SettingWithCopyWarning: \n", 95 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 96 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 97 | "\n", 98 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 99 | " isetter(loc, value[:, i].tolist())\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "# conditional attribution on (target==\"Yes\"): 1 for true, 0 for false\n", 105 | "target = np.where(target == \"Yes\", 1, 0)\n", 106 | "\n", 107 | "# scaling\n", 108 | "ss = StandardScaler()\n", 109 | "ss.fit(df[numerical_keys])\n", 110 | "features[numerical_keys] = ss.transform(df[numerical_keys])\n", 111 | "\n", 112 | "# one-hot encoding\n", 113 | "features = pd.get_dummies(features, columns=categorical_keys)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": { 119 | "id": "n1Uy7IAn002N" 120 | }, 121 | "source": [ 122 | "#### Training" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 99, 128 | "metadata": { 129 | "id": "byqambyAxY77" 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "# validation split\n", 134 | "test_target = target[5:]\n", 135 | "test_features = features[5:]\n", 136 | "\n", 137 | "# training split\n", 138 | "target = target[:5]\n", 139 | "features = features[:5]" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 100, 145 | "metadata": { 146 | "colab": { 147 | "base_uri": "https://localhost:8080/" 148 | }, 149 | "id": "u-azfrhrpPs-", 150 | "outputId": "f86d1c86-b72b-48b4-a496-9d692a5dc4a2" 151 | }, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", 157 | " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", 158 | " multi_class='auto', n_jobs=None, penalty='l2',\n", 159 | " random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n", 160 | " warm_start=False)" 161 | ] 162 | }, 163 | "execution_count": 100, 164 | "metadata": { 165 | "tags": [] 166 | }, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "lr = LogisticRegression()\n", 172 | "\n", 173 | "model = lr.fit(features, target)\n", 174 | "\n", 175 | "model" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": { 181 | "id": "8DkrD3Fo06yT" 182 | }, 183 | "source": [ 184 | "#### Evaluation" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 101, 190 | "metadata": { 191 | "colab": { 192 | "base_uri": "https://localhost:8080/" 193 | }, 194 | "id": "JAlbBX_FwHXh", 195 | "outputId": "aade60e7-03ad-470e-f544-cf45556327ce" 196 | }, 197 | "outputs": [ 198 | { 199 | "name": "stdout", 200 | "output_type": "stream", 201 | "text": [ 202 | "Accuracy: 100.0 %\n", 203 | "Classification Stats: \n", 204 | " precision recall f1-score support\n", 205 | "\n", 206 | " 0 1.00 1.00 1.00 2\n", 207 | " 1 1.00 1.00 1.00 1\n", 208 | "\n", 209 | " accuracy 1.00 3\n", 210 | " macro avg 1.00 1.00 1.00 3\n", 211 | "weighted avg 1.00 1.00 1.00 3\n", 212 | "\n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "predictions = model.predict(test_features)\n", 218 | "print(\"Accuracy: \", float(accuracy_score(test_target, predictions))*100, \"%\")\n", 219 | "print(\"Classification Stats: \\n\", classification_report(test_target, predictions))" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": { 225 | "id": "PWI6RTUO08_W" 226 | }, 227 | "source": [ 228 | "#### Deploy" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 102, 234 | "metadata": { 235 | "colab": { 236 | "base_uri": "https://localhost:8080/" 237 | }, 238 | "id": "N-91j3SetLSJ", 239 | "outputId": "be4e1c32-b5de-4243-f32d-edffe2f5fb7c" 240 | }, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "['Scaler/scaler.pickle']" 246 | ] 247 | }, 248 | "execution_count": 102, 249 | "metadata": { 250 | "tags": [] 251 | }, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "# to save and load the model and the scaler we need to run it on a local machine\n", 257 | "\n", 258 | "if not os.path.exists('Model'):\n", 259 | " os.mkdir('Model')\n", 260 | "if not os.path.exists('Scaler'):\n", 261 | " os.mkdir('Scaler')\n", 262 | "\n", 263 | "# create the files\n", 264 | "joblib.dump(model, r'Model/model.pickle')\n", 265 | "joblib.dump(ss, r'Scaler/scaler.pickle')\n", 266 | "\n", 267 | "# load the files\n", 268 | "model = joblib.load(r'Model/model.pickle')\n", 269 | "scaler = joblib.load(r'Scaler/scaler.pickle')" 270 | ] 271 | } 272 | ], 273 | "metadata": { 274 | "colab": { 275 | "authorship_tag": "ABX9TyPa82QyNIf7nf4CVJzTruSF", 276 | "include_colab_link": true, 277 | "name": "BasicBinaryClassification.ipynb", 278 | "provenance": [] 279 | }, 280 | "kernelspec": { 281 | "display_name": "Python 3", 282 | "language": "python", 283 | "name": "python3" 284 | }, 285 | "language_info": { 286 | "codemirror_mode": { 287 | "name": "ipython", 288 | "version": 3 289 | }, 290 | "file_extension": ".py", 291 | "mimetype": "text/x-python", 292 | "name": "python", 293 | "nbconvert_exporter": "python", 294 | "pygments_lexer": "ipython3", 295 | "version": "3.8.5" 296 | } 297 | }, 298 | "nbformat": 4, 299 | "nbformat_minor": 1 300 | } 301 | -------------------------------------------------------------------------------- /TFIDF.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "language_info": { 4 | "codemirror_mode": { 5 | "name": "ipython", 6 | "version": 3 7 | }, 8 | "file_extension": ".py", 9 | "mimetype": "text/x-python", 10 | "name": "python", 11 | "nbconvert_exporter": "python", 12 | "pygments_lexer": "ipython3", 13 | "version": "3.8.5-final" 14 | }, 15 | "orig_nbformat": 2, 16 | "kernelspec": { 17 | "name": "python3", 18 | "display_name": "Python 3", 19 | "language": "python" 20 | } 21 | }, 22 | "nbformat": 4, 23 | "nbformat_minor": 2, 24 | "cells": [ 25 | { 26 | "cell_type": "code", 27 | "execution_count": 9, 28 | "source": [ 29 | "import pandas as pd\n", 30 | "import math" 31 | ], 32 | "outputs": [], 33 | "metadata": {} 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 10, 38 | "source": [ 39 | "doc1 = [\"The sky is blue.\", \"The sun is bright today.\"]\n", 40 | "\n", 41 | "doc2 = [\"The sun in the sky is bright.\", \"We can see the shining sun, the bright sun.\"]" 42 | ], 43 | "outputs": [], 44 | "metadata": {} 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 11, 49 | "source": [ 50 | "# Create tokens for each document\n", 51 | "doc1_tokens = sum([doc.lower().replace(',', '').replace('.', '').split() for doc in doc1], [])\n", 52 | "doc2_tokens = sum([doc.lower().replace(',', '').replace('.', '').split() for doc in doc2], [])\n", 53 | "\n", 54 | "print(doc1_tokens)\n", 55 | "print('\\n')\n", 56 | "print(doc2_tokens)" 57 | ], 58 | "outputs": [ 59 | { 60 | "output_type": "stream", 61 | "name": "stdout", 62 | "text": [ 63 | "['the', 'sky', 'is', 'blue', 'the', 'sun', 'is', 'bright', 'today']\n", 64 | "\n", 65 | "\n", 66 | "['the', 'sun', 'in', 'the', 'sky', 'is', 'bright', 'we', 'can', 'see', 'the', 'shining', 'sun', 'the', 'bright', 'sun']\n" 67 | ] 68 | } 69 | ], 70 | "metadata": {} 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 12, 75 | "source": [ 76 | "# Remove stop words\n", 77 | "stopwords = ['a', 'the', 'i', 'me', 'is', 'to', 'then', 'what', 'are', 'for', 'my', 'as', 'can', 'and', 'in', 'of', 'am', 'it']\n", 78 | "\n", 79 | "doc1_tokens = set(doc1_tokens) - set(stopwords)\n", 80 | "doc2_tokens = set(doc2_tokens) - set(stopwords)\n", 81 | "\n", 82 | "\n", 83 | "print(doc1_tokens)\n", 84 | "print(doc2_tokens)" 85 | ], 86 | "outputs": [ 87 | { 88 | "output_type": "stream", 89 | "name": "stdout", 90 | "text": [ 91 | "{'sky', 'blue', 'today', 'bright', 'sun'}\n", 92 | "{'sky', 'shining', 'see', 'we', 'bright', 'sun'}\n" 93 | ] 94 | } 95 | ], 96 | "metadata": {} 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 13, 101 | "source": [ 102 | "# Find the unique set of tokens.\n", 103 | "unique_tokens = set(doc1_tokens).union(set(doc2_tokens))\n", 104 | "\n", 105 | "print(unique_tokens)" 106 | ], 107 | "outputs": [ 108 | { 109 | "output_type": "stream", 110 | "name": "stdout", 111 | "text": [ 112 | "{'sky', 'blue', 'today', 'shining', 'see', 'we', 'bright', 'sun'}\n" 113 | ] 114 | } 115 | ], 116 | "metadata": {} 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 14, 121 | "source": [ 122 | "# Maintain a dict to keep track of count of the unique words in the individual doc\n", 123 | "count_doc1 = dict.fromkeys(unique_tokens, 0)\n", 124 | "for token in doc1_tokens:\n", 125 | " count_doc1[token] += 1\n", 126 | " \n", 127 | "count_doc2 = dict.fromkeys(unique_tokens, 0)\n", 128 | "for token in doc2_tokens:\n", 129 | " count_doc2[token] += 1\n", 130 | " \n", 131 | " \n", 132 | "print(count_doc1)\n", 133 | "print('\\n')\n", 134 | "print(count_doc2)\n" 135 | ], 136 | "outputs": [ 137 | { 138 | "output_type": "stream", 139 | "name": "stdout", 140 | "text": [ 141 | "{'sky': 1, 'blue': 1, 'today': 1, 'shining': 0, 'see': 0, 'we': 0, 'bright': 1, 'sun': 1}\n", 142 | "\n", 143 | "\n", 144 | "{'sky': 1, 'blue': 0, 'today': 0, 'shining': 1, 'see': 1, 'we': 1, 'bright': 1, 'sun': 1}\n" 145 | ] 146 | } 147 | ], 148 | "metadata": {} 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 15, 153 | "source": [ 154 | "# Calculate term frequency (TF):\n", 155 | "def calculate_tf(count_doc, doc_tokens):\n", 156 | " tf = dict()\n", 157 | " for token, count in count_doc.items():\n", 158 | " tf[token] = count / float(len(doc_tokens))\n", 159 | " return tf\n", 160 | " \n", 161 | "tf1 = calculate_tf(count_doc=count_doc1, doc_tokens=doc1_tokens)\n", 162 | "tf2 = calculate_tf(count_doc=count_doc2, doc_tokens=doc2_tokens)\n", 163 | "\n", 164 | "print(tf1)\n", 165 | "print('\\n', tf2)" 166 | ], 167 | "outputs": [ 168 | { 169 | "output_type": "stream", 170 | "name": "stdout", 171 | "text": [ 172 | "{'sky': 0.2, 'blue': 0.2, 'today': 0.2, 'shining': 0.0, 'see': 0.0, 'we': 0.0, 'bright': 0.2, 'sun': 0.2}\n", 173 | "\n", 174 | " {'sky': 0.16666666666666666, 'blue': 0.0, 'today': 0.0, 'shining': 0.16666666666666666, 'see': 0.16666666666666666, 'we': 0.16666666666666666, 'bright': 0.16666666666666666, 'sun': 0.16666666666666666}\n" 175 | ] 176 | } 177 | ], 178 | "metadata": {} 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 16, 183 | "source": [ 184 | "# Calculate inverse document frequency IDF\n", 185 | "def calculate_idf(doc_counts):\n", 186 | " idf = dict.fromkeys(doc_counts[0].keys(), 0)\n", 187 | " for doc in doc_counts: \n", 188 | " for token, count in doc.items():\n", 189 | " if count!=0:\n", 190 | " idf[token] += 1\n", 191 | "\n", 192 | " \n", 193 | " for token, count in idf.items():\n", 194 | " idf[token] = math.log(len(doc_counts) / float(count))\n", 195 | " \n", 196 | " return idf\n", 197 | "\n", 198 | "idf = calculate_idf([count_doc1, count_doc2])\n", 199 | "print(idf)" 200 | ], 201 | "outputs": [ 202 | { 203 | "output_type": "stream", 204 | "name": "stdout", 205 | "text": [ 206 | "{'sky': 0.0, 'blue': 0.6931471805599453, 'today': 0.6931471805599453, 'shining': 0.6931471805599453, 'see': 0.6931471805599453, 'we': 0.6931471805599453, 'bright': 0.0, 'sun': 0.0}\n" 207 | ] 208 | } 209 | ], 210 | "metadata": {} 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 17, 215 | "source": [ 216 | "# Calculate TF-IDF\n", 217 | "def calculate_tfidf(tf, idf):\n", 218 | " tfidf = dict()\n", 219 | " for token, count in tf.items():\n", 220 | " tfidf[token] = count * idf[token]\n", 221 | " return tfidf" 222 | ], 223 | "outputs": [], 224 | "metadata": {} 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 18, 229 | "source": [ 230 | "\n", 231 | "tfidf1 = calculate_tfidf(tf1, idf)\n", 232 | "tfidf2 = calculate_tfidf(tf2, idf)" 233 | ], 234 | "outputs": [], 235 | "metadata": {} 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 19, 240 | "source": [ 241 | "# Create a dataframe for all the calculated values\n", 242 | "tfidf_df = pd.DataFrame([tfidf1, tfidf2])\n", 243 | "tfidf_df" 244 | ], 245 | "outputs": [ 246 | { 247 | "output_type": "execute_result", 248 | "data": { 249 | "text/plain": [ 250 | " sky blue today shining see we bright sun\n", 251 | "0 0.0 0.138629 0.138629 0.000000 0.000000 0.000000 0.0 0.0\n", 252 | "1 0.0 0.000000 0.000000 0.115525 0.115525 0.115525 0.0 0.0" 253 | ], 254 | "text/html": [ 255 | "
\n", 256 | "\n", 269 | "\n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | "
skybluetodayshiningseewebrightsun
00.00.1386290.1386290.0000000.0000000.0000000.00.0
10.00.0000000.0000000.1155250.1155250.1155250.00.0
\n", 308 | "
" 309 | ] 310 | }, 311 | "metadata": {}, 312 | "execution_count": 19 313 | } 314 | ], 315 | "metadata": {} 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "source": [], 321 | "outputs": [], 322 | "metadata": {} 323 | } 324 | ] 325 | } -------------------------------------------------------------------------------- /Count-TfIdf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": { 7 | "colab": {}, 8 | "colab_type": "code", 9 | "id": "N8gs4LslFVYv" 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "from sklearn.feature_extraction.text import CountVectorizer\n", 14 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 15 | "from pandas import DataFrame" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 12, 21 | "metadata": { 22 | "colab": {}, 23 | "colab_type": "code", 24 | "id": "Km4-r8KsFhUV" 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "def create_document_term_matrix(message_list, vectorizer):\n", 29 | " doc_term_matrix = vectorizer.fit_transform(message_list)\n", 30 | " return DataFrame(doc_term_matrix.toarray(), \n", 31 | " columns=vectorizer.get_feature_names())" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 13, 37 | "metadata": { 38 | "colab": {}, 39 | "colab_type": "code", 40 | "id": "jFTR9b78GDmU" 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "msg_1 = [\"My name is ABCD\",\n", 45 | " \"I want to learn NLP\"]" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 14, 51 | "metadata": { 52 | "colab": {}, 53 | "colab_type": "code", 54 | "id": "VTUZYWjFGXN0" 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "count_vect = CountVectorizer()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 15, 64 | "metadata": { 65 | "colab": { 66 | "base_uri": "https://localhost:8080/", 67 | "height": 110 68 | }, 69 | "colab_type": "code", 70 | "id": "c8tprPLMGbp5", 71 | "outputId": "78158e87-f03e-4241-8705-68b81c690bc7" 72 | }, 73 | "outputs": [ 74 | { 75 | "name": "stderr", 76 | "output_type": "stream", 77 | "text": [ 78 | "/Users/tamanna/opt/anaconda3/envs/tf/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n", 79 | " warnings.warn(msg, category=FutureWarning)\n" 80 | ] 81 | }, 82 | { 83 | "data": { 84 | "text/html": [ 85 | "
\n", 86 | "\n", 99 | "\n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | "
abcdislearnmynamenlptowant
011011000
100100111
\n", 138 | "
" 139 | ], 140 | "text/plain": [ 141 | " abcd is learn my name nlp to want\n", 142 | "0 1 1 0 1 1 0 0 0\n", 143 | "1 0 0 1 0 0 1 1 1" 144 | ] 145 | }, 146 | "execution_count": 15, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "create_document_term_matrix(msg_1, count_vect)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": { 158 | "colab_type": "text", 159 | "id": "i9FqaTO-JCD4" 160 | }, 161 | "source": [ 162 | "# TF-IDF" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": { 168 | "colab_type": "text", 169 | "id": "lyrGxv0-JEbl" 170 | }, 171 | "source": [ 172 | "## Term Frequency (tf)\n", 173 | "\n" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": { 179 | "colab_type": "text", 180 | "id": "kZi1FgpoI6m8" 181 | }, 182 | "source": [ 183 | "$$\n", 184 | "t f_{i, j}=\\frac{n_{i, j}}{\\sum_{k} n_{i, j}}\n", 185 | "$$" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "colab_type": "text", 192 | "id": "9SgL3wCwJBXZ" 193 | }, 194 | "source": [ 195 | "## Inverse Data Frequency (idf)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": { 201 | "colab_type": "text", 202 | "id": "-ehyK7mHI9eX" 203 | }, 204 | "source": [ 205 | "$$\n", 206 | "i d f(w)=\\log \\left(\\frac{N}{d f_{t}}\\right)\n", 207 | "$$" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 6, 213 | "metadata": { 214 | "colab": {}, 215 | "colab_type": "code", 216 | "id": "vVUeG-xfGhBk" 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "msg_2 = [\"ABCD is my name\",\n", 221 | " \"ABCD likes NLP\"]" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 7, 227 | "metadata": { 228 | "colab": {}, 229 | "colab_type": "code", 230 | "id": "zRGFGs00PzxF" 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "tfidf_vect = TfidfVectorizer()" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 8, 240 | "metadata": { 241 | "colab": { 242 | "base_uri": "https://localhost:8080/", 243 | "height": 110 244 | }, 245 | "colab_type": "code", 246 | "id": "KIP50r8dP4h5", 247 | "outputId": "79e2fe12-8580-4c47-d415-070b26924b99" 248 | }, 249 | "outputs": [ 250 | { 251 | "name": "stderr", 252 | "output_type": "stream", 253 | "text": [ 254 | "/Users/tamanna/opt/anaconda3/envs/tf/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n", 255 | " warnings.warn(msg, category=FutureWarning)\n" 256 | ] 257 | }, 258 | { 259 | "data": { 260 | "text/html": [ 261 | "
\n", 262 | "\n", 275 | "\n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | "
abcdislanguagelikesmynameprogrammingpython
00.3799780.5340460.0000000.0000000.5340460.5340460.0000000.000000
10.3351760.0000000.4710780.4710780.0000000.0000000.4710780.471078
\n", 314 | "
" 315 | ], 316 | "text/plain": [ 317 | " abcd is language likes my name programming \\\n", 318 | "0 0.379978 0.534046 0.000000 0.000000 0.534046 0.534046 0.000000 \n", 319 | "1 0.335176 0.000000 0.471078 0.471078 0.000000 0.000000 0.471078 \n", 320 | "\n", 321 | " python \n", 322 | "0 0.000000 \n", 323 | "1 0.471078 " 324 | ] 325 | }, 326 | "execution_count": 8, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "create_document_term_matrix(msg_2, tfidf_vect)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 9, 338 | "metadata": { 339 | "colab": {}, 340 | "colab_type": "code", 341 | "id": "74DpTD4nP8x8" 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "msg_3 = [\"ABCD ABCD is my name\",\n", 346 | " \"ABCD likes NLP\"]" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 10, 352 | "metadata": { 353 | "colab": { 354 | "base_uri": "https://localhost:8080/", 355 | "height": 110 356 | }, 357 | "colab_type": "code", 358 | "id": "nVcFm_WkUDqR", 359 | "outputId": "2dfbe2cb-007b-4eec-fa58-cb2a7fafda69" 360 | }, 361 | "outputs": [ 362 | { 363 | "name": "stderr", 364 | "output_type": "stream", 365 | "text": [ 366 | "/Users/tamanna/opt/anaconda3/envs/tf/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n", 367 | " warnings.warn(msg, category=FutureWarning)\n" 368 | ] 369 | }, 370 | { 371 | "data": { 372 | "text/html": [ 373 | "
\n", 374 | "\n", 387 | "\n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | "
abcdislanguagelikesmynameprogrammingpython
00.6348090.4461010.0000000.0000000.4461010.4461010.0000000.000000
10.3351760.0000000.4710780.4710780.0000000.0000000.4710780.471078
\n", 426 | "
" 427 | ], 428 | "text/plain": [ 429 | " abcd is language likes my name programming \\\n", 430 | "0 0.634809 0.446101 0.000000 0.000000 0.446101 0.446101 0.000000 \n", 431 | "1 0.335176 0.000000 0.471078 0.471078 0.000000 0.000000 0.471078 \n", 432 | "\n", 433 | " python \n", 434 | "0 0.000000 \n", 435 | "1 0.471078 " 436 | ] 437 | }, 438 | "execution_count": 10, 439 | "metadata": {}, 440 | "output_type": "execute_result" 441 | } 442 | ], 443 | "source": [ 444 | "create_document_term_matrix(msg_3, tfidf_vect)" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 0, 450 | "metadata": { 451 | "colab": {}, 452 | "colab_type": "code", 453 | "id": "ge3NcWkAUJxZ" 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "msg_4 = [\"ABCD ABCD ABCD is my name\",\n", 458 | " \"I like NLP\"]" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 0, 464 | "metadata": { 465 | "colab": { 466 | "base_uri": "https://localhost:8080/", 467 | "height": 110 468 | }, 469 | "colab_type": "code", 470 | "id": "9gMHUqo3UZQx", 471 | "outputId": "6a1e7221-ad92-4ffe-98fb-c378523ef158" 472 | }, 473 | "outputs": [ 474 | { 475 | "data": { 476 | "text/html": [ 477 | "
\n", 478 | "\n", 491 | "\n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | "
bhaveshislanguagelikemynameprogrammingpython
00.8660250.2886750.00.00.2886750.2886750.00.0
10.0000000.0000000.50.50.0000000.0000000.50.5
\n", 530 | "
" 531 | ], 532 | "text/plain": [ 533 | " bhavesh is language like my name programming python\n", 534 | "0 0.866025 0.288675 0.0 0.0 0.288675 0.288675 0.0 0.0\n", 535 | "1 0.000000 0.000000 0.5 0.5 0.000000 0.000000 0.5 0.5" 536 | ] 537 | }, 538 | "execution_count": 46, 539 | "metadata": { 540 | "tags": [] 541 | }, 542 | "output_type": "execute_result" 543 | } 544 | ], 545 | "source": [ 546 | "create_document_term_matrix(msg_4, tfidf_vect)" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 0, 552 | "metadata": { 553 | "colab": {}, 554 | "colab_type": "code", 555 | "id": "XdzcLOBlUba8" 556 | }, 557 | "outputs": [], 558 | "source": [] 559 | } 560 | ], 561 | "metadata": { 562 | "colab": { 563 | "collapsed_sections": [], 564 | "name": "count_tf_idf.ipynb", 565 | "provenance": [] 566 | }, 567 | "kernelspec": { 568 | "display_name": "Python 3 (ipykernel)", 569 | "language": "python", 570 | "name": "python3" 571 | }, 572 | "language_info": { 573 | "codemirror_mode": { 574 | "name": "ipython", 575 | "version": 3 576 | }, 577 | "file_extension": ".py", 578 | "mimetype": "text/x-python", 579 | "name": "python", 580 | "nbconvert_exporter": "python", 581 | "pygments_lexer": "ipython3", 582 | "version": "3.7.11" 583 | } 584 | }, 585 | "nbformat": 4, 586 | "nbformat_minor": 1 587 | } 588 | -------------------------------------------------------------------------------- /Working-with-PDF-Text.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Working with PDF Files" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "pip install PyPDF2" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 3, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# note the capitalization\n", 24 | "import PyPDF2" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Reading PDFs\n", 32 | "\n", 33 | "First we open a pdf, then create a reader object for it. Notice how we use the binary method of reading , 'rb', instead of just 'r'." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# Notice we read it as a binary with 'rb'\n", 43 | "f = open('US_Declaration.pdf','rb')" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 5, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "pdf_reader = PyPDF2.PdfFileReader(f)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 6, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/plain": [ 63 | "5" 64 | ] 65 | }, 66 | "execution_count": 6, 67 | "metadata": {}, 68 | "output_type": "execute_result" 69 | } 70 | ], 71 | "source": [ 72 | "pdf_reader.numPages" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 7, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "page_one = pdf_reader.getPage(0)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "We can then extract the text:" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 8, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "page_one_text = page_one.extractText()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 9, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "\"Declaration of IndependenceIN CONGRESS, July 4, 1776. The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the\\npolitical bands which have connected them with another, and to assume among the powers of the\\nearth, the separate and equal station to which the Laws of Nature and of Nature's God entitle\\n\\nthem, a decent respect to the opinions of mankind requires that they should declare the causes\\n\\nwhich impel them to the separation. \\nWe hold these truths to be self-evident, that all men are created equal, that they are endowed by\\n\\ntheir Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit\\nof Happiness.ŠThat to secure these rights, Governments are instituted among Men, deriving\\n\\ntheir just powers from the consent of the governed,ŠThat whenever any Form of Government\\nbecomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to\\ninstitute new Government, laying its foundation on such principles and organizing its powers in\\nsuch form, as to them shall seem most likely to effect their Safety and Happiness. Prudence,\\n\\nindeed, will dictate that Governments long established should not be changed for light and\\ntransient causes; and accordingly all experience hath shewn, that mankind are more disposed to\\nsuffer, while evils are sufferable, than to right themselves by abolishing the forms to which they\\n\\nare accustomed. But when a long train of abuses and usurpations, pursuing invariably the same\\nObject evinces a design to reduce them under absolute Despotism, it is their right, it is their duty,\\nto throw off such Government, and to provide new Guards for their future security.ŠSuch has\\nbeen the patient sufferance of these Colonies; and such is now the necessity which constrains\\n\\nthem to alter their former Systems of Government. The history of the present King of Great\\n\\nBritain is a history of repeated injuries and usurpations, all having in direct object the\\nestablishment of an absolute Tyranny over these States. To prove this, let Facts be submitted to a\\ncandid world. He has refused his Assent to Laws, the most wholesome and necessary for the\\npublic good.\\nHe has forbidden his Governors to pass Laws of immediate and pressing\\nimportance, unless suspended in their operation till his Assent should be obtained;\\nand when so suspended, he has utterly neglected to attend to them.\\n\\nHe has refused to pass other Laws for the accommodation of large districts of\\npeople, unless those people would relinquish the right of Representation in the\\nLegislature, a right inestimable to them and formidable to tyrants only. \\n\\nHe has called together legislative bodies at places unusual, uncomfortable, and distant\\nfrom the depository of their public Records, for the sole purpose of fatiguing them into\\ncompliance with his measures.\\n\"" 109 | ] 110 | }, 111 | "execution_count": 9, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "page_one_text" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 10, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "f.close()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "## Adding to PDFs\n", 134 | "\n", 135 | "We can not write to PDFs using Python because of the differences between the single string type of Python, and the variety of fonts, placements, and other parameters that a PDF could have.\n", 136 | "\n", 137 | "What we *can* do is copy pages and append pages to the end." 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 11, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "f = open('US_Declaration.pdf','rb')\n", 147 | "pdf_reader = PyPDF2.PdfFileReader(f)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 12, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "first_page = pdf_reader.getPage(0)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 13, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "pdf_writer = PyPDF2.PdfFileWriter()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 14, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "pdf_writer.addPage(first_page)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 15, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "pdf_output = open(\"Some_New_Doc.pdf\",\"wb\")" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 16, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "pdf_writer.write(pdf_output)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 17, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "pdf_output.close()\n", 202 | "f.close()" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "Now we have copied a page and added it to another new document!" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "___" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "## Simple Example\n", 224 | "\n", 225 | "Let's try to grab all the text from this PDF file:" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 18, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "f = open('US_Declaration.pdf','rb')\n", 235 | "\n", 236 | "# List of every page's text.\n", 237 | "# The index will correspond to the page number.\n", 238 | "pdf_text = [0] # zero is a placehoder to make page 1 = index 1\n", 239 | "\n", 240 | "pdf_reader = PyPDF2.PdfFileReader(f)\n", 241 | "\n", 242 | "for p in range(pdf_reader.numPages):\n", 243 | " \n", 244 | " page = pdf_reader.getPage(p)\n", 245 | " \n", 246 | " pdf_text.append(page.extractText())\n", 247 | "\n", 248 | "f.close()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 19, 254 | "metadata": {}, 255 | "outputs": [ 256 | { 257 | "data": { 258 | "text/plain": [ 259 | "[0,\n", 260 | " \"Declaration of IndependenceIN CONGRESS, July 4, 1776. The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the\\npolitical bands which have connected them with another, and to assume among the powers of the\\nearth, the separate and equal station to which the Laws of Nature and of Nature's God entitle\\n\\nthem, a decent respect to the opinions of mankind requires that they should declare the causes\\n\\nwhich impel them to the separation. \\nWe hold these truths to be self-evident, that all men are created equal, that they are endowed by\\n\\ntheir Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit\\nof Happiness.ŠThat to secure these rights, Governments are instituted among Men, deriving\\n\\ntheir just powers from the consent of the governed,ŠThat whenever any Form of Government\\nbecomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to\\ninstitute new Government, laying its foundation on such principles and organizing its powers in\\nsuch form, as to them shall seem most likely to effect their Safety and Happiness. Prudence,\\n\\nindeed, will dictate that Governments long established should not be changed for light and\\ntransient causes; and accordingly all experience hath shewn, that mankind are more disposed to\\nsuffer, while evils are sufferable, than to right themselves by abolishing the forms to which they\\n\\nare accustomed. But when a long train of abuses and usurpations, pursuing invariably the same\\nObject evinces a design to reduce them under absolute Despotism, it is their right, it is their duty,\\nto throw off such Government, and to provide new Guards for their future security.ŠSuch has\\nbeen the patient sufferance of these Colonies; and such is now the necessity which constrains\\n\\nthem to alter their former Systems of Government. The history of the present King of Great\\n\\nBritain is a history of repeated injuries and usurpations, all having in direct object the\\nestablishment of an absolute Tyranny over these States. To prove this, let Facts be submitted to a\\ncandid world. He has refused his Assent to Laws, the most wholesome and necessary for the\\npublic good.\\nHe has forbidden his Governors to pass Laws of immediate and pressing\\nimportance, unless suspended in their operation till his Assent should be obtained;\\nand when so suspended, he has utterly neglected to attend to them.\\n\\nHe has refused to pass other Laws for the accommodation of large districts of\\npeople, unless those people would relinquish the right of Representation in the\\nLegislature, a right inestimable to them and formidable to tyrants only. \\n\\nHe has called together legislative bodies at places unusual, uncomfortable, and distant\\nfrom the depository of their public Records, for the sole purpose of fatiguing them into\\ncompliance with his measures.\\n\",\n", 261 | " 'He has dissolved Representative Houses repeatedly, for opposing with manlyfirmness his invasions on the rights of the people.He has refused for a long time, after such dissolutions, to cause others to beelected; whereby the Legislative powers, incapable of Annihilation, have returnedto the People at large for their exercise; the State remaining in the mean timeexposed to all the dangers of invasion from without, and convulsions within.He has endeavoured to prevent the population of these States; for that purposeobstructing the Laws for Naturalization of Foreigners; refusing to pass others toencourage their migrations hither, and raising the conditions of newAppropriations of Lands.He has obstructed the Administration of Justice, by refusing his Assent to Lawsfor establishing Judiciary powers.He has made Judges dependent on his Will alone, for the tenure of their offices,and the amount and payment of their salaries.He has erected a multitude of New Offices, and sent hither swarms of Officers toharrass our people, and eat out their substance.He has kept among us, in times of peace, Standing Armies without the Consent ofour legislatures.He has affected to render the Military independent of and superior to the Civil power.He has combined with others to subject us to a jurisdiction foreign to ourconstitution, and unacknowledged by our laws; giving his Assent to their Acts ofpretended Legislation:For Quartering large bodies of armed troops among us:For protecting them, by a mock Trial, from punishment for any Murders whichthey should commit on the Inhabitants of these States:For cutting off our Trade with all parts of the world:For imposing Taxes on us without our Consent: For depriving us in many cases,of the benefits of Trial by Jury:For transporting us beyond Seas to be tried for pretended offencesFor abolishing the free System of English Laws in a neighbouring Province,establishing therein an Arbitrary government, and enlarging its Boundaries so as',\n", 262 | " 'to render it at once an example and fit instrument for introducing the sameabsolute rule into these Colonies:For taking away our Charters, abolishing our most valuable Laws, and alteringfundamentally the Forms of our Governments:For suspending our own Legislatures, and declaring themselves invested withpower to legislate for us in all cases whatsoever.He has abdicated Government here, by declaring us out of his Protection andwaging War against us.He has plundered our seas, ravaged our Coasts, burnt our towns, and destroyed thelives of our people.He is at this time transporting large Armies of foreign Mercenaries to compleatthe works of death, desolation and tyranny, already begun with circumstances ofCruelty & perfidy scarcely paralleled in the most barbarous ages, and totallyunworthy of the Head of a civilized nation.He has constrained our fellow Citizens taken Captive on the high Seas to bearArms against their Country, to become the executioners of their friends and\\nBrethren, or to fall themselves by their Hands.He has excited domestic insurrections amongst us, and has endeavoured to bringon the inhabitants of our frontiers, the merciless Indian Savages, whose known\\nrule of warfare, is an undistinguished destruction of all ages, sexes and conditions. In every stage of these Oppressions We have Petitioned for Redress in the most humble terms:Our repeated Petitions have been answered only by repeated injury. A Prince whose character isthus marked by every act which may define a Tyrant, is unfit to be the ruler of a free people. Nor have We been wanting in attentions to our Brittish brethren. We have warned them fromtime to time of attempts by their legislature to extend an unwarrantable jurisdiction over us. Wehave reminded them of the circumstances of our emigration and settlement here. We haveappealed to their native justice and magnanimity, and we have conjured them by the ties of ourcommon kindred to disavow these usurpations, which, would inevitably interrupt ourconnections and correspondence. They too have been deaf to the voice of justice and ofconsanguinity. We must, therefore, acquiesce in the necessity, which denounces our Separation,and hold them, as we hold the rest of mankind, Enemies in War, in Peace Friends. We, therefore, the Representatives of the united States of America, in General Congress,Assembled, appealing to the Supreme Judge of the world for the rectitude of our intentions, do,in the Name, and by Authority of the good People of these Colonies, solemnly publish anddeclare, That these United Colonies are, and of Right ought to be Free and Independent States;that they are Absolved from all Allegiance to the British Crown, and that all political connection',\n", 263 | " 'between them and the State of Great Britain, is and ought to be totally dissolved; and that as Free\\n\\nand Independent States, they have full Power to levy War, conclude Peace, contract Alliances,\\nestablish Commerce, and to do all other Acts and Things which Independent States may of right\\n\\ndo. And for the support of this Declaration, with a firm reliance on the protection of divine\\nProvidence, we mutually pledge to each other our Lives, our Fortunes and our sacred Honor.\\n\\n[The 56 signatures on the Declaration were arranged in six columns:\\n] [Column 1]\\n Georgia: Button Gwinnett\\n Lyman \\nHall George Walton \\n[Column 2]\\n North Carolina: William Hooper\\n Joseph Hewes\\n John Penn\\n South Carolina: Edward Ru\\ntledge Thomas Heyward, Jr.\\n Thomas Lynch, Jr.\\n Arthur Middleton \\n[Column 3]\\n Massachusetts: John Hancock\\n\\n Maryland: Samuel Chase\\n\\n William Paca\\n\\n Thomas Stone\\n\\n Charles Carroll of Carrollton\\n\\n Virginia: George Wythe\\n\\n Richard Henry Lee\\n\\n Thomas Jefferson\\n\\n Benjamin Harrison\\n\\n Thomas Nelson, Jr.\\n\\n Francis Lightfoot Lee\\n\\n Carter Braxton \\n\\n[Column 4]\\n Pennsylvania: Robert Morris\\n\\n Benjamin Rush\\n Benjamin Fran\\nklin John Morton\\n',\n", 264 | " ' George Clymer\\n James Smith\\n George Taylor\\n James Wilson\\n George Ross\\n Delaware: Caesar Rodney\\n George Read\\n Thomas McKean \\n[Column 5]\\n New York: Wi\\nlliam Floyd Philip Livingston\\n Francis L\\newis Lewis Morris\\n New Jersey: Richard Stockton\\n John Witherspoon\\n Francis Hopkinson\\n John Hart\\n Abraham Clark \\n[Column 6]\\n New Hampshire: Josiah Bartlett\\n William Whipple\\n Massachusetts: Samuel Adams\\n John Adams\\n Robert Treat Paine\\n Elbridge Gerry\\n Rhode Island: Stephen Hopkins\\n William Ellery\\n Connecticut: Roger Sherman\\n Samuel Huntington\\n William Williams\\n Oliver Wolcott\\n New Hampshire: Matthew Thornton\\n ']" 265 | ] 266 | }, 267 | "execution_count": 19, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "pdf_text" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 20, 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "He has dissolved Representative Houses repeatedly, for opposing with manlyfirmness his invasions on the rights of the people.He has refused for a long time, after such dissolutions, to cause others to beelected; whereby the Legislative powers, incapable of Annihilation, have returnedto the People at large for their exercise; the State remaining in the mean timeexposed to all the dangers of invasion from without, and convulsions within.He has endeavoured to prevent the population of these States; for that purposeobstructing the Laws for Naturalization of Foreigners; refusing to pass others toencourage their migrations hither, and raising the conditions of newAppropriations of Lands.He has obstructed the Administration of Justice, by refusing his Assent to Lawsfor establishing Judiciary powers.He has made Judges dependent on his Will alone, for the tenure of their offices,and the amount and payment of their salaries.He has erected a multitude of New Offices, and sent hither swarms of Officers toharrass our people, and eat out their substance.He has kept among us, in times of peace, Standing Armies without the Consent ofour legislatures.He has affected to render the Military independent of and superior to the Civil power.He has combined with others to subject us to a jurisdiction foreign to ourconstitution, and unacknowledged by our laws; giving his Assent to their Acts ofpretended Legislation:For Quartering large bodies of armed troops among us:For protecting them, by a mock Trial, from punishment for any Murders whichthey should commit on the Inhabitants of these States:For cutting off our Trade with all parts of the world:For imposing Taxes on us without our Consent: For depriving us in many cases,of the benefits of Trial by Jury:For transporting us beyond Seas to be tried for pretended offencesFor abolishing the free System of English Laws in a neighbouring Province,establishing therein an Arbitrary government, and enlarging its Boundaries so as\n" 286 | ] 287 | } 288 | ], 289 | "source": [ 290 | "print(pdf_text[2])" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "### Excellent work! \n", 298 | "That is all for PyPDF2 for now, remember that this won't work with every PDF file and is limited in its scope to only the text of PDFs.\n", 299 | "## Next up: Regular Expressions" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [] 308 | } 309 | ], 310 | "metadata": { 311 | "kernelspec": { 312 | "display_name": "Python 3 (ipykernel)", 313 | "language": "python", 314 | "name": "python3" 315 | }, 316 | "language_info": { 317 | "codemirror_mode": { 318 | "name": "ipython", 319 | "version": 3 320 | }, 321 | "file_extension": ".py", 322 | "mimetype": "text/x-python", 323 | "name": "python", 324 | "nbconvert_exporter": "python", 325 | "pygments_lexer": "ipython3", 326 | "version": "3.7.11" 327 | } 328 | }, 329 | "nbformat": 4, 330 | "nbformat_minor": 2 331 | } 332 | -------------------------------------------------------------------------------- /Spacy-Basics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# spaCy Basics" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Installation and Setup\n", 17 | "\n", 18 | "Installation is a two-step process. First, install spaCy using either conda or pip. Next, download the specific model you want, based on language.
For more info visit https://spacy.io/usage/\n", 19 | "\n", 20 | "### 1. From the command line or terminal:\n", 21 | "> `conda install -c conda-forge spacy`\n", 22 | ">
*or*
\n", 23 | "> `pip install -U spacy`\n", 24 | "\n", 25 | "> ### Alternatively you can create a virtual environment:\n", 26 | "> `conda create -n spacyenv python=3 spacy=2`\n", 27 | "\n", 28 | "### 2. Next, also from the command line (you must run this as admin or use sudo):\n", 29 | "\n", 30 | "> `python -m spacy download en`\n", 31 | "\n", 32 | "> ### If successful, you should see a message like:\n", 33 | "\n", 34 | "> **`Linking successful`**
\n", 35 | "> ` C:\\Anaconda3\\envs\\spacyenv\\lib\\site-packages\\en_core_web_sm -->`
\n", 36 | "> ` C:\\Anaconda3\\envs\\spacyenv\\lib\\site-packages\\spacy\\data\\en`
\n", 37 | "> ` `
\n", 38 | "> ` You can now load the model via spacy.load('en')`\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "# Working with spaCy in Python\n", 46 | "\n", 47 | "This is a typical set of instructions for importing and working with spaCy. Don't be surprised if this takes awhile - spaCy has a fairly large library to load:" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "ename": "OSError", 57 | "evalue": "[E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.", 58 | "output_type": "error", 59 | "traceback": [ 60 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 61 | "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", 62 | "\u001b[0;32m/var/folders/14/nc2ygsds1flfyq_lf5n38g600000gn/T/ipykernel_2850/1711385060.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Import spaCy and load the language library\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mspacy\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnlp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspacy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'en_core_web_sm'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# Create a Doc object\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 63 | "\u001b[0;32m~/opt/anaconda3/envs/tf/lib/python3.7/site-packages/spacy/__init__.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(name, vocab, disable, exclude, config)\u001b[0m\n\u001b[1;32m 50\u001b[0m \"\"\"\n\u001b[1;32m 51\u001b[0m return util.load_model(\n\u001b[0;32m---> 52\u001b[0;31m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocab\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvocab\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdisable\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdisable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexclude\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mexclude\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 53\u001b[0m )\n\u001b[1;32m 54\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 64 | "\u001b[0;32m~/opt/anaconda3/envs/tf/lib/python3.7/site-packages/spacy/util.py\u001b[0m in \u001b[0;36mload_model\u001b[0;34m(name, vocab, disable, exclude, config)\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mOLD_MODEL_SHORTCUTS\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 426\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mIOError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mErrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mE941\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfull\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mOLD_MODEL_SHORTCUTS\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[index]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 427\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mIOError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mErrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mE050\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 428\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 65 | "\u001b[0;31mOSError\u001b[0m: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory." 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "# Import spaCy and load the language library\n", 71 | "import spacy\n", 72 | "nlp = spacy.load('en_core_web_sm')\n", 73 | "\n", 74 | "# Create a Doc object\n", 75 | "doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')\n", 76 | "\n", 77 | "# Print each token separately\n", 78 | "for token in doc:\n", 79 | " print(token.text, token.pos_, token.dep_)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "This doesn't look very user-friendly, but right away we see some interesting things happen:\n", 87 | "1. Tesla is recognized to be a Proper Noun, not just a word at the start of a sentence\n", 88 | "2. U.S. is kept together as one entity (we call this a 'token')\n", 89 | "\n", 90 | "As we dive deeper into spaCy we'll see what each of these abbreviations mean and how they're derived. We'll also see how spaCy can interpret the last three tokens combined `$6 million` as referring to ***money***." 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "___\n", 98 | "# spaCy Objects\n", 99 | "\n", 100 | "After importing the spacy module in the cell above we loaded a **model** and named it `nlp`.
Next we created a **Doc** object by applying the model to our text, and named it `doc`.
spaCy also builds a companion **Vocab** object that we'll cover in later sections.
The **Doc** object that holds the processed text is our focus here." 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "___\n", 108 | "# Pipeline\n", 109 | "When we run `nlp`, our text enters a *processing pipeline* that first breaks down the text and then performs a series of operations to tag, parse and describe the data. Image source: https://spacy.io/usage/spacy-101#pipelines" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "We can check to see what components currently live in the pipeline. In later sections we'll learn how to disable components and add new ones as needed." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 2, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "text/plain": [ 134 | "[('tagger', ),\n", 135 | " ('parser', ),\n", 136 | " ('ner', )]" 137 | ] 138 | }, 139 | "execution_count": 2, 140 | "metadata": {}, 141 | "output_type": "execute_result" 142 | } 143 | ], 144 | "source": [ 145 | "nlp.pipeline" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 3, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "['tagger', 'parser', 'ner']" 157 | ] 158 | }, 159 | "execution_count": 3, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "nlp.pipe_names" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "___\n", 173 | "## Tokenization\n", 174 | "The first step in processing text is to split up all the component parts (words & punctuation) into \"tokens\". These tokens are annotated inside the Doc object to contain descriptive information. We'll go into much more detail on tokenization in an upcoming lecture. For now, let's look at another example:" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 4, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "Tesla PROPN nsubj\n", 187 | "is VERB aux\n", 188 | "n't ADV neg\n", 189 | " SPACE \n", 190 | "looking VERB ROOT\n", 191 | "into ADP prep\n", 192 | "startups NOUN pobj\n", 193 | "anymore ADV advmod\n", 194 | ". PUNCT punct\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "doc2 = nlp(u\"Tesla isn't looking into startups anymore.\")\n", 200 | "\n", 201 | "for token in doc2:\n", 202 | " print(token.text, token.pos_, token.dep_)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "Notice how `isn't` has been split into two tokens. spaCy recognizes both the root verb `is` and the negation attached to it. Notice also that both the extended whitespace and the period at the end of the sentence are assigned their own tokens.\n", 210 | "\n", 211 | "It's important to note that even though `doc2` contains processed information about each token, it also retains the original text:" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 5, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/plain": [ 222 | "Tesla isn't looking into startups anymore." 223 | ] 224 | }, 225 | "execution_count": 5, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "doc2" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 6, 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "data": { 241 | "text/plain": [ 242 | "Tesla" 243 | ] 244 | }, 245 | "execution_count": 6, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "doc2[0]" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 7, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/plain": [ 262 | "spacy.tokens.doc.Doc" 263 | ] 264 | }, 265 | "execution_count": 7, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "type(doc2)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "___\n", 279 | "## Part-of-Speech Tagging (POS)\n", 280 | "The next step after splitting the text up into tokens is to assign parts of speech. In the above example, `Tesla` was recognized to be a ***proper noun***. Here some statistical modeling is required. For example, words that follow \"the\" are typically nouns.\n", 281 | "\n", 282 | "For a full list of POS Tags visit https://spacy.io/api/annotation#pos-tagging" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 8, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "text/plain": [ 293 | "'PROPN'" 294 | ] 295 | }, 296 | "execution_count": 8, 297 | "metadata": {}, 298 | "output_type": "execute_result" 299 | } 300 | ], 301 | "source": [ 302 | "doc2[0].pos_" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "___\n", 310 | "## Dependencies\n", 311 | "We also looked at the syntactic dependencies assigned to each token. `Tesla` is identified as an `nsubj` or the ***nominal subject*** of the sentence.\n", 312 | "\n", 313 | "For a full list of Syntactic Dependencies visit https://spacy.io/api/annotation#dependency-parsing\n", 314 | "
A good explanation of typed dependencies can be found [here](https://nlp.stanford.edu/software/dependencies_manual.pdf)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 9, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/plain": [ 325 | "'nsubj'" 326 | ] 327 | }, 328 | "execution_count": 9, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "doc2[0].dep_" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "To see the full name of a tag use `spacy.explain(tag)`" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 10, 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "data": { 351 | "text/plain": [ 352 | "'proper noun'" 353 | ] 354 | }, 355 | "execution_count": 10, 356 | "metadata": {}, 357 | "output_type": "execute_result" 358 | } 359 | ], 360 | "source": [ 361 | "spacy.explain('PROPN')" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 11, 367 | "metadata": {}, 368 | "outputs": [ 369 | { 370 | "data": { 371 | "text/plain": [ 372 | "'nominal subject'" 373 | ] 374 | }, 375 | "execution_count": 11, 376 | "metadata": {}, 377 | "output_type": "execute_result" 378 | } 379 | ], 380 | "source": [ 381 | "spacy.explain('nsubj')" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "___\n", 389 | "## Additional Token Attributes\n", 390 | "We'll see these again in upcoming lectures. For now we just want to illustrate some of the other information that spaCy assigns to tokens:" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": {}, 396 | "source": [ 397 | "|Tag|Description|doc2[0].tag|\n", 398 | "|:------|:------:|:------|\n", 399 | "|`.text`|The original word text|`Tesla`|\n", 400 | "|`.lemma_`|The base form of the word|`tesla`|\n", 401 | "|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|\n", 402 | "|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|\n", 403 | "|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|\n", 404 | "|`.is_alpha`|Is the token an alpha character?|`True`|\n", 405 | "|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 12, 411 | "metadata": {}, 412 | "outputs": [ 413 | { 414 | "name": "stdout", 415 | "output_type": "stream", 416 | "text": [ 417 | "looking\n", 418 | "look\n" 419 | ] 420 | } 421 | ], 422 | "source": [ 423 | "# Lemmas (the base form of the word):\n", 424 | "print(doc2[4].text)\n", 425 | "print(doc2[4].lemma_)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 13, 431 | "metadata": {}, 432 | "outputs": [ 433 | { 434 | "name": "stdout", 435 | "output_type": "stream", 436 | "text": [ 437 | "VERB\n", 438 | "VBG / verb, gerund or present participle\n" 439 | ] 440 | } 441 | ], 442 | "source": [ 443 | "# Simple Parts-of-Speech & Detailed Tags:\n", 444 | "print(doc2[4].pos_)\n", 445 | "print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 14, 451 | "metadata": {}, 452 | "outputs": [ 453 | { 454 | "name": "stdout", 455 | "output_type": "stream", 456 | "text": [ 457 | "Tesla: Xxxxx\n", 458 | "U.S. : X.X.\n" 459 | ] 460 | } 461 | ], 462 | "source": [ 463 | "# Word Shapes:\n", 464 | "print(doc2[0].text+': '+doc2[0].shape_)\n", 465 | "print(doc[5].text+' : '+doc[5].shape_)" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 15, 471 | "metadata": {}, 472 | "outputs": [ 473 | { 474 | "name": "stdout", 475 | "output_type": "stream", 476 | "text": [ 477 | "True\n", 478 | "False\n" 479 | ] 480 | } 481 | ], 482 | "source": [ 483 | "# Boolean Values:\n", 484 | "print(doc2[0].is_alpha)\n", 485 | "print(doc2[0].is_stop)" 486 | ] 487 | }, 488 | { 489 | "cell_type": "markdown", 490 | "metadata": {}, 491 | "source": [ 492 | "___\n", 493 | "## Spans\n", 494 | "Large Doc objects can be hard to work with at times. A **span** is a slice of Doc object in the form `Doc[start:stop]`." 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 16, 500 | "metadata": {}, 501 | "outputs": [], 502 | "source": [ 503 | "doc3 = nlp(u'Although commmonly attributed to John Lennon from his song \"Beautiful Boy\", \\\n", 504 | "the phrase \"Life is what happens to us while we are making other plans\" was written by \\\n", 505 | "cartoonist Allen Saunders and published in Reader\\'s Digest in 1957, when Lennon was 17.')" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 17, 511 | "metadata": {}, 512 | "outputs": [ 513 | { 514 | "name": "stdout", 515 | "output_type": "stream", 516 | "text": [ 517 | "\"Life is what happens to us while we are making other plans\"\n" 518 | ] 519 | } 520 | ], 521 | "source": [ 522 | "life_quote = doc3[16:30]\n", 523 | "print(life_quote)" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": 18, 529 | "metadata": {}, 530 | "outputs": [ 531 | { 532 | "data": { 533 | "text/plain": [ 534 | "spacy.tokens.span.Span" 535 | ] 536 | }, 537 | "execution_count": 18, 538 | "metadata": {}, 539 | "output_type": "execute_result" 540 | } 541 | ], 542 | "source": [ 543 | "type(life_quote)" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "In upcoming lectures we'll see how to create Span objects using `Span()`. This will allow us to assign additional information to the Span." 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "___\n", 558 | "## Sentences\n", 559 | "Certain tokens inside a Doc object may also receive a \"start of sentence\" tag. While this doesn't immediately build a list of sentences, these tags enable the generation of sentence segments through `Doc.sents`. Later we'll write our own segmentation rules." 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 19, 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [ 568 | "doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": 20, 574 | "metadata": {}, 575 | "outputs": [ 576 | { 577 | "name": "stdout", 578 | "output_type": "stream", 579 | "text": [ 580 | "This is the first sentence.\n", 581 | "This is another sentence.\n", 582 | "This is the last sentence.\n" 583 | ] 584 | } 585 | ], 586 | "source": [ 587 | "for sent in doc4.sents:\n", 588 | " print(sent)" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 21, 594 | "metadata": {}, 595 | "outputs": [ 596 | { 597 | "data": { 598 | "text/plain": [ 599 | "True" 600 | ] 601 | }, 602 | "execution_count": 21, 603 | "metadata": {}, 604 | "output_type": "execute_result" 605 | } 606 | ], 607 | "source": [ 608 | "doc4[6].is_sent_start" 609 | ] 610 | } 611 | ], 612 | "metadata": { 613 | "kernelspec": { 614 | "display_name": "Python 3 (ipykernel)", 615 | "language": "python", 616 | "name": "python3" 617 | }, 618 | "language_info": { 619 | "codemirror_mode": { 620 | "name": "ipython", 621 | "version": 3 622 | }, 623 | "file_extension": ".py", 624 | "mimetype": "text/x-python", 625 | "name": "python", 626 | "nbconvert_exporter": "python", 627 | "pygments_lexer": "ipython3", 628 | "version": "3.7.11" 629 | } 630 | }, 631 | "nbformat": 4, 632 | "nbformat_minor": 2 633 | } 634 | -------------------------------------------------------------------------------- /Working-with-Text-Files.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Working with Text Files\n", 10 | "In this section we'll cover\n", 11 | " * Working with f-strings (formatted string literals) to format printed text\n", 12 | " * Working with Files - opening, reading, writing and appending text files" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "## Formatted String Literals (f-strings)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "Introduced in Python 3.6, f-strings offer several benefits over the older `.format()` string method.
For one, you can bring outside variables immediately into to the string rather than pass them through as keyword arguments:" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "His name is Fred.\n", 39 | "His name is Fred.\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "name = 'Fred'\n", 45 | "\n", 46 | "# Using the old .format() method:\n", 47 | "print('His name is {var}.'.format(var=name))\n", 48 | "\n", 49 | "# Using f-strings:\n", 50 | "print(f'His name is {name}.')" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "Pass `!r` to get the string representation:" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "His name is 'Fred'\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "print(f'His name is {name!r}')" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "Be careful not to let quotation marks in the replacement fields conflict with the quoting used in the outer string:" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "ename": "SyntaxError", 91 | "evalue": "invalid syntax (3751357111.py, line 3)", 92 | "output_type": "error", 93 | "traceback": [ 94 | "\u001b[0;36m File \u001b[0;32m\"/var/folders/14/nc2ygsds1flfyq_lf5n38g600000gn/T/ipykernel_2237/3751357111.py\"\u001b[0;36m, line \u001b[0;32m3\u001b[0m\n\u001b[0;31m print(f'Address: {d['a']} Main Street')\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" 95 | ] 96 | } 97 | ], 98 | "source": [ 99 | "d = {'a':123,'b':456}\n", 100 | "\n", 101 | "print(f'Address: {d['a']} Main Street')" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "Instead, use different styles of quotation marks:" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 5, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "Address: 123 Main Street\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "d = {'a':123,'b':456}\n", 126 | "\n", 127 | "print(f\"Address: {d['a']} Main Street\")" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "### Minimum Widths, Alignment and Padding\n", 135 | "You can pass arguments inside a nested set of curly braces to set a minimum width for the field, the alignment and even padding characters." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 6, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "Author Topic Pages \n", 148 | "Twain Rafting 601\n", 149 | "Feynman Physics 95\n", 150 | "Hamilton Mythology 144\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "library = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)]\n", 156 | "\n", 157 | "for book in library:\n", 158 | " print(f'{book[0]:{10}} {book[1]:{8}} {book[2]:{7}}')" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Here the first three lines align, except `Pages` follows a default left-alignment while numbers are right-aligned. Also, the fourth line's page number is pushed to the right as `Mythology` exceeds the minimum field width of `8`. When setting minimum field widths make sure to take the longest item into account.\n", 166 | "\n", 167 | "To set the alignment, use the character `<` for left-align, `^` for center, `>` for right.
\n", 168 | "To set padding, precede the alignment character with the padding character (`-` and `.` are common choices).\n", 169 | "\n", 170 | "Let's make some adjustments:" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 7, 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "Author Topic ..Pages\n", 183 | "Twain Rafting ....601\n", 184 | "Feynman Physics .....95\n", 185 | "Hamilton Mythology ....144\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "for book in library:\n", 191 | " print(f'{book[0]:{10}} {book[1]:{10}} {book[2]:.>{7}}') # here .> was added" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "### Date Formatting" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 8, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "January 27, 2018\n" 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "from datetime import datetime\n", 216 | "\n", 217 | "today = datetime(year=2018, month=1, day=27)\n", 218 | "\n", 219 | "print(f'{today:%B %d, %Y}')" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "# Files\n", 227 | "\n", 228 | "Python uses file objects to interact with external files on your computer. These file objects can be any sort of file you have on your computer, whether it be an audio file, a text file, emails, Excel documents, etc. Note: You will probably need to install certain libraries or modules to interact with those various file types, but they are easily available. (We will cover downloading modules later on in the course).\n", 229 | "\n", 230 | "Python has a built-in open function that allows us to open and play with basic file types. First we will need a file though. We're going to use some IPython magic to create a text file!\n", 231 | "\n", 232 | "## Creating a File with IPython\n", 233 | "#### This function is specific to jupyter notebooks! Alternatively, quickly create a simple .txt file with Sublime text editor." 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 9, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "Overwriting test.txt\n" 246 | ] 247 | } 248 | ], 249 | "source": [ 250 | "%%writefile test.txt\n", 251 | "Hello, this is a quick test file.\n", 252 | "This is the second line of the file." 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "## Python Opening a File\n", 260 | "\n", 261 | "### Know Your File's Location\n", 262 | "\n", 263 | "It's easy to get an error on this step:" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 11, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "myfile = open('test.txt')" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "To avoid this error, make sure your .txt file is saved in the same location as your notebook. To check your notebook location, use **pwd**:" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 10, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "'C:\\\\Users\\\\Mike\\\\NLP-Bootcamp\\\\00-Python-Text-Basics'" 291 | ] 292 | }, 293 | "execution_count": 10, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "pwd" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "**Alternatively, to grab files from any location on your computer, simply pass in the entire file path. **\n", 307 | "\n", 308 | "For Windows you need to use double \\ so python doesn't treat the second \\ as an escape character, a file path is in the form:\n", 309 | "\n", 310 | " myfile = open(\"C:\\\\Users\\\\YourUserName\\\\Home\\\\Folder\\\\myfile.txt\")\n", 311 | "\n", 312 | "For MacOS and Linux you use slashes in the opposite direction:\n", 313 | "\n", 314 | " myfile = open(\"/Users/YourUserName/Folder/myfile.txt\")" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 11, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "# Open the text.txt file we created earlier\n", 324 | "my_file = open('test.txt')" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 12, 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "data": { 334 | "text/plain": [ 335 | "<_io.TextIOWrapper name='test.txt' mode='r' encoding='cp1252'>" 336 | ] 337 | }, 338 | "execution_count": 12, 339 | "metadata": {}, 340 | "output_type": "execute_result" 341 | } 342 | ], 343 | "source": [ 344 | "my_file" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "`my_file` is now an open file object held in memory. We'll perform some reading and writing exercises, and then we have to close the file to free up memory.\n", 352 | "\n", 353 | "### .read() and .seek()" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 13, 359 | "metadata": {}, 360 | "outputs": [ 361 | { 362 | "data": { 363 | "text/plain": [ 364 | "'Hello, this is a quick test file.\\nThis is the second line of the file.'" 365 | ] 366 | }, 367 | "execution_count": 13, 368 | "metadata": {}, 369 | "output_type": "execute_result" 370 | } 371 | ], 372 | "source": [ 373 | "# We can now read the file\n", 374 | "my_file.read()" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 14, 380 | "metadata": {}, 381 | "outputs": [ 382 | { 383 | "data": { 384 | "text/plain": [ 385 | "''" 386 | ] 387 | }, 388 | "execution_count": 14, 389 | "metadata": {}, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "# But what happens if we try to read it again?\n", 395 | "my_file.read()" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "This happens because you can imagine the reading \"cursor\" is at the end of the file after having read it. So there is nothing left to read. We can reset the \"cursor\" like this:" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 15, 408 | "metadata": {}, 409 | "outputs": [ 410 | { 411 | "data": { 412 | "text/plain": [ 413 | "0" 414 | ] 415 | }, 416 | "execution_count": 15, 417 | "metadata": {}, 418 | "output_type": "execute_result" 419 | } 420 | ], 421 | "source": [ 422 | "# Seek to the start of file (index 0)\n", 423 | "my_file.seek(0)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 16, 429 | "metadata": {}, 430 | "outputs": [ 431 | { 432 | "data": { 433 | "text/plain": [ 434 | "'Hello, this is a quick test file.\\nThis is the second line of the file.'" 435 | ] 436 | }, 437 | "execution_count": 16, 438 | "metadata": {}, 439 | "output_type": "execute_result" 440 | } 441 | ], 442 | "source": [ 443 | "# Now read again\n", 444 | "my_file.read()" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [ 451 | "### .readlines()\n", 452 | "You can read a file line by line using the readlines method. Use caution with large files, since everything will be held in memory. We will learn how to iterate over large files later in the course." 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 17, 458 | "metadata": {}, 459 | "outputs": [ 460 | { 461 | "data": { 462 | "text/plain": [ 463 | "['Hello, this is a quick test file.\\n', 'This is the second line of the file.']" 464 | ] 465 | }, 466 | "execution_count": 17, 467 | "metadata": {}, 468 | "output_type": "execute_result" 469 | } 470 | ], 471 | "source": [ 472 | "# Readlines returns a list of the lines in the file\n", 473 | "my_file.seek(0)\n", 474 | "my_file.readlines()" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": {}, 480 | "source": [ 481 | "When you have finished using a file, it is always good practice to close it." 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 18, 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [ 490 | "my_file.close()" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "## Writing to a File\n", 498 | "\n", 499 | "By default, the `open()` function will only allow us to read the file. We need to pass the argument `'w'` to write over the file. For example:" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 19, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "# Add a second argument to the function, 'w' which stands for write.\n", 509 | "# Passing 'w+' lets us read and write to the file\n", 510 | "\n", 511 | "my_file = open('test.txt','w+')" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "
**Use caution!**
\n", 519 | "Opening a file with 'w' or 'w+' *truncates the original*, meaning that anything that was in the original file **is deleted**!
" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 20, 525 | "metadata": {}, 526 | "outputs": [ 527 | { 528 | "data": { 529 | "text/plain": [ 530 | "24" 531 | ] 532 | }, 533 | "execution_count": 20, 534 | "metadata": {}, 535 | "output_type": "execute_result" 536 | } 537 | ], 538 | "source": [ 539 | "# Write to the file\n", 540 | "my_file.write('This is a new first line')" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": 21, 546 | "metadata": {}, 547 | "outputs": [ 548 | { 549 | "data": { 550 | "text/plain": [ 551 | "'This is a new first line'" 552 | ] 553 | }, 554 | "execution_count": 21, 555 | "metadata": {}, 556 | "output_type": "execute_result" 557 | } 558 | ], 559 | "source": [ 560 | "# Read the file\n", 561 | "my_file.seek(0)\n", 562 | "my_file.read()" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 22, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "my_file.close() # always do this when you're done with a file" 572 | ] 573 | }, 574 | { 575 | "cell_type": "markdown", 576 | "metadata": {}, 577 | "source": [ 578 | "## Appending to a File\n", 579 | "Passing the argument `'a'` opens the file and puts the pointer at the end, so anything written is appended. Like `'w+'`, `'a+'` lets us read and write to a file. If the file does not exist, one will be created." 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": 23, 585 | "metadata": {}, 586 | "outputs": [ 587 | { 588 | "data": { 589 | "text/plain": [ 590 | "23" 591 | ] 592 | }, 593 | "execution_count": 23, 594 | "metadata": {}, 595 | "output_type": "execute_result" 596 | } 597 | ], 598 | "source": [ 599 | "my_file = open('test.txt','a+')\n", 600 | "my_file.write('\\nThis line is being appended to test.txt')\n", 601 | "my_file.write('\\nAnd another line here.')" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": 24, 607 | "metadata": {}, 608 | "outputs": [ 609 | { 610 | "name": "stdout", 611 | "output_type": "stream", 612 | "text": [ 613 | "This is a new first line\n", 614 | "This line is being appended to test.txt\n", 615 | "And another line here.\n" 616 | ] 617 | } 618 | ], 619 | "source": [ 620 | "my_file.seek(0)\n", 621 | "print(my_file.read())" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 25, 627 | "metadata": {}, 628 | "outputs": [], 629 | "source": [ 630 | "my_file.close()" 631 | ] 632 | }, 633 | { 634 | "cell_type": "markdown", 635 | "metadata": {}, 636 | "source": [ 637 | "### Appending with `%%writefile`\n", 638 | "Jupyter notebook users can do the same thing using IPython cell magic:" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": 26, 644 | "metadata": {}, 645 | "outputs": [ 646 | { 647 | "name": "stdout", 648 | "output_type": "stream", 649 | "text": [ 650 | "Appending to test.txt\n" 651 | ] 652 | } 653 | ], 654 | "source": [ 655 | "%%writefile -a test.txt\n", 656 | "\n", 657 | "This is more text being appended to test.txt\n", 658 | "And another line here." 659 | ] 660 | }, 661 | { 662 | "cell_type": "markdown", 663 | "metadata": {}, 664 | "source": [ 665 | "Add a blank space if you want the first line to begin on its own line, as Jupyter won't recognize escape sequences like `\\n`" 666 | ] 667 | }, 668 | { 669 | "cell_type": "markdown", 670 | "metadata": {}, 671 | "source": [ 672 | "## Aliases and Context Managers\n", 673 | "You can assign temporary variable names as aliases, and manage the opening and closing of files automatically using a context manager:" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": 27, 679 | "metadata": {}, 680 | "outputs": [ 681 | { 682 | "name": "stdout", 683 | "output_type": "stream", 684 | "text": [ 685 | "This is a new first line\n", 686 | "\n" 687 | ] 688 | } 689 | ], 690 | "source": [ 691 | "with open('test.txt','r') as txt:\n", 692 | " first_line = txt.readlines()[0]\n", 693 | " \n", 694 | "print(first_line)" 695 | ] 696 | }, 697 | { 698 | "cell_type": "markdown", 699 | "metadata": {}, 700 | "source": [ 701 | "Note that the `with ... as ...:` context manager automatically closed `test.txt` after assigning the first line of text to first_line:" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": 28, 707 | "metadata": {}, 708 | "outputs": [ 709 | { 710 | "ename": "ValueError", 711 | "evalue": "I/O operation on closed file.", 712 | "output_type": "error", 713 | "traceback": [ 714 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 715 | "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", 716 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mtxt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 717 | "\u001b[1;31mValueError\u001b[0m: I/O operation on closed file." 718 | ] 719 | } 720 | ], 721 | "source": [ 722 | "txt.read()" 723 | ] 724 | }, 725 | { 726 | "cell_type": "markdown", 727 | "metadata": {}, 728 | "source": [ 729 | "## Iterating through a File" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": 29, 735 | "metadata": {}, 736 | "outputs": [ 737 | { 738 | "name": "stdout", 739 | "output_type": "stream", 740 | "text": [ 741 | "This is a new first line\n", 742 | "This line is being appended to test.txt\n", 743 | "And another line here.\n", 744 | "This is more text being appended to test.txt\n", 745 | "And another line here." 746 | ] 747 | } 748 | ], 749 | "source": [ 750 | "with open('test.txt','r') as txt:\n", 751 | " for line in txt:\n", 752 | " print(line, end='') # the end='' argument removes extra linebreaks" 753 | ] 754 | }, 755 | { 756 | "cell_type": "markdown", 757 | "metadata": {}, 758 | "source": [ 759 | "Great! Now you should be familiar with formatted string literals and working with text files.\n", 760 | "## Next up: Working with PDF Text" 761 | ] 762 | } 763 | ], 764 | "metadata": { 765 | "kernelspec": { 766 | "display_name": "Python 3 (ipykernel)", 767 | "language": "python", 768 | "name": "python3" 769 | }, 770 | "language_info": { 771 | "codemirror_mode": { 772 | "name": "ipython", 773 | "version": 3 774 | }, 775 | "file_extension": ".py", 776 | "mimetype": "text/x-python", 777 | "name": "python", 778 | "nbconvert_exporter": "python", 779 | "pygments_lexer": "ipython3", 780 | "version": "3.7.11" 781 | } 782 | }, 783 | "nbformat": 4, 784 | "nbformat_minor": 2 785 | } 786 | -------------------------------------------------------------------------------- /Regular-Expressions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "___\n", 8 | "\n", 9 | " \n", 10 | "___" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# Regular Expressions\n", 18 | "\n", 19 | "Regular Expressions (sometimes called regex for short) allow a user to search for strings using almost any sort of rule they can come up with. For example, finding all capital letters in a string, or finding a phone number in a document. \n", 20 | "\n", 21 | "Regular expressions are notorious for their seemingly strange syntax. This strange syntax is a byproduct of their flexibility. Regular expressions have to be able to filter out any string pattern you can imagine, which is why they have a complex string pattern format.\n", 22 | "\n", 23 | "Regular expressions are handled using Python's built-in **re** library. See [the docs](https://docs.python.org/3/library/re.html) for more information." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "Let's begin by explaining how to search for basic patterns in a string!" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Searching for Basic Patterns\n", 38 | "\n", 39 | "Let's imagine that we have the following string:" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 1, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "text = \"The agent's phone number is 408-555-1234. Call soon!\"" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "We'll start off by trying to find out if the string \"phone\" is inside the text string. Now we could quickly do this with:" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 2, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "True" 67 | ] 68 | }, 69 | "execution_count": 2, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "'phone' in text" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "But let's show the format for regular expressions, because later on we will be searching for patterns that won't have such a simple solution." 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 3, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "import re" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "pattern = 'phone'" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 5, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "data": { 110 | "text/plain": [ 111 | "<_sre.SRE_Match object; span=(12, 17), match='phone'>" 112 | ] 113 | }, 114 | "execution_count": 5, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "re.search(pattern,text)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 6, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "pattern = \"NOT IN TEXT\"" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 7, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "re.search(pattern,text)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "Now we've seen that re.search() will take the pattern, scan the text, and then returns a Match object. If no pattern is found, a None is returned (in Jupyter Notebook this just means that nothing is output below the cell).\n", 146 | "\n", 147 | "Let's take a closer look at this Match object." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 8, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "pattern = 'phone'" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 9, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "match = re.search(pattern,text)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 10, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "<_sre.SRE_Match object; span=(12, 17), match='phone'>" 177 | ] 178 | }, 179 | "execution_count": 10, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "match" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "Notice the span, there is also a start and end index information." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 11, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "(12, 17)" 204 | ] 205 | }, 206 | "execution_count": 11, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "match.span()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 12, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/plain": [ 223 | "12" 224 | ] 225 | }, 226 | "execution_count": 12, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "match.start()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 13, 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/plain": [ 243 | "17" 244 | ] 245 | }, 246 | "execution_count": 13, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "match.end()" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "But what if the pattern occurs more than once?" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 14, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "text = \"my phone is a new phone\"" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 15, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "match = re.search(\"phone\",text)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 16, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/plain": [ 288 | "(3, 8)" 289 | ] 290 | }, 291 | "execution_count": 16, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | } 295 | ], 296 | "source": [ 297 | "match.span()" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "Notice it only matches the first instance. If we wanted a list of all matches, we can use .findall() method:" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 17, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "matches = re.findall(\"phone\",text)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 18, 319 | "metadata": {}, 320 | "outputs": [ 321 | { 322 | "data": { 323 | "text/plain": [ 324 | "['phone', 'phone']" 325 | ] 326 | }, 327 | "execution_count": 18, 328 | "metadata": {}, 329 | "output_type": "execute_result" 330 | } 331 | ], 332 | "source": [ 333 | "matches" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 19, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "data": { 343 | "text/plain": [ 344 | "2" 345 | ] 346 | }, 347 | "execution_count": 19, 348 | "metadata": {}, 349 | "output_type": "execute_result" 350 | } 351 | ], 352 | "source": [ 353 | "len(matches)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "To get actual match objects, use the iterator:" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 20, 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "name": "stdout", 370 | "output_type": "stream", 371 | "text": [ 372 | "(3, 8)\n", 373 | "(18, 23)\n" 374 | ] 375 | } 376 | ], 377 | "source": [ 378 | "for match in re.finditer(\"phone\",text):\n", 379 | " print(match.span())" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "If you wanted the actual text that matched, you can use the .group() method." 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 21, 392 | "metadata": {}, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/plain": [ 397 | "'phone'" 398 | ] 399 | }, 400 | "execution_count": 21, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "match.group()" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "# Patterns\n", 414 | "\n", 415 | "So far we've learned how to search for a basic string. What about more complex examples? Such as trying to find a telephone number in a large string of text? Or an email address?\n", 416 | "\n", 417 | "We could just use search method if we know the exact phone or email, but what if we don't know it? We may know the general format, and we can use that along with regular expressions to search the document for strings that match a particular pattern.\n", 418 | "\n", 419 | "This is where the syntax may appear strange at first, but take your time with this; often it's just a matter of looking up the pattern code.\n", 420 | "\n", 421 | "Let's begin!" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "## Identifiers for Characters in Patterns\n", 429 | "\n", 430 | "Characters such as a digit or a single string have different codes that represent them. You can use these to build up a pattern string. Notice how these make heavy use of the backwards slash \\ . Because of this when defining a pattern string for regular expression we use the format:\n", 431 | "\n", 432 | " r'mypattern'\n", 433 | " \n", 434 | "placing the r in front of the string allows python to understand that the \\ in the pattern string are not meant to be escape slashes.\n", 435 | "\n", 436 | "Below you can find a table of all the possible identifiers:" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "\n", 444 | "\n", 445 | "\n", 446 | "\n", 447 | "\n", 448 | "\n", 449 | "\n", 450 | "\n", 451 | "\n", 452 | "\n", 453 | "\n", 454 | "\n", 455 | "\n", 456 | "\n", 457 | "\n", 458 | "\n", 459 | "
CharacterDescriptionExample Pattern CodeExammple Match
\\dA digitfile_\\d\\dfile_25
\\wAlphanumeric\\w-\\w\\w\\wA-b_1
\\sWhite spacea\\sb\\sca b c
\\DA non digit\\D\\D\\DABC
\\WNon-alphanumeric\\W\\W\\W\\W\\W*-+=)
\\SNon-whitespace\\S\\S\\S\\SYoyo
" 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "metadata": {}, 465 | "source": [ 466 | "For example:" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 22, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "text = \"My telephone number is 408-555-1234\"" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 23, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "phone = re.search(r'\\d\\d\\d-\\d\\d\\d-\\d\\d\\d\\d',text)" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 24, 490 | "metadata": {}, 491 | "outputs": [ 492 | { 493 | "data": { 494 | "text/plain": [ 495 | "'408-555-1234'" 496 | ] 497 | }, 498 | "execution_count": 24, 499 | "metadata": {}, 500 | "output_type": "execute_result" 501 | } 502 | ], 503 | "source": [ 504 | "phone.group()" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": {}, 510 | "source": [ 511 | "Notice the repetition of \\d. That is a bit of an annoyance, especially if we are looking for very long strings of numbers. Let's explore the possible quantifiers.\n", 512 | "\n", 513 | "## Quantifiers\n", 514 | "\n", 515 | "Now that we know the special character designations, we can use them along with quantifiers to define how many we expect." 516 | ] 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "metadata": {}, 521 | "source": [ 522 | "\n", 523 | "\n", 524 | "\n", 525 | "\n", 526 | "\n", 527 | "\n", 528 | "\n", 529 | "\n", 530 | "\n", 531 | "\n", 532 | "\n", 533 | "\n", 534 | "\n", 535 | "\n", 536 | "\n", 537 | "\n", 538 | "
CharacterDescriptionExample Pattern CodeExammple Match
+Occurs one or more times\tVersion \\w-\\w+Version A-b1_1
{3}Occurs exactly 3 times\\D{3}abc
{2,4}Occurs 2 to 4 times\\d{2,4}123
{3,}Occurs 3 or more\\w{3,}anycharacters
\\*Occurs zero or more timesA\\*B\\*C*AAACC
?Once or noneplurals?plural
" 539 | ] 540 | }, 541 | { 542 | "cell_type": "markdown", 543 | "metadata": {}, 544 | "source": [ 545 | "Let's rewrite our pattern using these quantifiers:" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": 25, 551 | "metadata": {}, 552 | "outputs": [ 553 | { 554 | "data": { 555 | "text/plain": [ 556 | "<_sre.SRE_Match object; span=(23, 35), match='408-555-1234'>" 557 | ] 558 | }, 559 | "execution_count": 25, 560 | "metadata": {}, 561 | "output_type": "execute_result" 562 | } 563 | ], 564 | "source": [ 565 | "re.search(r'\\d{3}-\\d{3}-\\d{4}',text)" 566 | ] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "metadata": {}, 571 | "source": [ 572 | "## Groups\n", 573 | "\n", 574 | "What if we wanted to do two tasks, find phone numbers, but also be able to quickly extract their area code (the first three digits). We can use groups for any general task that involves grouping together regular expressions (so that we can later break them down). \n", 575 | "\n", 576 | "Using the phone number example, we can separate groups of regular expressions using parentheses:" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 26, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "phone_pattern = re.compile(r'(\\d{3})-(\\d{3})-(\\d{4})')" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 27, 591 | "metadata": {}, 592 | "outputs": [], 593 | "source": [ 594 | "results = re.search(phone_pattern,text)" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 28, 600 | "metadata": {}, 601 | "outputs": [ 602 | { 603 | "data": { 604 | "text/plain": [ 605 | "'408-555-1234'" 606 | ] 607 | }, 608 | "execution_count": 28, 609 | "metadata": {}, 610 | "output_type": "execute_result" 611 | } 612 | ], 613 | "source": [ 614 | "# The entire result\n", 615 | "results.group()" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 29, 621 | "metadata": {}, 622 | "outputs": [ 623 | { 624 | "data": { 625 | "text/plain": [ 626 | "'408'" 627 | ] 628 | }, 629 | "execution_count": 29, 630 | "metadata": {}, 631 | "output_type": "execute_result" 632 | } 633 | ], 634 | "source": [ 635 | "# Can then also call by group position.\n", 636 | "# remember groups were separated by parentheses ()\n", 637 | "# Something to note is that group ordering starts at 1. Passing in 0 returns everything\n", 638 | "results.group(1)" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": 30, 644 | "metadata": {}, 645 | "outputs": [ 646 | { 647 | "data": { 648 | "text/plain": [ 649 | "'555'" 650 | ] 651 | }, 652 | "execution_count": 30, 653 | "metadata": {}, 654 | "output_type": "execute_result" 655 | } 656 | ], 657 | "source": [ 658 | "results.group(2)" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 31, 664 | "metadata": {}, 665 | "outputs": [ 666 | { 667 | "data": { 668 | "text/plain": [ 669 | "'1234'" 670 | ] 671 | }, 672 | "execution_count": 31, 673 | "metadata": {}, 674 | "output_type": "execute_result" 675 | } 676 | ], 677 | "source": [ 678 | "results.group(3)" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": 32, 684 | "metadata": {}, 685 | "outputs": [ 686 | { 687 | "ename": "IndexError", 688 | "evalue": "no such group", 689 | "output_type": "error", 690 | "traceback": [ 691 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 692 | "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", 693 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# We only had three groups of parentheses\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mresults\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m4\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 694 | "\u001b[1;31mIndexError\u001b[0m: no such group" 695 | ] 696 | } 697 | ], 698 | "source": [ 699 | "# We only had three groups of parentheses\n", 700 | "results.group(4)" 701 | ] 702 | }, 703 | { 704 | "cell_type": "markdown", 705 | "metadata": {}, 706 | "source": [ 707 | "## Additional Regex Syntax\n", 708 | "\n", 709 | "### Or operator |\n", 710 | "\n", 711 | "Use the pipe operator to have an **or** statment. For example" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": 33, 717 | "metadata": {}, 718 | "outputs": [ 719 | { 720 | "data": { 721 | "text/plain": [ 722 | "<_sre.SRE_Match object; span=(5, 8), match='man'>" 723 | ] 724 | }, 725 | "execution_count": 33, 726 | "metadata": {}, 727 | "output_type": "execute_result" 728 | } 729 | ], 730 | "source": [ 731 | "re.search(r\"man|woman\",\"This man was here.\")" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": 34, 737 | "metadata": {}, 738 | "outputs": [ 739 | { 740 | "data": { 741 | "text/plain": [ 742 | "<_sre.SRE_Match object; span=(5, 10), match='woman'>" 743 | ] 744 | }, 745 | "execution_count": 34, 746 | "metadata": {}, 747 | "output_type": "execute_result" 748 | } 749 | ], 750 | "source": [ 751 | "re.search(r\"man|woman\",\"This woman was here.\")" 752 | ] 753 | }, 754 | { 755 | "cell_type": "markdown", 756 | "metadata": {}, 757 | "source": [ 758 | "### The Wildcard Character\n", 759 | "\n", 760 | "Use a \"wildcard\" as a placement that will match any character placed there. You can use a simple period **.** for this. For example:" 761 | ] 762 | }, 763 | { 764 | "cell_type": "code", 765 | "execution_count": 35, 766 | "metadata": {}, 767 | "outputs": [ 768 | { 769 | "data": { 770 | "text/plain": [ 771 | "['cat', 'hat', 'sat']" 772 | ] 773 | }, 774 | "execution_count": 35, 775 | "metadata": {}, 776 | "output_type": "execute_result" 777 | } 778 | ], 779 | "source": [ 780 | "re.findall(r\".at\",\"The cat in the hat sat here.\")" 781 | ] 782 | }, 783 | { 784 | "cell_type": "code", 785 | "execution_count": 36, 786 | "metadata": {}, 787 | "outputs": [ 788 | { 789 | "data": { 790 | "text/plain": [ 791 | "['bat', 'lat']" 792 | ] 793 | }, 794 | "execution_count": 36, 795 | "metadata": {}, 796 | "output_type": "execute_result" 797 | } 798 | ], 799 | "source": [ 800 | "re.findall(r\".at\",\"The bat went splat\")" 801 | ] 802 | }, 803 | { 804 | "cell_type": "markdown", 805 | "metadata": {}, 806 | "source": [ 807 | "Notice how we only matched the first 3 letters, that is because we need a **.** for each wildcard letter. Or use the quantifiers described above to set its own rules." 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": 37, 813 | "metadata": {}, 814 | "outputs": [ 815 | { 816 | "data": { 817 | "text/plain": [ 818 | "['e bat', 'splat']" 819 | ] 820 | }, 821 | "execution_count": 37, 822 | "metadata": {}, 823 | "output_type": "execute_result" 824 | } 825 | ], 826 | "source": [ 827 | "re.findall(r\"...at\",\"The bat went splat\")" 828 | ] 829 | }, 830 | { 831 | "cell_type": "markdown", 832 | "metadata": {}, 833 | "source": [ 834 | "However this still leads the problem to grabbing more beforehand. Really we only want words that end with \"at\"." 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": 38, 840 | "metadata": {}, 841 | "outputs": [ 842 | { 843 | "data": { 844 | "text/plain": [ 845 | "['bat', 'splat']" 846 | ] 847 | }, 848 | "execution_count": 38, 849 | "metadata": {}, 850 | "output_type": "execute_result" 851 | } 852 | ], 853 | "source": [ 854 | "# One or more non-whitespace that ends with 'at'\n", 855 | "re.findall(r'\\S+at',\"The bat went splat\")" 856 | ] 857 | }, 858 | { 859 | "cell_type": "markdown", 860 | "metadata": {}, 861 | "source": [ 862 | "### Starts With and Ends With\n", 863 | "\n", 864 | "We can use the **^** to signal starts with, and the **$** to signal ends with:" 865 | ] 866 | }, 867 | { 868 | "cell_type": "code", 869 | "execution_count": 39, 870 | "metadata": {}, 871 | "outputs": [ 872 | { 873 | "data": { 874 | "text/plain": [ 875 | "['2']" 876 | ] 877 | }, 878 | "execution_count": 39, 879 | "metadata": {}, 880 | "output_type": "execute_result" 881 | } 882 | ], 883 | "source": [ 884 | "# Ends with a number\n", 885 | "re.findall(r'\\d$','This ends with a number 2')" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": 40, 891 | "metadata": {}, 892 | "outputs": [ 893 | { 894 | "data": { 895 | "text/plain": [ 896 | "['1']" 897 | ] 898 | }, 899 | "execution_count": 40, 900 | "metadata": {}, 901 | "output_type": "execute_result" 902 | } 903 | ], 904 | "source": [ 905 | "# Starts with a number\n", 906 | "re.findall(r'^\\d','1 is the loneliest number.')" 907 | ] 908 | }, 909 | { 910 | "cell_type": "markdown", 911 | "metadata": {}, 912 | "source": [ 913 | "Note that this is for the entire string, not individual words!" 914 | ] 915 | }, 916 | { 917 | "cell_type": "markdown", 918 | "metadata": {}, 919 | "source": [ 920 | "### Exclusion\n", 921 | "\n", 922 | "To exclude characters, we can use the **^** symbol in conjunction with a set of brackets **[]**. Anything inside the brackets is excluded. For example:" 923 | ] 924 | }, 925 | { 926 | "cell_type": "code", 927 | "execution_count": 41, 928 | "metadata": {}, 929 | "outputs": [], 930 | "source": [ 931 | "phrase = \"there are 3 numbers 34 inside 5 this sentence.\"" 932 | ] 933 | }, 934 | { 935 | "cell_type": "code", 936 | "execution_count": 42, 937 | "metadata": {}, 938 | "outputs": [ 939 | { 940 | "data": { 941 | "text/plain": [ 942 | "['t',\n", 943 | " 'h',\n", 944 | " 'e',\n", 945 | " 'r',\n", 946 | " 'e',\n", 947 | " ' ',\n", 948 | " 'a',\n", 949 | " 'r',\n", 950 | " 'e',\n", 951 | " ' ',\n", 952 | " ' ',\n", 953 | " 'n',\n", 954 | " 'u',\n", 955 | " 'm',\n", 956 | " 'b',\n", 957 | " 'e',\n", 958 | " 'r',\n", 959 | " 's',\n", 960 | " ' ',\n", 961 | " ' ',\n", 962 | " 'i',\n", 963 | " 'n',\n", 964 | " 's',\n", 965 | " 'i',\n", 966 | " 'd',\n", 967 | " 'e',\n", 968 | " ' ',\n", 969 | " ' ',\n", 970 | " 't',\n", 971 | " 'h',\n", 972 | " 'i',\n", 973 | " 's',\n", 974 | " ' ',\n", 975 | " 's',\n", 976 | " 'e',\n", 977 | " 'n',\n", 978 | " 't',\n", 979 | " 'e',\n", 980 | " 'n',\n", 981 | " 'c',\n", 982 | " 'e',\n", 983 | " '.']" 984 | ] 985 | }, 986 | "execution_count": 42, 987 | "metadata": {}, 988 | "output_type": "execute_result" 989 | } 990 | ], 991 | "source": [ 992 | "re.findall(r'[^\\d]',phrase)" 993 | ] 994 | }, 995 | { 996 | "cell_type": "markdown", 997 | "metadata": {}, 998 | "source": [ 999 | "To get the words back together, use a + sign " 1000 | ] 1001 | }, 1002 | { 1003 | "cell_type": "code", 1004 | "execution_count": 43, 1005 | "metadata": {}, 1006 | "outputs": [ 1007 | { 1008 | "data": { 1009 | "text/plain": [ 1010 | "['there are ', ' numbers ', ' inside ', ' this sentence.']" 1011 | ] 1012 | }, 1013 | "execution_count": 43, 1014 | "metadata": {}, 1015 | "output_type": "execute_result" 1016 | } 1017 | ], 1018 | "source": [ 1019 | "re.findall(r'[^\\d]+',phrase)" 1020 | ] 1021 | }, 1022 | { 1023 | "cell_type": "markdown", 1024 | "metadata": {}, 1025 | "source": [ 1026 | "We can use this to remove punctuation from a sentence." 1027 | ] 1028 | }, 1029 | { 1030 | "cell_type": "code", 1031 | "execution_count": 44, 1032 | "metadata": {}, 1033 | "outputs": [], 1034 | "source": [ 1035 | "test_phrase = 'This is a string! But it has punctuation. How can we remove it?'" 1036 | ] 1037 | }, 1038 | { 1039 | "cell_type": "code", 1040 | "execution_count": 45, 1041 | "metadata": {}, 1042 | "outputs": [ 1043 | { 1044 | "data": { 1045 | "text/plain": [ 1046 | "['This',\n", 1047 | " 'is',\n", 1048 | " 'a',\n", 1049 | " 'string',\n", 1050 | " 'But',\n", 1051 | " 'it',\n", 1052 | " 'has',\n", 1053 | " 'punctuation',\n", 1054 | " 'How',\n", 1055 | " 'can',\n", 1056 | " 'we',\n", 1057 | " 'remove',\n", 1058 | " 'it']" 1059 | ] 1060 | }, 1061 | "execution_count": 45, 1062 | "metadata": {}, 1063 | "output_type": "execute_result" 1064 | } 1065 | ], 1066 | "source": [ 1067 | "re.findall('[^!.? ]+',test_phrase)" 1068 | ] 1069 | }, 1070 | { 1071 | "cell_type": "code", 1072 | "execution_count": 46, 1073 | "metadata": {}, 1074 | "outputs": [], 1075 | "source": [ 1076 | "clean = ' '.join(re.findall('[^!.? ]+',test_phrase))" 1077 | ] 1078 | }, 1079 | { 1080 | "cell_type": "code", 1081 | "execution_count": 47, 1082 | "metadata": {}, 1083 | "outputs": [ 1084 | { 1085 | "data": { 1086 | "text/plain": [ 1087 | "'This is a string But it has punctuation How can we remove it'" 1088 | ] 1089 | }, 1090 | "execution_count": 47, 1091 | "metadata": {}, 1092 | "output_type": "execute_result" 1093 | } 1094 | ], 1095 | "source": [ 1096 | "clean" 1097 | ] 1098 | }, 1099 | { 1100 | "cell_type": "markdown", 1101 | "metadata": {}, 1102 | "source": [ 1103 | "## Brackets for Grouping\n", 1104 | "\n", 1105 | "As we showed above we can use brackets to group together options, for example if we wanted to find hyphenated words:" 1106 | ] 1107 | }, 1108 | { 1109 | "cell_type": "code", 1110 | "execution_count": 48, 1111 | "metadata": {}, 1112 | "outputs": [], 1113 | "source": [ 1114 | "text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are'" 1115 | ] 1116 | }, 1117 | { 1118 | "cell_type": "code", 1119 | "execution_count": 49, 1120 | "metadata": {}, 1121 | "outputs": [ 1122 | { 1123 | "data": { 1124 | "text/plain": [ 1125 | "['hypen-words', 'long-ish']" 1126 | ] 1127 | }, 1128 | "execution_count": 49, 1129 | "metadata": {}, 1130 | "output_type": "execute_result" 1131 | } 1132 | ], 1133 | "source": [ 1134 | "re.findall(r'[\\w]+-[\\w]+',text)" 1135 | ] 1136 | }, 1137 | { 1138 | "cell_type": "markdown", 1139 | "metadata": {}, 1140 | "source": [ 1141 | "## Parentheses for Multiple Options\n", 1142 | "\n", 1143 | "If we have multiple options for matching, we can use parentheses to list out these options. For Example:" 1144 | ] 1145 | }, 1146 | { 1147 | "cell_type": "code", 1148 | "execution_count": 50, 1149 | "metadata": {}, 1150 | "outputs": [], 1151 | "source": [ 1152 | "# Find words that start with cat and end with one of these options: 'fish','nap', or 'claw'\n", 1153 | "text = 'Hello, would you like some catfish?'\n", 1154 | "texttwo = \"Hello, would you like to take a catnap?\"\n", 1155 | "textthree = \"Hello, have you seen this caterpillar?\"" 1156 | ] 1157 | }, 1158 | { 1159 | "cell_type": "code", 1160 | "execution_count": 51, 1161 | "metadata": {}, 1162 | "outputs": [ 1163 | { 1164 | "data": { 1165 | "text/plain": [ 1166 | "<_sre.SRE_Match object; span=(27, 34), match='catfish'>" 1167 | ] 1168 | }, 1169 | "execution_count": 51, 1170 | "metadata": {}, 1171 | "output_type": "execute_result" 1172 | } 1173 | ], 1174 | "source": [ 1175 | "re.search(r'cat(fish|nap|claw)',text)" 1176 | ] 1177 | }, 1178 | { 1179 | "cell_type": "code", 1180 | "execution_count": 52, 1181 | "metadata": {}, 1182 | "outputs": [ 1183 | { 1184 | "data": { 1185 | "text/plain": [ 1186 | "<_sre.SRE_Match object; span=(32, 38), match='catnap'>" 1187 | ] 1188 | }, 1189 | "execution_count": 52, 1190 | "metadata": {}, 1191 | "output_type": "execute_result" 1192 | } 1193 | ], 1194 | "source": [ 1195 | "re.search(r'cat(fish|nap|claw)',texttwo)" 1196 | ] 1197 | }, 1198 | { 1199 | "cell_type": "code", 1200 | "execution_count": 53, 1201 | "metadata": {}, 1202 | "outputs": [], 1203 | "source": [ 1204 | "# None returned\n", 1205 | "re.search(r'cat(fish|nap|claw)',textthree)" 1206 | ] 1207 | }, 1208 | { 1209 | "cell_type": "markdown", 1210 | "metadata": {}, 1211 | "source": [ 1212 | "### Conclusion\n", 1213 | "\n", 1214 | "Excellent work! For full information on all possible patterns, check out: https://docs.python.org/3/howto/regex.html" 1215 | ] 1216 | }, 1217 | { 1218 | "cell_type": "markdown", 1219 | "metadata": {}, 1220 | "source": [ 1221 | "## Next up: Python Text Basics Assessment" 1222 | ] 1223 | } 1224 | ], 1225 | "metadata": { 1226 | "kernelspec": { 1227 | "display_name": "Python 3", 1228 | "language": "python", 1229 | "name": "python3" 1230 | }, 1231 | "language_info": { 1232 | "codemirror_mode": { 1233 | "name": "ipython", 1234 | "version": 3 1235 | }, 1236 | "file_extension": ".py", 1237 | "mimetype": "text/x-python", 1238 | "name": "python", 1239 | "nbconvert_exporter": "python", 1240 | "pygments_lexer": "ipython3", 1241 | "version": "3.6.2" 1242 | } 1243 | }, 1244 | "nbformat": 4, 1245 | "nbformat_minor": 2 1246 | } 1247 | -------------------------------------------------------------------------------- /Tokenization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "___\n", 8 | "\n", 9 | " \n", 10 | "___" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# Tokenization\n", 18 | "The first step in creating a `Doc` object is to break down the incoming text into component pieces or \"tokens\"." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# Import spaCy and load the language library\n", 28 | "import spacy\n", 29 | "nlp = spacy.load('en_core_web_sm')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "\"We're moving to L.A.!\"\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "# Create a string that includes opening and closing quotation marks\n", 47 | "mystring = '\"We\\'re moving to L.A.!\"'\n", 48 | "print(mystring)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "\" | We | 're | moving | to | L.A. | ! | \" | " 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "# Create a Doc object and explore tokens\n", 66 | "doc = nlp(mystring)\n", 67 | "\n", 68 | "for token in doc:\n", 69 | " print(token.text, end=' | ')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "- **Prefix**:\tCharacter(s) at the beginning ▸ `$ ( “ ¿`\n", 84 | "- **Suffix**:\tCharacter(s) at the end ▸ `km ) , . ! ”`\n", 85 | "- **Infix**:\tCharacter(s) in between ▸ `- -- / ...`\n", 86 | "- **Exception**: Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied ▸ `St. U.S.`" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Notice that tokens are pieces of the original text. That is, we don't see any conversion to word stems or lemmas (base forms of words) and we haven't seen anything about organizations/places/money etc. Tokens are the basic building blocks of a Doc object - everything that helps us understand the meaning of the text is derived from tokens and their relationship to one another." 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "## Prefixes, Suffixes and Infixes\n", 101 | "spaCy will isolate punctuation that does *not* form an integral part of a word. Quotation marks, commas, and punctuation at the end of a sentence will be assigned their own token. However, punctuation that exists as part of an email address, website or numerical value will be kept as part of the token." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 4, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "We\n", 114 | "'re\n", 115 | "here\n", 116 | "to\n", 117 | "help\n", 118 | "!\n", 119 | "Send\n", 120 | "snail\n", 121 | "-\n", 122 | "mail\n", 123 | ",\n", 124 | "email\n", 125 | "support@oursite.com\n", 126 | "or\n", 127 | "visit\n", 128 | "us\n", 129 | "at\n", 130 | "http://www.oursite.com\n", 131 | "!\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "doc2 = nlp(u\"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!\")\n", 137 | "\n", 138 | "for t in doc2:\n", 139 | " print(t)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "Note that the exclamation points, comma, and the hyphen in 'snail-mail' are assigned their own tokens, yet both the email address and website are preserved." 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 5, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "A\n", 159 | "5\n", 160 | "km\n", 161 | "NYC\n", 162 | "cab\n", 163 | "ride\n", 164 | "costs\n", 165 | "$\n", 166 | "10.30\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "doc3 = nlp(u'A 5km NYC cab ride costs $10.30')\n", 172 | "\n", 173 | "for t in doc3:\n", 174 | " print(t)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "Here the distance unit and dollar sign are assigned their own tokens, yet the dollar amount is preserved." 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "## Exceptions\n", 189 | "Punctuation that exists as part of a known abbreviation will be kept as part of the token." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 6, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "name": "stdout", 199 | "output_type": "stream", 200 | "text": [ 201 | "Let\n", 202 | "'s\n", 203 | "visit\n", 204 | "St.\n", 205 | "Louis\n", 206 | "in\n", 207 | "the\n", 208 | "U.S.\n", 209 | "next\n", 210 | "year\n", 211 | ".\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "doc4 = nlp(u\"Let's visit St. Louis in the U.S. next year.\")\n", 217 | "\n", 218 | "for t in doc4:\n", 219 | " print(t)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "Here the abbreviations for \"Saint\" and \"United States\" are both preserved." 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "## Counting Tokens\n", 234 | "`Doc` objects have a set number of tokens:" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 7, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "8" 246 | ] 247 | }, 248 | "execution_count": 7, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "len(doc)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "## Counting Vocab Entries\n", 262 | "`Vocab` objects contain a full library of items!" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 8, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/plain": [ 273 | "57852" 274 | ] 275 | }, 276 | "execution_count": 8, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "len(doc.vocab)" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "NOTE: This number changes based on the language library loaded at the start, and any new lexemes introduced to the `vocab` when the `Doc` was created." 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "## Tokens can be retrieved by index position and slice\n", 297 | "`Doc` objects can be thought of as lists of `token` objects. As such, individual tokens can be retrieved by index position, and spans of tokens can be retrieved through slicing:" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 9, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "data": { 307 | "text/plain": [ 308 | "better" 309 | ] 310 | }, 311 | "execution_count": 9, 312 | "metadata": {}, 313 | "output_type": "execute_result" 314 | } 315 | ], 316 | "source": [ 317 | "doc5 = nlp(u'It is better to give than to receive.')\n", 318 | "\n", 319 | "# Retrieve the third token:\n", 320 | "doc5[2]" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 10, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "better to give" 332 | ] 333 | }, 334 | "execution_count": 10, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "# Retrieve three tokens from the middle:\n", 341 | "doc5[2:5]" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 11, 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "data": { 351 | "text/plain": [ 352 | "than to receive." 353 | ] 354 | }, 355 | "execution_count": 11, 356 | "metadata": {}, 357 | "output_type": "execute_result" 358 | } 359 | ], 360 | "source": [ 361 | "# Retrieve the last four tokens:\n", 362 | "doc5[-4:]" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "## Tokens cannot be reassigned\n", 370 | "Although `Doc` objects can be considered lists of tokens, they do *not* support item reassignment:" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 12, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "doc6 = nlp(u'My dinner was horrible.')\n", 380 | "doc7 = nlp(u'Your dinner was delicious.')" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 13, 386 | "metadata": {}, 387 | "outputs": [ 388 | { 389 | "ename": "TypeError", 390 | "evalue": "'spacy.tokens.doc.Doc' object does not support item assignment", 391 | "output_type": "error", 392 | "traceback": [ 393 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 394 | "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", 395 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# Try to change \"My dinner was horrible\" to \"My dinner was delicious\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mdoc6\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdoc7\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 396 | "\u001b[1;31mTypeError\u001b[0m: 'spacy.tokens.doc.Doc' object does not support item assignment" 397 | ] 398 | } 399 | ], 400 | "source": [ 401 | "# Try to change \"My dinner was horrible\" to \"My dinner was delicious\"\n", 402 | "doc6[3] = doc7[3]" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": { 408 | "collapsed": true 409 | }, 410 | "source": [ 411 | "___\n", 412 | "# Named Entities\n", 413 | "Going a step beyond tokens, *named entities* add another layer of context. The language model recognizes that certain words are organizational names while others are locations, and still other combinations relate to money, dates, etc. Named entities are accessible through the `ents` property of a `Doc` object." 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 14, 419 | "metadata": {}, 420 | "outputs": [ 421 | { 422 | "name": "stdout", 423 | "output_type": "stream", 424 | "text": [ 425 | "Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | \n", 426 | "----\n", 427 | "Apple - ORG - Companies, agencies, institutions, etc.\n", 428 | "Hong Kong - GPE - Countries, cities, states\n", 429 | "$6 million - MONEY - Monetary values, including unit\n" 430 | ] 431 | } 432 | ], 433 | "source": [ 434 | "doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')\n", 435 | "\n", 436 | "for token in doc8:\n", 437 | " print(token.text, end=' | ')\n", 438 | "\n", 439 | "print('\\n----')\n", 440 | "\n", 441 | "for ent in doc8.ents:\n", 442 | " print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": {}, 448 | "source": [ 449 | "Note how two tokens combine to form the entity `Hong Kong`, and three tokens combine to form the monetary entity: `$6 million`" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 15, 455 | "metadata": {}, 456 | "outputs": [ 457 | { 458 | "data": { 459 | "text/plain": [ 460 | "3" 461 | ] 462 | }, 463 | "execution_count": 15, 464 | "metadata": {}, 465 | "output_type": "execute_result" 466 | } 467 | ], 468 | "source": [ 469 | "len(doc8.ents)" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "Named Entity Recognition (NER) is an important machine learning tool applied to Natural Language Processing.
We'll do a lot more with it in an upcoming section. For more info on **named entities** visit https://spacy.io/usage/linguistic-features#named-entities" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": {}, 482 | "source": [ 483 | "---\n", 484 | "# Noun Chunks\n", 485 | "Similar to `Doc.ents`, `Doc.noun_chunks` are another object property. *Noun chunks* are \"base noun phrases\" – flat phrases that have a noun as their head. You can think of noun chunks as a noun plus the words describing the noun – for example, in [Sheb Wooley's 1958 song](https://en.wikipedia.org/wiki/The_Purple_People_Eater), a *\"one-eyed, one-horned, flying, purple people-eater\"* would be one long noun chunk." 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 16, 491 | "metadata": {}, 492 | "outputs": [ 493 | { 494 | "name": "stdout", 495 | "output_type": "stream", 496 | "text": [ 497 | "Autonomous cars\n", 498 | "insurance liability\n", 499 | "manufacturers\n" 500 | ] 501 | } 502 | ], 503 | "source": [ 504 | "doc9 = nlp(u\"Autonomous cars shift insurance liability toward manufacturers.\")\n", 505 | "\n", 506 | "for chunk in doc9.noun_chunks:\n", 507 | " print(chunk.text)" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": 17, 513 | "metadata": {}, 514 | "outputs": [ 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | "Red cars\n", 520 | "higher insurance rates\n" 521 | ] 522 | } 523 | ], 524 | "source": [ 525 | "doc10 = nlp(u\"Red cars do not carry higher insurance rates.\")\n", 526 | "\n", 527 | "for chunk in doc10.noun_chunks:\n", 528 | " print(chunk.text)" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 18, 534 | "metadata": {}, 535 | "outputs": [ 536 | { 537 | "name": "stdout", 538 | "output_type": "stream", 539 | "text": [ 540 | "He\n", 541 | "a one-eyed, one-horned, flying, purple people-eater\n" 542 | ] 543 | } 544 | ], 545 | "source": [ 546 | "doc11 = nlp(u\"He was a one-eyed, one-horned, flying, purple people-eater.\")\n", 547 | "\n", 548 | "for chunk in doc11.noun_chunks:\n", 549 | " print(chunk.text)" 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "metadata": {}, 555 | "source": [ 556 | "We'll look at additional noun_chunks components besides `.text` in an upcoming section.
For more info on **noun_chunks** visit https://spacy.io/usage/linguistic-features#noun-chunks" 557 | ] 558 | }, 559 | { 560 | "cell_type": "markdown", 561 | "metadata": {}, 562 | "source": [ 563 | "___\n", 564 | "# Built-in Visualizers\n", 565 | "\n", 566 | "spaCy includes a built-in visualization tool called **displaCy**. displaCy is able to detect whether you're working in a Jupyter notebook, and will return markup that can be rendered in a cell right away. When you export your notebook, the visualizations will be included as HTML.\n", 567 | "\n", 568 | "For more info visit https://spacy.io/usage/visualizers" 569 | ] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": {}, 574 | "source": [ 575 | "## Visualizing the dependency parse\n", 576 | "Run the cell below to import displacy and display the dependency graphic" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 19, 582 | "metadata": {}, 583 | "outputs": [ 584 | { 585 | "data": { 586 | "text/html": [ 587 | "\n", 588 | "\n", 589 | " Apple\n", 590 | " PROPN\n", 591 | "\n", 592 | "\n", 593 | "\n", 594 | " is\n", 595 | " VERB\n", 596 | "\n", 597 | "\n", 598 | "\n", 599 | " going\n", 600 | " VERB\n", 601 | "\n", 602 | "\n", 603 | "\n", 604 | " to\n", 605 | " PART\n", 606 | "\n", 607 | "\n", 608 | "\n", 609 | " build\n", 610 | " VERB\n", 611 | "\n", 612 | "\n", 613 | "\n", 614 | " a\n", 615 | " DET\n", 616 | "\n", 617 | "\n", 618 | "\n", 619 | " U.K.\n", 620 | " PROPN\n", 621 | "\n", 622 | "\n", 623 | "\n", 624 | " factory\n", 625 | " NOUN\n", 626 | "\n", 627 | "\n", 628 | "\n", 629 | " for\n", 630 | " ADP\n", 631 | "\n", 632 | "\n", 633 | "\n", 634 | " $\n", 635 | " SYM\n", 636 | "\n", 637 | "\n", 638 | "\n", 639 | " 6\n", 640 | " NUM\n", 641 | "\n", 642 | "\n", 643 | "\n", 644 | " million.\n", 645 | " NUM\n", 646 | "\n", 647 | "\n", 648 | "\n", 649 | " \n", 650 | " \n", 651 | " nsubj\n", 652 | " \n", 653 | " \n", 654 | "\n", 655 | "\n", 656 | "\n", 657 | " \n", 658 | " \n", 659 | " aux\n", 660 | " \n", 661 | " \n", 662 | "\n", 663 | "\n", 664 | "\n", 665 | " \n", 666 | " \n", 667 | " aux\n", 668 | " \n", 669 | " \n", 670 | "\n", 671 | "\n", 672 | "\n", 673 | " \n", 674 | " \n", 675 | " xcomp\n", 676 | " \n", 677 | " \n", 678 | "\n", 679 | "\n", 680 | "\n", 681 | " \n", 682 | " \n", 683 | " det\n", 684 | " \n", 685 | " \n", 686 | "\n", 687 | "\n", 688 | "\n", 689 | " \n", 690 | " \n", 691 | " compound\n", 692 | " \n", 693 | " \n", 694 | "\n", 695 | "\n", 696 | "\n", 697 | " \n", 698 | " \n", 699 | " dobj\n", 700 | " \n", 701 | " \n", 702 | "\n", 703 | "\n", 704 | "\n", 705 | " \n", 706 | " \n", 707 | " prep\n", 708 | " \n", 709 | " \n", 710 | "\n", 711 | "\n", 712 | "\n", 713 | " \n", 714 | " \n", 715 | " quantmod\n", 716 | " \n", 717 | " \n", 718 | "\n", 719 | "\n", 720 | "\n", 721 | " \n", 722 | " \n", 723 | " compound\n", 724 | " \n", 725 | " \n", 726 | "\n", 727 | "\n", 728 | "\n", 729 | " \n", 730 | " \n", 731 | " pobj\n", 732 | " \n", 733 | " \n", 734 | "\n", 735 | "" 736 | ], 737 | "text/plain": [ 738 | "" 739 | ] 740 | }, 741 | "metadata": {}, 742 | "output_type": "display_data" 743 | } 744 | ], 745 | "source": [ 746 | "from spacy import displacy\n", 747 | "\n", 748 | "doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')\n", 749 | "displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})" 750 | ] 751 | }, 752 | { 753 | "cell_type": "markdown", 754 | "metadata": {}, 755 | "source": [ 756 | "The optional `'distance'` argument sets the distance between tokens. If the distance is made too small, text that appears beneath short arrows may become too compressed to read." 757 | ] 758 | }, 759 | { 760 | "cell_type": "markdown", 761 | "metadata": {}, 762 | "source": [ 763 | "## Visualizing the entity recognizer" 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": 20, 769 | "metadata": {}, 770 | "outputs": [ 771 | { 772 | "data": { 773 | "text/html": [ 774 | "
Over \n", 775 | "\n", 776 | " the last quarter\n", 777 | " DATE\n", 778 | "\n", 779 | " \n", 780 | "\n", 781 | " Apple\n", 782 | " ORG\n", 783 | "\n", 784 | " sold \n", 785 | "\n", 786 | " nearly 20 thousand\n", 787 | " CARDINAL\n", 788 | "\n", 789 | " \n", 790 | "\n", 791 | " iPods\n", 792 | " PRODUCT\n", 793 | "\n", 794 | " for a profit of \n", 795 | "\n", 796 | " $6 million\n", 797 | " MONEY\n", 798 | "\n", 799 | ".
" 800 | ], 801 | "text/plain": [ 802 | "" 803 | ] 804 | }, 805 | "metadata": {}, 806 | "output_type": "display_data" 807 | } 808 | ], 809 | "source": [ 810 | "doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')\n", 811 | "displacy.render(doc, style='ent', jupyter=True)" 812 | ] 813 | }, 814 | { 815 | "cell_type": "markdown", 816 | "metadata": {}, 817 | "source": [ 818 | "___\n", 819 | "## Creating Visualizations Outside of Jupyter\n", 820 | "If you're using another Python IDE or writing a script, you can choose to have spaCy serve up html separately:" 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": 21, 826 | "metadata": {}, 827 | "outputs": [ 828 | { 829 | "name": "stdout", 830 | "output_type": "stream", 831 | "text": [ 832 | "\n", 833 | " Serving on port 5000...\n", 834 | " Using the 'dep' visualizer\n", 835 | "\n", 836 | "\n", 837 | " Shutting down server on port 5000.\n", 838 | "\n" 839 | ] 840 | } 841 | ], 842 | "source": [ 843 | "doc = nlp(u'This is a sentence.')\n", 844 | "displacy.serve(doc, style='dep')" 845 | ] 846 | }, 847 | { 848 | "cell_type": "markdown", 849 | "metadata": {}, 850 | "source": [ 851 | "**After running the cell above, click the link below to view the dependency parse**:\n", 852 | "\n", 853 | "http://127.0.0.1:5000\n", 854 | "

\n", 855 | "**To shut down the server and return to jupyter**, interrupt the kernel either through the **Kernel** menu above, by hitting the black square on the toolbar, or by typing the keyboard shortcut `Esc`, `I`, `I`" 856 | ] 857 | }, 858 | { 859 | "cell_type": "markdown", 860 | "metadata": {}, 861 | "source": [ 862 | "Great! Now you should have an understanding of how tokenization divides text up into individual elements, how named entities provide context, and how certain tools help to visualize grammar rules and entity labels.\n", 863 | "## Next up: Stemming" 864 | ] 865 | } 866 | ], 867 | "metadata": { 868 | "kernelspec": { 869 | "display_name": "Python 3", 870 | "language": "python", 871 | "name": "python3" 872 | }, 873 | "language_info": { 874 | "codemirror_mode": { 875 | "name": "ipython", 876 | "version": 3 877 | }, 878 | "file_extension": ".py", 879 | "mimetype": "text/x-python", 880 | "name": "python", 881 | "nbconvert_exporter": "python", 882 | "pygments_lexer": "ipython3", 883 | "version": "3.6.2" 884 | } 885 | }, 886 | "nbformat": 4, 887 | "nbformat_minor": 2 888 | } 889 | -------------------------------------------------------------------------------- /Projects/credit-card-fraud.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Credit Card Fraud Detection" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 73, 13 | "metadata": { 14 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 15 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" 16 | }, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "/kaggle/input/creditcardfraud/creditcard.csv\n" 23 | ] 24 | } 25 | ], 26 | "source": [ 27 | "import numpy as np # linear algebra\n", 28 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 29 | "from sklearn.model_selection import train_test_split\n", 30 | "from sklearn.linear_model import LogisticRegression\n", 31 | "from sklearn.utils import shuffle\n", 32 | "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", 33 | "\n", 34 | "# Input data files are available in the read-only \"../input/\" directory\n", 35 | "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", 36 | "\n", 37 | "import os\n", 38 | "for dirname, _, filenames in os.walk('/kaggle/input'):\n", 39 | " for filename in filenames:\n", 40 | " print(os.path.join(dirname, filename))\n", 41 | "\n" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/html": [ 52 | "
\n", 53 | "\n", 66 | "\n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | "
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.363787...-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425...-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.514654...0.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024...-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.817739...-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990
\n", 216 | "

5 rows × 31 columns

\n", 217 | "
" 218 | ], 219 | "text/plain": [ 220 | " Time V1 V2 V3 V4 V5 V6 V7 \\\n", 221 | "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n", 222 | "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n", 223 | "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n", 224 | "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n", 225 | "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n", 226 | "\n", 227 | " V8 V9 ... V21 V22 V23 V24 V25 \\\n", 228 | "0 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 \n", 229 | "1 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 \n", 230 | "2 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 \n", 231 | "3 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 \n", 232 | "4 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 \n", 233 | "\n", 234 | " V26 V27 V28 Amount Class \n", 235 | "0 -0.189115 0.133558 -0.021053 149.62 0 \n", 236 | "1 0.125895 -0.008983 0.014724 2.69 0 \n", 237 | "2 -0.139097 -0.055353 -0.059752 378.66 0 \n", 238 | "3 -0.221929 0.062723 0.061458 123.50 0 \n", 239 | "4 0.502292 0.219422 0.215153 69.99 0 \n", 240 | "\n", 241 | "[5 rows x 31 columns]" 242 | ] 243 | }, 244 | "execution_count": 2, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "# get the dataset\n", 251 | "dataset = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')\n", 252 | "dataset.head()" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 50, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "name": "stdout", 262 | "output_type": "stream", 263 | "text": [ 264 | " Time V1 V2 V3 V4 \\\n", 265 | "count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", 266 | "mean 94813.859575 3.918649e-15 5.682686e-16 -8.761736e-15 2.811118e-15 \n", 267 | "std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00 \n", 268 | "min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 \n", 269 | "25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 \n", 270 | "50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02 \n", 271 | "75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01 \n", 272 | "max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01 \n", 273 | "\n", 274 | " V5 V6 V7 V8 V9 \\\n", 275 | "count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", 276 | "mean -1.552103e-15 2.040130e-15 -1.698953e-15 -1.893285e-16 -3.147640e-15 \n", 277 | "std 1.380247e+00 1.332271e+00 1.237094e+00 1.194353e+00 1.098632e+00 \n", 278 | "min -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01 \n", 279 | "25% -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01 \n", 280 | "50% -5.433583e-02 -2.741871e-01 4.010308e-02 2.235804e-02 -5.142873e-02 \n", 281 | "75% 6.119264e-01 3.985649e-01 5.704361e-01 3.273459e-01 5.971390e-01 \n", 282 | "max 3.480167e+01 7.330163e+01 1.205895e+02 2.000721e+01 1.559499e+01 \n", 283 | "\n", 284 | " ... V21 V22 V23 V24 \\\n", 285 | "count ... 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", 286 | "mean ... 1.473120e-16 8.042109e-16 5.282512e-16 4.456271e-15 \n", 287 | "std ... 7.345240e-01 7.257016e-01 6.244603e-01 6.056471e-01 \n", 288 | "min ... -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00 \n", 289 | "25% ... -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01 \n", 290 | "50% ... -2.945017e-02 6.781943e-03 -1.119293e-02 4.097606e-02 \n", 291 | "75% ... 1.863772e-01 5.285536e-01 1.476421e-01 4.395266e-01 \n", 292 | "max ... 2.720284e+01 1.050309e+01 2.252841e+01 4.584549e+00 \n", 293 | "\n", 294 | " V25 V26 V27 V28 Amount \\\n", 295 | "count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 284807.000000 \n", 296 | "mean 1.426896e-15 1.701640e-15 -3.662252e-16 -1.217809e-16 88.349619 \n", 297 | "std 5.212781e-01 4.822270e-01 4.036325e-01 3.300833e-01 250.120109 \n", 298 | "min -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01 0.000000 \n", 299 | "25% -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02 5.600000 \n", 300 | "50% 1.659350e-02 -5.213911e-02 1.342146e-03 1.124383e-02 22.000000 \n", 301 | "75% 3.507156e-01 2.409522e-01 9.104512e-02 7.827995e-02 77.165000 \n", 302 | "max 7.519589e+00 3.517346e+00 3.161220e+01 3.384781e+01 25691.160000 \n", 303 | "\n", 304 | " Class \n", 305 | "count 284807.000000 \n", 306 | "mean 0.001727 \n", 307 | "std 0.041527 \n", 308 | "min 0.000000 \n", 309 | "25% 0.000000 \n", 310 | "50% 0.000000 \n", 311 | "75% 0.000000 \n", 312 | "max 1.000000 \n", 313 | "\n", 314 | "[8 rows x 31 columns]\n" 315 | ] 316 | }, 317 | { 318 | "data": { 319 | "text/plain": [ 320 | "" 321 | ] 322 | }, 323 | "execution_count": 50, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | }, 327 | { 328 | "data": { 329 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAD4CAYAAADy46FuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUQUlEQVR4nO3cbYyd5Z3f8e+vOEFuHlgDmxG12Zo2bLskabKL66CmrSZFwiR9AZGI6hQFdxfJ25RUWYkXS/KirIKQglSWirSw9S4WD6IBRLI17YZNXdhpulqenIiNeShlGmhwsIKytghOBY3Jvy/ONZtjd3zN8TycYXK+H+nonPO/7+u+r//Ymt/cD+ekqpAk6UT+ympPQJL01mZQSJK6DApJUpdBIUnqMigkSV3rVnsCy+3MM8+szZs3L3r8j3/8Y97xjncs34TWgEnredL6BXueFEvp+Vvf+tYPq+oX51v2cxcUmzdvZt++fYsePzMzw/T09PJNaA2YtJ4nrV+w50mxlJ6T/O8TLfPUkySpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqevn7pPZS7X/+6/yz675o7Hv98Uv/eOx71OSRuERhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6FgyKJGcn+ZMkzyZ5OsnnWv13knw/yZPt8fGhMZ9PMpvkuSTbhurnJ9nflt2cJK1+apJ7W/2xJJuHxuxI8nx77FjW7iVJC1o3wjpHgaur6ttJ3gV8K8netuymqvrXwysnOQ/YDrwP+GvAf03yy1X1JnArsBN4FPg6cDHwIHAlcLiq3ptkO3AD8E+SnA5cC2wBqu37gao6vLS2JUmjWvCIoqoOVtW32+vXgGeBjZ0hlwD3VNUbVfUCMAtsTXIW8O6qeqSqCrgTuHRozB3t9f3Ahe1oYxuwt6oOtXDYyyBcJEljMsoRxV9qp4R+FXgM+Ajw2SRXAPsYHHUcZhAijw4NO9BqP2mvj6/Tnl8CqKqjSV4FzhiuzzNmeF47GRypMDU1xczMzMm0dYyp9XD1B44uevxiLWXOS3XkyJFV3f+4TVq/YM+TYqV6HjkokrwT+CrwW1X1oyS3AtcxOCV0HXAj8BtA5hlenTqLHPOzQtUuYBfAli1banp6uttLz5fv3sON+08qP5fFi5dPj32fc2ZmZljKz2ytmbR+wZ4nxUr1PNJdT0nexiAk7q6qrwFU1Q+q6s2q+inw+8DWtvoB4Oyh4ZuAl1t90zz1Y8YkWQecBhzqbEuSNCaj3PUU4Dbg2ar63aH6WUOrfQJ4qr1+ANje7mQ6BzgXeLyqDgKvJbmgbfMKYM/QmLk7mi4DHm7XMb4BXJRkQ5INwEWtJkkak1HOsXwE+DSwP8mTrfYF4FNJPsTgVNCLwG8CVNXTSe4DnmFwx9RV7Y4ngM8AtwPrGdzt9GCr3wbclWSWwZHE9ratQ0muA55o632xqg4tplFJ0uIsGBRV9afMf63g650x1wPXz1PfB7x/nvrrwCdPsK3dwO6F5ilJWhl+MluS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUteCQZHk7CR/kuTZJE8n+Vyrn55kb5Ln2/OGoTGfTzKb5Lkk24bq5yfZ35bdnCStfmqSe1v9sSSbh8bsaPt4PsmOZe1ekrSgUY4ojgJXV9WvABcAVyU5D7gGeKiqzgUeau9py7YD7wMuBm5Jckrb1q3ATuDc9ri41a8EDlfVe4GbgBvatk4HrgU+DGwFrh0OJEnSylswKKrqYFV9u71+DXgW2AhcAtzRVrsDuLS9vgS4p6reqKoXgFlga5KzgHdX1SNVVcCdx42Z29b9wIXtaGMbsLeqDlXVYWAvPwsXSdIYrDuZldspoV8FHgOmquogDMIkyXvaahuBR4eGHWi1n7TXx9fnxrzUtnU0yavAGcP1ecYMz2sngyMVpqammJmZOZm2jjG1Hq7+wNFFj1+spcx5qY4cObKq+x+3SesX7HlSrFTPIwdFkncCXwV+q6p+1C4vzLvqPLXq1Bc75meFql3ALoAtW7bU9PT0iea2oC/fvYcb959Ufi6LFy+fHvs+58zMzLCUn9laM2n9gj1PipXqeaS7npK8jUFI3F1VX2vlH7TTSbTnV1r9AHD20PBNwMutvmme+jFjkqwDTgMOdbYlSRqTUe56CnAb8GxV/e7QogeAubuQdgB7hurb251M5zC4aP14O031WpIL2javOG7M3LYuAx5u1zG+AVyUZEO7iH1Rq0mSxmSUcywfAT4N7E/yZKt9AfgScF+SK4HvAZ8EqKqnk9wHPMPgjqmrqurNNu4zwO3AeuDB9oBBEN2VZJbBkcT2tq1DSa4DnmjrfbGqDi2uVUnSYiwYFFX1p8x/rQDgwhOMuR64fp76PuD989RfpwXNPMt2A7sXmqckaWX4yWxJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0LBkWS3UleSfLUUO13knw/yZPt8fGhZZ9PMpvkuSTbhurnJ9nflt2cJK1+apJ7W/2xJJuHxuxI8nx77Fi2riVJIxvliOJ24OJ56jdV1Yfa4+sASc4DtgPva2NuSXJKW/9WYCdwbnvMbfNK4HBVvRe4Cbihbet04Frgw8BW4NokG066Q0nSkiwYFFX1TeDQiNu7BLinqt6oqheAWWBrkrOAd1fVI1VVwJ3ApUNj7miv7wcubEcb24C9VXWoqg4De5k/sCRJK2gp1yg+m+Q77dTU3F/6G4GXhtY50Gob2+vj68eMqaqjwKvAGZ1tSZLGaN0ix90KXAdUe74R+A0g86xbnTqLHHOMJDsZnNZiamqKmZmZztT7ptbD1R84uujxi7WUOS/VkSNHVnX/4zZp/YI9T4qV6nlRQVFVP5h7neT3gf/c3h4Azh5adRPwcqtvmqc+POZAknXAaQxOdR0Apo8bM3OC+ewCdgFs2bKlpqen51ttJF++ew837l9sfi7ei5dPj32fc2ZmZljKz2ytmbR+wZ4nxUr1vKhTT+2aw5xPAHN3RD0AbG93Mp3D4KL141V1EHgtyQXt+sMVwJ6hMXN3NF0GPNyuY3wDuCjJhnZq66JWkySN0YJ/Oif5CoO/7M9McoDBnUjTST7E4FTQi8BvAlTV00nuA54BjgJXVdWbbVOfYXAH1XrgwfYAuA24K8ksgyOJ7W1bh5JcBzzR1vtiVY16UV2StEwWDIqq+tQ85ds6618PXD9PfR/w/nnqrwOfPMG2dgO7F5qjJGnl+MlsSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqWvBoEiyO8krSZ4aqp2eZG+S59vzhqFln08ym+S5JNuG6ucn2d+W3ZwkrX5qkntb/bEkm4fG7Gj7eD7JjmXrWpI0slGOKG4HLj6udg3wUFWdCzzU3pPkPGA78L425pYkp7QxtwI7gXPbY26bVwKHq+q9wE3ADW1bpwPXAh8GtgLXDgeSJGk8FgyKqvomcOi48iXAHe31HcClQ/V7quqNqnoBmAW2JjkLeHdVPVJVBdx53Ji5bd0PXNiONrYBe6vqUFUdBvby/weWJGmFrVvkuKmqOghQVQeTvKfVNwKPDq13oNV+0l4fX58b81Lb1tEkrwJnDNfnGXOMJDsZHK0wNTXFzMzMItuCqfVw9QeOLnr8Yi1lzkt15MiRVd3/uE1av2DPk2Klel5sUJxI5qlVp77YMccWq3YBuwC2bNlS09PTC070RL589x5u3L/cP5aFvXj59Nj3OWdmZoal/MzWmknrF+x5UqxUz4u96+kH7XQS7fmVVj8AnD203ibg5VbfNE/9mDFJ1gGnMTjVdaJtSZLGaLFB8QAwdxfSDmDPUH17u5PpHAYXrR9vp6leS3JBu/5wxXFj5rZ1GfBwu47xDeCiJBvaReyLWk2SNEYLnmNJ8hVgGjgzyQEGdyJ9CbgvyZXA94BPAlTV00nuA54BjgJXVdWbbVOfYXAH1XrgwfYAuA24K8ksgyOJ7W1bh5JcBzzR1vtiVR1/UV2StMIWDIqq+tQJFl14gvWvB66fp74PeP889ddpQTPPst3A7oXmKElaOX4yW5LUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklS15KCIsmLSfYneTLJvlY7PcneJM+35w1D638+yWyS55JsG6qf37Yzm+TmJGn1U5Pc2+qPJdm8lPlKkk7echxRfLSqPlRVW9r7a4CHqupc4KH2niTnAduB9wEXA7ckOaWNuRXYCZzbHhe3+pXA4ap6L3ATcMMyzFeSdBJW4tTTJcAd7fUdwKVD9Xuq6o2qegGYBbYmOQt4d1U9UlUF3HncmLlt3Q9cOHe0IUkaj3VLHF/Af0lSwL+vql3AVFUdBKiqg0ne09bdCDw6NPZAq/2kvT6+Pjfmpbato0leBc4Afjg8iSQ7GRyRMDU1xczMzKIbmloPV3/g6KLHL9ZS5rxUR44cWdX9j9uk9Qv2PClWquelBsVHqurlFgZ7k/yPzrrzHQlUp94bc2xhEFC7ALZs2VLT09PdSfd8+e493Lh/qT+Wk/fi5dNj3+ecmZkZlvIzW2smrV+w50mxUj0v6dRTVb3cnl8B/hDYCvygnU6iPb/SVj8AnD00fBPwcqtvmqd+zJgk64DTgENLmbMk6eQsOiiSvCPJu+ZeAxcBTwEPADvaajuAPe31A8D2difTOQwuWj/eTlO9luSCdv3hiuPGzG3rMuDhdh1DkjQmSznHMgX8Ybu2vA74D1X1x0meAO5LciXwPeCTAFX1dJL7gGeAo8BVVfVm29ZngNuB9cCD7QFwG3BXklkGRxLblzBfSdIiLDooquq7wAfnqf8FcOEJxlwPXD9PfR/w/nnqr9OCRpK0OvxktiSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktS1JoIiycVJnksym+Sa1Z6PJE2St3xQJDkF+HfAx4DzgE8lOW91ZyVJk2Pdak9gBFuB2ar6LkCSe4BLgGdWdVaSdAKbr/mjVdnv7Re/Y0W2uxaCYiPw0tD7A8CHh1dIshPY2d4eSfLcEvZ3JvDDJYxflNww7j0eY1V6XkWT1i/Y80T46A1L6vmvn2jBWgiKzFOrY95U7QJ2LcvOkn1VtWU5trVWTFrPk9Yv2POkWKme3/LXKBgcQZw99H4T8PIqzUWSJs5aCIongHOTnJPk7cB24IFVnpMkTYy3/Kmnqjqa5LPAN4BTgN1V9fQK7nJZTmGtMZPW86T1C/Y8KVak51TVwmtJkibWWjj1JElaRQaFJKlrIoNioa8EycDNbfl3kvzaasxzOY3Q8+Wt1+8k+bMkH1yNeS6nUb/6JcnfTfJmksvGOb+VMErPSaaTPJnk6ST/bdxzXG4j/N8+Lcl/SvLnredfX415Lpcku5O8kuSpEyxf/t9fVTVRDwYXxP8X8DeAtwN/Dpx33DofBx5k8BmOC4DHVnveY+j57wEb2uuPTULPQ+s9DHwduGy15z2Gf+dfYPCtBr/U3r9ntec9hp6/ANzQXv8icAh4+2rPfQk9/0Pg14CnTrB82X9/TeIRxV9+JUhV/V9g7itBhl0C3FkDjwK/kOSscU90GS3Yc1X9WVUdbm8fZfB5lbVslH9ngH8JfBV4ZZyTWyGj9PxPga9V1fcAqmqt9z1KzwW8K0mAdzIIiqPjnebyqapvMujhRJb999ckBsV8XwmycRHrrCUn28+VDP4iWcsW7DnJRuATwO+NcV4raZR/518GNiSZSfKtJFeMbXYrY5Se/y3wKww+qLsf+FxV/XQ801sVy/776y3/OYoVsOBXgoy4zloycj9JPsogKP7+is5o5Y3S878Bfruq3hz8sbnmjdLzOuB84EJgPfBIkker6n+u9ORWyCg9bwOeBP4R8DeBvUn+e1X9aIXntlqW/ffXJAbFKF8J8vP2tSEj9ZPk7wB/AHysqv5iTHNbKaP0vAW4p4XEmcDHkxytqv84lhkuv1H/b/+wqn4M/DjJN4EPAms1KEbp+deBL9XgBP5skheAvw08Pp4pjt2y//6axFNPo3wlyAPAFe3ugQuAV6vq4LgnuowW7DnJLwFfAz69hv+6HLZgz1V1TlVtrqrNwP3Av1jDIQGj/d/eA/yDJOuS/FUG38T87JjnuZxG6fl7DI6gSDIF/C3gu2Od5Xgt+++viTuiqBN8JUiSf96W/x6DO2A+DswC/4fBXyRr1og9/yvgDOCW9hf20VrD37w5Ys8/V0bpuaqeTfLHwHeAnwJ/UFXz3ma5Foz473wdcHuS/QxOy/x2Va3Zrx9P8hVgGjgzyQHgWuBtsHK/v/wKD0lS1ySeepIknQSDQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnr/wFa9dPsJa6ScgAAAABJRU5ErkJggg==\n", 330 | "text/plain": [ 331 | "
" 332 | ] 333 | }, 334 | "metadata": { 335 | "needs_background": "light" 336 | }, 337 | "output_type": "display_data" 338 | } 339 | ], 340 | "source": [ 341 | "# some descriptive statistics\n", 342 | "print(dataset.describe())\n", 343 | "dataset.Class.hist()" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 64, 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | "Number of true frauds: (492, 31)\n", 356 | "Number of false frauds: (284315, 31)\n" 357 | ] 358 | }, 359 | { 360 | "data": { 361 | "text/html": [ 362 | "
\n", 363 | "\n", 376 | "\n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | "
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
202205134220.0-1.532136-0.6354552.037052-0.9493821.696577-1.034161-0.060257-0.0184930.472222...-0.140775-0.673854-0.170881-0.7216770.528398-0.561805-0.170225-0.1405081.000
238845149848.0-0.8152990.9957340.711254-0.9582720.091578-0.3118250.0183350.6469720.031721...-0.149705-0.581679-0.126189-0.644711-0.303184-0.2613620.0004630.0765260.770
178820123796.01.994938-0.795754-1.423802-0.277639-0.526009-1.2675060.126832-0.500672-0.752476...-0.455601-0.8881690.2068430.062659-0.2284640.464678-0.082312-0.045135106.930
261056159844.0-0.4081113.132944-3.0980305.8038930.890609-0.501474-0.4400540.591828-3.267693...0.098482-0.538375-0.217989-1.0426570.3143890.5432440.2338510.11960345.511
10278268357.01.232604-0.5489311.0878730.894082-1.433055-0.356797-0.7174920.003167-0.100397...-0.448671-0.5175680.0128330.6992170.527258-0.3226070.0808050.03542719.591
\n", 526 | "

5 rows × 31 columns

\n", 527 | "
" 528 | ], 529 | "text/plain": [ 530 | " Time V1 V2 V3 V4 V5 V6 \\\n", 531 | "202205 134220.0 -1.532136 -0.635455 2.037052 -0.949382 1.696577 -1.034161 \n", 532 | "238845 149848.0 -0.815299 0.995734 0.711254 -0.958272 0.091578 -0.311825 \n", 533 | "178820 123796.0 1.994938 -0.795754 -1.423802 -0.277639 -0.526009 -1.267506 \n", 534 | "261056 159844.0 -0.408111 3.132944 -3.098030 5.803893 0.890609 -0.501474 \n", 535 | "102782 68357.0 1.232604 -0.548931 1.087873 0.894082 -1.433055 -0.356797 \n", 536 | "\n", 537 | " V7 V8 V9 ... V21 V22 V23 \\\n", 538 | "202205 -0.060257 -0.018493 0.472222 ... -0.140775 -0.673854 -0.170881 \n", 539 | "238845 0.018335 0.646972 0.031721 ... -0.149705 -0.581679 -0.126189 \n", 540 | "178820 0.126832 -0.500672 -0.752476 ... -0.455601 -0.888169 0.206843 \n", 541 | "261056 -0.440054 0.591828 -3.267693 ... 0.098482 -0.538375 -0.217989 \n", 542 | "102782 -0.717492 0.003167 -0.100397 ... -0.448671 -0.517568 0.012833 \n", 543 | "\n", 544 | " V24 V25 V26 V27 V28 Amount Class \n", 545 | "202205 -0.721677 0.528398 -0.561805 -0.170225 -0.140508 1.00 0 \n", 546 | "238845 -0.644711 -0.303184 -0.261362 0.000463 0.076526 0.77 0 \n", 547 | "178820 0.062659 -0.228464 0.464678 -0.082312 -0.045135 106.93 0 \n", 548 | "261056 -1.042657 0.314389 0.543244 0.233851 0.119603 45.51 1 \n", 549 | "102782 0.699217 0.527258 -0.322607 0.080805 0.035427 19.59 1 \n", 550 | "\n", 551 | "[5 rows x 31 columns]" 552 | ] 553 | }, 554 | "execution_count": 64, 555 | "metadata": {}, 556 | "output_type": "execute_result" 557 | } 558 | ], 559 | "source": [ 560 | "# undersampling\n", 561 | "\n", 562 | "# distinguish between the two classes\n", 563 | "false = dataset[dataset.Class == 0]\n", 564 | "true = dataset[dataset.Class == 1]\n", 565 | "print(\"Number of true frauds: \", true.shape)\n", 566 | "print(\"Number of false frauds: \", false.shape)\n", 567 | "\n", 568 | "# we will take a few more than 2 times the number of true obsevations\n", 569 | "balanced_false = false.sample(1000)\n", 570 | "\n", 571 | "# merge the true and false observations\n", 572 | "balanced_dataset = pd.concat([balanced_false, true])\n", 573 | "\n", 574 | "# shuffle the rows\n", 575 | "DATASET = pd.DataFrame(shuffle(balanced_dataset))" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 65, 581 | "metadata": {}, 582 | "outputs": [ 583 | { 584 | "data": { 585 | "text/html": [ 586 | "
\n", 587 | "\n", 600 | "\n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | "
TimeV1V2V3V4V5V6V7V8V9...V20V21V22V23V24V25V26V27V28Amount
202205134220.0-1.532136-0.6354552.037052-0.9493821.696577-1.034161-0.060257-0.0184930.472222...0.244874-0.140775-0.673854-0.170881-0.7216770.528398-0.561805-0.170225-0.1405081.00
238845149848.0-0.8152990.9957340.711254-0.9582720.091578-0.3118250.0183350.6469720.031721...-0.212782-0.149705-0.581679-0.126189-0.644711-0.303184-0.2613620.0004630.0765260.77
178820123796.01.994938-0.795754-1.423802-0.277639-0.526009-1.2675060.126832-0.500672-0.752476...-0.369585-0.455601-0.8881690.2068430.062659-0.2284640.464678-0.082312-0.045135106.93
261056159844.0-0.4081113.132944-3.0980305.8038930.890609-0.501474-0.4400540.591828-3.267693...0.4995680.098482-0.538375-0.217989-1.0426570.3143890.5432440.2338510.11960345.51
10278268357.01.232604-0.5489311.0878730.894082-1.433055-0.356797-0.7174920.003167-0.100397...-0.576274-0.448671-0.5175680.0128330.6992170.527258-0.3226070.0808050.03542719.59
\n", 750 | "

5 rows × 30 columns

\n", 751 | "
" 752 | ], 753 | "text/plain": [ 754 | " Time V1 V2 V3 V4 V5 V6 \\\n", 755 | "202205 134220.0 -1.532136 -0.635455 2.037052 -0.949382 1.696577 -1.034161 \n", 756 | "238845 149848.0 -0.815299 0.995734 0.711254 -0.958272 0.091578 -0.311825 \n", 757 | "178820 123796.0 1.994938 -0.795754 -1.423802 -0.277639 -0.526009 -1.267506 \n", 758 | "261056 159844.0 -0.408111 3.132944 -3.098030 5.803893 0.890609 -0.501474 \n", 759 | "102782 68357.0 1.232604 -0.548931 1.087873 0.894082 -1.433055 -0.356797 \n", 760 | "\n", 761 | " V7 V8 V9 ... V20 V21 V22 \\\n", 762 | "202205 -0.060257 -0.018493 0.472222 ... 0.244874 -0.140775 -0.673854 \n", 763 | "238845 0.018335 0.646972 0.031721 ... -0.212782 -0.149705 -0.581679 \n", 764 | "178820 0.126832 -0.500672 -0.752476 ... -0.369585 -0.455601 -0.888169 \n", 765 | "261056 -0.440054 0.591828 -3.267693 ... 0.499568 0.098482 -0.538375 \n", 766 | "102782 -0.717492 0.003167 -0.100397 ... -0.576274 -0.448671 -0.517568 \n", 767 | "\n", 768 | " V23 V24 V25 V26 V27 V28 Amount \n", 769 | "202205 -0.170881 -0.721677 0.528398 -0.561805 -0.170225 -0.140508 1.00 \n", 770 | "238845 -0.126189 -0.644711 -0.303184 -0.261362 0.000463 0.076526 0.77 \n", 771 | "178820 0.206843 0.062659 -0.228464 0.464678 -0.082312 -0.045135 106.93 \n", 772 | "261056 -0.217989 -1.042657 0.314389 0.543244 0.233851 0.119603 45.51 \n", 773 | "102782 0.012833 0.699217 0.527258 -0.322607 0.080805 0.035427 19.59 \n", 774 | "\n", 775 | "[5 rows x 30 columns]" 776 | ] 777 | }, 778 | "execution_count": 65, 779 | "metadata": {}, 780 | "output_type": "execute_result" 781 | } 782 | ], 783 | "source": [ 784 | "# split the data to train and validate the model\n", 785 | "y = DATASET.Class\n", 786 | "X = DATASET.drop(columns=[\"Class\"]).copy()\n", 787 | "X_train, X_test, y_train, y_test = train_test_split(X, y)" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": 70, 793 | "metadata": {}, 794 | "outputs": [ 795 | { 796 | "data": { 797 | "text/plain": [ 798 | "LogisticRegression(random_state=17)" 799 | ] 800 | }, 801 | "execution_count": 70, 802 | "metadata": {}, 803 | "output_type": "execute_result" 804 | } 805 | ], 806 | "source": [ 807 | "model = LogisticRegression(random_state=17).fit(X_train, y_train)\n", 808 | "preds = model.predict(X_test)" 809 | ] 810 | }, 811 | { 812 | "cell_type": "code", 813 | "execution_count": 75, 814 | "metadata": {}, 815 | "outputs": [ 816 | { 817 | "name": "stdout", 818 | "output_type": "stream", 819 | "text": [ 820 | "ACCURACY: 0.9490616621983914\n", 821 | "ROC_AUC: 0.9392845528455285\n", 822 | "F1 SCORE: 0.9218106995884775\n" 823 | ] 824 | } 825 | ], 826 | "source": [ 827 | "print(\"ACCURACY: \", accuracy_score(y_test, preds))\n", 828 | "print(\"ROC_AUC: \", roc_auc_score(y_test, preds))\n", 829 | "print(\"F1 SCORE: \", f1_score(y_test, preds))" 830 | ] 831 | }, 832 | { 833 | "cell_type": "markdown", 834 | "metadata": {}, 835 | "source": [ 836 | "As we can see, the model performed well classifying the observations, obtaining great score on some important metrics." 837 | ] 838 | } 839 | ], 840 | "metadata": { 841 | "kernelspec": { 842 | "display_name": "Python 3", 843 | "language": "python", 844 | "name": "python3" 845 | }, 846 | "language_info": { 847 | "codemirror_mode": { 848 | "name": "ipython", 849 | "version": 3 850 | }, 851 | "file_extension": ".py", 852 | "mimetype": "text/x-python", 853 | "name": "python", 854 | "nbconvert_exporter": "python", 855 | "pygments_lexer": "ipython3", 856 | "version": "3.8.5" 857 | } 858 | }, 859 | "nbformat": 4, 860 | "nbformat_minor": 4 861 | } 862 | --------------------------------------------------------------------------------