├── requirements.txt ├── spam.csv ├── model.joblib ├── README.md ├── app.py ├── youtube_comments.py └── final-project.ipynb /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | joblib 3 | sklearn 4 | nltk 5 | pandas -------------------------------------------------------------------------------- /spam.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsmentors/sample-project/HEAD/spam.csv -------------------------------------------------------------------------------- /model.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsmentors/sample-project/HEAD/model.joblib -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # This is a sample project for Spam detection in messages 2 | 3 | ## Directory Structure 4 | 5 | > 1. ### The repositoy contains the file required to create the Spam detector and App deployment code. 6 | > 2. ### The "final-project.ipynb" file is a notebook which you can open and run using colab (recommended). 7 | > 3. ### Other files are needed to run the app. 8 | 9 | ## How to run the code 10 | 1. Clone/download the repository 11 | 2. Install the requirements present in the requirements.txt using pip install -r requirements.txt 12 | 3. Run the app using streamlit run app.py 13 | 14 | ## How to use the YouTube comments spam analyzer 15 | 1. Use the steps written [here](https://www.thepythoncode.com/article/using-youtube-api-in-python) to get the "credentials.json" file 16 | 2. Add credentials.json file inside the directory 17 | 3. Run "youtube_comments.py" to create the token. 18 | 4. That's it! Now you can access the YouTube comments from the app. 19 | 20 | ## Video tutorial 21 | ----------------- 22 | ### We have created video tutorial to help you out understand the overall flow of the project 23 | 1. Part-1 of the tutorial is present [here](https://drive.google.com/file/d/1l04yRyLA4woOLRANHK3hPmZOELgLjgCZ/view?usp=sharing) 24 | 2. Part-2 of the tutorial is present [here](https://drive.google.com/file/d/1egitQWJ-8cD2dd9BYbfXCH9YYPNaz_ue/view?usp=sharing) 25 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import joblib 3 | from youtube_comments import extract_comments 4 | import string 5 | from nltk.corpus import stopwords 6 | import pandas as pd 7 | 8 | 9 | def text_process(mess): 10 | """ 11 | Takes in a string of text, then performs the following: 12 | 1. Remove all punctuation 13 | 2. Remove all stopwords 14 | 3. Returns a list of the cleaned text 15 | """ 16 | STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure'] 17 | # Check characters to see if they are in punctuation 18 | nopunc = [char for char in mess if char not in string.punctuation] 19 | 20 | # Join the characters again to form the string. 21 | nopunc = ''.join(nopunc) 22 | 23 | # Now just remove any stopwords 24 | return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS]) 25 | 26 | @st.cache(allow_output_mutation=True) 27 | def load_model(): 28 | model = joblib.load('model.joblib') 29 | return model 30 | 31 | # Loading the model from file 32 | model = load_model() 33 | 34 | # Function to predict spam and ham, given a text 35 | def predict(txt): 36 | prediction = model.predict([txt]) 37 | if prediction[0] == 1: 38 | return "Spam" 39 | else: 40 | return "Not Spam" 41 | 42 | # USER INTERFACE 43 | # Creating the heading 44 | st.title('Spam Detection App') 45 | 46 | # Retrieving and storing the text from the textbox 47 | txt = st.text_area('Text to analyze', ) 48 | 49 | # predicting the class and writing it on the webpage 50 | st.write('Predictions:', predict(txt)) 51 | 52 | # Getting the YouTube URL 53 | youtube_url = st.text_input('YouTube URL', ) 54 | 55 | # Getting the custom messages dataset 56 | uploaded_file = st.file_uploader("Choose a file") 57 | 58 | # If a custom dataset is uploaded, then create a histogram for that else if 59 | # YouTube URL is provided, extract the comments from there and create the histogram of prediction 60 | 61 | if st.button('Compute'): 62 | if uploaded_file is not None: 63 | df = pd.read_csv(uploaded_file, encoding='latin-1') 64 | result_data = [0, 0] 65 | for each in df['Text']: 66 | pred_data = predict(text_process(each)) 67 | if pred_data == 'Spam': 68 | result_data[0] += 1 69 | else: 70 | result_data[1] += 1 71 | result = pd.DataFrame({'Spam':[result_data[0]], 'Not Spam':[result_data[1]]}, columns=['Spam', 'Not Spam']) 72 | st.bar_chart(data=result) 73 | else: 74 | comments = extract_comments(youtube_url) 75 | processed_data = [] 76 | final_count = [0, 0] 77 | for key, val in comments.items(): 78 | pred_data = predict(text_process(val[0])) 79 | if pred_data == 'Spam': 80 | processed_data.append(1) 81 | final_count[0] += 1 82 | else: 83 | processed_data.append(0) 84 | final_count[1] += 1 85 | 86 | chart_data = pd.DataFrame({'Spam': [final_count[0]], 'Not Spam': [final_count[1]]}, columns=['Spam', 'Not Spam']) 87 | st.bar_chart(chart_data) 88 | 89 | else: 90 | pass 91 | 92 | -------------------------------------------------------------------------------- /youtube_comments.py: -------------------------------------------------------------------------------- 1 | 2 | from googleapiclient.discovery import build 3 | from google_auth_oauthlib.flow import InstalledAppFlow 4 | from google.auth.transport.requests import Request 5 | 6 | import urllib.parse as p 7 | import re 8 | import os 9 | import pickle 10 | 11 | SCOPES = ["https://www.googleapis.com/auth/youtube.force-ssl"] 12 | 13 | def youtube_authenticate(): 14 | os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1" 15 | api_service_name = "youtube" 16 | api_version = "v3" 17 | client_secrets_file = "credentials.json" 18 | creds = None 19 | # the file token.pickle stores the user's access and refresh tokens, and is 20 | # created automatically when the authorization flow completes for the first time 21 | if os.path.exists("token.pickle"): 22 | with open("token.pickle", "rb") as token: 23 | creds = pickle.load(token) 24 | # if there are no (valid) credentials availablle, let the user log in. 25 | if not creds or not creds.valid: 26 | if creds and creds.expired and creds.refresh_token: 27 | creds.refresh(Request()) 28 | else: 29 | flow = InstalledAppFlow.from_client_secrets_file(client_secrets_file, SCOPES) 30 | creds = flow.run_local_server(port=0) 31 | # save the credentials for the next run 32 | with open("token.pickle", "wb") as token: 33 | pickle.dump(creds, token) 34 | 35 | return build(api_service_name, api_version, credentials=creds) 36 | 37 | # authenticate to YouTube API 38 | youtube = youtube_authenticate() 39 | 40 | def get_video_id_by_url(url): 41 | """ 42 | Return the Video ID from the video `url` 43 | """ 44 | # split URL parts 45 | parsed_url = p.urlparse(url) 46 | # get the video ID by parsing the query of the URL 47 | video_id = p.parse_qs(parsed_url.query).get("v") 48 | if video_id: 49 | return video_id[0] 50 | else: 51 | raise Exception(f"Wasn't able to parse video URL: {url}") 52 | 53 | def get_comments(youtube, **kwargs): 54 | return youtube.commentThreads().list( 55 | part="snippet", 56 | **kwargs 57 | ).execute() 58 | 59 | def extract_comments(url): 60 | video_id = get_video_id_by_url(url) 61 | params = { 62 | 'videoId': video_id, 63 | 'maxResults': 100, 64 | 'order': 'relevance', # default is 'time' (newest) 65 | } 66 | n_pages = 5 67 | result = {} 68 | for i in range(n_pages): 69 | # make API call to get all comments from the channel (including posts & videos) 70 | response = get_comments(youtube, **params) 71 | items = response.get("items") 72 | # if items is empty, breakout of the loop 73 | if not items: 74 | break 75 | for item in items: 76 | comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"] 77 | updated_at = item["snippet"]["topLevelComment"]["snippet"]["updatedAt"] 78 | like_count = item["snippet"]["topLevelComment"]["snippet"]["likeCount"] 79 | comment_id = item["snippet"]["topLevelComment"]["id"] 80 | result[comment_id] = [comment, like_count, updated_at] 81 | # print(f"""\ 82 | # Comment: {comment} 83 | # Likes: {like_count} 84 | # Updated At: {updated_at} 85 | # ==================================\ 86 | # """) 87 | if "nextPageToken" in response: 88 | # if there is a next page 89 | # add next page token to the params we pass to the function 90 | params["pageToken"] = response["nextPageToken"] 91 | else: 92 | # must be end of comments!!!! 93 | break 94 | print("*" * 70) 95 | return result 96 | 97 | # url = "https://www.youtube.com/watch?v=hpwnlr-ZHB0" 98 | # #video_id = get_video_id_by_url("https://www.youtube.com/watch?v=hpwnlr-ZHB0") 99 | # comments = extract_comments(url) 100 | # 101 | # print(comments) 102 | 103 | -------------------------------------------------------------------------------- /final-project.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyMVPu4BZW7b7mtoVvGt2gg9"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"fdGBy-aG0_43","executionInfo":{"status":"ok","timestamp":1663748815114,"user_tz":-330,"elapsed":4515,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"8d9343f2-396f-4572-a464-918e016a166a"},"outputs":[{"output_type":"stream","name":"stdout","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["# Importing the google drive here\n","from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["# Importing the relevant libraries\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","from sklearn.feature_extraction.text import CountVectorizer\n","#vect = CountVectorizer()\n","path = \"/content/drive/MyDrive/guvi/project-proposal/final-project/spam.csv\""],"metadata":{"id":"QAIUhlJK1-Qy"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Reading the dataset using pandas. We drop the columns which are not required. (refer to Data Engineering module)"],"metadata":{"id":"zcGMe2dLxcp4"}},{"cell_type":"code","source":["# reading file using pandas\n","sms = pd.read_csv(path, encoding='latin-1')\n","sms.dropna(how=\"any\", inplace=True, axis=1)\n","sms.columns = ['label', 'message']\n","sms.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"2mDd1i2i2qZO","executionInfo":{"status":"ok","timestamp":1663748815115,"user_tz":-330,"elapsed":20,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"cef760ba-41cc-4996-efa4-81de1065837a"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" label message\n","0 ham Go until jurong point, crazy.. Available only ...\n","1 ham Ok lar... Joking wif u oni...\n","2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n","3 ham U dun say so early hor... U c already then say...\n","4 ham Nah I don't think he goes to usf, he lives aro..."],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
labelmessage
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":10}]},{"cell_type":"markdown","source":["# Exploring the Data"],"metadata":{"id":"yok2ksQK21Ae"}},{"cell_type":"code","source":["sms.describe()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":175},"id":"i2-Jomjm2ygo","executionInfo":{"status":"ok","timestamp":1663748815115,"user_tz":-330,"elapsed":19,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"6eb0fc0e-cef9-4cec-cc59-2a6919b31b75"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" label message\n","count 5572 5572\n","unique 2 5169\n","top ham Sorry, I'll call later\n","freq 4825 30"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
labelmessage
count55725572
unique25169
tophamSorry, I'll call later
freq482530
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":11}]},{"cell_type":"code","source":["sms.groupby('label').describe()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":175},"id":"zi63GGkw26qo","executionInfo":{"status":"ok","timestamp":1663748815115,"user_tz":-330,"elapsed":18,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"1537bfaa-30ed-48df-aa16-cc69722f7261"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" message \n"," count unique top freq\n","label \n","ham 4825 4516 Sorry, I'll call later 30\n","spam 747 653 Please call our customer service representativ... 4"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
message
countuniquetopfreq
label
ham48254516Sorry, I'll call later30
spam747653Please call our customer service representativ...4
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":12}]},{"cell_type":"markdown","source":["# Label Encoding\n","----------------\n","## While training or predicting the model, it is necessary to transform the data into a machine readable format. We now perform the relevant encoding to make it machine readable. (refer to module-11)"],"metadata":{"id":"-Dp8gGZ93gsY"}},{"cell_type":"code","source":["# convert label to a numerical variable\n","sms['label_num'] = sms.label.map({'ham':0, 'spam':1})\n","sms.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"8gQNQ0zi2-Yy","executionInfo":{"status":"ok","timestamp":1663748815116,"user_tz":-330,"elapsed":18,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"4b54c07a-b897-4fc8-80c1-2c382320a9cd"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" label message label_num\n","0 ham Go until jurong point, crazy.. Available only ... 0\n","1 ham Ok lar... Joking wif u oni... 0\n","2 spam Free entry in 2 a wkly comp to win FA Cup fina... 1\n","3 ham U dun say so early hor... U c already then say... 0\n","4 ham Nah I don't think he goes to usf, he lives aro... 0"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
labelmessagelabel_num
0hamGo until jurong point, crazy.. Available only ...0
1hamOk lar... Joking wif u oni...0
2spamFree entry in 2 a wkly comp to win FA Cup fina...1
3hamU dun say so early hor... U c already then say...0
4hamNah I don't think he goes to usf, he lives aro...0
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":13}]},{"cell_type":"markdown","source":["## Another special task is the feature engineering, where we try to find those features which can help us predict our label."],"metadata":{"id":"ribof00A4ZoD"}},{"cell_type":"code","source":["sms['message_len'] = sms.message.apply(len)\n","sms.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"kEHGEC8D3Cdh","executionInfo":{"status":"ok","timestamp":1663748815116,"user_tz":-330,"elapsed":17,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"d0517704-06e9-4f08-a61a-af673413eb48"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" label message label_num \\\n","0 ham Go until jurong point, crazy.. Available only ... 0 \n","1 ham Ok lar... Joking wif u oni... 0 \n","2 spam Free entry in 2 a wkly comp to win FA Cup fina... 1 \n","3 ham U dun say so early hor... U c already then say... 0 \n","4 ham Nah I don't think he goes to usf, he lives aro... 0 \n","\n"," message_len \n","0 111 \n","1 29 \n","2 155 \n","3 49 \n","4 61 "],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
labelmessagelabel_nummessage_len
0hamGo until jurong point, crazy.. Available only ...0111
1hamOk lar... Joking wif u oni...029
2spamFree entry in 2 a wkly comp to win FA Cup fina...1155
3hamU dun say so early hor... U c already then say...049
4hamNah I don't think he goes to usf, he lives aro...061
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":14}]},{"cell_type":"markdown","source":["## We plot the bar graph for the length of messages for each label. (please refer to the visualization module)"],"metadata":{"id":"OJJSKJtxyX-9"}},{"cell_type":"code","source":["plt.figure(figsize=(12, 8))\n","\n","sms[sms.label=='ham'].message_len.plot(bins=35, kind='hist', color='blue', \n"," label='Ham messages', alpha=0.6)\n","sms[sms.label=='spam'].message_len.plot(kind='hist', color='red', \n"," label='Spam messages', alpha=0.6)\n","plt.legend()\n","plt.xlabel(\"Message Length\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":514},"id":"ryDmSt2y9DTM","executionInfo":{"status":"ok","timestamp":1663748815116,"user_tz":-330,"elapsed":16,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"97f55f65-c707-4dc8-ab18-7422cfe467be"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Text(0.5, 0, 'Message Length')"]},"metadata":{},"execution_count":15},{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"iVBORw0KGgoAAAANSUhEUgAAAtoAAAHgCAYAAACb58plAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3de7hWZb3v//dXQSBlISr5MynRFqWcQgOV1DakEh4SLQ94aaE/S9oeMn+7y20t28sKd2orWbnzgKULW7lM81CeVqXllYcdISAKYiSeFpNIERNTU2Dy/f3xDGaPOIUJzHs+zMn7dV3zmmPc4x5jfB/mcz185j3vMUZkJpIkSZLa11aNLkCSJEnqigzakiRJUgEGbUmSJKkAg7YkSZJUgEFbkiRJKsCgLUmSJBXQrdEFlLLTTjvlgAEDGl2GJEmSurBZs2a9lJn9WtvWZYP2gAEDmDlzZqPLkCRJUhcWEc+/2zanjkiSJEkFGLQlSZKkAgzakiRJUgFddo62JElSI61cuZKmpibefPPNRpeidtCzZ0/69+9P9+7d27yPQVuSJKmApqYmevfuzYABA4iIRpejTZCZLFu2jKamJnbfffc27+fUEUmSpALefPNNdtxxR0N2FxAR7Ljjjhv81wmDtiRJUiGG7K5jY36WBm1JkqQuarvttnvb+rRp0zjrrLMaVM2WxznakiRJHWDSpPY93tSp7Xs8tT9HtCVJkrZAd955J/vttx977703hxxyCC+88AIAF154IRMnTuSggw5it91247bbbuO8885j6NChjBs3jpUrV77jWKNHj+bcc89lxIgR7LXXXjzyyCN8+tOfZuDAgVxwwQUt/X784x+z7777Mnz4cCZNmkRzczPNzc2ccsopDBkyhKFDhzJlyhQALr/8cgYNGsSwYcOYMGECADNmzGDUqFHsvffefOxjH2PBggUAvPHGGxx//PEMGjSIY445hv3226/lCeG/+tWvGDVqFPvssw/HHXccr732GgDnn39+y/G/8pWvFPk3dkRbkiSpi/rb3/7G8OHDW9ZffvlljjrqKAAOPPBApk+fTkTwwx/+kEsvvZTvfve7ADz99NPcf//9zJ8/n1GjRnHrrbdy6aWXcswxx3D33Xdz9NFHv+Nc22yzDTNnzuR73/se48ePZ9asWeywww588IMf5Nxzz+XFF1/kpptu4uGHH6Z79+6cccYZ3HDDDQwePJjFixczb948AF555RUALr74Yp599ll69OjR0rbnnnvy4IMP0q1bN+677z6+9rWvceutt3LllVfSt29f5s+fz7x581pe80svvcTkyZO577772Hbbbbnkkku47LLLOPPMM7n99tv5wx/+QES0HL+9FQvaEXEdcCTwYmYOqdpuAj5cddkeeCUzh0fEAOBJYEG1bXpmfrHa56PANKAXcA9wTmZmqbolSZK6il69ejFnzpyW9WnTprWM9DY1NXHCCSewZMkSVqxY8bbb1h122GF0796doUOH0tzczLhx4wAYOnQozz33XKvnWhPghw4dyuDBg9lll10A2GOPPVi0aBEPPfQQs2bNYuTIkUDtl4D3vve9fOpTn+KZZ57h7LPP5ogjjmDs2LEADBs2jJNOOomjjz66JdgvX76ciRMn8tRTTxERLaPrDz30EOeccw4AQ4YMYdiwYQBMnz6d+fPnc8ABBwCwYsUKRo0aRZ8+fejZsyennXYaRx55JEceeeQm/ku3ruTUkWnAuPqGzDwhM4dn5nDgVuC2us1Pr9m2JmRXrgK+AAysvt52TEmSJG24s88+m7POOou5c+cyderUt926rkePHgBstdVWdO/eveWOG1tttRWrVq1q9Xj1+6xZrt8nM5k4cSJz5sxhzpw5LFiwgAsvvJC+ffvy2GOPMXr0aK6++mo+//nPA3D33Xdz5plnMnv2bEaOHMmqVav4+te/zpgxY5g3bx533nnnem+3l5kceuihLeecP38+1157Ld26dWPGjBkce+yx3HXXXS2/SLS3YkE7Mx8AXm5tW9R+WscDN67rGBGxC/APmTm9GsX+EfDOv1VIkiRpgyxfvpxdd90VgOuvv774+Q4++GBuueUWXnzxRaA2jeX555/npZdeYvXq1XzmM59h8uTJzJ49m9WrV7No0SLGjBnDJZdcwvLly3nttdfeVvO0adNajn3AAQdw8803AzB//nzmzp0LwP7778/DDz/MwoULAXj99df54x//2HKsww8/nClTpvDYY48Vec2NmqN9EPBCZj5V17Z7RDwKvApckJkPArsCTXV9mqo2SZIkbYILL7yQ4447jr59+/KJT3yCZ599tuj5Bg0axOTJkxk7diyrV6+me/fuXHHFFfTq1YtTTz2V1atXA/Dtb3+b5uZmTj75ZJYvX05m8qUvfYntt9+e8847j4kTJzJ58mSOOOKIlmOfccYZTJw4kUGDBrHnnnsyePBg+vTpQ79+/Zg2bRonnngib731FgCTJ0+md+/ejB8/njfffJPM5LLLLivymqPkdOdq7vVda+Zo17VfBSzMzO9W6z2A7TJzWTUn+2fAYOBDwMWZeUjV7yDgf2ZmqxNpIuJ04HSAD3zgAx99/vnni7wuSZKk9XnyySfZa6+9Gl3GFqG5uZmVK1fSs2dPnn76aQ455BAWLFjANtts067nae1nGhGzMnNEa/07fEQ7IroBnwY+uqYtM98C3qqWZ0XE09RC9mKgf93u/au2VmXmNcA1ACNGjPCCSUmSpC3AG2+8wZgxY1i5ciWZyZVXXtnuIXtjNGLqyCHAHzKzZUpIRPQDXs7M5ojYg9pFj89k5ssR8WpE7A/8Hvgc8H8aULMkSZI2U7179265m8rmpNjFkBFxI/A74MMR0RQRp1WbJvDOiyA/DjweEXOAW4AvZuaaCynPAH4ILASeBv6zVM2SJElSeyk2op2ZJ75L+ymttN1K7XZ/rfWfCQxpbVtX1V6PaPXRrJIkSY3jI9glSZKkAgzakiRJUgEGbUmSpC7qoosuYvDgwQwbNozhw4fz+9//vtElbVEa9cAaSZKkLUt7XYS1xnouxvrd737HXXfdxezZs+nRowcvvfQSK1asaN8atE6OaEuSJHVBS5YsYaeddqJHjx4A7LTTTrzvfe8DYMCAAZx33nkMHTqUfffdt+UR5XfeeSf77bcfe++9N4cccggvvPACUHuK5MSJEznooIPYbbfduO2221r2HzduHCtXrnzH+UePHs25557LiBEj2GuvvXjkkUf49Kc/zcCBA7ngggta+v34xz9m3333Zfjw4UyaNInm5maam5s55ZRTGDJkCEOHDmXKlCkAXH755QwaNIhhw4YxYcIEAGbMmMGoUaPYe++9+djHPsaCBQuA2r21jz/+eAYNGsQxxxzDfvvt13ILwF/96leMGjWKffbZh+OOO47XXnsNgPPPP7/l+F/5ylc2+Wdg0JYkSeqCxo4dy6JFi/jQhz7EGWecwW9/+9u3be/Tpw9z587lrLPO4stf/jIABx54INOnT+fRRx9lwoQJXHrppS39n376aX7zm99wxx13cPLJJzNmzBjmzp1Lr169uPvuu1utYZtttmHmzJl88YtfZPz48VxxxRXMmzePadOmsWzZMp588kluuukmHn74YebMmcPWW2/NDTfcwJw5c1i8eDHz5s1j7ty5nHrqqQBcfPHFPProozz++ONcffXVAOy55548+OCDPProo3zzm9/ka1/7GgBXXnklffv2Zf78+XzrW99i1qxZALz00ktMnjyZ++67j9mzZzNixAguu+wyli1bxu23384TTzzB448//rZfBjaWU0ckSZK6oO22245Zs2bx4IMPcv/993PCCSdw8cUXc8oppwBw4okntnw/99xzAWhqauKEE05gyZIlrFixgt13373leIcddhjdu3dn6NChNDc3M27cOACGDh3Kc88912oNRx11VEufwYMHs8suuwCwxx57sGjRIh566CFmzZrFyJEjAfjb3/7Ge9/7Xj71qU/xzDPPcPbZZ3PEEUcwduxYAIYNG8ZJJ53E0UcfzdFHHw3A8uXLmThxIk899RQR0TK6/tBDD3HOOecAMGTIEIYNGwbA9OnTmT9/PgcccAAAK1asYNSoUfTp04eePXty2mmnceSRR3LkkUdu4k/AEW1JkqQua+utt2b06NF84xvf4Pvf/z633vr3x5ZExDuWzz77bM466yzmzp3L1KlTefPNN1v6rJmCstVWW9G9e/eWfbbaaitWrVrV6vnr91mzXL9PZjJx4kTmzJnDnDlzWLBgARdeeCF9+/blscceY/To0Vx99dV8/vOfB+Duu+/mzDPPZPbs2YwcOZJVq1bx9a9/nTFjxjBv3jzuvPPOt9Xcmszk0EMPbTnn/Pnzufbaa+nWrRszZszg2GOP5a677mr5RWJTGLQlSZK6oAULFvDUU0+1rM+ZM4fddtutZf2mm25q+T5q1CigNjq86667AnD99dcXr/Hggw/mlltu4cUXXwTg5Zdf5vnnn+ell15i9erVfOYzn2Hy5MnMnj2b1atXs2jRIsaMGcMll1zC8uXLee21195W87Rp01qOfcABB3DzzTcDMH/+fObOnQvA/vvvz8MPP9wyL/3111/nj3/8Y8uxDj/8cKZMmcJjjz22ya/PqSOSJEld0GuvvcbZZ5/NK6+8Qrdu3fjHf/xHrrnmmpbtf/nLXxg2bBg9evTgxhtvBGoXPR533HH07duXT3ziEzz77LNFaxw0aBCTJ09m7NixrF69mu7du3PFFVfQq1cvTj31VFavXg3At7/9bZqbmzn55JNZvnw5mcmXvvQltt9+e8477zwmTpzI5MmTOeKII1qOfcYZZzBx4kQGDRrEnnvuyeDBg+nTpw/9+vVj2rRpnHjiibz11lsATJ48md69ezN+/HjefPNNMpPLLrtsk19fZOYmH2RzNGLEiFxzZWln4yPYJUnq/J588kn22muvRpfRqgEDBjBz5kx22mmnRpdSTHNzMytXrqRnz548/fTTHHLIISxYsIBtttlmo4/Z2s80ImZl5ojW+juiLUmSpC7njTfeYMyYMaxcuZLM5Morr9ykkL0xDNqSJElbmHe7S0hX0rt3bxo9u8GLISVJkqQCDNqSJEmFdNVr4bZEG/OzNGhLkiQV0LNnT5YtW2bY7gIyk2XLltGzZ88N2s852pIkSQX079+fpqYmli5d2uhS1A569uxJ//79N2gfg7YkSVIB3bt3f9sjzLXlceqIJEmSVIBBW5IkSSrAoC1JkiQVYNCWJEmSCjBoS5IkSQUYtCVJkqQCDNqSJElSAQZtSZIkqQCDtiRJklSAQVuSJEkqwKAtSZIkFWDQliRJkgowaEuSJEkFGLQlSZKkAgzakiRJUgEGbUmSJKkAg7YkSZJUgEFbkiRJKsCgLUmSJBVg0JYkSZIKMGhLkiRJBRi0JUmSpAIM2pIkSVIBBm1JkiSpAIO2JEmSVIBBW5IkSSrAoC1JkiQVYNCWJEmSCjBoS5IkSQUYtCVJkqQCDNqSJElSAQZtSZIkqQCDtiRJklRAsaAdEddFxIsRMa+u7cKIWBwRc6qvw+u2fTUiFkbEgoj4ZF37uKptYUScX6peSZIkqT2VHNGeBoxrpX1KZg6vvu4BiIhBwARgcLXPlRGxdURsDVwBHAYMAk6s+kqSJEmbtW6lDpyZD0TEgDZ2Hw/8JDPfAp6NiIXAvtW2hZn5DEBE/KTqO7+dy5UkSZLaVSPmaJ8VEY9XU0v6Vm27Aovq+jRVbe/W3qqIOD0iZkbEzKVLl7Z33ZIkSVKbdXTQvgr4IDAcWAJ8tz0PnpnXZOaIzBzRr1+/9jy0JEmStEGKTR1pTWa+sGY5In4A3FWtLgbeX9e1f9XGOtolSZKkzVaHjmhHxC51q8cAa+5IcgcwISJ6RMTuwEBgBvAIMDAido+IbahdMHlHR9YsSZIkbYxiI9oRcSMwGtgpIpqAfwZGR8RwIIHngEkAmflERNxM7SLHVcCZmdlcHecs4JfA1sB1mflEqZolSZKk9lLyriMnttJ87Tr6XwRc1Er7PcA97ViaJEmSVJxPhpQkSZIKMGhLkiRJBRi0JUmSpAIM2pIkSVIBBm1JkiSpAIO2JEmSVIBBW5IkSSrAoC1JkiQVYNCWJEmSCjBoS5IkSQUYtCVJkqQCDNqSJElSAQZtSZIkqQCDtiRJklSAQVuSJEkqwKAtSZIkFWDQliRJkgowaEuSJEkFGLQlSZKkAgzakiRJUgEGbUmSJKkAg7YkSZJUgEFbkiRJKsCgLUmSJBVg0JYkSZIKMGhLkiRJBRi0JUmSpAIM2pIkSVIBBm1JkiSpAIO2JEmSVIBBW5IkSSrAoC1JkiQVYNCWJEmSCjBoS5IkSQUYtCVJkqQCDNqSJElSAQZtSZIkqQCDtiRJklSAQVuSJEkqwKAtSZIkFWDQliRJkgowaEuSJEkFGLQlSZKkAgzakiRJUgEGbUmSJKkAg7YkSZJUgEFbkiRJKsCgLUmSJBVg0JYkSZIKMGhLkiRJBRQL2hFxXUS8GBHz6tq+ExF/iIjHI+L2iNi+ah8QEX+LiDnV19V1+3w0IuZGxMKIuDwiolTNkiRJUnspOaI9DRi3Vtu9wJDMHAb8Efhq3banM3N49fXFuvargC8AA6uvtY8pSZIkbXaKBe3MfAB4ea22X2Xmqmp1OtB/XceIiF2Af8jM6ZmZwI+Ao0vUK0mSJLWnRs7R/n+B/6xb3z0iHo2I30bEQVXbrkBTXZ+mqk2SJEnarHVrxEkj4p+AVcANVdMS4AOZuSwiPgr8LCIGb8RxTwdOB/jABz7QXuVKkiRJG6zDR7Qj4hTgSOCkajoImflWZi6rlmcBTwMfAhbz9ukl/au2VmXmNZk5IjNH9OvXr9ArkCRJktavQ4N2RIwDzgOOysw36tr7RcTW1fIe1C56fCYzlwCvRsT+1d1GPgf8vCNrliRJkjZGsakjEXEjMBrYKSKagH+mdpeRHsC91V36pld3GPk48M2IWAmsBr6YmWsupDyD2h1MelGb010/r1uSJEnaLBUL2pl5YivN175L31uBW99l20xgSDuWJkmSJBXnkyElSZKkAgzakiRJUgEGbUmSJKkAg7YkSZJUgEFbkiRJKsCgLUmSJBVg0JYkSZIKMGhLkiRJBRi0JUmSpAIM2pIkSVIBBm1JkiSpAIO2JEmSVIBBW5IkSSrAoC1JkiQVYNCWJEmSCjBoS5IkSQUYtCVJkqQCDNqSJElSAQZtSZIkqQCDtiRJklSAQVuSJEkqwKAtSZIkFWDQliRJkgowaEuSJEkFGLQlSZKkAgzakiRJUgEGbUmSJKkAg7YkSZJUgEFbkiRJKsCgLUmSJBVg0JYkSZIKMGhLkiRJBRi0JUmSpAIM2pIkSVIBBm1JkiSpAIO2JEmSVIBBW5IkSSrAoC1JkiQVYNCWJEmSCjBoS5IkSQUYtCVJkqQCDNqSJElSAQZtSZIkqQCDtiRJklSAQVuSJEkqwKAtSZIkFWDQliRJkgowaEuSJEkFGLQlSZKkAgzakiRJUgEGbUmSJKmAokE7Iq6LiBcjYl5d2w4RcW9EPFV971u1R0RcHhELI+LxiNinbp+JVf+nImJiyZolSZKk9tCmoB0RQzfy+NOAcWu1nQ/8OjMHAr+u1gEOAwZWX6cDV1Xn3gH4Z2A/YF/gn9eEc0mSJGlz1dYR7SsjYkZEnBERfdp68Mx8AHh5rebxwPXV8vXA0XXtP8qa6cD2EbEL8Eng3sx8OTP/AtzLO8O7JEmStFlpU9DOzIOAk4D3A7Mi4j8i4tCNPOfOmbmkWv4zsHO1vCuwqK5fU9X2bu2SJEnSZqvNc7Qz8yngAuB/Av8NuDwi/hARn97Yk2dmArmx+68tIk6PiJkRMXPp0qXtdVhJkiRpg7V1jvawiJgCPAl8AvhUZu5VLU/ZwHO+UE0Jofr+YtW+mNqI+Rr9q7Z3a3+HzLwmM0dk5oh+/fptYFmSJElS+2nriPb/AWYDH8nMMzNzNkBm/onaKPeGuANYc+eQicDP69o/V919ZH9geTXF5JfA2IjoW10EObZqkyRJkjZb3drY7wjgb5nZDBARWwE9M/ONzPz3d9spIm4ERgM7RUQTtbuHXAzcHBGnAc8Dx1fd7wEOBxYCbwCnAmTmyxHxLeCRqt83M3PtCywlSZKkzUpbg/Z9wCHAa9X6e4BfAR9b106ZeeK7bDq4lb4JnPkux7kOuK6NtUqSJEkN19apIz0zc03Iplp+T5mSJEmSpM6vrUH79bWe1PhR4G9lSpIkSZI6v7ZOHfky8NOI+BMQwP8DnFCsKkmSJKmTa1PQzsxHImJP4MNV04LMXFmuLEmSJKlza+uINsBIYEC1zz4RQWb+qEhVkiRJUifXpqAdEf8OfBCYAzRXzQkYtCVJkqRWtHVEewQwqLoFnyRJkqT1aOtdR+ZRuwBSkiRJUhu0dUR7J2B+RMwA3lrTmJlHFalKkiRJ6uTaGrQvLFmEJEmS1NW09fZ+v42I3YCBmXlfRLwH2LpsaZIkSVLn1aY52hHxBeAWYGrVtCvws1JFSZIkSZ1dWy+GPBM4AHgVIDOfAt5bqihJkiSps2tr0H4rM1esWYmIbtTuoy1JkiSpFW0N2r+NiK8BvSLiUOCnwJ3lypIkSZI6t7YG7fOBpcBcYBJwD3BBqaIkSZKkzq6tdx1ZDfyg+pIkSZK0Hm0K2hHxLK3Myc7MPdq9IkmSJKkLaOsDa0bULfcEjgN2aP9yJEmSpK6hTXO0M3NZ3dfizPxX4IjCtUmSJEmdVlunjuxTt7oVtRHuto6GS5IkSVuctobl79YtrwKeA45v92okSZKkLqKtdx0ZU7oQSZIkqStp69SR/29d2zPzsvYpR5IkSeoaNuSuIyOBO6r1TwEzgKdKFCVJkiR1dm0N2v2BfTLzrwARcSFwd2aeXKowSZIkqTNr6yPYdwZW1K2vqNokSZIktaKtI9o/AmZExO3V+tHA9WVKkiRJkjq/tt515KKI+E/goKrp1Mx8tFxZkiRJUufW1qkjAO8BXs3M7wFNEbF7oZokSZKkTq+tt/f7Z2p3Hvkw8G9Ad+DHwAHlStOmmjRp048xdeqmH0OSJGlL1NYR7WOAo4DXATLzT0DvUkVJkiRJnV1bg/aKzEwgASJi23IlSZIkSZ1fW4P2zRExFdg+Ir4A3Af8oFxZkiRJUue23jnaERHATcCewKvU5mn/r8y8t3BtkiRJUqe13qCdmRkR92TmUMBwLUmSJLVBW6eOzI6IkUUrkSRJkrqQtj4Zcj/g5Ih4jtqdR4LaYPewUoVJkiRJndk6g3ZEfCAz/wv4ZAfVI0mSJHUJ6xvR/hmwT2Y+HxG3ZuZnOqIoSZIkqbNb3xztqFveo2QhkiRJUleyvqCd77IsSZIkaR3WN3XkIxHxKrWR7V7VMvz9Ysh/KFqdJEmS1EmtM2hn5tYdVYgkSZLUlbT1PtqSJEmSNoBBW5IkSSrAoC1JkiQVYNCWJEmSCjBoS5IkSQUYtCVJkqQCDNqSJElSAR0etCPiwxExp+7r1Yj4ckRcGBGL69oPr9vnqxGxMCIWRMQnO7pmSZIkaUOt78mQ7S4zFwDDASJia2AxcDtwKjAlM/+lvn9EDAImAIOB9wH3RcSHMrO5QwuXJEmSNkCjp44cDDydmc+vo8944CeZ+VZmPgssBPbtkOokSZKkjdTooD0BuLFu/ayIeDwirouIvlXbrsCiuj5NVZskSZK02WpY0I6IbYCjgJ9WTVcBH6Q2rWQJ8N2NOObpETEzImYuXbq03WqVJEmSNlSHz9GucxgwOzNfAFjzHSAifgDcVa0uBt5ft1//qu0dMvMa4BqAESNGZIGa12vSpEacVZIkSZubRk4dOZG6aSMRsUvdtmOAedXyHcCEiOgREbsDA4EZHValJEmStBEaMqIdEdsChwL147+XRsRwIIHn1mzLzCci4mZgPrAKONM7jkiSJGlz15CgnZmvAzuu1fbZdfS/CLiodF2SJElSe2n0XUckSZKkLsmgLUmSJBVg0JYkSZIKMGhLkiRJBRi0JUmSpAIM2pIkSVIBBm1JkiSpAIO2JEmSVIBBW5IkSSrAoC1JkiQVYNCWJEmSCjBoS5IkSQUYtCVJkqQCDNqSJElSAQZtSZIkqQCDtiRJklSAQVuSJEkqwKAtSZIkFWDQliRJkgowaEuSJEkFGLQlSZKkAgzakiRJUgEGbUmSJKkAg7YkSZJUgEFbkiRJKsCgLUmSJBVg0JYkSZIKMGhLkiRJBRi0JUmSpAIM2pIkSVIBBm1JkiSpAIO2JEmSVIBBW5IkSSrAoC1JkiQVYNCWJEmSCjBoS5IkSQUYtCVJkqQCDNqSJElSAQZtSZIkqQCDtiRJklSAQVuSJEkqwKAtSZIkFWDQliRJkgowaEuSJEkFGLQlSZKkAgzakiRJUgEGbUmSJKkAg7YkSZJUgEFbkiRJKsCgLUmSJBVg0JYkSZIKaFjQjojnImJuRMyJiJlV2w4RcW9EPFV971u1R0RcHhELI+LxiNinUXVLkiRJbdHoEe0xmTk8M0dU6+cDv87MgcCvq3WAw4CB1dfpwFUdXqkkSZK0ARodtNc2Hri+Wr4eOLqu/UdZMx3YPiJ2aUSBkiRJUls0Mmgn8KuImBURp1dtO2fmkmr5z8DO1fKuwKK6fZuqtreJiNMjYmZEzFy6dGmpuiVJkqT16tbAcx+YmYsj4r3AvRHxh/qNmZkRkRtywMy8BrgGYMSIERu0ryRJktSeGha0M3Nx9f3FiLgd2Bd4ISJ2ycwl1dSQF6vui4H31+3ev2pTYZMmtc9xpk5tn+NIkiR1Fg2ZOhIR20ZE7zXLwFhgHnAHMLHqNhH4ebV8B/C56u4j+wPL66aYSJIkSZudRo1o7wzcHhFraviPzPxFRDwC3BwRpwHPA8dX/e8BDgcWAm8Ap3Z8yZIkSVLbNSRoZ+YzwEdaaV8GHNxKewJndkBpkiRJUrvY3G7vJ0mSJHUJBm1JkiSpAIO2JEmSVIBBW5IkSSrAoC1JkiQVYNCWJEmSCjBoS5IkSQUYtCVJkqQCDNqSJElSAQZtSZIkqQCDtiRJklSAQVuSJEkqwKAtSZIkFWDQliRJku7Tkk8AAA0aSURBVAro1ugCpE0yadLG7Td1avvWIUmStBZHtCVJkqQCDNqSJElSAQZtSZIkqQCDtiRJklSAQVuSJEkqwKAtSZIkFWDQliRJkgowaEuSJEkFGLQlSZKkAgzakiRJUgEGbUmSJKmAbo0uQGqrSZPe2XbSAxt2jI9/vH1qkSRJWh9HtCVJkqQCDNqSJElSAQZtSZIkqQCDtiRJklSAF0OqQ7R2IaMkSVJX5oi2JEmSVIBBW5IkSSrAoC1JkiQVYNCWJEmSCjBoS5IkSQUYtCVJkqQCDNqSJElSAQZtSZIkqQCDtiRJklSAQVuSJEkqwKAtSZIkFWDQliRJkgowaEuSJEkFGLQlSZKkAgzakiRJUgEGbUmSJKkAg7YkSZJUgEFbkiRJKqDDg3ZEvD8i7o+I+RHxREScU7VfGBGLI2JO9XV43T5fjYiFEbEgIj7Z0TVLkiRJG6pbA865CvgfmTk7InoDsyLi3mrblMz8l/rOETEImAAMBt4H3BcRH8rM5g6tWpIkSdoAHT6inZlLMnN2tfxX4Elg13XsMh74SWa+lZnPAguBfctXKkmSJG28hs7RjogBwN7A76umsyLi8Yi4LiL6Vm27Aovqdmti3cFckiRJariGBe2I2A64FfhyZr4KXAV8EBgOLAG+uxHHPD0iZkbEzKVLl7ZrvZIkSdKGaEjQjoju1EL2DZl5G0BmvpCZzZm5GvgBf58eshh4f93u/au2d8jMazJzRGaO6NevX7kXIEmSJK1HI+46EsC1wJOZeVld+y513Y4B5lXLdwATIqJHROwODARmdFS9kiRJ0sZoxF1HDgA+C8yNiDlV29eAEyNiOJDAc8AkgMx8IiJuBuZTu2PJmd5xRJIkSZu7Dg/amfkQEK1sumcd+1wEXFSsKEmSJKmd+WRISZIkqQCDtiRJklSAQVuSJEkqwKAtSZIkFWDQliRJkgowaEuSJEkFGLQlSZKkAgzakiRJUgEGbUmSJKkAg7YkSZJUgEFbkiRJKsCgLUmSJBVg0JYkSZIKMGhLkiRJBRi0JUmSpAIM2pIkSVIBBm1JkiSpAIO2JEmSVEC3RhcgdaQHHqh9v2HSph1n6tRNr0WSJHVtjmhLkiRJBRi0JUmSpAIM2pIkSVIBBm1JkiSpAIO2JEmSVIB3HdFm4aQHNvE2IJIkSZsZR7QlSZKkAgzakiRJUgEGbUmSJKkAg7YkSZJUgEFbkiRJKsCgLUmSJBVg0JYkSZIKMGhLkiRJBRi0JUmSpAIM2pIkSVIBBm1JkiSpAIO2JEmSVEC3RhcgNcJJD0za6H1v+PjUdqxEkiR1VY5oS5IkSQUYtCVJkqQCDNqSJElSAQZtSZIkqQAvhlS72pSLDCVJkroSR7QlSZKkAgzakiRJUgEGbUmSJKkA52hLG2FSO0xFn+pzbyRJ6tIc0ZYkSZIKMGhLkiRJBTh1pIvzdnuSJEmNYdCW5JxzSZIK6DRBOyLGAd8DtgZ+mJkXN7ikDuXIdNfTHuFWkiRtvjpF0I6IrYErgEOBJuCRiLgjM+c3tjJJa7TXLw6OjEuSuopOEbSBfYGFmfkMQET8BBgPdKqg7ai0tH6b0zSWzamW9tDVXo8kbe46S9DeFVhUt94E7NegWrSF29hfmG74uAllS2S43TL4c5bUmsjMRtewXhFxLDAuMz9frX8W2C8zz1qr3+nA6dXqh4EFHVoo7AS81MHn1ObP94Va4/tCrfF9odb4vti87ZaZ/Vrb0FlGtBcD769b71+1vU1mXgNc01FFrS0iZmbmiEadX5sn3xdqje8Ltcb3hVrj+6Lz6iwPrHkEGBgRu0fENsAE4I4G1yRJkiS9q04xop2ZqyLiLOCX1G7vd11mPtHgsiRJkqR31SmCNkBm3gPc0+g61qNh01a0WfN9odb4vlBrfF+oNb4vOqlOcTGkJEmS1Nl0ljnakiRJUqdi0G4HETEuIhZExMKIOL/R9ajjRMT7I+L+iJgfEU9ExDlV+w4RcW9EPFV971u1R0RcXr1XHo+IfRr7ClRSRGwdEY9GxF3V+u4R8fvq539TdXE3EdGjWl9YbR/QyLpVTkRsHxG3RMQfIuLJiBjl54Ui4tzq/5B5EXFjRPT086JrMGhvorrHwx8GDAJOjIhBja1KHWgV8D8ycxCwP3Bm9fM/H/h1Zg4Efl2tQ+19MrD6Oh24quNLVgc6B3iybv0SYEpm/iPwF+C0qv004C9V+5Sqn7qm7wG/yMw9gY9Qe3/4ebEFi4hdgS8BIzJzCLWbPkzAz4suwaC96VoeD5+ZK4A1j4fXFiAzl2Tm7Gr5r9T+09yV2nvg+qrb9cDR1fJ44EdZMx3YPiJ26eCy1QEioj9wBPDDaj2ATwC3VF3Wfl+seb/cAhxc9VcXEhF9gI8D1wJk5orMfAU/L1S7OUWviOgGvAdYgp8XXYJBe9O19nj4XRtUixqo+vPd3sDvgZ0zc0m16c/AztWy75ctx78C5wGrq/UdgVcyc1W1Xv+zb3lfVNuXV/3VtewOLAX+rZpS9MOI2BY/L7ZombkY+Bfgv6gF7OXALPy86BIM2lI7iIjtgFuBL2fmq/XbsnZrH2/vswWJiCOBFzNzVqNr0WalG7APcFVm7g28zt+niQB+XmyJqjn546n9IvY+YFtgXEOLUrsxaG+6Nj0eXl1XRHSnFrJvyMzbquYX1vyJt/r+YtXu+2XLcABwVEQ8R2062Seozc3dvvrTMLz9Z9/yvqi29wGWdWTB6hBNQFNm/r5av4Va8PbzYst2CPBsZi7NzJXAbdQ+Q/y86AIM2pvOx8Nvwap5cdcCT2bmZXWb7gAmVssTgZ/XtX+uupvA/sDyuj8Zq4vIzK9mZv/MHEDtM+E3mXkScD9wbNVt7ffFmvfLsVV/RzW7mMz8M7AoIj5cNR0MzMfPiy3dfwH7R8R7qv9T1rwv/LzoAnxgTTuIiMOpzcdc83j4ixpckjpIRBwIPAjM5e9zcb9GbZ72zcAHgOeB4zPz5epD9PvU/iz4BnBqZs7s8MLVYSJiNPCVzDwyIvagNsK9A/AocHJmvhURPYF/pzbH/2VgQmY+06iaVU5EDKd2gew2wDPAqdQGvfy82IJFxDeAE6jdyepR4PPU5mL7edHJGbQlSZKkApw6IkmSJBVg0JYkSZIKMGhLkiRJBRi0JUmSpAIM2pIkSVIBBm1J6kARkRHx47r1bhGxNCLuamRdbRERrxU+/pcj4j0ddT5JKs2gLUkd63VgSET0qtYPxaf9rfFl4D3r7SVJnYRBW5I63j3AEdXyicCNazZExLYRcV1EzIiIRyNifNU+uGqbExGPR8TAqu/dEfFYRMyLiBOqvv8rIh6p2q6pHnxCRIys9p0TEd+JiHlV+9bV+iPV9kltfSER8cGI+EVEzIqIByNiz6p9WkRcHhH/NyKeiYhjq/atIuLKiPhDRNwbEfdExLER8SXgfcD9EXF/3fEvql7f9IjYeRP+zSWpwxm0Janj/QSYUD3hbRi1J4mu8U/UHqm8LzAG+E5EbAt8EfheZg4HRgBN1J4Y+KfM/EhmDgF+UR3j+5k5smrrBRxZtf8bMKk6RnPdOU+j9njvkcBI4AsRsXsbX8s1wNmZ+VHgK8CVddt2AQ6szn9x1fZpYAAwCPgsMAogMy8H/gSMycwxVd9tgemZ+RHgAeALbaxJkjYL3RpdgCRtaTLz8YgYQG00+561No8FjoqIr1TrPak9mvt3wD9FRH/gtsx8KiLmAt+NiEuAuzLzwWqfMRFxHrVpGDsAT0TEg0DvzPxd1ec/+HsAHwsMWzPqDPQBBgLPrut1RMR2wMeAn1aD5gA96rr8LDNXA/PrRqMPBH5atf+5fvS6FSuANXPXZ1GbZiNJnYZBW5Ia4w7gX4DRwI517QF8JjMXrNX/yYj4PbUpJ/dExKTM/E1E7AMcDkyOiF8Dl1IbVR6RmYsi4kJqYX1dgtqo9C838DVsBbxSjZC35q21zrGhVmZmVsvN+H+WpE7GqSOS1BjXAd/IzLlrtf8SOLtuXvXe1fc9gGeqKRY/pzYC/T7gjcz8MfAdYB/+HqpfqkacjwXIzFeAv0bEftX2CWud879HRPfqXB+qpqusU2a+CjwbEcdV+0VEfGQ9uz0MfKaaq70ztV801vgr0Ht955WkzsLRAUlqgMxsAi5vZdO3gH8FHo+IrahN3zgSOB74bESsBP4M/G9q86m/ExGrgZXAf8/MVyLiB8C8qt8jdcc+DfhB1f+3wPKq/YfU5k3PrgL+UuDoVmp7T0Q01a1fBpwEXBURFwDdqc0/f2wdL/1W4GBgPrAImF1XxzXALyLiT3XztCWp04q//1VOktSVRcR2mflatXw+sEtmntOoOiJiR2AGcEBm/rmj65Ck0hzRlqQtxxER8VVqn/3PA6c0qI67ImJ7YBvgW4ZsSV2VI9qSJElSAV4MKUmSJBVg0JYkSZIKMGhLkiRJBRi0JUmSpAIM2pIkSVIBBm1JkiSpgP8fA7kt3d7AxrMAAAAASUVORK5CYII=\n"},"metadata":{"needs_background":"light"}}]},{"cell_type":"markdown","source":["## From the above plot, we observe that mostly the length of the spam messages are usually shorter than the ham messages. Can we use this knowledge to classify the spam?"],"metadata":{"id":"T8MDvo1s9RZV"}},{"cell_type":"code","source":["sms[sms.label=='ham'].describe()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":300},"id":"x8RaHsLK9imm","executionInfo":{"status":"ok","timestamp":1663748815116,"user_tz":-330,"elapsed":12,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"96d504bf-3d0b-4a4e-94ae-b8ba4706f316"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" label_num message_len\n","count 4825.0 4825.000000\n","mean 0.0 71.023627\n","std 0.0 58.016023\n","min 0.0 2.000000\n","25% 0.0 33.000000\n","50% 0.0 52.000000\n","75% 0.0 92.000000\n","max 0.0 910.000000"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
label_nummessage_len
count4825.04825.000000
mean0.071.023627
std0.058.016023
min0.02.000000
25%0.033.000000
50%0.052.000000
75%0.092.000000
max0.0910.000000
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":16}]},{"cell_type":"code","source":["sms[sms.label=='spam'].describe()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":300},"id":"b3gDgmLw9xOf","executionInfo":{"status":"ok","timestamp":1663748815922,"user_tz":-330,"elapsed":818,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"db0bc75f-098f-4076-f399-88a0f0a563f6"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" label_num message_len\n","count 747.0 747.000000\n","mean 1.0 138.866131\n","std 0.0 29.183082\n","min 1.0 13.000000\n","25% 1.0 132.500000\n","50% 1.0 149.000000\n","75% 1.0 157.000000\n","max 1.0 224.000000"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
label_nummessage_len
count747.0747.000000
mean1.0138.866131
std0.029.183082
min1.013.000000
25%1.0132.500000
50%1.0149.000000
75%1.0157.000000
max1.0224.000000
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":17}]},{"cell_type":"markdown","source":["## Our dataset consists short messages. We need to process the data such that we remove the irrelevant tokens (such as punctuations, stop words)"],"metadata":{"id":"yHNG_VP8_N3F"}},{"cell_type":"code","source":["import string\n","from nltk.corpus import stopwords\n","import nltk\n","nltk.download('stopwords') # Downloading the stopwords\n","\n","def text_process(mess):\n"," \"\"\"\n"," Takes in a string of text, then performs the following:\n"," 1. Remove all punctuation\n"," 2. Remove all stopwords\n"," 3. Returns a list of the cleaned text\n"," \"\"\"\n"," STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']\n"," # Check characters to see if they are in punctuation\n"," nopunc = [char for char in mess if char not in string.punctuation]\n","\n"," # Join the characters again to form the string.\n"," nopunc = ''.join(nopunc)\n"," \n"," # Now just remove any stopwords\n"," return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"CUKkMI4S_QMx","executionInfo":{"status":"ok","timestamp":1663748817000,"user_tz":-330,"elapsed":1083,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"50463b15-79ef-4710-8b6f-bf3dec3e2e42"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data] Unzipping corpora/stopwords.zip.\n"]}]},{"cell_type":"code","source":["sms['clean_msg'] = sms.message.apply(text_process)"],"metadata":{"id":"bnbNs41N_bv5"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# This is our processed text\n","sms.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"NHY6pxgS_oNQ","executionInfo":{"status":"ok","timestamp":1663748817623,"user_tz":-330,"elapsed":20,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"e3b0a345-b9d3-4cf8-a1d8-9b3337fd36f3"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" label message label_num \\\n","0 ham Go until jurong point, crazy.. Available only ... 0 \n","1 ham Ok lar... Joking wif u oni... 0 \n","2 spam Free entry in 2 a wkly comp to win FA Cup fina... 1 \n","3 ham U dun say so early hor... U c already then say... 0 \n","4 ham Nah I don't think he goes to usf, he lives aro... 0 \n","\n"," message_len clean_msg \n","0 111 Go jurong point crazy Available bugis n great ... \n","1 29 Ok lar Joking wif oni \n","2 155 Free entry wkly comp win FA Cup final tkts 21s... \n","3 49 dun say early hor c already say \n","4 61 Nah think goes usf lives around though "],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
labelmessagelabel_nummessage_lenclean_msg
0hamGo until jurong point, crazy.. Available only ...0111Go jurong point crazy Available bugis n great ...
1hamOk lar... Joking wif u oni...029Ok lar Joking wif oni
2spamFree entry in 2 a wkly comp to win FA Cup fina...1155Free entry wkly comp win FA Cup final tkts 21s...
3hamU dun say so early hor... U c already then say...049dun say early hor c already say
4hamNah I don't think he goes to usf, he lives aro...061Nah think goes usf lives around though
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":20}]},{"cell_type":"markdown","source":["## Creating the word count after removing stop words and punctuations. We need to represent each message as a vector so that we can perform the ML prediction. (Why?)"],"metadata":{"id":"elDxYRDxA8Jw"}},{"cell_type":"code","source":["from collections import Counter\n","\n","words = sms[sms.label=='ham'].clean_msg.apply(lambda x: [word.lower() for word in x.split()])\n","ham_words = Counter()\n","\n","for msg in words:\n"," ham_words.update(msg)\n"," \n","print(ham_words.most_common(50))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QzErNBuI_4QU","executionInfo":{"status":"ok","timestamp":1663748817624,"user_tz":-330,"elapsed":19,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"d81ae0f7-6dbc-4f38-bcea-90fffd49b183"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["[('get', 303), ('ltgt', 276), ('ok', 272), ('go', 247), ('ill', 236), ('know', 232), ('got', 231), ('like', 229), ('call', 229), ('come', 224), ('good', 222), ('time', 189), ('day', 187), ('love', 185), ('going', 167), ('want', 163), ('one', 162), ('home', 160), ('lor', 160), ('need', 156), ('sorry', 153), ('still', 146), ('see', 137), ('n', 134), ('later', 134), ('da', 131), ('r', 131), ('back', 129), ('think', 128), ('well', 126), ('today', 125), ('send', 123), ('tell', 121), ('cant', 118), ('ì', 117), ('hi', 117), ('take', 112), ('much', 112), ('oh', 111), ('night', 107), ('hey', 106), ('happy', 105), ('great', 100), ('way', 100), ('hope', 99), ('pls', 98), ('work', 96), ('wat', 95), ('thats', 94), ('dear', 94)]\n"]}]},{"cell_type":"code","source":["words = sms[sms.label=='spam'].clean_msg.apply(lambda x: [word.lower() for word in x.split()])\n","spam_words = Counter()\n","\n","for msg in words:\n"," spam_words.update(msg)\n"," \n","print(spam_words.most_common(50))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"V5Xj6E-pBEJt","executionInfo":{"status":"ok","timestamp":1663748817624,"user_tz":-330,"elapsed":15,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"042a4b40-c0ce-464f-d32c-d96bcfd8603e"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["[('call', 347), ('free', 216), ('txt', 150), ('mobile', 123), ('text', 120), ('claim', 113), ('stop', 113), ('reply', 101), ('prize', 92), ('get', 83), ('new', 69), ('send', 67), ('nokia', 65), ('urgent', 63), ('cash', 62), ('win', 60), ('contact', 56), ('service', 55), ('please', 52), ('guaranteed', 50), ('customer', 49), ('16', 49), ('week', 49), ('tone', 48), ('per', 46), ('phone', 45), ('18', 43), ('chat', 42), ('awarded', 38), ('draw', 38), ('latest', 36), ('å£1000', 35), ('line', 35), ('150ppm', 34), ('mins', 34), ('receive', 33), ('camera', 33), ('1', 33), ('every', 33), ('message', 32), ('holiday', 32), ('landline', 32), ('shows', 31), ('å£2000', 31), ('go', 31), ('box', 30), ('number', 30), ('apply', 29), ('code', 29), ('live', 29)]\n"]}]},{"cell_type":"markdown","source":["## Next we will create a vector notation for each message. For that we will use the Bag-of-Words approach that we learnt in Module-11 (data engineering). We will use SciKit Learn's CountVectorizer for that."],"metadata":{"id":"g86r3NiuBWtT"}},{"cell_type":"code","source":["# how to define X and y (from the SMS data) for use with COUNTVECTORIZER\n","X = sms.clean_msg\n","y = sms.label_num\n","print(X.shape)\n","print(y.shape)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"xL9ZqWuBBlau","executionInfo":{"status":"ok","timestamp":1663748817624,"user_tz":-330,"elapsed":13,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"18541063-4009-4ff7-8335-43daf7309c2f"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["(5572,)\n","(5572,)\n"]}]},{"cell_type":"code","source":["# split X and y into training and testing sets \n","from sklearn.model_selection import train_test_split\n","X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)\n","print(X_train.shape)\n","print(X_test.shape)\n","print(y_train.shape)\n","print(y_test.shape)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"3h8rdX1lBymv","executionInfo":{"status":"ok","timestamp":1663748817625,"user_tz":-330,"elapsed":12,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"f95cc771-2e70-4e2b-82bf-01c508f9b816"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["(4179,)\n","(1393,)\n","(4179,)\n","(1393,)\n"]}]},{"cell_type":"code","source":["from sklearn.feature_extraction.text import CountVectorizer\n","\n","# instantiate the vectorizer\n","vect = CountVectorizer()\n","vect.fit(X_train)\n","print(vect)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"HfVRecVxB8wn","executionInfo":{"status":"ok","timestamp":1663748848752,"user_tz":-330,"elapsed":515,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"9dc1b9c7-3024-4f3e-9120-87d70b999eb9"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["CountVectorizer()\n"]}]},{"cell_type":"code","source":["# learn training data vocabulary, then use it to create a document-term matrix\n","X_train_dtm = vect.transform(X_train)"],"metadata":{"id":"WqUL-HNZCGXu"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# equivalently: combine fit and transform into a single step\n","X_train_dtm = vect.fit_transform(X_train)"],"metadata":{"id":"jDGtB4phCLUM"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# transform testing data (using fitted vocabulary) into a document-term matrix\n","X_test_dtm = vect.transform(X_test)"],"metadata":{"id":"dwBRqhtkCOrM"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn.feature_extraction.text import TfidfTransformer\n","\n","tfidf_transformer = TfidfTransformer()\n","tfidf_transformer.fit(X_train_dtm)\n","tfidf_transformer.transform(X_train_dtm)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"qU_SS7MpCWGP","executionInfo":{"status":"ok","timestamp":1663748817627,"user_tz":-330,"elapsed":11,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"88107aa7-5c0a-4f36-fe43-2d6626c97f69"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["<4179x7996 sparse matrix of type ''\n","\twith 34796 stored elements in Compressed Sparse Row format>"]},"metadata":{},"execution_count":29}]},{"cell_type":"markdown","source":["## Building and evaluating the model"],"metadata":{"id":"VJtmmr_5CcnY"}},{"cell_type":"code","source":["# We will first use the Multinomial Naive Bayes model\n","# import and instantiate a Multinomial Naive Bayes model\n","from sklearn.naive_bayes import MultinomialNB\n","nb = MultinomialNB()"],"metadata":{"id":"HnvEPIWdCZtB"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# train the model using X_train_dtm (timing it with an IPython \"magic command\")\n","%time nb.fit(X_train_dtm, y_train)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"NDM_Oo4lCpDc","executionInfo":{"status":"ok","timestamp":1663748817627,"user_tz":-330,"elapsed":10,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"e4e394a6-e5f5-4df0-91ba-07899f96e7cd"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["CPU times: user 5.99 ms, sys: 846 µs, total: 6.84 ms\n","Wall time: 8.84 ms\n"]},{"output_type":"execute_result","data":{"text/plain":["MultinomialNB()"]},"metadata":{},"execution_count":31}]},{"cell_type":"code","source":["# make class predictions for X_test_dtm\n","y_pred_class = nb.predict(X_test_dtm)"],"metadata":{"id":"fI045FuyCvsd"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# calculate accuracy of class predictions\n","from sklearn import metrics\n","metrics.accuracy_score(y_test, y_pred_class)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ey-bMtsrC1Eh","executionInfo":{"status":"ok","timestamp":1663748818854,"user_tz":-330,"elapsed":23,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"f5b36a96-d6cd-4bc1-8f5c-6bcbcef80425"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.9827709978463748"]},"metadata":{},"execution_count":33}]},{"cell_type":"code","source":["# print the confusion matrix\n","metrics.confusion_matrix(y_test, y_pred_class)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"gUB4ar1mC37B","executionInfo":{"status":"ok","timestamp":1663748818854,"user_tz":-330,"elapsed":22,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"04480ea2-e4b2-4c6f-8891-ed65965ed663"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[1205, 8],\n"," [ 16, 164]])"]},"metadata":{},"execution_count":34}]},{"cell_type":"code","source":["# print message text for false positives (ham incorrectly classifier)\n","# X_test[(y_pred_class==1) & (y_test==0)]\n","X_test[y_pred_class > y_test]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Zwa9SNSYC_Yi","executionInfo":{"status":"ok","timestamp":1663748818855,"user_tz":-330,"elapsed":21,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"d4c2a17c-c0a1-411f-bb79-09fad99f3c58"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["2418 Madamregret disturbancemight receive reference...\n","4598 laid airtel line rest\n","386 Customer place call\n","1289 HeyGreat dealFarm tour 9am 5pm 95pax 50 deposi...\n","5094 Hi ShanilRakhesh herethanksi exchanged uncut d...\n","494 free nowcan call\n","759 Call youcarlos isare phones vibrate acting mig...\n","3140 Customer place call\n","Name: clean_msg, dtype: object"]},"metadata":{},"execution_count":35}]},{"cell_type":"code","source":["# print message text for false negatives (spam incorrectly classifier)\n","X_test[y_pred_class < y_test]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"hVwHND2iDHEi","executionInfo":{"status":"ok","timestamp":1663748818855,"user_tz":-330,"elapsed":19,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"92187196-e7f6-4746-8994-97fe59b771ba"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["4674 Hi babe Chloe r smashed saturday night great w...\n","3528 Xmas New Years Eve tickets sale club day 10am ...\n","3417 LIFE never much fun great came made truly spec...\n","2773 come takes little time child afraid dark becom...\n","1960 Guess Somebody know secretly fancies Wanna fin...\n","5 FreeMsg Hey darling 3 weeks word back Id like ...\n","2078 85233 FREERingtoneReply REAL\n","1457 CLAIRE havin borin time alone wanna cum 2nite ...\n","190 unique enough Find 30th August wwwareyouunique...\n","2429 Guess IThis first time created web page WWWASJ...\n","3057 unsubscribed services Get tons sexy babes hunk...\n","1021 Guess Somebody know secretly fancies Wanna fin...\n","4067 TBSPERSOLVO chasing us since Sept forå£38 defi...\n","3358 Sorry missed call lets talk time 07090201529\n","2821 ROMCAPspam Everyone around responding well pre...\n","2247 Back work 2morro half term C 2nite sexy passio...\n","Name: clean_msg, dtype: object"]},"metadata":{},"execution_count":36}]},{"cell_type":"code","source":["# calculate predicted probabilities for X_test_dtm (poorly calibrated)\n","y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]\n","\n","# calculate AUC\n","metrics.roc_auc_score(y_test, y_pred_prob)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"oOHAYN6kDLL9","executionInfo":{"status":"ok","timestamp":1663748818855,"user_tz":-330,"elapsed":17,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"a77daf27-7872-4b59-b768-38e9e1a656a5"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.9774342768159751"]},"metadata":{},"execution_count":37}]},{"cell_type":"code","source":["from sklearn.feature_extraction.text import TfidfTransformer\n","from sklearn.pipeline import Pipeline\n","\n","pipe = Pipeline([('bow', CountVectorizer()), \n"," ('tfid', TfidfTransformer()), \n"," ('model', MultinomialNB())])\n","pipe.fit(X_train, y_train)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"aV8PZWMeDbAM","executionInfo":{"status":"ok","timestamp":1663749704897,"user_tz":-330,"elapsed":726,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"870684a5-27ad-482e-8991-c0ba68d136a7"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Pipeline(steps=[('bow', CountVectorizer()), ('tfid', TfidfTransformer()),\n"," ('model', MultinomialNB())])"]},"metadata":{},"execution_count":49}]},{"cell_type":"code","source":["y_pred = pipe.predict(X_test)"],"metadata":{"id":"sOLETGjrFDcj"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["pipe.predict([\"you have won a prize\"])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Y01GdlBhG2cc","executionInfo":{"status":"ok","timestamp":1663748818856,"user_tz":-330,"elapsed":15,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"ea5b86fc-f36a-48e5-ef68-dc2a9aa0517c"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([1])"]},"metadata":{},"execution_count":40}]},{"cell_type":"code","source":["metrics.accuracy_score(y_test, y_pred)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"o-zyrGbjFGp_","executionInfo":{"status":"ok","timestamp":1663748818856,"user_tz":-330,"elapsed":13,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"64e3ef6e-7ba5-4c09-adb7-f5aa2302948b"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.9669777458722182"]},"metadata":{},"execution_count":41}]},{"cell_type":"code","source":["metrics.confusion_matrix(y_test, y_pred)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"6dAVFnxGFLGG","executionInfo":{"status":"ok","timestamp":1663748818856,"user_tz":-330,"elapsed":12,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"587f5e95-3f11-41be-c569-5d1e0396d28b"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[1213, 0],\n"," [ 46, 134]])"]},"metadata":{},"execution_count":42}]},{"cell_type":"code","source":["X_test"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"38nMmyUXFzvt","executionInfo":{"status":"ok","timestamp":1663748818857,"user_tz":-330,"elapsed":12,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"cd62db05-94fb-489c-f7e9-de3a488d1d5e"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["1078 Convey regards\n","4028 ‰Û anyway many good evenings\n","958 sort code acc bank natwest reply confirm ive s...\n","4642 Sorry din lock keypad\n","4674 Hi babe Chloe r smashed saturday night great w...\n"," ... \n","3207 Oops phone died didnt even know Yeah like better\n","4655 K Ill work something\n","1140 Ohas usual vijay film different\n","1793 bad girl still remember\n","1710 promises though havent even gotten dinner yet\n","Name: clean_msg, Length: 1393, dtype: object"]},"metadata":{},"execution_count":43}]},{"cell_type":"code","source":["y_pred"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"n9DLYH0HF0Lz","executionInfo":{"status":"ok","timestamp":1663748818857,"user_tz":-330,"elapsed":11,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"1fa38ca8-0934-4d54-8252-c4a49730a00c"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([0, 0, 0, ..., 0, 0, 0])"]},"metadata":{},"execution_count":44}]},{"cell_type":"code","source":["import joblib\n","joblib.dump(pipe, '/content/drive/MyDrive/guvi/project-proposal/final-project/model.joblib')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"GKIxX6TidiAm","executionInfo":{"status":"ok","timestamp":1663748819792,"user_tz":-330,"elapsed":942,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"276653a1-cf8c-436c-d57d-b10154111d76"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['/content/drive/MyDrive/guvi/project-proposal/final-project/model.joblib']"]},"metadata":{},"execution_count":45}]},{"cell_type":"code","source":["my_model = joblib.load('/content/drive/MyDrive/guvi/project-proposal/final-project/model.joblib')\n","my_model.predict([\"You won a prize\"])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"zAMt3aXaeYPq","executionInfo":{"status":"ok","timestamp":1663748819793,"user_tz":-330,"elapsed":7,"user":{"displayName":"Amit Arjun Verma","userId":"00059363672616543285"}},"outputId":"a138aac6-f282-4859-8da1-27bd2327cbcb"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([1])"]},"metadata":{},"execution_count":46}]}]} --------------------------------------------------------------------------------