├── README.md └── QuoraInsincere.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # DMW-Quora-Insincere-Questions-Classifications 2 | DMW mini project Quora Insincere Questions Classification 3 | -------------------------------------------------------------------------------- /QuoraInsincere.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load in \n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the \"../input/\" directory.\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# Any results you write to the current directory are saved as output.","execution_count":1,"outputs":[{"output_type":"stream","text":"/kaggle/input/quora-insincere-questions-classification/train.csv\n/kaggle/input/quora-insincere-questions-classification/sample_submission.csv\n/kaggle/input/quora-insincere-questions-classification/test.csv\n/kaggle/input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/paragram_300_sl999.txt\n/kaggle/input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/README.txt\n/kaggle/input/quora-insincere-questions-classification/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec\n/kaggle/input/quora-insincere-questions-classification/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin\n/kaggle/input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt\n","name":"stdout"}]},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport re\nimport nltk\nnltk.download('stopwords')\nnltk.download('punkt')\nnltk.download('wordnet')","execution_count":2,"outputs":[{"output_type":"stream","text":"[nltk_data] Downloading package stopwords to /usr/share/nltk_data...\n[nltk_data] Package stopwords is already up-to-date!\n[nltk_data] Downloading package punkt to /usr/share/nltk_data...\n[nltk_data] Package punkt is already up-to-date!\n[nltk_data] Downloading package wordnet to /usr/share/nltk_data...\n[nltk_data] Package wordnet is already up-to-date!\n","name":"stdout"},{"output_type":"execute_result","execution_count":2,"data":{"text/plain":"True"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"import seaborn as sns\nimport string\nimport warnings \nwarnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n\n%matplotlib inline","execution_count":3,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"import spacy\nfrom collections import Counter","execution_count":4,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"import pickle","execution_count":5,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"train_df = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')\ntest_df = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')","execution_count":6,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"train_df.head()","execution_count":7,"outputs":[{"output_type":"execute_result","execution_count":7,"data":{"text/plain":" qid question_text \\\n0 00002165364db923c7e6 How did Quebec nationalists see their province... \n1 000032939017120e6e44 Do you have an adopted dog, how would you enco... \n2 0000412ca6e4628ce2cf Why does velocity affect time? Does velocity a... \n3 000042bf85aa498cd78e How did Otto von Guericke used the Magdeburg h... \n4 0000455dfa3e01eae3af Can I convert montra helicon D to a mountain b... \n\n target \n0 0 \n1 0 \n2 0 \n3 0 \n4 0 ","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
qidquestion_texttarget
000002165364db923c7e6How did Quebec nationalists see their province...0
1000032939017120e6e44Do you have an adopted dog, how would you enco...0
20000412ca6e4628ce2cfWhy does velocity affect time? Does velocity a...0
3000042bf85aa498cd78eHow did Otto von Guericke used the Magdeburg h...0
40000455dfa3e01eae3afCan I convert montra helicon D to a mountain b...0
\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"test_df.head()","execution_count":8,"outputs":[{"output_type":"execute_result","execution_count":8,"data":{"text/plain":" qid question_text\n0 0000163e3ea7c7a74cd7 Why do so many women become so rude and arroga...\n1 00002bd4fb5d505b9161 When should I apply for RV college of engineer...\n2 00007756b4a147d2b0b3 What is it really like to be a nurse practitio...\n3 000086e4b7e1c7146103 Who are entrepreneurs?\n4 0000c4c3fbe8785a3090 Is education really making good people nowadays?","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
qidquestion_text
00000163e3ea7c7a74cd7Why do so many women become so rude and arroga...
100002bd4fb5d505b9161When should I apply for RV college of engineer...
200007756b4a147d2b0b3What is it really like to be a nurse practitio...
3000086e4b7e1c7146103Who are entrepreneurs?
40000c4c3fbe8785a3090Is education really making good people nowadays?
\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"train_df['target'].value_counts()","execution_count":9,"outputs":[{"output_type":"execute_result","execution_count":9,"data":{"text/plain":"0 1225312\n1 80810\nName: target, dtype: int64"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"print('Average word length of questions in train is {0:.0f}.'.format(np.mean(train_df['question_text'].apply(lambda x: len(x.split())))))\nprint('Average word length of questions in test is {0:.0f}.'.format(np.mean(test_df['question_text'].apply(lambda x: len(x.split())))))","execution_count":10,"outputs":[{"output_type":"stream","text":"Average word length of questions in train is 13.\nAverage word length of questions in test is 13.\n","name":"stdout"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"print('Max word length of questions in train is {0:.0f}.'.format(np.max(train_df['question_text'].apply(lambda x: len(x.split())))))\nprint('Max word length of questions in test is {0:.0f}.'.format(np.max(test_df['question_text'].apply(lambda x: len(x.split())))))","execution_count":11,"outputs":[{"output_type":"stream","text":"Max word length of questions in train is 134.\nMax word length of questions in test is 87.\n","name":"stdout"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"print('Average character length of questions in train is {0:.0f}.'.format(np.mean(train_df['question_text'].apply(lambda x: len(x)))))\nprint('Average character length of questions in test is {0:.0f}.'.format(np.mean(test_df['question_text'].apply(lambda x: len(x)))))","execution_count":12,"outputs":[{"output_type":"stream","text":"Average character length of questions in train is 71.\nAverage character length of questions in test is 71.\n","name":"stdout"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"# using seaborns countplot to show distribution of questions in dataset\nfig, ax = plt.subplots()\ng = sns.countplot(train_df.target, palette='viridis')\ng.set_xticklabels(['Sincere', 'Insincere'])\ng.set_yticklabels([])\n\n# function to show values on bars\ndef show_values_on_bars(axs):\n def _show_on_single_plot(ax): \n for p in ax.patches:\n _x = p.get_x() + p.get_width() / 2\n _y = p.get_y() + p.get_height()\n value = '{:.0f}'.format(p.get_height())\n ax.text(_x, _y, value, ha=\"center\") \n\n if isinstance(axs, np.ndarray):\n for idx, ax in np.ndenumerate(axs):\n _show_on_single_plot(ax)\n else:\n _show_on_single_plot(axs)\nshow_values_on_bars(ax)\n\nsns.despine(left=True, bottom=True)\nplt.xlabel('')\nplt.ylabel('')\nplt.title('Distribution of Questions', fontsize=30)\nplt.tick_params(axis='x', which='major', labelsize=15)\nfig.savefig('classes.png')\nplt.show()","execution_count":13,"outputs":[{"output_type":"display_data","data":{"text/plain":"
","image/png":"\n"},"metadata":{"needs_background":"light"}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"# print percentage of questions where target == 1\n(len(train_df.loc[train_df.target==1])) / (len(train_df.loc[train_df.target == 0])) * 100","execution_count":14,"outputs":[{"output_type":"execute_result","execution_count":14,"data":{"text/plain":"6.595054973753624"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"X, y = train_df['question_text'], train_df['target']","execution_count":15,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df1 = train_df.sample(frac =.3) ","execution_count":16,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# using seaborns countplot to show distribution of questions in dataset\nfig, ax = plt.subplots()\ng = sns.countplot(df1.target, palette='viridis')\ng.set_xticklabels(['Sincere', 'Insincere'])\ng.set_yticklabels([])\n\n# function to show values on bars\ndef show_values_on_bars(axs):\n def _show_on_single_plot(ax): \n for p in ax.patches:\n _x = p.get_x() + p.get_width() / 2\n _y = p.get_y() + p.get_height()\n value = '{:.0f}'.format(p.get_height())\n ax.text(_x, _y, value, ha=\"center\") \n\n if isinstance(axs, np.ndarray):\n for idx, ax in np.ndenumerate(axs):\n _show_on_single_plot(ax)\n else:\n _show_on_single_plot(axs)\nshow_values_on_bars(ax)\n\nsns.despine(left=True, bottom=True)\nplt.xlabel('')\nplt.ylabel('')\nplt.title('Distribution of Questions (30% of train data)', fontsize=30)\nplt.tick_params(axis='x', which='major', labelsize=15)\nfig.savefig('classes.png')\nplt.show()","execution_count":17,"outputs":[{"output_type":"display_data","data":{"text/plain":"
","image/png":"\n"},"metadata":{"needs_background":"light"}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"from imblearn.over_sampling import SMOTE","execution_count":18,"outputs":[{"output_type":"stream","text":"Using TensorFlow backend.\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"from nltk import word_tokenize\nfrom nltk.stem import WordNetLemmatizer","execution_count":19,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#Lemmatizer\nclass LemmaTokenizer(object):\n def __init__(self):\n self.wnl = WordNetLemmatizer()\n def __call__(self, doc):\n return [self.wnl.lemmatize(t) for t in \n word_tokenize(doc)]","execution_count":20,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.feature_extraction.text import TfidfVectorizer","execution_count":21,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"vectorizer =TfidfVectorizer(stop_words='english',tokenizer=LemmaTokenizer())","execution_count":22,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"x = vectorizer.fit_transform(df1['question_text'])","execution_count":23,"outputs":[{"output_type":"stream","text":"/opt/conda/lib/python3.6/site-packages/sklearn/feature_extraction/text.py:300: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ha', 'le', 'u', 'wa'] not in stop_words.\n 'stop_words.' % sorted(inconsistent))\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"#Minority oversampling\nsm = SMOTE(random_state=42)\nx,y = sm.fit_sample(x,df1['target'])","execution_count":24,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# using seaborns countplot to show distribution of questions in dataset\nfig, ax = plt.subplots()\ng = sns.countplot(y, palette='viridis')\ng.set_xticklabels(['Sincere', 'Insincere'])\ng.set_yticklabels([])\n\n# function to show values on bars\ndef show_values_on_bars(axs):\n def _show_on_single_plot(ax): \n for p in ax.patches:\n _x = p.get_x() + p.get_width() / 2\n _y = p.get_y() + p.get_height()\n value = '{:.0f}'.format(p.get_height())\n ax.text(_x, _y, value, ha=\"center\") \n\n if isinstance(axs, np.ndarray):\n for idx, ax in np.ndenumerate(axs):\n _show_on_single_plot(ax)\n else:\n _show_on_single_plot(axs)\nshow_values_on_bars(ax)\n\nsns.despine(left=True, bottom=True)\nplt.xlabel('')\nplt.ylabel('')\nplt.title('Distribution of Questions after sampling', fontsize=30)\nplt.tick_params(axis='x', which='major', labelsize=15)\nfig.savefig('classes.png')\nplt.show()","execution_count":25,"outputs":[{"output_type":"display_data","data":{"text/plain":"
","image/png":"\n"},"metadata":{"needs_background":"light"}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.model_selection import train_test_split","execution_count":26,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=5)","execution_count":27,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.svm import LinearSVC\nmodel = LinearSVC(random_state=42,tol=5,fit_intercept=False)\nmodel.fit(x_train,y_train)\nsvcpredictions = model.predict(x_test)","execution_count":30,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix\nmnbaccuracy = accuracy_score(y_test, svcpredictions)\nprint('Confusion Matrix:',format(confusion_matrix(y_test,svcpredictions)))\nprint('Accuracy score: ', format(accuracy_score(y_test, svcpredictions)))\nprint('Precision score: ', format(precision_score(y_test, svcpredictions)))\nprint('Recall score: ', format(recall_score(y_test, svcpredictions)))\nprint('F1 score: ', format(f1_score(y_test, svcpredictions)))","execution_count":31,"outputs":[{"output_type":"stream","text":"Confusion Matrix: [[83430 8343]\n [ 3346 88651]]\nAccuracy score: 0.9363933177341242\nPrecision score: 0.9139843701672269\nRecall score: 0.9636292487798516\nF1 score: 0.9381504939388648\n","name":"stdout"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"labels = ['Sincere', 'Insincere']\ncm = confusion_matrix(y_test, svcpredictions)\nprint(cm)\nfig = plt.figure()\nax = fig.add_subplot(111)\ncax = ax.matshow(cm)\nplt.title('Confusion matrix of the classifier')\nfig.colorbar(cax)\nax.set_xticklabels([''] + labels)\nax.set_yticklabels([''] + labels)\nplt.xlabel('Predicted')\nplt.ylabel('True')\nplt.show()","execution_count":39,"outputs":[{"output_type":"stream","text":"[[83430 8343]\n [ 3346 88651]]\n","name":"stdout"},{"output_type":"display_data","data":{"text/plain":"
","image/png":"\n"},"metadata":{"needs_background":"light"}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.naive_bayes import MultinomialNB\nnaive_bayes = MultinomialNB()\nnaive_bayes.fit(x_train,y_train)","execution_count":32,"outputs":[{"output_type":"execute_result","execution_count":32,"data":{"text/plain":"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"predictions = naive_bayes.predict(x_test)","execution_count":33,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix\nmnbaccuracy = accuracy_score(y_test, predictions)\nprint('Confusion Matrix:',format(confusion_matrix(y_test,predictions)))\nprint('Accuracy score: ', format(accuracy_score(y_test, predictions)))\nprint('Precision score: ', format(precision_score(y_test, predictions)))\nprint('Recall score: ', format(recall_score(y_test, predictions)))\nprint('F1 score: ', format(f1_score(y_test, predictions)))","execution_count":34,"outputs":[{"output_type":"stream","text":"Confusion Matrix: [[75833 15940]\n [ 3766 88231]]\nAccuracy score: 0.8927681340806443\nPrecision score: 0.8469823655335939\nRecall score: 0.9590638825179082\nF1 score: 0.8995452877125729\n","name":"stdout"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.metrics import confusion_matrix\n\nlabels = ['Sincere', 'Insincere']\ncm = confusion_matrix(y_test, predictions)\nprint(cm)\nfig = plt.figure()\nax = fig.add_subplot(111)\ncax = ax.matshow(cm)\nplt.title('Confusion matrix of the classifier')\nfig.colorbar(cax)\nax.set_xticklabels([''] + labels)\nax.set_yticklabels([''] + labels)\nplt.xlabel('Predicted')\nplt.ylabel('True')\nplt.show()","execution_count":35,"outputs":[{"output_type":"stream","text":"[[75833 15940]\n [ 3766 88231]]\n","name":"stdout"},{"output_type":"display_data","data":{"text/plain":"
","image/png":"\n"},"metadata":{"needs_background":"light"}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.linear_model import LogisticRegression\nlr = LogisticRegression()\nlr.fit(x_train,y_train)\nlrpredicted = lr.predict(x_test)","execution_count":36,"outputs":[{"output_type":"stream","text":"/opt/conda/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n FutureWarning)\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix\nlr_accuracy = accuracy_score(lrpredicted,y_test )\nprint('Confusion Matrix:',format(confusion_matrix(y_test,lrpredicted)))\nprint('Accuracy score: ', format(accuracy_score(lrpredicted,y_test )))\nprint('Precision score: ', format(precision_score(y_test,lrpredicted)))\nprint('Recall score: ', format(recall_score(y_test, lrpredicted)))\nprint('F1 score: ', format(f1_score(y_test, lrpredicted)))","execution_count":37,"outputs":[{"output_type":"stream","text":"Confusion Matrix: [[83457 8316]\n [ 4831 87166]]\nAccuracy score: 0.9284594874027317\nPrecision score: 0.9129050501665236\nRecall score: 0.9474874180679804\nF1 score: 0.929874812645683\n","name":"stdout"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"labels = ['Sincere', 'Insincere']\ncm = confusion_matrix(y_test, lrpredicted)\nprint(cm)\nfig = plt.figure()\nax = fig.add_subplot(111)\ncax = ax.matshow(cm)\nplt.title('Confusion matrix of the classifier')\nfig.colorbar(cax)\nax.set_xticklabels([''] + labels)\nax.set_yticklabels([''] + labels)\nplt.xlabel('Predicted')\nplt.ylabel('True')\nplt.show()","execution_count":38,"outputs":[{"output_type":"stream","text":"[[83457 8316]\n [ 4831 87166]]\n","name":"stdout"},{"output_type":"display_data","data":{"text/plain":"
","image/png":"\n"},"metadata":{"needs_background":"light"}}]}],"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat":4,"nbformat_minor":1} --------------------------------------------------------------------------------