├── requirements.txt
├── install.sh
├── get_data.sh
├── README.md
├── nbsvm.py
├── translate.ipynb
├── visuals.py
├── convai_feature.ipynb
├── super_nbsvm.ipynb
├── badwords.ipynb
├── add_covaai.ipynb
├── RIDGE.ipynb
├── Untitled.ipynb
├── fasttext_direct.ipynb
└── LGBM_LOGREG_XGB_STACK_LOGREG.ipynb
/requirements.txt:
--------------------------------------------------------------------------------
1 | keras
2 | tensorflow-gpu
3 | splitter
4 | gensim
5 | nltk
6 | jupyter
7 | pyenchant
8 |
--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | sudo apt-get install cython htop \
2 | wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.zip \
3 | git clone https://github.com/facebookresearch/fastText.git \
4 | cd fastText \
5 | pip install .
6 |
--------------------------------------------------------------------------------
/get_data.sh:
--------------------------------------------------------------------------------
1 | curl -L -o final_train.csv "https://drive.google.com/uc?export=download&id=19WFLluCA0YNXLOAOVwN4iKul83-AXpPv"
2 | curl -L -o test.csv "https://drive.google.com/uc?export=download&id=19WFLluCA0YNXLOAOVwN4iKul83-AXpPv"
3 |
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Toxic Comment Classification
2 |
3 | This is my codes for the toxic comment classification competition hosted in [Kaggle](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge). Fully modified to another level from the base code [here](https://github.com/conversationai/unintended-ml-bias-analysis/tree/master/unintended_ml_bias)
4 |
5 |
6 | To download datasets please run get_data.sh
7 | ## The Task
8 | The dataset comprises of comments from Wikipedia’s talk page edits. It is a large number of Wikipedia comments which have been labeled by human raters for toxic behavior. The types of toxicity are:
9 |
10 | > * `toxic`
11 | > * `severe_toxic`
12 | > * `obscene`
13 | > * `threat`
14 | > * `insult`
15 | > * `identity_hate`
16 |
17 |
18 | ## The Approach
19 |
20 | Creating an ensemble model which predicts a probability of each type of toxicity for each comment.Full explaination of my approach is documented [here](https://medium.com/@dickson_chin93/my-solution-to-achieve-top-1-in-a-novel-data-science-nlp-competition-db8db2ee356a)
21 |
22 |
23 |
24 | ## Install Pre-requisites
25 |
26 | run install.sh and then run
27 | pip install -r requirements.txt
28 |
29 | ## Tips
30 |
31 | - Make sure embeddings original preprocessing is used to ensure highest percentage of embeddings can be imported
32 |
--------------------------------------------------------------------------------
/nbsvm.py:
--------------------------------------------------------------------------------
1 | from sklearn.base import BaseEstimator, ClassifierMixin
2 | from sklearn.utils.validation import check_X_y, check_is_fitted
3 | from sklearn.linear_model import LogisticRegression
4 | from scipy import sparse
5 | class NbSvmClassifier(BaseEstimator, ClassifierMixin):
6 | def __init__(self, C=1.0, dual=False, n_jobs=1):
7 | self.C = C
8 | self.dual = dual
9 | self.n_jobs = n_jobs
10 |
11 | def predict(self, x):
12 | # Verify that model has been fit
13 | check_is_fitted(self, ['_r', '_clf'])
14 | return self._clf.predict(x.multiply(self._r))
15 |
16 | def predict_proba(self, x):
17 | # Verify that model has been fit
18 | check_is_fitted(self, ['_r', '_clf'])
19 | return self._clf.predict_proba(x.multiply(self._r))
20 |
21 | def fit(self, x, y):
22 | # Check that X and y have correct shape
23 | y = y.values
24 | x, y = check_X_y(x, y, accept_sparse=True)
25 |
26 | def pr(x, y_i, y):
27 | p = x[y==y_i].sum(0)
28 | return (p+1) / ((y==y_i).sum()+1)
29 |
30 | self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
31 | x_nb = x.multiply(self._r)
32 | self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
33 | return self
34 |
35 | #EXAMPLE USAGE
36 | #model = NbSvmClassifier(C=4, dual=True, n_jobs=-1).fit(training_features, training_labels)
37 |
38 |
--------------------------------------------------------------------------------
/translate.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from preprocessing import split_train_data, translate_data"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 2,
15 | "metadata": {
16 | "scrolled": true
17 | },
18 | "outputs": [
19 | {
20 | "name": "stdout",
21 | "output_type": "stream",
22 | "text": [
23 | "Requirement already satisfied: translation in /home/stgc/anaconda2/lib/python2.7/site-packages\r\n",
24 | "Requirement already satisfied: requests in /home/stgc/anaconda2/lib/python2.7/site-packages (from translation)\r\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "!pip install translation"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 2,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "translate_data(\"data\", filename='sp_check_train.csv', filename_translated='train_translated_sp.csv')"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 3,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "translate_data(\"data\", filename='sp_check_test.csv', filename_translated='test_translated_sp.csv')"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": []
56 | }
57 | ],
58 | "metadata": {
59 | "kernelspec": {
60 | "display_name": "Python 2",
61 | "language": "python",
62 | "name": "python2"
63 | },
64 | "language_info": {
65 | "codemirror_mode": {
66 | "name": "ipython",
67 | "version": 2
68 | },
69 | "file_extension": ".py",
70 | "mimetype": "text/x-python",
71 | "name": "python",
72 | "nbconvert_exporter": "python",
73 | "pygments_lexer": "ipython2",
74 | "version": "2.7.14"
75 | }
76 | },
77 | "nbformat": 4,
78 | "nbformat_minor": 2
79 | }
80 |
--------------------------------------------------------------------------------
/visuals.py:
--------------------------------------------------------------------------------
1 | ###########################################
2 | # Suppress matplotlib user warnings
3 | # Necessary for newer version of matplotlib
4 | import warnings
5 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
6 | #
7 | # Display inline matplotlib plots with IPython
8 | from IPython import get_ipython
9 | get_ipython().run_line_magic('matplotlib', 'inline')
10 | ###########################################
11 |
12 | import matplotlib.pyplot as pl
13 | import matplotlib.patches as mpatches
14 | import numpy as np
15 | import pandas as pd
16 | from time import time
17 | from sklearn.metrics import f1_score, accuracy_score
18 |
19 |
20 | def distribution(data, transformed = False):
21 | """
22 | Visualization code for displaying skewed distributions of features
23 | """
24 |
25 | # Create figure
26 | fig = pl.figure(figsize = (11,5));
27 |
28 | # Skewed feature plotting
29 | for i, feature in enumerate(['capitals','num_unique_words']):
30 | ax = fig.add_subplot(1, 2, i+1)
31 | ax.hist(data[feature], bins = 25, color = '#00A0A0')
32 | ax.set_title("'%s' Feature Distribution"%(feature), fontsize = 14)
33 | ax.set_xlabel("Value")
34 | ax.set_ylabel("Number of Records")
35 | ax.set_ylim((0, 2000))
36 | ax.set_yticks([0, 500, 1000, 1500, 2000])
37 | ax.set_yticklabels([0, 500, 1000, 1500, ">2000"])
38 |
39 | # Plot aesthetics
40 | if transformed:
41 | fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", \
42 | fontsize = 16, y = 1.03)
43 | else:
44 | fig.suptitle("Skewed Distributions of Continuous Census Data Features", \
45 | fontsize = 16, y = 1.03)
46 |
47 | fig.tight_layout()
48 | fig.show()
49 |
50 |
51 | def evaluate(results, accuracy, f1):
52 | """
53 | Visualization code to display results of various learners.
54 |
55 | inputs:
56 | - learners: a list of supervised learners
57 | - stats: a list of dictionaries of the statistic results from 'train_predict()'
58 | - accuracy: The score for the naive predictor
59 | - f1: The score for the naive predictor
60 | """
61 |
62 | # Create figure
63 | fig, ax = pl.subplots(2, 3, figsize = (11,7))
64 |
65 | # Constants
66 | bar_width = 0.3
67 | colors = ['#A00000','#00A0A0','#00A000']
68 |
69 | # Super loop to plot four panels of data
70 | for k, learner in enumerate(results.keys()):
71 | for j, metric in enumerate(['train_time', 'acc_train', 'f_train', 'pred_time', 'acc_test', 'f_test']):
72 | for i in np.arange(3):
73 |
74 | # Creative plot code
75 | ax[j/3, j%3].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k])
76 | ax[j/3, j%3].set_xticks([0.45, 1.45, 2.45])
77 | ax[j/3, j%3].set_xticklabels(["1%", "10%", "100%"])
78 | ax[j/3, j%3].set_xlabel("Training Set Size")
79 | ax[j/3, j%3].set_xlim((-0.1, 3.0))
80 |
81 | # Add unique y-labels
82 | ax[0, 0].set_ylabel("Time (in seconds)")
83 | ax[0, 1].set_ylabel("Accuracy Score")
84 | ax[0, 2].set_ylabel("F-score")
85 | ax[1, 0].set_ylabel("Time (in seconds)")
86 | ax[1, 1].set_ylabel("Accuracy Score")
87 | ax[1, 2].set_ylabel("F-score")
88 |
89 | # Add titles
90 | ax[0, 0].set_title("Model Training")
91 | ax[0, 1].set_title("Accuracy Score on Training Subset")
92 | ax[0, 2].set_title("F-score on Training Subset")
93 | ax[1, 0].set_title("Model Predicting")
94 | ax[1, 1].set_title("Accuracy Score on Testing Set")
95 | ax[1, 2].set_title("F-score on Testing Set")
96 |
97 | # Add horizontal lines for naive predictors
98 | ax[0, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
99 | ax[1, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
100 | ax[0, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
101 | ax[1, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
102 |
103 | # Set y-limits for score panels
104 | ax[0, 1].set_ylim((0, 1))
105 | ax[0, 2].set_ylim((0, 1))
106 | ax[1, 1].set_ylim((0, 1))
107 | ax[1, 2].set_ylim((0, 1))
108 |
109 | # Create patches for the legend
110 | patches = []
111 | for i, learner in enumerate(results.keys()):
112 | patches.append(mpatches.Patch(color = colors[i], label = learner))
113 | pl.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \
114 | loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'x-large')
115 |
116 | # Aesthetics
117 | pl.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 16, y = 1.10)
118 | pl.tight_layout()
119 | pl.show()
120 |
121 |
122 | def feature_plot(importances, X_train, y_train):
123 |
124 | # Display the five most important features
125 | indices = np.argsort(importances)[::-1]
126 | columns = X_train.columns.values[indices[:5]]
127 | values = importances[indices][:5]
128 |
129 | # Creat the plot
130 | fig = pl.figure(figsize = (9,5))
131 | pl.title("Normalized Weights for First Five Most Predictive Features", fontsize = 16)
132 | pl.bar(np.arange(5), values, width = 0.6, align="center", color = '#00A000', \
133 | label = "Feature Weight")
134 | pl.bar(np.arange(5) - 0.3, np.cumsum(values), width = 0.2, align = "center", color = '#00A0A0', \
135 | label = "Cumulative Feature Weight")
136 | pl.xticks(np.arange(5), columns)
137 | pl.xlim((-0.5, 4.5))
138 | pl.ylabel("Weight", fontsize = 12)
139 | pl.xlabel("Feature", fontsize = 12)
140 |
141 | pl.legend(loc = 'upper center')
142 | pl.tight_layout()
143 | pl.show()
144 |
--------------------------------------------------------------------------------
/convai_feature.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {
5 | "_cell_guid": "59afa460-67bb-4083-8f64-4e8bdc7d688e",
6 | "_uuid": "e1f3247609398721b4c6b37da205be55e341b899"
7 | },
8 | "cell_type": "markdown",
9 | "source": "This is a basic LogisticRegression model trained using the data from https://www.kaggle.com/eoveson/convai-datasets-baseline-models\n\nThe baseline model in that kernal is tuned a little to get the data for this kernal This kernal scored 0.044 in the LB"
10 | },
11 | {
12 | "metadata": {
13 | "_cell_guid": "eb9acbb1-40db-4a60-9c00-7e1134408cb1",
14 | "_uuid": "7e97dad72af19207237cb816bc898ca5818f4389",
15 | "collapsed": true,
16 | "trusted": false
17 | },
18 | "cell_type": "code",
19 | "source": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load in \n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the \"../input/\" directory.\n# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom scipy import sparse\n# set stopwords\n\nfrom subprocess import check_output\nprint(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n# Any results you write to the current directory are saved as output.",
20 | "execution_count": null,
21 | "outputs": []
22 | },
23 | {
24 | "metadata": {
25 | "_cell_guid": "bb967e03-d30b-46ec-b9d2-c0f5d4c0ee68",
26 | "_uuid": "97b399586c43626b73bc77b50e58b952d86ea8da",
27 | "collapsed": true,
28 | "trusted": false
29 | },
30 | "cell_type": "code",
31 | "source": "train = pd.read_csv('../input/dataset/train_with_convai.csv')\ntest = pd.read_csv('../input/dataset/test_with_convai.csv')\n",
32 | "execution_count": null,
33 | "outputs": []
34 | },
35 | {
36 | "metadata": {
37 | "_cell_guid": "1eebb207-607e-4985-908e-9848888808b1",
38 | "_uuid": "3e90295dde0dd25158ea9e3464165aa8ea62fd1c",
39 | "collapsed": true,
40 | "trusted": false
41 | },
42 | "cell_type": "code",
43 | "source": "feats_to_concat = ['comment_text', 'toxic_level', 'attack', 'aggression']\n# combining test and train\nalldata = pd.concat([train[feats_to_concat], test[feats_to_concat]], axis=0)\nalldata.comment_text.fillna('unknown', inplace=True)",
44 | "execution_count": null,
45 | "outputs": []
46 | },
47 | {
48 | "metadata": {
49 | "_cell_guid": "88a8e609-b287-4a7e-b72d-5dcac6f4a55f",
50 | "_uuid": "741273ee4b5122a37d978708ba29e16879e5b33f",
51 | "collapsed": true,
52 | "trusted": false
53 | },
54 | "cell_type": "code",
55 | "source": "vect_words = TfidfVectorizer(max_features=50000, analyzer='word', ngram_range=(1, 1))\nvect_chars = TfidfVectorizer(max_features=20000, analyzer='char', ngram_range=(1, 3))",
56 | "execution_count": null,
57 | "outputs": []
58 | },
59 | {
60 | "metadata": {
61 | "_cell_guid": "6db22032-8e99-4848-8978-be7c68a1e936",
62 | "_uuid": "cf10b99072cef22bf87ee92c9aa51f035a26e893",
63 | "collapsed": true,
64 | "trusted": false
65 | },
66 | "cell_type": "code",
67 | "source": "all_words = vect_words.fit_transform(alldata.comment_text)\nall_chars = vect_chars.fit_transform(alldata.comment_text)",
68 | "execution_count": null,
69 | "outputs": []
70 | },
71 | {
72 | "metadata": {
73 | "_cell_guid": "8f42e0d7-5938-4bb0-beb7-7ddf9f85685d",
74 | "_uuid": "d074b6b6c5271f462c129c534980c5a0d287599f",
75 | "collapsed": true,
76 | "trusted": false
77 | },
78 | "cell_type": "code",
79 | "source": "train_new = train\ntest_new = test",
80 | "execution_count": null,
81 | "outputs": []
82 | },
83 | {
84 | "metadata": {
85 | "_cell_guid": "c068c9bb-bf28-4342-aa71-e575c6d93788",
86 | "_uuid": "09975f14757c51e19876dab638a39671dfd555e4",
87 | "collapsed": true,
88 | "trusted": false
89 | },
90 | "cell_type": "code",
91 | "source": "train_words = all_words[:len(train_new)]\ntest_words = all_words[len(train_new):]\n\ntrain_chars = all_chars[:len(train_new)]\ntest_chars = all_chars[len(train_new):]",
92 | "execution_count": null,
93 | "outputs": []
94 | },
95 | {
96 | "metadata": {
97 | "_cell_guid": "5d55e152-e1cb-4cf0-aa41-e3eec5850b3a",
98 | "_uuid": "0338f2d0b8f09c751f97afebf1cf8e77d8a10fe3",
99 | "collapsed": true,
100 | "trusted": false
101 | },
102 | "cell_type": "code",
103 | "source": "feats = ['toxic_level', 'attack']\n# make sparse matrix with needed data for train and test\ntrain_feats = sparse.hstack([train_words, train_chars, alldata[feats][:len(train_new)]])\ntest_feats = sparse.hstack([test_words, test_chars, alldata[feats][len(train_new):]])",
104 | "execution_count": null,
105 | "outputs": []
106 | },
107 | {
108 | "metadata": {
109 | "_cell_guid": "350aad79-ee6f-44bc-9d85-4e9652956bd3",
110 | "_uuid": "da2082c68a367369fac28ddc09eec2e5b6c718bb",
111 | "scrolled": false,
112 | "collapsed": true,
113 | "trusted": false
114 | },
115 | "cell_type": "code",
116 | "source": "col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n\nonly_col = ['toxic']\n\npreds = np.zeros((test_new.shape[0], len(col)))\n\nfor i, j in enumerate(col):\n print('===Fit '+j)\n \n model = LogisticRegression(C=4.0, solver='sag')\n print('Fitting model')\n model.fit(train_feats, train_new[j])\n \n print('Predicting on test')\n preds[:,i] = model.predict_proba(test_feats)[:,1]",
117 | "execution_count": null,
118 | "outputs": []
119 | },
120 | {
121 | "metadata": {
122 | "_cell_guid": "9d84b909-d93b-4778-b432-701f65a73d3c",
123 | "_uuid": "3605ca797e6d5e4d05ac2c63d70766c23d2a8cf1",
124 | "collapsed": true,
125 | "trusted": false
126 | },
127 | "cell_type": "code",
128 | "source": "subm = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')\n\nsubmid = pd.DataFrame({'id': subm[\"id\"]})\nsubmission = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)\nsubmission.to_csv('feat_lr_2cols.csv', index=False)",
129 | "execution_count": null,
130 | "outputs": []
131 | },
132 | {
133 | "metadata": {
134 | "_cell_guid": "6d350714-1262-4f91-af11-a7f95750ec84",
135 | "_uuid": "be385cfe2683246d05dc872d7b09cb4608b73337",
136 | "collapsed": true,
137 | "trusted": false
138 | },
139 | "cell_type": "code",
140 | "source": "",
141 | "execution_count": null,
142 | "outputs": []
143 | }
144 | ],
145 | "metadata": {
146 | "language_info": {
147 | "name": "python",
148 | "version": "3.6.4",
149 | "mimetype": "text/x-python",
150 | "codemirror_mode": {
151 | "name": "ipython",
152 | "version": 3
153 | },
154 | "pygments_lexer": "ipython3",
155 | "nbconvert_exporter": "python",
156 | "file_extension": ".py"
157 | },
158 | "kernelspec": {
159 | "display_name": "Python 3",
160 | "language": "python",
161 | "name": "python3"
162 | }
163 | },
164 | "nbformat": 4,
165 | "nbformat_minor": 1
166 | }
--------------------------------------------------------------------------------
/super_nbsvm.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {
5 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
6 | "collapsed": true,
7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
8 | "trusted": false
9 | },
10 | "cell_type": "code",
11 | "source": "# Inspiration 1: https://www.kaggle.com/tunguz/logistic-regression-with-words-and-char-n-grams/code\n# Inspiration 2: https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline\n\nimport pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\nimport re, string\nimport time\nfrom scipy.sparse import hstack\nfrom scipy.special import logit, expit\n\nfrom sklearn.model_selection import StratifiedKFold\nfrom sklearn.metrics import roc_auc_score",
12 | "execution_count": null,
13 | "outputs": []
14 | },
15 | {
16 | "metadata": {
17 | "_uuid": "7a9def5093420ee0cb33e3c4dbdb284af0e763b9",
18 | "collapsed": true,
19 | "_cell_guid": "890ecc79-0b1e-422a-beab-c20b5b588ae4",
20 | "trusted": false
21 | },
22 | "cell_type": "code",
23 | "source": "# Functions\ndef tokenize(s): return re_tok.sub(r' \\1 ', s).split()\n\n\ndef pr(y_i, y, x):\n p = x[y==y_i].sum(0)\n return (p+1) / ((y==y_i).sum()+1)\n\n\ndef get_mdl(y,x, c0 = 4):\n y = y.values\n r = np.log(pr(1,y,x) / pr(0,y,x))\n m = LogisticRegression(C= c0, dual=True)\n x_nb = x.multiply(r)\n return m.fit(x_nb, y), r\n\n\ndef multi_roc_auc_score(y_true, y_pred):\n assert y_true.shape == y_pred.shape\n columns = y_true.shape[1]\n column_losses = []\n for i in range(0, columns):\n column_losses.append(roc_auc_score(y_true[:, i], y_pred[:, i]))\n return np.array(column_losses).mean()",
24 | "execution_count": null,
25 | "outputs": []
26 | },
27 | {
28 | "metadata": {
29 | "_uuid": "648a414512e100686384f54a335f279ffb60dc25",
30 | "collapsed": true,
31 | "_cell_guid": "2e91fb5c-aa54-4509-adf1-02100d4d59e3",
32 | "trusted": false
33 | },
34 | "cell_type": "code",
35 | "source": "model_type = 'lrchar'\ntodate = time.strftime(\"%d%m\")",
36 | "execution_count": null,
37 | "outputs": []
38 | },
39 | {
40 | "metadata": {
41 | "_uuid": "1ce08d3aa6e6582ae66286a7bb870c23c133ca86",
42 | "_cell_guid": "10154f60-38e2-4ba2-ac3f-337cdabcc677"
43 | },
44 | "cell_type": "markdown",
45 | "source": "# Data"
46 | },
47 | {
48 | "metadata": {
49 | "_uuid": "a81bbe05d6bb731e198b6f3c753620532be4d600",
50 | "collapsed": true,
51 | "_cell_guid": "ad29a64e-d548-4b14-8c19-5a5adbab3e74",
52 | "trusted": false
53 | },
54 | "cell_type": "code",
55 | "source": "# read data\ntrain = pd.read_csv('../input/train.csv')\ntest = pd.read_csv('../input/test.csv')\nsubm = pd.read_csv('../input/sample_submission.csv')\n\nid_train = train['id'].copy()\nid_test = test['id'].copy()\n\n# add empty label for None\nlabel_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\ntrain['none'] = 1-train[label_cols].max(axis=1)\n# fill missing values\nCOMMENT = 'comment_text'\ntrain[COMMENT].fillna(\"unknown\", inplace=True)\ntest[COMMENT].fillna(\"unknown\", inplace=True)",
56 | "execution_count": null,
57 | "outputs": []
58 | },
59 | {
60 | "metadata": {
61 | "_uuid": "375e268097baecafa125fc7fe8c879d40a25efa8",
62 | "collapsed": true,
63 | "_cell_guid": "f2b78b34-b627-4788-a0f1-2571eab44e3b",
64 | "trusted": false
65 | },
66 | "cell_type": "code",
67 | "source": "# Tf-idf\n# prepare tokenizer\nre_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')\n\n# create sparse matrices\nn = train.shape[0]\n#vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize, min_df=3, max_df=0.9, strip_accents='unicode',\n# use_idf=1, smooth_idf=1, sublinear_tf=1 )\n\nword_vectorizer = TfidfVectorizer(\n tokenizer=tokenize,\n sublinear_tf=True,\n strip_accents='unicode',\n analyzer='word', \n min_df = 5,\n token_pattern=r'\\w{1,}',\n ngram_range=(1, 3))\n# ,\n# max_features=250000)\n\nall1 = pd.concat([train[COMMENT], test[COMMENT]])\nword_vectorizer.fit(all1)\nxtrain1 = word_vectorizer.transform(train[COMMENT])\nxtest1 = word_vectorizer.transform(test[COMMENT])\n\nchar_vectorizer = TfidfVectorizer(\n sublinear_tf=True,\n strip_accents='unicode',\n analyzer='char',\n min_df = 3,\n ngram_range=(1, 6))\n# ,\n# max_features=250000)\n\nall1 = pd.concat([train[COMMENT], test[COMMENT]])\nchar_vectorizer.fit(all1)\n\nxtrain2 = char_vectorizer.transform(train[COMMENT])\nxtest2 = char_vectorizer.transform(test[COMMENT])\n",
68 | "execution_count": null,
69 | "outputs": []
70 | },
71 | {
72 | "metadata": {
73 | "_uuid": "bfdb92b679fed1e718a8a3b6e4d61caa8f7aa2ee",
74 | "_cell_guid": "f948babf-3396-4aa9-91c8-adde0c62ff0a"
75 | },
76 | "cell_type": "markdown",
77 | "source": "# Model"
78 | },
79 | {
80 | "metadata": {
81 | "_uuid": "fc94a43d0a5a0613d3bdbce0c15c25454573eac6",
82 | "collapsed": true,
83 | "_cell_guid": "17517658-06fa-4437-ab4d-9e0d011e7753",
84 | "trusted": false
85 | },
86 | "cell_type": "code",
87 | "source": "nfolds = 5\nxseed = 29\ncval = 4\n\n# data setup\nxtrain = hstack([xtrain1, xtrain2], format='csr')\nxtest = hstack([xtest1,xtest2], format='csr')\nytrain = np.array(train[label_cols].copy())\n\n# stratified split\nskf = StratifiedKFold(n_splits= nfolds, random_state= xseed)\n\n# storage structures for prval / prfull\npredval = np.zeros((xtrain.shape[0], len(label_cols)))\npredfull = np.zeros((xtest.shape[0], len(label_cols)))\nscoremat = np.zeros((nfolds,len(label_cols) ))\nscore_vec = np.zeros((len(label_cols),1))",
88 | "execution_count": null,
89 | "outputs": []
90 | },
91 | {
92 | "metadata": {
93 | "_uuid": "c6b3c62063bda4ed1a8a6b8845454278766929fd",
94 | "collapsed": true,
95 | "_cell_guid": "a8d92c33-64f8-4302-bf49-e38712fd6b8f",
96 | "trusted": false
97 | },
98 | "cell_type": "code",
99 | "source": "for (lab_ind,lab) in enumerate(label_cols): \n y = train[lab].copy()\n print('label:' + str(lab_ind))\n for (f, (train_index, test_index)) in enumerate(skf.split(xtrain, y)):\n # split \n x0, x1 = xtrain[train_index], xtrain[test_index]\n y0, y1 = y[train_index], y[test_index] \n # fit model for prval\n m,r = get_mdl(y0,x0, c0 = cval)\n predval[test_index,lab_ind] = m.predict_proba(x1.multiply(r))[:,1]\n scoremat[f,lab_ind] = roc_auc_score(y1,predval[test_index,lab_ind])\n # fit model full\n m,r = get_mdl(y,xtrain, c0 = cval)\n predfull[:,lab_ind] += m.predict_proba(xtest.multiply(r))[:,1]\n print('fit:'+ str(lab) + ' fold:' + str(f) + ' score:%.6f' %(scoremat[f,lab_ind]))\n# break\npredfull /= nfolds ",
100 | "execution_count": null,
101 | "outputs": []
102 | },
103 | {
104 | "metadata": {
105 | "_uuid": "b14075b625915e7dc8d6b8eff44c79d4b075065d",
106 | "collapsed": true,
107 | "_cell_guid": "0a45a046-7e09-40df-b9a1-116397cf4d09",
108 | "trusted": false
109 | },
110 | "cell_type": "code",
111 | "source": "score_vec = np.zeros((len(label_cols),1))\nfor ii in range(len(label_cols)):\n score_vec[ii] = roc_auc_score(ymat[:,ii], predval[:,ii])\nprint(score_vec.mean())\nprint(multi_roc_auc_score(ymat, predval))",
112 | "execution_count": null,
113 | "outputs": []
114 | },
115 | {
116 | "metadata": {
117 | "_uuid": "586123023ff44c843d2c765475b344a1d5be5922",
118 | "_cell_guid": "0b575f63-b40d-448a-8542-e4d753bd7d10"
119 | },
120 | "cell_type": "markdown",
121 | "source": "# Store resultss"
122 | },
123 | {
124 | "metadata": {
125 | "_uuid": "1eaae93a1bd8569eeefc61c5b7207cccb526db2f",
126 | "collapsed": true,
127 | "_cell_guid": "65f0fee6-bad7-4eef-b5ca-0f3b96cad666",
128 | "trusted": false
129 | },
130 | "cell_type": "code",
131 | "source": "# store prval\nprval = pd.DataFrame(predval)\nprval.columns = label_cols\nprval['id'] = id_train\nprval.to_csv('prval_'+model_type+'x'+str(cval)+'f'+str(nfolds)+'_'+todate+'.csv', index= False)\n\n# store prfull\nprfull = pd.DataFrame(predfull)\nprfull.columns = label_cols\nprfull['id'] = id_test\nprfull.to_csv('prfull_'+model_type+'x'+str(cval)+'f'+str(nfolds)+'_'+todate+'.csv', index= False)\n\n# store submission\nsubmid = pd.DataFrame({'id': subm[\"id\"]})\nsubmission = pd.concat([submid, pd.DataFrame(prfull, columns = label_cols)], axis=1)\nsubmission.to_csv('sub_'+model_type+'x'+str(cval)+'f'+str(nfolds)+'_'+todate+'.csv', index= False)",
132 | "execution_count": null,
133 | "outputs": []
134 | },
135 | {
136 | "metadata": {
137 | "_uuid": "a7f182e6d38bf01d2ab905c839ef5fe635c89412",
138 | "_cell_guid": "192daff3-4d61-4c9b-9ca9-a7c2840947a3"
139 | },
140 | "cell_type": "markdown",
141 | "source": ""
142 | },
143 | {
144 | "metadata": {
145 | "_uuid": "47fcbff298a283564216632bac24f1bfba81c28b",
146 | "_cell_guid": "d353d366-049e-4b6a-b13f-261d8f1852bc"
147 | },
148 | "cell_type": "markdown",
149 | "source": ""
150 | },
151 | {
152 | "metadata": {
153 | "_uuid": "b3612a10b1146a4bbab18d465d2cc52e7ec1cfd1",
154 | "_cell_guid": "e4773030-4e38-4345-90fe-0187175e70fa"
155 | },
156 | "cell_type": "markdown",
157 | "source": ""
158 | },
159 | {
160 | "metadata": {
161 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
162 | "collapsed": true,
163 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0"
164 | },
165 | "cell_type": "markdown",
166 | "source": ""
167 | },
168 | {
169 | "metadata": {
170 | "_uuid": "5b73a04728cd61f0aefb38eca18b1d50116d8630",
171 | "_cell_guid": "102a72e7-ed1f-43db-87b7-4aa00d1899d5"
172 | },
173 | "cell_type": "markdown",
174 | "source": ""
175 | },
176 | {
177 | "metadata": {
178 | "_uuid": "5a86027bcd41e4c5898e284cf32afb3f722df3e1",
179 | "_cell_guid": "a9c3c701-b750-4ae2-a9e6-7b8a3c68096c"
180 | },
181 | "cell_type": "markdown",
182 | "source": ""
183 | }
184 | ],
185 | "metadata": {
186 | "kernelspec": {
187 | "display_name": "Python 3",
188 | "language": "python",
189 | "name": "python3"
190 | },
191 | "language_info": {
192 | "codemirror_mode": {
193 | "name": "ipython",
194 | "version": 3
195 | },
196 | "name": "python",
197 | "mimetype": "text/x-python",
198 | "nbconvert_exporter": "python",
199 | "file_extension": ".py",
200 | "pygments_lexer": "ipython3",
201 | "version": "3.6.4"
202 | }
203 | },
204 | "nbformat": 4,
205 | "nbformat_minor": 1
206 | }
--------------------------------------------------------------------------------
/badwords.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 6,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/html": [
11 | "
\n",
12 | "\n",
25 | "
\n",
26 | " \n",
27 | " \n",
28 | " | \n",
29 | " 0 | \n",
30 | " 1 | \n",
31 | " 2 | \n",
32 | " 3 | \n",
33 | " 4 | \n",
34 | "
\n",
35 | " \n",
36 | " \n",
37 | " \n",
38 | " | 0 | \n",
39 | " NaN | \n",
40 | " 0 | \n",
41 | " 4 | \n",
42 | " 0 | \n",
43 | " anus | \n",
44 | "
\n",
45 | " \n",
46 | " | 1 | \n",
47 | " NaN | \n",
48 | " 0 | \n",
49 | " 4 | \n",
50 | " 0 | \n",
51 | " arse | \n",
52 | "
\n",
53 | " \n",
54 | " | 2 | \n",
55 | " NaN | \n",
56 | " 0 | \n",
57 | " 4 | \n",
58 | " 0 | \n",
59 | " arsehole | \n",
60 | "
\n",
61 | " \n",
62 | " | 3 | \n",
63 | " NaN | \n",
64 | " 0 | \n",
65 | " 4 | \n",
66 | " 0 | \n",
67 | " asshole | \n",
68 | "
\n",
69 | " \n",
70 | " | 4 | \n",
71 | " NaN | \n",
72 | " 0 | \n",
73 | " 4 | \n",
74 | " 0 | \n",
75 | " axe-wound | \n",
76 | "
\n",
77 | " \n",
78 | " | 5 | \n",
79 | " NaN | \n",
80 | " 0 | \n",
81 | " 4 | \n",
82 | " 0 | \n",
83 | " axewound | \n",
84 | "
\n",
85 | " \n",
86 | " | 6 | \n",
87 | " NaN | \n",
88 | " 0 | \n",
89 | " 4 | \n",
90 | " 0 | \n",
91 | " bastard | \n",
92 | "
\n",
93 | " \n",
94 | " | 7 | \n",
95 | " NaN | \n",
96 | " 0 | \n",
97 | " 4 | \n",
98 | " 0 | \n",
99 | " basterd | \n",
100 | "
\n",
101 | " \n",
102 | " | 8 | \n",
103 | " NaN | \n",
104 | " 0 | \n",
105 | " 4 | \n",
106 | " 0 | \n",
107 | " bastird | \n",
108 | "
\n",
109 | " \n",
110 | " | 9 | \n",
111 | " NaN | \n",
112 | " 0 | \n",
113 | " 4 | \n",
114 | " 0 | \n",
115 | " blow job | \n",
116 | "
\n",
117 | " \n",
118 | "
\n",
119 | "
"
120 | ],
121 | "text/plain": [
122 | " 0 1 2 3 4\n",
123 | "0 NaN 0 4 0 anus\n",
124 | "1 NaN 0 4 0 arse\n",
125 | "2 NaN 0 4 0 arsehole\n",
126 | "3 NaN 0 4 0 asshole\n",
127 | "4 NaN 0 4 0 axe-wound\n",
128 | "5 NaN 0 4 0 axewound\n",
129 | "6 NaN 0 4 0 bastard\n",
130 | "7 NaN 0 4 0 basterd\n",
131 | "8 NaN 0 4 0 bastird\n",
132 | "9 NaN 0 4 0 blow job"
133 | ]
134 | },
135 | "execution_count": 6,
136 | "metadata": {},
137 | "output_type": "execute_result"
138 | }
139 | ],
140 | "source": [
141 | "import pandas as pd\n",
142 | "bad = pd.read_csv('badwords/bad_words.txt', header=None)\n",
143 | "bad.head(n=10)"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 8,
149 | "metadata": {
150 | "collapsed": true
151 | },
152 | "outputs": [],
153 | "source": [
154 | "strip = bad[4]\n",
155 | "strip.to_csv('badwords.csv', index=False)"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 23,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "s = pd.read_csv('badwords/badwords.csv', header=None)\n",
165 | "ss = pd.read_csv('badwords/en.txt', header=None)\n",
166 | "sss = pd.read_csv('badwords/full-list-of-bad-words-banned-by-google-txt-file_2013_11_26_04_53_31_867.txt', header=None)\n",
167 | "z = pd.read_csv('badwords/negative-words.txt', header=None)"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 17,
173 | "metadata": {},
174 | "outputs": [
175 | {
176 | "data": {
177 | "text/html": [
178 | "\n",
179 | "\n",
192 | "
\n",
193 | " \n",
194 | " \n",
195 | " | \n",
196 | " 0 | \n",
197 | "
\n",
198 | " \n",
199 | " \n",
200 | " \n",
201 | " | 0 | \n",
202 | " anus | \n",
203 | "
\n",
204 | " \n",
205 | " | 1 | \n",
206 | " arse | \n",
207 | "
\n",
208 | " \n",
209 | " | 2 | \n",
210 | " arsehole | \n",
211 | "
\n",
212 | " \n",
213 | " | 3 | \n",
214 | " asshole | \n",
215 | "
\n",
216 | " \n",
217 | " | 4 | \n",
218 | " axe-wound | \n",
219 | "
\n",
220 | " \n",
221 | " | 5 | \n",
222 | " axewound | \n",
223 | "
\n",
224 | " \n",
225 | " | 6 | \n",
226 | " bastard | \n",
227 | "
\n",
228 | " \n",
229 | " | 7 | \n",
230 | " basterd | \n",
231 | "
\n",
232 | " \n",
233 | " | 8 | \n",
234 | " bastird | \n",
235 | "
\n",
236 | " \n",
237 | " | 9 | \n",
238 | " blow job | \n",
239 | "
\n",
240 | " \n",
241 | "
\n",
242 | "
"
243 | ],
244 | "text/plain": [
245 | " 0\n",
246 | "0 anus\n",
247 | "1 arse\n",
248 | "2 arsehole\n",
249 | "3 asshole\n",
250 | "4 axe-wound\n",
251 | "5 axewound\n",
252 | "6 bastard\n",
253 | "7 basterd\n",
254 | "8 bastird\n",
255 | "9 blow job"
256 | ]
257 | },
258 | "execution_count": 17,
259 | "metadata": {},
260 | "output_type": "execute_result"
261 | }
262 | ],
263 | "source": [
264 | "s.head(n=10)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 18,
270 | "metadata": {
271 | "collapsed": true
272 | },
273 | "outputs": [],
274 | "source": [
275 | "new = s.append(ss)"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 19,
281 | "metadata": {},
282 | "outputs": [
283 | {
284 | "data": {
285 | "text/html": [
286 | "\n",
287 | "\n",
300 | "
\n",
301 | " \n",
302 | " \n",
303 | " | \n",
304 | " 0 | \n",
305 | "
\n",
306 | " \n",
307 | " \n",
308 | " \n",
309 | " | 0 | \n",
310 | " anus | \n",
311 | "
\n",
312 | " \n",
313 | " | 1 | \n",
314 | " arse | \n",
315 | "
\n",
316 | " \n",
317 | "
\n",
318 | "
"
319 | ],
320 | "text/plain": [
321 | " 0\n",
322 | "0 anus\n",
323 | "1 arse"
324 | ]
325 | },
326 | "execution_count": 19,
327 | "metadata": {},
328 | "output_type": "execute_result"
329 | }
330 | ],
331 | "source": [
332 | "new.head(n=2)"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 21,
338 | "metadata": {
339 | "collapsed": true
340 | },
341 | "outputs": [],
342 | "source": [
343 | "new = new.append(sss)"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 22,
349 | "metadata": {},
350 | "outputs": [
351 | {
352 | "data": {
353 | "text/plain": [
354 | "(1218, 1)"
355 | ]
356 | },
357 | "execution_count": 22,
358 | "metadata": {},
359 | "output_type": "execute_result"
360 | }
361 | ],
362 | "source": [
363 | "new.shape"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": 24,
369 | "metadata": {
370 | "collapsed": true
371 | },
372 | "outputs": [],
373 | "source": [
374 | "new = new.append(z)"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": 25,
380 | "metadata": {},
381 | "outputs": [
382 | {
383 | "data": {
384 | "text/plain": [
385 | "(6001, 1)"
386 | ]
387 | },
388 | "execution_count": 25,
389 | "metadata": {},
390 | "output_type": "execute_result"
391 | }
392 | ],
393 | "source": [
394 | "new.shape"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": null,
400 | "metadata": {
401 | "collapsed": true
402 | },
403 | "outputs": [],
404 | "source": []
405 | }
406 | ],
407 | "metadata": {
408 | "kernelspec": {
409 | "display_name": "Python 2",
410 | "language": "python",
411 | "name": "python2"
412 | },
413 | "language_info": {
414 | "codemirror_mode": {
415 | "name": "ipython",
416 | "version": 2
417 | },
418 | "file_extension": ".py",
419 | "mimetype": "text/x-python",
420 | "name": "python",
421 | "nbconvert_exporter": "python",
422 | "pygments_lexer": "ipython2",
423 | "version": "2.7.14"
424 | }
425 | },
426 | "nbformat": 4,
427 | "nbformat_minor": 2
428 | }
429 |
--------------------------------------------------------------------------------
/add_covaai.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {
5 | "_uuid": "41cd31e255867428bddad43ead3e766b2837948b",
6 | "_cell_guid": "93c24171-18f6-42d4-af00-5c856774b347",
7 | "collapsed": true
8 | },
9 | "cell_type": "markdown",
10 | "source": "Note: in the Discussion section they said that data from figshare has some overlap with the current test set (https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/46177). So it's possible that using features/scores based on this data may overfit to the current test set. Once they change the test set, the LB scores may change.\nSo at this point, I think it's hard to tell whether using features based on these datasets will ultimately help your LB score. It may still help, but we won't know for sure until the new test set is released."
11 | },
12 | {
13 | "metadata": {
14 | "_uuid": "7e224f68ef5f256e4be2566e9a8311d3b47470d1",
15 | "collapsed": true,
16 | "_cell_guid": "1d2bb910-0444-487d-89b5-6fa47b8a6b27"
17 | },
18 | "cell_type": "markdown",
19 | "source": "**The idea for this kernel is to use the public datasets at https://conversationai.github.io/ to train models and use those models to score the train and test sets for this challenge. You can then use the scores as features when training the real models. So the output of this kernel isn't meant to be submitted as is. The output is the original train/test datasets, with additional columns/features.**\n\nUsing these enhanced train/test sets improved my logistic-regression based models from 0.047 to 0.044 log-loss. I haven't done much if any tuning for these models below, so you should be able to tweak things and get even better results.\n\nI understand that there are PerspectiveAPI models that may be similar. But rather than wait for an API key, and so I could play around with the models more myself, I trained the models in this kernel."
20 | },
21 | {
22 | "metadata": {
23 | "_uuid": "96c136c2572dbcea17f38c260d0ea5e3c563706d",
24 | "collapsed": true,
25 | "_cell_guid": "9471f39f-ba45-4db8-b610-9bfe18770bc3",
26 | "trusted": false
27 | },
28 | "cell_type": "code",
29 | "source": "import numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)",
30 | "execution_count": null,
31 | "outputs": []
32 | },
33 | {
34 | "metadata": {
35 | "_uuid": "f7a153a8288d190481c26a7bfcf824036e517756",
36 | "collapsed": true,
37 | "_cell_guid": "cc063548-d203-491d-95d3-cfccd58cedf8",
38 | "trusted": false
39 | },
40 | "cell_type": "code",
41 | "source": "toxic_cmt = pd.read_table('../input/conversationaidataset/toxicity_annotated_comments.tsv')\ntoxic_annot = pd.read_table('../input/conversationaidataset/toxicity_annotations.tsv')\naggr_cmt = pd.read_table('../input/conversationaidataset/aggression_annotated_comments.tsv')\naggr_annot = pd.read_table('../input/conversationaidataset/aggression_annotations.tsv')\nattack_cmt = pd.read_table('../input/conversationaidataset/attack_annotated_comments.tsv')\nattack_annot = pd.read_table('../input/conversationaidataset/attack_annotations.tsv')",
42 | "execution_count": null,
43 | "outputs": []
44 | },
45 | {
46 | "metadata": {
47 | "_uuid": "46ff20d6c42c21b1eb1595e539776bd7823bd59d",
48 | "_cell_guid": "b158b96a-8e5e-438f-be60-a4855eb0842b"
49 | },
50 | "cell_type": "markdown",
51 | "source": "**Find the mean score for toxicity, aggression, attack, and join with the corresponding comment**\nFor each comment/rev_id, multiple workers have labeld/annotated. So then you have to decide what your overall label is for a given comment/rev_id. I simply took the mean value, and will train a regression model. You could try other aggregations/methods. You could, e.g., instead go with majority vote, and train binary classifiers, etc."
52 | },
53 | {
54 | "metadata": {
55 | "_uuid": "931f4c3e6c8a7ed28be8f6ff9694a9375ed00bec",
56 | "collapsed": true,
57 | "_cell_guid": "b635bf75-09e2-491c-8aa4-5512f696bded",
58 | "trusted": false
59 | },
60 | "cell_type": "code",
61 | "source": "def JoinAndSanitize(cmt, annot):\n df = cmt.set_index('rev_id').join(annot.groupby(['rev_id']).mean())\n df = Sanitize(df)\n return df",
62 | "execution_count": null,
63 | "outputs": []
64 | },
65 | {
66 | "metadata": {
67 | "_uuid": "5deb6f4f888ac2393462bd243c945d2fcbf9391d",
68 | "_cell_guid": "223c29f8-229b-42fd-ad5d-61427673dbe7"
69 | },
70 | "cell_type": "markdown",
71 | "source": "**Basic cleaning/standardizing -- can potentially do more (or less) here**"
72 | },
73 | {
74 | "metadata": {
75 | "_uuid": "585418f67b3e73c47af85228f7a9e62682cb2816",
76 | "collapsed": true,
77 | "_cell_guid": "d3b0bf5e-37aa-49f5-94cc-bee6d9e06fe1",
78 | "trusted": false
79 | },
80 | "cell_type": "code",
81 | "source": "def Sanitize(df):\n comment = 'comment' if 'comment' in df else 'comment_text'\n df[comment] = df[comment].str.lower().str.replace('newline_token', ' ')\n df[comment] = df[comment].fillna('erikov')\n return df",
82 | "execution_count": null,
83 | "outputs": []
84 | },
85 | {
86 | "metadata": {
87 | "_uuid": "d56061bf13b65b1587e125152d87d71790031fda",
88 | "collapsed": true,
89 | "_cell_guid": "185561a1-5b3d-49ea-b737-1f68504d285f",
90 | "trusted": false
91 | },
92 | "cell_type": "code",
93 | "source": "toxic = JoinAndSanitize(toxic_cmt, toxic_annot)\nattack = JoinAndSanitize(attack_cmt, attack_annot)\naggression = JoinAndSanitize(aggr_cmt, aggr_annot)",
94 | "execution_count": null,
95 | "outputs": []
96 | },
97 | {
98 | "metadata": {
99 | "_uuid": "8b9e981ccef00bdbb71e852eaeb80e3f2b4cd439",
100 | "_cell_guid": "bc38ff04-0274-4bdf-9bff-1e5625ac0983"
101 | },
102 | "cell_type": "markdown",
103 | "source": "**The attack and aggression labeled datasets are actually the same with only very slightly different annotations/labels**\nSo probably only the scores from one model will be needed, but I left both here for completeness."
104 | },
105 | {
106 | "metadata": {
107 | "_uuid": "cb884f34312b44a95038df85adbd08b8df5693ef",
108 | "collapsed": true,
109 | "_cell_guid": "30eed731-3c48-492b-84b0-fe677e24fb37",
110 | "trusted": false
111 | },
112 | "cell_type": "code",
113 | "source": "len(attack), len(aggression)",
114 | "execution_count": null,
115 | "outputs": []
116 | },
117 | {
118 | "metadata": {
119 | "_uuid": "e97f9c8f5d65713f01131ecaa306dea2eee7f1b4",
120 | "collapsed": true,
121 | "_cell_guid": "8a7db6ee-6988-4f13-8b22-011ac4fc2baf",
122 | "trusted": false
123 | },
124 | "cell_type": "code",
125 | "source": "attack['comment'].equals(aggression['comment'])",
126 | "execution_count": null,
127 | "outputs": []
128 | },
129 | {
130 | "metadata": {
131 | "_uuid": "845182cb1a08c406f03ef7373188b14ecf7d8d31",
132 | "_cell_guid": "448b0597-005b-4bce-bacc-c4e8d6ba46ec"
133 | },
134 | "cell_type": "markdown",
135 | "source": "Check how correlated the mean value for the annotations between the attack and aggression datasets are"
136 | },
137 | {
138 | "metadata": {
139 | "_uuid": "96134949dd9f12ab7a34955bed2449a2f1a7ded4",
140 | "collapsed": true,
141 | "_cell_guid": "47639bda-b46b-464a-91c3-3e2020c763ca",
142 | "trusted": false
143 | },
144 | "cell_type": "code",
145 | "source": "attack['attack'].corr(aggression['aggression'])",
146 | "execution_count": null,
147 | "outputs": []
148 | },
149 | {
150 | "metadata": {
151 | "_uuid": "dbcefead0cd7d82f2e63cb607d21bbf73f14a14f",
152 | "_cell_guid": "ad881513-707b-4c9f-aa36-cf9b5464bf75"
153 | },
154 | "cell_type": "markdown",
155 | "source": "**Check dataset**"
156 | },
157 | {
158 | "metadata": {
159 | "_uuid": "29496c0653e608686a9170cbaf6ba16825091323",
160 | "collapsed": true,
161 | "_cell_guid": "cfb2b58b-369a-4934-ac88-33b7c6fc026e",
162 | "trusted": false
163 | },
164 | "cell_type": "code",
165 | "source": "toxic.head()\n#attack.head()\n#aggression.head()",
166 | "execution_count": null,
167 | "outputs": []
168 | },
169 | {
170 | "metadata": {
171 | "_uuid": "66404a538b7b0f751d3c5ca887038ecaf6116f43",
172 | "collapsed": true,
173 | "_cell_guid": "57fe5c62-bbb8-433e-8b0d-48feb38c1ca5",
174 | "trusted": false
175 | },
176 | "cell_type": "code",
177 | "source": "from sklearn.feature_extraction.text import TfidfVectorizer\n\ndef Tfidfize(df):\n # can tweak these as desired\n max_vocab = 200000\n split = 0.1\n\n comment = 'comment' if 'comment' in df else 'comment_text'\n \n tfidfer = TfidfVectorizer(ngram_range=(1,2), max_features=max_vocab,\n use_idf=1, stop_words='english',\n smooth_idf=1, sublinear_tf=1 )\n tfidf = tfidfer.fit_transform(df[comment])\n\n return tfidf, tfidfer",
178 | "execution_count": null,
179 | "outputs": []
180 | },
181 | {
182 | "metadata": {
183 | "_uuid": "b267bdc10ea39fbffada53ddc707562b3e8a0ad6",
184 | "_cell_guid": "3496e5fe-0fb9-4cce-887c-46679ba71dcc"
185 | },
186 | "cell_type": "markdown",
187 | "source": "Get the tfidf values for the training sets, as well as the fit tfidf vectorizer to be used later to transform the train/test sets for the real challenge datasets."
188 | },
189 | {
190 | "metadata": {
191 | "_uuid": "bfeab05149e6f8e61ed12dd56284cb919dc94cd4",
192 | "collapsed": true,
193 | "_cell_guid": "2f334a1c-ce4b-4a9a-bd91-e70d9d617553",
194 | "trusted": false
195 | },
196 | "cell_type": "code",
197 | "source": "X_toxic, tfidfer_toxic = Tfidfize(toxic)\ny_toxic = toxic['toxicity'].values\nX_attack, tfidfer_attack = Tfidfize(attack)\ny_attack = attack['attack'].values\nX_aggression, tfidfer_aggression = Tfidfize(aggression)\ny_aggression = aggression['aggression'].values",
198 | "execution_count": null,
199 | "outputs": []
200 | },
201 | {
202 | "metadata": {
203 | "_uuid": "59cd67be9dbcc8939d59a60a21c14899cdcf219d",
204 | "_cell_guid": "c170e36d-f34f-40c3-bec2-b87f500394ae"
205 | },
206 | "cell_type": "markdown",
207 | "source": "**Model Training Strategy**\n\nRather than converting the 'toxicity', 'attack', 'aggression' into a binary label (e.g., >= 0.5), let's train a regression model to use as much information as possible. The output score from these models could be used as features in training the further refined models in the current challenge ('severe_toxic', 'obscene', etc.).\n\nThe toxicity/attack/aggression may not have a 1-1 mapping with the desired targets for the challenge, but they may be features that can help."
208 | },
209 | {
210 | "metadata": {
211 | "_uuid": "4afbde5720479525399c5a3fa3f7539506dec2ec",
212 | "collapsed": true,
213 | "_cell_guid": "e219786f-3591-4572-946c-70834f410c2a",
214 | "trusted": false
215 | },
216 | "cell_type": "code",
217 | "source": "from sklearn.linear_model import Ridge\nfrom sklearn.model_selection import cross_val_score\n\nridge = Ridge()\nmse_toxic = -cross_val_score(ridge, X_toxic, y_toxic, scoring='neg_mean_squared_error')\nmse_attack = -cross_val_score(ridge, X_attack, y_attack, scoring='neg_mean_squared_error')\nmse_aggression = -cross_val_score(ridge, X_aggression, y_aggression, scoring='neg_mean_squared_error')",
218 | "execution_count": null,
219 | "outputs": []
220 | },
221 | {
222 | "metadata": {
223 | "_uuid": "a7028c84de71f59f163c552776cd3ab7cc6fc7b2",
224 | "collapsed": true,
225 | "_cell_guid": "a76fdcba-ebe8-4413-8e52-926d0197555b",
226 | "trusted": false
227 | },
228 | "cell_type": "code",
229 | "source": "mse_toxic.mean(), mse_attack.mean(), mse_aggression.mean()",
230 | "execution_count": null,
231 | "outputs": []
232 | },
233 | {
234 | "metadata": {
235 | "_uuid": "6a93d0923ad96dcd55144c3d9e23fc006f555d29",
236 | "_cell_guid": "109ca84b-7fab-4c92-91c2-4e32e891170d"
237 | },
238 | "cell_type": "markdown",
239 | "source": "**If the cross-validation scores look okay, train on the full dataset**"
240 | },
241 | {
242 | "metadata": {
243 | "_uuid": "d4d8274bd65f2db4f7c44d585f9e0aeda2ac2985",
244 | "collapsed": true,
245 | "_cell_guid": "9e412176-f80b-4a8b-9dd6-da41673e7b38",
246 | "trusted": false
247 | },
248 | "cell_type": "code",
249 | "source": "model_toxic = ridge.fit(X_toxic, y_toxic)\nmodel_attack = ridge.fit(X_attack, y_attack)\nmodel_aggression = ridge.fit(X_aggression, y_aggression)",
250 | "execution_count": null,
251 | "outputs": []
252 | },
253 | {
254 | "metadata": {
255 | "_uuid": "9c98b0e8f7896edebed257d5d16ec1efc07edf6a",
256 | "_cell_guid": "dfae494c-51ca-4360-8150-baa193123442"
257 | },
258 | "cell_type": "markdown",
259 | "source": "**Now score the original train and test sets, and save out as an additional feature for those datasets. (These can then be used when training/scoring with our real model**"
260 | },
261 | {
262 | "metadata": {
263 | "_uuid": "71279eba76cc6dd72dbd0a8bac14ace806d7d938",
264 | "collapsed": true,
265 | "_cell_guid": "d626a4d3-59e9-4049-aefc-81b61f812d4d",
266 | "trusted": false
267 | },
268 | "cell_type": "code",
269 | "source": "train_orig = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')\ntest_orig = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv')",
270 | "execution_count": null,
271 | "outputs": []
272 | },
273 | {
274 | "metadata": {
275 | "_uuid": "4af40d3a6f3e17e706b5ce81dd07dbd03e9619ce",
276 | "collapsed": true,
277 | "_cell_guid": "baac45ac-88aa-4252-81ca-3f3a27a9ccbc",
278 | "trusted": false
279 | },
280 | "cell_type": "code",
281 | "source": "train_orig = Sanitize(train_orig)\ntest_orig = Sanitize(test_orig)",
282 | "execution_count": null,
283 | "outputs": []
284 | },
285 | {
286 | "metadata": {
287 | "_uuid": "f2d7f004cd7bd0a3c8461c8106e34c07b2d5348d",
288 | "collapsed": true,
289 | "_cell_guid": "c97079c3-0df4-4bfe-8308-81cbdcda48d3",
290 | "trusted": false
291 | },
292 | "cell_type": "code",
293 | "source": "def TfidfAndPredict(tfidfer, model):\n tfidf_train = tfidfer.transform(train_orig['comment_text'])\n tfidf_test = tfidfer.transform(test_orig['comment_text'])\n train_scores = model.predict(tfidf_train)\n test_scores = model.predict(tfidf_test)\n \n return train_scores, test_scores",
294 | "execution_count": null,
295 | "outputs": []
296 | },
297 | {
298 | "metadata": {
299 | "_uuid": "5dd3cc4349133aa1c3bf648ed2f41c5c14261b6f",
300 | "collapsed": true,
301 | "_cell_guid": "aa06f163-9e0d-48fe-98fe-eab940d1f13b",
302 | "trusted": false
303 | },
304 | "cell_type": "code",
305 | "source": "toxic_tr_scores, toxic_t_scores = TfidfAndPredict(tfidfer_toxic, model_toxic)",
306 | "execution_count": null,
307 | "outputs": []
308 | },
309 | {
310 | "metadata": {
311 | "_uuid": "95859856265bc344769fe837c435b0af8f222326",
312 | "collapsed": true,
313 | "_cell_guid": "9d6e0ce2-6e55-42fb-ab12-81437b0923b6",
314 | "trusted": false
315 | },
316 | "cell_type": "code",
317 | "source": "toxic_tr_scores.shape, toxic_t_scores.shape",
318 | "execution_count": null,
319 | "outputs": []
320 | },
321 | {
322 | "metadata": {
323 | "_uuid": "81ce8b860fe6cc402ccb2eb2f8a5b4e8391cdeba",
324 | "collapsed": true,
325 | "_cell_guid": "d4531c50-96a0-4180-81f1-78f6ae81ee20",
326 | "trusted": false
327 | },
328 | "cell_type": "code",
329 | "source": "attack_tr_scores, attack_t_scores = TfidfAndPredict(tfidfer_attack, model_attack)",
330 | "execution_count": null,
331 | "outputs": []
332 | },
333 | {
334 | "metadata": {
335 | "_uuid": "b25fd9d4bf26223a25e1cd4d33fa85e2859f44d4",
336 | "collapsed": true,
337 | "_cell_guid": "fb049953-c535-42fd-849d-42faeaaa0072",
338 | "trusted": false
339 | },
340 | "cell_type": "code",
341 | "source": "attack_tr_scores.shape, attack_t_scores.shape",
342 | "execution_count": null,
343 | "outputs": []
344 | },
345 | {
346 | "metadata": {
347 | "_uuid": "deb4311c8f47d2414ef24a47570611aad9da8e01",
348 | "collapsed": true,
349 | "_cell_guid": "8ca296a0-21c9-4863-be29-d8eff0474e8b",
350 | "trusted": false
351 | },
352 | "cell_type": "code",
353 | "source": "aggression_tr_scores, aggression_t_scores = TfidfAndPredict(tfidfer_aggression, model_aggression)",
354 | "execution_count": null,
355 | "outputs": []
356 | },
357 | {
358 | "metadata": {
359 | "_uuid": "eeb5ad47a06592db00d32b3b93ada9442053bcac",
360 | "collapsed": true,
361 | "_cell_guid": "fb80020b-ea0f-4129-afb5-a106aeb45e85",
362 | "trusted": false
363 | },
364 | "cell_type": "code",
365 | "source": "aggression_tr_scores.shape, aggression_t_scores.shape",
366 | "execution_count": null,
367 | "outputs": []
368 | },
369 | {
370 | "metadata": {
371 | "_uuid": "790503a6bc2b0a9cf6def1bd9ac5083a6e55c710",
372 | "_cell_guid": "8ea53ca4-fdb8-4f61-b268-c6aea94d24b4"
373 | },
374 | "cell_type": "markdown",
375 | "source": "**Ok, now write out these scores alongside the original train and test datasets**"
376 | },
377 | {
378 | "metadata": {
379 | "_uuid": "e7cf48859d9338ffb3abbd1c8a55a126df4e3307",
380 | "collapsed": true,
381 | "_cell_guid": "5d4bb41e-daff-43c5-b4c8-0e7e5e9c70b3",
382 | "trusted": false
383 | },
384 | "cell_type": "code",
385 | "source": "# toxic_level, to not be confused with original label 'toxic'\ntrain_orig['toxic_level'] = toxic_tr_scores\ntrain_orig['attack'] = attack_tr_scores\ntrain_orig['aggression'] = aggression_tr_scores\ntest_orig['toxic_level'] = toxic_t_scores\ntest_orig['attack'] = attack_t_scores\ntest_orig['aggression'] = aggression_t_scores\n",
386 | "execution_count": null,
387 | "outputs": []
388 | },
389 | {
390 | "metadata": {
391 | "_uuid": "c041fffab51281b288a221245f07bd5437aea35d",
392 | "collapsed": true,
393 | "_cell_guid": "88afa0c6-23ca-4393-88c5-b143acfa32a3",
394 | "trusted": false
395 | },
396 | "cell_type": "code",
397 | "source": "train_orig.to_csv('train_with_convai.csv', index=False)\ntest_orig.to_csv('test_with_convai.csv', index=False)",
398 | "execution_count": null,
399 | "outputs": []
400 | }
401 | ],
402 | "metadata": {
403 | "language_info": {
404 | "pygments_lexer": "ipython3",
405 | "file_extension": ".py",
406 | "codemirror_mode": {
407 | "version": 3,
408 | "name": "ipython"
409 | },
410 | "mimetype": "text/x-python",
411 | "version": "3.6.3",
412 | "nbconvert_exporter": "python",
413 | "name": "python"
414 | },
415 | "kernelspec": {
416 | "display_name": "Python 3",
417 | "language": "python",
418 | "name": "python3"
419 | }
420 | },
421 | "nbformat": 4,
422 | "nbformat_minor": 1
423 | }
--------------------------------------------------------------------------------
/RIDGE.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "Tfidf word vector\n",
13 | "Tfidf char vector\n",
14 | "stack both\n"
15 | ]
16 | }
17 | ],
18 | "source": [
19 | "import numpy as np\n",
20 | "import pandas as pd\n",
21 | "\n",
22 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
23 | "from sklearn.linear_model import Ridge\n",
24 | "from sklearn.model_selection import cross_val_score\n",
25 | "from scipy.sparse import hstack\n",
26 | "from sklearn.metrics import log_loss, matthews_corrcoef, roc_auc_score\n",
27 | "from sklearn.model_selection import cross_val_score\n",
28 | "from sklearn.model_selection import StratifiedKFold\n",
29 | "from sklearn.model_selection import KFold\n",
30 | "from sklearn.linear_model import Lasso\n",
31 | "from sklearn.linear_model import ElasticNet\n",
32 | "\n",
33 | "class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n",
34 | "\n",
35 | "train = pd.read_csv('train_translated_sp_clean.csv').fillna(' ')\n",
36 | "test = pd.read_csv('test_translated_sp_clean.csv').fillna(' ')\n",
37 | "\n",
38 | "train_text = train['comment_text']\n",
39 | "test_text = test['comment_text']\n",
40 | "all_text = pd.concat([train_text, test_text])\n",
41 | "\n",
42 | "class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n",
43 | "tr_ids = train[['id']]\n",
44 | "train[class_names] = train[class_names].astype(np.int8)\n",
45 | "target = train[class_names]\n",
46 | "\n",
47 | "print('Tfidf word vector')\n",
48 | "word_vectorizer = TfidfVectorizer(\n",
49 | " sublinear_tf=True,\n",
50 | " strip_accents='unicode',\n",
51 | " analyzer='word',\n",
52 | " token_pattern=r'\\w{1,}',\n",
53 | " stop_words='english',\n",
54 | " ngram_range=(1, 1),\n",
55 | " max_features=10000)\n",
56 | "word_vectorizer.fit(all_text)\n",
57 | "train_word_features = word_vectorizer.transform(train_text)\n",
58 | "test_word_features = word_vectorizer.transform(test_text)\n",
59 | "\n",
60 | "print('Tfidf char vector')\n",
61 | "char_vectorizer = TfidfVectorizer(\n",
62 | " sublinear_tf=True,\n",
63 | " strip_accents='unicode',\n",
64 | " analyzer='char',\n",
65 | " stop_words='english',\n",
66 | " ngram_range=(2, 6),\n",
67 | " max_features=50000)\n",
68 | "char_vectorizer.fit(all_text)\n",
69 | "train_char_features = char_vectorizer.transform(train_text)\n",
70 | "test_char_features = char_vectorizer.transform(test_text)\n",
71 | "\n",
72 | "print('stack both')\n",
73 | "#train_features = hstack([train_char_features, train_word_features])\n",
74 | "#test_features = hstack([test_char_features, test_word_features])\n",
75 | "\n",
76 | "#train_features = train_word_features\n",
77 | "#test_features = test_word_features\n",
78 | "\n",
79 | "train_features = hstack([train_char_features, train_word_features]).tocsr()\n",
80 | "test_features = hstack([test_char_features, test_word_features]).tocsr()\n",
81 | "\n",
82 | "scores = []\n",
83 | "scores_classes = np.zeros((len(class_names), 10))\n",
84 | "\n",
85 | "submission = pd.DataFrame.from_dict({'id': test['id']})\n",
86 | "submission_oof = train[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]\n",
87 | "\n",
88 | "idpred = tr_ids\n",
89 | "number_of_folds = 10\n",
90 | "\n",
91 | "#kfolder=StratifiedKFold(train_text, n_folds=number_of_folds,shuffle=True, random_state=15)\n",
92 | "\n"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 8,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "from sklearn.model_selection import StratifiedKFold\n",
102 | "\n",
103 | "number_of_folds = 10\n",
104 | "#kfolder = KFold(n_splits=number_of_folds, shuffle=True, random_state=239)\n",
105 | "kfolder= StratifiedKFold(n_splits=number_of_folds,shuffle=True, random_state=15)\n",
106 | "scores_classes = np.zeros((len(class_names), 10))\n"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 9,
112 | "metadata": {},
113 | "outputs": [
114 | {
115 | "name": "stdout",
116 | "output_type": "stream",
117 | "text": [
118 | "class_name is: toxic\n",
119 | "[ 0 1 2 ... 159568 159569 159570]\n",
120 | "[ 4 63 64 ... 159490 159503 159510]\n",
121 | "\n",
122 | " Fold 01 class toxic AUC: 0.978946\n",
123 | "[ 0 1 2 ... 159568 159569 159570]\n",
124 | "[ 10 26 32 ... 159548 159557 159563]\n",
125 | "\n",
126 | " Fold 02 class toxic AUC: 0.980277\n",
127 | "[ 0 2 3 ... 159568 159569 159570]\n",
128 | "[ 1 16 18 ... 159542 159547 159564]\n",
129 | "\n",
130 | " Fold 03 class toxic AUC: 0.979126\n",
131 | "[ 0 1 2 ... 159568 159569 159570]\n",
132 | "[ 17 21 30 ... 159536 159555 159566]\n",
133 | "\n",
134 | " Fold 04 class toxic AUC: 0.976820\n",
135 | "[ 0 1 3 ... 159568 159569 159570]\n",
136 | "[ 2 8 9 ... 159559 159560 159562]\n",
137 | "\n",
138 | " Fold 05 class toxic AUC: 0.977847\n",
139 | "[ 0 1 2 ... 159568 159569 159570]\n",
140 | "[ 20 22 24 ... 159546 159551 159558]\n",
141 | "\n",
142 | " Fold 06 class toxic AUC: 0.979972\n",
143 | "[ 1 2 3 ... 159566 159569 159570]\n",
144 | "[ 0 5 23 ... 159550 159567 159568]\n",
145 | "\n",
146 | " Fold 07 class toxic AUC: 0.981081\n",
147 | "[ 0 1 2 ... 159567 159568 159570]\n",
148 | "[ 3 6 11 ... 159554 159561 159569]\n",
149 | "\n",
150 | " Fold 08 class toxic AUC: 0.981545\n",
151 | "[ 0 1 2 ... 159567 159568 159569]\n",
152 | "[ 7 14 15 ... 159549 159553 159570]\n",
153 | "\n",
154 | " Fold 09 class toxic AUC: 0.982126\n",
155 | "[ 0 1 2 ... 159568 159569 159570]\n",
156 | "[ 31 46 54 ... 159517 159539 159565]\n",
157 | "\n",
158 | " Fold 10 class toxic AUC: 0.978534\n",
159 | "\n",
160 | " Average class toxic AUC:\t0.979627\n",
161 | " Out-of-fold class toxic AUC:\t0.979634\n"
162 | ]
163 | },
164 | {
165 | "name": "stderr",
166 | "output_type": "stream",
167 | "text": [
168 | "/home/stgc/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:33: SettingWithCopyWarning: \n",
169 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
170 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
171 | "\n",
172 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
173 | ]
174 | },
175 | {
176 | "name": "stdout",
177 | "output_type": "stream",
178 | "text": [
179 | "class_name is: severe_toxic\n",
180 | "[ 0 1 2 ... 159568 159569 159570]\n",
181 | "[ 4 7 32 ... 159548 159549 159563]\n",
182 | "\n",
183 | " Fold 01 class severe_toxic AUC: 0.988811\n",
184 | "[ 0 1 2 ... 159568 159569 159570]\n",
185 | "[ 10 15 24 ... 159534 159535 159560]\n",
186 | "\n",
187 | " Fold 02 class severe_toxic AUC: 0.989085\n",
188 | "[ 0 1 2 ... 159568 159569 159570]\n",
189 | "[ 6 9 12 ... 159531 159555 159559]\n",
190 | "\n",
191 | " Fold 03 class severe_toxic AUC: 0.988796\n",
192 | "[ 0 2 3 ... 159568 159569 159570]\n",
193 | "[ 1 16 39 ... 159546 159550 159564]\n",
194 | "\n",
195 | " Fold 04 class severe_toxic AUC: 0.987581\n",
196 | "[ 0 1 3 ... 159568 159569 159570]\n",
197 | "[ 2 17 19 ... 159517 159538 159566]\n",
198 | "\n",
199 | " Fold 05 class severe_toxic AUC: 0.981197\n",
200 | "[ 0 1 2 ... 159568 159569 159570]\n",
201 | "[ 8 18 20 ... 159544 159556 159562]\n",
202 | "\n",
203 | " Fold 06 class severe_toxic AUC: 0.990249\n",
204 | "[ 0 1 2 ... 159567 159568 159569]\n",
205 | "[ 5 22 40 ... 159557 159558 159570]\n",
206 | "\n",
207 | " Fold 07 class severe_toxic AUC: 0.988764\n",
208 | "[ 1 2 3 ... 159566 159569 159570]\n",
209 | "[ 0 21 27 ... 159528 159567 159568]\n",
210 | "\n",
211 | " Fold 08 class severe_toxic AUC: 0.978003\n",
212 | "[ 0 1 2 ... 159567 159568 159570]\n",
213 | "[ 3 11 26 ... 159554 159561 159569]\n",
214 | "\n",
215 | " Fold 09 class severe_toxic AUC: 0.989705\n",
216 | "[ 0 1 2 ... 159568 159569 159570]\n",
217 | "[ 14 23 29 ... 159541 159542 159565]\n",
218 | "\n",
219 | " Fold 10 class severe_toxic AUC: 0.991349\n",
220 | "\n",
221 | " Average class severe_toxic AUC:\t0.987354\n",
222 | " Out-of-fold class severe_toxic AUC:\t0.987371\n",
223 | "class_name is: obscene\n",
224 | "[ 0 1 2 ... 159568 159569 159570]\n",
225 | "[ 4 7 10 ... 159547 159548 159557]\n",
226 | "\n",
227 | " Fold 01 class obscene AUC: 0.992491\n",
228 | "[ 0 1 2 ... 159568 159569 159570]\n",
229 | "[ 24 30 33 ... 159559 159560 159563]\n",
230 | "\n",
231 | " Fold 02 class obscene AUC: 0.992949\n",
232 | "[ 0 1 2 ... 159568 159569 159570]\n",
233 | "[ 9 34 39 ... 159530 159531 159543]\n",
234 | "\n",
235 | " Fold 03 class obscene AUC: 0.992516\n",
236 | "[ 0 2 3 ... 159568 159569 159570]\n",
237 | "[ 1 16 28 ... 159538 159555 159564]\n",
238 | "\n",
239 | " Fold 04 class obscene AUC: 0.992658\n",
240 | "[ 0 1 3 ... 159568 159569 159570]\n",
241 | "[ 2 8 15 ... 159561 159562 159566]\n",
242 | "\n",
243 | " Fold 05 class obscene AUC: 0.993123\n",
244 | "[ 0 1 2 ... 159568 159569 159570]\n",
245 | "[ 6 18 20 ... 159515 159535 159541]\n",
246 | "\n",
247 | " Fold 06 class obscene AUC: 0.994014\n",
248 | "[ 1 2 3 ... 159566 159568 159569]\n",
249 | "[ 0 5 22 ... 159558 159567 159570]\n",
250 | "\n",
251 | " Fold 07 class obscene AUC: 0.994354\n",
252 | "[ 0 1 2 ... 159566 159567 159570]\n",
253 | "[ 3 11 21 ... 159552 159568 159569]\n",
254 | "\n",
255 | " Fold 08 class obscene AUC: 0.991395\n",
256 | "[ 0 1 2 ... 159568 159569 159570]\n",
257 | "[ 12 13 37 ... 159520 159536 159553]\n",
258 | "\n",
259 | " Fold 09 class obscene AUC: 0.991344\n",
260 | "[ 0 1 2 ... 159568 159569 159570]\n",
261 | "[ 14 23 29 ... 159542 159554 159565]\n",
262 | "\n",
263 | " Fold 10 class obscene AUC: 0.992631\n",
264 | "\n",
265 | " Average class obscene AUC:\t0.992748\n",
266 | " Out-of-fold class obscene AUC:\t0.992748\n",
267 | "class_name is: threat\n",
268 | "[ 0 1 2 ... 159568 159569 159570]\n",
269 | "[ 4 6 31 ... 159525 159547 159549]\n",
270 | "\n",
271 | " Fold 01 class threat AUC: 0.975375\n",
272 | "[ 0 1 2 ... 159568 159569 159570]\n",
273 | "[ 9 14 23 ... 159527 159532 159560]\n",
274 | "\n",
275 | " Fold 02 class threat AUC: 0.978111\n",
276 | "[ 0 1 2 ... 159568 159569 159570]\n",
277 | "[ 8 11 12 ... 159533 159555 159556]\n",
278 | "\n",
279 | " Fold 03 class threat AUC: 0.975833\n",
280 | "[ 0 2 3 ... 159568 159569 159570]\n",
281 | "[ 1 15 38 ... 159559 159564 159567]\n",
282 | "\n",
283 | " Fold 04 class threat AUC: 0.993123\n",
284 | "[ 0 1 3 ... 159568 159569 159570]\n",
285 | "[ 2 16 18 ... 159517 159538 159566]\n",
286 | "\n",
287 | " Fold 05 class threat AUC: 0.992293\n",
288 | "[ 0 1 2 ... 159568 159569 159570]\n",
289 | "[ 7 17 19 ... 159561 159562 159563]\n",
290 | "\n",
291 | " Fold 06 class threat AUC: 0.990553\n",
292 | "[ 0 1 2 ... 159567 159568 159569]\n",
293 | "[ 5 21 39 ... 159557 159558 159570]\n",
294 | "\n",
295 | " Fold 07 class threat AUC: 0.985213\n",
296 | "[ 1 2 3 ... 159567 159569 159570]\n",
297 | "[ 0 20 26 ... 159528 159551 159568]\n",
298 | "\n",
299 | " Fold 08 class threat AUC: 0.992530\n",
300 | "[ 0 1 2 ... 159567 159568 159570]\n",
301 | "[ 3 10 25 ... 159553 159554 159569]\n",
302 | "\n",
303 | " Fold 09 class threat AUC: 0.993273\n",
304 | "[ 0 1 2 ... 159568 159569 159570]\n",
305 | "[ 13 22 28 ... 159541 159542 159565]\n",
306 | "\n",
307 | " Fold 10 class threat AUC: 0.965121\n",
308 | "\n",
309 | " Average class threat AUC:\t0.984142\n",
310 | " Out-of-fold class threat AUC:\t0.984083\n",
311 | "class_name is: insult\n",
312 | "[ 0 1 2 ... 159568 159569 159570]\n",
313 | "[ 4 7 10 ... 159517 159537 159567]\n",
314 | "\n",
315 | " Fold 01 class insult AUC: 0.983587\n",
316 | "[ 0 1 2 ... 159568 159569 159570]\n",
317 | "[ 24 30 33 ... 159548 159554 159557]\n",
318 | "\n",
319 | " Fold 02 class insult AUC: 0.987092\n",
320 | "[ 0 1 2 ... 159568 159569 159570]\n",
321 | "[ 9 34 39 ... 159529 159542 159556]\n",
322 | "\n",
323 | " Fold 03 class insult AUC: 0.982300\n",
324 | "[ 0 2 3 ... 159568 159569 159570]\n",
325 | "[ 1 16 28 ... 159541 159555 159564]\n",
326 | "\n",
327 | " Fold 04 class insult AUC: 0.982773\n",
328 | "[ 0 1 3 ... 159568 159569 159570]\n",
329 | "[ 2 8 15 ... 159559 159562 159566]\n",
330 | "\n",
331 | " Fold 05 class insult AUC: 0.982130\n",
332 | "[ 0 1 2 ... 159568 159569 159570]\n",
333 | "[ 6 18 20 ... 159543 159546 159561]\n",
334 | "\n",
335 | " Fold 06 class insult AUC: 0.985141\n",
336 | "[ 1 2 3 ... 159567 159568 159569]\n",
337 | "[ 0 5 22 ... 159551 159558 159570]\n",
338 | "\n",
339 | " Fold 07 class insult AUC: 0.983921\n",
340 | "[ 0 1 2 ... 159566 159567 159570]\n",
341 | "[ 3 11 21 ... 159563 159568 159569]\n",
342 | "\n",
343 | " Fold 08 class insult AUC: 0.987698\n",
344 | "[ 0 1 2 ... 159568 159569 159570]\n",
345 | "[ 12 13 37 ... 159519 159535 159553]\n",
346 | "\n",
347 | " Fold 09 class insult AUC: 0.983639\n",
348 | "[ 0 1 2 ... 159568 159569 159570]\n",
349 | "[ 14 23 29 ... 159539 159540 159565]\n",
350 | "\n",
351 | " Fold 10 class insult AUC: 0.987121\n",
352 | "\n",
353 | " Average class insult AUC:\t0.984540\n",
354 | " Out-of-fold class insult AUC:\t0.984541\n",
355 | "class_name is: identity_hate\n",
356 | "[ 0 1 2 ... 159568 159569 159570]\n",
357 | "[ 4 6 31 ... 159555 159560 159561]\n",
358 | "\n",
359 | " Fold 01 class identity_hate AUC: 0.982784\n",
360 | "[ 0 1 2 ... 159568 159569 159570]\n",
361 | "[ 9 14 23 ... 159548 159549 159559]\n",
362 | "\n",
363 | " Fold 02 class identity_hate AUC: 0.977440\n",
364 | "[ 0 1 2 ... 159568 159569 159570]\n",
365 | "[ 8 11 12 ... 159539 159547 159550]\n",
366 | "\n",
367 | " Fold 03 class identity_hate AUC: 0.970181\n",
368 | "[ 0 2 3 ... 159568 159569 159570]\n",
369 | "[ 1 15 38 ... 159534 159543 159564]\n",
370 | "\n",
371 | " Fold 04 class identity_hate AUC: 0.984155\n",
372 | "[ 0 1 3 ... 159568 159569 159570]\n",
373 | "[ 2 16 18 ... 159525 159538 159566]\n",
374 | "\n",
375 | " Fold 05 class identity_hate AUC: 0.980120\n",
376 | "[ 0 1 2 ... 159568 159569 159570]\n",
377 | "[ 7 17 19 ... 159546 159562 159567]\n",
378 | "\n",
379 | " Fold 06 class identity_hate AUC: 0.976925\n",
380 | "[ 0 1 2 ... 159567 159568 159569]\n",
381 | "[ 5 21 39 ... 159557 159558 159570]\n",
382 | "\n",
383 | " Fold 07 class identity_hate AUC: 0.980144\n",
384 | "[ 1 2 3 ... 159567 159569 159570]\n",
385 | "[ 0 20 26 ... 159528 159563 159568]\n",
386 | "\n",
387 | " Fold 08 class identity_hate AUC: 0.978127\n",
388 | "[ 0 1 2 ... 159567 159568 159570]\n",
389 | "[ 3 10 25 ... 159553 159554 159569]\n",
390 | "\n",
391 | " Fold 09 class identity_hate AUC: 0.986655\n",
392 | "[ 0 1 2 ... 159568 159569 159570]\n",
393 | "[ 13 22 28 ... 159541 159542 159565]\n",
394 | "\n",
395 | " Fold 10 class identity_hate AUC: 0.984381\n",
396 | "\n",
397 | " Average class identity_hate AUC:\t0.980091\n",
398 | " Out-of-fold class identity_hate AUC:\t0.980032\n"
399 | ]
400 | }
401 | ],
402 | "source": [
403 | "\n",
404 | "for j, (class_name) in enumerate(class_names):\n",
405 | " \n",
406 | " print('class_name is: ' + class_name)\n",
407 | " avreal = target[class_name]\n",
408 | " lr_cv_sum = 0\n",
409 | " lr_test_pred = np.zeros(test.shape[0])\n",
410 | " lr_avpred = np.zeros(train.shape[0])\n",
411 | " \n",
412 | " for i, (train_index, val_index) in enumerate(kfolder.split(train_features, avreal)):\n",
413 | " print(train_index)\n",
414 | " print(val_index)\n",
415 | " X_train, X_val = train_features[train_index], train_features[val_index]\n",
416 | " y_train, y_val = target.loc[train_index], target.loc[val_index]\n",
417 | "\n",
418 | " classifier = Ridge(alpha=20, copy_X=True, fit_intercept=True, solver='auto',max_iter=100,normalize=False, random_state=0, tol=0.0025)\n",
419 | " \n",
420 | " #classifier = Lasso(alpha=0.1,normalize=True, max_iter=1e5)\n",
421 | " # classifier = ElasticNet(alpha=1.0, l1_ratio =0.5)\n",
422 | " classifier.fit(X_train, y_train[class_name])\n",
423 | " scores_val = classifier.predict(X_val)\n",
424 | " lr_avpred[val_index] = scores_val\n",
425 | " lr_test_pred += classifier.predict(test_features)\n",
426 | " scores_classes[j][i] = roc_auc_score(y_val[class_name], scores_val)\n",
427 | " print('\\n Fold %02d class %s AUC: %.6f' % ((i+1), class_name, scores_classes[j][i]))\n",
428 | "\n",
429 | " lr_cv_score = (lr_cv_sum / number_of_folds)\n",
430 | " lr_oof_auc = roc_auc_score(avreal, lr_avpred)\n",
431 | " print('\\n Average class %s AUC:\\t%.6f' % (class_name, np.mean(scores_classes[j])))\n",
432 | " print(' Out-of-fold class %s AUC:\\t%.6f' % (class_name, lr_oof_auc))\n",
433 | "\n",
434 | " submission[class_name] = lr_test_pred / number_of_folds\n",
435 | " submission_oof[class_name] = lr_avpred\n",
436 | "\n",
437 | "#print('\\n Overall AUC:\\t%.6f' % (np.mean(scores_classes)))\n",
438 | "submission.to_csv('10-fold_elast_test.csv', index=False)\n",
439 | "submission_oof.to_csv('10-fold_ridge_train.csv', index=False)"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": null,
445 | "metadata": {},
446 | "outputs": [],
447 | "source": []
448 | }
449 | ],
450 | "metadata": {
451 | "kernelspec": {
452 | "display_name": "Python 2",
453 | "language": "python",
454 | "name": "python2"
455 | },
456 | "language_info": {
457 | "codemirror_mode": {
458 | "name": "ipython",
459 | "version": 2
460 | },
461 | "file_extension": ".py",
462 | "mimetype": "text/x-python",
463 | "name": "python",
464 | "nbconvert_exporter": "python",
465 | "pygments_lexer": "ipython2",
466 | "version": "2.7.14"
467 | }
468 | },
469 | "nbformat": 4,
470 | "nbformat_minor": 2
471 | }
472 |
--------------------------------------------------------------------------------
/Untitled.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "name": "stdout",
20 | "output_type": "stream",
21 | "text": [
22 | "(4995, 6)\n",
23 | "(100202, 6)\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "predictions = pd.read_csv('nb36.csv', index_col=0)\n",
29 | "test = pd.read_csv('test_translated_clean.csv', index_col=0)\n",
30 | "\n",
31 | "def find_good_predicts(valids, column):\n",
32 | " b1 = valids[column] > 0.96\n",
33 | " b2 = valids[column] < 0.1\n",
34 | " c = valids[b1|b2]\n",
35 | " print(valids[b1].shape)\n",
36 | " print(valids[b2].shape)\n",
37 | " return c\n",
38 | "\n",
39 | "good_predictions = find_good_predicts(predictions,'toxic')\n",
40 | "g = good_predictions.join(test)\n",
41 | "train = pd.read_csv('train_translated_clean.csv',index_col=0)\n",
42 | "new_train = pd.concat([train,g])\n",
43 | "\n"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 4,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "data": {
53 | "text/html": [
54 | "\n",
55 | "\n",
68 | "
\n",
69 | " \n",
70 | " \n",
71 | " | \n",
72 | " comment_text | \n",
73 | " comment_text_english | \n",
74 | " id | \n",
75 | " identity_hate | \n",
76 | " insult | \n",
77 | " lang | \n",
78 | " obscene | \n",
79 | " severe_toxic | \n",
80 | " threat | \n",
81 | " toxic | \n",
82 | "
\n",
83 | " \n",
84 | " \n",
85 | " \n",
86 | " | 0 | \n",
87 | " explanation why the edits made under my userna... | \n",
88 | " Explanation\\nWhy the edits made under my usern... | \n",
89 | " 0000997932d777bf | \n",
90 | " 0.0 | \n",
91 | " 0.0 | \n",
92 | " en | \n",
93 | " 0.0 | \n",
94 | " 0.0 | \n",
95 | " 0.0 | \n",
96 | " 0.0 | \n",
97 | "
\n",
98 | " \n",
99 | " | 1 | \n",
100 | " d aww he matches this background colour i am s... | \n",
101 | " D'aww! He matches this background colour I'm s... | \n",
102 | " 000103f0d9cfb60f | \n",
103 | " 0.0 | \n",
104 | " 0.0 | \n",
105 | " en | \n",
106 | " 0.0 | \n",
107 | " 0.0 | \n",
108 | " 0.0 | \n",
109 | " 0.0 | \n",
110 | "
\n",
111 | " \n",
112 | " | 2 | \n",
113 | " hey man i am really not trying to edit war it ... | \n",
114 | " Hey man, I'm really not trying to edit war. It... | \n",
115 | " 000113f07ec002fd | \n",
116 | " 0.0 | \n",
117 | " 0.0 | \n",
118 | " en | \n",
119 | " 0.0 | \n",
120 | " 0.0 | \n",
121 | " 0.0 | \n",
122 | " 0.0 | \n",
123 | "
\n",
124 | " \n",
125 | " | 3 | \n",
126 | " more i can not make any real suggestions on im... | \n",
127 | " \"\\nMore\\nI can't make any real suggestions on ... | \n",
128 | " 0001b41b1c6bb37e | \n",
129 | " 0.0 | \n",
130 | " 0.0 | \n",
131 | " en | \n",
132 | " 0.0 | \n",
133 | " 0.0 | \n",
134 | " 0.0 | \n",
135 | " 0.0 | \n",
136 | "
\n",
137 | " \n",
138 | " | 4 | \n",
139 | " you sir are my hero any chance you remember wh... | \n",
140 | " You, sir, are my hero. Any chance you remember... | \n",
141 | " 0001d958c54c6e35 | \n",
142 | " 0.0 | \n",
143 | " 0.0 | \n",
144 | " en | \n",
145 | " 0.0 | \n",
146 | " 0.0 | \n",
147 | " 0.0 | \n",
148 | " 0.0 | \n",
149 | "
\n",
150 | " \n",
151 | " | 5 | \n",
152 | " congratulations from me as well use the tools ... | \n",
153 | " \"\\n\\nCongratulations from me as well, use the ... | \n",
154 | " 00025465d4725e87 | \n",
155 | " 0.0 | \n",
156 | " 0.0 | \n",
157 | " en | \n",
158 | " 0.0 | \n",
159 | " 0.0 | \n",
160 | " 0.0 | \n",
161 | " 0.0 | \n",
162 | "
\n",
163 | " \n",
164 | " | 6 | \n",
165 | " cocksucker before you piss around on my work | \n",
166 | " COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK | \n",
167 | " 0002bcb3da6cb337 | \n",
168 | " 0.0 | \n",
169 | " 1.0 | \n",
170 | " en | \n",
171 | " 1.0 | \n",
172 | " 1.0 | \n",
173 | " 0.0 | \n",
174 | " 1.0 | \n",
175 | "
\n",
176 | " \n",
177 | " | 7 | \n",
178 | " your vandalism to the matt shirvington article... | \n",
179 | " Your vandalism to the Matt Shirvington article... | \n",
180 | " 00031b1e95af7921 | \n",
181 | " 0.0 | \n",
182 | " 0.0 | \n",
183 | " en | \n",
184 | " 0.0 | \n",
185 | " 0.0 | \n",
186 | " 0.0 | \n",
187 | " 0.0 | \n",
188 | "
\n",
189 | " \n",
190 | " | 8 | \n",
191 | " sorry if the word nonsense was offensive to yo... | \n",
192 | " Sorry if the word 'nonsense' was offensive to ... | \n",
193 | " 00037261f536c51d | \n",
194 | " 0.0 | \n",
195 | " 0.0 | \n",
196 | " en | \n",
197 | " 0.0 | \n",
198 | " 0.0 | \n",
199 | " 0.0 | \n",
200 | " 0.0 | \n",
201 | "
\n",
202 | " \n",
203 | " | 9 | \n",
204 | " alignment on this subject and which are contra... | \n",
205 | " alignment on this subject and which are contra... | \n",
206 | " 00040093b2687caa | \n",
207 | " 0.0 | \n",
208 | " 0.0 | \n",
209 | " en | \n",
210 | " 0.0 | \n",
211 | " 0.0 | \n",
212 | " 0.0 | \n",
213 | " 0.0 | \n",
214 | "
\n",
215 | " \n",
216 | " | 10 | \n",
217 | " fair use rationale for image wonju jpg thanks ... | \n",
218 | " \"\\nFair use rationale for Image:Wonju.jpg\\n\\nT... | \n",
219 | " 0005300084f90edc | \n",
220 | " 0.0 | \n",
221 | " 0.0 | \n",
222 | " en | \n",
223 | " 0.0 | \n",
224 | " 0.0 | \n",
225 | " 0.0 | \n",
226 | " 0.0 | \n",
227 | "
\n",
228 | " \n",
229 | " | 11 | \n",
230 | " bbq be a man and lets discuss it maybe over th... | \n",
231 | " bbq \\n\\nbe a man and lets discuss it-maybe ove... | \n",
232 | " 00054a5e18b50dd4 | \n",
233 | " 0.0 | \n",
234 | " 0.0 | \n",
235 | " en | \n",
236 | " 0.0 | \n",
237 | " 0.0 | \n",
238 | " 0.0 | \n",
239 | " 0.0 | \n",
240 | "
\n",
241 | " \n",
242 | " | 12 | \n",
243 | " hey what is it talk what is it an exclusive gr... | \n",
244 | " Hey... what is it..\\n@ | talk .\\nWhat is it...... | \n",
245 | " 0005c987bdfc9d4b | \n",
246 | " 0.0 | \n",
247 | " 0.0 | \n",
248 | " en | \n",
249 | " 0.0 | \n",
250 | " 0.0 | \n",
251 | " 0.0 | \n",
252 | " 1.0 | \n",
253 | "
\n",
254 | " \n",
255 | " | 13 | \n",
256 | " before you start throwing accusations and warn... | \n",
257 | " Before you start throwing accusations and warn... | \n",
258 | " 0006f16e4e9f292e | \n",
259 | " 0.0 | \n",
260 | " 0.0 | \n",
261 | " en | \n",
262 | " 0.0 | \n",
263 | " 0.0 | \n",
264 | " 0.0 | \n",
265 | " 0.0 | \n",
266 | "
\n",
267 | " \n",
268 | " | 14 | \n",
269 | " oh and the girl above started her arguments wi... | \n",
270 | " Oh, and the girl above started her arguments w... | \n",
271 | " 00070ef96486d6f9 | \n",
272 | " 0.0 | \n",
273 | " 0.0 | \n",
274 | " en | \n",
275 | " 0.0 | \n",
276 | " 0.0 | \n",
277 | " 0.0 | \n",
278 | " 0.0 | \n",
279 | "
\n",
280 | " \n",
281 | " | 15 | \n",
282 | " juelz santanas age in two zero zero two juelz ... | \n",
283 | " \"\\n\\nJuelz Santanas Age\\n\\nIn 2002, Juelz Sant... | \n",
284 | " 00078f8ce7eb276d | \n",
285 | " 0.0 | \n",
286 | " 0.0 | \n",
287 | " en | \n",
288 | " 0.0 | \n",
289 | " 0.0 | \n",
290 | " 0.0 | \n",
291 | " 0.0 | \n",
292 | "
\n",
293 | " \n",
294 | " | 16 | \n",
295 | " bye do not look come or think of comming back ... | \n",
296 | " Bye! \\n\\nDon't look, come or think of comming ... | \n",
297 | " 0007e25b2121310b | \n",
298 | " 0.0 | \n",
299 | " 0.0 | \n",
300 | " en | \n",
301 | " 0.0 | \n",
302 | " 0.0 | \n",
303 | " 0.0 | \n",
304 | " 1.0 | \n",
305 | "
\n",
306 | " \n",
307 | " | 17 | \n",
308 | " redirect talk voydan pop georgiev chernodrinski | \n",
309 | " REDIRECT Talk:Voydan Pop Georgiev- Chernodrinski | \n",
310 | " 000897889268bc93 | \n",
311 | " 0.0 | \n",
312 | " 0.0 | \n",
313 | " af | \n",
314 | " 0.0 | \n",
315 | " 0.0 | \n",
316 | " 0.0 | \n",
317 | " 0.0 | \n",
318 | "
\n",
319 | " \n",
320 | " | 18 | \n",
321 | " the mitsurugi point made no sense why not argu... | \n",
322 | " The Mitsurugi point made no sense - why not ar... | \n",
323 | " 0009801bd85e5806 | \n",
324 | " 0.0 | \n",
325 | " 0.0 | \n",
326 | " en | \n",
327 | " 0.0 | \n",
328 | " 0.0 | \n",
329 | " 0.0 | \n",
330 | " 0.0 | \n",
331 | "
\n",
332 | " \n",
333 | " | 19 | \n",
334 | " do not mean to bother you i see that you re wr... | \n",
335 | " Don't mean to bother you \\n\\nI see that you're... | \n",
336 | " 0009eaea3325de8c | \n",
337 | " 0.0 | \n",
338 | " 0.0 | \n",
339 | " en | \n",
340 | " 0.0 | \n",
341 | " 0.0 | \n",
342 | " 0.0 | \n",
343 | " 0.0 | \n",
344 | "
\n",
345 | " \n",
346 | "
\n",
347 | "
"
348 | ],
349 | "text/plain": [
350 | " comment_text \\\n",
351 | "0 explanation why the edits made under my userna... \n",
352 | "1 d aww he matches this background colour i am s... \n",
353 | "2 hey man i am really not trying to edit war it ... \n",
354 | "3 more i can not make any real suggestions on im... \n",
355 | "4 you sir are my hero any chance you remember wh... \n",
356 | "5 congratulations from me as well use the tools ... \n",
357 | "6 cocksucker before you piss around on my work \n",
358 | "7 your vandalism to the matt shirvington article... \n",
359 | "8 sorry if the word nonsense was offensive to yo... \n",
360 | "9 alignment on this subject and which are contra... \n",
361 | "10 fair use rationale for image wonju jpg thanks ... \n",
362 | "11 bbq be a man and lets discuss it maybe over th... \n",
363 | "12 hey what is it talk what is it an exclusive gr... \n",
364 | "13 before you start throwing accusations and warn... \n",
365 | "14 oh and the girl above started her arguments wi... \n",
366 | "15 juelz santanas age in two zero zero two juelz ... \n",
367 | "16 bye do not look come or think of comming back ... \n",
368 | "17 redirect talk voydan pop georgiev chernodrinski \n",
369 | "18 the mitsurugi point made no sense why not argu... \n",
370 | "19 do not mean to bother you i see that you re wr... \n",
371 | "\n",
372 | " comment_text_english id \\\n",
373 | "0 Explanation\\nWhy the edits made under my usern... 0000997932d777bf \n",
374 | "1 D'aww! He matches this background colour I'm s... 000103f0d9cfb60f \n",
375 | "2 Hey man, I'm really not trying to edit war. It... 000113f07ec002fd \n",
376 | "3 \"\\nMore\\nI can't make any real suggestions on ... 0001b41b1c6bb37e \n",
377 | "4 You, sir, are my hero. Any chance you remember... 0001d958c54c6e35 \n",
378 | "5 \"\\n\\nCongratulations from me as well, use the ... 00025465d4725e87 \n",
379 | "6 COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK 0002bcb3da6cb337 \n",
380 | "7 Your vandalism to the Matt Shirvington article... 00031b1e95af7921 \n",
381 | "8 Sorry if the word 'nonsense' was offensive to ... 00037261f536c51d \n",
382 | "9 alignment on this subject and which are contra... 00040093b2687caa \n",
383 | "10 \"\\nFair use rationale for Image:Wonju.jpg\\n\\nT... 0005300084f90edc \n",
384 | "11 bbq \\n\\nbe a man and lets discuss it-maybe ove... 00054a5e18b50dd4 \n",
385 | "12 Hey... what is it..\\n@ | talk .\\nWhat is it...... 0005c987bdfc9d4b \n",
386 | "13 Before you start throwing accusations and warn... 0006f16e4e9f292e \n",
387 | "14 Oh, and the girl above started her arguments w... 00070ef96486d6f9 \n",
388 | "15 \"\\n\\nJuelz Santanas Age\\n\\nIn 2002, Juelz Sant... 00078f8ce7eb276d \n",
389 | "16 Bye! \\n\\nDon't look, come or think of comming ... 0007e25b2121310b \n",
390 | "17 REDIRECT Talk:Voydan Pop Georgiev- Chernodrinski 000897889268bc93 \n",
391 | "18 The Mitsurugi point made no sense - why not ar... 0009801bd85e5806 \n",
392 | "19 Don't mean to bother you \\n\\nI see that you're... 0009eaea3325de8c \n",
393 | "\n",
394 | " identity_hate insult lang obscene severe_toxic threat toxic \n",
395 | "0 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
396 | "1 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
397 | "2 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
398 | "3 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
399 | "4 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
400 | "5 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
401 | "6 0.0 1.0 en 1.0 1.0 0.0 1.0 \n",
402 | "7 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
403 | "8 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
404 | "9 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
405 | "10 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
406 | "11 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
407 | "12 0.0 0.0 en 0.0 0.0 0.0 1.0 \n",
408 | "13 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
409 | "14 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
410 | "15 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
411 | "16 0.0 0.0 en 0.0 0.0 0.0 1.0 \n",
412 | "17 0.0 0.0 af 0.0 0.0 0.0 0.0 \n",
413 | "18 0.0 0.0 en 0.0 0.0 0.0 0.0 \n",
414 | "19 0.0 0.0 en 0.0 0.0 0.0 0.0 "
415 | ]
416 | },
417 | "execution_count": 4,
418 | "metadata": {},
419 | "output_type": "execute_result"
420 | }
421 | ],
422 | "source": [
423 | "new_train.head(n=20)"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 9,
429 | "metadata": {},
430 | "outputs": [],
431 | "source": [
432 | "fn = 'train_' + str(new_train.shape[0]) + '.csv'\n",
433 | "new_train.to_csv(fn)"
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": 18,
439 | "metadata": {},
440 | "outputs": [],
441 | "source": [
442 | "train = 'clean_train_ori_third.csv'\n",
443 | "train = pd.read_csv(train)\n",
444 | "labels = ['toxic']\n",
445 | "train_target = train[labels].values\n",
446 | "kf_label = np.ones(train_target.shape)\n",
447 | "for i in range(train_target.shape[1]):\n",
448 | " kf_label[:,i] = 2**i\n",
449 | "kf_label = np.sum(kf_label, axis=1)"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": 19,
455 | "metadata": {},
456 | "outputs": [
457 | {
458 | "name": "stdout",
459 | "output_type": "stream",
460 | "text": [
461 | "[ 1. 1. 1. ..., 1. 1. 1.]\n"
462 | ]
463 | }
464 | ],
465 | "source": [
466 | "print(kf_label)"
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": 17,
472 | "metadata": {},
473 | "outputs": [
474 | {
475 | "data": {
476 | "text/plain": [
477 | "6"
478 | ]
479 | },
480 | "execution_count": 17,
481 | "metadata": {},
482 | "output_type": "execute_result"
483 | }
484 | ],
485 | "source": [
486 | "train_target.shape[1]"
487 | ]
488 | },
489 | {
490 | "cell_type": "code",
491 | "execution_count": null,
492 | "metadata": {},
493 | "outputs": [],
494 | "source": []
495 | }
496 | ],
497 | "metadata": {
498 | "kernelspec": {
499 | "display_name": "Python 2",
500 | "language": "python",
501 | "name": "python2"
502 | },
503 | "language_info": {
504 | "codemirror_mode": {
505 | "name": "ipython",
506 | "version": 2
507 | },
508 | "file_extension": ".py",
509 | "mimetype": "text/x-python",
510 | "name": "python",
511 | "nbconvert_exporter": "python",
512 | "pygments_lexer": "ipython2",
513 | "version": "2.7.14"
514 | }
515 | },
516 | "nbformat": 4,
517 | "nbformat_minor": 2
518 | }
519 |
--------------------------------------------------------------------------------
/fasttext_direct.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "_cell_guid": "5c86b9bc-a478-4da9-adcc-b97ca0fbd0c9",
7 | "_uuid": "53d59528c17da4c1d9759786326a82d0c8765804"
8 | },
9 | "source": [
10 | "# Using FastText models (not vectors) for robust embeddings\n",
11 | "\n",
12 | "I'd like to explain my approach of using pretrained FastText models as input to Keras Neural Networks. FastText is a word embedding not unlike Word2Vec or GloVe, but the cool thing is that each word vector is based on sub-word character n-grams. That means that even for previously unseen words (e.g. due to typos), the model can make an educated guess towards its meaning. To find out more about FastText, check out both their [Github](https://github.com/facebookresearch/fastText/) and [website](https://fasttext.cc/).\n",
13 | "\n",
14 | "To do this, we won't be using the classic Keras embedding layer and instead hand-craft the embedding for each example. As a result, we need to write more code and invest some time into preprocessing, but that is easily justified by the results.\n",
15 | "\n",
16 | "**Disclaimer: Loading the FastText model will take some serious memory! I recommend having at least 60 GB of RAM. EC2's p2.xlarge instance should have no problems with this, but you can always [add some swap](https://stackoverflow.com/questions/17173972/how-do-you-add-swap-to-an-ec2-instance) for good measure. I also added a section below to build a training generator for this.**"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "_cell_guid": "b88f84eb-051c-4fe8-8fc0-70365a0b9693",
23 | "_uuid": "168f1efdabebb51c2922aab58231921599f9348f"
24 | },
25 | "source": [
26 | "## Preparations: Getting FastText and the model\n",
27 | "\n",
28 | "First, build FastText from sources as described [here](https://github.com/facebookresearch/fastText#requirements). Don't worry, there's nothing crazy you have to do and it will finish in less than a minute. Next, install the Python package in your virtualenv following [these instructions](https://github.com/facebookresearch/fastText/tree/master/python).\n",
29 | "\n",
30 | "For the model, I use the one pretrained on English Wikipedia. I'd love to have one trained on Twitter or similar, since it might be more internet-slangy, but I haven't found any yet and don't feel like pretraining one myself. Download the model [here](https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md). Make sure you get the bin, not just the vec (text) file. I'll assume you placed it (or a symlink to it) into your code directory and named it `ft_model.bin`."
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {
36 | "_cell_guid": "eea9d34b-3205-421f-bfd8-6e29f8092ddd",
37 | "_uuid": "e4542f71483666b9168dacc6949bcbaff8b66642"
38 | },
39 | "source": [
40 | "## Preparations: Exploring the model\n",
41 | "\n",
42 | "Let's explore the model! Go to your FastText directory and run `./fasttext nn `. Now you can enter some terms and see the nearest neighbors to this word in the embedding space. Here are some examples:\n",
43 | "\n",
44 | "```\n",
45 | "Query word? queen\n",
46 | "—queen 0.719091\n",
47 | "‘queen 0.692849\n",
48 | "#queen 0.656498\n",
49 | "queena 0.650313\n",
50 | "king 0.64931\n",
51 | "queen`s 0.63954\n",
52 | "king/queen 0.634855\n",
53 | "s/queen 0.627386\n",
54 | "princess 0.623889\n",
55 | "queeny 0.620919\n",
56 | "```\n",
57 | "\n",
58 | "Ok that looks pretty ugly. I suppose Facebook was not very exact in their cleaning of the input data. But some sensible suggestions are there: `king` and `princess`! Let's try a typo that is unlikely to have appeared in the original data:\n",
59 | "\n",
60 | "```\n",
61 | "Query word? dimensionnallity\n",
62 | "dimension, 0.722278\n",
63 | "dimensionality 0.708645\n",
64 | "dimensionful 0.698573\n",
65 | "codimension 0.689754\n",
66 | "codimensions 0.67555\n",
67 | "twodimensional 0.674745\n",
68 | "dimension 0.67258\n",
69 | "\\,kdimensional 0.668848\n",
70 | "‘dimensions 0.665725\n",
71 | "two–dimensional 0.665109\n",
72 | "```\n",
73 | "\n",
74 | "Sweet! Even though it has never seen that word, it recognizes it to be related with \"dimensionality\". Let's try some something mean:\n",
75 | "\n",
76 | "```\n",
77 | "Query word? dumb\n",
78 | "stupid 0.746051\n",
79 | "dumber 0.732965\n",
80 | "clueless 0.662594\n",
81 | "idiotic 0.64993\n",
82 | "silly 0.632314\n",
83 | "stupidstitious 0.628875\n",
84 | "stupidly 0.622968\n",
85 | "moronic 0.621633\n",
86 | "ignorant 0.620475\n",
87 | "stupider 0.617377\n",
88 | "```\n",
89 | "\n",
90 | "Nice! Even though this was trained on Wikipedia, we're getting at least some basic insults. I'll leave it to you to explore the really hateful words. They all seem to be there ;)\n",
91 | "\n",
92 | "**Note:** Keep in mind that exploring the nearest neighbors is a very superficial approach to understanding the model! The embedding space has 300 dimensions, and we boil them down to a single distance metric. We can't be sure in which dimensions these words are related to each other, but we can trust in the model to have learnt something sensible.\n",
93 | "\n",
94 | "**Pro tip:** Our data should be cleaned and normalized in a similar way as Facebook did before they trained this model. We can query the model to get some insights into what they did, e.g.\n",
95 | "\n",
96 | "```\n",
97 | "Query word? 1\n",
98 | "insel 0.483141\n",
99 | "inseln 0.401125\n",
100 | "...\n",
101 | "Query word? one\n",
102 | "two 0.692744\n",
103 | "three 0.676568\n",
104 | "...\n",
105 | "```\n",
106 | "\n",
107 | "This tells us they converted all numbers to their text equivalent, and so should we!"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {
113 | "_cell_guid": "2f03f7c4-ecc8-47d3-8c59-9e4482e7684a",
114 | "_uuid": "ae7a32ce6f66748656faecde4d079857b992ac1e"
115 | },
116 | "source": [
117 | "## Loading and cleaning the data\n",
118 | "\n",
119 | "We define a method `normalize` to clean and prepare a single string. We will use it later to prepare our string data. Also, we load the data as we're used to:"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 73,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "\n",
129 | "from __future__ import absolute_import\n",
130 | "from __future__ import division\n",
131 | "from __future__ import print_function\n",
132 | "import cPickle\n",
133 | "import json\n",
134 | "import os\n",
135 | "import numpy as np\n",
136 | "from keras.callbacks import EarlyStopping\n",
137 | "from keras.callbacks import ModelCheckpoint\n",
138 | "from keras.layers import Conv1D\n",
139 | "from keras.layers import Dense\n",
140 | "from keras.layers import Reshape\n",
141 | "from keras.layers import Dropout\n",
142 | "from keras.layers import Bidirectional\n",
143 | "from keras.layers import LSTM\n",
144 | "from keras.layers import concatenate\n",
145 | "from keras.layers import Embedding\n",
146 | "from keras.layers import Embedding\n",
147 | "from keras import regularizers\n",
148 | "from keras.layers import Flatten\n",
149 | "from keras.layers import GlobalMaxPooling1D\n",
150 | "from keras.layers import Input\n",
151 | "from keras.layers import MaxPooling1D\n",
152 | "from keras.layers import CuDNNGRU\n",
153 | "from keras.models import load_model\n",
154 | "from keras.models import Model\n",
155 | "from keras.optimizers import RMSprop\n",
156 | "from keras.optimizers import Adam\n",
157 | "from keras.preprocessing.sequence import pad_sequences\n",
158 | "from keras.preprocessing.text import Tokenizer\n",
159 | "from keras.utils import to_categorical\n",
160 | "from keras import backend as K\n",
161 | "from keras.engine.topology import Layer\n",
162 | "from keras import initializers, regularizers, constraints\n",
163 | "from keras.layers.normalization import BatchNormalization\n",
164 | "from keras.layers.advanced_activations import LeakyReLU, PReLU\n",
165 | "from keras.models import Sequential\n",
166 | "import nltk\n",
167 | "from keras.optimizers import Nadam\n"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 13,
173 | "metadata": {
174 | "_cell_guid": "423398cb-482b-4dc3-9904-82c2d17d2e2c",
175 | "_kg_hide-output": true,
176 | "_uuid": "fa74d030d08aff58c32455cd4218d9ff0ef494d9"
177 | },
178 | "outputs": [
179 | {
180 | "name": "stdout",
181 | "output_type": "stream",
182 | "text": [
183 | "\n",
184 | "Loading data\n"
185 | ]
186 | }
187 | ],
188 | "source": [
189 | "import re\n",
190 | "import numpy as np\n",
191 | "import pandas as pd\n",
192 | "from fastText import load_model\n",
193 | "from keras import backend as K\n",
194 | "from keras.models import Model\n",
195 | "from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate\n",
196 | "from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNGRU\n",
197 | "from keras.models import Sequential\n",
198 | "\n",
199 | "from keras.preprocessing import text, sequence\n",
200 | "from keras.callbacks import Callback\n",
201 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer\n",
202 | "from sklearn.metrics import log_loss\n",
203 | "\n",
204 | "\n",
205 | "window_length = 200 # The amount of words we look at per example. Experiment with this.\n",
206 | "\n",
207 | "def normalize(s):\n",
208 | " \"\"\"\n",
209 | " Given a text, cleans and normalizes it. Feel free to add your own stuff.\n",
210 | " \"\"\"\n",
211 | " s = s.lower()\n",
212 | " # Replace ips\n",
213 | " s = re.sub(r'\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}', ' _ip_ ', s)\n",
214 | " # Isolate punctuation\n",
215 | " s = re.sub(r'([\\'\\\"\\.\\(\\)\\!\\?\\-\\\\\\/\\,])', r' \\1 ', s)\n",
216 | " # Remove some special characters\n",
217 | " s = re.sub(r'([\\;\\:\\|•«\\n])', ' ', s)\n",
218 | " # Replace numbers and symbols with language\n",
219 | " s = s.replace('&', ' and ')\n",
220 | " s = s.replace('@', ' at ')\n",
221 | " s = s.replace('0', ' zero ')\n",
222 | " s = s.replace('1', ' one ')\n",
223 | " s = s.replace('2', ' two ')\n",
224 | " s = s.replace('3', ' three ')\n",
225 | " s = s.replace('4', ' four ')\n",
226 | " s = s.replace('5', ' five ')\n",
227 | " s = s.replace('6', ' six ')\n",
228 | " s = s.replace('7', ' seven ')\n",
229 | " s = s.replace('8', ' eight ')\n",
230 | " s = s.replace('9', ' nine ')\n",
231 | " return s\n",
232 | "\n",
233 | "print('\\nLoading data')\n",
234 | "train = pd.read_csv('cleaned_final_train_clean.csv')\n",
235 | "test = pd.read_csv('cleaned_test_clean.csv')\n",
236 | "train['comment_text'] = train['comment_text'].fillna('_empty_')\n",
237 | "test['comment_text'] = test['comment_text'].fillna('_empty_')"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": []
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": null,
250 | "metadata": {},
251 | "outputs": [],
252 | "source": [
253 | "vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',\n",
254 | " stop_words= 'english',ngram_range=(1,3),dtype=np.float32)\n",
255 | "vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',\n",
256 | " stop_words= 'english',ngram_range=(3,6),dtype=np.float32)\n",
257 | "\n",
258 | "# Word ngram vector\n",
259 | "tr_vect = vect_word.fit_transform(train['comment_text'])\n",
260 | "ts_vect = vect_word.transform(test['comment_text'])\n",
261 | "\n",
262 | "# Character n gram vector\n",
263 | "tr_vect_char = vect_char.fit_transform(train['comment_text'])\n",
264 | "ts_vect_char = vect_char.transform(test['comment_text'])\n",
265 | "\n",
266 | "\n",
267 | "X = sparse.hstack([tr_vect, tr_vect_char])\n",
268 | "x_test = sparse.hstack([ts_vect, ts_vect_char])\n",
269 | "\n"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 8,
275 | "metadata": {},
276 | "outputs": [
277 | {
278 | "name": "stdout",
279 | "output_type": "stream",
280 | "text": [
281 | "--2018-02-25 02:32:20-- https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.zip\n",
282 | "Resolving s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)... 52.219.20.137\n",
283 | "Connecting to s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)|52.219.20.137|:443... connected.\n",
284 | "HTTP request sent, awaiting response... 200 OK\n",
285 | "Length: 10356881291 (9.6G) [application/zip]\n",
286 | "Saving to: ‘wiki.en.zip’\n",
287 | "\n",
288 | "wiki.en.zip 100%[===================>] 9.65G 8.10MB/s in 25m 35s \n",
289 | "\n",
290 | "2018-02-25 02:57:56 (6.44 MB/s) - ‘wiki.en.zip’ saved [10356881291/10356881291]\n",
291 | "\n"
292 | ]
293 | }
294 | ],
295 | "source": [
296 | "!wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.zip"
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "metadata": {
302 | "_cell_guid": "d98e8058-b635-408f-98ca-c0b999bc310c",
303 | "_uuid": "a4099da988bbe670ee7b389071e924ca1891cec2"
304 | },
305 | "source": [
306 | "Ok next, let's load the FastText model and define methods that convert text to a sequence of vectors. Note that I'm just considering the last n words of each text. You could play with this, too."
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": 3,
312 | "metadata": {
313 | "_cell_guid": "5792a3ad-8b04-435f-bd3d-f54d7449921b",
314 | "_kg_hide-output": true,
315 | "_uuid": "d5a8656a2cb0b9cde191230f477faa17934180c9"
316 | },
317 | "outputs": [
318 | {
319 | "name": "stdout",
320 | "output_type": "stream",
321 | "text": [
322 | "\n",
323 | "Loading FT model\n"
324 | ]
325 | }
326 | ],
327 | "source": [
328 | "classes = [\n",
329 | " 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'\n",
330 | "]\n",
331 | "\n",
332 | "print('\\nLoading FT model')\n",
333 | "ft_model = load_model('wiki.en.bin')\n",
334 | "n_features = ft_model.get_dimension()\n",
335 | "\n",
336 | "def text_to_vector(text):\n",
337 | " \"\"\"\n",
338 | " Given a string, normalizes it, then splits it into words and finally converts\n",
339 | " it to a sequence of word vectors.\n",
340 | " \"\"\"\n",
341 | " text = normalize(text)\n",
342 | " words = text.split()\n",
343 | " window = words[-window_length:]\n",
344 | " \n",
345 | " x = np.zeros((window_length, n_features))\n",
346 | "\n",
347 | " for i, word in enumerate(window):\n",
348 | " x[i, :] = ft_model.get_word_vector(word).astype('float32')\n",
349 | "\n",
350 | " return x\n",
351 | "\n",
352 | "def df_to_data(df):\n",
353 | " \"\"\"\n",
354 | " Convert a given dataframe to a dataset of inputs for the NN.\n",
355 | " \"\"\"\n",
356 | " x = np.zeros((len(df), window_length, n_features), dtype='float32')\n",
357 | "\n",
358 | " for i, comment in enumerate(df['comment_text'].values):\n",
359 | " x[i, :] = text_to_vector(comment)\n",
360 | "\n",
361 | " return x"
362 | ]
363 | },
364 | {
365 | "cell_type": "markdown",
366 | "metadata": {
367 | "_cell_guid": "d9bc312a-e444-4058-b4ac-3f30d3d98940",
368 | "_uuid": "ae35399a098729b8e68d76a953d4c097917438bb"
369 | },
370 | "source": [
371 | "To convert an input dataframe to an input vector, just call `df_to_data`. This will result in the shape `(n_examples, window_length, n_features)`. Here, for each row we would have 200 words a 300 features each.\n",
372 | "\n",
373 | "**EDIT/NOTE:** This will probably not fit into your memory, so don't bother executing it :) Instead, read my generator guide below."
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": 4,
379 | "metadata": {
380 | "_cell_guid": "1c98dd07-36b1-4b7f-b806-6a5d8bdabae1",
381 | "_kg_hide-output": true,
382 | "_uuid": "6a34513a82b5140d5e5c258f5c6427b83ae62245"
383 | },
384 | "outputs": [
385 | {
386 | "ename": "MemoryError",
387 | "evalue": "",
388 | "output_type": "error",
389 | "traceback": [
390 | "\u001b[0;31m------------------------------------------------------------------------\u001b[0m",
391 | "\u001b[0;31mMemoryError\u001b[0m Traceback (most recent call last)",
392 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mx_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_to_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0my_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mclasses\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mx_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_to_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0my_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mclasses\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
393 | "\u001b[0;32m\u001b[0m in \u001b[0;36mdf_to_data\u001b[0;34m(df)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mConvert\u001b[0m \u001b[0ma\u001b[0m \u001b[0mgiven\u001b[0m \u001b[0mdataframe\u001b[0m \u001b[0mto\u001b[0m \u001b[0ma\u001b[0m \u001b[0mdataset\u001b[0m \u001b[0mof\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mNN\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \"\"\"\n\u001b[0;32m---> 29\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwindow_length\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_features\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'float32'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 30\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcomment\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'comment_text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
394 | "\u001b[0;31mMemoryError\u001b[0m: "
395 | ]
396 | }
397 | ],
398 | "source": [
399 | "x_train = df_to_data(train)\n",
400 | "y_train = train[classes].values\n",
401 | "\n",
402 | "x_test = df_to_data(test)\n",
403 | "y_test = test[classes].values"
404 | ]
405 | },
406 | {
407 | "cell_type": "markdown",
408 | "metadata": {
409 | "_cell_guid": "4f582bb5-ac4b-4b67-8e34-6fa022416e57",
410 | "_uuid": "7b41f089ef0d93d41d565e99b03c0f19b30f26ce"
411 | },
412 | "source": [
413 | "And now you should be good to go! Train this as usual. You don't need an `EmbeddingLayer`, but you need to pass `input_shape=(window_length, n_features)` to the first layer in your NN.\n",
414 | "\n",
415 | "I'm still in the process of experimenting, but I already achieved a single-model LB score of `0.9842` with something very simple. Bagging multiple of these models got me into the top 100 easily. Good luck!"
416 | ]
417 | },
418 | {
419 | "cell_type": "markdown",
420 | "metadata": {
421 | "_cell_guid": "4bb7ba9a-7c32-43db-9c52-f4fca204ab12",
422 | "_uuid": "4dfea6e85ca933626a14ce2df8e937051b863d0a"
423 | },
424 | "source": [
425 | "### PS: Using a generator so you don't have to keep the whole damn thing in memory\n",
426 | "As @liujilong pointed out, not even the p2.xlarge machine with 64 GB can hold both the training and test set for window sizes longer than ~100 words. It seems I underestimated how much memory this monster model eats! Also, locally I had long [added swap space](https://stackoverflow.com/questions/17173972/how-do-you-add-swap-to-an-ec2-instance) and switched to generators so I wouldn't have to keep the whole thing memory. Let me show you how to implement the generator part. This is also useful to add some randomization later on.\n",
427 | "\n",
428 | "The idea is that instead of converting the whole training set to one large array, we can write a function that just spits out one batch of data at a time, infinitely. Keras can automaticaly spin up a separate thread for this method (note though that \"threads\" in Python are ridiculous and do not give any speedup whatsoever). This means that we have to write some more code and training will be slightly slower, but we need only a fraction of the memory and we can add some cool randomization to each batch later on (see ideas section below).\n",
429 | "\n",
430 | "We can keep all the code from above. This generator method works only for training data, not for validation data, so you will need to split by hand. Let's do that now:"
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": 6,
436 | "metadata": {
437 | "_cell_guid": "f71c2d37-2a1e-483a-af4c-5e7c820ad29c",
438 | "_kg_hide-output": true,
439 | "_uuid": "74becdb6b32d5676e1f4a0d67b38a0ba355dac97"
440 | },
441 | "outputs": [],
442 | "source": [
443 | "# Split the dataset:\n",
444 | "#split_index = round(len(train) * 0.9)\n",
445 | "#shuffled_train = train.sample(frac=1)\n",
446 | "#df_train = shuffled_train.iloc[:split_index]\n",
447 | "#df_val = shuffled_train.iloc[split_index:]\n",
448 | "\n",
449 | "# Convert validation set to fixed array\n",
450 | "#x_val = df_to_data(df_val)\n",
451 | "#y_val = df_val[classes].values\n",
452 | "\n",
453 | "def data_generator(df, batch_size):\n",
454 | " \"\"\"\n",
455 | " Given a raw dataframe, generates infinite batches of FastText vectors.\n",
456 | " \"\"\"\n",
457 | " batch_i = 0 # Counter inside the current batch vector\n",
458 | " batch_x = None # The current batch's x data\n",
459 | " batch_y = None # The current batch's y data\n",
460 | " \n",
461 | " while True: # Loop forever\n",
462 | " df = df.sample(frac=1) # Shuffle df each epoch\n",
463 | " \n",
464 | " for i, row in df.iterrows():\n",
465 | " comment = row['comment_text']\n",
466 | " \n",
467 | " if batch_x is None:\n",
468 | " batch_x = np.zeros((batch_size, window_length, n_features), dtype='float32')\n",
469 | " batch_y = np.zeros((batch_size, len(classes)), dtype='float32')\n",
470 | " \n",
471 | " batch_x[batch_i] = text_to_vector(comment)\n",
472 | " batch_y[batch_i] = row[classes].values\n",
473 | " batch_i += 1\n",
474 | "\n",
475 | " if batch_i == batch_size:\n",
476 | " # Ready to yield the batch\n",
477 | " yield batch_x, batch_y\n",
478 | " batch_x = None\n",
479 | " batch_y = None\n",
480 | " batch_i = 0"
481 | ]
482 | },
483 | {
484 | "cell_type": "markdown",
485 | "metadata": {
486 | "_cell_guid": "3e35540c-7ae8-4f38-ae70-a2d38bc1e189",
487 | "_uuid": "07d54b2346e24ebe9fd6dad691e129bd721a8b45"
488 | },
489 | "source": [
490 | "Alright, now we can use this generator to train the network. To make sure that one epoch has approxamitely the same number of examples as are in the training set, we need to set the `steps_per_epoch` to the number of batches we expect to cover the whole dataset. Here's the code:"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 8,
496 | "metadata": {
497 | "_cell_guid": "48f4dcad-b7fe-4cd6-81a7-9ac41e9a0daa",
498 | "_uuid": "1b55016b60964208c999381f09600af1a5ae96ed"
499 | },
500 | "outputs": [],
501 | "source": [
502 | "from keras.engine.topology import Layer\n",
503 | "\n",
504 | "class Attention(Layer):\n",
505 | " def __init__(self, step_dim,\n",
506 | " W_regularizer=None, b_regularizer=None,\n",
507 | " W_constraint=None, b_constraint=None,\n",
508 | " bias=True, **kwargs):\n",
509 | " self.supports_masking = True\n",
510 | " self.init = initializers.get('glorot_uniform')\n",
511 | "\n",
512 | " self.W_regularizer = regularizers.get(W_regularizer)\n",
513 | " self.b_regularizer = regularizers.get(b_regularizer)\n",
514 | "\n",
515 | " self.W_constraint = constraints.get(W_constraint)\n",
516 | " self.b_constraint = constraints.get(b_constraint)\n",
517 | "\n",
518 | " self.bias = bias\n",
519 | " self.step_dim = step_dim\n",
520 | " self.features_dim = 0\n",
521 | " super(Attention, self).__init__(**kwargs)\n",
522 | "\n",
523 | " def build(self, input_shape):\n",
524 | " assert len(input_shape) == 3\n",
525 | "\n",
526 | " self.W = self.add_weight((input_shape[-1],),\n",
527 | " initializer=self.init,\n",
528 | " name='{}_W'.format(self.name),\n",
529 | " regularizer=self.W_regularizer,\n",
530 | " constraint=self.W_constraint)\n",
531 | " self.features_dim = input_shape[-1]\n",
532 | "\n",
533 | " if self.bias:\n",
534 | " self.b = self.add_weight((input_shape[1],),\n",
535 | " initializer='zero',\n",
536 | " name='{}_b'.format(self.name),\n",
537 | " regularizer=self.b_regularizer,\n",
538 | " constraint=self.b_constraint)\n",
539 | " else:\n",
540 | " self.b = None\n",
541 | "\n",
542 | " self.built = True\n",
543 | "\n",
544 | " def compute_mask(self, input, input_mask=None):\n",
545 | " return None\n",
546 | "\n",
547 | " def call(self, x, mask=None):\n",
548 | " features_dim = self.features_dim\n",
549 | " step_dim = self.step_dim\n",
550 | "\n",
551 | " eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),\n",
552 | " K.reshape(self.W, (features_dim, 1))), (-1, step_dim))\n",
553 | "\n",
554 | " if self.bias:\n",
555 | " eij += self.b\n",
556 | "\n",
557 | " eij = K.tanh(eij)\n",
558 | "\n",
559 | " a = K.exp(eij)\n",
560 | "\n",
561 | " if mask is not None:\n",
562 | " a *= K.cast(mask, K.floatx())\n",
563 | "\n",
564 | " a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())\n",
565 | "\n",
566 | " a = K.expand_dims(a)\n",
567 | " weighted_input = x * a\n",
568 | " return K.sum(weighted_input, axis=1)\n",
569 | "\n",
570 | " def compute_output_shape(self, input_shape):\n",
571 | " return input_shape[0], self.features_dim"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": 81,
577 | "metadata": {
578 | "_cell_guid": "53202998-9f82-48eb-a8a7-86c0dcdd92fa",
579 | "_uuid": "876ed29154a6bdf94048af7dc0e6f5115b4f9f2e"
580 | },
581 | "outputs": [],
582 | "source": [
583 | "def build_model():\n",
584 | " inputs = Input(shape=(150,))\n",
585 | " inp = Reshape((1,150,))(inputs)\n",
586 | " x = Bidirectional(GRU(80, return_sequences=True))(inp)\n",
587 | " #att = Attention(150)(x)\n",
588 | " avg_pool = GlobalAveragePooling1D()(x)\n",
589 | " max_pool = GlobalMaxPooling1D()(x)\n",
590 | " conc = concatenate([avg_pool, max_pool])\n",
591 | " output = Dropout(0.5)(conc)\n",
592 | " output = BatchNormalization()(output)\n",
593 | " outp = Dense(6, activation=\"sigmoid\")(output)\n",
594 | " nadam = Nadam(lr=0.001)\n",
595 | " model = Model(inputs=inputs, outputs=outp)\n",
596 | " model.compile(loss='binary_crossentropy',\n",
597 | " optimizer=nadam,\n",
598 | " metrics=['accuracy'])\n",
599 | " \n",
600 | " return model"
601 | ]
602 | },
603 | {
604 | "cell_type": "code",
605 | "execution_count": 84,
606 | "metadata": {},
607 | "outputs": [],
608 | "source": [
609 | "def train_folds(X, y, fold_count, model_list, model_name):\n",
610 | " fold_size = len(X) // fold_count\n",
611 | " models = []\n",
612 | " total_meta = []\n",
613 | " for fold_id in range(0, fold_count):\n",
614 | " fold_start = fold_size * fold_id\n",
615 | " fold_end = fold_start + fold_size\n",
616 | " \n",
617 | " if fold_id == fold_count - 1:\n",
618 | " fold_end = len(X)\n",
619 | "\n",
620 | " train_x = np.concatenate([X[:fold_start], X[fold_end:]])\n",
621 | " train_y = np.concatenate([y[:fold_start], y[fold_end:]])\n",
622 | "\n",
623 | " val_x = X[fold_start:fold_end]\n",
624 | " val_y = y[fold_start:fold_end]\n",
625 | " \n",
626 | " save_path = os.path.join('models', '%s_model.h5' % (model_name + str(fold_id)))\n",
627 | " callbacks = [\n",
628 | " ModelCheckpoint(\n",
629 | " save_path, save_best_only=True, verbose=True)\n",
630 | " ]\n",
631 | " #train_x = np.reshape(train_x, train_x.shape + (1,))\n",
632 | " training_generator = data_generator(train_x, 128)\n",
633 | " x_tra = len(train_x)\n",
634 | " training_steps_per_epoch = round(len(train_x) / batch_size)\n",
635 | " model = train_model(model_list[fold_id], training_generator, x_tra, train_y, val_x, val_y,callbacks, training_steps_per_epoch)\n",
636 | " meta = model.predict(val_x, batch_size=128)\n",
637 | " if (fold_id == 0):\n",
638 | " total_meta = meta\n",
639 | " else:\n",
640 | " total_meta = np.concatenate((total_meta, meta), axis=0)\n",
641 | " model_path = os.path.join('models', \"model{0}_weights.npy\".format(fold_id))\n",
642 | " np.save(model_path, model.get_weights())\n",
643 | " models.append(model)\n",
644 | "\n",
645 | " return models, total_meta\n",
646 | "\n",
647 | "def train_model(model, training_generator,x_tra, train_y, val_x, val_y, callbacks, training_steps_per_epoch):\n",
648 | " best_loss = -1\n",
649 | " best_weights = None\n",
650 | " best_epoch = 0\n",
651 | "\n",
652 | " current_epoch = 0\n",
653 | " #charCNN:LSTM\n",
654 | " #train_x = np.reshape(train_x, train_x.shape + (1,))\n",
655 | " #val_x = np.reshape(val_x, val_x.shape + (1,))\n",
656 | " exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1\n",
657 | " steps = int(x_tra/batch_size) * 1000\n",
658 | " lr_init, lr_fin = 0.001, 0.0005\n",
659 | " lr_decay = exp_decay(lr_init, lr_fin, steps)\n",
660 | " #K.set_value(model.optimizer.lr, lr_init)\n",
661 | " #K.set_value(model.optimizer.decay, lr_decay)\n",
662 | "\n",
663 | " while True:\n",
664 | " model.fit_generator(\n",
665 | " training_generator,\n",
666 | " steps_per_epoch=training_steps_per_epoch,\n",
667 | " epochs=1,\n",
668 | " validation_data=(val_x, val_y),\n",
669 | " callbacks=callbacks,\n",
670 | " verbose=2\n",
671 | " )\n",
672 | " \n",
673 | " y_pred = model.predict(val_x, batch_size=128)\n",
674 | "\n",
675 | " total_loss = 0\n",
676 | " for j in range(6):\n",
677 | " loss = log_loss(val_y[:, j], y_pred[:, j])\n",
678 | " total_loss += loss\n",
679 | "\n",
680 | " total_loss /= 6.\n",
681 | "\n",
682 | " print(\"Epoch {0} auc {1} best_auc {2}\".format(current_epoch, total_loss, best_loss))\n",
683 | " \n",
684 | "\n",
685 | " current_epoch += 1\n",
686 | " if total_loss < best_loss or best_loss == -1:\n",
687 | " best_loss = total_loss\n",
688 | " best_weights = model.get_weights()\n",
689 | " best_epoch = current_epoch\n",
690 | " else:\n",
691 | " if current_epoch - best_epoch == 5:\n",
692 | " break\n",
693 | "\n",
694 | " model.set_weights(best_weights)\n",
695 | " return model"
696 | ]
697 | },
698 | {
699 | "cell_type": "code",
700 | "execution_count": 85,
701 | "metadata": {
702 | "_cell_guid": "1be4d47d-6b43-4422-b77f-f103e784e721",
703 | "_kg_hide-output": true,
704 | "_uuid": "f5b8b97f068e2884bc5e877a19afbc721ba135da"
705 | },
706 | "outputs": [
707 | {
708 | "ename": "ValueError",
709 | "evalue": "Error when checking input: expected input_175 to have shape (150,) but got array with shape (1,)",
710 | "output_type": "error",
711 | "traceback": [
712 | "\u001b[0;31m------------------------------------------------------------------------\u001b[0m",
713 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
714 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mmodel_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"bigru\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mtraining_steps_per_epoch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mround\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'comment_text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mmodels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtotal_meta\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_folds\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'comment_text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_labels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfolds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist_models\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;32mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Model trained!'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
715 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain_folds\u001b[0;34m(X, y, fold_count, model_list, model_name)\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0mx_tra\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_x\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0mtraining_steps_per_epoch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mround\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_x\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_list\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfold_id\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraining_generator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_tra\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_y\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval_x\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval_y\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mcallbacks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraining_steps_per_epoch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0mmeta\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mval_x\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m128\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfold_id\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
716 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain_model\u001b[0;34m(model, training_generator, x_tra, train_y, val_x, val_y, callbacks, training_steps_per_epoch)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mval_x\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval_y\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcallbacks\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 62\u001b[0;31m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 63\u001b[0m )\n\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
717 | "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/keras/legacy/interfaces.pyc\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 89\u001b[0m warnings.warn('Update your `' + object_name +\n\u001b[1;32m 90\u001b[0m '` call to the Keras 2 API: ' + signature, stacklevel=2)\n\u001b[0;32m---> 91\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 92\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_original_function\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
718 | "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/keras/engine/training.pyc\u001b[0m in \u001b[0;36mfit_generator\u001b[0;34m(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)\u001b[0m\n\u001b[1;32m 2181\u001b[0m str(validation_data))\n\u001b[1;32m 2182\u001b[0m val_x, val_y, val_sample_weights = self._standardize_user_data(\n\u001b[0;32m-> 2183\u001b[0;31m val_x, val_y, val_sample_weight)\n\u001b[0m\u001b[1;32m 2184\u001b[0m \u001b[0mval_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mval_x\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mval_y\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mval_sample_weights\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2185\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muses_learning_phase\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mK\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlearning_phase\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
719 | "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/keras/engine/training.pyc\u001b[0m in \u001b[0;36m_standardize_user_data\u001b[0;34m(self, x, y, sample_weight, class_weight, check_array_lengths, batch_size)\u001b[0m\n\u001b[1;32m 1481\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_feed_input_shapes\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1482\u001b[0m \u001b[0mcheck_batch_axis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1483\u001b[0;31m exception_prefix='input')\n\u001b[0m\u001b[1;32m 1484\u001b[0m y = _standardize_input_data(y, self._feed_output_names,\n\u001b[1;32m 1485\u001b[0m \u001b[0moutput_shapes\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
720 | "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/keras/engine/training.pyc\u001b[0m in \u001b[0;36m_standardize_input_data\u001b[0;34m(data, names, shapes, check_batch_axis, exception_prefix)\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0;34m': expected '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' to have shape '\u001b[0m \u001b[0;34m+\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 122\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' but got array with shape '\u001b[0m \u001b[0;34m+\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 123\u001b[0;31m str(data_shape))\n\u001b[0m\u001b[1;32m 124\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
721 | "\u001b[0;31mValueError\u001b[0m: Error when checking input: expected input_175 to have shape (150,) but got array with shape (1,)"
722 | ]
723 | }
724 | ],
725 | "source": [
726 | "list_models = []\n",
727 | "folds = 10\n",
728 | "for fold in range(0, folds):\n",
729 | " model = build_model()\n",
730 | " list_models.append(model)\n",
731 | "train_labels = train[['toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate']].values\n",
732 | "batch_size = 128\n",
733 | "model_name = \"bigru\"\n",
734 | "training_steps_per_epoch = round(len(train['comment_text']) / batch_size)\n",
735 | "models, total_meta = train_folds(train['comment_text'], train_labels, folds, list_models,model_name) \n",
736 | "print('Model trained!')\n"
737 | ]
738 | },
739 | {
740 | "cell_type": "code",
741 | "execution_count": null,
742 | "metadata": {},
743 | "outputs": [],
744 | "source": [
745 | "print(\"Predicting results...\")\n",
746 | "random_test = pd.read_csv('cleaned_test_clean.csv')\n",
747 | "#random_test = self.Sanitize(random_test)\n",
748 | "#random_test.to_csv('cleaned_test_clean.csv', index=False)\n",
749 | "X_test = random_test['comment_text'].fillna('_empty_')\n",
750 | " X_test = self.prep_text(X_test)\n",
751 | " #X_test = self.load_data(X_test)\n",
752 | " test_predicts_list = []\n",
753 | " for fold_id, model in enumerate(models):\n",
754 | " model_path = os.path.join(self.model_dir, \"model{0}_weights.npy\".format(fold_id))\n",
755 | " np.save(model_path, model.get_weights())\n",
756 | " \n",
757 | " test_predicts_path = os.path.join(self.model_dir, \"test_predicts{0}.npy\".format(fold_id))\n",
758 | " test_predicts = model.predict(X_test, batch_size=self.hparams['batch_size'])\n",
759 | " test_predicts_list.append(test_predicts)\n",
760 | " np.save(test_predicts_path, test_predicts)\n",
761 | "\n",
762 | " test_predicts = np.ones(test_predicts_list[0].shape)\n",
763 | " for fold_predict in test_predicts_list:\n",
764 | " test_predicts *= fold_predict\n",
765 | "\n",
766 | " test_predicts **= (1. / len(test_predicts_list))\n",
767 | " test_ids = random_test[\"id\"].values\n",
768 | " test_ids = test_ids.reshape((len(test_ids), 1))\n",
769 | " CLASSES = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n",
770 | " test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)\n",
771 | " test_predicts[\"id\"] = test_ids\n",
772 | " test_predicts = test_predicts[[\"id\"] + CLASSES]\n",
773 | " test_predicts.to_csv('augmentori_pred_fasttext_gru_cv_output_two.csv', index=False)\n",
774 | " print('predicted !')\n"
775 | ]
776 | },
777 | {
778 | "cell_type": "markdown",
779 | "metadata": {
780 | "_cell_guid": "1af33152-8655-4aba-976e-b34a573940e8",
781 | "_uuid": "e2b848800743d084611e07d9f93fefe444eb2f88"
782 | },
783 | "source": [
784 | "And there you go, this should work on p2.xlarge even for long window lengths!"
785 | ]
786 | },
787 | {
788 | "cell_type": "markdown",
789 | "metadata": {
790 | "_cell_guid": "e9697a31-c4aa-4ec7-85f8-f93f95b57228",
791 | "_uuid": "24d94ddbe3041de9a0ae252adb22b2d1354b3cff"
792 | },
793 | "source": [
794 | "### More stuff to try:\n",
795 | "Some suggestions. I've tried most of these and found them helpful:\n",
796 | "\n",
797 | "* Add random but common typos to strings before converting to FT vectors. That way, the model can learn in which way typos affect the embeddings. Use the training generator so you can adjust this over time.\n",
798 | "* Add more string preprocessing to our `normalize` function\n",
799 | "* Randomize the windows instead of using the end (great that we already have a generator!)\n",
800 | "* Use FastText's sentence vector feature to summarize parts of the text outside the window\n",
801 | "* Add other features ontop of the FT ones, e.g. capitalization etc."
802 | ]
803 | }
804 | ],
805 | "metadata": {
806 | "kernelspec": {
807 | "display_name": "Python 2",
808 | "language": "python",
809 | "name": "python2"
810 | },
811 | "language_info": {
812 | "codemirror_mode": {
813 | "name": "ipython",
814 | "version": 2
815 | },
816 | "file_extension": ".py",
817 | "mimetype": "text/x-python",
818 | "name": "python",
819 | "nbconvert_exporter": "python",
820 | "pygments_lexer": "ipython2",
821 | "version": "2.7.12"
822 | }
823 | },
824 | "nbformat": 4,
825 | "nbformat_minor": 1
826 | }
827 |
--------------------------------------------------------------------------------
/LGBM_LOGREG_XGB_STACK_LOGREG.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "def read_predictions(prediction_dir, concat_mode='concat', per_label=False):\n",
10 | " labels = pd.read_csv(os.path.join(prediction_dir, 'labels.csv'))\n",
11 | "\n",
12 | " filepaths_train, filepaths_test = [], []\n",
13 | " for filepath in sorted(glob.glob('{}/*'.format(prediction_dir))):\n",
14 | " if filepath.endswith('predictions_test_oof.csv'):\n",
15 | " filepaths_test.append(filepath)\n",
16 | "\n",
17 | " test_dfs = []\n",
18 | " for filepath in filepaths_test:\n",
19 | " test_dfs.append(pd.read_csv(filepath))\n",
20 | " test_dfs = reduce(lambda df1, df2: pd.merge(df1, df2, on=['id', 'fold_id']), test_dfs)\n",
21 | " test_dfs.columns = _clean_columns(test_dfs, keep_colnames = ['id','fold_id'])\n",
22 | "\n",
23 | " return train_dfs, test_dfs\n",
24 | "\n",
25 | "def get_fold_xy(test,i):\n",
26 | " #train_split = train[train['fold_id'] != i]\n",
27 | " #valid_split = train[train['fold_id'] == i]\n",
28 | " test_split = test[test['fold_id'] == i]\n",
29 | "\n",
30 | " #y_train = train_split[label_columns].values\n",
31 | " #y_valid = valid_split[label_columns].values\n",
32 | " #columns_to_drop_train = label_columns + ['id','fold_id']\n",
33 | " #X_train = train_split.drop(columns_to_drop_train, axis=1).values\n",
34 | " #X_valid = valid_split.drop(columns_to_drop_train, axis=1).values\n",
35 | "\n",
36 | " columns_to_drop_test = ['id','fold_id']\n",
37 | " X_test = test_split.drop(columns_to_drop_test, axis=1).values\n",
38 | " return X_test"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 40,
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "ename": "IOError",
48 | "evalue": "File newstageone/13/word2vec_scnn_predictions_test_oof.csv does not exist",
49 | "output_type": "error",
50 | "traceback": [
51 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
52 | "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)",
53 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mpr_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'newstageone/13/word2vec_scnn_predictions_test_oof.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mtest_predicts_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mfold\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
54 | "\u001b[0;32m/home/dcek/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 707\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[1;32m 708\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 709\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 710\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 711\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
55 | "\u001b[0;32m/home/dcek/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 447\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 448\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 449\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 450\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
56 | "\u001b[0;32m/home/dcek/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 816\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 818\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 819\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
57 | "\u001b[0;32m/home/dcek/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 1047\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1048\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1049\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1050\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1051\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'python'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
58 | "\u001b[0;32m/home/dcek/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 1693\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'allow_leading_cols'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex_col\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1694\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1695\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1696\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1697\u001b[0m \u001b[0;31m# XXX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
59 | "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n",
60 | "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[0;34m()\u001b[0m\n",
61 | "\u001b[0;31mIOError\u001b[0m: File newstageone/13/word2vec_scnn_predictions_test_oof.csv does not exist"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "import pandas as pd\n",
67 | "import numpy as np\n",
68 | "\n",
69 | "pr_file = pd.read_csv('newstageone/13/word2vec_scnn_predictions_test_oof.csv')\n",
70 | "test_predicts_list = []\n",
71 | "for fold in range(0,10):\n",
72 | " get_fold_xy(pr_file,fold)\n",
73 | " test_predicts_list.append(get_fold_xy(pr_file,fold))\n",
74 | "#pr_file.to_csv(\"newstageone/13/bad_word_logreg_predictions_test_oof.csv\", index=False)\n"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 39,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "test = pd.read_csv('cleaned_test.csv')\n",
84 | "test_predicts = np.ones(test_predicts_list[0].shape)\n",
85 | "for fold_predict in test_predicts_list:\n",
86 | " test_predicts *= fold_predict\n",
87 | "\n",
88 | "test_predicts **= (1. / len(test_predicts_list))\n",
89 | "test_ids = test[\"id\"].values\n",
90 | "test_ids = test_ids.reshape((len(test_ids), 1))\n",
91 | "CLASSES = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n",
92 | "test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)\n",
93 | "test_predicts[\"id\"] = test_ids\n",
94 | "test_predicts = test_predicts[[\"id\"] + CLASSES]\n",
95 | "test_predicts.to_csv('newstageone/13/word2vec_scnn_predictions_test_oof.csv', index=False)"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 72,
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "data": {
105 | "text/plain": [
106 | "159571"
107 | ]
108 | },
109 | "execution_count": 72,
110 | "metadata": {},
111 | "output_type": "execute_result"
112 | }
113 | ],
114 | "source": [
115 | "test = pd.read_csv('newstageone/OOF/oof20.csv')\n",
116 | "test.head()\n",
117 | "len(test)"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 1,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "import pandas as pd\n",
127 | "import numpy as np\n",
128 | "import re\n",
129 | "import lightgbm as lgb\n",
130 | "import warnings\n",
131 | "warnings.filterwarnings(action='ignore', category=DeprecationWarning, module='sklearn')\n",
132 | "from sklearn.model_selection import cross_val_score, StratifiedKFold\n",
133 | "\n",
134 | "from sklearn.preprocessing import StandardScaler\n",
135 | "from sklearn.model_selection import cross_val_score\n",
136 | "\n",
137 | "\n",
138 | "#######################\n",
139 | "# FEATURE ENGINEERING #\n",
140 | "#######################\n",
141 | "\"\"\"\n",
142 | "Main function\n",
143 | "Input: pandas Series and a feature engineering function\n",
144 | "Output: pandas Series\n",
145 | "\"\"\"\n",
146 | "def engineer_feature(series, func, normalize=True):\n",
147 | " feature = series.apply(func)\n",
148 | " \n",
149 | " if normalize:\n",
150 | " feature = pd.Series(z_normalize(feature.values.reshape(-1,1)).reshape(-1,))\n",
151 | " feature.name = func.__name__ \n",
152 | " return feature\n",
153 | "\n",
154 | "\"\"\"\n",
155 | "Engineer features\n",
156 | "Input: pandas Series and a list of feature engineering functions\n",
157 | "Output: pandas DataFrame\n",
158 | "\"\"\"\n",
159 | "def engineer_features(series, funclist, normalize=True):\n",
160 | " features = pd.DataFrame()\n",
161 | " for func in funclist:\n",
162 | " feature = engineer_feature(series, func, normalize)\n",
163 | " features[feature.name] = feature\n",
164 | " return features\n",
165 | "\n",
166 | "\"\"\"\n",
167 | "Normalizer\n",
168 | "Input: NumPy array\n",
169 | "Output: NumPy array\n",
170 | "\"\"\"\n",
171 | "scaler = StandardScaler()\n",
172 | "def z_normalize(data):\n",
173 | " scaler.fit(data)\n",
174 | " return scaler.transform(data)\n",
175 | " \n",
176 | "\"\"\"\n",
177 | "Feature functions\n",
178 | "\"\"\"\n",
179 | "def asterix_freq(x):\n",
180 | " return x.count('!')/len(x)\n",
181 | "\n",
182 | "def uppercase_freq(x):\n",
183 | " return len(re.findall(r'[A-Z]',x))/len(x)\n",
184 | " \n",
185 | "\"\"\"\n",
186 | "Import submission and OOF files\n",
187 | "\"\"\"\n",
188 | "def get_subs(nums):\n",
189 | " subs = np.hstack([np.array(pd.read_csv(\"SUB/sub\" + str(num) + \".csv\")[LABELS]) for num in subnums])\n",
190 | " oofs = np.hstack([np.array(pd.read_csv(\"OOF/oof\" + str(num) + \".csv\")[LABELS]) for num in subnums])\n",
191 | " return subs, oofs\n",
192 | "\n"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "def train_folds(X, y, fold_count, model_list):\n",
202 | " fold_size = len(X) // fold_count\n",
203 | " models = []\n",
204 | " total_meta = []\n",
205 | " auc_list = []\n",
206 | " for fold_id in range(0, fold_count):\n",
207 | " print(\"FOLD {}\".format(fold_id))\n",
208 | " fold_start = fold_size * fold_id\n",
209 | " fold_end = fold_start + fold_size\n",
210 | " \n",
211 | " if fold_id == fold_count - 1:\n",
212 | " fold_end = len(X)\n",
213 | "\n",
214 | " train_x = np.concatenate([X[:fold_start], X[fold_end:]])\n",
215 | " train_y = np.concatenate([y[:fold_start], y[fold_end:]])\n",
216 | "\n",
217 | " val_x = X[fold_start:fold_end]\n",
218 | " val_y = y[fold_start:fold_end]\n",
219 | " \n",
220 | " \n",
221 | " model, best_auc = _train_model(model_list[fold_id], train_x, train_y, val_x, val_y,callbacks)\n",
222 | " \n",
223 | " meta = model.predict(val_x, batch_size=128)\n",
224 | " if (fold_id == 0):\n",
225 | " total_meta = meta\n",
226 | " else:\n",
227 | " total_meta = np.concatenate((total_meta, meta), axis=0)\n",
228 | " model_path = os.path.join('models', \"model{0}_weights.npy\".format(fold_id))\n",
229 | " np.save(model_path, model.get_weights())\n",
230 | " models.append(model)\n",
231 | " auc_list.append(best_auc)\n",
232 | "\n",
233 | " return models, total_meta, auc_list\n",
234 | "\n",
235 | "def _train_model(model, train_x, train_y, val_x, val_y):\n",
236 | " for label in LABELS:\n",
237 | " model = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n",
238 | " model.fit(\n",
239 | " train_x,\n",
240 | " train_x[label])\n",
241 | " \n",
242 | " y_pred = model.predict_proba(val_x)[:,1]\n",
243 | "\n",
244 | " total_auc = 0\n",
245 | " for j in range(6):\n",
246 | " auc = compute_auc(val_y[:, j], y_pred[:, j])\n",
247 | " total_auc += auc\n",
248 | "\n",
249 | " total_loss /= 6.\n",
250 | " total_auc /= 6.\n",
251 | " return model, total_auc\n",
252 | " \n",
253 | "def build_model():\n",
254 | " stackers = []\n",
255 | " for fold in range(0,10):\n",
256 | " stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n",
257 | " stackers.append(stacker)\n",
258 | " return "
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "with timer(\"Scoring Light GBM\"):\n",
268 | " scores = []\n",
269 | " folds = KFold(n_splits=4, shuffle=True, random_state=1)\n",
270 | " lgb_round_dict = defaultdict(int)\n",
271 | " trn_lgbset = lgb.Dataset(csr_trn, free_raw_data=False)\n",
272 | " del csr_trn\n",
273 | " gc.collect()\n",
274 | " \n",
275 | " for class_name in class_names:\n",
276 | " print(\"Class %s scores : \" % class_name)\n",
277 | " class_pred = np.zeros(len(train))\n",
278 | " train_target = train[class_name]\n",
279 | " trn_lgbset.set_label(train_target.values)\n",
280 | " \n",
281 | " lgb_rounds = 500\n",
282 | "\n",
283 | " for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, train_target)):\n",
284 | " watchlist = [\n",
285 | " trn_lgbset.subset(trn_idx),\n",
286 | " trn_lgbset.subset(val_idx)\n",
287 | " ]\n",
288 | " # Train lgb l1\n",
289 | " model = lgb.train(\n",
290 | " params=params,\n",
291 | " train_set=watchlist[0],\n",
292 | " num_boost_round=lgb_rounds,\n",
293 | " valid_sets=watchlist,\n",
294 | " early_stopping_rounds=50,\n",
295 | " verbose_eval=0\n",
296 | " )\n",
297 | " class_pred[val_idx] = model.predict(trn_lgbset.data[val_idx], num_iteration=model.best_iteration)\n",
298 | " score = roc_auc_score(train_target.values[val_idx], class_pred[val_idx])\n",
299 | " \n",
300 | " # Compute mean rounds over folds for each class\n",
301 | " # So that it can be re-used for test predictions\n",
302 | " lgb_round_dict[class_name] += model.best_iteration\n",
303 | " print(\"\\t Fold %d : %.6f in %3d rounds\" % (n_fold + 1, score, model.best_iteration))\n",
304 | " \n",
305 | " print(\"full score : %.6f\" % roc_auc_score(train_target, class_pred))\n",
306 | " scores.append(roc_auc_score(train_target, class_pred))\n",
307 | " train[class_name + \"_oof\"] = class_pred\n",
308 | " submission[class_name] = lr_pred / folds\n",
309 | "\n",
310 | " # Save OOF predictions - may be interesting for stacking...\n",
311 | " train[[\"id\"] + class_names + [f + \"_oof\" for f in class_names]].to_csv(\"lvl0_lgbm_clean_oof.csv\",\n",
312 | " index=False,\n",
313 | " float_format=\"%.8f\")\n",
314 | "\n",
315 | " print('Total CV score is {}'.format(np.mean(scores)))\n",
316 | "\n",
317 | "with timer(\"Predicting probabilities\"):\n",
318 | " # Go through all classes and reuse computed number of rounds for each class\n",
319 | " for class_name in class_names:\n",
320 | " with timer(\"Predicting probabilities for %s\" % class_name):\n",
321 | " train_target = train[class_name]\n",
322 | " trn_lgbset.set_label(train_target.values)\n",
323 | " # Train lgb\n",
324 | " model = lgb.train(\n",
325 | " params=params,\n",
326 | " train_set=trn_lgbset,\n",
327 | " num_boost_round=int(lgb_round_dict[class_name] / folds.n_splits)\n",
328 | " )\n",
329 | " submission[class_name] = model.predict(csr_sub, num_iteration=model.best_iteration)\n",
330 | "\n",
331 | "submission.to_csv(\"lvl0_lgbm_clean_sub.csv\", index=False, float_format=\"%.8f\")"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": null,
337 | "metadata": {},
338 | "outputs": [],
339 | "source": []
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 39,
344 | "metadata": {},
345 | "outputs": [
346 | {
347 | "name": "stdout",
348 | "output_type": "stream",
349 | "text": [
350 | "\n",
351 | " Average class toxic AUC:\t0.987802\n",
352 | " Out-of-fold class toxic AUC:\t0.987650\n"
353 | ]
354 | },
355 | {
356 | "name": "stderr",
357 | "output_type": "stream",
358 | "text": [
359 | "/home/stgc/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:61: SettingWithCopyWarning: \n",
360 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
361 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
362 | "\n",
363 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
364 | ]
365 | },
366 | {
367 | "name": "stdout",
368 | "output_type": "stream",
369 | "text": [
370 | "\n",
371 | " Average class severe_toxic AUC:\t0.991559\n",
372 | " Out-of-fold class severe_toxic AUC:\t0.991465\n",
373 | "\n",
374 | " Average class obscene AUC:\t0.995286\n",
375 | " Out-of-fold class obscene AUC:\t0.995268\n",
376 | "\n",
377 | " Average class threat AUC:\t0.991848\n",
378 | " Out-of-fold class threat AUC:\t0.991016\n",
379 | "\n",
380 | " Average class insult AUC:\t0.989942\n",
381 | " Out-of-fold class insult AUC:\t0.989909\n",
382 | "\n",
383 | " Average class identity_hate AUC:\t0.990874\n",
384 | " Out-of-fold class identity_hate AUC:\t0.990406\n",
385 | "\n",
386 | " Overall AUC:\t0.991219\n"
387 | ]
388 | }
389 | ],
390 | "source": [
391 | "from sklearn.model_selection import cross_val_score, StratifiedKFold\n",
392 | "from sklearn.metrics import log_loss, matthews_corrcoef, roc_auc_score\n",
393 | "\n",
394 | "\n",
395 | "if __name__ == \"__main__\":\n",
396 | " \n",
397 | " train = pd.read_csv('train.csv').fillna(' ')\n",
398 | " test = pd.read_csv('test.csv').fillna(' ')\n",
399 | " sub = pd.read_csv('sample_submission.csv')\n",
400 | " submission = pd.DataFrame.from_dict({'id': test['id']})\n",
401 | " submission_oof = train[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]\n",
402 | "\n",
403 | " INPUT_COLUMN = \"comment_text\"\n",
404 | " LABELS = train.columns[2:]\n",
405 | " \n",
406 | " # Import submissions and OOF files\n",
407 | " # 29: LightGBM trained on Fasttext (CV: 0.9765, LB: 0.9620)\n",
408 | " # 51: Logistic regression with word and char n-grams (CV: 0.9858, LB: ?)\n",
409 | " # 52: LSTM trained on Fasttext (CV: ?, LB: 0.9851)\n",
410 | " subnums = [1,2,3,4,5,6,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]\n",
411 | " subs, oofs = get_subs(subnums)\n",
412 | " \n",
413 | " # Engineer features\n",
414 | " feature_functions = [len, asterix_freq, uppercase_freq]\n",
415 | " features = [f.__name__ for f in feature_functions]\n",
416 | " F_train = engineer_features(train[INPUT_COLUMN], feature_functions)\n",
417 | " F_test = engineer_features(test[INPUT_COLUMN], feature_functions)\n",
418 | " \n",
419 | " train_features = np.hstack([F_train[features].as_matrix(), oofs])\n",
420 | " X_test = np.hstack([F_test[features].as_matrix(), subs]) \n",
421 | " skf = StratifiedKFold(n_splits=10, shuffle=False)\n",
422 | " \n",
423 | " scores_classes = np.zeros((len(LABELS), 10))\n",
424 | " for j, (class_name) in enumerate(LABELS):\n",
425 | " avreal = train[class_name]\n",
426 | " lr_avpred = np.zeros(train.shape[0])\n",
427 | " #stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n",
428 | " for i, (train_index, val_index) in enumerate(skf.split(train_features, train[class_name].values)):\n",
429 | " #print(train_index)\n",
430 | " stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n",
431 | " X_train, X_val = train_features[train_index], train_features[val_index]\n",
432 | " y_train, y_val = train.loc[train_index], train.loc[val_index]\n",
433 | " stacker.fit(X_train, y_train[class_name])\n",
434 | " \n",
435 | " scores_val = stacker.predict_proba(X_val)[:, 1]\n",
436 | " scores_classes[j][i] = roc_auc_score(y_val[class_name], scores_val)\n",
437 | " lr_avpred[val_index] = scores_val\n",
438 | " \n",
439 | " lr_y_pred = stacker.predict_proba(X_test)[:, 1]\n",
440 | " if i > 0:\n",
441 | " lr_fpred = lr_pred + lr_y_pred\n",
442 | " else:\n",
443 | " lr_fpred = lr_y_pred\n",
444 | "\n",
445 | " lr_pred = lr_fpred\n",
446 | " \n",
447 | " lr_oof_auc = roc_auc_score(avreal, lr_avpred)\n",
448 | " print('\\n Average class %s AUC:\\t%.6f' % (class_name, np.mean(scores_classes[j])))\n",
449 | " print(' Out-of-fold class %s AUC:\\t%.6f' % (class_name, lr_oof_auc))\n",
450 | " submission[class_name] = lr_pred / 10\n",
451 | " submission_oof[class_name] = lr_avpred\n",
452 | " \n",
453 | " print('\\n Overall AUC:\\t%.6f' % (np.mean(scores_classes)))\n",
454 | " submission.to_csv('lgb_stacktwo_pred.csv', index=False)\n",
455 | " submission_oof.to_csv('lgb_stacktwo_meta.csv', index=False)\n",
456 | "\n",
457 | " '''\n",
458 | " stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n",
459 | " \n",
460 | " # Fit and submit\n",
461 | " \n",
462 | " scores = []\n",
463 | " for label in LABELS:\n",
464 | " print(label)\n",
465 | " score = cross_val_score(stacker, X_train, train[label], cv=10, scoring='roc_auc')\n",
466 | " print(\"AUC:\", score)\n",
467 | " scores.append(np.mean(score))\n",
468 | " stacker.fit(X_train, train[label])\n",
469 | " sub[label] = stacker.predict_proba(X_test)[:,1]\n",
470 | " print(\"CV score:\", np.mean(scores))\n",
471 | " \n",
472 | " sub.to_csv(\"29modelstack.csv\", index=False)\n",
473 | " '''"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": 40,
479 | "metadata": {},
480 | "outputs": [
481 | {
482 | "name": "stdout",
483 | "output_type": "stream",
484 | "text": [
485 | "\n",
486 | " Average class toxic AUC:\t0.987147\n",
487 | " Out-of-fold class toxic AUC:\t0.987087\n"
488 | ]
489 | },
490 | {
491 | "name": "stderr",
492 | "output_type": "stream",
493 | "text": [
494 | "/home/stgc/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:79: SettingWithCopyWarning: \n",
495 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
496 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
497 | "\n",
498 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
499 | ]
500 | },
501 | {
502 | "name": "stdout",
503 | "output_type": "stream",
504 | "text": [
505 | "\n",
506 | " Average class severe_toxic AUC:\t0.991326\n",
507 | " Out-of-fold class severe_toxic AUC:\t0.991270\n",
508 | "\n",
509 | " Average class obscene AUC:\t0.994503\n",
510 | " Out-of-fold class obscene AUC:\t0.994479\n",
511 | "\n",
512 | " Average class threat AUC:\t0.988594\n",
513 | " Out-of-fold class threat AUC:\t0.988375\n",
514 | "\n",
515 | " Average class insult AUC:\t0.988594\n",
516 | " Out-of-fold class insult AUC:\t0.988550\n",
517 | "\n",
518 | " Average class identity_hate AUC:\t0.988470\n",
519 | " Out-of-fold class identity_hate AUC:\t0.988419\n",
520 | "\n",
521 | " Overall AUC:\t0.989772\n"
522 | ]
523 | }
524 | ],
525 | "source": [
526 | "from sklearn.model_selection import cross_val_score, StratifiedKFold\n",
527 | "from sklearn.metrics import log_loss, matthews_corrcoef, roc_auc_score\n",
528 | "from sklearn.linear_model import LogisticRegression\n",
529 | "\n",
530 | "\n",
531 | "if __name__ == \"__main__\":\n",
532 | " \n",
533 | " train = pd.read_csv('train.csv').fillna(' ')\n",
534 | " test = pd.read_csv('test.csv').fillna(' ')\n",
535 | " sub = pd.read_csv('sample_submission.csv')\n",
536 | " submission = pd.DataFrame.from_dict({'id': test['id']})\n",
537 | " submission_oof = train[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]\n",
538 | "\n",
539 | " INPUT_COLUMN = \"comment_text\"\n",
540 | " LABELS = train.columns[2:]\n",
541 | " \n",
542 | " # Import submissions and OOF files\n",
543 | " # 29: LightGBM trained on Fasttext (CV: 0.9765, LB: 0.9620)\n",
544 | " # 51: Logistic regression with word and char n-grams (CV: 0.9858, LB: ?)\n",
545 | " # 52: LSTM trained on Fasttext (CV: ?, LB: 0.9851)\n",
546 | " subnums = [1,2,3,4,5,6,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]\n",
547 | " subs, oofs = get_subs(subnums)\n",
548 | " \n",
549 | " # Engineer features\n",
550 | " feature_functions = [len, asterix_freq, uppercase_freq]\n",
551 | " features = [f.__name__ for f in feature_functions]\n",
552 | " F_train = engineer_features(train[INPUT_COLUMN], feature_functions)\n",
553 | " F_test = engineer_features(test[INPUT_COLUMN], feature_functions)\n",
554 | " \n",
555 | " train_features = np.hstack([F_train[features].as_matrix(), oofs])\n",
556 | " X_test = np.hstack([F_test[features].as_matrix(), subs]) \n",
557 | " skf = StratifiedKFold(n_splits=10, shuffle=False)\n",
558 | " \n",
559 | " scores_classes = np.zeros((len(LABELS), 10))\n",
560 | " all_parameters = {\n",
561 | " 'C' : [1.048113, 0.1930, 0.596362, 0.25595, 0.449843, 0.25595],\n",
562 | " 'tol' : [0.1, 0.1, 0.046416, 0.0215443, 0.1, 0.01],\n",
563 | " 'solver' : ['lbfgs', 'newton-cg', 'lbfgs', 'newton-cg', 'newton-cg', 'lbfgs'],\n",
564 | " 'fit_intercept' : [True, True, True, True, True, True],\n",
565 | " 'penalty' : ['l2', 'l2', 'l2', 'l2', 'l2', 'l2'],\n",
566 | " 'class_weight' : [None, 'balanced', 'balanced', 'balanced', 'balanced', 'balanced'],\n",
567 | " }\n",
568 | " for j, (class_name) in enumerate(LABELS):\n",
569 | " avreal = train[class_name]\n",
570 | " lr_avpred = np.zeros(train.shape[0])\n",
571 | " #stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n",
572 | " for i, (train_index, val_index) in enumerate(skf.split(train_features, train[class_name].values)):\n",
573 | " #print(train_index)\n",
574 | " stacker = LogisticRegression(\n",
575 | " C=all_parameters['C'][j],\n",
576 | " max_iter=200,\n",
577 | " tol=all_parameters['tol'][j],\n",
578 | " solver=all_parameters['solver'][j],\n",
579 | " fit_intercept=all_parameters['fit_intercept'][j],\n",
580 | " penalty=all_parameters['penalty'][j],\n",
581 | " dual=False,\n",
582 | " class_weight=all_parameters['class_weight'][j],\n",
583 | " verbose=0)\n",
584 | " X_train, X_val = train_features[train_index], train_features[val_index]\n",
585 | " y_train, y_val = train.loc[train_index], train.loc[val_index]\n",
586 | " stacker.fit(X_train, y_train[class_name])\n",
587 | " \n",
588 | " scores_val = stacker.predict_proba(X_val)[:, 1]\n",
589 | " scores_classes[j][i] = roc_auc_score(y_val[class_name], scores_val)\n",
590 | " lr_avpred[val_index] = scores_val\n",
591 | " \n",
592 | " lr_y_pred = stacker.predict_proba(X_test)[:, 1]\n",
593 | " if i > 0:\n",
594 | " lr_fpred = lr_pred + lr_y_pred\n",
595 | " else:\n",
596 | " lr_fpred = lr_y_pred\n",
597 | "\n",
598 | " lr_pred = lr_fpred\n",
599 | " \n",
600 | " lr_oof_auc = roc_auc_score(avreal, lr_avpred)\n",
601 | " print('\\n Average class %s AUC:\\t%.6f' % (class_name, np.mean(scores_classes[j])))\n",
602 | " print(' Out-of-fold class %s AUC:\\t%.6f' % (class_name, lr_oof_auc))\n",
603 | " submission[class_name] = lr_pred / 10\n",
604 | " submission_oof[class_name] = lr_avpred\n",
605 | " \n",
606 | " print('\\n Overall AUC:\\t%.6f' % (np.mean(scores_classes)))\n",
607 | " submission.to_csv('logisticreg_stacktwo_pred.csv', index=False)\n",
608 | " submission_oof.to_csv('logisticreg_stacktwo_meta.csv', index=False)\n",
609 | "\n",
610 | " '''\n",
611 | " stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n",
612 | " \n",
613 | " # Fit and submit\n",
614 | " \n",
615 | " scores = []\n",
616 | " for label in LABELS:\n",
617 | " print(label)\n",
618 | " score = cross_val_score(stacker, X_train, train[label], cv=10, scoring='roc_auc')\n",
619 | " print(\"AUC:\", score)\n",
620 | " scores.append(np.mean(score))\n",
621 | " stacker.fit(X_train, train[label])\n",
622 | " sub[label] = stacker.predict_proba(X_test)[:,1]\n",
623 | " print(\"CV score:\", np.mean(scores))\n",
624 | " \n",
625 | " sub.to_csv(\"29modelstack.csv\", index=False)\n",
626 | " '''"
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": 44,
632 | "metadata": {},
633 | "outputs": [
634 | {
635 | "name": "stdout",
636 | "output_type": "stream",
637 | "text": [
638 | "\n",
639 | " Average class toxic AUC:\t0.987652\n",
640 | " Out-of-fold class toxic AUC:\t0.987478\n"
641 | ]
642 | },
643 | {
644 | "name": "stderr",
645 | "output_type": "stream",
646 | "text": [
647 | "/home/stgc/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:74: SettingWithCopyWarning: \n",
648 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
649 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
650 | "\n",
651 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
652 | ]
653 | },
654 | {
655 | "name": "stdout",
656 | "output_type": "stream",
657 | "text": [
658 | "\n",
659 | " Average class severe_toxic AUC:\t0.991304\n",
660 | " Out-of-fold class severe_toxic AUC:\t0.991147\n",
661 | "\n",
662 | " Average class obscene AUC:\t0.995268\n",
663 | " Out-of-fold class obscene AUC:\t0.995247\n",
664 | "\n",
665 | " Average class threat AUC:\t0.989995\n",
666 | " Out-of-fold class threat AUC:\t0.989601\n",
667 | "\n",
668 | " Average class insult AUC:\t0.989765\n",
669 | " Out-of-fold class insult AUC:\t0.989735\n",
670 | "\n",
671 | " Average class identity_hate AUC:\t0.990650\n",
672 | " Out-of-fold class identity_hate AUC:\t0.990048\n",
673 | "\n",
674 | " Overall AUC:\t0.990772\n"
675 | ]
676 | }
677 | ],
678 | "source": [
679 | "from sklearn.model_selection import cross_val_score, StratifiedKFold\n",
680 | "from sklearn.metrics import log_loss, matthews_corrcoef, roc_auc_score\n",
681 | "from sklearn.linear_model import LogisticRegression\n",
682 | "from xgboost import XGBClassifier\n",
683 | "\n",
684 | "\n",
685 | "if __name__ == \"__main__\":\n",
686 | " \n",
687 | " train = pd.read_csv('train.csv').fillna(' ')\n",
688 | " test = pd.read_csv('test.csv').fillna(' ')\n",
689 | " sub = pd.read_csv('sample_submission.csv')\n",
690 | " submission = pd.DataFrame.from_dict({'id': test['id']})\n",
691 | " submission_oof = train[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]\n",
692 | "\n",
693 | " INPUT_COLUMN = \"comment_text\"\n",
694 | " LABELS = train.columns[2:]\n",
695 | " \n",
696 | " # Import submissions and OOF files\n",
697 | " # 29: LightGBM trained on Fasttext (CV: 0.9765, LB: 0.9620)\n",
698 | " # 51: Logistic regression with word and char n-grams (CV: 0.9858, LB: ?)\n",
699 | " # 52: LSTM trained on Fasttext (CV: ?, LB: 0.9851)\n",
700 | " subnums = [1,2,3,4,5,6,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]\n",
701 | " subs, oofs = get_subs(subnums)\n",
702 | " \n",
703 | " # Engineer features\n",
704 | " feature_functions = [len, asterix_freq, uppercase_freq]\n",
705 | " features = [f.__name__ for f in feature_functions]\n",
706 | " F_train = engineer_features(train[INPUT_COLUMN], feature_functions)\n",
707 | " F_test = engineer_features(test[INPUT_COLUMN], feature_functions)\n",
708 | " \n",
709 | " train_features = np.hstack([F_train[features].as_matrix(), oofs])\n",
710 | " X_test = np.hstack([F_test[features].as_matrix(), subs]) \n",
711 | " skf = StratifiedKFold(n_splits=10, shuffle=False)\n",
712 | " \n",
713 | " scores_classes = np.zeros((len(LABELS), 10))\n",
714 | "\n",
715 | " for j, (class_name) in enumerate(LABELS):\n",
716 | " avreal = train[class_name]\n",
717 | " lr_avpred = np.zeros(train.shape[0])\n",
718 | " #stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n",
719 | " for i, (train_index, val_index) in enumerate(skf.split(train_features, train[class_name].values)):\n",
720 | " #print(train_index)\n",
721 | " n_estimators = 200\n",
722 | " stacker = clf = XGBClassifier(n_estimators=n_estimators,\n",
723 | " max_depth=4,\n",
724 | " objective=\"binary:logistic\",\n",
725 | " learning_rate=.1, \n",
726 | " subsample=.8, \n",
727 | " colsample_bytree=.8,\n",
728 | " gamma=1,\n",
729 | " reg_alpha=0,\n",
730 | " reg_lambda=1,\n",
731 | " nthread=2)\n",
732 | " X_train, X_val = train_features[train_index], train_features[val_index]\n",
733 | " y_train, y_val = train.loc[train_index], train.loc[val_index]\n",
734 | " stacker.fit(X_train, y_train[class_name])\n",
735 | " \n",
736 | " scores_val = stacker.predict_proba(X_val)[:, 1]\n",
737 | " scores_classes[j][i] = roc_auc_score(y_val[class_name], scores_val)\n",
738 | " lr_avpred[val_index] = scores_val\n",
739 | " \n",
740 | " lr_y_pred = stacker.predict_proba(X_test)[:, 1]\n",
741 | " if i > 0:\n",
742 | " lr_fpred = lr_pred + lr_y_pred\n",
743 | " else:\n",
744 | " lr_fpred = lr_y_pred\n",
745 | "\n",
746 | " lr_pred = lr_fpred\n",
747 | " \n",
748 | " lr_oof_auc = roc_auc_score(avreal, lr_avpred)\n",
749 | " print('\\n Average class %s AUC:\\t%.6f' % (class_name, np.mean(scores_classes[j])))\n",
750 | " print(' Out-of-fold class %s AUC:\\t%.6f' % (class_name, lr_oof_auc))\n",
751 | " submission[class_name] = lr_pred / 10\n",
752 | " submission_oof[class_name] = lr_avpred\n",
753 | " \n",
754 | " print('\\n Overall AUC:\\t%.6f' % (np.mean(scores_classes)))\n",
755 | " submission.to_csv('XGB_stacktwo_pred.csv', index=False)\n",
756 | " submission_oof.to_csv('XGB_stacktwo_meta.csv', index=False)\n",
757 | "\n",
758 | " '''\n",
759 | " stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n",
760 | " \n",
761 | " # Fit and submit\n",
762 | " \n",
763 | " scores = []\n",
764 | " for label in LABELS:\n",
765 | " print(label)\n",
766 | " score = cross_val_score(stacker, X_train, train[label], cv=10, scoring='roc_auc')\n",
767 | " print(\"AUC:\", score)\n",
768 | " scores.append(np.mean(score))\n",
769 | " stacker.fit(X_train, train[label])\n",
770 | " sub[label] = stacker.predict_proba(X_test)[:,1]\n",
771 | " print(\"CV score:\", np.mean(scores))\n",
772 | " \n",
773 | " sub.to_csv(\"29modelstack.csv\", index=False)\n",
774 | " '''"
775 | ]
776 | },
777 | {
778 | "cell_type": "code",
779 | "execution_count": 52,
780 | "metadata": {},
781 | "outputs": [],
782 | "source": [
783 | "def golden_features(data):\n",
784 | " df = pd.DataFrame()\n",
785 | " df['total_length'] = data['comment_text'].apply(len)\n",
786 | " df['capitals'] = data['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))\n",
787 | " df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),\n",
788 | " axis=1)\n",
789 | " df['num_exclamation_marks'] = data['comment_text'].apply(lambda comment: comment.count('!'))\n",
790 | " df['num_question_marks'] = data['comment_text'].apply(lambda comment: comment.count('?'))\n",
791 | " df['num_punctuation'] = data['comment_text'].apply(\n",
792 | " lambda comment: sum(comment.count(w) for w in '.,;:'))\n",
793 | " df['num_symbols'] = data['comment_text'].apply(\n",
794 | " lambda comment: sum(comment.count(w) for w in '*&$%'))\n",
795 | " df['num_words'] = data['comment_text'].apply(lambda comment: len(comment.split()))\n",
796 | " df['num_unique_words'] = data['comment_text'].apply(\n",
797 | " lambda comment: len(set(w for w in comment.split())))\n",
798 | " df['words_vs_unique'] = df['num_unique_words'] / df['num_words']\n",
799 | " df['num_smilies'] = data['comment_text'].apply(\n",
800 | " lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))\n",
801 | " return df"
802 | ]
803 | },
804 | {
805 | "cell_type": "code",
806 | "execution_count": 57,
807 | "metadata": {},
808 | "outputs": [
809 | {
810 | "name": "stdout",
811 | "output_type": "stream",
812 | "text": [
813 | "\n",
814 | " Average class toxic AUC:\t0.988588\n",
815 | " Out-of-fold class toxic AUC:\t0.988451\n"
816 | ]
817 | },
818 | {
819 | "name": "stderr",
820 | "output_type": "stream",
821 | "text": [
822 | "/home/stgc/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:82: SettingWithCopyWarning: \n",
823 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
824 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
825 | "\n",
826 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
827 | ]
828 | },
829 | {
830 | "name": "stdout",
831 | "output_type": "stream",
832 | "text": [
833 | "\n",
834 | " Average class severe_toxic AUC:\t0.991879\n",
835 | " Out-of-fold class severe_toxic AUC:\t0.991808\n",
836 | "\n",
837 | " Average class obscene AUC:\t0.995325\n",
838 | " Out-of-fold class obscene AUC:\t0.995282\n",
839 | "\n",
840 | " Average class threat AUC:\t0.990770\n",
841 | " Out-of-fold class threat AUC:\t0.990069\n",
842 | "\n",
843 | " Average class insult AUC:\t0.989822\n",
844 | " Out-of-fold class insult AUC:\t0.989714\n",
845 | "\n",
846 | " Average class identity_hate AUC:\t0.990839\n",
847 | " Out-of-fold class identity_hate AUC:\t0.990254\n",
848 | "\n",
849 | " Overall AUC:\t0.991204\n",
850 | "toxic\n",
851 | "('AUC:', array([0.98889795, 0.98861989, 0.98971198, 0.9868848 , 0.98862791,\n",
852 | " 0.9890668 , 0.98846785, 0.98917377, 0.98753618, 0.9888971 ]))\n",
853 | "severe_toxic\n",
854 | "('AUC:', array([0.99273583, 0.99111359, 0.99293898, 0.99054152, 0.99058346,\n",
855 | " 0.99194787, 0.99232698, 0.99152614, 0.99322736, 0.99184623]))\n",
856 | "obscene\n",
857 | "('AUC:', array([0.99580642, 0.99464038, 0.99561256, 0.99475678, 0.9959103 ,\n",
858 | " 0.99596112, 0.99524442, 0.99531185, 0.99552059, 0.99448298]))\n",
859 | "threat\n",
860 | "('AUC:', array([0.98517704, 0.99588571, 0.99611617, 0.98161418, 0.98804791,\n",
861 | " 0.98691385, 0.9933096 , 0.99055304, 0.99403656, 0.99604131]))\n",
862 | "insult\n",
863 | "('AUC:', array([0.98973901, 0.99019773, 0.98993672, 0.98867237, 0.98993016,\n",
864 | " 0.99099737, 0.9898002 , 0.98913562, 0.99063964, 0.9891739 ]))\n",
865 | "identity_hate\n",
866 | "('AUC:', array([0.98968835, 0.99280467, 0.98910903, 0.99082099, 0.99160926,\n",
867 | " 0.99028667, 0.98763187, 0.98932094, 0.99433892, 0.99277969]))\n",
868 | "('CV score:', 0.9912038016791836)\n"
869 | ]
870 | }
871 | ],
872 | "source": [
873 | "from sklearn.model_selection import cross_val_score, StratifiedKFold\n",
874 | "from sklearn.metrics import log_loss, matthews_corrcoef, roc_auc_score\n",
875 | "from sklearn.linear_model import LogisticRegression\n",
876 | "\n",
877 | "def get_subs(nums):\n",
878 | " subs = np.hstack([np.array(pd.read_csv(\"SUB_two/sub\" + str(num) + \".csv\")[LABELS]) for num in subnums])\n",
879 | " oofs = np.hstack([np.array(pd.read_csv(\"OOF_two/oof\" + str(num) + \".csv\")[LABELS]) for num in subnums])\n",
880 | " return subs, oofs\n",
881 | "\n",
882 | "if __name__ == \"__main__\":\n",
883 | " \n",
884 | " train = pd.read_csv('train.csv').fillna(' ')\n",
885 | " test = pd.read_csv('test.csv').fillna(' ')\n",
886 | " sub = pd.read_csv('sample_submission.csv')\n",
887 | " submission = pd.DataFrame.from_dict({'id': test['id']})\n",
888 | " submission_oof = train[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]\n",
889 | "\n",
890 | " INPUT_COLUMN = \"comment_text\"\n",
891 | " LABELS = train.columns[2:]\n",
892 | " \n",
893 | " # Import submissions and OOF files\n",
894 | " # 29: LightGBM trained on Fasttext (CV: 0.9765, LB: 0.9620)\n",
895 | " # 51: Logistic regression with word and char n-grams (CV: 0.9858, LB: ?)\n",
896 | " # 52: LSTM trained on Fasttext (CV: ?, LB: 0.9851)\n",
897 | " subnums = [1,2,3]\n",
898 | " subs, oofs = get_subs(subnums)\n",
899 | " \n",
900 | " # Engineer features\n",
901 | " feature_functions = [len, asterix_freq, uppercase_freq]\n",
902 | " features = [f.__name__ for f in feature_functions]\n",
903 | " F_train = engineer_features(train[INPUT_COLUMN], feature_functions)\n",
904 | " F_test = engineer_features(test[INPUT_COLUMN], feature_functions)\n",
905 | " \n",
906 | " \n",
907 | "\n",
908 | " train_features_pri = np.hstack([F_train[features].as_matrix(), oofs])\n",
909 | " X_test_pri = np.hstack([F_test[features].as_matrix(), subs]) \n",
910 | " \n",
911 | " gold_F = ('total_length', 'capitals', 'caps_vs_length', 'num_exclamation_marks',\n",
912 | " 'num_question_marks', 'num_punctuation', 'num_words', 'num_unique_words',\n",
913 | " 'words_vs_unique', 'num_smilies', 'num_symbols')\n",
914 | " gold_Feature = [g for g in gold_F]\n",
915 | " \n",
916 | " G_train = golden_features(train)\n",
917 | " G_test = golden_features(test)\n",
918 | " \n",
919 | " train_features = np.hstack([G_train[gold_Feature].as_matrix(), train_features_pri])\n",
920 | " X_test = np.hstack([G_test[gold_Feature].as_matrix(), X_test_pri])\n",
921 | " \n",
922 | " \n",
923 | " skf = StratifiedKFold(n_splits=10, shuffle=False)\n",
924 | " \n",
925 | " scores_classes = np.zeros((len(LABELS), 10))\n",
926 | "\n",
927 | " for j, (class_name) in enumerate(LABELS):\n",
928 | " avreal = train[class_name]\n",
929 | " lr_avpred = np.zeros(train.shape[0])\n",
930 | " #stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n",
931 | " for i, (train_index, val_index) in enumerate(skf.split(train_features, train[class_name].values)):\n",
932 | " #print(train_index)\n",
933 | " stacker = stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n",
934 | " X_train, X_val = train_features[train_index], train_features[val_index]\n",
935 | " y_train, y_val = train.loc[train_index], train.loc[val_index]\n",
936 | " stacker.fit(X_train, y_train[class_name])\n",
937 | " \n",
938 | " scores_val = stacker.predict_proba(X_val)[:, 1]\n",
939 | " scores_classes[j][i] = roc_auc_score(y_val[class_name], scores_val)\n",
940 | " lr_avpred[val_index] = scores_val\n",
941 | " \n",
942 | " lr_y_pred = stacker.predict_proba(X_test)[:, 1]\n",
943 | " if i > 0:\n",
944 | " lr_fpred = lr_pred + lr_y_pred\n",
945 | " else:\n",
946 | " lr_fpred = lr_y_pred\n",
947 | "\n",
948 | " lr_pred = lr_fpred\n",
949 | " \n",
950 | " lr_oof_auc = roc_auc_score(avreal, lr_avpred)\n",
951 | " print('\\n Average class %s AUC:\\t%.6f' % (class_name, np.mean(scores_classes[j])))\n",
952 | " print(' Out-of-fold class %s AUC:\\t%.6f' % (class_name, lr_oof_auc))\n",
953 | " submission[class_name] = lr_pred / 10\n",
954 | " submission_oof[class_name] = lr_avpred\n",
955 | " \n",
956 | " print('\\n Overall AUC:\\t%.6f' % (np.mean(scores_classes)))\n",
957 | " submission.to_csv('lgbm_stack_final_pred.csv', index=False)\n",
958 | " submission_oof.to_csv('LGBM_stack_final_meta.csv', index=False)\n",
959 | "\n",
960 | " \n",
961 | " stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n",
962 | " \n",
963 | " # Fit and submit\n",
964 | " \n",
965 | " scores = []\n",
966 | " for label in LABELS:\n",
967 | " print(label)\n",
968 | " score = cross_val_score(stacker, train_features, train[label], cv=10, scoring='roc_auc')\n",
969 | " print(\"AUC:\", score)\n",
970 | " scores.append(np.mean(score))\n",
971 | " stacker.fit(train_features, train[label])\n",
972 | " sub[label] = stacker.predict_proba(X_test)[:,1]\n",
973 | " print(\"CV score:\", np.mean(scores))\n",
974 | " \n",
975 | " sub.to_csv(\"TRAIN_ALL_STACK_LOG_REG.csv\", index=False)\n",
976 | " "
977 | ]
978 | },
979 | {
980 | "cell_type": "code",
981 | "execution_count": 41,
982 | "metadata": {},
983 | "outputs": [
984 | {
985 | "data": {
986 | "text/html": [
987 | "\n",
988 | "\n",
1001 | "
\n",
1002 | " \n",
1003 | " \n",
1004 | " | \n",
1005 | " id | \n",
1006 | " toxic | \n",
1007 | " severe_toxic | \n",
1008 | " obscene | \n",
1009 | " threat | \n",
1010 | " insult | \n",
1011 | " identity_hate | \n",
1012 | "
\n",
1013 | " \n",
1014 | " \n",
1015 | " \n",
1016 | " | 0 | \n",
1017 | " 00001cee341fdb12 | \n",
1018 | " 0.999646 | \n",
1019 | " 0.992186 | \n",
1020 | " 0.999660 | \n",
1021 | " 0.849823 | \n",
1022 | " 0.996217 | \n",
1023 | " 0.997916 | \n",
1024 | "
\n",
1025 | " \n",
1026 | " | 1 | \n",
1027 | " 0000247867823ef7 | \n",
1028 | " 0.006185 | \n",
1029 | " 0.001481 | \n",
1030 | " 0.008564 | \n",
1031 | " 0.016933 | \n",
1032 | " 0.017819 | \n",
1033 | " 0.020447 | \n",
1034 | "
\n",
1035 | " \n",
1036 | " | 2 | \n",
1037 | " 00013b17ad220c46 | \n",
1038 | " 0.006356 | \n",
1039 | " 0.001386 | \n",
1040 | " 0.009035 | \n",
1041 | " 0.017988 | \n",
1042 | " 0.016962 | \n",
1043 | " 0.018510 | \n",
1044 | "
\n",
1045 | " \n",
1046 | " | 3 | \n",
1047 | " 00017563c3f7919a | \n",
1048 | " 0.004452 | \n",
1049 | " 0.001463 | \n",
1050 | " 0.008542 | \n",
1051 | " 0.027373 | \n",
1052 | " 0.013220 | \n",
1053 | " 0.017579 | \n",
1054 | "
\n",
1055 | " \n",
1056 | " | 4 | \n",
1057 | " 00017695ad8997eb | \n",
1058 | " 0.005858 | \n",
1059 | " 0.001344 | \n",
1060 | " 0.010354 | \n",
1061 | " 0.025551 | \n",
1062 | " 0.014992 | \n",
1063 | " 0.019761 | \n",
1064 | "
\n",
1065 | " \n",
1066 | "
\n",
1067 | "
"
1068 | ],
1069 | "text/plain": [
1070 | " id toxic severe_toxic obscene threat insult \\\n",
1071 | "0 00001cee341fdb12 0.999646 0.992186 0.999660 0.849823 0.996217 \n",
1072 | "1 0000247867823ef7 0.006185 0.001481 0.008564 0.016933 0.017819 \n",
1073 | "2 00013b17ad220c46 0.006356 0.001386 0.009035 0.017988 0.016962 \n",
1074 | "3 00017563c3f7919a 0.004452 0.001463 0.008542 0.027373 0.013220 \n",
1075 | "4 00017695ad8997eb 0.005858 0.001344 0.010354 0.025551 0.014992 \n",
1076 | "\n",
1077 | " identity_hate \n",
1078 | "0 0.997916 \n",
1079 | "1 0.020447 \n",
1080 | "2 0.018510 \n",
1081 | "3 0.017579 \n",
1082 | "4 0.019761 "
1083 | ]
1084 | },
1085 | "execution_count": 41,
1086 | "metadata": {},
1087 | "output_type": "execute_result"
1088 | }
1089 | ],
1090 | "source": [
1091 | "test = pd.read_csv('SUB_two/logisticreg_stacktwo_pred.csv')\n",
1092 | "test.head()\n",
1093 | "#columns_to_drop_test = ['toxic','severe_toxic','threat','insult','identity_hate','obscene']\n",
1094 | "#columns_to_drop_test = ['comment_text']\n",
1095 | "\n",
1096 | "#test = test.drop(columns_to_drop_test, axis=1)\n",
1097 | "\n",
1098 | "#test = test.rename(columns={'toxic_oof': 'toxic', 'severe_toxic_oof': 'severe_toxic', 'obscene_oof': 'obscene', 'threat_oof': 'threat', 'insult_oof': 'insult', 'identity_hate_oof': 'identity_hate'})\n",
1099 | "#test.to_csv(\"29modelstack.csv\", index=False)\n"
1100 | ]
1101 | },
1102 | {
1103 | "cell_type": "code",
1104 | "execution_count": 58,
1105 | "metadata": {},
1106 | "outputs": [
1107 | {
1108 | "ename": "IOError",
1109 | "evalue": "File SUB_two/sub4.csv does not exist",
1110 | "output_type": "error",
1111 | "traceback": [
1112 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1113 | "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)",
1114 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msubnums\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0msubs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moofs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_subs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msubnums\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
1115 | "\u001b[0;32m\u001b[0m in \u001b[0;36mget_subs\u001b[0;34m(nums)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_subs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnums\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0msubs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"SUB_two/sub\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\".csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mLABELS\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mnum\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msubnums\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0moofs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"OOF_two/oof\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\".csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mLABELS\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mnum\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msubnums\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msubs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moofs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1116 | "\u001b[0;32m/home/stgc/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 707\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[1;32m 708\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 709\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 710\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 711\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1117 | "\u001b[0;32m/home/stgc/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 447\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 448\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 449\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 450\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1118 | "\u001b[0;32m/home/stgc/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 816\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 818\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 819\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1119 | "\u001b[0;32m/home/stgc/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 1047\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1048\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1049\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1050\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1051\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'python'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1120 | "\u001b[0;32m/home/stgc/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 1693\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'allow_leading_cols'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex_col\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1694\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1695\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1696\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1697\u001b[0m \u001b[0;31m# XXX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1121 | "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n",
1122 | "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[0;34m()\u001b[0m\n",
1123 | "\u001b[0;31mIOError\u001b[0m: File SUB_two/sub4.csv does not exist"
1124 | ]
1125 | }
1126 | ],
1127 | "source": [
1128 | "subnums = [4]\n",
1129 | "subs, oofs = get_subs(subnums)\n"
1130 | ]
1131 | },
1132 | {
1133 | "cell_type": "code",
1134 | "execution_count": null,
1135 | "metadata": {},
1136 | "outputs": [],
1137 | "source": [
1138 | "test = pd.read_csv('OOF/oof28.csv')\n"
1139 | ]
1140 | }
1141 | ],
1142 | "metadata": {
1143 | "kernelspec": {
1144 | "display_name": "Python 2",
1145 | "language": "python",
1146 | "name": "python2"
1147 | },
1148 | "language_info": {
1149 | "codemirror_mode": {
1150 | "name": "ipython",
1151 | "version": 2
1152 | },
1153 | "file_extension": ".py",
1154 | "mimetype": "text/x-python",
1155 | "name": "python",
1156 | "nbconvert_exporter": "python",
1157 | "pygments_lexer": "ipython2",
1158 | "version": "2.7.14"
1159 | }
1160 | },
1161 | "nbformat": 4,
1162 | "nbformat_minor": 2
1163 | }
1164 |
--------------------------------------------------------------------------------