├── EXTRATREES_CLASSIFIER.ipynb ├── HillCLIMBENSEMBLE.ipynb ├── LGBM_LOGREG_XGB_STACK_LOGREG.ipynb ├── README.md ├── RIDGE.ipynb ├── Train Toxicity Model.ipynb ├── Untitled.ipynb ├── XGBOOST.ipynb ├── add_covaai.ipynb ├── badwords.ipynb ├── bagging.ipynb ├── conv.ipynb ├── convai_feature.ipynb ├── ensemble.ipynb ├── fasttext_direct.ipynb ├── feature_engineering.ipynb ├── get_data.sh ├── install.sh ├── model_tool.py ├── nbsvm.ipynb ├── nbsvm.py ├── requirements.txt ├── sample_submission.csv ├── super_nbsvm.ipynb ├── translate.ipynb └── visuals.py /LGBM_LOGREG_XGB_STACK_LOGREG.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "def read_predictions(prediction_dir, concat_mode='concat', per_label=False):\n", 10 | " labels = pd.read_csv(os.path.join(prediction_dir, 'labels.csv'))\n", 11 | "\n", 12 | " filepaths_train, filepaths_test = [], []\n", 13 | " for filepath in sorted(glob.glob('{}/*'.format(prediction_dir))):\n", 14 | " if filepath.endswith('predictions_test_oof.csv'):\n", 15 | " filepaths_test.append(filepath)\n", 16 | "\n", 17 | " test_dfs = []\n", 18 | " for filepath in filepaths_test:\n", 19 | " test_dfs.append(pd.read_csv(filepath))\n", 20 | " test_dfs = reduce(lambda df1, df2: pd.merge(df1, df2, on=['id', 'fold_id']), test_dfs)\n", 21 | " test_dfs.columns = _clean_columns(test_dfs, keep_colnames = ['id','fold_id'])\n", 22 | "\n", 23 | " return train_dfs, test_dfs\n", 24 | "\n", 25 | "def get_fold_xy(test,i):\n", 26 | " #train_split = train[train['fold_id'] != i]\n", 27 | " #valid_split = train[train['fold_id'] == i]\n", 28 | " test_split = test[test['fold_id'] == i]\n", 29 | "\n", 30 | " #y_train = train_split[label_columns].values\n", 31 | " #y_valid = valid_split[label_columns].values\n", 32 | " #columns_to_drop_train = label_columns + ['id','fold_id']\n", 33 | " #X_train = train_split.drop(columns_to_drop_train, axis=1).values\n", 34 | " #X_valid = valid_split.drop(columns_to_drop_train, axis=1).values\n", 35 | "\n", 36 | " columns_to_drop_test = ['id','fold_id']\n", 37 | " X_test = test_split.drop(columns_to_drop_test, axis=1).values\n", 38 | " return X_test" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 40, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "ename": "IOError", 48 | "evalue": "File newstageone/13/word2vec_scnn_predictions_test_oof.csv does not exist", 49 | "output_type": "error", 50 | "traceback": [ 51 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 52 | "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)", 53 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mpr_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'newstageone/13/word2vec_scnn_predictions_test_oof.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mtest_predicts_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mfold\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 54 | "\u001b[0;32m/home/dcek/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 707\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[1;32m 708\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 709\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 710\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 711\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 55 | "\u001b[0;32m/home/dcek/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 447\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 448\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 449\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 450\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 56 | "\u001b[0;32m/home/dcek/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 816\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 818\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 819\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 57 | "\u001b[0;32m/home/dcek/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 1047\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1048\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1049\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1050\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1051\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'python'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 58 | "\u001b[0;32m/home/dcek/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 1693\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'allow_leading_cols'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex_col\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1694\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1695\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1696\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1697\u001b[0m \u001b[0;31m# XXX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 59 | "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n", 60 | "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[0;34m()\u001b[0m\n", 61 | "\u001b[0;31mIOError\u001b[0m: File newstageone/13/word2vec_scnn_predictions_test_oof.csv does not exist" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "import pandas as pd\n", 67 | "import numpy as np\n", 68 | "\n", 69 | "pr_file = pd.read_csv('newstageone/13/word2vec_scnn_predictions_test_oof.csv')\n", 70 | "test_predicts_list = []\n", 71 | "for fold in range(0,10):\n", 72 | " get_fold_xy(pr_file,fold)\n", 73 | " test_predicts_list.append(get_fold_xy(pr_file,fold))\n", 74 | "#pr_file.to_csv(\"newstageone/13/bad_word_logreg_predictions_test_oof.csv\", index=False)\n" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 39, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "test = pd.read_csv('cleaned_test.csv')\n", 84 | "test_predicts = np.ones(test_predicts_list[0].shape)\n", 85 | "for fold_predict in test_predicts_list:\n", 86 | " test_predicts *= fold_predict\n", 87 | "\n", 88 | "test_predicts **= (1. / len(test_predicts_list))\n", 89 | "test_ids = test[\"id\"].values\n", 90 | "test_ids = test_ids.reshape((len(test_ids), 1))\n", 91 | "CLASSES = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n", 92 | "test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)\n", 93 | "test_predicts[\"id\"] = test_ids\n", 94 | "test_predicts = test_predicts[[\"id\"] + CLASSES]\n", 95 | "test_predicts.to_csv('newstageone/13/word2vec_scnn_predictions_test_oof.csv', index=False)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 72, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "159571" 107 | ] 108 | }, 109 | "execution_count": 72, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "test = pd.read_csv('newstageone/OOF/oof20.csv')\n", 116 | "test.head()\n", 117 | "len(test)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 1, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "import pandas as pd\n", 127 | "import numpy as np\n", 128 | "import re\n", 129 | "import lightgbm as lgb\n", 130 | "import warnings\n", 131 | "warnings.filterwarnings(action='ignore', category=DeprecationWarning, module='sklearn')\n", 132 | "from sklearn.model_selection import cross_val_score, StratifiedKFold\n", 133 | "\n", 134 | "from sklearn.preprocessing import StandardScaler\n", 135 | "from sklearn.model_selection import cross_val_score\n", 136 | "\n", 137 | "\n", 138 | "#######################\n", 139 | "# FEATURE ENGINEERING #\n", 140 | "#######################\n", 141 | "\"\"\"\n", 142 | "Main function\n", 143 | "Input: pandas Series and a feature engineering function\n", 144 | "Output: pandas Series\n", 145 | "\"\"\"\n", 146 | "def engineer_feature(series, func, normalize=True):\n", 147 | " feature = series.apply(func)\n", 148 | " \n", 149 | " if normalize:\n", 150 | " feature = pd.Series(z_normalize(feature.values.reshape(-1,1)).reshape(-1,))\n", 151 | " feature.name = func.__name__ \n", 152 | " return feature\n", 153 | "\n", 154 | "\"\"\"\n", 155 | "Engineer features\n", 156 | "Input: pandas Series and a list of feature engineering functions\n", 157 | "Output: pandas DataFrame\n", 158 | "\"\"\"\n", 159 | "def engineer_features(series, funclist, normalize=True):\n", 160 | " features = pd.DataFrame()\n", 161 | " for func in funclist:\n", 162 | " feature = engineer_feature(series, func, normalize)\n", 163 | " features[feature.name] = feature\n", 164 | " return features\n", 165 | "\n", 166 | "\"\"\"\n", 167 | "Normalizer\n", 168 | "Input: NumPy array\n", 169 | "Output: NumPy array\n", 170 | "\"\"\"\n", 171 | "scaler = StandardScaler()\n", 172 | "def z_normalize(data):\n", 173 | " scaler.fit(data)\n", 174 | " return scaler.transform(data)\n", 175 | " \n", 176 | "\"\"\"\n", 177 | "Feature functions\n", 178 | "\"\"\"\n", 179 | "def asterix_freq(x):\n", 180 | " return x.count('!')/len(x)\n", 181 | "\n", 182 | "def uppercase_freq(x):\n", 183 | " return len(re.findall(r'[A-Z]',x))/len(x)\n", 184 | " \n", 185 | "\"\"\"\n", 186 | "Import submission and OOF files\n", 187 | "\"\"\"\n", 188 | "def get_subs(nums):\n", 189 | " subs = np.hstack([np.array(pd.read_csv(\"SUB/sub\" + str(num) + \".csv\")[LABELS]) for num in subnums])\n", 190 | " oofs = np.hstack([np.array(pd.read_csv(\"OOF/oof\" + str(num) + \".csv\")[LABELS]) for num in subnums])\n", 191 | " return subs, oofs\n", 192 | "\n" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "def train_folds(X, y, fold_count, model_list):\n", 202 | " fold_size = len(X) // fold_count\n", 203 | " models = []\n", 204 | " total_meta = []\n", 205 | " auc_list = []\n", 206 | " for fold_id in range(0, fold_count):\n", 207 | " print(\"FOLD {}\".format(fold_id))\n", 208 | " fold_start = fold_size * fold_id\n", 209 | " fold_end = fold_start + fold_size\n", 210 | " \n", 211 | " if fold_id == fold_count - 1:\n", 212 | " fold_end = len(X)\n", 213 | "\n", 214 | " train_x = np.concatenate([X[:fold_start], X[fold_end:]])\n", 215 | " train_y = np.concatenate([y[:fold_start], y[fold_end:]])\n", 216 | "\n", 217 | " val_x = X[fold_start:fold_end]\n", 218 | " val_y = y[fold_start:fold_end]\n", 219 | " \n", 220 | " \n", 221 | " model, best_auc = _train_model(model_list[fold_id], train_x, train_y, val_x, val_y,callbacks)\n", 222 | " \n", 223 | " meta = model.predict(val_x, batch_size=128)\n", 224 | " if (fold_id == 0):\n", 225 | " total_meta = meta\n", 226 | " else:\n", 227 | " total_meta = np.concatenate((total_meta, meta), axis=0)\n", 228 | " model_path = os.path.join('models', \"model{0}_weights.npy\".format(fold_id))\n", 229 | " np.save(model_path, model.get_weights())\n", 230 | " models.append(model)\n", 231 | " auc_list.append(best_auc)\n", 232 | "\n", 233 | " return models, total_meta, auc_list\n", 234 | "\n", 235 | "def _train_model(model, train_x, train_y, val_x, val_y):\n", 236 | " for label in LABELS:\n", 237 | " model = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n", 238 | " model.fit(\n", 239 | " train_x,\n", 240 | " train_x[label])\n", 241 | " \n", 242 | " y_pred = model.predict_proba(val_x)[:,1]\n", 243 | "\n", 244 | " total_auc = 0\n", 245 | " for j in range(6):\n", 246 | " auc = compute_auc(val_y[:, j], y_pred[:, j])\n", 247 | " total_auc += auc\n", 248 | "\n", 249 | " total_loss /= 6.\n", 250 | " total_auc /= 6.\n", 251 | " return model, total_auc\n", 252 | " \n", 253 | "def build_model():\n", 254 | " stackers = []\n", 255 | " for fold in range(0,10):\n", 256 | " stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n", 257 | " stackers.append(stacker)\n", 258 | " return " 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "with timer(\"Scoring Light GBM\"):\n", 268 | " scores = []\n", 269 | " folds = KFold(n_splits=4, shuffle=True, random_state=1)\n", 270 | " lgb_round_dict = defaultdict(int)\n", 271 | " trn_lgbset = lgb.Dataset(csr_trn, free_raw_data=False)\n", 272 | " del csr_trn\n", 273 | " gc.collect()\n", 274 | " \n", 275 | " for class_name in class_names:\n", 276 | " print(\"Class %s scores : \" % class_name)\n", 277 | " class_pred = np.zeros(len(train))\n", 278 | " train_target = train[class_name]\n", 279 | " trn_lgbset.set_label(train_target.values)\n", 280 | " \n", 281 | " lgb_rounds = 500\n", 282 | "\n", 283 | " for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, train_target)):\n", 284 | " watchlist = [\n", 285 | " trn_lgbset.subset(trn_idx),\n", 286 | " trn_lgbset.subset(val_idx)\n", 287 | " ]\n", 288 | " # Train lgb l1\n", 289 | " model = lgb.train(\n", 290 | " params=params,\n", 291 | " train_set=watchlist[0],\n", 292 | " num_boost_round=lgb_rounds,\n", 293 | " valid_sets=watchlist,\n", 294 | " early_stopping_rounds=50,\n", 295 | " verbose_eval=0\n", 296 | " )\n", 297 | " class_pred[val_idx] = model.predict(trn_lgbset.data[val_idx], num_iteration=model.best_iteration)\n", 298 | " score = roc_auc_score(train_target.values[val_idx], class_pred[val_idx])\n", 299 | " \n", 300 | " # Compute mean rounds over folds for each class\n", 301 | " # So that it can be re-used for test predictions\n", 302 | " lgb_round_dict[class_name] += model.best_iteration\n", 303 | " print(\"\\t Fold %d : %.6f in %3d rounds\" % (n_fold + 1, score, model.best_iteration))\n", 304 | " \n", 305 | " print(\"full score : %.6f\" % roc_auc_score(train_target, class_pred))\n", 306 | " scores.append(roc_auc_score(train_target, class_pred))\n", 307 | " train[class_name + \"_oof\"] = class_pred\n", 308 | " submission[class_name] = lr_pred / folds\n", 309 | "\n", 310 | " # Save OOF predictions - may be interesting for stacking...\n", 311 | " train[[\"id\"] + class_names + [f + \"_oof\" for f in class_names]].to_csv(\"lvl0_lgbm_clean_oof.csv\",\n", 312 | " index=False,\n", 313 | " float_format=\"%.8f\")\n", 314 | "\n", 315 | " print('Total CV score is {}'.format(np.mean(scores)))\n", 316 | "\n", 317 | "with timer(\"Predicting probabilities\"):\n", 318 | " # Go through all classes and reuse computed number of rounds for each class\n", 319 | " for class_name in class_names:\n", 320 | " with timer(\"Predicting probabilities for %s\" % class_name):\n", 321 | " train_target = train[class_name]\n", 322 | " trn_lgbset.set_label(train_target.values)\n", 323 | " # Train lgb\n", 324 | " model = lgb.train(\n", 325 | " params=params,\n", 326 | " train_set=trn_lgbset,\n", 327 | " num_boost_round=int(lgb_round_dict[class_name] / folds.n_splits)\n", 328 | " )\n", 329 | " submission[class_name] = model.predict(csr_sub, num_iteration=model.best_iteration)\n", 330 | "\n", 331 | "submission.to_csv(\"lvl0_lgbm_clean_sub.csv\", index=False, float_format=\"%.8f\")" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 39, 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "name": "stdout", 348 | "output_type": "stream", 349 | "text": [ 350 | "\n", 351 | " Average class toxic AUC:\t0.987802\n", 352 | " Out-of-fold class toxic AUC:\t0.987650\n" 353 | ] 354 | }, 355 | { 356 | "name": "stderr", 357 | "output_type": "stream", 358 | "text": [ 359 | "/home/stgc/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:61: SettingWithCopyWarning: \n", 360 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 361 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 362 | "\n", 363 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 364 | ] 365 | }, 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "\n", 371 | " Average class severe_toxic AUC:\t0.991559\n", 372 | " Out-of-fold class severe_toxic AUC:\t0.991465\n", 373 | "\n", 374 | " Average class obscene AUC:\t0.995286\n", 375 | " Out-of-fold class obscene AUC:\t0.995268\n", 376 | "\n", 377 | " Average class threat AUC:\t0.991848\n", 378 | " Out-of-fold class threat AUC:\t0.991016\n", 379 | "\n", 380 | " Average class insult AUC:\t0.989942\n", 381 | " Out-of-fold class insult AUC:\t0.989909\n", 382 | "\n", 383 | " Average class identity_hate AUC:\t0.990874\n", 384 | " Out-of-fold class identity_hate AUC:\t0.990406\n", 385 | "\n", 386 | " Overall AUC:\t0.991219\n" 387 | ] 388 | } 389 | ], 390 | "source": [ 391 | "from sklearn.model_selection import cross_val_score, StratifiedKFold\n", 392 | "from sklearn.metrics import log_loss, matthews_corrcoef, roc_auc_score\n", 393 | "\n", 394 | "\n", 395 | "if __name__ == \"__main__\":\n", 396 | " \n", 397 | " train = pd.read_csv('train.csv').fillna(' ')\n", 398 | " test = pd.read_csv('test.csv').fillna(' ')\n", 399 | " sub = pd.read_csv('sample_submission.csv')\n", 400 | " submission = pd.DataFrame.from_dict({'id': test['id']})\n", 401 | " submission_oof = train[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]\n", 402 | "\n", 403 | " INPUT_COLUMN = \"comment_text\"\n", 404 | " LABELS = train.columns[2:]\n", 405 | " \n", 406 | " # Import submissions and OOF files\n", 407 | " # 29: LightGBM trained on Fasttext (CV: 0.9765, LB: 0.9620)\n", 408 | " # 51: Logistic regression with word and char n-grams (CV: 0.9858, LB: ?)\n", 409 | " # 52: LSTM trained on Fasttext (CV: ?, LB: 0.9851)\n", 410 | " subnums = [1,2,3,4,5,6,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]\n", 411 | " subs, oofs = get_subs(subnums)\n", 412 | " \n", 413 | " # Engineer features\n", 414 | " feature_functions = [len, asterix_freq, uppercase_freq]\n", 415 | " features = [f.__name__ for f in feature_functions]\n", 416 | " F_train = engineer_features(train[INPUT_COLUMN], feature_functions)\n", 417 | " F_test = engineer_features(test[INPUT_COLUMN], feature_functions)\n", 418 | " \n", 419 | " train_features = np.hstack([F_train[features].as_matrix(), oofs])\n", 420 | " X_test = np.hstack([F_test[features].as_matrix(), subs]) \n", 421 | " skf = StratifiedKFold(n_splits=10, shuffle=False)\n", 422 | " \n", 423 | " scores_classes = np.zeros((len(LABELS), 10))\n", 424 | " for j, (class_name) in enumerate(LABELS):\n", 425 | " avreal = train[class_name]\n", 426 | " lr_avpred = np.zeros(train.shape[0])\n", 427 | " #stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n", 428 | " for i, (train_index, val_index) in enumerate(skf.split(train_features, train[class_name].values)):\n", 429 | " #print(train_index)\n", 430 | " stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n", 431 | " X_train, X_val = train_features[train_index], train_features[val_index]\n", 432 | " y_train, y_val = train.loc[train_index], train.loc[val_index]\n", 433 | " stacker.fit(X_train, y_train[class_name])\n", 434 | " \n", 435 | " scores_val = stacker.predict_proba(X_val)[:, 1]\n", 436 | " scores_classes[j][i] = roc_auc_score(y_val[class_name], scores_val)\n", 437 | " lr_avpred[val_index] = scores_val\n", 438 | " \n", 439 | " lr_y_pred = stacker.predict_proba(X_test)[:, 1]\n", 440 | " if i > 0:\n", 441 | " lr_fpred = lr_pred + lr_y_pred\n", 442 | " else:\n", 443 | " lr_fpred = lr_y_pred\n", 444 | "\n", 445 | " lr_pred = lr_fpred\n", 446 | " \n", 447 | " lr_oof_auc = roc_auc_score(avreal, lr_avpred)\n", 448 | " print('\\n Average class %s AUC:\\t%.6f' % (class_name, np.mean(scores_classes[j])))\n", 449 | " print(' Out-of-fold class %s AUC:\\t%.6f' % (class_name, lr_oof_auc))\n", 450 | " submission[class_name] = lr_pred / 10\n", 451 | " submission_oof[class_name] = lr_avpred\n", 452 | " \n", 453 | " print('\\n Overall AUC:\\t%.6f' % (np.mean(scores_classes)))\n", 454 | " submission.to_csv('lgb_stacktwo_pred.csv', index=False)\n", 455 | " submission_oof.to_csv('lgb_stacktwo_meta.csv', index=False)\n", 456 | "\n", 457 | " '''\n", 458 | " stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n", 459 | " \n", 460 | " # Fit and submit\n", 461 | " \n", 462 | " scores = []\n", 463 | " for label in LABELS:\n", 464 | " print(label)\n", 465 | " score = cross_val_score(stacker, X_train, train[label], cv=10, scoring='roc_auc')\n", 466 | " print(\"AUC:\", score)\n", 467 | " scores.append(np.mean(score))\n", 468 | " stacker.fit(X_train, train[label])\n", 469 | " sub[label] = stacker.predict_proba(X_test)[:,1]\n", 470 | " print(\"CV score:\", np.mean(scores))\n", 471 | " \n", 472 | " sub.to_csv(\"29modelstack.csv\", index=False)\n", 473 | " '''" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 40, 479 | "metadata": {}, 480 | "outputs": [ 481 | { 482 | "name": "stdout", 483 | "output_type": "stream", 484 | "text": [ 485 | "\n", 486 | " Average class toxic AUC:\t0.987147\n", 487 | " Out-of-fold class toxic AUC:\t0.987087\n" 488 | ] 489 | }, 490 | { 491 | "name": "stderr", 492 | "output_type": "stream", 493 | "text": [ 494 | "/home/stgc/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:79: SettingWithCopyWarning: \n", 495 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 496 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 497 | "\n", 498 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 499 | ] 500 | }, 501 | { 502 | "name": "stdout", 503 | "output_type": "stream", 504 | "text": [ 505 | "\n", 506 | " Average class severe_toxic AUC:\t0.991326\n", 507 | " Out-of-fold class severe_toxic AUC:\t0.991270\n", 508 | "\n", 509 | " Average class obscene AUC:\t0.994503\n", 510 | " Out-of-fold class obscene AUC:\t0.994479\n", 511 | "\n", 512 | " Average class threat AUC:\t0.988594\n", 513 | " Out-of-fold class threat AUC:\t0.988375\n", 514 | "\n", 515 | " Average class insult AUC:\t0.988594\n", 516 | " Out-of-fold class insult AUC:\t0.988550\n", 517 | "\n", 518 | " Average class identity_hate AUC:\t0.988470\n", 519 | " Out-of-fold class identity_hate AUC:\t0.988419\n", 520 | "\n", 521 | " Overall AUC:\t0.989772\n" 522 | ] 523 | } 524 | ], 525 | "source": [ 526 | "from sklearn.model_selection import cross_val_score, StratifiedKFold\n", 527 | "from sklearn.metrics import log_loss, matthews_corrcoef, roc_auc_score\n", 528 | "from sklearn.linear_model import LogisticRegression\n", 529 | "\n", 530 | "\n", 531 | "if __name__ == \"__main__\":\n", 532 | " \n", 533 | " train = pd.read_csv('train.csv').fillna(' ')\n", 534 | " test = pd.read_csv('test.csv').fillna(' ')\n", 535 | " sub = pd.read_csv('sample_submission.csv')\n", 536 | " submission = pd.DataFrame.from_dict({'id': test['id']})\n", 537 | " submission_oof = train[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]\n", 538 | "\n", 539 | " INPUT_COLUMN = \"comment_text\"\n", 540 | " LABELS = train.columns[2:]\n", 541 | " \n", 542 | " # Import submissions and OOF files\n", 543 | " # 29: LightGBM trained on Fasttext (CV: 0.9765, LB: 0.9620)\n", 544 | " # 51: Logistic regression with word and char n-grams (CV: 0.9858, LB: ?)\n", 545 | " # 52: LSTM trained on Fasttext (CV: ?, LB: 0.9851)\n", 546 | " subnums = [1,2,3,4,5,6,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]\n", 547 | " subs, oofs = get_subs(subnums)\n", 548 | " \n", 549 | " # Engineer features\n", 550 | " feature_functions = [len, asterix_freq, uppercase_freq]\n", 551 | " features = [f.__name__ for f in feature_functions]\n", 552 | " F_train = engineer_features(train[INPUT_COLUMN], feature_functions)\n", 553 | " F_test = engineer_features(test[INPUT_COLUMN], feature_functions)\n", 554 | " \n", 555 | " train_features = np.hstack([F_train[features].as_matrix(), oofs])\n", 556 | " X_test = np.hstack([F_test[features].as_matrix(), subs]) \n", 557 | " skf = StratifiedKFold(n_splits=10, shuffle=False)\n", 558 | " \n", 559 | " scores_classes = np.zeros((len(LABELS), 10))\n", 560 | " all_parameters = {\n", 561 | " 'C' : [1.048113, 0.1930, 0.596362, 0.25595, 0.449843, 0.25595],\n", 562 | " 'tol' : [0.1, 0.1, 0.046416, 0.0215443, 0.1, 0.01],\n", 563 | " 'solver' : ['lbfgs', 'newton-cg', 'lbfgs', 'newton-cg', 'newton-cg', 'lbfgs'],\n", 564 | " 'fit_intercept' : [True, True, True, True, True, True],\n", 565 | " 'penalty' : ['l2', 'l2', 'l2', 'l2', 'l2', 'l2'],\n", 566 | " 'class_weight' : [None, 'balanced', 'balanced', 'balanced', 'balanced', 'balanced'],\n", 567 | " }\n", 568 | " for j, (class_name) in enumerate(LABELS):\n", 569 | " avreal = train[class_name]\n", 570 | " lr_avpred = np.zeros(train.shape[0])\n", 571 | " #stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n", 572 | " for i, (train_index, val_index) in enumerate(skf.split(train_features, train[class_name].values)):\n", 573 | " #print(train_index)\n", 574 | " stacker = LogisticRegression(\n", 575 | " C=all_parameters['C'][j],\n", 576 | " max_iter=200,\n", 577 | " tol=all_parameters['tol'][j],\n", 578 | " solver=all_parameters['solver'][j],\n", 579 | " fit_intercept=all_parameters['fit_intercept'][j],\n", 580 | " penalty=all_parameters['penalty'][j],\n", 581 | " dual=False,\n", 582 | " class_weight=all_parameters['class_weight'][j],\n", 583 | " verbose=0)\n", 584 | " X_train, X_val = train_features[train_index], train_features[val_index]\n", 585 | " y_train, y_val = train.loc[train_index], train.loc[val_index]\n", 586 | " stacker.fit(X_train, y_train[class_name])\n", 587 | " \n", 588 | " scores_val = stacker.predict_proba(X_val)[:, 1]\n", 589 | " scores_classes[j][i] = roc_auc_score(y_val[class_name], scores_val)\n", 590 | " lr_avpred[val_index] = scores_val\n", 591 | " \n", 592 | " lr_y_pred = stacker.predict_proba(X_test)[:, 1]\n", 593 | " if i > 0:\n", 594 | " lr_fpred = lr_pred + lr_y_pred\n", 595 | " else:\n", 596 | " lr_fpred = lr_y_pred\n", 597 | "\n", 598 | " lr_pred = lr_fpred\n", 599 | " \n", 600 | " lr_oof_auc = roc_auc_score(avreal, lr_avpred)\n", 601 | " print('\\n Average class %s AUC:\\t%.6f' % (class_name, np.mean(scores_classes[j])))\n", 602 | " print(' Out-of-fold class %s AUC:\\t%.6f' % (class_name, lr_oof_auc))\n", 603 | " submission[class_name] = lr_pred / 10\n", 604 | " submission_oof[class_name] = lr_avpred\n", 605 | " \n", 606 | " print('\\n Overall AUC:\\t%.6f' % (np.mean(scores_classes)))\n", 607 | " submission.to_csv('logisticreg_stacktwo_pred.csv', index=False)\n", 608 | " submission_oof.to_csv('logisticreg_stacktwo_meta.csv', index=False)\n", 609 | "\n", 610 | " '''\n", 611 | " stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n", 612 | " \n", 613 | " # Fit and submit\n", 614 | " \n", 615 | " scores = []\n", 616 | " for label in LABELS:\n", 617 | " print(label)\n", 618 | " score = cross_val_score(stacker, X_train, train[label], cv=10, scoring='roc_auc')\n", 619 | " print(\"AUC:\", score)\n", 620 | " scores.append(np.mean(score))\n", 621 | " stacker.fit(X_train, train[label])\n", 622 | " sub[label] = stacker.predict_proba(X_test)[:,1]\n", 623 | " print(\"CV score:\", np.mean(scores))\n", 624 | " \n", 625 | " sub.to_csv(\"29modelstack.csv\", index=False)\n", 626 | " '''" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 44, 632 | "metadata": {}, 633 | "outputs": [ 634 | { 635 | "name": "stdout", 636 | "output_type": "stream", 637 | "text": [ 638 | "\n", 639 | " Average class toxic AUC:\t0.987652\n", 640 | " Out-of-fold class toxic AUC:\t0.987478\n" 641 | ] 642 | }, 643 | { 644 | "name": "stderr", 645 | "output_type": "stream", 646 | "text": [ 647 | "/home/stgc/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:74: SettingWithCopyWarning: \n", 648 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 649 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 650 | "\n", 651 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 652 | ] 653 | }, 654 | { 655 | "name": "stdout", 656 | "output_type": "stream", 657 | "text": [ 658 | "\n", 659 | " Average class severe_toxic AUC:\t0.991304\n", 660 | " Out-of-fold class severe_toxic AUC:\t0.991147\n", 661 | "\n", 662 | " Average class obscene AUC:\t0.995268\n", 663 | " Out-of-fold class obscene AUC:\t0.995247\n", 664 | "\n", 665 | " Average class threat AUC:\t0.989995\n", 666 | " Out-of-fold class threat AUC:\t0.989601\n", 667 | "\n", 668 | " Average class insult AUC:\t0.989765\n", 669 | " Out-of-fold class insult AUC:\t0.989735\n", 670 | "\n", 671 | " Average class identity_hate AUC:\t0.990650\n", 672 | " Out-of-fold class identity_hate AUC:\t0.990048\n", 673 | "\n", 674 | " Overall AUC:\t0.990772\n" 675 | ] 676 | } 677 | ], 678 | "source": [ 679 | "from sklearn.model_selection import cross_val_score, StratifiedKFold\n", 680 | "from sklearn.metrics import log_loss, matthews_corrcoef, roc_auc_score\n", 681 | "from sklearn.linear_model import LogisticRegression\n", 682 | "from xgboost import XGBClassifier\n", 683 | "\n", 684 | "\n", 685 | "if __name__ == \"__main__\":\n", 686 | " \n", 687 | " train = pd.read_csv('train.csv').fillna(' ')\n", 688 | " test = pd.read_csv('test.csv').fillna(' ')\n", 689 | " sub = pd.read_csv('sample_submission.csv')\n", 690 | " submission = pd.DataFrame.from_dict({'id': test['id']})\n", 691 | " submission_oof = train[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]\n", 692 | "\n", 693 | " INPUT_COLUMN = \"comment_text\"\n", 694 | " LABELS = train.columns[2:]\n", 695 | " \n", 696 | " # Import submissions and OOF files\n", 697 | " # 29: LightGBM trained on Fasttext (CV: 0.9765, LB: 0.9620)\n", 698 | " # 51: Logistic regression with word and char n-grams (CV: 0.9858, LB: ?)\n", 699 | " # 52: LSTM trained on Fasttext (CV: ?, LB: 0.9851)\n", 700 | " subnums = [1,2,3,4,5,6,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]\n", 701 | " subs, oofs = get_subs(subnums)\n", 702 | " \n", 703 | " # Engineer features\n", 704 | " feature_functions = [len, asterix_freq, uppercase_freq]\n", 705 | " features = [f.__name__ for f in feature_functions]\n", 706 | " F_train = engineer_features(train[INPUT_COLUMN], feature_functions)\n", 707 | " F_test = engineer_features(test[INPUT_COLUMN], feature_functions)\n", 708 | " \n", 709 | " train_features = np.hstack([F_train[features].as_matrix(), oofs])\n", 710 | " X_test = np.hstack([F_test[features].as_matrix(), subs]) \n", 711 | " skf = StratifiedKFold(n_splits=10, shuffle=False)\n", 712 | " \n", 713 | " scores_classes = np.zeros((len(LABELS), 10))\n", 714 | "\n", 715 | " for j, (class_name) in enumerate(LABELS):\n", 716 | " avreal = train[class_name]\n", 717 | " lr_avpred = np.zeros(train.shape[0])\n", 718 | " #stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n", 719 | " for i, (train_index, val_index) in enumerate(skf.split(train_features, train[class_name].values)):\n", 720 | " #print(train_index)\n", 721 | " n_estimators = 200\n", 722 | " stacker = clf = XGBClassifier(n_estimators=n_estimators,\n", 723 | " max_depth=4,\n", 724 | " objective=\"binary:logistic\",\n", 725 | " learning_rate=.1, \n", 726 | " subsample=.8, \n", 727 | " colsample_bytree=.8,\n", 728 | " gamma=1,\n", 729 | " reg_alpha=0,\n", 730 | " reg_lambda=1,\n", 731 | " nthread=2)\n", 732 | " X_train, X_val = train_features[train_index], train_features[val_index]\n", 733 | " y_train, y_val = train.loc[train_index], train.loc[val_index]\n", 734 | " stacker.fit(X_train, y_train[class_name])\n", 735 | " \n", 736 | " scores_val = stacker.predict_proba(X_val)[:, 1]\n", 737 | " scores_classes[j][i] = roc_auc_score(y_val[class_name], scores_val)\n", 738 | " lr_avpred[val_index] = scores_val\n", 739 | " \n", 740 | " lr_y_pred = stacker.predict_proba(X_test)[:, 1]\n", 741 | " if i > 0:\n", 742 | " lr_fpred = lr_pred + lr_y_pred\n", 743 | " else:\n", 744 | " lr_fpred = lr_y_pred\n", 745 | "\n", 746 | " lr_pred = lr_fpred\n", 747 | " \n", 748 | " lr_oof_auc = roc_auc_score(avreal, lr_avpred)\n", 749 | " print('\\n Average class %s AUC:\\t%.6f' % (class_name, np.mean(scores_classes[j])))\n", 750 | " print(' Out-of-fold class %s AUC:\\t%.6f' % (class_name, lr_oof_auc))\n", 751 | " submission[class_name] = lr_pred / 10\n", 752 | " submission_oof[class_name] = lr_avpred\n", 753 | " \n", 754 | " print('\\n Overall AUC:\\t%.6f' % (np.mean(scores_classes)))\n", 755 | " submission.to_csv('XGB_stacktwo_pred.csv', index=False)\n", 756 | " submission_oof.to_csv('XGB_stacktwo_meta.csv', index=False)\n", 757 | "\n", 758 | " '''\n", 759 | " stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n", 760 | " \n", 761 | " # Fit and submit\n", 762 | " \n", 763 | " scores = []\n", 764 | " for label in LABELS:\n", 765 | " print(label)\n", 766 | " score = cross_val_score(stacker, X_train, train[label], cv=10, scoring='roc_auc')\n", 767 | " print(\"AUC:\", score)\n", 768 | " scores.append(np.mean(score))\n", 769 | " stacker.fit(X_train, train[label])\n", 770 | " sub[label] = stacker.predict_proba(X_test)[:,1]\n", 771 | " print(\"CV score:\", np.mean(scores))\n", 772 | " \n", 773 | " sub.to_csv(\"29modelstack.csv\", index=False)\n", 774 | " '''" 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": 52, 780 | "metadata": {}, 781 | "outputs": [], 782 | "source": [ 783 | "def golden_features(data):\n", 784 | " df = pd.DataFrame()\n", 785 | " df['total_length'] = data['comment_text'].apply(len)\n", 786 | " df['capitals'] = data['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))\n", 787 | " df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),\n", 788 | " axis=1)\n", 789 | " df['num_exclamation_marks'] = data['comment_text'].apply(lambda comment: comment.count('!'))\n", 790 | " df['num_question_marks'] = data['comment_text'].apply(lambda comment: comment.count('?'))\n", 791 | " df['num_punctuation'] = data['comment_text'].apply(\n", 792 | " lambda comment: sum(comment.count(w) for w in '.,;:'))\n", 793 | " df['num_symbols'] = data['comment_text'].apply(\n", 794 | " lambda comment: sum(comment.count(w) for w in '*&$%'))\n", 795 | " df['num_words'] = data['comment_text'].apply(lambda comment: len(comment.split()))\n", 796 | " df['num_unique_words'] = data['comment_text'].apply(\n", 797 | " lambda comment: len(set(w for w in comment.split())))\n", 798 | " df['words_vs_unique'] = df['num_unique_words'] / df['num_words']\n", 799 | " df['num_smilies'] = data['comment_text'].apply(\n", 800 | " lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))\n", 801 | " return df" 802 | ] 803 | }, 804 | { 805 | "cell_type": "code", 806 | "execution_count": 57, 807 | "metadata": {}, 808 | "outputs": [ 809 | { 810 | "name": "stdout", 811 | "output_type": "stream", 812 | "text": [ 813 | "\n", 814 | " Average class toxic AUC:\t0.988588\n", 815 | " Out-of-fold class toxic AUC:\t0.988451\n" 816 | ] 817 | }, 818 | { 819 | "name": "stderr", 820 | "output_type": "stream", 821 | "text": [ 822 | "/home/stgc/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:82: SettingWithCopyWarning: \n", 823 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 824 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 825 | "\n", 826 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 827 | ] 828 | }, 829 | { 830 | "name": "stdout", 831 | "output_type": "stream", 832 | "text": [ 833 | "\n", 834 | " Average class severe_toxic AUC:\t0.991879\n", 835 | " Out-of-fold class severe_toxic AUC:\t0.991808\n", 836 | "\n", 837 | " Average class obscene AUC:\t0.995325\n", 838 | " Out-of-fold class obscene AUC:\t0.995282\n", 839 | "\n", 840 | " Average class threat AUC:\t0.990770\n", 841 | " Out-of-fold class threat AUC:\t0.990069\n", 842 | "\n", 843 | " Average class insult AUC:\t0.989822\n", 844 | " Out-of-fold class insult AUC:\t0.989714\n", 845 | "\n", 846 | " Average class identity_hate AUC:\t0.990839\n", 847 | " Out-of-fold class identity_hate AUC:\t0.990254\n", 848 | "\n", 849 | " Overall AUC:\t0.991204\n", 850 | "toxic\n", 851 | "('AUC:', array([0.98889795, 0.98861989, 0.98971198, 0.9868848 , 0.98862791,\n", 852 | " 0.9890668 , 0.98846785, 0.98917377, 0.98753618, 0.9888971 ]))\n", 853 | "severe_toxic\n", 854 | "('AUC:', array([0.99273583, 0.99111359, 0.99293898, 0.99054152, 0.99058346,\n", 855 | " 0.99194787, 0.99232698, 0.99152614, 0.99322736, 0.99184623]))\n", 856 | "obscene\n", 857 | "('AUC:', array([0.99580642, 0.99464038, 0.99561256, 0.99475678, 0.9959103 ,\n", 858 | " 0.99596112, 0.99524442, 0.99531185, 0.99552059, 0.99448298]))\n", 859 | "threat\n", 860 | "('AUC:', array([0.98517704, 0.99588571, 0.99611617, 0.98161418, 0.98804791,\n", 861 | " 0.98691385, 0.9933096 , 0.99055304, 0.99403656, 0.99604131]))\n", 862 | "insult\n", 863 | "('AUC:', array([0.98973901, 0.99019773, 0.98993672, 0.98867237, 0.98993016,\n", 864 | " 0.99099737, 0.9898002 , 0.98913562, 0.99063964, 0.9891739 ]))\n", 865 | "identity_hate\n", 866 | "('AUC:', array([0.98968835, 0.99280467, 0.98910903, 0.99082099, 0.99160926,\n", 867 | " 0.99028667, 0.98763187, 0.98932094, 0.99433892, 0.99277969]))\n", 868 | "('CV score:', 0.9912038016791836)\n" 869 | ] 870 | } 871 | ], 872 | "source": [ 873 | "from sklearn.model_selection import cross_val_score, StratifiedKFold\n", 874 | "from sklearn.metrics import log_loss, matthews_corrcoef, roc_auc_score\n", 875 | "from sklearn.linear_model import LogisticRegression\n", 876 | "\n", 877 | "def get_subs(nums):\n", 878 | " subs = np.hstack([np.array(pd.read_csv(\"SUB_two/sub\" + str(num) + \".csv\")[LABELS]) for num in subnums])\n", 879 | " oofs = np.hstack([np.array(pd.read_csv(\"OOF_two/oof\" + str(num) + \".csv\")[LABELS]) for num in subnums])\n", 880 | " return subs, oofs\n", 881 | "\n", 882 | "if __name__ == \"__main__\":\n", 883 | " \n", 884 | " train = pd.read_csv('train.csv').fillna(' ')\n", 885 | " test = pd.read_csv('test.csv').fillna(' ')\n", 886 | " sub = pd.read_csv('sample_submission.csv')\n", 887 | " submission = pd.DataFrame.from_dict({'id': test['id']})\n", 888 | " submission_oof = train[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]\n", 889 | "\n", 890 | " INPUT_COLUMN = \"comment_text\"\n", 891 | " LABELS = train.columns[2:]\n", 892 | " \n", 893 | " # Import submissions and OOF files\n", 894 | " # 29: LightGBM trained on Fasttext (CV: 0.9765, LB: 0.9620)\n", 895 | " # 51: Logistic regression with word and char n-grams (CV: 0.9858, LB: ?)\n", 896 | " # 52: LSTM trained on Fasttext (CV: ?, LB: 0.9851)\n", 897 | " subnums = [1,2,3]\n", 898 | " subs, oofs = get_subs(subnums)\n", 899 | " \n", 900 | " # Engineer features\n", 901 | " feature_functions = [len, asterix_freq, uppercase_freq]\n", 902 | " features = [f.__name__ for f in feature_functions]\n", 903 | " F_train = engineer_features(train[INPUT_COLUMN], feature_functions)\n", 904 | " F_test = engineer_features(test[INPUT_COLUMN], feature_functions)\n", 905 | " \n", 906 | " \n", 907 | "\n", 908 | " train_features_pri = np.hstack([F_train[features].as_matrix(), oofs])\n", 909 | " X_test_pri = np.hstack([F_test[features].as_matrix(), subs]) \n", 910 | " \n", 911 | " gold_F = ('total_length', 'capitals', 'caps_vs_length', 'num_exclamation_marks',\n", 912 | " 'num_question_marks', 'num_punctuation', 'num_words', 'num_unique_words',\n", 913 | " 'words_vs_unique', 'num_smilies', 'num_symbols')\n", 914 | " gold_Feature = [g for g in gold_F]\n", 915 | " \n", 916 | " G_train = golden_features(train)\n", 917 | " G_test = golden_features(test)\n", 918 | " \n", 919 | " train_features = np.hstack([G_train[gold_Feature].as_matrix(), train_features_pri])\n", 920 | " X_test = np.hstack([G_test[gold_Feature].as_matrix(), X_test_pri])\n", 921 | " \n", 922 | " \n", 923 | " skf = StratifiedKFold(n_splits=10, shuffle=False)\n", 924 | " \n", 925 | " scores_classes = np.zeros((len(LABELS), 10))\n", 926 | "\n", 927 | " for j, (class_name) in enumerate(LABELS):\n", 928 | " avreal = train[class_name]\n", 929 | " lr_avpred = np.zeros(train.shape[0])\n", 930 | " #stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n", 931 | " for i, (train_index, val_index) in enumerate(skf.split(train_features, train[class_name].values)):\n", 932 | " #print(train_index)\n", 933 | " stacker = stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n", 934 | " X_train, X_val = train_features[train_index], train_features[val_index]\n", 935 | " y_train, y_val = train.loc[train_index], train.loc[val_index]\n", 936 | " stacker.fit(X_train, y_train[class_name])\n", 937 | " \n", 938 | " scores_val = stacker.predict_proba(X_val)[:, 1]\n", 939 | " scores_classes[j][i] = roc_auc_score(y_val[class_name], scores_val)\n", 940 | " lr_avpred[val_index] = scores_val\n", 941 | " \n", 942 | " lr_y_pred = stacker.predict_proba(X_test)[:, 1]\n", 943 | " if i > 0:\n", 944 | " lr_fpred = lr_pred + lr_y_pred\n", 945 | " else:\n", 946 | " lr_fpred = lr_y_pred\n", 947 | "\n", 948 | " lr_pred = lr_fpred\n", 949 | " \n", 950 | " lr_oof_auc = roc_auc_score(avreal, lr_avpred)\n", 951 | " print('\\n Average class %s AUC:\\t%.6f' % (class_name, np.mean(scores_classes[j])))\n", 952 | " print(' Out-of-fold class %s AUC:\\t%.6f' % (class_name, lr_oof_auc))\n", 953 | " submission[class_name] = lr_pred / 10\n", 954 | " submission_oof[class_name] = lr_avpred\n", 955 | " \n", 956 | " print('\\n Overall AUC:\\t%.6f' % (np.mean(scores_classes)))\n", 957 | " submission.to_csv('lgbm_stack_final_pred.csv', index=False)\n", 958 | " submission_oof.to_csv('LGBM_stack_final_meta.csv', index=False)\n", 959 | "\n", 960 | " \n", 961 | " stacker = lgb.LGBMClassifier(max_depth=3, metric=\"auc\", n_estimators=125, num_leaves=10, boosting_type=\"gbdt\", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)\n", 962 | " \n", 963 | " # Fit and submit\n", 964 | " \n", 965 | " scores = []\n", 966 | " for label in LABELS:\n", 967 | " print(label)\n", 968 | " score = cross_val_score(stacker, train_features, train[label], cv=10, scoring='roc_auc')\n", 969 | " print(\"AUC:\", score)\n", 970 | " scores.append(np.mean(score))\n", 971 | " stacker.fit(train_features, train[label])\n", 972 | " sub[label] = stacker.predict_proba(X_test)[:,1]\n", 973 | " print(\"CV score:\", np.mean(scores))\n", 974 | " \n", 975 | " sub.to_csv(\"TRAIN_ALL_STACK_LOG_REG.csv\", index=False)\n", 976 | " " 977 | ] 978 | }, 979 | { 980 | "cell_type": "code", 981 | "execution_count": 41, 982 | "metadata": {}, 983 | "outputs": [ 984 | { 985 | "data": { 986 | "text/html": [ 987 | "
\n", 988 | "\n", 1001 | "\n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | "
idtoxicsevere_toxicobscenethreatinsultidentity_hate
000001cee341fdb120.9996460.9921860.9996600.8498230.9962170.997916
10000247867823ef70.0061850.0014810.0085640.0169330.0178190.020447
200013b17ad220c460.0063560.0013860.0090350.0179880.0169620.018510
300017563c3f7919a0.0044520.0014630.0085420.0273730.0132200.017579
400017695ad8997eb0.0058580.0013440.0103540.0255510.0149920.019761
\n", 1067 | "
" 1068 | ], 1069 | "text/plain": [ 1070 | " id toxic severe_toxic obscene threat insult \\\n", 1071 | "0 00001cee341fdb12 0.999646 0.992186 0.999660 0.849823 0.996217 \n", 1072 | "1 0000247867823ef7 0.006185 0.001481 0.008564 0.016933 0.017819 \n", 1073 | "2 00013b17ad220c46 0.006356 0.001386 0.009035 0.017988 0.016962 \n", 1074 | "3 00017563c3f7919a 0.004452 0.001463 0.008542 0.027373 0.013220 \n", 1075 | "4 00017695ad8997eb 0.005858 0.001344 0.010354 0.025551 0.014992 \n", 1076 | "\n", 1077 | " identity_hate \n", 1078 | "0 0.997916 \n", 1079 | "1 0.020447 \n", 1080 | "2 0.018510 \n", 1081 | "3 0.017579 \n", 1082 | "4 0.019761 " 1083 | ] 1084 | }, 1085 | "execution_count": 41, 1086 | "metadata": {}, 1087 | "output_type": "execute_result" 1088 | } 1089 | ], 1090 | "source": [ 1091 | "test = pd.read_csv('SUB_two/logisticreg_stacktwo_pred.csv')\n", 1092 | "test.head()\n", 1093 | "#columns_to_drop_test = ['toxic','severe_toxic','threat','insult','identity_hate','obscene']\n", 1094 | "#columns_to_drop_test = ['comment_text']\n", 1095 | "\n", 1096 | "#test = test.drop(columns_to_drop_test, axis=1)\n", 1097 | "\n", 1098 | "#test = test.rename(columns={'toxic_oof': 'toxic', 'severe_toxic_oof': 'severe_toxic', 'obscene_oof': 'obscene', 'threat_oof': 'threat', 'insult_oof': 'insult', 'identity_hate_oof': 'identity_hate'})\n", 1099 | "#test.to_csv(\"29modelstack.csv\", index=False)\n" 1100 | ] 1101 | }, 1102 | { 1103 | "cell_type": "code", 1104 | "execution_count": 58, 1105 | "metadata": {}, 1106 | "outputs": [ 1107 | { 1108 | "ename": "IOError", 1109 | "evalue": "File SUB_two/sub4.csv does not exist", 1110 | "output_type": "error", 1111 | "traceback": [ 1112 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 1113 | "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)", 1114 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msubnums\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0msubs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moofs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_subs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msubnums\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 1115 | "\u001b[0;32m\u001b[0m in \u001b[0;36mget_subs\u001b[0;34m(nums)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_subs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnums\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0msubs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"SUB_two/sub\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\".csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mLABELS\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mnum\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msubnums\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0moofs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"OOF_two/oof\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\".csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mLABELS\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mnum\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msubnums\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msubs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moofs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1116 | "\u001b[0;32m/home/stgc/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 707\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[1;32m 708\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 709\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 710\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 711\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1117 | "\u001b[0;32m/home/stgc/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 447\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 448\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 449\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 450\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1118 | "\u001b[0;32m/home/stgc/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 816\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 818\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 819\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1119 | "\u001b[0;32m/home/stgc/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 1047\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1048\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1049\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1050\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1051\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'python'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1120 | "\u001b[0;32m/home/stgc/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 1693\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'allow_leading_cols'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex_col\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1694\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1695\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1696\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1697\u001b[0m \u001b[0;31m# XXX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1121 | "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n", 1122 | "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[0;34m()\u001b[0m\n", 1123 | "\u001b[0;31mIOError\u001b[0m: File SUB_two/sub4.csv does not exist" 1124 | ] 1125 | } 1126 | ], 1127 | "source": [ 1128 | "subnums = [4]\n", 1129 | "subs, oofs = get_subs(subnums)\n" 1130 | ] 1131 | }, 1132 | { 1133 | "cell_type": "code", 1134 | "execution_count": null, 1135 | "metadata": {}, 1136 | "outputs": [], 1137 | "source": [ 1138 | "test = pd.read_csv('OOF/oof28.csv')\n" 1139 | ] 1140 | } 1141 | ], 1142 | "metadata": { 1143 | "kernelspec": { 1144 | "display_name": "Python 2", 1145 | "language": "python", 1146 | "name": "python2" 1147 | }, 1148 | "language_info": { 1149 | "codemirror_mode": { 1150 | "name": "ipython", 1151 | "version": 2 1152 | }, 1153 | "file_extension": ".py", 1154 | "mimetype": "text/x-python", 1155 | "name": "python", 1156 | "nbconvert_exporter": "python", 1157 | "pygments_lexer": "ipython2", 1158 | "version": "2.7.14" 1159 | } 1160 | }, 1161 | "nbformat": 4, 1162 | "nbformat_minor": 2 1163 | } 1164 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Toxic Comment Classification 2 | 3 | This is my codes for the toxic comment classification competition hosted in [Kaggle](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge). Fully modified to another level from the base code [here](https://github.com/conversationai/unintended-ml-bias-analysis/tree/master/unintended_ml_bias) 4 | 5 | 6 | To download datasets please run get_data.sh 7 | ## The Task 8 | The dataset comprises of comments from Wikipedia’s talk page edits. It is a large number of Wikipedia comments which have been labeled by human raters for toxic behavior. The types of toxicity are: 9 | 10 | > * `toxic` 11 | > * `severe_toxic` 12 | > * `obscene` 13 | > * `threat` 14 | > * `insult` 15 | > * `identity_hate` 16 | 17 | 18 | ## The Approach 19 | 20 | Creating an ensemble model which predicts a probability of each type of toxicity for each comment.Full explaination of my approach is documented [here](https://medium.com/@dickson_chin93/my-solution-to-achieve-top-1-in-a-novel-data-science-nlp-competition-db8db2ee356a) 21 | 22 | 23 | 24 | ## Install Pre-requisites 25 | 26 | run install.sh and then run 27 | pip install -r requirements.txt 28 | 29 | ## Tips 30 | 31 | - Make sure embeddings original preprocessing is used to ensure highest percentage of embeddings can be imported 32 | -------------------------------------------------------------------------------- /RIDGE.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Tfidf word vector\n", 13 | "Tfidf char vector\n", 14 | "stack both\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "\n", 22 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 23 | "from sklearn.linear_model import Ridge\n", 24 | "from sklearn.model_selection import cross_val_score\n", 25 | "from scipy.sparse import hstack\n", 26 | "from sklearn.metrics import log_loss, matthews_corrcoef, roc_auc_score\n", 27 | "from sklearn.model_selection import cross_val_score\n", 28 | "from sklearn.model_selection import StratifiedKFold\n", 29 | "from sklearn.model_selection import KFold\n", 30 | "from sklearn.linear_model import Lasso\n", 31 | "from sklearn.linear_model import ElasticNet\n", 32 | "\n", 33 | "class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n", 34 | "\n", 35 | "train = pd.read_csv('train_translated_sp_clean.csv').fillna(' ')\n", 36 | "test = pd.read_csv('test_translated_sp_clean.csv').fillna(' ')\n", 37 | "\n", 38 | "train_text = train['comment_text']\n", 39 | "test_text = test['comment_text']\n", 40 | "all_text = pd.concat([train_text, test_text])\n", 41 | "\n", 42 | "class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n", 43 | "tr_ids = train[['id']]\n", 44 | "train[class_names] = train[class_names].astype(np.int8)\n", 45 | "target = train[class_names]\n", 46 | "\n", 47 | "print('Tfidf word vector')\n", 48 | "word_vectorizer = TfidfVectorizer(\n", 49 | " sublinear_tf=True,\n", 50 | " strip_accents='unicode',\n", 51 | " analyzer='word',\n", 52 | " token_pattern=r'\\w{1,}',\n", 53 | " stop_words='english',\n", 54 | " ngram_range=(1, 1),\n", 55 | " max_features=10000)\n", 56 | "word_vectorizer.fit(all_text)\n", 57 | "train_word_features = word_vectorizer.transform(train_text)\n", 58 | "test_word_features = word_vectorizer.transform(test_text)\n", 59 | "\n", 60 | "print('Tfidf char vector')\n", 61 | "char_vectorizer = TfidfVectorizer(\n", 62 | " sublinear_tf=True,\n", 63 | " strip_accents='unicode',\n", 64 | " analyzer='char',\n", 65 | " stop_words='english',\n", 66 | " ngram_range=(2, 6),\n", 67 | " max_features=50000)\n", 68 | "char_vectorizer.fit(all_text)\n", 69 | "train_char_features = char_vectorizer.transform(train_text)\n", 70 | "test_char_features = char_vectorizer.transform(test_text)\n", 71 | "\n", 72 | "print('stack both')\n", 73 | "#train_features = hstack([train_char_features, train_word_features])\n", 74 | "#test_features = hstack([test_char_features, test_word_features])\n", 75 | "\n", 76 | "#train_features = train_word_features\n", 77 | "#test_features = test_word_features\n", 78 | "\n", 79 | "train_features = hstack([train_char_features, train_word_features]).tocsr()\n", 80 | "test_features = hstack([test_char_features, test_word_features]).tocsr()\n", 81 | "\n", 82 | "scores = []\n", 83 | "scores_classes = np.zeros((len(class_names), 10))\n", 84 | "\n", 85 | "submission = pd.DataFrame.from_dict({'id': test['id']})\n", 86 | "submission_oof = train[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]\n", 87 | "\n", 88 | "idpred = tr_ids\n", 89 | "number_of_folds = 10\n", 90 | "\n", 91 | "#kfolder=StratifiedKFold(train_text, n_folds=number_of_folds,shuffle=True, random_state=15)\n", 92 | "\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 8, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "from sklearn.model_selection import StratifiedKFold\n", 102 | "\n", 103 | "number_of_folds = 10\n", 104 | "#kfolder = KFold(n_splits=number_of_folds, shuffle=True, random_state=239)\n", 105 | "kfolder= StratifiedKFold(n_splits=number_of_folds,shuffle=True, random_state=15)\n", 106 | "scores_classes = np.zeros((len(class_names), 10))\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 9, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "class_name is: toxic\n", 119 | "[ 0 1 2 ... 159568 159569 159570]\n", 120 | "[ 4 63 64 ... 159490 159503 159510]\n", 121 | "\n", 122 | " Fold 01 class toxic AUC: 0.978946\n", 123 | "[ 0 1 2 ... 159568 159569 159570]\n", 124 | "[ 10 26 32 ... 159548 159557 159563]\n", 125 | "\n", 126 | " Fold 02 class toxic AUC: 0.980277\n", 127 | "[ 0 2 3 ... 159568 159569 159570]\n", 128 | "[ 1 16 18 ... 159542 159547 159564]\n", 129 | "\n", 130 | " Fold 03 class toxic AUC: 0.979126\n", 131 | "[ 0 1 2 ... 159568 159569 159570]\n", 132 | "[ 17 21 30 ... 159536 159555 159566]\n", 133 | "\n", 134 | " Fold 04 class toxic AUC: 0.976820\n", 135 | "[ 0 1 3 ... 159568 159569 159570]\n", 136 | "[ 2 8 9 ... 159559 159560 159562]\n", 137 | "\n", 138 | " Fold 05 class toxic AUC: 0.977847\n", 139 | "[ 0 1 2 ... 159568 159569 159570]\n", 140 | "[ 20 22 24 ... 159546 159551 159558]\n", 141 | "\n", 142 | " Fold 06 class toxic AUC: 0.979972\n", 143 | "[ 1 2 3 ... 159566 159569 159570]\n", 144 | "[ 0 5 23 ... 159550 159567 159568]\n", 145 | "\n", 146 | " Fold 07 class toxic AUC: 0.981081\n", 147 | "[ 0 1 2 ... 159567 159568 159570]\n", 148 | "[ 3 6 11 ... 159554 159561 159569]\n", 149 | "\n", 150 | " Fold 08 class toxic AUC: 0.981545\n", 151 | "[ 0 1 2 ... 159567 159568 159569]\n", 152 | "[ 7 14 15 ... 159549 159553 159570]\n", 153 | "\n", 154 | " Fold 09 class toxic AUC: 0.982126\n", 155 | "[ 0 1 2 ... 159568 159569 159570]\n", 156 | "[ 31 46 54 ... 159517 159539 159565]\n", 157 | "\n", 158 | " Fold 10 class toxic AUC: 0.978534\n", 159 | "\n", 160 | " Average class toxic AUC:\t0.979627\n", 161 | " Out-of-fold class toxic AUC:\t0.979634\n" 162 | ] 163 | }, 164 | { 165 | "name": "stderr", 166 | "output_type": "stream", 167 | "text": [ 168 | "/home/stgc/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:33: SettingWithCopyWarning: \n", 169 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 170 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 171 | "\n", 172 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 173 | ] 174 | }, 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "class_name is: severe_toxic\n", 180 | "[ 0 1 2 ... 159568 159569 159570]\n", 181 | "[ 4 7 32 ... 159548 159549 159563]\n", 182 | "\n", 183 | " Fold 01 class severe_toxic AUC: 0.988811\n", 184 | "[ 0 1 2 ... 159568 159569 159570]\n", 185 | "[ 10 15 24 ... 159534 159535 159560]\n", 186 | "\n", 187 | " Fold 02 class severe_toxic AUC: 0.989085\n", 188 | "[ 0 1 2 ... 159568 159569 159570]\n", 189 | "[ 6 9 12 ... 159531 159555 159559]\n", 190 | "\n", 191 | " Fold 03 class severe_toxic AUC: 0.988796\n", 192 | "[ 0 2 3 ... 159568 159569 159570]\n", 193 | "[ 1 16 39 ... 159546 159550 159564]\n", 194 | "\n", 195 | " Fold 04 class severe_toxic AUC: 0.987581\n", 196 | "[ 0 1 3 ... 159568 159569 159570]\n", 197 | "[ 2 17 19 ... 159517 159538 159566]\n", 198 | "\n", 199 | " Fold 05 class severe_toxic AUC: 0.981197\n", 200 | "[ 0 1 2 ... 159568 159569 159570]\n", 201 | "[ 8 18 20 ... 159544 159556 159562]\n", 202 | "\n", 203 | " Fold 06 class severe_toxic AUC: 0.990249\n", 204 | "[ 0 1 2 ... 159567 159568 159569]\n", 205 | "[ 5 22 40 ... 159557 159558 159570]\n", 206 | "\n", 207 | " Fold 07 class severe_toxic AUC: 0.988764\n", 208 | "[ 1 2 3 ... 159566 159569 159570]\n", 209 | "[ 0 21 27 ... 159528 159567 159568]\n", 210 | "\n", 211 | " Fold 08 class severe_toxic AUC: 0.978003\n", 212 | "[ 0 1 2 ... 159567 159568 159570]\n", 213 | "[ 3 11 26 ... 159554 159561 159569]\n", 214 | "\n", 215 | " Fold 09 class severe_toxic AUC: 0.989705\n", 216 | "[ 0 1 2 ... 159568 159569 159570]\n", 217 | "[ 14 23 29 ... 159541 159542 159565]\n", 218 | "\n", 219 | " Fold 10 class severe_toxic AUC: 0.991349\n", 220 | "\n", 221 | " Average class severe_toxic AUC:\t0.987354\n", 222 | " Out-of-fold class severe_toxic AUC:\t0.987371\n", 223 | "class_name is: obscene\n", 224 | "[ 0 1 2 ... 159568 159569 159570]\n", 225 | "[ 4 7 10 ... 159547 159548 159557]\n", 226 | "\n", 227 | " Fold 01 class obscene AUC: 0.992491\n", 228 | "[ 0 1 2 ... 159568 159569 159570]\n", 229 | "[ 24 30 33 ... 159559 159560 159563]\n", 230 | "\n", 231 | " Fold 02 class obscene AUC: 0.992949\n", 232 | "[ 0 1 2 ... 159568 159569 159570]\n", 233 | "[ 9 34 39 ... 159530 159531 159543]\n", 234 | "\n", 235 | " Fold 03 class obscene AUC: 0.992516\n", 236 | "[ 0 2 3 ... 159568 159569 159570]\n", 237 | "[ 1 16 28 ... 159538 159555 159564]\n", 238 | "\n", 239 | " Fold 04 class obscene AUC: 0.992658\n", 240 | "[ 0 1 3 ... 159568 159569 159570]\n", 241 | "[ 2 8 15 ... 159561 159562 159566]\n", 242 | "\n", 243 | " Fold 05 class obscene AUC: 0.993123\n", 244 | "[ 0 1 2 ... 159568 159569 159570]\n", 245 | "[ 6 18 20 ... 159515 159535 159541]\n", 246 | "\n", 247 | " Fold 06 class obscene AUC: 0.994014\n", 248 | "[ 1 2 3 ... 159566 159568 159569]\n", 249 | "[ 0 5 22 ... 159558 159567 159570]\n", 250 | "\n", 251 | " Fold 07 class obscene AUC: 0.994354\n", 252 | "[ 0 1 2 ... 159566 159567 159570]\n", 253 | "[ 3 11 21 ... 159552 159568 159569]\n", 254 | "\n", 255 | " Fold 08 class obscene AUC: 0.991395\n", 256 | "[ 0 1 2 ... 159568 159569 159570]\n", 257 | "[ 12 13 37 ... 159520 159536 159553]\n", 258 | "\n", 259 | " Fold 09 class obscene AUC: 0.991344\n", 260 | "[ 0 1 2 ... 159568 159569 159570]\n", 261 | "[ 14 23 29 ... 159542 159554 159565]\n", 262 | "\n", 263 | " Fold 10 class obscene AUC: 0.992631\n", 264 | "\n", 265 | " Average class obscene AUC:\t0.992748\n", 266 | " Out-of-fold class obscene AUC:\t0.992748\n", 267 | "class_name is: threat\n", 268 | "[ 0 1 2 ... 159568 159569 159570]\n", 269 | "[ 4 6 31 ... 159525 159547 159549]\n", 270 | "\n", 271 | " Fold 01 class threat AUC: 0.975375\n", 272 | "[ 0 1 2 ... 159568 159569 159570]\n", 273 | "[ 9 14 23 ... 159527 159532 159560]\n", 274 | "\n", 275 | " Fold 02 class threat AUC: 0.978111\n", 276 | "[ 0 1 2 ... 159568 159569 159570]\n", 277 | "[ 8 11 12 ... 159533 159555 159556]\n", 278 | "\n", 279 | " Fold 03 class threat AUC: 0.975833\n", 280 | "[ 0 2 3 ... 159568 159569 159570]\n", 281 | "[ 1 15 38 ... 159559 159564 159567]\n", 282 | "\n", 283 | " Fold 04 class threat AUC: 0.993123\n", 284 | "[ 0 1 3 ... 159568 159569 159570]\n", 285 | "[ 2 16 18 ... 159517 159538 159566]\n", 286 | "\n", 287 | " Fold 05 class threat AUC: 0.992293\n", 288 | "[ 0 1 2 ... 159568 159569 159570]\n", 289 | "[ 7 17 19 ... 159561 159562 159563]\n", 290 | "\n", 291 | " Fold 06 class threat AUC: 0.990553\n", 292 | "[ 0 1 2 ... 159567 159568 159569]\n", 293 | "[ 5 21 39 ... 159557 159558 159570]\n", 294 | "\n", 295 | " Fold 07 class threat AUC: 0.985213\n", 296 | "[ 1 2 3 ... 159567 159569 159570]\n", 297 | "[ 0 20 26 ... 159528 159551 159568]\n", 298 | "\n", 299 | " Fold 08 class threat AUC: 0.992530\n", 300 | "[ 0 1 2 ... 159567 159568 159570]\n", 301 | "[ 3 10 25 ... 159553 159554 159569]\n", 302 | "\n", 303 | " Fold 09 class threat AUC: 0.993273\n", 304 | "[ 0 1 2 ... 159568 159569 159570]\n", 305 | "[ 13 22 28 ... 159541 159542 159565]\n", 306 | "\n", 307 | " Fold 10 class threat AUC: 0.965121\n", 308 | "\n", 309 | " Average class threat AUC:\t0.984142\n", 310 | " Out-of-fold class threat AUC:\t0.984083\n", 311 | "class_name is: insult\n", 312 | "[ 0 1 2 ... 159568 159569 159570]\n", 313 | "[ 4 7 10 ... 159517 159537 159567]\n", 314 | "\n", 315 | " Fold 01 class insult AUC: 0.983587\n", 316 | "[ 0 1 2 ... 159568 159569 159570]\n", 317 | "[ 24 30 33 ... 159548 159554 159557]\n", 318 | "\n", 319 | " Fold 02 class insult AUC: 0.987092\n", 320 | "[ 0 1 2 ... 159568 159569 159570]\n", 321 | "[ 9 34 39 ... 159529 159542 159556]\n", 322 | "\n", 323 | " Fold 03 class insult AUC: 0.982300\n", 324 | "[ 0 2 3 ... 159568 159569 159570]\n", 325 | "[ 1 16 28 ... 159541 159555 159564]\n", 326 | "\n", 327 | " Fold 04 class insult AUC: 0.982773\n", 328 | "[ 0 1 3 ... 159568 159569 159570]\n", 329 | "[ 2 8 15 ... 159559 159562 159566]\n", 330 | "\n", 331 | " Fold 05 class insult AUC: 0.982130\n", 332 | "[ 0 1 2 ... 159568 159569 159570]\n", 333 | "[ 6 18 20 ... 159543 159546 159561]\n", 334 | "\n", 335 | " Fold 06 class insult AUC: 0.985141\n", 336 | "[ 1 2 3 ... 159567 159568 159569]\n", 337 | "[ 0 5 22 ... 159551 159558 159570]\n", 338 | "\n", 339 | " Fold 07 class insult AUC: 0.983921\n", 340 | "[ 0 1 2 ... 159566 159567 159570]\n", 341 | "[ 3 11 21 ... 159563 159568 159569]\n", 342 | "\n", 343 | " Fold 08 class insult AUC: 0.987698\n", 344 | "[ 0 1 2 ... 159568 159569 159570]\n", 345 | "[ 12 13 37 ... 159519 159535 159553]\n", 346 | "\n", 347 | " Fold 09 class insult AUC: 0.983639\n", 348 | "[ 0 1 2 ... 159568 159569 159570]\n", 349 | "[ 14 23 29 ... 159539 159540 159565]\n", 350 | "\n", 351 | " Fold 10 class insult AUC: 0.987121\n", 352 | "\n", 353 | " Average class insult AUC:\t0.984540\n", 354 | " Out-of-fold class insult AUC:\t0.984541\n", 355 | "class_name is: identity_hate\n", 356 | "[ 0 1 2 ... 159568 159569 159570]\n", 357 | "[ 4 6 31 ... 159555 159560 159561]\n", 358 | "\n", 359 | " Fold 01 class identity_hate AUC: 0.982784\n", 360 | "[ 0 1 2 ... 159568 159569 159570]\n", 361 | "[ 9 14 23 ... 159548 159549 159559]\n", 362 | "\n", 363 | " Fold 02 class identity_hate AUC: 0.977440\n", 364 | "[ 0 1 2 ... 159568 159569 159570]\n", 365 | "[ 8 11 12 ... 159539 159547 159550]\n", 366 | "\n", 367 | " Fold 03 class identity_hate AUC: 0.970181\n", 368 | "[ 0 2 3 ... 159568 159569 159570]\n", 369 | "[ 1 15 38 ... 159534 159543 159564]\n", 370 | "\n", 371 | " Fold 04 class identity_hate AUC: 0.984155\n", 372 | "[ 0 1 3 ... 159568 159569 159570]\n", 373 | "[ 2 16 18 ... 159525 159538 159566]\n", 374 | "\n", 375 | " Fold 05 class identity_hate AUC: 0.980120\n", 376 | "[ 0 1 2 ... 159568 159569 159570]\n", 377 | "[ 7 17 19 ... 159546 159562 159567]\n", 378 | "\n", 379 | " Fold 06 class identity_hate AUC: 0.976925\n", 380 | "[ 0 1 2 ... 159567 159568 159569]\n", 381 | "[ 5 21 39 ... 159557 159558 159570]\n", 382 | "\n", 383 | " Fold 07 class identity_hate AUC: 0.980144\n", 384 | "[ 1 2 3 ... 159567 159569 159570]\n", 385 | "[ 0 20 26 ... 159528 159563 159568]\n", 386 | "\n", 387 | " Fold 08 class identity_hate AUC: 0.978127\n", 388 | "[ 0 1 2 ... 159567 159568 159570]\n", 389 | "[ 3 10 25 ... 159553 159554 159569]\n", 390 | "\n", 391 | " Fold 09 class identity_hate AUC: 0.986655\n", 392 | "[ 0 1 2 ... 159568 159569 159570]\n", 393 | "[ 13 22 28 ... 159541 159542 159565]\n", 394 | "\n", 395 | " Fold 10 class identity_hate AUC: 0.984381\n", 396 | "\n", 397 | " Average class identity_hate AUC:\t0.980091\n", 398 | " Out-of-fold class identity_hate AUC:\t0.980032\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "\n", 404 | "for j, (class_name) in enumerate(class_names):\n", 405 | " \n", 406 | " print('class_name is: ' + class_name)\n", 407 | " avreal = target[class_name]\n", 408 | " lr_cv_sum = 0\n", 409 | " lr_test_pred = np.zeros(test.shape[0])\n", 410 | " lr_avpred = np.zeros(train.shape[0])\n", 411 | " \n", 412 | " for i, (train_index, val_index) in enumerate(kfolder.split(train_features, avreal)):\n", 413 | " print(train_index)\n", 414 | " print(val_index)\n", 415 | " X_train, X_val = train_features[train_index], train_features[val_index]\n", 416 | " y_train, y_val = target.loc[train_index], target.loc[val_index]\n", 417 | "\n", 418 | " classifier = Ridge(alpha=20, copy_X=True, fit_intercept=True, solver='auto',max_iter=100,normalize=False, random_state=0, tol=0.0025)\n", 419 | " \n", 420 | " #classifier = Lasso(alpha=0.1,normalize=True, max_iter=1e5)\n", 421 | " # classifier = ElasticNet(alpha=1.0, l1_ratio =0.5)\n", 422 | " classifier.fit(X_train, y_train[class_name])\n", 423 | " scores_val = classifier.predict(X_val)\n", 424 | " lr_avpred[val_index] = scores_val\n", 425 | " lr_test_pred += classifier.predict(test_features)\n", 426 | " scores_classes[j][i] = roc_auc_score(y_val[class_name], scores_val)\n", 427 | " print('\\n Fold %02d class %s AUC: %.6f' % ((i+1), class_name, scores_classes[j][i]))\n", 428 | "\n", 429 | " lr_cv_score = (lr_cv_sum / number_of_folds)\n", 430 | " lr_oof_auc = roc_auc_score(avreal, lr_avpred)\n", 431 | " print('\\n Average class %s AUC:\\t%.6f' % (class_name, np.mean(scores_classes[j])))\n", 432 | " print(' Out-of-fold class %s AUC:\\t%.6f' % (class_name, lr_oof_auc))\n", 433 | "\n", 434 | " submission[class_name] = lr_test_pred / number_of_folds\n", 435 | " submission_oof[class_name] = lr_avpred\n", 436 | "\n", 437 | "#print('\\n Overall AUC:\\t%.6f' % (np.mean(scores_classes)))\n", 438 | "submission.to_csv('10-fold_elast_test.csv', index=False)\n", 439 | "submission_oof.to_csv('10-fold_ridge_train.csv', index=False)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [] 448 | } 449 | ], 450 | "metadata": { 451 | "kernelspec": { 452 | "display_name": "Python 2", 453 | "language": "python", 454 | "name": "python2" 455 | }, 456 | "language_info": { 457 | "codemirror_mode": { 458 | "name": "ipython", 459 | "version": 2 460 | }, 461 | "file_extension": ".py", 462 | "mimetype": "text/x-python", 463 | "name": "python", 464 | "nbconvert_exporter": "python", 465 | "pygments_lexer": "ipython2", 466 | "version": "2.7.14" 467 | } 468 | }, 469 | "nbformat": 4, 470 | "nbformat_minor": 2 471 | } 472 | -------------------------------------------------------------------------------- /Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "(4995, 6)\n", 23 | "(100202, 6)\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "predictions = pd.read_csv('nb36.csv', index_col=0)\n", 29 | "test = pd.read_csv('test_translated_clean.csv', index_col=0)\n", 30 | "\n", 31 | "def find_good_predicts(valids, column):\n", 32 | " b1 = valids[column] > 0.96\n", 33 | " b2 = valids[column] < 0.1\n", 34 | " c = valids[b1|b2]\n", 35 | " print(valids[b1].shape)\n", 36 | " print(valids[b2].shape)\n", 37 | " return c\n", 38 | "\n", 39 | "good_predictions = find_good_predicts(predictions,'toxic')\n", 40 | "g = good_predictions.join(test)\n", 41 | "train = pd.read_csv('train_translated_clean.csv',index_col=0)\n", 42 | "new_train = pd.concat([train,g])\n", 43 | "\n" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/html": [ 54 | "
\n", 55 | "\n", 68 | "\n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | "
comment_textcomment_text_englishididentity_hateinsultlangobscenesevere_toxicthreattoxic
0explanation why the edits made under my userna...Explanation\\nWhy the edits made under my usern...0000997932d777bf0.00.0en0.00.00.00.0
1d aww he matches this background colour i am s...D'aww! He matches this background colour I'm s...000103f0d9cfb60f0.00.0en0.00.00.00.0
2hey man i am really not trying to edit war it ...Hey man, I'm really not trying to edit war. It...000113f07ec002fd0.00.0en0.00.00.00.0
3more i can not make any real suggestions on im...\"\\nMore\\nI can't make any real suggestions on ...0001b41b1c6bb37e0.00.0en0.00.00.00.0
4you sir are my hero any chance you remember wh...You, sir, are my hero. Any chance you remember...0001d958c54c6e350.00.0en0.00.00.00.0
5congratulations from me as well use the tools ...\"\\n\\nCongratulations from me as well, use the ...00025465d4725e870.00.0en0.00.00.00.0
6cocksucker before you piss around on my workCOCKSUCKER BEFORE YOU PISS AROUND ON MY WORK0002bcb3da6cb3370.01.0en1.01.00.01.0
7your vandalism to the matt shirvington article...Your vandalism to the Matt Shirvington article...00031b1e95af79210.00.0en0.00.00.00.0
8sorry if the word nonsense was offensive to yo...Sorry if the word 'nonsense' was offensive to ...00037261f536c51d0.00.0en0.00.00.00.0
9alignment on this subject and which are contra...alignment on this subject and which are contra...00040093b2687caa0.00.0en0.00.00.00.0
10fair use rationale for image wonju jpg thanks ...\"\\nFair use rationale for Image:Wonju.jpg\\n\\nT...0005300084f90edc0.00.0en0.00.00.00.0
11bbq be a man and lets discuss it maybe over th...bbq \\n\\nbe a man and lets discuss it-maybe ove...00054a5e18b50dd40.00.0en0.00.00.00.0
12hey what is it talk what is it an exclusive gr...Hey... what is it..\\n@ | talk .\\nWhat is it......0005c987bdfc9d4b0.00.0en0.00.00.01.0
13before you start throwing accusations and warn...Before you start throwing accusations and warn...0006f16e4e9f292e0.00.0en0.00.00.00.0
14oh and the girl above started her arguments wi...Oh, and the girl above started her arguments w...00070ef96486d6f90.00.0en0.00.00.00.0
15juelz santanas age in two zero zero two juelz ...\"\\n\\nJuelz Santanas Age\\n\\nIn 2002, Juelz Sant...00078f8ce7eb276d0.00.0en0.00.00.00.0
16bye do not look come or think of comming back ...Bye! \\n\\nDon't look, come or think of comming ...0007e25b2121310b0.00.0en0.00.00.01.0
17redirect talk voydan pop georgiev chernodrinskiREDIRECT Talk:Voydan Pop Georgiev- Chernodrinski000897889268bc930.00.0af0.00.00.00.0
18the mitsurugi point made no sense why not argu...The Mitsurugi point made no sense - why not ar...0009801bd85e58060.00.0en0.00.00.00.0
19do not mean to bother you i see that you re wr...Don't mean to bother you \\n\\nI see that you're...0009eaea3325de8c0.00.0en0.00.00.00.0
\n", 347 | "
" 348 | ], 349 | "text/plain": [ 350 | " comment_text \\\n", 351 | "0 explanation why the edits made under my userna... \n", 352 | "1 d aww he matches this background colour i am s... \n", 353 | "2 hey man i am really not trying to edit war it ... \n", 354 | "3 more i can not make any real suggestions on im... \n", 355 | "4 you sir are my hero any chance you remember wh... \n", 356 | "5 congratulations from me as well use the tools ... \n", 357 | "6 cocksucker before you piss around on my work \n", 358 | "7 your vandalism to the matt shirvington article... \n", 359 | "8 sorry if the word nonsense was offensive to yo... \n", 360 | "9 alignment on this subject and which are contra... \n", 361 | "10 fair use rationale for image wonju jpg thanks ... \n", 362 | "11 bbq be a man and lets discuss it maybe over th... \n", 363 | "12 hey what is it talk what is it an exclusive gr... \n", 364 | "13 before you start throwing accusations and warn... \n", 365 | "14 oh and the girl above started her arguments wi... \n", 366 | "15 juelz santanas age in two zero zero two juelz ... \n", 367 | "16 bye do not look come or think of comming back ... \n", 368 | "17 redirect talk voydan pop georgiev chernodrinski \n", 369 | "18 the mitsurugi point made no sense why not argu... \n", 370 | "19 do not mean to bother you i see that you re wr... \n", 371 | "\n", 372 | " comment_text_english id \\\n", 373 | "0 Explanation\\nWhy the edits made under my usern... 0000997932d777bf \n", 374 | "1 D'aww! He matches this background colour I'm s... 000103f0d9cfb60f \n", 375 | "2 Hey man, I'm really not trying to edit war. It... 000113f07ec002fd \n", 376 | "3 \"\\nMore\\nI can't make any real suggestions on ... 0001b41b1c6bb37e \n", 377 | "4 You, sir, are my hero. Any chance you remember... 0001d958c54c6e35 \n", 378 | "5 \"\\n\\nCongratulations from me as well, use the ... 00025465d4725e87 \n", 379 | "6 COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK 0002bcb3da6cb337 \n", 380 | "7 Your vandalism to the Matt Shirvington article... 00031b1e95af7921 \n", 381 | "8 Sorry if the word 'nonsense' was offensive to ... 00037261f536c51d \n", 382 | "9 alignment on this subject and which are contra... 00040093b2687caa \n", 383 | "10 \"\\nFair use rationale for Image:Wonju.jpg\\n\\nT... 0005300084f90edc \n", 384 | "11 bbq \\n\\nbe a man and lets discuss it-maybe ove... 00054a5e18b50dd4 \n", 385 | "12 Hey... what is it..\\n@ | talk .\\nWhat is it...... 0005c987bdfc9d4b \n", 386 | "13 Before you start throwing accusations and warn... 0006f16e4e9f292e \n", 387 | "14 Oh, and the girl above started her arguments w... 00070ef96486d6f9 \n", 388 | "15 \"\\n\\nJuelz Santanas Age\\n\\nIn 2002, Juelz Sant... 00078f8ce7eb276d \n", 389 | "16 Bye! \\n\\nDon't look, come or think of comming ... 0007e25b2121310b \n", 390 | "17 REDIRECT Talk:Voydan Pop Georgiev- Chernodrinski 000897889268bc93 \n", 391 | "18 The Mitsurugi point made no sense - why not ar... 0009801bd85e5806 \n", 392 | "19 Don't mean to bother you \\n\\nI see that you're... 0009eaea3325de8c \n", 393 | "\n", 394 | " identity_hate insult lang obscene severe_toxic threat toxic \n", 395 | "0 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 396 | "1 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 397 | "2 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 398 | "3 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 399 | "4 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 400 | "5 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 401 | "6 0.0 1.0 en 1.0 1.0 0.0 1.0 \n", 402 | "7 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 403 | "8 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 404 | "9 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 405 | "10 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 406 | "11 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 407 | "12 0.0 0.0 en 0.0 0.0 0.0 1.0 \n", 408 | "13 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 409 | "14 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 410 | "15 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 411 | "16 0.0 0.0 en 0.0 0.0 0.0 1.0 \n", 412 | "17 0.0 0.0 af 0.0 0.0 0.0 0.0 \n", 413 | "18 0.0 0.0 en 0.0 0.0 0.0 0.0 \n", 414 | "19 0.0 0.0 en 0.0 0.0 0.0 0.0 " 415 | ] 416 | }, 417 | "execution_count": 4, 418 | "metadata": {}, 419 | "output_type": "execute_result" 420 | } 421 | ], 422 | "source": [ 423 | "new_train.head(n=20)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 9, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "fn = 'train_' + str(new_train.shape[0]) + '.csv'\n", 433 | "new_train.to_csv(fn)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 18, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "train = 'clean_train_ori_third.csv'\n", 443 | "train = pd.read_csv(train)\n", 444 | "labels = ['toxic']\n", 445 | "train_target = train[labels].values\n", 446 | "kf_label = np.ones(train_target.shape)\n", 447 | "for i in range(train_target.shape[1]):\n", 448 | " kf_label[:,i] = 2**i\n", 449 | "kf_label = np.sum(kf_label, axis=1)" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 19, 455 | "metadata": {}, 456 | "outputs": [ 457 | { 458 | "name": "stdout", 459 | "output_type": "stream", 460 | "text": [ 461 | "[ 1. 1. 1. ..., 1. 1. 1.]\n" 462 | ] 463 | } 464 | ], 465 | "source": [ 466 | "print(kf_label)" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 17, 472 | "metadata": {}, 473 | "outputs": [ 474 | { 475 | "data": { 476 | "text/plain": [ 477 | "6" 478 | ] 479 | }, 480 | "execution_count": 17, 481 | "metadata": {}, 482 | "output_type": "execute_result" 483 | } 484 | ], 485 | "source": [ 486 | "train_target.shape[1]" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [] 495 | } 496 | ], 497 | "metadata": { 498 | "kernelspec": { 499 | "display_name": "Python 2", 500 | "language": "python", 501 | "name": "python2" 502 | }, 503 | "language_info": { 504 | "codemirror_mode": { 505 | "name": "ipython", 506 | "version": 2 507 | }, 508 | "file_extension": ".py", 509 | "mimetype": "text/x-python", 510 | "name": "python", 511 | "nbconvert_exporter": "python", 512 | "pygments_lexer": "ipython2", 513 | "version": "2.7.14" 514 | } 515 | }, 516 | "nbformat": 4, 517 | "nbformat_minor": 2 518 | } 519 | -------------------------------------------------------------------------------- /add_covaai.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": { 5 | "_uuid": "41cd31e255867428bddad43ead3e766b2837948b", 6 | "_cell_guid": "93c24171-18f6-42d4-af00-5c856774b347", 7 | "collapsed": true 8 | }, 9 | "cell_type": "markdown", 10 | "source": "

Note: in the Discussion section they said that data from figshare has some overlap with the current test set (https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/46177). So it's possible that using features/scores based on this data may overfit to the current test set. Once they change the test set, the LB scores may change.\nSo at this point, I think it's hard to tell whether using features based on these datasets will ultimately help your LB score. It may still help, but we won't know for sure until the new test set is released.

" 11 | }, 12 | { 13 | "metadata": { 14 | "_uuid": "7e224f68ef5f256e4be2566e9a8311d3b47470d1", 15 | "collapsed": true, 16 | "_cell_guid": "1d2bb910-0444-487d-89b5-6fa47b8a6b27" 17 | }, 18 | "cell_type": "markdown", 19 | "source": "**The idea for this kernel is to use the public datasets at https://conversationai.github.io/ to train models and use those models to score the train and test sets for this challenge. You can then use the scores as features when training the real models. So the output of this kernel isn't meant to be submitted as is. The output is the original train/test datasets, with additional columns/features.**\n\nUsing these enhanced train/test sets improved my logistic-regression based models from 0.047 to 0.044 log-loss. I haven't done much if any tuning for these models below, so you should be able to tweak things and get even better results.\n\nI understand that there are PerspectiveAPI models that may be similar. But rather than wait for an API key, and so I could play around with the models more myself, I trained the models in this kernel." 20 | }, 21 | { 22 | "metadata": { 23 | "_uuid": "96c136c2572dbcea17f38c260d0ea5e3c563706d", 24 | "collapsed": true, 25 | "_cell_guid": "9471f39f-ba45-4db8-b610-9bfe18770bc3", 26 | "trusted": false 27 | }, 28 | "cell_type": "code", 29 | "source": "import numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)", 30 | "execution_count": null, 31 | "outputs": [] 32 | }, 33 | { 34 | "metadata": { 35 | "_uuid": "f7a153a8288d190481c26a7bfcf824036e517756", 36 | "collapsed": true, 37 | "_cell_guid": "cc063548-d203-491d-95d3-cfccd58cedf8", 38 | "trusted": false 39 | }, 40 | "cell_type": "code", 41 | "source": "toxic_cmt = pd.read_table('../input/conversationaidataset/toxicity_annotated_comments.tsv')\ntoxic_annot = pd.read_table('../input/conversationaidataset/toxicity_annotations.tsv')\naggr_cmt = pd.read_table('../input/conversationaidataset/aggression_annotated_comments.tsv')\naggr_annot = pd.read_table('../input/conversationaidataset/aggression_annotations.tsv')\nattack_cmt = pd.read_table('../input/conversationaidataset/attack_annotated_comments.tsv')\nattack_annot = pd.read_table('../input/conversationaidataset/attack_annotations.tsv')", 42 | "execution_count": null, 43 | "outputs": [] 44 | }, 45 | { 46 | "metadata": { 47 | "_uuid": "46ff20d6c42c21b1eb1595e539776bd7823bd59d", 48 | "_cell_guid": "b158b96a-8e5e-438f-be60-a4855eb0842b" 49 | }, 50 | "cell_type": "markdown", 51 | "source": "**Find the mean score for toxicity, aggression, attack, and join with the corresponding comment**\nFor each comment/rev_id, multiple workers have labeld/annotated. So then you have to decide what your overall label is for a given comment/rev_id. I simply took the mean value, and will train a regression model. You could try other aggregations/methods. You could, e.g., instead go with majority vote, and train binary classifiers, etc." 52 | }, 53 | { 54 | "metadata": { 55 | "_uuid": "931f4c3e6c8a7ed28be8f6ff9694a9375ed00bec", 56 | "collapsed": true, 57 | "_cell_guid": "b635bf75-09e2-491c-8aa4-5512f696bded", 58 | "trusted": false 59 | }, 60 | "cell_type": "code", 61 | "source": "def JoinAndSanitize(cmt, annot):\n df = cmt.set_index('rev_id').join(annot.groupby(['rev_id']).mean())\n df = Sanitize(df)\n return df", 62 | "execution_count": null, 63 | "outputs": [] 64 | }, 65 | { 66 | "metadata": { 67 | "_uuid": "5deb6f4f888ac2393462bd243c945d2fcbf9391d", 68 | "_cell_guid": "223c29f8-229b-42fd-ad5d-61427673dbe7" 69 | }, 70 | "cell_type": "markdown", 71 | "source": "**Basic cleaning/standardizing -- can potentially do more (or less) here**" 72 | }, 73 | { 74 | "metadata": { 75 | "_uuid": "585418f67b3e73c47af85228f7a9e62682cb2816", 76 | "collapsed": true, 77 | "_cell_guid": "d3b0bf5e-37aa-49f5-94cc-bee6d9e06fe1", 78 | "trusted": false 79 | }, 80 | "cell_type": "code", 81 | "source": "def Sanitize(df):\n comment = 'comment' if 'comment' in df else 'comment_text'\n df[comment] = df[comment].str.lower().str.replace('newline_token', ' ')\n df[comment] = df[comment].fillna('erikov')\n return df", 82 | "execution_count": null, 83 | "outputs": [] 84 | }, 85 | { 86 | "metadata": { 87 | "_uuid": "d56061bf13b65b1587e125152d87d71790031fda", 88 | "collapsed": true, 89 | "_cell_guid": "185561a1-5b3d-49ea-b737-1f68504d285f", 90 | "trusted": false 91 | }, 92 | "cell_type": "code", 93 | "source": "toxic = JoinAndSanitize(toxic_cmt, toxic_annot)\nattack = JoinAndSanitize(attack_cmt, attack_annot)\naggression = JoinAndSanitize(aggr_cmt, aggr_annot)", 94 | "execution_count": null, 95 | "outputs": [] 96 | }, 97 | { 98 | "metadata": { 99 | "_uuid": "8b9e981ccef00bdbb71e852eaeb80e3f2b4cd439", 100 | "_cell_guid": "bc38ff04-0274-4bdf-9bff-1e5625ac0983" 101 | }, 102 | "cell_type": "markdown", 103 | "source": "**The attack and aggression labeled datasets are actually the same with only very slightly different annotations/labels**\nSo probably only the scores from one model will be needed, but I left both here for completeness." 104 | }, 105 | { 106 | "metadata": { 107 | "_uuid": "cb884f34312b44a95038df85adbd08b8df5693ef", 108 | "collapsed": true, 109 | "_cell_guid": "30eed731-3c48-492b-84b0-fe677e24fb37", 110 | "trusted": false 111 | }, 112 | "cell_type": "code", 113 | "source": "len(attack), len(aggression)", 114 | "execution_count": null, 115 | "outputs": [] 116 | }, 117 | { 118 | "metadata": { 119 | "_uuid": "e97f9c8f5d65713f01131ecaa306dea2eee7f1b4", 120 | "collapsed": true, 121 | "_cell_guid": "8a7db6ee-6988-4f13-8b22-011ac4fc2baf", 122 | "trusted": false 123 | }, 124 | "cell_type": "code", 125 | "source": "attack['comment'].equals(aggression['comment'])", 126 | "execution_count": null, 127 | "outputs": [] 128 | }, 129 | { 130 | "metadata": { 131 | "_uuid": "845182cb1a08c406f03ef7373188b14ecf7d8d31", 132 | "_cell_guid": "448b0597-005b-4bce-bacc-c4e8d6ba46ec" 133 | }, 134 | "cell_type": "markdown", 135 | "source": "Check how correlated the mean value for the annotations between the attack and aggression datasets are" 136 | }, 137 | { 138 | "metadata": { 139 | "_uuid": "96134949dd9f12ab7a34955bed2449a2f1a7ded4", 140 | "collapsed": true, 141 | "_cell_guid": "47639bda-b46b-464a-91c3-3e2020c763ca", 142 | "trusted": false 143 | }, 144 | "cell_type": "code", 145 | "source": "attack['attack'].corr(aggression['aggression'])", 146 | "execution_count": null, 147 | "outputs": [] 148 | }, 149 | { 150 | "metadata": { 151 | "_uuid": "dbcefead0cd7d82f2e63cb607d21bbf73f14a14f", 152 | "_cell_guid": "ad881513-707b-4c9f-aa36-cf9b5464bf75" 153 | }, 154 | "cell_type": "markdown", 155 | "source": "**Check dataset**" 156 | }, 157 | { 158 | "metadata": { 159 | "_uuid": "29496c0653e608686a9170cbaf6ba16825091323", 160 | "collapsed": true, 161 | "_cell_guid": "cfb2b58b-369a-4934-ac88-33b7c6fc026e", 162 | "trusted": false 163 | }, 164 | "cell_type": "code", 165 | "source": "toxic.head()\n#attack.head()\n#aggression.head()", 166 | "execution_count": null, 167 | "outputs": [] 168 | }, 169 | { 170 | "metadata": { 171 | "_uuid": "66404a538b7b0f751d3c5ca887038ecaf6116f43", 172 | "collapsed": true, 173 | "_cell_guid": "57fe5c62-bbb8-433e-8b0d-48feb38c1ca5", 174 | "trusted": false 175 | }, 176 | "cell_type": "code", 177 | "source": "from sklearn.feature_extraction.text import TfidfVectorizer\n\ndef Tfidfize(df):\n # can tweak these as desired\n max_vocab = 200000\n split = 0.1\n\n comment = 'comment' if 'comment' in df else 'comment_text'\n \n tfidfer = TfidfVectorizer(ngram_range=(1,2), max_features=max_vocab,\n use_idf=1, stop_words='english',\n smooth_idf=1, sublinear_tf=1 )\n tfidf = tfidfer.fit_transform(df[comment])\n\n return tfidf, tfidfer", 178 | "execution_count": null, 179 | "outputs": [] 180 | }, 181 | { 182 | "metadata": { 183 | "_uuid": "b267bdc10ea39fbffada53ddc707562b3e8a0ad6", 184 | "_cell_guid": "3496e5fe-0fb9-4cce-887c-46679ba71dcc" 185 | }, 186 | "cell_type": "markdown", 187 | "source": "Get the tfidf values for the training sets, as well as the fit tfidf vectorizer to be used later to transform the train/test sets for the real challenge datasets." 188 | }, 189 | { 190 | "metadata": { 191 | "_uuid": "bfeab05149e6f8e61ed12dd56284cb919dc94cd4", 192 | "collapsed": true, 193 | "_cell_guid": "2f334a1c-ce4b-4a9a-bd91-e70d9d617553", 194 | "trusted": false 195 | }, 196 | "cell_type": "code", 197 | "source": "X_toxic, tfidfer_toxic = Tfidfize(toxic)\ny_toxic = toxic['toxicity'].values\nX_attack, tfidfer_attack = Tfidfize(attack)\ny_attack = attack['attack'].values\nX_aggression, tfidfer_aggression = Tfidfize(aggression)\ny_aggression = aggression['aggression'].values", 198 | "execution_count": null, 199 | "outputs": [] 200 | }, 201 | { 202 | "metadata": { 203 | "_uuid": "59cd67be9dbcc8939d59a60a21c14899cdcf219d", 204 | "_cell_guid": "c170e36d-f34f-40c3-bec2-b87f500394ae" 205 | }, 206 | "cell_type": "markdown", 207 | "source": "**Model Training Strategy**\n\nRather than converting the 'toxicity', 'attack', 'aggression' into a binary label (e.g., >= 0.5), let's train a regression model to use as much information as possible. The output score from these models could be used as features in training the further refined models in the current challenge ('severe_toxic', 'obscene', etc.).\n\nThe toxicity/attack/aggression may not have a 1-1 mapping with the desired targets for the challenge, but they may be features that can help." 208 | }, 209 | { 210 | "metadata": { 211 | "_uuid": "4afbde5720479525399c5a3fa3f7539506dec2ec", 212 | "collapsed": true, 213 | "_cell_guid": "e219786f-3591-4572-946c-70834f410c2a", 214 | "trusted": false 215 | }, 216 | "cell_type": "code", 217 | "source": "from sklearn.linear_model import Ridge\nfrom sklearn.model_selection import cross_val_score\n\nridge = Ridge()\nmse_toxic = -cross_val_score(ridge, X_toxic, y_toxic, scoring='neg_mean_squared_error')\nmse_attack = -cross_val_score(ridge, X_attack, y_attack, scoring='neg_mean_squared_error')\nmse_aggression = -cross_val_score(ridge, X_aggression, y_aggression, scoring='neg_mean_squared_error')", 218 | "execution_count": null, 219 | "outputs": [] 220 | }, 221 | { 222 | "metadata": { 223 | "_uuid": "a7028c84de71f59f163c552776cd3ab7cc6fc7b2", 224 | "collapsed": true, 225 | "_cell_guid": "a76fdcba-ebe8-4413-8e52-926d0197555b", 226 | "trusted": false 227 | }, 228 | "cell_type": "code", 229 | "source": "mse_toxic.mean(), mse_attack.mean(), mse_aggression.mean()", 230 | "execution_count": null, 231 | "outputs": [] 232 | }, 233 | { 234 | "metadata": { 235 | "_uuid": "6a93d0923ad96dcd55144c3d9e23fc006f555d29", 236 | "_cell_guid": "109ca84b-7fab-4c92-91c2-4e32e891170d" 237 | }, 238 | "cell_type": "markdown", 239 | "source": "**If the cross-validation scores look okay, train on the full dataset**" 240 | }, 241 | { 242 | "metadata": { 243 | "_uuid": "d4d8274bd65f2db4f7c44d585f9e0aeda2ac2985", 244 | "collapsed": true, 245 | "_cell_guid": "9e412176-f80b-4a8b-9dd6-da41673e7b38", 246 | "trusted": false 247 | }, 248 | "cell_type": "code", 249 | "source": "model_toxic = ridge.fit(X_toxic, y_toxic)\nmodel_attack = ridge.fit(X_attack, y_attack)\nmodel_aggression = ridge.fit(X_aggression, y_aggression)", 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "metadata": { 255 | "_uuid": "9c98b0e8f7896edebed257d5d16ec1efc07edf6a", 256 | "_cell_guid": "dfae494c-51ca-4360-8150-baa193123442" 257 | }, 258 | "cell_type": "markdown", 259 | "source": "**Now score the original train and test sets, and save out as an additional feature for those datasets. (These can then be used when training/scoring with our real model**" 260 | }, 261 | { 262 | "metadata": { 263 | "_uuid": "71279eba76cc6dd72dbd0a8bac14ace806d7d938", 264 | "collapsed": true, 265 | "_cell_guid": "d626a4d3-59e9-4049-aefc-81b61f812d4d", 266 | "trusted": false 267 | }, 268 | "cell_type": "code", 269 | "source": "train_orig = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')\ntest_orig = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv')", 270 | "execution_count": null, 271 | "outputs": [] 272 | }, 273 | { 274 | "metadata": { 275 | "_uuid": "4af40d3a6f3e17e706b5ce81dd07dbd03e9619ce", 276 | "collapsed": true, 277 | "_cell_guid": "baac45ac-88aa-4252-81ca-3f3a27a9ccbc", 278 | "trusted": false 279 | }, 280 | "cell_type": "code", 281 | "source": "train_orig = Sanitize(train_orig)\ntest_orig = Sanitize(test_orig)", 282 | "execution_count": null, 283 | "outputs": [] 284 | }, 285 | { 286 | "metadata": { 287 | "_uuid": "f2d7f004cd7bd0a3c8461c8106e34c07b2d5348d", 288 | "collapsed": true, 289 | "_cell_guid": "c97079c3-0df4-4bfe-8308-81cbdcda48d3", 290 | "trusted": false 291 | }, 292 | "cell_type": "code", 293 | "source": "def TfidfAndPredict(tfidfer, model):\n tfidf_train = tfidfer.transform(train_orig['comment_text'])\n tfidf_test = tfidfer.transform(test_orig['comment_text'])\n train_scores = model.predict(tfidf_train)\n test_scores = model.predict(tfidf_test)\n \n return train_scores, test_scores", 294 | "execution_count": null, 295 | "outputs": [] 296 | }, 297 | { 298 | "metadata": { 299 | "_uuid": "5dd3cc4349133aa1c3bf648ed2f41c5c14261b6f", 300 | "collapsed": true, 301 | "_cell_guid": "aa06f163-9e0d-48fe-98fe-eab940d1f13b", 302 | "trusted": false 303 | }, 304 | "cell_type": "code", 305 | "source": "toxic_tr_scores, toxic_t_scores = TfidfAndPredict(tfidfer_toxic, model_toxic)", 306 | "execution_count": null, 307 | "outputs": [] 308 | }, 309 | { 310 | "metadata": { 311 | "_uuid": "95859856265bc344769fe837c435b0af8f222326", 312 | "collapsed": true, 313 | "_cell_guid": "9d6e0ce2-6e55-42fb-ab12-81437b0923b6", 314 | "trusted": false 315 | }, 316 | "cell_type": "code", 317 | "source": "toxic_tr_scores.shape, toxic_t_scores.shape", 318 | "execution_count": null, 319 | "outputs": [] 320 | }, 321 | { 322 | "metadata": { 323 | "_uuid": "81ce8b860fe6cc402ccb2eb2f8a5b4e8391cdeba", 324 | "collapsed": true, 325 | "_cell_guid": "d4531c50-96a0-4180-81f1-78f6ae81ee20", 326 | "trusted": false 327 | }, 328 | "cell_type": "code", 329 | "source": "attack_tr_scores, attack_t_scores = TfidfAndPredict(tfidfer_attack, model_attack)", 330 | "execution_count": null, 331 | "outputs": [] 332 | }, 333 | { 334 | "metadata": { 335 | "_uuid": "b25fd9d4bf26223a25e1cd4d33fa85e2859f44d4", 336 | "collapsed": true, 337 | "_cell_guid": "fb049953-c535-42fd-849d-42faeaaa0072", 338 | "trusted": false 339 | }, 340 | "cell_type": "code", 341 | "source": "attack_tr_scores.shape, attack_t_scores.shape", 342 | "execution_count": null, 343 | "outputs": [] 344 | }, 345 | { 346 | "metadata": { 347 | "_uuid": "deb4311c8f47d2414ef24a47570611aad9da8e01", 348 | "collapsed": true, 349 | "_cell_guid": "8ca296a0-21c9-4863-be29-d8eff0474e8b", 350 | "trusted": false 351 | }, 352 | "cell_type": "code", 353 | "source": "aggression_tr_scores, aggression_t_scores = TfidfAndPredict(tfidfer_aggression, model_aggression)", 354 | "execution_count": null, 355 | "outputs": [] 356 | }, 357 | { 358 | "metadata": { 359 | "_uuid": "eeb5ad47a06592db00d32b3b93ada9442053bcac", 360 | "collapsed": true, 361 | "_cell_guid": "fb80020b-ea0f-4129-afb5-a106aeb45e85", 362 | "trusted": false 363 | }, 364 | "cell_type": "code", 365 | "source": "aggression_tr_scores.shape, aggression_t_scores.shape", 366 | "execution_count": null, 367 | "outputs": [] 368 | }, 369 | { 370 | "metadata": { 371 | "_uuid": "790503a6bc2b0a9cf6def1bd9ac5083a6e55c710", 372 | "_cell_guid": "8ea53ca4-fdb8-4f61-b268-c6aea94d24b4" 373 | }, 374 | "cell_type": "markdown", 375 | "source": "**Ok, now write out these scores alongside the original train and test datasets**" 376 | }, 377 | { 378 | "metadata": { 379 | "_uuid": "e7cf48859d9338ffb3abbd1c8a55a126df4e3307", 380 | "collapsed": true, 381 | "_cell_guid": "5d4bb41e-daff-43c5-b4c8-0e7e5e9c70b3", 382 | "trusted": false 383 | }, 384 | "cell_type": "code", 385 | "source": "# toxic_level, to not be confused with original label 'toxic'\ntrain_orig['toxic_level'] = toxic_tr_scores\ntrain_orig['attack'] = attack_tr_scores\ntrain_orig['aggression'] = aggression_tr_scores\ntest_orig['toxic_level'] = toxic_t_scores\ntest_orig['attack'] = attack_t_scores\ntest_orig['aggression'] = aggression_t_scores\n", 386 | "execution_count": null, 387 | "outputs": [] 388 | }, 389 | { 390 | "metadata": { 391 | "_uuid": "c041fffab51281b288a221245f07bd5437aea35d", 392 | "collapsed": true, 393 | "_cell_guid": "88afa0c6-23ca-4393-88c5-b143acfa32a3", 394 | "trusted": false 395 | }, 396 | "cell_type": "code", 397 | "source": "train_orig.to_csv('train_with_convai.csv', index=False)\ntest_orig.to_csv('test_with_convai.csv', index=False)", 398 | "execution_count": null, 399 | "outputs": [] 400 | } 401 | ], 402 | "metadata": { 403 | "language_info": { 404 | "pygments_lexer": "ipython3", 405 | "file_extension": ".py", 406 | "codemirror_mode": { 407 | "version": 3, 408 | "name": "ipython" 409 | }, 410 | "mimetype": "text/x-python", 411 | "version": "3.6.3", 412 | "nbconvert_exporter": "python", 413 | "name": "python" 414 | }, 415 | "kernelspec": { 416 | "display_name": "Python 3", 417 | "language": "python", 418 | "name": "python3" 419 | } 420 | }, 421 | "nbformat": 4, 422 | "nbformat_minor": 1 423 | } -------------------------------------------------------------------------------- /badwords.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
\n", 12 | "\n", 25 | "\n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | "
01234
0NaN040anus
1NaN040arse
2NaN040arsehole
3NaN040asshole
4NaN040axe-wound
5NaN040axewound
6NaN040bastard
7NaN040basterd
8NaN040bastird
9NaN040blow job
\n", 119 | "
" 120 | ], 121 | "text/plain": [ 122 | " 0 1 2 3 4\n", 123 | "0 NaN 0 4 0 anus\n", 124 | "1 NaN 0 4 0 arse\n", 125 | "2 NaN 0 4 0 arsehole\n", 126 | "3 NaN 0 4 0 asshole\n", 127 | "4 NaN 0 4 0 axe-wound\n", 128 | "5 NaN 0 4 0 axewound\n", 129 | "6 NaN 0 4 0 bastard\n", 130 | "7 NaN 0 4 0 basterd\n", 131 | "8 NaN 0 4 0 bastird\n", 132 | "9 NaN 0 4 0 blow job" 133 | ] 134 | }, 135 | "execution_count": 6, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "import pandas as pd\n", 142 | "bad = pd.read_csv('badwords/bad_words.txt', header=None)\n", 143 | "bad.head(n=10)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 8, 149 | "metadata": { 150 | "collapsed": true 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "strip = bad[4]\n", 155 | "strip.to_csv('badwords.csv', index=False)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 23, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "s = pd.read_csv('badwords/badwords.csv', header=None)\n", 165 | "ss = pd.read_csv('badwords/en.txt', header=None)\n", 166 | "sss = pd.read_csv('badwords/full-list-of-bad-words-banned-by-google-txt-file_2013_11_26_04_53_31_867.txt', header=None)\n", 167 | "z = pd.read_csv('badwords/negative-words.txt', header=None)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 17, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/html": [ 178 | "
\n", 179 | "\n", 192 | "\n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | "
0
0anus
1arse
2arsehole
3asshole
4axe-wound
5axewound
6bastard
7basterd
8bastird
9blow job
\n", 242 | "
" 243 | ], 244 | "text/plain": [ 245 | " 0\n", 246 | "0 anus\n", 247 | "1 arse\n", 248 | "2 arsehole\n", 249 | "3 asshole\n", 250 | "4 axe-wound\n", 251 | "5 axewound\n", 252 | "6 bastard\n", 253 | "7 basterd\n", 254 | "8 bastird\n", 255 | "9 blow job" 256 | ] 257 | }, 258 | "execution_count": 17, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "s.head(n=10)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 18, 270 | "metadata": { 271 | "collapsed": true 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "new = s.append(ss)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 19, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/html": [ 286 | "
\n", 287 | "\n", 300 | "\n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | "
0
0anus
1arse
\n", 318 | "
" 319 | ], 320 | "text/plain": [ 321 | " 0\n", 322 | "0 anus\n", 323 | "1 arse" 324 | ] 325 | }, 326 | "execution_count": 19, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "new.head(n=2)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 21, 338 | "metadata": { 339 | "collapsed": true 340 | }, 341 | "outputs": [], 342 | "source": [ 343 | "new = new.append(sss)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 22, 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/plain": [ 354 | "(1218, 1)" 355 | ] 356 | }, 357 | "execution_count": 22, 358 | "metadata": {}, 359 | "output_type": "execute_result" 360 | } 361 | ], 362 | "source": [ 363 | "new.shape" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 24, 369 | "metadata": { 370 | "collapsed": true 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "new = new.append(z)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 25, 380 | "metadata": {}, 381 | "outputs": [ 382 | { 383 | "data": { 384 | "text/plain": [ 385 | "(6001, 1)" 386 | ] 387 | }, 388 | "execution_count": 25, 389 | "metadata": {}, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "new.shape" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": { 401 | "collapsed": true 402 | }, 403 | "outputs": [], 404 | "source": [] 405 | } 406 | ], 407 | "metadata": { 408 | "kernelspec": { 409 | "display_name": "Python 2", 410 | "language": "python", 411 | "name": "python2" 412 | }, 413 | "language_info": { 414 | "codemirror_mode": { 415 | "name": "ipython", 416 | "version": 2 417 | }, 418 | "file_extension": ".py", 419 | "mimetype": "text/x-python", 420 | "name": "python", 421 | "nbconvert_exporter": "python", 422 | "pygments_lexer": "ipython2", 423 | "version": "2.7.14" 424 | } 425 | }, 426 | "nbformat": 4, 427 | "nbformat_minor": 2 428 | } 429 | -------------------------------------------------------------------------------- /convai_feature.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": { 5 | "_cell_guid": "59afa460-67bb-4083-8f64-4e8bdc7d688e", 6 | "_uuid": "e1f3247609398721b4c6b37da205be55e341b899" 7 | }, 8 | "cell_type": "markdown", 9 | "source": "This is a basic LogisticRegression model trained using the data from https://www.kaggle.com/eoveson/convai-datasets-baseline-models\n\nThe baseline model in that kernal is tuned a little to get the data for this kernal This kernal scored 0.044 in the LB" 10 | }, 11 | { 12 | "metadata": { 13 | "_cell_guid": "eb9acbb1-40db-4a60-9c00-7e1134408cb1", 14 | "_uuid": "7e97dad72af19207237cb816bc898ca5818f4389", 15 | "collapsed": true, 16 | "trusted": false 17 | }, 18 | "cell_type": "code", 19 | "source": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load in \n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the \"../input/\" directory.\n# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom scipy import sparse\n# set stopwords\n\nfrom subprocess import check_output\nprint(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n# Any results you write to the current directory are saved as output.", 20 | "execution_count": null, 21 | "outputs": [] 22 | }, 23 | { 24 | "metadata": { 25 | "_cell_guid": "bb967e03-d30b-46ec-b9d2-c0f5d4c0ee68", 26 | "_uuid": "97b399586c43626b73bc77b50e58b952d86ea8da", 27 | "collapsed": true, 28 | "trusted": false 29 | }, 30 | "cell_type": "code", 31 | "source": "train = pd.read_csv('../input/dataset/train_with_convai.csv')\ntest = pd.read_csv('../input/dataset/test_with_convai.csv')\n", 32 | "execution_count": null, 33 | "outputs": [] 34 | }, 35 | { 36 | "metadata": { 37 | "_cell_guid": "1eebb207-607e-4985-908e-9848888808b1", 38 | "_uuid": "3e90295dde0dd25158ea9e3464165aa8ea62fd1c", 39 | "collapsed": true, 40 | "trusted": false 41 | }, 42 | "cell_type": "code", 43 | "source": "feats_to_concat = ['comment_text', 'toxic_level', 'attack', 'aggression']\n# combining test and train\nalldata = pd.concat([train[feats_to_concat], test[feats_to_concat]], axis=0)\nalldata.comment_text.fillna('unknown', inplace=True)", 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "metadata": { 49 | "_cell_guid": "88a8e609-b287-4a7e-b72d-5dcac6f4a55f", 50 | "_uuid": "741273ee4b5122a37d978708ba29e16879e5b33f", 51 | "collapsed": true, 52 | "trusted": false 53 | }, 54 | "cell_type": "code", 55 | "source": "vect_words = TfidfVectorizer(max_features=50000, analyzer='word', ngram_range=(1, 1))\nvect_chars = TfidfVectorizer(max_features=20000, analyzer='char', ngram_range=(1, 3))", 56 | "execution_count": null, 57 | "outputs": [] 58 | }, 59 | { 60 | "metadata": { 61 | "_cell_guid": "6db22032-8e99-4848-8978-be7c68a1e936", 62 | "_uuid": "cf10b99072cef22bf87ee92c9aa51f035a26e893", 63 | "collapsed": true, 64 | "trusted": false 65 | }, 66 | "cell_type": "code", 67 | "source": "all_words = vect_words.fit_transform(alldata.comment_text)\nall_chars = vect_chars.fit_transform(alldata.comment_text)", 68 | "execution_count": null, 69 | "outputs": [] 70 | }, 71 | { 72 | "metadata": { 73 | "_cell_guid": "8f42e0d7-5938-4bb0-beb7-7ddf9f85685d", 74 | "_uuid": "d074b6b6c5271f462c129c534980c5a0d287599f", 75 | "collapsed": true, 76 | "trusted": false 77 | }, 78 | "cell_type": "code", 79 | "source": "train_new = train\ntest_new = test", 80 | "execution_count": null, 81 | "outputs": [] 82 | }, 83 | { 84 | "metadata": { 85 | "_cell_guid": "c068c9bb-bf28-4342-aa71-e575c6d93788", 86 | "_uuid": "09975f14757c51e19876dab638a39671dfd555e4", 87 | "collapsed": true, 88 | "trusted": false 89 | }, 90 | "cell_type": "code", 91 | "source": "train_words = all_words[:len(train_new)]\ntest_words = all_words[len(train_new):]\n\ntrain_chars = all_chars[:len(train_new)]\ntest_chars = all_chars[len(train_new):]", 92 | "execution_count": null, 93 | "outputs": [] 94 | }, 95 | { 96 | "metadata": { 97 | "_cell_guid": "5d55e152-e1cb-4cf0-aa41-e3eec5850b3a", 98 | "_uuid": "0338f2d0b8f09c751f97afebf1cf8e77d8a10fe3", 99 | "collapsed": true, 100 | "trusted": false 101 | }, 102 | "cell_type": "code", 103 | "source": "feats = ['toxic_level', 'attack']\n# make sparse matrix with needed data for train and test\ntrain_feats = sparse.hstack([train_words, train_chars, alldata[feats][:len(train_new)]])\ntest_feats = sparse.hstack([test_words, test_chars, alldata[feats][len(train_new):]])", 104 | "execution_count": null, 105 | "outputs": [] 106 | }, 107 | { 108 | "metadata": { 109 | "_cell_guid": "350aad79-ee6f-44bc-9d85-4e9652956bd3", 110 | "_uuid": "da2082c68a367369fac28ddc09eec2e5b6c718bb", 111 | "scrolled": false, 112 | "collapsed": true, 113 | "trusted": false 114 | }, 115 | "cell_type": "code", 116 | "source": "col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n\nonly_col = ['toxic']\n\npreds = np.zeros((test_new.shape[0], len(col)))\n\nfor i, j in enumerate(col):\n print('===Fit '+j)\n \n model = LogisticRegression(C=4.0, solver='sag')\n print('Fitting model')\n model.fit(train_feats, train_new[j])\n \n print('Predicting on test')\n preds[:,i] = model.predict_proba(test_feats)[:,1]", 117 | "execution_count": null, 118 | "outputs": [] 119 | }, 120 | { 121 | "metadata": { 122 | "_cell_guid": "9d84b909-d93b-4778-b432-701f65a73d3c", 123 | "_uuid": "3605ca797e6d5e4d05ac2c63d70766c23d2a8cf1", 124 | "collapsed": true, 125 | "trusted": false 126 | }, 127 | "cell_type": "code", 128 | "source": "subm = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')\n\nsubmid = pd.DataFrame({'id': subm[\"id\"]})\nsubmission = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)\nsubmission.to_csv('feat_lr_2cols.csv', index=False)", 129 | "execution_count": null, 130 | "outputs": [] 131 | }, 132 | { 133 | "metadata": { 134 | "_cell_guid": "6d350714-1262-4f91-af11-a7f95750ec84", 135 | "_uuid": "be385cfe2683246d05dc872d7b09cb4608b73337", 136 | "collapsed": true, 137 | "trusted": false 138 | }, 139 | "cell_type": "code", 140 | "source": "", 141 | "execution_count": null, 142 | "outputs": [] 143 | } 144 | ], 145 | "metadata": { 146 | "language_info": { 147 | "name": "python", 148 | "version": "3.6.4", 149 | "mimetype": "text/x-python", 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "pygments_lexer": "ipython3", 155 | "nbconvert_exporter": "python", 156 | "file_extension": ".py" 157 | }, 158 | "kernelspec": { 159 | "display_name": "Python 3", 160 | "language": "python", 161 | "name": "python3" 162 | } 163 | }, 164 | "nbformat": 4, 165 | "nbformat_minor": 1 166 | } -------------------------------------------------------------------------------- /fasttext_direct.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "_cell_guid": "5c86b9bc-a478-4da9-adcc-b97ca0fbd0c9", 7 | "_uuid": "53d59528c17da4c1d9759786326a82d0c8765804" 8 | }, 9 | "source": [ 10 | "# Using FastText models (not vectors) for robust embeddings\n", 11 | "\n", 12 | "I'd like to explain my approach of using pretrained FastText models as input to Keras Neural Networks. FastText is a word embedding not unlike Word2Vec or GloVe, but the cool thing is that each word vector is based on sub-word character n-grams. That means that even for previously unseen words (e.g. due to typos), the model can make an educated guess towards its meaning. To find out more about FastText, check out both their [Github](https://github.com/facebookresearch/fastText/) and [website](https://fasttext.cc/).\n", 13 | "\n", 14 | "To do this, we won't be using the classic Keras embedding layer and instead hand-craft the embedding for each example. As a result, we need to write more code and invest some time into preprocessing, but that is easily justified by the results.\n", 15 | "\n", 16 | "**Disclaimer: Loading the FastText model will take some serious memory! I recommend having at least 60 GB of RAM. EC2's p2.xlarge instance should have no problems with this, but you can always [add some swap](https://stackoverflow.com/questions/17173972/how-do-you-add-swap-to-an-ec2-instance) for good measure. I also added a section below to build a training generator for this.**" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "_cell_guid": "b88f84eb-051c-4fe8-8fc0-70365a0b9693", 23 | "_uuid": "168f1efdabebb51c2922aab58231921599f9348f" 24 | }, 25 | "source": [ 26 | "## Preparations: Getting FastText and the model\n", 27 | "\n", 28 | "First, build FastText from sources as described [here](https://github.com/facebookresearch/fastText#requirements). Don't worry, there's nothing crazy you have to do and it will finish in less than a minute. Next, install the Python package in your virtualenv following [these instructions](https://github.com/facebookresearch/fastText/tree/master/python).\n", 29 | "\n", 30 | "For the model, I use the one pretrained on English Wikipedia. I'd love to have one trained on Twitter or similar, since it might be more internet-slangy, but I haven't found any yet and don't feel like pretraining one myself. Download the model [here](https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md). Make sure you get the bin, not just the vec (text) file. I'll assume you placed it (or a symlink to it) into your code directory and named it `ft_model.bin`." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": { 36 | "_cell_guid": "eea9d34b-3205-421f-bfd8-6e29f8092ddd", 37 | "_uuid": "e4542f71483666b9168dacc6949bcbaff8b66642" 38 | }, 39 | "source": [ 40 | "## Preparations: Exploring the model\n", 41 | "\n", 42 | "Let's explore the model! Go to your FastText directory and run `./fasttext nn `. Now you can enter some terms and see the nearest neighbors to this word in the embedding space. Here are some examples:\n", 43 | "\n", 44 | "```\n", 45 | "Query word? queen\n", 46 | "—queen 0.719091\n", 47 | "‘queen 0.692849\n", 48 | "#queen 0.656498\n", 49 | "queena 0.650313\n", 50 | "king 0.64931\n", 51 | "queen`s 0.63954\n", 52 | "king/queen 0.634855\n", 53 | "s/queen 0.627386\n", 54 | "princess 0.623889\n", 55 | "queeny 0.620919\n", 56 | "```\n", 57 | "\n", 58 | "Ok that looks pretty ugly. I suppose Facebook was not very exact in their cleaning of the input data. But some sensible suggestions are there: `king` and `princess`! Let's try a typo that is unlikely to have appeared in the original data:\n", 59 | "\n", 60 | "```\n", 61 | "Query word? dimensionnallity\n", 62 | "dimension, 0.722278\n", 63 | "dimensionality 0.708645\n", 64 | "dimensionful 0.698573\n", 65 | "codimension 0.689754\n", 66 | "codimensions 0.67555\n", 67 | "twodimensional 0.674745\n", 68 | "dimension 0.67258\n", 69 | "\\,kdimensional 0.668848\n", 70 | "‘dimensions 0.665725\n", 71 | "two–dimensional 0.665109\n", 72 | "```\n", 73 | "\n", 74 | "Sweet! Even though it has never seen that word, it recognizes it to be related with \"dimensionality\". Let's try some something mean:\n", 75 | "\n", 76 | "```\n", 77 | "Query word? dumb\n", 78 | "stupid 0.746051\n", 79 | "dumber 0.732965\n", 80 | "clueless 0.662594\n", 81 | "idiotic 0.64993\n", 82 | "silly 0.632314\n", 83 | "stupidstitious 0.628875\n", 84 | "stupidly 0.622968\n", 85 | "moronic 0.621633\n", 86 | "ignorant 0.620475\n", 87 | "stupider 0.617377\n", 88 | "```\n", 89 | "\n", 90 | "Nice! Even though this was trained on Wikipedia, we're getting at least some basic insults. I'll leave it to you to explore the really hateful words. They all seem to be there ;)\n", 91 | "\n", 92 | "**Note:** Keep in mind that exploring the nearest neighbors is a very superficial approach to understanding the model! The embedding space has 300 dimensions, and we boil them down to a single distance metric. We can't be sure in which dimensions these words are related to each other, but we can trust in the model to have learnt something sensible.\n", 93 | "\n", 94 | "**Pro tip:** Our data should be cleaned and normalized in a similar way as Facebook did before they trained this model. We can query the model to get some insights into what they did, e.g.\n", 95 | "\n", 96 | "```\n", 97 | "Query word? 1\n", 98 | "insel 0.483141\n", 99 | "inseln 0.401125\n", 100 | "...\n", 101 | "Query word? one\n", 102 | "two 0.692744\n", 103 | "three 0.676568\n", 104 | "...\n", 105 | "```\n", 106 | "\n", 107 | "This tells us they converted all numbers to their text equivalent, and so should we!" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": { 113 | "_cell_guid": "2f03f7c4-ecc8-47d3-8c59-9e4482e7684a", 114 | "_uuid": "ae7a32ce6f66748656faecde4d079857b992ac1e" 115 | }, 116 | "source": [ 117 | "## Loading and cleaning the data\n", 118 | "\n", 119 | "We define a method `normalize` to clean and prepare a single string. We will use it later to prepare our string data. Also, we load the data as we're used to:" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 73, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "\n", 129 | "from __future__ import absolute_import\n", 130 | "from __future__ import division\n", 131 | "from __future__ import print_function\n", 132 | "import cPickle\n", 133 | "import json\n", 134 | "import os\n", 135 | "import numpy as np\n", 136 | "from keras.callbacks import EarlyStopping\n", 137 | "from keras.callbacks import ModelCheckpoint\n", 138 | "from keras.layers import Conv1D\n", 139 | "from keras.layers import Dense\n", 140 | "from keras.layers import Reshape\n", 141 | "from keras.layers import Dropout\n", 142 | "from keras.layers import Bidirectional\n", 143 | "from keras.layers import LSTM\n", 144 | "from keras.layers import concatenate\n", 145 | "from keras.layers import Embedding\n", 146 | "from keras.layers import Embedding\n", 147 | "from keras import regularizers\n", 148 | "from keras.layers import Flatten\n", 149 | "from keras.layers import GlobalMaxPooling1D\n", 150 | "from keras.layers import Input\n", 151 | "from keras.layers import MaxPooling1D\n", 152 | "from keras.layers import CuDNNGRU\n", 153 | "from keras.models import load_model\n", 154 | "from keras.models import Model\n", 155 | "from keras.optimizers import RMSprop\n", 156 | "from keras.optimizers import Adam\n", 157 | "from keras.preprocessing.sequence import pad_sequences\n", 158 | "from keras.preprocessing.text import Tokenizer\n", 159 | "from keras.utils import to_categorical\n", 160 | "from keras import backend as K\n", 161 | "from keras.engine.topology import Layer\n", 162 | "from keras import initializers, regularizers, constraints\n", 163 | "from keras.layers.normalization import BatchNormalization\n", 164 | "from keras.layers.advanced_activations import LeakyReLU, PReLU\n", 165 | "from keras.models import Sequential\n", 166 | "import nltk\n", 167 | "from keras.optimizers import Nadam\n" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 13, 173 | "metadata": { 174 | "_cell_guid": "423398cb-482b-4dc3-9904-82c2d17d2e2c", 175 | "_kg_hide-output": true, 176 | "_uuid": "fa74d030d08aff58c32455cd4218d9ff0ef494d9" 177 | }, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "\n", 184 | "Loading data\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "import re\n", 190 | "import numpy as np\n", 191 | "import pandas as pd\n", 192 | "from fastText import load_model\n", 193 | "from keras import backend as K\n", 194 | "from keras.models import Model\n", 195 | "from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate\n", 196 | "from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNGRU\n", 197 | "from keras.models import Sequential\n", 198 | "\n", 199 | "from keras.preprocessing import text, sequence\n", 200 | "from keras.callbacks import Callback\n", 201 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer\n", 202 | "from sklearn.metrics import log_loss\n", 203 | "\n", 204 | "\n", 205 | "window_length = 200 # The amount of words we look at per example. Experiment with this.\n", 206 | "\n", 207 | "def normalize(s):\n", 208 | " \"\"\"\n", 209 | " Given a text, cleans and normalizes it. Feel free to add your own stuff.\n", 210 | " \"\"\"\n", 211 | " s = s.lower()\n", 212 | " # Replace ips\n", 213 | " s = re.sub(r'\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}', ' _ip_ ', s)\n", 214 | " # Isolate punctuation\n", 215 | " s = re.sub(r'([\\'\\\"\\.\\(\\)\\!\\?\\-\\\\\\/\\,])', r' \\1 ', s)\n", 216 | " # Remove some special characters\n", 217 | " s = re.sub(r'([\\;\\:\\|•«\\n])', ' ', s)\n", 218 | " # Replace numbers and symbols with language\n", 219 | " s = s.replace('&', ' and ')\n", 220 | " s = s.replace('@', ' at ')\n", 221 | " s = s.replace('0', ' zero ')\n", 222 | " s = s.replace('1', ' one ')\n", 223 | " s = s.replace('2', ' two ')\n", 224 | " s = s.replace('3', ' three ')\n", 225 | " s = s.replace('4', ' four ')\n", 226 | " s = s.replace('5', ' five ')\n", 227 | " s = s.replace('6', ' six ')\n", 228 | " s = s.replace('7', ' seven ')\n", 229 | " s = s.replace('8', ' eight ')\n", 230 | " s = s.replace('9', ' nine ')\n", 231 | " return s\n", 232 | "\n", 233 | "print('\\nLoading data')\n", 234 | "train = pd.read_csv('cleaned_final_train_clean.csv')\n", 235 | "test = pd.read_csv('cleaned_test_clean.csv')\n", 236 | "train['comment_text'] = train['comment_text'].fillna('_empty_')\n", 237 | "test['comment_text'] = test['comment_text'].fillna('_empty_')" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',\n", 254 | " stop_words= 'english',ngram_range=(1,3),dtype=np.float32)\n", 255 | "vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',\n", 256 | " stop_words= 'english',ngram_range=(3,6),dtype=np.float32)\n", 257 | "\n", 258 | "# Word ngram vector\n", 259 | "tr_vect = vect_word.fit_transform(train['comment_text'])\n", 260 | "ts_vect = vect_word.transform(test['comment_text'])\n", 261 | "\n", 262 | "# Character n gram vector\n", 263 | "tr_vect_char = vect_char.fit_transform(train['comment_text'])\n", 264 | "ts_vect_char = vect_char.transform(test['comment_text'])\n", 265 | "\n", 266 | "\n", 267 | "X = sparse.hstack([tr_vect, tr_vect_char])\n", 268 | "x_test = sparse.hstack([ts_vect, ts_vect_char])\n", 269 | "\n" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 8, 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "name": "stdout", 279 | "output_type": "stream", 280 | "text": [ 281 | "--2018-02-25 02:32:20-- https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.zip\n", 282 | "Resolving s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)... 52.219.20.137\n", 283 | "Connecting to s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)|52.219.20.137|:443... connected.\n", 284 | "HTTP request sent, awaiting response... 200 OK\n", 285 | "Length: 10356881291 (9.6G) [application/zip]\n", 286 | "Saving to: ‘wiki.en.zip’\n", 287 | "\n", 288 | "wiki.en.zip 100%[===================>] 9.65G 8.10MB/s in 25m 35s \n", 289 | "\n", 290 | "2018-02-25 02:57:56 (6.44 MB/s) - ‘wiki.en.zip’ saved [10356881291/10356881291]\n", 291 | "\n" 292 | ] 293 | } 294 | ], 295 | "source": [ 296 | "!wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.zip" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": { 302 | "_cell_guid": "d98e8058-b635-408f-98ca-c0b999bc310c", 303 | "_uuid": "a4099da988bbe670ee7b389071e924ca1891cec2" 304 | }, 305 | "source": [ 306 | "Ok next, let's load the FastText model and define methods that convert text to a sequence of vectors. Note that I'm just considering the last n words of each text. You could play with this, too." 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 3, 312 | "metadata": { 313 | "_cell_guid": "5792a3ad-8b04-435f-bd3d-f54d7449921b", 314 | "_kg_hide-output": true, 315 | "_uuid": "d5a8656a2cb0b9cde191230f477faa17934180c9" 316 | }, 317 | "outputs": [ 318 | { 319 | "name": "stdout", 320 | "output_type": "stream", 321 | "text": [ 322 | "\n", 323 | "Loading FT model\n" 324 | ] 325 | } 326 | ], 327 | "source": [ 328 | "classes = [\n", 329 | " 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'\n", 330 | "]\n", 331 | "\n", 332 | "print('\\nLoading FT model')\n", 333 | "ft_model = load_model('wiki.en.bin')\n", 334 | "n_features = ft_model.get_dimension()\n", 335 | "\n", 336 | "def text_to_vector(text):\n", 337 | " \"\"\"\n", 338 | " Given a string, normalizes it, then splits it into words and finally converts\n", 339 | " it to a sequence of word vectors.\n", 340 | " \"\"\"\n", 341 | " text = normalize(text)\n", 342 | " words = text.split()\n", 343 | " window = words[-window_length:]\n", 344 | " \n", 345 | " x = np.zeros((window_length, n_features))\n", 346 | "\n", 347 | " for i, word in enumerate(window):\n", 348 | " x[i, :] = ft_model.get_word_vector(word).astype('float32')\n", 349 | "\n", 350 | " return x\n", 351 | "\n", 352 | "def df_to_data(df):\n", 353 | " \"\"\"\n", 354 | " Convert a given dataframe to a dataset of inputs for the NN.\n", 355 | " \"\"\"\n", 356 | " x = np.zeros((len(df), window_length, n_features), dtype='float32')\n", 357 | "\n", 358 | " for i, comment in enumerate(df['comment_text'].values):\n", 359 | " x[i, :] = text_to_vector(comment)\n", 360 | "\n", 361 | " return x" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": { 367 | "_cell_guid": "d9bc312a-e444-4058-b4ac-3f30d3d98940", 368 | "_uuid": "ae35399a098729b8e68d76a953d4c097917438bb" 369 | }, 370 | "source": [ 371 | "To convert an input dataframe to an input vector, just call `df_to_data`. This will result in the shape `(n_examples, window_length, n_features)`. Here, for each row we would have 200 words a 300 features each.\n", 372 | "\n", 373 | "**EDIT/NOTE:** This will probably not fit into your memory, so don't bother executing it :) Instead, read my generator guide below." 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 4, 379 | "metadata": { 380 | "_cell_guid": "1c98dd07-36b1-4b7f-b806-6a5d8bdabae1", 381 | "_kg_hide-output": true, 382 | "_uuid": "6a34513a82b5140d5e5c258f5c6427b83ae62245" 383 | }, 384 | "outputs": [ 385 | { 386 | "ename": "MemoryError", 387 | "evalue": "", 388 | "output_type": "error", 389 | "traceback": [ 390 | "\u001b[0;31m------------------------------------------------------------------------\u001b[0m", 391 | "\u001b[0;31mMemoryError\u001b[0m Traceback (most recent call last)", 392 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mx_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_to_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0my_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mclasses\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mx_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_to_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0my_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mclasses\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 393 | "\u001b[0;32m\u001b[0m in \u001b[0;36mdf_to_data\u001b[0;34m(df)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mConvert\u001b[0m \u001b[0ma\u001b[0m \u001b[0mgiven\u001b[0m \u001b[0mdataframe\u001b[0m \u001b[0mto\u001b[0m \u001b[0ma\u001b[0m \u001b[0mdataset\u001b[0m \u001b[0mof\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mNN\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \"\"\"\n\u001b[0;32m---> 29\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwindow_length\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_features\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'float32'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 30\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcomment\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'comment_text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 394 | "\u001b[0;31mMemoryError\u001b[0m: " 395 | ] 396 | } 397 | ], 398 | "source": [ 399 | "x_train = df_to_data(train)\n", 400 | "y_train = train[classes].values\n", 401 | "\n", 402 | "x_test = df_to_data(test)\n", 403 | "y_test = test[classes].values" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": { 409 | "_cell_guid": "4f582bb5-ac4b-4b67-8e34-6fa022416e57", 410 | "_uuid": "7b41f089ef0d93d41d565e99b03c0f19b30f26ce" 411 | }, 412 | "source": [ 413 | "And now you should be good to go! Train this as usual. You don't need an `EmbeddingLayer`, but you need to pass `input_shape=(window_length, n_features)` to the first layer in your NN.\n", 414 | "\n", 415 | "I'm still in the process of experimenting, but I already achieved a single-model LB score of `0.9842` with something very simple. Bagging multiple of these models got me into the top 100 easily. Good luck!" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": { 421 | "_cell_guid": "4bb7ba9a-7c32-43db-9c52-f4fca204ab12", 422 | "_uuid": "4dfea6e85ca933626a14ce2df8e937051b863d0a" 423 | }, 424 | "source": [ 425 | "### PS: Using a generator so you don't have to keep the whole damn thing in memory\n", 426 | "As @liujilong pointed out, not even the p2.xlarge machine with 64 GB can hold both the training and test set for window sizes longer than ~100 words. It seems I underestimated how much memory this monster model eats! Also, locally I had long [added swap space](https://stackoverflow.com/questions/17173972/how-do-you-add-swap-to-an-ec2-instance) and switched to generators so I wouldn't have to keep the whole thing memory. Let me show you how to implement the generator part. This is also useful to add some randomization later on.\n", 427 | "\n", 428 | "The idea is that instead of converting the whole training set to one large array, we can write a function that just spits out one batch of data at a time, infinitely. Keras can automaticaly spin up a separate thread for this method (note though that \"threads\" in Python are ridiculous and do not give any speedup whatsoever). This means that we have to write some more code and training will be slightly slower, but we need only a fraction of the memory and we can add some cool randomization to each batch later on (see ideas section below).\n", 429 | "\n", 430 | "We can keep all the code from above. This generator method works only for training data, not for validation data, so you will need to split by hand. Let's do that now:" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 6, 436 | "metadata": { 437 | "_cell_guid": "f71c2d37-2a1e-483a-af4c-5e7c820ad29c", 438 | "_kg_hide-output": true, 439 | "_uuid": "74becdb6b32d5676e1f4a0d67b38a0ba355dac97" 440 | }, 441 | "outputs": [], 442 | "source": [ 443 | "# Split the dataset:\n", 444 | "#split_index = round(len(train) * 0.9)\n", 445 | "#shuffled_train = train.sample(frac=1)\n", 446 | "#df_train = shuffled_train.iloc[:split_index]\n", 447 | "#df_val = shuffled_train.iloc[split_index:]\n", 448 | "\n", 449 | "# Convert validation set to fixed array\n", 450 | "#x_val = df_to_data(df_val)\n", 451 | "#y_val = df_val[classes].values\n", 452 | "\n", 453 | "def data_generator(df, batch_size):\n", 454 | " \"\"\"\n", 455 | " Given a raw dataframe, generates infinite batches of FastText vectors.\n", 456 | " \"\"\"\n", 457 | " batch_i = 0 # Counter inside the current batch vector\n", 458 | " batch_x = None # The current batch's x data\n", 459 | " batch_y = None # The current batch's y data\n", 460 | " \n", 461 | " while True: # Loop forever\n", 462 | " df = df.sample(frac=1) # Shuffle df each epoch\n", 463 | " \n", 464 | " for i, row in df.iterrows():\n", 465 | " comment = row['comment_text']\n", 466 | " \n", 467 | " if batch_x is None:\n", 468 | " batch_x = np.zeros((batch_size, window_length, n_features), dtype='float32')\n", 469 | " batch_y = np.zeros((batch_size, len(classes)), dtype='float32')\n", 470 | " \n", 471 | " batch_x[batch_i] = text_to_vector(comment)\n", 472 | " batch_y[batch_i] = row[classes].values\n", 473 | " batch_i += 1\n", 474 | "\n", 475 | " if batch_i == batch_size:\n", 476 | " # Ready to yield the batch\n", 477 | " yield batch_x, batch_y\n", 478 | " batch_x = None\n", 479 | " batch_y = None\n", 480 | " batch_i = 0" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": { 486 | "_cell_guid": "3e35540c-7ae8-4f38-ae70-a2d38bc1e189", 487 | "_uuid": "07d54b2346e24ebe9fd6dad691e129bd721a8b45" 488 | }, 489 | "source": [ 490 | "Alright, now we can use this generator to train the network. To make sure that one epoch has approxamitely the same number of examples as are in the training set, we need to set the `steps_per_epoch` to the number of batches we expect to cover the whole dataset. Here's the code:" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 8, 496 | "metadata": { 497 | "_cell_guid": "48f4dcad-b7fe-4cd6-81a7-9ac41e9a0daa", 498 | "_uuid": "1b55016b60964208c999381f09600af1a5ae96ed" 499 | }, 500 | "outputs": [], 501 | "source": [ 502 | "from keras.engine.topology import Layer\n", 503 | "\n", 504 | "class Attention(Layer):\n", 505 | " def __init__(self, step_dim,\n", 506 | " W_regularizer=None, b_regularizer=None,\n", 507 | " W_constraint=None, b_constraint=None,\n", 508 | " bias=True, **kwargs):\n", 509 | " self.supports_masking = True\n", 510 | " self.init = initializers.get('glorot_uniform')\n", 511 | "\n", 512 | " self.W_regularizer = regularizers.get(W_regularizer)\n", 513 | " self.b_regularizer = regularizers.get(b_regularizer)\n", 514 | "\n", 515 | " self.W_constraint = constraints.get(W_constraint)\n", 516 | " self.b_constraint = constraints.get(b_constraint)\n", 517 | "\n", 518 | " self.bias = bias\n", 519 | " self.step_dim = step_dim\n", 520 | " self.features_dim = 0\n", 521 | " super(Attention, self).__init__(**kwargs)\n", 522 | "\n", 523 | " def build(self, input_shape):\n", 524 | " assert len(input_shape) == 3\n", 525 | "\n", 526 | " self.W = self.add_weight((input_shape[-1],),\n", 527 | " initializer=self.init,\n", 528 | " name='{}_W'.format(self.name),\n", 529 | " regularizer=self.W_regularizer,\n", 530 | " constraint=self.W_constraint)\n", 531 | " self.features_dim = input_shape[-1]\n", 532 | "\n", 533 | " if self.bias:\n", 534 | " self.b = self.add_weight((input_shape[1],),\n", 535 | " initializer='zero',\n", 536 | " name='{}_b'.format(self.name),\n", 537 | " regularizer=self.b_regularizer,\n", 538 | " constraint=self.b_constraint)\n", 539 | " else:\n", 540 | " self.b = None\n", 541 | "\n", 542 | " self.built = True\n", 543 | "\n", 544 | " def compute_mask(self, input, input_mask=None):\n", 545 | " return None\n", 546 | "\n", 547 | " def call(self, x, mask=None):\n", 548 | " features_dim = self.features_dim\n", 549 | " step_dim = self.step_dim\n", 550 | "\n", 551 | " eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),\n", 552 | " K.reshape(self.W, (features_dim, 1))), (-1, step_dim))\n", 553 | "\n", 554 | " if self.bias:\n", 555 | " eij += self.b\n", 556 | "\n", 557 | " eij = K.tanh(eij)\n", 558 | "\n", 559 | " a = K.exp(eij)\n", 560 | "\n", 561 | " if mask is not None:\n", 562 | " a *= K.cast(mask, K.floatx())\n", 563 | "\n", 564 | " a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())\n", 565 | "\n", 566 | " a = K.expand_dims(a)\n", 567 | " weighted_input = x * a\n", 568 | " return K.sum(weighted_input, axis=1)\n", 569 | "\n", 570 | " def compute_output_shape(self, input_shape):\n", 571 | " return input_shape[0], self.features_dim" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 81, 577 | "metadata": { 578 | "_cell_guid": "53202998-9f82-48eb-a8a7-86c0dcdd92fa", 579 | "_uuid": "876ed29154a6bdf94048af7dc0e6f5115b4f9f2e" 580 | }, 581 | "outputs": [], 582 | "source": [ 583 | "def build_model():\n", 584 | " inputs = Input(shape=(150,))\n", 585 | " inp = Reshape((1,150,))(inputs)\n", 586 | " x = Bidirectional(GRU(80, return_sequences=True))(inp)\n", 587 | " #att = Attention(150)(x)\n", 588 | " avg_pool = GlobalAveragePooling1D()(x)\n", 589 | " max_pool = GlobalMaxPooling1D()(x)\n", 590 | " conc = concatenate([avg_pool, max_pool])\n", 591 | " output = Dropout(0.5)(conc)\n", 592 | " output = BatchNormalization()(output)\n", 593 | " outp = Dense(6, activation=\"sigmoid\")(output)\n", 594 | " nadam = Nadam(lr=0.001)\n", 595 | " model = Model(inputs=inputs, outputs=outp)\n", 596 | " model.compile(loss='binary_crossentropy',\n", 597 | " optimizer=nadam,\n", 598 | " metrics=['accuracy'])\n", 599 | " \n", 600 | " return model" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 84, 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "def train_folds(X, y, fold_count, model_list, model_name):\n", 610 | " fold_size = len(X) // fold_count\n", 611 | " models = []\n", 612 | " total_meta = []\n", 613 | " for fold_id in range(0, fold_count):\n", 614 | " fold_start = fold_size * fold_id\n", 615 | " fold_end = fold_start + fold_size\n", 616 | " \n", 617 | " if fold_id == fold_count - 1:\n", 618 | " fold_end = len(X)\n", 619 | "\n", 620 | " train_x = np.concatenate([X[:fold_start], X[fold_end:]])\n", 621 | " train_y = np.concatenate([y[:fold_start], y[fold_end:]])\n", 622 | "\n", 623 | " val_x = X[fold_start:fold_end]\n", 624 | " val_y = y[fold_start:fold_end]\n", 625 | " \n", 626 | " save_path = os.path.join('models', '%s_model.h5' % (model_name + str(fold_id)))\n", 627 | " callbacks = [\n", 628 | " ModelCheckpoint(\n", 629 | " save_path, save_best_only=True, verbose=True)\n", 630 | " ]\n", 631 | " #train_x = np.reshape(train_x, train_x.shape + (1,))\n", 632 | " training_generator = data_generator(train_x, 128)\n", 633 | " x_tra = len(train_x)\n", 634 | " training_steps_per_epoch = round(len(train_x) / batch_size)\n", 635 | " model = train_model(model_list[fold_id], training_generator, x_tra, train_y, val_x, val_y,callbacks, training_steps_per_epoch)\n", 636 | " meta = model.predict(val_x, batch_size=128)\n", 637 | " if (fold_id == 0):\n", 638 | " total_meta = meta\n", 639 | " else:\n", 640 | " total_meta = np.concatenate((total_meta, meta), axis=0)\n", 641 | " model_path = os.path.join('models', \"model{0}_weights.npy\".format(fold_id))\n", 642 | " np.save(model_path, model.get_weights())\n", 643 | " models.append(model)\n", 644 | "\n", 645 | " return models, total_meta\n", 646 | "\n", 647 | "def train_model(model, training_generator,x_tra, train_y, val_x, val_y, callbacks, training_steps_per_epoch):\n", 648 | " best_loss = -1\n", 649 | " best_weights = None\n", 650 | " best_epoch = 0\n", 651 | "\n", 652 | " current_epoch = 0\n", 653 | " #charCNN:LSTM\n", 654 | " #train_x = np.reshape(train_x, train_x.shape + (1,))\n", 655 | " #val_x = np.reshape(val_x, val_x.shape + (1,))\n", 656 | " exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1\n", 657 | " steps = int(x_tra/batch_size) * 1000\n", 658 | " lr_init, lr_fin = 0.001, 0.0005\n", 659 | " lr_decay = exp_decay(lr_init, lr_fin, steps)\n", 660 | " #K.set_value(model.optimizer.lr, lr_init)\n", 661 | " #K.set_value(model.optimizer.decay, lr_decay)\n", 662 | "\n", 663 | " while True:\n", 664 | " model.fit_generator(\n", 665 | " training_generator,\n", 666 | " steps_per_epoch=training_steps_per_epoch,\n", 667 | " epochs=1,\n", 668 | " validation_data=(val_x, val_y),\n", 669 | " callbacks=callbacks,\n", 670 | " verbose=2\n", 671 | " )\n", 672 | " \n", 673 | " y_pred = model.predict(val_x, batch_size=128)\n", 674 | "\n", 675 | " total_loss = 0\n", 676 | " for j in range(6):\n", 677 | " loss = log_loss(val_y[:, j], y_pred[:, j])\n", 678 | " total_loss += loss\n", 679 | "\n", 680 | " total_loss /= 6.\n", 681 | "\n", 682 | " print(\"Epoch {0} auc {1} best_auc {2}\".format(current_epoch, total_loss, best_loss))\n", 683 | " \n", 684 | "\n", 685 | " current_epoch += 1\n", 686 | " if total_loss < best_loss or best_loss == -1:\n", 687 | " best_loss = total_loss\n", 688 | " best_weights = model.get_weights()\n", 689 | " best_epoch = current_epoch\n", 690 | " else:\n", 691 | " if current_epoch - best_epoch == 5:\n", 692 | " break\n", 693 | "\n", 694 | " model.set_weights(best_weights)\n", 695 | " return model" 696 | ] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "execution_count": 85, 701 | "metadata": { 702 | "_cell_guid": "1be4d47d-6b43-4422-b77f-f103e784e721", 703 | "_kg_hide-output": true, 704 | "_uuid": "f5b8b97f068e2884bc5e877a19afbc721ba135da" 705 | }, 706 | "outputs": [ 707 | { 708 | "ename": "ValueError", 709 | "evalue": "Error when checking input: expected input_175 to have shape (150,) but got array with shape (1,)", 710 | "output_type": "error", 711 | "traceback": [ 712 | "\u001b[0;31m------------------------------------------------------------------------\u001b[0m", 713 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 714 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mmodel_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"bigru\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mtraining_steps_per_epoch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mround\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'comment_text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mmodels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtotal_meta\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_folds\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'comment_text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_labels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfolds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist_models\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;32mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Model trained!'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 715 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain_folds\u001b[0;34m(X, y, fold_count, model_list, model_name)\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0mx_tra\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_x\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0mtraining_steps_per_epoch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mround\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_x\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_list\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfold_id\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraining_generator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_tra\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_y\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval_x\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval_y\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mcallbacks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraining_steps_per_epoch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0mmeta\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mval_x\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m128\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfold_id\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 716 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain_model\u001b[0;34m(model, training_generator, x_tra, train_y, val_x, val_y, callbacks, training_steps_per_epoch)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mval_x\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval_y\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcallbacks\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 62\u001b[0;31m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 63\u001b[0m )\n\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 717 | "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/keras/legacy/interfaces.pyc\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 89\u001b[0m warnings.warn('Update your `' + object_name +\n\u001b[1;32m 90\u001b[0m '` call to the Keras 2 API: ' + signature, stacklevel=2)\n\u001b[0;32m---> 91\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 92\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_original_function\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 718 | "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/keras/engine/training.pyc\u001b[0m in \u001b[0;36mfit_generator\u001b[0;34m(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)\u001b[0m\n\u001b[1;32m 2181\u001b[0m str(validation_data))\n\u001b[1;32m 2182\u001b[0m val_x, val_y, val_sample_weights = self._standardize_user_data(\n\u001b[0;32m-> 2183\u001b[0;31m val_x, val_y, val_sample_weight)\n\u001b[0m\u001b[1;32m 2184\u001b[0m \u001b[0mval_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mval_x\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mval_y\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mval_sample_weights\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2185\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muses_learning_phase\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mK\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlearning_phase\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 719 | "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/keras/engine/training.pyc\u001b[0m in \u001b[0;36m_standardize_user_data\u001b[0;34m(self, x, y, sample_weight, class_weight, check_array_lengths, batch_size)\u001b[0m\n\u001b[1;32m 1481\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_feed_input_shapes\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1482\u001b[0m \u001b[0mcheck_batch_axis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1483\u001b[0;31m exception_prefix='input')\n\u001b[0m\u001b[1;32m 1484\u001b[0m y = _standardize_input_data(y, self._feed_output_names,\n\u001b[1;32m 1485\u001b[0m \u001b[0moutput_shapes\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 720 | "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/keras/engine/training.pyc\u001b[0m in \u001b[0;36m_standardize_input_data\u001b[0;34m(data, names, shapes, check_batch_axis, exception_prefix)\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0;34m': expected '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' to have shape '\u001b[0m \u001b[0;34m+\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 122\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' but got array with shape '\u001b[0m \u001b[0;34m+\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 123\u001b[0;31m str(data_shape))\n\u001b[0m\u001b[1;32m 124\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 721 | "\u001b[0;31mValueError\u001b[0m: Error when checking input: expected input_175 to have shape (150,) but got array with shape (1,)" 722 | ] 723 | } 724 | ], 725 | "source": [ 726 | "list_models = []\n", 727 | "folds = 10\n", 728 | "for fold in range(0, folds):\n", 729 | " model = build_model()\n", 730 | " list_models.append(model)\n", 731 | "train_labels = train[['toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate']].values\n", 732 | "batch_size = 128\n", 733 | "model_name = \"bigru\"\n", 734 | "training_steps_per_epoch = round(len(train['comment_text']) / batch_size)\n", 735 | "models, total_meta = train_folds(train['comment_text'], train_labels, folds, list_models,model_name) \n", 736 | "print('Model trained!')\n" 737 | ] 738 | }, 739 | { 740 | "cell_type": "code", 741 | "execution_count": null, 742 | "metadata": {}, 743 | "outputs": [], 744 | "source": [ 745 | "print(\"Predicting results...\")\n", 746 | "random_test = pd.read_csv('cleaned_test_clean.csv')\n", 747 | "#random_test = self.Sanitize(random_test)\n", 748 | "#random_test.to_csv('cleaned_test_clean.csv', index=False)\n", 749 | "X_test = random_test['comment_text'].fillna('_empty_')\n", 750 | " X_test = self.prep_text(X_test)\n", 751 | " #X_test = self.load_data(X_test)\n", 752 | " test_predicts_list = []\n", 753 | " for fold_id, model in enumerate(models):\n", 754 | " model_path = os.path.join(self.model_dir, \"model{0}_weights.npy\".format(fold_id))\n", 755 | " np.save(model_path, model.get_weights())\n", 756 | " \n", 757 | " test_predicts_path = os.path.join(self.model_dir, \"test_predicts{0}.npy\".format(fold_id))\n", 758 | " test_predicts = model.predict(X_test, batch_size=self.hparams['batch_size'])\n", 759 | " test_predicts_list.append(test_predicts)\n", 760 | " np.save(test_predicts_path, test_predicts)\n", 761 | "\n", 762 | " test_predicts = np.ones(test_predicts_list[0].shape)\n", 763 | " for fold_predict in test_predicts_list:\n", 764 | " test_predicts *= fold_predict\n", 765 | "\n", 766 | " test_predicts **= (1. / len(test_predicts_list))\n", 767 | " test_ids = random_test[\"id\"].values\n", 768 | " test_ids = test_ids.reshape((len(test_ids), 1))\n", 769 | " CLASSES = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n", 770 | " test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)\n", 771 | " test_predicts[\"id\"] = test_ids\n", 772 | " test_predicts = test_predicts[[\"id\"] + CLASSES]\n", 773 | " test_predicts.to_csv('augmentori_pred_fasttext_gru_cv_output_two.csv', index=False)\n", 774 | " print('predicted !')\n" 775 | ] 776 | }, 777 | { 778 | "cell_type": "markdown", 779 | "metadata": { 780 | "_cell_guid": "1af33152-8655-4aba-976e-b34a573940e8", 781 | "_uuid": "e2b848800743d084611e07d9f93fefe444eb2f88" 782 | }, 783 | "source": [ 784 | "And there you go, this should work on p2.xlarge even for long window lengths!" 785 | ] 786 | }, 787 | { 788 | "cell_type": "markdown", 789 | "metadata": { 790 | "_cell_guid": "e9697a31-c4aa-4ec7-85f8-f93f95b57228", 791 | "_uuid": "24d94ddbe3041de9a0ae252adb22b2d1354b3cff" 792 | }, 793 | "source": [ 794 | "### More stuff to try:\n", 795 | "Some suggestions. I've tried most of these and found them helpful:\n", 796 | "\n", 797 | "* Add random but common typos to strings before converting to FT vectors. That way, the model can learn in which way typos affect the embeddings. Use the training generator so you can adjust this over time.\n", 798 | "* Add more string preprocessing to our `normalize` function\n", 799 | "* Randomize the windows instead of using the end (great that we already have a generator!)\n", 800 | "* Use FastText's sentence vector feature to summarize parts of the text outside the window\n", 801 | "* Add other features ontop of the FT ones, e.g. capitalization etc." 802 | ] 803 | } 804 | ], 805 | "metadata": { 806 | "kernelspec": { 807 | "display_name": "Python 2", 808 | "language": "python", 809 | "name": "python2" 810 | }, 811 | "language_info": { 812 | "codemirror_mode": { 813 | "name": "ipython", 814 | "version": 2 815 | }, 816 | "file_extension": ".py", 817 | "mimetype": "text/x-python", 818 | "name": "python", 819 | "nbconvert_exporter": "python", 820 | "pygments_lexer": "ipython2", 821 | "version": "2.7.12" 822 | } 823 | }, 824 | "nbformat": 4, 825 | "nbformat_minor": 1 826 | } 827 | -------------------------------------------------------------------------------- /get_data.sh: -------------------------------------------------------------------------------- 1 | curl -L -o final_train.csv "https://drive.google.com/uc?export=download&id=19WFLluCA0YNXLOAOVwN4iKul83-AXpPv" 2 | curl -L -o test.csv "https://drive.google.com/uc?export=download&id=19WFLluCA0YNXLOAOVwN4iKul83-AXpPv" 3 | 4 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | sudo apt-get install cython htop \ 2 | wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.zip \ 3 | git clone https://github.com/facebookresearch/fastText.git \ 4 | cd fastText \ 5 | pip install . 6 | -------------------------------------------------------------------------------- /nbsvm.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, ClassifierMixin 2 | from sklearn.utils.validation import check_X_y, check_is_fitted 3 | from sklearn.linear_model import LogisticRegression 4 | from scipy import sparse 5 | class NbSvmClassifier(BaseEstimator, ClassifierMixin): 6 | def __init__(self, C=1.0, dual=False, n_jobs=1): 7 | self.C = C 8 | self.dual = dual 9 | self.n_jobs = n_jobs 10 | 11 | def predict(self, x): 12 | # Verify that model has been fit 13 | check_is_fitted(self, ['_r', '_clf']) 14 | return self._clf.predict(x.multiply(self._r)) 15 | 16 | def predict_proba(self, x): 17 | # Verify that model has been fit 18 | check_is_fitted(self, ['_r', '_clf']) 19 | return self._clf.predict_proba(x.multiply(self._r)) 20 | 21 | def fit(self, x, y): 22 | # Check that X and y have correct shape 23 | y = y.values 24 | x, y = check_X_y(x, y, accept_sparse=True) 25 | 26 | def pr(x, y_i, y): 27 | p = x[y==y_i].sum(0) 28 | return (p+1) / ((y==y_i).sum()+1) 29 | 30 | self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y))) 31 | x_nb = x.multiply(self._r) 32 | self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y) 33 | return self 34 | 35 | #EXAMPLE USAGE 36 | #model = NbSvmClassifier(C=4, dual=True, n_jobs=-1).fit(training_features, training_labels) 37 | 38 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | keras 2 | tensorflow-gpu 3 | splitter 4 | gensim 5 | nltk 6 | jupyter 7 | pyenchant 8 | -------------------------------------------------------------------------------- /super_nbsvm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": { 5 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 6 | "collapsed": true, 7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 8 | "trusted": false 9 | }, 10 | "cell_type": "code", 11 | "source": "# Inspiration 1: https://www.kaggle.com/tunguz/logistic-regression-with-words-and-char-n-grams/code\n# Inspiration 2: https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline\n\nimport pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\nimport re, string\nimport time\nfrom scipy.sparse import hstack\nfrom scipy.special import logit, expit\n\nfrom sklearn.model_selection import StratifiedKFold\nfrom sklearn.metrics import roc_auc_score", 12 | "execution_count": null, 13 | "outputs": [] 14 | }, 15 | { 16 | "metadata": { 17 | "_uuid": "7a9def5093420ee0cb33e3c4dbdb284af0e763b9", 18 | "collapsed": true, 19 | "_cell_guid": "890ecc79-0b1e-422a-beab-c20b5b588ae4", 20 | "trusted": false 21 | }, 22 | "cell_type": "code", 23 | "source": "# Functions\ndef tokenize(s): return re_tok.sub(r' \\1 ', s).split()\n\n\ndef pr(y_i, y, x):\n p = x[y==y_i].sum(0)\n return (p+1) / ((y==y_i).sum()+1)\n\n\ndef get_mdl(y,x, c0 = 4):\n y = y.values\n r = np.log(pr(1,y,x) / pr(0,y,x))\n m = LogisticRegression(C= c0, dual=True)\n x_nb = x.multiply(r)\n return m.fit(x_nb, y), r\n\n\ndef multi_roc_auc_score(y_true, y_pred):\n assert y_true.shape == y_pred.shape\n columns = y_true.shape[1]\n column_losses = []\n for i in range(0, columns):\n column_losses.append(roc_auc_score(y_true[:, i], y_pred[:, i]))\n return np.array(column_losses).mean()", 24 | "execution_count": null, 25 | "outputs": [] 26 | }, 27 | { 28 | "metadata": { 29 | "_uuid": "648a414512e100686384f54a335f279ffb60dc25", 30 | "collapsed": true, 31 | "_cell_guid": "2e91fb5c-aa54-4509-adf1-02100d4d59e3", 32 | "trusted": false 33 | }, 34 | "cell_type": "code", 35 | "source": "model_type = 'lrchar'\ntodate = time.strftime(\"%d%m\")", 36 | "execution_count": null, 37 | "outputs": [] 38 | }, 39 | { 40 | "metadata": { 41 | "_uuid": "1ce08d3aa6e6582ae66286a7bb870c23c133ca86", 42 | "_cell_guid": "10154f60-38e2-4ba2-ac3f-337cdabcc677" 43 | }, 44 | "cell_type": "markdown", 45 | "source": "# Data" 46 | }, 47 | { 48 | "metadata": { 49 | "_uuid": "a81bbe05d6bb731e198b6f3c753620532be4d600", 50 | "collapsed": true, 51 | "_cell_guid": "ad29a64e-d548-4b14-8c19-5a5adbab3e74", 52 | "trusted": false 53 | }, 54 | "cell_type": "code", 55 | "source": "# read data\ntrain = pd.read_csv('../input/train.csv')\ntest = pd.read_csv('../input/test.csv')\nsubm = pd.read_csv('../input/sample_submission.csv')\n\nid_train = train['id'].copy()\nid_test = test['id'].copy()\n\n# add empty label for None\nlabel_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\ntrain['none'] = 1-train[label_cols].max(axis=1)\n# fill missing values\nCOMMENT = 'comment_text'\ntrain[COMMENT].fillna(\"unknown\", inplace=True)\ntest[COMMENT].fillna(\"unknown\", inplace=True)", 56 | "execution_count": null, 57 | "outputs": [] 58 | }, 59 | { 60 | "metadata": { 61 | "_uuid": "375e268097baecafa125fc7fe8c879d40a25efa8", 62 | "collapsed": true, 63 | "_cell_guid": "f2b78b34-b627-4788-a0f1-2571eab44e3b", 64 | "trusted": false 65 | }, 66 | "cell_type": "code", 67 | "source": "# Tf-idf\n# prepare tokenizer\nre_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')\n\n# create sparse matrices\nn = train.shape[0]\n#vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize, min_df=3, max_df=0.9, strip_accents='unicode',\n# use_idf=1, smooth_idf=1, sublinear_tf=1 )\n\nword_vectorizer = TfidfVectorizer(\n tokenizer=tokenize,\n sublinear_tf=True,\n strip_accents='unicode',\n analyzer='word', \n min_df = 5,\n token_pattern=r'\\w{1,}',\n ngram_range=(1, 3))\n# ,\n# max_features=250000)\n\nall1 = pd.concat([train[COMMENT], test[COMMENT]])\nword_vectorizer.fit(all1)\nxtrain1 = word_vectorizer.transform(train[COMMENT])\nxtest1 = word_vectorizer.transform(test[COMMENT])\n\nchar_vectorizer = TfidfVectorizer(\n sublinear_tf=True,\n strip_accents='unicode',\n analyzer='char',\n min_df = 3,\n ngram_range=(1, 6))\n# ,\n# max_features=250000)\n\nall1 = pd.concat([train[COMMENT], test[COMMENT]])\nchar_vectorizer.fit(all1)\n\nxtrain2 = char_vectorizer.transform(train[COMMENT])\nxtest2 = char_vectorizer.transform(test[COMMENT])\n", 68 | "execution_count": null, 69 | "outputs": [] 70 | }, 71 | { 72 | "metadata": { 73 | "_uuid": "bfdb92b679fed1e718a8a3b6e4d61caa8f7aa2ee", 74 | "_cell_guid": "f948babf-3396-4aa9-91c8-adde0c62ff0a" 75 | }, 76 | "cell_type": "markdown", 77 | "source": "# Model" 78 | }, 79 | { 80 | "metadata": { 81 | "_uuid": "fc94a43d0a5a0613d3bdbce0c15c25454573eac6", 82 | "collapsed": true, 83 | "_cell_guid": "17517658-06fa-4437-ab4d-9e0d011e7753", 84 | "trusted": false 85 | }, 86 | "cell_type": "code", 87 | "source": "nfolds = 5\nxseed = 29\ncval = 4\n\n# data setup\nxtrain = hstack([xtrain1, xtrain2], format='csr')\nxtest = hstack([xtest1,xtest2], format='csr')\nytrain = np.array(train[label_cols].copy())\n\n# stratified split\nskf = StratifiedKFold(n_splits= nfolds, random_state= xseed)\n\n# storage structures for prval / prfull\npredval = np.zeros((xtrain.shape[0], len(label_cols)))\npredfull = np.zeros((xtest.shape[0], len(label_cols)))\nscoremat = np.zeros((nfolds,len(label_cols) ))\nscore_vec = np.zeros((len(label_cols),1))", 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "metadata": { 93 | "_uuid": "c6b3c62063bda4ed1a8a6b8845454278766929fd", 94 | "collapsed": true, 95 | "_cell_guid": "a8d92c33-64f8-4302-bf49-e38712fd6b8f", 96 | "trusted": false 97 | }, 98 | "cell_type": "code", 99 | "source": "for (lab_ind,lab) in enumerate(label_cols): \n y = train[lab].copy()\n print('label:' + str(lab_ind))\n for (f, (train_index, test_index)) in enumerate(skf.split(xtrain, y)):\n # split \n x0, x1 = xtrain[train_index], xtrain[test_index]\n y0, y1 = y[train_index], y[test_index] \n # fit model for prval\n m,r = get_mdl(y0,x0, c0 = cval)\n predval[test_index,lab_ind] = m.predict_proba(x1.multiply(r))[:,1]\n scoremat[f,lab_ind] = roc_auc_score(y1,predval[test_index,lab_ind])\n # fit model full\n m,r = get_mdl(y,xtrain, c0 = cval)\n predfull[:,lab_ind] += m.predict_proba(xtest.multiply(r))[:,1]\n print('fit:'+ str(lab) + ' fold:' + str(f) + ' score:%.6f' %(scoremat[f,lab_ind]))\n# break\npredfull /= nfolds ", 100 | "execution_count": null, 101 | "outputs": [] 102 | }, 103 | { 104 | "metadata": { 105 | "_uuid": "b14075b625915e7dc8d6b8eff44c79d4b075065d", 106 | "collapsed": true, 107 | "_cell_guid": "0a45a046-7e09-40df-b9a1-116397cf4d09", 108 | "trusted": false 109 | }, 110 | "cell_type": "code", 111 | "source": "score_vec = np.zeros((len(label_cols),1))\nfor ii in range(len(label_cols)):\n score_vec[ii] = roc_auc_score(ymat[:,ii], predval[:,ii])\nprint(score_vec.mean())\nprint(multi_roc_auc_score(ymat, predval))", 112 | "execution_count": null, 113 | "outputs": [] 114 | }, 115 | { 116 | "metadata": { 117 | "_uuid": "586123023ff44c843d2c765475b344a1d5be5922", 118 | "_cell_guid": "0b575f63-b40d-448a-8542-e4d753bd7d10" 119 | }, 120 | "cell_type": "markdown", 121 | "source": "# Store resultss" 122 | }, 123 | { 124 | "metadata": { 125 | "_uuid": "1eaae93a1bd8569eeefc61c5b7207cccb526db2f", 126 | "collapsed": true, 127 | "_cell_guid": "65f0fee6-bad7-4eef-b5ca-0f3b96cad666", 128 | "trusted": false 129 | }, 130 | "cell_type": "code", 131 | "source": "# store prval\nprval = pd.DataFrame(predval)\nprval.columns = label_cols\nprval['id'] = id_train\nprval.to_csv('prval_'+model_type+'x'+str(cval)+'f'+str(nfolds)+'_'+todate+'.csv', index= False)\n\n# store prfull\nprfull = pd.DataFrame(predfull)\nprfull.columns = label_cols\nprfull['id'] = id_test\nprfull.to_csv('prfull_'+model_type+'x'+str(cval)+'f'+str(nfolds)+'_'+todate+'.csv', index= False)\n\n# store submission\nsubmid = pd.DataFrame({'id': subm[\"id\"]})\nsubmission = pd.concat([submid, pd.DataFrame(prfull, columns = label_cols)], axis=1)\nsubmission.to_csv('sub_'+model_type+'x'+str(cval)+'f'+str(nfolds)+'_'+todate+'.csv', index= False)", 132 | "execution_count": null, 133 | "outputs": [] 134 | }, 135 | { 136 | "metadata": { 137 | "_uuid": "a7f182e6d38bf01d2ab905c839ef5fe635c89412", 138 | "_cell_guid": "192daff3-4d61-4c9b-9ca9-a7c2840947a3" 139 | }, 140 | "cell_type": "markdown", 141 | "source": "" 142 | }, 143 | { 144 | "metadata": { 145 | "_uuid": "47fcbff298a283564216632bac24f1bfba81c28b", 146 | "_cell_guid": "d353d366-049e-4b6a-b13f-261d8f1852bc" 147 | }, 148 | "cell_type": "markdown", 149 | "source": "" 150 | }, 151 | { 152 | "metadata": { 153 | "_uuid": "b3612a10b1146a4bbab18d465d2cc52e7ec1cfd1", 154 | "_cell_guid": "e4773030-4e38-4345-90fe-0187175e70fa" 155 | }, 156 | "cell_type": "markdown", 157 | "source": "" 158 | }, 159 | { 160 | "metadata": { 161 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", 162 | "collapsed": true, 163 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0" 164 | }, 165 | "cell_type": "markdown", 166 | "source": "" 167 | }, 168 | { 169 | "metadata": { 170 | "_uuid": "5b73a04728cd61f0aefb38eca18b1d50116d8630", 171 | "_cell_guid": "102a72e7-ed1f-43db-87b7-4aa00d1899d5" 172 | }, 173 | "cell_type": "markdown", 174 | "source": "" 175 | }, 176 | { 177 | "metadata": { 178 | "_uuid": "5a86027bcd41e4c5898e284cf32afb3f722df3e1", 179 | "_cell_guid": "a9c3c701-b750-4ae2-a9e6-7b8a3c68096c" 180 | }, 181 | "cell_type": "markdown", 182 | "source": "" 183 | } 184 | ], 185 | "metadata": { 186 | "kernelspec": { 187 | "display_name": "Python 3", 188 | "language": "python", 189 | "name": "python3" 190 | }, 191 | "language_info": { 192 | "codemirror_mode": { 193 | "name": "ipython", 194 | "version": 3 195 | }, 196 | "name": "python", 197 | "mimetype": "text/x-python", 198 | "nbconvert_exporter": "python", 199 | "file_extension": ".py", 200 | "pygments_lexer": "ipython3", 201 | "version": "3.6.4" 202 | } 203 | }, 204 | "nbformat": 4, 205 | "nbformat_minor": 1 206 | } -------------------------------------------------------------------------------- /translate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from preprocessing import split_train_data, translate_data" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": { 16 | "scrolled": true 17 | }, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "Requirement already satisfied: translation in /home/stgc/anaconda2/lib/python2.7/site-packages\r\n", 24 | "Requirement already satisfied: requests in /home/stgc/anaconda2/lib/python2.7/site-packages (from translation)\r\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "!pip install translation" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "translate_data(\"data\", filename='sp_check_train.csv', filename_translated='train_translated_sp.csv')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "translate_data(\"data\", filename='sp_check_test.csv', filename_translated='test_translated_sp.csv')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 2", 61 | "language": "python", 62 | "name": "python2" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 2 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython2", 74 | "version": "2.7.14" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 2 79 | } 80 | -------------------------------------------------------------------------------- /visuals.py: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # Suppress matplotlib user warnings 3 | # Necessary for newer version of matplotlib 4 | import warnings 5 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") 6 | # 7 | # Display inline matplotlib plots with IPython 8 | from IPython import get_ipython 9 | get_ipython().run_line_magic('matplotlib', 'inline') 10 | ########################################### 11 | 12 | import matplotlib.pyplot as pl 13 | import matplotlib.patches as mpatches 14 | import numpy as np 15 | import pandas as pd 16 | from time import time 17 | from sklearn.metrics import f1_score, accuracy_score 18 | 19 | 20 | def distribution(data, transformed = False): 21 | """ 22 | Visualization code for displaying skewed distributions of features 23 | """ 24 | 25 | # Create figure 26 | fig = pl.figure(figsize = (11,5)); 27 | 28 | # Skewed feature plotting 29 | for i, feature in enumerate(['capitals','num_unique_words']): 30 | ax = fig.add_subplot(1, 2, i+1) 31 | ax.hist(data[feature], bins = 25, color = '#00A0A0') 32 | ax.set_title("'%s' Feature Distribution"%(feature), fontsize = 14) 33 | ax.set_xlabel("Value") 34 | ax.set_ylabel("Number of Records") 35 | ax.set_ylim((0, 2000)) 36 | ax.set_yticks([0, 500, 1000, 1500, 2000]) 37 | ax.set_yticklabels([0, 500, 1000, 1500, ">2000"]) 38 | 39 | # Plot aesthetics 40 | if transformed: 41 | fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", \ 42 | fontsize = 16, y = 1.03) 43 | else: 44 | fig.suptitle("Skewed Distributions of Continuous Census Data Features", \ 45 | fontsize = 16, y = 1.03) 46 | 47 | fig.tight_layout() 48 | fig.show() 49 | 50 | 51 | def evaluate(results, accuracy, f1): 52 | """ 53 | Visualization code to display results of various learners. 54 | 55 | inputs: 56 | - learners: a list of supervised learners 57 | - stats: a list of dictionaries of the statistic results from 'train_predict()' 58 | - accuracy: The score for the naive predictor 59 | - f1: The score for the naive predictor 60 | """ 61 | 62 | # Create figure 63 | fig, ax = pl.subplots(2, 3, figsize = (11,7)) 64 | 65 | # Constants 66 | bar_width = 0.3 67 | colors = ['#A00000','#00A0A0','#00A000'] 68 | 69 | # Super loop to plot four panels of data 70 | for k, learner in enumerate(results.keys()): 71 | for j, metric in enumerate(['train_time', 'acc_train', 'f_train', 'pred_time', 'acc_test', 'f_test']): 72 | for i in np.arange(3): 73 | 74 | # Creative plot code 75 | ax[j/3, j%3].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k]) 76 | ax[j/3, j%3].set_xticks([0.45, 1.45, 2.45]) 77 | ax[j/3, j%3].set_xticklabels(["1%", "10%", "100%"]) 78 | ax[j/3, j%3].set_xlabel("Training Set Size") 79 | ax[j/3, j%3].set_xlim((-0.1, 3.0)) 80 | 81 | # Add unique y-labels 82 | ax[0, 0].set_ylabel("Time (in seconds)") 83 | ax[0, 1].set_ylabel("Accuracy Score") 84 | ax[0, 2].set_ylabel("F-score") 85 | ax[1, 0].set_ylabel("Time (in seconds)") 86 | ax[1, 1].set_ylabel("Accuracy Score") 87 | ax[1, 2].set_ylabel("F-score") 88 | 89 | # Add titles 90 | ax[0, 0].set_title("Model Training") 91 | ax[0, 1].set_title("Accuracy Score on Training Subset") 92 | ax[0, 2].set_title("F-score on Training Subset") 93 | ax[1, 0].set_title("Model Predicting") 94 | ax[1, 1].set_title("Accuracy Score on Testing Set") 95 | ax[1, 2].set_title("F-score on Testing Set") 96 | 97 | # Add horizontal lines for naive predictors 98 | ax[0, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') 99 | ax[1, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') 100 | ax[0, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') 101 | ax[1, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') 102 | 103 | # Set y-limits for score panels 104 | ax[0, 1].set_ylim((0, 1)) 105 | ax[0, 2].set_ylim((0, 1)) 106 | ax[1, 1].set_ylim((0, 1)) 107 | ax[1, 2].set_ylim((0, 1)) 108 | 109 | # Create patches for the legend 110 | patches = [] 111 | for i, learner in enumerate(results.keys()): 112 | patches.append(mpatches.Patch(color = colors[i], label = learner)) 113 | pl.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \ 114 | loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'x-large') 115 | 116 | # Aesthetics 117 | pl.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 16, y = 1.10) 118 | pl.tight_layout() 119 | pl.show() 120 | 121 | 122 | def feature_plot(importances, X_train, y_train): 123 | 124 | # Display the five most important features 125 | indices = np.argsort(importances)[::-1] 126 | columns = X_train.columns.values[indices[:5]] 127 | values = importances[indices][:5] 128 | 129 | # Creat the plot 130 | fig = pl.figure(figsize = (9,5)) 131 | pl.title("Normalized Weights for First Five Most Predictive Features", fontsize = 16) 132 | pl.bar(np.arange(5), values, width = 0.6, align="center", color = '#00A000', \ 133 | label = "Feature Weight") 134 | pl.bar(np.arange(5) - 0.3, np.cumsum(values), width = 0.2, align = "center", color = '#00A0A0', \ 135 | label = "Cumulative Feature Weight") 136 | pl.xticks(np.arange(5), columns) 137 | pl.xlim((-0.5, 4.5)) 138 | pl.ylabel("Weight", fontsize = 12) 139 | pl.xlabel("Feature", fontsize = 12) 140 | 141 | pl.legend(loc = 'upper center') 142 | pl.tight_layout() 143 | pl.show() 144 | --------------------------------------------------------------------------------