├── EDA.ipynb ├── EDA_profiler.ipynb ├── Final_Data_Prediction_Model_SG.py ├── GMM - Clusters 6 - Normalized input.ipynb ├── GMM - Clusters 6 - Normalized using mean (remove 30 days variable).ipynb ├── GMM - Cluster number selection.ipynb ├── GMM - trials.ipynb ├── README.md ├── basic_io ├── eda_helpers.py ├── encoder_code.py ├── encoding.py ├── linux.txt ├── models_run.py └── regression_model_basics /EDA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import csv\n", 12 | "pd.options.display.max_rows = 4000\n", 13 | "pd.set_option('display.float_format', lambda x: '%.2f' % x)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# Read files from the directory (one by one)\n", 23 | "import glob\n", 24 | "files1 = glob.glob('Z:\\Payment Arrangement Recommender Files\\*.txt')\n", 25 | "files2 = glob.glob('Z:\\Treatment Files\\*.txt')\n", 26 | "files3 = glob.glob('Z:\\WLS Dec 2017\\*.txt')\n", 27 | "files4 = glob.glob('Z:\\WLS Jul 2016\\*.txt')\n", 28 | "files5 = glob.glob('Z:\\WLS Jun 2017\\*.txt')\n", 29 | "files6 = glob.glob('Z:\\WLS Mar 2017\\*.txt')\n", 30 | "files7 = glob.glob('Z:\\WLS Oct 2017\\*.txt')\n", 31 | "files8 = glob.glob('Z:\\WLS Sep 2017\\*.txt')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "files = files1+ files2+ files3+ files4 + files5 + files6 + files7 + files8\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "import glob\n", 50 | "files1_ = glob.glob('Z:\\WLS Oct 2017\\Oct_TRT\\*.txt')\n", 51 | "files2_ = glob.glob('Z:\\WLS Oct 2017\\Oct_PAR\\*.txt')\n", 52 | "files_ = files1_ + files2_" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 5, 58 | "metadata": { 59 | "scrolled": true 60 | }, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "Z:\\WLS Dec 2017\\WLS AR STRATA DEC 2017 RES.txt\n", 67 | "Z:\\WLS Dec 2017\\WLS COHORT DEC 2017 RES.txt\n" 68 | ] 69 | }, 70 | { 71 | "name": "stderr", 72 | "output_type": "stream", 73 | "text": [ 74 | "C:\\Users\\sg641p\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2785: DtypeWarning: Columns (8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n", 75 | " interactivity=interactivity, compiler=compiler, result=result)\n" 76 | ] 77 | }, 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "Z:\\WLS Dec 2017\\WLS DD STRATA DEC 2017 data.txt\n", 83 | "Z:\\WLS Dec 2017\\WLS PAY DEC 2017 RES.txt\n", 84 | "Z:\\WLS Dec 2017\\WLS PERFORMANCE DEC 2017 RES.txt\n", 85 | "Z:\\WLS Dec 2017\\WLS Treatment Dec 2017 RES.txt\n", 86 | "Z:\\WLS Dec 2017\\WLS_PA_DEC_2017.txt\n" 87 | ] 88 | }, 89 | { 90 | "name": "stderr", 91 | "output_type": "stream", 92 | "text": [ 93 | "C:\\Users\\sg641p\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2785: DtypeWarning: Columns (35) have mixed types. Specify dtype option on import or set low_memory=False.\n", 94 | " interactivity=interactivity, compiler=compiler, result=result)\n" 95 | ] 96 | }, 97 | { 98 | "ename": "MemoryError", 99 | "evalue": "", 100 | "output_type": "error", 101 | "traceback": [ 102 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 103 | "\u001b[1;31mMemoryError\u001b[0m Traceback (most recent call last)", 104 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;31m# Populate columns\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[0mcols\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[0msample_value\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 8\u001b[0m \u001b[0mis_null\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0many\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[0mnum_nulls\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxis\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 105 | "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36mvalues\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 4631\u001b[0m \"\"\"\n\u001b[0;32m 4632\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_consolidate_inplace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4633\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_data\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mas_array\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtranspose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_AXIS_REVERSED\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4634\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4635\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 106 | "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\internals.py\u001b[0m in \u001b[0;36mas_array\u001b[1;34m(self, transpose, items)\u001b[0m\n\u001b[0;32m 3947\u001b[0m \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmgr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mblocks\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3948\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3949\u001b[1;33m \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmgr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_interleave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3950\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3951\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtranspose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mtranspose\u001b[0m \u001b[1;32melse\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 107 | "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\internals.py\u001b[0m in \u001b[0;36m_interleave\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 3976\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mblk\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mblocks\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3977\u001b[0m \u001b[0mrl\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mblk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmgr_locs\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3978\u001b[1;33m \u001b[0mresult\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mrl\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mblk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3979\u001b[0m \u001b[0mitemmask\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mrl\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3980\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 108 | "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\internals.py\u001b[0m in \u001b[0;36mget_values\u001b[1;34m(self, dtype)\u001b[0m\n\u001b[0;32m 217\u001b[0m \"\"\"\n\u001b[0;32m 218\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mis_object_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 219\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mobject\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 220\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 221\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 109 | "\u001b[1;31mMemoryError\u001b[0m: " 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "for f in files:\n", 115 | " print(f)\n", 116 | " df = pd.read_csv(f, sep = '|')\n", 117 | " \n", 118 | " # Populate columns\n", 119 | " cols = df.columns.tolist()\n", 120 | " sample_value = df.values[0].tolist()\n", 121 | " is_null = df.isnull().any().tolist()\n", 122 | " num_nulls = df.isnull().sum(axis = 0)\n", 123 | " per_nulls = df.isnull().sum(axis = 0)*100/len(df)\n", 124 | " rows = zip(cols,sample_value,is_null,num_nulls,per_nulls)\n", 125 | " \n", 126 | " header = ['Column Name', 'Sample Values', 'Is null?', '# of Nulls', '% of Nulls'] \n", 127 | " start = '\\\\' \n", 128 | " end = '.'\n", 129 | "\n", 130 | " filename = (f.split(start))[2].split(end)[0]\n", 131 | " \n", 132 | " newfilePath = '../../3.analysis/2.analysis/EDA/' + filename + '_EDA.csv'\n", 133 | " with open(newfilePath, \"w\") as file:\n", 134 | " writer = csv.writer(file)\n", 135 | " writer.writerow(header)\n", 136 | " for row in rows:\n", 137 | " writer.writerow(row)\n", 138 | " del(df)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "Z:\\WLS Oct 2017\\Oct_TRT\\WLS Treatment Oct 2017 RES HIST.txt\n", 151 | "Z:\\WLS Oct 2017\\Oct_TRT\\WLS Treatment Oct 2017 RES.txt\n", 152 | "Z:\\WLS Oct 2017\\Oct_TRT\\WLS Treatment Oct 2017 RES_Updated.txt\n", 153 | "Z:\\WLS Oct 2017\\Oct_TRT\\WLS_TRT_Mar_Sep RES.txt\n", 154 | "Z:\\WLS Oct 2017\\Oct_PAR\\WLS Pay Oct 2017 RES.txt\n", 155 | "Z:\\WLS Oct 2017\\Oct_PAR\\WLS_PA_Oct_Cohort_Perfromance.txt\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "for f in files_:\n", 161 | " print(f)\n", 162 | " df = pd.read_csv(f, sep = '|')\n", 163 | " \n", 164 | " # Populate columns\n", 165 | " cols = df.columns.tolist()\n", 166 | " sample_value = df.values[0].tolist()\n", 167 | " is_null = df.isnull().any().tolist()\n", 168 | " num_nulls = df.isnull().sum(axis = 0)\n", 169 | " per_nulls = df.isnull().sum(axis = 0)*100/len(df)\n", 170 | " rows = zip(cols,sample_value,is_null,num_nulls,per_nulls)\n", 171 | " \n", 172 | " header = ['Column Name', 'Sample Values', 'Is null?', '# of Nulls', '% of Nulls'] \n", 173 | " start = '\\\\' \n", 174 | " end = '.'\n", 175 | "\n", 176 | " filename = (f.split(start))[3].split(end)[0]\n", 177 | " \n", 178 | " newfilePath = '../../3.analysis/2.analysis/EDA/' + filename + '_EDA.csv'\n", 179 | " with open(newfilePath, \"w\") as file:\n", 180 | " writer = csv.writer(file)\n", 181 | " writer.writerow(header)\n", 182 | " for row in rows:\n", 183 | " writer.writerow(row)\n", 184 | " del(df)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [] 193 | } 194 | ], 195 | "metadata": { 196 | "kernelspec": { 197 | "display_name": "Python 3", 198 | "language": "python", 199 | "name": "python3" 200 | }, 201 | "language_info": { 202 | "codemirror_mode": { 203 | "name": "ipython", 204 | "version": 3 205 | }, 206 | "file_extension": ".py", 207 | "mimetype": "text/x-python", 208 | "name": "python", 209 | "nbconvert_exporter": "python", 210 | "pygments_lexer": "ipython3", 211 | "version": "3.7.0" 212 | } 213 | }, 214 | "nbformat": 4, 215 | "nbformat_minor": 2 216 | } 217 | -------------------------------------------------------------------------------- /EDA_profiler.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "from collections import defaultdict\n", 14 | "import getopt, sys\n", 15 | "\n", 16 | "\n", 17 | "cth = 15\n", 18 | "sth = 60\n", 19 | "dlm = \",\"\n", 20 | "MISSING_VALUES = [\"\", \" \", \"N/A\", \"#N/A\", \"nan\"]\n", 21 | "\n", 22 | "ifile_name = \"./in_data/R4642-1-COHORT_PA_RECOMMENDER2_new.txt\"\n", 23 | "ofile_name = \"./eda_results/R4642-1-COHORT_PA_RECOMMENDER2_new_profiler.csv\"\n", 24 | "\n", 25 | "# ifile_name = \"./in_data/R4642-5-Call_Data.txt\"\n", 26 | "# ofile_name = \"./eda_results/R4642-5-Call_Data_profiler.csv\"\n", 27 | "\n", 28 | "try:\n", 29 | " ofile = open(ofile_name, 'w')\n", 30 | " df = pd.read_csv(ifile_name, na_values=MISSING_VALUES, sep= '|', low_memory = False)\n", 31 | "except:\n", 32 | " print('Parameter Error\\n')\n", 33 | " sys.exit(2)\n", 34 | "\n", 35 | "ofile.write(\"Input File Name,\" + ifile_name)\n", 36 | "ofile.write(\"\\nProfile File Name,\" + ofile_name)\n", 37 | "ofile.write(\"\\nNote\\nAll blanks; N/A; #N/A will be treated as missing values\")\n", 38 | "ofile.write(\"\\nAll statistics are computed on observed values\")\n", 39 | "ofile.write(\"\\nNumeric columns with not more than %d unique values will be considered as categorical\" % cth)\n", 40 | "ofile.write(\"\\nCharacter columns with more than %d unique values will be considered as string\" % sth)\n", 41 | "\n", 42 | "\n", 43 | "class Numeric:\n", 44 | " def __init__(self, series):\n", 45 | " self.valid_list = [x for x in series if ~np.isnan(x)]\n", 46 | " self.missing = len(series) - len(self.valid_list)\n", 47 | " self.observed = len(self.valid_list)\n", 48 | " self.mean = np.mean(self.valid_list)\n", 49 | " self.std = np.std(self.valid_list)\n", 50 | " self.min = np.min(self.valid_list)\n", 51 | " self.max = np.max(self.valid_list)\n", 52 | " self.p5 = np.percentile(self.valid_list, 5)\n", 53 | " self.p25 = np.percentile(self.valid_list, 25)\n", 54 | " self.p50 = np.percentile(self.valid_list, 50)\n", 55 | " self.p75 = np.percentile(self.valid_list, 75)\n", 56 | " self.p95 = np.percentile(self.valid_list, 95)\n", 57 | "\n", 58 | "\n", 59 | "class Categorical:\n", 60 | " def __init__(self, series):\n", 61 | " self.valid_list = [x for x in series if pd.notnull(x)]\n", 62 | " self.missing = len(series) - len(self.valid_list)\n", 63 | " self.observed = len(self.valid_list)\n", 64 | " self.num_categ = len(set(self.valid_list))\n", 65 | " self.cnt_categ = defaultdict(float)\n", 66 | " for each in self.valid_list:\n", 67 | " self.cnt_categ[each] += 1\n", 68 | " \n", 69 | "\n", 70 | "(rows, cols) = df.shape\n", 71 | "ofile.write(\"\\n\\nData Shape\\nRows,\" + str(rows) + \"\\nColumns,\" + str(cols))\n", 72 | "DATA_TYPE = {}\n", 73 | "for column_name in df.columns:\n", 74 | " if df[column_name].dtype == \"object\":\n", 75 | " df[column_name] = df[column_name].str.strip()\n", 76 | " if df[column_name].nunique() > sth:\n", 77 | " DATA_TYPE[column_name] = \"String/Text\"\n", 78 | " else:\n", 79 | " DATA_TYPE[column_name] = \"Categorical\"\n", 80 | " elif len([x for x in pd.unique(df[column_name].ravel()) if ~np.isnan(x)]) < cth:\n", 81 | " DATA_TYPE[column_name] = \"Categorical\"\n", 82 | " else:\n", 83 | " DATA_TYPE[column_name] = \"Numeric (int64)\" if df[column_name].dtype == 'int64' else \"Numeric (float64)\"\n", 84 | "\n", 85 | "ofile.write(\n", 86 | " \"\\n\\nNumeric variables\\nVariable, #Records, #Missing, #Observed, Mean, StdDev, Min, Max, Percentile_5, Percentile_25, Percentile_50, Percentile_75, Percentile_95\")\n", 87 | "for column_name in DATA_TYPE.keys():\n", 88 | " if DATA_TYPE[column_name][:7] == \"Numeric\":\n", 89 | " temp = Numeric(df[column_name])\n", 90 | " output = [column_name, rows, temp.missing, temp.observed, temp.mean, temp.std, temp.min, temp.max, temp.p5,\n", 91 | " temp.p25, temp.p50, temp.p75, temp.p95]\n", 92 | " ofile.write(\"\\n\" + \",\".join(map(str, output)))\n", 93 | " \n", 94 | "\n", 95 | "ofile.write(\n", 96 | " \"\\n\\nCategorical variables\\nVariable, #Records, #Missing, #Observed, #Categories, %C1, %C2, %C3, %C4, %C5, %C6, %C7, %C8\")\n", 97 | "for column_name in DATA_TYPE.keys():\n", 98 | " if DATA_TYPE[column_name][:11] == \"Categorical\":\n", 99 | " temp = Categorical(df[column_name])\n", 100 | " output = [column_name, rows, temp.missing, temp.observed, temp.num_categ]\n", 101 | " unord_list = []\n", 102 | " for each in temp.cnt_categ.keys():\n", 103 | " unord_list.append((each, round(temp.cnt_categ[each] / temp.observed, 4)))\n", 104 | " ord_list = sorted(unord_list, key=lambda x: x[1], reverse=True)\n", 105 | " for each in ord_list[:8]:\n", 106 | " output.append(str(each[0]) + \" # \" + str(each[1]))\n", 107 | " ofile.write(\"\\n\" + \",\".join(map(str, output)))\n", 108 | " \n", 109 | "\n", 110 | "ofile.write(\n", 111 | " \"\\n\\nString/Text variables\\nVariable, #Records, #Missing, #Observed, #Categories\") # , %C1, %C2, %C3, %C4, %C5, %C6, %C7, %C8\")\n", 112 | "for column_name in DATA_TYPE.keys():\n", 113 | " if DATA_TYPE[column_name][:11] == \"String/Text\":\n", 114 | " temp = Categorical(df[column_name])\n", 115 | " output = [column_name, rows, temp.missing, temp.observed, temp.num_categ]\n", 116 | " ofile.write(\"\\n\" + \",\".join(map(str, output)))\n", 117 | "\n", 118 | "ofile.close()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "Python 3", 134 | "language": "python", 135 | "name": "python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.6.0" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 2 152 | } 153 | -------------------------------------------------------------------------------- /Final_Data_Prediction_Model_SG.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import seaborn as seabornInstance 11 | from sklearn.model_selection import train_test_split 12 | from sklearn.linear_model import LinearRegression 13 | from sklearn.linear_model import LogisticRegression 14 | from sklearn.preprocessing import LabelEncoder 15 | from sklearn import metrics 16 | from sklearn.tree import DecisionTreeRegressor 17 | from sklearn.ensemble import RandomForestClassifier 18 | from sklearn.preprocessing import OneHotEncoder 19 | from sklearn.model_selection import train_test_split 20 | from sklearn.metrics import r2_score,mean_squared_error 21 | 22 | import pickle 23 | get_ipython().run_line_magic('matplotlib', 'inline') 24 | 25 | 26 | # In[2]: 27 | 28 | 29 | # Read dataset 30 | Train_data_1 = pd.read_csv("new_123_789.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 31 | Train_data_2 = pd.read_csv("Train_Data2.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 32 | Train_data_3 = pd.read_csv("Train_data3.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 33 | Train_data_4 = pd.read_csv("Train_data4.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 34 | dataset = pd.concat([Train_data_1, Train_data_2, Train_data_3, Train_data_4], ignore_index = True) 35 | 36 | 37 | # In[3]: 38 | 39 | 40 | # Split train, test and validation 41 | target ='ELGBL_EXPNS_AMT' 42 | X = dataset.drop(columns = target) 43 | y = dataset[target] 44 | 45 | 46 | # In[6]: 47 | 48 | 49 | same_value_columns = same_values(X, 0.975) 50 | 51 | 52 | # In[7]: 53 | 54 | 55 | same_value_columns 56 | 57 | 58 | # In[8]: 59 | 60 | 61 | X['MDFR_1_CD'].value_counts() 62 | 63 | 64 | # In[ ]: 65 | 66 | 67 | 68 | 69 | 70 | # In[47]: 71 | 72 | 73 | def pre_process_data(df): 74 | #Columns to remove based on business logic 75 | #BILLD_CHRGD_AMT 76 | columns_to_remove = ['PAYMNT_AMT', 'NOT_CVRD_AMT', 'BSIC_CPAYMNT_AMT', 'MM_CPAYMNT_AMT', 77 | 'MM_DDCTBL_AMT', 'CPAYMNT_AMT', 'CPAYMNT_TYPE_AMT','BSIC_DDCTBL_AMT', 'PN_ID', 'PN_VRTN_ID', 78 | 'MEM_RESP', 'AUTO_ADUJ', 'COB_SGMNT_CNT', 'MEDCR_CNT', 'DTL_SGMNT_CNT', 'EOB_DNL_CD'] 79 | exception_cols = ['BILLD_CHRGD_AMT'] 80 | df.drop(columns = columns_to_remove, inplace = True) 81 | 82 | #Columns which have the same value for 97.5% of the rows 83 | same_value_columns = same_values(df, 0.975) 84 | df.drop(columns = same_value_columns, inplace = True) 85 | 86 | # Convert to int (Manual identification) 87 | df['TOTL_UNITS_PRCD_CNT'] = df['TOTL_UNITS_PRCD_CNT'].astype('float64') 88 | 89 | # Create variables and convert to string 90 | df['yr'] = pd.DatetimeIndex(df['SRVC_THRU_DT']).year.astype('str') 91 | df['mnth'] = pd.DatetimeIndex(df['SRVC_THRU_DT']).month.astype('str') 92 | df['day_of_week'] = pd.DatetimeIndex(df['SRVC_THRU_DT']).dayofweek.astype('str') 93 | # Drop date variables 94 | df.drop(columns = ['SRVC_THRU_DT','SRVC_FROM_DT'], inplace = True) 95 | 96 | # String columns with less than or equal to 15 unique values (for OHE) 97 | unique_cols = unique_counts(df,15) 98 | 99 | # Convert to OHE 100 | df = ohe(df,unique_cols,exception_cols) 101 | 102 | # Columns which are highly correlated above a certain threshold. (manually identify one variable to keep) 103 | columns_highly_correlated = correlation(df,0.85) 104 | df.drop(columns = columns_highly_correlated, inplace = True) 105 | 106 | return (df) 107 | 108 | 109 | # In[48]: 110 | 111 | 112 | X_ = pre_process_data(X) 113 | 114 | 115 | # In[50]: 116 | 117 | 118 | train_X, test_X_, train_Y, test_Y_ = train_test_split(X_, y, test_size=0.4, random_state=42) 119 | test_X, val_X, test_Y, val_Y = train_test_split(test_X_, test_Y_, test_size=0.5, random_state=42) 120 | 121 | 122 | # In[51]: 123 | 124 | 125 | from sklearn.ensemble import RandomForestRegressor 126 | regr = RandomForestRegressor(max_depth=5, random_state=42) 127 | regr.fit(train_X, train_Y) 128 | 129 | 130 | # In[88]: 131 | 132 | 133 | test_pred = regr.predict(test_X) 134 | 135 | 136 | # In[89]: 137 | 138 | 139 | mse = mean_squared_error(test_pred, c) 140 | rmse = np.sqrt(mse) 141 | print(rmse) 142 | 143 | 144 | # In[92]: 145 | 146 | 147 | regr.score(test_X,test_Y) 148 | 149 | 150 | # In[93]: 151 | 152 | 153 | model.score(test_X,test_Y) 154 | 155 | 156 | # In[83]: 157 | 158 | 159 | for i,col in enumerate(test_X.columns.values.tolist()): 160 | print(col,'{0:.4f}'.format(regr.feature_importances_[i])) 161 | 162 | 163 | # In[98]: 164 | 165 | 166 | model = DecisionTreeRegressor(max_depth=5) 167 | 168 | 169 | # In[99]: 170 | 171 | 172 | model.fit(train_X, train_Y) 173 | 174 | 175 | # In[100]: 176 | 177 | 178 | test_pred = model.predict(test_X) 179 | 180 | 181 | # In[101]: 182 | 183 | 184 | mse = mean_squared_error(test_pred, test_Y) 185 | rmse = np.sqrt(mse) 186 | print(rmse) 187 | 188 | 189 | # In[90]: 190 | 191 | 192 | for i,col in enumerate(test_X.columns.values.tolist()): 193 | print(col,'{0:.4f}'.format(model.feature_importances_[i])) 194 | 195 | 196 | # In[18]: 197 | 198 | 199 | from sklearn.metrics import roc_auc_score 200 | # Actual class predictions 201 | rf_predictions = model.predict(test_X) 202 | # Probabilities for each class 203 | rf_probs = model.predict_proba(test_X)[:, 1] 204 | 205 | 206 | # In[36]: 207 | 208 | 209 | test_pred.value_counts() 210 | 211 | 212 | # In[35]: 213 | 214 | 215 | test_Y.value_counts() 216 | 217 | 218 | # In[81]: 219 | 220 | 221 | metrics.accuracy_score(test_Y_.tolist(),test_pred) 222 | 223 | 224 | # In[ ]: 225 | 226 | 227 | 228 | 229 | 230 | # In[4]: 231 | 232 | 233 | def same_values(df, threshold): 234 | cols = [] 235 | for col in df.columns.values.tolist(): 236 | null_pct = len(df.loc[df[col].isna() == True])/len(df) 237 | if(null_pct >= threshold): 238 | cols.append(col) 239 | else: 240 | same_pct = df[col].value_counts()[0]/len(df) 241 | if(same_pct >= threshold): 242 | cols.append(col) 243 | return cols 244 | 245 | 246 | # In[38]: 247 | 248 | 249 | def unique_counts(df, threshold): 250 | unique_cols= [] 251 | for col in df.columns.values.tolist(): 252 | if (df[col].nunique() <= threshold): 253 | unique_cols.append(col) 254 | return (unique_cols) 255 | 256 | 257 | # In[12]: 258 | 259 | 260 | def correlation(df, threshold): 261 | corr = df.corr() 262 | correlations = [] 263 | for col in corr.columns.values.tolist(): 264 | for col_row in corr.index.values.tolist(): 265 | if (col != col_row): 266 | if (corr[col][col_row] >= threshold): 267 | correlations.append(col) 268 | correlations.append(col_row) 269 | return list(set(correlations)) 270 | 271 | 272 | # In[45]: 273 | 274 | 275 | def ohe(df,col_list): 276 | columns_list = df.columns.values.tolist() 277 | columns_list.remove('BILLD_CHRGD_AMT') 278 | for col in columns_list: 279 | if col in col_list: 280 | ohe = pd.DataFrame() 281 | if(df[col].dtypes == 'object'): 282 | ohe = pd.get_dummies(df[col]) 283 | ohe.columns = [col + "_" + ohe_col for ohe_col in ohe.columns.values.tolist()] 284 | df = df.join(ohe) 285 | df.drop(col,axis = 1, inplace = True) 286 | return df 287 | 288 | 289 | # #### BTS 290 | 291 | # In[ ]: 292 | 293 | 294 | ## Finding variables which are NULL 295 | 296 | 297 | # In[3]: 298 | 299 | 300 | ## Finding variables which are highly correlated 301 | corr = dataset.corr() 302 | for col in dataset.columns.values.tolist(): 303 | for col_row in corr.index.values.tolist(): 304 | if (col != col_row): 305 | if (corr[col][col_row] > 0.84): 306 | print(str(corr[col][col_row]), col, col_row) 307 | correlations = [] 308 | for col in corr.columns.values.tolist(): 309 | for col_row in corr.index.values.tolist(): 310 | correlation = [] 311 | if (col != col_row): 312 | if (corr[col][col_row] >= 0.8): 313 | correlation.append(col) 314 | correlation.append(col_row) 315 | correlations.append(correlation) 316 | 317 | 318 | # In[4]: 319 | 320 | 321 | ## Find string variables with less than 16 unique values 322 | for col in dataset.columns.values.tolist(): 323 | if (dataset[col].nunique() <= 15): 324 | print ("\'" + col +"\',") 325 | 326 | 327 | # In[ ]: 328 | 329 | 330 | # Convert to OHE - def 331 | 332 | 333 | # In[ ]: 334 | 335 | 336 | test_X = tes 337 | 338 | 339 | # In[5]: 340 | 341 | 342 | dataset = pd.concat([Train_data_1, Train_data_2, Train_data_3, Train_data_4], ignore_index = True) 343 | 344 | 345 | # In[6]: 346 | 347 | 348 | # Remove columns 349 | dataset.drop(columns = columns_to_remove, inplace = True) 350 | dataset.drop(columns = columns_97_5, inplace = True) 351 | 352 | 353 | # In[7]: 354 | 355 | 356 | # Convert to int 357 | dataset['TOTL_UNITS_PRCD_CNT'] = dataset['TOTL_UNITS_PRCD_CNT'].astype('float64') 358 | 359 | 360 | # In[8]: 361 | 362 | 363 | # Create variables 364 | dataset['yr'] = pd.DatetimeIndex(dataset['SRVC_THRU_DT']).year 365 | dataset['mnth'] = pd.DatetimeIndex(dataset['SRVC_THRU_DT']).month 366 | dataset['day_of_week'] = pd.DatetimeIndex(dataset['SRVC_THRU_DT']).dayofweek 367 | 368 | 369 | # In[9]: 370 | 371 | 372 | # Convert to string 373 | dataset['yr'] = dataset['yr'].astype('str') 374 | dataset['mnth'] = dataset['mnth'].astype('str') 375 | dataset['day_of_week'] = dataset['day_of_week'].astype('str') 376 | 377 | 378 | # In[10]: 379 | 380 | 381 | # Drop date variables 382 | dataset.drop(columns = ['SRVC_THRU_DT','SRVC_FROM_DT'], inplace = True) 383 | 384 | 385 | # In[11]: 386 | 387 | 388 | # Drop string variables with more than 50 values. Go into detail of each one and try to club them to create indicators 389 | 390 | 391 | # In[12]: 392 | 393 | 394 | columns_under_15 = ['PROV_TAX_ID','PROV_NM','PROV_STR_ADRS','ROV_ZIP_5_CD','PROV_PAYENT_LCTN_CD','MX_PRCG_VRTN_CD', 395 | 'SCRN_FRMT_CD','MIXER_PARG_IND','CLM_TYP','NUM_LINES','HCFA_PT_CD','CLM_TYPE_CD','TELEHEALTH', 396 | 'PROD_DESC','NEW_CLM_TYP','UM_RQRD_IND','CLM_PAYMNT_ACTN_1_CD','yr','mnth','day_of_week'] 397 | 398 | 399 | # In[13]: 400 | 401 | 402 | columns_highly_correlated = ['PROV_NM', 'PROV_STR_ADRS', 'ROV_ZIP_5_CD', 'PROV_PAYENT_LCTN_CD', 'CLM_TYP', 403 | 'UM_RQRD_IND', 'MX_PRCG_VRTN_CD', 'MIXER_PARG_IND', 'HCFA_PT_CD', 'TELEHEALTH'] 404 | 405 | 406 | # In[14]: 407 | 408 | 409 | dataset.drop(columns = columns_highly_correlated, inplace = True) 410 | 411 | 412 | # In[15]: 413 | 414 | 415 | train_X = dataset.drop(columns = target) 416 | train_Y = dataset[target] 417 | 418 | 419 | # In[16]: 420 | 421 | 422 | # Convert to OHE 423 | for col in train_X.columns.values.tolist(): 424 | if col in columns_under_15: 425 | ohe = pd.DataFrame() 426 | if(train_X[col].dtypes == 'object'): 427 | ohe = pd.get_dummies(train_X[col]) 428 | ohe.columns = [col + "_" + ohe_col for ohe_col in ohe.columns.values.tolist()] 429 | train_X = train_X.join(ohe) 430 | train_X.drop(col,axis = 1, inplace = True) 431 | 432 | 433 | # In[17]: 434 | 435 | 436 | dataset.shape 437 | 438 | 439 | # In[18]: 440 | 441 | 442 | # Find correlation of all columns with each other 443 | corr = dataset.corr() 444 | 445 | 446 | # In[19]: 447 | 448 | 449 | for col in dataset.columns.values.tolist(): 450 | for col_row in corr.index.values.tolist(): 451 | if (col != col_row): 452 | if (corr[col][col_row] > 0.84): 453 | print(str(corr[col][col_row]), col, col_row) 454 | 455 | 456 | # In[20]: 457 | 458 | 459 | # train without correlation 460 | 461 | 462 | # In[21]: 463 | 464 | 465 | # Find correlation and create a list of list 466 | correlations = [] 467 | for col in corr.columns.values.tolist(): 468 | for col_row in corr.index.values.tolist(): 469 | correlation = [] 470 | if (col != col_row): 471 | if (corr[col][col_row] >= 0.8): 472 | correlation.append(col) 473 | correlation.append(col_row) 474 | correlations.append(correlation) 475 | 476 | 477 | # In[19]: 478 | 479 | 480 | model = RandomForestClassifier(n_estimators=100, bootstrap = True) 481 | 482 | 483 | # In[20]: 484 | 485 | 486 | model.fit(train_X, train_Y) 487 | 488 | 489 | # In[21]: 490 | 491 | 492 | 493 | 494 | 495 | # In[ ]: 496 | 497 | 498 | from sklearn.metrics import roc_auc_score 499 | # Actual class predictions 500 | rf_predictions = model.predict(test) 501 | # Probabilities for each class 502 | rf_probs = model.predict_proba(test)[:, 1] 503 | 504 | 505 | # In[14]: 506 | 507 | 508 | Process_path = 'Train' 509 | 510 | 511 | # In[15]: 512 | 513 | 514 | if Process_path == "Train": 515 | ### Training Dataset ############ 516 | Train_data_1 = pd.read_csv("new_123_789.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 517 | Train_data_2 = pd.read_csv("Train_Data2.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 518 | Train_data_3 = pd.read_csv("Train_data3.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 519 | Train_data_4 = pd.read_csv("Train_data4.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 520 | Train_data_4 = Train_data_4[:9203] 521 | dataset = pd.concat([Train_data_1, Train_data_2, Train_data_3, Train_data_4], ignore_index = True) 522 | print(dataset.shape) 523 | dataset.insert(0, 'New_ID', range(800000000, 800000000 + len(dataset))) 524 | dataset.drop(dataset[dataset['CLM_PAYMNT_ACTN_1_CD'] > 'P'].index, inplace = True) 525 | dataset1 = dataset[['New_ID',"DAIG1","PROC_CD","PRCG_ZIP_ST_CD","PROV_TAX_ID","PROV_ST_CD","PROV_SPCLTY_CD","BILLG_NPI","RNDRG_NPI","PROV_PAYENT_LCTN_CD","SRVC_FCLTY_LCTN_ID","SRVC_FCLTY_LCTN_NPI","POT_CD","PN_ID","PN_VRTN_ID","MBR_CNTRCT_CD","MBR_CVRG_PRCG_VRTN_CD","MBR_PROD_CD","HCFA_PT_CD","CLM_TYPE_CD","RNDRG_LINE_1_ADRS","RNDRG_CITY_NM","SRC_CD","PROV_RGN_CD","UM_RQRD_IND","NEW_CLM_TYP_1","CASE_NBR","ROV_ZIP_5_CD","PROV_PRC_ZIP_4_CD"]] 526 | dataset2 = dataset[["BILLD_CHRGD_AMT","ELGBL_EXPNS_AMT","TOTL_UNITS_PRCD_CNT"]] 527 | New_dataset1 = handle_non_numerical_data(dataset1) 528 | New_dataset2 = dataset2 529 | Final_Dataset = pd.concat([New_dataset1, New_dataset2], axis=1, join='inner') 530 | X_train = Final_Dataset[['New_ID',"DAIG1","PROC_CD","PRCG_ZIP_ST_CD","PROV_TAX_ID","PROV_ST_CD","PROV_SPCLTY_CD","BILLG_NPI","RNDRG_NPI","PROV_PAYENT_LCTN_CD","SRVC_FCLTY_LCTN_ID","SRVC_FCLTY_LCTN_NPI","POT_CD","PN_ID","PN_VRTN_ID","MBR_CNTRCT_CD","MBR_CVRG_PRCG_VRTN_CD","MBR_PROD_CD","HCFA_PT_CD","CLM_TYPE_CD","RNDRG_LINE_1_ADRS","RNDRG_CITY_NM","SRC_CD","PROV_RGN_CD","UM_RQRD_IND","NEW_CLM_TYP_1","CASE_NBR","ROV_ZIP_5_CD","PROV_PRC_ZIP_4_CD","BILLD_CHRGD_AMT","TOTL_UNITS_PRCD_CNT"]].values 531 | y_train = Final_Dataset["ELGBL_EXPNS_AMT"].values 532 | #### LINEAR Regression Model#### (Doestn't work with different types of data, need to change the New_dataset2 to Float) 533 | #model = LinearRegression() 534 | #model.fit(X_train, y_train) 535 | 536 | ## Descision Model ### (Getting 100% Accuracy Lower accuracy as test Dataset increses, Able to handle larger data set) 537 | #model = DecisionTreeClassifier(random_state=RSEED) 538 | #model = DecisionTreeClassifier(criterion="entropy", max_depth=25) 539 | model = DecisionTreeClassifier(criterion="entropy") 540 | #model = DecisionTreeClassifier(criterion="entropy",max_depth=25,random_state = 100,max_features = "auto", min_samples_leaf = 50) 541 | model.fit(X_train, y_train) 542 | 543 | ######## Random Forest Model ## (getting Around 90% of accuracy) 544 | ######################################### 545 | #### Create the model with 100 trees 546 | #model = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt') 547 | #model.fit(X_train, y_train) 548 | 549 | #######Logestic Regression Model ################ (Only getting 10% Accuracy) 550 | #model = LogisticRegression(C=0.7,random_state=42) 551 | #model.fit(X_train, y_train) 552 | 553 | #### Load Model to the drive ####### 554 | pickle.dump(model, open(filename, 'wb')) 555 | 556 | 557 | # In[16]: 558 | 559 | 560 | ### Training Dataset ############ 561 | Train_data_1 = pd.read_csv("new_123_789.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 562 | Train_data_2 = pd.read_csv("Train_Data2.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 563 | Train_data_3 = pd.read_csv("Train_data3.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 564 | Train_data_4 = pd.read_csv("Train_data4.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 565 | Train_data_4 = Train_data_4[:9203] 566 | dataset = pd.concat([Train_data_1, Train_data_2, Train_data_3, Train_data_4], ignore_index = True) 567 | print(dataset.shape) 568 | dataset.insert(0, 'New_ID', range(800000000, 800000000 + len(dataset))) 569 | dataset.drop(dataset[dataset['CLM_PAYMNT_ACTN_1_CD'] > 'P'].index, inplace = True) 570 | dataset1 = dataset[['New_ID',"DAIG1","PROC_CD","PRCG_ZIP_ST_CD","PROV_TAX_ID","PROV_ST_CD","PROV_SPCLTY_CD","BILLG_NPI","RNDRG_NPI","PROV_PAYENT_LCTN_CD","SRVC_FCLTY_LCTN_ID","SRVC_FCLTY_LCTN_NPI","POT_CD","PN_ID","PN_VRTN_ID","MBR_CNTRCT_CD","MBR_CVRG_PRCG_VRTN_CD","MBR_PROD_CD","HCFA_PT_CD","CLM_TYPE_CD","RNDRG_LINE_1_ADRS","RNDRG_CITY_NM","SRC_CD","PROV_RGN_CD","UM_RQRD_IND","NEW_CLM_TYP_1","CASE_NBR","ROV_ZIP_5_CD","PROV_PRC_ZIP_4_CD"]] 571 | dataset2 = dataset[["BILLD_CHRGD_AMT","ELGBL_EXPNS_AMT","TOTL_UNITS_PRCD_CNT"]] 572 | New_dataset1 = handle_non_numerical_data(dataset1) 573 | New_dataset2 = dataset2 574 | Final_Dataset = pd.concat([New_dataset1, New_dataset2], axis=1, join='inner') 575 | 576 | 577 | # In[51]: 578 | 579 | 580 | columns_train = ['New_ID',"DAIG1","PROC_CD","PRCG_ZIP_ST_CD","PROV_TAX_ID","PROV_ST_CD","PROV_SPCLTY_CD","BILLG_NPI","RNDRG_NPI","PROV_PAYENT_LCTN_CD","SRVC_FCLTY_LCTN_ID","SRVC_FCLTY_LCTN_NPI","POT_CD","PN_ID","PN_VRTN_ID","MBR_CNTRCT_CD","MBR_CVRG_PRCG_VRTN_CD","MBR_PROD_CD","HCFA_PT_CD","CLM_TYPE_CD","RNDRG_LINE_1_ADRS","RNDRG_CITY_NM","SRC_CD","PROV_RGN_CD","UM_RQRD_IND","NEW_CLM_TYP_1","CASE_NBR","ROV_ZIP_5_CD","PROV_PRC_ZIP_4_CD","TOTL_UNITS_PRCD_CNT", 'BILLD_CHRGD_AMT'] 581 | 582 | 583 | # In[52]: 584 | 585 | 586 | for col in columns_train: 587 | print(col) 588 | 589 | 590 | # In[44]: 591 | 592 | 593 | X_train = Final_Dataset[columns_train] 594 | 595 | 596 | # In[41]: 597 | 598 | 599 | Final_Dataset['BILLD_CHRGD_AMT'] = Final_Dataset['BILLD_CHRGD_AMT'].astype('float') 600 | Final_Dataset['TOTL_UNITS_PRCD_CNT'] = Final_Dataset['TOTL_UNITS_PRCD_CNT'].astype('float') 601 | Final_Dataset['ELGBL_EXPNS_AMT'] = Final_Dataset['ELGBL_EXPNS_AMT'].astype('str') 602 | 603 | 604 | # In[42]: 605 | 606 | 607 | for col in X_train: 608 | # print(col,type(Final_Dataset[col][0])) 609 | print(col, Final_Dataset['ELGBL_EXPNS_AMT'].corr(Final_Dataset[col])) 610 | 611 | 612 | # In[ ]: 613 | 614 | 615 | 616 | 617 | 618 | # In[ ]: 619 | 620 | 621 | 622 | 623 | 624 | # In[46]: 625 | 626 | 627 | Final_Dataset['ELGBL_EXPNS_AMT'] = Final_Dataset['ELGBL_EXPNS_AMT'].astype('str') 628 | 629 | 630 | # In[47]: 631 | 632 | 633 | 634 | 635 | y_train = Final_Dataset["ELGBL_EXPNS_AMT"].values 636 | #### LINEAR Regression Model#### (Doestn't work with different types of data, need to change the New_dataset2 to Float) 637 | #model = LinearRegression() 638 | #model.fit(X_train, y_train) 639 | 640 | ## Descision Model ### (Getting 100% Accuracy Lower accuracy as test Dataset increses, Able to handle larger data set) 641 | #model = DecisionTreeClassifier(random_state=RSEED) 642 | #model = DecisionTreeClassifier(criterion="entropy", max_depth=25) 643 | model = DecisionTreeClassifier(criterion="entropy") 644 | #model = DecisionTreeClassifier(criterion="entropy",max_depth=25,random_state = 100,max_features = "auto", min_samples_leaf = 50) 645 | model.fit(X_train, y_train) 646 | 647 | ######## Random Forest Model ## (getting Around 90% of accuracy) 648 | ######################################### 649 | #### Create the model with 100 trees 650 | #model = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt') 651 | #model.fit(X_train, y_train) 652 | 653 | #######Logestic Regression Model ################ (Only getting 10% Accuracy) 654 | #model = LogisticRegression(C=0.7,random_state=42) 655 | #model.fit(X_train, y_train) 656 | 657 | #### Load Model to the drive ####### 658 | pickle.dump(model, open(filename, 'wb')) 659 | 660 | 661 | # In[ ]: 662 | 663 | 664 | 665 | 666 | 667 | # In[ ]: 668 | 669 | 670 | 671 | 672 | 673 | # In[48]: 674 | 675 | 676 | Process_path = 'Test' 677 | 678 | 679 | # In[49]: 680 | 681 | 682 | if Process_path == "Test": 683 | ###### Testing Dataset ############ 684 | #testset = pd.read_csv("Test_Date.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 685 | # testset = pd.read_csv("Test_Data_3.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 686 | Train_data_4 = pd.read_csv("Train_data4.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 687 | testset = Train_data_4[:500] 688 | testset.insert(0, 'New_ID', range(900000000, 900000000 + len(testset))) 689 | #testset.drop(testset[testset['CLM_PAYMNT_ACTN_1_CD'] > 'P'].index, inplace = True) 690 | testset1 = testset[['New_ID',"DAIG1","PROC_CD","PRCG_ZIP_ST_CD","PROV_TAX_ID","PROV_ST_CD","PROV_SPCLTY_CD","BILLG_NPI","RNDRG_NPI","PROV_PAYENT_LCTN_CD","SRVC_FCLTY_LCTN_ID","SRVC_FCLTY_LCTN_NPI","POT_CD","PN_ID","PN_VRTN_ID","MBR_CNTRCT_CD","MBR_CVRG_PRCG_VRTN_CD","MBR_PROD_CD","HCFA_PT_CD","CLM_TYPE_CD","RNDRG_LINE_1_ADRS","RNDRG_CITY_NM","SRC_CD","PROV_RGN_CD","UM_RQRD_IND","NEW_CLM_TYP_1","CASE_NBR","ROV_ZIP_5_CD","PROV_PRC_ZIP_4_CD"]] 691 | testset2 = testset[["BILLD_CHRGD_AMT","ELGBL_EXPNS_AMT","TOTL_UNITS_PRCD_CNT"]] 692 | #testset1 = testset[["DAIG1","PROC_CD","PROV_TAX_ID"]] 693 | New_testset1 = handle_non_numerical_data_Test(testset1) 694 | New_testset2 = testset2 695 | Final_testset = pd.concat([New_testset1, New_testset2], axis=1, join='inner') 696 | X_test = Final_testset[['New_ID',"DAIG1","PROC_CD","PRCG_ZIP_ST_CD","PROV_TAX_ID","PROV_ST_CD","PROV_SPCLTY_CD","BILLG_NPI","RNDRG_NPI","PROV_PAYENT_LCTN_CD","SRVC_FCLTY_LCTN_ID","SRVC_FCLTY_LCTN_NPI","POT_CD","PN_ID","PN_VRTN_ID","MBR_CNTRCT_CD","MBR_CVRG_PRCG_VRTN_CD","MBR_PROD_CD","HCFA_PT_CD","CLM_TYPE_CD","RNDRG_LINE_1_ADRS","RNDRG_CITY_NM","SRC_CD","PROV_RGN_CD","UM_RQRD_IND","NEW_CLM_TYP_1","CASE_NBR","ROV_ZIP_5_CD","PROV_PRC_ZIP_4_CD","TOTL_UNITS_PRCD_CNT"]].values 697 | y_test = Final_testset["ELGBL_EXPNS_AMT"].values 698 | #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001, random_state=RSEED) 699 | #### Read Model from Drive ######## 700 | model = pickle.load(open(filename, 'rb')) 701 | 702 | #Actual Pridiction 703 | y_pred = model.predict(X_test) 704 | new_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) 705 | print("Test Data Accuracy: ({0:.4f})".format(metrics.accuracy_score(y_test,y_pred))) 706 | ##print("Test Data Accuracy: ({0:.4f})".format(metrics.accuracy_score(y_test,y_pred))) 707 | new_df1 = pd.DataFrame({'New_ID': [i[0] for i in X_test],'Actual': y_test, 'Predicted': y_pred}) 708 | ##Final_Resultset = pd.merge(testset, new_df1, on='New_ID') 709 | Final_Resultset = pd.merge(testset, new_df1, on='New_ID') 710 | Final_Resultset1 = Final_Resultset[["New_ID","KEY_CHK_DCN_NBR","KEY_CHK_DCN_ITEM_CD","KEY_CHK_DCN_CENTRY_CD","BILLD_CHRGD_AMT","Actual","Predicted"]] 711 | #print(Final_Resultset1) 712 | 713 | Final_Resultset1.head(20) 714 | 715 | 716 | # In[ ]: 717 | 718 | 719 | 720 | 721 | 722 | # #### Misc 723 | 724 | # In[ ]: 725 | 726 | 727 | strings_vars = ['KEY_CHK_DCN_NBR','DAIG1', 'DAIG2', 'DAIG3', 'DAIG4', 'DAIG5', 'PROC_CD', 'PROV_SCNDRY_NM', 728 | 'PROV_PRC_ZIP_4_CD', 'RNDRG_NPI', 'PROV_SPCLTY_CD', 'SRVC_FCLTY_LCTN_NPI', 'MBR_CNTRCT_CD', 729 | 'MBR_CVRG_PRCG_VRTN_CD', 'MBR_PROD_CD', 'RNDRG_LINE_1_ADRS', 'RNDRG_CITY_NM', 'CLM_PAYMNT_ACTN_2_6_CD', 730 | 'CASE_NBR', 'SRVC_FROM_DT', 'SRVC_THRU_DT'] 731 | categorical_vars = ['KEY_CHK_DCN_ITEM_CD', 'KEY_CHK_DCN_CENTRY_CD', 'MDFR_1_CD', 'MDFR_2_CD', 'MDFR_3_CD', 732 | 'PRCG_ZIP_ST_CD', 'PROV_TAX_ID', 'PROV_NM', 'PROV_STR_ADRS', 'ROV_ZIP_5_CD', 'PROV_ST_CD', 733 | 'BILLG_NPI', 'PROV_PAYENT_LCTN_CD', 'SRVC_FCLTY_LCTN_ID', 'BSIC_DDCTBL_AMT', 'POT_CD', 734 | 'MX_PRCG_VRTN_CD', 'MX_PROV_PRCG_PROD_CD', 'PN_ID', 'PN_VRTN_ID', 'SCRN_FRMT_CD', 'MIXER_PARG_IND', 735 | 'MEM_RESP', 'CLM_TYP', 'NUM_LINES', 'HCFA_PT_CD', 'CLM_TYPE_CD', 'AUTO_ADUJ', 'HCPCS_MDFR_CD', 736 | 'PAY_AUTHRZN_CD', 'COB_SGMNT_CNT', 'MEDCR_CNT', 'DTL_SGMNT_CNT', 'PROV_GROUP', 'RNDRG_LINE_2_ADRS', 737 | 'TELEHEALTH', 'SRC_CD', 'PROD_DESC', 'NEW_CLM_TYP', 'PROV_RGN_CD', 'UM_RFRL_TYPE_RQRD_IND', 738 | 'UM_RQRD_IND', 'NEW_CLM_TYP_1', 'CLM_PAYMNT_ACTN_1_CD', 'EOB_DNL_CD'] 739 | 740 | 741 | # In[ ]: 742 | 743 | 744 | def pre_process_data(df): 745 | #Columns to remove based on business logic 746 | columns_to_remove = ['BILLD_CHRGD_AMT', 'PAYMNT_AMT', 'NOT_CVRD_AMT', 'BSIC_CPAYMNT_AMT', 747 | 'MM_CPAYMNT_AMT', 'MM_DDCTBL_AMT', 'CPAYMNT_AMT', 'CPAYMNT_TYPE_AMT','BSIC_DDCTBL_AMT', 748 | 'PN_ID', 'PN_VRTN_ID', 'MEM_RESP', 'AUTO_ADUJ', 'COB_SGMNT_CNT', 'MEDCR_CNT', 'DTL_SGMNT_CNT', 749 | 'EOB_DNL_CD'] 750 | #Columns which have the same value for 97.5% of the rows 751 | same_value_columns = same_values(dataset, 0.975) 752 | df.drop(columns = columns_to_remove, inplace = True) 753 | df.drop(columns = same_value_columns, inplace = True) 754 | # Convert to int (Manual identification) 755 | df['TOTL_UNITS_PRCD_CNT'] = df['TOTL_UNITS_PRCD_CNT'].astype('float64') 756 | # Create variables 757 | df['yr'] = pd.DatetimeIndex(df['SRVC_THRU_DT']).year 758 | df['mnth'] = pd.DatetimeIndex(df['SRVC_THRU_DT']).month 759 | df['day_of_week'] = pd.DatetimeIndex(df['SRVC_THRU_DT']).dayofweek 760 | # Convert to string 761 | df['yr'] = df['yr'].astype('str') 762 | df['mnth'] = df['mnth'].astype('str') 763 | df['day_of_week'] = df['day_of_week'].astype('str') 764 | # Drop date variables 765 | df.drop(columns = ['SRVC_THRU_DT','SRVC_FROM_DT'], inplace = True) 766 | # String columns with less than 16 unique values (for OHE) 767 | unique_cols = ['PROV_TAX_ID','PROV_NM','PROV_STR_ADRS','ROV_ZIP_5_CD','PROV_PAYENT_LCTN_CD','MX_PRCG_VRTN_CD', 768 | 'SCRN_FRMT_CD','MIXER_PARG_IND','CLM_TYP','NUM_LINES','HCFA_PT_CD','CLM_TYPE_CD','TELEHEALTH', 769 | 'PROD_DESC','NEW_CLM_TYP','UM_RQRD_IND','CLM_PAYMNT_ACTN_1_CD','yr','mnth','day_of_week'] 770 | columns_highly_correlated = ['PROV_NM', 'PROV_STR_ADRS', 'ROV_ZIP_5_CD', 'PROV_PAYENT_LCTN_CD', 'CLM_TYP', 771 | 'UM_RQRD_IND', 'MX_PRCG_VRTN_CD', 'MIXER_PARG_IND', 'HCFA_PT_CD', 'TELEHEALTH'] 772 | dataset.drop(columns = columns_highly_correlated, inplace = True) 773 | ohe 774 | 775 | 776 | # #### Questions 777 | 778 | # In[37]: 779 | 780 | 781 | #. 1. Service through and from date are same for all values 782 | dataset['duration_of_treatment'] = pd.to_datetime(dataset['SRVC_THRU_DT']) - pd.to_datetime(dataset['SRVC_FROM_DT']) 783 | dataset['date_check'] = dataset['SRVC_THRU_DT']==dataset['SRVC_FROM_DT'] 784 | 785 | 786 | # In[ ]: 787 | 788 | 789 | # 2. DTL_LINE_NBR - number 01 vs 1 - is there any difference? 790 | # Note - I think this can be an integer 791 | 792 | -------------------------------------------------------------------------------- /GMM - Clusters 6 - Normalized input.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "pd.options.display.max_rows = 4000\n", 23 | "pd.set_option('display.float_format', lambda x: '%.2f' % x)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": { 30 | "collapsed": false, 31 | "scrolled": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "df = pd.read_csv('../../0.Data/1.Interim/New/cluster.csv')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "cols = ['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n", 47 | " 'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n", 48 | " 'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n", 49 | " 'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n", 50 | " 'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n", 51 | " 'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n", 52 | " 'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n", 53 | " 'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n", 54 | " 'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n", 55 | " 'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS']" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 5, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "array(['TOT_DUE_AMT', 'Promise_%', 'Adjust_%', 'A_avg', 'P_avg',\n", 69 | " 'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n", 70 | " 'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n", 71 | " 'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n", 72 | " 'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n", 73 | " 'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n", 74 | " 'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n", 75 | " 'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n", 76 | " 'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n", 77 | " 'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS', 'BAN',\n", 78 | " 'labels'], dtype=object)" 79 | ] 80 | }, 81 | "execution_count": 5, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [ 87 | "df.columns.values" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 6, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "df_ = df[cols]" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 8, 104 | "metadata": { 105 | "collapsed": true 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "df_ =((df_-df_.min())/(df_.max()-df_.min()))*100" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 17, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/html": [ 122 | "
\n", 123 | "\n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | "
TOT_DUE_AMTPromise_%A_avgP_avgBANacct_bhvr_scr_nbrar_bhvr_scr_nbravg_paid_full_dy_cntcrdt_buru_scr_nbrcust_bhvr_scr_nbr...preferred_weekdayCALL_DTreturn_itm_180_dy_cntreturn_itm_30_dy_cntTenureMOBILITY_REGION_NAME_flagCentralMOBILITY_REGION_NAME_flagEastMOBILITY_REGION_NAME_flagWestACCT_STS_AT_CALL_DATE_flagNACCT_STS_AT_CALL_DATE_flagOACCT_STS_AT_CALL_DATE_flagS
09.1750.0076.0099.0524.89100.00100.004.9891.41100.00...33.330.000.0017.27100.000.000.000.00100.000.00
19.0989.4776.3299.3988.55100.00100.005.2779.96100.00...66.674.350.0036.920.00100.000.000.00100.000.00
29.2240.0075.8998.690.06100.00100.003.9193.05100.00...66.670.000.0079.46100.000.000.000.00100.000.00
38.9945.0076.0099.440.1370.1462.536.640.00100.00...66.674.350.0028.270.00100.000.000.00100.000.00
49.0950.0076.0199.3872.30100.00100.003.0364.11100.00...66.670.000.000.580.00100.000.000.00100.000.00
\n", 273 | "

5 rows × 28 columns

\n", 274 | "
" 275 | ], 276 | "text/plain": [ 277 | " TOT_DUE_AMT Promise_% A_avg P_avg BAN acct_bhvr_scr_nbr \\\n", 278 | "0 9.17 50.00 76.00 99.05 24.89 100.00 \n", 279 | "1 9.09 89.47 76.32 99.39 88.55 100.00 \n", 280 | "2 9.22 40.00 75.89 98.69 0.06 100.00 \n", 281 | "3 8.99 45.00 76.00 99.44 0.13 70.14 \n", 282 | "4 9.09 50.00 76.01 99.38 72.30 100.00 \n", 283 | "\n", 284 | " ar_bhvr_scr_nbr avg_paid_full_dy_cnt crdt_buru_scr_nbr \\\n", 285 | "0 100.00 4.98 91.41 \n", 286 | "1 100.00 5.27 79.96 \n", 287 | "2 100.00 3.91 93.05 \n", 288 | "3 62.53 6.64 0.00 \n", 289 | "4 100.00 3.03 64.11 \n", 290 | "\n", 291 | " cust_bhvr_scr_nbr ... preferred_weekdayCALL_DT \\\n", 292 | "0 100.00 ... 33.33 \n", 293 | "1 100.00 ... 66.67 \n", 294 | "2 100.00 ... 66.67 \n", 295 | "3 100.00 ... 66.67 \n", 296 | "4 100.00 ... 66.67 \n", 297 | "\n", 298 | " return_itm_180_dy_cnt return_itm_30_dy_cnt Tenure \\\n", 299 | "0 0.00 0.00 17.27 \n", 300 | "1 4.35 0.00 36.92 \n", 301 | "2 0.00 0.00 79.46 \n", 302 | "3 4.35 0.00 28.27 \n", 303 | "4 0.00 0.00 0.58 \n", 304 | "\n", 305 | " MOBILITY_REGION_NAME_flagCentral MOBILITY_REGION_NAME_flagEast \\\n", 306 | "0 100.00 0.00 \n", 307 | "1 0.00 100.00 \n", 308 | "2 100.00 0.00 \n", 309 | "3 0.00 100.00 \n", 310 | "4 0.00 100.00 \n", 311 | "\n", 312 | " MOBILITY_REGION_NAME_flagWest ACCT_STS_AT_CALL_DATE_flagN \\\n", 313 | "0 0.00 0.00 \n", 314 | "1 0.00 0.00 \n", 315 | "2 0.00 0.00 \n", 316 | "3 0.00 0.00 \n", 317 | "4 0.00 0.00 \n", 318 | "\n", 319 | " ACCT_STS_AT_CALL_DATE_flagO ACCT_STS_AT_CALL_DATE_flagS \n", 320 | "0 100.00 0.00 \n", 321 | "1 100.00 0.00 \n", 322 | "2 100.00 0.00 \n", 323 | "3 100.00 0.00 \n", 324 | "4 100.00 0.00 \n", 325 | "\n", 326 | "[5 rows x 28 columns]" 327 | ] 328 | }, 329 | "execution_count": 17, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "df_.head()" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 10, 341 | "metadata": { 342 | "collapsed": true 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "# Run GMM \n", 347 | "from sklearn.mixture import GMM" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 11, 353 | "metadata": { 354 | "collapsed": false 355 | }, 356 | "outputs": [ 357 | { 358 | "name": "stderr", 359 | "output_type": "stream", 360 | "text": [ 361 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning: Class GMM is deprecated; The class GMM is deprecated in 0.18 and will be removed in 0.20. Use class GaussianMixture instead.\n", 362 | " warnings.warn(msg, category=DeprecationWarning)\n" 363 | ] 364 | }, 365 | { 366 | "ename": "KeyboardInterrupt", 367 | "evalue": "", 368 | "output_type": "error", 369 | "traceback": [ 370 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 371 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 372 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_cell_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'time'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'gmm = GMM(n_components=9).fit(df_)\\nlabels = gmm.predict(df_)'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 373 | "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mrun_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m 2113\u001b[0m \u001b[0mmagic_arg_s\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvar_expand\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstack_depth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2114\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2115\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmagic_arg_s\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2116\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2117\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 374 | "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/IPython/core/magics/execution.py\u001b[0m in \u001b[0;36mtime\u001b[0;34m(self, line, cell, local_ns)\u001b[0m\n", 375 | "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/IPython/core/magic.py\u001b[0m in \u001b[0;36m\u001b[0;34m(f, *a, **k)\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 188\u001b[0;31m \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 189\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 190\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 376 | "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/IPython/core/magics/execution.py\u001b[0m in \u001b[0;36mtime\u001b[0;34m(self, line, cell, local_ns)\u001b[0m\n\u001b[1;32m 1178\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1179\u001b[0m \u001b[0mst\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclock2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1180\u001b[0;31m \u001b[0mexec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mglob\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlocal_ns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1181\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclock2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1182\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 377 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n", 378 | "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/mixture/gmm.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m 595\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 596\u001b[0m \"\"\"\n\u001b[0;32m--> 597\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 598\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 379 | "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/mixture/gmm.py\u001b[0m in \u001b[0;36m_fit\u001b[0;34m(self, X, y, do_prediction)\u001b[0m\n\u001b[1;32m 491\u001b[0m self.means_ = cluster.KMeans(\n\u001b[1;32m 492\u001b[0m \u001b[0mn_clusters\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_components\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 493\u001b[0;31m random_state=self.random_state).fit(X).cluster_centers_\n\u001b[0m\u001b[1;32m 494\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mverbose\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 495\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\tMeans have been initialized.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 380 | "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/cluster/k_means_.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m 887\u001b[0m \u001b[0mtol\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy_x\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy_x\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 888\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malgorithm\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0malgorithm\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 889\u001b[0;31m return_n_iter=True)\n\u001b[0m\u001b[1;32m 890\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 891\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 381 | "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/cluster/k_means_.py\u001b[0m in \u001b[0;36mk_means\u001b[0;34m(X, n_clusters, init, precompute_distances, n_init, max_iter, verbose, tol, random_state, copy_x, n_jobs, algorithm, return_n_iter)\u001b[0m\n\u001b[1;32m 343\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_clusters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmax_iter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0mprecompute_distances\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprecompute_distances\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtol\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtol\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 345\u001b[0;31m x_squared_norms=x_squared_norms, random_state=random_state)\n\u001b[0m\u001b[1;32m 346\u001b[0m \u001b[0;31m# determine if these results are the best so far\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 347\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mbest_inertia\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0minertia\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mbest_inertia\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 382 | "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/cluster/k_means_.py\u001b[0m in \u001b[0;36m_kmeans_single_elkan\u001b[0;34m(X, n_clusters, max_iter, init, verbose, x_squared_norms, random_state, tol, precompute_distances)\u001b[0m\n\u001b[1;32m 397\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Initialization complete'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 398\u001b[0m centers, labels, n_iter = k_means_elkan(X, n_clusters, centers, tol=tol,\n\u001b[0;32m--> 399\u001b[0;31m max_iter=max_iter, verbose=verbose)\n\u001b[0m\u001b[1;32m 400\u001b[0m \u001b[0minertia\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mcenters\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m**\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat64\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 401\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minertia\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcenters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_iter\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 383 | "\u001b[0;32msklearn/cluster/_k_means_elkan.pyx\u001b[0m in \u001b[0;36msklearn.cluster._k_means_elkan.k_means_elkan (sklearn/cluster/_k_means_elkan.c:7470)\u001b[0;34m()\u001b[0m\n", 384 | "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36meuclidean_distances\u001b[0;34m(X, Y, Y_norm_squared, squared, X_norm_squared)\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0;31m# Pairwise distances\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 162\u001b[0;31m def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,\n\u001b[0m\u001b[1;32m 163\u001b[0m X_norm_squared=None):\n\u001b[1;32m 164\u001b[0m \"\"\"\n", 385 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "%%time\n", 391 | "gmm = GMM(n_components=9).fit(df_)\n", 392 | "labels = gmm.predict(df_)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 21, 398 | "metadata": { 399 | "collapsed": true 400 | }, 401 | "outputs": [], 402 | "source": [ 403 | "cluster_normalized = df_.copy()" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 58, 409 | "metadata": { 410 | "collapsed": false 411 | }, 412 | "outputs": [ 413 | { 414 | "data": { 415 | "text/plain": [ 416 | "TOT_DUE_AMT 9.09\n", 417 | "Promise_% 63.69\n", 418 | "A_avg 76.00\n", 419 | "P_avg 99.09\n", 420 | "BAN 57.30\n", 421 | "acct_bhvr_scr_nbr 98.03\n", 422 | "ar_bhvr_scr_nbr 97.95\n", 423 | "avg_paid_full_dy_cnt 8.73\n", 424 | "crdt_buru_scr_nbr 83.87\n", 425 | "cust_bhvr_scr_nbr 98.27\n", 426 | "cust_recls_scr_nbr 89.64\n", 427 | "pmt_arng_scr_nbr 82.87\n", 428 | "wirls_ln_cnt 0.56\n", 429 | "excpt_ovrd_ind 0.75\n", 430 | "pyarr_scr_nbr 82.87\n", 431 | "lst_bhvr_scr_nbr 77.48\n", 432 | "preferred_month_CALL_DT 60.62\n", 433 | "preferred_day_of_monthCALL_DT 56.81\n", 434 | "preferred_weekdayCALL_DT 42.45\n", 435 | "return_itm_180_dy_cnt 0.45\n", 436 | "return_itm_30_dy_cnt 0.27\n", 437 | "Tenure 21.54\n", 438 | "MOBILITY_REGION_NAME_flagCentral 36.56\n", 439 | "MOBILITY_REGION_NAME_flagEast 44.19\n", 440 | "MOBILITY_REGION_NAME_flagWest 19.25\n", 441 | "ACCT_STS_AT_CALL_DATE_flagN 0.21\n", 442 | "ACCT_STS_AT_CALL_DATE_flagO 99.60\n", 443 | "ACCT_STS_AT_CALL_DATE_flagS 0.20\n", 444 | "labels 2.68\n", 445 | "dtype: float64" 446 | ] 447 | }, 448 | "execution_count": 58, 449 | "metadata": {}, 450 | "output_type": "execute_result" 451 | } 452 | ], 453 | "source": [ 454 | "cluster_normalized.mean()" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": { 461 | "collapsed": true 462 | }, 463 | "outputs": [], 464 | "source": [] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": { 470 | "collapsed": true 471 | }, 472 | "outputs": [], 473 | "source": [] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": { 479 | "collapsed": true 480 | }, 481 | "outputs": [], 482 | "source": [] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 22, 487 | "metadata": { 488 | "collapsed": true 489 | }, 490 | "outputs": [], 491 | "source": [ 492 | "cluster_normalized['labels'] = labels" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 23, 498 | "metadata": { 499 | "collapsed": false 500 | }, 501 | "outputs": [], 502 | "source": [ 503 | "c1 = cluster_normalized.loc[cluster_normalized['labels'] == 0]\n", 504 | "c2 = cluster_normalized.loc[cluster_normalized['labels'] == 1]\n", 505 | "c3 = cluster_normalized.loc[cluster_normalized['labels'] == 2]\n", 506 | "c4 = cluster_normalized.loc[cluster_normalized['labels'] == 3]\n", 507 | "c5 = cluster_normalized.loc[cluster_normalized['labels'] == 4]\n", 508 | "c6 = cluster_normalized.loc[cluster_normalized['labels'] == 5]" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 24, 514 | "metadata": { 515 | "collapsed": true 516 | }, 517 | "outputs": [], 518 | "source": [ 519 | "df = cluster_normalized.drop('labels', axis = 1)" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 34, 525 | "metadata": { 526 | "collapsed": true 527 | }, 528 | "outputs": [ 529 | { 530 | "data": { 531 | "text/plain": [ 532 | "TOT_DUE_AMT 9.09\n", 533 | "Promise_% 63.69\n", 534 | "A_avg 76.00\n", 535 | "P_avg 99.09\n", 536 | "BAN 57.30\n", 537 | "acct_bhvr_scr_nbr 98.03\n", 538 | "ar_bhvr_scr_nbr 97.95\n", 539 | "avg_paid_full_dy_cnt 8.73\n", 540 | "crdt_buru_scr_nbr 83.87\n", 541 | "cust_bhvr_scr_nbr 98.27\n", 542 | "cust_recls_scr_nbr 89.64\n", 543 | "pmt_arng_scr_nbr 82.87\n", 544 | "wirls_ln_cnt 0.56\n", 545 | "excpt_ovrd_ind 0.75\n", 546 | "pyarr_scr_nbr 82.87\n", 547 | "lst_bhvr_scr_nbr 77.48\n", 548 | "preferred_month_CALL_DT 60.62\n", 549 | "preferred_day_of_monthCALL_DT 56.81\n", 550 | "preferred_weekdayCALL_DT 42.45\n", 551 | "return_itm_180_dy_cnt 0.45\n", 552 | "return_itm_30_dy_cnt 0.27\n", 553 | "Tenure 21.54\n", 554 | "MOBILITY_REGION_NAME_flagCentral 36.56\n", 555 | "MOBILITY_REGION_NAME_flagEast 44.19\n", 556 | "MOBILITY_REGION_NAME_flagWest 19.25\n", 557 | "ACCT_STS_AT_CALL_DATE_flagN 0.21\n", 558 | "ACCT_STS_AT_CALL_DATE_flagO 99.60\n", 559 | "ACCT_STS_AT_CALL_DATE_flagS 0.20\n", 560 | "dtype: float64" 561 | ] 562 | }, 563 | "execution_count": 34, 564 | "metadata": {}, 565 | "output_type": "execute_result" 566 | } 567 | ], 568 | "source": [ 569 | "df_.mean()" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 36, 575 | "metadata": { 576 | "collapsed": true 577 | }, 578 | "outputs": [], 579 | "source": [ 580 | "df_.to_csv('../../0.Data/1.Interim/New/merge_normalized.csv', index = False)" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": 38, 586 | "metadata": { 587 | "collapsed": true 588 | }, 589 | "outputs": [], 590 | "source": [ 591 | "cluster_normalized.to_csv('../../0.Data/1.Interim/New/cluster_normalized.csv', index = False)" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 37, 597 | "metadata": { 598 | "collapsed": false 599 | }, 600 | "outputs": [ 601 | { 602 | "data": { 603 | "text/plain": [ 604 | "array(['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n", 605 | " 'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n", 606 | " 'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n", 607 | " 'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n", 608 | " 'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n", 609 | " 'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n", 610 | " 'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n", 611 | " 'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n", 612 | " 'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n", 613 | " 'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS',\n", 614 | " 'labels'], dtype=object)" 615 | ] 616 | }, 617 | "execution_count": 37, 618 | "metadata": {}, 619 | "output_type": "execute_result" 620 | } 621 | ], 622 | "source": [ 623 | "cluster_normalized.columns.values" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": 25, 629 | "metadata": { 630 | "collapsed": false 631 | }, 632 | "outputs": [ 633 | { 634 | "data": { 635 | "text/plain": [ 636 | "array(['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n", 637 | " 'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n", 638 | " 'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n", 639 | " 'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n", 640 | " 'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n", 641 | " 'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n", 642 | " 'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n", 643 | " 'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n", 644 | " 'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n", 645 | " 'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS'], dtype=object)" 646 | ] 647 | }, 648 | "execution_count": 25, 649 | "metadata": {}, 650 | "output_type": "execute_result" 651 | } 652 | ], 653 | "source": [ 654 | "df.columns.values" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 27, 660 | "metadata": { 661 | "collapsed": false 662 | }, 663 | "outputs": [], 664 | "source": [ 665 | "cluster_grp = []\n", 666 | "for col in df.columns.values.tolist():\n", 667 | " orig = round(df[col].mean(),2)\n", 668 | " c1_val = round(c1[col].mean(),2)\n", 669 | " c2_val = round(c2[col].mean(),2)\n", 670 | " c3_val = round(c3[col].mean(),2)\n", 671 | " c4_val = round(c4[col].mean(),2)\n", 672 | " c5_val = round(c5[col].mean(),2)\n", 673 | " c6_val = round(c6[col].mean(),2)\n", 674 | " if(orig == 0):\n", 675 | " c1_change = None\n", 676 | " c2_change = None\n", 677 | " c3_change = None\n", 678 | " c4_change = None\n", 679 | " c5_change = None\n", 680 | " c6_change = None\n", 681 | " else:\n", 682 | " c1_change = round(((c1_val-orig)*100/orig),2)\n", 683 | " c2_change = round(((c2_val-orig)*100/orig),2)\n", 684 | " c3_change = round(((c3_val-orig)*100/orig),2)\n", 685 | " c4_change = round(((c4_val-orig)*100/orig),2)\n", 686 | " c5_change = round(((c5_val-orig)*100/orig),2)\n", 687 | " c6_change = round(((c6_val-orig)*100/orig),2)\n", 688 | " cluster_grp.append((col, orig, c1_val, c1_change , c2_val, c2_change, c3_val, c3_change, c4_val, c4_change, c5_val, c5_change,\\\n", 689 | " c6_val, c6_change))\n", 690 | "\n" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 28, 696 | "metadata": { 697 | "collapsed": false 698 | }, 699 | "outputs": [], 700 | "source": [ 701 | "cluster_group = pd.DataFrame(cluster_grp, columns = ['Column', 'Entire dataset value', 'c1_value', 'c1_change_%', \\\n", 702 | " 'c2_value', 'c2_change_%', 'c3_value', 'c3_change_%', 'c4_value', 'c4_change_%', \\\n", 703 | " 'c5_value', 'c5_change_%', 'c6_value' , 'c6_change_%'])" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 29, 709 | "metadata": { 710 | "collapsed": false 711 | }, 712 | "outputs": [ 713 | { 714 | "data": { 715 | "text/plain": [ 716 | "28" 717 | ] 718 | }, 719 | "execution_count": 29, 720 | "metadata": {}, 721 | "output_type": "execute_result" 722 | } 723 | ], 724 | "source": [ 725 | "len(cluster_group)" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": 30, 731 | "metadata": { 732 | "collapsed": true 733 | }, 734 | "outputs": [], 735 | "source": [ 736 | "cluster_group.to_csv('../3.Analysis/cluster_gmm_1_normalized.csv', index = False)" 737 | ] 738 | }, 739 | { 740 | "cell_type": "code", 741 | "execution_count": 45, 742 | "metadata": { 743 | "collapsed": false 744 | }, 745 | "outputs": [ 746 | { 747 | "data": { 748 | "text/plain": [ 749 | "array(['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n", 750 | " 'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n", 751 | " 'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n", 752 | " 'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n", 753 | " 'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n", 754 | " 'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n", 755 | " 'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n", 756 | " 'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n", 757 | " 'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n", 758 | " 'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS'], dtype=object)" 759 | ] 760 | }, 761 | "execution_count": 45, 762 | "metadata": {}, 763 | "output_type": "execute_result" 764 | } 765 | ], 766 | "source": [ 767 | "df_.columns.values" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": 57, 773 | "metadata": { 774 | "collapsed": false 775 | }, 776 | "outputs": [ 777 | { 778 | "data": { 779 | "text/plain": [ 780 | "0.00 0.96\n", 781 | "6.67 0.03\n", 782 | "13.33 0.00\n", 783 | "20.00 0.00\n", 784 | "26.67 0.00\n", 785 | "33.33 0.00\n", 786 | "40.00 0.00\n", 787 | "46.67 0.00\n", 788 | "93.33 0.00\n", 789 | "100.00 0.00\n", 790 | "53.33 0.00\n", 791 | "Name: return_itm_30_dy_cnt, dtype: float64" 792 | ] 793 | }, 794 | "execution_count": 57, 795 | "metadata": {}, 796 | "output_type": "execute_result" 797 | } 798 | ], 799 | "source": [ 800 | "df_.return_itm_30_dy_cnt.value_counts()/len(df_)" 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": 47, 806 | "metadata": { 807 | "collapsed": false 808 | }, 809 | "outputs": [], 810 | "source": [ 811 | "df_['labels'] = labels" 812 | ] 813 | }, 814 | { 815 | "cell_type": "code", 816 | "execution_count": 48, 817 | "metadata": { 818 | "collapsed": true 819 | }, 820 | "outputs": [], 821 | "source": [ 822 | "c1_ = df_.loc[df_['labels'] == 0]\n", 823 | "c2_ = df_.loc[df_['labels'] == 1]\n", 824 | "c3_ = df_.loc[df_['labels'] == 2]\n", 825 | "c4_ = df_.loc[df_['labels'] == 3]\n", 826 | "c5_ = df_.loc[df_['labels'] == 4]\n", 827 | "c6_ = df_.loc[df_['labels'] == 5]" 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": 51, 833 | "metadata": { 834 | "collapsed": false 835 | }, 836 | "outputs": [ 837 | { 838 | "data": { 839 | "text/plain": [ 840 | "54.54545454545454" 841 | ] 842 | }, 843 | "execution_count": 51, 844 | "metadata": {}, 845 | "output_type": "execute_result" 846 | } 847 | ], 848 | "source": [ 849 | "c5_.wirls_ln_cnt.max()" 850 | ] 851 | }, 852 | { 853 | "cell_type": "code", 854 | "execution_count": 52, 855 | "metadata": { 856 | "collapsed": false 857 | }, 858 | "outputs": [ 859 | { 860 | "data": { 861 | "text/plain": [ 862 | "0.0" 863 | ] 864 | }, 865 | "execution_count": 52, 866 | "metadata": {}, 867 | "output_type": "execute_result" 868 | } 869 | ], 870 | "source": [ 871 | "c5_.wirls_ln_cnt.min()" 872 | ] 873 | }, 874 | { 875 | "cell_type": "code", 876 | "execution_count": 53, 877 | "metadata": { 878 | "collapsed": true 879 | }, 880 | "outputs": [ 881 | { 882 | "data": { 883 | "text/plain": [ 884 | "0.17 368\n", 885 | "0.00 347\n", 886 | "0.34 314\n", 887 | "0.51 278\n", 888 | "0.69 214\n", 889 | "0.86 189\n", 890 | "1.03 122\n", 891 | "1.20 80\n", 892 | "1.37 51\n", 893 | "1.54 34\n", 894 | "1.72 31\n", 895 | "1.89 16\n", 896 | "2.23 16\n", 897 | "2.40 16\n", 898 | "3.43 14\n", 899 | "3.95 13\n", 900 | "2.74 13\n", 901 | "2.92 11\n", 902 | "3.09 10\n", 903 | "2.06 9\n", 904 | "3.26 7\n", 905 | "4.29 7\n", 906 | "2.57 7\n", 907 | "3.77 6\n", 908 | "5.15 6\n", 909 | "4.12 6\n", 910 | "3.60 6\n", 911 | "4.63 5\n", 912 | "5.83 4\n", 913 | "5.66 3\n", 914 | "5.49 3\n", 915 | "4.80 3\n", 916 | "8.06 3\n", 917 | "4.97 3\n", 918 | "6.35 2\n", 919 | "6.86 2\n", 920 | "5.32 2\n", 921 | "6.17 2\n", 922 | "7.03 1\n", 923 | "17.50 1\n", 924 | "10.81 1\n", 925 | "16.12 1\n", 926 | "4.46 1\n", 927 | "6.69 1\n", 928 | "6.00 1\n", 929 | "14.07 1\n", 930 | "10.12 1\n", 931 | "32.59 1\n", 932 | "8.40 1\n", 933 | "10.46 1\n", 934 | "13.89 1\n", 935 | "11.84 1\n", 936 | "54.55 1\n", 937 | "14.75 1\n", 938 | "11.15 1\n", 939 | "20.41 1\n", 940 | "9.43 1\n", 941 | "6.52 1\n", 942 | "23.67 1\n", 943 | "29.33 1\n", 944 | "12.01 1\n", 945 | "8.23 1\n", 946 | "19.38 1\n", 947 | "19.90 1\n", 948 | "7.38 1\n", 949 | "7.55 1\n", 950 | "22.30 1\n", 951 | "9.61 1\n", 952 | "7.72 1\n", 953 | "9.95 1\n", 954 | "12.86 1\n", 955 | "Name: wirls_ln_cnt, dtype: int64" 956 | ] 957 | }, 958 | "execution_count": 53, 959 | "metadata": {}, 960 | "output_type": "execute_result" 961 | } 962 | ], 963 | "source": [ 964 | "c5_.wirls_ln_cnt.value_counts()" 965 | ] 966 | }, 967 | { 968 | "cell_type": "code", 969 | "execution_count": null, 970 | "metadata": { 971 | "collapsed": true 972 | }, 973 | "outputs": [], 974 | "source": [] 975 | } 976 | ], 977 | "metadata": { 978 | "kernelspec": { 979 | "display_name": "Python 3", 980 | "language": "python", 981 | "name": "python3" 982 | }, 983 | "language_info": { 984 | "codemirror_mode": { 985 | "name": "ipython", 986 | "version": 3 987 | }, 988 | "file_extension": ".py", 989 | "mimetype": "text/x-python", 990 | "name": "python", 991 | "nbconvert_exporter": "python", 992 | "pygments_lexer": "ipython3", 993 | "version": "3.6.0" 994 | } 995 | }, 996 | "nbformat": 4, 997 | "nbformat_minor": 1 998 | } 999 | -------------------------------------------------------------------------------- /GMM - Clusters 6 - Normalized using mean (remove 30 days variable).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 4, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "pd.options.display.max_rows = 4000\n", 23 | "pd.set_option('display.float_format', lambda x: '%.2f' % x)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 5, 29 | "metadata": { 30 | "collapsed": false, 31 | "scrolled": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "df = pd.read_csv('../../0.Data/1.Interim/New/cluster.csv')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 1, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "cols = ['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg',\n", 47 | " 'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n", 48 | " 'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n", 49 | " 'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n", 50 | " 'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n", 51 | " 'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n", 52 | " 'return_itm_180_dy_cnt', 'Tenure',\n", 53 | " 'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n", 54 | " 'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n", 55 | " 'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS']" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 6, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "array(['TOT_DUE_AMT', 'Promise_%', 'Adjust_%', 'A_avg', 'P_avg',\n", 69 | " 'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n", 70 | " 'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n", 71 | " 'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n", 72 | " 'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n", 73 | " 'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n", 74 | " 'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n", 75 | " 'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n", 76 | " 'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n", 77 | " 'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS', 'BAN',\n", 78 | " 'labels'], dtype=object)" 79 | ] 80 | }, 81 | "execution_count": 6, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [ 87 | "df.columns.values" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 7, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "df_ = df[cols]" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 8, 104 | "metadata": { 105 | "collapsed": true 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "# Run GMM \n", 110 | "from sklearn.mixture import GMM" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 16, 116 | "metadata": { 117 | "collapsed": true 118 | }, 119 | "outputs": [ 120 | { 121 | "name": "stderr", 122 | "output_type": "stream", 123 | "text": [ 124 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning: Class GMM is deprecated; The class GMM is deprecated in 0.18 and will be removed in 0.20. Use class GaussianMixture instead.\n", 125 | " warnings.warn(msg, category=DeprecationWarning)\n", 126 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function distribute_covar_matrix_to_match_covariance_type is deprecated; The functon distribute_covar_matrix_to_match_covariance_typeis deprecated in 0.18 and will be removed in 0.20.\n", 127 | " warnings.warn(msg, category=DeprecationWarning)\n", 128 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 129 | " warnings.warn(msg, category=DeprecationWarning)\n", 130 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 131 | " warnings.warn(msg, category=DeprecationWarning)\n", 132 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 133 | " warnings.warn(msg, category=DeprecationWarning)\n", 134 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 135 | " warnings.warn(msg, category=DeprecationWarning)\n", 136 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 137 | " warnings.warn(msg, category=DeprecationWarning)\n", 138 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 139 | " warnings.warn(msg, category=DeprecationWarning)\n", 140 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 141 | " warnings.warn(msg, category=DeprecationWarning)\n", 142 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 143 | " warnings.warn(msg, category=DeprecationWarning)\n", 144 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 145 | " warnings.warn(msg, category=DeprecationWarning)\n", 146 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 147 | " warnings.warn(msg, category=DeprecationWarning)\n", 148 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 149 | " warnings.warn(msg, category=DeprecationWarning)\n", 150 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 151 | " warnings.warn(msg, category=DeprecationWarning)\n", 152 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 153 | " warnings.warn(msg, category=DeprecationWarning)\n", 154 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 155 | " warnings.warn(msg, category=DeprecationWarning)\n", 156 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 157 | " warnings.warn(msg, category=DeprecationWarning)\n", 158 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 159 | " warnings.warn(msg, category=DeprecationWarning)\n", 160 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 161 | " warnings.warn(msg, category=DeprecationWarning)\n", 162 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 163 | " warnings.warn(msg, category=DeprecationWarning)\n", 164 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 165 | " warnings.warn(msg, category=DeprecationWarning)\n", 166 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 167 | " warnings.warn(msg, category=DeprecationWarning)\n", 168 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 169 | " warnings.warn(msg, category=DeprecationWarning)\n", 170 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 171 | " warnings.warn(msg, category=DeprecationWarning)\n", 172 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 173 | " warnings.warn(msg, category=DeprecationWarning)\n", 174 | "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n", 175 | " warnings.warn(msg, category=DeprecationWarning)\n" 176 | ] 177 | }, 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "CPU times: user 2min 44s, sys: 24 s, total: 3min 8s\n", 183 | "Wall time: 50.1 s\n" 184 | ] 185 | } 186 | ], 187 | "source": [ 188 | "%%time\n", 189 | "gmm = GMM(n_components=6).fit(df_)\n", 190 | "labels = gmm.predict(df_)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 17, 196 | "metadata": { 197 | "collapsed": true 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "cluster_normalized = df_.copy()" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 18, 207 | "metadata": { 208 | "collapsed": true 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "cluster_normalized['labels'] = labels" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 19, 218 | "metadata": { 219 | "collapsed": false 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "c1 = cluster_normalized.loc[cluster_normalized['labels'] == 0]\n", 224 | "c2 = cluster_normalized.loc[cluster_normalized['labels'] == 1]\n", 225 | "c3 = cluster_normalized.loc[cluster_normalized['labels'] == 2]\n", 226 | "c4 = cluster_normalized.loc[cluster_normalized['labels'] == 3]\n", 227 | "c5 = cluster_normalized.loc[cluster_normalized['labels'] == 4]\n", 228 | "c6 = cluster_normalized.loc[cluster_normalized['labels'] == 5]" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 20, 234 | "metadata": { 235 | "collapsed": false 236 | }, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "3 0.34\n", 242 | "5 0.26\n", 243 | "1 0.15\n", 244 | "0 0.14\n", 245 | "2 0.07\n", 246 | "4 0.04\n", 247 | "Name: labels, dtype: float64" 248 | ] 249 | }, 250 | "execution_count": 20, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "cluster_normalized['labels'].value_counts()/len(cluster_normalized)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "collapsed": true 264 | }, 265 | "outputs": [], 266 | "source": [] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": { 272 | "collapsed": true 273 | }, 274 | "outputs": [], 275 | "source": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 24, 280 | "metadata": { 281 | "collapsed": true 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "df = cluster_normalized.drop('labels', axis = 1)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 34, 291 | "metadata": { 292 | "collapsed": true 293 | }, 294 | "outputs": [ 295 | { 296 | "data": { 297 | "text/plain": [ 298 | "TOT_DUE_AMT 9.09\n", 299 | "Promise_% 63.69\n", 300 | "A_avg 76.00\n", 301 | "P_avg 99.09\n", 302 | "BAN 57.30\n", 303 | "acct_bhvr_scr_nbr 98.03\n", 304 | "ar_bhvr_scr_nbr 97.95\n", 305 | "avg_paid_full_dy_cnt 8.73\n", 306 | "crdt_buru_scr_nbr 83.87\n", 307 | "cust_bhvr_scr_nbr 98.27\n", 308 | "cust_recls_scr_nbr 89.64\n", 309 | "pmt_arng_scr_nbr 82.87\n", 310 | "wirls_ln_cnt 0.56\n", 311 | "excpt_ovrd_ind 0.75\n", 312 | "pyarr_scr_nbr 82.87\n", 313 | "lst_bhvr_scr_nbr 77.48\n", 314 | "preferred_month_CALL_DT 60.62\n", 315 | "preferred_day_of_monthCALL_DT 56.81\n", 316 | "preferred_weekdayCALL_DT 42.45\n", 317 | "return_itm_180_dy_cnt 0.45\n", 318 | "return_itm_30_dy_cnt 0.27\n", 319 | "Tenure 21.54\n", 320 | "MOBILITY_REGION_NAME_flagCentral 36.56\n", 321 | "MOBILITY_REGION_NAME_flagEast 44.19\n", 322 | "MOBILITY_REGION_NAME_flagWest 19.25\n", 323 | "ACCT_STS_AT_CALL_DATE_flagN 0.21\n", 324 | "ACCT_STS_AT_CALL_DATE_flagO 99.60\n", 325 | "ACCT_STS_AT_CALL_DATE_flagS 0.20\n", 326 | "dtype: float64" 327 | ] 328 | }, 329 | "execution_count": 34, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "df_.mean()" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 36, 341 | "metadata": { 342 | "collapsed": true 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "df_.to_csv('../../0.Data/1.Interim/New/merge_normalized.csv', index = False)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 38, 352 | "metadata": { 353 | "collapsed": true 354 | }, 355 | "outputs": [], 356 | "source": [ 357 | "cluster_normalized.to_csv('../../0.Data/1.Interim/New/cluster_normalized.csv', index = False)" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 37, 363 | "metadata": { 364 | "collapsed": false 365 | }, 366 | "outputs": [ 367 | { 368 | "data": { 369 | "text/plain": [ 370 | "array(['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n", 371 | " 'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n", 372 | " 'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n", 373 | " 'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n", 374 | " 'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n", 375 | " 'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n", 376 | " 'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n", 377 | " 'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n", 378 | " 'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n", 379 | " 'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS',\n", 380 | " 'labels'], dtype=object)" 381 | ] 382 | }, 383 | "execution_count": 37, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "cluster_normalized.columns.values" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 25, 395 | "metadata": { 396 | "collapsed": false 397 | }, 398 | "outputs": [ 399 | { 400 | "data": { 401 | "text/plain": [ 402 | "array(['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n", 403 | " 'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n", 404 | " 'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n", 405 | " 'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n", 406 | " 'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n", 407 | " 'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n", 408 | " 'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n", 409 | " 'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n", 410 | " 'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n", 411 | " 'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS'], dtype=object)" 412 | ] 413 | }, 414 | "execution_count": 25, 415 | "metadata": {}, 416 | "output_type": "execute_result" 417 | } 418 | ], 419 | "source": [ 420 | "df.columns.values" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 27, 426 | "metadata": { 427 | "collapsed": false 428 | }, 429 | "outputs": [], 430 | "source": [ 431 | "cluster_grp = []\n", 432 | "for col in df.columns.values.tolist():\n", 433 | " orig = round(df[col].mean(),2)\n", 434 | " c1_val = round(c1[col].mean(),2)\n", 435 | " c2_val = round(c2[col].mean(),2)\n", 436 | " c3_val = round(c3[col].mean(),2)\n", 437 | " c4_val = round(c4[col].mean(),2)\n", 438 | " c5_val = round(c5[col].mean(),2)\n", 439 | " c6_val = round(c6[col].mean(),2)\n", 440 | " if(orig == 0):\n", 441 | " c1_change = None\n", 442 | " c2_change = None\n", 443 | " c3_change = None\n", 444 | " c4_change = None\n", 445 | " c5_change = None\n", 446 | " c6_change = None\n", 447 | " else:\n", 448 | " c1_change = round(((c1_val-orig)*100/orig),2)\n", 449 | " c2_change = round(((c2_val-orig)*100/orig),2)\n", 450 | " c3_change = round(((c3_val-orig)*100/orig),2)\n", 451 | " c4_change = round(((c4_val-orig)*100/orig),2)\n", 452 | " c5_change = round(((c5_val-orig)*100/orig),2)\n", 453 | " c6_change = round(((c6_val-orig)*100/orig),2)\n", 454 | " cluster_grp.append((col, orig, c1_val, c1_change , c2_val, c2_change, c3_val, c3_change, c4_val, c4_change, c5_val, c5_change,\\\n", 455 | " c6_val, c6_change))\n", 456 | "\n" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 28, 462 | "metadata": { 463 | "collapsed": false 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "cluster_group = pd.DataFrame(cluster_grp, columns = ['Column', 'Entire dataset value', 'c1_value', 'c1_change_%', \\\n", 468 | " 'c2_value', 'c2_change_%', 'c3_value', 'c3_change_%', 'c4_value', 'c4_change_%', \\\n", 469 | " 'c5_value', 'c5_change_%', 'c6_value' , 'c6_change_%'])" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 29, 475 | "metadata": { 476 | "collapsed": false 477 | }, 478 | "outputs": [ 479 | { 480 | "data": { 481 | "text/plain": [ 482 | "28" 483 | ] 484 | }, 485 | "execution_count": 29, 486 | "metadata": {}, 487 | "output_type": "execute_result" 488 | } 489 | ], 490 | "source": [ 491 | "len(cluster_group)" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 30, 497 | "metadata": { 498 | "collapsed": true 499 | }, 500 | "outputs": [], 501 | "source": [ 502 | "cluster_group.to_csv('../3.Analysis/cluster_gmm_1_normalized.csv', index = False)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 45, 508 | "metadata": { 509 | "collapsed": false 510 | }, 511 | "outputs": [ 512 | { 513 | "data": { 514 | "text/plain": [ 515 | "array(['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n", 516 | " 'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n", 517 | " 'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n", 518 | " 'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n", 519 | " 'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n", 520 | " 'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n", 521 | " 'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n", 522 | " 'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n", 523 | " 'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n", 524 | " 'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS'], dtype=object)" 525 | ] 526 | }, 527 | "execution_count": 45, 528 | "metadata": {}, 529 | "output_type": "execute_result" 530 | } 531 | ], 532 | "source": [ 533 | "df_.columns.values" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 57, 539 | "metadata": { 540 | "collapsed": false 541 | }, 542 | "outputs": [ 543 | { 544 | "data": { 545 | "text/plain": [ 546 | "0.00 0.96\n", 547 | "6.67 0.03\n", 548 | "13.33 0.00\n", 549 | "20.00 0.00\n", 550 | "26.67 0.00\n", 551 | "33.33 0.00\n", 552 | "40.00 0.00\n", 553 | "46.67 0.00\n", 554 | "93.33 0.00\n", 555 | "100.00 0.00\n", 556 | "53.33 0.00\n", 557 | "Name: return_itm_30_dy_cnt, dtype: float64" 558 | ] 559 | }, 560 | "execution_count": 57, 561 | "metadata": {}, 562 | "output_type": "execute_result" 563 | } 564 | ], 565 | "source": [ 566 | "df_.return_itm_30_dy_cnt.value_counts()/len(df_)" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 47, 572 | "metadata": { 573 | "collapsed": false 574 | }, 575 | "outputs": [], 576 | "source": [ 577 | "df_['labels'] = labels" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": 48, 583 | "metadata": { 584 | "collapsed": true 585 | }, 586 | "outputs": [], 587 | "source": [ 588 | "c1_ = df_.loc[df_['labels'] == 0]\n", 589 | "c2_ = df_.loc[df_['labels'] == 1]\n", 590 | "c3_ = df_.loc[df_['labels'] == 2]\n", 591 | "c4_ = df_.loc[df_['labels'] == 3]\n", 592 | "c5_ = df_.loc[df_['labels'] == 4]\n", 593 | "c6_ = df_.loc[df_['labels'] == 5]" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": 51, 599 | "metadata": { 600 | "collapsed": false 601 | }, 602 | "outputs": [ 603 | { 604 | "data": { 605 | "text/plain": [ 606 | "54.54545454545454" 607 | ] 608 | }, 609 | "execution_count": 51, 610 | "metadata": {}, 611 | "output_type": "execute_result" 612 | } 613 | ], 614 | "source": [ 615 | "c5_.wirls_ln_cnt.max()" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 52, 621 | "metadata": { 622 | "collapsed": false 623 | }, 624 | "outputs": [ 625 | { 626 | "data": { 627 | "text/plain": [ 628 | "0.0" 629 | ] 630 | }, 631 | "execution_count": 52, 632 | "metadata": {}, 633 | "output_type": "execute_result" 634 | } 635 | ], 636 | "source": [ 637 | "c5_.wirls_ln_cnt.min()" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 53, 643 | "metadata": { 644 | "collapsed": true 645 | }, 646 | "outputs": [ 647 | { 648 | "data": { 649 | "text/plain": [ 650 | "0.17 368\n", 651 | "0.00 347\n", 652 | "0.34 314\n", 653 | "0.51 278\n", 654 | "0.69 214\n", 655 | "0.86 189\n", 656 | "1.03 122\n", 657 | "1.20 80\n", 658 | "1.37 51\n", 659 | "1.54 34\n", 660 | "1.72 31\n", 661 | "1.89 16\n", 662 | "2.23 16\n", 663 | "2.40 16\n", 664 | "3.43 14\n", 665 | "3.95 13\n", 666 | "2.74 13\n", 667 | "2.92 11\n", 668 | "3.09 10\n", 669 | "2.06 9\n", 670 | "3.26 7\n", 671 | "4.29 7\n", 672 | "2.57 7\n", 673 | "3.77 6\n", 674 | "5.15 6\n", 675 | "4.12 6\n", 676 | "3.60 6\n", 677 | "4.63 5\n", 678 | "5.83 4\n", 679 | "5.66 3\n", 680 | "5.49 3\n", 681 | "4.80 3\n", 682 | "8.06 3\n", 683 | "4.97 3\n", 684 | "6.35 2\n", 685 | "6.86 2\n", 686 | "5.32 2\n", 687 | "6.17 2\n", 688 | "7.03 1\n", 689 | "17.50 1\n", 690 | "10.81 1\n", 691 | "16.12 1\n", 692 | "4.46 1\n", 693 | "6.69 1\n", 694 | "6.00 1\n", 695 | "14.07 1\n", 696 | "10.12 1\n", 697 | "32.59 1\n", 698 | "8.40 1\n", 699 | "10.46 1\n", 700 | "13.89 1\n", 701 | "11.84 1\n", 702 | "54.55 1\n", 703 | "14.75 1\n", 704 | "11.15 1\n", 705 | "20.41 1\n", 706 | "9.43 1\n", 707 | "6.52 1\n", 708 | "23.67 1\n", 709 | "29.33 1\n", 710 | "12.01 1\n", 711 | "8.23 1\n", 712 | "19.38 1\n", 713 | "19.90 1\n", 714 | "7.38 1\n", 715 | "7.55 1\n", 716 | "22.30 1\n", 717 | "9.61 1\n", 718 | "7.72 1\n", 719 | "9.95 1\n", 720 | "12.86 1\n", 721 | "Name: wirls_ln_cnt, dtype: int64" 722 | ] 723 | }, 724 | "execution_count": 53, 725 | "metadata": {}, 726 | "output_type": "execute_result" 727 | } 728 | ], 729 | "source": [ 730 | "c5_.wirls_ln_cnt.value_counts()" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": null, 736 | "metadata": { 737 | "collapsed": true 738 | }, 739 | "outputs": [], 740 | "source": [] 741 | } 742 | ], 743 | "metadata": { 744 | "kernelspec": { 745 | "display_name": "Python 3", 746 | "language": "python", 747 | "name": "python3" 748 | }, 749 | "language_info": { 750 | "codemirror_mode": { 751 | "name": "ipython", 752 | "version": 3 753 | }, 754 | "file_extension": ".py", 755 | "mimetype": "text/x-python", 756 | "name": "python", 757 | "nbconvert_exporter": "python", 758 | "pygments_lexer": "ipython3", 759 | "version": "3.6.0" 760 | } 761 | }, 762 | "nbformat": 4, 763 | "nbformat_minor": 1 764 | } 765 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ML_CodeBase_Python 2 | Code snippets for ML in python 3 | -------------------------------------------------------------------------------- /basic_io: -------------------------------------------------------------------------------- 1 | import pandas as pd, numpy as np, csv, sys, json, pickle, re 2 | from sklearn.model_selection import train_test_split #For stratified sampling 3 | 4 | from PARAMETERS_global import * 5 | 6 | def read_encoder(encoder_type = 'label/le', variable_name = 'default'): 7 | '''loads classes from an encoder''' 8 | return pickle.load(open(path_data_output + '/encoders/' + encoder_type + '_' + variable_name + '.pkl','rb')) 9 | 10 | def save_csv(df, path, index = False, compression = None): 11 | df.to_csv(path, index = index, compression = compression) 12 | return 13 | 14 | def write_encoder(encoder, encoder_type = 'label/le', variable_name = 'default'): 15 | '''Takes the encoder and saves it as .pkl file''' 16 | with open(path_data_output + '/encoders/' + encoder_type + '_' + variable_name + '.pkl', 'wb') as outfile: 17 | pickle.dump(encoder,outfile) 18 | return 19 | 20 | def intersection(list_a, list_b): 21 | return list(set(list_a).intersection(set(list_b))) 22 | 23 | def difference(list_a, list_b): 24 | # alternate implementatiion - [x for x in list_a if x not in list_b] 25 | return list(set(list_a).difference(set(list_b))) 26 | 27 | def union(list_a, list_b): 28 | # alternate implementatiion - [x for x in list_a if x not in list_b] 29 | return list(set(list_a).union(set(list_b))) 30 | 31 | def select_dtype(df, dtypes): 32 | df = df.select_dtypes(include=dtypes) 33 | return df 34 | 35 | def remove_dtype(df, dtypes): 36 | df = df.select_dtypes(include=dtypes) 37 | return df 38 | -------------------------------------------------------------------------------- /eda_helpers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd, csv, scipy.stats as ss, seaborn as sns, numpy as np, sys 2 | sys.path.insert(0,'/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds') 3 | from config.PARAMETERS_global import * 4 | from utils.BASIC_input_output import * 5 | 6 | def select_dtype(df, dtypes): 7 | df = df.select_dtypes(include=dtypes) 8 | return df 9 | 10 | def remove_dtype(df, dtypes): 11 | df = df.select_dtypes(include=dtypes) 12 | return df 13 | 14 | def df_correlation(df, columns): 15 | for col in columns: 16 | print(col) 17 | tmp = df[columns].apply(lambda x: x.corr(df[col])) 18 | #print(tmp) 19 | print(tmp.loc[tmp >= 0.5]) 20 | print("Processing ended for column: {}".format(col)) 21 | return 22 | 23 | def df_skewed(df, columns): 24 | for col in columns: 25 | print(col) 26 | tmp = df[col].value_counts().head(20)/len(df) 27 | print(tmp.loc[tmp >= 0.8]) 28 | #print("Processing ended for column: {}".format(col)) 29 | return 30 | 31 | def df_unique(df, columns): 32 | for col in columns: 33 | print(col, len(df[col].unique())) 34 | return 35 | 36 | def df_outlier(df, columns): 37 | for col in columns: 38 | print(col) 39 | tmp = df[col].value_counts().tail(20)/len(df) 40 | print(tmp.loc[tmp <= 0.01]) 41 | #print("Processing ended for column: {}".format(col)) 42 | return 43 | 44 | def generate_temporal_vars(df, cols): 45 | for col in cols: 46 | df[col] = df[col].astype('datetime64') 47 | df['fe_' + str(col)+'_weekday'] = df[col].dt.weekday 48 | df['fe_' + str(col)+'_month'] = df[col].dt.month 49 | df['fe_' + str(col)+'_year'] = df[col].dt.year 50 | df['fe_' + str(col)+'_day'] = df[col].dt.day 51 | return df 52 | 53 | def df_count_nan(df): 54 | tmp = df.apply(lambda x : x.isnull().sum(axis=0)) 55 | print(tmp.loc[tmp > 0]) 56 | return 57 | 58 | def df_strip_values(df): 59 | object_cols = select_dtype(df, ['object']).columns.values.tolist() 60 | df[object_cols] = df[object_cols].apply(lambda x : x.str.strip()) 61 | return df 62 | 63 | def df_number_stats(df): 64 | df_int = select_dtype(df, ['int64', 'float64']) 65 | for col in df_int: 66 | print(col) 67 | print(round(df_int[col].describe(),2)) 68 | return 69 | 70 | def df_object_describe(df): 71 | df_obj = select_dtype(df, ['object']) 72 | for col in df_obj: 73 | print(col) 74 | print((df_obj[col].describe())) 75 | return 76 | 77 | ''' 78 | Cramer's V method to calculate categorical correlation 79 | ''' 80 | def cramers_v(x, y): 81 | confusion_matrix = pd.crosstab(x,y) 82 | chi2 = ss.chi2_contingency(confusion_matrix)[0] 83 | n = confusion_matrix.sum().sum() 84 | phi2 = chi2/n 85 | r,k = confusion_matrix.shape 86 | phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1)) 87 | rcorr = r-((r-1)**2)/(n-1) 88 | kcorr = k-((k-1)**2)/(n-1) 89 | return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1))) 90 | 91 | ''' 92 | Correlation among categorical variables 93 | ''' 94 | def categorical_correlation(df, exceptions): 95 | df_obj = select_dtype(df, ['object']) 96 | object_cols = df_obj.columns.values.tolist() 97 | object_cols = difference(object_cols,exceptions) 98 | corr = {} 99 | for col1 in object_cols: 100 | for col2 in object_cols: 101 | try: 102 | correlation = cramers_v(df[col1],df[col2]) 103 | except ValueError: 104 | print("Value error occurred for columns {} and {}".format(col1, col2)) 105 | corr[str(col1) + "-" + str(col2)] = correlation 106 | if((col1!=col2) & (correlation >= 0.5)): 107 | print(col1, col2, corr[str(col1) + "-" + str(col2)]) 108 | return 109 | 110 | ''' 111 | Correlation of categoriacal features with a categorical target 112 | ''' 113 | def categorical_correlation_w_target(df, exceptions, target = target_aa): 114 | df_obj = select_dtype(df, ['object']) 115 | object_cols = df_obj.columns.values.tolist() 116 | object_cols = difference(object_cols,exceptions) 117 | for col in object_cols: 118 | print(col) 119 | print(cramers_v(df[col],df[target])) 120 | return 121 | 122 | ''' 123 | Returns a df where modifier code is not * and does not match procedure modifier code 124 | ''' 125 | def modifier_analysis(df, var1, var2): 126 | df_temp = df.loc[df[var1] != df[var2]][[var1,var2]] 127 | df_temp['combined'] = df_temp[var1] + df_temp[var2] 128 | df_temp_ = df_temp.loc[df_temp[var2] != '* '] 129 | print(len(df_temp_), len(df_temp)) 130 | return df_temp_ 131 | -------------------------------------------------------------------------------- /encoder_code.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | ''' 8 | Input: Merged dataset (at DTL or HDR level) 9 | Outptut: Folder with X (sparse matrix), Y (csv), col_dict (csv) 10 | ''' 11 | 12 | 13 | # In[2]: 14 | 15 | 16 | get_ipython().run_line_magic('load_ext', 'autoreload') 17 | get_ipython().run_line_magic('autoreload', '2') 18 | 19 | #Import packages 20 | import pandas as pd, sys, os, glob 21 | from scipy import sparse 22 | from sklearn.model_selection import train_test_split 23 | from sklearn.preprocessing import LabelEncoder 24 | from category_encoders.target_encoder import TargetEncoder 25 | 26 | sys.path.insert(0,'/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds') 27 | sys.path.insert(0,'/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds/code') 28 | 29 | from config.PARAMETERS_global import * 30 | from config.LOOKUP_objects import * 31 | from utils.BASIC_input_output import * 32 | from models.CREATE_X_y import * 33 | from utils.FUNCTIONS_feature_encoding import * 34 | 35 | pd.options.display.max_rows = 100 36 | 37 | 38 | # In[3]: 39 | 40 | 41 | get_ipython().run_line_magic('reload_ext', 'autoreload') 42 | 43 | 44 | # In[4]: 45 | 46 | 47 | model_aapm_version 48 | 49 | 50 | # In[5]: 51 | 52 | 53 | ''' 54 | MODEL INPUTS 55 | ''' 56 | model = 'aa' #aa - allowed amount, ltr - LTR 57 | #model_name = model_aapm_version ## This will be the name of the folder 58 | model_name = 'model_aapm_basic_subset_ap_prof_claims_5MM_num_cap_y_dates_n_binning_n_fe_proc_cd_grp_nbr_2020_3_months' 59 | filename_master_data = '/' + date.today().strftime("%Y%m%d") + '_ver' + model_name +'.csv.gz' 60 | 61 | 62 | # In[6]: 63 | 64 | 65 | filename_master_data = '/20200914_vermodel_aapm_basic_subset_ap_prof_claims_5MM_num_cap_y_dates_n_binning_n_fe_proc_cd_grp_nbr_5MM.csv.gz' 66 | 67 | 68 | # In[7]: 69 | 70 | 71 | target = target_aa if(model=='aa') else target_ltr 72 | merge = pd.read_csv(path_data_master + filename_master_data, compression = 'gzip') 73 | merge = merge.loc[merge['CLM_PAYMNT_ACTN_1_CD']!='R'] if(model=='aa') else merge 74 | 75 | 76 | # In[10]: 77 | 78 | 79 | merge_ = merge.sample(n=2000000, random_state = 42) 80 | 81 | 82 | # In[12]: 83 | 84 | 85 | merge = merge_.copy(deep = True) 86 | 87 | 88 | # In[13]: 89 | 90 | 91 | merge['binned_DTL_LINE_NBR'] = pd.cut(x = merge['DTL_LINE_NBR'], bins = [0,1,2,3,4,50]) 92 | 93 | keys = merge['binned_DTL_LINE_NBR'].value_counts().index.tolist() 94 | values = ['1','2','3','4','4+'] 95 | 96 | values_dict = dict(zip(keys, values)) 97 | merge['binned_DTL_LINE_NBR'] = merge['binned_DTL_LINE_NBR'].map(values_dict) 98 | 99 | 100 | # In[14]: 101 | 102 | 103 | merge.drop(columns = ['KEY_CHK_DCN_ITEM_CD', 'DTL_LINE_NBR'], inplace = True) 104 | 105 | 106 | # In[15]: 107 | 108 | 109 | get_ipython().run_cell_magic('time', '', 'X_train, X_test, y_train, y_test = train_test_split_ratio(merge, target)') 110 | 111 | 112 | # In[16]: 113 | 114 | 115 | print(X_train.shape, X_test.shape) 116 | 117 | 118 | # In[17]: 119 | 120 | 121 | model_name 122 | 123 | 124 | # ###### OHE Encoding 125 | 126 | # In[18]: 127 | 128 | 129 | key_ = key_dtl if(model=='aa') else key_hdr 130 | dtypes = dict(dtl_dtypes).update(hdr_dtypes) 131 | thresh = 0.001 #all the values which are less than size*thresh will be dropped 132 | train = pd.concat([X_train, y_train], axis = 1) 133 | test = pd.concat([X_test, y_test], axis = 1) 134 | 135 | 136 | # In[19]: 137 | 138 | 139 | print(train.shape, test.shape) 140 | 141 | 142 | # In[20]: 143 | 144 | 145 | path = path_data_output + '/encoders/ohe/' + model_name 146 | path_train = path + '/train' 147 | path_test = path + '/test' 148 | path_pre_process_train = model_name + '/train/pre_process' 149 | path_pre_process_test = model_name + '/test/pre_process' 150 | 151 | 152 | # In[21]: 153 | 154 | 155 | ##Create folder structure for new model 156 | try: 157 | os.mkdir(path) 158 | os.mkdir(path + '/train') 159 | os.mkdir(path + '/test') 160 | os.mkdir(path + '/train/pre_process') 161 | os.mkdir(path + '/test/pre_process') 162 | except: 163 | print('Directory already present') 164 | else: 165 | print('Directory created') 166 | 167 | 168 | # Train Processing 169 | 170 | # In[22]: 171 | 172 | 173 | get_ipython().run_cell_magic('time', '', "n_claims = train.shape[0] # Number of rows\ntrain['row'] = (range(n_claims)) # add a row column with s.no.") 174 | 175 | 176 | # In[23]: 177 | 178 | 179 | get_ipython().run_cell_magic('time', '', "remove_num_cols = [key_common[0], target, 'row','HCFA_PT_CD','PAT_MBR_CD'] # -- Exceptions for numeric processing\n# remove_num_cols = [key_common[0], target, 'row', 'dtl_fe_month_MBR_CNTRCT_END_DT','dtl_fe_month_MBR_CNTRCT_EFCTV_DT',\n# 'dtl_fe_month_SRVC_FROM_DT','hdr_fe_year_ILNS_ONSET_DT','dtl_fe_month_CLM_CMPLTN_DT',\n# 'hdr_fe_month_PAT_BRTH_DT','hdr_fe_year_PAT_BRTH_DT','dtl_fe_year_MBR_CNTRCT_EFCTV_DT',\n# 'hdr_fe_month_SRVC_FROM_DT','dtl_fe_year_CLM_CMPLTN_DT','HCFA_PT_CD','hdr_fe_month_SRVC_THRU_DT',\n# 'dtl_fe_month_SRVC_TO_DT','dtl_fe_year_MBR_CNTRCT_END_DT','dtl_fe_year_SRVC_TO_DT',\n# 'hdr_fe_year_SRVC_THRU_DT','dtl_fe_year_SRVC_FROM_DT','hdr_fe_year_SRVC_FROM_DT','PAT_MBR_CD',\n# 'hdr_fe_year_CLM_CMPLTN_DT','hdr_fe_month_CLM_CMPLTN_DT','hdr_fe_month_ILNS_ONSET_DT'] # -- Exceptions for numeric processing\nnum_processed, numeric_cols = process_numerical(train, ['row'], dtypes, remove_num_cols)\ntarget_processed = process_target(train, ['row'], target)") 180 | 181 | 182 | # In[24]: 183 | 184 | 185 | get_ipython().run_cell_magic('time', '', "columns_to_remove = [key_common[0], target, 'row']\ncolumns_to_remove = columns_to_remove + numeric_cols + [target]\ncat_size = pre_process_categorical(train, ['row'], path_pre_process_train , dtypes, columns_to_remove)") 186 | 187 | 188 | # In[25]: 189 | 190 | 191 | get_ipython().run_cell_magic('time', '', "cat_processed = process_categorical(train, cat_size, n_claims*thresh, path_train + '/pre_process', dtypes, columns_to_remove)") 192 | 193 | 194 | # In[26]: 195 | 196 | 197 | get_ipython().run_cell_magic('time', '', "merge_to_model_ready(target_processed, num_processed, cat_processed, path_train, 'row')") 198 | 199 | 200 | # In[27]: 201 | 202 | 203 | train_X_path = '/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds/data/output/encoders/ohe/'+model_name+'/train/X.npz' 204 | df_train = sparse.load_npz(train_X_path) 205 | df_train.shape 206 | 207 | 208 | # In[28]: 209 | 210 | 211 | #Select the datset to delete 212 | delete = 'train' 213 | 214 | 215 | # In[29]: 216 | 217 | 218 | #Remove pre_process files and directory 219 | files = glob.glob(path + '/' + delete + '/pre_process/ohe_*.pkl') 220 | for f in files: 221 | os.remove(f) 222 | os.rmdir(path + '/' + delete + '/pre_process/') 223 | 224 | #Remove all other files 225 | # files = glob.glob(path + '/' + delete + '/*') 226 | # for f in files: 227 | # os.remove(f) 228 | # os.rmdir(path + '/' + delete) 229 | 230 | 231 | # Test Processing 232 | 233 | # In[30]: 234 | 235 | 236 | get_ipython().run_cell_magic('time', '', "#test.drop_duplicates(subset = key_, inplace = True) # df which has key_hdr values have been de-duplicated\nn_claims = test.shape[0] # Number of rows\ntest['row'] = (range(n_claims)) # add a row column with s.no.") 237 | 238 | 239 | # In[31]: 240 | 241 | 242 | get_ipython().run_cell_magic('time', '', "num_processed, numeric_cols = process_numerical(test, ['row'], dtypes, remove_num_cols)\ntarget_processed = process_target(test, ['row'], target)") 243 | 244 | 245 | # In[32]: 246 | 247 | 248 | get_ipython().run_cell_magic('time', '', "col_dict = pd.read_csv(path_train + '/col_dict.csv')\ncat_size = pre_process_categorical_test(test, ['row'], path_pre_process_test , dtypes, columns_to_remove, col_dict)") 249 | 250 | 251 | # In[33]: 252 | 253 | 254 | get_ipython().run_cell_magic('time', '', "cat_processed = process_categorical(test,cat_size, n_claims*thresh, path_test + '/pre_process', dtypes, columns_to_remove)") 255 | 256 | 257 | # In[34]: 258 | 259 | 260 | get_ipython().run_cell_magic('time', '', "merge_to_model_ready_test(target_processed, num_processed, cat_processed, path_test, path_train, 'row')") 261 | 262 | 263 | # In[35]: 264 | 265 | 266 | test_X_path = '/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds/data/output/encoders/ohe/'+model_name+'/test/X.npz' 267 | df_test = sparse.load_npz(test_X_path) 268 | df_test.shape 269 | 270 | 271 | # In[36]: 272 | 273 | 274 | #Select the datset to delete 275 | delete = 'test' 276 | 277 | 278 | # In[37]: 279 | 280 | 281 | #Remove pre_process files and directory 282 | files = glob.glob(path + '/' + delete + '/pre_process/ohe_*.pkl') 283 | for f in files: 284 | os.remove(f) 285 | os.rmdir(path + '/' + delete + '/pre_process/') 286 | 287 | #Remove all other files 288 | # files = glob.glob(path + '/' + delete + '/*') 289 | # for f in files: 290 | # os.remove(f) 291 | # os.rmdir(path + '/' + delete) 292 | 293 | 294 | # In[41]: 295 | 296 | 297 | #Remove main directory 298 | # os.rmdir(path) 299 | 300 | 301 | # In[96]: 302 | 303 | 304 | 305 | 306 | 307 | # In[ ]: 308 | 309 | 310 | 311 | 312 | 313 | # ###### Label Encoding 314 | 315 | # In[9]: 316 | 317 | 318 | thresh = 0.0001 #determines the count for 'rare' classification 319 | 320 | 321 | # In[10]: 322 | 323 | 324 | get_ipython().run_cell_magic('time', '', 'label_encoding_fit(X_train, thresh)') 325 | 326 | 327 | # In[11]: 328 | 329 | 330 | get_ipython().run_cell_magic('time', '', 'X_train_ = label_encoding_transform(X_train)') 331 | 332 | 333 | # In[12]: 334 | 335 | 336 | get_ipython().run_cell_magic('time', '', 'X_test_ = label_encoding_transform(X_test)') 337 | 338 | 339 | # ###### Target Encoding 340 | 341 | # In[7]: 342 | 343 | 344 | get_ipython().run_cell_magic('time', '', 'target_encoding_fit(X_train, y_train, model)') 345 | 346 | 347 | # In[6]: 348 | 349 | 350 | get_ipython().run_cell_magic('time', '', 'X_train_ = target_encoding_transform(X_train, model)') 351 | 352 | 353 | # In[7]: 354 | 355 | 356 | get_ipython().run_cell_magic('time', '', 'X_test_ = target_encoding_transform(X_test, model)') 357 | 358 | 359 | # In[10]: 360 | 361 | 362 | model_ready = path_data_model_ready + '/' + model_name + '/' 363 | save_csv(X_train_, model_ready + 'target_encoded_X_train.csv.gz', compression = 'gzip') 364 | save_csv(X_test_, model_ready + 'target_encoded_X_test.csv.gz', compression = 'gzip') 365 | save_csv(y_train, model_ready + 'target_encoded_y_train.csv.gz', compression = 'gzip') 366 | save_csv(y_test, model_ready + 'target_encoded_y_test.csv.gz', compression = 'gzip') 367 | 368 | 369 | # In[ ]: 370 | 371 | 372 | 373 | 374 | 375 | # In[ ]: 376 | 377 | 378 | 379 | 380 | 381 | # In[ ]: 382 | 383 | 384 | 385 | 386 | 387 | # In[ ]: 388 | 389 | 390 | 391 | 392 | 393 | # In[ ]: 394 | 395 | 396 | def sparse_binning(df, columns, thresh): 397 | '''takes columns to bin using the threshold as the % of sparsity over which the variables will be binned''' 398 | for col in columns: 399 | sparse_name = str(col) + str('_sparse') 400 | value_counts = df[col].value_counts()*100/len(df) 401 | values = value_counts.loc[value_counts thresh] 150 | print(len(keep_err.unique()), " variables processed") 151 | df_col = df_col[df_col['VAR'].isin(keep_err)] 152 | #write_encoder(df_num, 'ohe/cat_processed', '') 153 | return df_col 154 | 155 | def merge_to_model_ready(target, num, cat, path, key): 156 | #target = pd.read_pickle(path + 'target_processed.pkl') 157 | #num = pd.read_pickle(path + 'num_processed.pkl') 158 | #cat = pd.read_pickle(path + 'cat_processed.pkl') 159 | 160 | model_data = target[key] 161 | n_claims = model_data.shape[0] 162 | model_data = pd.merge(model_data, cat, on=key, how='left') 163 | model_data.fillna('No_VAR', inplace=True) 164 | n = model_data.shape[0] 165 | col_dict = model_data[['VAR']].drop_duplicates().sort_values(by=['VAR']) 166 | n_vars = col_dict.shape[0] 167 | col_dict['col'] = (range(n_vars)) 168 | model_data = pd.merge(model_data, col_dict, on='VAR', how='left') 169 | vals = np.ones(n, dtype=float) 170 | rows = model_data['row'] 171 | cols = model_data['col'] 172 | 173 | for name in num.columns.values.tolist(): 174 | if name != 'row': 175 | vals = np.concatenate((vals, num[name])) 176 | rows = np.concatenate((rows, num['row'])) 177 | cols = np.concatenate((cols, n_vars*np.ones(len(num[name]), dtype=int))) 178 | new_col = pd.DataFrame({'VAR': [name], 'col': [n_vars]}) 179 | col_dict = pd.concat([col_dict, new_col]) 180 | n_vars = n_vars + 1 181 | 182 | X = sparse.csr_matrix((vals, (rows, cols)), shape=(n_claims, n_vars)) 183 | 184 | sparse.save_npz(path + "/X.npz", X) 185 | save_csv(target, path + '/Y.csv') 186 | save_csv(col_dict, path + '/col_dict.csv') 187 | return 188 | 189 | def merge_to_model_ready_test(target, num, cat, path_test, path_train, key): 190 | #target = pd.read_pickle(path + 'target_processed.pkl') 191 | #num = pd.read_pickle(path + 'num_processed.pkl') 192 | #cat = pd.read_pickle(path + 'cat_processed.pkl') 193 | 194 | model_data = target[key] 195 | n_claims = model_data.shape[0] 196 | model_data = pd.merge(model_data, cat, on=key, how='left') 197 | model_data.fillna('No_VAR', inplace=True) 198 | ## Variable differences 199 | col_dict_train = pd.read_csv(path_train + '/col_dict.csv') 200 | vars_train = col_dict_train.VAR.unique() 201 | vars_test = model_data.VAR.unique() 202 | vars_not_in_train = difference(vars_test,vars_train) 203 | vars_not_in_test = difference(vars_train,vars_test) 204 | vars_not_in_test_and_num = difference(vars_not_in_test,num.columns.values.tolist()) 205 | 206 | ## Remove variables present in test but not in train 207 | size = model_data.shape[0] 208 | model_data = model_data.loc[~model_data.VAR.isin(vars_not_in_train)] 209 | print("{} rows removed for variables which were present in test but not in train". 210 | format(size-model_data.shape[0])) 211 | 212 | # Add variables present in train but not in test ## DEFER 213 | model_data = model_data.append(pd.DataFrame({'row':[model_data.row.max()+1]*len(vars_not_in_test_and_num), 'VAR':vars_not_in_test_and_num})) 214 | 215 | col_dict = model_data[['VAR']].drop_duplicates().sort_values(by=['VAR']) 216 | n_vars = col_dict.shape[0] 217 | col_dict['col'] = (range(n_vars)) 218 | model_data = pd.merge(model_data, col_dict, on='VAR', how='left') 219 | n = model_data.shape[0] 220 | vals = np.ones(n, dtype=float) 221 | rows = model_data['row'] 222 | cols = model_data['col'] 223 | 224 | for name in num.columns.values.tolist(): 225 | if name != 'row': 226 | vals = np.concatenate((vals, num[name])) 227 | rows = np.concatenate((rows, num['row'])) 228 | cols = np.concatenate((cols, n_vars*np.ones(len(num[name]), dtype=int))) 229 | new_col = pd.DataFrame({'VAR': [name], 'col': [n_vars]}) 230 | col_dict = pd.concat([col_dict, new_col]) 231 | n_vars = n_vars + 1 232 | 233 | n_claims += 1 234 | 235 | X = sparse.csr_matrix((vals, (rows, cols)), shape=(n_claims, n_vars)) 236 | sparse.save_npz(path_test + "/X.npz", X) 237 | save_csv(target, path_test + '/Y.csv') 238 | save_csv(col_dict, path_test + '/col_dict.csv') 239 | return 240 | -------------------------------------------------------------------------------- /linux.txt: -------------------------------------------------------------------------------- 1 | awk < R4826-201801_RCMND_SPSH2.txt '{print $98}' | sort | uniq | wc –l 2 | head -1 R4826-201801_RCMND_SPSH.txt | tr '|' '\n' | cat -n | grep "rcmnd_prtf_cd" 3 | awk -F "|" '{ if(($145 == "N")||($145 == "pmt_arng_ind")) { print } }' R4826-201801_RCMND_SPSH.txt > R4826-201801_RCMND_SPSH_n.txt 4 | awk -F '|' '{print $98}' R4826-201801_RCMND_SPSH.txt | sort | uniq -c 5 | while read p; do head -1 R4826-201801_RCMND_SPSH2.txt | tr '|' '\n' | cat -n | grep "$p"; done < cols.txt 6 | wc -l 7 | -------------------------------------------------------------------------------- /models_run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | get_ipython().run_line_magic('load_ext', 'autoreload') 8 | get_ipython().run_line_magic('autoreload', '2') 9 | 10 | import pandas as pd, numpy as np, sys, os 11 | from sklearn import metrics 12 | from scipy import sparse 13 | from sklearn.metrics import r2_score,mean_squared_error 14 | from sklearn.model_selection import RandomizedSearchCV 15 | import csv, warnings, time, pickle 16 | 17 | sys.path.insert(0,'/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds') 18 | sys.path.insert(0,'/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds/code') 19 | sys.path.insert(0,'/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds/code/models') 20 | 21 | from config.PARAMETERS_global import * 22 | from config.LOOKUP_objects import * 23 | from utils.BASIC_input_output import * 24 | from utils.MODEL_basics import * 25 | 26 | warnings.filterwarnings('ignore') 27 | pd.options.display.max_rows = 100 28 | 29 | 30 | # In[2]: 31 | 32 | 33 | get_ipython().run_line_magic('reload_ext', 'autoreload') 34 | 35 | 36 | # In[3]: 37 | 38 | 39 | model_name = 'model_aapm_basic_subset_ap_prof_claims_1MM_num_cap_y_dates_n_binning_n_fe_proc_cd_grp_nbr' 40 | 41 | 42 | # In[4]: 43 | 44 | 45 | # model_name = model_aapm_version ## This will be the name of the folder 46 | path = path_data_output + '/encoders/ohe/' + model_name + '/' 47 | path_train = path + 'train/' 48 | path_test = path + 'test/' 49 | X_train = sparse.load_npz(path_train + 'X.npz') 50 | y_train = pd.read_csv(path_train + 'Y.csv')[target_aa] 51 | X_test = sparse.load_npz(path_test + 'X.npz') 52 | y_test = pd.read_csv(path_test + 'Y.csv')[target_aa] 53 | col_dict = pd.read_csv(path_train + 'col_dict.csv') 54 | 55 | 56 | # In[5]: 57 | 58 | 59 | print(X_test.shape, y_test.shape) 60 | X_test_ = X_test[:len(y_test)] #remove last row 61 | X_train.shape[0] + X_test_.shape[0] 62 | 63 | 64 | # #### Linear Regression 65 | 66 | # In[6]: 67 | 68 | 69 | from sklearn.linear_model import LinearRegression 70 | 71 | 72 | # In[7]: 73 | 74 | 75 | get_ipython().run_cell_magic('time', '', 'reg = LinearRegression().fit(X_train, y_train)') 76 | 77 | 78 | # In[8]: 79 | 80 | 81 | get_ipython().run_cell_magic('time', '', 'test_pred = reg.predict(X_test_)\ntrain_pred = reg.predict(X_train)') 82 | 83 | 84 | # In[9]: 85 | 86 | 87 | get_ipython().run_cell_magic('time', '', 'calculate_performance(y_test, test_pred)\ncalculate_performance(y_train, train_pred) ') 88 | 89 | 90 | # In[10]: 91 | 92 | 93 | calculate_performance_for_hypo_testing(y_test, test_pred) 94 | generate_deviation_stats_for_hypo_testing(test_pred, y_test) 95 | 96 | 97 | # In[121]: 98 | 99 | 100 | generate_deviation_stats(train_pred, y_train) 101 | 102 | 103 | # In[122]: 104 | 105 | 106 | generate_deviation_stats(test_pred, y_test) 107 | 108 | 109 | # In[123]: 110 | 111 | 112 | for i,col in enumerate(col_dict.VAR): 113 | print(col,'{0:.4f}'.format(reg.coef_[i])) 114 | 115 | 116 | # In[124]: 117 | 118 | 119 | calculate_performance_for_hypo_testing(y_test, test_pred) 120 | generate_deviation_stats_for_hypo_testing(test_pred, y_test) 121 | 122 | 123 | # In[87]: 124 | 125 | 126 | calculate_performance_for_hypo_testing(y_test, test_pred) 127 | generate_deviation_stats_for_hypo_testing(test_pred, y_test) 128 | 129 | 130 | # In[40]: 131 | 132 | 133 | calculate_performance_for_hypo_testing(y_test, test_pred) 134 | generate_deviation_stats_for_hypo_testing(test_pred, y_test) 135 | 136 | 137 | # #### GBM 138 | 139 | # In[11]: 140 | 141 | 142 | from sklearn.ensemble import GradientBoostingRegressor 143 | 144 | 145 | # In[28]: 146 | 147 | 148 | gbr = GradientBoostingRegressor(n_estimators = 50, max_depth=7, learning_rate = 0.1, max_features = 'sqrt', random_state=42) 149 | 150 | 151 | # In[29]: 152 | 153 | 154 | get_ipython().run_cell_magic('time', '', 'gbr.fit(X_train, y_train)') 155 | 156 | 157 | # In[30]: 158 | 159 | 160 | preds_train = gbr.predict(X_train) 161 | rmse_train = np.sqrt(mean_squared_error(y_train, preds_train)) 162 | r2_train = r2_score(y_train, preds_train) 163 | preds_test = gbr.predict(X_test_) 164 | rmse_test = np.sqrt(mean_squared_error(y_test, preds_test)) 165 | r2_test = r2_score(y_test, preds_test) 166 | 167 | 168 | # In[31]: 169 | 170 | 171 | calculate_performance(y_test, preds_test) 172 | calculate_performance(y_train, preds_train) 173 | 174 | 175 | # In[32]: 176 | 177 | 178 | generate_deviation_stats(preds_train, y_train) 179 | 180 | 181 | # In[33]: 182 | 183 | 184 | generate_deviation_stats(preds_test, y_test) 185 | 186 | 187 | # In[35]: 188 | 189 | 190 | for i,col in enumerate(col_dict.VAR): 191 | print(col,'{0:.4f}'.format(gbr.feature_importances_[i])) 192 | 193 | 194 | # In[20]: 195 | 196 | 197 | top_10_features(gbr) 198 | 199 | 200 | # In[ ]: 201 | 202 | 203 | top_10_features(gbr) 204 | 205 | 206 | # In[34]: 207 | 208 | 209 | calculate_performance_for_hypo_testing(y_test, preds_test) 210 | generate_deviation_stats_for_hypo_testing(preds_test, y_test) 211 | 212 | 213 | # In[16]: 214 | 215 | 216 | calculate_performance_for_hypo_testing(y_test, preds_test) 217 | generate_deviation_stats_for_hypo_testing(preds_test, y_test) 218 | 219 | 220 | # In[133]: 221 | 222 | 223 | calculate_performance_for_hypo_testing(y_test, preds_test) 224 | generate_deviation_stats_for_hypo_testing(preds_test, y_test) 225 | 226 | 227 | # In[71]: 228 | 229 | 230 | calculate_performance_for_hypo_testing(y_test, preds_test) 231 | generate_deviation_stats_for_hypo_testing(preds_test, y_test) 232 | 233 | 234 | # In[51]: 235 | 236 | 237 | calculate_performance_for_hypo_testing(y_test, preds_test) 238 | generate_deviation_stats_for_hypo_testing(preds_test, y_test) 239 | 240 | 241 | # #### GBM - 2 (Hyperparameters changed) 242 | 243 | # In[11]: 244 | 245 | 246 | from sklearn.ensemble import GradientBoostingRegressor 247 | 248 | 249 | # In[12]: 250 | 251 | 252 | gbr = GradientBoostingRegressor(n_estimators = 500, max_depth=7, learning_rate = 0.1, random_state=42) 253 | 254 | 255 | # In[13]: 256 | 257 | 258 | get_ipython().run_cell_magic('time', '', 'gbr.fit(X_train, y_train)') 259 | 260 | 261 | # In[14]: 262 | 263 | 264 | preds_train = gbr.predict(X_train) 265 | rmse_train = np.sqrt(mean_squared_error(y_train, preds_train)) 266 | r2_train = r2_score(y_train, preds_train) 267 | preds_test = gbr.predict(X_test_) 268 | rmse_test = np.sqrt(mean_squared_error(y_test, preds_test)) 269 | r2_test = r2_score(y_test, preds_test) 270 | 271 | 272 | # In[15]: 273 | 274 | 275 | calculate_performance(y_test, preds_test) 276 | calculate_performance(y_train, preds_train) 277 | 278 | 279 | # In[16]: 280 | 281 | 282 | generate_deviation_stats(preds_train, y_train) 283 | 284 | 285 | # In[17]: 286 | 287 | 288 | generate_deviation_stats(preds_test, y_test) 289 | 290 | 291 | # In[18]: 292 | 293 | 294 | for i,col in enumerate(col_dict.VAR): 295 | print(col,'{0:.4f}'.format(gbr.feature_importances_[i])) 296 | 297 | 298 | # In[19]: 299 | 300 | 301 | top_10_features(gbr) 302 | 303 | 304 | # In[20]: 305 | 306 | 307 | calculate_performance_for_hypo_testing(y_test, preds_test) 308 | generate_deviation_stats_for_hypo_testing(preds_test, y_test) 309 | 310 | 311 | # ### XG Boost 312 | 313 | # In[6]: 314 | 315 | 316 | import xgboost 317 | from xgboost import plot_importance 318 | 319 | 320 | # In[16]: 321 | 322 | 323 | model = xgboost.XGBRegressor(colsample_bytree=0.4, 324 | gamma=0, 325 | learning_rate=0.09, 326 | max_depth=7, 327 | min_child_weight=1.5, 328 | n_estimators=10000, 329 | reg_alpha=0.75, 330 | reg_lambda=0.45, 331 | subsample=0.6, 332 | seed=42) 333 | 334 | 335 | # In[ ]: 336 | 337 | 338 | get_ipython().run_cell_magic('time', '', 'model.fit(X_train, y_train)') 339 | 340 | 341 | # In[ ]: 342 | 343 | 344 | get_ipython().run_cell_magic('time', '', 'preds_train = model.predict(X_train)\npreds_test = model.predict(X_test_)') 345 | 346 | 347 | # In[ ]: 348 | 349 | 350 | calculate_performance_for_hypo_testing(y_test, preds_test) 351 | generate_deviation_stats_for_hypo_testing(preds_test, y_test) 352 | 353 | 354 | # In[ ]: 355 | 356 | 357 | 358 | 359 | 360 | # In[27]: 361 | 362 | 363 | model2 = xgboost.XGBRegressor(colsample_bytree=0.4, 364 | gamma=0, 365 | learning_rate=0.07, 366 | max_depth=6, 367 | min_child_weight=1, 368 | n_estimators=10000, 369 | reg_alpha=0.75, 370 | reg_lambda=0.45, 371 | subsample=0.6, 372 | seed=42) 373 | 374 | 375 | # In[28]: 376 | 377 | 378 | get_ipython().run_cell_magic('time', '', 'model2.fit(X_train, y_train)') 379 | 380 | 381 | # In[29]: 382 | 383 | 384 | preds_train = model2.predict(X_train) 385 | preds_test = model2.predict(X_test_) 386 | 387 | 388 | # In[30]: 389 | 390 | 391 | calculate_performance_for_hypo_testing(y_test, preds_test) 392 | generate_deviation_stats_for_hypo_testing(preds_test, y_test) 393 | 394 | 395 | # In[31]: 396 | 397 | 398 | model3 = xgboost.XGBRegressor(colsample_bytree=0.4, 399 | gamma=0, 400 | learning_rate=0.07, 401 | max_depth=10, 402 | min_child_weight=1.5, 403 | n_estimators=10000, 404 | reg_alpha=0.75, 405 | reg_lambda=0.45, 406 | subsample=0.6, 407 | seed=42) 408 | 409 | 410 | # In[32]: 411 | 412 | 413 | get_ipython().run_cell_magic('time', '', 'model3.fit(X_train, y_train)') 414 | 415 | 416 | # In[33]: 417 | 418 | 419 | preds_train = model3.predict(X_train) 420 | preds_test = model3.predict(X_test_) 421 | 422 | 423 | # In[34]: 424 | 425 | 426 | calculate_performance_for_hypo_testing(y_test, preds_test) 427 | generate_deviation_stats_for_hypo_testing(preds_test, y_test) 428 | 429 | 430 | # In[43]: 431 | 432 | 433 | pickle.dump(model3, open('xgboost_best_perf.pkl', 'wb')) 434 | 435 | 436 | # In[ ]: 437 | 438 | 439 | 440 | 441 | 442 | # In[37]: 443 | 444 | 445 | model4 = xgboost.XGBRegressor(colsample_bytree=0.4, 446 | gamma=1, 447 | learning_rate=0.07, 448 | max_depth=10, 449 | min_child_weight=1.5, 450 | n_estimators=10000, 451 | reg_alpha=0.75, 452 | reg_lambda=0.45, 453 | subsample=0.6, 454 | seed=42) 455 | 456 | 457 | # In[38]: 458 | 459 | 460 | get_ipython().run_cell_magic('time', '', 'model4.fit(X_train, y_train)') 461 | 462 | 463 | # In[39]: 464 | 465 | 466 | preds_train = model4.predict(X_train) 467 | preds_test = model4.predict(X_test_) 468 | 469 | 470 | # In[40]: 471 | 472 | 473 | calculate_performance_for_hypo_testing(y_test, preds_test) 474 | generate_deviation_stats_for_hypo_testing(preds_test, y_test) 475 | 476 | 477 | # In[41]: 478 | 479 | 480 | calculate_performance_for_hypo_testing(y_train, preds_train) 481 | generate_deviation_stats_for_hypo_testing(preds_train, y_train) 482 | 483 | 484 | # In[44]: 485 | 486 | 487 | 488 | 489 | 490 | # In[46]: 491 | 492 | 493 | feats = model4.feature_importances_ 494 | 495 | 496 | # In[50]: 497 | 498 | 499 | feats_df = pd.DataFrame(feats) 500 | 501 | 502 | # In[53]: 503 | 504 | 505 | feats_df.to_csv('feats.csv', index = False) 506 | 507 | 508 | # In[52]: 509 | 510 | 511 | feats_df['col'] = col_dict.VAR.tolist() 512 | 513 | 514 | # In[45]: 515 | 516 | 517 | for i,col in enumerate(col_dict.VAR): 518 | print(col,'{0:.4f}'.format(model4.feature_importances_[i])) 519 | 520 | 521 | # In[54]: 522 | 523 | 524 | feats_df['col'] 525 | 526 | 527 | # #### RF 528 | 529 | # In[34]: 530 | 531 | 532 | from sklearn.ensemble import RandomForestRegressor 533 | 534 | 535 | # In[63]: 536 | 537 | 538 | regr = RandomForestRegressor(n_estimators = 500, max_depth=5,max_features = 'auto', random_state=42) 539 | 540 | 541 | # In[64]: 542 | 543 | 544 | get_ipython().run_cell_magic('time', '', 'regr.fit(X_train, y_train)') 545 | 546 | 547 | # In[65]: 548 | 549 | 550 | preds_train = regr.predict(X_train) 551 | rmse_train = np.sqrt(mean_squared_error(y_train, preds_train)) 552 | r2_train = r2_score(y_train, preds_train) 553 | 554 | preds_test = regr.predict(X_test_) 555 | rmse_test = np.sqrt(mean_squared_error(y_test, preds_test)) 556 | r2_test = r2_score(y_test, preds_test) 557 | 558 | 559 | # In[66]: 560 | 561 | 562 | calculate_performance(y_test, preds_test) 563 | calculate_performance(y_train, preds_train) 564 | 565 | 566 | # In[67]: 567 | 568 | 569 | generate_deviation_stats(preds_train, y_train) 570 | 571 | 572 | # In[68]: 573 | 574 | 575 | generate_deviation_stats(preds_test, y_test) 576 | 577 | 578 | # In[27]: 579 | 580 | 581 | for i,col in enumerate(col_dict.VAR): 582 | print(col,'{0:.4f}'.format(regr.feature_importances_[i])) 583 | 584 | 585 | # #### Ridge 586 | 587 | # In[92]: 588 | 589 | 590 | from sklearn.linear_model import Ridge 591 | 592 | 593 | # In[99]: 594 | 595 | 596 | get_ipython().run_cell_magic('time', '', 'reg = Ridge(alpha=1).fit(X_train, y_train)') 597 | 598 | 599 | # In[100]: 600 | 601 | 602 | get_ipython().run_cell_magic('time', '', 'test_pred = reg.predict(X_test_)\ntrain_pred = reg.predict(X_train)') 603 | 604 | 605 | # In[101]: 606 | 607 | 608 | get_ipython().run_cell_magic('time', '', 'calculate_performance(y_test, test_pred)\ncalculate_performance(y_train, train_pred) ') 609 | 610 | 611 | # In[102]: 612 | 613 | 614 | for i,col in enumerate(col_dict.VAR): 615 | print(col,'{0:.4f}'.format(reg.coef_[i])) 616 | 617 | 618 | # In[103]: 619 | 620 | 621 | generate_deviation_stats(train_pred, y_train) 622 | 623 | 624 | # In[104]: 625 | 626 | 627 | generate_deviation_stats(test_pred, y_test) 628 | 629 | 630 | # In[ ]: 631 | 632 | 633 | 634 | 635 | 636 | # #### Lasso 637 | 638 | # In[105]: 639 | 640 | 641 | from sklearn.linear_model import Lasso 642 | 643 | 644 | # In[106]: 645 | 646 | 647 | get_ipython().run_cell_magic('time', '', 'reg = Lasso(alpha = 1).fit(X_train, y_train)') 648 | 649 | 650 | # In[107]: 651 | 652 | 653 | get_ipython().run_cell_magic('time', '', 'test_pred = reg.predict(X_test_)\ntrain_pred = reg.predict(X_train)') 654 | 655 | 656 | # In[108]: 657 | 658 | 659 | get_ipython().run_cell_magic('time', '', 'calculate_performance(y_test, test_pred)\ncalculate_performance(y_train, train_pred) ') 660 | 661 | 662 | # In[66]: 663 | 664 | 665 | for i,col in enumerate(col_dict.VAR): 666 | print(col,'{0:.4f}'.format(reg.coef_[i])) 667 | 668 | 669 | # In[109]: 670 | 671 | 672 | generate_deviation_stats(train_pred, c) 673 | 674 | 675 | # In[110]: 676 | 677 | 678 | generate_deviation_stats(test_pred, y_test) 679 | 680 | 681 | # In[12]: 682 | 683 | 684 | round(y_train.describe(),2) 685 | 686 | 687 | # In[ ]: 688 | 689 | 690 | 691 | 692 | -------------------------------------------------------------------------------- /regression_model_basics: -------------------------------------------------------------------------------- 1 | import numpy as np, pandas as pd 2 | from sklearn import metrics 3 | from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error 4 | 5 | def mean_absolute_percentage_error(y_true, y_pred): 6 | y_true, y_pred = np.array(y_true), np.array(y_pred) 7 | return np.mean(np.abs((y_true - y_pred) / (y_true + 0.0000001))) * 100 8 | 9 | def calculate_performance(test_Y, test_pred): 10 | mse = mean_squared_error(test_Y, test_pred) 11 | rmse = np.sqrt(mse) 12 | print("RMSE: ", round(rmse,2)) 13 | print("R2: ", round(r2_score(test_Y, test_pred),2)) 14 | print('MAE: ', round(mean_absolute_error(test_Y, test_pred),2)) 15 | print('MAPE: ', round(mean_absolute_percentage_error(test_Y, test_pred),2)) 16 | return 17 | 18 | ## Calculate the deviation 19 | def generate_deviation_stats(pred, actual): 20 | df = pd.DataFrame(pred) 21 | df['actual'] = actual 22 | df.rename(columns = {0: 'pred'}, inplace = True) 23 | df['diff'] = abs(df['actual']- df['pred']) 24 | df['actual'] = df['actual'].replace(0,1) 25 | df['pct_deviation'] = round(df['diff']*100/df['actual'],4) 26 | df['pct_deviation'].value_counts()/len(df) 27 | print('max percentage deviation {}%'.format(df['pct_deviation'].max())) 28 | deviations = [0,2,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100,150,200] 29 | len_deviations = [] 30 | for i in deviations: 31 | dev = df.loc[df['pct_deviation'] <= i] 32 | len_ = len(dev) 33 | len_deviations.append(len_) 34 | #for i,index in enumerate(deviations): 35 | # len_deviations[i+1] = len_deviations[i+1]-len_deviations[i] 36 | # if(i==len(deviations)-2): 37 | # break; 38 | for i,index in enumerate(deviations): 39 | print('{} % \tdeviation - \tcount: {}, \tpct {}%'.format(index,len_deviations[i],round(len_deviations[i]*100/len(df),2))) 40 | return 41 | 42 | #Remove claims with 0s 43 | def generate_deviation_stats_wo_0(pred, actual): 44 | df = pd.DataFrame(pred) 45 | df['actual'] = actual 46 | df.rename(columns = {0: 'pred'}, inplace = True) 47 | df['diff'] = abs(df['actual']- df['pred']) 48 | df = df.loc[df['actual'] != 0] 49 | #df['actual'] = df['actual'].replace(0,1) 50 | df['pct_deviation'] = round(df['diff']*100/df['actual'],4) 51 | df['pct_deviation'].value_counts()/len(df) 52 | print('max percentage deviation {}%'.format(df['pct_deviation'].max())) 53 | deviations = [0,2,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100,150,200] 54 | len_deviations = [] 55 | for i in deviations: 56 | dev = df.loc[df['pct_deviation'] <= i] 57 | len_ = len(dev) 58 | len_deviations.append(len_) 59 | #for i,index in enumerate(deviations): 60 | # len_deviations[i+1] = len_deviations[i+1]-len_deviations[i] 61 | # if(i==len(deviations)-2): 62 | # break; 63 | for i,index in enumerate(deviations): 64 | print('{} % \tdeviation - \tcount: {}, \tpct {}%'.format(index,len_deviations[i],round(len_deviations[i]*100/len(df),2))) 65 | return 66 | --------------------------------------------------------------------------------