├── EDA.ipynb
├── EDA_profiler.ipynb
├── Final_Data_Prediction_Model_SG.py
├── GMM  - Clusters 6 - Normalized input.ipynb
├── GMM  - Clusters 6 - Normalized using mean (remove 30 days variable).ipynb
├── GMM - Cluster number selection.ipynb
├── GMM - trials.ipynb
├── README.md
├── basic_io
├── eda_helpers.py
├── encoder_code.py
├── encoding.py
├── linux.txt
├── models_run.py
└── regression_model_basics


/EDA.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "import csv\n",
 12 |     "pd.options.display.max_rows = 4000\n",
 13 |     "pd.set_option('display.float_format', lambda x: '%.2f' % x)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "# Read files from the directory (one by one)\n",
 23 |     "import glob\n",
 24 |     "files1 = glob.glob('Z:\\Payment Arrangement Recommender Files\\*.txt')\n",
 25 |     "files2 = glob.glob('Z:\\Treatment Files\\*.txt')\n",
 26 |     "files3 = glob.glob('Z:\\WLS Dec 2017\\*.txt')\n",
 27 |     "files4 = glob.glob('Z:\\WLS Jul 2016\\*.txt')\n",
 28 |     "files5 = glob.glob('Z:\\WLS Jun 2017\\*.txt')\n",
 29 |     "files6 = glob.glob('Z:\\WLS Mar 2017\\*.txt')\n",
 30 |     "files7 = glob.glob('Z:\\WLS Oct 2017\\*.txt')\n",
 31 |     "files8 = glob.glob('Z:\\WLS Sep 2017\\*.txt')"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "files = files1+ files2+ files3+ files4 + files5 + files6 + files7 + files8\n"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import glob\n",
 50 |     "files1_ = glob.glob('Z:\\WLS Oct 2017\\Oct_TRT\\*.txt')\n",
 51 |     "files2_ = glob.glob('Z:\\WLS Oct 2017\\Oct_PAR\\*.txt')\n",
 52 |     "files_ = files1_ + files2_"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 5,
 58 |    "metadata": {
 59 |     "scrolled": true
 60 |    },
 61 |    "outputs": [
 62 |     {
 63 |      "name": "stdout",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "Z:\\WLS Dec 2017\\WLS AR STRATA DEC 2017 RES.txt\n",
 67 |       "Z:\\WLS Dec 2017\\WLS COHORT DEC 2017 RES.txt\n"
 68 |      ]
 69 |     },
 70 |     {
 71 |      "name": "stderr",
 72 |      "output_type": "stream",
 73 |      "text": [
 74 |       "C:\\Users\\sg641p\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2785: DtypeWarning: Columns (8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n",
 75 |       "  interactivity=interactivity, compiler=compiler, result=result)\n"
 76 |      ]
 77 |     },
 78 |     {
 79 |      "name": "stdout",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "Z:\\WLS Dec 2017\\WLS DD STRATA DEC 2017 data.txt\n",
 83 |       "Z:\\WLS Dec 2017\\WLS PAY DEC 2017 RES.txt\n",
 84 |       "Z:\\WLS Dec 2017\\WLS PERFORMANCE DEC 2017 RES.txt\n",
 85 |       "Z:\\WLS Dec 2017\\WLS Treatment Dec 2017 RES.txt\n",
 86 |       "Z:\\WLS Dec 2017\\WLS_PA_DEC_2017.txt\n"
 87 |      ]
 88 |     },
 89 |     {
 90 |      "name": "stderr",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "C:\\Users\\sg641p\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2785: DtypeWarning: Columns (35) have mixed types. Specify dtype option on import or set low_memory=False.\n",
 94 |       "  interactivity=interactivity, compiler=compiler, result=result)\n"
 95 |      ]
 96 |     },
 97 |     {
 98 |      "ename": "MemoryError",
 99 |      "evalue": "",
100 |      "output_type": "error",
101 |      "traceback": [
102 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
103 |       "\u001b[1;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
104 |       "\u001b[1;32m<ipython-input-5-0362b1334a9b>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[1;31m# Populate columns\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m     \u001b[0mcols\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m     \u001b[0msample_value\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      8\u001b[0m     \u001b[0mis_null\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0many\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      9\u001b[0m     \u001b[0mnum_nulls\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxis\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
105 |       "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36mvalues\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   4631\u001b[0m         \"\"\"\n\u001b[0;32m   4632\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_consolidate_inplace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4633\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_data\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mas_array\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtranspose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_AXIS_REVERSED\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   4634\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4635\u001b[0m     \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
106 |       "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\internals.py\u001b[0m in \u001b[0;36mas_array\u001b[1;34m(self, transpose, items)\u001b[0m\n\u001b[0;32m   3947\u001b[0m             \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmgr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mblocks\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3948\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3949\u001b[1;33m             \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmgr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_interleave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3950\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3951\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtranspose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mtranspose\u001b[0m \u001b[1;32melse\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
107 |       "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\internals.py\u001b[0m in \u001b[0;36m_interleave\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   3976\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mblk\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mblocks\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3977\u001b[0m             \u001b[0mrl\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mblk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmgr_locs\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3978\u001b[1;33m             \u001b[0mresult\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mrl\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mblk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3979\u001b[0m             \u001b[0mitemmask\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mrl\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3980\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
108 |       "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pandas\\core\\internals.py\u001b[0m in \u001b[0;36mget_values\u001b[1;34m(self, dtype)\u001b[0m\n\u001b[0;32m    217\u001b[0m         \"\"\"\n\u001b[0;32m    218\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mis_object_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 219\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mobject\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    220\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    221\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
109 |       "\u001b[1;31mMemoryError\u001b[0m: "
110 |      ]
111 |     }
112 |    ],
113 |    "source": [
114 |     "for f in files:\n",
115 |     "    print(f)\n",
116 |     "    df = pd.read_csv(f, sep = '|')\n",
117 |     "    \n",
118 |     "    # Populate columns\n",
119 |     "    cols = df.columns.tolist()\n",
120 |     "    sample_value = df.values[0].tolist()\n",
121 |     "    is_null = df.isnull().any().tolist()\n",
122 |     "    num_nulls = df.isnull().sum(axis = 0)\n",
123 |     "    per_nulls = df.isnull().sum(axis = 0)*100/len(df)\n",
124 |     "    rows = zip(cols,sample_value,is_null,num_nulls,per_nulls)\n",
125 |     "    \n",
126 |     "    header = ['Column Name', 'Sample Values', 'Is null?', '# of Nulls', '% of Nulls'] \n",
127 |     "    start = '\\\\'    \n",
128 |     "    end = '.'\n",
129 |     "\n",
130 |     "    filename = (f.split(start))[2].split(end)[0]\n",
131 |     "    \n",
132 |     "    newfilePath = '../../3.analysis/2.analysis/EDA/' + filename  + '_EDA.csv'\n",
133 |     "    with open(newfilePath, \"w\") as file:\n",
134 |     "        writer = csv.writer(file)\n",
135 |     "        writer.writerow(header)\n",
136 |     "        for row in rows:\n",
137 |     "            writer.writerow(row)\n",
138 |     "    del(df)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "Z:\\WLS Oct 2017\\Oct_TRT\\WLS Treatment Oct 2017 RES HIST.txt\n",
151 |       "Z:\\WLS Oct 2017\\Oct_TRT\\WLS Treatment Oct 2017 RES.txt\n",
152 |       "Z:\\WLS Oct 2017\\Oct_TRT\\WLS Treatment Oct 2017 RES_Updated.txt\n",
153 |       "Z:\\WLS Oct 2017\\Oct_TRT\\WLS_TRT_Mar_Sep RES.txt\n",
154 |       "Z:\\WLS Oct 2017\\Oct_PAR\\WLS Pay Oct 2017 RES.txt\n",
155 |       "Z:\\WLS Oct 2017\\Oct_PAR\\WLS_PA_Oct_Cohort_Perfromance.txt\n"
156 |      ]
157 |     }
158 |    ],
159 |    "source": [
160 |     "for f in files_:\n",
161 |     "    print(f)\n",
162 |     "    df = pd.read_csv(f, sep = '|')\n",
163 |     "    \n",
164 |     "    # Populate columns\n",
165 |     "    cols = df.columns.tolist()\n",
166 |     "    sample_value = df.values[0].tolist()\n",
167 |     "    is_null = df.isnull().any().tolist()\n",
168 |     "    num_nulls = df.isnull().sum(axis = 0)\n",
169 |     "    per_nulls = df.isnull().sum(axis = 0)*100/len(df)\n",
170 |     "    rows = zip(cols,sample_value,is_null,num_nulls,per_nulls)\n",
171 |     "    \n",
172 |     "    header = ['Column Name', 'Sample Values', 'Is null?', '# of Nulls', '% of Nulls'] \n",
173 |     "    start = '\\\\'    \n",
174 |     "    end = '.'\n",
175 |     "\n",
176 |     "    filename = (f.split(start))[3].split(end)[0]\n",
177 |     "    \n",
178 |     "    newfilePath = '../../3.analysis/2.analysis/EDA/' + filename  + '_EDA.csv'\n",
179 |     "    with open(newfilePath, \"w\") as file:\n",
180 |     "        writer = csv.writer(file)\n",
181 |     "        writer.writerow(header)\n",
182 |     "        for row in rows:\n",
183 |     "            writer.writerow(row)\n",
184 |     "    del(df)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": []
193 |   }
194 |  ],
195 |  "metadata": {
196 |   "kernelspec": {
197 |    "display_name": "Python 3",
198 |    "language": "python",
199 |    "name": "python3"
200 |   },
201 |   "language_info": {
202 |    "codemirror_mode": {
203 |     "name": "ipython",
204 |     "version": 3
205 |    },
206 |    "file_extension": ".py",
207 |    "mimetype": "text/x-python",
208 |    "name": "python",
209 |    "nbconvert_exporter": "python",
210 |    "pygments_lexer": "ipython3",
211 |    "version": "3.7.0"
212 |   }
213 |  },
214 |  "nbformat": 4,
215 |  "nbformat_minor": 2
216 | }
217 | 


--------------------------------------------------------------------------------
/EDA_profiler.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "from collections import defaultdict\n",
 14 |     "import getopt, sys\n",
 15 |     "\n",
 16 |     "\n",
 17 |     "cth = 15\n",
 18 |     "sth = 60\n",
 19 |     "dlm = \",\"\n",
 20 |     "MISSING_VALUES = [\"\", \" \", \"N/A\", \"#N/A\", \"nan\"]\n",
 21 |     "\n",
 22 |     "ifile_name = \"./in_data/R4642-1-COHORT_PA_RECOMMENDER2_new.txt\"\n",
 23 |     "ofile_name = \"./eda_results/R4642-1-COHORT_PA_RECOMMENDER2_new_profiler.csv\"\n",
 24 |     "\n",
 25 |     "# ifile_name = \"./in_data/R4642-5-Call_Data.txt\"\n",
 26 |     "# ofile_name = \"./eda_results/R4642-5-Call_Data_profiler.csv\"\n",
 27 |     "\n",
 28 |     "try:\n",
 29 |     "    ofile = open(ofile_name, 'w')\n",
 30 |     "    df = pd.read_csv(ifile_name, na_values=MISSING_VALUES, sep= '|', low_memory = False)\n",
 31 |     "except:\n",
 32 |     "    print('Parameter Error\\n')\n",
 33 |     "    sys.exit(2)\n",
 34 |     "\n",
 35 |     "ofile.write(\"Input File Name,\" + ifile_name)\n",
 36 |     "ofile.write(\"\\nProfile File Name,\" + ofile_name)\n",
 37 |     "ofile.write(\"\\nNote\\nAll blanks; N/A; #N/A will be treated as missing values\")\n",
 38 |     "ofile.write(\"\\nAll statistics are computed on observed values\")\n",
 39 |     "ofile.write(\"\\nNumeric columns with not more than %d unique values will be considered as categorical\" % cth)\n",
 40 |     "ofile.write(\"\\nCharacter columns with more than %d unique values will be considered as string\" % sth)\n",
 41 |     "\n",
 42 |     "\n",
 43 |     "class Numeric:\n",
 44 |     "    def __init__(self, series):\n",
 45 |     "        self.valid_list = [x for x in series if ~np.isnan(x)]\n",
 46 |     "        self.missing = len(series) - len(self.valid_list)\n",
 47 |     "        self.observed = len(self.valid_list)\n",
 48 |     "        self.mean = np.mean(self.valid_list)\n",
 49 |     "        self.std = np.std(self.valid_list)\n",
 50 |     "        self.min = np.min(self.valid_list)\n",
 51 |     "        self.max = np.max(self.valid_list)\n",
 52 |     "        self.p5 = np.percentile(self.valid_list, 5)\n",
 53 |     "        self.p25 = np.percentile(self.valid_list, 25)\n",
 54 |     "        self.p50 = np.percentile(self.valid_list, 50)\n",
 55 |     "        self.p75 = np.percentile(self.valid_list, 75)\n",
 56 |     "        self.p95 = np.percentile(self.valid_list, 95)\n",
 57 |     "\n",
 58 |     "\n",
 59 |     "class Categorical:\n",
 60 |     "    def __init__(self, series):\n",
 61 |     "        self.valid_list = [x for x in series if pd.notnull(x)]\n",
 62 |     "        self.missing = len(series) - len(self.valid_list)\n",
 63 |     "        self.observed = len(self.valid_list)\n",
 64 |     "        self.num_categ = len(set(self.valid_list))\n",
 65 |     "        self.cnt_categ = defaultdict(float)\n",
 66 |     "        for each in self.valid_list:\n",
 67 |     "            self.cnt_categ[each] += 1\n",
 68 |     "            \n",
 69 |     "\n",
 70 |     "(rows, cols) = df.shape\n",
 71 |     "ofile.write(\"\\n\\nData Shape\\nRows,\" + str(rows) + \"\\nColumns,\" + str(cols))\n",
 72 |     "DATA_TYPE = {}\n",
 73 |     "for column_name in df.columns:\n",
 74 |     "    if df[column_name].dtype == \"object\":\n",
 75 |     "        df[column_name] = df[column_name].str.strip()\n",
 76 |     "        if df[column_name].nunique() > sth:\n",
 77 |     "            DATA_TYPE[column_name] = \"String/Text\"\n",
 78 |     "        else:\n",
 79 |     "            DATA_TYPE[column_name] = \"Categorical\"\n",
 80 |     "    elif len([x for x in pd.unique(df[column_name].ravel()) if ~np.isnan(x)]) < cth:\n",
 81 |     "        DATA_TYPE[column_name] = \"Categorical\"\n",
 82 |     "    else:\n",
 83 |     "        DATA_TYPE[column_name] = \"Numeric (int64)\" if df[column_name].dtype == 'int64' else \"Numeric (float64)\"\n",
 84 |     "\n",
 85 |     "ofile.write(\n",
 86 |     "    \"\\n\\nNumeric variables\\nVariable, #Records, #Missing, #Observed, Mean, StdDev, Min, Max, Percentile_5, Percentile_25, Percentile_50, Percentile_75, Percentile_95\")\n",
 87 |     "for column_name in DATA_TYPE.keys():\n",
 88 |     "    if DATA_TYPE[column_name][:7] == \"Numeric\":\n",
 89 |     "        temp = Numeric(df[column_name])\n",
 90 |     "        output = [column_name, rows, temp.missing, temp.observed, temp.mean, temp.std, temp.min, temp.max, temp.p5,\n",
 91 |     "                  temp.p25, temp.p50, temp.p75, temp.p95]\n",
 92 |     "        ofile.write(\"\\n\" + \",\".join(map(str, output)))\n",
 93 |     "        \n",
 94 |     "\n",
 95 |     "ofile.write(\n",
 96 |     "    \"\\n\\nCategorical variables\\nVariable, #Records, #Missing, #Observed, #Categories, %C1, %C2, %C3, %C4, %C5, %C6, %C7, %C8\")\n",
 97 |     "for column_name in DATA_TYPE.keys():\n",
 98 |     "    if DATA_TYPE[column_name][:11] == \"Categorical\":\n",
 99 |     "        temp = Categorical(df[column_name])\n",
100 |     "        output = [column_name, rows, temp.missing, temp.observed, temp.num_categ]\n",
101 |     "        unord_list = []\n",
102 |     "        for each in temp.cnt_categ.keys():\n",
103 |     "            unord_list.append((each, round(temp.cnt_categ[each] / temp.observed, 4)))\n",
104 |     "        ord_list = sorted(unord_list, key=lambda x: x[1], reverse=True)\n",
105 |     "        for each in ord_list[:8]:\n",
106 |     "            output.append(str(each[0]) + \" # \" + str(each[1]))\n",
107 |     "        ofile.write(\"\\n\" + \",\".join(map(str, output)))\n",
108 |     "        \n",
109 |     "\n",
110 |     "ofile.write(\n",
111 |     "    \"\\n\\nString/Text variables\\nVariable, #Records, #Missing, #Observed, #Categories\")  # , %C1, %C2, %C3, %C4, %C5, %C6, %C7, %C8\")\n",
112 |     "for column_name in DATA_TYPE.keys():\n",
113 |     "    if DATA_TYPE[column_name][:11] == \"String/Text\":\n",
114 |     "        temp = Categorical(df[column_name])\n",
115 |     "        output = [column_name, rows, temp.missing, temp.observed, temp.num_categ]\n",
116 |     "        ofile.write(\"\\n\" + \",\".join(map(str, output)))\n",
117 |     "\n",
118 |     "ofile.close()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "collapsed": true
126 |    },
127 |    "outputs": [],
128 |    "source": []
129 |   }
130 |  ],
131 |  "metadata": {
132 |   "kernelspec": {
133 |    "display_name": "Python 3",
134 |    "language": "python",
135 |    "name": "python3"
136 |   },
137 |   "language_info": {
138 |    "codemirror_mode": {
139 |     "name": "ipython",
140 |     "version": 3
141 |    },
142 |    "file_extension": ".py",
143 |    "mimetype": "text/x-python",
144 |    "name": "python",
145 |    "nbconvert_exporter": "python",
146 |    "pygments_lexer": "ipython3",
147 |    "version": "3.6.0"
148 |   }
149 |  },
150 |  "nbformat": 4,
151 |  "nbformat_minor": 2
152 | }
153 | 


--------------------------------------------------------------------------------
/Final_Data_Prediction_Model_SG.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import pandas as pd  
  8 | import numpy as np  
  9 | import matplotlib.pyplot as plt  
 10 | import seaborn as seabornInstance 
 11 | from sklearn.model_selection import train_test_split 
 12 | from sklearn.linear_model import LinearRegression
 13 | from sklearn.linear_model import LogisticRegression
 14 | from sklearn.preprocessing import LabelEncoder
 15 | from sklearn import metrics
 16 | from sklearn.tree import DecisionTreeRegressor
 17 | from sklearn.ensemble import RandomForestClassifier
 18 | from sklearn.preprocessing import OneHotEncoder
 19 | from sklearn.model_selection import train_test_split
 20 | from sklearn.metrics import r2_score,mean_squared_error
 21 | 
 22 | import pickle
 23 | get_ipython().run_line_magic('matplotlib', 'inline')
 24 | 
 25 | 
 26 | # In[2]:
 27 | 
 28 | 
 29 | # Read dataset
 30 | Train_data_1 = pd.read_csv("new_123_789.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
 31 | Train_data_2 = pd.read_csv("Train_Data2.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
 32 | Train_data_3 = pd.read_csv("Train_data3.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
 33 | Train_data_4 = pd.read_csv("Train_data4.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
 34 | dataset = pd.concat([Train_data_1, Train_data_2, Train_data_3, Train_data_4], ignore_index = True)
 35 | 
 36 | 
 37 | # In[3]:
 38 | 
 39 | 
 40 | # Split train, test and validation
 41 | target ='ELGBL_EXPNS_AMT'
 42 | X = dataset.drop(columns = target)
 43 | y = dataset[target]
 44 | 
 45 | 
 46 | # In[6]:
 47 | 
 48 | 
 49 | same_value_columns = same_values(X, 0.975)
 50 | 
 51 | 
 52 | # In[7]:
 53 | 
 54 | 
 55 | same_value_columns
 56 | 
 57 | 
 58 | # In[8]:
 59 | 
 60 | 
 61 | X['MDFR_1_CD'].value_counts()
 62 | 
 63 | 
 64 | # In[ ]:
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | # In[47]:
 71 | 
 72 | 
 73 | def pre_process_data(df):
 74 |     #Columns to remove based on business logic
 75 |     #BILLD_CHRGD_AMT
 76 |     columns_to_remove = ['PAYMNT_AMT', 'NOT_CVRD_AMT', 'BSIC_CPAYMNT_AMT', 'MM_CPAYMNT_AMT',
 77 |                          'MM_DDCTBL_AMT', 'CPAYMNT_AMT', 'CPAYMNT_TYPE_AMT','BSIC_DDCTBL_AMT', 'PN_ID', 'PN_VRTN_ID',
 78 |                          'MEM_RESP', 'AUTO_ADUJ', 'COB_SGMNT_CNT', 'MEDCR_CNT', 'DTL_SGMNT_CNT', 'EOB_DNL_CD']
 79 |     exception_cols = ['BILLD_CHRGD_AMT']
 80 |     df.drop(columns = columns_to_remove, inplace = True)
 81 |     
 82 |     #Columns which have the same value for 97.5% of the rows 
 83 |     same_value_columns = same_values(df, 0.975)
 84 |     df.drop(columns = same_value_columns, inplace = True)
 85 |     
 86 |     # Convert to int (Manual identification)
 87 |     df['TOTL_UNITS_PRCD_CNT'] = df['TOTL_UNITS_PRCD_CNT'].astype('float64')
 88 |     
 89 |     # Create variables and convert to string
 90 |     df['yr'] = pd.DatetimeIndex(df['SRVC_THRU_DT']).year.astype('str')
 91 |     df['mnth'] = pd.DatetimeIndex(df['SRVC_THRU_DT']).month.astype('str')
 92 |     df['day_of_week'] = pd.DatetimeIndex(df['SRVC_THRU_DT']).dayofweek.astype('str')
 93 |    # Drop date variables
 94 |     df.drop(columns = ['SRVC_THRU_DT','SRVC_FROM_DT'], inplace = True)
 95 |     
 96 |     # String columns with less than or equal to 15 unique values (for OHE)
 97 |     unique_cols = unique_counts(df,15)
 98 |     
 99 |     # Convert to OHE
100 |     df = ohe(df,unique_cols,exception_cols)
101 |     
102 |     # Columns which are highly correlated above a certain threshold. (manually identify one variable to keep)
103 |     columns_highly_correlated = correlation(df,0.85)
104 |     df.drop(columns = columns_highly_correlated, inplace = True)
105 |     
106 |     return (df)
107 | 
108 | 
109 | # In[48]:
110 | 
111 | 
112 | X_ = pre_process_data(X)
113 | 
114 | 
115 | # In[50]:
116 | 
117 | 
118 | train_X, test_X_, train_Y, test_Y_ = train_test_split(X_, y, test_size=0.4, random_state=42)
119 | test_X, val_X, test_Y, val_Y = train_test_split(test_X_, test_Y_, test_size=0.5, random_state=42)
120 | 
121 | 
122 | # In[51]:
123 | 
124 | 
125 | from sklearn.ensemble import RandomForestRegressor
126 | regr = RandomForestRegressor(max_depth=5, random_state=42)
127 | regr.fit(train_X, train_Y)
128 | 
129 | 
130 | # In[88]:
131 | 
132 | 
133 | test_pred = regr.predict(test_X)
134 | 
135 | 
136 | # In[89]:
137 | 
138 | 
139 | mse = mean_squared_error(test_pred, c)
140 | rmse = np.sqrt(mse)
141 | print(rmse)
142 | 
143 | 
144 | # In[92]:
145 | 
146 | 
147 | regr.score(test_X,test_Y)
148 | 
149 | 
150 | # In[93]:
151 | 
152 | 
153 | model.score(test_X,test_Y)
154 | 
155 | 
156 | # In[83]:
157 | 
158 | 
159 | for i,col in enumerate(test_X.columns.values.tolist()):
160 |     print(col,'{0:.4f}'.format(regr.feature_importances_[i]))
161 | 
162 | 
163 | # In[98]:
164 | 
165 | 
166 | model = DecisionTreeRegressor(max_depth=5)
167 | 
168 | 
169 | # In[99]:
170 | 
171 | 
172 | model.fit(train_X, train_Y)
173 | 
174 | 
175 | # In[100]:
176 | 
177 | 
178 | test_pred = model.predict(test_X)
179 | 
180 | 
181 | # In[101]:
182 | 
183 | 
184 | mse = mean_squared_error(test_pred, test_Y)
185 | rmse = np.sqrt(mse)
186 | print(rmse)
187 | 
188 | 
189 | # In[90]:
190 | 
191 | 
192 | for i,col in enumerate(test_X.columns.values.tolist()):
193 |     print(col,'{0:.4f}'.format(model.feature_importances_[i]))
194 | 
195 | 
196 | # In[18]:
197 | 
198 | 
199 | from sklearn.metrics import roc_auc_score
200 | # Actual class predictions
201 | rf_predictions = model.predict(test_X)
202 | # Probabilities for each class
203 | rf_probs = model.predict_proba(test_X)[:, 1]
204 | 
205 | 
206 | # In[36]:
207 | 
208 | 
209 | test_pred.value_counts()
210 | 
211 | 
212 | # In[35]:
213 | 
214 | 
215 | test_Y.value_counts()
216 | 
217 | 
218 | # In[81]:
219 | 
220 | 
221 | metrics.accuracy_score(test_Y_.tolist(),test_pred)
222 | 
223 | 
224 | # In[ ]:
225 | 
226 | 
227 | 
228 | 
229 | 
230 | # In[4]:
231 | 
232 | 
233 | def same_values(df, threshold):
234 |     cols = []
235 |     for col in df.columns.values.tolist():
236 |         null_pct = len(df.loc[df[col].isna() == True])/len(df)
237 |         if(null_pct >= threshold):
238 |             cols.append(col)
239 |         else:
240 |             same_pct = df[col].value_counts()[0]/len(df)
241 |             if(same_pct >= threshold):
242 |                 cols.append(col)
243 |     return cols
244 | 
245 | 
246 | # In[38]:
247 | 
248 | 
249 | def unique_counts(df, threshold):
250 |     unique_cols= []
251 |     for col in df.columns.values.tolist():
252 |         if (df[col].nunique() <= threshold):
253 |             unique_cols.append(col)
254 |     return (unique_cols)
255 | 
256 | 
257 | # In[12]:
258 | 
259 | 
260 | def correlation(df, threshold):
261 |     corr = df.corr()
262 |     correlations = []
263 |     for col in corr.columns.values.tolist():
264 |         for col_row in corr.index.values.tolist():
265 |             if (col != col_row):
266 |                 if (corr[col][col_row] >= threshold):
267 |                     correlations.append(col)
268 |                     correlations.append(col_row)
269 |     return list(set(correlations))
270 | 
271 | 
272 | # In[45]:
273 | 
274 | 
275 | def ohe(df,col_list):
276 |     columns_list = df.columns.values.tolist()
277 |     columns_list.remove('BILLD_CHRGD_AMT')
278 |     for col in columns_list:
279 |         if col in col_list:
280 |             ohe = pd.DataFrame()
281 |             if(df[col].dtypes == 'object'):
282 |                 ohe = pd.get_dummies(df[col])
283 |                 ohe.columns = [col + "_" + ohe_col for ohe_col in ohe.columns.values.tolist()]
284 |                 df = df.join(ohe)
285 |         df.drop(col,axis = 1, inplace = True)
286 |     return df
287 | 
288 | 
289 | # #### BTS
290 | 
291 | # In[ ]:
292 | 
293 | 
294 | ## Finding variables which are NULL 
295 | 
296 | 
297 | # In[3]:
298 | 
299 | 
300 | ## Finding variables which are highly correlated
301 | corr = dataset.corr()                
302 | for col in dataset.columns.values.tolist():
303 |     for col_row in corr.index.values.tolist():
304 |         if (col != col_row):
305 |             if (corr[col][col_row] > 0.84):
306 |                 print(str(corr[col][col_row]), col, col_row)
307 | correlations = []
308 | for col in corr.columns.values.tolist():
309 |     for col_row in corr.index.values.tolist():
310 |         correlation = []
311 |         if (col != col_row):
312 |             if (corr[col][col_row] >= 0.8):
313 |                 correlation.append(col)
314 |                 correlation.append(col_row)
315 |                 correlations.append(correlation)
316 | 
317 | 
318 | # In[4]:
319 | 
320 | 
321 | ## Find string variables with less than 16 unique values
322 | for col in dataset.columns.values.tolist():
323 |     if (dataset[col].nunique() <= 15):
324 |         print ("\'" + col  +"\',")
325 | 
326 | 
327 | # In[ ]:
328 | 
329 | 
330 | # Convert to OHE - def 
331 | 
332 | 
333 | # In[ ]:
334 | 
335 | 
336 | test_X = tes
337 | 
338 | 
339 | # In[5]:
340 | 
341 | 
342 | dataset = pd.concat([Train_data_1, Train_data_2, Train_data_3, Train_data_4], ignore_index = True)
343 | 
344 | 
345 | # In[6]:
346 | 
347 | 
348 | # Remove columns 
349 | dataset.drop(columns = columns_to_remove, inplace = True)
350 | dataset.drop(columns = columns_97_5, inplace = True)
351 | 
352 | 
353 | # In[7]:
354 | 
355 | 
356 | # Convert to int
357 | dataset['TOTL_UNITS_PRCD_CNT'] = dataset['TOTL_UNITS_PRCD_CNT'].astype('float64')
358 | 
359 | 
360 | # In[8]:
361 | 
362 | 
363 | # Create variables 
364 | dataset['yr'] = pd.DatetimeIndex(dataset['SRVC_THRU_DT']).year
365 | dataset['mnth'] = pd.DatetimeIndex(dataset['SRVC_THRU_DT']).month
366 | dataset['day_of_week'] = pd.DatetimeIndex(dataset['SRVC_THRU_DT']).dayofweek
367 | 
368 | 
369 | # In[9]:
370 | 
371 | 
372 | # Convert to string 
373 | dataset['yr'] = dataset['yr'].astype('str')
374 | dataset['mnth'] = dataset['mnth'].astype('str')
375 | dataset['day_of_week'] = dataset['day_of_week'].astype('str')
376 | 
377 | 
378 | # In[10]:
379 | 
380 | 
381 | # Drop date variables
382 | dataset.drop(columns = ['SRVC_THRU_DT','SRVC_FROM_DT'], inplace = True)
383 | 
384 | 
385 | # In[11]:
386 | 
387 | 
388 | # Drop string variables with more than 50 values. Go into detail of each one and try to club them to create indicators
389 | 
390 | 
391 | # In[12]:
392 | 
393 | 
394 | columns_under_15 = ['PROV_TAX_ID','PROV_NM','PROV_STR_ADRS','ROV_ZIP_5_CD','PROV_PAYENT_LCTN_CD','MX_PRCG_VRTN_CD',
395 |                     'SCRN_FRMT_CD','MIXER_PARG_IND','CLM_TYP','NUM_LINES','HCFA_PT_CD','CLM_TYPE_CD','TELEHEALTH',
396 |                     'PROD_DESC','NEW_CLM_TYP','UM_RQRD_IND','CLM_PAYMNT_ACTN_1_CD','yr','mnth','day_of_week']
397 | 
398 | 
399 | # In[13]:
400 | 
401 | 
402 | columns_highly_correlated = ['PROV_NM', 'PROV_STR_ADRS', 'ROV_ZIP_5_CD', 'PROV_PAYENT_LCTN_CD', 'CLM_TYP', 
403 |                              'UM_RQRD_IND', 'MX_PRCG_VRTN_CD',  'MIXER_PARG_IND', 'HCFA_PT_CD',  'TELEHEALTH']
404 | 
405 | 
406 | # In[14]:
407 | 
408 | 
409 | dataset.drop(columns = columns_highly_correlated, inplace = True)
410 | 
411 | 
412 | # In[15]:
413 | 
414 | 
415 | train_X = dataset.drop(columns = target)
416 | train_Y = dataset[target]
417 | 
418 | 
419 | # In[16]:
420 | 
421 | 
422 | # Convert to OHE
423 | for col in train_X.columns.values.tolist():
424 |     if col in columns_under_15:
425 |         ohe = pd.DataFrame()
426 |         if(train_X[col].dtypes == 'object'):
427 |             ohe = pd.get_dummies(train_X[col])
428 |             ohe.columns = [col + "_" + ohe_col for ohe_col in ohe.columns.values.tolist()]
429 |             train_X = train_X.join(ohe)
430 |     train_X.drop(col,axis = 1, inplace = True)
431 | 
432 | 
433 | # In[17]:
434 | 
435 | 
436 | dataset.shape
437 | 
438 | 
439 | # In[18]:
440 | 
441 | 
442 | # Find correlation of all columns with each other
443 | corr = dataset.corr()
444 | 
445 | 
446 | # In[19]:
447 | 
448 | 
449 | for col in dataset.columns.values.tolist():
450 |     for col_row in corr.index.values.tolist():
451 |         if (col != col_row):
452 |             if (corr[col][col_row] > 0.84):
453 |                 print(str(corr[col][col_row]), col, col_row)
454 | 
455 | 
456 | # In[20]:
457 | 
458 | 
459 | # train without correlation
460 | 
461 | 
462 | # In[21]:
463 | 
464 | 
465 | # Find correlation and create a list of list
466 | correlations = []
467 | for col in corr.columns.values.tolist():
468 |     for col_row in corr.index.values.tolist():
469 |         correlation = []
470 |         if (col != col_row):
471 |             if (corr[col][col_row] >= 0.8):
472 |                 correlation.append(col)
473 |                 correlation.append(col_row)
474 |                 correlations.append(correlation)
475 | 
476 | 
477 | # In[19]:
478 | 
479 | 
480 | model = RandomForestClassifier(n_estimators=100,  bootstrap = True)
481 | 
482 | 
483 | # In[20]:
484 | 
485 | 
486 | model.fit(train_X, train_Y)
487 | 
488 | 
489 | # In[21]:
490 | 
491 | 
492 | 
493 | 
494 | 
495 | # In[ ]:
496 | 
497 | 
498 | from sklearn.metrics import roc_auc_score
499 | # Actual class predictions
500 | rf_predictions = model.predict(test)
501 | # Probabilities for each class
502 | rf_probs = model.predict_proba(test)[:, 1]
503 | 
504 | 
505 | # In[14]:
506 | 
507 | 
508 | Process_path = 'Train'
509 | 
510 | 
511 | # In[15]:
512 | 
513 | 
514 | if Process_path == "Train":
515 |     ### Training Dataset ############
516 |     Train_data_1 = pd.read_csv("new_123_789.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
517 |     Train_data_2 = pd.read_csv("Train_Data2.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
518 |     Train_data_3 = pd.read_csv("Train_data3.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
519 |     Train_data_4 = pd.read_csv("Train_data4.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
520 |     Train_data_4 = Train_data_4[:9203]
521 |     dataset = pd.concat([Train_data_1, Train_data_2, Train_data_3, Train_data_4], ignore_index = True)
522 |     print(dataset.shape)
523 |     dataset.insert(0, 'New_ID', range(800000000, 800000000 + len(dataset)))
524 |     dataset.drop(dataset[dataset['CLM_PAYMNT_ACTN_1_CD'] > 'P'].index, inplace = True)
525 |     dataset1 = dataset[['New_ID',"DAIG1","PROC_CD","PRCG_ZIP_ST_CD","PROV_TAX_ID","PROV_ST_CD","PROV_SPCLTY_CD","BILLG_NPI","RNDRG_NPI","PROV_PAYENT_LCTN_CD","SRVC_FCLTY_LCTN_ID","SRVC_FCLTY_LCTN_NPI","POT_CD","PN_ID","PN_VRTN_ID","MBR_CNTRCT_CD","MBR_CVRG_PRCG_VRTN_CD","MBR_PROD_CD","HCFA_PT_CD","CLM_TYPE_CD","RNDRG_LINE_1_ADRS","RNDRG_CITY_NM","SRC_CD","PROV_RGN_CD","UM_RQRD_IND","NEW_CLM_TYP_1","CASE_NBR","ROV_ZIP_5_CD","PROV_PRC_ZIP_4_CD"]]
526 |     dataset2 = dataset[["BILLD_CHRGD_AMT","ELGBL_EXPNS_AMT","TOTL_UNITS_PRCD_CNT"]]
527 |     New_dataset1 = handle_non_numerical_data(dataset1)
528 |     New_dataset2 = dataset2
529 |     Final_Dataset = pd.concat([New_dataset1, New_dataset2], axis=1, join='inner')
530 |     X_train = Final_Dataset[['New_ID',"DAIG1","PROC_CD","PRCG_ZIP_ST_CD","PROV_TAX_ID","PROV_ST_CD","PROV_SPCLTY_CD","BILLG_NPI","RNDRG_NPI","PROV_PAYENT_LCTN_CD","SRVC_FCLTY_LCTN_ID","SRVC_FCLTY_LCTN_NPI","POT_CD","PN_ID","PN_VRTN_ID","MBR_CNTRCT_CD","MBR_CVRG_PRCG_VRTN_CD","MBR_PROD_CD","HCFA_PT_CD","CLM_TYPE_CD","RNDRG_LINE_1_ADRS","RNDRG_CITY_NM","SRC_CD","PROV_RGN_CD","UM_RQRD_IND","NEW_CLM_TYP_1","CASE_NBR","ROV_ZIP_5_CD","PROV_PRC_ZIP_4_CD","BILLD_CHRGD_AMT","TOTL_UNITS_PRCD_CNT"]].values
531 |     y_train = Final_Dataset["ELGBL_EXPNS_AMT"].values
532 |     #### LINEAR Regression Model#### (Doestn't work with different types of data, need to change the New_dataset2 to Float)
533 |     #model = LinearRegression()  
534 |     #model.fit(X_train, y_train)
535 | 
536 |     ## Descision Model ### (Getting 100% Accuracy Lower accuracy as test Dataset increses, Able to handle larger data set)
537 |     #model = DecisionTreeClassifier(random_state=RSEED)
538 |     #model = DecisionTreeClassifier(criterion="entropy", max_depth=25)
539 |     model = DecisionTreeClassifier(criterion="entropy")
540 |     #model = DecisionTreeClassifier(criterion="entropy",max_depth=25,random_state = 100,max_features = "auto", min_samples_leaf = 50)
541 |     model.fit(X_train, y_train)
542 | 
543 |     ######## Random Forest Model ## (getting Around 90% of accuracy)
544 |     #########################################
545 |     #### Create the model with 100 trees
546 |     #model = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt')
547 |     #model.fit(X_train, y_train)
548 | 
549 |     #######Logestic Regression Model ################ (Only getting 10% Accuracy)
550 |     #model = LogisticRegression(C=0.7,random_state=42)
551 |     #model.fit(X_train, y_train)
552 | 
553 |     #### Load Model to the drive #######
554 |     pickle.dump(model, open(filename, 'wb'))
555 | 
556 | 
557 | # In[16]:
558 | 
559 | 
560 | ### Training Dataset ############
561 | Train_data_1 = pd.read_csv("new_123_789.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
562 | Train_data_2 = pd.read_csv("Train_Data2.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
563 | Train_data_3 = pd.read_csv("Train_data3.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
564 | Train_data_4 = pd.read_csv("Train_data4.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
565 | Train_data_4 = Train_data_4[:9203]
566 | dataset = pd.concat([Train_data_1, Train_data_2, Train_data_3, Train_data_4], ignore_index = True)
567 | print(dataset.shape)
568 | dataset.insert(0, 'New_ID', range(800000000, 800000000 + len(dataset)))
569 | dataset.drop(dataset[dataset['CLM_PAYMNT_ACTN_1_CD'] > 'P'].index, inplace = True)
570 | dataset1 = dataset[['New_ID',"DAIG1","PROC_CD","PRCG_ZIP_ST_CD","PROV_TAX_ID","PROV_ST_CD","PROV_SPCLTY_CD","BILLG_NPI","RNDRG_NPI","PROV_PAYENT_LCTN_CD","SRVC_FCLTY_LCTN_ID","SRVC_FCLTY_LCTN_NPI","POT_CD","PN_ID","PN_VRTN_ID","MBR_CNTRCT_CD","MBR_CVRG_PRCG_VRTN_CD","MBR_PROD_CD","HCFA_PT_CD","CLM_TYPE_CD","RNDRG_LINE_1_ADRS","RNDRG_CITY_NM","SRC_CD","PROV_RGN_CD","UM_RQRD_IND","NEW_CLM_TYP_1","CASE_NBR","ROV_ZIP_5_CD","PROV_PRC_ZIP_4_CD"]]
571 | dataset2 = dataset[["BILLD_CHRGD_AMT","ELGBL_EXPNS_AMT","TOTL_UNITS_PRCD_CNT"]]
572 | New_dataset1 = handle_non_numerical_data(dataset1)
573 | New_dataset2 = dataset2
574 | Final_Dataset = pd.concat([New_dataset1, New_dataset2], axis=1, join='inner')
575 | 
576 | 
577 | # In[51]:
578 | 
579 | 
580 | columns_train = ['New_ID',"DAIG1","PROC_CD","PRCG_ZIP_ST_CD","PROV_TAX_ID","PROV_ST_CD","PROV_SPCLTY_CD","BILLG_NPI","RNDRG_NPI","PROV_PAYENT_LCTN_CD","SRVC_FCLTY_LCTN_ID","SRVC_FCLTY_LCTN_NPI","POT_CD","PN_ID","PN_VRTN_ID","MBR_CNTRCT_CD","MBR_CVRG_PRCG_VRTN_CD","MBR_PROD_CD","HCFA_PT_CD","CLM_TYPE_CD","RNDRG_LINE_1_ADRS","RNDRG_CITY_NM","SRC_CD","PROV_RGN_CD","UM_RQRD_IND","NEW_CLM_TYP_1","CASE_NBR","ROV_ZIP_5_CD","PROV_PRC_ZIP_4_CD","TOTL_UNITS_PRCD_CNT", 'BILLD_CHRGD_AMT']
581 | 
582 | 
583 | # In[52]:
584 | 
585 | 
586 | for col in columns_train:
587 |     print(col)
588 | 
589 | 
590 | # In[44]:
591 | 
592 | 
593 | X_train = Final_Dataset[columns_train]
594 | 
595 | 
596 | # In[41]:
597 | 
598 | 
599 | Final_Dataset['BILLD_CHRGD_AMT'] = Final_Dataset['BILLD_CHRGD_AMT'].astype('float')
600 | Final_Dataset['TOTL_UNITS_PRCD_CNT'] = Final_Dataset['TOTL_UNITS_PRCD_CNT'].astype('float')
601 | Final_Dataset['ELGBL_EXPNS_AMT'] = Final_Dataset['ELGBL_EXPNS_AMT'].astype('str')
602 | 
603 | 
604 | # In[42]:
605 | 
606 | 
607 | for col in X_train:
608 | #     print(col,type(Final_Dataset[col][0]))
609 |    print(col, Final_Dataset['ELGBL_EXPNS_AMT'].corr(Final_Dataset[col]))
610 | 
611 | 
612 | # In[ ]:
613 | 
614 | 
615 | 
616 | 
617 | 
618 | # In[ ]:
619 | 
620 | 
621 | 
622 | 
623 | 
624 | # In[46]:
625 | 
626 | 
627 | Final_Dataset['ELGBL_EXPNS_AMT'] = Final_Dataset['ELGBL_EXPNS_AMT'].astype('str')
628 | 
629 | 
630 | # In[47]:
631 | 
632 | 
633 | 
634 |     
635 |     y_train = Final_Dataset["ELGBL_EXPNS_AMT"].values
636 |     #### LINEAR Regression Model#### (Doestn't work with different types of data, need to change the New_dataset2 to Float)
637 |     #model = LinearRegression()  
638 |     #model.fit(X_train, y_train)
639 | 
640 |     ## Descision Model ### (Getting 100% Accuracy Lower accuracy as test Dataset increses, Able to handle larger data set)
641 |     #model = DecisionTreeClassifier(random_state=RSEED)
642 |     #model = DecisionTreeClassifier(criterion="entropy", max_depth=25)
643 |     model = DecisionTreeClassifier(criterion="entropy")
644 |     #model = DecisionTreeClassifier(criterion="entropy",max_depth=25,random_state = 100,max_features = "auto", min_samples_leaf = 50)
645 |     model.fit(X_train, y_train)
646 | 
647 |     ######## Random Forest Model ## (getting Around 90% of accuracy)
648 |     #########################################
649 |     #### Create the model with 100 trees
650 |     #model = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt')
651 |     #model.fit(X_train, y_train)
652 | 
653 |     #######Logestic Regression Model ################ (Only getting 10% Accuracy)
654 |     #model = LogisticRegression(C=0.7,random_state=42)
655 |     #model.fit(X_train, y_train)
656 | 
657 |     #### Load Model to the drive #######
658 |     pickle.dump(model, open(filename, 'wb'))
659 | 
660 | 
661 | # In[ ]:
662 | 
663 | 
664 | 
665 | 
666 | 
667 | # In[ ]:
668 | 
669 | 
670 | 
671 | 
672 | 
673 | # In[48]:
674 | 
675 | 
676 | Process_path = 'Test'
677 | 
678 | 
679 | # In[49]:
680 | 
681 | 
682 | if Process_path == "Test":
683 |   ###### Testing Dataset ############
684 |   #testset = pd.read_csv("Test_Date.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
685 | #    testset = pd.read_csv("Test_Data_3.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
686 |     Train_data_4 = pd.read_csv("Train_data4.csv",sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
687 |     testset = Train_data_4[:500]
688 |     testset.insert(0, 'New_ID', range(900000000, 900000000 + len(testset)))
689 |     #testset.drop(testset[testset['CLM_PAYMNT_ACTN_1_CD'] > 'P'].index, inplace = True)
690 |     testset1 = testset[['New_ID',"DAIG1","PROC_CD","PRCG_ZIP_ST_CD","PROV_TAX_ID","PROV_ST_CD","PROV_SPCLTY_CD","BILLG_NPI","RNDRG_NPI","PROV_PAYENT_LCTN_CD","SRVC_FCLTY_LCTN_ID","SRVC_FCLTY_LCTN_NPI","POT_CD","PN_ID","PN_VRTN_ID","MBR_CNTRCT_CD","MBR_CVRG_PRCG_VRTN_CD","MBR_PROD_CD","HCFA_PT_CD","CLM_TYPE_CD","RNDRG_LINE_1_ADRS","RNDRG_CITY_NM","SRC_CD","PROV_RGN_CD","UM_RQRD_IND","NEW_CLM_TYP_1","CASE_NBR","ROV_ZIP_5_CD","PROV_PRC_ZIP_4_CD"]]
691 |     testset2 = testset[["BILLD_CHRGD_AMT","ELGBL_EXPNS_AMT","TOTL_UNITS_PRCD_CNT"]]
692 |     #testset1 = testset[["DAIG1","PROC_CD","PROV_TAX_ID"]]
693 |     New_testset1 = handle_non_numerical_data_Test(testset1)
694 |     New_testset2 = testset2
695 |     Final_testset = pd.concat([New_testset1, New_testset2], axis=1, join='inner')
696 |     X_test = Final_testset[['New_ID',"DAIG1","PROC_CD","PRCG_ZIP_ST_CD","PROV_TAX_ID","PROV_ST_CD","PROV_SPCLTY_CD","BILLG_NPI","RNDRG_NPI","PROV_PAYENT_LCTN_CD","SRVC_FCLTY_LCTN_ID","SRVC_FCLTY_LCTN_NPI","POT_CD","PN_ID","PN_VRTN_ID","MBR_CNTRCT_CD","MBR_CVRG_PRCG_VRTN_CD","MBR_PROD_CD","HCFA_PT_CD","CLM_TYPE_CD","RNDRG_LINE_1_ADRS","RNDRG_CITY_NM","SRC_CD","PROV_RGN_CD","UM_RQRD_IND","NEW_CLM_TYP_1","CASE_NBR","ROV_ZIP_5_CD","PROV_PRC_ZIP_4_CD","TOTL_UNITS_PRCD_CNT"]].values
697 |     y_test = Final_testset["ELGBL_EXPNS_AMT"].values
698 |     #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001, random_state=RSEED)
699 |     #### Read Model from Drive ########
700 |     model = pickle.load(open(filename, 'rb'))
701 |     
702 |     #Actual Pridiction
703 |     y_pred = model.predict(X_test)
704 |     new_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
705 |     print("Test Data Accuracy: ({0:.4f})".format(metrics.accuracy_score(y_test,y_pred)))
706 |     ##print("Test Data Accuracy: ({0:.4f})".format(metrics.accuracy_score(y_test,y_pred)))
707 |     new_df1 = pd.DataFrame({'New_ID': [i[0] for i in X_test],'Actual': y_test, 'Predicted': y_pred})
708 |     ##Final_Resultset = pd.merge(testset, new_df1, on='New_ID') 
709 |     Final_Resultset = pd.merge(testset, new_df1, on='New_ID')
710 |     Final_Resultset1 = Final_Resultset[["New_ID","KEY_CHK_DCN_NBR","KEY_CHK_DCN_ITEM_CD","KEY_CHK_DCN_CENTRY_CD","BILLD_CHRGD_AMT","Actual","Predicted"]]
711 |     #print(Final_Resultset1)
712 | 
713 | Final_Resultset1.head(20)
714 | 
715 | 
716 | # In[ ]:
717 | 
718 | 
719 | 
720 | 
721 | 
722 | # #### Misc
723 | 
724 | # In[ ]:
725 | 
726 | 
727 | strings_vars = ['KEY_CHK_DCN_NBR','DAIG1', 'DAIG2', 'DAIG3', 'DAIG4', 'DAIG5', 'PROC_CD', 'PROV_SCNDRY_NM',
728 |                 'PROV_PRC_ZIP_4_CD', 'RNDRG_NPI', 'PROV_SPCLTY_CD', 'SRVC_FCLTY_LCTN_NPI', 'MBR_CNTRCT_CD',
729 |                 'MBR_CVRG_PRCG_VRTN_CD', 'MBR_PROD_CD', 'RNDRG_LINE_1_ADRS', 'RNDRG_CITY_NM', 'CLM_PAYMNT_ACTN_2_6_CD',
730 |                 'CASE_NBR', 'SRVC_FROM_DT', 'SRVC_THRU_DT']
731 | categorical_vars = ['KEY_CHK_DCN_ITEM_CD', 'KEY_CHK_DCN_CENTRY_CD', 'MDFR_1_CD', 'MDFR_2_CD', 'MDFR_3_CD', 
732 |                     'PRCG_ZIP_ST_CD', 'PROV_TAX_ID', 'PROV_NM', 'PROV_STR_ADRS', 'ROV_ZIP_5_CD', 'PROV_ST_CD',
733 |                     'BILLG_NPI', 'PROV_PAYENT_LCTN_CD', 'SRVC_FCLTY_LCTN_ID', 'BSIC_DDCTBL_AMT', 'POT_CD',
734 |                     'MX_PRCG_VRTN_CD', 'MX_PROV_PRCG_PROD_CD', 'PN_ID', 'PN_VRTN_ID', 'SCRN_FRMT_CD', 'MIXER_PARG_IND',
735 |                     'MEM_RESP', 'CLM_TYP', 'NUM_LINES', 'HCFA_PT_CD', 'CLM_TYPE_CD', 'AUTO_ADUJ', 'HCPCS_MDFR_CD',
736 |                     'PAY_AUTHRZN_CD', 'COB_SGMNT_CNT', 'MEDCR_CNT', 'DTL_SGMNT_CNT', 'PROV_GROUP', 'RNDRG_LINE_2_ADRS', 
737 |                     'TELEHEALTH', 'SRC_CD', 'PROD_DESC', 'NEW_CLM_TYP', 'PROV_RGN_CD', 'UM_RFRL_TYPE_RQRD_IND',
738 |                     'UM_RQRD_IND', 'NEW_CLM_TYP_1', 'CLM_PAYMNT_ACTN_1_CD', 'EOB_DNL_CD']
739 | 
740 | 
741 | # In[ ]:
742 | 
743 | 
744 | def pre_process_data(df):
745 |     #Columns to remove based on business logic
746 |     columns_to_remove = ['BILLD_CHRGD_AMT',  'PAYMNT_AMT', 'NOT_CVRD_AMT', 'BSIC_CPAYMNT_AMT', 
747 |                     'MM_CPAYMNT_AMT', 'MM_DDCTBL_AMT', 'CPAYMNT_AMT', 'CPAYMNT_TYPE_AMT','BSIC_DDCTBL_AMT', 
748 |                     'PN_ID', 'PN_VRTN_ID', 'MEM_RESP', 'AUTO_ADUJ', 'COB_SGMNT_CNT', 'MEDCR_CNT', 'DTL_SGMNT_CNT', 
749 |                     'EOB_DNL_CD']
750 |     #Columns which have the same value for 97.5% of the rows 
751 |     same_value_columns = same_values(dataset, 0.975)
752 |     df.drop(columns = columns_to_remove, inplace = True)
753 |     df.drop(columns = same_value_columns, inplace = True)
754 |     # Convert to int (Manual identification)
755 |     df['TOTL_UNITS_PRCD_CNT'] = df['TOTL_UNITS_PRCD_CNT'].astype('float64')
756 |     # Create variables 
757 |     df['yr'] = pd.DatetimeIndex(df['SRVC_THRU_DT']).year
758 |     df['mnth'] = pd.DatetimeIndex(df['SRVC_THRU_DT']).month
759 |     df['day_of_week'] = pd.DatetimeIndex(df['SRVC_THRU_DT']).dayofweek
760 |     # Convert to string 
761 |     df['yr'] = df['yr'].astype('str')
762 |     df['mnth'] = df['mnth'].astype('str')
763 |     df['day_of_week'] = df['day_of_week'].astype('str')
764 |    # Drop date variables
765 |     df.drop(columns = ['SRVC_THRU_DT','SRVC_FROM_DT'], inplace = True) 
766 |     # String columns with less than 16 unique values (for OHE)
767 |     unique_cols = ['PROV_TAX_ID','PROV_NM','PROV_STR_ADRS','ROV_ZIP_5_CD','PROV_PAYENT_LCTN_CD','MX_PRCG_VRTN_CD',
768 |                     'SCRN_FRMT_CD','MIXER_PARG_IND','CLM_TYP','NUM_LINES','HCFA_PT_CD','CLM_TYPE_CD','TELEHEALTH',
769 |                     'PROD_DESC','NEW_CLM_TYP','UM_RQRD_IND','CLM_PAYMNT_ACTN_1_CD','yr','mnth','day_of_week']
770 |     columns_highly_correlated = ['PROV_NM', 'PROV_STR_ADRS', 'ROV_ZIP_5_CD', 'PROV_PAYENT_LCTN_CD', 'CLM_TYP', 
771 |                              'UM_RQRD_IND', 'MX_PRCG_VRTN_CD',  'MIXER_PARG_IND', 'HCFA_PT_CD',  'TELEHEALTH']
772 |     dataset.drop(columns = columns_highly_correlated, inplace = True)
773 |     ohe
774 | 
775 | 
776 | # #### Questions
777 | 
778 | # In[37]:
779 | 
780 | 
781 | #. 1. Service through and from date are same for all values
782 | dataset['duration_of_treatment'] = pd.to_datetime(dataset['SRVC_THRU_DT']) - pd.to_datetime(dataset['SRVC_FROM_DT'])
783 | dataset['date_check'] = dataset['SRVC_THRU_DT']==dataset['SRVC_FROM_DT']
784 | 
785 | 
786 | # In[ ]:
787 | 
788 | 
789 | # 2. DTL_LINE_NBR - number 01 vs 1 - is there any difference?
790 | # Note - I think this can be an integer
791 | 
792 | 


--------------------------------------------------------------------------------
/GMM  - Clusters 6 - Normalized input.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "pd.options.display.max_rows = 4000\n",
 23 |     "pd.set_option('display.float_format', lambda x: '%.2f' % x)"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 3,
 29 |    "metadata": {
 30 |     "collapsed": false,
 31 |     "scrolled": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "df = pd.read_csv('../../0.Data/1.Interim/New/cluster.csv')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 4,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "cols = ['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n",
 47 |     "       'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n",
 48 |     "       'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n",
 49 |     "       'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n",
 50 |     "       'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n",
 51 |     "       'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n",
 52 |     "       'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n",
 53 |     "       'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n",
 54 |     "       'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n",
 55 |     "       'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS']"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 5,
 61 |    "metadata": {
 62 |     "collapsed": false
 63 |    },
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "array(['TOT_DUE_AMT', 'Promise_%', 'Adjust_%', 'A_avg', 'P_avg',\n",
 69 |        "       'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n",
 70 |        "       'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n",
 71 |        "       'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n",
 72 |        "       'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n",
 73 |        "       'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n",
 74 |        "       'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n",
 75 |        "       'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n",
 76 |        "       'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n",
 77 |        "       'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS', 'BAN',\n",
 78 |        "       'labels'], dtype=object)"
 79 |       ]
 80 |      },
 81 |      "execution_count": 5,
 82 |      "metadata": {},
 83 |      "output_type": "execute_result"
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "df.columns.values"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 6,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "df_ = df[cols]"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 8,
104 |    "metadata": {
105 |     "collapsed": true
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "df_ =((df_-df_.min())/(df_.max()-df_.min()))*100"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 17,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "data": {
121 |       "text/html": [
122 |        "<div>\n",
123 |        "<table border=\"1\" class=\"dataframe\">\n",
124 |        "  <thead>\n",
125 |        "    <tr style=\"text-align: right;\">\n",
126 |        "      <th></th>\n",
127 |        "      <th>TOT_DUE_AMT</th>\n",
128 |        "      <th>Promise_%</th>\n",
129 |        "      <th>A_avg</th>\n",
130 |        "      <th>P_avg</th>\n",
131 |        "      <th>BAN</th>\n",
132 |        "      <th>acct_bhvr_scr_nbr</th>\n",
133 |        "      <th>ar_bhvr_scr_nbr</th>\n",
134 |        "      <th>avg_paid_full_dy_cnt</th>\n",
135 |        "      <th>crdt_buru_scr_nbr</th>\n",
136 |        "      <th>cust_bhvr_scr_nbr</th>\n",
137 |        "      <th>...</th>\n",
138 |        "      <th>preferred_weekdayCALL_DT</th>\n",
139 |        "      <th>return_itm_180_dy_cnt</th>\n",
140 |        "      <th>return_itm_30_dy_cnt</th>\n",
141 |        "      <th>Tenure</th>\n",
142 |        "      <th>MOBILITY_REGION_NAME_flagCentral</th>\n",
143 |        "      <th>MOBILITY_REGION_NAME_flagEast</th>\n",
144 |        "      <th>MOBILITY_REGION_NAME_flagWest</th>\n",
145 |        "      <th>ACCT_STS_AT_CALL_DATE_flagN</th>\n",
146 |        "      <th>ACCT_STS_AT_CALL_DATE_flagO</th>\n",
147 |        "      <th>ACCT_STS_AT_CALL_DATE_flagS</th>\n",
148 |        "    </tr>\n",
149 |        "  </thead>\n",
150 |        "  <tbody>\n",
151 |        "    <tr>\n",
152 |        "      <th>0</th>\n",
153 |        "      <td>9.17</td>\n",
154 |        "      <td>50.00</td>\n",
155 |        "      <td>76.00</td>\n",
156 |        "      <td>99.05</td>\n",
157 |        "      <td>24.89</td>\n",
158 |        "      <td>100.00</td>\n",
159 |        "      <td>100.00</td>\n",
160 |        "      <td>4.98</td>\n",
161 |        "      <td>91.41</td>\n",
162 |        "      <td>100.00</td>\n",
163 |        "      <td>...</td>\n",
164 |        "      <td>33.33</td>\n",
165 |        "      <td>0.00</td>\n",
166 |        "      <td>0.00</td>\n",
167 |        "      <td>17.27</td>\n",
168 |        "      <td>100.00</td>\n",
169 |        "      <td>0.00</td>\n",
170 |        "      <td>0.00</td>\n",
171 |        "      <td>0.00</td>\n",
172 |        "      <td>100.00</td>\n",
173 |        "      <td>0.00</td>\n",
174 |        "    </tr>\n",
175 |        "    <tr>\n",
176 |        "      <th>1</th>\n",
177 |        "      <td>9.09</td>\n",
178 |        "      <td>89.47</td>\n",
179 |        "      <td>76.32</td>\n",
180 |        "      <td>99.39</td>\n",
181 |        "      <td>88.55</td>\n",
182 |        "      <td>100.00</td>\n",
183 |        "      <td>100.00</td>\n",
184 |        "      <td>5.27</td>\n",
185 |        "      <td>79.96</td>\n",
186 |        "      <td>100.00</td>\n",
187 |        "      <td>...</td>\n",
188 |        "      <td>66.67</td>\n",
189 |        "      <td>4.35</td>\n",
190 |        "      <td>0.00</td>\n",
191 |        "      <td>36.92</td>\n",
192 |        "      <td>0.00</td>\n",
193 |        "      <td>100.00</td>\n",
194 |        "      <td>0.00</td>\n",
195 |        "      <td>0.00</td>\n",
196 |        "      <td>100.00</td>\n",
197 |        "      <td>0.00</td>\n",
198 |        "    </tr>\n",
199 |        "    <tr>\n",
200 |        "      <th>2</th>\n",
201 |        "      <td>9.22</td>\n",
202 |        "      <td>40.00</td>\n",
203 |        "      <td>75.89</td>\n",
204 |        "      <td>98.69</td>\n",
205 |        "      <td>0.06</td>\n",
206 |        "      <td>100.00</td>\n",
207 |        "      <td>100.00</td>\n",
208 |        "      <td>3.91</td>\n",
209 |        "      <td>93.05</td>\n",
210 |        "      <td>100.00</td>\n",
211 |        "      <td>...</td>\n",
212 |        "      <td>66.67</td>\n",
213 |        "      <td>0.00</td>\n",
214 |        "      <td>0.00</td>\n",
215 |        "      <td>79.46</td>\n",
216 |        "      <td>100.00</td>\n",
217 |        "      <td>0.00</td>\n",
218 |        "      <td>0.00</td>\n",
219 |        "      <td>0.00</td>\n",
220 |        "      <td>100.00</td>\n",
221 |        "      <td>0.00</td>\n",
222 |        "    </tr>\n",
223 |        "    <tr>\n",
224 |        "      <th>3</th>\n",
225 |        "      <td>8.99</td>\n",
226 |        "      <td>45.00</td>\n",
227 |        "      <td>76.00</td>\n",
228 |        "      <td>99.44</td>\n",
229 |        "      <td>0.13</td>\n",
230 |        "      <td>70.14</td>\n",
231 |        "      <td>62.53</td>\n",
232 |        "      <td>6.64</td>\n",
233 |        "      <td>0.00</td>\n",
234 |        "      <td>100.00</td>\n",
235 |        "      <td>...</td>\n",
236 |        "      <td>66.67</td>\n",
237 |        "      <td>4.35</td>\n",
238 |        "      <td>0.00</td>\n",
239 |        "      <td>28.27</td>\n",
240 |        "      <td>0.00</td>\n",
241 |        "      <td>100.00</td>\n",
242 |        "      <td>0.00</td>\n",
243 |        "      <td>0.00</td>\n",
244 |        "      <td>100.00</td>\n",
245 |        "      <td>0.00</td>\n",
246 |        "    </tr>\n",
247 |        "    <tr>\n",
248 |        "      <th>4</th>\n",
249 |        "      <td>9.09</td>\n",
250 |        "      <td>50.00</td>\n",
251 |        "      <td>76.01</td>\n",
252 |        "      <td>99.38</td>\n",
253 |        "      <td>72.30</td>\n",
254 |        "      <td>100.00</td>\n",
255 |        "      <td>100.00</td>\n",
256 |        "      <td>3.03</td>\n",
257 |        "      <td>64.11</td>\n",
258 |        "      <td>100.00</td>\n",
259 |        "      <td>...</td>\n",
260 |        "      <td>66.67</td>\n",
261 |        "      <td>0.00</td>\n",
262 |        "      <td>0.00</td>\n",
263 |        "      <td>0.58</td>\n",
264 |        "      <td>0.00</td>\n",
265 |        "      <td>100.00</td>\n",
266 |        "      <td>0.00</td>\n",
267 |        "      <td>0.00</td>\n",
268 |        "      <td>100.00</td>\n",
269 |        "      <td>0.00</td>\n",
270 |        "    </tr>\n",
271 |        "  </tbody>\n",
272 |        "</table>\n",
273 |        "<p>5 rows × 28 columns</p>\n",
274 |        "</div>"
275 |       ],
276 |       "text/plain": [
277 |        "   TOT_DUE_AMT  Promise_%  A_avg  P_avg   BAN  acct_bhvr_scr_nbr  \\\n",
278 |        "0         9.17      50.00  76.00  99.05 24.89             100.00   \n",
279 |        "1         9.09      89.47  76.32  99.39 88.55             100.00   \n",
280 |        "2         9.22      40.00  75.89  98.69  0.06             100.00   \n",
281 |        "3         8.99      45.00  76.00  99.44  0.13              70.14   \n",
282 |        "4         9.09      50.00  76.01  99.38 72.30             100.00   \n",
283 |        "\n",
284 |        "   ar_bhvr_scr_nbr  avg_paid_full_dy_cnt  crdt_buru_scr_nbr  \\\n",
285 |        "0           100.00                  4.98              91.41   \n",
286 |        "1           100.00                  5.27              79.96   \n",
287 |        "2           100.00                  3.91              93.05   \n",
288 |        "3            62.53                  6.64               0.00   \n",
289 |        "4           100.00                  3.03              64.11   \n",
290 |        "\n",
291 |        "   cust_bhvr_scr_nbr             ...               preferred_weekdayCALL_DT  \\\n",
292 |        "0             100.00             ...                                  33.33   \n",
293 |        "1             100.00             ...                                  66.67   \n",
294 |        "2             100.00             ...                                  66.67   \n",
295 |        "3             100.00             ...                                  66.67   \n",
296 |        "4             100.00             ...                                  66.67   \n",
297 |        "\n",
298 |        "   return_itm_180_dy_cnt  return_itm_30_dy_cnt  Tenure  \\\n",
299 |        "0                   0.00                  0.00   17.27   \n",
300 |        "1                   4.35                  0.00   36.92   \n",
301 |        "2                   0.00                  0.00   79.46   \n",
302 |        "3                   4.35                  0.00   28.27   \n",
303 |        "4                   0.00                  0.00    0.58   \n",
304 |        "\n",
305 |        "   MOBILITY_REGION_NAME_flagCentral  MOBILITY_REGION_NAME_flagEast  \\\n",
306 |        "0                            100.00                           0.00   \n",
307 |        "1                              0.00                         100.00   \n",
308 |        "2                            100.00                           0.00   \n",
309 |        "3                              0.00                         100.00   \n",
310 |        "4                              0.00                         100.00   \n",
311 |        "\n",
312 |        "   MOBILITY_REGION_NAME_flagWest  ACCT_STS_AT_CALL_DATE_flagN  \\\n",
313 |        "0                           0.00                         0.00   \n",
314 |        "1                           0.00                         0.00   \n",
315 |        "2                           0.00                         0.00   \n",
316 |        "3                           0.00                         0.00   \n",
317 |        "4                           0.00                         0.00   \n",
318 |        "\n",
319 |        "   ACCT_STS_AT_CALL_DATE_flagO  ACCT_STS_AT_CALL_DATE_flagS  \n",
320 |        "0                       100.00                         0.00  \n",
321 |        "1                       100.00                         0.00  \n",
322 |        "2                       100.00                         0.00  \n",
323 |        "3                       100.00                         0.00  \n",
324 |        "4                       100.00                         0.00  \n",
325 |        "\n",
326 |        "[5 rows x 28 columns]"
327 |       ]
328 |      },
329 |      "execution_count": 17,
330 |      "metadata": {},
331 |      "output_type": "execute_result"
332 |     }
333 |    ],
334 |    "source": [
335 |     "df_.head()"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 10,
341 |    "metadata": {
342 |     "collapsed": true
343 |    },
344 |    "outputs": [],
345 |    "source": [
346 |     "# Run GMM \n",
347 |     "from sklearn.mixture import GMM"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 11,
353 |    "metadata": {
354 |     "collapsed": false
355 |    },
356 |    "outputs": [
357 |     {
358 |      "name": "stderr",
359 |      "output_type": "stream",
360 |      "text": [
361 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning: Class GMM is deprecated; The class GMM is deprecated in 0.18 and will be  removed in 0.20. Use class GaussianMixture instead.\n",
362 |       "  warnings.warn(msg, category=DeprecationWarning)\n"
363 |      ]
364 |     },
365 |     {
366 |      "ename": "KeyboardInterrupt",
367 |      "evalue": "",
368 |      "output_type": "error",
369 |      "traceback": [
370 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
371 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
372 |       "\u001b[0;32m<ipython-input-11-b31657582bad>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_cell_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'time'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'gmm = GMM(n_components=9).fit(df_)\\nlabels = gmm.predict(df_)'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
373 |       "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mrun_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m   2113\u001b[0m             \u001b[0mmagic_arg_s\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvar_expand\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstack_depth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2114\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2115\u001b[0;31m                 \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmagic_arg_s\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2116\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2117\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
374 |       "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/IPython/core/magics/execution.py\u001b[0m in \u001b[0;36mtime\u001b[0;34m(self, line, cell, local_ns)\u001b[0m\n",
375 |       "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/IPython/core/magic.py\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(f, *a, **k)\u001b[0m\n\u001b[1;32m    186\u001b[0m     \u001b[0;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    187\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 188\u001b[0;31m         \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    189\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    190\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
376 |       "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/IPython/core/magics/execution.py\u001b[0m in \u001b[0;36mtime\u001b[0;34m(self, line, cell, local_ns)\u001b[0m\n\u001b[1;32m   1178\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1179\u001b[0m             \u001b[0mst\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclock2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1180\u001b[0;31m             \u001b[0mexec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mglob\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlocal_ns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1181\u001b[0m             \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclock2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1182\u001b[0m             \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
377 |       "\u001b[0;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n",
378 |       "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/mixture/gmm.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m    595\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    596\u001b[0m         \"\"\"\n\u001b[0;32m--> 597\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    598\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    599\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
379 |       "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/mixture/gmm.py\u001b[0m in \u001b[0;36m_fit\u001b[0;34m(self, X, y, do_prediction)\u001b[0m\n\u001b[1;32m    491\u001b[0m                 self.means_ = cluster.KMeans(\n\u001b[1;32m    492\u001b[0m                     \u001b[0mn_clusters\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_components\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 493\u001b[0;31m                     random_state=self.random_state).fit(X).cluster_centers_\n\u001b[0m\u001b[1;32m    494\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mverbose\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    495\u001b[0m                     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\tMeans have been initialized.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
380 |       "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/cluster/k_means_.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m    887\u001b[0m                 \u001b[0mtol\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy_x\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy_x\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    888\u001b[0m                 \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malgorithm\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0malgorithm\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 889\u001b[0;31m                 return_n_iter=True)\n\u001b[0m\u001b[1;32m    890\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    891\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
381 |       "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/cluster/k_means_.py\u001b[0m in \u001b[0;36mk_means\u001b[0;34m(X, n_clusters, init, precompute_distances, n_init, max_iter, verbose, tol, random_state, copy_x, n_jobs, algorithm, return_n_iter)\u001b[0m\n\u001b[1;32m    343\u001b[0m                 \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_clusters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmax_iter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    344\u001b[0m                 \u001b[0mprecompute_distances\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprecompute_distances\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtol\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtol\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 345\u001b[0;31m                 x_squared_norms=x_squared_norms, random_state=random_state)\n\u001b[0m\u001b[1;32m    346\u001b[0m             \u001b[0;31m# determine if these results are the best so far\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    347\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mbest_inertia\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0minertia\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mbest_inertia\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
382 |       "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/cluster/k_means_.py\u001b[0m in \u001b[0;36m_kmeans_single_elkan\u001b[0;34m(X, n_clusters, max_iter, init, verbose, x_squared_norms, random_state, tol, precompute_distances)\u001b[0m\n\u001b[1;32m    397\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Initialization complete'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    398\u001b[0m     centers, labels, n_iter = k_means_elkan(X, n_clusters, centers, tol=tol,\n\u001b[0;32m--> 399\u001b[0;31m                                             max_iter=max_iter, verbose=verbose)\n\u001b[0m\u001b[1;32m    400\u001b[0m     \u001b[0minertia\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mcenters\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m**\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat64\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    401\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minertia\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcenters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_iter\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
383 |       "\u001b[0;32msklearn/cluster/_k_means_elkan.pyx\u001b[0m in \u001b[0;36msklearn.cluster._k_means_elkan.k_means_elkan (sklearn/cluster/_k_means_elkan.c:7470)\u001b[0;34m()\u001b[0m\n",
384 |       "\u001b[0;32m/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36meuclidean_distances\u001b[0;34m(X, Y, Y_norm_squared, squared, X_norm_squared)\u001b[0m\n\u001b[1;32m    160\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    161\u001b[0m \u001b[0;31m# Pairwise distances\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 162\u001b[0;31m def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,\n\u001b[0m\u001b[1;32m    163\u001b[0m                         X_norm_squared=None):\n\u001b[1;32m    164\u001b[0m     \"\"\"\n",
385 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
386 |      ]
387 |     }
388 |    ],
389 |    "source": [
390 |     "%%time\n",
391 |     "gmm = GMM(n_components=9).fit(df_)\n",
392 |     "labels = gmm.predict(df_)"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 21,
398 |    "metadata": {
399 |     "collapsed": true
400 |    },
401 |    "outputs": [],
402 |    "source": [
403 |     "cluster_normalized = df_.copy()"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": 58,
409 |    "metadata": {
410 |     "collapsed": false
411 |    },
412 |    "outputs": [
413 |     {
414 |      "data": {
415 |       "text/plain": [
416 |        "TOT_DUE_AMT                         9.09\n",
417 |        "Promise_%                          63.69\n",
418 |        "A_avg                              76.00\n",
419 |        "P_avg                              99.09\n",
420 |        "BAN                                57.30\n",
421 |        "acct_bhvr_scr_nbr                  98.03\n",
422 |        "ar_bhvr_scr_nbr                    97.95\n",
423 |        "avg_paid_full_dy_cnt                8.73\n",
424 |        "crdt_buru_scr_nbr                  83.87\n",
425 |        "cust_bhvr_scr_nbr                  98.27\n",
426 |        "cust_recls_scr_nbr                 89.64\n",
427 |        "pmt_arng_scr_nbr                   82.87\n",
428 |        "wirls_ln_cnt                        0.56\n",
429 |        "excpt_ovrd_ind                      0.75\n",
430 |        "pyarr_scr_nbr                      82.87\n",
431 |        "lst_bhvr_scr_nbr                   77.48\n",
432 |        "preferred_month_CALL_DT            60.62\n",
433 |        "preferred_day_of_monthCALL_DT      56.81\n",
434 |        "preferred_weekdayCALL_DT           42.45\n",
435 |        "return_itm_180_dy_cnt               0.45\n",
436 |        "return_itm_30_dy_cnt                0.27\n",
437 |        "Tenure                             21.54\n",
438 |        "MOBILITY_REGION_NAME_flagCentral   36.56\n",
439 |        "MOBILITY_REGION_NAME_flagEast      44.19\n",
440 |        "MOBILITY_REGION_NAME_flagWest      19.25\n",
441 |        "ACCT_STS_AT_CALL_DATE_flagN         0.21\n",
442 |        "ACCT_STS_AT_CALL_DATE_flagO        99.60\n",
443 |        "ACCT_STS_AT_CALL_DATE_flagS         0.20\n",
444 |        "labels                              2.68\n",
445 |        "dtype: float64"
446 |       ]
447 |      },
448 |      "execution_count": 58,
449 |      "metadata": {},
450 |      "output_type": "execute_result"
451 |     }
452 |    ],
453 |    "source": [
454 |     "cluster_normalized.mean()"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": null,
460 |    "metadata": {
461 |     "collapsed": true
462 |    },
463 |    "outputs": [],
464 |    "source": []
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": null,
469 |    "metadata": {
470 |     "collapsed": true
471 |    },
472 |    "outputs": [],
473 |    "source": []
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {
479 |     "collapsed": true
480 |    },
481 |    "outputs": [],
482 |    "source": []
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": 22,
487 |    "metadata": {
488 |     "collapsed": true
489 |    },
490 |    "outputs": [],
491 |    "source": [
492 |     "cluster_normalized['labels'] = labels"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": 23,
498 |    "metadata": {
499 |     "collapsed": false
500 |    },
501 |    "outputs": [],
502 |    "source": [
503 |     "c1 = cluster_normalized.loc[cluster_normalized['labels'] == 0]\n",
504 |     "c2 = cluster_normalized.loc[cluster_normalized['labels'] == 1]\n",
505 |     "c3 = cluster_normalized.loc[cluster_normalized['labels'] == 2]\n",
506 |     "c4 = cluster_normalized.loc[cluster_normalized['labels'] == 3]\n",
507 |     "c5 = cluster_normalized.loc[cluster_normalized['labels'] == 4]\n",
508 |     "c6 = cluster_normalized.loc[cluster_normalized['labels'] == 5]"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": 24,
514 |    "metadata": {
515 |     "collapsed": true
516 |    },
517 |    "outputs": [],
518 |    "source": [
519 |     "df = cluster_normalized.drop('labels', axis = 1)"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": 34,
525 |    "metadata": {
526 |     "collapsed": true
527 |    },
528 |    "outputs": [
529 |     {
530 |      "data": {
531 |       "text/plain": [
532 |        "TOT_DUE_AMT                         9.09\n",
533 |        "Promise_%                          63.69\n",
534 |        "A_avg                              76.00\n",
535 |        "P_avg                              99.09\n",
536 |        "BAN                                57.30\n",
537 |        "acct_bhvr_scr_nbr                  98.03\n",
538 |        "ar_bhvr_scr_nbr                    97.95\n",
539 |        "avg_paid_full_dy_cnt                8.73\n",
540 |        "crdt_buru_scr_nbr                  83.87\n",
541 |        "cust_bhvr_scr_nbr                  98.27\n",
542 |        "cust_recls_scr_nbr                 89.64\n",
543 |        "pmt_arng_scr_nbr                   82.87\n",
544 |        "wirls_ln_cnt                        0.56\n",
545 |        "excpt_ovrd_ind                      0.75\n",
546 |        "pyarr_scr_nbr                      82.87\n",
547 |        "lst_bhvr_scr_nbr                   77.48\n",
548 |        "preferred_month_CALL_DT            60.62\n",
549 |        "preferred_day_of_monthCALL_DT      56.81\n",
550 |        "preferred_weekdayCALL_DT           42.45\n",
551 |        "return_itm_180_dy_cnt               0.45\n",
552 |        "return_itm_30_dy_cnt                0.27\n",
553 |        "Tenure                             21.54\n",
554 |        "MOBILITY_REGION_NAME_flagCentral   36.56\n",
555 |        "MOBILITY_REGION_NAME_flagEast      44.19\n",
556 |        "MOBILITY_REGION_NAME_flagWest      19.25\n",
557 |        "ACCT_STS_AT_CALL_DATE_flagN         0.21\n",
558 |        "ACCT_STS_AT_CALL_DATE_flagO        99.60\n",
559 |        "ACCT_STS_AT_CALL_DATE_flagS         0.20\n",
560 |        "dtype: float64"
561 |       ]
562 |      },
563 |      "execution_count": 34,
564 |      "metadata": {},
565 |      "output_type": "execute_result"
566 |     }
567 |    ],
568 |    "source": [
569 |     "df_.mean()"
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "code",
574 |    "execution_count": 36,
575 |    "metadata": {
576 |     "collapsed": true
577 |    },
578 |    "outputs": [],
579 |    "source": [
580 |     "df_.to_csv('../../0.Data/1.Interim/New/merge_normalized.csv', index = False)"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": 38,
586 |    "metadata": {
587 |     "collapsed": true
588 |    },
589 |    "outputs": [],
590 |    "source": [
591 |     "cluster_normalized.to_csv('../../0.Data/1.Interim/New/cluster_normalized.csv', index = False)"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": 37,
597 |    "metadata": {
598 |     "collapsed": false
599 |    },
600 |    "outputs": [
601 |     {
602 |      "data": {
603 |       "text/plain": [
604 |        "array(['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n",
605 |        "       'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n",
606 |        "       'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n",
607 |        "       'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n",
608 |        "       'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n",
609 |        "       'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n",
610 |        "       'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n",
611 |        "       'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n",
612 |        "       'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n",
613 |        "       'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS',\n",
614 |        "       'labels'], dtype=object)"
615 |       ]
616 |      },
617 |      "execution_count": 37,
618 |      "metadata": {},
619 |      "output_type": "execute_result"
620 |     }
621 |    ],
622 |    "source": [
623 |     "cluster_normalized.columns.values"
624 |    ]
625 |   },
626 |   {
627 |    "cell_type": "code",
628 |    "execution_count": 25,
629 |    "metadata": {
630 |     "collapsed": false
631 |    },
632 |    "outputs": [
633 |     {
634 |      "data": {
635 |       "text/plain": [
636 |        "array(['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n",
637 |        "       'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n",
638 |        "       'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n",
639 |        "       'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n",
640 |        "       'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n",
641 |        "       'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n",
642 |        "       'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n",
643 |        "       'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n",
644 |        "       'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n",
645 |        "       'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS'], dtype=object)"
646 |       ]
647 |      },
648 |      "execution_count": 25,
649 |      "metadata": {},
650 |      "output_type": "execute_result"
651 |     }
652 |    ],
653 |    "source": [
654 |     "df.columns.values"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": 27,
660 |    "metadata": {
661 |     "collapsed": false
662 |    },
663 |    "outputs": [],
664 |    "source": [
665 |     "cluster_grp = []\n",
666 |     "for col in df.columns.values.tolist():\n",
667 |     "    orig = round(df[col].mean(),2)\n",
668 |     "    c1_val = round(c1[col].mean(),2)\n",
669 |     "    c2_val = round(c2[col].mean(),2)\n",
670 |     "    c3_val = round(c3[col].mean(),2)\n",
671 |     "    c4_val = round(c4[col].mean(),2)\n",
672 |     "    c5_val = round(c5[col].mean(),2)\n",
673 |     "    c6_val = round(c6[col].mean(),2)\n",
674 |     "    if(orig == 0):\n",
675 |     "        c1_change = None\n",
676 |     "        c2_change = None\n",
677 |     "        c3_change = None\n",
678 |     "        c4_change = None\n",
679 |     "        c5_change = None\n",
680 |     "        c6_change = None\n",
681 |     "    else:\n",
682 |     "        c1_change = round(((c1_val-orig)*100/orig),2)\n",
683 |     "        c2_change = round(((c2_val-orig)*100/orig),2)\n",
684 |     "        c3_change = round(((c3_val-orig)*100/orig),2)\n",
685 |     "        c4_change = round(((c4_val-orig)*100/orig),2)\n",
686 |     "        c5_change = round(((c5_val-orig)*100/orig),2)\n",
687 |     "        c6_change = round(((c6_val-orig)*100/orig),2)\n",
688 |     "    cluster_grp.append((col, orig, c1_val, c1_change , c2_val, c2_change, c3_val, c3_change, c4_val, c4_change, c5_val, c5_change,\\\n",
689 |     "                       c6_val, c6_change))\n",
690 |     "\n"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "code",
695 |    "execution_count": 28,
696 |    "metadata": {
697 |     "collapsed": false
698 |    },
699 |    "outputs": [],
700 |    "source": [
701 |     "cluster_group = pd.DataFrame(cluster_grp, columns = ['Column', 'Entire dataset value', 'c1_value', 'c1_change_%', \\\n",
702 |     "                'c2_value', 'c2_change_%', 'c3_value', 'c3_change_%', 'c4_value', 'c4_change_%', \\\n",
703 |     "                'c5_value', 'c5_change_%', 'c6_value' , 'c6_change_%'])"
704 |    ]
705 |   },
706 |   {
707 |    "cell_type": "code",
708 |    "execution_count": 29,
709 |    "metadata": {
710 |     "collapsed": false
711 |    },
712 |    "outputs": [
713 |     {
714 |      "data": {
715 |       "text/plain": [
716 |        "28"
717 |       ]
718 |      },
719 |      "execution_count": 29,
720 |      "metadata": {},
721 |      "output_type": "execute_result"
722 |     }
723 |    ],
724 |    "source": [
725 |     "len(cluster_group)"
726 |    ]
727 |   },
728 |   {
729 |    "cell_type": "code",
730 |    "execution_count": 30,
731 |    "metadata": {
732 |     "collapsed": true
733 |    },
734 |    "outputs": [],
735 |    "source": [
736 |     "cluster_group.to_csv('../3.Analysis/cluster_gmm_1_normalized.csv', index = False)"
737 |    ]
738 |   },
739 |   {
740 |    "cell_type": "code",
741 |    "execution_count": 45,
742 |    "metadata": {
743 |     "collapsed": false
744 |    },
745 |    "outputs": [
746 |     {
747 |      "data": {
748 |       "text/plain": [
749 |        "array(['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n",
750 |        "       'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n",
751 |        "       'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n",
752 |        "       'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n",
753 |        "       'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n",
754 |        "       'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n",
755 |        "       'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n",
756 |        "       'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n",
757 |        "       'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n",
758 |        "       'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS'], dtype=object)"
759 |       ]
760 |      },
761 |      "execution_count": 45,
762 |      "metadata": {},
763 |      "output_type": "execute_result"
764 |     }
765 |    ],
766 |    "source": [
767 |     "df_.columns.values"
768 |    ]
769 |   },
770 |   {
771 |    "cell_type": "code",
772 |    "execution_count": 57,
773 |    "metadata": {
774 |     "collapsed": false
775 |    },
776 |    "outputs": [
777 |     {
778 |      "data": {
779 |       "text/plain": [
780 |        "0.00     0.96\n",
781 |        "6.67     0.03\n",
782 |        "13.33    0.00\n",
783 |        "20.00    0.00\n",
784 |        "26.67    0.00\n",
785 |        "33.33    0.00\n",
786 |        "40.00    0.00\n",
787 |        "46.67    0.00\n",
788 |        "93.33    0.00\n",
789 |        "100.00   0.00\n",
790 |        "53.33    0.00\n",
791 |        "Name: return_itm_30_dy_cnt, dtype: float64"
792 |       ]
793 |      },
794 |      "execution_count": 57,
795 |      "metadata": {},
796 |      "output_type": "execute_result"
797 |     }
798 |    ],
799 |    "source": [
800 |     "df_.return_itm_30_dy_cnt.value_counts()/len(df_)"
801 |    ]
802 |   },
803 |   {
804 |    "cell_type": "code",
805 |    "execution_count": 47,
806 |    "metadata": {
807 |     "collapsed": false
808 |    },
809 |    "outputs": [],
810 |    "source": [
811 |     "df_['labels'] = labels"
812 |    ]
813 |   },
814 |   {
815 |    "cell_type": "code",
816 |    "execution_count": 48,
817 |    "metadata": {
818 |     "collapsed": true
819 |    },
820 |    "outputs": [],
821 |    "source": [
822 |     "c1_ = df_.loc[df_['labels'] == 0]\n",
823 |     "c2_ = df_.loc[df_['labels'] == 1]\n",
824 |     "c3_ = df_.loc[df_['labels'] == 2]\n",
825 |     "c4_ = df_.loc[df_['labels'] == 3]\n",
826 |     "c5_ = df_.loc[df_['labels'] == 4]\n",
827 |     "c6_ = df_.loc[df_['labels'] == 5]"
828 |    ]
829 |   },
830 |   {
831 |    "cell_type": "code",
832 |    "execution_count": 51,
833 |    "metadata": {
834 |     "collapsed": false
835 |    },
836 |    "outputs": [
837 |     {
838 |      "data": {
839 |       "text/plain": [
840 |        "54.54545454545454"
841 |       ]
842 |      },
843 |      "execution_count": 51,
844 |      "metadata": {},
845 |      "output_type": "execute_result"
846 |     }
847 |    ],
848 |    "source": [
849 |     "c5_.wirls_ln_cnt.max()"
850 |    ]
851 |   },
852 |   {
853 |    "cell_type": "code",
854 |    "execution_count": 52,
855 |    "metadata": {
856 |     "collapsed": false
857 |    },
858 |    "outputs": [
859 |     {
860 |      "data": {
861 |       "text/plain": [
862 |        "0.0"
863 |       ]
864 |      },
865 |      "execution_count": 52,
866 |      "metadata": {},
867 |      "output_type": "execute_result"
868 |     }
869 |    ],
870 |    "source": [
871 |     "c5_.wirls_ln_cnt.min()"
872 |    ]
873 |   },
874 |   {
875 |    "cell_type": "code",
876 |    "execution_count": 53,
877 |    "metadata": {
878 |     "collapsed": true
879 |    },
880 |    "outputs": [
881 |     {
882 |      "data": {
883 |       "text/plain": [
884 |        "0.17     368\n",
885 |        "0.00     347\n",
886 |        "0.34     314\n",
887 |        "0.51     278\n",
888 |        "0.69     214\n",
889 |        "0.86     189\n",
890 |        "1.03     122\n",
891 |        "1.20      80\n",
892 |        "1.37      51\n",
893 |        "1.54      34\n",
894 |        "1.72      31\n",
895 |        "1.89      16\n",
896 |        "2.23      16\n",
897 |        "2.40      16\n",
898 |        "3.43      14\n",
899 |        "3.95      13\n",
900 |        "2.74      13\n",
901 |        "2.92      11\n",
902 |        "3.09      10\n",
903 |        "2.06       9\n",
904 |        "3.26       7\n",
905 |        "4.29       7\n",
906 |        "2.57       7\n",
907 |        "3.77       6\n",
908 |        "5.15       6\n",
909 |        "4.12       6\n",
910 |        "3.60       6\n",
911 |        "4.63       5\n",
912 |        "5.83       4\n",
913 |        "5.66       3\n",
914 |        "5.49       3\n",
915 |        "4.80       3\n",
916 |        "8.06       3\n",
917 |        "4.97       3\n",
918 |        "6.35       2\n",
919 |        "6.86       2\n",
920 |        "5.32       2\n",
921 |        "6.17       2\n",
922 |        "7.03       1\n",
923 |        "17.50      1\n",
924 |        "10.81      1\n",
925 |        "16.12      1\n",
926 |        "4.46       1\n",
927 |        "6.69       1\n",
928 |        "6.00       1\n",
929 |        "14.07      1\n",
930 |        "10.12      1\n",
931 |        "32.59      1\n",
932 |        "8.40       1\n",
933 |        "10.46      1\n",
934 |        "13.89      1\n",
935 |        "11.84      1\n",
936 |        "54.55      1\n",
937 |        "14.75      1\n",
938 |        "11.15      1\n",
939 |        "20.41      1\n",
940 |        "9.43       1\n",
941 |        "6.52       1\n",
942 |        "23.67      1\n",
943 |        "29.33      1\n",
944 |        "12.01      1\n",
945 |        "8.23       1\n",
946 |        "19.38      1\n",
947 |        "19.90      1\n",
948 |        "7.38       1\n",
949 |        "7.55       1\n",
950 |        "22.30      1\n",
951 |        "9.61       1\n",
952 |        "7.72       1\n",
953 |        "9.95       1\n",
954 |        "12.86      1\n",
955 |        "Name: wirls_ln_cnt, dtype: int64"
956 |       ]
957 |      },
958 |      "execution_count": 53,
959 |      "metadata": {},
960 |      "output_type": "execute_result"
961 |     }
962 |    ],
963 |    "source": [
964 |     "c5_.wirls_ln_cnt.value_counts()"
965 |    ]
966 |   },
967 |   {
968 |    "cell_type": "code",
969 |    "execution_count": null,
970 |    "metadata": {
971 |     "collapsed": true
972 |    },
973 |    "outputs": [],
974 |    "source": []
975 |   }
976 |  ],
977 |  "metadata": {
978 |   "kernelspec": {
979 |    "display_name": "Python 3",
980 |    "language": "python",
981 |    "name": "python3"
982 |   },
983 |   "language_info": {
984 |    "codemirror_mode": {
985 |     "name": "ipython",
986 |     "version": 3
987 |    },
988 |    "file_extension": ".py",
989 |    "mimetype": "text/x-python",
990 |    "name": "python",
991 |    "nbconvert_exporter": "python",
992 |    "pygments_lexer": "ipython3",
993 |    "version": "3.6.0"
994 |   }
995 |  },
996 |  "nbformat": 4,
997 |  "nbformat_minor": 1
998 | }
999 | 


--------------------------------------------------------------------------------
/GMM  - Clusters 6 - Normalized using mean (remove 30 days variable).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 4,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "pd.options.display.max_rows = 4000\n",
 23 |     "pd.set_option('display.float_format', lambda x: '%.2f' % x)"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 5,
 29 |    "metadata": {
 30 |     "collapsed": false,
 31 |     "scrolled": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "df = pd.read_csv('../../0.Data/1.Interim/New/cluster.csv')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 1,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "cols = ['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg',\n",
 47 |     "       'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n",
 48 |     "       'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n",
 49 |     "       'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n",
 50 |     "       'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n",
 51 |     "       'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n",
 52 |     "       'return_itm_180_dy_cnt', 'Tenure',\n",
 53 |     "       'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n",
 54 |     "       'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n",
 55 |     "       'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS']"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 6,
 61 |    "metadata": {
 62 |     "collapsed": false
 63 |    },
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "array(['TOT_DUE_AMT', 'Promise_%', 'Adjust_%', 'A_avg', 'P_avg',\n",
 69 |        "       'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n",
 70 |        "       'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n",
 71 |        "       'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n",
 72 |        "       'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n",
 73 |        "       'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n",
 74 |        "       'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n",
 75 |        "       'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n",
 76 |        "       'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n",
 77 |        "       'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS', 'BAN',\n",
 78 |        "       'labels'], dtype=object)"
 79 |       ]
 80 |      },
 81 |      "execution_count": 6,
 82 |      "metadata": {},
 83 |      "output_type": "execute_result"
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "df.columns.values"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 7,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "df_ = df[cols]"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 8,
104 |    "metadata": {
105 |     "collapsed": true
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "# Run GMM \n",
110 |     "from sklearn.mixture import GMM"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 16,
116 |    "metadata": {
117 |     "collapsed": true
118 |    },
119 |    "outputs": [
120 |     {
121 |      "name": "stderr",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning: Class GMM is deprecated; The class GMM is deprecated in 0.18 and will be  removed in 0.20. Use class GaussianMixture instead.\n",
125 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
126 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function distribute_covar_matrix_to_match_covariance_type is deprecated; The functon distribute_covar_matrix_to_match_covariance_typeis deprecated in 0.18 and will be removed in 0.20.\n",
127 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
128 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
129 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
130 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
131 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
132 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
133 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
134 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
135 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
136 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
137 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
138 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
139 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
140 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
141 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
142 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
143 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
144 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
145 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
146 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
147 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
148 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
149 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
150 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
151 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
152 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
153 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
154 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
155 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
156 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
157 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
158 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
159 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
160 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
161 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
162 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
163 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
164 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
165 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
166 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
167 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
168 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
169 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
170 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
171 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
172 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
173 |       "  warnings.warn(msg, category=DeprecationWarning)\n",
174 |       "/opt/app/anaconda2/python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:70: DeprecationWarning: Function log_multivariate_normal_density is deprecated; The function log_multivariate_normal_density is deprecated in 0.18 and will be removed in 0.20.\n",
175 |       "  warnings.warn(msg, category=DeprecationWarning)\n"
176 |      ]
177 |     },
178 |     {
179 |      "name": "stdout",
180 |      "output_type": "stream",
181 |      "text": [
182 |       "CPU times: user 2min 44s, sys: 24 s, total: 3min 8s\n",
183 |       "Wall time: 50.1 s\n"
184 |      ]
185 |     }
186 |    ],
187 |    "source": [
188 |     "%%time\n",
189 |     "gmm = GMM(n_components=6).fit(df_)\n",
190 |     "labels = gmm.predict(df_)"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 17,
196 |    "metadata": {
197 |     "collapsed": true
198 |    },
199 |    "outputs": [],
200 |    "source": [
201 |     "cluster_normalized = df_.copy()"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 18,
207 |    "metadata": {
208 |     "collapsed": true
209 |    },
210 |    "outputs": [],
211 |    "source": [
212 |     "cluster_normalized['labels'] = labels"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 19,
218 |    "metadata": {
219 |     "collapsed": false
220 |    },
221 |    "outputs": [],
222 |    "source": [
223 |     "c1 = cluster_normalized.loc[cluster_normalized['labels'] == 0]\n",
224 |     "c2 = cluster_normalized.loc[cluster_normalized['labels'] == 1]\n",
225 |     "c3 = cluster_normalized.loc[cluster_normalized['labels'] == 2]\n",
226 |     "c4 = cluster_normalized.loc[cluster_normalized['labels'] == 3]\n",
227 |     "c5 = cluster_normalized.loc[cluster_normalized['labels'] == 4]\n",
228 |     "c6 = cluster_normalized.loc[cluster_normalized['labels'] == 5]"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 20,
234 |    "metadata": {
235 |     "collapsed": false
236 |    },
237 |    "outputs": [
238 |     {
239 |      "data": {
240 |       "text/plain": [
241 |        "3   0.34\n",
242 |        "5   0.26\n",
243 |        "1   0.15\n",
244 |        "0   0.14\n",
245 |        "2   0.07\n",
246 |        "4   0.04\n",
247 |        "Name: labels, dtype: float64"
248 |       ]
249 |      },
250 |      "execution_count": 20,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "cluster_normalized['labels'].value_counts()/len(cluster_normalized)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {
263 |     "collapsed": true
264 |    },
265 |    "outputs": [],
266 |    "source": []
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "metadata": {
272 |     "collapsed": true
273 |    },
274 |    "outputs": [],
275 |    "source": []
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 24,
280 |    "metadata": {
281 |     "collapsed": true
282 |    },
283 |    "outputs": [],
284 |    "source": [
285 |     "df = cluster_normalized.drop('labels', axis = 1)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 34,
291 |    "metadata": {
292 |     "collapsed": true
293 |    },
294 |    "outputs": [
295 |     {
296 |      "data": {
297 |       "text/plain": [
298 |        "TOT_DUE_AMT                         9.09\n",
299 |        "Promise_%                          63.69\n",
300 |        "A_avg                              76.00\n",
301 |        "P_avg                              99.09\n",
302 |        "BAN                                57.30\n",
303 |        "acct_bhvr_scr_nbr                  98.03\n",
304 |        "ar_bhvr_scr_nbr                    97.95\n",
305 |        "avg_paid_full_dy_cnt                8.73\n",
306 |        "crdt_buru_scr_nbr                  83.87\n",
307 |        "cust_bhvr_scr_nbr                  98.27\n",
308 |        "cust_recls_scr_nbr                 89.64\n",
309 |        "pmt_arng_scr_nbr                   82.87\n",
310 |        "wirls_ln_cnt                        0.56\n",
311 |        "excpt_ovrd_ind                      0.75\n",
312 |        "pyarr_scr_nbr                      82.87\n",
313 |        "lst_bhvr_scr_nbr                   77.48\n",
314 |        "preferred_month_CALL_DT            60.62\n",
315 |        "preferred_day_of_monthCALL_DT      56.81\n",
316 |        "preferred_weekdayCALL_DT           42.45\n",
317 |        "return_itm_180_dy_cnt               0.45\n",
318 |        "return_itm_30_dy_cnt                0.27\n",
319 |        "Tenure                             21.54\n",
320 |        "MOBILITY_REGION_NAME_flagCentral   36.56\n",
321 |        "MOBILITY_REGION_NAME_flagEast      44.19\n",
322 |        "MOBILITY_REGION_NAME_flagWest      19.25\n",
323 |        "ACCT_STS_AT_CALL_DATE_flagN         0.21\n",
324 |        "ACCT_STS_AT_CALL_DATE_flagO        99.60\n",
325 |        "ACCT_STS_AT_CALL_DATE_flagS         0.20\n",
326 |        "dtype: float64"
327 |       ]
328 |      },
329 |      "execution_count": 34,
330 |      "metadata": {},
331 |      "output_type": "execute_result"
332 |     }
333 |    ],
334 |    "source": [
335 |     "df_.mean()"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 36,
341 |    "metadata": {
342 |     "collapsed": true
343 |    },
344 |    "outputs": [],
345 |    "source": [
346 |     "df_.to_csv('../../0.Data/1.Interim/New/merge_normalized.csv', index = False)"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 38,
352 |    "metadata": {
353 |     "collapsed": true
354 |    },
355 |    "outputs": [],
356 |    "source": [
357 |     "cluster_normalized.to_csv('../../0.Data/1.Interim/New/cluster_normalized.csv', index = False)"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 37,
363 |    "metadata": {
364 |     "collapsed": false
365 |    },
366 |    "outputs": [
367 |     {
368 |      "data": {
369 |       "text/plain": [
370 |        "array(['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n",
371 |        "       'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n",
372 |        "       'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n",
373 |        "       'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n",
374 |        "       'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n",
375 |        "       'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n",
376 |        "       'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n",
377 |        "       'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n",
378 |        "       'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n",
379 |        "       'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS',\n",
380 |        "       'labels'], dtype=object)"
381 |       ]
382 |      },
383 |      "execution_count": 37,
384 |      "metadata": {},
385 |      "output_type": "execute_result"
386 |     }
387 |    ],
388 |    "source": [
389 |     "cluster_normalized.columns.values"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 25,
395 |    "metadata": {
396 |     "collapsed": false
397 |    },
398 |    "outputs": [
399 |     {
400 |      "data": {
401 |       "text/plain": [
402 |        "array(['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n",
403 |        "       'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n",
404 |        "       'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n",
405 |        "       'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n",
406 |        "       'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n",
407 |        "       'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n",
408 |        "       'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n",
409 |        "       'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n",
410 |        "       'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n",
411 |        "       'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS'], dtype=object)"
412 |       ]
413 |      },
414 |      "execution_count": 25,
415 |      "metadata": {},
416 |      "output_type": "execute_result"
417 |     }
418 |    ],
419 |    "source": [
420 |     "df.columns.values"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": 27,
426 |    "metadata": {
427 |     "collapsed": false
428 |    },
429 |    "outputs": [],
430 |    "source": [
431 |     "cluster_grp = []\n",
432 |     "for col in df.columns.values.tolist():\n",
433 |     "    orig = round(df[col].mean(),2)\n",
434 |     "    c1_val = round(c1[col].mean(),2)\n",
435 |     "    c2_val = round(c2[col].mean(),2)\n",
436 |     "    c3_val = round(c3[col].mean(),2)\n",
437 |     "    c4_val = round(c4[col].mean(),2)\n",
438 |     "    c5_val = round(c5[col].mean(),2)\n",
439 |     "    c6_val = round(c6[col].mean(),2)\n",
440 |     "    if(orig == 0):\n",
441 |     "        c1_change = None\n",
442 |     "        c2_change = None\n",
443 |     "        c3_change = None\n",
444 |     "        c4_change = None\n",
445 |     "        c5_change = None\n",
446 |     "        c6_change = None\n",
447 |     "    else:\n",
448 |     "        c1_change = round(((c1_val-orig)*100/orig),2)\n",
449 |     "        c2_change = round(((c2_val-orig)*100/orig),2)\n",
450 |     "        c3_change = round(((c3_val-orig)*100/orig),2)\n",
451 |     "        c4_change = round(((c4_val-orig)*100/orig),2)\n",
452 |     "        c5_change = round(((c5_val-orig)*100/orig),2)\n",
453 |     "        c6_change = round(((c6_val-orig)*100/orig),2)\n",
454 |     "    cluster_grp.append((col, orig, c1_val, c1_change , c2_val, c2_change, c3_val, c3_change, c4_val, c4_change, c5_val, c5_change,\\\n",
455 |     "                       c6_val, c6_change))\n",
456 |     "\n"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": 28,
462 |    "metadata": {
463 |     "collapsed": false
464 |    },
465 |    "outputs": [],
466 |    "source": [
467 |     "cluster_group = pd.DataFrame(cluster_grp, columns = ['Column', 'Entire dataset value', 'c1_value', 'c1_change_%', \\\n",
468 |     "                'c2_value', 'c2_change_%', 'c3_value', 'c3_change_%', 'c4_value', 'c4_change_%', \\\n",
469 |     "                'c5_value', 'c5_change_%', 'c6_value' , 'c6_change_%'])"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 29,
475 |    "metadata": {
476 |     "collapsed": false
477 |    },
478 |    "outputs": [
479 |     {
480 |      "data": {
481 |       "text/plain": [
482 |        "28"
483 |       ]
484 |      },
485 |      "execution_count": 29,
486 |      "metadata": {},
487 |      "output_type": "execute_result"
488 |     }
489 |    ],
490 |    "source": [
491 |     "len(cluster_group)"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": 30,
497 |    "metadata": {
498 |     "collapsed": true
499 |    },
500 |    "outputs": [],
501 |    "source": [
502 |     "cluster_group.to_csv('../3.Analysis/cluster_gmm_1_normalized.csv', index = False)"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": 45,
508 |    "metadata": {
509 |     "collapsed": false
510 |    },
511 |    "outputs": [
512 |     {
513 |      "data": {
514 |       "text/plain": [
515 |        "array(['TOT_DUE_AMT', 'Promise_%', 'A_avg', 'P_avg', 'BAN',\n",
516 |        "       'acct_bhvr_scr_nbr', 'ar_bhvr_scr_nbr', 'avg_paid_full_dy_cnt',\n",
517 |        "       'crdt_buru_scr_nbr', 'cust_bhvr_scr_nbr', 'cust_recls_scr_nbr',\n",
518 |        "       'pmt_arng_scr_nbr', 'wirls_ln_cnt', 'excpt_ovrd_ind',\n",
519 |        "       'pyarr_scr_nbr', 'lst_bhvr_scr_nbr', 'preferred_month_CALL_DT',\n",
520 |        "       'preferred_day_of_monthCALL_DT', 'preferred_weekdayCALL_DT',\n",
521 |        "       'return_itm_180_dy_cnt', 'return_itm_30_dy_cnt', 'Tenure',\n",
522 |        "       'MOBILITY_REGION_NAME_flagCentral', 'MOBILITY_REGION_NAME_flagEast',\n",
523 |        "       'MOBILITY_REGION_NAME_flagWest', 'ACCT_STS_AT_CALL_DATE_flagN',\n",
524 |        "       'ACCT_STS_AT_CALL_DATE_flagO', 'ACCT_STS_AT_CALL_DATE_flagS'], dtype=object)"
525 |       ]
526 |      },
527 |      "execution_count": 45,
528 |      "metadata": {},
529 |      "output_type": "execute_result"
530 |     }
531 |    ],
532 |    "source": [
533 |     "df_.columns.values"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": 57,
539 |    "metadata": {
540 |     "collapsed": false
541 |    },
542 |    "outputs": [
543 |     {
544 |      "data": {
545 |       "text/plain": [
546 |        "0.00     0.96\n",
547 |        "6.67     0.03\n",
548 |        "13.33    0.00\n",
549 |        "20.00    0.00\n",
550 |        "26.67    0.00\n",
551 |        "33.33    0.00\n",
552 |        "40.00    0.00\n",
553 |        "46.67    0.00\n",
554 |        "93.33    0.00\n",
555 |        "100.00   0.00\n",
556 |        "53.33    0.00\n",
557 |        "Name: return_itm_30_dy_cnt, dtype: float64"
558 |       ]
559 |      },
560 |      "execution_count": 57,
561 |      "metadata": {},
562 |      "output_type": "execute_result"
563 |     }
564 |    ],
565 |    "source": [
566 |     "df_.return_itm_30_dy_cnt.value_counts()/len(df_)"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": 47,
572 |    "metadata": {
573 |     "collapsed": false
574 |    },
575 |    "outputs": [],
576 |    "source": [
577 |     "df_['labels'] = labels"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": 48,
583 |    "metadata": {
584 |     "collapsed": true
585 |    },
586 |    "outputs": [],
587 |    "source": [
588 |     "c1_ = df_.loc[df_['labels'] == 0]\n",
589 |     "c2_ = df_.loc[df_['labels'] == 1]\n",
590 |     "c3_ = df_.loc[df_['labels'] == 2]\n",
591 |     "c4_ = df_.loc[df_['labels'] == 3]\n",
592 |     "c5_ = df_.loc[df_['labels'] == 4]\n",
593 |     "c6_ = df_.loc[df_['labels'] == 5]"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": 51,
599 |    "metadata": {
600 |     "collapsed": false
601 |    },
602 |    "outputs": [
603 |     {
604 |      "data": {
605 |       "text/plain": [
606 |        "54.54545454545454"
607 |       ]
608 |      },
609 |      "execution_count": 51,
610 |      "metadata": {},
611 |      "output_type": "execute_result"
612 |     }
613 |    ],
614 |    "source": [
615 |     "c5_.wirls_ln_cnt.max()"
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "code",
620 |    "execution_count": 52,
621 |    "metadata": {
622 |     "collapsed": false
623 |    },
624 |    "outputs": [
625 |     {
626 |      "data": {
627 |       "text/plain": [
628 |        "0.0"
629 |       ]
630 |      },
631 |      "execution_count": 52,
632 |      "metadata": {},
633 |      "output_type": "execute_result"
634 |     }
635 |    ],
636 |    "source": [
637 |     "c5_.wirls_ln_cnt.min()"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": 53,
643 |    "metadata": {
644 |     "collapsed": true
645 |    },
646 |    "outputs": [
647 |     {
648 |      "data": {
649 |       "text/plain": [
650 |        "0.17     368\n",
651 |        "0.00     347\n",
652 |        "0.34     314\n",
653 |        "0.51     278\n",
654 |        "0.69     214\n",
655 |        "0.86     189\n",
656 |        "1.03     122\n",
657 |        "1.20      80\n",
658 |        "1.37      51\n",
659 |        "1.54      34\n",
660 |        "1.72      31\n",
661 |        "1.89      16\n",
662 |        "2.23      16\n",
663 |        "2.40      16\n",
664 |        "3.43      14\n",
665 |        "3.95      13\n",
666 |        "2.74      13\n",
667 |        "2.92      11\n",
668 |        "3.09      10\n",
669 |        "2.06       9\n",
670 |        "3.26       7\n",
671 |        "4.29       7\n",
672 |        "2.57       7\n",
673 |        "3.77       6\n",
674 |        "5.15       6\n",
675 |        "4.12       6\n",
676 |        "3.60       6\n",
677 |        "4.63       5\n",
678 |        "5.83       4\n",
679 |        "5.66       3\n",
680 |        "5.49       3\n",
681 |        "4.80       3\n",
682 |        "8.06       3\n",
683 |        "4.97       3\n",
684 |        "6.35       2\n",
685 |        "6.86       2\n",
686 |        "5.32       2\n",
687 |        "6.17       2\n",
688 |        "7.03       1\n",
689 |        "17.50      1\n",
690 |        "10.81      1\n",
691 |        "16.12      1\n",
692 |        "4.46       1\n",
693 |        "6.69       1\n",
694 |        "6.00       1\n",
695 |        "14.07      1\n",
696 |        "10.12      1\n",
697 |        "32.59      1\n",
698 |        "8.40       1\n",
699 |        "10.46      1\n",
700 |        "13.89      1\n",
701 |        "11.84      1\n",
702 |        "54.55      1\n",
703 |        "14.75      1\n",
704 |        "11.15      1\n",
705 |        "20.41      1\n",
706 |        "9.43       1\n",
707 |        "6.52       1\n",
708 |        "23.67      1\n",
709 |        "29.33      1\n",
710 |        "12.01      1\n",
711 |        "8.23       1\n",
712 |        "19.38      1\n",
713 |        "19.90      1\n",
714 |        "7.38       1\n",
715 |        "7.55       1\n",
716 |        "22.30      1\n",
717 |        "9.61       1\n",
718 |        "7.72       1\n",
719 |        "9.95       1\n",
720 |        "12.86      1\n",
721 |        "Name: wirls_ln_cnt, dtype: int64"
722 |       ]
723 |      },
724 |      "execution_count": 53,
725 |      "metadata": {},
726 |      "output_type": "execute_result"
727 |     }
728 |    ],
729 |    "source": [
730 |     "c5_.wirls_ln_cnt.value_counts()"
731 |    ]
732 |   },
733 |   {
734 |    "cell_type": "code",
735 |    "execution_count": null,
736 |    "metadata": {
737 |     "collapsed": true
738 |    },
739 |    "outputs": [],
740 |    "source": []
741 |   }
742 |  ],
743 |  "metadata": {
744 |   "kernelspec": {
745 |    "display_name": "Python 3",
746 |    "language": "python",
747 |    "name": "python3"
748 |   },
749 |   "language_info": {
750 |    "codemirror_mode": {
751 |     "name": "ipython",
752 |     "version": 3
753 |    },
754 |    "file_extension": ".py",
755 |    "mimetype": "text/x-python",
756 |    "name": "python",
757 |    "nbconvert_exporter": "python",
758 |    "pygments_lexer": "ipython3",
759 |    "version": "3.6.0"
760 |   }
761 |  },
762 |  "nbformat": 4,
763 |  "nbformat_minor": 1
764 | }
765 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ML_CodeBase_Python
2 | Code snippets for ML in python 
3 | 


--------------------------------------------------------------------------------
/basic_io:
--------------------------------------------------------------------------------
 1 | import pandas as pd, numpy as np, csv, sys, json, pickle, re
 2 | from sklearn.model_selection import train_test_split #For stratified sampling
 3 | 
 4 | from PARAMETERS_global import *
 5 | 
 6 | def read_encoder(encoder_type = 'label/le', variable_name = 'default'):
 7 |     '''loads classes from an encoder'''
 8 |     return pickle.load(open(path_data_output + '/encoders/' + encoder_type + '_' + variable_name + '.pkl','rb'))
 9 |         
10 | def save_csv(df, path, index = False, compression = None):
11 |     df.to_csv(path, index = index,  compression = compression)
12 |     return
13 | 
14 | def write_encoder(encoder, encoder_type = 'label/le', variable_name = 'default'):
15 |     '''Takes the encoder and saves it as .pkl file'''
16 |     with open(path_data_output + '/encoders/' + encoder_type + '_' + variable_name + '.pkl', 'wb') as outfile:
17 |         pickle.dump(encoder,outfile)
18 |     return
19 |     
20 | def intersection(list_a, list_b):
21 |     return list(set(list_a).intersection(set(list_b)))
22 | 
23 | def difference(list_a, list_b):
24 |     # alternate implementatiion - [x for x in list_a if x not in list_b]
25 |     return list(set(list_a).difference(set(list_b)))
26 | 
27 | def union(list_a, list_b):
28 |     # alternate implementatiion - [x for x in list_a if x not in list_b]
29 |     return list(set(list_a).union(set(list_b)))
30 | 
31 | def select_dtype(df, dtypes):
32 |     df = df.select_dtypes(include=dtypes)
33 |     return df
34 | 
35 | def remove_dtype(df, dtypes):
36 |     df = df.select_dtypes(include=dtypes)
37 |     return df
38 | 


--------------------------------------------------------------------------------
/eda_helpers.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd, csv, scipy.stats as ss, seaborn as sns, numpy as np, sys
  2 | sys.path.insert(0,'/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds')
  3 | from config.PARAMETERS_global import *
  4 | from utils.BASIC_input_output import *
  5 | 
  6 | def select_dtype(df, dtypes):
  7 |     df = df.select_dtypes(include=dtypes)
  8 |     return df
  9 | 
 10 | def remove_dtype(df, dtypes):
 11 |     df = df.select_dtypes(include=dtypes)
 12 |     return df
 13 | 
 14 | def df_correlation(df, columns):
 15 |     for col in columns:
 16 |         print(col)
 17 |         tmp = df[columns].apply(lambda x: x.corr(df[col]))
 18 |         #print(tmp)
 19 |         print(tmp.loc[tmp >= 0.5])
 20 |         print("Processing ended for column: {}".format(col))
 21 |     return
 22 |     
 23 | def df_skewed(df, columns):
 24 |     for col in columns:
 25 |         print(col)
 26 |         tmp = df[col].value_counts().head(20)/len(df)
 27 |         print(tmp.loc[tmp >= 0.8])
 28 |         #print("Processing ended for column: {}".format(col))
 29 |     return
 30 | 
 31 | def df_unique(df, columns):
 32 |     for col in columns:
 33 |         print(col, len(df[col].unique()))
 34 |     return
 35 | 
 36 | def df_outlier(df, columns):
 37 |     for col in columns:
 38 |         print(col)
 39 |         tmp = df[col].value_counts().tail(20)/len(df)
 40 |         print(tmp.loc[tmp <= 0.01])
 41 |         #print("Processing ended for column: {}".format(col))
 42 |     return
 43 | 
 44 | def generate_temporal_vars(df, cols):
 45 |     for col in cols:
 46 |         df[col] =  df[col].astype('datetime64')
 47 |         df['fe_' + str(col)+'_weekday'] = df[col].dt.weekday
 48 |         df['fe_' + str(col)+'_month'] = df[col].dt.month
 49 |         df['fe_' + str(col)+'_year'] = df[col].dt.year
 50 |         df['fe_' + str(col)+'_day'] = df[col].dt.day   
 51 |     return df
 52 | 
 53 | def df_count_nan(df):
 54 |     tmp = df.apply(lambda x : x.isnull().sum(axis=0))
 55 |     print(tmp.loc[tmp > 0])
 56 |     return
 57 |               
 58 | def df_strip_values(df):
 59 |     object_cols = select_dtype(df, ['object']).columns.values.tolist()
 60 |     df[object_cols] =  df[object_cols].apply(lambda x : x.str.strip())
 61 |     return df
 62 | 
 63 | def df_number_stats(df):
 64 |     df_int = select_dtype(df, ['int64', 'float64'])
 65 |     for col in df_int:
 66 |         print(col)
 67 |         print(round(df_int[col].describe(),2))
 68 |     return
 69 | 
 70 | def df_object_describe(df):
 71 |     df_obj = select_dtype(df, ['object'])
 72 |     for col in df_obj:
 73 |         print(col)
 74 |         print((df_obj[col].describe()))
 75 |     return
 76 | 
 77 | '''
 78 | Cramer's V method to calculate categorical correlation
 79 | '''
 80 | def cramers_v(x, y):
 81 |     confusion_matrix = pd.crosstab(x,y)
 82 |     chi2 = ss.chi2_contingency(confusion_matrix)[0]
 83 |     n = confusion_matrix.sum().sum()
 84 |     phi2 = chi2/n
 85 |     r,k = confusion_matrix.shape
 86 |     phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
 87 |     rcorr = r-((r-1)**2)/(n-1)
 88 |     kcorr = k-((k-1)**2)/(n-1)
 89 |     return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
 90 | 
 91 | '''
 92 | Correlation among categorical variables
 93 | '''
 94 | def categorical_correlation(df, exceptions):
 95 |     df_obj = select_dtype(df, ['object'])
 96 |     object_cols = df_obj.columns.values.tolist()
 97 |     object_cols = difference(object_cols,exceptions)
 98 |     corr = {}
 99 |     for col1 in object_cols:
100 |         for col2 in object_cols:
101 |             try:
102 |                 correlation = cramers_v(df[col1],df[col2])
103 |             except ValueError:
104 |                 print("Value error occurred for columns {} and {}".format(col1, col2))
105 |             corr[str(col1) + "-" + str(col2)] = correlation
106 |             if((col1!=col2) & (correlation >= 0.5)):
107 |                 print(col1, col2, corr[str(col1) + "-" + str(col2)])
108 |     return
109 | 
110 | '''
111 | Correlation of categoriacal features with a categorical target
112 | '''
113 | def categorical_correlation_w_target(df, exceptions, target =  target_aa):
114 |     df_obj = select_dtype(df, ['object'])
115 |     object_cols = df_obj.columns.values.tolist()
116 |     object_cols = difference(object_cols,exceptions)
117 |     for col in object_cols:
118 |         print(col)
119 |         print(cramers_v(df[col],df[target]))
120 |     return
121 | 
122 | '''
123 | Returns a df where modifier code is not * and does not match procedure modifier code
124 | '''
125 | def modifier_analysis(df, var1, var2):
126 |     df_temp = df.loc[df[var1] != df[var2]][[var1,var2]]
127 |     df_temp['combined'] = df_temp[var1] + df_temp[var2]
128 |     df_temp_ = df_temp.loc[df_temp[var2]  != '* ']
129 |     print(len(df_temp_), len(df_temp))
130 |     return df_temp_
131 | 


--------------------------------------------------------------------------------
/encoder_code.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | '''
  8 | Input: Merged dataset (at DTL or HDR level)
  9 | Outptut: Folder with X (sparse matrix), Y (csv), col_dict (csv)
 10 | '''
 11 | 
 12 | 
 13 | # In[2]:
 14 | 
 15 | 
 16 | get_ipython().run_line_magic('load_ext', 'autoreload')
 17 | get_ipython().run_line_magic('autoreload', '2')
 18 | 
 19 | #Import packages
 20 | import pandas as pd, sys, os, glob
 21 | from scipy import sparse
 22 | from sklearn.model_selection import train_test_split
 23 | from sklearn.preprocessing import LabelEncoder
 24 | from category_encoders.target_encoder import TargetEncoder
 25 | 
 26 | sys.path.insert(0,'/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds')
 27 | sys.path.insert(0,'/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds/code')
 28 | 
 29 | from config.PARAMETERS_global import *
 30 | from config.LOOKUP_objects import *
 31 | from utils.BASIC_input_output import *
 32 | from models.CREATE_X_y import *
 33 | from utils.FUNCTIONS_feature_encoding import *
 34 | 
 35 | pd.options.display.max_rows = 100
 36 | 
 37 | 
 38 | # In[3]:
 39 | 
 40 | 
 41 | get_ipython().run_line_magic('reload_ext', 'autoreload')
 42 | 
 43 | 
 44 | # In[4]:
 45 | 
 46 | 
 47 | model_aapm_version
 48 | 
 49 | 
 50 | # In[5]:
 51 | 
 52 | 
 53 | '''
 54 | MODEL INPUTS
 55 | '''
 56 | model = 'aa' #aa - allowed amount, ltr - LTR
 57 | #model_name = model_aapm_version ## This will be the name of the folder
 58 | model_name = 'model_aapm_basic_subset_ap_prof_claims_5MM_num_cap_y_dates_n_binning_n_fe_proc_cd_grp_nbr_2020_3_months'
 59 | filename_master_data = '/' + date.today().strftime("%Y%m%d") + '_ver' + model_name +'.csv.gz'
 60 | 
 61 | 
 62 | # In[6]:
 63 | 
 64 | 
 65 | filename_master_data = '/20200914_vermodel_aapm_basic_subset_ap_prof_claims_5MM_num_cap_y_dates_n_binning_n_fe_proc_cd_grp_nbr_5MM.csv.gz'
 66 | 
 67 | 
 68 | # In[7]:
 69 | 
 70 | 
 71 | target  = target_aa if(model=='aa') else target_ltr
 72 | merge  = pd.read_csv(path_data_master + filename_master_data, compression = 'gzip')
 73 | merge = merge.loc[merge['CLM_PAYMNT_ACTN_1_CD']!='R'] if(model=='aa') else merge
 74 | 
 75 | 
 76 | # In[10]:
 77 | 
 78 | 
 79 | merge_ = merge.sample(n=2000000, random_state  = 42)
 80 | 
 81 | 
 82 | # In[12]:
 83 | 
 84 | 
 85 | merge = merge_.copy(deep = True)
 86 | 
 87 | 
 88 | # In[13]:
 89 | 
 90 | 
 91 | merge['binned_DTL_LINE_NBR'] = pd.cut(x = merge['DTL_LINE_NBR'], bins = [0,1,2,3,4,50])
 92 | 
 93 | keys = merge['binned_DTL_LINE_NBR'].value_counts().index.tolist()
 94 | values = ['1','2','3','4','4+']
 95 | 
 96 | values_dict = dict(zip(keys, values))
 97 | merge['binned_DTL_LINE_NBR'] = merge['binned_DTL_LINE_NBR'].map(values_dict)
 98 | 
 99 | 
100 | # In[14]:
101 | 
102 | 
103 | merge.drop(columns = ['KEY_CHK_DCN_ITEM_CD', 'DTL_LINE_NBR'], inplace = True)
104 | 
105 | 
106 | # In[15]:
107 | 
108 | 
109 | get_ipython().run_cell_magic('time', '', 'X_train, X_test, y_train, y_test = train_test_split_ratio(merge, target)')
110 | 
111 | 
112 | # In[16]:
113 | 
114 | 
115 | print(X_train.shape, X_test.shape)
116 | 
117 | 
118 | # In[17]:
119 | 
120 | 
121 | model_name
122 | 
123 | 
124 | # ###### OHE Encoding
125 | 
126 | # In[18]:
127 | 
128 | 
129 | key_ = key_dtl if(model=='aa') else key_hdr
130 | dtypes = dict(dtl_dtypes).update(hdr_dtypes)
131 | thresh = 0.001 #all the values which are less than  size*thresh will be dropped
132 | train = pd.concat([X_train, y_train], axis = 1)
133 | test = pd.concat([X_test, y_test], axis = 1)
134 | 
135 | 
136 | # In[19]:
137 | 
138 | 
139 | print(train.shape, test.shape)
140 | 
141 | 
142 | # In[20]:
143 | 
144 | 
145 | path = path_data_output + '/encoders/ohe/' + model_name
146 | path_train = path + '/train'
147 | path_test  = path + '/test'
148 | path_pre_process_train = model_name + '/train/pre_process'
149 | path_pre_process_test = model_name + '/test/pre_process'
150 | 
151 | 
152 | # In[21]:
153 | 
154 | 
155 | ##Create folder structure for new model
156 | try:
157 |     os.mkdir(path)
158 |     os.mkdir(path + '/train')
159 |     os.mkdir(path + '/test')
160 |     os.mkdir(path + '/train/pre_process')
161 |     os.mkdir(path + '/test/pre_process')
162 | except:
163 |     print('Directory already present')
164 | else:
165 |     print('Directory created')
166 | 
167 | 
168 | # Train Processing
169 | 
170 | # In[22]:
171 | 
172 | 
173 | get_ipython().run_cell_magic('time', '', "n_claims = train.shape[0] # Number of rows\ntrain['row'] = (range(n_claims)) # add a row column with s.no.")
174 | 
175 | 
176 | # In[23]:
177 | 
178 | 
179 | get_ipython().run_cell_magic('time', '', "remove_num_cols = [key_common[0], target, 'row','HCFA_PT_CD','PAT_MBR_CD']  # -- Exceptions for numeric processing\n# remove_num_cols = [key_common[0], target, 'row', 'dtl_fe_month_MBR_CNTRCT_END_DT','dtl_fe_month_MBR_CNTRCT_EFCTV_DT',\n#                    'dtl_fe_month_SRVC_FROM_DT','hdr_fe_year_ILNS_ONSET_DT','dtl_fe_month_CLM_CMPLTN_DT',\n#                    'hdr_fe_month_PAT_BRTH_DT','hdr_fe_year_PAT_BRTH_DT','dtl_fe_year_MBR_CNTRCT_EFCTV_DT',\n#                    'hdr_fe_month_SRVC_FROM_DT','dtl_fe_year_CLM_CMPLTN_DT','HCFA_PT_CD','hdr_fe_month_SRVC_THRU_DT',\n#                    'dtl_fe_month_SRVC_TO_DT','dtl_fe_year_MBR_CNTRCT_END_DT','dtl_fe_year_SRVC_TO_DT',\n#                    'hdr_fe_year_SRVC_THRU_DT','dtl_fe_year_SRVC_FROM_DT','hdr_fe_year_SRVC_FROM_DT','PAT_MBR_CD',\n#                    'hdr_fe_year_CLM_CMPLTN_DT','hdr_fe_month_CLM_CMPLTN_DT','hdr_fe_month_ILNS_ONSET_DT']  # -- Exceptions for numeric processing\nnum_processed, numeric_cols = process_numerical(train, ['row'], dtypes, remove_num_cols)\ntarget_processed = process_target(train, ['row'], target)")
180 | 
181 | 
182 | # In[24]:
183 | 
184 | 
185 | get_ipython().run_cell_magic('time', '', "columns_to_remove = [key_common[0], target, 'row']\ncolumns_to_remove = columns_to_remove + numeric_cols + [target]\ncat_size = pre_process_categorical(train, ['row'], path_pre_process_train , dtypes, columns_to_remove)")
186 | 
187 | 
188 | # In[25]:
189 | 
190 | 
191 | get_ipython().run_cell_magic('time', '', "cat_processed  = process_categorical(train, cat_size, n_claims*thresh, path_train + '/pre_process', dtypes, columns_to_remove)")
192 | 
193 | 
194 | # In[26]:
195 | 
196 | 
197 | get_ipython().run_cell_magic('time', '', "merge_to_model_ready(target_processed, num_processed, cat_processed, path_train, 'row')")
198 | 
199 | 
200 | # In[27]:
201 | 
202 | 
203 | train_X_path =  '/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds/data/output/encoders/ohe/'+model_name+'/train/X.npz'
204 | df_train = sparse.load_npz(train_X_path)
205 | df_train.shape
206 | 
207 | 
208 | # In[28]:
209 | 
210 | 
211 | #Select the datset to delete
212 | delete = 'train'
213 | 
214 | 
215 | # In[29]:
216 | 
217 | 
218 | #Remove pre_process files and directory
219 | files = glob.glob(path + '/' + delete + '/pre_process/ohe_*.pkl')
220 | for f in files:
221 |     os.remove(f)
222 | os.rmdir(path + '/' + delete + '/pre_process/')
223 | 
224 | #Remove all other files 
225 | # files = glob.glob(path + '/' + delete + '/*')
226 | # for f in files:
227 | #     os.remove(f)
228 | # os.rmdir(path + '/' + delete)
229 | 
230 | 
231 | # Test Processing
232 | 
233 | # In[30]:
234 | 
235 | 
236 | get_ipython().run_cell_magic('time', '', "#test.drop_duplicates(subset = key_, inplace  = True)  # df which has key_hdr values have been de-duplicated\nn_claims = test.shape[0] # Number of rows\ntest['row'] = (range(n_claims)) # add a row column with s.no.")
237 | 
238 | 
239 | # In[31]:
240 | 
241 | 
242 | get_ipython().run_cell_magic('time', '', "num_processed, numeric_cols = process_numerical(test, ['row'], dtypes, remove_num_cols)\ntarget_processed = process_target(test, ['row'], target)")
243 | 
244 | 
245 | # In[32]:
246 | 
247 | 
248 | get_ipython().run_cell_magic('time', '', "col_dict = pd.read_csv(path_train +  '/col_dict.csv')\ncat_size = pre_process_categorical_test(test, ['row'], path_pre_process_test , dtypes, columns_to_remove, col_dict)")
249 | 
250 | 
251 | # In[33]:
252 | 
253 | 
254 | get_ipython().run_cell_magic('time', '', "cat_processed  = process_categorical(test,cat_size, n_claims*thresh, path_test + '/pre_process', dtypes, columns_to_remove)")
255 | 
256 | 
257 | # In[34]:
258 | 
259 | 
260 | get_ipython().run_cell_magic('time', '', "merge_to_model_ready_test(target_processed, num_processed, cat_processed, path_test, path_train, 'row')")
261 | 
262 | 
263 | # In[35]:
264 | 
265 | 
266 | test_X_path =  '/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds/data/output/encoders/ohe/'+model_name+'/test/X.npz'
267 | df_test = sparse.load_npz(test_X_path)
268 | df_test.shape
269 | 
270 | 
271 | # In[36]:
272 | 
273 | 
274 | #Select the datset to delete
275 | delete = 'test'
276 | 
277 | 
278 | # In[37]:
279 | 
280 | 
281 | #Remove pre_process files and directory
282 | files = glob.glob(path + '/' + delete + '/pre_process/ohe_*.pkl')
283 | for f in files:
284 |     os.remove(f)
285 | os.rmdir(path + '/' + delete + '/pre_process/')
286 | 
287 | #Remove all other files 
288 | # files = glob.glob(path + '/' + delete + '/*')
289 | # for f in files:
290 | #     os.remove(f)
291 | # os.rmdir(path + '/' + delete)
292 | 
293 | 
294 | # In[41]:
295 | 
296 | 
297 | #Remove main directory
298 | # os.rmdir(path)
299 | 
300 | 
301 | # In[96]:
302 | 
303 | 
304 | 
305 | 
306 | 
307 | # In[ ]:
308 | 
309 | 
310 | 
311 | 
312 | 
313 | # ######  Label Encoding
314 | 
315 | # In[9]:
316 | 
317 | 
318 | thresh = 0.0001 #determines the count for 'rare' classification 
319 | 
320 | 
321 | # In[10]:
322 | 
323 | 
324 | get_ipython().run_cell_magic('time', '', 'label_encoding_fit(X_train, thresh)')
325 | 
326 | 
327 | # In[11]:
328 | 
329 | 
330 | get_ipython().run_cell_magic('time', '', 'X_train_  = label_encoding_transform(X_train)')
331 | 
332 | 
333 | # In[12]:
334 | 
335 | 
336 | get_ipython().run_cell_magic('time', '', 'X_test_  = label_encoding_transform(X_test)')
337 | 
338 | 
339 | # ###### Target Encoding
340 | 
341 | # In[7]:
342 | 
343 | 
344 | get_ipython().run_cell_magic('time', '', 'target_encoding_fit(X_train, y_train, model)')
345 | 
346 | 
347 | # In[6]:
348 | 
349 | 
350 | get_ipython().run_cell_magic('time', '', 'X_train_ = target_encoding_transform(X_train, model)')
351 | 
352 | 
353 | # In[7]:
354 | 
355 | 
356 | get_ipython().run_cell_magic('time', '', 'X_test_ = target_encoding_transform(X_test, model)')
357 | 
358 | 
359 | # In[10]:
360 | 
361 | 
362 | model_ready = path_data_model_ready + '/' + model_name + '/'
363 | save_csv(X_train_, model_ready + 'target_encoded_X_train.csv.gz', compression = 'gzip')
364 | save_csv(X_test_, model_ready + 'target_encoded_X_test.csv.gz', compression = 'gzip')
365 | save_csv(y_train, model_ready + 'target_encoded_y_train.csv.gz', compression = 'gzip')
366 | save_csv(y_test, model_ready + 'target_encoded_y_test.csv.gz', compression = 'gzip')
367 | 
368 | 
369 | # In[ ]:
370 | 
371 | 
372 | 
373 | 
374 | 
375 | # In[ ]:
376 | 
377 | 
378 | 
379 | 
380 | 
381 | # In[ ]:
382 | 
383 | 
384 | 
385 | 
386 | 
387 | # In[ ]:
388 | 
389 | 
390 | 
391 | 
392 | 
393 | # In[ ]:
394 | 
395 | 
396 | def sparse_binning(df, columns, thresh):
397 |     '''takes columns to bin using the threshold as the % of sparsity over which the variables will be binned'''
398 |     for col in columns:
399 |         sparse_name =  str(col) + str('_sparse')
400 |         value_counts = df[col].value_counts()*100/len(df)
401 |         values = value_counts.loc[value_counts<thresh].index.values.tolist()
402 |         values_dict = dict(zip(values, [sparse_name]*len(values)))
403 |         df[col] = df[col].map(values_dict).fillna(df[col])
404 |         print('Binning complete for column: {}'.format(col))
405 |     return df
406 | 
407 | list(binning_variables['icd_codes']) + list(binning_variables['zip_codes'])
408 | 
409 | df = merge
410 | columns = ['ICD_A_CD',	'ICD_A_POA_CD',	'ICD_B_CD',	'ICD_B_POA_CD',	'ICD_C_CD',	'ICD_C_POA_CD',	'ICD_D_CD',	'ICD_D_POA_CD',	'ICD_E_CD',	'ICD_E_POA_CD',	'MDFR_1_CD',	'MDFR_2_CD',	'MDFR_3_CD']
411 | thresh = 0.01
412 | merge = sparse_binning(df, columns, thresh) 
413 | 
414 | merge['binned_DTL_LINE_NBR'] = pd.cut(x = merge['DTL_LINE_NBR'], bins = [0,1,2,3,4,50])
415 | 
416 | keys = merge['binned_DTL_LINE_NBR'].value_counts().index.tolist()
417 | values = ['1','2','3','4','4+']
418 | 
419 | values_dict = dict(zip(keys, values))
420 | merge['binned_DTL_LINE_NBR'] = merge['binned_DTL_LINE_NBR'].map(values_dict)
421 | 
422 | #-- Remove repetetive columns 
423 | dates = ['ILNS_ONSET_DT', 'MBR_CNTRCT_EFCTV_DT', 'MBR_CNTRCT_END_DT',  'PAT_BRTH_DT', 'SRVC_TO_DT', 'SRVC_THRU_DT', 
424 |          'SRVC_FROM_DT', 'CLM_CMPLTN_DT']
425 | repeats = ['DTL_LINE_NBR',  'hdr_fe_norm_TOTL_CHRG_AMT', 'dtl_fe_norm_BILLD_CHRGD_AMT', 'dtl_fe_norm_BILLD_SRVC_UNIT_QTY',
426 |            'dtl_fe_norm_TOTL_UNITS_PRCD_CNT', 'dtl_fe_norm_UNITS_OCR_NBR']
427 | 
428 | merge.shape
429 | 
430 | merge.drop(columns = dates + repeats, inplace = True)
431 | 
432 | merge.shape
433 | 
434 | ##rough end
435 | 
436 | 
437 | # In[ ]:
438 | 
439 | 
440 | 
441 | 
442 | 
443 | # In[ ]:
444 | 
445 | 
446 | 
447 | 
448 | 
449 | # ######  Label Encoding
450 | 
451 | # In[9]:
452 | 
453 | 
454 | thresh = 0.0001 #determines the count for 'rare' classification 
455 | 
456 | 
457 | # In[10]:
458 | 
459 | 
460 | get_ipython().run_cell_magic('time', '', 'label_encoding_fit(X_train, thresh)')
461 | 
462 | 
463 | # In[11]:
464 | 
465 | 
466 | get_ipython().run_cell_magic('time', '', 'X_train_  = label_encoding_transform(X_train)')
467 | 
468 | 
469 | # In[12]:
470 | 
471 | 
472 | get_ipython().run_cell_magic('time', '', 'X_test_  = label_encoding_transform(X_test)')
473 | 
474 | 
475 | # ###### Target Encoding
476 | 
477 | # In[7]:
478 | 
479 | 
480 | get_ipython().run_cell_magic('time', '', 'target_encoding_fit(X_train, y_train, model)')
481 | 
482 | 
483 | # In[6]:
484 | 
485 | 
486 | get_ipython().run_cell_magic('time', '', 'X_train_ = target_encoding_transform(X_train, model)')
487 | 
488 | 
489 | # In[7]:
490 | 
491 | 
492 | get_ipython().run_cell_magic('time', '', 'X_test_ = target_encoding_transform(X_test, model)')
493 | 
494 | 
495 | # In[10]:
496 | 
497 | 
498 | model_ready = path_data_model_ready + '/' + model_name + '/'
499 | save_csv(X_train_, model_ready + 'target_encoded_X_train.csv.gz', compression = 'gzip')
500 | save_csv(X_test_, model_ready + 'target_encoded_X_test.csv.gz', compression = 'gzip')
501 | save_csv(y_train, model_ready + 'target_encoded_y_train.csv.gz', compression = 'gzip')
502 | save_csv(y_test, model_ready + 'target_encoded_y_test.csv.gz', compression = 'gzip')
503 | 
504 | 
505 | # In[ ]:
506 | 
507 | 
508 | 
509 | 
510 | 
511 | # In[ ]:
512 | 
513 | 
514 | 
515 | 
516 | 
517 | # In[ ]:
518 | 
519 | 
520 | 
521 | 
522 | 


--------------------------------------------------------------------------------
/encoding.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd, sys, numpy as np, os
  2 | from scipy import sparse
  3 | from category_encoders.target_encoder import TargetEncoder
  4 | from sklearn.preprocessing import LabelEncoder
  5 | sys.path.insert(0,'/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds')
  6 | from config.PARAMETERS_global import *
  7 | from config.LOOKUP_objects import *
  8 | from utils.BASIC_input_output import *
  9 | from utils.EDA_helpers import *
 10 | 
 11 | '''Target (mean) Encoding'''
 12 | def target_encoding_fit(X,y,model):
 13 |     object_cols = select_dtype(X,['object', 'datetime64']).columns.values.tolist()
 14 |     object_cols = difference(object_cols,key_common) #Remove ID
 15 |     folder = 'target/aa/target' if model == 'a' else 'target/ltr/target'
 16 |     for col in object_cols:
 17 |         print("Encoding for column: {}".format(col))
 18 |         encoder = TargetEncoder(cols = [col])
 19 |         encoder.fit(X[col], y)
 20 |         write_encoder(encoder, folder, col)
 21 |     return
 22 | 
 23 | def target_encoding_transform(df,model):
 24 |     ## Apply the encoder to test data
 25 |     object_cols = select_dtype(df,['object', 'datetime64']).columns.values.tolist()
 26 |     object_cols = difference(object_cols,key_common) #Remove ID
 27 |     folder = 'target/aa/target' if model == 'a' else 'target/ltr/target'
 28 |     for col in object_cols:
 29 |         #Load encoder
 30 |         print("Encoding for column: {}".format(col))
 31 |         try:
 32 |             encoder = read_encoder(folder, col)
 33 |         except:
 34 |             print('Error occurred when reading column {}'.format(col))
 35 |         df[col] = encoder.transform(df[col])
 36 |     return df
 37 | 
 38 | '''Label Encoding'''
 39 | def label_encoding_fit(df, thresh):
 40 |     object_cols = select_dtype(df,['object']).columns.values.tolist()
 41 |     object_cols = difference(object_cols,key_common) #Remove ID
 42 |     for col in object_cols:
 43 |         print("Encoding for column: {}".format(col))
 44 |         ## Identify rare values in train dataset
 45 |         df[col] = df[col].replace('*', str(col) + '_none')
 46 |         col_df = df[col]
 47 |         df[col][col_df.groupby(col_df).transform('count').lt(thresh*len(df))] = 'rare'
 48 |         encoder = 'le_'  + str(col)
 49 |         encoder = LabelEncoder()
 50 |         try:
 51 |             df[col] = df[col].apply(lambda x :  str(x))
 52 |             df[col]  = encoder.fit_transform(df[col])
 53 |         except ValueError:
 54 |             print("Val error occurred for " + str(col))
 55 |         except re.error:
 56 |             print("Re error occurred for " + str(col))
 57 |         except TypeError:
 58 |             print("TypeError occurred for " + str(col))
 59 |         write_encoder(encoder, 'label/le', col)
 60 |     return
 61 | 
 62 | def label_encoding_transform(df):
 63 |     ## Apply the encoder to test data
 64 |     object_cols = select_dtype(df,['object']).columns.values.tolist()
 65 |     object_cols = difference(object_cols,key_common) #Remove ID    
 66 |     for col in object_cols:
 67 |         #Load encoder
 68 |         print("Encoding for column: {}".format(col))
 69 |         df[col] = df[col].replace('*', str(col) + '_none')
 70 |         encoder = LabelEncoder()
 71 |         try:
 72 |             encoder = read_encoder('label/le', col)
 73 |         except:
 74 |             print('Error occured when reading column {}'.format(col))
 75 |         print(encoder.classes_)
 76 | 
 77 |         #Identify and assign values
 78 |         df[col][~df[col].isin(encoder.classes_)] = 'rare'
 79 |         df[col] = encoder.transform(df[col])
 80 |     return df
 81 | 
 82 | def process_numerical(df, key, dtypes, remove_num_cols = None):
 83 |     cols = select_dtype(df,['int64','float64']).columns.values.tolist()
 84 |     cols = difference(cols,remove_num_cols) if(remove_num_cols) else cols
 85 |     df_num = df[key + cols]
 86 |     return df_num, cols
 87 | 
 88 | def process_target(df, key, target):
 89 |     cols = [target]
 90 |     df_target = df[key + cols]
 91 |     return df_target
 92 | 
 93 | def pre_process_categorical(df, key, path, dtypes, remove_cat_cols = None):
 94 |     p_size = 0
 95 |     object_cols = df.columns.values.tolist()
 96 |     object_cols = difference(object_cols, remove_cat_cols)
 97 |     write_path = 'ohe/'+path+'/ohe'
 98 |     for col in object_cols:
 99 |         if(df[col].dtype!='object'):
100 |             df[col] = df[col].astype('str')
101 |         temp_df = df[key  + [col]].drop_duplicates().rename(columns={col: "VAR"})
102 |         temp_df['VAR'] = temp_df['VAR'].str.strip()
103 |         temp_df.replace({'*': 'none'}, inplace = True)
104 |         #temp_df = temp_df.loc[temp_df['VAR']!='*']
105 |         temp_df['VAR'] = temp_df['VAR'].apply(lambda x : str(col) + '_' + str(x))
106 |         write_encoder(temp_df, write_path, col)
107 |         p_size = p_size + temp_df.shape[0]
108 |         print("Finished processing column : {}".format(col))
109 |     return p_size
110 | 
111 | def pre_process_categorical_test(df, key, path, dtypes, remove_cat_cols, col_dict):
112 |     p_size = 0
113 |     object_cols = df.columns.values.tolist()
114 |     object_cols = difference(object_cols, remove_cat_cols)
115 |     write_path = 'ohe/'+path+'/ohe'
116 |     for col in object_cols:
117 |         if(df[col].dtype!='object'):
118 |             df[col] = df[col].astype('str')
119 |         temp_df = df[key  + [col]].drop_duplicates().rename(columns={col: "VAR"})
120 |         temp_df['VAR'] = temp_df['VAR'].str.strip()
121 |         temp_df.replace({'*': 'none'}, inplace = True)
122 |         #temp_df = temp_df.loc[temp_df['VAR']!='*']
123 |         temp_df['VAR'] = temp_df['VAR'].apply(lambda x : str(col) + '_' + str(x))
124 |         temp_seen_df = temp_df.loc[temp_df.VAR.isin(col_dict.VAR.values.tolist())]
125 |         write_encoder(temp_seen_df, write_path , col)
126 |         p_size = p_size + temp_df.shape[0]
127 |         print("Finished processing column : {}".format(col))
128 |     print(p_size)
129 |     return p_size
130 | 
131 | def process_categorical(df, cat_size, thresh, path, dtypes, columns_to_remove):
132 |     df_key = pd.DataFrame({'row': [0], 'VAR': ['No_VAR']})
133 |     df_col = pd.DataFrame(np.repeat(df_key.values, cat_size, axis=0), columns=['row', 'VAR'])
134 |     start_index = 0
135 |     object_cols = df.columns.values.tolist()
136 |     object_cols = difference(object_cols, columns_to_remove)
137 |     #object_cols = dtype_dict_find_strings(dtypes)
138 |     #object_cols = intersection(df.columns.values.tolist(),object_cols)
139 |     #object_cols = difference(object_cols, columns_to_remove)    
140 |     for col in object_cols:                                                      
141 |         temp_df = pd.read_pickle(path + '/ohe_' + col + '.pkl')
142 |         n = temp_df.shape[0]
143 |         end_index = start_index + n
144 |         df_col['VAR'][start_index:end_index] = temp_df['VAR'][:]
145 |         df_col['row'][start_index:end_index] = temp_df['row'][:]
146 |         start_index = end_index
147 |         print("Finished processing column : {}".format(col))
148 |     err_cnt = df_col.groupby(['VAR'], as_index=False)['row'].count().rename(columns={'row': 'count'})
149 |     keep_err = err_cnt['VAR'][err_cnt['count'] > thresh]
150 |     print(len(keep_err.unique()), " variables processed")
151 |     df_col = df_col[df_col['VAR'].isin(keep_err)]
152 |     #write_encoder(df_num, 'ohe/cat_processed', '')
153 |     return df_col
154 | 
155 | def merge_to_model_ready(target, num, cat, path, key):
156 |     #target = pd.read_pickle(path + 'target_processed.pkl')
157 |     #num = pd.read_pickle(path + 'num_processed.pkl')
158 |     #cat = pd.read_pickle(path + 'cat_processed.pkl')
159 | 
160 |     model_data = target[key]
161 |     n_claims = model_data.shape[0]
162 |     model_data = pd.merge(model_data, cat, on=key, how='left')
163 |     model_data.fillna('No_VAR', inplace=True)
164 |     n = model_data.shape[0]
165 |     col_dict = model_data[['VAR']].drop_duplicates().sort_values(by=['VAR'])
166 |     n_vars = col_dict.shape[0]
167 |     col_dict['col'] = (range(n_vars))
168 |     model_data = pd.merge(model_data, col_dict, on='VAR', how='left')
169 |     vals = np.ones(n, dtype=float)
170 |     rows = model_data['row']
171 |     cols = model_data['col']
172 | 
173 |     for name in num.columns.values.tolist():
174 |         if name != 'row':
175 |             vals = np.concatenate((vals, num[name]))
176 |             rows = np.concatenate((rows, num['row']))
177 |             cols = np.concatenate((cols, n_vars*np.ones(len(num[name]), dtype=int)))
178 |             new_col = pd.DataFrame({'VAR': [name], 'col': [n_vars]})
179 |             col_dict = pd.concat([col_dict, new_col])
180 |             n_vars = n_vars + 1
181 | 
182 |     X = sparse.csr_matrix((vals, (rows, cols)), shape=(n_claims, n_vars)) 
183 | 
184 |     sparse.save_npz(path + "/X.npz", X)
185 |     save_csv(target, path + '/Y.csv')
186 |     save_csv(col_dict, path + '/col_dict.csv')
187 |     return
188 | 
189 | def merge_to_model_ready_test(target, num, cat, path_test, path_train, key):
190 |     #target = pd.read_pickle(path + 'target_processed.pkl')
191 |     #num = pd.read_pickle(path + 'num_processed.pkl')
192 |     #cat = pd.read_pickle(path + 'cat_processed.pkl')
193 | 
194 |     model_data = target[key]
195 |     n_claims = model_data.shape[0]
196 |     model_data = pd.merge(model_data, cat, on=key, how='left')
197 |     model_data.fillna('No_VAR', inplace=True)
198 |     ## Variable differences
199 |     col_dict_train = pd.read_csv(path_train + '/col_dict.csv')
200 |     vars_train = col_dict_train.VAR.unique()
201 |     vars_test = model_data.VAR.unique()
202 |     vars_not_in_train = difference(vars_test,vars_train)
203 |     vars_not_in_test = difference(vars_train,vars_test)
204 |     vars_not_in_test_and_num = difference(vars_not_in_test,num.columns.values.tolist())
205 | 
206 |     ## Remove variables present in test but not in train
207 |     size =  model_data.shape[0]
208 |     model_data = model_data.loc[~model_data.VAR.isin(vars_not_in_train)]
209 |     print("{} rows removed for variables which were present in test but not in train".
210 |          format(size-model_data.shape[0]))
211 | 
212 |     # Add variables present in train but not in test ## DEFER
213 |     model_data = model_data.append(pd.DataFrame({'row':[model_data.row.max()+1]*len(vars_not_in_test_and_num),  'VAR':vars_not_in_test_and_num}))
214 | 
215 |     col_dict = model_data[['VAR']].drop_duplicates().sort_values(by=['VAR'])
216 |     n_vars = col_dict.shape[0]
217 |     col_dict['col'] = (range(n_vars))
218 |     model_data = pd.merge(model_data, col_dict, on='VAR', how='left')
219 |     n = model_data.shape[0]
220 |     vals = np.ones(n, dtype=float)
221 |     rows = model_data['row']
222 |     cols = model_data['col']
223 | 
224 |     for name in num.columns.values.tolist():
225 |         if name != 'row':
226 |             vals = np.concatenate((vals, num[name]))
227 |             rows = np.concatenate((rows, num['row']))
228 |             cols = np.concatenate((cols, n_vars*np.ones(len(num[name]), dtype=int)))
229 |             new_col = pd.DataFrame({'VAR': [name], 'col': [n_vars]})
230 |             col_dict = pd.concat([col_dict, new_col])
231 |             n_vars = n_vars + 1
232 | 
233 |     n_claims += 1
234 | 
235 |     X = sparse.csr_matrix((vals, (rows, cols)), shape=(n_claims, n_vars)) 
236 |     sparse.save_npz(path_test + "/X.npz", X)
237 |     save_csv(target, path_test + '/Y.csv')
238 |     save_csv(col_dict, path_test + '/col_dict.csv')
239 |     return
240 | 


--------------------------------------------------------------------------------
/linux.txt:
--------------------------------------------------------------------------------
1 | awk < R4826-201801_RCMND_SPSH2.txt '{print $98}' | sort | uniq | wc –l
2 | head -1 R4826-201801_RCMND_SPSH.txt | tr '|' '\n' | cat -n | grep "rcmnd_prtf_cd"
3 | awk -F "|" '{ if(($145 == "N")||($145 == "pmt_arng_ind")) { print } }' R4826-201801_RCMND_SPSH.txt > R4826-201801_RCMND_SPSH_n.txt
4 | awk -F '|' '{print $98}' R4826-201801_RCMND_SPSH.txt | sort | uniq -c
5 | while read p; do head -1 R4826-201801_RCMND_SPSH2.txt | tr '|' '\n' | cat -n | grep "$p"; done < cols.txt
6 | wc -l <filename>
7 | 


--------------------------------------------------------------------------------
/models_run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | get_ipython().run_line_magic('load_ext', 'autoreload')
  8 | get_ipython().run_line_magic('autoreload', '2')
  9 | 
 10 | import pandas as pd, numpy as np,  sys, os 
 11 | from sklearn import metrics
 12 | from scipy import sparse
 13 | from sklearn.metrics import r2_score,mean_squared_error
 14 | from sklearn.model_selection import RandomizedSearchCV
 15 | import csv, warnings, time, pickle
 16 | 
 17 | sys.path.insert(0,'/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds')
 18 | sys.path.insert(0,'/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds/code')
 19 | sys.path.insert(0,'/ds/data/ve2/dtl/aixx/phi/no_gbd/r000/work/pocep_ds/pocep_ds/code/models')
 20 | 
 21 | from config.PARAMETERS_global import *
 22 | from config.LOOKUP_objects import *
 23 | from utils.BASIC_input_output import *
 24 | from utils.MODEL_basics import *
 25 | 
 26 | warnings.filterwarnings('ignore')
 27 | pd.options.display.max_rows = 100
 28 | 
 29 | 
 30 | # In[2]:
 31 | 
 32 | 
 33 | get_ipython().run_line_magic('reload_ext', 'autoreload')
 34 | 
 35 | 
 36 | # In[3]:
 37 | 
 38 | 
 39 | model_name = 'model_aapm_basic_subset_ap_prof_claims_1MM_num_cap_y_dates_n_binning_n_fe_proc_cd_grp_nbr'
 40 | 
 41 | 
 42 | # In[4]:
 43 | 
 44 | 
 45 | # model_name = model_aapm_version ## This will be the name of the folder
 46 | path = path_data_output + '/encoders/ohe/' + model_name + '/'
 47 | path_train = path + 'train/'
 48 | path_test  = path + 'test/'
 49 | X_train = sparse.load_npz(path_train + 'X.npz')
 50 | y_train = pd.read_csv(path_train + 'Y.csv')[target_aa]
 51 | X_test = sparse.load_npz(path_test + 'X.npz')
 52 | y_test = pd.read_csv(path_test + 'Y.csv')[target_aa]
 53 | col_dict = pd.read_csv(path_train + 'col_dict.csv')
 54 | 
 55 | 
 56 | # In[5]:
 57 | 
 58 | 
 59 | print(X_test.shape, y_test.shape)
 60 | X_test_ = X_test[:len(y_test)] #remove last row
 61 | X_train.shape[0] + X_test_.shape[0]
 62 | 
 63 | 
 64 | # #### Linear Regression
 65 | 
 66 | # In[6]:
 67 | 
 68 | 
 69 | from sklearn.linear_model import LinearRegression
 70 | 
 71 | 
 72 | # In[7]:
 73 | 
 74 | 
 75 | get_ipython().run_cell_magic('time', '', 'reg = LinearRegression().fit(X_train, y_train)')
 76 | 
 77 | 
 78 | # In[8]:
 79 | 
 80 | 
 81 | get_ipython().run_cell_magic('time', '', 'test_pred = reg.predict(X_test_)\ntrain_pred = reg.predict(X_train)')
 82 | 
 83 | 
 84 | # In[9]:
 85 | 
 86 | 
 87 | get_ipython().run_cell_magic('time', '', 'calculate_performance(y_test, test_pred)\ncalculate_performance(y_train, train_pred) ')
 88 | 
 89 | 
 90 | # In[10]:
 91 | 
 92 | 
 93 | calculate_performance_for_hypo_testing(y_test, test_pred)
 94 | generate_deviation_stats_for_hypo_testing(test_pred, y_test)
 95 | 
 96 | 
 97 | # In[121]:
 98 | 
 99 | 
100 | generate_deviation_stats(train_pred, y_train)
101 | 
102 | 
103 | # In[122]:
104 | 
105 | 
106 | generate_deviation_stats(test_pred, y_test)
107 | 
108 | 
109 | # In[123]:
110 | 
111 | 
112 | for i,col in enumerate(col_dict.VAR):
113 |     print(col,'{0:.4f}'.format(reg.coef_[i]))
114 | 
115 | 
116 | # In[124]:
117 | 
118 | 
119 | calculate_performance_for_hypo_testing(y_test, test_pred)
120 | generate_deviation_stats_for_hypo_testing(test_pred, y_test)
121 | 
122 | 
123 | # In[87]:
124 | 
125 | 
126 | calculate_performance_for_hypo_testing(y_test, test_pred)
127 | generate_deviation_stats_for_hypo_testing(test_pred, y_test)
128 | 
129 | 
130 | # In[40]:
131 | 
132 | 
133 | calculate_performance_for_hypo_testing(y_test, test_pred)
134 | generate_deviation_stats_for_hypo_testing(test_pred, y_test)
135 | 
136 | 
137 | # #### GBM
138 | 
139 | # In[11]:
140 | 
141 | 
142 | from sklearn.ensemble import GradientBoostingRegressor
143 | 
144 | 
145 | # In[28]:
146 | 
147 | 
148 | gbr = GradientBoostingRegressor(n_estimators = 50, max_depth=7, learning_rate = 0.1, max_features = 'sqrt', random_state=42)
149 | 
150 | 
151 | # In[29]:
152 | 
153 | 
154 | get_ipython().run_cell_magic('time', '', 'gbr.fit(X_train, y_train)')
155 | 
156 | 
157 | # In[30]:
158 | 
159 | 
160 | preds_train = gbr.predict(X_train)
161 | rmse_train = np.sqrt(mean_squared_error(y_train, preds_train))
162 | r2_train = r2_score(y_train, preds_train)
163 | preds_test = gbr.predict(X_test_)
164 | rmse_test = np.sqrt(mean_squared_error(y_test, preds_test))
165 | r2_test = r2_score(y_test, preds_test)
166 | 
167 | 
168 | # In[31]:
169 | 
170 | 
171 | calculate_performance(y_test, preds_test)
172 | calculate_performance(y_train, preds_train) 
173 | 
174 | 
175 | # In[32]:
176 | 
177 | 
178 | generate_deviation_stats(preds_train, y_train)
179 | 
180 | 
181 | # In[33]:
182 | 
183 | 
184 | generate_deviation_stats(preds_test, y_test)
185 | 
186 | 
187 | # In[35]:
188 | 
189 | 
190 | for i,col in enumerate(col_dict.VAR):
191 |     print(col,'{0:.4f}'.format(gbr.feature_importances_[i]))
192 | 
193 | 
194 | # In[20]:
195 | 
196 | 
197 | top_10_features(gbr)
198 | 
199 | 
200 | # In[ ]:
201 | 
202 | 
203 | top_10_features(gbr)
204 | 
205 | 
206 | # In[34]:
207 | 
208 | 
209 | calculate_performance_for_hypo_testing(y_test, preds_test)
210 | generate_deviation_stats_for_hypo_testing(preds_test, y_test)
211 | 
212 | 
213 | # In[16]:
214 | 
215 | 
216 | calculate_performance_for_hypo_testing(y_test, preds_test)
217 | generate_deviation_stats_for_hypo_testing(preds_test, y_test)
218 | 
219 | 
220 | # In[133]:
221 | 
222 | 
223 | calculate_performance_for_hypo_testing(y_test, preds_test)
224 | generate_deviation_stats_for_hypo_testing(preds_test, y_test)
225 | 
226 | 
227 | # In[71]:
228 | 
229 | 
230 | calculate_performance_for_hypo_testing(y_test, preds_test)
231 | generate_deviation_stats_for_hypo_testing(preds_test, y_test)
232 | 
233 | 
234 | # In[51]:
235 | 
236 | 
237 | calculate_performance_for_hypo_testing(y_test, preds_test)
238 | generate_deviation_stats_for_hypo_testing(preds_test, y_test)
239 | 
240 | 
241 | # #### GBM - 2 (Hyperparameters changed)
242 | 
243 | # In[11]:
244 | 
245 | 
246 | from sklearn.ensemble import GradientBoostingRegressor
247 | 
248 | 
249 | # In[12]:
250 | 
251 | 
252 | gbr = GradientBoostingRegressor(n_estimators = 500, max_depth=7, learning_rate = 0.1, random_state=42)
253 | 
254 | 
255 | # In[13]:
256 | 
257 | 
258 | get_ipython().run_cell_magic('time', '', 'gbr.fit(X_train, y_train)')
259 | 
260 | 
261 | # In[14]:
262 | 
263 | 
264 | preds_train = gbr.predict(X_train)
265 | rmse_train = np.sqrt(mean_squared_error(y_train, preds_train))
266 | r2_train = r2_score(y_train, preds_train)
267 | preds_test = gbr.predict(X_test_)
268 | rmse_test = np.sqrt(mean_squared_error(y_test, preds_test))
269 | r2_test = r2_score(y_test, preds_test)
270 | 
271 | 
272 | # In[15]:
273 | 
274 | 
275 | calculate_performance(y_test, preds_test)
276 | calculate_performance(y_train, preds_train) 
277 | 
278 | 
279 | # In[16]:
280 | 
281 | 
282 | generate_deviation_stats(preds_train, y_train)
283 | 
284 | 
285 | # In[17]:
286 | 
287 | 
288 | generate_deviation_stats(preds_test, y_test)
289 | 
290 | 
291 | # In[18]:
292 | 
293 | 
294 | for i,col in enumerate(col_dict.VAR):
295 |     print(col,'{0:.4f}'.format(gbr.feature_importances_[i]))
296 | 
297 | 
298 | # In[19]:
299 | 
300 | 
301 | top_10_features(gbr)
302 | 
303 | 
304 | # In[20]:
305 | 
306 | 
307 | calculate_performance_for_hypo_testing(y_test, preds_test)
308 | generate_deviation_stats_for_hypo_testing(preds_test, y_test)
309 | 
310 | 
311 | # ### XG Boost
312 | 
313 | # In[6]:
314 | 
315 | 
316 | import xgboost
317 | from xgboost import plot_importance
318 | 
319 | 
320 | # In[16]:
321 | 
322 | 
323 | model = xgboost.XGBRegressor(colsample_bytree=0.4,
324 |                  gamma=0,                 
325 |                  learning_rate=0.09,
326 |                  max_depth=7,
327 |                  min_child_weight=1.5,
328 |                  n_estimators=10000,                                                                    
329 |                  reg_alpha=0.75,
330 |                  reg_lambda=0.45,
331 |                  subsample=0.6,
332 |                  seed=42) 
333 | 
334 | 
335 | # In[ ]:
336 | 
337 | 
338 | get_ipython().run_cell_magic('time', '', 'model.fit(X_train, y_train)')
339 | 
340 | 
341 | # In[ ]:
342 | 
343 | 
344 | get_ipython().run_cell_magic('time', '', 'preds_train = model.predict(X_train)\npreds_test = model.predict(X_test_)')
345 | 
346 | 
347 | # In[ ]:
348 | 
349 | 
350 | calculate_performance_for_hypo_testing(y_test, preds_test)
351 | generate_deviation_stats_for_hypo_testing(preds_test, y_test)
352 | 
353 | 
354 | # In[ ]:
355 | 
356 | 
357 | 
358 | 
359 | 
360 | # In[27]:
361 | 
362 | 
363 | model2 = xgboost.XGBRegressor(colsample_bytree=0.4,
364 |                  gamma=0,                 
365 |                  learning_rate=0.07,
366 |                  max_depth=6,
367 |                  min_child_weight=1,
368 |                  n_estimators=10000,                                                                    
369 |                  reg_alpha=0.75,
370 |                  reg_lambda=0.45,
371 |                  subsample=0.6,
372 |                  seed=42) 
373 | 
374 | 
375 | # In[28]:
376 | 
377 | 
378 | get_ipython().run_cell_magic('time', '', 'model2.fit(X_train, y_train)')
379 | 
380 | 
381 | # In[29]:
382 | 
383 | 
384 | preds_train = model2.predict(X_train)
385 | preds_test = model2.predict(X_test_)
386 | 
387 | 
388 | # In[30]:
389 | 
390 | 
391 | calculate_performance_for_hypo_testing(y_test, preds_test)
392 | generate_deviation_stats_for_hypo_testing(preds_test, y_test)
393 | 
394 | 
395 | # In[31]:
396 | 
397 | 
398 | model3 = xgboost.XGBRegressor(colsample_bytree=0.4,
399 |                  gamma=0,                 
400 |                  learning_rate=0.07,
401 |                  max_depth=10,
402 |                  min_child_weight=1.5,
403 |                  n_estimators=10000,                                                                    
404 |                  reg_alpha=0.75,
405 |                  reg_lambda=0.45,
406 |                  subsample=0.6,
407 |                  seed=42) 
408 | 
409 | 
410 | # In[32]:
411 | 
412 | 
413 | get_ipython().run_cell_magic('time', '', 'model3.fit(X_train, y_train)')
414 | 
415 | 
416 | # In[33]:
417 | 
418 | 
419 | preds_train = model3.predict(X_train)
420 | preds_test = model3.predict(X_test_)
421 | 
422 | 
423 | # In[34]:
424 | 
425 | 
426 | calculate_performance_for_hypo_testing(y_test, preds_test)
427 | generate_deviation_stats_for_hypo_testing(preds_test, y_test)
428 | 
429 | 
430 | # In[43]:
431 | 
432 | 
433 | pickle.dump(model3, open('xgboost_best_perf.pkl', 'wb'))
434 | 
435 | 
436 | # In[ ]:
437 | 
438 | 
439 | 
440 | 
441 | 
442 | # In[37]:
443 | 
444 | 
445 | model4 = xgboost.XGBRegressor(colsample_bytree=0.4,
446 |                  gamma=1,                 
447 |                  learning_rate=0.07,
448 |                  max_depth=10,
449 |                  min_child_weight=1.5,
450 |                  n_estimators=10000,                                                                    
451 |                  reg_alpha=0.75,
452 |                  reg_lambda=0.45,
453 |                  subsample=0.6,
454 |                  seed=42) 
455 | 
456 | 
457 | # In[38]:
458 | 
459 | 
460 | get_ipython().run_cell_magic('time', '', 'model4.fit(X_train, y_train)')
461 | 
462 | 
463 | # In[39]:
464 | 
465 | 
466 | preds_train = model4.predict(X_train)
467 | preds_test = model4.predict(X_test_)
468 | 
469 | 
470 | # In[40]:
471 | 
472 | 
473 | calculate_performance_for_hypo_testing(y_test, preds_test)
474 | generate_deviation_stats_for_hypo_testing(preds_test, y_test)
475 | 
476 | 
477 | # In[41]:
478 | 
479 | 
480 | calculate_performance_for_hypo_testing(y_train, preds_train)
481 | generate_deviation_stats_for_hypo_testing(preds_train, y_train)
482 | 
483 | 
484 | # In[44]:
485 | 
486 | 
487 | 
488 | 
489 | 
490 | # In[46]:
491 | 
492 | 
493 | feats = model4.feature_importances_
494 | 
495 | 
496 | # In[50]:
497 | 
498 | 
499 | feats_df = pd.DataFrame(feats)
500 | 
501 | 
502 | # In[53]:
503 | 
504 | 
505 | feats_df.to_csv('feats.csv', index = False)
506 | 
507 | 
508 | # In[52]:
509 | 
510 | 
511 | feats_df['col'] = col_dict.VAR.tolist()
512 | 
513 | 
514 | # In[45]:
515 | 
516 | 
517 | for i,col in enumerate(col_dict.VAR):
518 |     print(col,'{0:.4f}'.format(model4.feature_importances_[i]))
519 | 
520 | 
521 | # In[54]:
522 | 
523 | 
524 | feats_df['col']
525 | 
526 | 
527 | # #### RF
528 | 
529 | # In[34]:
530 | 
531 | 
532 | from sklearn.ensemble import RandomForestRegressor
533 | 
534 | 
535 | # In[63]:
536 | 
537 | 
538 | regr = RandomForestRegressor(n_estimators = 500, max_depth=5,max_features = 'auto', random_state=42)
539 | 
540 | 
541 | # In[64]:
542 | 
543 | 
544 | get_ipython().run_cell_magic('time', '', 'regr.fit(X_train, y_train)')
545 | 
546 | 
547 | # In[65]:
548 | 
549 | 
550 | preds_train = regr.predict(X_train)
551 | rmse_train = np.sqrt(mean_squared_error(y_train, preds_train))
552 | r2_train = r2_score(y_train, preds_train)
553 | 
554 | preds_test = regr.predict(X_test_)
555 | rmse_test = np.sqrt(mean_squared_error(y_test, preds_test))
556 | r2_test = r2_score(y_test, preds_test)
557 | 
558 | 
559 | # In[66]:
560 | 
561 | 
562 | calculate_performance(y_test, preds_test)
563 | calculate_performance(y_train, preds_train) 
564 | 
565 | 
566 | # In[67]:
567 | 
568 | 
569 | generate_deviation_stats(preds_train, y_train)
570 | 
571 | 
572 | # In[68]:
573 | 
574 | 
575 | generate_deviation_stats(preds_test, y_test)
576 | 
577 | 
578 | # In[27]:
579 | 
580 | 
581 | for i,col in enumerate(col_dict.VAR):
582 |     print(col,'{0:.4f}'.format(regr.feature_importances_[i]))
583 | 
584 | 
585 | # #### Ridge
586 | 
587 | # In[92]:
588 | 
589 | 
590 | from sklearn.linear_model import Ridge
591 | 
592 | 
593 | # In[99]:
594 | 
595 | 
596 | get_ipython().run_cell_magic('time', '', 'reg = Ridge(alpha=1).fit(X_train, y_train)')
597 | 
598 | 
599 | # In[100]:
600 | 
601 | 
602 | get_ipython().run_cell_magic('time', '', 'test_pred = reg.predict(X_test_)\ntrain_pred = reg.predict(X_train)')
603 | 
604 | 
605 | # In[101]:
606 | 
607 | 
608 | get_ipython().run_cell_magic('time', '', 'calculate_performance(y_test, test_pred)\ncalculate_performance(y_train, train_pred) ')
609 | 
610 | 
611 | # In[102]:
612 | 
613 | 
614 | for i,col in enumerate(col_dict.VAR):
615 |     print(col,'{0:.4f}'.format(reg.coef_[i]))
616 | 
617 | 
618 | # In[103]:
619 | 
620 | 
621 | generate_deviation_stats(train_pred, y_train)
622 | 
623 | 
624 | # In[104]:
625 | 
626 | 
627 | generate_deviation_stats(test_pred, y_test)
628 | 
629 | 
630 | # In[ ]:
631 | 
632 | 
633 | 
634 | 
635 | 
636 | # #### Lasso
637 | 
638 | # In[105]:
639 | 
640 | 
641 | from sklearn.linear_model import Lasso
642 | 
643 | 
644 | # In[106]:
645 | 
646 | 
647 | get_ipython().run_cell_magic('time', '', 'reg = Lasso(alpha = 1).fit(X_train, y_train)')
648 | 
649 | 
650 | # In[107]:
651 | 
652 | 
653 | get_ipython().run_cell_magic('time', '', 'test_pred = reg.predict(X_test_)\ntrain_pred = reg.predict(X_train)')
654 | 
655 | 
656 | # In[108]:
657 | 
658 | 
659 | get_ipython().run_cell_magic('time', '', 'calculate_performance(y_test, test_pred)\ncalculate_performance(y_train, train_pred) ')
660 | 
661 | 
662 | # In[66]:
663 | 
664 | 
665 | for i,col in enumerate(col_dict.VAR):
666 |     print(col,'{0:.4f}'.format(reg.coef_[i]))
667 | 
668 | 
669 | # In[109]:
670 | 
671 | 
672 | generate_deviation_stats(train_pred, c)
673 | 
674 | 
675 | # In[110]:
676 | 
677 | 
678 | generate_deviation_stats(test_pred, y_test)
679 | 
680 | 
681 | # In[12]:
682 | 
683 | 
684 | round(y_train.describe(),2)
685 | 
686 | 
687 | # In[ ]:
688 | 
689 | 
690 | 
691 | 
692 | 


--------------------------------------------------------------------------------
/regression_model_basics:
--------------------------------------------------------------------------------
 1 | import numpy as np, pandas as pd
 2 | from sklearn import metrics
 3 | from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error
 4 | 
 5 | def mean_absolute_percentage_error(y_true, y_pred): 
 6 |     y_true, y_pred = np.array(y_true), np.array(y_pred)
 7 |     return np.mean(np.abs((y_true - y_pred) / (y_true +  0.0000001))) * 100
 8 | 
 9 | def calculate_performance(test_Y, test_pred):
10 |     mse = mean_squared_error(test_Y, test_pred)
11 |     rmse = np.sqrt(mse)
12 |     print("RMSE: ", round(rmse,2))
13 |     print("R2: ", round(r2_score(test_Y, test_pred),2))
14 |     print('MAE: ', round(mean_absolute_error(test_Y, test_pred),2))
15 |     print('MAPE: ', round(mean_absolute_percentage_error(test_Y, test_pred),2))
16 |     return
17 | 
18 | ## Calculate the deviation 
19 | def generate_deviation_stats(pred, actual):
20 |     df = pd.DataFrame(pred)
21 |     df['actual'] = actual
22 |     df.rename(columns = {0: 'pred'}, inplace = True)
23 |     df['diff'] = abs(df['actual']- df['pred'])
24 |     df['actual'] = df['actual'].replace(0,1)
25 |     df['pct_deviation'] =  round(df['diff']*100/df['actual'],4)
26 |     df['pct_deviation'].value_counts()/len(df)
27 |     print('max percentage deviation {}%'.format(df['pct_deviation'].max()))
28 |     deviations = [0,2,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100,150,200]
29 |     len_deviations = []
30 |     for i in deviations:
31 |         dev = df.loc[df['pct_deviation'] <= i]
32 |         len_ = len(dev)
33 |         len_deviations.append(len_)
34 |     #for i,index in enumerate(deviations):
35 |     #    len_deviations[i+1] = len_deviations[i+1]-len_deviations[i]
36 |     #    if(i==len(deviations)-2):
37 |     #        break;
38 |     for i,index in enumerate(deviations):
39 |         print('{} % \tdeviation - \tcount: {}, \tpct {}%'.format(index,len_deviations[i],round(len_deviations[i]*100/len(df),2)))
40 |     return
41 | 
42 | #Remove claims with 0s
43 | def generate_deviation_stats_wo_0(pred, actual):
44 |     df = pd.DataFrame(pred)
45 |     df['actual'] = actual
46 |     df.rename(columns = {0: 'pred'}, inplace = True)
47 |     df['diff'] = abs(df['actual']- df['pred'])
48 |     df = df.loc[df['actual']  != 0]
49 |     #df['actual'] = df['actual'].replace(0,1)
50 |     df['pct_deviation'] =  round(df['diff']*100/df['actual'],4)
51 |     df['pct_deviation'].value_counts()/len(df)
52 |     print('max percentage deviation {}%'.format(df['pct_deviation'].max()))
53 |     deviations = [0,2,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100,150,200]
54 |     len_deviations = []
55 |     for i in deviations:
56 |         dev = df.loc[df['pct_deviation'] <= i]
57 |         len_ = len(dev)
58 |         len_deviations.append(len_)
59 |     #for i,index in enumerate(deviations):
60 |     #    len_deviations[i+1] = len_deviations[i+1]-len_deviations[i]
61 |     #    if(i==len(deviations)-2):
62 |     #        break;
63 |     for i,index in enumerate(deviations):
64 |         print('{} % \tdeviation - \tcount: {}, \tpct {}%'.format(index,len_deviations[i],round(len_deviations[i]*100/len(df),2)))
65 |     return
66 | 


--------------------------------------------------------------------------------