├── Rank_1_Team_Creed ├── readMe.docx └── code │ ├── __pycache__ │ └── ltfs_finhack.cpython-37.pyc │ ├── call.py │ └── ltfs_finhack.py ├── Rank_2_Team_Demolition_Crew ├── ReadMe.txt ├── 4_ensemble_all.R ├── 2_2_seg_1_Tbats.R ├── 2_1_tbats.ipynb ├── 1_seg_1_xgb.R └── 3_seg_2_xgb.R ├── README.md └── Rank_3_Zishan └── requirements.txt /Rank_1_Team_Creed/readMe.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kunalj101/LTFS-Data-Science-FinHack2/master/Rank_1_Team_Creed/readMe.docx -------------------------------------------------------------------------------- /Rank_1_Team_Creed/code/__pycache__/ltfs_finhack.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kunalj101/LTFS-Data-Science-FinHack2/master/Rank_1_Team_Creed/code/__pycache__/ltfs_finhack.cpython-37.pyc -------------------------------------------------------------------------------- /Rank_1_Team_Creed/code/call.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from ltfs_finhack import predict_finhack_data 3 | 4 | input_train = pd.read_csv('../input/ltfs_train.csv') 5 | input_test = pd.read_csv('../input/ltfs_test.csv') 6 | 7 | try: 8 | festival_data = pd.read_csv('../input/festival_dates.csv') 9 | except: 10 | festival_data = None 11 | 12 | try: 13 | predicted_test_data = predict_finhack_data(input_train, input_test, festival_data) 14 | predicted_test_data.to_csv('../output/predicted_test_data.csv', index = False) 15 | except Exception as e: 16 | print('Execution failed :(') 17 | print(e) 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /Rank_2_Team_Demolition_Crew/ReadMe.txt: -------------------------------------------------------------------------------- 1 | Packages required along with their version: 2 | 3 | data.table_1.12.2 4 | dplyr_0.8.3 5 | xgboost_0.90.0.2 6 | MLmetrics_1.1.1 7 | rPython_0.0-6 8 | reticulate_1.13 9 | fpp_0.5 10 | imputeTS_3.0 11 | forecast_8.10 12 | zoo_1.8-6 13 | 14 | 15 | Steps to generate the final prediction: 16 | 1. Please put train, test, submission and holiday (external data, shared over mail, source: https://www.officeholidays.com/countries/india) file in data folder and set it as working directory. 17 | 18 | 2. Now please execute the codes in sequence as provided below: 19 | 1_seg_1_xgb 20 | 2_1_tbats.ipynb 21 | 2_2_seg_1_Tbats 22 | 3_seg_2_xgb 23 | 4_ensemble_all 24 | 25 | Executing the 4th code will generate the final submission with name 'final_pred_seg_1_2.csv' in data folder. 26 | -------------------------------------------------------------------------------- /Rank_2_Team_Demolition_Crew/4_ensemble_all.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(dplyr) 3 | 4 | # segment 1 xgb 5 | seg_1_xgb <- fread('./data/pred_seg_1_xgb.csv') 6 | names(seg_1_xgb)[names(seg_1_xgb)=='case_count'] <- 'xgb_case_count' 7 | 8 | 9 | # segment 1 Tbats 10 | seg_1_tbats <- fread('./data/tbats_seg1.csv') 11 | names(seg_1_tbats)[names(seg_1_tbats)=='case_count'] <- 'tbats_case_count' 12 | 13 | seg_1_xgb_tbats <- left_join(seg_1_xgb, seg_1_tbats) 14 | seg_1_xgb_tbats$case_count <- seg_1_xgb_tbats$xgb_case_count*0.2 + seg_1_xgb_tbats$tbats_case_count*0.8 15 | 16 | 17 | # segment 2 xgb 18 | seg_2_xgb <- fread('./data/final_pred_seg_2.csv') 19 | 20 | final_pred_seg_1_2 <- rbind(seg_1_xgb_tbats[,c('id','application_date','segment','case_count')], 21 | seg_2_xgb) 22 | 23 | fwrite(final_pred_seg_1_2, './data/final_pred_seg_1_2.csv') 24 | 25 | 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LTFS-Data-Science-FinHack2 2 | 3 | ## Winners Solutions 4 | 5 | ## Problem Statement 6 | 7 | LTFS receives a lot of requests for its various finance offerings that include housing loan, two-wheeler loan, real estate financing and micro loans. The number of applications received is something that varies a lot with season. Going through these applications is a manual process and is tedious. Accurately forecasting the number of cases received can help with resource and manpower management resulting into quick response on applications and more efficient processing. 8 | 9 | You have been appointed with the task of forecasting daily cases for next 3 months for 2 different business segments aggregated at the country level keeping in consideration the following major Indian festivals (inclusive but not exhaustive list): Diwali, Dussehra, Ganesh Chaturthi, Navratri, Holi etc. (You are free to use any publicly available open source external datasets). Some other examples could be: 10 | 11 | Weather 12 | Macroeconomic variables 13 | 14 | Note that the external dataset must belong to a reliable source. 15 | -------------------------------------------------------------------------------- /Rank_2_Team_Demolition_Crew/2_2_seg_1_Tbats.R: -------------------------------------------------------------------------------- 1 | library(reticulate) 2 | library(data.table) 3 | library(fpp) #0.5 4 | library(imputeTS) #3.0 5 | library(dplyr) #0.8.3 6 | library(forecast) #8.10 7 | library(zoo) 8 | 9 | 10 | 11 | ########## loading data ################### 12 | train=read.csv("./data/preprocessed_train_for_tbats.csv") 13 | test=read.csv("./data/test.csv") 14 | sub=read.csv("./data/sample_submission.csv") 15 | 16 | #create a weekday column 17 | train$weekday<- weekdays(as.Date(train$application_date)) 18 | 19 | ########### preprocessed the data ########## 20 | num2=data.frame("branch_id"=c(),"application_date"=c(),"case_count"=c(),"case_count_preprocessed"=c()) 21 | for(i in 1:length(unique(train$branch_id))){ 22 | a=train[train$branch_id==unique(train$branch_id)[i],"case_count"] 23 | q1=length(a) 24 | x2=ts(a, frequency = 7) 25 | x2[!is.na(a)]=cumsum(a[!is.na(a)]) 26 | a=na.trim(a[x2!=0], sides = c("left"), is.na = c("any", "all")) 27 | q2=length(a) 28 | if(q2==0) next 29 | b=train[train$branch_id==unique(train$branch_id)[i],"application_date"] 30 | b=b[(q1-q2+1):q1] 31 | num=data.frame(b, a) 32 | names(num)[1] <- "application_date" 33 | names(num)[2] <- "case_count" 34 | num$case_count_preprocessed=c(round(tsclean(ts(a, frequency = 7)),2)) 35 | num$branch_id=unique(train$branch_id)[i] 36 | num2=rbind(num2,num) 37 | } 38 | 39 | 40 | num2[num2$case_count_preprocessed<=0,"case_count_preprocessed"]=0 41 | num2[is.na(num2$case_count),"case_count"] = 0 42 | 43 | 44 | ################################ merge and groupby ########################################################## 45 | num3=merge(num2,unique(train[,c("branch_id","segment")]),by="branch_id") 46 | num4=num3 %>% 47 | dplyr::group_by(segment,application_date) %>% 48 | dplyr::summarise(case_count_sum = sum(case_count), 49 | case_count_preprocessed_sum = sum(case_count_preprocessed)) 50 | 51 | num4_seg1=subset(num4,num4$segment==1) 52 | num4_seg2=subset(num4,num4$segment==2) 53 | num4_seg2$weekday<- weekdays(as.Date(num4_seg2$application_date)) 54 | 55 | ########################## function to calculate mape ########################################################## 56 | mape = function(a,b){ 57 | return(accuracy(b[b!=0],a[b!=0])[5]) 58 | } 59 | num4_seg1=num4_seg1[-nrow(num4_seg1),] 60 | 61 | ###################################### Submission ########################################################## 62 | #For Segment 1 63 | y.msts <- msts(num4_seg1$case_count_preprocessed_sum,seasonal.periods=c(7)) 64 | y.msts=tail(y.msts,420) 65 | month=4 66 | 67 | fit <- tbats(y.msts, use.box.cox=NULL, use.parallel=TRUE, num.cores = NULL, 68 | use.trend=NULL, use.damped.trend=NULL, use.arma.errors=TRUE, 69 | model=NULL) 70 | 71 | fc <- forecast(fit,(30*month)) 72 | 73 | a=seq(as.Date(max(as.Date(num4_seg1$application_date))), as.Date(max(as.Date(sub[sub$segment==1,"application_date"]))), by="days") 74 | a=a[2:length(a)] 75 | b=head(fc$mean,length(a)) 76 | sub_seg1=data.frame("application_date"=a,"case_count"=b) 77 | sub_seg1$segment=1 78 | 79 | sub_seg1_raw=sub[sub$segment==1,] 80 | sub_seg1_raw$case_count=NULL 81 | sub_seg1_raw$application_date=as.Date(sub_seg1_raw$application_date) 82 | 83 | 84 | sub5=merge(sub_seg1_raw,sub_seg1,on=c("application_date","segment")) 85 | sub5=sub5[order(sub5$id),] 86 | sub5=sub5[,c("id","application_date","segment","case_count")] 87 | fwrite(sub5,'./data/tbats_seg1.csv',row.names = FALSE) 88 | 89 | 90 | -------------------------------------------------------------------------------- /Rank_2_Team_Demolition_Crew/2_1_tbats.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "from datetime import date, timedelta\n", 12 | "\n", 13 | "data = pd.read_csv('./data/train.csv')\n", 14 | "data.head()\n", 15 | "\n", 16 | "new_branch_id=[2,4,6,12,22,23,24,25,26,27,28,30,31,32,33,37]\n", 17 | "old_branch_id=list(set(data.branch_id.unique())-set(new_branch_id))\n", 18 | "data['application_date']=pd.to_datetime(data['application_date'])\n", 19 | "data['branch_id_new']=data.loc[data.branch_id.isnull(),'state'].map({'TRIPURA':2,\n", 20 | " 'TAMIL NADU':4,\n", 21 | " 'PUNJAB':6,\n", 22 | " 'ASSAM':12,\n", 23 | " 'UTTAR PRADESH':22,\n", 24 | " 'MAHARASHTRA':23,\n", 25 | " 'ORISSA':24,\n", 26 | " 'KARNATAKA':25,\n", 27 | " 'KERALA':26,\n", 28 | " 'MADHYA PRADESH':27,\n", 29 | " 'HARYANA':28,\n", 30 | " 'BIHAR':30,\n", 31 | " 'GUJARAT':31,\n", 32 | " 'CHHATTISGARH':32,\n", 33 | " 'WEST BENGAL':33,\n", 34 | " 'JHARKHAND':37})\n", 35 | "data.loc[data.branch_id.isnull(),'branch_id']= data['branch_id_new']\n", 36 | "data=data.drop('branch_id_new',axis=1)\n", 37 | "\n", 38 | "d1 = data['application_date'].min()\n", 39 | "d2 = pd.to_datetime(data.groupby('branch_id')['application_date'].max().unique())\n", 40 | "\n", 41 | "# this will give you a list containing all of the dates\n", 42 | "dd_newlist = [d1 + timedelta(days=x) for x in range((d2[1]-d1).days + 1)]\n", 43 | "dd_oldlist = [d1 + timedelta(days=x) for x in range((d2[0]-d1).days + 1)]\n", 44 | "\n", 45 | "new=pd.DataFrame({'branch_id':np.repeat(old_branch_id,len(dd_oldlist)),\n", 46 | " 'application_date':dd_oldlist*len(old_branch_id)}).dropna()\n", 47 | "new = new.append(pd.DataFrame({'branch_id':np.repeat(new_branch_id,len(dd_newlist)),\n", 48 | " 'application_date':dd_newlist*len(new_branch_id)}), ignore_index=True)\n", 49 | "\n", 50 | "branch=data[['branch_id','segment','state']].drop_duplicates().copy()\n", 51 | "state=data[['state','zone']].drop_duplicates().dropna().copy()\n", 52 | "state=state[~((state.state == 'ORISSA') & (state.zone == 'SOUTH'))]\n", 53 | "branch=pd.merge(branch,state,on='state',how='left')\n", 54 | "\n", 55 | "new=pd.merge(new,branch,on='branch_id',how='left')\n", 56 | "\n", 57 | "data=data.drop(['state','zone'],axis=1)\n", 58 | "data=pd.merge(new,data,on=['branch_id','segment','application_date'],how='left')\n", 59 | "data.head()\n", 60 | "data.to_csv('preprocessed_train_for_tbats.csv',index=False)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [] 69 | } 70 | ], 71 | "metadata": { 72 | "kernelspec": { 73 | "display_name": "Python 3", 74 | "language": "python", 75 | "name": "python3" 76 | }, 77 | "language_info": { 78 | "codemirror_mode": { 79 | "name": "ipython", 80 | "version": 3 81 | }, 82 | "file_extension": ".py", 83 | "mimetype": "text/x-python", 84 | "name": "python", 85 | "nbconvert_exporter": "python", 86 | "pygments_lexer": "ipython3", 87 | "version": "3.7.3" 88 | } 89 | }, 90 | "nbformat": 4, 91 | "nbformat_minor": 2 92 | } 93 | -------------------------------------------------------------------------------- /Rank_3_Zishan/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.9.0 2 | alabaster==0.7.12 3 | anaconda-client==1.7.2 4 | anaconda-navigator==1.9.7 5 | anaconda-project==0.8.3 6 | asn1crypto==1.0.1 7 | astor==0.7.1 8 | astroid==2.3.1 9 | astropy==3.2.1 10 | atomicwrites==1.3.0 11 | attrs==19.2.0 12 | Babel==2.7.0 13 | backcall==0.1.0 14 | backports.functools-lru-cache==1.5 15 | backports.os==0.1.1 16 | backports.shutil-get-terminal-size==1.0.0 17 | backports.tempfile==1.0 18 | backports.weakref==1.0.post1 19 | beautifulsoup4==4.8.0 20 | bitarray==1.0.1 21 | bkcharts==0.2 22 | bleach==3.1.0 23 | bokeh==1.3.4 24 | boto==2.49.0 25 | Bottleneck==1.2.1 26 | catboost==0.16.5 27 | certifi==2019.9.11 28 | cffi==1.12.3 29 | chardet==3.0.4 30 | chart-studio==1.0.0 31 | Click==7.0 32 | cloudpickle==1.2.2 33 | clyent==1.2.2 34 | colorama==0.4.1 35 | colorlover==0.3.0 36 | comtypes==1.1.7 37 | conda==4.8.0 38 | conda-build==3.18.9 39 | conda-package-handling==1.6.0 40 | conda-verify==3.4.2 41 | contextlib2==0.6.0 42 | convertdate==2.1.3 43 | cryptography==2.7 44 | cufflinks==0.17.0 45 | cycler==0.10.0 46 | Cython==0.29.13 47 | cytoolz==0.10.0 48 | dask==2.5.2 49 | decorator==4.4.0 50 | defusedxml==0.6.0 51 | distributed==2.5.2 52 | docutils==0.15.2 53 | entrypoints==0.3 54 | ephem==3.7.7.0 55 | et-xmlfile==1.0.1 56 | fastcache==1.1.0 57 | fbprophet==0.5 58 | filelock==3.0.12 59 | Flask==1.1.1 60 | fsspec==0.5.2 61 | future==0.17.1 62 | gast==0.2.2 63 | gevent==1.4.0 64 | glob2==0.7 65 | google-pasta==0.1.8 66 | graphviz==0.13.2 67 | greenlet==0.4.15 68 | grpcio==1.23.0 69 | h5py==2.9.0 70 | HeapDict==1.0.1 71 | holidays==0.9.11 72 | html5lib==1.0.1 73 | idna==2.8 74 | imageio==2.6.0 75 | imagesize==1.1.0 76 | importlib-metadata==0.23 77 | ipykernel==5.1.2 78 | ipython==7.8.0 79 | ipython-genutils==0.2.0 80 | ipywidgets==7.5.1 81 | isort==4.3.21 82 | itsdangerous==1.1.0 83 | jdcal==1.4.1 84 | jedi==0.15.1 85 | Jinja2==2.10.3 86 | joblib==0.13.2 87 | json5==0.8.5 88 | jsonschema==3.0.2 89 | jupyter==1.0.0 90 | jupyter-client==5.3.3 91 | jupyter-console==6.0.0 92 | jupyter-core==4.5.0 93 | jupyterlab==1.1.4 94 | jupyterlab-server==1.0.6 95 | Keras==2.3.1 96 | Keras-Applications==1.0.8 97 | Keras-Preprocessing==1.1.0 98 | keyring==18.0.0 99 | kiwisolver==1.1.0 100 | lazy-object-proxy==1.4.2 101 | libarchive-c==2.8 102 | lightgbm==2.3.1 103 | llvmlite==0.29.0 104 | locket==0.2.0 105 | lunardate==0.2.0 106 | lxml==4.4.1 107 | Mako==1.1.0 108 | Markdown==3.1.1 109 | MarkupSafe==1.1.1 110 | matplotlib==3.1.1 111 | mccabe==0.6.1 112 | menuinst==1.4.16 113 | mistune==0.8.4 114 | mkl-fft==1.0.14 115 | mkl-random==1.1.0 116 | mkl-service==2.3.0 117 | mock==3.0.5 118 | more-itertools==7.2.0 119 | mpmath==1.1.0 120 | msgpack==0.6.1 121 | multipledispatch==0.6.0 122 | navigator-updater==0.2.1 123 | nbconvert==5.6.0 124 | nbformat==4.4.0 125 | networkx==2.3 126 | nltk==3.4.5 127 | nose==1.3.7 128 | notebook==6.0.1 129 | numba==0.45.1 130 | numexpr==2.7.0 131 | numpy==1.16.5 132 | numpydoc==0.9.1 133 | olefile==0.46 134 | openpyxl==3.0.0 135 | opt-einsum==0+untagged.49.gdbede45.dirty 136 | packaging==19.2 137 | pandas==0.25.1 138 | pandocfilters==1.4.2 139 | parso==0.5.1 140 | partd==1.0.0 141 | path.py==12.0.1 142 | pathlib2==2.3.5 143 | patsy==0.5.1 144 | pep8==1.7.1 145 | pickleshare==0.7.5 146 | Pillow==6.2.0 147 | pkginfo==1.5.0.1 148 | plotly==4.4.1 149 | pluggy==0.13.0 150 | ply==3.11 151 | pmdarima==1.5.2 152 | prometheus-client==0.7.1 153 | prompt-toolkit==2.0.10 154 | protobuf==3.11.2 155 | psutil==5.6.3 156 | py==1.8.0 157 | pycodestyle==2.5.0 158 | pycosat==0.6.3 159 | pycparser==2.19 160 | pycrypto==2.6.1 161 | pycurl==7.43.0.3 162 | pyflakes==2.1.1 163 | Pygments==2.4.2 164 | pygpu==0.7.6 165 | pylint==2.4.2 166 | pyodbc==4.0.27 167 | pyOpenSSL==19.0.0 168 | pyparsing==2.4.2 169 | pyreadline==2.1 170 | pyrsistent==0.15.4 171 | PySocks==1.7.1 172 | pystan==2.19.0.0 173 | pytest==5.2.1 174 | pytest-arraydiff==0.3 175 | pytest-astropy==0.5.0 176 | pytest-doctestplus==0.4.0 177 | pytest-openfiles==0.4.0 178 | pytest-remotedata==0.3.2 179 | python-dateutil==2.8.0 180 | pytz==2019.3 181 | PyWavelets==1.0.3 182 | pywin32==223 183 | pywinpty==0.5.5 184 | PyYAML==5.1.2 185 | pyzmq==18.1.0 186 | QtAwesome==0.6.0 187 | qtconsole==4.5.5 188 | QtPy==1.9.0 189 | requests==2.22.0 190 | retrying==1.3.3 191 | rope==0.14.0 192 | ruamel-yaml==0.15.46 193 | scikit-image==0.15.0 194 | scikit-learn==0.21.3 195 | scipy==1.3.1 196 | seaborn==0.9.0 197 | Send2Trash==1.5.0 198 | simplegeneric==0.8.1 199 | singledispatch==3.4.0.3 200 | six==1.12.0 201 | snowballstemmer==2.0.0 202 | sortedcollections==1.1.2 203 | sortedcontainers==2.1.0 204 | soupsieve==1.9.3 205 | Sphinx==2.2.0 206 | sphinxcontrib-applehelp==1.0.1 207 | sphinxcontrib-devhelp==1.0.1 208 | sphinxcontrib-htmlhelp==1.0.2 209 | sphinxcontrib-jsmath==1.0.1 210 | sphinxcontrib-qthelp==1.0.2 211 | sphinxcontrib-serializinghtml==1.1.3 212 | sphinxcontrib-websupport==1.1.2 213 | spyder==3.3.6 214 | spyder-kernels==0.5.2 215 | SQLAlchemy==1.3.9 216 | statsmodels==0.10.1 217 | sympy==1.4 218 | tables==3.5.2 219 | tblib==1.4.0 220 | tensorboard==2.0.0 221 | tensorflow==2.0.0 222 | tensorflow-estimator==2.0.0 223 | termcolor==1.1.0 224 | terminado==0.8.2 225 | testpath==0.4.2 226 | Theano==1.0.4 227 | toolz==0.10.0 228 | tornado==6.0.3 229 | tqdm==4.36.1 230 | traitlets==4.3.3 231 | unicodecsv==0.14.1 232 | urllib3==1.24.2 233 | wcwidth==0.1.7 234 | webencodings==0.5.1 235 | Werkzeug==0.16.0 236 | widgetsnbextension==3.5.1 237 | win-inet-pton==1.1.0 238 | win-unicode-console==0.5 239 | wincertstore==0.2 240 | wrapt==1.11.2 241 | xgboost==0.90 242 | xlrd==1.2.0 243 | XlsxWriter==1.2.1 244 | xlwings==0.15.10 245 | xlwt==1.3.0 246 | zict==1.0.0 247 | zipp==0.6.0 248 | -------------------------------------------------------------------------------- /Rank_2_Team_Demolition_Crew/1_seg_1_xgb.R: -------------------------------------------------------------------------------- 1 | library(data.table) # data.table_1.12.2 2 | library(dplyr) # dplyr_0.8.3 3 | library(xgboost) # xgboost_0.90.0.2 4 | library(MLmetrics) # MLmetrics_1.1.1 5 | 6 | 7 | ############# loading data ############### 8 | train_data <- fread('./data/train.csv', na.strings = c('',' ', NA)) 9 | test_data <- fread('./data/test.csv' , na.strings = c('',' ', NA)) 10 | submission <- fread('./data/sample_submission.csv' , na.strings = c('',' ', NA)) 11 | 12 | train_data$days <- weekdays(as.Date(train_data$application_date)) 13 | 14 | holidays_mapping <- fread('./data/holiday_mapping.csv') # source: https://www.officeholidays.com/countries/india 15 | holidays_mapping$date <- as.Date(holidays_mapping$date, format = '%d/%m/%Y') 16 | holidays_mapping$holiday_flag <- 1 17 | holidays_mapping$type <- ifelse(holidays_mapping$type =='Observance, Christian','Observance', holidays_mapping$type) 18 | holidays_mapping$type <- ifelse(holidays_mapping$type =='Muslim, Common local holiday','local', holidays_mapping$type) 19 | holidays_mapping$type <- tolower(holidays_mapping$type) 20 | holidays_mapping$type <- gsub(' ','_', holidays_mapping$type) 21 | holiday_type_binary_flag <- dcast(setDT(holidays_mapping), date ~ type, value.var = c("holiday_flag")) 22 | holidays_mapping <- left_join(holidays_mapping, holiday_type_binary_flag) 23 | ########################################## 24 | 25 | 26 | ############# branch, state and zone mapping ############ 27 | branch_state_zone_mapping <- unique(train_data[train_data$segment==1,c('branch_id','state','zone')]) 28 | 29 | state_zone_mapping <- unique(train_data[train_data$segment==1,c('state','zone')]) 30 | state_zone_mapping$rm_it_flag <- ifelse(state_zone_mapping$state == 'ORISSA' & state_zone_mapping$zone == 'EAST',1,0) 31 | state_zone_mapping <- state_zone_mapping[state_zone_mapping$rm_it_flag !=1, ] 32 | state_zone_mapping$rm_it_flag <- NULL 33 | ######################################################### 34 | 35 | 36 | ####################### Data Preparation ################# 37 | # segment 1 38 | seg_1_min_date <- min(train_data$application_date[train_data$segment==1]) 39 | seg_1_train_date <- max(train_data$application_date[train_data$segment==1]) 40 | seg_1_max_date <- max(submission$application_date[submission$segment==1]) 41 | seg_1_dates <- as.character(seq(from=as.Date(seg_1_min_date),to=as.Date(seg_1_max_date),by="1 day")) 42 | seg_1_master_data_set <- merge(unique(train_data$branch_id[train_data$segment==1]), seg_1_dates, all = T) 43 | names(seg_1_master_data_set) <- c('branch_id','application_date') 44 | seg_1_master_data_set$application_date <- as.character(seg_1_master_data_set$application_date) 45 | seg_1_master_data_set$segment <- 1 46 | seg_1_master_data_set <- left_join(seg_1_master_data_set, branch_state_zone_mapping) 47 | seg_1_master_data_set <- left_join(seg_1_master_data_set, train_data) 48 | seg_1_master_data_set$days <- NULL 49 | seg_1_master_data_set <- seg_1_master_data_set[order(seg_1_master_data_set$branch_id, seg_1_master_data_set$application_date), ] 50 | seg_1_master_data_set$case_count[is.na(seg_1_master_data_set$case_count)] <- 0 51 | 52 | 53 | train_valid_test_split <- unique(data.frame(seg_1_master_data_set[,c('application_date')])) 54 | names(train_valid_test_split)[1] <- 'application_date' 55 | train_valid_test_split$application_date <- as.character(train_valid_test_split$application_date) 56 | train_valid_test_split$data_type <- ifelse(train_valid_test_split$application_date < max(train_data$application_date[train_data$segment==1]), 'train','submission') 57 | validation_data_seg_1 <- tail(train_valid_test_split[train_valid_test_split$data_type=='train',], n = 87) 58 | validation_data_seg_1$data_type <- ifelse(validation_data_seg_1$application_date < as.Date(min(validation_data_seg_1$application_date))+29, 'validation_M1', 59 | ifelse(validation_data_seg_1$application_date < as.Date(min(validation_data_seg_1$application_date))+58, 'validation_M2','validation_M3')) 60 | 61 | table(validation_data_seg_1$data_type) 62 | validation_data_seg_1$segment <- 1 63 | 64 | seg_1_master_data_set <- left_join(seg_1_master_data_set, validation_data_seg_1) 65 | seg_1_master_data_set$data_type <- ifelse(seg_1_master_data_set$application_date > seg_1_train_date,'submission', 66 | ifelse(is.na(seg_1_master_data_set$data_type) & seg_1_master_data_set$application_date < seg_1_train_date, 'train', seg_1_master_data_set$data_type )) 67 | seg_1_master_data_set$data_type[is.na(seg_1_master_data_set$data_type)] <- 'Others' 68 | 69 | 70 | # segment 2 71 | seg_2_min_date <- min(train_data$application_date[train_data$segment==2]) 72 | seg_2_train_date <- max(train_data$application_date[train_data$segment==2]) 73 | seg_2_max_date <- max(submission$application_date[submission$segment==2]) 74 | seg_2_dates <- as.character(seq(from=as.Date(seg_2_min_date),to=as.Date(seg_2_max_date),by="1 day")) 75 | seg_2_master_data_set <- merge(unique(train_data$state[train_data$segment==2]), seg_2_dates, all = T) 76 | names(seg_2_master_data_set) <- c('state','application_date') 77 | seg_2_master_data_set$state <- as.character(seg_2_master_data_set$state) 78 | seg_2_master_data_set$application_date <- as.character(seg_2_master_data_set$application_date) 79 | seg_2_master_data_set$segment <- 2 80 | 81 | seg_2_master_data_set <- seg_2_master_data_set[order(seg_2_master_data_set$state, seg_2_master_data_set$application_date), ] 82 | seg_2_master_data_set <- left_join(seg_2_master_data_set, train_data[train_data$segment==2, c('application_date','segment','state','case_count')]) 83 | sort(colSums(is.na(seg_2_master_data_set))) 84 | seg_2_master_data_set$case_count[is.na(seg_2_master_data_set$case_count)] <- 0 85 | 86 | 87 | dim(train_data[train_data$segment==1, ]) # 66898 88 | dim(train_data[train_data$segment==2, ]) # 13504 89 | 90 | dim(seg_1_master_data_set) # 75779 91 | dim(seg_2_master_data_set) # 14992 92 | 93 | rm(train_valid_test_split, branch_state_zone_mapping, state_zone_mapping, validation_data_seg_1) 94 | ############################################################### 95 | 96 | 97 | ###################### date features ############### 98 | date_data <- data.frame(date = unique(c(seg_1_master_data_set$application_date, seg_2_master_data_set$application_date))) 99 | summary(as.Date(date_data$date)) 100 | date_data$date <- as.Date(date_data$date) 101 | date_data <- data.table(date_data) 102 | 103 | date_data[, ":="( month_alpha = format(date,'%B'), 104 | weekdays = weekdays(date), 105 | month_week_number = ceiling(as.numeric(format(date, "%d"))/7), 106 | weekdays_num = wday(date), 107 | month = month(date), 108 | quarter = quarter(date), 109 | year = year(date), 110 | 111 | month_day = as.numeric(format(date, "%d")), 112 | year_week_number = as.numeric(format(date, "%W")) 113 | )] 114 | 115 | date_data$week_number_day = paste(date_data$month_week_number, date_data$weekdays, sep = '_') 116 | unique(date_data$week_number_day) 117 | date_data$week_number_day <- as.numeric(as.factor(date_data$week_number_day)) 118 | 119 | dim(date_data) 120 | names(holidays_mapping) 121 | date_data <- left_join(date_data, holidays_mapping[,c('date','name','type','gazetted_holiday','local','observance','restricted_holiday','season')]) 122 | 123 | date_data <- date_data %>% dplyr::group_by(year, month) %>% dplyr::mutate(month_end_date = max(month_day)) 124 | date_data$month_end_date <- NULL 125 | 126 | names(date_data)[names(date_data)=='date'] <- 'application_date' 127 | date_data$application_date <- as.character(date_data$application_date) 128 | 129 | date_data$gazetted_holiday[is.na(date_data$gazetted_holiday)] <- 0 130 | date_data$local[is.na(date_data$local)] <- 0 131 | date_data$observance[is.na(date_data$observance)] <- 0 132 | date_data$restricted_holiday[is.na(date_data$restricted_holiday)] <- 0 133 | date_data$season[is.na(date_data$season)] <- 0 134 | sort(colSums(is.na(date_data))) 135 | 136 | 137 | exclude_cols <- c() 138 | date_data$f_holiday <- ifelse(date_data$observance>0 | date_data$gazetted_holiday>0, 1, 0) 139 | temp <- date_data %>% 140 | dplyr::mutate(flag_cumulative = cumsum(f_holiday)) %>% 141 | dplyr::group_by(flag_cumulative) %>% 142 | dplyr::mutate(days_elapsed_since_last_holiday = 1:n() -1) %>% 143 | dplyr::mutate(days_elapsed_since_last_holiday=rank(days_elapsed_since_last_holiday)/length(days_elapsed_since_last_holiday)) %>% 144 | dplyr::ungroup() %>% 145 | dplyr::select(-flag_cumulative) 146 | 147 | date_data <- left_join(date_data, unique(temp[,c('application_date','days_elapsed_since_last_holiday')])) 148 | unique(date_data$type) 149 | date_data$holiday_flag_factor <- as.numeric(as.factor(date_data$type)) 150 | ############################################################## 151 | 152 | head(date_data) 153 | date_seg_1_master <- seg_1_master_data_set %>% dplyr::group_by(application_date,segment,data_type) %>% dplyr::summarise(case_count = sum(case_count, na.rm = T)) 154 | head(seg_1_master_data_set) 155 | 156 | date_seg_master <- left_join(date_seg_1_master, date_data) 157 | sort(colSums(is.na(date_seg_master))) 158 | 159 | 160 | ##################### creating lag feature ############# 161 | # lag feature (previous year) 162 | date_seg_master <- date_seg_master %>% 163 | dplyr::group_by(segment) %>% 164 | dplyr::mutate(seg_lag_365 = lag(case_count, 365)) 165 | ######################################################## 166 | 167 | 168 | names(date_seg_master) 169 | cols_to_select <- c('data_type','segment', 170 | 'month_week_number','weekdays_num',"month" , "quarter", "year","month_day", "year_week_number","day_of_year", 'week_number_day','case_count') 171 | 172 | lag_feat <- c('seg_lag_365') 173 | holiday_feat <- c('days_elapsed_since_last_holiday','holiday_flag_factor') 174 | 175 | cols_to_select <- unique(c(cols_to_select, lag_feat, holiday_feat)) 176 | cols_to_select 177 | 178 | date_seg_master_1 <- date_seg_master 179 | dim(date_seg_master_1) 180 | date_seg_master_1 <- date_seg_master_1[, names(date_seg_master_1) %in% cols_to_select] 181 | 182 | 183 | date_seg_1_master <- date_seg_master_1[date_seg_master_1$segment==1, ] 184 | date_seg_1_master$segment <- NULL 185 | 186 | 187 | ############## MAPE function ################ 188 | xgb_mape <- function(preds, dtrain){ 189 | labels <- xgboost::getinfo(dtrain, 'label') 190 | err <- sum(abs((as.numeric(labels) - as.numeric(preds))/as.numeric(labels))) 191 | err <- err*100/length(labels) 192 | return(list(metric='MAPE_custom', value=err)) 193 | } 194 | ########################################### 195 | 196 | 197 | ########## Model for segment 1 ################## 198 | # 1. CV train 199 | train_ids <- c('train', 'validation_M1') 200 | X_train <- date_seg_1_master[date_seg_1_master$data_type %in% train_ids, ] 201 | Y_train <- date_seg_1_master$case_count[date_seg_1_master$data_type %in% train_ids] 202 | 203 | X_train$data_type <- NULL 204 | X_train$case_count <- NULL 205 | xgtrain <- xgb.DMatrix(data = as.matrix(X_train), label = Y_train, missing = NA) 206 | 207 | 208 | 209 | # 2. validation 210 | validation_month <- 'validation_M2' 211 | X_test <- date_seg_1_master[date_seg_1_master$data_type %in% validation_month, ] 212 | Y_test <- date_seg_1_master$case_count[date_seg_1_master$data_type %in% validation_month] 213 | 214 | X_test$data_type <- NULL 215 | X_test$case_count <- NULL 216 | 217 | str(X_test) 218 | xgtest <- xgb.DMatrix(data = as.matrix(X_test), label = Y_test, missing = NA) 219 | watchlist <- list(train = xgtrain, valid = xgtest) 220 | 221 | 222 | # 3. OOS 223 | validation_month <- 'validation_M3' 224 | valid_X <- date_seg_1_master[date_seg_1_master$data_type %in% validation_month, ] 225 | valid_Y <- date_seg_1_master$case_count[date_seg_1_master$data_type %in% validation_month] 226 | 227 | valid_X$data_type <- NULL 228 | valid_X$case_count <- NULL 229 | 230 | 231 | # 4. final train and test 232 | train <- date_seg_1_master[date_seg_1_master$data_type=='train' | 233 | date_seg_1_master$data_type %like% 'validation', ] 234 | test <- date_seg_1_master[date_seg_1_master$data_type %like% 'submission', ] 235 | 236 | target <- train$case_count 237 | train$case_count <- NULL 238 | test$case_count <- NULL 239 | 240 | train$data_type <- NULL 241 | test$data_type <- NULL 242 | 243 | 244 | ####################################### creating xgb model ######################################## 245 | names(train) 246 | names(test) 247 | table(names(train)==names(test)) 248 | table(names(train)==names(X_train)) 249 | str(data.frame(train)) 250 | 251 | # model hyperparameters 252 | param = list("objective" = "reg:linear", 253 | # eval_metric = "rmse" , 254 | "eval_metric" = xgb_mape, "nthread" = -1, 255 | "eta" = 0.1, "max_depth" = 10, "subsample" = 0.95, "colsample_bytree" =1, 256 | "gamma" = 0,"min_child_weight" = 1) 257 | 258 | #---------------------------------------- Holdout CV ---------------------------------------------# 259 | set.seed(300) 260 | num_round <- 5000 261 | bst_cv <- xgb.train(param, 262 | data = xgtrain, 263 | watchlist = watchlist, 264 | num_round, 265 | early_stopping_round = 20, 266 | maximize = F, 267 | missing = NA) 268 | 269 | bst_nrounds <- bst_cv$best_iteration # optimal number of iteration 270 | 271 | # pred on OOS data 272 | OOS_pred <- predict (bst_cv, as.matrix(valid_X), missing = NA) 273 | mape_oos <- MAPE(OOS_pred, valid_Y)*100 274 | mape_oos 275 | 276 | 277 | #--------------------------------- Final Model for Test data ----------------------# 278 | # training the final model 279 | set.seed(500) 280 | bst <- xgboost(param=param, 281 | data =as.matrix(train), 282 | label = target, 283 | nrounds=bst_nrounds, 284 | missing=NA) 285 | 286 | # making predicton on test data 287 | pred = predict(bst, as.matrix(test),missing=NA) 288 | pred <- ifelse(pred>3000, pred*1.1, pred) 289 | sum(pred) # 295881 290 | 291 | 292 | # segment 1 submission file 293 | seg_1_subm <- submission[submission$segment==1, ] 294 | seg_1_subm$case_count <- pred 295 | sum(seg_1_subm$case_count) 296 | 297 | fwrite(seg_1_subm, './data/pred_seg_1_xgb.csv') 298 | 299 | 300 | ################ Feature importance ############### 301 | names = names(train) 302 | importance_matrix = xgb.importance(names, model=bst) 303 | gp = xgb.plot.importance(importance_matrix) 304 | print(gp) 305 | ################################################# 306 | 307 | -------------------------------------------------------------------------------- /Rank_2_Team_Demolition_Crew/3_seg_2_xgb.R: -------------------------------------------------------------------------------- 1 | library(data.table) # data.table_1.12.2 2 | library(dplyr) # dplyr_0.8.3 3 | library(xgboost) # xgboost_0.90.0.2 4 | library(MLmetrics) # MLmetrics_1.1.1 5 | 6 | 7 | ############# loading data ############### 8 | train_data <- fread('./data/train.csv', na.strings = c('',' ', NA)) 9 | test_data <- fread('./data/test.csv' , na.strings = c('',' ', NA)) 10 | submission <- fread('./data/sample_submission.csv' , na.strings = c('',' ', NA)) 11 | 12 | train_data$days <- weekdays(as.Date(train_data$application_date)) 13 | 14 | holidays_mapping <- fread('./data/holiday_mapping.csv') 15 | holidays_mapping$date <- as.Date(holidays_mapping$date, format = '%d/%m/%Y') 16 | summary(holidays_mapping$date) 17 | holidays_mapping$holiday_flag <- 1 18 | unique(holidays_mapping$type) 19 | sort(table(holidays_mapping$type)) 20 | holidays_mapping$type <- ifelse(holidays_mapping$type =='Observance, Christian','Observance', holidays_mapping$type) 21 | holidays_mapping$type <- ifelse(holidays_mapping$type =='Muslim, Common local holiday','local', holidays_mapping$type) 22 | 23 | holidays_mapping$type <- tolower(holidays_mapping$type) 24 | holidays_mapping$type <- gsub(' ','_', holidays_mapping$type) 25 | 26 | holiday_type_binary_flag <- dcast(setDT(holidays_mapping), date ~ type, value.var = c("holiday_flag")) 27 | names(holiday_type_binary_flag) 28 | 29 | dim(holidays_mapping) 30 | holidays_mapping <- left_join(holidays_mapping, holiday_type_binary_flag) 31 | ########################################## 32 | 33 | 34 | ############# branch, state and zone mapping ############ 35 | names(train_data) 36 | branch_state_zone_mapping <- unique(train_data[train_data$segment==1,c('branch_id','state','zone')]) 37 | 38 | state_zone_mapping <- unique(train_data[train_data$segment==1,c('state','zone')]) 39 | state_zone_mapping$rm_it_flag <- ifelse(state_zone_mapping$state == 'ORISSA' & state_zone_mapping$zone == 'EAST',1,0) 40 | state_zone_mapping <- state_zone_mapping[state_zone_mapping$rm_it_flag !=1, ] 41 | state_zone_mapping$rm_it_flag <- NULL 42 | ######################################################### 43 | 44 | 45 | ####################### Data Preparation ################# 46 | # segment 1 47 | seg_1_min_date <- min(train_data$application_date[train_data$segment==1]) 48 | seg_1_train_date <- max(train_data$application_date[train_data$segment==1]) 49 | seg_1_max_date <- max(submission$application_date[submission$segment==1]) 50 | seg_1_dates <- as.character(seq(from=as.Date(seg_1_min_date),to=as.Date(seg_1_max_date),by="1 day")) 51 | seg_1_master_data_set <- merge(unique(train_data$branch_id[train_data$segment==1]), seg_1_dates, all = T) 52 | names(seg_1_master_data_set) <- c('branch_id','application_date') 53 | seg_1_master_data_set$application_date <- as.character(seg_1_master_data_set$application_date) 54 | seg_1_master_data_set$segment <- 1 55 | seg_1_master_data_set <- left_join(seg_1_master_data_set, branch_state_zone_mapping) 56 | sort(colSums(is.na(seg_1_master_data_set))) 57 | seg_1_master_data_set <- left_join(seg_1_master_data_set, train_data) 58 | seg_1_master_data_set$days <- NULL 59 | seg_1_master_data_set <- seg_1_master_data_set[order(seg_1_master_data_set$branch_id, seg_1_master_data_set$application_date), ] 60 | seg_1_master_data_set$case_count[is.na(seg_1_master_data_set$case_count)] <- 0 61 | 62 | 63 | train_valid_test_split <- unique(data.frame(seg_1_master_data_set[,c('application_date')])) 64 | names(train_valid_test_split)[1] <- 'application_date' 65 | train_valid_test_split$application_date <- as.character(train_valid_test_split$application_date) 66 | train_valid_test_split$data_type <- ifelse(train_valid_test_split$application_date < max(train_data$application_date[train_data$segment==1]), 'train','submission') 67 | dim(train_valid_test_split) 68 | validation_data_seg_1 <- tail(train_valid_test_split[train_valid_test_split$data_type=='train',], n = 87) 69 | validation_data_seg_1$data_type <- ifelse(validation_data_seg_1$application_date < as.Date(min(validation_data_seg_1$application_date))+29, 'validation_M1', 70 | ifelse(validation_data_seg_1$application_date < as.Date(min(validation_data_seg_1$application_date))+58, 'validation_M2','validation_M3')) 71 | 72 | table(validation_data_seg_1$data_type) 73 | validation_data_seg_1$segment <- 1 74 | 75 | seg_1_master_data_set <- left_join(seg_1_master_data_set, validation_data_seg_1) 76 | seg_1_master_data_set$data_type <- ifelse(seg_1_master_data_set$application_date > seg_1_train_date,'submission', 77 | ifelse(is.na(seg_1_master_data_set$data_type) & seg_1_master_data_set$application_date < seg_1_train_date, 'train', seg_1_master_data_set$data_type )) 78 | seg_1_master_data_set$data_type[is.na(seg_1_master_data_set$data_type)] <- 'Others' 79 | 80 | 81 | # segment 2 82 | seg_2_min_date <- min(train_data$application_date[train_data$segment==2]) 83 | seg_2_train_date <- max(train_data$application_date[train_data$segment==2]) 84 | seg_2_max_date <- max(submission$application_date[submission$segment==2]) 85 | seg_2_dates <- as.character(seq(from=as.Date(seg_2_min_date),to=as.Date(seg_2_max_date),by="1 day")) 86 | seg_2_master_data_set <- merge(unique(train_data$state[train_data$segment==2]), seg_2_dates, all = T) 87 | names(seg_2_master_data_set) <- c('state','application_date') 88 | seg_2_master_data_set$state <- as.character(seg_2_master_data_set$state) 89 | seg_2_master_data_set$application_date <- as.character(seg_2_master_data_set$application_date) 90 | seg_2_master_data_set$segment <- 2 91 | 92 | seg_2_master_data_set <- seg_2_master_data_set[order(seg_2_master_data_set$state, seg_2_master_data_set$application_date), ] 93 | seg_2_master_data_set <- left_join(seg_2_master_data_set, train_data[train_data$segment==2, c('application_date','segment','state','case_count')]) 94 | sort(colSums(is.na(seg_2_master_data_set))) 95 | seg_2_master_data_set$case_count[is.na(seg_2_master_data_set$case_count)] <- 0 96 | 97 | 98 | 99 | train_valid_test_split <- unique(data.frame(seg_2_master_data_set[,c('application_date')])) 100 | names(train_valid_test_split)[1] <- 'application_date' 101 | train_valid_test_split$application_date <- as.character(train_valid_test_split$application_date) 102 | train_valid_test_split$data_type <- ifelse(train_valid_test_split$application_date <= max(train_data$application_date[train_data$segment==2]), 'train','submission') 103 | dim(train_valid_test_split) 104 | validation_data_seg_2 <- tail(train_valid_test_split[train_valid_test_split$data_type=='train',], n = 93) 105 | validation_data_seg_2$data_type <- ifelse(validation_data_seg_2$application_date <= as.Date(min(validation_data_seg_2$application_date))+30, 'validation_M1', 106 | ifelse(validation_data_seg_2$application_date <= as.Date(min(validation_data_seg_2$application_date))+61, 'validation_M2','validation_M3')) 107 | 108 | table(validation_data_seg_2$data_type) 109 | validation_data_seg_2$segment <- 2 110 | 111 | seg_2_master_data_set <- left_join(seg_2_master_data_set, validation_data_seg_2) 112 | seg_2_master_data_set$data_type <- ifelse(seg_2_master_data_set$application_date > seg_2_train_date,'submission', 113 | ifelse(is.na(seg_2_master_data_set$data_type) & seg_2_master_data_set$application_date <= seg_2_train_date, 'train', seg_2_master_data_set$data_type )) 114 | 115 | table(seg_2_master_data_set$data_type) 116 | 117 | dim(train_data[train_data$segment==1, ]) # 66898 118 | dim(train_data[train_data$segment==2, ]) # 13504 119 | 120 | dim(seg_1_master_data_set) # 75779 121 | dim(seg_2_master_data_set) # 14992 122 | 123 | rm(train_valid_test_split, branch_state_zone_mapping, state_zone_mapping, validation_data_seg_1, validation_data_seg_2) 124 | gc() 125 | ############################################################### 126 | 127 | 128 | ###################### date features ############### 129 | date_data <- data.frame(date = unique(c(seg_1_master_data_set$application_date, seg_2_master_data_set$application_date))) 130 | summary(as.Date(date_data$date)) 131 | date_data$date <- as.Date(date_data$date) 132 | date_data <- data.table(date_data) 133 | 134 | date_data[, ":="( month_alpha = format(date,'%B'), 135 | weekdays = weekdays(date), 136 | month_week_number = ceiling(as.numeric(format(date, "%d"))/7), 137 | weekdays_num = wday(date), 138 | month = month(date), 139 | quarter = quarter(date), 140 | year = year(date), 141 | 142 | month_day = as.numeric(format(date, "%d")), 143 | year_week_number = as.numeric(format(date, "%W")) 144 | )] 145 | 146 | names(holidays_mapping) 147 | date_data <- left_join(date_data, holidays_mapping[,c('date','name','type','gazetted_holiday','local','observance','restricted_holiday','season')]) 148 | names(date_data) 149 | date_data <- date_data %>% dplyr::group_by(year, month) %>% dplyr::mutate(month_end_date = max(month_day)) 150 | date_data$month_end_date <- ifelse(date_data$month_alpha=='October' & date_data$year==2019, 31, date_data$month_end_date) 151 | date_data$temp_key_1 <- ifelse(date_data$month_day <= 7, 'week_1', 152 | ifelse(date_data$month_day > 7 & date_data$month_day <= 10, 'week_2_8_10', 153 | ifelse(date_data$month_day > 10 & date_data$month_day <= 18, 'week_3', 154 | ifelse(date_data$month_day > 18 & date_data$month_day <= 25, 'week_4','week_5') ))) 155 | 156 | 157 | date_data$custom_date_key <- paste(date_data$temp_key_1, date_data$weekdays, sep = '_') 158 | date_data$custom_date_key <- ifelse(date_data$month_day == date_data$month_end_date, 'month_end', date_data$custom_date_key) 159 | date_data$month_end_date <- NULL 160 | names(date_data)[names(date_data)=='date'] <- 'application_date' 161 | date_data$application_date <- as.character(date_data$application_date) 162 | 163 | date_data$gazetted_holiday[is.na(date_data$gazetted_holiday)] <- 0 164 | date_data$local[is.na(date_data$local)] <- 0 165 | date_data$observance[is.na(date_data$observance)] <- 0 166 | date_data$restricted_holiday[is.na(date_data$restricted_holiday)] <- 0 167 | date_data$season[is.na(date_data$season)] <- 0 168 | 169 | sort(colSums(is.na(date_data))) 170 | ############################################################## 171 | 172 | head(date_data) 173 | date_seg_2_master <- seg_2_master_data_set %>% dplyr::group_by(application_date,segment,data_type) %>% dplyr::summarise(case_count = sum(case_count, na.rm = T)) 174 | date_seg_2_master <- left_join(date_seg_2_master, date_data) 175 | 176 | date_seg_master <- date_seg_2_master 177 | sort(colSums(is.na(date_seg_master))) 178 | 179 | 180 | ##################### creating yearly lag feature (custom_date_key level) ############# 181 | names(date_seg_master) 182 | yearly_lag_feature_df <- date_seg_master %>% dplyr::group_by(segment, year, month, custom_date_key) %>% dplyr::summarise(lag_365 = round(mean(case_count))) 183 | yearly_lag_feature_df$year <- yearly_lag_feature_df$year + 1 184 | 185 | date_seg_master <- left_join(date_seg_master, yearly_lag_feature_df) 186 | sort(colSums(is.na(date_seg_master))) 187 | ###################################################################################### 188 | 189 | 190 | names(date_seg_master) 191 | base_feat <- c('data_type','segment','month_week_number','weekdays_num',"month" , "quarter", "year","month_day", 192 | "year_week_number","day_of_year", 'case_count') 193 | 194 | holiday_flag_feat <- c("gazetted_holiday", "local", "observance","restricted_holiday", "season") 195 | lag_feat <- c('custom_date_key','lag_365') 196 | 197 | cols_to_select <- unique(c(base_feat, holiday_flag_feat, lag_feat)) 198 | date_seg_master <- date_seg_master[, names(date_seg_master) %in% cols_to_select] 199 | 200 | date_seg_2_master <- date_seg_master[date_seg_master$segment==2, ] 201 | date_seg_2_master$segment <- NULL 202 | date_seg_2_master$custom_date_key <- as.numeric(as.factor(date_seg_2_master$custom_date_key)) 203 | 204 | 205 | 206 | ############## MAPE function ################ 207 | xgb_mape <- function(preds, dtrain){ 208 | labels <- xgboost::getinfo(dtrain, 'label') 209 | err <- sum(abs((as.numeric(labels) - as.numeric(preds))/as.numeric(labels))) 210 | err <- err*100/length(labels) 211 | return(list(metric='MAPE_custom', value=err)) 212 | } 213 | ########################################### 214 | 215 | 216 | ########## Model for segment 2 ################## 217 | # 1. CV train 218 | train_ids <- c('train', 'validation_M1') 219 | X_train <- date_seg_2_master[date_seg_2_master$data_type %in% train_ids, ] 220 | Y_train <- date_seg_2_master$case_count[date_seg_2_master$data_type %in% train_ids] 221 | 222 | X_train$data_type <- NULL 223 | X_train$case_count <- NULL 224 | xgtrain <- xgb.DMatrix(data = as.matrix(X_train), label = Y_train, missing = NA) 225 | 226 | 227 | 228 | # 2. validation 229 | validation_month <- 'validation_M2' 230 | X_test <- date_seg_2_master[date_seg_2_master$data_type %in% validation_month, ] 231 | Y_test <- date_seg_2_master$case_count[date_seg_2_master$data_type %in% validation_month] 232 | 233 | X_test$data_type <- NULL 234 | X_test$case_count <- NULL 235 | 236 | str(X_test) 237 | xgtest <- xgb.DMatrix(data = as.matrix(X_test), label = Y_test, missing = NA) 238 | watchlist <- list(train = xgtrain, valid = xgtest) 239 | 240 | 241 | # 3. OOS 242 | validation_month <- 'validation_M3' 243 | valid_X <- date_seg_2_master[date_seg_2_master$data_type %in% validation_month, ] 244 | valid_Y <- date_seg_2_master$case_count[date_seg_2_master$data_type %in% validation_month] 245 | 246 | valid_X$data_type <- NULL 247 | valid_X$case_count <- NULL 248 | 249 | 250 | # 4. final train and test 251 | train <- date_seg_2_master[date_seg_2_master$data_type=='train' | 252 | date_seg_2_master$data_type %like% 'validation', ] 253 | 254 | test <- date_seg_2_master[date_seg_2_master$data_type %like% 'submission', ] 255 | 256 | target <- train$case_count 257 | train$case_count <- NULL 258 | test$case_count <- NULL 259 | 260 | 261 | train$data_type <- NULL 262 | test$data_type <- NULL 263 | 264 | 265 | ####################################### creating the xgb model ######################################## 266 | # model hyperparameters 267 | param = list("objective" = "reg:linear", 268 | # eval_metric = "rmse" , 269 | "eval_metric" = xgb_mape, "nthread" = -1, 270 | "eta" = 0.1, "max_depth" = 10, "subsample" = 0.95, "colsample_bytree" =1, 271 | "gamma" = 0,"min_child_weight" = 1) 272 | 273 | #---------------------------------------- Holdout CV ---------------------------------------------# 274 | set.seed(300) 275 | num_round <- 5000 276 | bst_cv <- xgb.train(param, 277 | data = xgtrain, 278 | watchlist = watchlist, 279 | num_round, 280 | early_stopping_round = 20, 281 | maximize = F, 282 | missing = NA) 283 | 284 | bst_nrounds <- bst_cv$best_iteration # optimal number of iteration 285 | orig_bst_nrounds <- bst_nrounds 286 | 287 | # pred on OOS data 288 | OOS_pred <- predict (bst_cv, as.matrix(valid_X), missing = NA) 289 | mape_oos <- MAPE(OOS_pred, valid_Y)*100 290 | mape_oos 291 | 292 | 293 | # training again and then prediction on validation data 294 | set.seed(500) 295 | bst_model <- xgboost(param=param, 296 | data =as.matrix(rbind(X_train, X_test)), 297 | label = c(Y_train,Y_test), 298 | nrounds=bst_nrounds, 299 | missing=NA) 300 | 301 | # making predicton on test data 302 | OOS_pred_2 = predict(bst_model, as.matrix(valid_X),missing=NA) 303 | mape_oos_2 <- MAPE(OOS_pred_2, valid_Y)*100 304 | mape_oos_2 305 | cat('tr score: ',c(bst_cv$evaluation_log[orig_bst_nrounds])[[2]] , 'cv score: ',bst_cv$best_score,'MAPE1: ',mape_oos, 'MAPE2: ', mape_oos_2) 306 | 307 | 308 | 309 | #--------------------------------- Final Model for Test data ----------------------# 310 | # training the final model 311 | set.seed(500) 312 | bst <- xgboost(param=param, 313 | data =as.matrix(train), 314 | label = target, 315 | nrounds=bst_nrounds/0.8, 316 | missing=NA) 317 | 318 | # making predicton on test data 319 | pred = predict(bst, as.matrix(test),missing=NA) 320 | summary(pred) 321 | summary(target) 322 | sum(pred) 323 | 324 | # segment 2 submission file 325 | seg_2_subm <- submission[submission$segment==2, ] 326 | seg_2_subm$case_count <- pred 327 | sum(seg_2_subm$case_count) 328 | 329 | fwrite(seg_2_subm, './data/final_pred_seg_2.csv') 330 | 331 | 332 | ############## Feature importance plot ########## 333 | names = names(train) 334 | importance_matrix = xgb.importance(names, model=bst) 335 | importance_matrix = xgb.importance(names, model=bst) 336 | gp = xgb.plot.importance(importance_matrix) 337 | print(gp) 338 | ################################################# 339 | 340 | -------------------------------------------------------------------------------- /Rank_1_Team_Creed/code/ltfs_finhack.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import datetime 4 | from datetime import timedelta 5 | from sklearn.linear_model import LinearRegression 6 | from dateutil.relativedelta import relativedelta 7 | 8 | def predict_finhack_data(input_train, input_test, festive_dates): 9 | """ 10 | """ 11 | # time stamp conversion 12 | input_train['application_date'] = pd.to_datetime(input_train['application_date'], format = "%Y-%m-%d") 13 | input_test['application_date'] = pd.to_datetime(input_test['application_date'], format = "%Y-%m-%d") 14 | print('Aggregating input done on country level...') 15 | input_train = aggregate_input_data(input_train.copy(deep = True)) 16 | print(' Done!') 17 | final_test = None 18 | for each_segment, each_segment_data in input_train.groupby(['seg']): 19 | print('Analyzing segment', each_segment, '...') 20 | each_segment_test = input_test[input_test['segment'] == each_segment] 21 | each_segment_data = each_segment_data.reset_index(drop = True) 22 | each_segment_test = each_segment_test.reset_index(drop = True) 23 | predicted_result = predict_on_test_data(each_segment, each_segment_data, each_segment_test, festive_dates) 24 | final_test = pd.concat([predicted_result, final_test]) 25 | print(' Done!') 26 | 27 | # final_test = final_test.sort_values() 28 | return final_test 29 | # input_train = compute_festive_variation(each_segment_data, festive_dates) 30 | # ===================================== 31 | def aggregate_input_data(input_train): 32 | """ 33 | """ 34 | new_df = [] 35 | for i, j in input_train.groupby(['application_date','segment']): 36 | add = sum(j['case_count'].tolist()) 37 | temp = {'date': i[0], 'seg': i[1], 'apps': add} 38 | new_df.append(temp) 39 | 40 | new_df = pd.DataFrame(new_df, columns = new_df[0].keys()) 41 | return new_df 42 | # ===================================== 43 | def predict_on_test_data(each_segment, each_segment_data, each_segment_test, festive_dates): 44 | """ 45 | """ 46 | # add temporal values to the data set 47 | # import pdb; pdb.set_trace() 48 | print(' Finding pattern...') 49 | each_segment_data.loc[:, 'weekday'] = each_segment_data.date.dt.weekday.tolist() 50 | each_segment_data.loc[:, 'year'] = each_segment_data.date.dt.year.tolist() 51 | each_segment_data.loc[:, 'day'] = each_segment_data.date.dt.day.tolist() 52 | each_segment_data.loc[:, 'month'] = each_segment_data.date.dt.month.tolist() 53 | 54 | each_segment_test.loc[:, 'weekday'] = each_segment_test.application_date.dt.weekday.tolist() 55 | each_segment_test.loc[:, 'year'] = each_segment_test.application_date.dt.year.tolist() 56 | each_segment_test.loc[:, 'day'] = each_segment_test.application_date.dt.day.tolist() 57 | each_segment_test.loc[:, 'month'] = each_segment_test.application_date.dt.month.tolist() 58 | 59 | # Special temporal values 60 | k = each_segment_data.loc[:, 'year'].astype(str)+"-"+each_segment_data.loc[:, 'month'].astype(str) 61 | kk = each_segment_test.loc[:, 'year'].astype(str)+"-"+each_segment_test.loc[:, 'month'].astype(str) 62 | l = pd.to_datetime(k, format = '%Y-%m')+pd.offsets.MonthEnd(1) 63 | ll = pd.to_datetime(kk, format = '%Y-%m')+pd.offsets.MonthEnd(1) 64 | # Getting last but one day of month 65 | l1 = l-timedelta(days=1) 66 | l1 = list(set(l1.to_list())) 67 | ll1 = ll-timedelta(days =1) 68 | ll1 = list(set(ll1.to_list())) 69 | each_segment_data.loc[:,'special'] = each_segment_data['day'].astype(str) 70 | each_segment_data.loc[:,'bankSpecific'] = ['others']*each_segment_data.shape[0] 71 | each_segment_test.loc[:,'special'] = each_segment_test['day'].astype(str) 72 | each_segment_test.loc[:,'bankSpecific'] =['others']*each_segment_test.shape[0] 73 | if each_segment == 2: 74 | each_segment_data.loc[each_segment_data['day'] <= 10, 'bankSpecific'] = 'others1' 75 | each_segment_data.loc[each_segment_data['day'] > 10, 'bankSpecific'] = 'others2' 76 | each_segment_test.loc[each_segment_test['day'] <= 10, 'bankSpecific'] = 'others1' 77 | each_segment_test.loc[each_segment_test['day'] > 10, 'bankSpecific'] = 'others2' 78 | 79 | each_segment_data.loc[each_segment_data['day'] == 1, 'special'] = 'startOfMonth' 80 | each_segment_data.loc[each_segment_data['day'] == 15, 'special'] = 'midOfMonth' 81 | each_segment_data.loc[each_segment_data['date'].isin(l), 'special'] = 'endOfMonth' 82 | each_segment_data.loc[each_segment_data['date'].isin(l1), 'special'] = 'endOfMonth1' 83 | 84 | each_segment_test.loc[each_segment_test['day'] == 1, 'special'] = 'startOfMonth' 85 | each_segment_test.loc[each_segment_test['day'] == 15, 'special'] = 'midOfMonth' 86 | each_segment_test.loc[each_segment_test['application_date'].isin(ll), 'special'] = 'endOfMonth' 87 | each_segment_test.loc[each_segment_test['application_date'].isin(ll1), 'special'] = 'endOfMonth1' 88 | 89 | pattern_matrix = None 90 | # predcition part 91 | data_x = each_segment_data.copy(deep = True) 92 | # Considering behavioural chagne 93 | data_x = data_x[data_x['year'] == 2019] 94 | data_x = data_x[data_x['month'] > 3] 95 | if each_segment == 2: 96 | # data have two clusters dividing the clusters() 97 | part_1 = data_x[data_x['bankSpecific'] == 'others1'] 98 | part_2 = data_x[data_x['bankSpecific'] == 'others2'] 99 | in_data = {'others2': part_2, 'others1': part_1} 100 | if each_segment == 1: 101 | part_1 = data_x.copy(deep = True) 102 | in_data = {'others': part_1} 103 | for part, each_part in in_data.items(): 104 | each_part_data = each_part.copy(deep = True) 105 | # weekay 106 | unique_wdays = list(set(each_part_data.weekday.tolist())) 107 | wday_dict = [] 108 | sub_in_data = each_part_data[each_part_data['special'] != 'endOfMonth'] 109 | sub_in_data = sub_in_data[sub_in_data['special'] != 'endOfMonth1'] 110 | if part == 'others': 111 | sub_in_data = sub_in_data[sub_in_data['special'] != 'startOfMonth'] 112 | sub_in_data = sub_in_data[sub_in_data['special'] != 'midOfMonth'] 113 | seg = str(each_segment) 114 | key = part 115 | # =============================================================================== 116 | for ii in unique_wdays: 117 | # print('======') 118 | values = sub_in_data.loc[sub_in_data['weekday'] == ii, 'apps'] 119 | if len(values) == 1: 120 | avg = values.values[0] 121 | rep_value = avg 122 | std = 0 123 | else: 124 | vals = values.tolist() 125 | avg = np.percentile(vals, 75) 126 | rep_value = linear_regression(vals) 127 | avgg = np.mean(vals) 128 | if pd.isnull(avg): 129 | avg = vals[0] 130 | std = values.std() 131 | variation = std/avgg 132 | m_m_variation = max(values.values) - min(values.values) 133 | temp = {'pattern_value': ii, 'avg':avg, 'std': std, 'variation': variation, 'min_max_diff': m_m_variation, 134 | 'pattern':'DOW', 'data': seg+'_'+key ,'rep_value': rep_value} 135 | wday_dict.append(temp) 136 | wday_dict = pd.DataFrame(wday_dict, columns = wday_dict[0].keys()) 137 | # ================================================================================= 138 | # special 139 | unique_sdays = list(set(each_part_data.special.tolist())) 140 | sub_in_data = each_part_data[each_part_data['weekday'] != 6] 141 | sday_dict = [] 142 | for iii in unique_sdays: 143 | values = sub_in_data.loc[sub_in_data['special'] == iii, 'apps'] 144 | if len(values) == 1: 145 | avg = values.values[0] 146 | rep_value = avg 147 | std = 0 148 | else: 149 | vals = values.tolist() 150 | if len(vals) > 2: 151 | vals = vals[-2:] 152 | rep_value = linear_regression(vals) 153 | # print(iii, rep_value, vals) 154 | avg = np.percentile(vals, 75) 155 | avgg = np.mean(vals) 156 | if pd.isnull(avg): 157 | avg = vals[0] 158 | std = values.std() 159 | variation = std/avgg 160 | m_m_variation = max(values.values) - min(values.values) 161 | temp = {'pattern_value': iii, 'avg':avg, 'std': std, 'variation': variation, 'min_max_diff': m_m_variation, 162 | 'pattern':'special', 'data': seg+'_'+key, 'rep_value': rep_value} 163 | sday_dict.append(temp) 164 | sday_dict = pd.DataFrame(sday_dict, columns = sday_dict[0].keys()) 165 | temp_pattern_matrix = pd.concat([wday_dict, sday_dict]) 166 | # import pdb; pdb.set_trace() 167 | pattern_matrix = pd.concat([temp_pattern_matrix, pattern_matrix]) 168 | 169 | pattern_matrix = pattern_matrix.reset_index(drop = True) 170 | # ================================================= 171 | # predict future 172 | each_segment_test['data'] = each_segment_test['segment'].astype(str)+'_'+each_segment_test['bankSpecific'] 173 | test = each_segment_test.copy(deep = True) 174 | forecast = [] 175 | dom_avg_variation = round(np.mean(pattern_matrix.loc[pattern_matrix['pattern'] == 'special','variation'].tolist()), 3) 176 | dow_avg_variation = round(np.mean(pattern_matrix.loc[pattern_matrix['pattern'] == 'DOW','variation'].tolist()), 3) 177 | print(' DOM variation: ', dom_avg_variation) 178 | print(' DOW variation: ', dow_avg_variation) 179 | best_dimension = 'DOM' 180 | if dom_avg_variation > dow_avg_variation: 181 | best_dimension = 'DOW' 182 | print(' Best pattern', best_dimension) 183 | print(' Done!') 184 | print(' Forecasting future values') 185 | for i, j in test.iterrows(): 186 | data_type = j['data'] 187 | sub_pattern_matrix = pattern_matrix[pattern_matrix['data'] == data_type] 188 | dow_sub_data = sub_pattern_matrix[sub_pattern_matrix['pattern'] == 'DOW'] 189 | dow_sub_data = dow_sub_data.reset_index(drop = True) 190 | dom_sub_data = sub_pattern_matrix[sub_pattern_matrix['pattern'] == 'special'] 191 | dom_sub_data = dom_sub_data.reset_index(drop = True) 192 | dow_variation = float(dow_sub_data.loc[dow_sub_data['pattern_value'] == j['weekday'], 'variation']) 193 | dom_variation = float(dom_sub_data.loc[dom_sub_data['pattern_value'] == str(j['special']), 'variation']) 194 | 195 | dow_min_max = float(dow_sub_data.loc[dow_sub_data['pattern_value'] == j['weekday'], 'min_max_diff']) 196 | dom_min_max = float(dom_sub_data.loc[dom_sub_data['pattern_value'] == str(j['special']), 'min_max_diff']) 197 | dow_std = float(dow_sub_data.loc[dow_sub_data['pattern_value'] == j['weekday'], 'std']) 198 | dom_std = float(dom_sub_data.loc[dom_sub_data['pattern_value'] == str(j['special']), 'std']) 199 | 200 | dow_rep = float(dow_sub_data.loc[dow_sub_data['pattern_value'] == j['weekday'], 'rep_value']) 201 | dom_rep = float(dom_sub_data.loc[dom_sub_data['pattern_value'] == str(j['special']), 'rep_value']) 202 | case_count = dom_rep 203 | if best_dimension == 'DOM': 204 | if j['weekday'] == 6: 205 | case_count = dow_rep 206 | if j['special'] in ['endOfMonth1', 'endOfMonth']: 207 | case_count = dom_rep 208 | if best_dimension == 'DOW': 209 | case_count = dow_rep 210 | 211 | case_count = round(case_count) 212 | dow_threshold = float(dow_sub_data.loc[dow_sub_data['pattern_value'] == j['weekday'], 'avg']) 213 | dom_threshold = float(dom_sub_data.loc[dom_sub_data['pattern_value'] == j['special'], 'avg']) 214 | tempp = {'id': j['id'], 'application_date': j['application_date'], 'segment': j['segment'], 215 | 'case_count': case_count} 216 | forecast.append(tempp) 217 | 218 | predicted_test = pd.DataFrame(forecast, columns = forecast[0].keys()) 219 | print(' Done!') 220 | try: 221 | predicted_test = adjust_festive_dates(each_segment_data, predicted_test, each_segment, festive_dates) 222 | except Exception as e: 223 | print(e) 224 | pass 225 | return predicted_test 226 | 227 | # ==================================== 228 | def identify_best_k_for_a_measure(values, measure, CONST_K_RANGE = np.linspace(-2,2,41), 229 | CONST_QUANTILE_RANGE = np.linspace(30, 80, 6)): 230 | """ 231 | Identify best k for each measure for mean 232 | Input: values, measure, CONST_K_RANGE 233 | Output: best k, best_value 234 | """ 235 | absolute_distance_one = np.array([]) 236 | absolute_distances = np.array([]) 237 | all_representative_values = np.array([]) 238 | if (measure == "mean"): 239 | avg_val = np.mean(values) 240 | sd_val = np.std(values) 241 | all_representative_values = (avg_val + sd_val * CONST_K_RANGE) 242 | if (measure == "median"): 243 | median_val = np.percentile(values, 50) 244 | mad_val = np.percentile(np.abs(values - np.percentile(values, 50)), 50) 245 | all_representative_values = (median_val + mad_val * CONST_K_RANGE) 246 | if (measure == "quantile"): 247 | all_representative_values = np.percentile(values, CONST_QUANTILE_RANGE) 248 | for x in all_representative_values: 249 | absolute_distance_one = np.sum(np.absolute(values - x)) 250 | absolute_distances = np.append(absolute_distances, absolute_distance_one) 251 | best_value = min(absolute_distances) 252 | best_measure_index = np.argmin(absolute_distances) 253 | if measure == "mean": 254 | best_k = CONST_K_RANGE[(best_measure_index)] 255 | if measure == "median": 256 | best_k = CONST_K_RANGE[(best_measure_index)] 257 | if measure == "quantile": 258 | best_k = CONST_QUANTILE_RANGE[(best_measure_index)] 259 | 260 | return list([best_k, best_value]) 261 | 262 | def compute_representative_value(values, CONST_ALL_MEASURES, CONST_K_RANGE = np.linspace(-2,2,41), 263 | CONST_QUANTILE_RANGE = np.linspace(20, 90, 15)): 264 | """ 265 | # Compute representative value of each dimension value in given timeseries 266 | # Input: values 267 | # Output: representative value 268 | """ 269 | if len(values) == 1: 270 | return values.values[0] 271 | best_k = np.array([]) 272 | best_measure = np.array([]) 273 | best_value_mean = np.array([]) 274 | best_value_mad = np.array([]) 275 | best_value_quantile = np.array([]) 276 | 277 | # compute representative value of each dimension value in given timeseries 278 | # Identify best measure and best k 279 | for one_measure in CONST_ALL_MEASURES: # mohit : test by putting only 2 measures in CONST 280 | # Identify best k for each measure for mean 281 | if one_measure == "mean": 282 | (best_k_mean, best_value_mean) = identify_best_k_for_a_measure(values, 283 | one_measure, 284 | CONST_K_RANGE, 285 | CONST_QUANTILE_RANGE) 286 | # Identify best k for each measure for median 287 | if one_measure == "median": 288 | (best_k_mad, best_value_mad) = identify_best_k_for_a_measure(values, 289 | one_measure, 290 | CONST_K_RANGE, 291 | CONST_QUANTILE_RANGE) 292 | if one_measure == "quantile": 293 | # Identify best k for each measure for quantile 294 | (best_k_quantile, best_value_quantile) = identify_best_k_for_a_measure(values, 295 | one_measure, 296 | CONST_K_RANGE, 297 | CONST_QUANTILE_RANGE) 298 | 299 | # Identify best measure and best_k 300 | all_measures_values = np.array([best_value_mean, best_value_mad, best_value_quantile]) 301 | best_of_all_measure_index = np.argmin(all_measures_values) 302 | best_measure = CONST_ALL_MEASURES[best_of_all_measure_index] 303 | if best_measure == "mean": 304 | best_k = best_k_mean 305 | if best_measure == "median": 306 | best_k = best_k_mad 307 | if best_measure == "quantile": 308 | best_k = best_k_quantile 309 | 310 | # Compute representative values as per best measure and best k 311 | # Input: values, best_measure, best_k 312 | # Output: representative value 313 | if best_measure == "mean": 314 | representative_val = np.mean(values) + best_k * np.std(values) 315 | if best_measure == "median": 316 | median_val = np.percentile(values, 50) 317 | mad_val = np.percentile(np.abs(values - np.percentile(values, 50)), 50) 318 | representative_val = median_val + best_k * mad_val 319 | if best_measure == "quantile": 320 | representative_val = np.percentile(values, best_k) 321 | 322 | return representative_val 323 | # ===================================================================== 324 | def linear_regression(n): 325 | """ 326 | """ 327 | # print('====', n) 328 | m = list(range(1,len(n)+1)) 329 | # print(m) 330 | import numpy as np 331 | from sklearn.linear_model import LinearRegression 332 | x = np.array(m).reshape((-1, 1)) 333 | y = np.array(n) 334 | model = LinearRegression() 335 | try: 336 | model.fit(x, y) 337 | except Exception as e: 338 | print(e) 339 | raise(e) 340 | intercept = model.intercept_ 341 | # print(intercept) 342 | slope = model.coef_[0] 343 | # detrend values to get the representative value 344 | values = np.array(n)-(np.array(m)*slope) 345 | try: 346 | rep_value = compute_representative_value(values, ['mean', 'quantile', 'median']) 347 | except Exception as e: 348 | print(e) 349 | raise(e) 350 | # Add trend to the representative value 351 | rep_value = rep_value+(np.mean(m)*slope) 352 | return rep_value 353 | # ===================================================================== 354 | def compute_festival_variation(arg_input_ts, arg_input_festival_list, input_temporal_pattern): 355 | CONST_MONTHS_UNDER_CONSIDERATION = 3 356 | # Add day of week and day of month: 357 | arg_input_ts['dow'] = [x.weekday() for x in arg_input_ts['timestamp']] 358 | arg_input_ts['dom'] = [x.day for x in arg_input_ts['timestamp']] 359 | arg_input_ts['festival'] = None 360 | arg_input_ts['variationPercentage'] = None 361 | festival_list_grouped = arg_input_festival_list.groupby(['festival']) 362 | DAYS_TO_LOOK = CONST_MONTHS_UNDER_CONSIDERATION*31 363 | # -------------------------------------------------- 364 | for one_festival in festival_list_grouped: 365 | # Prepare a subset of each festival 366 | one_festival_name = one_festival[0] 367 | one_festival_subset = one_festival[1] 368 | 369 | one_festival_previous_values_subset = pd.DataFrame() 370 | one_festival_unique_dates = one_festival_subset['date'].unique() 371 | 372 | # Find data of previous months and next months CONST_MONTHS_UNDER_CONSIDERATION: 373 | for one_festival_one_date in one_festival_subset.groupby(['date']): 374 | one_festival_date = one_festival_one_date[0] 375 | 376 | # Find day of month of that particular month: 377 | one_date_value = one_festival_date.day 378 | one_dow_value = one_festival_date.weekday() 379 | 380 | # Find date range: 381 | one_date_months_back = one_festival_date - relativedelta(days=DAYS_TO_LOOK) 382 | 383 | # Find subset of values greater than and lesser than the lower and upper limit of dates: 384 | one_date_value_subset = arg_input_ts[(arg_input_ts['timestamp'] >= one_date_months_back) & 385 | (arg_input_ts['timestamp']<= one_festival_date)] 386 | 387 | if one_date_value_subset.shape[0] == 0: 388 | continue 389 | 390 | # Prepare a subset of values corresponding to one date value: 391 | if input_temporal_pattern == "dom": 392 | one_date_value_subset = one_date_value_subset[one_date_value_subset['dom'] == one_date_value] 393 | 394 | if input_temporal_pattern == "dow": 395 | one_date_value_subset = one_date_value_subset[one_date_value_subset['dow'] == one_dow_value] 396 | 397 | if one_date_value_subset.shape[0]==0: 398 | continue 399 | 400 | # Mark festival date under consider: 401 | one_date_value_subset.loc[one_date_value_subset['timestamp'] == one_festival_date, 'festival'] = one_festival_name 402 | 403 | # Check if festival date has been marked or not 404 | # If one festival date is not present, continue 405 | check_festival_mark = one_date_value_subset[one_date_value_subset['festival'] == one_festival_name] 406 | if check_festival_mark.shape[0] == 0: 407 | continue 408 | 409 | # Compute difference between festival date values and values of same date of previous and next months 410 | one_festival_one_date_metric_value = one_date_value_subset.loc[ 411 | one_date_value_subset['festival'] == one_festival_name, 'values'].values[0] 412 | 413 | 414 | # Find the difference between values of previous months and festival dates 415 | one_date_value_subset['absoluteVariation'] = one_date_value_subset['values'] - one_festival_one_date_metric_value 416 | one_date_value_subset['denominator'] = one_date_value_subset['values'] 417 | 418 | # Remove current festival value 419 | one_festival_previous_values_subset = one_festival_previous_values_subset.append(one_date_value_subset) 420 | 421 | # Compute variation percentage: 422 | 423 | if one_festival_previous_values_subset.shape[0] == 0: 424 | continue 425 | 426 | one_festival_all_values_subset = one_festival_previous_values_subset[one_festival_previous_values_subset['festival'] == one_festival_name] 427 | # Create a seperate subset with current festival values removed 428 | one_festival_values_removed_subset = one_festival_previous_values_subset[one_festival_previous_values_subset['festival'] != one_festival_name] 429 | 430 | one_festival_sum_abs_variation = np.sum(one_festival_values_removed_subset['absoluteVariation']) 431 | one_festival_sum_denom = np.sum(one_festival_values_removed_subset['denominator']) 432 | one_festival_percentageVariation = one_festival_sum_abs_variation/one_festival_sum_denom 433 | 434 | one_festival_input_data_index = one_festival_all_values_subset.index 435 | 436 | # Update one festival name and variation percentage in the list 437 | arg_input_ts.loc[one_festival_input_data_index,'variationPercentage'] = one_festival_percentageVariation 438 | arg_input_ts.loc[one_festival_input_data_index,'festival'] = one_festival_name 439 | 440 | # ---------------------------------------------- 441 | columns_of_interest = ['timestamp', 'values', 'festival', 'variationPercentage'] 442 | arg_input_ts = arg_input_ts[columns_of_interest] 443 | return arg_input_ts 444 | # ========================================================================== 445 | def adjust_festive_dates(each_segment_data, predicted_test, each_segment, festive_dates): 446 | """ 447 | """ 448 | input_data = each_segment_data.copy(deep = True) 449 | input_data = input_data[['date', 'apps']] 450 | input_data.columns = ['timestamp', 'values'] 451 | festive_dates.loc[:, 'date'] = pd.to_datetime(festive_dates['date'], format = "%m/%d/%Y") 452 | arg_input_festival_list = festive_dates[festive_dates['future_flag'] == False] 453 | arg_input_festival_list = arg_input_festival_list[['festival', 'date']] 454 | # arg_input_festival_list['date'] = pd.to_datetime(arg_input_festival_list['date'], format = CONST_INPUT_DATE_FORMAT, errors = 'coerce') 455 | input_temporal_pattern = "dom" 456 | if each_segment == 1: 457 | input_temporal_pattern = "dow" 458 | 459 | # import pdb; pdb.set_trace() 460 | try: 461 | print(' Computing festivals significance...') 462 | return_data = compute_festival_variation(input_data, arg_input_festival_list, input_temporal_pattern) 463 | print(' Done!') 464 | # import pdb; pdb.set_trace() 465 | except Exception as e: 466 | print(e) 467 | raise(e) 468 | # print('---1') 469 | # return_data = return_data.dropna() 470 | print(' Adopting festivals significance...') 471 | return_data = return_data.reset_index(drop =True) 472 | return_data['year'] = return_data.timestamp.dt.year.tolist() 473 | return_data = return_data[return_data['year'] == 2018] 474 | return_data = return_data.reset_index(drop =True) 475 | return_data = return_data.reset_index(drop = True) 476 | return_data = return_data[['festival', 'variationPercentage', 'values']] 477 | festive_dates_future = festive_dates[festive_dates['future_flag'] == True] 478 | predicted_test['festive'] = ['']*predicted_test.shape[0] 479 | for i, j in festive_dates_future.iterrows(): 480 | # import pdb; pdb.set_trace() 481 | kkk = j['date'] 482 | kk = [] 483 | kk.append(kkk) 484 | predicted_test.loc[predicted_test['application_date'].isin(kk),'festive'] = j['festival'] 485 | 486 | for key, val in predicted_test.iterrows(): 487 | 488 | if val['festive'] == '': 489 | continue 490 | 491 | # import pdb; pdb.set_trace() 492 | flag = festive_dates_future[festive_dates_future['festival'] == val['festive']] 493 | if flag.shape[0] == 0: 494 | continue 495 | 496 | flag = flag['date_change_flag'].values[0] 497 | if flag == False: 498 | new_value = return_data.loc[return_data['festival'] == val['festive'], 'values'].tolist()[0] 499 | else: 500 | current_value = val['case_count'] 501 | diff = return_data.loc[return_data['festival'] == val['festive'], 'variationPercentage'].tolist()[0] 502 | new_value = round(current_value*(1-diff), 0) 503 | # print(val['festive'], 'is adjusted from', current_value,new_value) 504 | if pd.isnull(new_value): 505 | continue 506 | predicted_test.loc[predicted_test['id'] == val['id'], 'case_count'] = new_value 507 | 508 | predicted_test = predicted_test.drop(['festive'], axis = 1) 509 | print(' Done!') 510 | return predicted_test 511 | 512 | 513 | 514 | --------------------------------------------------------------------------------