├── readme.md ├── batch.sh ├── r5_vwrun.sh ├── r0_itemstore.py ├── r1_baseline.r ├── holiday_names.txt ├── r3c_features.py ├── r3a_rollingmean.py ├── r6_submission.py ├── r4_vwtxt_creator.py ├── holidays.txt ├── r2_preprocess.py └── r3b_zeros.py /readme.md: -------------------------------------------------------------------------------- 1 | code for the kaggle competition 2 | https://www.kaggle.com/c/walmart-recruiting-sales-in-stormy-weather 3 | 4 | ### Requirements 5 | 6 | R, python, vowpal wabbit, the data from kaggle 7 | 8 | ### Setup 9 | 10 | put codes 11 | put the data from kaggle in data/ directory 12 | 13 | ### Usage 14 | 15 | run "source batch.sh" -------------------------------------------------------------------------------- /batch.sh: -------------------------------------------------------------------------------- 1 | mkdir model 2 | mkdir submission 3 | ipython r0_itemstore.py 4 | R --vanilla < r1_baseline.r 5 | ipython r2_preprocess.py 6 | ipython r3a_rollingmean.py 7 | ipython r3b_zeros.py 8 | ipython r3c_features.py 9 | ipython r4_vwtxt_creator.py 10 | source r5_vwrun.sh 11 | ipython r6_submission.py 12 | #sort submission/p.csv > submission/sortp.csv 13 | #diff submission/sortp.csv answer/sortp.csv > temp.txt -------------------------------------------------------------------------------- /r5_vwrun.sh: -------------------------------------------------------------------------------- 1 | 2 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib 3 | export PATH=/usr/local/bin:$PATH 4 | 5 | cd model 6 | 7 | # linear regression by vowpal wabbit 8 | # unexpected features might be used for test data prediction, because I forgot adding --ignore for test data prediction. 9 | 10 | vw -d vwdata.vwtxt -c -k -P 1000000 --passes 650 -q AB -q AC -q BM -q CM -q BK -q CK --ignore F --ignore I -f vwdata.vwmdl --l1 0.0000001 11 | vw -d vwdata.vwtxt -t -i vwdata.vwmdl --invert_hash vwdata.vwih 12 | 13 | vw -d vwdata_test.vwtxt -t -i vwdata.vwmdl -p vwdata.predict.txt 14 | 15 | cd .. 16 | -------------------------------------------------------------------------------- /r0_itemstore.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | def create_vaild_item_store_combinations(_df): 5 | df = _df.copy() 6 | df['log1p'] = np.log(df['units'] + 1) 7 | 8 | g = df.groupby(["store_nbr", "item_nbr"])['log1p'].mean() 9 | g = g[g > 0.0] 10 | 11 | store_nbrs = g.index.get_level_values(0) 12 | item_nbrs = g.index.get_level_values(1) 13 | 14 | store_item_nbrs = sorted(zip(store_nbrs, item_nbrs), key = lambda t: t[1] * 10000 + t[0] ) 15 | 16 | with open(store_item_nbrs_path, 'wb') as f: 17 | f.write("store_nbr,item_nbr\n") 18 | for sno, ino in store_item_nbrs: 19 | f.write("{},{}\n".format(sno, ino)) 20 | 21 | store_item_nbrs_path = 'model/store_item_nbrs.csv' 22 | df_train = pd.read_csv("data/train.csv") 23 | create_vaild_item_store_combinations(df_train) 24 | -------------------------------------------------------------------------------- /r1_baseline.r: -------------------------------------------------------------------------------- 1 | # read files 2 | df <- read.table("data/train.csv", sep=',', header=T) 3 | store_item_nbrs <- read.table("model/store_item_nbrs.csv", sep=',', header=T) 4 | 5 | # calculate log1p 6 | df$log1p <- log1p(df$units) 7 | 8 | # calculate days from 2012-01-01 9 | origin <- as.integer(floor(julian(as.POSIXlt('2012-01-01')))) 10 | df$date2j <- as.integer(floor(julian((as.POSIXlt(df$date))))) - origin 11 | 12 | # exclude 2013-12-25 13 | date_excl <- as.integer(floor(julian(as.POSIXlt('2013-12-25')))) - origin 14 | df <- df[df$date2j != date_excl, ] 15 | 16 | # for each item_nbr/store_nbrs, fitting by ppr function 17 | df_fitted <- data.frame(date2j=c(), sno=c(), ino=c()) 18 | 19 | rng <- 1:nrow(store_item_nbrs) 20 | 21 | for (i in rng) { 22 | ino <- store_item_nbrs[i, "item_nbr"] 23 | sno <- store_item_nbrs[i, "store_nbr"] 24 | df0 <- subset(df, store_nbr == sno & item_nbr == ino) 25 | df0.ppr <- ppr(log1p ~ date2j, data = df0, nterms=3, max.terms=5) 26 | 27 | df1 <- data.frame(date2j=0:1034, store_nbr=sno, item_nbr=ino) 28 | df1$ppr_fitted <- predict(df0.ppr, df1) 29 | 30 | #plot(df0$date2j, df0$log1p, main=paste(c("result", ino, sno))) 31 | #lines(newdf$date2j, newdf$gampred, col="red") 32 | #lines(newdf$date2j, newdf$pprpred, col="blue") 33 | 34 | df_fitted <- rbind(df_fitted, df1) 35 | } 36 | 37 | write.table(df_fitted, "model/baseline.csv", quote=F, col.names=T, append=F, sep=",", row.names=F) 38 | 39 | cat("curve fitting finished") 40 | 41 | q("no") -------------------------------------------------------------------------------- /holiday_names.txt: -------------------------------------------------------------------------------- 1 | 2012 Jan 1 NewYearsDay 2 | 2012 Jan 16 MartinLutherKingDay 3 | 2012 Feb 14 ValentinesDay 4 | 2012 Feb 20 PresidentsDay 5 | 2012 Apr 8 EasterSunday 6 | 2012 May 13 MothersDay 7 | 2012 May 28 MemorialDay 8 | 2012 Jun 17 FathersDay 9 | 2012 Jul 4 IndependenceDay 10 | 2012 Sep 3 LaborDay 11 | 2012 Oct 8 ColumbusDay 12 | 2012 Oct 31 Halloween 13 | 2012 Nov 11 VeteransDay 14 | 2012 Nov 20 BlackFridayM3 15 | 2012 Nov 21 BlackFridayM2 16 | 2012 Nov 22 ThanksgivingDay 17 | 2012 Nov 23 BlackFriday 18 | 2012 Nov 24 BlackFriday1 19 | 2012 Nov 25 BlackFriday2 20 | 2012 Nov 26 BlackFriday3 21 | 2012 Dec 24 ChristmasEve 22 | 2012 Dec 25 ChristmasDay 23 | 2012 Dec 31 NewYearsEve 24 | 2013 Jan 1 NewYearsDay 25 | 2013 Jan 21 MartinLutherKingDay 26 | 2013 Feb 14 ValentinesDay 27 | 2013 Feb 18 PresidentsDay 28 | 2013 Mar 31 EasterSunday 29 | 2013 May 12 MothersDay 30 | 2013 May 27 MemorialDay 31 | 2013 Jun 16 FathersDay 32 | 2013 Jul 4 IndependenceDay 33 | 2013 Sep 2 LaborDay 34 | 2013 Oct 14 ColumbusDay 35 | 2013 Oct 31 Halloween 36 | 2013 Nov 11 VeteransDay 37 | 2013 Nov 26 BlackFridayM3 38 | 2013 Nov 27 BlackFridayM2 39 | 2013 Nov 28 ThanksgivingDay 40 | 2013 Nov 29 BlackFriday 41 | 2013 Nov 30 BlackFriday1 42 | 2013 Dec 1 BlackFriday2 43 | 2013 Dec 2 BlackFriday3 44 | 2013 Dec 24 ChristmasEve 45 | 2013 Dec 25 ChristmasDay 46 | 2013 Dec 31 NewYearsEve 47 | 2014 Jan 1 NewYearsDay 48 | 2014 Jan 20 MartinLutherKingDay 49 | 2014 Feb 14 ValentinesDay 50 | 2014 Feb 17 PresidentsDay 51 | 2014 Apr 20 EasterSunday 52 | 2014 May 11 MothersDay 53 | 2014 May 26 MemorialDay 54 | 2014 Jun 15 FathersDay 55 | 2014 Jul 4 IndependenceDay 56 | 2014 Sep 1 LaborDay 57 | 2014 Oct 13 ColumbusDay 58 | 2014 Oct 31 Halloween -------------------------------------------------------------------------------- /r3c_features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle 4 | 5 | def create_features(): 6 | 7 | dfs = [] 8 | 9 | for sno, ino in store_items: 10 | 11 | alldates = pd.date_range('2012-01-01', '2014-10-31', freq='D') 12 | alldates.name = 'date2' 13 | df = pd.DataFrame(None, index = alldates) 14 | df = df.reset_index() 15 | df['store_nbr'] = sno 16 | df['item_nbr'] = ino 17 | df['date2j'] = (df.date2 - pd.to_datetime("2012-01-01")).dt.days 18 | 19 | df = df.merge(df_baseline[['item_nbr', 'store_nbr', 'date2j', 'ppr_fitted']], 20 | how = 'left', 21 | on = ['item_nbr', 'store_nbr', 'date2j']) 22 | 23 | df = df.merge(df_rollingmean[['item_nbr', 'store_nbr', 'date2', 'rmean', 'include1', 'include2']], 24 | how = 'left', 25 | on = ['item_nbr', 'store_nbr', 'date2']) 26 | 27 | df = df.merge(df_zeros[['item_nbr', 'store_nbr', 'date2', 'include_zeros']], 28 | how = 'left', 29 | on = ['item_nbr', 'store_nbr', 'date2']) 30 | 31 | df['include3'] = (df.include2 & df.include_zeros) 32 | 33 | df = df.reset_index(drop = True) 34 | dfs.append(df) 35 | 36 | return pd.concat(dfs, ignore_index=True) 37 | 38 | df_baseline = pd.read_csv("model/baseline.csv", sep=",") 39 | df_rollingmean = pd.read_pickle('model/df_rollingmean.pkl') 40 | df_zeros = pd.read_pickle("model/df_zeros.pkl") 41 | 42 | store_item_nbrs_path = 'model/store_item_nbrs.csv' 43 | store_item_nbrs = pd.read_csv(store_item_nbrs_path) 44 | store_items = zip(store_item_nbrs.store_nbr, store_item_nbrs.item_nbr) 45 | 46 | df_features = create_features() 47 | df_features.to_pickle('model/df_features.pkl') -------------------------------------------------------------------------------- /r3a_rollingmean.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle 4 | 5 | def create_rollingmean(_df): 6 | 7 | dfs = [] 8 | 9 | for sno, ino in store_items: 10 | 11 | exclude_date = pd.to_datetime("2013-12-25") 12 | 13 | df = _df[(_df.store_nbr == sno) & (_df.item_nbr == ino)].copy() 14 | df = df.set_index('date2', drop=False) 15 | df = df.sort_index() 16 | df = df[df.date2 != exclude_date] # exclude 2013-12-25 17 | 18 | # calculate rolling mean 19 | window = 21 20 | df['rmean'] = pd.rolling_mean(df.log1p, window, center=True) 21 | df['rmean'] = df['rmean'].interpolate() 22 | df['rmean'] = df['rmean'].ffill() 23 | df['rmean'] = df['rmean'].bfill() 24 | 25 | # alldates 26 | alldates = pd.date_range('2012-01-01', '2014-10-31', freq='D') 27 | alldates.name = 'date2' 28 | df2 = pd.DataFrame(None, index = alldates) 29 | 30 | df2['store_nbr'] = sno 31 | df2['item_nbr'] = ino 32 | 33 | df2['log1p'] = df.log1p 34 | df2['rmean'] = df.rmean 35 | df2['rmean'] = df2['rmean'].interpolate() 36 | df2['rmean'] = df2['rmean'].ffill() 37 | df2['rmean'] = df2['rmean'].bfill() 38 | df2 = df2.reset_index() 39 | 40 | EPS = 0.000001 41 | df2['include1'] = (df2.rmean > EPS) 42 | # exclude 2013-12-25 43 | df2['include2'] = df2['include1'] & (df2.date2 != exclude_date) 44 | 45 | dfs.append(df2) 46 | 47 | return pd.concat(dfs, ignore_index=True) 48 | 49 | df_train = pd.read_pickle("model/train2.pkl") 50 | 51 | store_item_nbrs_path = 'model/store_item_nbrs.csv' 52 | store_item_nbrs = pd.read_csv(store_item_nbrs_path) 53 | store_items = zip(store_item_nbrs.store_nbr, store_item_nbrs.item_nbr) 54 | 55 | df_rollingmean = create_rollingmean(df_train) 56 | df_rollingmean.to_pickle('model/df_rollingmean.pkl') -------------------------------------------------------------------------------- /r6_submission.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle 4 | 5 | class SubmissionCreator(object): 6 | 7 | def create_id(self, row): 8 | date = row["date"] 9 | sno = row["store_nbr"] 10 | ino = row["item_nbr"] 11 | id = "{}_{}_{}".format(sno, ino, date) 12 | return id 13 | 14 | def create_id2(self, row): 15 | date = row["date"] 16 | s_no = row["store_nbr"] 17 | i_no = row["item_nbr"] 18 | id = str(i_no) + "_" + str(s_no) + "_" + date[0:4] + date[5:7] + date[8:10] 19 | return id 20 | 21 | def create_prediction_dict(self, fname_test, fname_p): 22 | d = dict() 23 | 24 | f_test = open(fname_test) 25 | f_p = open(fname_p) 26 | lines_test = f_test.readlines() 27 | lines_p = f_p.readlines() 28 | 29 | for line_test, line_p in zip(lines_test, lines_p): 30 | p_from_baseline = float(line_p.strip()) 31 | 32 | I = line_test.strip().split("|")[-1] 33 | id2 = I.split(" ")[2] 34 | notsold = I.split(" ")[4] 35 | baseline = float(I.split(" ")[-1]) 36 | 37 | if notsold == "True": 38 | pred = p_from_baseline + baseline 39 | else: 40 | pred = 0.0 41 | 42 | d[id2] = np.max([pred, 0.0]) 43 | 44 | return d 45 | 46 | def create_submission(self, df_test, fname_submission): 47 | df = df_test 48 | 49 | fw = open(fname_submission, "w") 50 | fw.write("id,units\n") 51 | 52 | for index, row in df.iterrows(): 53 | id = self.create_id(row) 54 | id2 = self.create_id2(row) 55 | if prediction_dict.has_key(id2): 56 | log1p = prediction_dict[id2] 57 | else: 58 | log1p = 0.0 59 | units = np.exp(log1p) - 1 60 | fw.write("{},{}\n".format(id, units)) 61 | 62 | fw.close() 63 | print "finished {}".format(fname_submission) 64 | 65 | 66 | submission_creator = SubmissionCreator() 67 | df_test = pd.read_csv("data/test.csv") 68 | prediction_dict = submission_creator.create_prediction_dict("model/vwdata_test.vwtxt", "model/vwdata.predict.txt") 69 | submission_creator.create_submission(df_test, "submission/p.csv") 70 | -------------------------------------------------------------------------------- /r4_vwtxt_creator.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | import pickle 5 | 6 | class VWtxtCreator(object): 7 | 8 | def write_train(self, _df_train, fname): 9 | df = self.create_df_vw(_df_train, True) 10 | self.write_txt(df, fname) 11 | return df 12 | 13 | def write_test(self, _df_test, fname): 14 | df = self.create_df_vw(_df_test, False) 15 | self.write_txt(df, fname) 16 | return df 17 | 18 | def create_df_vw(self, _df, is_train): 19 | 20 | df = _df.copy() 21 | df['datestr'] = map(lambda d:d.strftime('%Y%m%d'), df.date2) # it's slow.. 22 | df['id2'] = ( np.char.array(df.item_nbr) + "_" + np.char.array(df.store_nbr) + "_" + df.datestr ) 23 | 24 | df = df.merge( 25 | df_features[ ['item_nbr', 'store_nbr', 'date2', 26 | 'ppr_fitted', 'rmean', 'include1', 'include2', 'include3'] ], 27 | how = 'left', 28 | on = ['item_nbr', 'store_nbr', 'date2'] 29 | ) 30 | 31 | df['baseline'] = df.ppr_fitted 32 | df['include'] = df.include2 33 | df['include_prediction'] = df.include3 # use for training, but predict as zero 34 | 35 | # set index again 36 | df = df.set_index(_df.index) 37 | 38 | # drop not merged rows 39 | df = df.dropna() 40 | 41 | # set y (only when train) 42 | if is_train: 43 | df['y'] = df.log1p - df.baseline 44 | else: 45 | df['y'] = 0.0 46 | 47 | # exclude dates not effective for linear regression 48 | df = df[df.include] 49 | 50 | return df 51 | 52 | def write_txt(self, _df, fname): 53 | import csv 54 | 55 | f = open( fname, 'wb' ) 56 | wtr = csv.writer(f) 57 | for i, row in _df.iterrows(): 58 | newline = "{}".format(row.y) 59 | newline += (" |A wd{} we:{} hol:{} holwd:{} holwe:{}" 60 | .format(row.weekday, row.is_weekend, 61 | row.is_holiday, row.is_holiday_weekday, row.is_holiday_weekend)) 62 | newline += " |B ino{}".format(int(row.item_nbr)) 63 | newline += " |C sno{}".format(int(row.store_nbr)) 64 | newline += " |D date{}".format(int(row.datestr)) 65 | newline += " |F {}".format(row.holiday_name) 66 | newline += " |K {}".format(row.around_BlackFriday) 67 | newline += " |M day{} month{} year{}".format(row.day, row.month, row.year) 68 | newline += " |W isRS:{} departF:{}".format(row.preciptotal_flag, row.depart_flag) 69 | newline += " |I id {} avl4 {} rmean {}".format(row.id2, row.include_prediction, row.baseline) 70 | wtr.writerow( [newline] ) 71 | 72 | f.close() 73 | 74 | vwtxt_creator = VWtxtCreator() 75 | 76 | df_features = pd.read_pickle("model/df_features.pkl") 77 | df_train = pd.read_pickle('model/train2.pkl') 78 | vwtxt_creator.write_train(df_train, "model/vwdata.vwtxt") 79 | print "finished write train" 80 | 81 | df_test = pd.read_pickle('model/test2.pkl') 82 | vwtxt_creator.write_test(df_test, "model/vwdata_test.vwtxt") 83 | print "finished write test" 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /holidays.txt: -------------------------------------------------------------------------------- 1 | 2012 Jan 1 Sunday New Year's Day National holiday 2 | 2012 Jan 2 Monday New Year's Day observed National holiday 3 | 2012 Jan 16 Monday Martin Luther King Day National holiday 4 | 2012 Feb 14 Tuesday Valentine's Day Observance 5 | 2012 Feb 20 Monday Presidents' Day (Washington's Birthday) National holiday 6 | 2012 Apr 8 Sunday Easter Sunday Observance, Christian 7 | 2012 May 13 Sunday Mothers' Day Observance 8 | 2012 May 28 Monday Memorial Day National holiday 9 | 2012 Jun 17 Sunday Fathers' Day Observance 10 | 2012 Jul 4 Wednesday Independence Day National holiday 11 | 2012 Sep 3 Monday Labor Day National holiday 12 | 2012 Oct 8 Monday Columbus Day National holiday All except AK, AR, CA, DE, FL, HI, MI, NV, 2012 OR, SD, TX, WA 13 | 2012 Oct 31 Wednesday Halloween Observance 14 | 2012 Nov 6 Tuesday Election Day Observance 15 | 2012 Nov 11 Sunday Veterans Day National holiday 16 | 2012 Nov 12 Monday Veterans Day observed National holiday 17 | 2012 Nov 22 Thursday Thanksgiving Day National holiday 18 | 2012 Dec 24 Monday Christmas Eve Observance, Christian 19 | 2012 Dec 25 Tuesday Christmas Day National holiday, Christian 20 | 2012 Dec 31 Monday New Year's Eve Observance 21 | 2013 Jan 1 Tuesday New Year's Day National holiday 22 | 2013 Jan 21 Monday Martin Luther King Day National holiday 23 | 2013 Feb 14 Thursday Valentine's Day Observance 24 | 2013 Feb 18 Monday Presidents' Day (Washington's Birthday) National holiday 25 | 2013 Mar 31 Sunday Easter Sunday Observance, Christian 26 | 2013 May 12 Sunday Mothers' Day Observance 27 | 2013 May 27 Monday Memorial Day National holiday 28 | 2013 Jun 16 Sunday Fathers' Day Observance 29 | 2013 Jul 4 Thursday Independence Day National holiday 30 | 2013 Sep 2 Monday Labor Day National holiday 31 | 2013 Oct 14 Monday Columbus Day National holiday All except AK, AR, CA, DE, FL, HI, MI, NV, OR, SD, TX, WA 32 | 2013 Oct 31 Thursday Halloween Observance 33 | 2013 Nov 11 Monday Veterans Day National holiday 34 | 2013 Nov 28 Thursday Thanksgiving Day National holiday 35 | 2013 Dec 24 Tuesday Christmas Eve Observance, Christian 36 | 2013 Dec 25 Wednesday Christmas Day National holiday, Christian 37 | 2013 Dec 31 Tuesday New Year's Eve Observance 38 | 2014 Jan 1 Wednesday New Year's Day National holiday 39 | 2014 Jan 20 Monday Martin Luther King Day National holiday 40 | 2014 Feb 14 Friday Valentine's Day Observance 41 | 2014 Feb 17 Monday Presidents' Day (Washington's Birthday) National holiday 42 | 2014 Apr 13 Sunday Thomas Jefferson's Birthday Observance 43 | 2014 Apr 20 Sunday Easter Sunday Observance, Christian 44 | 2014 May 11 Sunday Mothers' Day Observance 45 | 2014 May 26 Monday Memorial Day National holiday 46 | 2014 Jun 15 Sunday Fathers' Day Observance 47 | 2014 Jul 4 Friday Independence Day National holiday 48 | 2014 Sep 1 Monday Labor Day National holiday 49 | 2014 Oct 13 Monday Columbus Day National holiday All except AK, AR, CA, DE, FL, HI, MI, MN, ND, NV, OR, SD, TX, VT, WA, WI, WY 50 | 2014 Oct 31 Friday Halloween Observance 51 | 2014 Nov 11 Tuesday Veterans Day National holiday 52 | 2014 Nov 27 Thursday Thanksgiving Day National holiday 53 | 2014 Dec 24 Wednesday Christmas Eve Observance, Christian 54 | 2014 Dec 25 Thursday Christmas Day National holiday, Christian 55 | 2014 Dec 31 Wednesday New Year's Eve Observance -------------------------------------------------------------------------------- /r2_preprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle 4 | import os.path 5 | 6 | def get_holidays(fpath): 7 | # holidays are from http://www.timeanddate.com/holidays/us/ , holidays and some observances 8 | 9 | f = open(fpath) 10 | lines = f.readlines() 11 | lines = [line.split(" ")[:3] for line in lines] 12 | lines = ["{} {} {}".format(line[0], line[1], line[2]) for line in lines] 13 | lines = pd.to_datetime(lines) 14 | return pd.DataFrame({"date2":lines}) 15 | 16 | def get_holiday_names(fpath): 17 | # holiday_names are holidays + around Black Fridays 18 | 19 | f = open(fpath) 20 | lines = f.readlines() 21 | lines = [line.strip().split(" ")[:4] for line in lines] 22 | lines_dt = ["{} {} {}".format(line[0], line[1], line[2]) for line in lines] 23 | lines_dt = pd.to_datetime(lines_dt) 24 | lines_hol = [line[3] for line in lines] 25 | return pd.DataFrame({"date2":lines_dt, "holiday_name":lines_hol}) 26 | 27 | def to_float(series, replace_value_for_M, replace_value_for_T): 28 | series = series.map(lambda s : s.strip()) 29 | series[series == 'M'] = replace_value_for_M 30 | series[series == 'T'] = replace_value_for_T 31 | return series.astype(float) 32 | 33 | def preprocess(_df, is_train): 34 | 35 | df = _df.copy() 36 | 37 | # log1p 38 | if is_train: 39 | df['log1p'] = np.log(df['units'] + 1) 40 | 41 | # date 42 | df['date2'] = pd.to_datetime(df['date']) 43 | 44 | # weather features 45 | wtr['date2'] = pd.to_datetime(wtr.date) 46 | wtr["preciptotal2"] = to_float(wtr["preciptotal"], 0.00, 0.005) 47 | wtr["preciptotal_flag"] = np.where(wtr["preciptotal2"] > 0.2, 1.0, 0.0) 48 | 49 | wtr["depart2"] = to_float(wtr.depart, np.nan, 0.00) 50 | wtr["depart_flag"] = 0.0 51 | wtr["depart_flag"] = np.where(wtr["depart2"] < -8.0, -1, wtr["depart_flag"]) 52 | wtr["depart_flag"] = np.where(wtr["depart2"] > 8.0 , 1, wtr["depart_flag"]) 53 | df = pd.merge(df, key, on='store_nbr') 54 | df = pd.merge(df, wtr[["date2", "station_nbr", "preciptotal_flag", "depart_flag"]], 55 | on=["date2", "station_nbr"]) 56 | 57 | # weekday 58 | df['weekday'] = df.date2.dt.weekday 59 | df['is_weekend'] = df.date2.dt.weekday.isin([5,6]) 60 | df['is_holiday'] = df.date2.isin(holidays.date2) 61 | df['is_holiday_weekday'] = df.is_holiday & (df.is_weekend == False) 62 | df['is_holiday_weekend'] = df.is_holiday & df.is_weekend 63 | 64 | # bool to int (maybe no meaning) 65 | df.is_weekend = np.where(df.is_weekend, 1, 0) 66 | df.is_holiday = np.where(df.is_holiday, 1, 0) 67 | df.is_holiday_weekday = np.where(df.is_holiday_weekday, 1, 0) 68 | df.is_holiday_weekend = np.where(df.is_holiday_weekend, 1, 0) 69 | 70 | # day, month, year 71 | df['day'] = df.date2.dt.day 72 | df['month'] = df.date2.dt.month 73 | df['year'] = df.date2.dt.year 74 | 75 | # around BlackFriday 76 | df = pd.merge(df, holiday_names, on='date2', how = 'left') 77 | df.loc[df.holiday_name.isnull(), "holiday_name"] = "" 78 | 79 | around_BlackFriday = ["BlackFridayM3", "BlackFridayM2", "ThanksgivingDay", "BlackFriday", 80 | "BlackFriday1", "BlackFriday2", "BlackFriday3"] 81 | df["around_BlackFriday"] = np.where(df.holiday_name.isin(around_BlackFriday), 82 | df.holiday_name, "Else") 83 | 84 | return df 85 | 86 | # read dataframes 87 | key = pd.read_csv("data/key.csv") 88 | wtr = pd.read_csv("data/weather.csv") 89 | holidays = get_holidays("holidays.txt") 90 | holiday_names = get_holiday_names("holiday_names.txt") 91 | 92 | store_item_nbrs_path = 'model/store_item_nbrs.csv' 93 | store_item_nbrs = pd.read_csv(store_item_nbrs_path) 94 | valid_store_items = set(zip(store_item_nbrs.store_nbr, store_item_nbrs.item_nbr)) 95 | 96 | # preprocess 97 | df_train = pd.read_csv("data/train.csv") 98 | mask_train = [(sno_ino in valid_store_items) for sno_ino in zip(df_train['store_nbr'], df_train['item_nbr']) ] 99 | df_train = df_train[mask_train].copy() 100 | preprocess(df_train, True).to_pickle('model/train2.pkl') 101 | 102 | df_test = pd.read_csv("data/test.csv") 103 | mask_test = [(sno_ino in valid_store_items) for sno_ino in zip(df_test['store_nbr'], df_test['item_nbr']) ] 104 | df_test = df_test[mask_test].copy() 105 | preprocess(df_test, False).to_pickle('model/test2.pkl') -------------------------------------------------------------------------------- /r3b_zeros.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle 4 | import os.path 5 | 6 | def create_zeros_parameters(_df): 7 | 8 | df = _df[['date2','item_nbr','store_nbr','log1p']].copy() 9 | df = df.set_index('date2', drop=True) 10 | df['is_zero'] = np.where(_df.log1p == 0.0, 1, 0) 11 | 12 | loop_range = range(1, 11) 13 | 14 | # calculate forward successive zeros 15 | # start/end of the dates is treated as non-zero 16 | cols_forward = ['f{}'.format(i) for i in loop_range] 17 | for i in loop_range: 18 | col = cols_forward[i-1] 19 | colp = cols_forward[i-2] 20 | df[col] = df.is_zero.shift(-i) 21 | df[col] = df[col].fillna(0) 22 | if (i-2 >= 0) : df[col] = df[colp] * df[col] 23 | df['forward_zeros'] = df[cols_forward].sum(axis=1) 24 | 25 | # calculate back successive zeros 26 | # start/end of the dates is treated as non-zero 27 | cols_back = ['b{}'.format(i) for i in loop_range] 28 | for i in loop_range: 29 | col = cols_back[i-1] 30 | colp = cols_back[i-2] 31 | df[col] = df.is_zero.shift(i) 32 | df[col] = df[col].fillna(0) 33 | if (i-2 >= 0) : df[col] = df[colp] * df[col] 34 | df['back_zeros'] = df[cols_back].sum(axis=1) 35 | 36 | df['min_zeros'] = np.minimum(df.back_zeros, df.forward_zeros) 37 | 38 | g = df[df.is_zero == 0].groupby(['min_zeros']) 39 | max_bothside_zeros = g['min_zeros'].count().index.max() 40 | max_bothside_zeros = np.min([9, np.max([1, max_bothside_zeros])]) 41 | 42 | df['max_bothside_zeros'] = max_bothside_zeros 43 | df = df.drop(cols_forward, axis=1) 44 | df = df.drop(cols_back, axis=1) 45 | return df 46 | 47 | 48 | def create_zeros(_df): 49 | 50 | dfs = [] 51 | 52 | for sno, ino in store_items: 53 | df = _df[ (_df.item_nbr == ino) & (_df.store_nbr == sno) ] 54 | dfn = create_zeros_parameters(df) 55 | 56 | # all dates 57 | alldates = pd.date_range('2012-01-01', '2014-10-31', freq='D') 58 | alldates.name = 'date2' 59 | dfn2 = pd.DataFrame(dfn, index = alldates) 60 | 61 | # fill same values 62 | dfn2[['item_nbr', 'store_nbr']] = dfn2[['item_nbr', 'store_nbr']].ffill() 63 | dfn2[['item_nbr', 'store_nbr']] = dfn2[['item_nbr', 'store_nbr']].bfill() 64 | 65 | dfn2[['max_bothside_zeros']] = dfn2[['max_bothside_zeros']].ffill() 66 | dfn2[['max_bothside_zeros']] = dfn2[['max_bothside_zeros']].bfill() 67 | 68 | # calculate previous and next is zero or not 69 | not_train = dfn2.log1p.isnull() 70 | 71 | dfn2['is_zero_prev'] = dfn2['is_zero'] 72 | dfn2['is_zero_prev'] = dfn2['is_zero_prev'].ffill() 73 | dfn2['is_zero_prev'] = dfn2['is_zero_prev'].bfill() 74 | dfn2['is_zero_next'] = dfn2['is_zero'] 75 | dfn2['is_zero_next'] = dfn2['is_zero_next'].bfill() 76 | dfn2['is_zero_next'] = dfn2['is_zero_next'].ffill() 77 | 78 | dfn2['back_zeros'] = dfn2['back_zeros'].interpolate(method='ffill') 79 | dfn2['back_zeros'] = dfn2['back_zeros'].bfill() 80 | dfn2.loc[not_train, 'back_zeros'] = np.where(dfn2.loc[not_train, 'is_zero_prev'], 81 | dfn2.loc[not_train, 'back_zeros'] + 1, 0) 82 | dfn2['back_zeros'] = np.minimum(dfn2['back_zeros'], 10) 83 | 84 | dfn2['forward_zeros'] = dfn2['forward_zeros'].interpolate(method='bfill') 85 | dfn2['forward_zeros'] = dfn2['forward_zeros'].ffill() 86 | dfn2.loc[not_train, 'forward_zeros'] = np.where(dfn2.loc[not_train, 'is_zero_next'], 87 | dfn2.loc[not_train, 'forward_zeros'] + 1, 0) 88 | dfn2['forward_zeros'] = np.minimum(dfn2['forward_zeros'], 10) 89 | 90 | dfn2['min_zeros'] = np.minimum(dfn2.back_zeros, dfn2.forward_zeros) 91 | 92 | dfn2['include_zeros'] = (dfn2.min_zeros <= dfn2.max_bothside_zeros) 93 | dfn2 = dfn2.reset_index(drop = False) 94 | 95 | dfs.append(dfn2) 96 | 97 | return pd.concat(dfs, ignore_index=True) 98 | 99 | df_train = pd.read_pickle("model/train2.pkl") 100 | 101 | store_item_nbrs_path = 'model/store_item_nbrs.csv' 102 | store_item_nbrs = pd.read_csv(store_item_nbrs_path) 103 | store_items = zip(store_item_nbrs.store_nbr, store_item_nbrs.item_nbr) 104 | 105 | df_zeros = create_zeros(df_train) 106 | df_zeros.to_pickle("model/df_zeros.pkl") 107 | --------------------------------------------------------------------------------