├── LICENSE
├── get_importance.R
├── model.R
├── README.md
├── preprocessing.R
├── lgbm_psudeo_labeling.R
└── function.R


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Shuo-Jen, Chang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/get_importance.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Split
 4 | tem1 <- data.table(tem1, df7) 
 5 | train <- tem1[1:nrow(train),]
 6 | test <- tem1[(nrow(train)+1) : nrow(tem),]
 7 | 
 8 | 
 9 | # ===============================================================================================================
10 | # Train model
11 | 
12 | df_tem <- 1:nrow(train) ; oof <- 1:nrow(train)
13 | c <- 0
14 | a <- vector()
15 | result_vec <- vector()
16 | ppte <- 0
17 | ncol(train)
18 | i = 1
19 | gc()
20 | print(i)
21 | set.seed(88)
22 | folds <- KFold(y, nfolds = 5,  stratified = F , seed = 71)
23 | 
24 | names(folds)[i] <- "test"
25 | test_st  <- train[folds$test,]
26 | 
27 | dtrain <- lgb.Dataset(data.matrix( train[-c(folds$test), ] ), label = y[-c(folds$test)]   )
28 | dval <- lgb.Dataset(data.matrix( train[folds$test, ] ), label = y[c(folds$test)]          )
29 | 
30 | lgb_param <- list(boosting_type = 'gbdt',
31 |                   objective = "huber",
32 |                   boost_from_average = 'false',
33 |                   metric = "none",
34 |                   learning_rate = 0.05,
35 |                   num_leaves = 128,
36 |                   #  min_gain_to_split = 0.01,
37 |                   feature_fraction = 0.05,
38 |                   #  feature_fraction_seed = seed,
39 |                   bagging_freq = 1,
40 |                   bagging_fraction = 1,
41 |                   min_sum_hessian_in_leaf = 5,
42 |                   #  min_data_in_leaf = 100,
43 |                   lambda_l1 = 0,
44 |                   lambda_l2 = 0,
45 |                   alpha = 0.3
46 |                   
47 | )
48 | 
49 | valids <- list(valid = dval)
50 | lgb <- lgb.train(params = lgb_param,  data = dtrain, nrounds = 20000 , eval = "RMSE",
51 |                  eval_freq = 50, valids = valids, early_stopping_rounds = 300, verbose = -1)
52 | 
53 | 
54 | impp <- lgb.importance(lgb)
55 | write.csv(impp, "imp.csv", row.names = F)
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/model.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # Select top 700
 5 | train <- train[,impp$Feature[1:700], with=F ]
 6 | test <- test[,impp$Feature[1:700], with=F ]
 7 | 
 8 | # ===============================================================================================================
 9 | # Train model
10 | df_tem <- 1:nrow(train) ; oof <- 1:nrow(train)
11 | c <- 0
12 | a <- vector()
13 | result_vec <- vector()
14 | ppte <- 0
15 | ncol(train)
16 | 
17 | for(i in 1:5){
18 |   #i = 1
19 |   gc()
20 |   print(i)
21 |   set.seed(88)
22 |   folds <- KFold(y, nfolds = 5,  stratified = F , seed = 71)
23 |   
24 |   names(folds)[i] <- "test"
25 |   test_st  <- train[folds$test,]
26 |   
27 |   dtrain <- lgb.Dataset(data.matrix( train[-c(folds$test), ] ), label = y[-c(folds$test)]   )
28 |   dval <- lgb.Dataset(data.matrix( train[folds$test, ] ), label = y[c(folds$test)]          )
29 |   
30 |   lgb_param <- list(boosting_type = 'gbdt',
31 |                     objective = "huber",
32 |                     boost_from_average = 'false',
33 |                     metric = "none",
34 |                     learning_rate = 0.008,
35 |                     num_leaves = 128,
36 |                     #  min_gain_to_split = 0.01,
37 |                     feature_fraction = 0.05,
38 |                     #  feature_fraction_seed = seed,
39 |                     bagging_freq = 1,
40 |                     bagging_fraction = 1,
41 |                     min_sum_hessian_in_leaf = 5,
42 |                     #  min_data_in_leaf = 100,
43 |                     lambda_l1 = 0,
44 |                     lambda_l2 = 0,
45 |                     alpha = 0.3
46 |                     
47 |   )
48 |   
49 |   valids <- list(valid = dval)
50 |   
51 |   #a <- as.numeric(Sys.time())
52 |   lgb <- lgb.train(params = lgb_param,  data = dtrain, nrounds = 50000 , eval = "RMSE",
53 |                    eval_freq = 50, valids = valids, early_stopping_rounds = 500, verbose = -1)
54 |   #cat("Train time :" , ( as.numeric(Sys.time()) - a ) / 60, "min", "\n" )
55 |   
56 |   pp <- predict(lgb, data.matrix(test_st))
57 |   ppte1 <- predict(lgb, data.matrix(test))
58 |   ppte <- ppte + ppte1
59 |   
60 |   df_tem[as.numeric(folds$test)] <- as.numeric(unlist(pp))
61 |   a[i] <- RMSLE( expm1(df_tem[as.numeric(folds$test)]) , expm1(y[as.numeric(folds$test)])  )
62 |   cat("best iter :" , lgb$best_iter, "best score :", a[i] ,"\n" )
63 |   invisible(gc())
64 | }
65 | 
66 | # CV score
67 | mean(a)
68 | 
69 | # Submission
70 | ppte <- expm1(ppte/5) ; ppte[ppte<0] <- 1e-6
71 | sub <- data.frame(id = test_id, y = ppte)
72 | write.csv(sub,"this_should_work.csv",row.names = F)
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Probspace---Re_estate---1st-place-solution
 2 | 
 3 | https://prob.space/competitions/re_real_estate_2020/ranking
 4 | 
 5 | 1st solution of "Re：不動産取引価格予測" competition on Probspace
 6 | 
 7 | 
 8 | 1. Run `function.R`
 9 | 2. Run `preprocessing.R`
10 | 3. Run `lgbm_psudeo_labeling.R`
11 | 4. Run `get_importance.R`, this will generate the feature importance (impp.csv)
12 | 5. Run `model.R`
13 | 
14 | # Solution
15 | ### About me
16 | Currently, I am a Master's student at Tokyo Tech. For further information, please refer to my [Kaggle Profile](https://www.kaggle.com/andrew60909).
17 | 
18 | ### Summary
19 | My final score (Public: 0.25848 / Private: 0.25854 / CV:0.261616) is based on a single LGBM (5 fold bagging) by select top700 features from lgb feature importance. I only use Lightgbm here since it is good for dealing with tabular data and relatively fast compared to Xgboost and Catboost, which allow you to test more ideas in limited time. For the validation scheme, I simply use 5-Fold cross-validation and it works very well, CV score always aligns with the LB score.  
20 | 
21 | ### Feature engineering
22 | The data is a little bit dirty, but compared to data from Signate student cup 2019, it was not a problem at all for me. I just spent some time on transforming them from 全角 to 半角, then separating them into the single feature so that we can do some feature engineering on it.
23 | 
24 | My whole FE is composed of 6 parts : 
25 | 
26 | - Group method (numeric2cate) : Apply statistics of numeric features in different categorical features group. For example, applying "**mean**" on "**面積（㎡）**" group by "**市区町村コード**".
27 |     The statistics functions I used :
28 |     + Mean, max, min, std, sum, skewness, kurtosis
29 |     + Bayes mean
30 |     + IQR : q75 - q25 
31 |     + IQR_ratio : q75 / q25
32 |     + Median absolute deviation : median( abs(x - median(x)) )
33 |     + Mean variance : std(x) / mean(x)
34 |     + hl_ratio : The ratio of numbers of the samples that higher and lower than the mean [(Ref, Table 2)](https://arxiv.org/pdf/1801.07323.pdf).
35 |     + MAD : Median Absolute Deviation : median( |x - median(x)| )
36 |     + Beyond1std : Calculating the ratio beyond 1 std
37 |     + Range : max - min
38 |     + Range_ratio : max / min
39 |     + Shapiro-Wilk Statistic 
40 |     + diff and ratio : "x - mean(x)" or "x / mean(x)"
41 |     + Z-score : ( x-mean(x) ) / std(x)
42 | 
43 | - Group method (cate2cate) : Apply statistics of categorical features in different categorical features group. For example, applying "**entropy**" on the frequency table of "**最寄駅：名称**" group by "**市区町村コード**".
44 |   The statistics functions I used :
45 |      + n_distinct : number of unique
46 |      + Entropy : apply entropy on frequency table
47 |      + freq1name : the number of most frequently appeared category
48 |      + freq1ratio : the number of most frequently appeared category / group size
49 | 
50 | - Target encoding : [Reference](http://helios.mm.di.uoa.gr/~rouvas/ssi/sigkdd/sigkdd.vol3.1/barreca.pdf) and [code](https://www.kaggle.com/brandenkmurray/it-is-lit)
51 | 
52 | - Count encoding : This works very well on some categorical features like "**取引の事情等**"
53 | 
54 | - Feature from land_price.csv : Making features by 2 different "Group method" that I have mentioned above. Applying the statistics on the features that is grouped by "**所在地コード**", then just merge it to our train+test data
55 | 
56 | - Feature pseudo-labeling : Build a LGBM model to predict the important features (I used "**sq_meter**", "**land__mean__ON__h31_price**", "**nobeyuka_m2**", "**Age of house**","**time_to_nearest_aki**"), and then take the oof predictions. 
57 | 
58 | 
59 | ### Hyper-parameter
60 | Suprisingly that tuning "alpha" in huber loss give me really a big boost (~0.001). In huber loss, alpha=1 basically means absolute loss (same formula). So if we lower the alpha value, it will make your model less sensitive to those "outlier cases".
61 | 
62 | ```
63 | lgb_param <- list(boosting_type = 'gbdt',
64 |                   objective = "huber",
65 |                   boost_from_average = 'false',
66 |                   metric = "none",
67 |                   learning_rate = 0.008,
68 |                   num_leaves = 128,
69 |                   #  min_gain_to_split = 0.01,
70 |                   feature_fraction = 0.05,
71 |                   #  feature_fraction_seed = 666666,
72 |                   bagging_freq = 1,
73 |                   bagging_fraction = 1,
74 |                   min_sum_hessian_in_leaf = 5,
75 |                   #  min_data_in_leaf = 100,
76 |                   lambda_l1 = 0,
77 |                   lambda_l2 = 0,
78 |                   alpha = 0.3
79 |                   )
80 | ```
81 | 
82 | 
83 | &nbsp;
84 | 


--------------------------------------------------------------------------------
/preprocessing.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | # system setting and call library
  5 | Sys.setlocale("LC_CTYPE", locale="Japanese")
  6 | 
  7 | library(tidyverse)
  8 | library(lightgbm)
  9 | library(moments)
 10 | library(entropy)
 11 | library(rBayesianOptimization)
 12 | library(MLmetrics)
 13 | library(data.table)
 14 | library(caret)
 15 | 
 16 | # Read data
 17 | train <- fread("train_data.csv", encoding = "UTF-8") 
 18 | test <- fread("test_data.csv", encoding = "UTF-8")
 19 | land_price <- fread("published_land_price.csv", encoding = "UTF-8")
 20 | y <- log1p(train$y) ; train$y <- NULL
 21 | test_id <- test$id
 22 | 
 23 | tem <- rbind(train,test)
 24 | 
 25 | # ==========================================================================================
 26 | # Data Cleaning
 27 | tem <- tem %>% as.data.frame() %>%
 28 |   mutate(
 29 |     
 30 |     # 全角 -> 半角
 31 |     間取り = z2h(間取り),
 32 |     
 33 |     # 最寄駅：距離（分） (time_to_nearest_aki)
 34 |     time_to_nearest_aki = ifelse(`最寄駅：距離（分）` == "30分?60分", 45, `最寄駅：距離（分）`),
 35 |     time_to_nearest_aki = ifelse(time_to_nearest_aki == "1H?1H30", 75, time_to_nearest_aki),
 36 |     time_to_nearest_aki = ifelse(time_to_nearest_aki == "1H30?2H", 105, time_to_nearest_aki),
 37 |     
 38 |     # 間取り
 39 |     room_type_normal = as.numeric(str_extract(間取り,"[0-9]+")),
 40 |     room_type_R = ifelse(grepl("R",間取り), 1, 0 ),
 41 |     room_type_L = ifelse(grepl("L",間取り), 1, 0 ),
 42 |     room_type_D = ifelse(grepl("D",間取り), 1, 0 ),
 43 |     room_type_K = ifelse(grepl("K",間取り), 1, 0 ),
 44 |     room_type_S = ifelse(grepl("S",間取り), 1, 0 ),
 45 |     
 46 |     # 面積（㎡） (sq_meter)
 47 |     sq_meter = ifelse(`面積（㎡）` == "2000㎡以上", 2500, `面積（㎡）`),
 48 |     sq_meter = as.numeric(ifelse(sq_meter  == "5000㎡以上", 5500, sq_meter)),
 49 |     
 50 |     # 間口 (maguchi)
 51 |     maguchi = as.numeric(ifelse(間口 == "50.0m以上", 75, 間口)),
 52 |     
 53 |     # 延床面積（㎡）(nobeyuka_m2)
 54 |     nobeyuka_m2 = as.numeric(ifelse(`延床面積（㎡）` == "2000㎡以上", 2500, `延床面積（㎡）`)),
 55 |     
 56 |     # 建築年 (built_year)
 57 |     year1 = str_remove_all(建築年,'[0-9]|年'),
 58 |     year1 = ifelse(year1 == "昭和", 1925, year1), 
 59 |     year1 = ifelse(year1 == "平成", 1988, year1), 
 60 |     built_year_b4ww2 = ifelse(year1 == "戦前", 1, 0), 
 61 |     year1 = as.numeric(ifelse(year1 == "戦前", NA, year1)), 
 62 |     year2 = as.numeric(str_extract_all(建築年, '[0-9]+')),
 63 |     built_year = year1 + year2,
 64 |     
 65 |     # 取引時点 (transaction date)
 66 |     trans_date_yr =  as.numeric(str_extract_all(取引時点, "[0-9]+")), 
 67 |     trans_date_q = as.numeric(z2h(str_remove_all(取引時点, "[0-9]|年第|四半期"))),
 68 |     trans_date = trans_date_yr + 0.2 * trans_date_q,
 69 |     
 70 |     
 71 |     
 72 |   ) %>%
 73 |   select(-`最寄駅：距離（分）`, -間取り, -`面積（㎡）`, -間口, -`延床面積（㎡）`, -建築年, -year1, -year2, -取引時点)
 74 | 
 75 | 
 76 | # JPN -> ENG
 77 | # 種類, 地域, 市区町村コード, 都道府県名, 市区町村名, 地区名, 最寄駅：名称, 土地の形状, 建物の構造, 用途, 今後の利用目的, 
 78 | # 前面道路：方位, 前面道路：種類, 前面道路：幅員（ｍ）, 都市計画, 建ぺい率（％）, 容積率（％）, 改装, 取引の事情等
 79 | 
 80 | colnames(tem)[c(2:20)] <- c("type", "land_type", "area_code", "city", "area", "sub_area", "nearest_aki_name", "land_shape",
 81 |                             "house_backbone", "use", "future_use", "faced_direction", "faced_road_type", "faced_road_width", "city_plan", 
 82 |                             "tapei_ratio", "floor_area_ratio", "reconstructed", "note"
 83 | )
 84 | 
 85 | tem <- tem %>% mutate(
 86 |   room_total = apply( tem[,c("room_type_normal", "room_type_R","room_type_L","room_type_D","room_type_K","room_type_S")], 1, 
 87 |                       function(x){sum(x,na.rm = T)}  ) ,
 88 |   square_meter_per_room = sq_meter / room_total ,
 89 |   square_meter_per_room_v2 = sq_meter / room_type_normal
 90 | ) %>%
 91 |   mutate_all(funs(ifelse(is.nan(.), NA, .))) %>% 
 92 |   mutate_all(funs(ifelse(is.infinite(.), NA, .))) %>% 
 93 |   select(-area, -id, -city) %>%
 94 |   as.data.table()
 95 | 
 96 | 
 97 | cols <- names(which(lapply(tem,class) == "character"))
 98 | tem[, (cols) := lapply(.SD, function(z) as.integer(as.factor(z))), .SDcols = cols]
 99 | df6 <- extract_feature_land(land_price)
100 | tem <- merge(tem, df6, by = "area_code", all.x=TRUE, sort=FALSE)
101 | # ===============================================================================================================
102 | # Making features : part 1~5
103 | fun_list1 <- c("mean", "sd", "max", "min", "sum","skewness", "kurtosis",
104 |                "IQR", "iqr_ratio", "beyond1std_ratio", "mean_var", "range_diff", "range_per", "hl_ratio", "sw_stat",
105 |                "x_diff", "x_ratio", "x_zscore", "entropy")
106 | fun_list2 <- c("n_distinct", "freq1ratio", "freq1count", "entropy_freqtable")
107 | 
108 | numer_list <- c("sq_meter", "nobeyuka_m2", "square_meter_per_room", "built_year", "faced_road_width", "floor_area_ratio", 
109 |                 "maguchi" ,"square_meter_per_room_v2", "trans_date", "land__mean__ON__h31_price")
110 | 
111 | cate_list <- c("area_code", "nearest_aki_name", "land_type", "use", "sub_area", "trans_date_yr")
112 | 
113 | 
114 | df1 <- get_group_feature_numer2cate(tem, fun_list1, numer_list, cate_list)
115 | df2 <- get_group_feature_cate2cate(tem, fun_list2, cate_list)
116 | df3 <- get_bayes_mean(tem, numer_list, cate_list)
117 | df4 <- get_count_encoding(tem, cate_list = cols)
118 | df5 <- get_target_encoding(tem, cate_list = cols)
119 | tem1 <- data.table(tem, df1, df2, df3, df4, df5)
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/lgbm_psudeo_labeling.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | # sq_meter, land__mean__ON__h31_price, nobeyuka_m2, built_year
  6 | 
  7 | lgb_pred_have_na <- function(data, target, target_name){
  8 |   
  9 |   cat("input col :", ncol(data),"\n")
 10 |   data <- as.data.frame(data)
 11 |   non_na_idx <- which(is.na( as.numeric(unlist(data[,target_name])) ) == F  )
 12 |   na_idx <- which(is.na( as.numeric(unlist(data[,target_name])) ) == T  )  
 13 |   
 14 |   if(target_name == "land__mean__ON__h31_price"){ data <- data[,which(grepl("_price",colnames(data)) == F )] }
 15 |   cleaned_target <- target[is.na(target) == F]
 16 |   cleaned_data <- data[,which(grepl(target_name,colnames(data)) == F )]   
 17 |   cat("cleaned col :", ncol(cleaned_data),"\n")
 18 |   
 19 |   non_na_data <- cleaned_data[non_na_idx,]
 20 |   na_data <- cleaned_data[na_idx,]
 21 |   
 22 |   df_tem <- 1:nrow(non_na_data)
 23 |   c <- 0
 24 |   a <- vector()
 25 |   ppte <- 0
 26 |   
 27 |   for(i in 1:5){
 28 |     invisible(gc())
 29 |     print(i)
 30 |     set.seed(77)
 31 |     folds <- KFold(cleaned_target, nfolds = 5,  stratified = FALSE, seed = 0)
 32 |     
 33 |     names(folds)[i] <- "test"
 34 |     test_st  <- non_na_data[folds$test,]
 35 |     
 36 |     d0 <- lgb.Dataset(data.matrix( non_na_data[-c(folds$test),] ), label = cleaned_target[-c(folds$test)] )
 37 |     dval <- lgb.Dataset(data.matrix( non_na_data[folds$test,] ), label = cleaned_target[c(folds$test)] )
 38 |     
 39 |     lgb_param <- list(boosting_type = 'gbdt',
 40 |                       objective = "regression" ,
 41 |                       metric = "RMSE",
 42 |                       learning_rate = 0.1,
 43 |                       num_leaves = 96,
 44 |                       feature_fraction = 0.05,
 45 |                       bagging_freq = 1,
 46 |                       bagging_fraction = 1,
 47 |                       min_data_in_leaf = 200
 48 |     )
 49 |     
 50 |     valids <- list(valid = dval)
 51 |     lgb <- lgb.train(params = lgb_param,  data = d0, nrounds = 2000, 
 52 |                      eval_freq = 200, valids = valids, early_stopping_rounds = 300, verbose = 1)
 53 |     
 54 |     pp <- predict(lgb, data.matrix(test_st))
 55 |     ppte1 <- predict(lgb, data.matrix(na_data))
 56 |     ppte <- ppte + ppte1
 57 |     
 58 |     df_tem[as.numeric(folds$test)] <- as.numeric(unlist(pp))
 59 |     cat( ">>>>  END  <<<< ","\n")
 60 |   }
 61 |   ppte <- ppte/5
 62 |   return(list(df_tem,ppte))
 63 | }
 64 | 
 65 | 
 66 | lgb_pred_no_na <- function(data, target, target_name){
 67 |   
 68 |   data <- as.data.frame(data)
 69 |   cat("input col :", ncol(data),"\n")
 70 |   cleaned_data <- data[,which(grepl(target_name,colnames(data)) == F )]  
 71 |   if(target_name == "land__mean__ON__h31_price"){ data <- data[,which(grepl("square_meter",colnames(data)) == F )] }
 72 |   cat("cleaned col :", ncol(cleaned_data),"\n")
 73 |   
 74 |   df_tem <- 1:nrow(cleaned_data)
 75 |   c <- 0
 76 |   a <- vector()
 77 |   ppte <- 0
 78 |   target <- log1p(target)
 79 |   
 80 |   for(i in 1:5){
 81 |     invisible(gc())
 82 |     print(i)
 83 |     set.seed(77)
 84 |     folds <- KFold(target, nfolds = 5,  stratified = FALSE, seed = 0)
 85 |     
 86 |     names(folds)[i] <- "test"
 87 |     test_st  <- cleaned_data[folds$test,]
 88 |     
 89 |     d0 <- lgb.Dataset(data.matrix( cleaned_data[-c(folds$test),] ), label = target[-c(folds$test)] )
 90 |     dval <- lgb.Dataset(data.matrix( cleaned_data[folds$test,] ), label = target[c(folds$test)] )
 91 |     
 92 |     lgb_param <- list(boosting_type = 'gbdt',
 93 |                       objective = "regression" ,
 94 |                       metric = "RMSE",
 95 |                       learning_rate = 0.1,
 96 |                       num_leaves = 96,
 97 |                       feature_fraction = 0.05,
 98 |                       bagging_freq = 1,
 99 |                       bagging_fraction = 1,
100 |                       min_data_in_leaf = 200
101 |     )
102 |     
103 |     valids <- list(valid = dval)
104 |     lgb <- lgb.train(params = lgb_param,  data = d0, nrounds = 1000, 
105 |                      eval_freq = 200, valids = valids, early_stopping_rounds = 300, verbose = 1)
106 |     
107 |     pp <- predict(lgb, data.matrix(test_st))
108 |     
109 |     df_tem[as.numeric(folds$test)] <- as.numeric(unlist(pp))
110 |     cat( ">>>>  END  <<<< ","\n")
111 |   }
112 |   return(df_tem)
113 | }
114 | 
115 | 
116 | get_pred0 <- function(data){
117 |   
118 |   data <- as.data.frame(data)
119 |   c <- 0 
120 |   tmp2 <- 1:nrow(data)
121 |   pred_data <- data.frame(0)
122 |   col <- data[,c("land__mean__ON__h31_price", "nobeyuka_m2", "built_year")]
123 |   for(target_name in colnames(col)){
124 |     
125 |     library(lightgbm)  
126 |     c <- c+1
127 |     tmp1 <- lgb_pred_have_na(data, as.numeric(unlist(col[,c])), target_name) 
128 |     tmp2[ which(is.na(  as.numeric(unlist(col[,target_name]))  ) == F) ] <-  tmp1[[1]]
129 |     tmp2[ which(is.na(  as.numeric(unlist(col[,target_name]))  ) == T) ] <-  tmp1[[2]]
130 |     pred_data <- data.frame(pred_data,tmp2)
131 |     lgb.unloader(wipe = T)
132 |   }
133 |   pred_data <- pred_data[,-1] ; colnames(pred_data) <- paste0(colnames(col),"___pred0_by_lgbm",sep="")
134 |   return(pred_data)
135 | }
136 | 
137 | 
138 | df7 <- get_pred0(tem1)
139 | df8 <- data.table(expm1(lgb_pred_no_na(tem1, tem1$sq_meter, "sq_meter")), 
140 |                   expm1(lgb_pred_no_na(tem1, tem1$time_to_nearest_aki, "time_to_nearest_aki"))
141 | )
142 | 
143 | colnames(df8) <- paste0(c("sq_meter", "time_to_nearest_aki"),"___pred0_by_lgbm",sep="")
144 | 
145 | df7 <- data.table(df7, df8)
146 | write.csv(df7, "df7.csv", row.names=F)
147 | 
148 | 
149 | 
150 | 


--------------------------------------------------------------------------------
/function.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Some function
  4 | beyond1std_ratio <- function(x){ return( sum(ifelse(x > (mean(x,na.rm=T) + sd(x,na.rm=T)),1,0)) / length(x) )}
  5 | iqr_ratio <- function(x){ return( quantile(x,0.75,na.rm=T) / quantile(x,0.25,na.rm=T) )}
  6 | mean_var <- function(x){ return( sd(x,na.rm=T) /  mean(x,na.rm=T) )}
  7 | range_diff <- function(x){ return( max(x,na.rm=T) - min(x,na.rm=T) )}
  8 | range_per <- function(x){ return( max(x,na.rm=T) / min(x,na.rm=T) )}
  9 | hl_ratio <- function(x){ return( sum(ifelse(x > mean(x),1,0)) / sum(ifelse(x >= mean(x),0,1)) )}
 10 | sw_stat <- function(x){ return( ifelse(sum(is.na(x) == F) > 3 & sum(is.na(x) == F) < 5000 &
 11 |                                          length(x) > 3 & length(x) < 5000 & sum(diff(x),na.rm=T)!=0 , 
 12 |                                        shapiro.test(as.numeric(x))$statistic, NA) )}
 13 | x_diff <- function(x){ return(x - mean(x,na.rm=T)) }
 14 | x_ratio <- function(x){ return(x / mean(x,na.rm=T)) }
 15 | x_zscore <- function(x){ return( (x-mean(x,na.rm=T)) / sd(x,na.rm=T)) }
 16 | 
 17 | freq1ratio <- function(x){ return( 
 18 |   ifelse(sort(table(x),decreasing = T)[1] == sort(table(x),decreasing = T)[2],
 19 |          NA, as.numeric( sort(table(x),decreasing = T)[1] / length(x) )  )  )}
 20 | freq1count <- function(x){ return( 
 21 |   ifelse(sort(table(x),decreasing = T)[1] == sort(table(x),decreasing = T)[2],
 22 |          NA, ( sort(table(x),decreasing = T)[1] )  )  )}
 23 | 
 24 | entropy_freqtable <- function(x){ return( as.numeric(entropy(table(x)))) }
 25 | 
 26 | 
 27 | # Revised version of function "zen2han" from "Nippon" package
 28 | z2h <- function (s) 
 29 | {
 30 |   s <- ifelse(s == "", NA, s)  
 31 |   ori_s <- s
 32 |   s <- na.omit(s)
 33 |   if (any(Encoding(s) != "UTF-8")) 
 34 |     s <- iconv(s, from = "", to = "UTF-8")
 35 |   s <- paste(s, sep = "")
 36 |   y <- sapply(unlist(strsplit(s, split = ",")), function(x) {
 37 |     i <- utf8ToInt(x)
 38 |     if (i >= 65281 && i <= 65374) {
 39 |       return(intToUtf8(i - 65248))
 40 |     }
 41 |     else {
 42 |       return(x)
 43 |     }
 44 |   })
 45 |   ori_s[!is.na(ori_s)] <- paste(y)
 46 |   return(ori_s)
 47 | }
 48 | 
 49 | # Get group function
 50 | # can acclerate whole process by using %dopar%
 51 | get_group_feature_numer2cate <- function(data, fun_list, numer_list, cate_list){
 52 |   
 53 |   tmp1 <- data.table(row_idx = 1:nrow(data))
 54 |   
 55 |   for(fun_name in fun_list){
 56 |     cat("- Processing : ", fun_name, "\n")
 57 |     for(cate in cate_list){
 58 |       for(numer in numer_list){
 59 |         
 60 |         if(cate == numer) next
 61 |         data2 <- data[,c(cate,numer), with=F]
 62 |         
 63 |         if(fun_name %in% c("x_diff", "x_ratio", "x_zscore")){
 64 |           tmp <- data2[ , col := get(fun_name)( get(numer) ), by = cate]
 65 |         } else{ tmp <- data2[ , col := get(fun_name)( na.omit(get(numer)) ), by = cate] }
 66 |         
 67 |         colnames(tmp)[3] <- paste0(fun_name, "__ON__", numer, "__BY__", cate)
 68 |         tmp1 <- data.table(tmp1,tmp[,-c(1:2)])
 69 |       } 
 70 |     }
 71 |   }
 72 |   tmp1 <- tmp1[, lapply(.SD, function(x){ ifelse(is.infinite(x) | is.nan(x), NA, x) } )]
 73 |   cat("====  Process completed, ignore the warning message from max and min function (returning Inf)  ====", "\n")
 74 |   return(tmp1[,-1])
 75 | }  
 76 | 
 77 | # Get group function
 78 | # can acclerate whole process by using %dopar%
 79 | get_group_feature_cate2cate <- function(data, fun_list, cate_list){
 80 |   
 81 |   tmp1 <- data.table(row_idx = 1:nrow(data))
 82 |   data2 <- data[,c(cate_list), with=F]
 83 |   
 84 |   
 85 |   for(fun_name in fun_list){
 86 |     cat("- Processing : ", fun_name, "\n")
 87 |     for(cate1 in cate_list){
 88 |       for(cate2 in cate_list){
 89 |         
 90 |         if(cate1 == cate2) next
 91 |         if(cate1 == "sub_area" & cate2 == "area_code") next
 92 |         tmp <- data2[, col := NULL][ , col := get(fun_name)( get(cate2) ), by = cate1][ , "col", with=F]
 93 |         colnames(tmp) <- paste0(fun_name, "__ON__", cate2, "__BY__", cate1)
 94 |         tmp1 <- data.table(tmp1,tmp)
 95 |       } 
 96 |     }
 97 |   }
 98 |   cat("====  Process completed  ====", "\n")
 99 |   return(tmp1[,-1])
100 | }  
101 | 
102 | 
103 | get_bayes_mean <- function(data, numer_list, cate_list, prior=20){
104 |   
105 |   tmp1 <- data.table(row_idx = 1:nrow(data))
106 |   
107 |   for(cate in cate_list){
108 |     for(numer in numer_list){
109 |       
110 |       data2 <- data[,c(cate,numer), with=F]
111 |       tmp <- data2[ , col :=  ( ( ( get(numer) / mean(na.omit(get(numer))) ) * .N) + prior ) / (.N + prior)  , by = cate][is.na(col), col:= 1]
112 |       colnames(tmp)[3] <- paste0("bayes_mean", "__ON__", numer, "__BY__", cate)
113 |       tmp1 <- data.table(tmp1,tmp[,-c(1:2)])
114 |     } 
115 |   }
116 |   
117 |   tmp1 <- tmp1[, lapply(.SD, function(x){ ifelse(is.infinite(x) | is.nan(x) , NA, x) } )]
118 |   cat("====   Process completed   ====", "\n")
119 |   return(tmp1[,-1])
120 | }  
121 | 
122 | 
123 | get_count_encoding <- function(data, cate_list){
124 |   tmp1 <- data.table(row_idx = 1:nrow(data))
125 |   for(cate in cate_list){
126 |     tmp <- data[ , col := .N, by=cate ][ , "col", with=F] 
127 |     colnames(tmp) <- paste0("count_encoding__OF__", cate)
128 |     tmp1 <- data.table(tmp1, tmp)
129 |   }
130 |   cat("====  Process completed  ====", "\n")
131 |   return(tmp1[,-1])
132 | }
133 | 
134 | 
135 | 
136 | # Author of the code : Branden Murray
137 | # https://www.kaggle.com/brandenkmurray/it-is-lit
138 | # This is a modified version for regression task
139 | catNWayAvgCV <- function(data, varList, y, pred0, filter, k, f, g=1, lambda=NULL, r_k, cv=NULL){
140 |   
141 |   # It is probably best to sort your dataset first by filter and then by ID (or index)
142 |   n <- length(varList)
143 |   varNames <- paste0("v",seq(n))
144 |   ind <- unlist(cv, use.names=FALSE)
145 |   oof <- NULL
146 |   if (length(cv) > 0){
147 |     for (i in 1:length(cv)){
148 |       sub1 <- data.table(v1=data[,varList,with=FALSE], y=data[,y,with=FALSE], pred0=data[,pred0,with=FALSE], filt=filter)
149 |       sub1 <- sub1[sub1$filt==TRUE,]
150 |       sub1[,filt:=NULL]
151 |       colnames(sub1) <- c(varNames,"y","pred0")
152 |       sub2 <- sub1[cv[[i]],]
153 |       sub1 <- sub1[-cv[[i]],]
154 |       sum1 <- sub1[,list(sumy=sum(y), avgY=mean(y), cnt=length(y)), by=varNames]
155 |       tmp1 <- merge(sub2, sum1, by = varNames, all.x=TRUE, sort=FALSE)
156 |       set(tmp1, i=which(is.na(tmp1[,cnt])), j="cnt", value=0)
157 |       set(tmp1, i=which(is.na(tmp1[,sumy])), j="sumy", value=0)
158 |       if(!is.null(lambda)) tmp1[beta:=lambda] else tmp1[,beta:= 1/(g+exp((tmp1[,cnt] - k)/f))]
159 |       tmp1[,adj_avg:=((1-beta)*avgY+beta*pred0)]
160 |       set(tmp1, i=which(is.na(tmp1[["avgY"]])), j="avgY", value=tmp1[is.na(tmp1[["avgY"]]), pred0])
161 |       set(tmp1, i=which(is.na(tmp1[["adj_avg"]])), j="adj_avg", value=tmp1[is.na(tmp1[["adj_avg"]]), pred0])
162 |       set(tmp1, i=NULL, j="adj_avg", value=tmp1$adj_avg*(1+(runif(nrow(sub2))-0.5)*r_k))
163 |       oof <- c(oof, tmp1$adj_avg)
164 |     }
165 |   }
166 |   oofInd <- data.frame(ind, oof)
167 |   oofInd <- oofInd[order(oofInd$ind),]
168 |   sub1 <- data.table(v1=data[,varList,with=FALSE], y=data[,y,with=FALSE], pred0=data[,pred0,with=FALSE], filt=filter)
169 |   colnames(sub1) <- c(varNames,"y","pred0","filt")
170 |   sub2 <- sub1[sub1$filt==F,]
171 |   sub1 <- sub1[sub1$filt==T,]
172 |   sum1 <- sub1[,list(sumy=sum(y), avgY=mean(y), cnt=length(y)), by=varNames]
173 |   tmp1 <- merge(sub2, sum1, by = varNames, all.x=TRUE, sort=FALSE)
174 |   tmp1$cnt[is.na(tmp1$cnt)] <- 0
175 |   tmp1$sumy[is.na(tmp1$sumy)] <- 0
176 |   if(!is.null(lambda)) tmp1$beta <- lambda else tmp1$beta <- 1/(g+exp((tmp1$cnt - k)/f))
177 |   tmp1$adj_avg <- (1-tmp1$beta)*tmp1$avgY + tmp1$beta*tmp1$pred0
178 |   tmp1$avgY[is.na(tmp1$avgY)] <- tmp1$pred0[is.na(tmp1$avgY)]
179 |   tmp1$adj_avg[is.na(tmp1$adj_avg)] <- tmp1$pred0[is.na(tmp1$adj_avg)]
180 |   # Combine train and test into one vector
181 |   return(c(oofInd$oof, tmp1$adj_avg))
182 | }
183 | 
184 | 
185 | get_target_encoding <- function(data, cate_list){
186 |   
187 |   data <- tem ; cate_list <- cols
188 |   
189 |   for_enc <- data.table(data[ ,cate_list, with=F],                        # takes only low cardinaility categorical features
190 |                         y = c(y,rep(NA,nrow(test))),                      # target
191 |                         pred0 = rep(mean(y),nrow(tem)),                   # global mean
192 |                         filter=c(rep(0,nrow(train)),rep(1,nrow(test)))    # Indicater, to show where is train and test
193 |   ) 
194 |   
195 |   cvFoldsList <- createFolds(y, k=5, list=TRUE, returnTrain=FALSE)       # create 5 folds
196 |   
197 |   tmp1 <- data.table(row_idx = 1:nrow(data))
198 |   for(cate in cate_list){
199 |     tmp <- data.table(fea = catNWayAvgCV(for_enc, varList=cate, y="y", pred0="pred0",
200 |                                          filter=for_enc$filter==0, k=30, f=10, r_k=0.02, cv=cvFoldsList))
201 |     colnames(tmp) <- paste0("target_encoding__OF__", cate)
202 |     tmp1 <- data.table(tmp1, tmp)
203 |   }
204 |   cat("====  Process completed  ====", "\n")
205 |   return(tmp1[,-1])
206 | }
207 | 
208 | # Extract feature from land_price.csv
209 | extract_feature_land <- function(data){
210 |   
211 |   # Numeric : 2,3,14,20,22,34,40,41,(43)  44:49,50:80
212 |   # Cate    : 5,6,17,18,24,32,37
213 |   
214 |   colnames(data)[c(2:6,14,17,18,20,22,24,32,34,37,40,41,43,44:49,50:80 )] <- 
215 |     c("longitude", "latitude", "area_code", "use", "seq_num", "chiseki", "backbone", "facility", "maguchi_ratio", #20
216 |       "floor", "faced_road_type", "envir", "distance_to_aki", "cityplan_type", "tapei_ratio", "floor_area_ratio", "selected_yr", #43
217 |       paste0("s",58:63,"_price"), paste0("h",1:31,"_price")  )
218 |   
219 |   # Data cleaning
220 |   df_tmp <- data[,c(paste0("s",58:63,"_price"), paste0("h",1:31,"_price")), with=F]
221 |   df_tmp[df_tmp == 0] <- NA
222 |   data[,c(paste0("s",58:63,"_price"), paste0("h",1:31,"_price"))]<- df_tmp
223 |   invisible(gc())
224 |   
225 |   data[, `:=`(selected_yr = as.numeric(str_count(selected_yr, "1")),
226 |               past15yr_price_mean = apply(data[, paste0("h",c(16:31),"_price"), with=F], 1, mean, na.rm=T),
227 |               past15yr_price_sd = apply(data[, paste0("h",c(16:31),"_price"), with=F], 1, sd, na.rm=T),
228 |               overall_price_mean = apply(data[, c(paste0("s",58:63,"_price"), paste0("h",1:31,"_price")), with=F], 1, mean, na.rm=T),
229 |               overall_price_sd = apply(data[, c(paste0("s",58:63,"_price"), paste0("h",1:31,"_price")), with=F], 1, sd, na.rm=T)
230 |   )]
231 |   
232 |   fun_list1 <- c("mean", "sd", "max", "min", "sum","skewness", "kurtosis",
233 |                  "IQR", "iqr_ratio", "mean_var", "range_diff", "range_per", "hl_ratio")
234 |   fun_list2 <- c("n_distinct", "entropy_freqtable") 
235 |   tar_col_numer <- c("longitude", "latitude", "chiseki", "maguchi_ratio", "floor", "distance_to_aki","tapei_ratio", "floor_area_ratio",
236 |                      "selected_yr", paste0("h",c(1,11,21,31),"_price"), 
237 |                      "overall_price_mean", "overall_price_sd", "past15yr_price_mean", "past15yr_price_sd")
238 |   tar_col_cate <- c("use", "seq_num", "backbone", "facility", "faced_road_type", "envir", "cityplan_type")
239 |   
240 |   # Making features
241 |   tmp <- data[ , .( col = .N ), by = "area_code"][,1] 
242 |   
243 |   for(funfun in fun_list1){
244 |     for(tar_col in tar_col_numer){
245 |       df <- data[ , .( col = get(funfun)( as.double(na.omit(get(tar_col))) ) ) , by = "area_code" ] 
246 |       colnames(df)[2] <- paste0( "land__", funfun , "__ON__", tar_col )  
247 |       tmp <- merge(tmp, df, by = "area_code", all.x=TRUE, sort=FALSE)
248 |     }
249 |   }
250 |   
251 |   for(funfun in fun_list2){
252 |     for(tar_col in tar_col_cate){
253 |       df <- data[, col := NULL][ , .( col = get(funfun)( get(tar_col) ) ) , by = "area_code" ] 
254 |       colnames(df)[2] <- paste0( "land__", funfun , "__ON__", tar_col )  
255 |       tmp <- merge(tmp, df, by = "area_code", all.x=TRUE, sort=FALSE)
256 |     }
257 |   }
258 |   tmp <- tmp[, lapply(.SD, function(x){ ifelse(is.infinite(x) | is.nan(x) , NA, x) } )]
259 |   return(tmp)
260 | }
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 
267 | 
268 | 
269 | 
270 | 
271 | 
272 | 
273 | 
274 | 
275 | 
276 | 


--------------------------------------------------------------------------------