├── LICENSE ├── get_importance.R ├── model.R ├── README.md ├── preprocessing.R ├── lgbm_psudeo_labeling.R └── function.R /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Shuo-Jen, Chang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /get_importance.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Split 4 | tem1 <- data.table(tem1, df7) 5 | train <- tem1[1:nrow(train),] 6 | test <- tem1[(nrow(train)+1) : nrow(tem),] 7 | 8 | 9 | # =============================================================================================================== 10 | # Train model 11 | 12 | df_tem <- 1:nrow(train) ; oof <- 1:nrow(train) 13 | c <- 0 14 | a <- vector() 15 | result_vec <- vector() 16 | ppte <- 0 17 | ncol(train) 18 | i = 1 19 | gc() 20 | print(i) 21 | set.seed(88) 22 | folds <- KFold(y, nfolds = 5, stratified = F , seed = 71) 23 | 24 | names(folds)[i] <- "test" 25 | test_st <- train[folds$test,] 26 | 27 | dtrain <- lgb.Dataset(data.matrix( train[-c(folds$test), ] ), label = y[-c(folds$test)] ) 28 | dval <- lgb.Dataset(data.matrix( train[folds$test, ] ), label = y[c(folds$test)] ) 29 | 30 | lgb_param <- list(boosting_type = 'gbdt', 31 | objective = "huber", 32 | boost_from_average = 'false', 33 | metric = "none", 34 | learning_rate = 0.05, 35 | num_leaves = 128, 36 | # min_gain_to_split = 0.01, 37 | feature_fraction = 0.05, 38 | # feature_fraction_seed = seed, 39 | bagging_freq = 1, 40 | bagging_fraction = 1, 41 | min_sum_hessian_in_leaf = 5, 42 | # min_data_in_leaf = 100, 43 | lambda_l1 = 0, 44 | lambda_l2 = 0, 45 | alpha = 0.3 46 | 47 | ) 48 | 49 | valids <- list(valid = dval) 50 | lgb <- lgb.train(params = lgb_param, data = dtrain, nrounds = 20000 , eval = "RMSE", 51 | eval_freq = 50, valids = valids, early_stopping_rounds = 300, verbose = -1) 52 | 53 | 54 | impp <- lgb.importance(lgb) 55 | write.csv(impp, "imp.csv", row.names = F) 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /model.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Select top 700 5 | train <- train[,impp$Feature[1:700], with=F ] 6 | test <- test[,impp$Feature[1:700], with=F ] 7 | 8 | # =============================================================================================================== 9 | # Train model 10 | df_tem <- 1:nrow(train) ; oof <- 1:nrow(train) 11 | c <- 0 12 | a <- vector() 13 | result_vec <- vector() 14 | ppte <- 0 15 | ncol(train) 16 | 17 | for(i in 1:5){ 18 | #i = 1 19 | gc() 20 | print(i) 21 | set.seed(88) 22 | folds <- KFold(y, nfolds = 5, stratified = F , seed = 71) 23 | 24 | names(folds)[i] <- "test" 25 | test_st <- train[folds$test,] 26 | 27 | dtrain <- lgb.Dataset(data.matrix( train[-c(folds$test), ] ), label = y[-c(folds$test)] ) 28 | dval <- lgb.Dataset(data.matrix( train[folds$test, ] ), label = y[c(folds$test)] ) 29 | 30 | lgb_param <- list(boosting_type = 'gbdt', 31 | objective = "huber", 32 | boost_from_average = 'false', 33 | metric = "none", 34 | learning_rate = 0.008, 35 | num_leaves = 128, 36 | # min_gain_to_split = 0.01, 37 | feature_fraction = 0.05, 38 | # feature_fraction_seed = seed, 39 | bagging_freq = 1, 40 | bagging_fraction = 1, 41 | min_sum_hessian_in_leaf = 5, 42 | # min_data_in_leaf = 100, 43 | lambda_l1 = 0, 44 | lambda_l2 = 0, 45 | alpha = 0.3 46 | 47 | ) 48 | 49 | valids <- list(valid = dval) 50 | 51 | #a <- as.numeric(Sys.time()) 52 | lgb <- lgb.train(params = lgb_param, data = dtrain, nrounds = 50000 , eval = "RMSE", 53 | eval_freq = 50, valids = valids, early_stopping_rounds = 500, verbose = -1) 54 | #cat("Train time :" , ( as.numeric(Sys.time()) - a ) / 60, "min", "\n" ) 55 | 56 | pp <- predict(lgb, data.matrix(test_st)) 57 | ppte1 <- predict(lgb, data.matrix(test)) 58 | ppte <- ppte + ppte1 59 | 60 | df_tem[as.numeric(folds$test)] <- as.numeric(unlist(pp)) 61 | a[i] <- RMSLE( expm1(df_tem[as.numeric(folds$test)]) , expm1(y[as.numeric(folds$test)]) ) 62 | cat("best iter :" , lgb$best_iter, "best score :", a[i] ,"\n" ) 63 | invisible(gc()) 64 | } 65 | 66 | # CV score 67 | mean(a) 68 | 69 | # Submission 70 | ppte <- expm1(ppte/5) ; ppte[ppte<0] <- 1e-6 71 | sub <- data.frame(id = test_id, y = ppte) 72 | write.csv(sub,"this_should_work.csv",row.names = F) 73 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Probspace---Re_estate---1st-place-solution 2 | 3 | https://prob.space/competitions/re_real_estate_2020/ranking 4 | 5 | 1st solution of "Re:不動産取引価格予測" competition on Probspace 6 | 7 | 8 | 1. Run `function.R` 9 | 2. Run `preprocessing.R` 10 | 3. Run `lgbm_psudeo_labeling.R` 11 | 4. Run `get_importance.R`, this will generate the feature importance (impp.csv) 12 | 5. Run `model.R` 13 | 14 | # Solution 15 | ### About me 16 | Currently, I am a Master's student at Tokyo Tech. For further information, please refer to my [Kaggle Profile](https://www.kaggle.com/andrew60909). 17 | 18 | ### Summary 19 | My final score (Public: 0.25848 / Private: 0.25854 / CV:0.261616) is based on a single LGBM (5 fold bagging) by select top700 features from lgb feature importance. I only use Lightgbm here since it is good for dealing with tabular data and relatively fast compared to Xgboost and Catboost, which allow you to test more ideas in limited time. For the validation scheme, I simply use 5-Fold cross-validation and it works very well, CV score always aligns with the LB score. 20 | 21 | ### Feature engineering 22 | The data is a little bit dirty, but compared to data from Signate student cup 2019, it was not a problem at all for me. I just spent some time on transforming them from 全角 to 半角, then separating them into the single feature so that we can do some feature engineering on it. 23 | 24 | My whole FE is composed of 6 parts : 25 | 26 | - Group method (numeric2cate) : Apply statistics of numeric features in different categorical features group. For example, applying "**mean**" on "**面積(㎡)**" group by "**市区町村コード**". 27 | The statistics functions I used : 28 | + Mean, max, min, std, sum, skewness, kurtosis 29 | + Bayes mean 30 | + IQR : q75 - q25 31 | + IQR_ratio : q75 / q25 32 | + Median absolute deviation : median( abs(x - median(x)) ) 33 | + Mean variance : std(x) / mean(x) 34 | + hl_ratio : The ratio of numbers of the samples that higher and lower than the mean [(Ref, Table 2)](https://arxiv.org/pdf/1801.07323.pdf). 35 | + MAD : Median Absolute Deviation : median( |x - median(x)| ) 36 | + Beyond1std : Calculating the ratio beyond 1 std 37 | + Range : max - min 38 | + Range_ratio : max / min 39 | + Shapiro-Wilk Statistic 40 | + diff and ratio : "x - mean(x)" or "x / mean(x)" 41 | + Z-score : ( x-mean(x) ) / std(x) 42 | 43 | - Group method (cate2cate) : Apply statistics of categorical features in different categorical features group. For example, applying "**entropy**" on the frequency table of "**最寄駅:名称**" group by "**市区町村コード**". 44 | The statistics functions I used : 45 | + n_distinct : number of unique 46 | + Entropy : apply entropy on frequency table 47 | + freq1name : the number of most frequently appeared category 48 | + freq1ratio : the number of most frequently appeared category / group size 49 | 50 | - Target encoding : [Reference](http://helios.mm.di.uoa.gr/~rouvas/ssi/sigkdd/sigkdd.vol3.1/barreca.pdf) and [code](https://www.kaggle.com/brandenkmurray/it-is-lit) 51 | 52 | - Count encoding : This works very well on some categorical features like "**取引の事情等**" 53 | 54 | - Feature from land_price.csv : Making features by 2 different "Group method" that I have mentioned above. Applying the statistics on the features that is grouped by "**所在地コード**", then just merge it to our train+test data 55 | 56 | - Feature pseudo-labeling : Build a LGBM model to predict the important features (I used "**sq_meter**", "**land__mean__ON__h31_price**", "**nobeyuka_m2**", "**Age of house**","**time_to_nearest_aki**"), and then take the oof predictions. 57 | 58 | 59 | ### Hyper-parameter 60 | Suprisingly that tuning "alpha" in huber loss give me really a big boost (~0.001). In huber loss, alpha=1 basically means absolute loss (same formula). So if we lower the alpha value, it will make your model less sensitive to those "outlier cases". 61 | 62 | ``` 63 | lgb_param <- list(boosting_type = 'gbdt', 64 | objective = "huber", 65 | boost_from_average = 'false', 66 | metric = "none", 67 | learning_rate = 0.008, 68 | num_leaves = 128, 69 | # min_gain_to_split = 0.01, 70 | feature_fraction = 0.05, 71 | # feature_fraction_seed = 666666, 72 | bagging_freq = 1, 73 | bagging_fraction = 1, 74 | min_sum_hessian_in_leaf = 5, 75 | # min_data_in_leaf = 100, 76 | lambda_l1 = 0, 77 | lambda_l2 = 0, 78 | alpha = 0.3 79 | ) 80 | ``` 81 | 82 | 83 |   84 | -------------------------------------------------------------------------------- /preprocessing.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # system setting and call library 5 | Sys.setlocale("LC_CTYPE", locale="Japanese") 6 | 7 | library(tidyverse) 8 | library(lightgbm) 9 | library(moments) 10 | library(entropy) 11 | library(rBayesianOptimization) 12 | library(MLmetrics) 13 | library(data.table) 14 | library(caret) 15 | 16 | # Read data 17 | train <- fread("train_data.csv", encoding = "UTF-8") 18 | test <- fread("test_data.csv", encoding = "UTF-8") 19 | land_price <- fread("published_land_price.csv", encoding = "UTF-8") 20 | y <- log1p(train$y) ; train$y <- NULL 21 | test_id <- test$id 22 | 23 | tem <- rbind(train,test) 24 | 25 | # ========================================================================================== 26 | # Data Cleaning 27 | tem <- tem %>% as.data.frame() %>% 28 | mutate( 29 | 30 | # 全角 -> 半角 31 | 間取り = z2h(間取り), 32 | 33 | # 最寄駅:距離(分) (time_to_nearest_aki) 34 | time_to_nearest_aki = ifelse(`最寄駅:距離(分)` == "30分?60分", 45, `最寄駅:距離(分)`), 35 | time_to_nearest_aki = ifelse(time_to_nearest_aki == "1H?1H30", 75, time_to_nearest_aki), 36 | time_to_nearest_aki = ifelse(time_to_nearest_aki == "1H30?2H", 105, time_to_nearest_aki), 37 | 38 | # 間取り 39 | room_type_normal = as.numeric(str_extract(間取り,"[0-9]+")), 40 | room_type_R = ifelse(grepl("R",間取り), 1, 0 ), 41 | room_type_L = ifelse(grepl("L",間取り), 1, 0 ), 42 | room_type_D = ifelse(grepl("D",間取り), 1, 0 ), 43 | room_type_K = ifelse(grepl("K",間取り), 1, 0 ), 44 | room_type_S = ifelse(grepl("S",間取り), 1, 0 ), 45 | 46 | # 面積(㎡) (sq_meter) 47 | sq_meter = ifelse(`面積(㎡)` == "2000㎡以上", 2500, `面積(㎡)`), 48 | sq_meter = as.numeric(ifelse(sq_meter == "5000㎡以上", 5500, sq_meter)), 49 | 50 | # 間口 (maguchi) 51 | maguchi = as.numeric(ifelse(間口 == "50.0m以上", 75, 間口)), 52 | 53 | # 延床面積(㎡)(nobeyuka_m2) 54 | nobeyuka_m2 = as.numeric(ifelse(`延床面積(㎡)` == "2000㎡以上", 2500, `延床面積(㎡)`)), 55 | 56 | # 建築年 (built_year) 57 | year1 = str_remove_all(建築年,'[0-9]|年'), 58 | year1 = ifelse(year1 == "昭和", 1925, year1), 59 | year1 = ifelse(year1 == "平成", 1988, year1), 60 | built_year_b4ww2 = ifelse(year1 == "戦前", 1, 0), 61 | year1 = as.numeric(ifelse(year1 == "戦前", NA, year1)), 62 | year2 = as.numeric(str_extract_all(建築年, '[0-9]+')), 63 | built_year = year1 + year2, 64 | 65 | # 取引時点 (transaction date) 66 | trans_date_yr = as.numeric(str_extract_all(取引時点, "[0-9]+")), 67 | trans_date_q = as.numeric(z2h(str_remove_all(取引時点, "[0-9]|年第|四半期"))), 68 | trans_date = trans_date_yr + 0.2 * trans_date_q, 69 | 70 | 71 | 72 | ) %>% 73 | select(-`最寄駅:距離(分)`, -間取り, -`面積(㎡)`, -間口, -`延床面積(㎡)`, -建築年, -year1, -year2, -取引時点) 74 | 75 | 76 | # JPN -> ENG 77 | # 種類, 地域, 市区町村コード, 都道府県名, 市区町村名, 地区名, 最寄駅:名称, 土地の形状, 建物の構造, 用途, 今後の利用目的, 78 | # 前面道路:方位, 前面道路:種類, 前面道路:幅員(m), 都市計画, 建ぺい率(%), 容積率(%), 改装, 取引の事情等 79 | 80 | colnames(tem)[c(2:20)] <- c("type", "land_type", "area_code", "city", "area", "sub_area", "nearest_aki_name", "land_shape", 81 | "house_backbone", "use", "future_use", "faced_direction", "faced_road_type", "faced_road_width", "city_plan", 82 | "tapei_ratio", "floor_area_ratio", "reconstructed", "note" 83 | ) 84 | 85 | tem <- tem %>% mutate( 86 | room_total = apply( tem[,c("room_type_normal", "room_type_R","room_type_L","room_type_D","room_type_K","room_type_S")], 1, 87 | function(x){sum(x,na.rm = T)} ) , 88 | square_meter_per_room = sq_meter / room_total , 89 | square_meter_per_room_v2 = sq_meter / room_type_normal 90 | ) %>% 91 | mutate_all(funs(ifelse(is.nan(.), NA, .))) %>% 92 | mutate_all(funs(ifelse(is.infinite(.), NA, .))) %>% 93 | select(-area, -id, -city) %>% 94 | as.data.table() 95 | 96 | 97 | cols <- names(which(lapply(tem,class) == "character")) 98 | tem[, (cols) := lapply(.SD, function(z) as.integer(as.factor(z))), .SDcols = cols] 99 | df6 <- extract_feature_land(land_price) 100 | tem <- merge(tem, df6, by = "area_code", all.x=TRUE, sort=FALSE) 101 | # =============================================================================================================== 102 | # Making features : part 1~5 103 | fun_list1 <- c("mean", "sd", "max", "min", "sum","skewness", "kurtosis", 104 | "IQR", "iqr_ratio", "beyond1std_ratio", "mean_var", "range_diff", "range_per", "hl_ratio", "sw_stat", 105 | "x_diff", "x_ratio", "x_zscore", "entropy") 106 | fun_list2 <- c("n_distinct", "freq1ratio", "freq1count", "entropy_freqtable") 107 | 108 | numer_list <- c("sq_meter", "nobeyuka_m2", "square_meter_per_room", "built_year", "faced_road_width", "floor_area_ratio", 109 | "maguchi" ,"square_meter_per_room_v2", "trans_date", "land__mean__ON__h31_price") 110 | 111 | cate_list <- c("area_code", "nearest_aki_name", "land_type", "use", "sub_area", "trans_date_yr") 112 | 113 | 114 | df1 <- get_group_feature_numer2cate(tem, fun_list1, numer_list, cate_list) 115 | df2 <- get_group_feature_cate2cate(tem, fun_list2, cate_list) 116 | df3 <- get_bayes_mean(tem, numer_list, cate_list) 117 | df4 <- get_count_encoding(tem, cate_list = cols) 118 | df5 <- get_target_encoding(tem, cate_list = cols) 119 | tem1 <- data.table(tem, df1, df2, df3, df4, df5) 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /lgbm_psudeo_labeling.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # sq_meter, land__mean__ON__h31_price, nobeyuka_m2, built_year 6 | 7 | lgb_pred_have_na <- function(data, target, target_name){ 8 | 9 | cat("input col :", ncol(data),"\n") 10 | data <- as.data.frame(data) 11 | non_na_idx <- which(is.na( as.numeric(unlist(data[,target_name])) ) == F ) 12 | na_idx <- which(is.na( as.numeric(unlist(data[,target_name])) ) == T ) 13 | 14 | if(target_name == "land__mean__ON__h31_price"){ data <- data[,which(grepl("_price",colnames(data)) == F )] } 15 | cleaned_target <- target[is.na(target) == F] 16 | cleaned_data <- data[,which(grepl(target_name,colnames(data)) == F )] 17 | cat("cleaned col :", ncol(cleaned_data),"\n") 18 | 19 | non_na_data <- cleaned_data[non_na_idx,] 20 | na_data <- cleaned_data[na_idx,] 21 | 22 | df_tem <- 1:nrow(non_na_data) 23 | c <- 0 24 | a <- vector() 25 | ppte <- 0 26 | 27 | for(i in 1:5){ 28 | invisible(gc()) 29 | print(i) 30 | set.seed(77) 31 | folds <- KFold(cleaned_target, nfolds = 5, stratified = FALSE, seed = 0) 32 | 33 | names(folds)[i] <- "test" 34 | test_st <- non_na_data[folds$test,] 35 | 36 | d0 <- lgb.Dataset(data.matrix( non_na_data[-c(folds$test),] ), label = cleaned_target[-c(folds$test)] ) 37 | dval <- lgb.Dataset(data.matrix( non_na_data[folds$test,] ), label = cleaned_target[c(folds$test)] ) 38 | 39 | lgb_param <- list(boosting_type = 'gbdt', 40 | objective = "regression" , 41 | metric = "RMSE", 42 | learning_rate = 0.1, 43 | num_leaves = 96, 44 | feature_fraction = 0.05, 45 | bagging_freq = 1, 46 | bagging_fraction = 1, 47 | min_data_in_leaf = 200 48 | ) 49 | 50 | valids <- list(valid = dval) 51 | lgb <- lgb.train(params = lgb_param, data = d0, nrounds = 2000, 52 | eval_freq = 200, valids = valids, early_stopping_rounds = 300, verbose = 1) 53 | 54 | pp <- predict(lgb, data.matrix(test_st)) 55 | ppte1 <- predict(lgb, data.matrix(na_data)) 56 | ppte <- ppte + ppte1 57 | 58 | df_tem[as.numeric(folds$test)] <- as.numeric(unlist(pp)) 59 | cat( ">>>> END <<<< ","\n") 60 | } 61 | ppte <- ppte/5 62 | return(list(df_tem,ppte)) 63 | } 64 | 65 | 66 | lgb_pred_no_na <- function(data, target, target_name){ 67 | 68 | data <- as.data.frame(data) 69 | cat("input col :", ncol(data),"\n") 70 | cleaned_data <- data[,which(grepl(target_name,colnames(data)) == F )] 71 | if(target_name == "land__mean__ON__h31_price"){ data <- data[,which(grepl("square_meter",colnames(data)) == F )] } 72 | cat("cleaned col :", ncol(cleaned_data),"\n") 73 | 74 | df_tem <- 1:nrow(cleaned_data) 75 | c <- 0 76 | a <- vector() 77 | ppte <- 0 78 | target <- log1p(target) 79 | 80 | for(i in 1:5){ 81 | invisible(gc()) 82 | print(i) 83 | set.seed(77) 84 | folds <- KFold(target, nfolds = 5, stratified = FALSE, seed = 0) 85 | 86 | names(folds)[i] <- "test" 87 | test_st <- cleaned_data[folds$test,] 88 | 89 | d0 <- lgb.Dataset(data.matrix( cleaned_data[-c(folds$test),] ), label = target[-c(folds$test)] ) 90 | dval <- lgb.Dataset(data.matrix( cleaned_data[folds$test,] ), label = target[c(folds$test)] ) 91 | 92 | lgb_param <- list(boosting_type = 'gbdt', 93 | objective = "regression" , 94 | metric = "RMSE", 95 | learning_rate = 0.1, 96 | num_leaves = 96, 97 | feature_fraction = 0.05, 98 | bagging_freq = 1, 99 | bagging_fraction = 1, 100 | min_data_in_leaf = 200 101 | ) 102 | 103 | valids <- list(valid = dval) 104 | lgb <- lgb.train(params = lgb_param, data = d0, nrounds = 1000, 105 | eval_freq = 200, valids = valids, early_stopping_rounds = 300, verbose = 1) 106 | 107 | pp <- predict(lgb, data.matrix(test_st)) 108 | 109 | df_tem[as.numeric(folds$test)] <- as.numeric(unlist(pp)) 110 | cat( ">>>> END <<<< ","\n") 111 | } 112 | return(df_tem) 113 | } 114 | 115 | 116 | get_pred0 <- function(data){ 117 | 118 | data <- as.data.frame(data) 119 | c <- 0 120 | tmp2 <- 1:nrow(data) 121 | pred_data <- data.frame(0) 122 | col <- data[,c("land__mean__ON__h31_price", "nobeyuka_m2", "built_year")] 123 | for(target_name in colnames(col)){ 124 | 125 | library(lightgbm) 126 | c <- c+1 127 | tmp1 <- lgb_pred_have_na(data, as.numeric(unlist(col[,c])), target_name) 128 | tmp2[ which(is.na( as.numeric(unlist(col[,target_name])) ) == F) ] <- tmp1[[1]] 129 | tmp2[ which(is.na( as.numeric(unlist(col[,target_name])) ) == T) ] <- tmp1[[2]] 130 | pred_data <- data.frame(pred_data,tmp2) 131 | lgb.unloader(wipe = T) 132 | } 133 | pred_data <- pred_data[,-1] ; colnames(pred_data) <- paste0(colnames(col),"___pred0_by_lgbm",sep="") 134 | return(pred_data) 135 | } 136 | 137 | 138 | df7 <- get_pred0(tem1) 139 | df8 <- data.table(expm1(lgb_pred_no_na(tem1, tem1$sq_meter, "sq_meter")), 140 | expm1(lgb_pred_no_na(tem1, tem1$time_to_nearest_aki, "time_to_nearest_aki")) 141 | ) 142 | 143 | colnames(df8) <- paste0(c("sq_meter", "time_to_nearest_aki"),"___pred0_by_lgbm",sep="") 144 | 145 | df7 <- data.table(df7, df8) 146 | write.csv(df7, "df7.csv", row.names=F) 147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /function.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Some function 4 | beyond1std_ratio <- function(x){ return( sum(ifelse(x > (mean(x,na.rm=T) + sd(x,na.rm=T)),1,0)) / length(x) )} 5 | iqr_ratio <- function(x){ return( quantile(x,0.75,na.rm=T) / quantile(x,0.25,na.rm=T) )} 6 | mean_var <- function(x){ return( sd(x,na.rm=T) / mean(x,na.rm=T) )} 7 | range_diff <- function(x){ return( max(x,na.rm=T) - min(x,na.rm=T) )} 8 | range_per <- function(x){ return( max(x,na.rm=T) / min(x,na.rm=T) )} 9 | hl_ratio <- function(x){ return( sum(ifelse(x > mean(x),1,0)) / sum(ifelse(x >= mean(x),0,1)) )} 10 | sw_stat <- function(x){ return( ifelse(sum(is.na(x) == F) > 3 & sum(is.na(x) == F) < 5000 & 11 | length(x) > 3 & length(x) < 5000 & sum(diff(x),na.rm=T)!=0 , 12 | shapiro.test(as.numeric(x))$statistic, NA) )} 13 | x_diff <- function(x){ return(x - mean(x,na.rm=T)) } 14 | x_ratio <- function(x){ return(x / mean(x,na.rm=T)) } 15 | x_zscore <- function(x){ return( (x-mean(x,na.rm=T)) / sd(x,na.rm=T)) } 16 | 17 | freq1ratio <- function(x){ return( 18 | ifelse(sort(table(x),decreasing = T)[1] == sort(table(x),decreasing = T)[2], 19 | NA, as.numeric( sort(table(x),decreasing = T)[1] / length(x) ) ) )} 20 | freq1count <- function(x){ return( 21 | ifelse(sort(table(x),decreasing = T)[1] == sort(table(x),decreasing = T)[2], 22 | NA, ( sort(table(x),decreasing = T)[1] ) ) )} 23 | 24 | entropy_freqtable <- function(x){ return( as.numeric(entropy(table(x)))) } 25 | 26 | 27 | # Revised version of function "zen2han" from "Nippon" package 28 | z2h <- function (s) 29 | { 30 | s <- ifelse(s == "", NA, s) 31 | ori_s <- s 32 | s <- na.omit(s) 33 | if (any(Encoding(s) != "UTF-8")) 34 | s <- iconv(s, from = "", to = "UTF-8") 35 | s <- paste(s, sep = "") 36 | y <- sapply(unlist(strsplit(s, split = ",")), function(x) { 37 | i <- utf8ToInt(x) 38 | if (i >= 65281 && i <= 65374) { 39 | return(intToUtf8(i - 65248)) 40 | } 41 | else { 42 | return(x) 43 | } 44 | }) 45 | ori_s[!is.na(ori_s)] <- paste(y) 46 | return(ori_s) 47 | } 48 | 49 | # Get group function 50 | # can acclerate whole process by using %dopar% 51 | get_group_feature_numer2cate <- function(data, fun_list, numer_list, cate_list){ 52 | 53 | tmp1 <- data.table(row_idx = 1:nrow(data)) 54 | 55 | for(fun_name in fun_list){ 56 | cat("- Processing : ", fun_name, "\n") 57 | for(cate in cate_list){ 58 | for(numer in numer_list){ 59 | 60 | if(cate == numer) next 61 | data2 <- data[,c(cate,numer), with=F] 62 | 63 | if(fun_name %in% c("x_diff", "x_ratio", "x_zscore")){ 64 | tmp <- data2[ , col := get(fun_name)( get(numer) ), by = cate] 65 | } else{ tmp <- data2[ , col := get(fun_name)( na.omit(get(numer)) ), by = cate] } 66 | 67 | colnames(tmp)[3] <- paste0(fun_name, "__ON__", numer, "__BY__", cate) 68 | tmp1 <- data.table(tmp1,tmp[,-c(1:2)]) 69 | } 70 | } 71 | } 72 | tmp1 <- tmp1[, lapply(.SD, function(x){ ifelse(is.infinite(x) | is.nan(x), NA, x) } )] 73 | cat("==== Process completed, ignore the warning message from max and min function (returning Inf) ====", "\n") 74 | return(tmp1[,-1]) 75 | } 76 | 77 | # Get group function 78 | # can acclerate whole process by using %dopar% 79 | get_group_feature_cate2cate <- function(data, fun_list, cate_list){ 80 | 81 | tmp1 <- data.table(row_idx = 1:nrow(data)) 82 | data2 <- data[,c(cate_list), with=F] 83 | 84 | 85 | for(fun_name in fun_list){ 86 | cat("- Processing : ", fun_name, "\n") 87 | for(cate1 in cate_list){ 88 | for(cate2 in cate_list){ 89 | 90 | if(cate1 == cate2) next 91 | if(cate1 == "sub_area" & cate2 == "area_code") next 92 | tmp <- data2[, col := NULL][ , col := get(fun_name)( get(cate2) ), by = cate1][ , "col", with=F] 93 | colnames(tmp) <- paste0(fun_name, "__ON__", cate2, "__BY__", cate1) 94 | tmp1 <- data.table(tmp1,tmp) 95 | } 96 | } 97 | } 98 | cat("==== Process completed ====", "\n") 99 | return(tmp1[,-1]) 100 | } 101 | 102 | 103 | get_bayes_mean <- function(data, numer_list, cate_list, prior=20){ 104 | 105 | tmp1 <- data.table(row_idx = 1:nrow(data)) 106 | 107 | for(cate in cate_list){ 108 | for(numer in numer_list){ 109 | 110 | data2 <- data[,c(cate,numer), with=F] 111 | tmp <- data2[ , col := ( ( ( get(numer) / mean(na.omit(get(numer))) ) * .N) + prior ) / (.N + prior) , by = cate][is.na(col), col:= 1] 112 | colnames(tmp)[3] <- paste0("bayes_mean", "__ON__", numer, "__BY__", cate) 113 | tmp1 <- data.table(tmp1,tmp[,-c(1:2)]) 114 | } 115 | } 116 | 117 | tmp1 <- tmp1[, lapply(.SD, function(x){ ifelse(is.infinite(x) | is.nan(x) , NA, x) } )] 118 | cat("==== Process completed ====", "\n") 119 | return(tmp1[,-1]) 120 | } 121 | 122 | 123 | get_count_encoding <- function(data, cate_list){ 124 | tmp1 <- data.table(row_idx = 1:nrow(data)) 125 | for(cate in cate_list){ 126 | tmp <- data[ , col := .N, by=cate ][ , "col", with=F] 127 | colnames(tmp) <- paste0("count_encoding__OF__", cate) 128 | tmp1 <- data.table(tmp1, tmp) 129 | } 130 | cat("==== Process completed ====", "\n") 131 | return(tmp1[,-1]) 132 | } 133 | 134 | 135 | 136 | # Author of the code : Branden Murray 137 | # https://www.kaggle.com/brandenkmurray/it-is-lit 138 | # This is a modified version for regression task 139 | catNWayAvgCV <- function(data, varList, y, pred0, filter, k, f, g=1, lambda=NULL, r_k, cv=NULL){ 140 | 141 | # It is probably best to sort your dataset first by filter and then by ID (or index) 142 | n <- length(varList) 143 | varNames <- paste0("v",seq(n)) 144 | ind <- unlist(cv, use.names=FALSE) 145 | oof <- NULL 146 | if (length(cv) > 0){ 147 | for (i in 1:length(cv)){ 148 | sub1 <- data.table(v1=data[,varList,with=FALSE], y=data[,y,with=FALSE], pred0=data[,pred0,with=FALSE], filt=filter) 149 | sub1 <- sub1[sub1$filt==TRUE,] 150 | sub1[,filt:=NULL] 151 | colnames(sub1) <- c(varNames,"y","pred0") 152 | sub2 <- sub1[cv[[i]],] 153 | sub1 <- sub1[-cv[[i]],] 154 | sum1 <- sub1[,list(sumy=sum(y), avgY=mean(y), cnt=length(y)), by=varNames] 155 | tmp1 <- merge(sub2, sum1, by = varNames, all.x=TRUE, sort=FALSE) 156 | set(tmp1, i=which(is.na(tmp1[,cnt])), j="cnt", value=0) 157 | set(tmp1, i=which(is.na(tmp1[,sumy])), j="sumy", value=0) 158 | if(!is.null(lambda)) tmp1[beta:=lambda] else tmp1[,beta:= 1/(g+exp((tmp1[,cnt] - k)/f))] 159 | tmp1[,adj_avg:=((1-beta)*avgY+beta*pred0)] 160 | set(tmp1, i=which(is.na(tmp1[["avgY"]])), j="avgY", value=tmp1[is.na(tmp1[["avgY"]]), pred0]) 161 | set(tmp1, i=which(is.na(tmp1[["adj_avg"]])), j="adj_avg", value=tmp1[is.na(tmp1[["adj_avg"]]), pred0]) 162 | set(tmp1, i=NULL, j="adj_avg", value=tmp1$adj_avg*(1+(runif(nrow(sub2))-0.5)*r_k)) 163 | oof <- c(oof, tmp1$adj_avg) 164 | } 165 | } 166 | oofInd <- data.frame(ind, oof) 167 | oofInd <- oofInd[order(oofInd$ind),] 168 | sub1 <- data.table(v1=data[,varList,with=FALSE], y=data[,y,with=FALSE], pred0=data[,pred0,with=FALSE], filt=filter) 169 | colnames(sub1) <- c(varNames,"y","pred0","filt") 170 | sub2 <- sub1[sub1$filt==F,] 171 | sub1 <- sub1[sub1$filt==T,] 172 | sum1 <- sub1[,list(sumy=sum(y), avgY=mean(y), cnt=length(y)), by=varNames] 173 | tmp1 <- merge(sub2, sum1, by = varNames, all.x=TRUE, sort=FALSE) 174 | tmp1$cnt[is.na(tmp1$cnt)] <- 0 175 | tmp1$sumy[is.na(tmp1$sumy)] <- 0 176 | if(!is.null(lambda)) tmp1$beta <- lambda else tmp1$beta <- 1/(g+exp((tmp1$cnt - k)/f)) 177 | tmp1$adj_avg <- (1-tmp1$beta)*tmp1$avgY + tmp1$beta*tmp1$pred0 178 | tmp1$avgY[is.na(tmp1$avgY)] <- tmp1$pred0[is.na(tmp1$avgY)] 179 | tmp1$adj_avg[is.na(tmp1$adj_avg)] <- tmp1$pred0[is.na(tmp1$adj_avg)] 180 | # Combine train and test into one vector 181 | return(c(oofInd$oof, tmp1$adj_avg)) 182 | } 183 | 184 | 185 | get_target_encoding <- function(data, cate_list){ 186 | 187 | data <- tem ; cate_list <- cols 188 | 189 | for_enc <- data.table(data[ ,cate_list, with=F], # takes only low cardinaility categorical features 190 | y = c(y,rep(NA,nrow(test))), # target 191 | pred0 = rep(mean(y),nrow(tem)), # global mean 192 | filter=c(rep(0,nrow(train)),rep(1,nrow(test))) # Indicater, to show where is train and test 193 | ) 194 | 195 | cvFoldsList <- createFolds(y, k=5, list=TRUE, returnTrain=FALSE) # create 5 folds 196 | 197 | tmp1 <- data.table(row_idx = 1:nrow(data)) 198 | for(cate in cate_list){ 199 | tmp <- data.table(fea = catNWayAvgCV(for_enc, varList=cate, y="y", pred0="pred0", 200 | filter=for_enc$filter==0, k=30, f=10, r_k=0.02, cv=cvFoldsList)) 201 | colnames(tmp) <- paste0("target_encoding__OF__", cate) 202 | tmp1 <- data.table(tmp1, tmp) 203 | } 204 | cat("==== Process completed ====", "\n") 205 | return(tmp1[,-1]) 206 | } 207 | 208 | # Extract feature from land_price.csv 209 | extract_feature_land <- function(data){ 210 | 211 | # Numeric : 2,3,14,20,22,34,40,41,(43) 44:49,50:80 212 | # Cate : 5,6,17,18,24,32,37 213 | 214 | colnames(data)[c(2:6,14,17,18,20,22,24,32,34,37,40,41,43,44:49,50:80 )] <- 215 | c("longitude", "latitude", "area_code", "use", "seq_num", "chiseki", "backbone", "facility", "maguchi_ratio", #20 216 | "floor", "faced_road_type", "envir", "distance_to_aki", "cityplan_type", "tapei_ratio", "floor_area_ratio", "selected_yr", #43 217 | paste0("s",58:63,"_price"), paste0("h",1:31,"_price") ) 218 | 219 | # Data cleaning 220 | df_tmp <- data[,c(paste0("s",58:63,"_price"), paste0("h",1:31,"_price")), with=F] 221 | df_tmp[df_tmp == 0] <- NA 222 | data[,c(paste0("s",58:63,"_price"), paste0("h",1:31,"_price"))]<- df_tmp 223 | invisible(gc()) 224 | 225 | data[, `:=`(selected_yr = as.numeric(str_count(selected_yr, "1")), 226 | past15yr_price_mean = apply(data[, paste0("h",c(16:31),"_price"), with=F], 1, mean, na.rm=T), 227 | past15yr_price_sd = apply(data[, paste0("h",c(16:31),"_price"), with=F], 1, sd, na.rm=T), 228 | overall_price_mean = apply(data[, c(paste0("s",58:63,"_price"), paste0("h",1:31,"_price")), with=F], 1, mean, na.rm=T), 229 | overall_price_sd = apply(data[, c(paste0("s",58:63,"_price"), paste0("h",1:31,"_price")), with=F], 1, sd, na.rm=T) 230 | )] 231 | 232 | fun_list1 <- c("mean", "sd", "max", "min", "sum","skewness", "kurtosis", 233 | "IQR", "iqr_ratio", "mean_var", "range_diff", "range_per", "hl_ratio") 234 | fun_list2 <- c("n_distinct", "entropy_freqtable") 235 | tar_col_numer <- c("longitude", "latitude", "chiseki", "maguchi_ratio", "floor", "distance_to_aki","tapei_ratio", "floor_area_ratio", 236 | "selected_yr", paste0("h",c(1,11,21,31),"_price"), 237 | "overall_price_mean", "overall_price_sd", "past15yr_price_mean", "past15yr_price_sd") 238 | tar_col_cate <- c("use", "seq_num", "backbone", "facility", "faced_road_type", "envir", "cityplan_type") 239 | 240 | # Making features 241 | tmp <- data[ , .( col = .N ), by = "area_code"][,1] 242 | 243 | for(funfun in fun_list1){ 244 | for(tar_col in tar_col_numer){ 245 | df <- data[ , .( col = get(funfun)( as.double(na.omit(get(tar_col))) ) ) , by = "area_code" ] 246 | colnames(df)[2] <- paste0( "land__", funfun , "__ON__", tar_col ) 247 | tmp <- merge(tmp, df, by = "area_code", all.x=TRUE, sort=FALSE) 248 | } 249 | } 250 | 251 | for(funfun in fun_list2){ 252 | for(tar_col in tar_col_cate){ 253 | df <- data[, col := NULL][ , .( col = get(funfun)( get(tar_col) ) ) , by = "area_code" ] 254 | colnames(df)[2] <- paste0( "land__", funfun , "__ON__", tar_col ) 255 | tmp <- merge(tmp, df, by = "area_code", all.x=TRUE, sort=FALSE) 256 | } 257 | } 258 | tmp <- tmp[, lapply(.SD, function(x){ ifelse(is.infinite(x) | is.nan(x) , NA, x) } )] 259 | return(tmp) 260 | } 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | --------------------------------------------------------------------------------