├── AnalyticsVidhya ├── readme.md ├── WNS-analytics-wizard-2019 │ └── README.md ├── amexpert-2019-machine-learning-hackathon │ └── code │ │ ├── README.md │ │ ├── agg_feature_2_merge.R │ │ ├── agg_feature.R │ │ └── agg_feature_2.R ├── Knocktober │ └── readme.md ├── Date-your-Data │ ├── feature_df_all_CountOfApplications.R │ ├── 11_Ensemble_Models.R │ ├── feature_df_all_Match_Internship_Location_with_other_locations.R │ ├── BUILD_FINAL_SUBMISSION.R │ ├── README.md │ ├── 4_feature_internship_SkillsCoding.R │ ├── 3_feature_internship_Profile_Coding.R │ ├── 6_feature_student_degreeCoding.R │ ├── 7_feature_student_ExperienceCoding.R │ ├── 2_feature_internship_Profile_WordCount.R │ ├── 5_feature_student_StreamsCoding.R │ ├── 9_model_XGB_1.R │ ├── 10_model_XGB_1.R │ └── 1_internship_WordCorrection.R └── AVDatafest_XtremeML │ ├── input │ └── holiday.csv │ └── README.md ├── Kaggle ├── readme.md └── Avito Duplicate Ad Detection │ ├── input │ └── README.txt │ ├── output │ └── README.txt │ ├── cache │ └── README.txt │ ├── tokenizers │ └── punkt │ │ └── PY3 │ │ └── russian.pickle │ ├── Documentation - TheQuants Team - Avito Contest.pdf │ ├── code │ ├── 5_data_postprocessing.py │ ├── feature_verification.py │ ├── 3_feature_set1d_interaction.R │ ├── libavito.py │ ├── models │ │ ├── libavito.py │ │ ├── marios_xgregv3.py │ │ ├── marios_xgson_v4.py │ │ ├── marios_xgsonv2_v5.py │ │ ├── marios_xgrank_v2.py │ │ ├── marios_xgson_v2.py │ │ ├── marios_xgson_v3.py │ │ └── marios_xgrank_v3.py │ ├── 3_feature_set4a_fuzzy.py │ ├── 3_feature_set1g_capitalLetters.R │ ├── 3_feature_set4b_fuzzy_clean.py │ ├── 3_feature_set1f_SpecialCounting.R │ ├── legacy │ │ └── 3_feature_set4e_count3way_clean.py │ ├── 3_feature_set3d_json1.py │ ├── 3_feature_set3c_json.py │ ├── 3_feature_set3b_title.py │ ├── 3_feature_set3a_description.py │ ├── 3_feature_set1e_attribute.R │ ├── 3_json_to_cols.py │ ├── functions.R │ ├── 3_feature_set3f_hamming.py │ ├── 2_image_info.py │ ├── 3_feature_set1a_ngram.R │ ├── 3_feature_set1b_nchar.R │ ├── 5_consolidate_features.R │ ├── 3_feature_set4c_alternate.py │ ├── 3_feature_set1c_misc.R │ └── 1_data_preprocessing.py │ ├── config.cfg │ ├── README.md │ └── runAll.sh ├── README.md ├── HackerEarth ├── Predict Lanes from LIDAR data │ ├── Rplot.png │ ├── final_1_calculateHaversineDistance.R │ ├── README.md │ ├── final_2_buildData.R │ └── final_3_buildModel.R └── Loan Default ML Challenge │ └── README.md ├── HackerRank └── Walmart-Codesprint │ └── readme.md └── Microsoft └── Womens-Health-Risk-Assessment ├── README.md └── Predict.R /AnalyticsVidhya/readme.md: -------------------------------------------------------------------------------- 1 | Analytics Vidhya Hackathons 2 | -------------------------------------------------------------------------------- /Kaggle/readme.md: -------------------------------------------------------------------------------- 1 | Repository for all my Kaggle Competitions 2 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/input/README.txt: -------------------------------------------------------------------------------- 1 | This is the default location for input files 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Competitions 2 | Repository of various competitions I participate 3 | 4 | (c) Sonny Laskar 5 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/output/README.txt: -------------------------------------------------------------------------------- 1 | This is the default location for output files such as submission files 2 | -------------------------------------------------------------------------------- /AnalyticsVidhya/WNS-analytics-wizard-2019/README.md: -------------------------------------------------------------------------------- 1 | # Repo for WNS Hackathon # 2 | https://github.com/sonnylaskar/wns-analytics-wizard-2019 3 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/cache/README.txt: -------------------------------------------------------------------------------- 1 | This is the default location for cache files such as models and cleaned data/features 2 | -------------------------------------------------------------------------------- /HackerEarth/Predict Lanes from LIDAR data/Rplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sonnylaskar/Competitions/HEAD/HackerEarth/Predict Lanes from LIDAR data/Rplot.png -------------------------------------------------------------------------------- /HackerRank/Walmart-Codesprint/readme.md: -------------------------------------------------------------------------------- 1 | # Walmart Codesprint 2 | 3 | #Final Rank: 14 / 132 4 | 5 | https://www.hackerrank.com/contests/walmart-codesprint-ml/challenges 6 | 7 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/tokenizers/punkt/PY3/russian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sonnylaskar/Competitions/HEAD/Kaggle/Avito Duplicate Ad Detection/tokenizers/punkt/PY3/russian.pickle -------------------------------------------------------------------------------- /AnalyticsVidhya/amexpert-2019-machine-learning-hackathon/code/README.md: -------------------------------------------------------------------------------- 1 | American Express Coupon Conversion Hackathon 2 | https://datahack.analyticsvidhya.com/contest/amexpert-2019-machine-learning-hackathon/ 3 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/Documentation - TheQuants Team - Avito Contest.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sonnylaskar/Competitions/HEAD/Kaggle/Avito Duplicate Ad Detection/Documentation - TheQuants Team - Avito Contest.pdf -------------------------------------------------------------------------------- /AnalyticsVidhya/Knocktober/readme.md: -------------------------------------------------------------------------------- 1 | # Competition: 2 | https://datahack.analyticsvidhya.com/contest/knocktober-2016/ 3 | 4 | Problem Type: 5 | Binary Classification 6 | 7 | # Models: 8 | 9 | 2 Bags of XGBoost 10 | 11 | 2 Bags of GBM 12 | 13 | Equal Weighted Rank Average of the above models. 14 | 15 | # Score: 16 | Public LB: 0.8362 (Rank 4) 17 | 18 | Private LB: 0.7685 (Rank 3) 19 | -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/feature_df_all_CountOfApplications.R: -------------------------------------------------------------------------------- 1 | #Feature 2 | #Add a column of how many applications received for any Internship_ID 3 | Intern_Freq <- data.frame(table(df_all$Internship_ID)) 4 | names(Intern_Freq) <- c("Internship_ID", "Internship_ApplicationCount") 5 | Intern_Freq$Internship_ID <- as.integer(as.character(Intern_Freq$Internship_ID)) 6 | df_all <- left_join(df_all, Intern_Freq, by = "Internship_ID" ) 7 | rm(Intern_Freq) 8 | #END -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/11_Ensemble_Models.R: -------------------------------------------------------------------------------- 1 | library(readr) 2 | 3 | #Ensemble the 2 XGB models 4 | MODEL_1 <- read_csv("../Submissions/XGB_MODEL_S123_N526.csv") 5 | MODEL_2 <- read_csv("../Submissions/XGB_MODEL_S500_N710.csv") 6 | 7 | MEANSCORE <- (MODEL_1$Is_Shortlisted + MODEL_2$Is_Shortlisted) / 2 8 | 9 | #SAVE 10 | submission <- data.frame(Internship_ID = MODEL_1$Internship_ID, 11 | Student_ID = MODEL_1$Student_ID, 12 | Is_Shortlisted = MEANSCORE) 13 | write_csv(submission,"../Submissions/FINAL_SUBMISSION.csv") 14 | -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/feature_df_all_Match_Internship_Location_with_other_locations.R: -------------------------------------------------------------------------------- 1 | #Feature 2 | #Add if InternLocation matches with hometomeLocation, 3 | #if InternLocation matches with InstitudeLocationCode 4 | #if InternLocation matches with PreferredLocationCode 5 | 6 | df_all$isIntern_Loc_Match_HomeTown <- ifelse(df_all$LocationCode == df_all$hometownLocationCode, 1, 0) 7 | df_all$isIntern_Loc_Match_InstitudeLocationCode <- ifelse(df_all$LocationCode == df_all$InstitudeLocationCode, 1, 0) 8 | df_all$isIntern_Loc_Match_PreferredLocationCode <- ifelse(df_all$LocationCode == df_all$PreferredLocationCode, 1, 0) 9 | -------------------------------------------------------------------------------- /HackerEarth/Loan Default ML Challenge/README.md: -------------------------------------------------------------------------------- 1 | Code for the HackerEarth competition on detecting loan defaulters 2 | https://www.hackerearth.com/challenge/competitive/machine-learning-challenge-one/ 3 | 4 | # The code was built on the below platform: 5 | OS: Linux (CentOS) 6 | RAM: 16GB 7 | CPU Core: 8 8 | 9 | Software: 10 | R 3.3.2 11 | 12 | R Packages: 13 | readr,dplyr,caret,xgboost,gbm,data.table,lightgbm,tm,stringr,ModelMetrics) 14 | 15 | To generate the final submission: 16 | 1) Create the folders: input, code, output 17 | 2) Put the data files in input folder 18 | 3) Save the R files in code folder 19 | 4) Execute the below command : 20 | Rscript final_model.R 21 | 5) It will take around 1 hour to complete 22 | 6) The final submission file will be created in output folder 23 | 24 | 25 | -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/BUILD_FINAL_SUBMISSION.R: -------------------------------------------------------------------------------- 1 | #This will build the Final Solution 2 | #Will take some time 3 | 4 | source("1_internship_WordCorrection.R") 5 | source("2_feature_internship_Profile_WordCount.R") 6 | source("3_feature_internship_Profile_Coding.R") 7 | source("4_feature_internship_SkillsCoding.R") 8 | source("5_feature_student_StreamsCoding.R") 9 | source("6_feature_student_degreeCoding.R") 10 | source("7_feature_student_ExperienceCoding.R") 11 | source("8_preprocessing.R") 12 | 13 | print("Building First XGB model") 14 | source("9_model_XGB_1.R") 15 | print("Building Second XGB model") 16 | source("10_model_XGB_1.R") 17 | 18 | print("Calculating the Average of the 2 models") 19 | source("11_Ensemble_Models.R") 20 | print("Huh!!!, I am done!") 21 | print("Check out FINAL_SUBMISSION FILE in Submission FOlder") -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/5_data_postprocessing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import libavito as a 4 | import feather as f 5 | import time 6 | 7 | cache_loc = a.get_config().cache_loc 8 | 9 | start = time.time() 10 | print('Transforming training data ... ', end='', flush=True) 11 | df = f.read_dataframe(cache_loc + 'final_featureSet_train.fthr') 12 | df.replace([np.nan, None], -1, inplace=True) 13 | df.replace([np.inf, -np.inf], 9999.99, inplace=True) 14 | f.write_dataframe(df, cache_loc + 'final_featureSet_train.fthr') 15 | del df 16 | a.print_elapsed(start) 17 | 18 | start = time.time() 19 | print('Transforming testing data ... ', end='', flush=True) 20 | df = f.read_dataframe(cache_loc + 'final_featureSet_test.fthr') 21 | df.replace([np.nan, None], -1, inplace=True) 22 | df.replace([np.inf, -np.inf], 9999.99, inplace=True) 23 | f.write_dataframe(df, cache_loc + 'final_featureSet_test.fthr') 24 | a.print_elapsed(start) 25 | 26 | -------------------------------------------------------------------------------- /AnalyticsVidhya/amexpert-2019-machine-learning-hackathon/code/agg_feature_2_merge.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(lubridate) 3 | 4 | createDf <- function(file_names) { 5 | df <- tibble() 6 | for (i in file_names) { 7 | tmp <- suppressMessages(read_csv(i)) 8 | if (nrow(df) == 0) { 9 | df <- tmp 10 | } else { 11 | df <- left_join(df, tmp, by = c("CampaignDate", "customer_id", "item_id")) 12 | } 13 | rm(tmp) 14 | gc() 15 | } 16 | df 17 | } 18 | 19 | #[1] 26 27 28 29 30 1 2 3 4 5 6 7 8 9 10 11 12 13 16 17 18 19 20 21 22 23 24 25 20 | #assign("df_2", df_1) 21 | 22 | df <- tibble() 23 | for (i in 1:30) { 24 | print(i) 25 | tmp <- createDf(list.files(path = "../input/", 26 | pattern = paste0("agg_feat_",i,"_"), 27 | full.names = T)) 28 | #assign(paste0("df_", i), df) 29 | if (nrow(df) == 0) { 30 | df <- tmp 31 | } else { 32 | df <- bind_rows(df, tmp) 33 | } 34 | rm(tmp) 35 | gc() 36 | } 37 | 38 | write_csv(df, "../input/agg_v2.csv") 39 | 40 | -------------------------------------------------------------------------------- /Microsoft/Womens-Health-Risk-Assessment/README.md: -------------------------------------------------------------------------------- 1 | # Microsoft - Womens Health Risk Assessment Machine Learning Competition 2 | https://gallery.cortanaintelligence.com/Competition/Women-s-Health-Risk-Assessment-1 3 | 4 | # Problem 5 | ## Type of Problem: 6 | Supervised Multiclass Classification Problem 7 | 8 | ## Problem Description: 9 | 10 | The objective of this machine learning competition is to build machine learning models to assign a young woman subject (15-30 years old) in one of the 9 underdeveloped regions into a risk segment, and a subgroup within the segment. 11 | After the accurate assignments of the risk segment and subgroup in each region, a healthcare practitioner can deliver services to prevent the subject from the health risks, specifically sexual and reproductive health risks (like HIV infections). The types of services are personalized, based on the risk segment and subgroup assignments. 12 | 13 | ## Evaluation: 14 | Accuracy 15 | 16 | ## Score: 17 | Public Leaderboard : _87.316611_ Rank: _7_ / 493 18 | 19 | Private Leaderboard : _87.144886_ Rank: _7_ / 493 20 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/config.cfg: -------------------------------------------------------------------------------- 1 | # All file locations must be unix locations - FOLDERS MUST END IN '/' 2 | 3 | ########################### 4 | ##### MACHINE CONFIG ##### 5 | ########################## 6 | # When preprocessing_nthreads is set to 1, you will receive 7 | # more progress/speed updates. 8 | preprocessing_nthreads = 12 9 | model_nthreads = 12 10 | 11 | # Set to the folder where config.cfg resides 12 | BASE_DIR = '/path/to/directory/' 13 | 14 | # Location to store intermediate files (eg. models or processed features) - SSD suggested 15 | cache_loc = './cache/' 16 | 17 | # Location to put output files 18 | output_loc = './output/' 19 | 20 | ####################### 21 | ##### INPUT FILES ##### 22 | ####################### 23 | 24 | train_ItemInfo = './input/ItemInfo_train.csv' 25 | train_ItemPairs = './input/ItemPairs_train.csv' 26 | 27 | test_ItemInfo = './input/ItemInfo_test.csv' 28 | test_ItemPairs = './input/ItemPairs_test.csv' 29 | 30 | category_csv = './input/Category.csv' 31 | location_csv = './input/Location.csv' 32 | 33 | images_root = '/path/to/images/' 34 | -------------------------------------------------------------------------------- /HackerEarth/Predict Lanes from LIDAR data/final_1_calculateHaversineDistance.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(stringr) 3 | library(geosphere) 4 | 5 | label <- read_csv("../input/labels.csv") 6 | label$roadCoordinates <- NULL 7 | train <- read_csv("../input/train.csv") 8 | test <- read_csv("../input/test.csv") 9 | df_all <- bind_rows(train, test) 10 | 11 | getDis <- function(x) { 12 | x <- as.data.frame(matrix(as.numeric(unlist(strsplit(unlist(strsplit(x, "\\|")), " "))), byrow = T, ncol = 2)) 13 | x$V1 <- ifelse(x$V1 < -90, -90, x$V1) 14 | x$V2 <- ifelse(x$V2 < -90, -90, x$V2) 15 | x$V1 <- ifelse(x$V1 > 90, 90, x$V1) 16 | x$V2 <- ifelse(x$V2 > 90, 90, x$V2) 17 | x <- arrange(x, V1, V2)[c(1, nrow(x)), ] 18 | distHaversine(x[, 1], x[, 2]) 19 | } 20 | 21 | 22 | getHaversineDistance <- function(id) { 23 | median(sapply(df_all$laneLineCoordinates[df_all$roadId == id], getDis, USE.NAMES = F) ) 24 | } 25 | 26 | roads <- data_frame(roadId = unique(df_all$roadId)) 27 | roads$haversineDistance <- (sapply(roads$roadId, getHaversineDistance)) 28 | 29 | write_csv(roads, "../input/roadsDistance.csv") 30 | 31 | 32 | -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/README.md: -------------------------------------------------------------------------------- 1 | # AnalyticsVidhya Date Your Data Contest 2 | This repository contains the code used by me in the "Date Your Data Contest". This scored 0.7006 on the Private Leaderboard and secured 3rd position in the contest 3 | 4 | https://www.analyticsvidhya.com/blog/2016/03/winning-solutions-dyd-competition-xgboost-ruled/ 5 | 6 | A) Prerequisites 7 | 8 | Ensure that the following packages are installed: 9 | dplyr, tidy, xgboost, tm, SnowBallC, readr, qdap, stringr, stylo, caret 10 | 11 | B) Build Submission File 12 | 13 | 1) Ensure that all datasets i.e. Student, Internships, train and test are 14 | present in the "data" folder. Download from the link in the description of this repository 15 | 2) Execute the RScript "BUILD_FINAL_SUBMISSION.R" 16 | 3) Wait for some time to complete 17 | 4) Check the "Submissions" folder for the FINAL Submission file 18 | 19 | C) Improvements 20 | 21 | 1) I couldn't try other models for lack of time. Building other models might also be helpful 22 | 2) I have build a long list of features but I haven't removed the unnecessary files. Feature Selection would have us reduce the feature Sets 23 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/feature_verification.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import libavito as a 4 | from multiprocessing import Pool 5 | 6 | df1 = pd.read_csv('') 7 | df2 = pd.read_csv('') 8 | 9 | def find_best_feature(c): 10 | ftr = df1[c].values 11 | 12 | high_correl = 0 13 | high_ftr = '' 14 | num_995 = 0 15 | for c2 in df2.columns: 16 | cor = np.corrcoef(ftr, df2[c2])[0, 1] 17 | if cor > 0.995: 18 | num_995 += 1 19 | if cor > high_correl: 20 | high_correl = cor 21 | high_ftr = c2 22 | 23 | return high_correl, high_ftr, num_995 24 | 25 | for c in df1.columns: 26 | hc, hf, n995 = find_best_feature(c) 27 | 28 | if hc == 1: 29 | print(a.c.OKGREEN + (c + ' -> ' + hf).ljust(60) + ' | CORREL 1' + a.c.END) 30 | elif hc > 0.995: 31 | print(a.c.OKBLUE + (c + ' -> ' + hf).ljust(60) + ' | CORREL ' + str(hc) + a.c.END) 32 | elif hc > 0.95: 33 | print(a.c.WARNING + (c + ' -> ' + hf).ljust(60) + ' | CORREL ' + str(hc) + a.c.END) 34 | else: 35 | print(a.c.FAIL + (c + ' -> ???? ').ljust(60) + ' | ' + str(hc) + ' ' + hf) 36 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1d_interaction.R: -------------------------------------------------------------------------------- 1 | # This script is called by 3_feature_set1d_misc.R script 2 | # DO NOT CALL it Directly 3 | # Start Interaction feature script 4 | print("Starting Interaction feature script") 5 | featureList <- c("isMetroIdSame", "isLocationIDSame", "isRegionIDSame", "isLongitudeSame", "isLatitudeSame", "isTitleSame", "isdescriptionSame") 6 | featureList <- combn(featureList, 2) 7 | 8 | create_interaction <- function(x) { 9 | i <- x[1] 10 | j <- x[2] 11 | print(c(i, j)) 12 | columnName <- paste("interaction", i, j, sep = "_") 13 | set1d[[columnName]] <<- ifelse(set1d[[i]] == 1 & set1d[[j]] == 1, 1, 0) 14 | return(NULL) 15 | } 16 | apply(featureList, 2, create_interaction) 17 | 18 | set1d <- set1d[, grep("interaction", names(set1d), value = T)] #Filter only interaction features 19 | names(set1d) <- paste("set1d", names(set1d), sep = "_") 20 | 21 | 22 | ######## Add Primary Columns ItemID1 and ItemID2 23 | set1d <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], set1d) 24 | print("Saving Interaction features features") 25 | write_feather(set1d, paste(cache_loc, "/", "features_", trainOrTest, "_set1d_", "interaction.fthr", sep = "" )) 26 | 27 | #END 28 | 29 | -------------------------------------------------------------------------------- /AnalyticsVidhya/amexpert-2019-machine-learning-hackathon/code/agg_feature.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(lubridate) 3 | 4 | campaign_data <- read_csv("../input/campaign_data.csv") 5 | campaign_data$start_date <- dmy(campaign_data$start_date) 6 | campaign_data$end_date <- dmy(campaign_data$end_date) 7 | campaign_data <- arrange(campaign_data, start_date) 8 | 9 | 10 | customer_transaction_data <- read_csv("../input/customer_transaction_data.csv") 11 | 12 | 13 | x <- unique(customer_transaction_data$date)[1] 14 | campaignDates <- campaign_data$start_date 15 | roundToNearestCampaignDate <- function(x) { 16 | campaignDates[campaignDates > x][1] 17 | } 18 | 19 | df_dates <- tibble(date = unique(customer_transaction_data$date)) 20 | df_dates <- df_dates %>% 21 | rowwise() %>% 22 | mutate(nextCampaignDate = roundToNearestCampaignDate(date)) 23 | 24 | customer_transaction_data <- left_join(customer_transaction_data, df_dates, by = "date") 25 | 26 | customer_transaction_df <- customer_transaction_data %>% 27 | #head(100000) %>% 28 | group_by(nextCampaignDate, customer_id, item_id) %>% 29 | summarise(quantity_sum = sum(quantity, na.rm = T), 30 | selling_price_sum = sum(selling_price, na.rm = T), 31 | other_discount_sum = sum(other_discount, na.rm = T), 32 | coupon_discount_sum = sum(coupon_discount, na.rm = T)) 33 | 34 | write_csv(customer_transaction_df, "../input/customer_transaction_df.csv") 35 | -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/4_feature_internship_SkillsCoding.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(tidyr) 3 | library(readr) 4 | 5 | #LOAD DATA 6 | internship <- read_csv("../data/Internship.csv", na = c("", "NA", "NULL")) 7 | NCOL <- ncol(internship) 8 | 9 | #With the below code we checked how the words look like in the Skills column 10 | unlist(strsplit(unlist(strsplit(internship$Skills_required, " ")), ",")) %>% 11 | table() %>% 12 | data.frame() %>% 13 | arrange(-Freq) %>% 14 | mutate(perc.weight = percent_rank(Freq)) %>% 15 | filter(perc.weight > 0.95) -> aList 16 | 17 | aList$NCHAR <- nchar(as.character(aList$.)) 18 | aList <- filter(aList, NCHAR > 1) 19 | StringsForSkills <- setdiff(as.character(aList$.), stopwords("english")) 20 | 21 | #Add 4 columns to Student dataframe 22 | internship$Skills_requiredCode <- NA 23 | 24 | for (i in StringsForSkills) { 25 | print(i) 26 | internship$Skills_requiredCode[grep(i, internship$Skills_required, ignore.case = TRUE)] <- i 27 | } 28 | 29 | ##Dummy Variables for StreamsCode 30 | for (i in c("Skills_requiredCode")) { 31 | print(i) 32 | for(level in unique(internship[[i]])){ 33 | internship[paste("dummy", i, level, sep = "_")] <- ifelse(internship[[i]] == level, 1, 0) 34 | } 35 | internship[[i]] <- NULL #Drop this column 36 | } 37 | 38 | 39 | #SAVE FILES 40 | write.csv(internship[, (NCOL+1):ncol(internship)], "../data/Features_internship_SkillsCode.csv", row.names = F) 41 | 42 | -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/3_feature_internship_Profile_Coding.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(tidyr) 3 | library(readr) 4 | library(tm) 5 | 6 | #LOAD DATA 7 | internship <- read_csv("../data/Internship_Processed.csv", na = c("", "NA", "NULL")) 8 | NCOL <- ncol(internship) 9 | 10 | #With the below code we checked how the words look like in the Skills column 11 | unlist(strsplit(unlist(strsplit(internship$Internship_Profile, " ")), ",")) %>% 12 | table() %>% 13 | data.frame() %>% 14 | arrange(-Freq) %>% 15 | mutate(perc.weight = percent_rank(Freq)) %>% 16 | filter(perc.weight > 0.95) -> aList 17 | 18 | aList$NCHAR <- nchar(as.character(aList$.)) 19 | aList <- filter(aList, NCHAR > 1) 20 | StringsForProfile <- setdiff(as.character(aList$.), stopwords("english")) 21 | 22 | #Add 4 columns to Student dataframe 23 | internship$InternshipProfile_Code <- NA 24 | 25 | for (i in StringsForProfile) { 26 | print(i) 27 | internship$InternshipProfile_Code[grep(i, internship$Internship_Profile, ignore.case = TRUE)] <- i 28 | } 29 | 30 | ##Dummy Variables for StreamsCode 31 | for (i in c("InternshipProfile_Code")) { 32 | print(i) 33 | for(level in unique(internship[[i]])){ 34 | internship[paste("dummy", i, level, sep = "_")] <- ifelse(internship[[i]] == level, 1, 0) 35 | } 36 | internship[[i]] <- NULL #Drop this column 37 | } 38 | 39 | 40 | #SAVE FILES 41 | write.csv(internship[, (NCOL+1):ncol(internship)], "../data/Features_internship_ProfileCode.csv", row.names = F) 42 | 43 | -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/6_feature_student_degreeCoding.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(tidyr) 3 | library(readr) 4 | 5 | #LOAD DATA 6 | student <- read_csv("../data/Student.csv", na = c("", "NA", "NULL")) 7 | 8 | #With the below code we checked how the words look like in the degree column 9 | #table(student$Degree) %>% data.frame() %>% arrange(-Freq) %>% View() 10 | 11 | #We will create 4 binary columns to identify the following: 12 | #1) IsUnderGraduate 13 | #2) IsPostGraduate 14 | #3) IsTechbackground 15 | #4) IsNonTechbackground 16 | 17 | StringsForUG <- c("BE|B.|Bachelor|Undergrad|BCA|UG|BBA|LLB") 18 | 19 | StringsForPG <- c("MBA|Management|M.|MCA|MBA|Post Graduate|Master|Ph.D") 20 | 21 | StringsForTech <- c("MCA|M.Tech|M. Tech|BCA|B.E.|B. E.|B.Tech|B. Tech|Science|Technology|Engineer|Software") 22 | 23 | StringsForNonTech <- c("MBA|Management|BBA|LLB|Business|Journalism|Mass|Arts|Pharma|Chartered|Dental|Social|English|Finance|Sports|Media|Fashion|Psychology") 24 | 25 | NCOL <- ncol(student) 26 | #Add 4 columns to Student dataframe 27 | student$IsUnderGraduate <- 0 28 | student$IsPostGraduate <- 0 29 | student$IsTechbackground <- 0 30 | student$IsNonTechbackground <- 0 31 | 32 | student$IsUnderGraduate[grep(StringsForUG, student$Degree, ignore.case = TRUE)] <- 1 33 | student$IsPostGraduate[grep(StringsForPG, student$Degree, ignore.case = TRUE)] <- 1 34 | student$IsTechbackground[grep(StringsForTech, student$Degree, ignore.case = TRUE)] <- 1 35 | student$IsNonTechbackground[grep(StringsForNonTech, student$Degree, ignore.case = TRUE)] <- 1 36 | 37 | #SAVE FILES 38 | write.csv(student[, (NCOL+1):ncol(student)], "../data/Features_student_DegreeCode.csv", row.names = F) 39 | 40 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/README.md: -------------------------------------------------------------------------------- 1 | # Kaggle Avito Duplicate Ad Detection Contest 2 | Winning Solution Blog : https://blog.kaggle.com/2016/08/31/avito-duplicate-ads-detection-winners-interview-2nd-place-team-the-quants-mikel-peter-marios-sonny/ 3 | 4 | Contest Link: https://www.kaggle.com/c/avito-duplicate-ads-detection/ 5 | 6 | Private Leaderboard Score - _0.95294_ ( Rank 2 / 548) 7 | 8 | Final solution of Avito Duplicate Ad Detection - TheQuants 9 | 10 | ##Prerequisites: 11 | **OS:** Any Linux Distribution (Ubuntu 14.04 Preferred) 12 | **RAM:** 128GB+ (64GB for feature extraction) 13 | **CPU:** 36 cores+ (Preferred) 14 | **GPU:** CUDA-compatible NVIDIA GPU with Compute Capability 3.5+ (TITAN X Preferred) 15 | **Storage:** 64GB+ (not including input data) - Images on SSD _highly recommended_ 16 | 17 | **R Version:** 3.1+ 18 | **R Packages:** data.table, dplyr, dummies, feather, Hmisc, igraph, jsonlite, parallel, raster, readr, reshape2, stringdist, stringr, stylo, textreuse, tidyr, tm, xgboost 19 | 20 | **Python Version:** 3.5.1 21 | **Python Packages:** scikit-learn, numpy, pandas, python-Levenshtein, codecs, OpenCV, feather-format, jellyfish, nltk, PIL, fuzzywuzzy, stop_words, haversine 22 | 23 | **Python Version:** 2.7.1 24 | **Python Packages:** scikit-learn, feather-format, numpy, pandas 25 | XGBoost (0.4.0) 26 | Keras (0.3.2) 27 | Theano (0.8.0rc1) 28 | 29 | ##How to Generate the Submission File 30 | 1) Update `config.cfg` and set all config parameters 31 | 2) Ensure all directories mentioned in config.cfg are write-able 32 | 3) Run `RunAll.sh` 33 | 34 | _Note_: In order to generate the full submission including models, it may take several weeks and needs at least 128GB of RAM 35 | -------------------------------------------------------------------------------- /HackerEarth/Predict Lanes from LIDAR data/README.md: -------------------------------------------------------------------------------- 1 | # Approach for [HackerEarth India Hacks Machine Learning Competition - Semi Finals](https://www.hackerearth.com/challenge/test/indiahacks-2017-machine-learning-round-2/) - (12-13 August 2017, Bangalore, India) 2 | ## (c) [Sonny Laskar](https://github.com/sonnylaskar) 3 | ## Model scored #1 on Public Leaderboard and #2 on Private Leaderboard 4 | 5 | ## Pre-requisites: 6 | ``` 7 | R 3.3+ 8 | Packages: xgboost, tidyverse, feather, geosphere 9 | ``` 10 | ## Approach 11 | 12 | ### Directory 13 | ``` 14 | Create folders - code, input, output 15 | Copy all input files in input folder 16 | Copy all code files in code folder 17 | ``` 18 | 19 | ### Scripts 20 | Execute *Rscript final_1_calculateHaversineDistance.R* to calculate the length of each line by finding the Haversine distance between the two extreme coordinates for each line
21 | 22 | Execute *Rscript final_2_buildData.R* 23 |
This script builds all features and prepares the data for final model 24 | 25 | ### Feature Engineering: 26 | ``` 27 | sumOfDistanceFromLeft = Sum of all distances towards Left 28 | sumOfDistanceFromRight = Sum of all distances towards Right 29 | r_sumOfDistanceFromLR = Ratio of the above two 30 | int_distLR = Intersection between the distances in left and right 31 | latCounter = Unique Count of latitude after rounding off to 4 digits 32 | lonCounter = Unique Count of longitude after rounding off to 4 digits 33 | uniq_linesLeft = Unique lines on Left 34 | uniq_linesRight = Unique lines on Right 35 | totalLaneLinesMean = Mean of total Lane Lines 36 | haversineDistance = Haversine length of each line and averaged, Then it is scaled by dividing against the LaneLineMean value 37 | [Refer feature Importance plot for importance] 38 | ``` 39 | Execute *final_3_buildModel.R* to build the final model
40 | XGBOOST models with 10 different seeds are built and averaged. 41 | The final submission file will be in output folder 42 | 43 |
44 | Cheers :-) 45 | -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/7_feature_student_ExperienceCoding.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(tidyr) 3 | library(readr) 4 | library(tm) 5 | 6 | #LOAD DATA 7 | student <- read_csv("../data/Student.csv", na = c("", "NA", "NULL")) 8 | NCOL <- ncol(student) 9 | 10 | #Filter only the Experience-related columns 11 | student <- student[, c(1,15:19)] 12 | 13 | #####SECTION FOR EXPERIENCE ############ 14 | #Create columns for each type of Experience and make a single row for each Student ID 15 | student$Experience_Type[is.na(student$Experience_Type)] <- "NoExperience" 16 | student %>% 17 | select(Student_ID, Experience_Type) %>% 18 | mutate(yesno = 1) %>% 19 | distinct %>% 20 | spread(Experience_Type, yesno, fill = 0) -> studentExperience 21 | 22 | #####SECTION FOR PROFILE ############ 23 | unlist(strsplit(unlist(strsplit(student$Profile, " ")), ",")) %>% 24 | table() %>% 25 | data.frame() %>% 26 | arrange(-Freq) %>% 27 | mutate(perc.weight = percent_rank(Freq)) %>% 28 | filter(perc.weight > 0.98) -> aList 29 | 30 | aList$NCHAR <- nchar(as.character(aList$.)) 31 | aList <- filter(aList, NCHAR > 1) 32 | aList <- unique(tolower(stemDocument(as.character(aList$.)))) 33 | StringsForExperienceProfile <- setdiff(aList, stopwords("english")) 34 | 35 | student$Experience_Profile_Type <- NA 36 | for (i in StringsForExperienceProfile) { 37 | print(i) 38 | student$Experience_Profile_Type[grep(i, student$Profile, ignore.case = TRUE)] <- i 39 | } 40 | 41 | #Create columns for each type of Profile and make a single row for each Student ID 42 | student$Experience_Profile_Type[is.na(student$Experience_Profile_Type)] <- "NoProfile" 43 | student %>% 44 | select(Student_ID, Experience_Profile_Type) %>% 45 | mutate(yesno = 1) %>% 46 | distinct %>% 47 | spread(Experience_Profile_Type, yesno, fill = 0) -> studentExperienceProfile 48 | 49 | #JOIN 50 | studentExperience <- left_join(studentExperience, studentExperienceProfile, by = "Student_ID") 51 | #SAVE FILES 52 | write.csv(studentExperience, "../data/Features_student_Experience.csv", row.names = F) 53 | 54 | -------------------------------------------------------------------------------- /AnalyticsVidhya/AVDatafest_XtremeML/input/holiday.csv: -------------------------------------------------------------------------------- 1 | Date,f_Holiday 2 | 2010-01-01,1.0 3 | 2010-01-06,1.0 4 | 2010-04-02,1.0 5 | 2010-05-01,1.0 6 | 2010-08-15,1.0 7 | 2010-09-11,1.0 8 | 2010-10-12,1.0 9 | 2010-11-01,1.0 10 | 2010-12-06,1.0 11 | 2010-12-08,1.0 12 | 2010-12-25,1.0 13 | 2010-12-27,1.0 14 | 2011-01-01,1.0 15 | 2011-01-06,1.0 16 | 2011-04-22,1.0 17 | 2011-05-01,1.0 18 | 2011-08-15,1.0 19 | 2011-09-11,1.0 20 | 2011-10-12,1.0 21 | 2011-11-01,1.0 22 | 2011-12-06,1.0 23 | 2011-12-08,1.0 24 | 2011-12-25,1.0 25 | 2011-12-06,1.0 26 | 2012-01-01,1.0 27 | 2012-01-06,1.0 28 | 2012-03-19,1.0 29 | 2012-04-06,1.0 30 | 2012-05-01,1.0 31 | 2012-05-06,1.0 32 | 2012-08-15,1.0 33 | 2012-09-11,1.0 34 | 2012-10-12,1.0 35 | 2012-11-01,1.0 36 | 2012-12-06,1.0 37 | 2012-12-08,1.0 38 | 2012-12-25,1.0 39 | 2013-01-01,1.0 40 | 2013-01-06,1.0 41 | 2013-03-19,1.0 42 | 2013-03-29,1.0 43 | 2013-05-01,1.0 44 | 2013-05-05,1.0 45 | 2013-06-24,1.0 46 | 2013-08-15,1.0 47 | 2013-09-11,1.0 48 | 2013-10-12,1.0 49 | 2013-11-01,1.0 50 | 2013-12-06,1.0 51 | 2013-12-08,1.0 52 | 2013-12-25,1.0 53 | 2014-01-01,1.0 54 | 2014-01-06,1.0 55 | 2014-03-19,1.0 56 | 2014-04-18,1.0 57 | 2014-04-21,1.0 58 | 2014-05-01,1.0 59 | 2014-05-04,1.0 60 | 2014-06-24,1.0 61 | 2014-08-15,1.0 62 | 2014-09-11,1.0 63 | 2014-10-12,1.0 64 | 2014-11-01,1.0 65 | 2014-12-06,1.0 66 | 2014-12-08,1.0 67 | 2014-12-25,1.0 68 | 2015-01-01,1.0 69 | 2015-01-06,1.0 70 | 2015-03-19,1.0 71 | 2015-04-03,1.0 72 | 2015-04-06,1.0 73 | 2015-05-01,1.0 74 | 2015-05-03,1.0 75 | 2015-06-24,1.0 76 | 2015-09-11,1.0 77 | 2015-10-12,1.0 78 | 2015-11-01,1.0 79 | 2015-12-06,1.0 80 | 2015-12-08,1.0 81 | 2015-12-25,1.0 82 | 2015-12-26,1.0 83 | 2016-01-01,1.0 84 | 2016-01-06,1.0 85 | 2016-03-19,1.0 86 | 2016-03-25,1.0 87 | 2016-03-28,1.0 88 | 2016-05-01,1.0 89 | 2016-05-16,1.0 90 | 2016-06-24,1.0 91 | 2016-08-15,1.0 92 | 2016-09-11,1.0 93 | 2016-10-12,1.0 94 | 2016-11-01,1.0 95 | 2016-12-06,1.0 96 | 2016-12-08,1.0 97 | 2016-12-25,1.0 98 | 2016-12-26,1.0 99 | 2017-01-01,1.0 100 | 2017-01-06,1.0 101 | 2017-03-19,1.0 102 | 2017-04-14,1.0 103 | 2017-04-17,1.0 104 | -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/2_feature_internship_Profile_WordCount.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(tidyr) 3 | library(readr) 4 | library(stylo) 5 | library(stringr) 6 | 7 | #LOAD DATA 8 | internship <- read_csv("../data/Internship_Processed.csv", na = c("", "NA", "NULL")) 9 | 10 | ######## 11 | getNGrams <- function(my.text, n = 1) { 12 | # which can be split into a vector of consecutive words: 13 | #my.vector.of.words = txt.to.words(my.text) #Removed this single is would replace all numbers 14 | #my.vector.of.words <- unlist(strsplit(gsub("\\s+", " ", str_trim(my.text)), " ")) 15 | my.vector.of.words <- unlist(strsplit(gsub("\\s+", " ", my.text), " ")) 16 | #my.vector.of.words <- unlist(strsplit(my.text, " ")) 17 | # now, we create a vector of word 2-grams: 18 | if (length(my.vector.of.words) >= n) { 19 | make.ngrams(my.vector.of.words, ngram.size = n) 20 | } else { 21 | return(NULL) 22 | } 23 | } 24 | 25 | ################################### 26 | getNgramsCount <- function(words, n) { 27 | ####################################### 28 | # COUNTING NGRAMS FEATURES 29 | ####################################### 30 | #Generate Ngrams 31 | NgramsProfile <- getNGrams(words, n) 32 | 33 | #Count of Ngrams 34 | countOfNgramsInProfile <- length(NgramsProfile) 35 | 36 | #Count of Unique NGrams 37 | countOfUniqueNgramsInProfile <- length(unique(NgramsProfile)) 38 | 39 | return(c(countOfNgramsInProfile, countOfUniqueNgramsInProfile)) 40 | } 41 | 42 | NCOL <- ncol(internship) 43 | for ( n in 1:2) { 44 | print(n) 45 | internship_words <- as.data.frame(t(mapply(getNgramsCount, internship$Internship_Profile, n))) 46 | colnames(internship_words) <- c(paste("countOf_", n, "_gramsInProfile", sep = ""), 47 | paste("countOfUnique_", n, "_gramsInProfile", sep = "") 48 | ) 49 | row.names(internship_words) <- NULL 50 | internship <- cbind(internship, internship_words) 51 | } 52 | 53 | 54 | write.csv(internship[, (NCOL+1):ncol(internship)], "../data/Features_internship_Profile_WordCount.csv", row.names = F) 55 | 56 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/libavito.py: -------------------------------------------------------------------------------- 1 | #### Copyright 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants 2 | #### Avito Duplicate Ad Detection 3 | # Author: Mikel 4 | # This file contains various functions which are used in multiple scripts 5 | 6 | from imp import load_source 7 | from time import time 8 | import sys 9 | 10 | # Terminal output colours for use in scripts 11 | class c: 12 | HEADER = '\033[95m' 13 | OKBLUE = '\033[94m' 14 | OKGREEN = '\033[92m' 15 | WARNING = '\033[93m' 16 | FAIL = '\033[91m' 17 | END = '\033[0m' 18 | BOLD = '\033[1m' 19 | UNDERLINE = '\033[4m' 20 | 21 | # Function to read the config file 22 | def read_config(): 23 | conf = load_source('config.cfg', 'config.cfg') 24 | conf.nthreads = conf.model_nthreads 25 | conf.debug = 0 26 | # except Exception as e: 27 | # #print(bcol.FAIL + 'Failed to parse config file:' + bcol.END) 28 | # print(e.message, e.args) 29 | # raise Exception(bcol.FAIL + 'Failed to parse config file:' + bcol.END) 30 | return conf 31 | 32 | # Just an alias 33 | def get_config(): 34 | return read_config() 35 | 36 | # Function which reads '--train' or '--test' launch arguments 37 | def get_mode(argv, name='Script'): 38 | if len(argv) != 2: 39 | raise RuntimeError(name + ' must be called with either --train or --test') 40 | if argv[1] == '--train': 41 | mode = 0 42 | elif argv[1] == '--test': 43 | mode = 1 44 | else: 45 | raise RuntimeError(name + ' must be called with either --train or --test') 46 | assert mode == 0 or mode == 1 47 | return mode 48 | 49 | # Function which prints current status and time remaining: 50 | def print_progress(k, start, o): 51 | if k != 0: 52 | dur_per_k = (time() - start) / k 53 | rem_dur = dur_per_k * (o - k) 54 | rem_mins = int(rem_dur / 60) 55 | rem_secs = rem_dur % 60 56 | toprint = str(k) + " items processed - " + str(rem_mins) + "m" + str(int(rem_secs)) + "s remaining. " 57 | sys.stdout.write(toprint + '\r') 58 | sys.stdout.flush() 59 | 60 | def print_elapsed(start): 61 | print(str(round(time() - start, 1)) + 's elapsed') 62 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/models/libavito.py: -------------------------------------------------------------------------------- 1 | #### Copyright 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants 2 | #### Avito Duplicate Ad Detection 3 | # Author: Mikel 4 | # This file contains various functions which are used in multiple scripts 5 | 6 | from imp import load_source 7 | from time import time 8 | import sys 9 | 10 | # Terminal output colours for use in scripts 11 | class c: 12 | HEADER = '\033[95m' 13 | OKBLUE = '\033[94m' 14 | OKGREEN = '\033[92m' 15 | WARNING = '\033[93m' 16 | FAIL = '\033[91m' 17 | END = '\033[0m' 18 | BOLD = '\033[1m' 19 | UNDERLINE = '\033[4m' 20 | 21 | # Function to read the config file 22 | def read_config(): 23 | conf = load_source('config.cfg', 'config.cfg') 24 | conf.nthreads = conf.model_nthreads 25 | conf.debug = 0 26 | # except Exception as e: 27 | # #print(bcol.FAIL + 'Failed to parse config file:' + bcol.END) 28 | # print(e.message, e.args) 29 | # raise Exception(bcol.FAIL + 'Failed to parse config file:' + bcol.END) 30 | return conf 31 | 32 | # Just an alias 33 | def get_config(): 34 | return read_config() 35 | 36 | # Function which reads '--train' or '--test' launch arguments 37 | def get_mode(argv, name='Script'): 38 | if len(argv) != 2: 39 | raise RuntimeError(name + ' must be called with either --train or --test') 40 | if argv[1] == '--train': 41 | mode = 0 42 | elif argv[1] == '--test': 43 | mode = 1 44 | else: 45 | raise RuntimeError(name + ' must be called with either --train or --test') 46 | assert mode == 0 or mode == 1 47 | return mode 48 | 49 | # Function which prints current status and time remaining: 50 | def print_progress(k, start, o): 51 | if k != 0: 52 | dur_per_k = (time() - start) / k 53 | rem_dur = dur_per_k * (o - k) 54 | rem_mins = int(rem_dur / 60) 55 | rem_secs = rem_dur % 60 56 | toprint = str(k) + " items processed - " + str(rem_mins) + "m" + str(int(rem_secs)) + "s remaining. " 57 | sys.stdout.write(toprint + '\r') 58 | sys.stdout.flush() 59 | 60 | def print_elapsed(start): 61 | print(str(round(time() - start, 1)) + 's elapsed') 62 | -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/5_feature_student_StreamsCoding.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(tidyr) 3 | library(readr) 4 | 5 | #LOAD DATA 6 | student <- read_csv("../data/Student.csv", na = c("", "NA", "NULL")) 7 | 8 | #With the below code we checked how the words look like in the degree column 9 | #table(student$Stream) %>% data.frame() %>% arrange(-Freq) %>% View() 10 | 11 | NCOL <- ncol(student) 12 | #We will create binary columns for most popular streams 13 | 14 | #Add the Temporary Column 15 | student$StreamCode <- NA 16 | 17 | StringsForStreams <- c("Computer", 18 | "Electronics", 19 | "Mechanical", 20 | "Commerce", 21 | "Information", 22 | "Marketing", 23 | "Electrical", 24 | "Civil", 25 | "Finance", 26 | "Arts", 27 | "Science", 28 | "Economics", 29 | "Humanities", 30 | "Management", 31 | "English", 32 | "Human", 33 | "Software", 34 | "Bio", 35 | "Mass", 36 | "Operations", 37 | "Architecture", 38 | "Instrumentation", 39 | "Mathematics", 40 | "Physics", 41 | "Media", 42 | "Accounts", 43 | "Statistics", 44 | "Chemistry", 45 | "Political Science", 46 | "Psychology", 47 | "Fashion", 48 | "journalism" 49 | ) 50 | 51 | for (i in StringsForStreams) { 52 | print(i) 53 | student$StreamCode[grep(i, student$Stream, ignore.case = TRUE)] <- i 54 | } 55 | 56 | ##Dummy Variables for StreamsCode 57 | for (i in c("StreamCode")) { 58 | print(i) 59 | for(level in unique(student[[i]])){ 60 | student[paste("dummy", i, level, sep = "_")] <- ifelse(student[[i]] == level, 1, 0) 61 | } 62 | student[[i]] <- NULL #Drop this column 63 | } 64 | 65 | #SAVE FILES 66 | write.csv(student[, (NCOL+1):ncol(student)], "../data/Features_student_StreamCode.csv", row.names = F) 67 | 68 | -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/9_model_XGB_1.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(tidyr) 3 | library(readr) 4 | library(xgboost) 5 | library(pROC) 6 | library(caret) 7 | 8 | #MODEL DESCRIPTION 9 | #XGBOOST MODEL SEED = 123 and NROUND = 526 10 | #LOAD DATA 11 | train <- read.csv("../data/train_processed.csv", header = TRUE, stringsAsFactors = FALSE) 12 | test <- read.csv("../data/test_processed.csv", header = TRUE, stringsAsFactors = FALSE) 13 | 14 | #DONT NEED THESE COLUMNS ANY MORE 15 | train$Earliest_Start_Date <- NULL 16 | train$Internship_deadline <- NULL 17 | train$Start_Date <- NULL 18 | train$End_Date <- NULL 19 | train$End.Date <- NULL 20 | train$Start.Date <- NULL 21 | 22 | test$Earliest_Start_Date <- NULL 23 | test$Internship_deadline <- NULL 24 | test$Start_Date <- NULL 25 | test$End_Date <- NULL 26 | test$End.Date <- NULL 27 | test$Start.Date <- NULL 28 | 29 | #Validation Set 30 | set.seed(123) 31 | inTrain <- createDataPartition(y = train$Is_Shortlisted, p = .70, list = FALSE) 32 | trainSet <- train[inTrain, ] 33 | validateSet <- train[-inTrain, ] 34 | ##### 35 | 36 | dtrain <- xgb.DMatrix(data = data.matrix(train[, c(2:ncol(train))]), 37 | label = data.matrix(train$Is_Shortlisted), 38 | missing=NA) 39 | dvalidate <- xgb.DMatrix(data = data.matrix(validateSet[, c(2:ncol(validateSet))]), 40 | label = data.matrix(validateSet$Is_Shortlisted), 41 | missing=NA) 42 | watchlist <- list(train = dtrain, test = dvalidate) 43 | param <- list("objective" = "binary:logistic", 44 | "eval_metric" = "auc", 45 | "eta" = 0.1, 46 | "max_depth" = 10, 47 | "subsample" = 1, 48 | "min_child_weight" = 1, 49 | "colsample_bytree" = 0.2 50 | ) 51 | cv.nround <- 526 52 | 53 | t <- Sys.time() 54 | set.seed(123) 55 | bst <- xgb.train(param = param, 56 | data = dtrain, 57 | nrounds = cv.nround, 58 | maximize = TRUE) 59 | print(Sys.time() - t) 60 | 61 | 62 | test_target_xgb <- predict(bst, 63 | data.matrix(test[, c(2:ncol(test))]), 64 | missing=NA) 65 | submission <- data.frame(Internship_ID = test$Internship_ID, 66 | Student_ID = test$Student_ID, 67 | Is_Shortlisted = test_target_xgb) 68 | write_csv(submission,"../Submissions/XGB_MODEL_S123_N526.csv") 69 | -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/10_model_XGB_1.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(tidyr) 3 | library(readr) 4 | library(xgboost) 5 | library(pROC) 6 | library(caret) 7 | 8 | #MODEL DESCRIPTION 9 | #XGBOOST MODEL SEED = 500 and NROUND = 710 10 | 11 | #LOAD DATA 12 | train <- read.csv("../data/train_processes.csv", header = TRUE, stringsAsFactors = FALSE) 13 | test <- read.csv("../data/test_processes.csv", header = TRUE, stringsAsFactors = FALSE) 14 | 15 | #DONT NEED THESE COLUMNS ANY MORE 16 | train$Earliest_Start_Date <- NULL 17 | train$Internship_deadline <- NULL 18 | train$Start_Date <- NULL 19 | train$End_Date <- NULL 20 | train$End.Date <- NULL 21 | train$Start.Date <- NULL 22 | 23 | test$Earliest_Start_Date <- NULL 24 | test$Internship_deadline <- NULL 25 | test$Start_Date <- NULL 26 | test$End_Date <- NULL 27 | test$End.Date <- NULL 28 | test$Start.Date <- NULL 29 | 30 | #Validation Set 31 | set.seed(123) 32 | inTrain <- createDataPartition(y = train$Is_Shortlisted, p = .70, list = FALSE) 33 | trainSet <- train[inTrain, ] 34 | validateSet <- train[-inTrain, ] 35 | ##### 36 | 37 | dtrain <- xgb.DMatrix(data = data.matrix(train[, c(2:ncol(train))]), 38 | label = data.matrix(train$Is_Shortlisted), 39 | missing=NA) 40 | dvalidate <- xgb.DMatrix(data = data.matrix(validateSet[, c(2:ncol(validateSet))]), 41 | label = data.matrix(validateSet$Is_Shortlisted), 42 | missing=NA) 43 | watchlist <- list(train = dtrain, test = dvalidate) 44 | param <- list("objective" = "binary:logistic", 45 | "eval_metric" = "auc", 46 | "eta" = 0.1, 47 | "max_depth" = 10, 48 | "subsample" = 1, 49 | "min_child_weight" = 1, 50 | "colsample_bytree" = 0.2 51 | ) 52 | cv.nround <- 710 53 | 54 | t <- Sys.time() 55 | set.seed(500) 56 | bst <- xgb.train(param = param, 57 | data = dtrain, 58 | nrounds = cv.nround, 59 | maximize = TRUE) 60 | print(Sys.time() - t) 61 | 62 | 63 | test_target_xgb <- predict(bst, 64 | data.matrix(test[, c(2:ncol(test))]), 65 | missing=NA) 66 | submission <- data.frame(Internship_ID = test$Internship_ID, 67 | Student_ID = test$Student_ID, 68 | Is_Shortlisted = test_target_xgb) 69 | write_csv(submission,"../Submissions/XGB_MODEL_S500_N710.csv") 70 | -------------------------------------------------------------------------------- /AnalyticsVidhya/AVDatafest_XtremeML/README.md: -------------------------------------------------------------------------------- 1 | # Winning Solution for Analytics Vidhya Machine Learning Competition - [Xtreme ML Hack](https://datahack.analyticsvidhya.com/contest/machine-learning-hackathon/) 2 | 3 | (c) [Sonny](https://github.com/sonnylaskar) 4 | 5 | This model scored 60.9 on the Public Leaderboard, 61.7 on the [Private Leaderboard]("https://datahack.analyticsvidhya.com/contest/machine-learning-hackathon/lb") and ranked #2. 6 | 7 | ## Prerequisites: 8 | 1. R version 3.3.3 9 | 2. R Packages: readr, lubridate, dplyr, tidyr, xgboost 10 | 11 | ## Problem Statement: 12 | The largest water supplier of Barcelona wants to leverage machine learning to effectively predict daywise-mediumwise-departmentwise breakdown of predictions of how many contacts (tickets/enquiries) would it receive and how many resolutions would it make so that they can size their team properly and improve customer satisfaction. 13 | 14 | ## Approach: 15 | While this looked to be a time-series problem, it did not work out for me to solve it by leveraging various time series modelling techniques like ARIMA, etc. Hence I switched to solving it with regression. But the issue was that the Test dataset was in future and literally no information was known in future. We were allowed to use external data in this contest and Holiday calender seemed to be an obvious parameter that should surely affect such problems. 16 | 17 | ### Feature Engineering: 18 | 1. Date features like weekday, quarter, etc. 19 | 2. Whether a Day was a holiday in Spain? 20 | 3. How many days were elaped since the last holiday (in rank_percent)? 21 | 4. Lagged features of # of contacts and resolutions of 75 days, 90 days and 120 days (Since the prediction to be made was upto 75 days in future, hence I decided not to include any lag value less than 75 days) 22 | 23 | ### Modeling: 24 | Xgboost is the first model that I try everytime I have to solve any such problem. As always, it gave a significant score. For cross validation, I used the last 4 months data. 25 | 26 | ## Steps to reproduce the submission: 27 | 1. Copy all Train files in the folder _"input/Train"_ 28 | 2. Copy all Test files in the folder _"input/Test"_ 29 | 3. External data: I used holiday list of Spain as an external data from [here](http://www.officeholidays.com/countries/spain/regional.php?list_year=2010&list_region=catalonia "Calender") 30 | 4. Ensure folder _"output"_ exists 31 | 5. Run the Rscript _final_model.R_ from the _code_ directory 32 | 6. The final files will be created in the _"output"_ folder 33 | 34 | Enjoy :smile: 35 | 36 | 37 | Regards 38 | 39 | Sonny 40 | -------------------------------------------------------------------------------- /HackerEarth/Predict Lanes from LIDAR data/final_2_buildData.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(feather) 3 | library(stringr) 4 | 5 | label <- read_csv("../input/labels.csv") 6 | label$roadCoordinates <- NULL 7 | train <- read_csv("../input/train.csv") 8 | test <- read_csv("../input/test.csv") 9 | df_all <- bind_rows(train, test) 10 | roadsDistance <- read_csv("../input/roadsDistance.csv") 11 | 12 | getLatLong <- function(x, t = "lat") { 13 | a <- matrix(as.numeric(unlist(strsplit(unlist(strsplit(x, "\\|")), " "))), byrow = T, ncol = 2) 14 | if (t == "lon") { 15 | apply(a, 2, mean)[1] 16 | } else { 17 | apply(a, 2, mean)[2] 18 | } 19 | } 20 | 21 | 22 | df_all$meanLat <- sapply(df_all$laneLineCoordinates, getLatLong, t = "lat", USE.NAMES = F) 23 | df_all$meanLon <- sapply(df_all$laneLineCoordinates, getLatLong, t = "lon", USE.NAMES = F) 24 | 25 | df_all %>% 26 | group_by(roadId) %>% 27 | summarise( 28 | sumOfDistanceFromLeft = sum(distFromLaneLineOnLeft, na.rm = T), 29 | sumOfDistanceFromRight = sum(distFromLaneLineOnRight, na.rm = T), 30 | r_sumOfDistanceFromLR = sumOfDistanceFromLeft / sumOfDistanceFromRight, 31 | int_distLR = length(intersect(distFromLaneLineOnLeft, distFromLaneLineOnRight)), 32 | 33 | latCounter = length(unique(round(meanLat, 4))), 34 | lonCounter = length(unique(round(meanLon, 4))), 35 | 36 | int_TotalLinesLR = length(intersect(totalLinesOnLeft, totalLaneLinesOnRight)), 37 | uniq_linesLeft = length(unique(totalLinesOnLeft)), 38 | uniq_linesRight = length(unique(totalLaneLinesOnRight)), 39 | totalLaneLinesMean = mean(totalLaneLines), 40 | totalLinesOnLeftMax = max(totalLinesOnLeft), 41 | 42 | uniq_lineId = length(unique(laneLineId)) / length((laneLineId)), 43 | roadCategory = unique(roadCategory), 44 | 45 | r_lineToRoadLength = sum(laneLineLength / roadLength < 0.8), 46 | r_lineToRoadLength2 = sum(laneLineLength / roadLength >= 0.8), 47 | laneLineLengthMean = mean(laneLineLength), 48 | 49 | sum_interSectingLines = sum(noOfIntersectingLaneLinesLeft, noOfIntersectingLaneLinesRight), 50 | noOfIntersectingLaneLinesLeftMean = mean(noOfIntersectingLaneLinesLeft), 51 | 52 | sum_isIntersectingWithRoadGeometryTrue = sum(isIntersectingWithRoadGeometry == "true"), 53 | sum_isIntersectingWithRoadGeometryFalse = sum(isIntersectingWithRoadGeometry == "false") 54 | ) -> df2 55 | 56 | 57 | 58 | df2$data <- ifelse(df2$roadId %in% train$roadId, "train", "test") 59 | df2 <- left_join(df2, roadsDistance, by = "roadId") 60 | df2$haversineDistance <- df2$haversineDistance / df2$laneLineLengthMean 61 | df2 <- left_join(df2, label, by = "roadId") 62 | 63 | write_feather(df2, "../input/df_all.fthr") 64 | 65 | -------------------------------------------------------------------------------- /HackerEarth/Predict Lanes from LIDAR data/final_3_buildModel.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(feather) 3 | library(xgboost) 4 | 5 | nthread <- parallel::detectCores() 6 | 7 | df_all <- read_feather("../input/df_all.fthr") 8 | TARGET <- "noOfLanes" 9 | NAString <- NA 10 | model_features <- setdiff(names(df_all), c("roadId", TARGET, "data")) 11 | 12 | df_all_train <- df_all[df_all$data == "train", ] 13 | df_all_test <- df_all[df_all$data == "test", ] 14 | #rm(df_all) 15 | gc() 16 | 17 | ####### XGBOOST ############ 18 | EARLY_STOPPING <- 100 19 | print.every.n <- 10 20 | df_all_train[[TARGET]] <- as.factor(df_all_train[[TARGET]] - 1) 21 | num_class <- length(levels(df_all_train[[TARGET]])) 22 | 23 | param <- list( 24 | objective = "multi:softprob", 25 | booster = "gbtree", 26 | eval_metric = "mlogloss", 27 | num_class = num_class, 28 | eta = 0.1, 29 | max_depth = 5, 30 | subsample = 0.9, 31 | min_child_weight = 1, 32 | colsample_bytree = 1.0, 33 | gamma = 0, 34 | nthread = nthread, 35 | num_parallel_tree = 2 36 | ) 37 | 38 | if (param$eval_metric != "auc") { 39 | isMaximize <- F 40 | } else { 41 | isMaximize <- T 42 | } 43 | nrounds <- 100 44 | seed <- (1:10)*1000 45 | 46 | dtrain <- xgb.DMatrix( data = data.matrix(df_all_train[, model_features]), 47 | label = data.matrix(df_all_train[[TARGET]]), 48 | missing = NAString) 49 | watchlist <- list(train = dtrain) 50 | 51 | t <- Sys.time() 52 | print(param) 53 | test_xgb_model <- rep(0, nrow(df_all_test)) 54 | for (s in seed) { 55 | cat("Generating XGB seed", s, "\n", sep = " ") 56 | set.seed(s) 57 | bst <- xgb.train( params = param, 58 | data = dtrain, 59 | nrounds = nrounds, 60 | verbose = 1, 61 | print_every_n = print.every.n, 62 | early_stopping_rounds = EARLY_STOPPING, 63 | watchlist = watchlist, 64 | maximize = isMaximize 65 | ) 66 | print(format(Sys.time() - t, format = "%H:%M") ) 67 | dtest <- xgb.DMatrix( data = data.matrix(df_all_test[, model_features]), 68 | missing = NAString) 69 | tmp <- predict(bst, dtest) 70 | tmp <- ifelse(tmp < 0, 0, tmp) 71 | test_xgb_model <- test_xgb_model + tmp 72 | } 73 | xgb_1 <- test_xgb_model / length(seed) 74 | 75 | 76 | xgb_1 <- apply(matrix(xgb_1, byrow = T, ncol = num_class), 1, which.max) 77 | xgb_1 <- data.frame(roadId = df_all_test$roadId, noOfLanes = xgb_1) 78 | write_csv(xgb_1, "../output/finalSubmission.csv") 79 | 80 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set4a_fuzzy.py: -------------------------------------------------------------------------------- 1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants 2 | #### Author: Marios & Mikel 3 | #### Avito Duplicate Ad Detection 4 | # 3_feature_set4a_fuzzy.py 5 | # Creates text features using the fuzzywuzzy python packages 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import sys 10 | import time 11 | import gc 12 | import feather 13 | from fuzzywuzzy import fuzz 14 | from multiprocessing import Pool 15 | 16 | import libavito as a 17 | 18 | def process_row(row): 19 | values = [] 20 | values.append(row[0]) 21 | values.append(row[1]) 22 | 23 | # Not black magic, iterate over title/description/json 24 | for d in [2, 4, 6]: 25 | st_1 = str(row[d]) 26 | st_2 = str(row[d + 1]) 27 | values.append(fuzz.partial_ratio(st_1, st_2)) 28 | values.append(fuzz.token_set_ratio(st_1, st_2)) 29 | values.append(fuzz.ratio(st_1, st_2)) 30 | values.append(fuzz.token_sort_ratio(st_1, st_2)) 31 | return values 32 | 33 | print(a.c.BOLD + 'Extracting set4a fuzzy text features ...' + a.c.END) 34 | 35 | # Get train/test mode from launch argument 36 | mode = a.get_mode(sys.argv, '3_feature_set4a_fuzzy.py') 37 | 38 | ## Read settings required by script 39 | config = a.read_config() 40 | nthreads = config.preprocessing_nthreads 41 | cache_loc = config.cache_loc 42 | debug = config.debug 43 | if mode == 0: 44 | root = config.train_images_root 45 | df = feather.read_dataframe(cache_loc + 'train.fthr') 46 | if mode == 1: 47 | root = config.test_images_root 48 | df = feather.read_dataframe(cache_loc + 'test.fthr') 49 | 50 | df = df[['itemID_1', 'itemID_2', 'title_1', 'title_2', 'description_1', 'description_2', 'attrsJSON_1', 'attrsJSON_2']] 51 | 52 | ftrs = [] 53 | 54 | start = time.time() 55 | o = len(df.index) 56 | if nthreads == 1: 57 | print('Extracting features with 1 thread ...') 58 | k = 0 59 | # Iterate over files 60 | ftrs = [] 61 | for row in df.values: 62 | x = process_row(row) 63 | ftrs.append(x) 64 | k += 1 65 | if k % 100 == 0: 66 | a.print_progress(k, start, o) 67 | 68 | # Otherwise perform multi-threaded mapping 69 | else: 70 | print('Extracting features multi-threaded ... ', end='', flush=True) 71 | pool = Pool(nthreads) 72 | ftrs = pool.map(process_row, df.values) 73 | pool.close() 74 | gc.collect() 75 | 76 | a.print_elapsed(start) 77 | 78 | ftrs = pd.DataFrame(ftrs) 79 | cols = ['itemID_1', 'itemID_2'] + ['set4a_X' + str(i) for i in range(1, len(ftrs.columns.tolist()) - 1)] 80 | print(cols) 81 | ftrs.columns = cols 82 | 83 | # Save updated dataset 84 | if mode == 0: 85 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set4a_fuzzy.fthr') 86 | if mode == 1: 87 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set4a_fuzzy.fthr') 88 | 89 | a.print_elapsed(start) 90 | print('set4a extraction complete!') 91 | 92 | # Write status to status file so master script knows whether to proceed. 93 | f = open(cache_loc + 'status.txt', 'a') 94 | f.write('feature_set4a_OK\n') 95 | f.close() 96 | -------------------------------------------------------------------------------- /AnalyticsVidhya/amexpert-2019-machine-learning-hackathon/code/agg_feature_2.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(lubridate) 3 | 4 | campaign_data <- read_csv("../input/campaign_data.csv") 5 | campaign_data$start_date <- dmy(campaign_data$start_date) 6 | campaign_data$end_date <- dmy(campaign_data$end_date) 7 | campaign_data <- arrange(campaign_data, start_date) 8 | 9 | 10 | customer_transaction_data <- read_csv("../input/customer_transaction_data.csv") 11 | 12 | 13 | #x <- unique(customer_transaction_data$date) 14 | #campaignDates <- campaign_data$start_date 15 | #roundToNearestCampaignDate <- function(x) { 16 | # campaignDates[campaignDates > x][1] 17 | #} 18 | 19 | #df_dates <- tibble(date = unique(customer_transaction_data$date)) 20 | #df_dates <- df_dates %>% 21 | # rowwise() %>% 22 | # mutate(nextCampaignDate = roundToNearestCampaignDate(date)) 23 | 24 | #customer_transaction_data <- left_join(customer_transaction_data, df_dates, by = "date") 25 | 26 | #customer_transaction_df <- customer_transaction_data %>% 27 | #head(100000) %>% 28 | # group_by(nextCampaignDate, customer_id, item_id) %>% 29 | # summarise(quantity_sum = sum(quantity, na.rm = T), 30 | # selling_price_sum = sum(selling_price, na.rm = T), 31 | # other_discount_sum = sum(other_discount, na.rm = T), 32 | # coupon_discount_sum = sum(coupon_discount, na.rm = T)) 33 | 34 | #write_csv(customer_transaction_df, "../input/customer_transaction_df.csv") 35 | 36 | #df_dates <- tibble(campaignDates = campaignDates) 37 | #df_dates$date_1m <- df_dates$campaignDates - 30 38 | #df_dates$date_2m <- df_dates$campaignDates - 60 39 | 40 | for (i in unique(campaign_data$campaign_id)) { 41 | customer_transaction_data[[paste0("campaign_id_", i)]] <- campaign_data$start_date[campaign_data$campaign_id == i] 42 | } 43 | 44 | #[1] 26 27 28 29 30 1 2 3 4 5 6 7 8 9 10 11 12 13 16 17 18 19 20 21 22 23 24 25 45 | 46 | #customer_transaction_df <- tibble() 47 | for (i in unique(campaign_data$campaign_id)) { 48 | for (lagDays in c(seq(30, 30*12, 30))) { 49 | print(paste(i, lagDays)) 50 | customer_transaction_data$CampaignDate <- customer_transaction_data[[paste0("campaign_id_", i)]] 51 | tmp <- customer_transaction_data %>% 52 | filter(date >= CampaignDate - lagDays & date < CampaignDate) %>% 53 | group_by(CampaignDate, customer_id, item_id) %>% 54 | summarise(quantity_sum = sum(quantity, na.rm = T), 55 | selling_price_sum = sum(selling_price, na.rm = T), 56 | other_discount_sum = sum(other_discount, na.rm = T), 57 | coupon_discount_sum = sum(coupon_discount, na.rm = T), 58 | quantity_mean = mean(quantity, na.rm = T), 59 | selling_price_mean = mean(selling_price, na.rm = T), 60 | other_discount_mean = mean(other_discount, na.rm = T), 61 | coupon_discount_mean = mean(coupon_discount, na.rm = T)) 62 | 63 | 64 | if (nrow(tmp) > 0) { 65 | names(tmp)[-(1:3)] <- paste(names(tmp)[-(1:3)], lagDays, sep = "_") 66 | #customer_transaction_df <- bind_rows(customer_transaction_df, tmp) 67 | write_csv(tmp, paste("../input/agg_feat",i, lagDays, ".csv", sep = "_")) 68 | rm(tmp) 69 | } 70 | gc() 71 | } 72 | } 73 | 74 | #write_csv(customer_transaction_df, "../input/agg_feat_2.csv") 75 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1g_capitalLetters.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | ################################################################################################ 3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants 4 | #### Competition: Avito Duplicate Ad Detection 5 | # Filename : 3_feature_set1g_capitalLetters.R 6 | # Description: This Rscript generates Capital Letters Features 7 | # Usage: 8 | # Rscript ./code/3_feature_set1g_capitalLetters.R train 9 | # Rscript ./code/3_feature_set1g_capitalLetters.R test 10 | # Default argument is test 11 | ################################################################################################ 12 | ################################################################################################ 13 | 14 | args <- commandArgs(trailingOnly = F) 15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)]))) 16 | 17 | 18 | # Source Config and functions.R file 19 | source(paste(BASE, "/../config.cfg", sep = "")) 20 | source(paste(BASE_DIR, "/code/functions.R", sep = "")) 21 | 22 | #Load any additional packages 23 | library(parallel) 24 | 25 | # Read argument for train or test 26 | trainOrTest <- commandArgs(trailingOnly = TRUE) 27 | if (length(trainOrTest) > 1) { 28 | stop("ERROR: I need only 1 argument : train or test") 29 | } 30 | 31 | if (length(trainOrTest) == 0) { 32 | print("No Arguments passed, Assuming you mean test") 33 | trainOrTest <- "test" 34 | } 35 | 36 | #Load dat 37 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "") 38 | cat("Reading file ", FILENAME, "\n", sep = " ") 39 | dat <- read_csv(FILENAME) 40 | 41 | 42 | #Function to generate functions 43 | getCapitalLetterFeatures <- function(x) { 44 | wordsWithCapitalLetters <- length(grep("[[:upper:]]", unlist(strsplit(x, " ")))) 45 | countOfCapitalLetters <- length(grep("[[:upper:]]", unlist(strsplit(x, "")))) 46 | return(c(wordsWithCapitalLetters, countOfCapitalLetters)) 47 | } 48 | 49 | df2 <- data.frame(ID = 1:nrow(dat)) #Else cbind will not work 50 | for (Field in c("title_1", "title_2", "description_1", "description_2")) { 51 | print(Field) 52 | df2_temp <- as.data.frame(t(mcmapply(getCapitalLetterFeatures, dat[[Field]], USE.NAMES = F))) 53 | names(df2_temp) <- c(paste("wordsWithCapitalLetters", Field, sep = "_"), paste("countOfCapitalLetters", Field, sep = "_")) 54 | df2 <- cbind(df2, df2_temp) 55 | } 56 | for (i in c("title", "description")) { 57 | for (j in c("wordsWithCapitalLetters", "countOfCapitalLetters")) { 58 | #print(c(i,j)) 59 | NewField1 <- paste(j, "_", i,"_1", sep = "") 60 | NewField2 <- paste(j, "_", i,"_2", sep = "") 61 | #print(c(NewField1,NewField2)) 62 | NewFieldName <- paste("ratio", NewField1, NewField2, sep = "_") 63 | print(NewFieldName) 64 | df2[[NewFieldName]] <- df2[[NewField1]] / df2[[NewField2]] 65 | df2[[NewFieldName]] <- round(ifelse(df2[[NewFieldName]] > 1, 1/df2[[NewFieldName]], df2[[NewFieldName]]), 2) 66 | } 67 | } 68 | 69 | df2$ID <- NULL 70 | names(df2) <- paste("set1g", names(df2), sep = "_") 71 | 72 | 73 | ######## Add Primary Columns ItemID1 and ItemID2 74 | df2 <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df2) 75 | print("Saving Capital Letter features") 76 | write_feather(df2, paste(cache_loc, "/", "features_", trainOrTest, "_set1g_", "capitalLetters.fthr", sep = "" )) 77 | 78 | #END 79 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set4b_fuzzy_clean.py: -------------------------------------------------------------------------------- 1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants 2 | #### Author: Marios & Mikel 3 | #### Avito Duplicate Ad Detection 4 | # 3_feature_set4b_fuzzy_clean.py 5 | # Creates clean text features using the fuzzywuzzy python packages 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import sys 10 | import jellyfish 11 | import feather 12 | import time 13 | import gc 14 | from fuzzywuzzy import fuzz 15 | from multiprocessing import Pool 16 | 17 | import libavito as a 18 | 19 | def process_row(row): 20 | values = [] 21 | values.append(row[0]) 22 | values.append(row[1]) 23 | 24 | # iterate over cleaned title/descs/jsons 25 | for d in [2, 4, 6]: 26 | s1 = str(row[d]) 27 | s2 = str(row[d + 1]) 28 | values.append(jellyfish.levenshtein_distance(s1, s2)) 29 | values.append(jellyfish.jaro_distance(s1, s2)) 30 | #values.append(float(jellyfish.damerau_levenshtein_distance(s1,s2)) ) 31 | values.append(fuzz.partial_ratio(s1, s2)) 32 | values.append(fuzz.token_set_ratio(s1, s2)) 33 | values.append(fuzz.ratio(s1, s2)) 34 | values.append(fuzz.token_sort_ratio(s1, s2)) 35 | return values 36 | 37 | print(a.c.BOLD + 'Extracting set4b fuzzy cleaned text features ...' + a.c.END) 38 | 39 | # Get train/test mode from launch argument 40 | mode = a.get_mode(sys.argv, '3_feature_set4b_fuzzy_clean.py') 41 | 42 | ## Read settings required by script 43 | config = a.read_config() 44 | nthreads = config.preprocessing_nthreads 45 | cache_loc = config.cache_loc 46 | debug = config.debug 47 | if mode == 0: 48 | root = config.train_images_root 49 | df = feather.read_dataframe(cache_loc + 'train.fthr') 50 | if mode == 1: 51 | root = config.test_images_root 52 | df = feather.read_dataframe(cache_loc + 'test.fthr') 53 | 54 | df = df[['itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2', 'cleandesc_1', 'cleandesc_2', 'cleanjson_1', 'cleanjson_2']] 55 | 56 | ftrs = [] 57 | 58 | start = time.time() 59 | o = len(df.index) 60 | if nthreads == 1: 61 | print('Extracting features with 1 thread ...') 62 | k = 0 63 | # Iterate over files 64 | ftrs = [] 65 | for row in df.values: 66 | x = process_row(row) 67 | ftrs.append(x) 68 | k += 1 69 | if k % 100 == 0: 70 | a.print_progress(k, start, o) 71 | 72 | # Otherwise perform multi-threaded mapping 73 | else: 74 | print('Extracting features multi-threaded ... ', end='', flush=True) 75 | pool = Pool(nthreads) 76 | ftrs = pool.map(process_row, df.values) 77 | pool.close() 78 | gc.collect() 79 | 80 | a.print_elapsed(start) 81 | 82 | ftrs = pd.DataFrame(ftrs) 83 | cols = ['itemID_1', 'itemID_2'] + ['set4b_X' + str(i) for i in range(1, len(ftrs.columns.tolist()) - 1)] 84 | print(cols) 85 | ftrs.columns = cols 86 | 87 | # Save updated dataset 88 | if mode == 0: 89 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set4b_fuzzy_clean.fthr') 90 | if mode == 1: 91 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set4b_fuzzy_clean.fthr') 92 | 93 | a.print_elapsed(start) 94 | print('set4b extraction complete!') 95 | 96 | # Write status to status file so master script knows whether to proceed. 97 | f = open(cache_loc + 'status.txt', 'a') 98 | f.write('feature_set4b_OK\n') 99 | f.close() 100 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1f_SpecialCounting.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | ################################################################################################ 3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants 4 | #### Competition: Avito Duplicate Ad Detection 5 | # Filename : 3_feature_set1f_SpecialCounting.R 6 | # Description: This Rscript generates all Special Character Counting Features 7 | # Usage: 8 | # Rscript ./code/3_feature_set1f_SpecialCounting.R train 9 | # Rscript ./code/3_feature_set1f_SpecialCounting.R test 10 | # Default argument is test 11 | ################################################################################################ 12 | ################################################################################################ 13 | 14 | args <- commandArgs(trailingOnly = F) 15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)]))) 16 | 17 | 18 | # Source Config and functions.R file 19 | source(paste(BASE, "/../config.cfg", sep = "")) 20 | source(paste(BASE_DIR, "/code/functions.R", sep = "")) 21 | 22 | #Load any additional packages 23 | library(parallel) 24 | library(stylo) 25 | library(stringr) 26 | library(tm) 27 | 28 | # Read argument for train or test 29 | trainOrTest <- commandArgs(trailingOnly = TRUE) 30 | if (length(trainOrTest) > 1) { 31 | stop("ERROR: I need only 1 argument : train or test") 32 | } 33 | 34 | if (length(trainOrTest) == 0) { 35 | print("No Arguments passed, Assuming you mean test") 36 | trainOrTest <- "test" 37 | } 38 | 39 | #Load data 40 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "") 41 | cat("Reading file ", FILENAME, "\n", sep = " ") 42 | dat <- read_csv(FILENAME) 43 | 44 | 45 | 46 | # Function to generate Features 47 | getFeatures <- function(x, type) { 48 | if (type == "digit") { 49 | lengths((regmatches(x, gregexpr("[[:digit:]]+",x)))) 50 | } else if (type == "cntrl") { 51 | lengths((regmatches(x, gregexpr("[[:cntrl:]]+",x)))) 52 | } else if (type == "graph") { 53 | lengths((regmatches(x, gregexpr("[[:graph:]]+",x)))) 54 | } else if (type == "punct") { 55 | lengths((regmatches(x, gregexpr("[[:punct:]]+",x)))) 56 | } else if (type == "xdigit") { 57 | lengths((regmatches(x, gregexpr("[[:xdigit:]]+",x)))) 58 | } else { 59 | return(NA) 60 | } 61 | } 62 | 63 | print("Generating Count Features") 64 | for (i in c("digit", "graph", "punct", "xdigit")) { 65 | for (j in c("cleantitle_1", "cleantitle_2", "cleandesc_1", "cleandesc_2")) { 66 | print(c(i,j)) 67 | assign( 68 | paste("countOf", i, "In", j , sep = "_"), 69 | sapply(dat[[j]], getFeatures, type = i, USE.NAMES = FALSE) 70 | ) 71 | } 72 | } 73 | 74 | print("Generating Ratio Features") 75 | for (i in c("_digit", "_graph_", "_punct_", "_xdigit_")) { 76 | for (j in c("title", "desc")) { 77 | print(c(i, j)) 78 | f_name <- grep(i, grep(j, ls(), value = T), value = T) 79 | ratio <- get(f_name[1]) / get(f_name[2]) 80 | ratio <- ifelse(ratio > 1, 1/ratio, ratio) 81 | assign( 82 | paste("ratioOfcountOf", i, "In", j , sep = "_"), 83 | round(ratio, 2) 84 | ) 85 | } 86 | } 87 | 88 | df_master <- as.data.frame(do.call(cbind, list(sapply(grep("countOf", ls(), value = T), get, USE.NAMES = T)))) 89 | names(df_master) <- paste("set1f", names(df_master), sep = "_") 90 | 91 | ######## Add Primary Columns ItemID1 and ItemID2 92 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master) 93 | print("Saving Special Counting features") 94 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1f_", "specialCounting.fthr", sep = "" )) 95 | 96 | #END 97 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/legacy/3_feature_set4e_count3way_clean.py: -------------------------------------------------------------------------------- 1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants 2 | #### Author: Marios & Mikel 3 | #### Avito Duplicate Ad Detection 4 | # 3_feature_set4e_count3way_clean.py 5 | # Counts how many 3-random-grams in item1 appear in item2 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import sys 10 | import jellyfish 11 | import feather 12 | import time 13 | import gc 14 | import re 15 | import math 16 | from collections import Counter 17 | from fuzzywuzzy import fuzz 18 | from multiprocessing import Pool 19 | 20 | import libavito as a 21 | 22 | def count_3words(words, text): 23 | # To count how many times of the search terms having three words at least showing in texts. 24 | count3 = 0 25 | if len(words) < 3 or len(text) < 3: 26 | return -1 27 | else: 28 | for m in range(0, len(words) - 2): 29 | words1 = words[m] 30 | for n in range(m + 1, len(words) - 1): 31 | words2 = words[n] 32 | for z in range(m + 2, len(words)): 33 | words3 = words[z] 34 | if words1 in text and words2 and words3 in text: 35 | count3 += 1 36 | return count3 37 | 38 | def process_row(row): 39 | 40 | title = 2 41 | desc = 4 42 | json = 6 43 | 44 | values = [] 45 | 46 | values.append(row[0]) 47 | values.append(row[1]) 48 | 49 | for d in [title, desc, json]: 50 | st_1 = str(row[d]).replace(":", " ").replace('"', ' ') 51 | st_2 = str(row[d + 1]).replace(":", " ").replace('"', ' ') 52 | values.append(count_3words(st_1.split(" "), st_2.split(" "))) 53 | 54 | return values 55 | 56 | print(a.c.BOLD + 'Extracting set4e 3-way word count features ...' + a.c.END) 57 | 58 | # Get train/test mode from launch argument 59 | mode = a.get_mode(sys.argv, '3_feature_set4e_fuzzy_clean.py') 60 | 61 | ## Read settings required by script 62 | config = a.read_config() 63 | nthreads = config.preprocessing_nthreads 64 | cache_loc = config.cache_loc 65 | debug = config.debug 66 | if mode == 0: 67 | root = config.train_images_root 68 | df = feather.read_dataframe(cache_loc + 'train.fthr') 69 | if mode == 1: 70 | root = config.test_images_root 71 | df = feather.read_dataframe(cache_loc + 'test.fthr')[:1000] 72 | 73 | df = df[['itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2', 'cleandesc_1', 'cleandesc_2', 'cleanjson_1', 'cleanjson_2']] 74 | 75 | ftrs = [] 76 | 77 | start = time.time() 78 | o = len(df.index) 79 | if nthreads == 1: 80 | print('Extracting features with 1 thread ...') 81 | k = 0 82 | # Iterate over files 83 | ftrs = [] 84 | for row in df.values: 85 | x = process_row(row) 86 | ftrs.append(x) 87 | k += 1 88 | if k % 1 == 0: 89 | a.print_progress(k, start, o) 90 | 91 | # Otherwise perform multi-threaded mapping 92 | else: 93 | print('Extracting features multi-threaded ... ', end='', flush=True) 94 | pool = Pool(nthreads) 95 | ftrs = pool.map(process_row, df.values) 96 | pool.close() 97 | gc.collect() 98 | 99 | a.print_elapsed(start) 100 | 101 | ftrs = pd.DataFrame(ftrs) 102 | cols = ['itemID_1', 'itemID_2'] + ['set4e_X' + str(i) for i in range(1, len(ftrs.columns.tolist()) - 1)] 103 | print(cols) 104 | ftrs.columns = cols 105 | 106 | # Save updated dataset 107 | if mode == 0: 108 | feather.write_dataframe(ftrs, cache_loc + 'feature_set4e_train.fthr') 109 | if mode == 1: 110 | feather.write_dataframe(ftrs, cache_loc + 'feature_set4e_test.fthr') 111 | 112 | a.print_elapsed(start) 113 | print('set4e extraction complete!') 114 | 115 | # Write status to status file so master script knows whether to proceed. 116 | f = open(cache_loc + 'status.txt', 'a') 117 | f.write('feature_set4e_OK\n') 118 | f.close() 119 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3d_json1.py: -------------------------------------------------------------------------------- 1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants 2 | #### Author: Peter & Mikel 3 | #### Avito Duplicate Ad Detection 4 | # 3_feature_set3d_json1.py 5 | # Creates json jaccard similarity 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import nltk 10 | import sklearn 11 | import json 12 | import sys 13 | import gc 14 | import feather 15 | from pandas.io.json import json_normalize 16 | import unicodedata 17 | from stop_words import get_stop_words 18 | import time 19 | 20 | import libavito as a 21 | 22 | stopwords = get_stop_words('ru') 23 | punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po']) 24 | sno = nltk.stem.SnowballStemmer('russian') 25 | 26 | def get_clean_tokens(text): 27 | newtext = [] 28 | text0 = nltk.word_tokenize(text, 'russian') 29 | for y in text0: 30 | y = ''.join(x for x in y 31 | if unicodedata.category(x) not in punctutation_cats) 32 | if len(y) > 0 and y not in stopwords: 33 | newtext.append(sno.stem(y)) 34 | return newtext 35 | 36 | def jaccard_similarity(x, y): 37 | intersection_cardinality = len(set.intersection(*[set(x), set(y)])) 38 | union_cardinality = len(set.union(*[set(x), set(y)])) 39 | if union_cardinality == 0: 40 | return -1.0 41 | else: 42 | return intersection_cardinality / float(union_cardinality) 43 | 44 | def ratio_of_matches(x, y): 45 | intersection_cardinality = len(set.intersection(*[set(x), set(y)])) 46 | x_cardinality = len(x) 47 | if x_cardinality == 0: 48 | return -1.0 49 | else: 50 | return intersection_cardinality / float(x_cardinality) 51 | 52 | print(a.c.BOLD + 'Extracting set3d JSON features ...' + a.c.END) 53 | 54 | # Get train/test mode from launch argument 55 | mode = a.get_mode(sys.argv, '3_feature_set3d_json1.py') 56 | 57 | ## Read settings required by script 58 | config = a.read_config() 59 | nthreads = config.preprocessing_nthreads 60 | cache_loc = config.cache_loc 61 | debug = config.debug 62 | if mode == 0: 63 | root = config.train_images_root 64 | df = feather.read_dataframe(cache_loc + 'train.fthr') 65 | if mode == 1: 66 | root = config.test_images_root 67 | df = feather.read_dataframe(cache_loc + 'test.fthr') 68 | 69 | train = df[['itemID_1', 'itemID_2', 'attrsJSON_1', 'attrsJSON_2']] 70 | del df 71 | gc.collect() 72 | 73 | train = train.fillna('') 74 | 75 | ftrs = [] 76 | 77 | print('Calculating features ...') 78 | t0 = time.time() 79 | for i in range(0, len(train.index)): 80 | if i % 10000 == 0: 81 | a.print_progress(i, t0, len(train.index)) 82 | try: 83 | jx = train.iloc[i]['attrsJSON_1'].lower() 84 | jy = train.iloc[i]['attrsJSON_2'].lower() 85 | resx = json.loads(jx) 86 | resy = json.loads(jy) 87 | similarkeys = jaccard_similarity(resx.keys(), resy.keys()) 88 | similarvals = jaccard_similarity(resx.values(), resy.values()) 89 | #out = str(train.iloc[i]['itemID_1']) + " " + str(train.iloc[i]['itemID_2']) + " " + str(similarkeys) + " " + str(similarvals)+ " " + str(len(resx)) + " " + str(len(resy)) + "\n" 90 | ftrs.append([train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], similarkeys, similarvals, len(resx), len(resy)]) 91 | except: 92 | pass 93 | 94 | start = time.time() 95 | print('Caching data to disk ... ', end='', flush=True) 96 | ftrs = pd.DataFrame(ftrs) 97 | ftrs.columns = ['itemID_1', 'itemID_2', 'similarkeys', 'similarvals', 'nkey1', 'nkey2'] 98 | 99 | # Save updated dataset 100 | if mode == 0: 101 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set3d.fthr') 102 | if mode == 1: 103 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set3d.fthr') 104 | 105 | a.print_elapsed(start) 106 | print('set3d extraction complete!') 107 | 108 | # Write status to status file so master script knows whether to proceed. 109 | f = open(cache_loc + 'status.txt', 'a') 110 | f.write('feature_set3d_OK\n') 111 | f.close() 112 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3c_json.py: -------------------------------------------------------------------------------- 1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants 2 | #### Author: Peter & Mikel 3 | #### Avito Duplicate Ad Detection 4 | # 3_feature_set3c_json.py 5 | # Creates some features from clean jsons 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import nltk 10 | import sklearn 11 | import json 12 | import sys 13 | import gc 14 | import feather 15 | from pandas.io.json import json_normalize 16 | import unicodedata 17 | from stop_words import get_stop_words 18 | import time 19 | from multiprocessing import Pool 20 | 21 | import libavito as a 22 | 23 | stopwords = get_stop_words('ru') 24 | punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po']) 25 | sno = nltk.stem.SnowballStemmer('russian') 26 | 27 | def get_clean_tokens(text): 28 | newtext = [] 29 | text0 = nltk.word_tokenize(text, 'russian') 30 | for y in text0: 31 | y = ''.join(x for x in y 32 | if unicodedata.category(x) not in punctutation_cats) 33 | if len(y) > 0 and y not in stopwords: 34 | newtext.append(sno.stem(y)) 35 | return newtext 36 | 37 | def jaccard_similarity(x, y): 38 | intersection_cardinality = len(set.intersection(*[set(x), set(y)])) 39 | union_cardinality = len(set.union(*[set(x), set(y)])) 40 | if union_cardinality == 0: 41 | return -1.0 42 | else: 43 | return intersection_cardinality / float(union_cardinality) 44 | 45 | def ratio_of_matches(x, y): 46 | intersection_cardinality = len(set.intersection(*[set(x), set(y)])) 47 | x_cardinality = len(x) 48 | if x_cardinality == 0: 49 | return -1.0 50 | else: 51 | return intersection_cardinality / float(x_cardinality) 52 | 53 | print(a.c.BOLD + 'Extracting set3c JSON features ...' + a.c.END) 54 | 55 | # Get train/test mode from launch argument 56 | mode = a.get_mode(sys.argv, '3_feature_set3c_json.py') 57 | 58 | ## Read settings required by script 59 | config = a.read_config() 60 | nthreads = config.preprocessing_nthreads 61 | cache_loc = config.cache_loc 62 | debug = config.debug 63 | if mode == 0: 64 | root = config.train_images_root 65 | df = feather.read_dataframe(cache_loc + 'train.fthr') 66 | if mode == 1: 67 | root = config.test_images_root 68 | df = feather.read_dataframe(cache_loc + 'test.fthr') 69 | 70 | train = df[['itemID_1', 'itemID_2', 'attrsJSON_1', 'attrsJSON_2']] 71 | del df 72 | gc.collect() 73 | 74 | train = train.fillna('') 75 | 76 | ftrs = [] 77 | 78 | def process_row(i): 79 | jx = get_clean_tokens(train.iloc[i]['attrsJSON_1']) 80 | jy = get_clean_tokens(train.iloc[i]['attrsJSON_2']) 81 | sim_j = jaccard_similarity(jx, jy) 82 | mat1_j = ratio_of_matches(jx, jy) 83 | mat2_j = ratio_of_matches(jy, jx) 84 | return [train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], sim_j, mat1_j, mat2_j] 85 | 86 | t0 = time.time() 87 | if nthreads == 1: 88 | print('Extracting features with 1 thread ...') 89 | for i in range(0, len(train.index)): 90 | if i % 10000 == 0: 91 | a.print_progress(i, t0, len(train.index)) 92 | ftrs.append(process_row(i)) 93 | else: 94 | print('Extracting features multi-threaded ... ', end='', flush=True) 95 | pool = Pool(nthreads) 96 | ftrs = pool.map(process_row, range(0, len(train.index))) 97 | pool.close() 98 | a.print_elapsed(t0) 99 | 100 | start = time.time() 101 | print('Caching data to disk ... ', end='', flush=True) 102 | ftrs = pd.DataFrame(ftrs) 103 | ftrs.columns = ['itemID_1', 'itemID_2', 'simjson', 'matjson1', 'matjson2'] 104 | 105 | # Save updated dataset 106 | if mode == 0: 107 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set3c.fthr') 108 | if mode == 1: 109 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set3c.fthr') 110 | 111 | a.print_elapsed(start) 112 | print('set3c extraction complete!') 113 | 114 | # Write status to status file so master script knows whether to proceed. 115 | f = open(cache_loc + 'status.txt', 'a') 116 | f.write('feature_set3c_OK\n') 117 | f.close() 118 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3b_title.py: -------------------------------------------------------------------------------- 1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants 2 | #### Author: Peter & Mikel 3 | #### Avito Duplicate Ad Detection 4 | # 3_feature_set3b_title.py 5 | # Creates some features from clean titles 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import nltk 10 | import sklearn 11 | import json 12 | import sys 13 | import gc 14 | import feather 15 | from pandas.io.json import json_normalize 16 | import unicodedata 17 | from stop_words import get_stop_words 18 | import time 19 | from multiprocessing import Pool 20 | 21 | import libavito as a 22 | 23 | stopwords = get_stop_words('ru') 24 | punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po']) 25 | sno = nltk.stem.SnowballStemmer('russian') 26 | 27 | def get_clean_tokens(text): 28 | newtext = [] 29 | text0 = nltk.word_tokenize(text, 'russian') 30 | for y in text0: 31 | y = ''.join(x for x in y 32 | if unicodedata.category(x) not in punctutation_cats) 33 | if len(y) > 0 and y not in stopwords: 34 | newtext.append(sno.stem(y)) 35 | return newtext 36 | 37 | def jaccard_similarity(x, y): 38 | intersection_cardinality = len(set.intersection(*[set(x), set(y)])) 39 | union_cardinality = len(set.union(*[set(x), set(y)])) 40 | if union_cardinality == 0: 41 | return -1.0 42 | else: 43 | return intersection_cardinality / float(union_cardinality) 44 | 45 | def ratio_of_matches(x, y): 46 | intersection_cardinality = len(set.intersection(*[set(x), set(y)])) 47 | x_cardinality = len(x) 48 | if x_cardinality == 0: 49 | return -1.0 50 | else: 51 | return intersection_cardinality / float(x_cardinality) 52 | 53 | print(a.c.BOLD + 'Extracting set3b title features ...' + a.c.END) 54 | 55 | # Get train/test mode from launch argument 56 | mode = a.get_mode(sys.argv, '3_feature_set3b_title.py') 57 | 58 | ## Read settings required by script 59 | config = a.read_config() 60 | nthreads = config.preprocessing_nthreads 61 | cache_loc = config.cache_loc 62 | debug = config.debug 63 | if mode == 0: 64 | root = config.train_images_root 65 | df = feather.read_dataframe(cache_loc + 'train.fthr') 66 | if mode == 1: 67 | root = config.test_images_root 68 | df = feather.read_dataframe(cache_loc + 'test.fthr') 69 | 70 | train = df[['itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2']] 71 | del df 72 | gc.collect() 73 | 74 | train = train.fillna('') 75 | 76 | ftrs = [] 77 | 78 | def process_row(i): 79 | tx = train.iloc[i]['cleantitle_1'].split(' ') 80 | ty = train.iloc[i]['cleantitle_2'].split(' ') 81 | sim_t = jaccard_similarity(tx, ty) 82 | mat1_t = ratio_of_matches(tx, ty) 83 | mat2_t = ratio_of_matches(ty, tx) 84 | return [train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], sim_t, mat1_t, mat2_t, len(tx), len(ty)] 85 | 86 | t0 = time.time() 87 | if nthreads == 1: 88 | print('Extracting features with 1 thread ...') 89 | for i in range(0, len(train.index)): 90 | if i % 10000 == 0: 91 | a.print_progress(i, t0, len(train.index)) 92 | ftrs.append(process_row(i)) 93 | else: 94 | print('Extracting features multi-threaded ... ', end='', flush=True) 95 | pool = Pool(nthreads) 96 | ftrs = pool.map(process_row, range(0, len(train.index))) 97 | pool.close() 98 | a.print_elapsed(t0) 99 | 100 | start = time.time() 101 | print('Caching data to disk ... ', end='', flush=True) 102 | ftrs = pd.DataFrame(ftrs) 103 | ftrs.columns = ['itemID_1', 'itemID_2', 'simtitle', 'mattitle1', 'mattitle2', 'nwords1', 'nwords2'] 104 | 105 | # Save updated dataset 106 | if mode == 0: 107 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set3b.fthr') 108 | if mode == 1: 109 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set3b.fthr') 110 | 111 | a.print_elapsed(start) 112 | print('set3b extraction complete!') 113 | 114 | # Write status to status file so master script knows whether to proceed. 115 | f = open(cache_loc + 'status.txt', 'a') 116 | f.write('feature_set3b_OK\n') 117 | f.close() 118 | -------------------------------------------------------------------------------- /AnalyticsVidhya/Date-your-Data/1_internship_WordCorrection.R: -------------------------------------------------------------------------------- 1 | library(qdap) 2 | library(dplyr) 3 | library(tidyr) 4 | library(readr) 5 | library(stringr) 6 | library(tm) 7 | 8 | #LOAD DATA 9 | internship <- read_csv("../data/Internship.csv", na = c("", "NA", "NULL")) 10 | 11 | SPELLINGERRORS <- check_spelling(internship$Internship_Profile, 12 | assume.first.correct = TRUE, 13 | n.suggests = 4) 14 | SPELLINGERRORS <- data.frame(lapply(SPELLINGERRORS, as.character), 15 | stringsAsFactors=FALSE) %>% 16 | select(not.found, suggestion) 17 | #Remove Duplicate rows 18 | SPELLINGERRORS <- SPELLINGERRORS[!duplicated(SPELLINGERRORS[1:2]), ] 19 | 20 | #Now check sort(SPELLINGERRORS$not.found) and see which are actual spelling mistakes, which are correct but need modification 21 | #Below are what I have observed: 22 | SPELL_MISTAKES <- c("activites", "ambassodor", "andoid", "andorid", "andriod", "anubhava","autid","bussiness","chemsitry", 23 | "coordinaing","cosnulting","develoment","developement","develpoment","enrolment","facilitation", 24 | "finanace","managemnt","managment","mangement","marekting","markting","notejs","nutritionist","oflline","optimaization", 25 | "optimization","optmization","pharmacovigilance","reasearch","recruiter","professonal","requirment","retreival","socia", 26 | "trbology","tution","varification","vertification","writitng") 27 | 28 | SPELLINGERRORS <- SPELLINGERRORS[(SPELLINGERRORS$not.found %in% SPELL_MISTAKES), ] 29 | SIMILAR_WORDS <- list( 30 | c("apps", "app"), 31 | c("Accounting", "Accountant"), 32 | c("back-end", "backend"), 33 | c("beckend", "backend"), 34 | c("back end", "backend"), 35 | c("blog", "blogger"), 36 | c("blogging", "blogger"), 37 | c("blogs", "blogger"), 38 | c("cataloguing" ,"catalogue"), 39 | c("curating", "curation"), 40 | c("desiging", "design"), 41 | c("desigining", "design"), 42 | c("designe", "design"), 43 | c("telecalling", "telecaller"), 44 | c("telecommunications", "telecom"), 45 | c("trbology" , "tribology"), 46 | c("oflline", "offline") 47 | ) 48 | m <- matrix(unlist(SIMILAR_WORDS), byrow = TRUE, ncol = 2) 49 | colnames(m) <- c("not.found", "suggestion") 50 | SPELLINGERRORS <- rbind(SPELLINGERRORS, m) 51 | 52 | #Function to replace Spelling errors 53 | replaceSpellingErrors <- function(words) { 54 | b <- c() 55 | for (i in unlist(strsplit(words, " "))) { 56 | if (i %in% SPELLINGERRORS$not.found) { 57 | b <- append(b, SPELLINGERRORS$suggestion[SPELLINGERRORS$not.found == i]) 58 | } else { 59 | b <- append(b, i) 60 | } 61 | } 62 | return(paste(b, collapse = " ")) 63 | } 64 | 65 | #Function to remove all unwanted stuff 66 | cleanUpText <- function(words, stem = TRUE) { 67 | #Remove all graph characters 68 | words <- str_replace_all(words,"[^[:graph:]]", " ") 69 | words <- gsub("[^[:alpha:][:space:]]*", "", words) 70 | words <- tolower(words) 71 | #Remove Punctuation except Hyphen - 72 | words <- gsub("([-])|[[:punct:]]", '\\1', words) 73 | #Remove all extra whitespace 74 | gsub("\\s+", " ", str_trim(words)) 75 | #Replace all spelling errors 76 | words <- replaceSpellingErrors(words) 77 | #Stemming if stem = TRUE 78 | stemList <- c() 79 | if (stem) { 80 | for (i in words) { 81 | i <- gsub("[[:punct:]]$", "", i) #Remove any trailing punctuation mark 82 | i <- gsub("^[[:punct:]]", "", i) #Remove any leading punctuation mark 83 | j <- paste(stemDocument(unlist(strsplit(i," "))), collapse = " ") 84 | stemList <- append(stemList, j) 85 | } 86 | return(stemList) 87 | } else { 88 | return(words) 89 | } 90 | } 91 | 92 | t <- Sys.time() 93 | for (i in c("Internship_Profile")) { 94 | print(i) 95 | #internship[[i]] <- cleanUpText(internship[[i]], stem = TRUE) 96 | internship[[i]] <- sapply(internship[[i]], cleanUpText) 97 | } 98 | print(Sys.time()-t) 99 | 100 | #Save file 101 | write.csv(internship, "../data/Internship_Processed.csv", row.names = FALSE) 102 | 103 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3a_description.py: -------------------------------------------------------------------------------- 1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants 2 | #### Author: Peter & Mikel 3 | #### Avito Duplicate Ad Detection 4 | # 3_feature_set3a_description.py 5 | # Creates some features from clean descriptions 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import nltk 10 | import sklearn 11 | import json 12 | import sys 13 | import gc 14 | import feather 15 | from pandas.io.json import json_normalize 16 | import unicodedata 17 | from stop_words import get_stop_words 18 | import time 19 | from multiprocessing import Pool 20 | 21 | import libavito as a 22 | 23 | stopwords = get_stop_words('ru') 24 | punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po']) 25 | sno = nltk.stem.SnowballStemmer('russian') 26 | 27 | def get_clean_tokens(text): 28 | newtext = [] 29 | text0 = nltk.word_tokenize(text, 'russian') 30 | for y in text0: 31 | y = ''.join(x for x in y 32 | if unicodedata.category(x) not in punctutation_cats) 33 | if len(y) > 0 and y not in stopwords: 34 | newtext.append(sno.stem(y)) 35 | return newtext 36 | 37 | def jaccard_similarity(x, y): 38 | intersection_cardinality = len(set.intersection(*[set(x), set(y)])) 39 | union_cardinality = len(set.union(*[set(x), set(y)])) 40 | if union_cardinality == 0: 41 | return -1.0 42 | else: 43 | return intersection_cardinality / float(union_cardinality) 44 | 45 | def ratio_of_matches(x, y): 46 | intersection_cardinality = len(set.intersection(*[set(x), set(y)])) 47 | x_cardinality = len(x) 48 | if x_cardinality == 0: 49 | return -1.0 50 | else: 51 | return intersection_cardinality / float(x_cardinality) 52 | 53 | print(a.c.BOLD + 'Extracting set3a description features ...' + a.c.END) 54 | 55 | # Get train/test mode from launch argument 56 | mode = a.get_mode(sys.argv, '3_feature_set3a_description.py') 57 | 58 | ## Read settings required by script 59 | config = a.read_config() 60 | nthreads = config.preprocessing_nthreads 61 | cache_loc = config.cache_loc 62 | debug = config.debug 63 | if mode == 0: 64 | root = config.train_images_root 65 | df = feather.read_dataframe(cache_loc + 'train.fthr') 66 | if mode == 1: 67 | root = config.test_images_root 68 | df = feather.read_dataframe(cache_loc + 'test.fthr') 69 | 70 | train = df[['itemID_1', 'itemID_2', 'cleandesc_1', 'cleandesc_2']] 71 | del df 72 | gc.collect() 73 | 74 | train = train.fillna('') 75 | 76 | ftrs = [] 77 | 78 | def process_row(i): 79 | dx = train.iloc[i]['cleandesc_1'].split(' ') 80 | dy = train.iloc[i]['cleandesc_2'].split(' ') 81 | sim_d = jaccard_similarity(dx, dy) 82 | mat1_d = ratio_of_matches(dx, dy) 83 | mat2_d = ratio_of_matches(dy, dx) 84 | return [train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], sim_d, mat1_d, mat2_d, len(dx), len(dy)] 85 | 86 | # print('Calculating features ...') 87 | t0 = time.time() 88 | if nthreads == 1: 89 | print('Extracting features with 1 thread ...') 90 | for i in range(0, len(train.index)): 91 | if i % 10000 == 0: 92 | a.print_progress(i, t0, len(train.index)) 93 | ftrs.append(process_row(i)) 94 | else: 95 | print('Extracting features multi-threaded ... ', end='', flush=True) 96 | pool = Pool(nthreads) 97 | ftrs = pool.map(process_row, range(0, len(train.index))) 98 | pool.close() 99 | a.print_elapsed(t0) 100 | 101 | start = time.time() 102 | print('Caching data to disk ... ', end='', flush=True) 103 | ftrs = pd.DataFrame(ftrs) 104 | ftrs.columns = ['itemID_1', 'itemID_2', 'simdesc', 'mat1_d', 'mat2_d', 'nwords1', 'nwords2'] 105 | 106 | # Save updated dataset 107 | if mode == 0: 108 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set3a.fthr') 109 | if mode == 1: 110 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set3a.fthr') 111 | 112 | a.print_elapsed(start) 113 | print('set3a extraction complete!') 114 | 115 | # Write status to status file so master script knows whether to proceed. 116 | f = open(cache_loc + 'status.txt', 'a') 117 | f.write('feature_set3a_OK\n') 118 | f.close() 119 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1e_attribute.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | ################################################################################################ 3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants 4 | #### Competition: Avito Duplicate Ad Detection 5 | # Filename : 3_feature_set1e_attribute.R 6 | # Description: This Rscript generates all Attribute (Json) features 7 | # Usage: 8 | # Rscript ./code/3_feature_set1e_attribute.R train 9 | # Rscript ./code/3_feature_set1e_attribute.R test 10 | # Default argument is test 11 | ################################################################################################ 12 | ################################################################################################ 13 | args <- commandArgs(trailingOnly = F) 14 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)]))) 15 | 16 | 17 | # Source Config and functions.R file 18 | source(paste(BASE, "/../config.cfg", sep = "")) 19 | source(paste(BASE_DIR, "/code/functions.R", sep = "")) 20 | 21 | #Load any additional packages 22 | library(parallel) 23 | library(jsonlite) 24 | 25 | # Read argument for train or test 26 | trainOrTest <- commandArgs(trailingOnly = TRUE) 27 | if (length(trainOrTest) > 1) { 28 | stop("ERROR: I need only 1 argument : train or test") 29 | } 30 | 31 | if (length(trainOrTest) == 0) { 32 | print("No Arguments passed, Assuming you mean test") 33 | trainOrTest <- "test" 34 | } 35 | 36 | #Load data 37 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "") 38 | cat("Reading file ", FILENAME, "\n", sep = " ") 39 | dat <- read_csv(FILENAME) 40 | 41 | 42 | 43 | #Function to generate Attribute Features 44 | attribute_feature <- function(w) { 45 | x <- w[1] 46 | y <- w[2] 47 | if (is.na(x) | is.na(y) | x == "[]" | y == "[]") { 48 | return(rep(NA,8)) 49 | } 50 | x <- paste("[", x, "]", sep = "") 51 | y <- paste("[", y, "]", sep = "") 52 | x.df <- fromJSON(x, simplifyDataFrame = TRUE) 53 | y.df <- fromJSON(y, simplifyDataFrame = TRUE) 54 | N_Attr_x <- ncol(x.df) 55 | N_Attr_y <- ncol(y.df) 56 | if (N_Attr_x == 0 | N_Attr_y == 0) { 57 | return(rep(NA,8)) 58 | } 59 | L <- length(intersect(names(x.df), names(y.df))) 60 | ratioOfPercentageOfMatchingAttributesNames <- L / min(N_Attr_x, N_Attr_y) 61 | ratioOfPercentageOfMatchingAttributesValues <- NA 62 | c <- 0 63 | if (ratioOfPercentageOfMatchingAttributesNames > 0) { 64 | for (i in intersect(names(x.df), names(y.df))) { 65 | if (x.df[[i]] == y.df[[i]]) { 66 | c <- c + 1 67 | } 68 | } 69 | ratioOfPercentageOfMatchingAttributesValues <- c / L 70 | } 71 | numberOfAttributes_sum <- N_Attr_x + N_Attr_y 72 | numberOfAttributes_diff <- abs(N_Attr_x - N_Attr_y) 73 | numberOfAttributes_min <- min(N_Attr_x, N_Attr_y) 74 | numberOfAttributes_max <- max(N_Attr_x, N_Attr_y) 75 | 76 | return(c( 77 | numberOfAttributes_sum, 78 | numberOfAttributes_diff, 79 | numberOfAttributes_min, 80 | numberOfAttributes_max, 81 | L, 82 | ratioOfPercentageOfMatchingAttributesNames, 83 | c, 84 | ratioOfPercentageOfMatchingAttributesValues 85 | )) 86 | 87 | 88 | } 89 | 90 | print("Generating Features") 91 | #This can be made Parallel , I didnt do that as of now 92 | df_master <- as.data.frame(t(apply(dat[, c("cleanjson_1", "cleanjson_2")], 1, attribute_feature))) 93 | names(df_master) <- c( 94 | "numberOfAttributes_sum", 95 | "numberOfAttributes_diff", 96 | "numberOfAttributes_min", 97 | "numberOfAttributes_max", 98 | "NoOfMatchingAttributesNames", 99 | "ratioOfPercentageOfMatchingAttributesNames", 100 | "NoOfMatchingAttributesValues", 101 | "ratioOfPercentageOfMatchingAttributesValues" 102 | ) 103 | 104 | names(df_master) <- paste("set1e", names(df_master), sep = "_") 105 | 106 | ######## Add Primary Columns ItemID1 and ItemID2 107 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master) 108 | print("Saving Attributes features") 109 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1e_", "attributes.fthr", sep = "" )) 110 | 111 | #END 112 | -------------------------------------------------------------------------------- /Microsoft/Womens-Health-Risk-Assessment/Predict.R: -------------------------------------------------------------------------------- 1 | #(c) Sonny Laskar (sonnylaskar at gmail Dot Com) 2 | #Create a zip file with all packages which are not available in Microsoft Azure environment and upload the zip. 3 | #The zip file is available in "src" folder. My zip was named downloaded_packages.zip 4 | install.packages("src/downloaded_packages/stringi_1.1.1.zip", lib = ".", repos = NULL, verbose = TRUE) 5 | install.packages("src/downloaded_packages/magrittr_1.5.zip", lib = ".", repos = NULL, verbose = TRUE) 6 | install.packages("src/downloaded_packages/xgboost_0.4-4.zip", lib = ".", repos = NULL, verbose = TRUE) 7 | 8 | library(xgboost, lib.loc=".", verbose=TRUE) 9 | library(dplyr) 10 | library(gbm) 11 | library(randomForest) 12 | # Map 1-based optional input ports to variables 13 | dataset1 <- maml.mapInputPort(1) # class: data.frame 14 | dataset1$segment <- NULL 15 | dataset1$subgroup <- NULL 16 | cat("Original dim: ", dim(dataset1), "\n") 17 | 18 | 19 | encode_religion <- function(dat) { 20 | #Input: Character Vector for religion 21 | #Output: Numeric Vector 22 | dat <- ifelse(dat == "Buddhist", 1, dat) 23 | dat <- ifelse(dat == "Evangelical/Bo", 2, dat) 24 | dat <- ifelse(dat == "Hindu", 3, dat) 25 | dat <- ifelse(dat == "Jewish", 4, dat) 26 | dat <- ifelse(dat == "Muslim", 5, dat) 27 | dat <- ifelse(dat == "Other", 6, dat) 28 | dat <- ifelse(dat == "Other Christia", 7, dat) 29 | dat <- ifelse(dat == "Roman Catholic", 8, dat) 30 | dat <- ifelse(dat == "Russian/Easter", 9, dat) 31 | dat <- ifelse(dat == "Traditional/An", 10, dat) 32 | dat <- ifelse(dat == "", NA, dat) 33 | dat <- as.integer(dat) 34 | return(dat) 35 | } 36 | 37 | manual_encode_religion <- function(dat) { 38 | #Input: Character Vector for religion 39 | #Output: Numeric Vector 40 | RELIGION <- c("Hindu", "Evangelical/Bo", "Muslim", "Roman Catholic", "Other Christia", "Buddhist", "Russian/Easter", "Traditional/An", "Other", "Jewish") 41 | for (i in RELIGION) { 42 | c <- paste("religion", i, sep = ".") 43 | print(c) 44 | dat[[c]] <- ifelse(dat$religion == i, 1, 0) 45 | } 46 | dat$religion <- encode_religion(dat$religion) 47 | return(dat) 48 | } 49 | 50 | featureEngineering <- function(dat) { 51 | dat$INTNR <- NULL 52 | dat$geo <- as.integer(dat$geo) 53 | dat <- manual_encode_religion(dat) 54 | dat$segment <- NULL 55 | dat$subgroup <- NULL 56 | dat[is.na(dat)] <- -1 57 | dat$christian <- as.numeric(dat$christian) #Xgboost needs at least one column as numeric 58 | #Random Forest cannot handle / and space in colnames 59 | names(dat) <- gsub("/", "_", names(dat)) 60 | names(dat) <- gsub(" ", "_", names(dat)) 61 | 62 | return(dat) 63 | } 64 | dataset1 <- featureEngineering(dataset1) 65 | cat("New dim: ", dim(dataset1), "\n") 66 | 67 | 68 | sub <- data.frame(patientID = NULL, geo = NULL, class = NULL) 69 | for (GEO in 1:9) { 70 | print(GEO) 71 | dat <- dataset1[dataset1$geo == GEO, ] 72 | cat("New dim: ", dim(dat), "\n") 73 | if (nrow(dat) == 0) next 74 | patientID <- dat$patientID 75 | dat$patientID <- NULL 76 | 77 | if (GEO == 1) classes <- c("11","21","22") 78 | if (GEO == 2) classes <- c("11","12","21","22","31","41") 79 | if (GEO == 3) classes <- c("11","12","21","22") 80 | if (GEO == 4) classes <- c("11","12") 81 | if (GEO == 5) classes <- c("11","12","22","31","32") 82 | if (GEO == 6) classes <- c("11","12","21") 83 | if (GEO == 7) classes <- c("11","12","21","22","31") 84 | if (GEO == 8) classes <- c("11","21","31","41") 85 | if (GEO == 9) classes <- c("11","12","21","31","32") 86 | #LOAD XGB Model 87 | xgb_1000 <- readRDS(paste("src/downloaded_packages/xgb_geo_", GEO ,"_seed1000.model", sep = "")) 88 | 89 | xgb_test <- predict(xgb_1000, data.matrix(dat), missing=NA) 90 | xgb_test <- as.data.frame(matrix(xgb_test, 91 | nrow=nrow(dat), 92 | byrow = TRUE)) 93 | colnames(xgb_test) <- classes 94 | 95 | #LOAD RF Model 96 | rf_1000 <- readRDS(paste("src/downloaded_packages/rf_geo_", GEO ,"_seed1000.model", sep = "")) 97 | rf_test <- as.data.frame(predict(rf_1000, 98 | dat, 99 | type= "prob")) 100 | colnames(rf_test) <- classes 101 | 102 | #Combined Weightage 103 | final <- (xgb_test*0.4 + rf_test*0.6) 104 | final$NEW <- apply(final, 1, function(x) { 105 | m <- which.max(x) 106 | names(final)[m] 107 | }) 108 | sub <- rbind(sub, data.frame(patientID = patientID, geo = dat$geo, class = final$NEW)) 109 | } 110 | 111 | data.set <- data.frame(patientID = sub$patientID, 112 | Geo_Pred = sub$geo, 113 | Segment_Pred = as.integer(substring(sub$class, 1, 1)), 114 | Subgroup_Pred = as.integer(substring(sub$class, 2, 2)) 115 | ) 116 | 117 | print(str(data.set)) 118 | maml.mapOutputPort("data.set"); 119 | 120 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_json_to_cols.py: -------------------------------------------------------------------------------- 1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants 2 | #### Author: Peter & Mikel 3 | #### Avito Duplicate Ad Detection 4 | # 3_json_to_cols.py 5 | # Encodes json key similarity into a sparse format for feature extraction 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import sklearn 10 | import json 11 | from pandas.io.json import json_normalize 12 | import unicodedata 13 | import time 14 | import codecs 15 | import feather 16 | 17 | import libavito as a 18 | 19 | def jaccard_similarity(x, y): 20 | intersection_cardinality = len(set.intersection(*[set(x), set(y)])) 21 | union_cardinality = len(set.union(*[set(x), set(y)])) 22 | if union_cardinality == 0: 23 | return -1.0 24 | else: 25 | return intersection_cardinality / float(union_cardinality) 26 | 27 | ## Read settings required by script 28 | config = a.read_config() 29 | nthreads = config.preprocessing_nthreads 30 | cache_loc = config.cache_loc 31 | debug = config.debug 32 | df_train = feather.read_dataframe(cache_loc + 'train.fthr') 33 | df_test = feather.read_dataframe(cache_loc + 'test.fthr') 34 | 35 | df_train = df_train[['itemID_1', 'itemID_2', 'cleanjson_1', 'cleanjson_2']] 36 | df_test = df_test[['itemID_1', 'itemID_2', 'cleanjson_1', 'cleanjson_2']] 37 | 38 | df = pd.concat([df_train, df_test]) 39 | 40 | clean_jsons = df['cleanjson_1'].tolist() + df['cleanjson_2'].tolist() 41 | 42 | print('Creating key dict ... ') 43 | allkey = {} 44 | pa = 0 45 | t0 = time.time() 46 | for i in range(0, len(clean_jsons)): 47 | if i % 100000 == 0: 48 | a.print_progress(i, t0, len(clean_jsons)) 49 | try: 50 | jx = clean_jsons[i].replace("'", "") 51 | resx = json.loads(jx) 52 | for x in resx.keys(): 53 | if x in allkey: 54 | allkey[x] = allkey[x] + 1 55 | else: 56 | allkey[x] = 1 57 | except KeyboardInterrupt: 58 | raise 59 | except Exception as e: 60 | pa += 1 61 | 62 | t0 = time.time() 63 | print('Transforming key dict ... ', end='', flush=True) 64 | icount = 0 65 | keydict = {} 66 | for k, n in allkey.items(): 67 | keydict[k] = icount 68 | icount += 1 69 | a.print_elapsed(t0) 70 | 71 | ftrs_train = [] 72 | print('Generating for train ... ') 73 | t0 = time.time() 74 | pa = 0 75 | for i in range(0, len(df_train.index)): 76 | if i % 10000 == 0: 77 | a.print_progress(i, t0, len(df_train.index)) 78 | try: 79 | jx = df_train.iloc[i]['cleanjson_1'].replace("'", "") 80 | jy = df_train.iloc[i]['cleanjson_2'].replace("'", "") 81 | resx = json.loads(jx) 82 | resy = json.loads(jy) 83 | except KeyboardInterrupt: 84 | raise 85 | except: 86 | continue 87 | 88 | if resx != [] and resy != []: 89 | for key in set.union(*[set(resx.keys()), set(resy.keys())]): 90 | if key in resx.keys() and key in resy.keys(): 91 | c = resx[key] 92 | b = resy[key] 93 | res = jaccard_similarity(c, b) 94 | else: 95 | res = -1 96 | ftrs_train.append([df_train.iloc[i]['itemID_1'], df_train.iloc[i]['itemID_2'], str(keydict[key]), str(res)]) 97 | else: 98 | pa += 1 99 | 100 | ftrs_test = [] 101 | print('Generating for test ... ') 102 | t0 = time.time() 103 | for i in range(0, len(df_test.index)): 104 | if i % 10000 == 0: 105 | a.print_progress(i, t0, len(df_test.index)) 106 | try: 107 | jx = df_test.iloc[i]['cleanjson_1'].replace("'", '') 108 | jy = df_test.iloc[i]['cleanjson_2'].replace("'", '') 109 | resx = json.loads(jx) 110 | resy = json.loads(jy) 111 | except KeyboardInterrupt: 112 | raise 113 | except: 114 | continue 115 | 116 | if resx != [] and resy != []: 117 | for key in set.union(*[set(resx.keys()), set(resy.keys())]): 118 | if key in resx.keys() and key in resy.keys(): 119 | c = resx[key] 120 | b = resy[key] 121 | res = jaccard_similarity(c, b) 122 | else: 123 | res = -1 124 | ftrs_test.append([df_test.iloc[i]['itemID_1'], df_test.iloc[i]['itemID_2'], str(keydict[key]), str(res)]) 125 | else: 126 | pa += 1 127 | 128 | print("\nError rows: " + str(pa)) 129 | 130 | print(len(ftrs_train)) 131 | print(len(ftrs_test)) 132 | 133 | print('Tranforming features ... ', end='', flush=True) 134 | t0 = time.time() 135 | ftrs_train = pd.DataFrame(ftrs_train) 136 | ftrs_test = pd.DataFrame(ftrs_test) 137 | ftrs_train.columns = ['itemID_1', 'itemID_2', 'keyID', 'value'] 138 | ftrs_test.columns = ['itemID_1', 'itemID_2', 'keyID', 'value'] 139 | a.print_elapsed(t0) 140 | 141 | print('Caching data to disk ... ', end='', flush=True) 142 | t0 = time.time() 143 | feather.write_dataframe(ftrs_train, cache_loc + 'json_vals_train_v2.fthr') 144 | feather.write_dataframe(ftrs_test, cache_loc + 'json_vals_test_v2.fthr') 145 | a.print_elapsed(t0) 146 | 147 | print('json_to_cols Complete!') 148 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/functions.R: -------------------------------------------------------------------------------- 1 | #### Copyright 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants 2 | #### Avito Duplicate Ad Detection 3 | # functions.R 4 | # TODO: WRITE DESCRIPTION OF SCRIPT HERE 5 | 6 | #Load Basic packages needed by all R scripts 7 | library(readr) 8 | library(dplyr) 9 | library(tidyr) 10 | library(feather) 11 | 12 | ######## GET NGRAMS FUNCTIONS 13 | getNGrams <- function(my.text, n = 1) { 14 | # which can be split into a vector of consecutive words: 15 | my.vector.of.words <- stemDocument(unlist(strsplit(gsub("\\s+", " ", str_trim(my.text)), " "))) 16 | # now, we create a vector of word n-grams: 17 | if (length(my.vector.of.words) >= n) { 18 | make.ngrams(my.vector.of.words, ngram.size = n) 19 | } else { 20 | return(NULL) 21 | } 22 | } 23 | ######## GET NCHARS FUNCTIONS 24 | getNGramsChars <- function(my.text, n = 1) { 25 | # which can be split into a vector of consecutive words: 26 | my.vector.of.words <- stemDocument(unlist(strsplit(gsub("\\s+", " ", str_trim(my.text)), " "))) 27 | # now, we create a vector of word n-grams: 28 | if (length(my.vector.of.words) >= n) { 29 | my.vector.of.chars = txt.to.features(my.vector.of.words, features = "c") 30 | make.ngrams(my.vector.of.chars, ngram.size = n) 31 | } else { 32 | return(NULL) 33 | } 34 | } 35 | 36 | ## NGRAMS 37 | getNgramsCount <- function(string1, string2, n = 1) { 38 | ####################################### 39 | # COUNTING NGRAMS FEATURES 40 | ####################################### 41 | #Generate Ngrams 42 | NgramsString1 <- getNGrams(tolower(string1), n) 43 | NgramsString2 <- getNGrams(tolower(string2), n) 44 | 45 | #Count of Ngrams 46 | countOfNgramsInString1 <- length(NgramsString1) 47 | countOfNgramsInString2 <- length(NgramsString2) 48 | ratioOfNgrams_String1_String2 <- round(countOfNgramsInString1 / countOfNgramsInString2, 3) 49 | 50 | #Count of Unique NGrams 51 | countOfUniqueNgramsInString1 <- length(unique(NgramsString1)) 52 | countOfUniqueNgramsInString2 <- length(unique(NgramsString2)) 53 | ratioOfUniqueNgrams_String1_String2 <- round(countOfUniqueNgramsInString1 / countOfUniqueNgramsInString2, 3) 54 | 55 | ratioOfIntersect_Ngrams_String1_in_String2 <- round(sum(NgramsString1 %in% NgramsString2) / countOfNgramsInString1, 3) 56 | ratioOfIntersect_Ngrams_String2_in_String1 <- round(sum(NgramsString2 %in% NgramsString1) / countOfNgramsInString2, 3) 57 | 58 | countOfNgramsInString_min <- min( countOfNgramsInString1, countOfNgramsInString2 ) 59 | countOfNgramsInString_max <- max( countOfNgramsInString1, countOfNgramsInString2 ) 60 | countOfNgramsInString_sum <- ( countOfNgramsInString1 + countOfNgramsInString2 ) 61 | countOfNgramsInString_diff <- abs( countOfNgramsInString1 - countOfNgramsInString2 ) 62 | 63 | return(c( 64 | countOfNgramsInString_min, 65 | countOfNgramsInString_max, 66 | countOfNgramsInString_sum, 67 | countOfNgramsInString_diff, 68 | countOfNgramsInString1, 69 | countOfNgramsInString2, 70 | countOfUniqueNgramsInString1, 71 | countOfUniqueNgramsInString2, 72 | ratioOfNgrams_String1_String2, 73 | ratioOfUniqueNgrams_String1_String2, 74 | ratioOfIntersect_Ngrams_String1_in_String2, 75 | ratioOfIntersect_Ngrams_String2_in_String1 76 | )) 77 | } 78 | 79 | ## NCHARS 80 | getNcharsCount <- function(string1, string2, n = 1) { 81 | ####################################### 82 | # COUNTING Nchars FEATURES 83 | ####################################### 84 | #Generate Nchars 85 | NcharsString1 <- getNGramsChars(tolower(string1), n) 86 | NcharsString2 <- getNGramsChars(tolower(string2), n) 87 | 88 | #Count of Nchars 89 | countOfNcharsInString1 <- length(NcharsString1) 90 | countOfNcharsInString2 <- length(NcharsString2) 91 | ratioOfNchars_String1_String2 <- round(countOfNcharsInString1 / countOfNcharsInString2, 3) 92 | 93 | #Count of Unique Nchars 94 | countOfUniqueNcharsInString1 <- length(unique(NcharsString1)) 95 | countOfUniqueNcharsInString2 <- length(unique(NcharsString2)) 96 | ratioOfUniqueNchars_String1_String2 <- round(countOfUniqueNcharsInString1 / countOfUniqueNcharsInString2, 3) 97 | 98 | ratioOfIntersect_Nchars_String1_in_String2 <- round(sum(NcharsString1 %in% NcharsString2) / countOfNcharsInString1, 3) 99 | ratioOfIntersect_Nchars_String2_in_String1 <- round(sum(NcharsString2 %in% NcharsString1) / countOfNcharsInString2, 3) 100 | 101 | countOfNcharsInString_min <- min( countOfNcharsInString1, countOfNcharsInString2 ) 102 | countOfNcharsInString_max <- max( countOfNcharsInString1, countOfNcharsInString2 ) 103 | countOfNcharsInString_sum <- ( countOfNcharsInString1 + countOfNcharsInString2 ) 104 | countOfNcharsInString_diff <- abs(( countOfNcharsInString1 - countOfNcharsInString2 )) 105 | 106 | return(c( 107 | countOfNcharsInString_min, 108 | countOfNcharsInString_max, 109 | countOfNcharsInString_sum, 110 | countOfNcharsInString_diff, 111 | countOfNcharsInString1, 112 | countOfNcharsInString2, 113 | countOfUniqueNcharsInString1, 114 | countOfUniqueNcharsInString2, 115 | ratioOfNchars_String1_String2, 116 | ratioOfUniqueNchars_String1_String2, 117 | ratioOfIntersect_Nchars_String1_in_String2, 118 | ratioOfIntersect_Nchars_String2_in_String1 119 | )) 120 | } 121 | 122 | 123 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3f_hamming.py: -------------------------------------------------------------------------------- 1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants 2 | #### Author: Peter & Mikel 3 | #### Avito Duplicate Ad Detection 4 | # 3_feature_set3f_hamming.py 5 | # Creates features from image dHashes 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import sys 10 | import feather 11 | import time 12 | import gc 13 | from multiprocessing import Pool 14 | 15 | import libavito as a 16 | 17 | def debug(s): 18 | print(str(s)) 19 | time.sleep(1) 20 | 21 | print(a.c.BOLD + 'Extracting set3f image hamming features ...' + a.c.END) 22 | 23 | # Get train/test mode from launch argument 24 | mode = a.get_mode(sys.argv, '3_feature_set3f_hamming.py') 25 | 26 | ## Read settings required by script 27 | config = a.read_config() 28 | nthreads = config.preprocessing_nthreads 29 | cache_loc = config.cache_loc 30 | #debug = config.debug 31 | if mode == 0: 32 | df = feather.read_dataframe(cache_loc + 'train.fthr') 33 | if mode == 1: 34 | df = feather.read_dataframe(cache_loc + 'test.fthr') 35 | 36 | root = config.images_root 37 | image_db = feather.read_dataframe(cache_loc + 'image_database.fthr') 38 | 39 | df = df[['itemID_1', 'itemID_2', 'images_array_1', 'images_array_2']] 40 | 41 | start = time.time() 42 | print('Preparing imageDB ... ', end='', flush=True) 43 | image_db.index = image_db['image'] 44 | nhash = image_db['FreqOfHash'].to_dict() 45 | ihash = image_db['imagehash'].to_dict() 46 | a.print_elapsed(start) 47 | 48 | def process_row(row): 49 | id1 = row[0] 50 | id2 = row[1] 51 | array_x = row[2] 52 | array_y = row[3] 53 | 54 | if array_x is not None: 55 | aux_x = array_x.replace(' ', '').split(',') 56 | else: 57 | aux_x = [] 58 | if array_y is not None: 59 | aux_y = array_y.replace(' ', '').split(',') 60 | else: 61 | aux_y = [] 62 | 63 | icount = [] 64 | missing = 0 65 | minhamming = 99999 66 | minhamming30 = 99999 67 | minhamming50 = 99999 68 | minhamming100 = 99999 69 | #maxn = 0 70 | for k in range(0, 9): 71 | icount.append(0) 72 | 73 | # Find out if some images are repeated very often 74 | maxnx = 0 75 | maxny = 0 76 | for ix in aux_x: 77 | ix = int(ix) 78 | if ix in nhash: 79 | if maxnx < nhash[ix]: 80 | maxnx = nhash[ix] 81 | 82 | for iy in aux_y: 83 | iy = int(iy) 84 | if iy in nhash: 85 | if maxny < nhash[iy]: 86 | maxny = nhash[iy] 87 | 88 | for ix in aux_x: 89 | for iy in aux_y: 90 | if ix in ihash and iy in ihash: 91 | try: 92 | a = int('0x' + ihash[ix], 16) 93 | b = int('0x' + ihash[iy], 16) 94 | hamming = bin(a ^ b).count("1") 95 | if hamming < 9: 96 | icount[hamming] = icount[hamming] + 1 97 | 98 | if hamming < minhamming: 99 | minhamming = hamming 100 | 101 | if nhash[ix] < 100 and nhash[iy] < 100: 102 | if minhamming100 > hamming: 103 | minhamming100 = hamming 104 | 105 | if nhash[ix] < 30 and nhash[iy] < 30: 106 | if minhamming30 > hamming: 107 | minhamming30 = hamming 108 | 109 | if nhash[ix] < 50 and nhash[iy] < 50: 110 | if minhamming50 > hamming: 111 | minhamming50 = hamming 112 | 113 | except: 114 | pass 115 | #debug(['break', ix, iy]) 116 | else: 117 | #debug(['missing', ix, iy]) 118 | missing = missing + 1 119 | 120 | vals = [id1, id2] + icount + [missing, minhamming, maxnx, maxny, minhamming30, minhamming50, minhamming100] 121 | if min(len(aux_x), len(aux_y)) > 0: 122 | return vals 123 | else: 124 | return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 125 | 126 | ftrs = [] 127 | 128 | start = time.time() 129 | o = len(df.index) 130 | if nthreads == 1: 131 | print('Extracting features with 1 thread ...') 132 | k = 0 133 | # Iterate over files 134 | ftrs = [] 135 | for row in df.values: 136 | x = process_row(row) 137 | ftrs.append(x) 138 | k += 1 139 | if k % 100 == 0: 140 | a.print_progress(k, start, o) 141 | 142 | # Otherwise perform multi-threaded mapping 143 | else: 144 | print('Extracting features multi-threaded ... ', end='', flush=True) 145 | pool = Pool(nthreads) 146 | ftrs = pool.map(process_row, df.values) 147 | pool.close() 148 | gc.collect() 149 | 150 | a.print_elapsed(start) 151 | 152 | ftrs = pd.DataFrame(ftrs) 153 | ftrs = ftrs.loc[ftrs[0] > 0] 154 | cols = ['itemID_1', 'itemID_2'] + [str(c) for c in ['ham' + str(i) for i in range(9)] + ['miss', 'minham', 'maxnx', 'maxny', 'minham30', 'minham50', 'minham100']] 155 | print(cols) 156 | ftrs.columns = cols 157 | 158 | # Save updated dataset 159 | if mode == 0: 160 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set3f.fthr') 161 | if mode == 1: 162 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set3f.fthr') 163 | 164 | a.print_elapsed(start) 165 | print('set3f extraction complete!') 166 | 167 | # Write status to status file so master script knows whether to proceed. 168 | f = open(cache_loc + 'status.txt', 'a') 169 | f.write('feature_set3f_OK\n') 170 | f.close() 171 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/2_image_info.py: -------------------------------------------------------------------------------- 1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants 2 | #### Author: Mikel 3 | #### Avito Duplicate Ad Detection 4 | # 2_image_info.py 5 | # Creates a database of images and metadata about them, including dHash 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import cv2 10 | import feather 11 | import glob 12 | import sys 13 | import time 14 | import os 15 | import gc 16 | from multiprocessing import Pool 17 | from PIL import Image 18 | from collections import Counter 19 | 20 | import libavito as a 21 | 22 | print(a.c.BOLD + 'Generating image info ...' + a.c.END) 23 | 24 | # Get train/test mode from launch argument 25 | mode = a.get_mode(sys.argv, '2_image_info.py') 26 | 27 | ## Read settings required by script 28 | config = a.read_config() 29 | nthreads = config.preprocessing_nthreads 30 | cache_loc = config.cache_loc 31 | debug = config.debug 32 | root = config.images_root 33 | 34 | # Function to compute difference hash of image 35 | def DifferenceHash(img): 36 | theImage = Image.fromarray(img) 37 | # Convert the image to 8-bit grayscale. 38 | theImage = theImage.convert("L") # 8-bit grayscale 39 | # Squeeze it down to an 8x8 image. 40 | theImage = theImage.resize((8, 8), Image.ANTIALIAS) 41 | # Go through the image pixel by pixel. 42 | # Return 1-bits when a pixel is equal to or brighter than the previous 43 | # pixel, and 0-bits when it's below. 44 | # Use the 64th pixel as the 0th pixel. 45 | previousPixel = theImage.getpixel((0, 7)) 46 | differenceHash = 0 47 | for row in range(0, 8, 2): 48 | # Go left to right on odd rows. 49 | for col in range(8): 50 | differenceHash <<= 1 51 | pixel = theImage.getpixel((col, row)) 52 | differenceHash |= 1 * (pixel >= previousPixel) 53 | previousPixel = pixel 54 | row += 1 55 | # Go right to left on even rows. 56 | for col in range(7, -1, -1): 57 | differenceHash <<= 1 58 | pixel = theImage.getpixel((col, row)) 59 | differenceHash |= 1 * (pixel >= previousPixel) 60 | previousPixel = pixel 61 | return differenceHash 62 | 63 | def get_info(file_loc): 64 | try: 65 | # Get size of image 66 | size = os.path.getsize(file_loc) 67 | 68 | # Attempt to load image 69 | img = cv2.imread(file_loc) 70 | try: 71 | # Test if image is corrupt 72 | assert img.shape[0] * img.shape[1] > 0 73 | except: 74 | print('[WARNING] Image ' + file_loc + ' is corrupt, skipping.') 75 | raise 76 | 77 | # Get image metadata 78 | width = img.shape[1] 79 | height = img.shape[0] 80 | 81 | # Get ratio of image dimensions 82 | ratio = round(min(width, height) / max(width, height), 2) 83 | 84 | # Compute difference hash of image and convert to hex 85 | dhash = '%(hash)016x' % {"hash": DifferenceHash(img)} 86 | 87 | return [width, height, ratio, dhash, size] 88 | 89 | except KeyboardInterrupt: 90 | raise 91 | except: 92 | print('[WARNING] Image ' + file_loc + ' failed to process.') 93 | return [np.nan, np.nan, np.nan, np.nan, np.nan] 94 | 95 | def process_line(f): 96 | # Get image ID 97 | img_id = f.split('/')[-1].split('.')[0] 98 | # Retrieve info for image 99 | d = get_info(f) 100 | # Construct list and return 101 | info = [] 102 | info.append(img_id) 103 | info.extend(d) 104 | return info 105 | 106 | # Recursively glob for jpeg files in the image root 107 | start = time.time() 108 | print('Looking for images in ' + root + ' ... ', end='', flush=True) 109 | files = glob.glob(root + '**/*.jpg', recursive=True) 110 | a.print_elapsed(start) 111 | 112 | print('Found ' + str(len(files)) + ' images.') 113 | 114 | l_id = [] 115 | l_width = [] 116 | l_height = [] 117 | l_ratio = [] 118 | l_hash = [] 119 | l_size = [] 120 | o = len(files) 121 | if nthreads == 1: 122 | print('Extracting image info with 1 thread ...') 123 | k = 0 124 | # Iterate over files 125 | for f in files: 126 | x = process_line(f) 127 | l_id.append(x[0]) 128 | l_width.append(x[1]) 129 | l_height.append(x[2]) 130 | l_ratio.append(x[3]) 131 | l_hash.append(x[4]) 132 | l_size.append(x[5]) 133 | k += 1 134 | if k % 1000 == 0: 135 | a.print_progress(k, start, o) 136 | # Otherwise perform multi-threaded mapping 137 | else: 138 | print('Extracting image info multi-threaded ... ', end='', flush=True) 139 | pool = Pool(nthreads) 140 | newdata = pool.map(process_line, files) 141 | pool.close() 142 | for x in newdata: 143 | l_id.append(x[0]) 144 | l_width.append(x[1]) 145 | l_height.append(x[2]) 146 | l_ratio.append(x[3]) 147 | l_hash.append(x[4]) 148 | l_size.append(x[5]) 149 | del newdata 150 | gc.collect() 151 | 152 | a.print_elapsed(start) 153 | 154 | print('Finding hash-counts ...', end='', flush=True) 155 | start = time.time() 156 | counttable = Counter(l_hash) 157 | l_hashcount = [] 158 | for h in l_hash: 159 | l_hashcount.append(counttable[h]) 160 | a.print_elapsed(start) 161 | 162 | # Bind lists to dataframe 163 | df = pd.DataFrame() 164 | df['image'] = l_id 165 | df['width'] = l_width 166 | df['height'] = l_height 167 | df['ratioOfDimension'] = l_ratio 168 | df['imagehash'] = l_hash 169 | df['FreqOfHash'] = l_hashcount 170 | df['imagesize'] = l_size 171 | 172 | start = time.time() 173 | print('Caching image data ... ', end='', flush=True) 174 | 175 | # Save updated dataset 176 | feather.write_dataframe(df, cache_loc + 'image_database.fthr') 177 | df.to_csv(cache_loc + 'image_database.csv', index=False) 178 | 179 | a.print_elapsed(start) 180 | print('Image info extraction complete!') 181 | 182 | # Write status to status file so master script knows whether to proceed. 183 | f = open(cache_loc + 'status.txt', 'a') 184 | f.write('image_info_OK\n') 185 | f.close() 186 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1a_ngram.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | ################################################################################################ 3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants 4 | #### Competition: Avito Duplicate Ad Detection 5 | # Filename : 3_feature_set1a_ngram.R 6 | # Description: This Rscript generates all ngram features 7 | # Usage: 8 | # Rscript ./code/3_feature_set1a_ngram.R train 9 | # Rscript ./code/3_feature_set1a_ngram.R test 10 | # Default argument is test 11 | ################################################################################################ 12 | ################################################################################################ 13 | 14 | args <- commandArgs(trailingOnly = F) 15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)]))) 16 | 17 | # Source Config and functions.R file 18 | source(paste(BASE, "/../config.cfg", sep = "")) 19 | source(paste(BASE_DIR, "/code/functions.R", sep = "")) 20 | 21 | #Load any additional packages 22 | library(parallel) 23 | library(stylo) 24 | library(stringr) 25 | library(tm) 26 | 27 | # Read argument for train or test 28 | trainOrTest <- commandArgs(trailingOnly = TRUE) 29 | if (length(trainOrTest) > 1) { 30 | stop("ERROR: I need only 1 argument : train or test") 31 | } 32 | 33 | if (length(trainOrTest) == 0) { 34 | print("No Arguments passed, Assuming you mean test") 35 | trainOrTest <- "test" 36 | } 37 | 38 | #Load data 39 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "") 40 | cat("Reading file ", FILENAME, "\n", sep = " ") 41 | dat <- read_csv(FILENAME) 42 | 43 | ####################################### 44 | # Start generating Features for DESCRIPTION columns 45 | print("Start generating nGrams Features for DESCRIPTION columns") 46 | for (n in 1:3) { 47 | print(n) 48 | df2 <- data.frame(t(mcmapply(getNgramsCount, dat$cleandesc_1, dat$cleandesc_2, n, USE.NAMES = FALSE, mc.cores = preprocessing_nthreads))) 49 | colnames(df2) <- c( 50 | paste("countOf_", n, "_Grams_description_min", sep = ""), 51 | paste("countOf_", n, "_Grams_description_max", sep = ""), 52 | paste("countOf_", n, "_Grams_description_sum", sep = ""), 53 | paste("countOf_", n, "_Grams_description_diff", sep = ""), 54 | 55 | paste("countOf_", n, "_Grams_cleandesc_1", sep = ""), 56 | paste("countOf_", n, "_Grams_cleandesc_2", sep = ""), 57 | paste("countOfUnique_", n, "_Grams_cleandesc_1", sep = ""), 58 | paste("countOfUnique_", n, "_Grams_cleandesc_2", sep = ""), 59 | paste("ratioOf_", n, "_Grams_cleandesc_1_cleandesc_2", sep = ""), 60 | paste("ratioOfUnique_", n, "_Grams_cleandesc_1_cleandesc_2", sep = ""), 61 | paste("ratioOfIntersect_", n, "_Grams_cleandesc_1_in_cleandesc_2", sep = ""), 62 | paste("ratioOfIntersect_", n, "_Grams_cleandesc_2_in_cleandesc_1", sep = "") 63 | ) 64 | if (nrow(df2) != nrow(dat)) { 65 | cat("Expecting", nrow(dat), "Got", nrow(df2), "\n", sep = " ") 66 | stop("mcmapply is behaving weird. Getting less results") 67 | } 68 | 69 | if (exists("df_master")) { 70 | df_master <- bind_cols(df_master, df2) 71 | } else { 72 | df_master <- df2 73 | } 74 | } 75 | names(df_master) <- paste("set1a", names(df_master), sep = "_") 76 | 77 | ######## Add Primary Columns ItemID1 and ItemID2 78 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master) 79 | print("Saving Description ngrams features") 80 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1a_", "ngram_description.fthr", sep = "" )) 81 | rm(df_master, df2) 82 | gc() 83 | 84 | 85 | ####################################### 86 | # Start generating Features for TITLE columns 87 | print("Start generating nGrams Features for TITLE columns") 88 | for (n in 1:3) { 89 | print(n) 90 | df2 <- data.frame(t(mcmapply(getNgramsCount, dat$cleantitle_1, dat$cleantitle_2, n, USE.NAMES = FALSE, mc.cores = preprocessing_nthreads))) 91 | colnames(df2) <- c( 92 | paste("countOf_", n, "_Grams_title_min", sep = ""), 93 | paste("countOf_", n, "_Grams_title_max", sep = ""), 94 | paste("countOf_", n, "_Grams_title_sum", sep = ""), 95 | paste("countOf_", n, "_Grams_title_diff", sep = ""), 96 | 97 | paste("countOf_", n, "_Grams_cleantitle_1", sep = ""), 98 | paste("countOf_", n, "_Grams_cleantitle_2", sep = ""), 99 | paste("countOfUnique_", n, "_Grams_cleantitle_1", sep = ""), 100 | paste("countOfUnique_", n, "_Grams_cleantitle_2", sep = ""), 101 | paste("ratioOf_", n, "_Grams_cleantitle_1_cleantitle_2", sep = ""), 102 | paste("ratioOfUnique_", n, "_Grams_cleantitle_1_cleantitle_2", sep = ""), 103 | paste("ratioOfIntersect_", n, "_Grams_cleantitle_1_in_cleantitle_2", sep = ""), 104 | paste("ratioOfIntersect_", n, "_Grams_cleantitle_2_in_cleantitle_1", sep = "") 105 | ) 106 | 107 | if (nrow(df2) != nrow(dat)) { 108 | cat("Expecting", nrow(dat), "Got", nrow(df2), "\n", sep = " ") 109 | stop("mcmapply is behaving weird. Getting less results") 110 | } 111 | 112 | if (exists("df_master")) { 113 | df_master <- bind_cols(df_master, df2) 114 | } else { 115 | df_master <- df2 116 | } 117 | } 118 | names(df_master) <- paste("set1a", names(df_master), sep = "_") 119 | 120 | ######## Add Primary Columns ItemID1 and ItemID2 121 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master) 122 | print("Saving Title ngrams features") 123 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1a_", "ngram_title.fthr", sep = "" )) 124 | rm(df_master, df2) 125 | gc() 126 | 127 | #END 128 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1b_nchar.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | ################################################################################################ 3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants 4 | #### Competition: Avito Duplicate Ad Detection 5 | # Filename : 3_feature_set1b_nchar.R 6 | # Description: This Rscript generates all nchar features 7 | # Usage: 8 | # Rscript ./code/3_feature_set1b_nchar.R train 9 | # Rscript ./code/3_feature_set1b_nchar.R test 10 | # Default argument is test 11 | ################################################################################################ 12 | ################################################################################################ 13 | 14 | args <- commandArgs(trailingOnly = F) 15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)]))) 16 | 17 | # Source Config and functions.R file 18 | source(paste(BASE, "/../config.cfg", sep = "")) 19 | source(paste(BASE_DIR, "/code/functions.R", sep = "")) 20 | 21 | #Load any additional packages 22 | library(parallel) 23 | library(stylo) 24 | library(stringr) 25 | library(tm) 26 | 27 | # Read argument for train or test 28 | trainOrTest <- commandArgs(trailingOnly = TRUE) 29 | if (length(trainOrTest) > 1) { 30 | stop("ERROR: I need only 1 argument : train or test") 31 | } 32 | 33 | if (length(trainOrTest) == 0) { 34 | print("No Arguments passed, Assuming you mean test") 35 | trainOrTest <- "test" 36 | } 37 | 38 | #Load data 39 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "") 40 | cat("Reading file ", FILENAME, "\n", sep = " ") 41 | dat <- read_csv(FILENAME) 42 | 43 | 44 | ####################################### 45 | # Start generating Features for DESCRIPTION columns 46 | print("Start generating nChars Features for DESCRIPTION columns") 47 | for (n in 1:3) { 48 | print(n) 49 | df2 <- data.frame(t(mcmapply(getNcharsCount, dat$cleandesc_1, dat$cleandesc_2, n, USE.NAMES = FALSE, mc.cores = preprocessing_nthreads))) 50 | colnames(df2) <- c( 51 | paste("countOf_", n, "_Chars_description_min", sep = ""), 52 | paste("countOf_", n, "_Chars_description_max", sep = ""), 53 | paste("countOf_", n, "_Chars_description_sum", sep = ""), 54 | paste("countOf_", n, "_Chars_description_diff", sep = ""), 55 | 56 | paste("countOf_", n, "_Chars_cleandesc_1", sep = ""), 57 | paste("countOf_", n, "_Chars_cleandesc_2", sep = ""), 58 | paste("countOfUnique_", n, "_Chars_cleandesc_1", sep = ""), 59 | paste("countOfUnique_", n, "_Chars_cleandesc_2", sep = ""), 60 | paste("ratioOf_", n, "_Chars_cleandesc_1_cleandesc_2", sep = ""), 61 | paste("ratioOfUnique_", n, "_Chars_cleandesc_1_cleandesc_2", sep = ""), 62 | paste("ratioOfIntersect_", n, "_chars_cleandesc_1_in_cleandesc_2", sep = ""), 63 | paste("ratioOfIntersect_", n, "_chars_cleandesc_2_in_cleandesc_1", sep = "") 64 | ) 65 | if (nrow(df2) != nrow(dat)) { 66 | cat("Expecting", nrow(dat), "Got", nrow(df2), "\n", sep = " ") 67 | stop("mcmapply is behaving weird. Getting less results") 68 | } 69 | 70 | if (exists("df_master")) { 71 | df_master <- bind_cols(df_master, df2) 72 | } else { 73 | df_master <- df2 74 | } 75 | } 76 | 77 | names(df_master) <- paste("set1b", names(df_master), sep = "_") 78 | 79 | ######## Add Primary Columns ItemID1 and ItemID2 80 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master) 81 | print("Saving Description nchars features") 82 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1b_", "nchar_description.fthr", sep = "" )) 83 | rm(df_master, df2) 84 | gc() 85 | 86 | ####################################### 87 | # Start generating Features for TITLE columns 88 | print("Start generating nChars Features for TITLE columns") 89 | for (n in 1:3) { 90 | print(n) 91 | df2 <- data.frame(t(mcmapply(getNcharsCount, dat$cleantitle_1, dat$cleantitle_2, n, USE.NAMES = FALSE, mc.cores = preprocessing_nthreads))) 92 | colnames(df2) <- c( 93 | paste("countOf_", n, "_Chars_title_min", sep = ""), 94 | paste("countOf_", n, "_Chars_title_max", sep = ""), 95 | paste("countOf_", n, "_Chars_title_sum", sep = ""), 96 | paste("countOf_", n, "_Chars_title_diff", sep = ""), 97 | 98 | paste("countOf_", n, "_Chars_cleantitle_1", sep = ""), 99 | paste("countOf_", n, "_Chars_cleantitle_2", sep = ""), 100 | paste("countOfUnique_", n, "_Chars_cleantitle_1", sep = ""), 101 | paste("countOfUnique_", n, "_Chars_cleantitle_2", sep = ""), 102 | paste("ratioOf_", n, "_Chars_cleantitle_1_cleantitle_2", sep = ""), 103 | paste("ratioOfUnique_", n, "_Chars_cleantitle_1_cleantitle_2", sep = ""), 104 | paste("ratioOfIntersect_", n, "_chars_cleantitle_1_in_cleantitle_2", sep = ""), 105 | paste("ratioOfIntersect_", n, "_chars_cleantitle_2_in_cleantitle_1", sep = "") 106 | ) 107 | if (nrow(df2) != nrow(dat)) { 108 | cat("Expecting", nrow(dat), "Got", nrow(df2), "\n", sep = " ") 109 | stop("mcmapply is behaving weird. Getting less results") 110 | } 111 | 112 | if (exists("df_master")) { 113 | df_master <- bind_cols(df_master, df2) 114 | } else { 115 | df_master <- df2 116 | } 117 | } 118 | names(df_master) <- paste("set1b", names(df_master), sep = "_") 119 | 120 | ######## Add Primary Columns ItemID1 and ItemID2 121 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master) 122 | print("Saving Title nchars features") 123 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1b_", "nchar_title.fthr", sep = "" )) 124 | rm(df_master, df2) 125 | gc() 126 | 127 | #END 128 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/5_consolidate_features.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | ################################################################################################ 3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants 4 | #### Competition: Avito Duplicate Ad Detection 5 | # Filename : 3_feature_set1a_ngram.R 6 | # Description: This Rscript generates all ngram features 7 | # Usage: 8 | # Rscript ./code/3_feature_set1a_ngram.R train 9 | # Rscript ./code/3_feature_set1a_ngram.R test 10 | # Default argument is test 11 | ################################################################################################ 12 | ################################################################################################ 13 | 14 | # Source Config and functions.R file 15 | source("config.cfg") 16 | source("./code/functions.R") 17 | 18 | library(readr) 19 | library(dplyr) 20 | library(feather) 21 | 22 | 23 | # Read argument for train or test 24 | trainOrTest <- commandArgs(trailingOnly = TRUE) 25 | if (length(trainOrTest) > 1) { 26 | stop("ERROR: I need only 1 argument : train or test") 27 | } 28 | 29 | if (length(trainOrTest) == 0) { 30 | print("No Arguments passed, Assuming you mean test") 31 | trainOrTest <- "test" 32 | } 33 | 34 | #Load data 35 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "") 36 | cat("Reading file ", FILENAME, "\n", sep = " ") 37 | completeDate <- read_csv(FILENAME) 38 | if (trainOrTest == "train") { 39 | completeDate <- completeDate[, c("itemID_1", "itemID_2", "isDuplicate")] 40 | gc() 41 | } else { 42 | completeDate <- completeDate[, c("id", "itemID_1", "itemID_2")] 43 | gc() 44 | } 45 | 46 | ngram_title <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1a_ngram_title.fthr", sep = "" )) 47 | completeDate <- left_join(completeDate, ngram_title, by = c("itemID_1", "itemID_2")) 48 | rm(ngram_title) 49 | 50 | ngram_description <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1a_ngram_description.fthr", sep = "" )) 51 | completeDate <- left_join(completeDate, ngram_description, by = c("itemID_1", "itemID_2")) 52 | rm(ngram_description) 53 | 54 | nchar_title <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1b_nchar_title.fthr", sep = "" )) 55 | completeDate <- left_join(completeDate, nchar_title, by = c("itemID_1", "itemID_2")) 56 | rm(nchar_title) 57 | 58 | nchar_description <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1b_nchar_description.fthr", sep = "" )) 59 | completeDate <- left_join(completeDate, nchar_description, by = c("itemID_1", "itemID_2")) 60 | rm(nchar_description) 61 | 62 | misc <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1c_misc.fthr", sep = "" )) 63 | completeDate <- left_join(completeDate, misc, by = c("itemID_1", "itemID_2")) 64 | rm(misc) 65 | 66 | interaction <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1d_interaction.fthr", sep = "" )) 67 | completeDate <- left_join(completeDate, interaction, by = c("itemID_1", "itemID_2")) 68 | rm(interaction) 69 | 70 | attributes <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1e_attributes.fthr", sep = "" )) 71 | completeDate <- left_join(completeDate, attributes, by = c("itemID_1", "itemID_2")) 72 | rm(attributes) 73 | 74 | specialCounting <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1f_specialCounting.fthr", sep = "" )) 75 | completeDate <- left_join(completeDate, specialCounting, by = c("itemID_1", "itemID_2")) 76 | rm(specialCounting) 77 | 78 | capitalLetters <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1g_capitalLetters.fthr", sep = "" )) 79 | completeDate <- left_join(completeDate, capitalLetters, by = c("itemID_1", "itemID_2")) 80 | rm(capitalLetters) 81 | 82 | image <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1h_image.fthr", sep = "" )) 83 | completeDate <- left_join(completeDate, image, by = c("itemID_1", "itemID_2")) 84 | rm(image) 85 | 86 | imageSize <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1i_imageSize.fthr", sep = "" )) 87 | completeDate <- left_join(completeDate, imageSize, by = c("itemID_1", "itemID_2")) 88 | rm(imageSize) 89 | 90 | 91 | 92 | location_levenshtein <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set2a_location_levenshtein.fthr", sep = "" )) 93 | completeDate <- left_join(completeDate, location_levenshtein, by = c("itemID_1", "itemID_2")) 94 | rm(location_levenshtein) 95 | 96 | brisk <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set2b_brisk.fthr", sep = "" )) 97 | completeDate <- left_join(completeDate, brisk, by = c("itemID_1", "itemID_2")) 98 | rm(brisk) 99 | 100 | histogram <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set2c_histogram.fthr", sep = "" )) 101 | completeDate <- left_join(completeDate, histogram, by = c("itemID_1", "itemID_2")) 102 | rm(histogram) 103 | 104 | 105 | consolidated <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set3_consolidated.fthr", sep = "" )) 106 | completeDate <- left_join(completeDate, consolidated, by = c("itemID_1", "itemID_2")) 107 | rm(consolidated) 108 | 109 | 110 | fuzzy <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set4a_fuzzy.fthr", sep = "" )) 111 | completeDate <- left_join(completeDate, fuzzy, by = c("itemID_1", "itemID_2")) 112 | rm(fuzzy) 113 | 114 | fuzzy_clean <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set4b_fuzzy_clean.fthr", sep = "" )) 115 | completeDate <- left_join(completeDate, fuzzy_clean, by = c("itemID_1", "itemID_2")) 116 | rm(fuzzy_clean) 117 | 118 | alternate <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set4c_alternate.fthr", sep = "" )) 119 | completeDate <- left_join(completeDate, alternate, by = c("itemID_1", "itemID_2")) 120 | rm(alternate) 121 | 122 | similarity <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set4d_similarity.fthr", sep = "" )) 123 | completeDate <- left_join(completeDate, similarity, by = c("itemID_1", "itemID_2")) 124 | rm(similarity) 125 | gc() 126 | 127 | print("Saving Final Files") 128 | write_feather(completeDate, paste("cache/final_featureSet_", trainOrTest, ".fthr", sep = "" )) 129 | print("DONE") 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set4c_alternate.py: -------------------------------------------------------------------------------- 1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants 2 | #### Author: Marios & Mikel 3 | #### Avito Duplicate Ad Detection 4 | # 3_feature_set4b_fuzzy_clean.py 5 | # Creates various text similarity features 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import sys 10 | import jellyfish 11 | import feather 12 | import time 13 | import gc 14 | import re 15 | import math 16 | from collections import Counter 17 | from fuzzywuzzy import fuzz 18 | from multiprocessing import Pool 19 | 20 | import libavito as a 21 | 22 | WORD = re.compile(r'\w+') 23 | 24 | def get_cosine(text1, text2): 25 | vec1 = text_to_vector(text1) 26 | vec2 = text_to_vector(text2) 27 | intersection = set(vec1.keys()) & set(vec2.keys()) 28 | numerator = sum([vec1[x] * vec2[x] for x in intersection]) 29 | 30 | sum1 = sum([vec1[x]**2 for x in vec1.keys()]) 31 | sum2 = sum([vec2[x]**2 for x in vec2.keys()]) 32 | denominator = math.sqrt(sum1) * math.sqrt(sum2) 33 | 34 | if not denominator: 35 | return 0.0 36 | else: 37 | return float(numerator) / denominator 38 | 39 | def text_to_vector(text): 40 | words = WORD.findall(text) 41 | return Counter(words) 42 | 43 | 44 | def count_2words_together(words, text, ranges): 45 | count2 = 0 46 | if len(words) < 2 or len(text) < 2: 47 | return -1 48 | else: 49 | for m in range(0, len(words) - 1): 50 | words1 = words[m] 51 | for n in range(m + 1, len(words)): 52 | words2 = words[n] 53 | if words1 in text: 54 | ind = text.index(words1) 55 | try: 56 | words2 in text[ind + 1:ind + 1 + ranges] 57 | count2 += 1 58 | except: 59 | pass 60 | return count2 61 | 62 | def count_2words(words, text): 63 | # To count how many times of the search terms having two words at least showing in texts. 64 | count2 = 0 65 | if len(words) < 2 or len(text) < 2: 66 | return -1 67 | else: 68 | for m in range(0, len(words) - 1): 69 | words1 = words[m] 70 | for n in range(m + 1, len(words)): 71 | words2 = words[n] 72 | if words1 in text and words2 in text: 73 | count2 += 1 74 | return count2 75 | 76 | def calculate_similarity_simple(str1, str2): 77 | count = 0 78 | if str1 in str2: 79 | count = 1 80 | return count 81 | 82 | def calculate_similarity_split(str1, str2): 83 | count = 0 84 | countabs = 0 85 | countper = 0 86 | split1 = str1.split(" ") 87 | split2 = str2.split(" ") 88 | for s1 in split1: 89 | for s2 in split2: 90 | if s1 in s2: 91 | count += 1 92 | if s1 == s2: 93 | countabs += 1 94 | countper += 1 95 | 96 | return count, countabs, countabs / (countper + 1) 97 | 98 | def process_row(row): 99 | 100 | title = 2 101 | desc = 4 102 | json = 6 103 | 104 | pairs = [[title, desc], [desc, title], [title, json], [json, title], [desc, json], [json, desc]] 105 | values = [] 106 | # string feature counts 107 | 108 | values.append(row[0]) 109 | values.append(row[1]) 110 | 111 | for d, s in pairs: 112 | st_1 = str(row[d]).replace(":", " ") 113 | st_2 = str(row[s + 1]).replace(":", " ") 114 | values.append(calculate_similarity_simple(st_1, st_2)) 115 | val1, val2, val3 = calculate_similarity_split(st_1, st_2) 116 | values.append(val1) 117 | values.append(val2) 118 | values.append(val3) 119 | st_1_array = st_1.split(" ") 120 | st_2_array = st_2.split(" ") 121 | values.append(count_2words(st_1_array, st_2_array)) 122 | values.append(get_cosine(st_1, st_2)) 123 | values.append(count_2words_together(st_1_array, st_2_array, 1)) 124 | values.append(count_2words_together(st_1_array, st_2_array, 5)) 125 | 126 | return values 127 | 128 | print(a.c.BOLD + 'Extracting set4c alternate text features ...' + a.c.END) 129 | 130 | # Get train/test mode from launch argument 131 | mode = a.get_mode(sys.argv, '3_feature_set4c_fuzzy_clean.py') 132 | 133 | ## Read settings required by script 134 | config = a.read_config() 135 | nthreads = config.preprocessing_nthreads 136 | cache_loc = config.cache_loc 137 | debug = config.debug 138 | if mode == 0: 139 | root = config.train_images_root 140 | df = feather.read_dataframe(cache_loc + 'train.fthr') 141 | if mode == 1: 142 | root = config.test_images_root 143 | df = feather.read_dataframe(cache_loc + 'test.fthr') 144 | 145 | df = df[['itemID_1', 'itemID_2', 'title_1', 'title_2', 'description_1', 'description_2', 'attrsJSON_1', 'attrsJSON_2']] 146 | 147 | ftrs = [] 148 | 149 | start = time.time() 150 | o = len(df.index) 151 | if nthreads == 1: 152 | print('Extracting features with 1 thread ...') 153 | k = 0 154 | # Iterate over files 155 | ftrs = [] 156 | for row in df.values: 157 | x = process_row(row) 158 | ftrs.append(x) 159 | k += 1 160 | if k % 100 == 0: 161 | a.print_progress(k, start, o) 162 | 163 | # Otherwise perform multi-threaded mapping 164 | else: 165 | print('Extracting features multi-threaded ... ', end='', flush=True) 166 | pool = Pool(nthreads) 167 | ftrs = pool.map(process_row, df.values) 168 | pool.close() 169 | gc.collect() 170 | 171 | a.print_elapsed(start) 172 | 173 | ftrs = pd.DataFrame(ftrs) 174 | cols = ['itemID_1', 'itemID_2'] + ['set4c_X' + str(i) for i in range(1, len(ftrs.columns.tolist()) - 1)] 175 | print(cols) 176 | ftrs.columns = cols 177 | 178 | # Save updated dataset 179 | if mode == 0: 180 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set4c_alternate.fthr') 181 | if mode == 1: 182 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set4c_alternate.fthr') 183 | 184 | a.print_elapsed(start) 185 | print('set4c extraction complete!') 186 | 187 | # Write status to status file so master script knows whether to proceed. 188 | f = open(cache_loc + 'status.txt', 'a') 189 | f.write('feature_set4c_OK\n') 190 | f.close() 191 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1c_misc.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | ################################################################################################ 3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants 4 | #### Competition: Avito Duplicate Ad Detection 5 | # Filename : 3_feature_set1c_misc.R 6 | # Description: This Rscript generates all ngram features 7 | # Usage: 8 | # Rscript ./code/3_feature_set1c_misc.R train 9 | # Rscript ./code/3_feature_set1c_misc.R test 10 | # Default argument is test 11 | ################################################################################################ 12 | ################################################################################################ 13 | 14 | args <- commandArgs(trailingOnly = F) 15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)]))) 16 | 17 | # Source Config and functions.R file 18 | source(paste(BASE, "/../config.cfg", sep = "")) 19 | source(paste(BASE_DIR, "/code/functions.R", sep = "")) 20 | 21 | #Load any additional packages 22 | library(parallel) 23 | library(stylo) 24 | 25 | # Read argument for train or test 26 | trainOrTest <- commandArgs(trailingOnly = TRUE) 27 | if (length(trainOrTest) > 1) { 28 | stop("ERROR: I need only 1 argument : train or test") 29 | } 30 | 31 | if (length(trainOrTest) == 0) { 32 | print("No Arguments passed, Assuming you mean test") 33 | trainOrTest <- "test" 34 | } 35 | 36 | #Load data 37 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "") 38 | cat("Reading file ", FILENAME, "\n", sep = " ") 39 | dat <- read_csv(FILENAME) 40 | 41 | 42 | ######## IDs and Long and Lat Features 43 | print("Generating Binary features ") 44 | isMetroIdSame <- ifelse(dat$metroID_1 == dat$metroID_2, 1, 0) 45 | isLocationIDSame <- ifelse(dat$locationID_1 == dat$locationID_2, 1, 0) 46 | isRegionIDSame <- ifelse(dat$regionID_1 == dat$regionID_2, 1, 0) 47 | isLongitudeSame <- ifelse(round(dat$lon_1, 2) == round(dat$lon_2, 2), 1, 0) 48 | isLatitudeSame <- ifelse(round(dat$lat_1, 2) == round(dat$lat_2, 2), 1, 0) 49 | isTitleSame <- ifelse(tolower(dat$cleantitle_1) == tolower(dat$cleantitle_2), 1, 0) #isTitle Same 50 | isdescriptionSame <- ifelse(tolower(dat$cleandesc_1) == tolower(dat$cleandesc_2), 1, 0) #isdescription Same 51 | 52 | ######## PRICE Features 53 | print("Generating Price features ") 54 | priceDiff <- abs(dat$price_1 - dat$price_2) 55 | ratioOfPrices <- dat$price_1 / dat$price_2 56 | ratioOfPrices <- round(ifelse(ratioOfPrices > 1, 1/ratioOfPrices, ratioOfPrices), 3) 57 | both_price_na <- ifelse(is.na(dat$price_1) & is.na(dat$price_2), 1, 0) #Both Price NA 58 | one_price_na <- ifelse(is.na(dat$price_1) | is.na(dat$price_2), 1, 0) #One Price NA 59 | total_price <- (dat$price_1 + dat$price_2) #Total Price 60 | 61 | 62 | ######## IMAGE Features 63 | print("Generating Image features") 64 | library(stringr) 65 | imageCount_sum <- str_count(dat$images_array_1, '[0-9.]+') + str_count(dat$images_array_2, '[0-9.]+') 66 | imageCount_diff <- abs(str_count(dat$images_array_1, '[0-9.]+') - str_count(dat$images_array_2, '[0-9.]+')) 67 | imageCount_min <- pmin(str_count(dat$images_array_1, '[0-9.]+'), str_count(dat$images_array_2, '[0-9.]+'), na.rm = F) 68 | imageCount_max <- pmax(str_count(dat$images_array_1, '[0-9.]+'), str_count(dat$images_array_2, '[0-9.]+'), na.rm = F) 69 | ratioOfNumberOfImages <- str_count(dat$images_array_1, '[0-9.]+') / str_count(dat$images_array_2, '[0-9.]+') 70 | ratioOfNumberOfImages <- round(ifelse(ratioOfNumberOfImages > 1, 1/ratioOfNumberOfImages, ratioOfNumberOfImages), 3) 71 | 72 | ######## DISTANCE STRING Features 73 | library(stringdist) 74 | print("Generating Text Distance features for title") 75 | titleDistance_cosine <- round(mcmapply(stringdist, dat$cleantitle_1, dat$cleantitle_2, method = "cosine", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3) 76 | titleDistance_hamming <- round(mcmapply(stringdist, dat$cleantitle_1, dat$cleantitle_2, method = "hamming", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3) 77 | titleDistance_jaccard <- round(mcmapply(stringdist, dat$cleantitle_1, dat$cleantitle_2, method = "jaccard", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3) 78 | 79 | print("Generating Text Distance features for description") 80 | descriptionDistance_cosine <- round(mcmapply(stringdist, dat$cleandesc_1, dat$cleandesc_2, method = "cosine", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3) 81 | 82 | descriptionDistance_hamming <- round(mcmapply(stringdist, dat$cleandesc_1, dat$cleandesc_2, method = "hamming", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3) 83 | 84 | descriptionDistance_jaccard <- round(mcmapply(stringdist, dat$cleandesc_1, dat$cleandesc_2, method = "jaccard", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3) 85 | 86 | 87 | ######## DATA FRAME 88 | df_master <- data.frame( isMetroIdSame = isMetroIdSame, 89 | isLocationIDSame = isLocationIDSame, 90 | isRegionIDSame = isRegionIDSame, 91 | isLongitudeSame = isLongitudeSame, 92 | isLatitudeSame = isLatitudeSame, 93 | isTitleSame = isTitleSame, 94 | isdescriptionSame = isdescriptionSame, 95 | priceDiff = priceDiff, 96 | ratioOfPrices = ratioOfPrices, 97 | both_price_na = both_price_na, 98 | one_price_na = one_price_na, 99 | total_price = total_price, 100 | imageCount_sum = imageCount_sum, 101 | imageCount_diff = imageCount_diff, 102 | imageCount_min = imageCount_min, 103 | imageCount_max = imageCount_max, 104 | ratioOfNumberOfImages = ratioOfNumberOfImages, 105 | titleDistance_cosine = titleDistance_cosine, 106 | titleDistance_hamming = titleDistance_hamming, 107 | titleDistance_jaccard = titleDistance_jaccard, 108 | descriptionDistance_cosine = descriptionDistance_cosine, 109 | descriptionDistance_hamming = descriptionDistance_hamming, 110 | descriptionDistance_jaccard = descriptionDistance_jaccard 111 | ) 112 | 113 | set1d <- df_master #making a copy for geenrating interaction features. Need to do this before renaming columns 114 | 115 | names(df_master) <- paste("set1c", names(df_master), sep = "_") 116 | ######## Add Primary Columns ItemID1 and ItemID2 117 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master) 118 | print("Saving Misc features") 119 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1c_", "misc.fthr", sep = "" )) 120 | 121 | # Start Interaction feature script 122 | source("./code/3_feature_set1d_interaction.R") 123 | #END 124 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/1_data_preprocessing.py: -------------------------------------------------------------------------------- 1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants 2 | #### Author: Mikel 3 | #### Avito Duplicate Ad Detection 4 | # 1_data_preprocessing.py 5 | # Takes in input data, cleans text and merges itemIDs. 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import nltk 10 | import sklearn 11 | import json 12 | import math 13 | import feather #import pickle - feather used instead as it is compatible with R 14 | from pandas.io.json import json_normalize 15 | import unicodedata 16 | from stop_words import get_stop_words 17 | import time 18 | from multiprocessing import Pool 19 | import sys 20 | import gc 21 | from imp import load_source 22 | 23 | import libavito as a 24 | 25 | ######################### 26 | ##### SCRIPT CONFIG ##### 27 | ######################### 28 | 29 | # Define cleaning parameters 30 | stopwords = get_stop_words('ru') 31 | exclude_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'Sk', 'Sc', 'So', 'Co', 'Cf', 'Cc', 'Cs', 'Cn']) 32 | sno = nltk.stem.SnowballStemmer('russian') 33 | 34 | ######################### 35 | 36 | print(a.c.BOLD + 'Cleaning input data ...' + a.c.END) 37 | 38 | # Get train/test mode from launch argument 39 | mode = a.get_mode(sys.argv, '1_data_preprocessing.py') 40 | 41 | ## Read settings required by script 42 | config = a.read_config() 43 | nthreads = config.preprocessing_nthreads 44 | cache_loc = config.cache_loc 45 | category_loc = config.category_csv 46 | location_loc = config.location_csv 47 | debug = config.debug 48 | if mode == 0: 49 | data_loc = config.train_ItemInfo 50 | pairs_loc = config.train_ItemPairs 51 | if mode == 1: 52 | data_loc = config.test_ItemInfo 53 | pairs_loc = config.test_ItemPairs 54 | 55 | # Read file for processing into memory 56 | start = time.time() 57 | print('Reading input data ... ', end='', flush=True) 58 | df = pd.read_csv(data_loc) 59 | a.print_elapsed(start) 60 | 61 | def get_clean_tokens(text): 62 | newtext = [] 63 | 64 | # lower text 65 | text = text.lower() 66 | 67 | # replace punctation 68 | text = ''.join(x if unicodedata.category(x) not in exclude_cats else ' ' for x in text) 69 | 70 | # replace some symbols 71 | text = ''.join(x if x not in ["'", '`', '>', '<', '=', '+'] else ' ' for x in text) 72 | 73 | # tokenize the text 74 | text0 = nltk.word_tokenize(text, 'russian') 75 | 76 | # word by word 77 | for y in text0: 78 | # remove stopwords and stemming 79 | if len(y) > 0 and y not in stopwords: 80 | newtext.append(sno.stem(y)) 81 | 82 | return newtext 83 | 84 | def process_line(i): 85 | # Lists to store tokens in 86 | tx = [] 87 | dx = [] 88 | resx = [] 89 | 90 | # Pluck initial strings from dataframe 91 | title = str(df.iloc[i]['title']) 92 | desc = str(df.iloc[i]['description']) 93 | jx = str(df.iloc[i]['attrsJSON']).lower() 94 | 95 | tx = get_clean_tokens(title) 96 | dx = get_clean_tokens(desc) 97 | 98 | # Process JSON 99 | try: 100 | resx = json.loads(jx) 101 | for key in resx.keys(): 102 | a = get_clean_tokens(resx[key]) 103 | resx[key] = " ".join(a) 104 | except: 105 | resx = [] 106 | if debug == 1: 107 | print('DEBUG: Failed to read JSON "' + json + '" at ' + str(i)) 108 | pass 109 | 110 | jxs = '' + json.dumps(resx, ensure_ascii=False) 111 | txs = ' '.join(tx) 112 | dxs = ' '.join(dx) 113 | 114 | del tx, resx, dx 115 | gc.collect() 116 | 117 | return [txs, dxs, jxs] 118 | 119 | # def process_line(i): 120 | # return ['empty', 'empty', 'empty'] 121 | 122 | newtitles = [] 123 | newdescs = [] 124 | newjson = [] 125 | ids = df['itemID'].values 126 | 127 | start = time.time() 128 | # If number of threads is equal to 1, output time remaining etc. 129 | o = len(df.index) 130 | if nthreads == 1: 131 | print('Cleaning text with 1 thread ...') 132 | k = 0 133 | # Iterate over lines 134 | for i in range(0, o): 135 | x = process_line(i) 136 | newtitles.append(x[0]) 137 | newdescs.append(x[1]) 138 | newjson.append(x[2]) 139 | k += 1 140 | if k % 100 == 0: 141 | a.print_progress(k, start, o) 142 | # Otherwise perform multi-threaded mapping 143 | else: 144 | print('Cleaning text multi-threaded ... ', end='', flush=True) 145 | pool = Pool(nthreads) 146 | newdata = pool.map(process_line, range(0, o)) 147 | pool.close() 148 | for x in newdata: 149 | newtitles.append(x[0]) 150 | newdescs.append(x[1]) 151 | newjson.append(x[2]) 152 | 153 | del newdata 154 | gc.collect() 155 | 156 | a.print_elapsed(start) 157 | 158 | ######################### 159 | 160 | print(a.c.BOLD + 'Joining input data ...' + a.c.END) 161 | 162 | # Joining cleaned data into original data 163 | df['cleandesc'] = newdescs 164 | df['cleantitle'] = newtitles 165 | df['cleanjson'] = newjson 166 | 167 | # Memory management 168 | del newdescs, newtitles, newjson 169 | gc.collect() 170 | 171 | start = time.time() 172 | print('Joining parentCategory ... ', end='', flush=True) 173 | category = pd.read_csv(category_loc) 174 | df = df.merge(category, on=['categoryID'], copy=False) 175 | a.print_elapsed(start) 176 | 177 | start = time.time() 178 | print('Joining regionID ... ', end='', flush=True) 179 | location = pd.read_csv(location_loc) 180 | df = df.merge(location, on=['locationID'], copy=False) 181 | a.print_elapsed(start) 182 | 183 | start = time.time() 184 | print('Joining itemPairs ...', end='', flush=True) 185 | itemPairs = pd.read_csv(pairs_loc) 186 | df = pd.merge(pd.merge(itemPairs, df, how='inner', left_on='itemID_1', right_on='itemID'), df, how='inner', left_on='itemID_2', right_on='itemID') # , suffixes=('_1', '_2')) 187 | df.drop(['itemID_x', 'itemID_y'], axis=1, inplace=True) 188 | df.columns = [c.replace('_x', '_1').replace('_y', '_2') for c in df.columns] 189 | a.print_elapsed(start) 190 | 191 | start = time.time() 192 | print('Caching cleaned data ... ', end='', flush=True) 193 | 194 | # Save updated dataset 195 | if mode == 0: 196 | #pickle.dump(df, open(cache_loc + 'train.bin', 'wb'), protocol=4) 197 | feather.write_dataframe(df, cache_loc + 'train.fthr') 198 | df.to_csv(cache_loc + 'train.csv', index=False) 199 | if mode == 1: 200 | #pickle.dump(df, open(cache_loc + 'test.bin', 'wb'), protocol=4) 201 | feather.write_dataframe(df, cache_loc + 'test.fthr') 202 | df.to_csv(cache_loc + 'test.csv', index=False) 203 | 204 | a.print_elapsed(start) 205 | print('Data preprocessing complete!') 206 | 207 | # Write status to status file so master script knows whether to proceed. 208 | f = open(cache_loc + 'status.txt', 'a') 209 | f.write('data_preprocessing_OK\n') 210 | f.close() 211 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/runAll.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ################################################################################################ 3 | ################################################################################################ 4 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants 5 | #### Competition: Avito Duplicate Ad Detection 6 | # Filename : runAll.sh 7 | # Description: This bash Script generates the Submission Files 8 | # Usage: 9 | # bash ./runAll.sh 10 | ################################################################################################ 11 | ################################################################################################ 12 | 13 | 14 | echo "`tput smso` Running Data Preprocessing`tput rmso`" 15 | python3 code/1_data_preprocessing.py --train 16 | python3 code/1_data_preprocessing.py --test 17 | 18 | echo "`tput smso` Running Image Processing`tput rmso`" 19 | python3 code/2_image_info.py 20 | 21 | echo "`tput smso` Extracing NGrams`tput rmso`" 22 | Rscript code/runnAll.sh train 23 | Rscript code/runnAll.sh test 24 | 25 | echo "`tput smso` Extracting NChars`tput rmso`" 26 | Rscript code/3_feature_set1b_nchar.R train 27 | Rscript code/3_feature_set1b_nchar.R test 28 | 29 | echo "`tput smso` Extracting Misc Features`tput rmso`" 30 | Rscript code/3_feature_set1c_misc.R train 31 | Rscript code/3_feature_set1c_misc.R test 32 | 33 | echo "`tput smso`Extracing Attributes `tput rmso`" 34 | Rscript code/3_feature_set1e_attribute.R train 35 | Rscript code/3_feature_set1e_attribute.R test 36 | 37 | echo "`tput smso`Extracting Special Counting Features `tput rmso`" 38 | Rscript code/3_feature_set1f_SpecialCounting.R train 39 | Rscript code/3_feature_set1f_SpecialCounting.R test 40 | 41 | echo "`tput smso` Extracting Capital Letters`tput rmso`" 42 | Rscript code/3_feature_set1g_capitalLetters.R train 43 | Rscript code/3_feature_set1g_capitalLetters.R test 44 | 45 | echo "`tput smso` Extracting hash features `tput rmso`" 46 | Rscript code/3_feature_set1h_images.R train 47 | Rscript code/3_feature_set1h_images.R test 48 | 49 | echo "`tput smso` Extracing Image Size Features `tput rmso`" 50 | Rscript code/3_feature_set1i_imagesSize.R train 51 | Rscript code/3_feature_set1i_imagesSize.R test 52 | 53 | echo "`tput smso` Extracing Location `tput rmso`" 54 | python3 code/3_feature_set2a_lev_loc.py --train 55 | python3 code/3_feature_set2a_lev_loc.py --test 56 | 57 | echo "`tput smso` Extracing BRISK`tput rmso`" 58 | python3 code/3_feature_set2b_brisk.py --train 59 | python3 code/3_feature_set2b_brisk.py --test 60 | 61 | echo "`tput smso`Extracting Histograms `tput rmso`" 62 | python3 code/3_feature_set2c_hist.py --train 63 | python3 code/3_feature_set2c_hist.py --test 64 | 65 | echo "`tput smso`Extracing Descriptions `tput rmso`" 66 | python3 code/3_feature_set3a_description.py --train 67 | python3 code/3_feature_set3a_description.py --test 68 | 69 | echo "`tput smso`Extracting Title `tput rmso`" 70 | python3 code/3_feature_set3b_title.py --train 71 | python3 code/3_feature_set3b_title.py --test 72 | 73 | echo "`tput smso` Extracting Json `tput rmso`" 74 | python3 code/3_feature_set3c_json.py --train 75 | python3 code/3_feature_set3c_json.py --test 76 | 77 | echo "`tput smso` Extracing Jsonpart2 `tput rmso`" 78 | python3 code/3_feature_set3d_json1.py --train 79 | python3 code/3_feature_set3d_json1.py --test 80 | 81 | echo "`tput smso`Extracing hamming `tput rmso`" 82 | python3 code/3_feature_set3f_hamming.py --train 83 | python3 code/3_feature_set3f_hamming.py --test 84 | 85 | echo "`tput smso`Extracing Json to Col `tput rmso`" 86 | python3 code/3_json_to_cols.py 87 | 88 | echo "`tput smso`Extracing WOE `tput rmso`" 89 | Rscript code/3_feature_set3g_json_to_cols_createWOE.R train 90 | Rscript code/3_feature_set3g_json_to_cols_createWOE.R test 91 | 92 | echo "`tput smso` Consolidating few features `tput rmso`" 93 | Rscript code/3_feature_set3z_consolidate.R train 94 | Rscript code/3_feature_set3z_consolidate.R test 95 | 96 | echo "`tput smso` Extracing Fuzzy`tput rmso`" 97 | python3 code/3_feature_set4a_fuzzy.py --train 98 | python3 code/3_feature_set4a_fuzzy.py --test 99 | 100 | echo "`tput smso` Extracting fuzzy Clean`tput rmso`" 101 | python3 code/3_feature_set4b_fuzzy_clean.py --train 102 | python3 code/3_feature_set4b_fuzzy_clean.py --test 103 | 104 | echo "`tput smso`Extracing Alternate `tput rmso`" 105 | python3 code/3_feature_set4c_alternate.py --train 106 | python3 code/3_feature_set4c_alternate.py --test 107 | 108 | echo "`tput smso` Extracing Similarity`tput rmso`" 109 | python3 code/3_feature_set4d_similarity_clean.py --train 110 | python3 code/3_feature_set4d_similarity_clean.py --test 111 | 112 | echo "`tput smso`Extracing BOW `tput rmso`" 113 | python3 code/4_bag_of_words.py 114 | 115 | 116 | 117 | ############################################################################################ 118 | ############################################################################################ 119 | #Consolidate All Features 120 | echo "`tput smso`CONSOLIDATING ALL FEATURES `tput rmso`" 121 | Rscript code/5_consolidate_features.R train 122 | Rscript code/5_consolidate_features.R test 123 | 124 | echo "`tput smso`Replacing all NaN and Inf`tput rmso`" 125 | python3 code/5_data_postprocessing.py --train 126 | python3 code/5_data_postprocessing.py --test 127 | 128 | echo "FEATURES DONE" 129 | ############################################################################################ 130 | echo "Running models" 131 | 132 | echo "`tput smso`Running logit_v2`tput rmso`" 133 | python2 code/models/marios_logit_v2.py 134 | 135 | echo "`tput smso`Running nn_v1`tput rmso`" 136 | python2 code/models/marios_nn_v1.py 137 | 138 | echo "`tput smso`Running nnnew_v2`tput rmso`" 139 | python2 code/models/marios_nnnew_v2.py 140 | 141 | echo "`tput smso`Running nnnew_v3`tput rmso`" 142 | python2 code/models/marios_nnnew_v3.py 143 | 144 | echo "`tput smso`Running nnnew_v4`tput rmso`" 145 | python2 code/models/marios_nnnew_v4.py 146 | 147 | echo "`tput smso`Running ridge_v2`tput rmso`" 148 | python2 code/models/marios_ridge_v2.py 149 | 150 | echo "`tput smso`Running sgd_v2`tput rmso`" 151 | python2 code/models/marios_sgd_v2.py 152 | 153 | echo "`tput smso`Running xg_v1`tput rmso`" 154 | python2 code/models/marios_xg_v1.py 155 | 156 | echo "`tput smso`Running xgrank_v2`tput rmso`" 157 | python2 code/models/marios_xgrank_v2.py 158 | 159 | echo "`tput smso`Running xgrank_v3`tput rmso`" 160 | python2 code/models/marios_xgrank_v3.py 161 | 162 | echo "`tput smso`Running xgregv3`tput rmso`" 163 | python2 code/models/marios_xgregv3.py 164 | 165 | echo "`tput smso`Running xgson_v2`tput rmso`" 166 | python2 code/models/marios_xgson_v2.py 167 | 168 | echo "`tput smso`Running xgson_v3`tput rmso`" 169 | python2 code/models/marios_xgson_v3.py 170 | 171 | echo "`tput smso`Running xgson_v4`tput rmso`" 172 | python2 code/models/marios_xgson_v4.py 173 | 174 | echo "`tput smso`Running xgson_v2_v5`tput rmso`" 175 | python2 code/models/marios_xgson_v2_v5.py 176 | 177 | echo "`tput smso`Running meta-model`tput rmso`" 178 | python2 code/models/meta_rf_v1.py 179 | 180 | echo "MODELS DONE" 181 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgregv3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.preprocessing import StandardScaler 3 | from sklearn.metrics import roc_auc_score 4 | import XGBoostClassifier as xg 5 | import os 6 | import libavito 7 | import feather 8 | 9 | # bagger for xgboost 10 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True): 11 | 12 | # create array object to hold predictions 13 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))] 14 | #loop for as many times as we want bags 15 | for n in range (0, estimators): 16 | #shuff;e first, aids in increasing variance and forces different results 17 | #X_t,y_c=shuffle(X,y, random_state=seed+n) 18 | 19 | if update_seed: # update seed if requested, to give a slightly different model 20 | model.set_params(random_state=seed + n) 21 | model.fit(X_t,y_c) # fit model0.0917411475506 22 | preds=model.predict(xt) # predict probabilities 23 | # update bag's array 24 | for j in range (0, (xt.shape[0])): 25 | baggedpred[j]+=preds[j] 26 | print("done bag %d " % (n)) 27 | # divide with number of bags to create an average estimate 28 | for j in range (0, len(baggedpred)): 29 | baggedpred[j]/=float(estimators) 30 | # return probabilities 31 | return np.array(baggedpred) 32 | 33 | 34 | 35 | 36 | def loadcolumn(filename,col=4, skip=1, floats=True): 37 | pred=[] 38 | op=open(filename,'r') 39 | if skip==1: 40 | op.readline() #header 41 | for line in op: 42 | line=line.replace('\n','') 43 | sps=line.split(',') 44 | #load always the last columns 45 | if floats: 46 | pred.append(float(sps[col])) 47 | else : 48 | pred.append(str(sps[col])) 49 | op.close() 50 | return pred 51 | 52 | 53 | def printfilcsve(X, filename): 54 | 55 | np.savetxt(filename,X, fmt='%.5f') 56 | 57 | 58 | # read the train and test allclean.csv files. skip errors 59 | def readfile(name, index=0): 60 | dopen=open(name,"r") 61 | array=[] 62 | skip_firstrow=False 63 | if index!=0: 64 | skip_firstrow=True 65 | for i,line in enumerate(dopen): 66 | if i==0 and skip_firstrow: 67 | continue 68 | splits=line.replace("\n","").replace(" ","").split(",") 69 | ar=[] 70 | for k in splits: 71 | try: 72 | ar.append(float(k)) 73 | except: 74 | ar.append(0.0) 75 | print(" the string is %s ok?" % ((k))) 76 | array.append(ar)#[float(k)0.971474 if k!="0" else 0.0 for k in splits ]) 77 | if i%100000==0: 78 | print(" we are at " , str(i)) 79 | return np.array(array) 80 | 81 | 82 | def main(): 83 | 84 | config = libavito.get_config() 85 | cache_loc = config.cache_loc 86 | nthreads = config.nthreads 87 | 88 | Usecv=True # true will split the training data 66-33 and do cv 89 | SEED=15 90 | threads=nthreads # number of workers for parallelism 91 | 92 | ######### Load files ############ 93 | print("Loading input data") 94 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr') 95 | y = train['isDuplicate'].values 96 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values 97 | del train 98 | print(X.shape) 99 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr') 100 | ids = test['id'].values 101 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values 102 | del test 103 | print(X_test.shape) 104 | 105 | 106 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions 107 | if not os.path.exists(metafolder): #if it does not exists, we create it 108 | os.makedirs(metafolder) 109 | outset="marios_xgreg_v3" # predic of all files 110 | 111 | #model to use 112 | 113 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] 114 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )] 115 | kfolder=[[idex1,idex2]] 116 | #Create Arrays for meta 117 | train_stacker=[ 0.0 for k in range (0,len(idex2)) ] 118 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))] 119 | # CHECK EVerything in five..it could be more efficient 120 | 121 | #create target variable 122 | mean_kapa = 0.0 123 | #kfolder=StratifiedKFold(y, n_folds=number_of_folds,shuffle=True, random_state=SEED) 124 | #number_of_folds=0 125 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time 126 | i=0 # iterator counter 127 | if Usecv: 128 | print ("starting cross validation") 129 | for train_index, test_index in kfolder: 130 | # creaning and validation sets 131 | X_train, X_cv = X[train_index], X[test_index] 132 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index] 133 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) )) 134 | 135 | 136 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True) 137 | 138 | 139 | # compute Loglikelihood metric for this CV fold 140 | #scalepreds(preds) 141 | kapa = roc_auc_score(y_cv,preds) 142 | print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa) 143 | 144 | mean_kapa += kapa 145 | #save the results 146 | no=0 147 | for real_index in test_index: 148 | train_stacker[no]=(preds[no]) 149 | no+=1 150 | i+=1 151 | if Usecv: 152 | print (" Average AUC: %f" % (mean_kapa) ) 153 | print (" printing train datasets ") 154 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv") 155 | 156 | 157 | #preds=bagged_set(X, y,model, SEED, 1, X_test, update_seed=True) 158 | 159 | preds=bagged_set(X, y,model, SEED , 5, X_test, update_seed=True) 160 | 161 | 162 | for pr in range (0,len(preds)): 163 | test_stacker[pr]=(preds[pr]) 164 | 165 | preds=np.array(preds) 166 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv") 167 | 168 | 169 | print("Write results...") 170 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv" 171 | print("Writing submission to %s" % output_file) 172 | f = open(config.output_loc + output_file, "w") 173 | f.write("id,probability\n")# the header 174 | for g in range(0, len(preds)) : 175 | pr=preds[g] 176 | f.write("%d,%f\n" % (((ids[g]),pr ) ) ) 177 | f.close() 178 | print("Done.") 179 | 180 | 181 | 182 | 183 | 184 | 185 | if __name__=="__main__": 186 | main() 187 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgson_v4.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import numpy as np 4 | from sklearn.metrics import roc_auc_score 5 | import XGBoostClassifier as xg 6 | import os 7 | import libavito 8 | import feather 9 | 10 | #load a single column from file 11 | def loadcolumn(filename,col=4, skip=1, floats=True): 12 | pred=[] 13 | op=open(filename,'r') 14 | if skip==1: 15 | op.readline() #header 16 | for line in op: 17 | line=line.replace('\n','') 18 | sps=line.split(',') 19 | #load always the last columns 20 | if floats: 21 | pred.append(float(sps[col])) 22 | else : 23 | pred.append(str(sps[col])) 24 | op.close() 25 | return pred 26 | 27 | #export file in csv using numpy 28 | def printfilcsve(X, filename): 29 | 30 | np.savetxt(filename,X, fmt='%.5f') 31 | 32 | # read the train and test allclean.csv files. skip errors 33 | def readfile(name, index=0): 34 | dopen=open(name,"r") 35 | array=[] 36 | skip_firstrow=False 37 | if index!=0: 38 | skip_firstrow=True 39 | for i,line in enumerate(dopen): 40 | if i==0 and skip_firstrow: 41 | continue 42 | splits=line.replace("\n","").replace(" ","").split(",") 43 | ar=[] 44 | for k in splits: 45 | try: 46 | ar.append(float(k)) 47 | except: 48 | ar.append(0.0) 49 | print(" the string is %s ok?" % ((k))) 50 | array.append(ar)#[float(k) if k!="0" else 0.0 for k in splits ]) 51 | if i%100000==0: 52 | print(" we are at " , str(i)) 53 | return np.array(array) 54 | 55 | 56 | 57 | # bagger for xgboost 58 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True): 59 | 60 | # create array object to hold predictions 61 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))] 62 | #loop for as many times as we want bags 63 | for n in range (0, estimators): 64 | #shuff;e first, aids in increasing variance and forces different results 65 | #X_t,y_c=shuffle(X,y, random_state=seed+n) 66 | 67 | if update_seed: # update seed if requested, to give a slightly different model 68 | model.set_params(random_state=seed + n) 69 | model.fit(X_t,y_c) # fit model0.0917411475506 70 | preds=model.predict_proba(xt)[:,1] # predict probabilities 71 | # update bag's array 72 | for j in range (0, (xt.shape[0])): 73 | baggedpred[j]+=preds[j] 74 | print("done bag %d " % (n)) 75 | # divide with number of bags to create an average estimate 76 | for j in range (0, len(baggedpred)): 77 | baggedpred[j]/=float(estimators) 78 | # return probabilities 79 | return np.array(baggedpred) 80 | 81 | def main(): 82 | 83 | config = libavito.get_config() 84 | cache_loc = config.cache_loc 85 | nthreads = config.nthreads 86 | 87 | Usecv=True # true will split the training data 66-33 and do cv 88 | SEED=15 89 | threads=nthreads # number of workers for parallelism 90 | 91 | ######### Load files ############ 92 | print("Loading input data") 93 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr') 94 | y = train['isDuplicate'].values 95 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values 96 | del train 97 | print(X.shape) 98 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr') 99 | ids = test['id'].values 100 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values 101 | del test 102 | print(X_test.shape) 103 | 104 | 105 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions 106 | if not os.path.exists(metafolder): #if it does not exists, we create it 107 | os.makedirs(metafolder) 108 | outset="marios_xgson_v4" # predic of all files 109 | 110 | #model to use 111 | 112 | model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads, eta=0.02, gamma=7.0,max_depth=20, min_child_weight=20, subsample=0.9, colsample_bytree=0.4,objective='binary:logistic',seed=1) 113 | 114 | #Create Arrays for meta 115 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai 116 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )] #indices for test 117 | kfolder=[[idex1,idex2]] # create an object to put indices in 118 | 119 | #arrays to save predictions for validation and test for meta modelling (stacking) 120 | train_stacker=[ 0.0 for k in range (0,len(idex2)) ] 121 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))] 122 | 123 | #create target variable 124 | mean_kapa = 0.0 125 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time 126 | i=0 # iterator counter 127 | if Usecv: 128 | print ("starting cross validation" ) 129 | for train_index, test_index in kfolder: 130 | # creaning and validation sets 131 | X_train, X_cv = X[train_index], X[test_index] 132 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index] 133 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) )) 134 | 135 | #use xgboost bagger 136 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True) 137 | 138 | # compute Loglikelihood metric for this CV fold 139 | #scalepreds(preds) 140 | kapa = roc_auc_score(y_cv,preds) 141 | print("size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)) 142 | 143 | mean_kapa += kapa 144 | #save the results 145 | no=0 146 | for real_index in test_index: 147 | train_stacker[no]=(preds[no]) 148 | no+=1 149 | i+=1 150 | if (Usecv): 151 | #print the array of validation predictions for stacking later on inside the 'meta_folder' 152 | print (" Average AUC: %f" % (mean_kapa) ) 153 | print (" printing train datasets ") 154 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv") 155 | 156 | preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True) 157 | 158 | 159 | for pr in range (0,len(preds)): 160 | test_stacker[pr]=(preds[pr]) 161 | #print prediction as numpy array for stacking later on 162 | preds=np.array(preds) 163 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv") 164 | 165 | #create submission file 166 | print("Write results...") 167 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv" 168 | print("Writing submission to %s" % output_file) 169 | f = open(config.output_loc + output_file, "w") 170 | f.write("id,probability\n")# the header 171 | for g in range(0, len(preds)) : 172 | pr=preds[g] 173 | f.write("%d,%f\n" % (((ids[g]),pr ) ) ) 174 | f.close() 175 | print("Done.") 176 | 177 | if __name__=="__main__": 178 | main() 179 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgsonv2_v5.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import numpy as np 4 | from sklearn.metrics import roc_auc_score 5 | import XGBoostClassifier as xg 6 | import os 7 | import libavito 8 | import feather 9 | 10 | #load a single column from file 11 | def loadcolumn(filename,col=4, skip=1, floats=True): 12 | pred=[] 13 | op=open(filename,'r') 14 | if skip==1: 15 | op.readline() #header 16 | for line in op: 17 | line=line.replace('\n','') 18 | sps=line.split(',') 19 | #load always the last columns 20 | if floats: 21 | pred.append(float(sps[col])) 22 | else : 23 | pred.append(str(sps[col])) 24 | op.close() 25 | return pred 26 | 27 | #export file in csv using numpy 28 | def printfilcsve(X, filename): 29 | 30 | np.savetxt(filename,X, fmt='%.5f') 31 | 32 | # read the train and test allclean.csv files. skip errors 33 | def readfile(name, index=0): 34 | dopen=open(name,"r") 35 | array=[] 36 | skip_firstrow=False 37 | if index!=0: 38 | skip_firstrow=True 39 | for i,line in enumerate(dopen): 40 | if i==0 and skip_firstrow: 41 | continue 42 | splits=line.replace("\n","").replace(" ","").split(",") 43 | ar=[] 44 | for k in splits: 45 | try: 46 | ar.append(float(k)) 47 | except: 48 | ar.append(0.0) 49 | print(" the string is %s ok?" % ((k))) 50 | array.append(ar)#[float(k) if k!="0" else 0.0 for k in splits ]) 51 | if i%100000==0: 52 | print(" we are at " , str(i)) 53 | return np.array(array) 54 | 55 | 56 | 57 | # bagger for xgboost 58 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True): 59 | 60 | # create array object to hold predictions 61 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))] 62 | #loop for as many times as we want bags 63 | for n in range (0, estimators): 64 | #shuff;e first, aids in increasing variance and forces different results 65 | #X_t,y_c=shuffle(X,y, random_state=seed+n) 66 | 67 | if update_seed: # update seed if requested, to give a slightly different model 68 | model.set_params(random_state=seed + n) 69 | model.fit(X_t,y_c) # fit model0.0917411475506 70 | preds=model.predict_proba(xt)[:,1] # predict probabilities 71 | # update bag's array 72 | for j in range (0, (xt.shape[0])): 73 | baggedpred[j]+=preds[j] 74 | print("done bag %d " % (n)) 75 | # divide with number of bags to create an average estimate 76 | for j in range (0, len(baggedpred)): 77 | baggedpred[j]/=float(estimators) 78 | # return probabilities 79 | return np.array(baggedpred) 80 | 81 | def main(): 82 | 83 | config = libavito.get_config() 84 | cache_loc = config.cache_loc 85 | nthreads = config.nthreads 86 | 87 | Usecv=True # true will split the training data 66-33 and do cv 88 | SEED=15 89 | threads=nthreads # number of workers for parallelism 90 | 91 | ######### Load files ############ 92 | print("Loading input data") 93 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr') 94 | y = train['isDuplicate'].values 95 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values 96 | del train 97 | print(X.shape) 98 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr') 99 | ids = test['id'].values 100 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values 101 | del test 102 | print(X_test.shape) 103 | 104 | 105 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions 106 | if not os.path.exists(metafolder): #if it does not exists, we create it 107 | os.makedirs(metafolder) 108 | outset="marios_xgsonv2_v5" # predic of all files 109 | 110 | #model to use 111 | 112 | model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads, eta=0.1, gamma=0.0,max_depth=20, min_child_weight=1, subsample=1.0, 113 | colsample_bytree=0.9,objective='binary:logistic',silent=True, seed=1) 114 | 115 | #Create Arrays for meta 116 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai 117 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )] #indices for test 118 | kfolder=[[idex1,idex2]] # create an object to put indices in 119 | 120 | #arrays to save predictions for validation and test for meta modelling (stacking) 121 | train_stacker=[ 0.0 for k in range (0,len(idex2)) ] 122 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))] 123 | 124 | #create target variable 125 | mean_kapa = 0.0 126 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time 127 | i=0 # iterator counter 128 | if Usecv: 129 | print ("starting cross validation" ) 130 | for train_index, test_index in kfolder: 131 | # creaning and validation sets 132 | X_train, X_cv = X[train_index], X[test_index] 133 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index] 134 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) )) 135 | 136 | #use xgboost bagger 137 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True) 138 | 139 | # compute Loglikelihood metric for this CV fold 140 | #scalepreds(preds) 141 | kapa = roc_auc_score(y_cv,preds) 142 | print("size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)) 143 | 144 | mean_kapa += kapa 145 | #save the results 146 | no=0 147 | for real_index in test_index: 148 | train_stacker[no]=(preds[no]) 149 | no+=1 150 | i+=1 151 | if (Usecv): 152 | #print the array of validation predictions for stacking later on inside the 'meta_folder' 153 | print (" Average AUC: %f" % (mean_kapa) ) 154 | print (" printing train datasets ") 155 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv") 156 | 157 | preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True) 158 | 159 | 160 | for pr in range (0,len(preds)): 161 | test_stacker[pr]=(preds[pr]) 162 | #print prediction as numpy array for stacking later on 163 | preds=np.array(preds) 164 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv") 165 | 166 | #create submission file 167 | print("Write results...") 168 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv" 169 | print("Writing submission to %s" % output_file) 170 | f = open(config.output_loc + output_file, "w") 171 | f.write("id,probability\n")# the header 172 | for g in range(0, len(preds)) : 173 | pr=preds[g] 174 | f.write("%d,%f\n" % (((ids[g]),pr ) ) ) 175 | f.close() 176 | print("Done.") 177 | 178 | if __name__=="__main__": 179 | main() 180 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgrank_v2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.externals import joblib 3 | from sklearn.preprocessing import StandardScaler 4 | from sklearn.metrics import roc_auc_score 5 | import XGBoostClassifier as xg 6 | import os 7 | import libavito 8 | import feather 9 | 10 | # bagger for xgboost 11 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True): 12 | 13 | # create array object to hold predictions 14 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))] 15 | #loop for as many times as we want bags 16 | for n in range (0, estimators): 17 | #shuff;e first, aids in increasing variance and forces different results 18 | #X_t,y_c=shuffle(X,y, random_state=seed+n) 19 | 20 | if update_seed: # update seed if requested, to give a slightly different model 21 | model.set_params(random_state=seed + n) 22 | model.fit(X_t,y_c) # fit model0.0917411475506 23 | preds=model.predict(xt) # predict probabilities 24 | # update bag's array 25 | for j in range (0, (xt.shape[0])): 26 | baggedpred[j]+=preds[j] 27 | print("done bag %d " % (n)) 28 | # divide with number of bags to create an average estimate 29 | for j in range (0, len(baggedpred)): 30 | baggedpred[j]/=float(estimators) 31 | # return probabilities 32 | return np.array(baggedpred) 33 | 34 | 35 | 36 | 37 | def loadcolumn(filename,col=4, skip=1, floats=True): 38 | pred=[] 39 | op=open(filename,'r') 40 | if skip==1: 41 | op.readline() #header 42 | for line in op: 43 | line=line.replace('\n','') 44 | sps=line.split(',') 45 | #load always the last columns 46 | if floats: 47 | pred.append(float(sps[col])) 48 | else : 49 | pred.append(str(sps[col])) 50 | op.close() 51 | return pred 52 | 53 | def printfilcsve(X, filename): 54 | 55 | np.savetxt(filename,X, fmt='%.5f') 56 | 57 | 58 | # read the train and test allclean.csv files. skip errors 59 | def readfile(name, index=0): 60 | dopen=open(name,"r") 61 | array=[] 62 | skip_firstrow=False 63 | if index!=0: 64 | skip_firstrow=True 65 | for i,line in enumerate(dopen): 66 | if i==0 and skip_firstrow: 67 | continue 68 | splits=line.replace("\n","").replace(" ","").split(",") 69 | ar=[] 70 | for k in splits: 71 | try: 72 | ar.append(float(k)) 73 | except: 74 | ar.append(0.0) 75 | print(" the string is %s ok?" % ((k))) 76 | array.append(ar)#[float(k)0.971474 if k!="0" else 0.0 for k in splits ]) 77 | if i%100000==0: 78 | print(" we are at " , str(i)) 79 | return np.array(array) 80 | 81 | 82 | def main(): 83 | 84 | config = libavito.get_config() 85 | cache_loc = config.cache_loc 86 | nthreads = config.nthreads 87 | 88 | Usecv=True # true will split the training data 66-33 and do cv 89 | SEED=15 90 | threads=nthreads # number of workers for parallelism 91 | 92 | ######### Load files ############ 93 | print("Loading input data") 94 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr') 95 | y = train['isDuplicate'].values 96 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values 97 | del train 98 | print(X.shape) 99 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr') 100 | ids = test['id'].values 101 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values 102 | del test 103 | print(X_test.shape) 104 | 105 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions 106 | 107 | model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads, eta=0.02, gamma=7.0,max_depth=15, min_child_weight=20, subsample=0.9, 108 | colsample_bytree=0.4,objective='rank:pairwise',seed=1) 109 | 110 | #create meta folder to drop predictions for train and test 111 | if not os.path.exists(metafolder): #if it does not exists, we create it 112 | os.makedirs(metafolder) 113 | 114 | outset="marios_xgrank_v2" # predic of all files 115 | 116 | #model to use 117 | 118 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] 119 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )] 120 | kfolder=[[idex1,idex2]] 121 | #Create Arrays for meta 122 | train_stacker=[ 0.0 for k in range (0,(idex2.shape[0])) ] 123 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))] 124 | # CHECK EVerything in five..it could be more efficient 125 | 126 | #create target variable 127 | mean_kapa = 0.0 128 | #kfolder=StratifiedKFold(y, n_folds=number_of_folds,shuffle=True, random_state=SEED) 129 | #number_of_folds=0 130 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time 131 | i=0 # iterator counter 132 | if Usecv: 133 | print ("starting cross validation") 134 | for train_index, test_index in kfolder: 135 | # creaning and validation sets 136 | X_train, X_cv = X[train_index], X[test_index] 137 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index] 138 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) )) 139 | 140 | 141 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True) 142 | 143 | 144 | # compute Loglikelihood metric for this CV fold 145 | #scalepreds(preds) 146 | kapa = roc_auc_score(y_cv,preds) 147 | print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa) 148 | 149 | mean_kapa += kapa 150 | #save the results 151 | no=0 152 | for real_index in test_index: 153 | train_stacker[no]=(preds[no]) 154 | no+=1 155 | i+=1 156 | if Usecv: 157 | print (" Average AUC: %f" % (mean_kapa) ) 158 | print (" printing train datasets ") 159 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv") 160 | 161 | 162 | #preds=bagged_set(X, y,model, SEED, 1, X_test, update_seed=True) 163 | 164 | preds=bagged_set(X, y,model, SEED , 5, X_test, update_seed=True) 165 | 166 | 167 | for pr in range (0,len(preds)): 168 | test_stacker[pr]=(preds[pr]) 169 | 170 | preds=np.array(preds) 171 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv") 172 | 173 | 174 | print("Write results...") 175 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv" 176 | print("Writing submission to %s" % output_file) 177 | f = open(config.output_loc + output_file, "w") 178 | f.write("id,probability\n")# the header 179 | for g in range(0, len(preds)) : 180 | pr=preds[g] 181 | f.write("%d,%f\n" % (((ids[g]),pr ) ) ) 182 | f.close() 183 | print("Done.") 184 | 185 | 186 | 187 | 188 | 189 | 190 | if __name__=="__main__": 191 | main() 192 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgson_v2.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import numpy as np 4 | from sklearn.externals import joblib 5 | from sklearn.metrics import roc_auc_score 6 | import XGBoostClassifier as xg 7 | import os 8 | import libavito 9 | import feather 10 | 11 | #load a single column from file 12 | def loadcolumn(filename,col=4, skip=1, floats=True): 13 | pred=[] 14 | op=open(filename,'r') 15 | if skip==1: 16 | op.readline() #header 17 | for line in op: 18 | line=line.replace('\n','') 19 | sps=line.split(',') 20 | #load always the last columns 21 | if floats: 22 | pred.append(float(sps[col])) 23 | else : 24 | pred.append(str(sps[col])) 25 | op.close() 26 | return pred 27 | 28 | 29 | #export file in csv using numpy 30 | def printfilcsve(X, filename): 31 | 32 | np.savetxt(filename,X, fmt='%.5f') 33 | 34 | # read the train and test allclean.csv files. skip errors 35 | def readfile(name, index=0): 36 | dopen=open(name,"r") 37 | array=[] 38 | skip_firstrow=False 39 | if index!=0: 40 | skip_firstrow=True 41 | for i,line in enumerate(dopen): 42 | if i==0 and skip_firstrow: 43 | continue 44 | splits=line.replace("\n","").replace(" ","").split(",") 45 | ar=[] 46 | for k in splits: 47 | try: 48 | ar.append(float(k)) 49 | except: 50 | ar.append(0.0) 51 | print(" the string is %s ok?" % ((k))) 52 | array.append(ar)#[float(k) if k!="0" else 0.0 for k in splits ]) 53 | if i%100000==0: 54 | print(" we are at " , str(i)) 55 | return np.array(array) 56 | 57 | 58 | # bagger for xgboost 59 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True): 60 | 61 | # create array object to hold predictions 62 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))] 63 | #loop for as many times as we want bags 64 | for n in range (0, estimators): 65 | #shuff;e first, aids in increasing variance and forces different results 66 | #X_t,y_c=shuffle(X,y, random_state=seed+n) 67 | 68 | if update_seed: # update seed if requested, to give a slightly different model 69 | model.set_params(random_state=seed + n) 70 | model.fit(X_t,y_c) # fit model0.0917411475506 71 | preds=model.predict_proba(xt)[:,1] # predict probabilities 72 | # update bag's array 73 | for j in range (0, (xt.shape[0])): 74 | baggedpred[j]+=preds[j] 75 | print("done bag %d " % (n)) 76 | # divide with number of bags to create an average estimate 77 | for j in range (0, len(baggedpred)): 78 | baggedpred[j]/=float(estimators) 79 | # return probabilities 80 | return np.array(baggedpred) 81 | 82 | 83 | def main(): 84 | config = libavito.get_config() 85 | cache_loc = config.cache_loc 86 | nthreads = config.nthreads 87 | 88 | Usecv=True # true will split the training data 66-33 and do cv 89 | SEED=15 90 | threads=nthreads # number of workers for parallelism 91 | 92 | ######### Load files ############ 93 | print("Loading input data") 94 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr') 95 | y = train['isDuplicate'].values 96 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values 97 | del train 98 | print(X.shape) 99 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr') 100 | ids = test['id'].values 101 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values 102 | del test 103 | print(X_test.shape) 104 | 105 | 106 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions 107 | if not os.path.exists(metafolder): #if it does not exists, we create it 108 | os.makedirs(metafolder) 109 | outset="marios_xgson_v2" # predic of all files 110 | 111 | #model to use 112 | 113 | model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads, eta=0.02, gamma=7.0,max_depth=20, min_child_weight=20, subsample=0.9, 114 | colsample_bytree=0.4,objective='binary:logistic',seed=1) 115 | 116 | #Create Arrays for meta 117 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai 118 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )] #indices for test 119 | kfolder=[[idex1,idex2]] # create an object to put indices in 120 | 121 | #arrays to save predictions for validation and test for meta modelling (stacking) 122 | train_stacker=[ 0.0 for k in range (0,(X.shape[0])) ] 123 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))] 124 | 125 | #create target variable 126 | mean_kapa = 0.0 127 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time 128 | i=0 # iterator counter 129 | if Usecv: 130 | print ("starting cross validation" ) 131 | for train_index, test_index in kfolder: 132 | # creaning and validation sets 133 | X_train, X_cv = X[train_index], X[test_index] 134 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index] 135 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) )) 136 | 137 | #use xgboost bagger 138 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True) 139 | 140 | # compute Loglikelihood metric for this CV fold 141 | #scalepreds(preds) 142 | kapa = roc_auc_score(y_cv,preds) 143 | print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa) 144 | 145 | mean_kapa += kapa 146 | #save the results 147 | no=0 148 | for real_index in test_index: 149 | train_stacker[no]=(preds[no]) 150 | no+=1 151 | i+=1 152 | if (Usecv): 153 | #print the array of validation predictions for stacking later on inside the 'meta_folder' 154 | print (" Average AUC: %f" % (mean_kapa) ) 155 | print (" printing train datasets ") 156 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv") 157 | 158 | preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True) 159 | 160 | 161 | for pr in range (0,len(preds)): 162 | test_stacker[pr]=(preds[pr]) 163 | #print prediction as numpy array for stacking later on 164 | preds=np.array(preds) 165 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv") 166 | 167 | #create submission file 168 | print("Write results...") 169 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv" 170 | print("Writing submission to %s" % output_file) 171 | f = open(config.output_loc + output_file, "w") 172 | f.write("id,probability\n")# the header 173 | for g in range(0, len(preds)) : 174 | pr=preds[g] 175 | f.write("%d,%f\n" % (((ids[g]),pr ) ) ) 176 | f.close() 177 | print("Done.") 178 | 179 | 180 | 181 | 182 | 183 | 184 | if __name__=="__main__": 185 | main() 186 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgson_v3.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import numpy as np 4 | from sklearn.externals import joblib 5 | from sklearn.metrics import roc_auc_score 6 | import XGBoostClassifier as xg 7 | import os 8 | import libavito 9 | import feather 10 | 11 | #load a single column from file 12 | def loadcolumn(filename,col=4, skip=1, floats=True): 13 | pred=[] 14 | op=open(filename,'r') 15 | if skip==1: 16 | op.readline() #header 17 | for line in op: 18 | line=line.replace('\n','') 19 | sps=line.split(',') 20 | #load always the last columns 21 | if floats: 22 | pred.append(float(sps[col])) 23 | else : 24 | pred.append(str(sps[col])) 25 | op.close() 26 | return pred 27 | 28 | 29 | 30 | #export file in csv using numpy 31 | def printfilcsve(X, filename): 32 | 33 | np.savetxt(filename,X, fmt='%.5f') 34 | 35 | # read the train and test allclean.csv files. skip errors 36 | def readfile(name, index=0): 37 | dopen=open(name,"r") 38 | array=[] 39 | skip_firstrow=False 40 | if index!=0: 41 | skip_firstrow=True 42 | for i,line in enumerate(dopen): 43 | if i==0 and skip_firstrow: 44 | continue 45 | splits=line.replace("\n","").replace(" ","").split(",") 46 | ar=[] 47 | for k in splits: 48 | try: 49 | ar.append(float(k)) 50 | except: 51 | ar.append(0.0) 52 | print(" the string is %s ok?" % ((k))) 53 | array.append(ar)#[float(k) if k!="0" else 0.0 for k in splits ]) 54 | if i%100000==0: 55 | print(" we are at " , str(i)) 56 | return np.array(array) 57 | 58 | 59 | # bagger for xgboost 60 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True): 61 | 62 | # create array object to hold predictions 63 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))] 64 | #loop for as many times as we want bags 65 | for n in range (0, estimators): 66 | #shuff;e first, aids in increasing variance and forces different results 67 | #X_t,y_c=shuffle(X,y, random_state=seed+n) 68 | 69 | if update_seed: # update seed if requested, to give a slightly different model 70 | model.set_params(random_state=seed + n) 71 | model.fit(X_t,y_c) # fit model0.0917411475506 72 | preds=model.predict_proba(xt)[:,1] # predict probabilities 73 | # update bag's array 74 | for j in range (0, (xt.shape[0])): 75 | baggedpred[j]+=preds[j] 76 | print("done bag %d " % (n)) 77 | # divide with number of bags to create an average estimate 78 | for j in range (0, len(baggedpred)): 79 | baggedpred[j]/=float(estimators) 80 | # return probabilities 81 | return np.array(baggedpred) 82 | 83 | 84 | def main(): 85 | 86 | config = libavito.get_config() 87 | cache_loc = config.cache_loc 88 | nthreads = config.nthreads 89 | 90 | Usecv=True # true will split the training data 66-33 and do cv 91 | SEED=15 92 | threads=nthreads # number of workers for parallelism 93 | 94 | ######### Load files ############ 95 | print("Loading input data") 96 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr') 97 | y = train['isDuplicate'].values 98 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values 99 | del train 100 | print(X.shape) 101 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr') 102 | ids = test['id'].values 103 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values 104 | del test 105 | print(X_test.shape) 106 | 107 | 108 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions 109 | if not os.path.exists(metafolder): #if it does not exists, we create it 110 | os.makedirs(metafolder) 111 | outset="marios_xgson_v3" # predic of all files 112 | 113 | #model to use 114 | 115 | model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads, eta=0.02, gamma=7.0,max_depth=20, min_child_weight=20, subsample=0.9, 116 | colsample_bytree=0.4,objective='binary:logistic',seed=1) 117 | 118 | #Create Arrays for meta 119 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai 120 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )] #indices for test 121 | kfolder=[[idex1,idex2]] # create an object to put indices in 122 | 123 | #arrays to save predictions for validation and test for meta modelling (stacking) 124 | train_stacker=[ 0.0 for k in range (0,(idex2.shape[0])) ] 125 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))] 126 | 127 | #create target variable 128 | mean_kapa = 0.0 129 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time 130 | i=0 # iterator counter 131 | if Usecv: 132 | print ("starting cross validation" ) 133 | for train_index, test_index in kfolder: 134 | # creaning and validation sets 135 | X_train, X_cv = X[train_index], X[test_index] 136 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index] 137 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) )) 138 | 139 | #use xgboost bagger 140 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True) 141 | 142 | # compute Loglikelihood metric for this CV fold 143 | #scalepreds(preds) 144 | kapa = roc_auc_score(y_cv,preds) 145 | print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa) 146 | 147 | mean_kapa += kapa 148 | #save the results 149 | no=0 150 | for real_index in test_index: 151 | train_stacker[no]=(preds[no]) 152 | no+=1 153 | i+=1 154 | if (Usecv): 155 | #print the array of validation predictions for stacking later on inside the 'meta_folder' 156 | print (" Average AUC: %f" % (mean_kapa) ) 157 | print (" printing train datasets ") 158 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv") 159 | 160 | preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True) 161 | 162 | 163 | for pr in range (0,len(preds)): 164 | test_stacker[pr]=(preds[pr]) 165 | #print prediction as numpy array for stacking later on 166 | preds=np.array(preds) 167 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv") 168 | 169 | #create submission file 170 | print("Write results...") 171 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv" 172 | print("Writing submission to %s" % output_file) 173 | f = open(config.output_loc + output_file, "w")s 174 | f.write("id,probability\n")# the header 175 | for g in range(0, len(preds)) : 176 | pr=preds[g] 177 | f.write("%d,%f\n" % (((ids[g]),pr ) ) ) 178 | f.close() 179 | print("Done.") 180 | 181 | 182 | 183 | 184 | 185 | 186 | if __name__=="__main__": 187 | main() 188 | -------------------------------------------------------------------------------- /Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgrank_v3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.preprocessing import StandardScaler 3 | from sklearn.metrics import roc_auc_score 4 | import XGBoostClassifier as xg 5 | import os 6 | import libavito 7 | import feather 8 | 9 | # bagger for xgboost 10 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True): 11 | 12 | # create array object to hold predictions 13 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))] 14 | #loop for as many times as we want bags 15 | for n in range (0, estimators): 16 | #shuff;e first, aids in increasing variance and forces different results 17 | #X_t,y_c=shuffle(X,y, random_state=seed+n) 18 | 19 | if update_seed: # update seed if requested, to give a slightly different model 20 | model.set_params(random_state=seed + n) 21 | model.fit(X_t,y_c) # fit model0.0917411475506 22 | preds=model.predict(xt) # predict probabilities 23 | # update bag's array 24 | for j in range (0, (xt.shape[0])): 25 | baggedpred[j]+=preds[j] 26 | print("done bag %d " % (n)) 27 | # divide with number of bags to create an average estimate 28 | for j in range (0, len(baggedpred)): 29 | baggedpred[j]/=float(estimators) 30 | # return probabilities 31 | return np.array(baggedpred) 32 | 33 | def loadcolumn(filename,col=4, skip=1, floats=True): 34 | pred=[] 35 | op=open(filename,'r') 36 | if skip==1: 37 | op.readline() #header 38 | for line in op: 39 | line=line.replace('\n','') 40 | sps=line.split(',') 41 | #load always the last columns 42 | if floats: 43 | pred.append(float(sps[col])) 44 | else : 45 | pred.append(str(sps[col])) 46 | op.close() 47 | return pred 48 | 49 | 50 | def printfilcsve(X, filename): 51 | 52 | np.savetxt(filename,X, fmt='%.5f') 53 | 54 | 55 | # read the train and test allclean.csv files. skip errors 56 | def readfile(name, index=0): 57 | dopen=open(name,"r") 58 | array=[] 59 | skip_firstrow=False 60 | if index!=0: 61 | skip_firstrow=True 62 | for i,line in enumerate(dopen): 63 | if i==0 and skip_firstrow: 64 | continue 65 | splits=line.replace("\n","").replace(" ","").split(",") 66 | ar=[] 67 | for k in splits: 68 | try: 69 | ar.append(float(k)) 70 | except: 71 | ar.append(0.0) 72 | print(" the string is %s ok?" % ((k))) 73 | array.append(ar)#[float(k)0.971474 if k!="0" else 0.0 for k in splits ]) 74 | if i%100000==0: 75 | print(" we are at " , str(i)) 76 | return np.array(array) 77 | 78 | 79 | def main(): 80 | 81 | Use_scale=True 82 | Usecv=True # true will split the training data 66-33 and do cv 83 | SEED=15 84 | threads=nthreads # number of workers for parallelism 85 | 86 | ######### Load files ############ 87 | print("Loading input data") 88 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr') 89 | y = train['isDuplicate'].values 90 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values 91 | del train 92 | print(X.shape) 93 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr') 94 | ids = test['id'].values 95 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values 96 | del test 97 | print(X_test.shape) 98 | 99 | 100 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions 101 | 102 | 103 | model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads, eta=0.02, gamma=7.0,max_depth=15, min_child_weight=20, subsample=0.9, 104 | colsample_bytree=0.4,objective='rank:pairwise',seed=1) 105 | 106 | if not os.path.exists(metafolder): #if it does not exists, we create it 107 | os.makedirs(metafolder) 108 | 109 | outset="marios_xgrank_v3" # predic of all files 110 | 111 | #model to use 112 | 113 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] 114 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )] 115 | kfolder=[[idex1,idex2]] 116 | #Create Arrays for meta 117 | train_stacker=[ 0.0 for k in range (0,len(idex2)) ] 118 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))] 119 | # CHECK EVerything in five..it could be more efficient 120 | 121 | #create target variable 122 | mean_kapa = 0.0 123 | #kfolder=StratifiedKFold(y, n_folds=number_of_folds,shuffle=True, random_state=SEED) 124 | #number_of_folds=0 125 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time 126 | i=0 # iterator counter 127 | if Usecv: 128 | print ("starting cross validation") 129 | for train_index, test_index in kfolder: 130 | # creaning and validation sets 131 | X_train, X_cv = X[train_index], X[test_index] 132 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index] 133 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) )) 134 | 135 | if Use_scale: 136 | stda=StandardScaler() 137 | X_train=stda.fit_transform(X_train) 138 | X_cv=stda.transform(X_cv) 139 | 140 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True) 141 | 142 | 143 | # compute Loglikelihood metric for this CV fold 144 | #scalepreds(preds) 145 | kapa = roc_auc_score(y_cv,preds) 146 | print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa) 147 | 148 | mean_kapa += kapa 149 | #save the results 150 | no=0 151 | for real_index in test_index: 152 | train_stacker[no]=(preds[no]) 153 | no+=1 154 | i+=1 155 | if Usecv: 156 | print (" Average AUC: %f" % (mean_kapa) ) 157 | print (" printing train datasets ") 158 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv") 159 | 160 | if Use_scale: 161 | stda=StandardScaler() 162 | X=stda.fit_transform(X) 163 | X_test=stda.transform(X_test) 164 | 165 | #preds=bagged_set(X, y,model, SEED, 1, X_test, update_seed=True) 166 | 167 | preds=bagged_set(X, y,model, SEED , 5, X_test, update_seed=True) 168 | 169 | 170 | for pr in range (0,len(preds)): 171 | test_stacker[pr]=(preds[pr]) 172 | 173 | preds=np.array(preds) 174 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv") 175 | 176 | 177 | print("Write results...") 178 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv" 179 | print("Writing submission to %s" % output_file) 180 | f = open(config.output_loc + output_file, "w") 181 | f.write("id,probability\n")# the header 182 | for g in range(0, len(preds)) : 183 | pr=preds[g] 184 | f.write("%d,%f\n" % (((ids[g]),pr ) ) ) 185 | f.close() 186 | print("Done.") 187 | 188 | 189 | 190 | 191 | 192 | 193 | if __name__=="__main__": 194 | main() 195 | --------------------------------------------------------------------------------