├── AnalyticsVidhya
    ├── readme.md
    ├── WNS-analytics-wizard-2019
    │   └── README.md
    ├── amexpert-2019-machine-learning-hackathon
    │   └── code
    │   │   ├── README.md
    │   │   ├── agg_feature_2_merge.R
    │   │   ├── agg_feature.R
    │   │   └── agg_feature_2.R
    ├── Knocktober
    │   └── readme.md
    ├── Date-your-Data
    │   ├── feature_df_all_CountOfApplications.R
    │   ├── 11_Ensemble_Models.R
    │   ├── feature_df_all_Match_Internship_Location_with_other_locations.R
    │   ├── BUILD_FINAL_SUBMISSION.R
    │   ├── README.md
    │   ├── 4_feature_internship_SkillsCoding.R
    │   ├── 3_feature_internship_Profile_Coding.R
    │   ├── 6_feature_student_degreeCoding.R
    │   ├── 7_feature_student_ExperienceCoding.R
    │   ├── 2_feature_internship_Profile_WordCount.R
    │   ├── 5_feature_student_StreamsCoding.R
    │   ├── 9_model_XGB_1.R
    │   ├── 10_model_XGB_1.R
    │   └── 1_internship_WordCorrection.R
    └── AVDatafest_XtremeML
    │   ├── input
    │       └── holiday.csv
    │   └── README.md
├── Kaggle
    ├── readme.md
    └── Avito Duplicate Ad Detection
    │   ├── input
    │       └── README.txt
    │   ├── output
    │       └── README.txt
    │   ├── cache
    │       └── README.txt
    │   ├── tokenizers
    │       └── punkt
    │       │   └── PY3
    │       │       └── russian.pickle
    │   ├── Documentation - TheQuants Team - Avito Contest.pdf
    │   ├── code
    │       ├── 5_data_postprocessing.py
    │       ├── feature_verification.py
    │       ├── 3_feature_set1d_interaction.R
    │       ├── libavito.py
    │       ├── models
    │       │   ├── libavito.py
    │       │   ├── marios_xgregv3.py
    │       │   ├── marios_xgson_v4.py
    │       │   ├── marios_xgsonv2_v5.py
    │       │   ├── marios_xgrank_v2.py
    │       │   ├── marios_xgson_v2.py
    │       │   ├── marios_xgson_v3.py
    │       │   └── marios_xgrank_v3.py
    │       ├── 3_feature_set4a_fuzzy.py
    │       ├── 3_feature_set1g_capitalLetters.R
    │       ├── 3_feature_set4b_fuzzy_clean.py
    │       ├── 3_feature_set1f_SpecialCounting.R
    │       ├── legacy
    │       │   └── 3_feature_set4e_count3way_clean.py
    │       ├── 3_feature_set3d_json1.py
    │       ├── 3_feature_set3c_json.py
    │       ├── 3_feature_set3b_title.py
    │       ├── 3_feature_set3a_description.py
    │       ├── 3_feature_set1e_attribute.R
    │       ├── 3_json_to_cols.py
    │       ├── functions.R
    │       ├── 3_feature_set3f_hamming.py
    │       ├── 2_image_info.py
    │       ├── 3_feature_set1a_ngram.R
    │       ├── 3_feature_set1b_nchar.R
    │       ├── 5_consolidate_features.R
    │       ├── 3_feature_set4c_alternate.py
    │       ├── 3_feature_set1c_misc.R
    │       └── 1_data_preprocessing.py
    │   ├── config.cfg
    │   ├── README.md
    │   └── runAll.sh
├── README.md
├── HackerEarth
    ├── Predict Lanes from LIDAR data
    │   ├── Rplot.png
    │   ├── final_1_calculateHaversineDistance.R
    │   ├── README.md
    │   ├── final_2_buildData.R
    │   └── final_3_buildModel.R
    └── Loan Default ML Challenge
    │   └── README.md
├── HackerRank
    └── Walmart-Codesprint
    │   └── readme.md
└── Microsoft
    └── Womens-Health-Risk-Assessment
        ├── README.md
        └── Predict.R


/AnalyticsVidhya/readme.md:
--------------------------------------------------------------------------------
1 | Analytics Vidhya Hackathons
2 | 


--------------------------------------------------------------------------------
/Kaggle/readme.md:
--------------------------------------------------------------------------------
1 | Repository for all my Kaggle Competitions
2 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/input/README.txt:
--------------------------------------------------------------------------------
1 | This is the default location for input files
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Competitions
2 | Repository of various competitions I participate
3 | 
4 | (c) Sonny Laskar
5 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/output/README.txt:
--------------------------------------------------------------------------------
1 | This is the default location for output files such as submission files
2 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/WNS-analytics-wizard-2019/README.md:
--------------------------------------------------------------------------------
1 | # Repo for WNS Hackathon #
2 | https://github.com/sonnylaskar/wns-analytics-wizard-2019
3 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/cache/README.txt:
--------------------------------------------------------------------------------
1 | This is the default location for cache files such as models and cleaned data/features
2 | 


--------------------------------------------------------------------------------
/HackerEarth/Predict Lanes from LIDAR data/Rplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sonnylaskar/Competitions/HEAD/HackerEarth/Predict Lanes from LIDAR data/Rplot.png


--------------------------------------------------------------------------------
/HackerRank/Walmart-Codesprint/readme.md:
--------------------------------------------------------------------------------
1 | # Walmart Codesprint
2 | 
3 | #Final Rank: 14 / 132
4 | 
5 | https://www.hackerrank.com/contests/walmart-codesprint-ml/challenges
6 | 
7 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/tokenizers/punkt/PY3/russian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sonnylaskar/Competitions/HEAD/Kaggle/Avito Duplicate Ad Detection/tokenizers/punkt/PY3/russian.pickle


--------------------------------------------------------------------------------
/AnalyticsVidhya/amexpert-2019-machine-learning-hackathon/code/README.md:
--------------------------------------------------------------------------------
1 | American Express Coupon Conversion Hackathon
2 | https://datahack.analyticsvidhya.com/contest/amexpert-2019-machine-learning-hackathon/
3 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/Documentation - TheQuants Team - Avito Contest.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sonnylaskar/Competitions/HEAD/Kaggle/Avito Duplicate Ad Detection/Documentation - TheQuants Team - Avito Contest.pdf


--------------------------------------------------------------------------------
/AnalyticsVidhya/Knocktober/readme.md:
--------------------------------------------------------------------------------
 1 | # Competition:
 2 | https://datahack.analyticsvidhya.com/contest/knocktober-2016/
 3 | 
 4 | Problem Type:
 5 | Binary Classification
 6 | 
 7 | # Models:
 8 | 
 9 | 2 Bags of XGBoost
10 | 
11 | 2 Bags of GBM
12 | 
13 | Equal Weighted Rank Average of the above models.
14 | 
15 | # Score:
16 | Public LB: 0.8362 (Rank 4)
17 | 
18 | Private LB: 0.7685 (Rank 3)
19 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/feature_df_all_CountOfApplications.R:
--------------------------------------------------------------------------------
1 | #Feature
2 | #Add a column of how many applications received for any Internship_ID
3 | Intern_Freq <- data.frame(table(df_all$Internship_ID))
4 | names(Intern_Freq) <- c("Internship_ID", "Internship_ApplicationCount")
5 | Intern_Freq$Internship_ID <- as.integer(as.character(Intern_Freq$Internship_ID))
6 | df_all <- left_join(df_all, Intern_Freq, by = "Internship_ID" )
7 | rm(Intern_Freq)
8 | #END


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/11_Ensemble_Models.R:
--------------------------------------------------------------------------------
 1 | library(readr)
 2 | 
 3 | #Ensemble the 2 XGB models
 4 | MODEL_1 <- read_csv("../Submissions/XGB_MODEL_S123_N526.csv")
 5 | MODEL_2 <- read_csv("../Submissions/XGB_MODEL_S500_N710.csv")
 6 | 
 7 | MEANSCORE <- (MODEL_1$Is_Shortlisted + MODEL_2$Is_Shortlisted) / 2
 8 | 
 9 | #SAVE
10 | submission <- data.frame(Internship_ID = MODEL_1$Internship_ID,
11 |                          Student_ID = MODEL_1$Student_ID,
12 |                          Is_Shortlisted = MEANSCORE)
13 | write_csv(submission,"../Submissions/FINAL_SUBMISSION.csv")  
14 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/feature_df_all_Match_Internship_Location_with_other_locations.R:
--------------------------------------------------------------------------------
1 | #Feature
2 | #Add if InternLocation matches with hometomeLocation, 
3 | #if InternLocation matches with InstitudeLocationCode
4 | #if InternLocation matches with PreferredLocationCode
5 | 
6 | df_all$isIntern_Loc_Match_HomeTown <- ifelse(df_all$LocationCode == df_all$hometownLocationCode, 1, 0)
7 | df_all$isIntern_Loc_Match_InstitudeLocationCode <- ifelse(df_all$LocationCode == df_all$InstitudeLocationCode, 1, 0)
8 | df_all$isIntern_Loc_Match_PreferredLocationCode <- ifelse(df_all$LocationCode == df_all$PreferredLocationCode, 1, 0)
9 | 


--------------------------------------------------------------------------------
/HackerEarth/Loan Default ML Challenge/README.md:
--------------------------------------------------------------------------------
 1 | Code for the HackerEarth competition on detecting loan defaulters 
 2 | https://www.hackerearth.com/challenge/competitive/machine-learning-challenge-one/
 3 | 
 4 | # The code was built on the below platform:
 5 | OS: Linux (CentOS)
 6 | RAM: 16GB
 7 | CPU Core: 8
 8 | 
 9 | Software:
10 | R 3.3.2
11 | 
12 | R Packages:
13 | readr,dplyr,caret,xgboost,gbm,data.table,lightgbm,tm,stringr,ModelMetrics)
14 | 
15 | To generate the final submission:
16 | 1) Create the folders: input, code, output
17 | 2) Put the data files in input folder
18 | 3) Save the R files in code folder
19 | 4) Execute the below command :
20 | 	Rscript final_model.R
21 | 5) It will take around 1 hour to complete
22 | 6) The final submission file will be created in output folder
23 | 
24 | 	
25 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/BUILD_FINAL_SUBMISSION.R:
--------------------------------------------------------------------------------
 1 | #This will build the Final Solution
 2 | #Will take some time
 3 | 
 4 | source("1_internship_WordCorrection.R")
 5 | source("2_feature_internship_Profile_WordCount.R")
 6 | source("3_feature_internship_Profile_Coding.R")
 7 | source("4_feature_internship_SkillsCoding.R")
 8 | source("5_feature_student_StreamsCoding.R")
 9 | source("6_feature_student_degreeCoding.R")
10 | source("7_feature_student_ExperienceCoding.R")
11 | source("8_preprocessing.R")
12 | 
13 | print("Building First XGB model")
14 | source("9_model_XGB_1.R")
15 | print("Building Second XGB model")
16 | source("10_model_XGB_1.R")
17 | 
18 | print("Calculating the Average of the 2 models")
19 | source("11_Ensemble_Models.R")
20 | print("Huh!!!, I am done!")
21 | print("Check out FINAL_SUBMISSION FILE in Submission FOlder")


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/5_data_postprocessing.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import libavito as a
 4 | import feather as f
 5 | import time
 6 | 
 7 | cache_loc = a.get_config().cache_loc
 8 | 
 9 | start = time.time()
10 | print('Transforming training data ... ', end='', flush=True)
11 | df = f.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
12 | df.replace([np.nan, None], -1, inplace=True)
13 | df.replace([np.inf, -np.inf], 9999.99, inplace=True)
14 | f.write_dataframe(df, cache_loc + 'final_featureSet_train.fthr')
15 | del df
16 | a.print_elapsed(start)
17 | 
18 | start = time.time()
19 | print('Transforming testing data ... ', end='', flush=True)
20 | df = f.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
21 | df.replace([np.nan, None], -1, inplace=True)
22 | df.replace([np.inf, -np.inf], 9999.99, inplace=True)
23 | f.write_dataframe(df, cache_loc + 'final_featureSet_test.fthr')
24 | a.print_elapsed(start)
25 | 
26 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/amexpert-2019-machine-learning-hackathon/code/agg_feature_2_merge.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(lubridate)
 3 | 
 4 | createDf <- function(file_names) {
 5 |   df <- tibble()
 6 |   for (i in file_names) {
 7 |     tmp <- suppressMessages(read_csv(i))
 8 |     if (nrow(df) == 0) {
 9 |       df <- tmp
10 |     } else {
11 |       df <- left_join(df, tmp, by = c("CampaignDate", "customer_id", "item_id"))
12 |     }
13 |     rm(tmp)
14 |     gc()
15 |   }
16 |   df
17 | }
18 | 
19 | #[1] 26 27 28 29 30  1  2  3  4  5  6  7  8  9 10 11 12 13 16 17 18 19 20 21 22 23 24 25
20 | #assign("df_2", df_1)
21 | 
22 | df <- tibble()
23 | for (i in 1:30) {
24 |   print(i)
25 |   tmp <- createDf(list.files(path = "../input/",
26 |                               pattern = paste0("agg_feat_",i,"_"),
27 |                               full.names = T))
28 |   #assign(paste0("df_", i), df)
29 |   if (nrow(df) == 0) {
30 |     df <- tmp
31 |   } else {
32 |     df <- bind_rows(df, tmp)
33 |   }
34 |   rm(tmp)
35 |   gc()
36 | }
37 | 
38 | write_csv(df, "../input/agg_v2.csv")
39 | 
40 | 


--------------------------------------------------------------------------------
/Microsoft/Womens-Health-Risk-Assessment/README.md:
--------------------------------------------------------------------------------
 1 | # Microsoft - Womens Health Risk Assessment Machine Learning Competition
 2 | https://gallery.cortanaintelligence.com/Competition/Women-s-Health-Risk-Assessment-1
 3 | 
 4 | # Problem 
 5 | ## Type of Problem: 
 6 | Supervised Multiclass Classification Problem
 7 | 
 8 | ## Problem Description:
 9 | 
10 | The objective of this machine learning competition is to build machine learning models to assign a young woman subject (15-30 years old) in one of the 9 underdeveloped regions into a risk segment, and a subgroup within the segment. 
11 | After the accurate assignments of the risk segment and subgroup in each region, a healthcare practitioner can deliver services to prevent the subject from the health risks, specifically sexual and reproductive health risks (like HIV infections). The types of services are personalized, based on the risk segment and subgroup assignments.
12 | 
13 | ## Evaluation:
14 | Accuracy
15 | 
16 | ## Score:
17 | Public Leaderboard : _87.316611_ Rank: _7_ / 493
18 | 
19 | Private Leaderboard : _87.144886_ Rank: _7_ / 493
20 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/config.cfg:
--------------------------------------------------------------------------------
 1 | # All file locations must be unix locations - FOLDERS MUST END IN '/'
 2 | 
 3 | ###########################
 4 | ##### MACHINE CONFIG #####
 5 | ##########################
 6 | # When preprocessing_nthreads is set to 1, you will receive
 7 | # more progress/speed updates.
 8 | preprocessing_nthreads = 12
 9 | model_nthreads = 12
10 | 
11 | # Set to the folder where config.cfg resides
12 | BASE_DIR = '/path/to/directory/'
13 | 
14 | # Location to store intermediate files (eg. models or processed features) - SSD suggested
15 | cache_loc = './cache/'
16 | 
17 | # Location to put output files
18 | output_loc = './output/'
19 | 
20 | #######################
21 | ##### INPUT FILES #####
22 | #######################
23 | 
24 | train_ItemInfo = './input/ItemInfo_train.csv'
25 | train_ItemPairs = './input/ItemPairs_train.csv'
26 | 
27 | test_ItemInfo = './input/ItemInfo_test.csv'
28 | test_ItemPairs = './input/ItemPairs_test.csv'
29 | 
30 | category_csv = './input/Category.csv'
31 | location_csv = './input/Location.csv'
32 | 
33 | images_root = '/path/to/images/'
34 | 


--------------------------------------------------------------------------------
/HackerEarth/Predict Lanes from LIDAR data/final_1_calculateHaversineDistance.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(stringr)
 3 | library(geosphere)
 4 | 
 5 | label <- read_csv("../input/labels.csv")
 6 | label$roadCoordinates <- NULL
 7 | train <- read_csv("../input/train.csv")
 8 | test <- read_csv("../input/test.csv")
 9 | df_all <- bind_rows(train, test)
10 | 
11 | getDis <- function(x) {
12 |   x <- as.data.frame(matrix(as.numeric(unlist(strsplit(unlist(strsplit(x, "\\|")), " "))), byrow = T, ncol = 2))
13 |   x$V1 <- ifelse(x$V1 < -90, -90, x$V1)
14 |   x$V2 <- ifelse(x$V2 < -90, -90, x$V2)
15 |   x$V1 <- ifelse(x$V1 > 90, 90, x$V1)
16 |   x$V2 <- ifelse(x$V2 > 90, 90, x$V2)
17 |   x <- arrange(x, V1, V2)[c(1, nrow(x)), ]
18 |   distHaversine(x[, 1], x[, 2])
19 | }
20 | 
21 | 
22 | getHaversineDistance <- function(id) {
23 |   median(sapply(df_all$laneLineCoordinates[df_all$roadId == id], getDis, USE.NAMES = F)  )
24 | }
25 | 
26 | roads <- data_frame(roadId = unique(df_all$roadId))
27 | roads$haversineDistance <- (sapply(roads$roadId, getHaversineDistance))
28 | 
29 | write_csv(roads, "../input/roadsDistance.csv")
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/README.md:
--------------------------------------------------------------------------------
 1 | # AnalyticsVidhya Date Your Data Contest
 2 | This repository contains the code used by me in the "Date Your Data Contest". This scored 0.7006 on the Private Leaderboard and secured 3rd position in the contest
 3 | 
 4 | https://www.analyticsvidhya.com/blog/2016/03/winning-solutions-dyd-competition-xgboost-ruled/
 5 | 
 6 | A) Prerequisites
 7 | 
 8 | Ensure that the following packages are installed:
 9 | dplyr, tidy, xgboost, tm, SnowBallC, readr, qdap, stringr, stylo, caret
10 | 
11 | B) Build Submission File
12 | 
13 | 1) Ensure that all datasets i.e. Student, Internships, train and test are 
14 | present in the "data" folder. Download from the link in the description of this repository
15 | 2) Execute the RScript "BUILD_FINAL_SUBMISSION.R"
16 | 3) Wait for some time to complete
17 | 4) Check the "Submissions" folder for the FINAL Submission file
18 | 
19 | C) Improvements
20 | 
21 | 1) I couldn't try other models for lack of time. Building other models might also be helpful
22 | 2) I have build a long list of features but I haven't removed the unnecessary files. Feature Selection would have us reduce the feature Sets
23 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/feature_verification.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import libavito as a
 4 | from multiprocessing import Pool
 5 | 
 6 | df1 = pd.read_csv('')
 7 | df2 = pd.read_csv('')
 8 | 
 9 | def find_best_feature(c):
10 |     ftr = df1[c].values
11 | 
12 |     high_correl = 0
13 |     high_ftr = ''
14 |     num_995 = 0
15 |     for c2 in df2.columns:
16 |         cor = np.corrcoef(ftr, df2[c2])[0, 1]
17 |         if cor > 0.995:
18 |             num_995 += 1
19 |         if cor > high_correl:
20 |             high_correl = cor
21 |             high_ftr = c2
22 | 
23 |     return high_correl, high_ftr, num_995
24 | 
25 | for c in df1.columns:
26 |     hc, hf, n995 = find_best_feature(c)
27 | 
28 |     if hc == 1:
29 |         print(a.c.OKGREEN + (c + ' -> ' + hf).ljust(60) + ' | CORREL 1' + a.c.END)
30 |     elif hc > 0.995:
31 |         print(a.c.OKBLUE + (c + ' -> ' + hf).ljust(60) + ' | CORREL ' + str(hc) + a.c.END)
32 |     elif hc > 0.95:
33 |         print(a.c.WARNING + (c + ' -> ' + hf).ljust(60) + ' | CORREL ' + str(hc) + a.c.END)
34 |     else:
35 |         print(a.c.FAIL + (c + ' -> ???? ').ljust(60) + ' | ' + str(hc) + ' ' + hf)
36 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1d_interaction.R:
--------------------------------------------------------------------------------
 1 | # This script is called by 3_feature_set1d_misc.R script
 2 | # DO NOT CALL it Directly
 3 | # Start Interaction feature script
 4 | print("Starting Interaction feature script")
 5 | featureList <- c("isMetroIdSame", "isLocationIDSame", "isRegionIDSame", "isLongitudeSame", "isLatitudeSame", "isTitleSame", "isdescriptionSame")
 6 | featureList <- combn(featureList, 2)
 7 | 
 8 | create_interaction <- function(x) {
 9 |         i <- x[1]
10 |         j <- x[2]
11 |         print(c(i, j))
12 |         columnName <- paste("interaction", i, j, sep = "_")
13 |         set1d[[columnName]] <<- ifelse(set1d[[i]] == 1 & set1d[[j]] == 1, 1, 0)
14 |         return(NULL)
15 | }
16 | apply(featureList, 2, create_interaction)
17 | 
18 | set1d <- set1d[, grep("interaction", names(set1d), value = T)] #Filter only interaction features
19 | names(set1d) <- paste("set1d", names(set1d), sep = "_")
20 | 
21 | 
22 | ######## Add Primary Columns ItemID1 and ItemID2
23 | set1d <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], set1d)
24 | print("Saving Interaction features features")
25 | write_feather(set1d, paste(cache_loc, "/", "features_", trainOrTest, "_set1d_", "interaction.fthr", sep = "" ))
26 | 
27 | #END
28 | 
29 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/amexpert-2019-machine-learning-hackathon/code/agg_feature.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(lubridate)
 3 | 
 4 | campaign_data <- read_csv("../input/campaign_data.csv")
 5 | campaign_data$start_date <- dmy(campaign_data$start_date)
 6 | campaign_data$end_date <- dmy(campaign_data$end_date)
 7 | campaign_data <- arrange(campaign_data, start_date)
 8 | 
 9 | 
10 | customer_transaction_data <- read_csv("../input/customer_transaction_data.csv")
11 | 
12 | 
13 | x <- unique(customer_transaction_data$date)[1]
14 | campaignDates <- campaign_data$start_date
15 | roundToNearestCampaignDate <- function(x) {
16 |   campaignDates[campaignDates > x][1]
17 | }
18 | 
19 | df_dates <- tibble(date = unique(customer_transaction_data$date))
20 | df_dates <- df_dates %>%
21 |   rowwise() %>%
22 |   mutate(nextCampaignDate = roundToNearestCampaignDate(date))
23 | 
24 | customer_transaction_data <- left_join(customer_transaction_data, df_dates, by = "date")
25 | 
26 | customer_transaction_df <- customer_transaction_data %>%
27 |   #head(100000) %>%
28 |   group_by(nextCampaignDate, customer_id, item_id) %>%
29 |   summarise(quantity_sum = sum(quantity, na.rm = T),
30 |             selling_price_sum = sum(selling_price, na.rm = T),
31 |             other_discount_sum = sum(other_discount, na.rm = T),
32 |             coupon_discount_sum = sum(coupon_discount, na.rm = T))
33 | 
34 | write_csv(customer_transaction_df, "../input/customer_transaction_df.csv")
35 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/4_feature_internship_SkillsCoding.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | library(tidyr)
 3 | library(readr)
 4 | 
 5 | #LOAD DATA
 6 | internship <- read_csv("../data/Internship.csv", na = c("", "NA", "NULL"))
 7 | NCOL <- ncol(internship)
 8 | 
 9 | #With the below code we checked how the words look like in the Skills column
10 | unlist(strsplit(unlist(strsplit(internship$Skills_required, " ")), ",")) %>% 
11 |   table() %>%
12 |   data.frame() %>% 
13 |   arrange(-Freq) %>% 
14 |   mutate(perc.weight = percent_rank(Freq)) %>%
15 |   filter(perc.weight > 0.95) -> aList
16 | 
17 | aList$NCHAR <- nchar(as.character(aList$.))
18 | aList <- filter(aList, NCHAR > 1)
19 | StringsForSkills <- setdiff(as.character(aList$.), stopwords("english"))
20 | 
21 | #Add 4 columns to Student dataframe
22 | internship$Skills_requiredCode <- NA
23 | 
24 | for (i in StringsForSkills) {
25 |   print(i)
26 |   internship$Skills_requiredCode[grep(i, internship$Skills_required, ignore.case = TRUE)] <- i
27 | }
28 | 
29 | ##Dummy Variables for StreamsCode
30 | for (i in c("Skills_requiredCode")) {
31 |   print(i)
32 |   for(level in unique(internship[[i]])){
33 |     internship[paste("dummy", i, level, sep = "_")] <- ifelse(internship[[i]] == level, 1, 0)
34 |   }
35 |   internship[[i]] <- NULL #Drop this column
36 | }
37 | 
38 | 
39 | #SAVE FILES
40 | write.csv(internship[, (NCOL+1):ncol(internship)], "../data/Features_internship_SkillsCode.csv", row.names = F)
41 | 
42 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/3_feature_internship_Profile_Coding.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | library(tidyr)
 3 | library(readr)
 4 | library(tm)
 5 | 
 6 | #LOAD DATA
 7 | internship <- read_csv("../data/Internship_Processed.csv", na = c("", "NA", "NULL"))
 8 | NCOL <- ncol(internship)
 9 | 
10 | #With the below code we checked how the words look like in the Skills column
11 | unlist(strsplit(unlist(strsplit(internship$Internship_Profile, " ")), ",")) %>% 
12 |   table() %>%
13 |   data.frame() %>% 
14 |   arrange(-Freq) %>% 
15 |   mutate(perc.weight = percent_rank(Freq)) %>% 
16 |   filter(perc.weight > 0.95) -> aList
17 | 
18 | aList$NCHAR <- nchar(as.character(aList$.))
19 | aList <- filter(aList, NCHAR > 1)
20 | StringsForProfile <- setdiff(as.character(aList$.), stopwords("english"))
21 | 
22 | #Add 4 columns to Student dataframe
23 | internship$InternshipProfile_Code <- NA
24 | 
25 | for (i in StringsForProfile) {
26 |   print(i)
27 |   internship$InternshipProfile_Code[grep(i, internship$Internship_Profile, ignore.case = TRUE)] <- i
28 | }
29 | 
30 | ##Dummy Variables for StreamsCode
31 | for (i in c("InternshipProfile_Code")) {
32 |   print(i)
33 |   for(level in unique(internship[[i]])){
34 |     internship[paste("dummy", i, level, sep = "_")] <- ifelse(internship[[i]] == level, 1, 0)
35 |   }
36 |   internship[[i]] <- NULL #Drop this column
37 | }
38 | 
39 | 
40 | #SAVE FILES
41 | write.csv(internship[, (NCOL+1):ncol(internship)], "../data/Features_internship_ProfileCode.csv", row.names = F)
42 | 
43 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/6_feature_student_degreeCoding.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | library(tidyr)
 3 | library(readr)
 4 | 
 5 | #LOAD DATA
 6 | student <- read_csv("../data/Student.csv", na = c("", "NA", "NULL"))
 7 | 
 8 | #With the below code we checked how the words look like in the degree column
 9 | #table(student$Degree) %>% data.frame() %>% arrange(-Freq) %>% View()
10 | 
11 | #We will create 4 binary columns to identify  the following:
12 | #1) IsUnderGraduate
13 | #2) IsPostGraduate
14 | #3) IsTechbackground
15 | #4) IsNonTechbackground
16 | 
17 | StringsForUG <- c("BE|B.|Bachelor|Undergrad|BCA|UG|BBA|LLB")
18 | 
19 | StringsForPG <- c("MBA|Management|M.|MCA|MBA|Post Graduate|Master|Ph.D")
20 | 
21 | StringsForTech <- c("MCA|M.Tech|M. Tech|BCA|B.E.|B. E.|B.Tech|B. Tech|Science|Technology|Engineer|Software")
22 | 
23 | StringsForNonTech <- c("MBA|Management|BBA|LLB|Business|Journalism|Mass|Arts|Pharma|Chartered|Dental|Social|English|Finance|Sports|Media|Fashion|Psychology")
24 | 
25 | NCOL <- ncol(student)
26 | #Add 4 columns to Student dataframe
27 | student$IsUnderGraduate <- 0
28 | student$IsPostGraduate <- 0
29 | student$IsTechbackground <- 0
30 | student$IsNonTechbackground <- 0
31 | 
32 | student$IsUnderGraduate[grep(StringsForUG, student$Degree, ignore.case = TRUE)] <- 1
33 | student$IsPostGraduate[grep(StringsForPG, student$Degree, ignore.case = TRUE)] <- 1
34 | student$IsTechbackground[grep(StringsForTech, student$Degree, ignore.case = TRUE)] <- 1
35 | student$IsNonTechbackground[grep(StringsForNonTech, student$Degree, ignore.case = TRUE)] <- 1
36 | 
37 | #SAVE FILES
38 | write.csv(student[, (NCOL+1):ncol(student)], "../data/Features_student_DegreeCode.csv", row.names = F)
39 | 
40 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/README.md:
--------------------------------------------------------------------------------
 1 | # Kaggle Avito Duplicate Ad Detection Contest 
 2 | Winning Solution Blog : https://blog.kaggle.com/2016/08/31/avito-duplicate-ads-detection-winners-interview-2nd-place-team-the-quants-mikel-peter-marios-sonny/
 3 | 
 4 | Contest Link: https://www.kaggle.com/c/avito-duplicate-ads-detection/
 5 | 
 6 | Private Leaderboard Score - _0.95294_ ( Rank 2 / 548)
 7 | 
 8 | Final solution of Avito Duplicate Ad Detection - TheQuants
 9 | 
10 | ##Prerequisites:
11 | **OS:** Any Linux Distribution (Ubuntu 14.04 Preferred)  
12 | **RAM:** 128GB+ (64GB for feature extraction)  
13 | **CPU:** 36 cores+ (Preferred)  
14 | **GPU:** CUDA-compatible NVIDIA GPU with Compute Capability 3.5+ (TITAN X Preferred)  
15 | **Storage:** 64GB+ (not including input data) - Images on SSD _highly recommended_
16 | 
17 | **R Version:** 3.1+  
18 | **R Packages:** data.table, dplyr, dummies, feather, Hmisc, igraph, jsonlite, parallel, raster, readr, reshape2, stringdist, stringr, stylo, textreuse, tidyr, tm, xgboost
19 | 
20 | **Python Version:** 3.5.1  
21 | **Python Packages:** scikit-learn, numpy, pandas, python-Levenshtein, codecs, OpenCV, feather-format, jellyfish, nltk, PIL, fuzzywuzzy, stop_words, haversine
22 | 
23 | **Python Version:** 2.7.1  
24 | **Python Packages:**  scikit-learn, feather-format, numpy, pandas
25 | XGBoost (0.4.0)  
26 | Keras (0.3.2)  
27 | Theano (0.8.0rc1)  
28 | 
29 | ##How to Generate the Submission File  
30 | 1) Update `config.cfg` and set all config parameters  
31 | 2) Ensure all directories mentioned in config.cfg are write-able  
32 | 3) Run `RunAll.sh`  
33 | 
34 | _Note_: In order to generate the full submission including models, it may take several weeks and needs at least 128GB of RAM
35 | 


--------------------------------------------------------------------------------
/HackerEarth/Predict Lanes from LIDAR data/README.md:
--------------------------------------------------------------------------------
 1 | # Approach for [HackerEarth India Hacks Machine Learning Competition - Semi Finals](https://www.hackerearth.com/challenge/test/indiahacks-2017-machine-learning-round-2/) - (12-13 August 2017, Bangalore, India)
 2 | ## (c) [Sonny Laskar](https://github.com/sonnylaskar)
 3 | ## Model scored #1 on Public Leaderboard and #2 on Private Leaderboard
 4 | 
 5 | ## Pre-requisites:
 6 | ```
 7 | R 3.3+
 8 | Packages: xgboost, tidyverse, feather, geosphere
 9 | ```
10 | ## Approach
11 | 
12 | ### Directory
13 | ```
14 | Create folders - code, input, output
15 | Copy all input files in input folder
16 | Copy all code files in code folder
17 | ```
18 | 
19 | ### Scripts
20 | Execute *Rscript final_1_calculateHaversineDistance.R* to calculate the length of each line by finding the Haversine distance between the two extreme coordinates for each line<br/>
21 | 
22 | Execute *Rscript final_2_buildData.R*
23 |         <br/>This script builds all features and prepares the data for final model
24 | 
25 | ### Feature Engineering:
26 | ```
27 |   sumOfDistanceFromLeft = Sum of all distances towards Left
28 |   sumOfDistanceFromRight = Sum of all distances towards Right
29 |   r_sumOfDistanceFromLR  = Ratio of the above two
30 |   int_distLR = Intersection between the distances in left and right
31 |   latCounter = Unique Count of latitude after rounding off to 4 digits
32 |   lonCounter = Unique Count of longitude after rounding off to 4 digits
33 |   uniq_linesLeft = Unique lines on Left
34 |   uniq_linesRight = Unique lines on Right
35 |   totalLaneLinesMean = Mean of total Lane Lines
36 |   haversineDistance = Haversine length of each line and averaged, Then it is scaled by dividing against the LaneLineMean value
37 |   [Refer feature Importance plot for importance]
38 | ```
39 |   Execute *final_3_buildModel.R* to build the final model<br/>
40 |         XGBOOST models with 10 different seeds are built and averaged.
41 |         The final submission file will be in output folder
42 | 
43 | <br/>
44 | Cheers :-)
45 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/7_feature_student_ExperienceCoding.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | library(tidyr)
 3 | library(readr)
 4 | library(tm)
 5 | 
 6 | #LOAD DATA
 7 | student <- read_csv("../data/Student.csv", na = c("", "NA", "NULL"))
 8 | NCOL <- ncol(student)
 9 | 
10 | #Filter only the Experience-related columns
11 | student <- student[, c(1,15:19)]
12 | 
13 | #####SECTION FOR EXPERIENCE ############
14 | #Create columns for each type of Experience and make a single row for each Student ID
15 | student$Experience_Type[is.na(student$Experience_Type)] <- "NoExperience"
16 | student %>%
17 |   select(Student_ID, Experience_Type) %>%
18 |   mutate(yesno = 1) %>%
19 |   distinct %>%
20 |   spread(Experience_Type, yesno, fill = 0) -> studentExperience
21 | 
22 | #####SECTION FOR PROFILE ############
23 | unlist(strsplit(unlist(strsplit(student$Profile, " ")), ",")) %>% 
24 |   table() %>%
25 |   data.frame() %>% 
26 |   arrange(-Freq) %>% 
27 |   mutate(perc.weight = percent_rank(Freq)) %>%
28 |   filter(perc.weight > 0.98) -> aList
29 | 
30 | aList$NCHAR <- nchar(as.character(aList$.))
31 | aList <- filter(aList, NCHAR > 1)
32 | aList <- unique(tolower(stemDocument(as.character(aList$.))))
33 | StringsForExperienceProfile <- setdiff(aList, stopwords("english"))
34 | 
35 | student$Experience_Profile_Type <- NA
36 | for (i in StringsForExperienceProfile) {
37 |   print(i)
38 |   student$Experience_Profile_Type[grep(i, student$Profile, ignore.case = TRUE)] <- i
39 | }
40 | 
41 | #Create columns for each type of Profile and make a single row for each Student ID
42 | student$Experience_Profile_Type[is.na(student$Experience_Profile_Type)] <- "NoProfile"
43 | student %>%
44 |   select(Student_ID, Experience_Profile_Type) %>%
45 |   mutate(yesno = 1) %>%
46 |   distinct %>%
47 |   spread(Experience_Profile_Type, yesno, fill = 0) -> studentExperienceProfile
48 | 
49 | #JOIN
50 | studentExperience <- left_join(studentExperience, studentExperienceProfile, by = "Student_ID")
51 | #SAVE FILES
52 | write.csv(studentExperience, "../data/Features_student_Experience.csv", row.names = F)
53 | 
54 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/AVDatafest_XtremeML/input/holiday.csv:
--------------------------------------------------------------------------------
  1 | Date,f_Holiday
  2 | 2010-01-01,1.0
  3 | 2010-01-06,1.0
  4 | 2010-04-02,1.0
  5 | 2010-05-01,1.0
  6 | 2010-08-15,1.0
  7 | 2010-09-11,1.0
  8 | 2010-10-12,1.0
  9 | 2010-11-01,1.0
 10 | 2010-12-06,1.0
 11 | 2010-12-08,1.0
 12 | 2010-12-25,1.0
 13 | 2010-12-27,1.0
 14 | 2011-01-01,1.0
 15 | 2011-01-06,1.0
 16 | 2011-04-22,1.0
 17 | 2011-05-01,1.0
 18 | 2011-08-15,1.0
 19 | 2011-09-11,1.0
 20 | 2011-10-12,1.0
 21 | 2011-11-01,1.0
 22 | 2011-12-06,1.0
 23 | 2011-12-08,1.0
 24 | 2011-12-25,1.0
 25 | 2011-12-06,1.0
 26 | 2012-01-01,1.0
 27 | 2012-01-06,1.0
 28 | 2012-03-19,1.0
 29 | 2012-04-06,1.0
 30 | 2012-05-01,1.0
 31 | 2012-05-06,1.0
 32 | 2012-08-15,1.0
 33 | 2012-09-11,1.0
 34 | 2012-10-12,1.0
 35 | 2012-11-01,1.0
 36 | 2012-12-06,1.0
 37 | 2012-12-08,1.0
 38 | 2012-12-25,1.0
 39 | 2013-01-01,1.0
 40 | 2013-01-06,1.0
 41 | 2013-03-19,1.0
 42 | 2013-03-29,1.0
 43 | 2013-05-01,1.0
 44 | 2013-05-05,1.0
 45 | 2013-06-24,1.0
 46 | 2013-08-15,1.0
 47 | 2013-09-11,1.0
 48 | 2013-10-12,1.0
 49 | 2013-11-01,1.0
 50 | 2013-12-06,1.0
 51 | 2013-12-08,1.0
 52 | 2013-12-25,1.0
 53 | 2014-01-01,1.0
 54 | 2014-01-06,1.0
 55 | 2014-03-19,1.0
 56 | 2014-04-18,1.0
 57 | 2014-04-21,1.0
 58 | 2014-05-01,1.0
 59 | 2014-05-04,1.0
 60 | 2014-06-24,1.0
 61 | 2014-08-15,1.0
 62 | 2014-09-11,1.0
 63 | 2014-10-12,1.0
 64 | 2014-11-01,1.0
 65 | 2014-12-06,1.0
 66 | 2014-12-08,1.0
 67 | 2014-12-25,1.0
 68 | 2015-01-01,1.0
 69 | 2015-01-06,1.0
 70 | 2015-03-19,1.0
 71 | 2015-04-03,1.0
 72 | 2015-04-06,1.0
 73 | 2015-05-01,1.0
 74 | 2015-05-03,1.0
 75 | 2015-06-24,1.0
 76 | 2015-09-11,1.0
 77 | 2015-10-12,1.0
 78 | 2015-11-01,1.0
 79 | 2015-12-06,1.0
 80 | 2015-12-08,1.0
 81 | 2015-12-25,1.0
 82 | 2015-12-26,1.0
 83 | 2016-01-01,1.0
 84 | 2016-01-06,1.0
 85 | 2016-03-19,1.0
 86 | 2016-03-25,1.0
 87 | 2016-03-28,1.0
 88 | 2016-05-01,1.0
 89 | 2016-05-16,1.0
 90 | 2016-06-24,1.0
 91 | 2016-08-15,1.0
 92 | 2016-09-11,1.0
 93 | 2016-10-12,1.0
 94 | 2016-11-01,1.0
 95 | 2016-12-06,1.0
 96 | 2016-12-08,1.0
 97 | 2016-12-25,1.0
 98 | 2016-12-26,1.0
 99 | 2017-01-01,1.0
100 | 2017-01-06,1.0
101 | 2017-03-19,1.0
102 | 2017-04-14,1.0
103 | 2017-04-17,1.0
104 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/2_feature_internship_Profile_WordCount.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | library(tidyr)
 3 | library(readr)
 4 | library(stylo)
 5 | library(stringr)
 6 | 
 7 | #LOAD DATA
 8 | internship <- read_csv("../data/Internship_Processed.csv", na = c("", "NA", "NULL"))
 9 | 
10 | ########
11 | getNGrams <- function(my.text, n = 1) {
12 |   # which can be split into a vector of consecutive words:
13 |   #my.vector.of.words = txt.to.words(my.text) #Removed this single is would replace all numbers
14 |   #my.vector.of.words <- unlist(strsplit(gsub("\\s+", " ", str_trim(my.text)), " "))
15 |   my.vector.of.words <- unlist(strsplit(gsub("\\s+", " ", my.text), " "))
16 |   #my.vector.of.words <- unlist(strsplit(my.text, " "))
17 |   # now, we create a vector of word 2-grams:
18 |   if (length(my.vector.of.words) >= n) {
19 |     make.ngrams(my.vector.of.words, ngram.size = n)
20 |   } else {
21 |     return(NULL)  
22 |   }
23 | }
24 | 
25 | ###################################
26 | getNgramsCount <- function(words, n) {
27 |   #######################################
28 |   # COUNTING NGRAMS FEATURES
29 |   #######################################
30 |   #Generate Ngrams
31 |   NgramsProfile <- getNGrams(words, n)
32 |   
33 |   #Count of Ngrams
34 |   countOfNgramsInProfile <- length(NgramsProfile)
35 |   
36 |   #Count of Unique NGrams
37 |   countOfUniqueNgramsInProfile <- length(unique(NgramsProfile))
38 |   
39 |   return(c(countOfNgramsInProfile, countOfUniqueNgramsInProfile))
40 | }
41 | 
42 | NCOL <- ncol(internship)
43 | for ( n in 1:2) {
44 |   print(n)
45 |   internship_words <- as.data.frame(t(mapply(getNgramsCount, internship$Internship_Profile, n)))
46 |   colnames(internship_words) <- c(paste("countOf_", n, "_gramsInProfile", sep = ""),
47 |                               paste("countOfUnique_", n, "_gramsInProfile", sep = "")
48 |                               )
49 |   row.names(internship_words) <- NULL
50 |   internship <- cbind(internship,  internship_words) 
51 | }
52 | 
53 | 
54 | write.csv(internship[, (NCOL+1):ncol(internship)], "../data/Features_internship_Profile_WordCount.csv", row.names = F)
55 | 
56 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/libavito.py:
--------------------------------------------------------------------------------
 1 | #### Copyright 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
 2 | #### Avito Duplicate Ad Detection
 3 | # Author: Mikel
 4 | # This file contains various functions which are used in multiple scripts
 5 | 
 6 | from imp import load_source
 7 | from time import time
 8 | import sys
 9 | 
10 | # Terminal output colours for use in scripts
11 | class c:
12 |     HEADER = '\033[95m'
13 |     OKBLUE = '\033[94m'
14 |     OKGREEN = '\033[92m'
15 |     WARNING = '\033[93m'
16 |     FAIL = '\033[91m'
17 |     END = '\033[0m'
18 |     BOLD = '\033[1m'
19 |     UNDERLINE = '\033[4m'
20 | 
21 | # Function to read the config file
22 | def read_config():
23 |     conf = load_source('config.cfg', 'config.cfg')
24 |     conf.nthreads = conf.model_nthreads
25 |     conf.debug = 0
26 |     # except Exception as e:
27 |     #     #print(bcol.FAIL + 'Failed to parse config file:' + bcol.END)
28 |     #     print(e.message, e.args)
29 |     #     raise Exception(bcol.FAIL + 'Failed to parse config file:' + bcol.END)
30 |     return conf
31 | 
32 | # Just an alias
33 | def get_config():
34 |     return read_config()
35 | 
36 | # Function which reads '--train' or '--test' launch arguments
37 | def get_mode(argv, name='Script'):
38 |     if len(argv) != 2:
39 |         raise RuntimeError(name + ' must be called with either --train or --test')
40 |     if argv[1] == '--train':
41 |         mode = 0
42 |     elif argv[1] == '--test':
43 |         mode = 1
44 |     else:
45 |         raise RuntimeError(name + ' must be called with either --train or --test')
46 |     assert mode == 0 or mode == 1
47 |     return mode
48 | 
49 | # Function which prints current status and time remaining:
50 | def print_progress(k, start, o):
51 |     if k != 0:
52 |         dur_per_k = (time() - start) / k
53 |         rem_dur = dur_per_k * (o - k)
54 |         rem_mins = int(rem_dur / 60)
55 |         rem_secs = rem_dur % 60
56 |         toprint = str(k) + " items processed - " + str(rem_mins) + "m" + str(int(rem_secs)) + "s remaining.  "
57 |         sys.stdout.write(toprint + '\r')
58 |         sys.stdout.flush()
59 | 
60 | def print_elapsed(start):
61 |     print(str(round(time() - start, 1)) + 's elapsed')
62 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/libavito.py:
--------------------------------------------------------------------------------
 1 | #### Copyright 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
 2 | #### Avito Duplicate Ad Detection
 3 | # Author: Mikel
 4 | # This file contains various functions which are used in multiple scripts
 5 | 
 6 | from imp import load_source
 7 | from time import time
 8 | import sys
 9 | 
10 | # Terminal output colours for use in scripts
11 | class c:
12 |     HEADER = '\033[95m'
13 |     OKBLUE = '\033[94m'
14 |     OKGREEN = '\033[92m'
15 |     WARNING = '\033[93m'
16 |     FAIL = '\033[91m'
17 |     END = '\033[0m'
18 |     BOLD = '\033[1m'
19 |     UNDERLINE = '\033[4m'
20 | 
21 | # Function to read the config file
22 | def read_config():
23 |     conf = load_source('config.cfg', 'config.cfg')
24 |     conf.nthreads = conf.model_nthreads
25 |     conf.debug = 0
26 |     # except Exception as e:
27 |     #     #print(bcol.FAIL + 'Failed to parse config file:' + bcol.END)
28 |     #     print(e.message, e.args)
29 |     #     raise Exception(bcol.FAIL + 'Failed to parse config file:' + bcol.END)
30 |     return conf
31 | 
32 | # Just an alias
33 | def get_config():
34 |     return read_config()
35 | 
36 | # Function which reads '--train' or '--test' launch arguments
37 | def get_mode(argv, name='Script'):
38 |     if len(argv) != 2:
39 |         raise RuntimeError(name + ' must be called with either --train or --test')
40 |     if argv[1] == '--train':
41 |         mode = 0
42 |     elif argv[1] == '--test':
43 |         mode = 1
44 |     else:
45 |         raise RuntimeError(name + ' must be called with either --train or --test')
46 |     assert mode == 0 or mode == 1
47 |     return mode
48 | 
49 | # Function which prints current status and time remaining:
50 | def print_progress(k, start, o):
51 |     if k != 0:
52 |         dur_per_k = (time() - start) / k
53 |         rem_dur = dur_per_k * (o - k)
54 |         rem_mins = int(rem_dur / 60)
55 |         rem_secs = rem_dur % 60
56 |         toprint = str(k) + " items processed - " + str(rem_mins) + "m" + str(int(rem_secs)) + "s remaining.  "
57 |         sys.stdout.write(toprint + '\r')
58 |         sys.stdout.flush()
59 | 
60 | def print_elapsed(start):
61 |     print(str(round(time() - start, 1)) + 's elapsed')
62 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/5_feature_student_StreamsCoding.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | library(tidyr)
 3 | library(readr)
 4 | 
 5 | #LOAD DATA
 6 | student <- read_csv("../data/Student.csv", na = c("", "NA", "NULL"))
 7 | 
 8 | #With the below code we checked how the words look like in the degree column
 9 | #table(student$Stream) %>% data.frame() %>% arrange(-Freq) %>% View()
10 | 
11 | NCOL <- ncol(student)
12 | #We will create binary columns for most popular streams
13 | 
14 | #Add the Temporary Column
15 | student$StreamCode <- NA
16 | 
17 | StringsForStreams <- c("Computer",
18 |                        "Electronics",
19 |                        "Mechanical",
20 |                        "Commerce",
21 |                        "Information",
22 |                        "Marketing",
23 |                        "Electrical",
24 |                        "Civil",
25 |                        "Finance",
26 |                        "Arts",
27 |                        "Science",
28 |                        "Economics",
29 |                        "Humanities",
30 |                        "Management",
31 |                        "English",
32 |                        "Human",
33 |                        "Software",
34 |                        "Bio",
35 |                        "Mass",
36 |                        "Operations",
37 |                        "Architecture",
38 |                        "Instrumentation",
39 |                        "Mathematics",
40 |                        "Physics",
41 |                        "Media",
42 |                        "Accounts",
43 |                        "Statistics",
44 |                        "Chemistry",
45 |                        "Political Science",
46 |                        "Psychology",
47 |                        "Fashion",
48 |                        "journalism"
49 |                        )
50 | 
51 | for (i in StringsForStreams) {
52 |   print(i)
53 |   student$StreamCode[grep(i, student$Stream, ignore.case = TRUE)] <- i
54 | }
55 | 
56 | ##Dummy Variables for StreamsCode
57 | for (i in c("StreamCode")) {
58 |   print(i)
59 |   for(level in unique(student[[i]])){
60 |     student[paste("dummy", i, level, sep = "_")] <- ifelse(student[[i]] == level, 1, 0)
61 |   }
62 |   student[[i]] <- NULL #Drop this column
63 | }
64 | 
65 | #SAVE FILES
66 | write.csv(student[, (NCOL+1):ncol(student)], "../data/Features_student_StreamCode.csv", row.names = F)
67 | 
68 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/9_model_XGB_1.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | library(tidyr)
 3 | library(readr)
 4 | library(xgboost)
 5 | library(pROC)
 6 | library(caret)
 7 | 
 8 | #MODEL DESCRIPTION
 9 | #XGBOOST MODEL SEED = 123 and NROUND = 526
10 | #LOAD DATA
11 | train <- read.csv("../data/train_processed.csv", header = TRUE, stringsAsFactors = FALSE)
12 | test <- read.csv("../data/test_processed.csv", header = TRUE, stringsAsFactors = FALSE)
13 | 
14 | #DONT NEED THESE COLUMNS ANY MORE
15 | train$Earliest_Start_Date <- NULL
16 | train$Internship_deadline <- NULL
17 | train$Start_Date <- NULL
18 | train$End_Date <- NULL
19 | train$End.Date <- NULL
20 | train$Start.Date <- NULL
21 | 
22 | test$Earliest_Start_Date <- NULL
23 | test$Internship_deadline <- NULL
24 | test$Start_Date <- NULL
25 | test$End_Date <- NULL
26 | test$End.Date <- NULL
27 | test$Start.Date <- NULL
28 | 
29 | #Validation Set
30 | set.seed(123)
31 | inTrain <- createDataPartition(y = train$Is_Shortlisted, p = .70, list = FALSE)
32 | trainSet <- train[inTrain, ]
33 | validateSet <- train[-inTrain, ]
34 | #####
35 | 
36 | dtrain <- xgb.DMatrix(data = data.matrix(train[, c(2:ncol(train))]), 
37 |                       label = data.matrix(train$Is_Shortlisted),
38 |                       missing=NA)
39 | dvalidate <- xgb.DMatrix(data = data.matrix(validateSet[, c(2:ncol(validateSet))]), 
40 |                          label = data.matrix(validateSet$Is_Shortlisted),
41 |                          missing=NA)
42 | watchlist <- list(train = dtrain, test = dvalidate)
43 | param <- list("objective" = "binary:logistic",
44 |               "eval_metric" = "auc",
45 |               "eta" = 0.1,
46 |               "max_depth" = 10,
47 |               "subsample" = 1,
48 |               "min_child_weight" = 1,
49 |               "colsample_bytree" = 0.2
50 |               )
51 | cv.nround <- 526
52 | 
53 | t <- Sys.time()
54 | set.seed(123)
55 | bst <- xgb.train(param = param,
56 |                  data = dtrain,
57 |                  nrounds = cv.nround,
58 |                  maximize = TRUE)
59 | print(Sys.time() - t)
60 | 
61 | 
62 | test_target_xgb <- predict(bst, 
63 |                                data.matrix(test[, c(2:ncol(test))]), 
64 |                                missing=NA)
65 | submission <- data.frame(Internship_ID = test$Internship_ID,
66 |                          Student_ID = test$Student_ID,
67 |                          Is_Shortlisted = test_target_xgb)
68 | write_csv(submission,"../Submissions/XGB_MODEL_S123_N526.csv")  
69 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/10_model_XGB_1.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | library(tidyr)
 3 | library(readr)
 4 | library(xgboost)
 5 | library(pROC)
 6 | library(caret)
 7 | 
 8 | #MODEL DESCRIPTION
 9 | #XGBOOST MODEL SEED = 500 and NROUND = 710
10 | 
11 | #LOAD DATA
12 | train <- read.csv("../data/train_processes.csv", header = TRUE, stringsAsFactors = FALSE)
13 | test <- read.csv("../data/test_processes.csv", header = TRUE, stringsAsFactors = FALSE)
14 | 
15 | #DONT NEED THESE COLUMNS ANY MORE
16 | train$Earliest_Start_Date <- NULL
17 | train$Internship_deadline <- NULL
18 | train$Start_Date <- NULL
19 | train$End_Date <- NULL
20 | train$End.Date <- NULL
21 | train$Start.Date <- NULL
22 | 
23 | test$Earliest_Start_Date <- NULL
24 | test$Internship_deadline <- NULL
25 | test$Start_Date <- NULL
26 | test$End_Date <- NULL
27 | test$End.Date <- NULL
28 | test$Start.Date <- NULL
29 | 
30 | #Validation Set
31 | set.seed(123)
32 | inTrain <- createDataPartition(y = train$Is_Shortlisted, p = .70, list = FALSE)
33 | trainSet <- train[inTrain, ]
34 | validateSet <- train[-inTrain, ]
35 | #####
36 | 
37 | dtrain <- xgb.DMatrix(data = data.matrix(train[, c(2:ncol(train))]), 
38 |                       label = data.matrix(train$Is_Shortlisted),
39 |                       missing=NA)
40 | dvalidate <- xgb.DMatrix(data = data.matrix(validateSet[, c(2:ncol(validateSet))]), 
41 |                          label = data.matrix(validateSet$Is_Shortlisted),
42 |                          missing=NA)
43 | watchlist <- list(train = dtrain, test = dvalidate)
44 | param <- list("objective" = "binary:logistic",
45 |               "eval_metric" = "auc",
46 |               "eta" = 0.1,
47 |               "max_depth" = 10,
48 |               "subsample" = 1,
49 |               "min_child_weight" = 1,
50 |               "colsample_bytree" = 0.2
51 |               )
52 | cv.nround <- 710
53 | 
54 | t <- Sys.time()
55 | set.seed(500)
56 | bst <- xgb.train(param = param,
57 |                  data = dtrain,
58 |                  nrounds = cv.nround,
59 |                  maximize = TRUE)
60 | print(Sys.time() - t)
61 | 
62 | 
63 | test_target_xgb <- predict(bst, 
64 |                                data.matrix(test[, c(2:ncol(test))]), 
65 |                                missing=NA)
66 | submission <- data.frame(Internship_ID = test$Internship_ID,
67 |                          Student_ID = test$Student_ID,
68 |                          Is_Shortlisted = test_target_xgb)
69 | write_csv(submission,"../Submissions/XGB_MODEL_S500_N710.csv")  
70 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/AVDatafest_XtremeML/README.md:
--------------------------------------------------------------------------------
 1 | # Winning Solution for Analytics Vidhya Machine Learning Competition - [Xtreme ML Hack](https://datahack.analyticsvidhya.com/contest/machine-learning-hackathon/)
 2 | 
 3 | (c) [Sonny](https://github.com/sonnylaskar)
 4 | 
 5 | This model scored 60.9 on the Public Leaderboard, 61.7 on the [Private Leaderboard]("https://datahack.analyticsvidhya.com/contest/machine-learning-hackathon/lb") and ranked #2. 
 6 | 
 7 | ## Prerequisites:
 8 | 1. R version 3.3.3 
 9 | 2. R Packages: readr, lubridate, dplyr, tidyr, xgboost
10 | 
11 | ## Problem Statement:
12 | The largest water supplier of Barcelona wants to leverage machine learning to effectively predict daywise-mediumwise-departmentwise breakdown of predictions of how many contacts (tickets/enquiries) would it receive and how many resolutions would it make so that they can size their team properly and improve customer satisfaction.
13 | 
14 | ## Approach:
15 | While this looked to be a time-series problem, it did not work out for me to solve it by leveraging various time series modelling techniques like ARIMA, etc. Hence I switched to solving it with regression. But the issue was that the Test dataset was in future and literally no information was known in future. We were allowed to use external data in this contest and Holiday calender seemed to be an obvious parameter that should surely affect such problems. 
16 | 
17 | ### Feature Engineering:
18 | 1. Date features like weekday, quarter, etc.
19 | 2. Whether a Day was a holiday in Spain?
20 | 3. How many days were elaped since the last holiday (in rank_percent)?
21 | 4. Lagged features of # of contacts and resolutions of 75 days, 90 days and 120 days (Since the prediction to be made was upto 75 days in future, hence I decided not to include any lag value less than 75 days)
22 | 
23 | ### Modeling:
24 | Xgboost is the first model that I try everytime I have to solve any such problem. As always, it gave a significant score. For cross validation, I used the last 4 months data. 
25 | 
26 | ## Steps to reproduce the submission:
27 | 1. Copy all Train files in the folder _"input/Train"_
28 | 2. Copy all Test files in the folder _"input/Test"_
29 | 3. External data: I used holiday list of Spain as an external data from [here](http://www.officeholidays.com/countries/spain/regional.php?list_year=2010&list_region=catalonia "Calender")
30 | 4. Ensure folder _"output"_ exists
31 | 5. Run the Rscript _final_model.R_ from the _code_ directory
32 | 6. The final files will be created in the _"output"_ folder
33 | 
34 | Enjoy :smile:
35 | 
36 | 
37 | Regards
38 | 
39 | Sonny
40 | 


--------------------------------------------------------------------------------
/HackerEarth/Predict Lanes from LIDAR data/final_2_buildData.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(feather)
 3 | library(stringr)
 4 | 
 5 | label <- read_csv("../input/labels.csv")
 6 | label$roadCoordinates <- NULL
 7 | train <- read_csv("../input/train.csv")
 8 | test <- read_csv("../input/test.csv")
 9 | df_all <- bind_rows(train, test)
10 | roadsDistance <- read_csv("../input/roadsDistance.csv")
11 | 
12 | getLatLong <- function(x, t = "lat") {
13 |   a <- matrix(as.numeric(unlist(strsplit(unlist(strsplit(x, "\\|")), " "))), byrow = T, ncol = 2)
14 |   if (t == "lon") { 
15 |     apply(a, 2, mean)[1]
16 |   } else {
17 |     apply(a, 2, mean)[2]
18 |   }
19 | }
20 | 
21 | 
22 | df_all$meanLat <- sapply(df_all$laneLineCoordinates, getLatLong, t = "lat", USE.NAMES = F)
23 | df_all$meanLon <- sapply(df_all$laneLineCoordinates, getLatLong, t = "lon", USE.NAMES = F)  
24 |   
25 | df_all %>%
26 |   group_by(roadId) %>%
27 |   summarise(
28 |     sumOfDistanceFromLeft = sum(distFromLaneLineOnLeft, na.rm = T),
29 |     sumOfDistanceFromRight = sum(distFromLaneLineOnRight, na.rm = T),
30 |     r_sumOfDistanceFromLR = sumOfDistanceFromLeft / sumOfDistanceFromRight,
31 |     int_distLR = length(intersect(distFromLaneLineOnLeft, distFromLaneLineOnRight)),
32 | 
33 |     latCounter = length(unique(round(meanLat, 4))),
34 |     lonCounter = length(unique(round(meanLon, 4))),
35 | 
36 |     int_TotalLinesLR = length(intersect(totalLinesOnLeft, totalLaneLinesOnRight)),
37 |     uniq_linesLeft = length(unique(totalLinesOnLeft)),
38 |     uniq_linesRight = length(unique(totalLaneLinesOnRight)),
39 |     totalLaneLinesMean = mean(totalLaneLines),
40 |     totalLinesOnLeftMax = max(totalLinesOnLeft),
41 | 
42 |     uniq_lineId = length(unique(laneLineId)) / length((laneLineId)),
43 |     roadCategory = unique(roadCategory),
44 | 
45 |     r_lineToRoadLength = sum(laneLineLength / roadLength < 0.8),
46 |     r_lineToRoadLength2 = sum(laneLineLength / roadLength >= 0.8),
47 |     laneLineLengthMean = mean(laneLineLength),
48 |     
49 |     sum_interSectingLines = sum(noOfIntersectingLaneLinesLeft, noOfIntersectingLaneLinesRight),
50 |     noOfIntersectingLaneLinesLeftMean = mean(noOfIntersectingLaneLinesLeft),
51 | 
52 |     sum_isIntersectingWithRoadGeometryTrue = sum(isIntersectingWithRoadGeometry == "true"),
53 |     sum_isIntersectingWithRoadGeometryFalse = sum(isIntersectingWithRoadGeometry == "false")
54 |   ) -> df2
55 | 
56 | 
57 | 
58 | df2$data <- ifelse(df2$roadId %in% train$roadId, "train", "test")
59 | df2 <- left_join(df2, roadsDistance, by = "roadId")
60 | df2$haversineDistance <- df2$haversineDistance / df2$laneLineLengthMean
61 | df2 <- left_join(df2, label, by = "roadId")
62 | 
63 | write_feather(df2, "../input/df_all.fthr")
64 | 
65 | 


--------------------------------------------------------------------------------
/HackerEarth/Predict Lanes from LIDAR data/final_3_buildModel.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(feather)
 3 | library(xgboost)
 4 | 
 5 | nthread <- parallel::detectCores()
 6 | 
 7 | df_all <- read_feather("../input/df_all.fthr")
 8 | TARGET <- "noOfLanes"
 9 | NAString <- NA
10 | model_features <- setdiff(names(df_all), c("roadId", TARGET, "data"))
11 | 
12 | df_all_train <- df_all[df_all$data == "train", ]
13 | df_all_test <- df_all[df_all$data == "test", ]
14 | #rm(df_all)
15 | gc()
16 | 
17 | ####### XGBOOST ############
18 | EARLY_STOPPING <- 100
19 | print.every.n <- 10
20 | df_all_train[[TARGET]] <- as.factor(df_all_train[[TARGET]] - 1)
21 | num_class <- length(levels(df_all_train[[TARGET]]))
22 | 
23 | param <- list(
24 |   objective           = "multi:softprob", 
25 |   booster             = "gbtree",
26 |   eval_metric         = "mlogloss",
27 |   num_class           = num_class,
28 |   eta                 = 0.1,
29 |   max_depth           = 5,
30 |   subsample           = 0.9,
31 |   min_child_weight    = 1,
32 |   colsample_bytree    = 1.0,
33 |   gamma               = 0,
34 |   nthread             = nthread,
35 |   num_parallel_tree   = 2
36 | )
37 | 
38 | if (param$eval_metric != "auc") {
39 |   isMaximize  <- F
40 | } else {
41 |   isMaximize  <- T
42 | }
43 | nrounds <- 100
44 | seed <- (1:10)*1000
45 | 
46 | dtrain <- xgb.DMatrix(  data = data.matrix(df_all_train[, model_features]),
47 |                         label = data.matrix(df_all_train[[TARGET]]),
48 |                         missing = NAString)
49 | watchlist <- list(train = dtrain)
50 | 
51 | t <- Sys.time()
52 | print(param)
53 | test_xgb_model <- rep(0, nrow(df_all_test))
54 | for (s in seed) {
55 |   cat("Generating XGB seed", s, "\n", sep = " ")
56 |   set.seed(s)
57 |   bst <- xgb.train(             params              = param,
58 |                                 data                = dtrain,
59 |                                 nrounds             = nrounds,
60 |                                 verbose             = 1,
61 |                                 print_every_n       = print.every.n,
62 |                                 early_stopping_rounds    = EARLY_STOPPING,
63 |                                 watchlist           = watchlist,
64 |                                 maximize            = isMaximize
65 |   )
66 |   print(format(Sys.time() - t, format = "%H:%M") )
67 |   dtest <- xgb.DMatrix(  data = data.matrix(df_all_test[, model_features]),
68 |                           missing = NAString)
69 |   tmp <- predict(bst, dtest)
70 |   tmp <- ifelse(tmp < 0, 0, tmp)
71 |   test_xgb_model <- test_xgb_model + tmp
72 | }
73 | xgb_1 <- test_xgb_model / length(seed)
74 | 
75 | 
76 | xgb_1 <- apply(matrix(xgb_1, byrow = T, ncol = num_class), 1, which.max)
77 | xgb_1 <- data.frame(roadId = df_all_test$roadId, noOfLanes = xgb_1)
78 | write_csv(xgb_1, "../output/finalSubmission.csv")
79 | 
80 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set4a_fuzzy.py:
--------------------------------------------------------------------------------
 1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
 2 | #### Author: Marios & Mikel
 3 | #### Avito Duplicate Ad Detection
 4 | # 3_feature_set4a_fuzzy.py
 5 | # Creates text features using the fuzzywuzzy python packages
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | import sys
10 | import time
11 | import gc
12 | import feather
13 | from fuzzywuzzy import fuzz
14 | from multiprocessing import Pool
15 | 
16 | import libavito as a
17 | 
18 | def process_row(row):
19 |     values = []
20 |     values.append(row[0])
21 |     values.append(row[1])
22 | 
23 |     # Not black magic, iterate over title/description/json
24 |     for d in [2, 4, 6]:
25 |         st_1 = str(row[d])
26 |         st_2 = str(row[d + 1])
27 |         values.append(fuzz.partial_ratio(st_1, st_2))
28 |         values.append(fuzz.token_set_ratio(st_1, st_2))
29 |         values.append(fuzz.ratio(st_1, st_2))
30 |         values.append(fuzz.token_sort_ratio(st_1, st_2))
31 |     return values
32 | 
33 | print(a.c.BOLD + 'Extracting set4a fuzzy text features ...' + a.c.END)
34 | 
35 | # Get train/test mode from launch argument
36 | mode = a.get_mode(sys.argv, '3_feature_set4a_fuzzy.py')
37 | 
38 | ## Read settings required by script
39 | config = a.read_config()
40 | nthreads = config.preprocessing_nthreads
41 | cache_loc = config.cache_loc
42 | debug = config.debug
43 | if mode == 0:
44 |     root = config.train_images_root
45 |     df = feather.read_dataframe(cache_loc + 'train.fthr')
46 | if mode == 1:
47 |     root = config.test_images_root
48 |     df = feather.read_dataframe(cache_loc + 'test.fthr')
49 | 
50 | df = df[['itemID_1', 'itemID_2', 'title_1', 'title_2', 'description_1', 'description_2', 'attrsJSON_1', 'attrsJSON_2']]
51 | 
52 | ftrs = []
53 | 
54 | start = time.time()
55 | o = len(df.index)
56 | if nthreads == 1:
57 |     print('Extracting features with 1 thread ...')
58 |     k = 0
59 |     # Iterate over files
60 |     ftrs = []
61 |     for row in df.values:
62 |         x = process_row(row)
63 |         ftrs.append(x)
64 |         k += 1
65 |         if k % 100 == 0:
66 |             a.print_progress(k, start, o)
67 | 
68 | # Otherwise perform multi-threaded mapping
69 | else:
70 |     print('Extracting features multi-threaded ... ', end='', flush=True)
71 |     pool = Pool(nthreads)
72 |     ftrs = pool.map(process_row, df.values)
73 |     pool.close()
74 |     gc.collect()
75 | 
76 |     a.print_elapsed(start)
77 | 
78 | ftrs = pd.DataFrame(ftrs)
79 | cols = ['itemID_1', 'itemID_2'] + ['set4a_X' + str(i) for i in range(1, len(ftrs.columns.tolist()) - 1)]
80 | print(cols)
81 | ftrs.columns = cols
82 | 
83 | # Save updated dataset
84 | if mode == 0:
85 |     feather.write_dataframe(ftrs, cache_loc + 'features_train_set4a_fuzzy.fthr')
86 | if mode == 1:
87 |     feather.write_dataframe(ftrs, cache_loc + 'features_test_set4a_fuzzy.fthr')
88 | 
89 | a.print_elapsed(start)
90 | print('set4a extraction complete!')
91 | 
92 | # Write status to status file so master script knows whether to proceed.
93 | f = open(cache_loc + 'status.txt', 'a')
94 | f.write('feature_set4a_OK\n')
95 | f.close()
96 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/amexpert-2019-machine-learning-hackathon/code/agg_feature_2.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(lubridate)
 3 | 
 4 | campaign_data <- read_csv("../input/campaign_data.csv")
 5 | campaign_data$start_date <- dmy(campaign_data$start_date)
 6 | campaign_data$end_date <- dmy(campaign_data$end_date)
 7 | campaign_data <- arrange(campaign_data, start_date)
 8 | 
 9 | 
10 | customer_transaction_data <- read_csv("../input/customer_transaction_data.csv")
11 | 
12 | 
13 | #x <- unique(customer_transaction_data$date)
14 | #campaignDates <- campaign_data$start_date
15 | #roundToNearestCampaignDate <- function(x) {
16 | #  campaignDates[campaignDates > x][1]
17 | #}
18 | 
19 | #df_dates <- tibble(date = unique(customer_transaction_data$date))
20 | #df_dates <- df_dates %>%
21 | #  rowwise() %>%
22 | #  mutate(nextCampaignDate = roundToNearestCampaignDate(date))
23 | 
24 | #customer_transaction_data <- left_join(customer_transaction_data, df_dates, by = "date")
25 | 
26 | #customer_transaction_df <- customer_transaction_data %>%
27 |   #head(100000) %>%
28 | #  group_by(nextCampaignDate, customer_id, item_id) %>%
29 | #  summarise(quantity_sum = sum(quantity, na.rm = T),
30 | #            selling_price_sum = sum(selling_price, na.rm = T),
31 | #            other_discount_sum = sum(other_discount, na.rm = T),
32 | #            coupon_discount_sum = sum(coupon_discount, na.rm = T))
33 | 
34 | #write_csv(customer_transaction_df, "../input/customer_transaction_df.csv")
35 | 
36 | #df_dates <- tibble(campaignDates = campaignDates)
37 | #df_dates$date_1m <- df_dates$campaignDates - 30
38 | #df_dates$date_2m <- df_dates$campaignDates - 60
39 | 
40 | for (i in unique(campaign_data$campaign_id)) {
41 |   customer_transaction_data[[paste0("campaign_id_", i)]] <- campaign_data$start_date[campaign_data$campaign_id == i]
42 | }
43 | 
44 | #[1] 26 27 28 29 30  1  2  3  4  5  6  7  8  9 10 11 12 13 16 17 18 19 20 21 22 23 24 25
45 | 
46 | #customer_transaction_df <- tibble()
47 | for (i in unique(campaign_data$campaign_id)) {
48 |   for (lagDays in c(seq(30, 30*12, 30))) {
49 |     print(paste(i, lagDays))
50 |     customer_transaction_data$CampaignDate <- customer_transaction_data[[paste0("campaign_id_", i)]]
51 |     tmp <- customer_transaction_data %>%
52 |       filter(date >= CampaignDate - lagDays & date < CampaignDate) %>%
53 |       group_by(CampaignDate, customer_id, item_id) %>%
54 |       summarise(quantity_sum = sum(quantity, na.rm = T),
55 |                 selling_price_sum = sum(selling_price, na.rm = T),
56 |                 other_discount_sum = sum(other_discount, na.rm = T),
57 |                 coupon_discount_sum = sum(coupon_discount, na.rm = T),
58 |                 quantity_mean = mean(quantity, na.rm = T),
59 |                 selling_price_mean = mean(selling_price, na.rm = T),
60 |                 other_discount_mean = mean(other_discount, na.rm = T),
61 |                 coupon_discount_mean = mean(coupon_discount, na.rm = T))
62 |     
63 |     
64 |     if (nrow(tmp) > 0) {
65 |       names(tmp)[-(1:3)] <- paste(names(tmp)[-(1:3)], lagDays, sep = "_")
66 |       #customer_transaction_df <- bind_rows(customer_transaction_df, tmp)
67 |       write_csv(tmp, paste("../input/agg_feat",i, lagDays, ".csv", sep = "_"))
68 |       rm(tmp)
69 |     }
70 |     gc()
71 |   }  
72 | }
73 | 
74 | #write_csv(customer_transaction_df, "../input/agg_feat_2.csv")
75 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1g_capitalLetters.R:
--------------------------------------------------------------------------------
 1 | ################################################################################################
 2 | ################################################################################################
 3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
 4 | #### Competition: Avito Duplicate Ad Detection
 5 | # Filename : 3_feature_set1g_capitalLetters.R
 6 | # Description: This Rscript generates Capital Letters Features
 7 | # Usage:
 8 | #       Rscript ./code/3_feature_set1g_capitalLetters.R train
 9 | #       Rscript ./code/3_feature_set1g_capitalLetters.R test
10 | #       Default argument is test
11 | ################################################################################################
12 | ################################################################################################
13 | 
14 | args <- commandArgs(trailingOnly = F)
15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)])))
16 | 
17 | 
18 | # Source Config and functions.R file
19 | source(paste(BASE, "/../config.cfg", sep = ""))
20 | source(paste(BASE_DIR, "/code/functions.R", sep = ""))
21 | 
22 | #Load any additional packages
23 | library(parallel)
24 | 
25 | # Read argument for train or test
26 | trainOrTest <- commandArgs(trailingOnly = TRUE)
27 | if (length(trainOrTest) > 1) {
28 |         stop("ERROR: I need only 1 argument : train or test")
29 | }
30 | 
31 | if (length(trainOrTest) == 0) {
32 |         print("No Arguments passed, Assuming you mean test")
33 |         trainOrTest <- "test"
34 | }
35 | 
36 | #Load dat
37 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
38 | cat("Reading file ", FILENAME, "\n", sep = " ")
39 | dat <- read_csv(FILENAME)
40 | 
41 | 
42 | #Function to generate functions
43 | getCapitalLetterFeatures <- function(x) {
44 | 	wordsWithCapitalLetters <- length(grep("[[:upper:]]", unlist(strsplit(x, " "))))
45 | 	countOfCapitalLetters <- length(grep("[[:upper:]]", unlist(strsplit(x, ""))))
46 | 	return(c(wordsWithCapitalLetters, countOfCapitalLetters))
47 | }
48 | 
49 | df2 <- data.frame(ID = 1:nrow(dat)) #Else cbind will not work
50 | for (Field in c("title_1", "title_2", "description_1", "description_2")) {
51 | 	print(Field)
52 | 	df2_temp <- as.data.frame(t(mcmapply(getCapitalLetterFeatures, dat[[Field]], USE.NAMES = F)))
53 | 	names(df2_temp) <- c(paste("wordsWithCapitalLetters", Field, sep = "_"), paste("countOfCapitalLetters", Field, sep = "_"))
54 | 	df2 <- cbind(df2, df2_temp)
55 | }
56 | for (i in c("title", "description")) {
57 | 	for (j in c("wordsWithCapitalLetters", "countOfCapitalLetters")) {
58 | 		#print(c(i,j))
59 | 		NewField1 <- paste(j, "_", i,"_1",  sep = "")
60 | 		NewField2 <- paste(j, "_", i,"_2",  sep = "")
61 | 		#print(c(NewField1,NewField2))
62 | 		NewFieldName <- paste("ratio", NewField1, NewField2, sep = "_")
63 | 		print(NewFieldName)
64 | 		df2[[NewFieldName]] <- df2[[NewField1]] / df2[[NewField2]]
65 | 		df2[[NewFieldName]] <- round(ifelse(df2[[NewFieldName]] > 1, 1/df2[[NewFieldName]], df2[[NewFieldName]]), 2)
66 | 	}
67 | }
68 | 
69 | df2$ID <- NULL
70 | names(df2) <- paste("set1g", names(df2), sep = "_")
71 | 
72 | 
73 | ######## Add Primary Columns ItemID1 and ItemID2
74 | df2 <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df2)
75 | print("Saving Capital Letter  features")
76 | write_feather(df2, paste(cache_loc, "/", "features_", trainOrTest, "_set1g_", "capitalLetters.fthr", sep = "" ))
77 | 
78 | #END
79 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set4b_fuzzy_clean.py:
--------------------------------------------------------------------------------
  1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
  2 | #### Author: Marios & Mikel
  3 | #### Avito Duplicate Ad Detection
  4 | # 3_feature_set4b_fuzzy_clean.py
  5 | # Creates clean text features using the fuzzywuzzy python packages
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import sys
 10 | import jellyfish
 11 | import feather
 12 | import time
 13 | import gc
 14 | from fuzzywuzzy import fuzz
 15 | from multiprocessing import Pool
 16 | 
 17 | import libavito as a
 18 | 
 19 | def process_row(row):
 20 |     values = []
 21 |     values.append(row[0])
 22 |     values.append(row[1])
 23 | 
 24 |     # iterate over cleaned title/descs/jsons
 25 |     for d in [2, 4, 6]:
 26 |         s1 = str(row[d])
 27 |         s2 = str(row[d + 1])
 28 |         values.append(jellyfish.levenshtein_distance(s1, s2))
 29 |         values.append(jellyfish.jaro_distance(s1, s2))
 30 |         #values.append(float(jellyfish.damerau_levenshtein_distance(s1,s2)) )
 31 |         values.append(fuzz.partial_ratio(s1, s2))
 32 |         values.append(fuzz.token_set_ratio(s1, s2))
 33 |         values.append(fuzz.ratio(s1, s2))
 34 |         values.append(fuzz.token_sort_ratio(s1, s2))
 35 |     return values
 36 | 
 37 | print(a.c.BOLD + 'Extracting set4b fuzzy cleaned text features ...' + a.c.END)
 38 | 
 39 | # Get train/test mode from launch argument
 40 | mode = a.get_mode(sys.argv, '3_feature_set4b_fuzzy_clean.py')
 41 | 
 42 | ## Read settings required by script
 43 | config = a.read_config()
 44 | nthreads = config.preprocessing_nthreads
 45 | cache_loc = config.cache_loc
 46 | debug = config.debug
 47 | if mode == 0:
 48 |     root = config.train_images_root
 49 |     df = feather.read_dataframe(cache_loc + 'train.fthr')
 50 | if mode == 1:
 51 |     root = config.test_images_root
 52 |     df = feather.read_dataframe(cache_loc + 'test.fthr')
 53 | 
 54 | df = df[['itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2', 'cleandesc_1', 'cleandesc_2', 'cleanjson_1', 'cleanjson_2']]
 55 | 
 56 | ftrs = []
 57 | 
 58 | start = time.time()
 59 | o = len(df.index)
 60 | if nthreads == 1:
 61 |     print('Extracting features with 1 thread ...')
 62 |     k = 0
 63 |     # Iterate over files
 64 |     ftrs = []
 65 |     for row in df.values:
 66 |         x = process_row(row)
 67 |         ftrs.append(x)
 68 |         k += 1
 69 |         if k % 100 == 0:
 70 |             a.print_progress(k, start, o)
 71 | 
 72 | # Otherwise perform multi-threaded mapping
 73 | else:
 74 |     print('Extracting features multi-threaded ... ', end='', flush=True)
 75 |     pool = Pool(nthreads)
 76 |     ftrs = pool.map(process_row, df.values)
 77 |     pool.close()
 78 |     gc.collect()
 79 | 
 80 |     a.print_elapsed(start)
 81 | 
 82 | ftrs = pd.DataFrame(ftrs)
 83 | cols = ['itemID_1', 'itemID_2'] + ['set4b_X' + str(i) for i in range(1, len(ftrs.columns.tolist()) - 1)]
 84 | print(cols)
 85 | ftrs.columns = cols
 86 | 
 87 | # Save updated dataset
 88 | if mode == 0:
 89 |     feather.write_dataframe(ftrs, cache_loc + 'features_train_set4b_fuzzy_clean.fthr')
 90 | if mode == 1:
 91 |     feather.write_dataframe(ftrs, cache_loc + 'features_test_set4b_fuzzy_clean.fthr')
 92 | 
 93 | a.print_elapsed(start)
 94 | print('set4b extraction complete!')
 95 | 
 96 | # Write status to status file so master script knows whether to proceed.
 97 | f = open(cache_loc + 'status.txt', 'a')
 98 | f.write('feature_set4b_OK\n')
 99 | f.close()
100 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1f_SpecialCounting.R:
--------------------------------------------------------------------------------
 1 | ################################################################################################
 2 | ################################################################################################
 3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
 4 | #### Competition: Avito Duplicate Ad Detection
 5 | # Filename : 3_feature_set1f_SpecialCounting.R
 6 | # Description: This Rscript generates all Special Character Counting Features
 7 | # Usage:
 8 | #       Rscript ./code/3_feature_set1f_SpecialCounting.R train
 9 | #       Rscript ./code/3_feature_set1f_SpecialCounting.R test
10 | #       Default argument is test
11 | ################################################################################################
12 | ################################################################################################
13 | 
14 | args <- commandArgs(trailingOnly = F)
15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)])))
16 | 
17 | 
18 | # Source Config and functions.R file
19 | source(paste(BASE, "/../config.cfg", sep = ""))
20 | source(paste(BASE_DIR, "/code/functions.R", sep = ""))
21 | 
22 | #Load any additional packages
23 | library(parallel)
24 | library(stylo)
25 | library(stringr)
26 | library(tm)
27 | 
28 | # Read argument for train or test
29 | trainOrTest <- commandArgs(trailingOnly = TRUE)
30 | if (length(trainOrTest) > 1) {
31 |         stop("ERROR: I need only 1 argument : train or test")
32 | }
33 | 
34 | if (length(trainOrTest) == 0) {
35 |         print("No Arguments passed, Assuming you mean test")
36 |         trainOrTest <- "test"
37 | }
38 | 
39 | #Load data
40 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
41 | cat("Reading file ", FILENAME, "\n", sep = " ")
42 | dat <- read_csv(FILENAME)
43 | 
44 | 
45 | 
46 | # Function to generate Features
47 | getFeatures <- function(x, type) {
48 | 	if (type == "digit") {
49 | 		lengths((regmatches(x, gregexpr("[[:digit:]]+",x))))
50 | 	} else if (type == "cntrl") {
51 | 		lengths((regmatches(x, gregexpr("[[:cntrl:]]+",x))))
52 | 	} else if (type == "graph") {
53 |                 lengths((regmatches(x, gregexpr("[[:graph:]]+",x))))
54 |         } else if (type == "punct") {
55 |                 lengths((regmatches(x, gregexpr("[[:punct:]]+",x))))
56 |         } else if (type == "xdigit") {
57 |                 lengths((regmatches(x, gregexpr("[[:xdigit:]]+",x))))
58 |         } else {
59 | 		return(NA)
60 | 	}
61 | }
62 | 
63 | print("Generating Count Features")
64 | for (i in c("digit", "graph", "punct", "xdigit")) {
65 | 	for (j in c("cleantitle_1", "cleantitle_2", "cleandesc_1", "cleandesc_2")) {
66 | 		print(c(i,j))
67 | 		assign(
68 | 			paste("countOf", i, "In", j , sep = "_"),
69 | 			sapply(dat[[j]], getFeatures, type = i, USE.NAMES = FALSE)
70 | 			)
71 | 	}
72 | }
73 | 
74 | print("Generating Ratio Features")
75 | for (i in c("_digit", "_graph_", "_punct_", "_xdigit_")) {
76 | 	for (j in c("title", "desc")) {
77 | 		print(c(i, j))
78 | 		f_name <- grep(i, grep(j, ls(), value = T), value = T)
79 | 		ratio <- get(f_name[1]) / get(f_name[2])
80 | 		ratio <- ifelse(ratio > 1, 1/ratio, ratio)
81 | 		assign(
82 |                         paste("ratioOfcountOf", i, "In", j , sep = "_"),
83 | 			round(ratio, 2)
84 |                         )
85 | 	}
86 | }
87 | 
88 | df_master <- as.data.frame(do.call(cbind, list(sapply(grep("countOf", ls(), value = T), get, USE.NAMES = T))))
89 | names(df_master) <- paste("set1f", names(df_master), sep = "_")
90 | 
91 | ######## Add Primary Columns ItemID1 and ItemID2
92 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
93 | print("Saving Special Counting features")
94 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1f_", "specialCounting.fthr", sep = "" ))
95 | 
96 | #END
97 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/legacy/3_feature_set4e_count3way_clean.py:
--------------------------------------------------------------------------------
  1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
  2 | #### Author: Marios & Mikel
  3 | #### Avito Duplicate Ad Detection
  4 | # 3_feature_set4e_count3way_clean.py
  5 | # Counts how many 3-random-grams in item1 appear in item2
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import sys
 10 | import jellyfish
 11 | import feather
 12 | import time
 13 | import gc
 14 | import re
 15 | import math
 16 | from collections import Counter
 17 | from fuzzywuzzy import fuzz
 18 | from multiprocessing import Pool
 19 | 
 20 | import libavito as a
 21 | 
 22 | def count_3words(words, text):
 23 |     # To count how many times of the search terms having three words at least showing in texts.
 24 |     count3 = 0
 25 |     if len(words) < 3 or len(text) < 3:
 26 |         return -1
 27 |     else:
 28 |         for m in range(0, len(words) - 2):
 29 |             words1 = words[m]
 30 |             for n in range(m + 1, len(words) - 1):
 31 |                 words2 = words[n]
 32 |                 for z in range(m + 2, len(words)):
 33 |                     words3 = words[z]
 34 |                     if words1 in text and words2 and words3 in text:
 35 |                         count3 += 1
 36 |         return count3
 37 | 
 38 | def process_row(row):
 39 | 
 40 |     title = 2
 41 |     desc = 4
 42 |     json = 6
 43 | 
 44 |     values = []
 45 | 
 46 |     values.append(row[0])
 47 |     values.append(row[1])
 48 | 
 49 |     for d in [title, desc, json]:
 50 |         st_1 = str(row[d]).replace(":", " ").replace('"', ' ')
 51 |         st_2 = str(row[d + 1]).replace(":", " ").replace('"', ' ')
 52 |         values.append(count_3words(st_1.split(" "), st_2.split(" ")))
 53 | 
 54 |     return values
 55 | 
 56 | print(a.c.BOLD + 'Extracting set4e 3-way word count features ...' + a.c.END)
 57 | 
 58 | # Get train/test mode from launch argument
 59 | mode = a.get_mode(sys.argv, '3_feature_set4e_fuzzy_clean.py')
 60 | 
 61 | ## Read settings required by script
 62 | config = a.read_config()
 63 | nthreads = config.preprocessing_nthreads
 64 | cache_loc = config.cache_loc
 65 | debug = config.debug
 66 | if mode == 0:
 67 |     root = config.train_images_root
 68 |     df = feather.read_dataframe(cache_loc + 'train.fthr')
 69 | if mode == 1:
 70 |     root = config.test_images_root
 71 |     df = feather.read_dataframe(cache_loc + 'test.fthr')[:1000]
 72 | 
 73 | df = df[['itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2', 'cleandesc_1', 'cleandesc_2', 'cleanjson_1', 'cleanjson_2']]
 74 | 
 75 | ftrs = []
 76 | 
 77 | start = time.time()
 78 | o = len(df.index)
 79 | if nthreads == 1:
 80 |     print('Extracting features with 1 thread ...')
 81 |     k = 0
 82 |     # Iterate over files
 83 |     ftrs = []
 84 |     for row in df.values:
 85 |         x = process_row(row)
 86 |         ftrs.append(x)
 87 |         k += 1
 88 |         if k % 1 == 0:
 89 |             a.print_progress(k, start, o)
 90 | 
 91 | # Otherwise perform multi-threaded mapping
 92 | else:
 93 |     print('Extracting features multi-threaded ... ', end='', flush=True)
 94 |     pool = Pool(nthreads)
 95 |     ftrs = pool.map(process_row, df.values)
 96 |     pool.close()
 97 |     gc.collect()
 98 | 
 99 |     a.print_elapsed(start)
100 | 
101 | ftrs = pd.DataFrame(ftrs)
102 | cols = ['itemID_1', 'itemID_2'] + ['set4e_X' + str(i) for i in range(1, len(ftrs.columns.tolist()) - 1)]
103 | print(cols)
104 | ftrs.columns = cols
105 | 
106 | # Save updated dataset
107 | if mode == 0:
108 |     feather.write_dataframe(ftrs, cache_loc + 'feature_set4e_train.fthr')
109 | if mode == 1:
110 |     feather.write_dataframe(ftrs, cache_loc + 'feature_set4e_test.fthr')
111 | 
112 | a.print_elapsed(start)
113 | print('set4e extraction complete!')
114 | 
115 | # Write status to status file so master script knows whether to proceed.
116 | f = open(cache_loc + 'status.txt', 'a')
117 | f.write('feature_set4e_OK\n')
118 | f.close()
119 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3d_json1.py:
--------------------------------------------------------------------------------
  1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
  2 | #### Author: Peter & Mikel
  3 | #### Avito Duplicate Ad Detection
  4 | # 3_feature_set3d_json1.py
  5 | # Creates json jaccard similarity
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import nltk
 10 | import sklearn
 11 | import json
 12 | import sys
 13 | import gc
 14 | import feather
 15 | from pandas.io.json import json_normalize
 16 | import unicodedata
 17 | from stop_words import get_stop_words
 18 | import time
 19 | 
 20 | import libavito as a
 21 | 
 22 | stopwords = get_stop_words('ru')
 23 | punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po'])
 24 | sno = nltk.stem.SnowballStemmer('russian')
 25 | 
 26 | def get_clean_tokens(text):
 27 |     newtext = []
 28 |     text0 = nltk.word_tokenize(text, 'russian')
 29 |     for y in text0:
 30 |         y = ''.join(x for x in y
 31 |                   if unicodedata.category(x) not in punctutation_cats)
 32 |         if len(y) > 0 and y not in stopwords:
 33 |             newtext.append(sno.stem(y))
 34 |     return newtext
 35 | 
 36 | def jaccard_similarity(x, y):
 37 |     intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 38 |     union_cardinality = len(set.union(*[set(x), set(y)]))
 39 |     if union_cardinality == 0:
 40 |         return -1.0
 41 |     else:
 42 |         return intersection_cardinality / float(union_cardinality)
 43 | 
 44 | def ratio_of_matches(x, y):
 45 |     intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 46 |     x_cardinality = len(x)
 47 |     if x_cardinality == 0:
 48 |         return -1.0
 49 |     else:
 50 |         return intersection_cardinality / float(x_cardinality)
 51 | 
 52 | print(a.c.BOLD + 'Extracting set3d JSON features ...' + a.c.END)
 53 | 
 54 | # Get train/test mode from launch argument
 55 | mode = a.get_mode(sys.argv, '3_feature_set3d_json1.py')
 56 | 
 57 | ## Read settings required by script
 58 | config = a.read_config()
 59 | nthreads = config.preprocessing_nthreads
 60 | cache_loc = config.cache_loc
 61 | debug = config.debug
 62 | if mode == 0:
 63 |     root = config.train_images_root
 64 |     df = feather.read_dataframe(cache_loc + 'train.fthr')
 65 | if mode == 1:
 66 |     root = config.test_images_root
 67 |     df = feather.read_dataframe(cache_loc + 'test.fthr')
 68 | 
 69 | train = df[['itemID_1', 'itemID_2', 'attrsJSON_1', 'attrsJSON_2']]
 70 | del df
 71 | gc.collect()
 72 | 
 73 | train = train.fillna('')
 74 | 
 75 | ftrs = []
 76 | 
 77 | print('Calculating features ...')
 78 | t0 = time.time()
 79 | for i in range(0, len(train.index)):
 80 |     if i % 10000 == 0:
 81 |         a.print_progress(i, t0, len(train.index))
 82 |     try:
 83 |         jx = train.iloc[i]['attrsJSON_1'].lower()
 84 |         jy = train.iloc[i]['attrsJSON_2'].lower()
 85 |         resx = json.loads(jx)
 86 |         resy = json.loads(jy)
 87 |         similarkeys = jaccard_similarity(resx.keys(), resy.keys())
 88 |         similarvals = jaccard_similarity(resx.values(), resy.values())
 89 |         #out = str(train.iloc[i]['itemID_1']) + " " + str(train.iloc[i]['itemID_2']) + " " + str(similarkeys) + " " + str(similarvals)+ " " + str(len(resx)) + " " + str(len(resy)) + "\n"
 90 |         ftrs.append([train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], similarkeys, similarvals, len(resx), len(resy)])
 91 |     except:
 92 |         pass
 93 | 
 94 | start = time.time()
 95 | print('Caching data to disk ... ', end='', flush=True)
 96 | ftrs = pd.DataFrame(ftrs)
 97 | ftrs.columns = ['itemID_1', 'itemID_2', 'similarkeys', 'similarvals', 'nkey1', 'nkey2']
 98 | 
 99 | # Save updated dataset
100 | if mode == 0:
101 |     feather.write_dataframe(ftrs, cache_loc + 'features_train_set3d.fthr')
102 | if mode == 1:
103 |     feather.write_dataframe(ftrs, cache_loc + 'features_test_set3d.fthr')
104 | 
105 | a.print_elapsed(start)
106 | print('set3d extraction complete!')
107 | 
108 | # Write status to status file so master script knows whether to proceed.
109 | f = open(cache_loc + 'status.txt', 'a')
110 | f.write('feature_set3d_OK\n')
111 | f.close()
112 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3c_json.py:
--------------------------------------------------------------------------------
  1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
  2 | #### Author: Peter & Mikel
  3 | #### Avito Duplicate Ad Detection
  4 | # 3_feature_set3c_json.py
  5 | # Creates some features from clean jsons
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import nltk
 10 | import sklearn
 11 | import json
 12 | import sys
 13 | import gc
 14 | import feather
 15 | from pandas.io.json import json_normalize
 16 | import unicodedata
 17 | from stop_words import get_stop_words
 18 | import time
 19 | from multiprocessing import Pool
 20 | 
 21 | import libavito as a
 22 | 
 23 | stopwords = get_stop_words('ru')
 24 | punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po'])
 25 | sno = nltk.stem.SnowballStemmer('russian')
 26 | 
 27 | def get_clean_tokens(text):
 28 |     newtext = []
 29 |     text0 = nltk.word_tokenize(text, 'russian')
 30 |     for y in text0:
 31 |         y = ''.join(x for x in y
 32 |                   if unicodedata.category(x) not in punctutation_cats)
 33 |         if len(y) > 0 and y not in stopwords:
 34 |             newtext.append(sno.stem(y))
 35 |     return newtext
 36 | 
 37 | def jaccard_similarity(x, y):
 38 |     intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 39 |     union_cardinality = len(set.union(*[set(x), set(y)]))
 40 |     if union_cardinality == 0:
 41 |         return -1.0
 42 |     else:
 43 |         return intersection_cardinality / float(union_cardinality)
 44 | 
 45 | def ratio_of_matches(x, y):
 46 |     intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 47 |     x_cardinality = len(x)
 48 |     if x_cardinality == 0:
 49 |         return -1.0
 50 |     else:
 51 |         return intersection_cardinality / float(x_cardinality)
 52 | 
 53 | print(a.c.BOLD + 'Extracting set3c JSON features ...' + a.c.END)
 54 | 
 55 | # Get train/test mode from launch argument
 56 | mode = a.get_mode(sys.argv, '3_feature_set3c_json.py')
 57 | 
 58 | ## Read settings required by script
 59 | config = a.read_config()
 60 | nthreads = config.preprocessing_nthreads
 61 | cache_loc = config.cache_loc
 62 | debug = config.debug
 63 | if mode == 0:
 64 |     root = config.train_images_root
 65 |     df = feather.read_dataframe(cache_loc + 'train.fthr')
 66 | if mode == 1:
 67 |     root = config.test_images_root
 68 |     df = feather.read_dataframe(cache_loc + 'test.fthr')
 69 | 
 70 | train = df[['itemID_1', 'itemID_2', 'attrsJSON_1', 'attrsJSON_2']]
 71 | del df
 72 | gc.collect()
 73 | 
 74 | train = train.fillna('')
 75 | 
 76 | ftrs = []
 77 | 
 78 | def process_row(i):
 79 |     jx = get_clean_tokens(train.iloc[i]['attrsJSON_1'])
 80 |     jy = get_clean_tokens(train.iloc[i]['attrsJSON_2'])
 81 |     sim_j = jaccard_similarity(jx, jy)
 82 |     mat1_j = ratio_of_matches(jx, jy)
 83 |     mat2_j = ratio_of_matches(jy, jx)
 84 |     return [train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], sim_j, mat1_j, mat2_j]
 85 | 
 86 | t0 = time.time()
 87 | if nthreads == 1:
 88 |     print('Extracting features with 1 thread ...')
 89 |     for i in range(0, len(train.index)):
 90 |         if i % 10000 == 0:
 91 |             a.print_progress(i, t0, len(train.index))
 92 |         ftrs.append(process_row(i))
 93 | else:
 94 |     print('Extracting features multi-threaded ... ', end='', flush=True)
 95 |     pool = Pool(nthreads)
 96 |     ftrs = pool.map(process_row, range(0, len(train.index)))
 97 |     pool.close()
 98 |     a.print_elapsed(t0)
 99 | 
100 | start = time.time()
101 | print('Caching data to disk ... ', end='', flush=True)
102 | ftrs = pd.DataFrame(ftrs)
103 | ftrs.columns = ['itemID_1', 'itemID_2', 'simjson', 'matjson1', 'matjson2']
104 | 
105 | # Save updated dataset
106 | if mode == 0:
107 |     feather.write_dataframe(ftrs, cache_loc + 'features_train_set3c.fthr')
108 | if mode == 1:
109 |     feather.write_dataframe(ftrs, cache_loc + 'features_test_set3c.fthr')
110 | 
111 | a.print_elapsed(start)
112 | print('set3c extraction complete!')
113 | 
114 | # Write status to status file so master script knows whether to proceed.
115 | f = open(cache_loc + 'status.txt', 'a')
116 | f.write('feature_set3c_OK\n')
117 | f.close()
118 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3b_title.py:
--------------------------------------------------------------------------------
  1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
  2 | #### Author: Peter & Mikel
  3 | #### Avito Duplicate Ad Detection
  4 | # 3_feature_set3b_title.py
  5 | # Creates some features from clean titles
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import nltk
 10 | import sklearn
 11 | import json
 12 | import sys
 13 | import gc
 14 | import feather
 15 | from pandas.io.json import json_normalize
 16 | import unicodedata
 17 | from stop_words import get_stop_words
 18 | import time
 19 | from multiprocessing import Pool
 20 | 
 21 | import libavito as a
 22 | 
 23 | stopwords = get_stop_words('ru')
 24 | punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po'])
 25 | sno = nltk.stem.SnowballStemmer('russian')
 26 | 
 27 | def get_clean_tokens(text):
 28 |     newtext = []
 29 |     text0 = nltk.word_tokenize(text, 'russian')
 30 |     for y in text0:
 31 |         y = ''.join(x for x in y
 32 |                   if unicodedata.category(x) not in punctutation_cats)
 33 |         if len(y) > 0 and y not in stopwords:
 34 |             newtext.append(sno.stem(y))
 35 |     return newtext
 36 | 
 37 | def jaccard_similarity(x, y):
 38 |     intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 39 |     union_cardinality = len(set.union(*[set(x), set(y)]))
 40 |     if union_cardinality == 0:
 41 |         return -1.0
 42 |     else:
 43 |         return intersection_cardinality / float(union_cardinality)
 44 | 
 45 | def ratio_of_matches(x, y):
 46 |     intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 47 |     x_cardinality = len(x)
 48 |     if x_cardinality == 0:
 49 |         return -1.0
 50 |     else:
 51 |         return intersection_cardinality / float(x_cardinality)
 52 | 
 53 | print(a.c.BOLD + 'Extracting set3b title features ...' + a.c.END)
 54 | 
 55 | # Get train/test mode from launch argument
 56 | mode = a.get_mode(sys.argv, '3_feature_set3b_title.py')
 57 | 
 58 | ## Read settings required by script
 59 | config = a.read_config()
 60 | nthreads = config.preprocessing_nthreads
 61 | cache_loc = config.cache_loc
 62 | debug = config.debug
 63 | if mode == 0:
 64 |     root = config.train_images_root
 65 |     df = feather.read_dataframe(cache_loc + 'train.fthr')
 66 | if mode == 1:
 67 |     root = config.test_images_root
 68 |     df = feather.read_dataframe(cache_loc + 'test.fthr')
 69 | 
 70 | train = df[['itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2']]
 71 | del df
 72 | gc.collect()
 73 | 
 74 | train = train.fillna('')
 75 | 
 76 | ftrs = []
 77 | 
 78 | def process_row(i):
 79 |     tx = train.iloc[i]['cleantitle_1'].split(' ')
 80 |     ty = train.iloc[i]['cleantitle_2'].split(' ')
 81 |     sim_t = jaccard_similarity(tx, ty)
 82 |     mat1_t = ratio_of_matches(tx, ty)
 83 |     mat2_t = ratio_of_matches(ty, tx)
 84 |     return [train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], sim_t, mat1_t, mat2_t, len(tx), len(ty)]
 85 | 
 86 | t0 = time.time()
 87 | if nthreads == 1:
 88 |     print('Extracting features with 1 thread ...')
 89 |     for i in range(0, len(train.index)):
 90 |         if i % 10000 == 0:
 91 |             a.print_progress(i, t0, len(train.index))
 92 |         ftrs.append(process_row(i))
 93 | else:
 94 |     print('Extracting features multi-threaded ... ', end='', flush=True)
 95 |     pool = Pool(nthreads)
 96 |     ftrs = pool.map(process_row, range(0, len(train.index)))
 97 |     pool.close()
 98 |     a.print_elapsed(t0)
 99 | 
100 | start = time.time()
101 | print('Caching data to disk ... ', end='', flush=True)
102 | ftrs = pd.DataFrame(ftrs)
103 | ftrs.columns = ['itemID_1', 'itemID_2', 'simtitle', 'mattitle1', 'mattitle2', 'nwords1', 'nwords2']
104 | 
105 | # Save updated dataset
106 | if mode == 0:
107 |     feather.write_dataframe(ftrs, cache_loc + 'features_train_set3b.fthr')
108 | if mode == 1:
109 |     feather.write_dataframe(ftrs, cache_loc + 'features_test_set3b.fthr')
110 | 
111 | a.print_elapsed(start)
112 | print('set3b extraction complete!')
113 | 
114 | # Write status to status file so master script knows whether to proceed.
115 | f = open(cache_loc + 'status.txt', 'a')
116 | f.write('feature_set3b_OK\n')
117 | f.close()
118 | 


--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/1_internship_WordCorrection.R:
--------------------------------------------------------------------------------
  1 | library(qdap)
  2 | library(dplyr)
  3 | library(tidyr)
  4 | library(readr)
  5 | library(stringr)
  6 | library(tm)
  7 | 
  8 | #LOAD DATA
  9 | internship <- read_csv("../data/Internship.csv", na = c("", "NA", "NULL"))
 10 | 
 11 | SPELLINGERRORS <- check_spelling(internship$Internship_Profile,
 12 |                       assume.first.correct = TRUE,
 13 |                       n.suggests = 4)
 14 | SPELLINGERRORS <- data.frame(lapply(SPELLINGERRORS, as.character), 
 15 |                              stringsAsFactors=FALSE) %>%
 16 |                   select(not.found, suggestion)
 17 | #Remove Duplicate rows
 18 | SPELLINGERRORS <- SPELLINGERRORS[!duplicated(SPELLINGERRORS[1:2]), ]
 19 | 
 20 | #Now check sort(SPELLINGERRORS$not.found) and see which are actual spelling mistakes, which are correct but need modification
 21 | #Below are what I have observed:
 22 | SPELL_MISTAKES <- c("activites", "ambassodor", "andoid", "andorid", "andriod", "anubhava","autid","bussiness","chemsitry",
 23 |                     "coordinaing","cosnulting","develoment","developement","develpoment","enrolment","facilitation",
 24 |                     "finanace","managemnt","managment","mangement","marekting","markting","notejs","nutritionist","oflline","optimaization",
 25 |                     "optimization","optmization","pharmacovigilance","reasearch","recruiter","professonal","requirment","retreival","socia",
 26 |                     "trbology","tution","varification","vertification","writitng")
 27 | 
 28 | SPELLINGERRORS <- SPELLINGERRORS[(SPELLINGERRORS$not.found %in% SPELL_MISTAKES), ]
 29 | SIMILAR_WORDS <- list(
 30 |   c("apps", "app"), 
 31 |   c("Accounting", "Accountant"),
 32 |   c("back-end", "backend"),
 33 |   c("beckend", "backend"),
 34 |   c("back end", "backend"),
 35 |   c("blog", "blogger"),
 36 |   c("blogging", "blogger"),
 37 |   c("blogs", "blogger"),
 38 |   c("cataloguing" ,"catalogue"),
 39 |   c("curating", "curation"),
 40 |   c("desiging", "design"),
 41 |   c("desigining", "design"),
 42 |   c("designe", "design"),
 43 |   c("telecalling", "telecaller"),
 44 |   c("telecommunications", "telecom"),
 45 |   c("trbology" , "tribology"),
 46 |   c("oflline", "offline")
 47 | )
 48 | m <- matrix(unlist(SIMILAR_WORDS), byrow = TRUE, ncol = 2)
 49 | colnames(m) <- c("not.found", "suggestion")
 50 | SPELLINGERRORS <- rbind(SPELLINGERRORS, m)
 51 | 
 52 | #Function to replace Spelling errors
 53 | replaceSpellingErrors <- function(words) {
 54 |   b <- c()
 55 |   for (i in unlist(strsplit(words, " "))) {
 56 |     if (i %in% SPELLINGERRORS$not.found) {
 57 |       b <- append(b, SPELLINGERRORS$suggestion[SPELLINGERRORS$not.found == i])
 58 |     } else {
 59 |       b <- append(b, i)
 60 |     }
 61 |   }
 62 |   return(paste(b, collapse = " "))
 63 | }
 64 | 
 65 | #Function to remove all unwanted stuff
 66 | cleanUpText <- function(words, stem = TRUE) {
 67 |   #Remove all graph characters
 68 |   words <- str_replace_all(words,"[^[:graph:]]", " ")
 69 |   words <- gsub("[^[:alpha:][:space:]]*", "", words)
 70 |   words <- tolower(words)
 71 |   #Remove Punctuation except Hyphen -
 72 |   words <- gsub("([-])|[[:punct:]]", '\\1', words)
 73 |   #Remove all extra whitespace
 74 |   gsub("\\s+", " ", str_trim(words))
 75 |   #Replace all spelling errors
 76 |   words <- replaceSpellingErrors(words)
 77 |   #Stemming if stem = TRUE
 78 |   stemList <- c()
 79 |   if (stem) {
 80 |     for (i in words) {
 81 |       i <- gsub("[[:punct:]]$", "", i) #Remove any trailing punctuation mark
 82 |       i <- gsub("^[[:punct:]]", "", i) #Remove any leading punctuation mark
 83 |       j <- paste(stemDocument(unlist(strsplit(i," "))), collapse = " ")
 84 |       stemList <- append(stemList, j)
 85 |     }
 86 |     return(stemList)
 87 |   } else {
 88 |     return(words)
 89 |   }
 90 | }
 91 | 
 92 | t <- Sys.time()
 93 | for (i in c("Internship_Profile")) {
 94 |   print(i)
 95 |   #internship[[i]] <- cleanUpText(internship[[i]], stem = TRUE)
 96 |   internship[[i]] <- sapply(internship[[i]], cleanUpText)
 97 | }
 98 | print(Sys.time()-t)
 99 | 
100 | #Save file
101 | write.csv(internship, "../data/Internship_Processed.csv", row.names = FALSE)
102 | 
103 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3a_description.py:
--------------------------------------------------------------------------------
  1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
  2 | #### Author: Peter & Mikel
  3 | #### Avito Duplicate Ad Detection
  4 | # 3_feature_set3a_description.py
  5 | # Creates some features from clean descriptions
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import nltk
 10 | import sklearn
 11 | import json
 12 | import sys
 13 | import gc
 14 | import feather
 15 | from pandas.io.json import json_normalize
 16 | import unicodedata
 17 | from stop_words import get_stop_words
 18 | import time
 19 | from multiprocessing import Pool
 20 | 
 21 | import libavito as a
 22 | 
 23 | stopwords = get_stop_words('ru')
 24 | punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po'])
 25 | sno = nltk.stem.SnowballStemmer('russian')
 26 | 
 27 | def get_clean_tokens(text):
 28 |     newtext = []
 29 |     text0 = nltk.word_tokenize(text, 'russian')
 30 |     for y in text0:
 31 |         y = ''.join(x for x in y
 32 |                   if unicodedata.category(x) not in punctutation_cats)
 33 |         if len(y) > 0 and y not in stopwords:
 34 |             newtext.append(sno.stem(y))
 35 |     return newtext
 36 | 
 37 | def jaccard_similarity(x, y):
 38 |     intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 39 |     union_cardinality = len(set.union(*[set(x), set(y)]))
 40 |     if union_cardinality == 0:
 41 |         return -1.0
 42 |     else:
 43 |         return intersection_cardinality / float(union_cardinality)
 44 | 
 45 | def ratio_of_matches(x, y):
 46 |     intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 47 |     x_cardinality = len(x)
 48 |     if x_cardinality == 0:
 49 |         return -1.0
 50 |     else:
 51 |         return intersection_cardinality / float(x_cardinality)
 52 | 
 53 | print(a.c.BOLD + 'Extracting set3a description features ...' + a.c.END)
 54 | 
 55 | # Get train/test mode from launch argument
 56 | mode = a.get_mode(sys.argv, '3_feature_set3a_description.py')
 57 | 
 58 | ## Read settings required by script
 59 | config = a.read_config()
 60 | nthreads = config.preprocessing_nthreads
 61 | cache_loc = config.cache_loc
 62 | debug = config.debug
 63 | if mode == 0:
 64 |     root = config.train_images_root
 65 |     df = feather.read_dataframe(cache_loc + 'train.fthr')
 66 | if mode == 1:
 67 |     root = config.test_images_root
 68 |     df = feather.read_dataframe(cache_loc + 'test.fthr')
 69 | 
 70 | train = df[['itemID_1', 'itemID_2', 'cleandesc_1', 'cleandesc_2']]
 71 | del df
 72 | gc.collect()
 73 | 
 74 | train = train.fillna('')
 75 | 
 76 | ftrs = []
 77 | 
 78 | def process_row(i):
 79 |     dx = train.iloc[i]['cleandesc_1'].split(' ')
 80 |     dy = train.iloc[i]['cleandesc_2'].split(' ')
 81 |     sim_d = jaccard_similarity(dx, dy)
 82 |     mat1_d = ratio_of_matches(dx, dy)
 83 |     mat2_d = ratio_of_matches(dy, dx)
 84 |     return [train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], sim_d, mat1_d, mat2_d, len(dx), len(dy)]
 85 | 
 86 | # print('Calculating features ...')
 87 | t0 = time.time()
 88 | if nthreads == 1:
 89 |     print('Extracting features with 1 thread ...')
 90 |     for i in range(0, len(train.index)):
 91 |         if i % 10000 == 0:
 92 |             a.print_progress(i, t0, len(train.index))
 93 |         ftrs.append(process_row(i))
 94 | else:
 95 |     print('Extracting features multi-threaded ... ', end='', flush=True)
 96 |     pool = Pool(nthreads)
 97 |     ftrs = pool.map(process_row, range(0, len(train.index)))
 98 |     pool.close()
 99 |     a.print_elapsed(t0)
100 | 
101 | start = time.time()
102 | print('Caching data to disk ... ', end='', flush=True)
103 | ftrs = pd.DataFrame(ftrs)
104 | ftrs.columns = ['itemID_1', 'itemID_2', 'simdesc', 'mat1_d', 'mat2_d', 'nwords1', 'nwords2']
105 | 
106 | # Save updated dataset
107 | if mode == 0:
108 |     feather.write_dataframe(ftrs, cache_loc + 'features_train_set3a.fthr')
109 | if mode == 1:
110 |     feather.write_dataframe(ftrs, cache_loc + 'features_test_set3a.fthr')
111 | 
112 | a.print_elapsed(start)
113 | print('set3a extraction complete!')
114 | 
115 | # Write status to status file so master script knows whether to proceed.
116 | f = open(cache_loc + 'status.txt', 'a')
117 | f.write('feature_set3a_OK\n')
118 | f.close()
119 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1e_attribute.R:
--------------------------------------------------------------------------------
  1 | ################################################################################################
  2 | ################################################################################################
  3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
  4 | #### Competition: Avito Duplicate Ad Detection
  5 | # Filename : 3_feature_set1e_attribute.R
  6 | # Description: This Rscript generates all Attribute (Json) features
  7 | # Usage:
  8 | #       Rscript ./code/3_feature_set1e_attribute.R train
  9 | #       Rscript ./code/3_feature_set1e_attribute.R test
 10 | #       Default argument is test
 11 | ################################################################################################
 12 | ################################################################################################
 13 | args <- commandArgs(trailingOnly = F)
 14 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)])))
 15 | 
 16 | 
 17 | # Source Config and functions.R file
 18 | source(paste(BASE, "/../config.cfg", sep = ""))
 19 | source(paste(BASE_DIR, "/code/functions.R", sep = ""))
 20 | 
 21 | #Load any additional packages
 22 | library(parallel)
 23 | library(jsonlite)
 24 | 
 25 | # Read argument for train or test
 26 | trainOrTest <- commandArgs(trailingOnly = TRUE)
 27 | if (length(trainOrTest) > 1) {
 28 |         stop("ERROR: I need only 1 argument : train or test")
 29 | }
 30 | 
 31 | if (length(trainOrTest) == 0) {
 32 |         print("No Arguments passed, Assuming you mean test")
 33 |         trainOrTest <- "test"
 34 | }
 35 | 
 36 | #Load data
 37 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
 38 | cat("Reading file ", FILENAME, "\n", sep = " ")
 39 | dat <- read_csv(FILENAME)
 40 | 
 41 | 
 42 | 
 43 | #Function to generate Attribute Features
 44 | attribute_feature <- function(w) {
 45 | 	x <- w[1]
 46 | 	y <- w[2]
 47 | 	if (is.na(x) | is.na(y) | x == "[]" | y == "[]") {
 48 | 		return(rep(NA,8))
 49 | 	}
 50 | 	x <- paste("[", x, "]", sep = "")
 51 | 	y <- paste("[", y, "]", sep = "")
 52 | 	x.df <- fromJSON(x, simplifyDataFrame = TRUE)
 53 | 	y.df <- fromJSON(y, simplifyDataFrame = TRUE)
 54 | 	N_Attr_x <- ncol(x.df)
 55 | 	N_Attr_y <- ncol(y.df)
 56 | 	if (N_Attr_x == 0 | N_Attr_y == 0) {
 57 | 		return(rep(NA,8))
 58 | 	}
 59 | 	L <- length(intersect(names(x.df), names(y.df)))
 60 | 	ratioOfPercentageOfMatchingAttributesNames <- L / min(N_Attr_x, N_Attr_y)
 61 | 	ratioOfPercentageOfMatchingAttributesValues <- NA
 62 | 	c <- 0
 63 | 	if (ratioOfPercentageOfMatchingAttributesNames > 0) {
 64 | 		for (i in intersect(names(x.df), names(y.df))) {
 65 | 			if (x.df[[i]] == y.df[[i]]) {
 66 | 				c <- c + 1
 67 | 			}
 68 | 		}
 69 | 	ratioOfPercentageOfMatchingAttributesValues <- c / L
 70 | 	}
 71 | 	numberOfAttributes_sum <- N_Attr_x + N_Attr_y
 72 | 	numberOfAttributes_diff <- abs(N_Attr_x - N_Attr_y)
 73 | 	numberOfAttributes_min <- min(N_Attr_x, N_Attr_y)
 74 | 	numberOfAttributes_max <- max(N_Attr_x, N_Attr_y)
 75 | 
 76 | 	return(c(
 77 | 			numberOfAttributes_sum, 
 78 | 			numberOfAttributes_diff, 
 79 | 			numberOfAttributes_min,
 80 | 			numberOfAttributes_max,
 81 | 			L, 
 82 | 			ratioOfPercentageOfMatchingAttributesNames, 
 83 | 			c, 
 84 | 			ratioOfPercentageOfMatchingAttributesValues
 85 | 		))
 86 | 	
 87 | 
 88 | }
 89 | 
 90 | print("Generating Features")
 91 | #This can be made Parallel , I didnt do that as of now
 92 | df_master <- as.data.frame(t(apply(dat[, c("cleanjson_1", "cleanjson_2")], 1, attribute_feature)))
 93 | names(df_master) <- c(
 94 | 			"numberOfAttributes_sum", 
 95 | 			"numberOfAttributes_diff", 
 96 | 			"numberOfAttributes_min",
 97 | 			"numberOfAttributes_max",
 98 | 			"NoOfMatchingAttributesNames", 
 99 | 			"ratioOfPercentageOfMatchingAttributesNames", 
100 | 			"NoOfMatchingAttributesValues", 
101 | 			"ratioOfPercentageOfMatchingAttributesValues"
102 | 			)
103 | 
104 | names(df_master) <- paste("set1e", names(df_master), sep = "_")
105 | 
106 | ######## Add Primary Columns ItemID1 and ItemID2
107 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
108 | print("Saving Attributes features")
109 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1e_", "attributes.fthr", sep = "" ))
110 | 
111 | #END
112 | 


--------------------------------------------------------------------------------
/Microsoft/Womens-Health-Risk-Assessment/Predict.R:
--------------------------------------------------------------------------------
  1 | #(c) Sonny Laskar (sonnylaskar at gmail Dot Com)
  2 | #Create a zip file with all packages which are not available in Microsoft Azure environment and upload the zip.
  3 | #The zip file is available in "src" folder. My zip was named downloaded_packages.zip
  4 | install.packages("src/downloaded_packages/stringi_1.1.1.zip", lib = ".", repos = NULL, verbose = TRUE)
  5 | install.packages("src/downloaded_packages/magrittr_1.5.zip", lib = ".", repos = NULL, verbose = TRUE)
  6 | install.packages("src/downloaded_packages/xgboost_0.4-4.zip", lib = ".", repos = NULL, verbose = TRUE)
  7 | 
  8 | library(xgboost, lib.loc=".", verbose=TRUE)
  9 | library(dplyr)
 10 | library(gbm)
 11 | library(randomForest)
 12 | # Map 1-based optional input ports to variables
 13 | dataset1 <- maml.mapInputPort(1) # class: data.frame
 14 | dataset1$segment <- NULL
 15 | dataset1$subgroup <- NULL
 16 | cat("Original dim: ", dim(dataset1), "\n")
 17 | 
 18 | 
 19 | encode_religion <- function(dat) {
 20 |   #Input: Character Vector for religion
 21 |   #Output: Numeric Vector
 22 |   dat <- ifelse(dat == "Buddhist", 1, dat)
 23 |   dat <- ifelse(dat == "Evangelical/Bo", 2, dat)
 24 |   dat <- ifelse(dat == "Hindu", 3, dat)
 25 |   dat <- ifelse(dat == "Jewish", 4, dat)
 26 |   dat <- ifelse(dat == "Muslim", 5, dat)
 27 |   dat <- ifelse(dat == "Other", 6, dat)
 28 |   dat <- ifelse(dat == "Other Christia", 7, dat)
 29 |   dat <- ifelse(dat == "Roman Catholic", 8, dat)
 30 |   dat <- ifelse(dat == "Russian/Easter", 9, dat)
 31 |   dat <- ifelse(dat == "Traditional/An", 10, dat)
 32 |   dat <- ifelse(dat == "", NA, dat)
 33 |   dat <- as.integer(dat)
 34 |   return(dat)
 35 | }
 36 | 
 37 | manual_encode_religion <- function(dat) {
 38 |   #Input: Character Vector for religion
 39 |   #Output: Numeric Vector
 40 |   RELIGION <- c("Hindu", "Evangelical/Bo", "Muslim", "Roman Catholic", "Other Christia", "Buddhist", "Russian/Easter", "Traditional/An", "Other", "Jewish")
 41 |   for (i in RELIGION) {
 42 |     c <- paste("religion", i, sep = ".")
 43 |     print(c)
 44 |     dat[[c]] <- ifelse(dat$religion == i, 1, 0)
 45 |   }
 46 |   dat$religion <- encode_religion(dat$religion)
 47 |   return(dat)
 48 | }
 49 | 
 50 | featureEngineering <- function(dat) {
 51 |   dat$INTNR <- NULL
 52 |   dat$geo <- as.integer(dat$geo)
 53 |   dat <- manual_encode_religion(dat)
 54 |   dat$segment <- NULL
 55 |   dat$subgroup <- NULL
 56 |   dat[is.na(dat)] <- -1
 57 |   dat$christian <- as.numeric(dat$christian) #Xgboost needs at least one column as numeric
 58 |   #Random Forest cannot handle / and space in colnames
 59 |   names(dat) <- gsub("/", "_", names(dat))
 60 |   names(dat) <- gsub(" ", "_", names(dat))
 61 | 
 62 |   return(dat)
 63 | }
 64 | dataset1 <- featureEngineering(dataset1)
 65 | cat("New dim: ", dim(dataset1), "\n")
 66 | 
 67 | 
 68 | sub <- data.frame(patientID = NULL, geo = NULL, class = NULL)
 69 | for (GEO in 1:9) {
 70 | 	print(GEO)
 71 | 	dat <- dataset1[dataset1$geo == GEO, ]
 72 | 	cat("New dim: ", dim(dat), "\n")
 73 | 	if (nrow(dat) == 0) next
 74 | 	patientID <- dat$patientID
 75 | 	dat$patientID <- NULL
 76 | 	
 77 | 	if (GEO == 1) classes <- c("11","21","22") 
 78 | 	if (GEO == 2) classes <- c("11","12","21","22","31","41")
 79 | 	if (GEO == 3) classes <- c("11","12","21","22")
 80 | 	if (GEO == 4) classes <- c("11","12")
 81 | 	if (GEO == 5) classes <- c("11","12","22","31","32")
 82 | 	if (GEO == 6) classes <- c("11","12","21")
 83 | 	if (GEO == 7) classes <- c("11","12","21","22","31")
 84 | 	if (GEO == 8) classes <- c("11","21","31","41")
 85 | 	if (GEO == 9) classes <- c("11","12","21","31","32")
 86 | 	#LOAD XGB Model
 87 | 	xgb_1000 <- readRDS(paste("src/downloaded_packages/xgb_geo_", GEO ,"_seed1000.model", sep = ""))
 88 | 
 89 | 	xgb_test <- predict(xgb_1000, data.matrix(dat), missing=NA)
 90 | 	xgb_test <- as.data.frame(matrix(xgb_test,
 91 |                               		nrow=nrow(dat),
 92 | 	                              	byrow = TRUE))
 93 | 	colnames(xgb_test) <- classes
 94 | 
 95 | 	#LOAD RF Model
 96 | 	rf_1000 <- readRDS(paste("src/downloaded_packages/rf_geo_", GEO ,"_seed1000.model", sep = ""))
 97 | 	rf_test <- as.data.frame(predict(rf_1000,
 98 |                        		dat,
 99 | 	                      	type= "prob"))
100 | 	colnames(rf_test) <- classes
101 | 
102 | 	#Combined Weightage
103 | 	final <- (xgb_test*0.4 + rf_test*0.6)
104 | 	final$NEW <- apply(final, 1, function(x) {
105 |                                                 m <- which.max(x)
106 |                                                 names(final)[m]
107 |                                                 })
108 | 	sub <- rbind(sub, data.frame(patientID = patientID, geo = dat$geo, class = final$NEW))
109 | }
110 | 
111 | data.set <- data.frame(patientID = sub$patientID,
112 |                        Geo_Pred = sub$geo,
113 |                        Segment_Pred = as.integer(substring(sub$class, 1, 1)),
114 |                        Subgroup_Pred = as.integer(substring(sub$class, 2, 2))
115 | 			)
116 | 
117 | print(str(data.set))
118 | maml.mapOutputPort("data.set");
119 | 
120 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_json_to_cols.py:
--------------------------------------------------------------------------------
  1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
  2 | #### Author: Peter & Mikel
  3 | #### Avito Duplicate Ad Detection
  4 | # 3_json_to_cols.py
  5 | # Encodes json key similarity into a sparse format for feature extraction
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import sklearn
 10 | import json
 11 | from pandas.io.json import json_normalize
 12 | import unicodedata
 13 | import time
 14 | import codecs
 15 | import feather
 16 | 
 17 | import libavito as a
 18 | 
 19 | def jaccard_similarity(x, y):
 20 |     intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 21 |     union_cardinality = len(set.union(*[set(x), set(y)]))
 22 |     if union_cardinality == 0:
 23 |         return -1.0
 24 |     else:
 25 |         return intersection_cardinality / float(union_cardinality)
 26 | 
 27 | ## Read settings required by script
 28 | config = a.read_config()
 29 | nthreads = config.preprocessing_nthreads
 30 | cache_loc = config.cache_loc
 31 | debug = config.debug
 32 | df_train = feather.read_dataframe(cache_loc + 'train.fthr')
 33 | df_test = feather.read_dataframe(cache_loc + 'test.fthr')
 34 | 
 35 | df_train = df_train[['itemID_1', 'itemID_2', 'cleanjson_1', 'cleanjson_2']]
 36 | df_test = df_test[['itemID_1', 'itemID_2', 'cleanjson_1', 'cleanjson_2']]
 37 | 
 38 | df = pd.concat([df_train, df_test])
 39 | 
 40 | clean_jsons = df['cleanjson_1'].tolist() + df['cleanjson_2'].tolist()
 41 | 
 42 | print('Creating key dict ... ')
 43 | allkey = {}
 44 | pa = 0
 45 | t0 = time.time()
 46 | for i in range(0, len(clean_jsons)):
 47 |     if i % 100000 == 0:
 48 |         a.print_progress(i, t0, len(clean_jsons))
 49 |     try:
 50 |         jx = clean_jsons[i].replace("'", "")
 51 |         resx = json.loads(jx)
 52 |         for x in resx.keys():
 53 |             if x in allkey:
 54 |                 allkey[x] = allkey[x] + 1
 55 |             else:
 56 |                 allkey[x] = 1
 57 |     except KeyboardInterrupt:
 58 |         raise
 59 |     except Exception as e:
 60 |         pa += 1
 61 | 
 62 | t0 = time.time()
 63 | print('Transforming key dict ... ', end='', flush=True)
 64 | icount = 0
 65 | keydict = {}
 66 | for k, n in allkey.items():
 67 |     keydict[k] = icount
 68 |     icount += 1
 69 | a.print_elapsed(t0)
 70 | 
 71 | ftrs_train = []
 72 | print('Generating for train ... ')
 73 | t0 = time.time()
 74 | pa = 0
 75 | for i in range(0, len(df_train.index)):
 76 |     if i % 10000 == 0:
 77 |         a.print_progress(i, t0, len(df_train.index))
 78 |     try:
 79 |         jx = df_train.iloc[i]['cleanjson_1'].replace("'", "")
 80 |         jy = df_train.iloc[i]['cleanjson_2'].replace("'", "")
 81 |         resx = json.loads(jx)
 82 |         resy = json.loads(jy)
 83 |     except KeyboardInterrupt:
 84 |         raise
 85 |     except:
 86 |         continue
 87 | 
 88 |     if resx != [] and resy != []:
 89 |         for key in set.union(*[set(resx.keys()), set(resy.keys())]):
 90 |             if key in resx.keys() and key in resy.keys():
 91 |                 c = resx[key]
 92 |                 b = resy[key]
 93 |                 res = jaccard_similarity(c, b)
 94 |             else:
 95 |                 res = -1
 96 |             ftrs_train.append([df_train.iloc[i]['itemID_1'], df_train.iloc[i]['itemID_2'], str(keydict[key]), str(res)])
 97 |     else:
 98 |         pa += 1
 99 | 
100 | ftrs_test = []
101 | print('Generating for test ... ')
102 | t0 = time.time()
103 | for i in range(0, len(df_test.index)):
104 |     if i % 10000 == 0:
105 |         a.print_progress(i, t0, len(df_test.index))
106 |     try:
107 |         jx = df_test.iloc[i]['cleanjson_1'].replace("'", '')
108 |         jy = df_test.iloc[i]['cleanjson_2'].replace("'", '')
109 |         resx = json.loads(jx)
110 |         resy = json.loads(jy)
111 |     except KeyboardInterrupt:
112 |         raise
113 |     except:
114 |         continue
115 | 
116 |     if resx != [] and resy != []:
117 |         for key in set.union(*[set(resx.keys()), set(resy.keys())]):
118 |             if key in resx.keys() and key in resy.keys():
119 |                 c = resx[key]
120 |                 b = resy[key]
121 |                 res = jaccard_similarity(c, b)
122 |             else:
123 |                 res = -1
124 |             ftrs_test.append([df_test.iloc[i]['itemID_1'], df_test.iloc[i]['itemID_2'], str(keydict[key]), str(res)])
125 |     else:
126 |         pa += 1
127 | 
128 | print("\nError rows: " + str(pa))
129 | 
130 | print(len(ftrs_train))
131 | print(len(ftrs_test))
132 | 
133 | print('Tranforming features ... ', end='', flush=True)
134 | t0 = time.time()
135 | ftrs_train = pd.DataFrame(ftrs_train)
136 | ftrs_test = pd.DataFrame(ftrs_test)
137 | ftrs_train.columns = ['itemID_1', 'itemID_2', 'keyID', 'value']
138 | ftrs_test.columns = ['itemID_1', 'itemID_2', 'keyID', 'value']
139 | a.print_elapsed(t0)
140 | 
141 | print('Caching data to disk ... ', end='', flush=True)
142 | t0 = time.time()
143 | feather.write_dataframe(ftrs_train, cache_loc + 'json_vals_train_v2.fthr')
144 | feather.write_dataframe(ftrs_test, cache_loc + 'json_vals_test_v2.fthr')
145 | a.print_elapsed(t0)
146 | 
147 | print('json_to_cols Complete!')
148 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/functions.R:
--------------------------------------------------------------------------------
  1 | #### Copyright 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
  2 | #### Avito Duplicate Ad Detection
  3 | # functions.R
  4 | # TODO: WRITE DESCRIPTION OF SCRIPT HERE
  5 | 
  6 | #Load Basic packages needed by all R scripts
  7 | library(readr)
  8 | library(dplyr)
  9 | library(tidyr)
 10 | library(feather)
 11 | 
 12 | ######## GET NGRAMS FUNCTIONS
 13 | getNGrams <- function(my.text, n = 1) {
 14 |   # which can be split into a vector of consecutive words:
 15 |   my.vector.of.words <- stemDocument(unlist(strsplit(gsub("\\s+", " ", str_trim(my.text)), " ")))
 16 |   # now, we create a vector of word n-grams:
 17 |   if (length(my.vector.of.words) >= n) {
 18 |     make.ngrams(my.vector.of.words, ngram.size = n)
 19 |   } else {
 20 |     return(NULL)
 21 |   }
 22 | }
 23 | ######## GET NCHARS FUNCTIONS
 24 | getNGramsChars <- function(my.text, n = 1) {
 25 |   # which can be split into a vector of consecutive words:
 26 |   my.vector.of.words <- stemDocument(unlist(strsplit(gsub("\\s+", " ", str_trim(my.text)), " ")))
 27 |   # now, we create a vector of word n-grams:
 28 |   if (length(my.vector.of.words) >= n) {
 29 |     my.vector.of.chars = txt.to.features(my.vector.of.words, features = "c")
 30 |     make.ngrams(my.vector.of.chars, ngram.size = n)
 31 |   } else {
 32 |     return(NULL)
 33 |   }
 34 | }
 35 | 
 36 | ## NGRAMS
 37 | getNgramsCount <- function(string1, string2, n = 1) {
 38 |   #######################################
 39 |   # COUNTING NGRAMS FEATURES
 40 |   #######################################
 41 |   #Generate Ngrams
 42 |   NgramsString1 <- getNGrams(tolower(string1), n)
 43 |   NgramsString2 <- getNGrams(tolower(string2), n)
 44 | 
 45 |   #Count of Ngrams
 46 |   countOfNgramsInString1 <- length(NgramsString1)
 47 |   countOfNgramsInString2 <- length(NgramsString2)
 48 |   ratioOfNgrams_String1_String2 <- round(countOfNgramsInString1 / countOfNgramsInString2, 3)
 49 | 
 50 |   #Count of Unique NGrams
 51 |   countOfUniqueNgramsInString1 <- length(unique(NgramsString1))
 52 |   countOfUniqueNgramsInString2 <- length(unique(NgramsString2))
 53 |   ratioOfUniqueNgrams_String1_String2 <- round(countOfUniqueNgramsInString1 / countOfUniqueNgramsInString2, 3)
 54 | 
 55 |   ratioOfIntersect_Ngrams_String1_in_String2 <- round(sum(NgramsString1 %in% NgramsString2) / countOfNgramsInString1, 3)
 56 |   ratioOfIntersect_Ngrams_String2_in_String1 <- round(sum(NgramsString2 %in% NgramsString1) / countOfNgramsInString2, 3)
 57 | 
 58 |   countOfNgramsInString_min <- min( countOfNgramsInString1, countOfNgramsInString2 )
 59 |   countOfNgramsInString_max <- max( countOfNgramsInString1, countOfNgramsInString2 )
 60 |   countOfNgramsInString_sum <- ( countOfNgramsInString1 + countOfNgramsInString2 )
 61 |   countOfNgramsInString_diff <- abs( countOfNgramsInString1 - countOfNgramsInString2 )
 62 | 
 63 |   return(c(
 64 |         countOfNgramsInString_min,
 65 |         countOfNgramsInString_max,
 66 |         countOfNgramsInString_sum,
 67 |         countOfNgramsInString_diff,
 68 | 	countOfNgramsInString1,
 69 | 	countOfNgramsInString2,
 70 | 	countOfUniqueNgramsInString1,
 71 | 	countOfUniqueNgramsInString2,
 72 | 	ratioOfNgrams_String1_String2, 
 73 | 	ratioOfUniqueNgrams_String1_String2,
 74 | 	ratioOfIntersect_Ngrams_String1_in_String2,
 75 | 	ratioOfIntersect_Ngrams_String2_in_String1
 76 |   	))
 77 | }
 78 | 
 79 | ## NCHARS
 80 | getNcharsCount <- function(string1, string2, n = 1) {
 81 |   #######################################
 82 |   # COUNTING Nchars FEATURES
 83 |   #######################################
 84 |   #Generate Nchars
 85 |   NcharsString1 <- getNGramsChars(tolower(string1), n)
 86 |   NcharsString2 <- getNGramsChars(tolower(string2), n)
 87 | 
 88 |   #Count of Nchars
 89 |   countOfNcharsInString1 <- length(NcharsString1)
 90 |   countOfNcharsInString2 <- length(NcharsString2)
 91 |   ratioOfNchars_String1_String2 <- round(countOfNcharsInString1 / countOfNcharsInString2, 3)
 92 | 
 93 |   #Count of Unique Nchars
 94 |   countOfUniqueNcharsInString1 <- length(unique(NcharsString1))
 95 |   countOfUniqueNcharsInString2 <- length(unique(NcharsString2))
 96 |   ratioOfUniqueNchars_String1_String2 <- round(countOfUniqueNcharsInString1 / countOfUniqueNcharsInString2, 3)
 97 | 
 98 |   ratioOfIntersect_Nchars_String1_in_String2 <- round(sum(NcharsString1 %in% NcharsString2) / countOfNcharsInString1, 3)
 99 |   ratioOfIntersect_Nchars_String2_in_String1 <- round(sum(NcharsString2 %in% NcharsString1) / countOfNcharsInString2, 3)
100 | 
101 |   countOfNcharsInString_min <- min( countOfNcharsInString1, countOfNcharsInString2 )
102 |   countOfNcharsInString_max <- max( countOfNcharsInString1, countOfNcharsInString2 )
103 |   countOfNcharsInString_sum <- ( countOfNcharsInString1 + countOfNcharsInString2 )
104 |   countOfNcharsInString_diff <- abs(( countOfNcharsInString1 - countOfNcharsInString2 ))
105 | 
106 |   return(c(
107 | 	countOfNcharsInString_min,
108 | 	countOfNcharsInString_max,
109 | 	countOfNcharsInString_sum,
110 | 	countOfNcharsInString_diff,
111 | 	countOfNcharsInString1,
112 | 	countOfNcharsInString2,
113 | 	countOfUniqueNcharsInString1,
114 | 	countOfUniqueNcharsInString2,
115 |         ratioOfNchars_String1_String2,
116 |         ratioOfUniqueNchars_String1_String2,
117 |         ratioOfIntersect_Nchars_String1_in_String2,
118 |         ratioOfIntersect_Nchars_String2_in_String1
119 |         ))
120 | }
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3f_hamming.py:
--------------------------------------------------------------------------------
  1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
  2 | #### Author: Peter & Mikel
  3 | #### Avito Duplicate Ad Detection
  4 | # 3_feature_set3f_hamming.py
  5 | # Creates features from image dHashes
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | import sys
 10 | import feather
 11 | import time
 12 | import gc
 13 | from multiprocessing import Pool
 14 | 
 15 | import libavito as a
 16 | 
 17 | def debug(s):
 18 |     print(str(s))
 19 |     time.sleep(1)
 20 | 
 21 | print(a.c.BOLD + 'Extracting set3f image hamming features ...' + a.c.END)
 22 | 
 23 | # Get train/test mode from launch argument
 24 | mode = a.get_mode(sys.argv, '3_feature_set3f_hamming.py')
 25 | 
 26 | ## Read settings required by script
 27 | config = a.read_config()
 28 | nthreads = config.preprocessing_nthreads
 29 | cache_loc = config.cache_loc
 30 | #debug = config.debug
 31 | if mode == 0:
 32 |     df = feather.read_dataframe(cache_loc + 'train.fthr')
 33 | if mode == 1:
 34 |     df = feather.read_dataframe(cache_loc + 'test.fthr')
 35 | 
 36 | root = config.images_root
 37 | image_db = feather.read_dataframe(cache_loc + 'image_database.fthr')
 38 | 
 39 | df = df[['itemID_1', 'itemID_2', 'images_array_1', 'images_array_2']]
 40 | 
 41 | start = time.time()
 42 | print('Preparing imageDB ... ', end='', flush=True)
 43 | image_db.index = image_db['image']
 44 | nhash = image_db['FreqOfHash'].to_dict()
 45 | ihash = image_db['imagehash'].to_dict()
 46 | a.print_elapsed(start)
 47 | 
 48 | def process_row(row):
 49 |     id1 = row[0]
 50 |     id2 = row[1]
 51 |     array_x = row[2]
 52 |     array_y = row[3]
 53 | 
 54 |     if array_x is not None:
 55 |         aux_x = array_x.replace(' ', '').split(',')
 56 |     else:
 57 |         aux_x = []
 58 |     if array_y is not None:
 59 |         aux_y = array_y.replace(' ', '').split(',')
 60 |     else:
 61 |         aux_y = []
 62 | 
 63 |     icount = []
 64 |     missing = 0
 65 |     minhamming = 99999
 66 |     minhamming30 = 99999
 67 |     minhamming50 = 99999
 68 |     minhamming100 = 99999
 69 |     #maxn = 0
 70 |     for k in range(0, 9):
 71 |         icount.append(0)
 72 | 
 73 | #   Find out if some images are repeated very often
 74 |     maxnx = 0
 75 |     maxny = 0
 76 |     for ix in aux_x:
 77 |         ix = int(ix)
 78 |         if ix in nhash:
 79 |             if maxnx < nhash[ix]:
 80 |                 maxnx = nhash[ix]
 81 | 
 82 |     for iy in aux_y:
 83 |         iy = int(iy)
 84 |         if iy in nhash:
 85 |             if maxny < nhash[iy]:
 86 |                 maxny = nhash[iy]
 87 | 
 88 |     for ix in aux_x:
 89 |         for iy in aux_y:
 90 |             if ix in ihash and iy in ihash:
 91 |                 try:
 92 |                     a = int('0x' + ihash[ix], 16)
 93 |                     b = int('0x' + ihash[iy], 16)
 94 |                     hamming = bin(a ^ b).count("1")
 95 |                     if hamming < 9:
 96 |                         icount[hamming] = icount[hamming] + 1
 97 | 
 98 |                     if hamming < minhamming:
 99 |                         minhamming = hamming
100 | 
101 |                     if nhash[ix] < 100 and nhash[iy] < 100:
102 |                         if minhamming100 > hamming:
103 |                             minhamming100 = hamming
104 | 
105 |                     if nhash[ix] < 30 and nhash[iy] < 30:
106 |                         if minhamming30 > hamming:
107 |                             minhamming30 = hamming
108 | 
109 |                     if nhash[ix] < 50 and nhash[iy] < 50:
110 |                         if minhamming50 > hamming:
111 |                             minhamming50 = hamming
112 | 
113 |                 except:
114 |                     pass
115 |                     #debug(['break', ix, iy])
116 |             else:
117 |                 #debug(['missing', ix, iy])
118 |                 missing = missing + 1
119 | 
120 |     vals = [id1, id2] + icount + [missing, minhamming, maxnx, maxny, minhamming30, minhamming50, minhamming100]
121 |     if min(len(aux_x), len(aux_y)) > 0:
122 |         return vals
123 |     else:
124 |         return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
125 | 
126 | ftrs = []
127 | 
128 | start = time.time()
129 | o = len(df.index)
130 | if nthreads == 1:
131 |     print('Extracting features with 1 thread ...')
132 |     k = 0
133 |     # Iterate over files
134 |     ftrs = []
135 |     for row in df.values:
136 |         x = process_row(row)
137 |         ftrs.append(x)
138 |         k += 1
139 |         if k % 100 == 0:
140 |             a.print_progress(k, start, o)
141 | 
142 | # Otherwise perform multi-threaded mapping
143 | else:
144 |     print('Extracting features multi-threaded ... ', end='', flush=True)
145 |     pool = Pool(nthreads)
146 |     ftrs = pool.map(process_row, df.values)
147 |     pool.close()
148 |     gc.collect()
149 | 
150 |     a.print_elapsed(start)
151 | 
152 | ftrs = pd.DataFrame(ftrs)
153 | ftrs = ftrs.loc[ftrs[0] > 0]
154 | cols = ['itemID_1', 'itemID_2'] + [str(c) for c in ['ham' + str(i) for i in range(9)] + ['miss', 'minham', 'maxnx', 'maxny', 'minham30', 'minham50', 'minham100']]
155 | print(cols)
156 | ftrs.columns = cols
157 | 
158 | # Save updated dataset
159 | if mode == 0:
160 |     feather.write_dataframe(ftrs, cache_loc + 'features_train_set3f.fthr')
161 | if mode == 1:
162 |     feather.write_dataframe(ftrs, cache_loc + 'features_test_set3f.fthr')
163 | 
164 | a.print_elapsed(start)
165 | print('set3f extraction complete!')
166 | 
167 | # Write status to status file so master script knows whether to proceed.
168 | f = open(cache_loc + 'status.txt', 'a')
169 | f.write('feature_set3f_OK\n')
170 | f.close()
171 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/2_image_info.py:
--------------------------------------------------------------------------------
  1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
  2 | #### Author: Mikel
  3 | #### Avito Duplicate Ad Detection
  4 | # 2_image_info.py
  5 | # Creates a database of images and metadata about them, including dHash
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import cv2
 10 | import feather
 11 | import glob
 12 | import sys
 13 | import time
 14 | import os
 15 | import gc
 16 | from multiprocessing import Pool
 17 | from PIL import Image
 18 | from collections import Counter
 19 | 
 20 | import libavito as a
 21 | 
 22 | print(a.c.BOLD + 'Generating image info ...' + a.c.END)
 23 | 
 24 | # Get train/test mode from launch argument
 25 | mode = a.get_mode(sys.argv, '2_image_info.py')
 26 | 
 27 | ## Read settings required by script
 28 | config = a.read_config()
 29 | nthreads = config.preprocessing_nthreads
 30 | cache_loc = config.cache_loc
 31 | debug = config.debug
 32 | root = config.images_root
 33 | 
 34 | # Function to compute difference hash of image
 35 | def DifferenceHash(img):
 36 |     theImage = Image.fromarray(img)
 37 |     # Convert the image to 8-bit grayscale.
 38 |     theImage = theImage.convert("L")  # 8-bit grayscale
 39 |     # Squeeze it down to an 8x8 image.
 40 |     theImage = theImage.resize((8, 8), Image.ANTIALIAS)
 41 |     # Go through the image pixel by pixel.
 42 |     # Return 1-bits when a pixel is equal to or brighter than the previous
 43 |     # pixel, and 0-bits when it's below.
 44 |     # Use the 64th pixel as the 0th pixel.
 45 |     previousPixel = theImage.getpixel((0, 7))
 46 |     differenceHash = 0
 47 |     for row in range(0, 8, 2):
 48 |         # Go left to right on odd rows.
 49 |         for col in range(8):
 50 |             differenceHash <<= 1
 51 |             pixel = theImage.getpixel((col, row))
 52 |             differenceHash |= 1 * (pixel >= previousPixel)
 53 |             previousPixel = pixel
 54 |         row += 1
 55 |         # Go right to left on even rows.
 56 |         for col in range(7, -1, -1):
 57 |             differenceHash <<= 1
 58 |             pixel = theImage.getpixel((col, row))
 59 |             differenceHash |= 1 * (pixel >= previousPixel)
 60 |             previousPixel = pixel
 61 |     return differenceHash
 62 | 
 63 | def get_info(file_loc):
 64 |     try:
 65 |         # Get size of image
 66 |         size = os.path.getsize(file_loc)
 67 | 
 68 |         # Attempt to load image
 69 |         img = cv2.imread(file_loc)
 70 |         try:
 71 |             # Test if image is corrupt
 72 |             assert img.shape[0] * img.shape[1] > 0
 73 |         except:
 74 |             print('[WARNING] Image ' + file_loc + ' is corrupt, skipping.')
 75 |             raise
 76 | 
 77 |         # Get image metadata
 78 |         width = img.shape[1]
 79 |         height = img.shape[0]
 80 | 
 81 |         # Get ratio of image dimensions
 82 |         ratio = round(min(width, height) / max(width, height), 2)
 83 | 
 84 |         # Compute difference hash of image and convert to hex
 85 |         dhash = '%(hash)016x' % {"hash": DifferenceHash(img)}
 86 | 
 87 |         return [width, height, ratio, dhash, size]
 88 | 
 89 |     except KeyboardInterrupt:
 90 |         raise
 91 |     except:
 92 |         print('[WARNING] Image ' + file_loc + ' failed to process.')
 93 |         return [np.nan, np.nan, np.nan, np.nan, np.nan]
 94 | 
 95 | def process_line(f):
 96 |     # Get image ID
 97 |     img_id = f.split('/')[-1].split('.')[0]
 98 |     # Retrieve info for image
 99 |     d = get_info(f)
100 |     # Construct list and return
101 |     info = []
102 |     info.append(img_id)
103 |     info.extend(d)
104 |     return info
105 | 
106 | # Recursively glob for jpeg files in the image root
107 | start = time.time()
108 | print('Looking for images in ' + root + ' ... ', end='', flush=True)
109 | files = glob.glob(root + '**/*.jpg', recursive=True)
110 | a.print_elapsed(start)
111 | 
112 | print('Found ' + str(len(files)) + ' images.')
113 | 
114 | l_id = []
115 | l_width = []
116 | l_height = []
117 | l_ratio = []
118 | l_hash = []
119 | l_size = []
120 | o = len(files)
121 | if nthreads == 1:
122 |     print('Extracting image info with 1 thread ...')
123 |     k = 0
124 |     # Iterate over files
125 |     for f in files:
126 |         x = process_line(f)
127 |         l_id.append(x[0])
128 |         l_width.append(x[1])
129 |         l_height.append(x[2])
130 |         l_ratio.append(x[3])
131 |         l_hash.append(x[4])
132 |         l_size.append(x[5])
133 |         k += 1
134 |         if k % 1000 == 0:
135 |             a.print_progress(k, start, o)
136 | # Otherwise perform multi-threaded mapping
137 | else:
138 |     print('Extracting image info multi-threaded ... ', end='', flush=True)
139 |     pool = Pool(nthreads)
140 |     newdata = pool.map(process_line, files)
141 |     pool.close()
142 |     for x in newdata:
143 |         l_id.append(x[0])
144 |         l_width.append(x[1])
145 |         l_height.append(x[2])
146 |         l_ratio.append(x[3])
147 |         l_hash.append(x[4])
148 |         l_size.append(x[5])
149 |     del newdata
150 |     gc.collect()
151 | 
152 |     a.print_elapsed(start)
153 | 
154 | print('Finding hash-counts ...', end='', flush=True)
155 | start = time.time()
156 | counttable = Counter(l_hash)
157 | l_hashcount = []
158 | for h in l_hash:
159 |     l_hashcount.append(counttable[h])
160 | a.print_elapsed(start)
161 | 
162 | # Bind lists to dataframe
163 | df = pd.DataFrame()
164 | df['image'] = l_id
165 | df['width'] = l_width
166 | df['height'] = l_height
167 | df['ratioOfDimension'] = l_ratio
168 | df['imagehash'] = l_hash
169 | df['FreqOfHash'] = l_hashcount
170 | df['imagesize'] = l_size
171 | 
172 | start = time.time()
173 | print('Caching image data ... ', end='', flush=True)
174 | 
175 | # Save updated dataset
176 | feather.write_dataframe(df, cache_loc + 'image_database.fthr')
177 | df.to_csv(cache_loc + 'image_database.csv', index=False)
178 | 
179 | a.print_elapsed(start)
180 | print('Image info extraction complete!')
181 | 
182 | # Write status to status file so master script knows whether to proceed.
183 | f = open(cache_loc + 'status.txt', 'a')
184 | f.write('image_info_OK\n')
185 | f.close()
186 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1a_ngram.R:
--------------------------------------------------------------------------------
  1 | ################################################################################################
  2 | ################################################################################################
  3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
  4 | #### Competition: Avito Duplicate Ad Detection
  5 | # Filename : 3_feature_set1a_ngram.R
  6 | # Description: This Rscript generates all ngram features
  7 | # Usage: 
  8 | #	Rscript ./code/3_feature_set1a_ngram.R train
  9 | #	Rscript ./code/3_feature_set1a_ngram.R test
 10 | #	Default argument is test
 11 | ################################################################################################
 12 | ################################################################################################
 13 | 
 14 | args <- commandArgs(trailingOnly = F)
 15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)])))
 16 | 
 17 | # Source Config and functions.R file
 18 | source(paste(BASE, "/../config.cfg", sep = ""))
 19 | source(paste(BASE_DIR, "/code/functions.R", sep = ""))
 20 | 
 21 | #Load any additional packages
 22 | library(parallel)
 23 | library(stylo)
 24 | library(stringr)
 25 | library(tm)
 26 | 
 27 | # Read argument for train or test
 28 | trainOrTest <- commandArgs(trailingOnly = TRUE)
 29 | if (length(trainOrTest) > 1) {
 30 | 	stop("ERROR: I need only 1 argument : train or test")
 31 | }
 32 | 
 33 | if (length(trainOrTest) == 0) {
 34 |         print("No Arguments passed, Assuming you mean test")
 35 | 	trainOrTest <- "test"
 36 | }
 37 | 
 38 | #Load data
 39 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
 40 | cat("Reading file ", FILENAME, "\n", sep = " ")
 41 | dat <- read_csv(FILENAME)
 42 | 
 43 | #######################################
 44 | # Start generating Features for DESCRIPTION columns
 45 | print("Start generating nGrams Features for DESCRIPTION columns")
 46 | for (n in 1:3) {
 47 |   print(n)
 48 |   df2 <- data.frame(t(mcmapply(getNgramsCount, dat$cleandesc_1, dat$cleandesc_2, n, USE.NAMES = FALSE, mc.cores = preprocessing_nthreads)))
 49 |   colnames(df2) <- c(
 50 |                         paste("countOf_", n, "_Grams_description_min", sep = ""),
 51 |                         paste("countOf_", n, "_Grams_description_max", sep = ""),
 52 |                         paste("countOf_", n, "_Grams_description_sum", sep = ""),
 53 |                         paste("countOf_", n, "_Grams_description_diff", sep = ""),
 54 | 
 55 |                         paste("countOf_", n, "_Grams_cleandesc_1", sep = ""),
 56 |                         paste("countOf_", n, "_Grams_cleandesc_2", sep = ""),
 57 |                         paste("countOfUnique_", n, "_Grams_cleandesc_1", sep = ""),
 58 |                         paste("countOfUnique_", n, "_Grams_cleandesc_2", sep = ""),
 59 |                         paste("ratioOf_", n, "_Grams_cleandesc_1_cleandesc_2", sep = ""),
 60 |                         paste("ratioOfUnique_", n, "_Grams_cleandesc_1_cleandesc_2", sep = ""),
 61 |                         paste("ratioOfIntersect_", n, "_Grams_cleandesc_1_in_cleandesc_2", sep = ""),
 62 |                         paste("ratioOfIntersect_", n, "_Grams_cleandesc_2_in_cleandesc_1", sep = "")
 63 |                     )
 64 |   if (nrow(df2) != nrow(dat)) {
 65 | 	cat("Expecting", nrow(dat), "Got", nrow(df2), "\n", sep = " ")
 66 | 	stop("mcmapply is behaving weird. Getting less results")
 67 |   }
 68 | 
 69 |   if (exists("df_master")) {
 70 |         df_master <- bind_cols(df_master, df2)
 71 |   } else {
 72 |         df_master <- df2
 73 |   }
 74 | }
 75 | names(df_master) <- paste("set1a", names(df_master), sep = "_")
 76 | 
 77 | ######## Add Primary Columns ItemID1 and ItemID2
 78 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
 79 | print("Saving Description ngrams features")
 80 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1a_", "ngram_description.fthr", sep = "" ))
 81 | rm(df_master, df2)
 82 | gc()
 83 | 
 84 | 
 85 | #######################################
 86 | # Start generating Features for TITLE columns
 87 | print("Start generating nGrams Features for TITLE columns")
 88 | for (n in 1:3) {
 89 |   print(n)
 90 |   df2 <- data.frame(t(mcmapply(getNgramsCount, dat$cleantitle_1, dat$cleantitle_2, n, USE.NAMES = FALSE, mc.cores = preprocessing_nthreads)))
 91 |   colnames(df2) <- c(
 92 |                         paste("countOf_", n, "_Grams_title_min", sep = ""),
 93 |                         paste("countOf_", n, "_Grams_title_max", sep = ""),
 94 |                         paste("countOf_", n, "_Grams_title_sum", sep = ""),
 95 |                         paste("countOf_", n, "_Grams_title_diff", sep = ""),
 96 | 
 97 |                         paste("countOf_", n, "_Grams_cleantitle_1", sep = ""),
 98 |                         paste("countOf_", n, "_Grams_cleantitle_2", sep = ""),
 99 |                         paste("countOfUnique_", n, "_Grams_cleantitle_1", sep = ""),
100 |                         paste("countOfUnique_", n, "_Grams_cleantitle_2", sep = ""),
101 |                         paste("ratioOf_", n, "_Grams_cleantitle_1_cleantitle_2", sep = ""),
102 |                         paste("ratioOfUnique_", n, "_Grams_cleantitle_1_cleantitle_2", sep = ""),
103 |                         paste("ratioOfIntersect_", n, "_Grams_cleantitle_1_in_cleantitle_2", sep = ""),
104 |                         paste("ratioOfIntersect_", n, "_Grams_cleantitle_2_in_cleantitle_1", sep = "")
105 |                     )
106 | 
107 |   if (nrow(df2) != nrow(dat)) {
108 | 	cat("Expecting", nrow(dat), "Got", nrow(df2), "\n", sep = " ")
109 | 	stop("mcmapply is behaving weird. Getting less results")
110 |   }
111 | 
112 |   if (exists("df_master")) {
113 |         df_master <- bind_cols(df_master, df2)
114 |   } else {
115 |         df_master <- df2
116 |   }
117 | }
118 | names(df_master) <- paste("set1a", names(df_master), sep = "_")
119 | 
120 | ######## Add Primary Columns ItemID1 and ItemID2
121 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
122 | print("Saving Title ngrams features")
123 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1a_", "ngram_title.fthr", sep = "" ))
124 | rm(df_master, df2)
125 | gc()
126 | 
127 | #END
128 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1b_nchar.R:
--------------------------------------------------------------------------------
  1 | ################################################################################################
  2 | ################################################################################################
  3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
  4 | #### Competition: Avito Duplicate Ad Detection
  5 | # Filename : 3_feature_set1b_nchar.R
  6 | # Description: This Rscript generates all nchar features
  7 | # Usage:
  8 | #       Rscript ./code/3_feature_set1b_nchar.R train
  9 | #       Rscript ./code/3_feature_set1b_nchar.R test
 10 | #       Default argument is test
 11 | ################################################################################################
 12 | ################################################################################################
 13 | 
 14 | args <- commandArgs(trailingOnly = F)
 15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)])))
 16 | 
 17 | # Source Config and functions.R file
 18 | source(paste(BASE, "/../config.cfg", sep = ""))
 19 | source(paste(BASE_DIR, "/code/functions.R", sep = ""))
 20 | 
 21 | #Load any additional packages
 22 | library(parallel)
 23 | library(stylo)
 24 | library(stringr)
 25 | library(tm)
 26 | 
 27 | # Read argument for train or test
 28 | trainOrTest <- commandArgs(trailingOnly = TRUE)
 29 | if (length(trainOrTest) > 1) {
 30 |         stop("ERROR: I need only 1 argument : train or test")
 31 | }
 32 | 
 33 | if (length(trainOrTest) == 0) {
 34 |         print("No Arguments passed, Assuming you mean test")
 35 |         trainOrTest <- "test"
 36 | }
 37 | 
 38 | #Load data
 39 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
 40 | cat("Reading file ", FILENAME, "\n", sep = " ")
 41 | dat <- read_csv(FILENAME)
 42 | 
 43 | 
 44 | #######################################
 45 | # Start generating Features for DESCRIPTION columns
 46 | print("Start generating nChars Features for DESCRIPTION columns")
 47 | for (n in 1:3) {
 48 |   print(n)
 49 |   df2 <- data.frame(t(mcmapply(getNcharsCount, dat$cleandesc_1, dat$cleandesc_2, n, USE.NAMES = FALSE, mc.cores = preprocessing_nthreads)))
 50 |   colnames(df2) <- c(
 51 |                         paste("countOf_", n, "_Chars_description_min", sep = ""),
 52 |                         paste("countOf_", n, "_Chars_description_max", sep = ""),
 53 |                         paste("countOf_", n, "_Chars_description_sum", sep = ""),
 54 |                         paste("countOf_", n, "_Chars_description_diff", sep = ""),
 55 | 
 56 |                         paste("countOf_", n, "_Chars_cleandesc_1", sep = ""),
 57 |                         paste("countOf_", n, "_Chars_cleandesc_2", sep = ""),
 58 |                         paste("countOfUnique_", n, "_Chars_cleandesc_1", sep = ""),
 59 |                         paste("countOfUnique_", n, "_Chars_cleandesc_2", sep = ""),
 60 |                         paste("ratioOf_", n, "_Chars_cleandesc_1_cleandesc_2", sep = ""),
 61 |                         paste("ratioOfUnique_", n, "_Chars_cleandesc_1_cleandesc_2", sep = ""),
 62 |                         paste("ratioOfIntersect_", n, "_chars_cleandesc_1_in_cleandesc_2", sep = ""),
 63 |                         paste("ratioOfIntersect_", n, "_chars_cleandesc_2_in_cleandesc_1", sep = "")
 64 |                     )
 65 |   if (nrow(df2) != nrow(dat)) {
 66 |         cat("Expecting", nrow(dat), "Got", nrow(df2), "\n", sep = " ")
 67 |         stop("mcmapply is behaving weird. Getting less results")
 68 |   }
 69 | 
 70 |   if (exists("df_master")) {
 71 |         df_master <- bind_cols(df_master, df2)
 72 |   } else {
 73 |         df_master <- df2
 74 |   }
 75 | }
 76 | 
 77 | names(df_master) <- paste("set1b", names(df_master), sep = "_")
 78 | 
 79 | ######## Add Primary Columns ItemID1 and ItemID2
 80 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
 81 | print("Saving Description nchars features")
 82 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1b_", "nchar_description.fthr", sep = "" ))
 83 | rm(df_master, df2)
 84 | gc()
 85 | 
 86 | #######################################
 87 | # Start generating Features for TITLE columns
 88 | print("Start generating nChars Features for TITLE columns")
 89 | for (n in 1:3) {
 90 |   print(n)
 91 |   df2 <- data.frame(t(mcmapply(getNcharsCount, dat$cleantitle_1, dat$cleantitle_2, n, USE.NAMES = FALSE, mc.cores = preprocessing_nthreads)))
 92 |   colnames(df2) <- c(
 93 |                         paste("countOf_", n, "_Chars_title_min", sep = ""),
 94 |                         paste("countOf_", n, "_Chars_title_max", sep = ""),
 95 |                         paste("countOf_", n, "_Chars_title_sum", sep = ""),
 96 |                         paste("countOf_", n, "_Chars_title_diff", sep = ""),
 97 | 
 98 |                         paste("countOf_", n, "_Chars_cleantitle_1", sep = ""),
 99 |                         paste("countOf_", n, "_Chars_cleantitle_2", sep = ""),
100 |                         paste("countOfUnique_", n, "_Chars_cleantitle_1", sep = ""),
101 |                         paste("countOfUnique_", n, "_Chars_cleantitle_2", sep = ""),
102 |                         paste("ratioOf_", n, "_Chars_cleantitle_1_cleantitle_2", sep = ""),
103 |                         paste("ratioOfUnique_", n, "_Chars_cleantitle_1_cleantitle_2", sep = ""),
104 |                         paste("ratioOfIntersect_", n, "_chars_cleantitle_1_in_cleantitle_2", sep = ""),
105 |                         paste("ratioOfIntersect_", n, "_chars_cleantitle_2_in_cleantitle_1", sep = "")
106 |                     )
107 |   if (nrow(df2) != nrow(dat)) {
108 |         cat("Expecting", nrow(dat), "Got", nrow(df2), "\n", sep = " ")
109 |         stop("mcmapply is behaving weird. Getting less results")
110 |   }
111 | 
112 |   if (exists("df_master")) {
113 |         df_master <- bind_cols(df_master, df2)
114 |   } else {
115 |         df_master <- df2
116 |   }
117 | }
118 | names(df_master) <- paste("set1b", names(df_master), sep = "_")
119 | 
120 | ######## Add Primary Columns ItemID1 and ItemID2
121 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
122 | print("Saving Title nchars features")
123 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1b_", "nchar_title.fthr", sep = "" ))
124 | rm(df_master, df2)
125 | gc()
126 | 
127 | #END
128 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/5_consolidate_features.R:
--------------------------------------------------------------------------------
  1 | ################################################################################################
  2 | ################################################################################################
  3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
  4 | #### Competition: Avito Duplicate Ad Detection
  5 | # Filename : 3_feature_set1a_ngram.R
  6 | # Description: This Rscript generates all ngram features
  7 | # Usage:
  8 | #       Rscript ./code/3_feature_set1a_ngram.R train
  9 | #       Rscript ./code/3_feature_set1a_ngram.R test
 10 | #       Default argument is test
 11 | ################################################################################################
 12 | ################################################################################################
 13 | 
 14 | # Source Config and functions.R file
 15 | source("config.cfg")
 16 | source("./code/functions.R")
 17 | 
 18 | library(readr)
 19 | library(dplyr)
 20 | library(feather)
 21 | 
 22 | 
 23 | # Read argument for train or test
 24 | trainOrTest <- commandArgs(trailingOnly = TRUE)
 25 | if (length(trainOrTest) > 1) {
 26 |         stop("ERROR: I need only 1 argument : train or test")
 27 | }
 28 | 
 29 | if (length(trainOrTest) == 0) {
 30 |         print("No Arguments passed, Assuming you mean test")
 31 |         trainOrTest <- "test"
 32 | }
 33 | 
 34 | #Load data
 35 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
 36 | cat("Reading file ", FILENAME, "\n", sep = " ")
 37 | completeDate <- read_csv(FILENAME)
 38 | if (trainOrTest == "train") {
 39 | 	completeDate <- completeDate[, c("itemID_1", "itemID_2", "isDuplicate")]
 40 | 	gc()
 41 | } else {
 42 | 	completeDate <- completeDate[, c("id", "itemID_1", "itemID_2")]
 43 | 	gc()
 44 | }     
 45 | 
 46 | ngram_title <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1a_ngram_title.fthr", sep = "" ))
 47 | completeDate <- left_join(completeDate, ngram_title, by = c("itemID_1", "itemID_2"))
 48 | rm(ngram_title)
 49 | 
 50 | ngram_description <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1a_ngram_description.fthr", sep = "" ))
 51 | completeDate <- left_join(completeDate, ngram_description, by = c("itemID_1", "itemID_2"))
 52 | rm(ngram_description)
 53 | 
 54 | nchar_title <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1b_nchar_title.fthr", sep = "" ))
 55 | completeDate <- left_join(completeDate, nchar_title, by = c("itemID_1", "itemID_2"))
 56 | rm(nchar_title)
 57 | 
 58 | nchar_description <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1b_nchar_description.fthr", sep = "" ))
 59 | completeDate <- left_join(completeDate, nchar_description, by = c("itemID_1", "itemID_2"))
 60 | rm(nchar_description)
 61 | 
 62 | misc <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1c_misc.fthr", sep = "" ))
 63 | completeDate <- left_join(completeDate, misc, by = c("itemID_1", "itemID_2"))
 64 | rm(misc)
 65 | 
 66 | interaction <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1d_interaction.fthr", sep = "" ))
 67 | completeDate <- left_join(completeDate, interaction, by = c("itemID_1", "itemID_2"))
 68 | rm(interaction)
 69 | 
 70 | attributes <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1e_attributes.fthr", sep = "" ))
 71 | completeDate <- left_join(completeDate, attributes, by = c("itemID_1", "itemID_2"))
 72 | rm(attributes)
 73 | 
 74 | specialCounting <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1f_specialCounting.fthr", sep = "" ))
 75 | completeDate <- left_join(completeDate, specialCounting, by = c("itemID_1", "itemID_2"))
 76 | rm(specialCounting)
 77 | 
 78 | capitalLetters <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1g_capitalLetters.fthr", sep = "" ))
 79 | completeDate <- left_join(completeDate, capitalLetters, by = c("itemID_1", "itemID_2"))
 80 | rm(capitalLetters)
 81 | 
 82 | image <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1h_image.fthr", sep = "" ))
 83 | completeDate <- left_join(completeDate, image, by = c("itemID_1", "itemID_2"))
 84 | rm(image)
 85 | 
 86 | imageSize <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1i_imageSize.fthr", sep = "" ))
 87 | completeDate <- left_join(completeDate, imageSize, by = c("itemID_1", "itemID_2"))
 88 | rm(imageSize)
 89 | 
 90 | 
 91 | 
 92 | location_levenshtein <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set2a_location_levenshtein.fthr", sep = "" ))
 93 | completeDate <- left_join(completeDate, location_levenshtein, by = c("itemID_1", "itemID_2"))
 94 | rm(location_levenshtein)
 95 | 
 96 | brisk <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set2b_brisk.fthr", sep = "" ))
 97 | completeDate <- left_join(completeDate, brisk, by = c("itemID_1", "itemID_2"))
 98 | rm(brisk)
 99 | 
100 | histogram <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set2c_histogram.fthr", sep = "" ))
101 | completeDate <- left_join(completeDate, histogram, by = c("itemID_1", "itemID_2"))
102 | rm(histogram)
103 | 
104 | 
105 | consolidated <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set3_consolidated.fthr", sep = "" ))
106 | completeDate <- left_join(completeDate, consolidated, by = c("itemID_1", "itemID_2"))
107 | rm(consolidated)
108 | 
109 | 
110 | fuzzy <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set4a_fuzzy.fthr", sep = "" ))
111 | completeDate <- left_join(completeDate, fuzzy, by = c("itemID_1", "itemID_2"))
112 | rm(fuzzy)
113 | 
114 | fuzzy_clean <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set4b_fuzzy_clean.fthr", sep = "" ))
115 | completeDate <- left_join(completeDate, fuzzy_clean, by = c("itemID_1", "itemID_2"))
116 | rm(fuzzy_clean)
117 | 
118 | alternate <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set4c_alternate.fthr", sep = "" ))
119 | completeDate <- left_join(completeDate, alternate, by = c("itemID_1", "itemID_2"))
120 | rm(alternate)
121 | 
122 | similarity <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set4d_similarity.fthr", sep = "" ))
123 | completeDate <- left_join(completeDate, similarity, by = c("itemID_1", "itemID_2"))
124 | rm(similarity)
125 | gc()
126 | 
127 | print("Saving Final Files")
128 | write_feather(completeDate, paste("cache/final_featureSet_", trainOrTest, ".fthr", sep = "" ))
129 | print("DONE")
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set4c_alternate.py:
--------------------------------------------------------------------------------
  1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
  2 | #### Author: Marios & Mikel
  3 | #### Avito Duplicate Ad Detection
  4 | # 3_feature_set4b_fuzzy_clean.py
  5 | # Creates various text similarity features
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import sys
 10 | import jellyfish
 11 | import feather
 12 | import time
 13 | import gc
 14 | import re
 15 | import math
 16 | from collections import Counter
 17 | from fuzzywuzzy import fuzz
 18 | from multiprocessing import Pool
 19 | 
 20 | import libavito as a
 21 | 
 22 | WORD = re.compile(r'\w+')
 23 | 
 24 | def get_cosine(text1, text2):
 25 |     vec1 = text_to_vector(text1)
 26 |     vec2 = text_to_vector(text2)
 27 |     intersection = set(vec1.keys()) & set(vec2.keys())
 28 |     numerator = sum([vec1[x] * vec2[x] for x in intersection])
 29 | 
 30 |     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
 31 |     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
 32 |     denominator = math.sqrt(sum1) * math.sqrt(sum2)
 33 | 
 34 |     if not denominator:
 35 |         return 0.0
 36 |     else:
 37 |         return float(numerator) / denominator
 38 | 
 39 | def text_to_vector(text):
 40 |     words = WORD.findall(text)
 41 |     return Counter(words)
 42 | 
 43 | 
 44 | def count_2words_together(words, text, ranges):
 45 |     count2 = 0
 46 |     if len(words) < 2 or len(text) < 2:
 47 |         return -1
 48 |     else:
 49 |         for m in range(0, len(words) - 1):
 50 |             words1 = words[m]
 51 |             for n in range(m + 1, len(words)):
 52 |                 words2 = words[n]
 53 |                 if words1 in text:
 54 |                     ind = text.index(words1)
 55 |                     try:
 56 |                         words2 in text[ind + 1:ind + 1 + ranges]
 57 |                         count2 += 1
 58 |                     except:
 59 |                         pass
 60 |         return count2
 61 | 
 62 | def count_2words(words, text):
 63 |         # To count how many times of the search terms having two words at least showing in texts.
 64 |     count2 = 0
 65 |     if len(words) < 2 or len(text) < 2:
 66 |         return -1
 67 |     else:
 68 |         for m in range(0, len(words) - 1):
 69 |             words1 = words[m]
 70 |             for n in range(m + 1, len(words)):
 71 |                 words2 = words[n]
 72 |                 if words1 in text and words2 in text:
 73 |                     count2 += 1
 74 |         return count2
 75 | 
 76 | def calculate_similarity_simple(str1, str2):
 77 |     count = 0
 78 |     if str1 in str2:
 79 |         count = 1
 80 |     return count
 81 | 
 82 | def calculate_similarity_split(str1, str2):
 83 |     count = 0
 84 |     countabs = 0
 85 |     countper = 0
 86 |     split1 = str1.split(" ")
 87 |     split2 = str2.split(" ")
 88 |     for s1 in split1:
 89 |         for s2 in split2:
 90 |             if s1 in s2:
 91 |                 count += 1
 92 |             if s1 == s2:
 93 |                 countabs += 1
 94 |             countper += 1
 95 | 
 96 |     return count, countabs, countabs / (countper + 1)
 97 | 
 98 | def process_row(row):
 99 | 
100 |     title = 2
101 |     desc = 4
102 |     json = 6
103 | 
104 |     pairs = [[title, desc], [desc, title], [title, json], [json, title], [desc, json], [json, desc]]
105 |     values = []
106 |     # string feature counts
107 | 
108 |     values.append(row[0])
109 |     values.append(row[1])
110 | 
111 |     for d, s in pairs:
112 |         st_1 = str(row[d]).replace(":", " ")
113 |         st_2 = str(row[s + 1]).replace(":", " ")
114 |         values.append(calculate_similarity_simple(st_1, st_2))
115 |         val1, val2, val3 = calculate_similarity_split(st_1, st_2)
116 |         values.append(val1)
117 |         values.append(val2)
118 |         values.append(val3)
119 |         st_1_array = st_1.split(" ")
120 |         st_2_array = st_2.split(" ")
121 |         values.append(count_2words(st_1_array, st_2_array))
122 |         values.append(get_cosine(st_1, st_2))
123 |         values.append(count_2words_together(st_1_array, st_2_array, 1))
124 |         values.append(count_2words_together(st_1_array, st_2_array, 5))
125 | 
126 |     return values
127 | 
128 | print(a.c.BOLD + 'Extracting set4c alternate text features ...' + a.c.END)
129 | 
130 | # Get train/test mode from launch argument
131 | mode = a.get_mode(sys.argv, '3_feature_set4c_fuzzy_clean.py')
132 | 
133 | ## Read settings required by script
134 | config = a.read_config()
135 | nthreads = config.preprocessing_nthreads
136 | cache_loc = config.cache_loc
137 | debug = config.debug
138 | if mode == 0:
139 |     root = config.train_images_root
140 |     df = feather.read_dataframe(cache_loc + 'train.fthr')
141 | if mode == 1:
142 |     root = config.test_images_root
143 |     df = feather.read_dataframe(cache_loc + 'test.fthr')
144 | 
145 | df = df[['itemID_1', 'itemID_2', 'title_1', 'title_2', 'description_1', 'description_2', 'attrsJSON_1', 'attrsJSON_2']]
146 | 
147 | ftrs = []
148 | 
149 | start = time.time()
150 | o = len(df.index)
151 | if nthreads == 1:
152 |     print('Extracting features with 1 thread ...')
153 |     k = 0
154 |     # Iterate over files
155 |     ftrs = []
156 |     for row in df.values:
157 |         x = process_row(row)
158 |         ftrs.append(x)
159 |         k += 1
160 |         if k % 100 == 0:
161 |             a.print_progress(k, start, o)
162 | 
163 | # Otherwise perform multi-threaded mapping
164 | else:
165 |     print('Extracting features multi-threaded ... ', end='', flush=True)
166 |     pool = Pool(nthreads)
167 |     ftrs = pool.map(process_row, df.values)
168 |     pool.close()
169 |     gc.collect()
170 | 
171 |     a.print_elapsed(start)
172 | 
173 | ftrs = pd.DataFrame(ftrs)
174 | cols = ['itemID_1', 'itemID_2'] + ['set4c_X' + str(i) for i in range(1, len(ftrs.columns.tolist()) - 1)]
175 | print(cols)
176 | ftrs.columns = cols
177 | 
178 | # Save updated dataset
179 | if mode == 0:
180 |     feather.write_dataframe(ftrs, cache_loc + 'features_train_set4c_alternate.fthr')
181 | if mode == 1:
182 |     feather.write_dataframe(ftrs, cache_loc + 'features_test_set4c_alternate.fthr')
183 | 
184 | a.print_elapsed(start)
185 | print('set4c extraction complete!')
186 | 
187 | # Write status to status file so master script knows whether to proceed.
188 | f = open(cache_loc + 'status.txt', 'a')
189 | f.write('feature_set4c_OK\n')
190 | f.close()
191 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1c_misc.R:
--------------------------------------------------------------------------------
  1 | ################################################################################################
  2 | ################################################################################################
  3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
  4 | #### Competition: Avito Duplicate Ad Detection
  5 | # Filename : 3_feature_set1c_misc.R
  6 | # Description: This Rscript generates all ngram features
  7 | # Usage:
  8 | #       Rscript ./code/3_feature_set1c_misc.R train
  9 | #       Rscript ./code/3_feature_set1c_misc.R test
 10 | #       Default argument is test
 11 | ################################################################################################
 12 | ################################################################################################
 13 | 
 14 | args <- commandArgs(trailingOnly = F)
 15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)])))
 16 | 
 17 | # Source Config and functions.R file
 18 | source(paste(BASE, "/../config.cfg", sep = ""))
 19 | source(paste(BASE_DIR, "/code/functions.R", sep = ""))
 20 | 
 21 | #Load any additional packages
 22 | library(parallel)
 23 | library(stylo)
 24 | 
 25 | # Read argument for train or test
 26 | trainOrTest <- commandArgs(trailingOnly = TRUE)
 27 | if (length(trainOrTest) > 1) {
 28 |         stop("ERROR: I need only 1 argument : train or test")
 29 | }
 30 | 
 31 | if (length(trainOrTest) == 0) {
 32 |         print("No Arguments passed, Assuming you mean test")
 33 |         trainOrTest <- "test"
 34 | }
 35 | 
 36 | #Load data
 37 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
 38 | cat("Reading file ", FILENAME, "\n", sep = " ")
 39 | dat <- read_csv(FILENAME)
 40 | 
 41 | 
 42 | ######## IDs and Long and Lat Features
 43 | print("Generating Binary features ")
 44 | isMetroIdSame <- ifelse(dat$metroID_1 == dat$metroID_2, 1, 0)
 45 | isLocationIDSame <- ifelse(dat$locationID_1 == dat$locationID_2, 1, 0)
 46 | isRegionIDSame <- ifelse(dat$regionID_1 == dat$regionID_2, 1, 0)
 47 | isLongitudeSame <-  ifelse(round(dat$lon_1, 2) == round(dat$lon_2, 2), 1, 0)
 48 | isLatitudeSame <-  ifelse(round(dat$lat_1, 2) == round(dat$lat_2, 2), 1, 0)
 49 | isTitleSame <- ifelse(tolower(dat$cleantitle_1) == tolower(dat$cleantitle_2), 1, 0) #isTitle Same
 50 | isdescriptionSame <- ifelse(tolower(dat$cleandesc_1) == tolower(dat$cleandesc_2), 1, 0) #isdescription Same
 51 | 
 52 | ######## PRICE Features
 53 | print("Generating Price features ")
 54 | priceDiff <- abs(dat$price_1 - dat$price_2)
 55 | ratioOfPrices <- dat$price_1 / dat$price_2
 56 | ratioOfPrices <- round(ifelse(ratioOfPrices > 1, 1/ratioOfPrices, ratioOfPrices), 3)
 57 | both_price_na <- ifelse(is.na(dat$price_1) & is.na(dat$price_2), 1, 0) #Both Price NA
 58 | one_price_na <- ifelse(is.na(dat$price_1) | is.na(dat$price_2), 1, 0) #One Price NA
 59 | total_price <- (dat$price_1 + dat$price_2) #Total Price
 60 | 
 61 | 
 62 | ######## IMAGE Features
 63 | print("Generating Image features")
 64 | library(stringr)
 65 | imageCount_sum <- str_count(dat$images_array_1, '[0-9.]+') + str_count(dat$images_array_2, '[0-9.]+')
 66 | imageCount_diff <- abs(str_count(dat$images_array_1, '[0-9.]+') - str_count(dat$images_array_2, '[0-9.]+'))
 67 | imageCount_min <- pmin(str_count(dat$images_array_1, '[0-9.]+'), str_count(dat$images_array_2, '[0-9.]+'),  na.rm = F)
 68 | imageCount_max <- pmax(str_count(dat$images_array_1, '[0-9.]+'), str_count(dat$images_array_2, '[0-9.]+'),  na.rm = F)
 69 | ratioOfNumberOfImages <- str_count(dat$images_array_1, '[0-9.]+') / str_count(dat$images_array_2, '[0-9.]+')
 70 | ratioOfNumberOfImages <- round(ifelse(ratioOfNumberOfImages > 1, 1/ratioOfNumberOfImages, ratioOfNumberOfImages), 3)
 71 | 
 72 | ######## DISTANCE STRING Features
 73 | library(stringdist)
 74 | print("Generating Text Distance features for title")
 75 | titleDistance_cosine <- round(mcmapply(stringdist, dat$cleantitle_1, dat$cleantitle_2, method = "cosine", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3)
 76 | titleDistance_hamming <- round(mcmapply(stringdist, dat$cleantitle_1, dat$cleantitle_2, method = "hamming", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3)
 77 | titleDistance_jaccard <- round(mcmapply(stringdist, dat$cleantitle_1, dat$cleantitle_2, method = "jaccard", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3)
 78 | 
 79 | print("Generating Text Distance features for description")
 80 | descriptionDistance_cosine <- round(mcmapply(stringdist, dat$cleandesc_1, dat$cleandesc_2, method = "cosine", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3)
 81 | 
 82 | descriptionDistance_hamming <- round(mcmapply(stringdist, dat$cleandesc_1, dat$cleandesc_2, method = "hamming", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3)
 83 | 
 84 | descriptionDistance_jaccard <- round(mcmapply(stringdist, dat$cleandesc_1, dat$cleandesc_2, method = "jaccard", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3)
 85 | 
 86 | 
 87 | ######## DATA FRAME
 88 | df_master <- data.frame(	isMetroIdSame = isMetroIdSame,
 89 | 				isLocationIDSame = isLocationIDSame,
 90 | 				isRegionIDSame = isRegionIDSame,
 91 | 				isLongitudeSame = isLongitudeSame,
 92 | 				isLatitudeSame = isLatitudeSame,
 93 | 				isTitleSame = isTitleSame,
 94 | 				isdescriptionSame = isdescriptionSame,
 95 | 				priceDiff = priceDiff,
 96 | 				ratioOfPrices = ratioOfPrices,
 97 | 				both_price_na = both_price_na,
 98 | 				one_price_na = one_price_na,
 99 | 				total_price = total_price,
100 | 				imageCount_sum = imageCount_sum,
101 | 				imageCount_diff = imageCount_diff,
102 | 				imageCount_min = imageCount_min,
103 | 				imageCount_max = imageCount_max,
104 | 				ratioOfNumberOfImages = ratioOfNumberOfImages,
105 | 				titleDistance_cosine = titleDistance_cosine,
106 | 				titleDistance_hamming = titleDistance_hamming,
107 | 				titleDistance_jaccard = titleDistance_jaccard,
108 | 				descriptionDistance_cosine = descriptionDistance_cosine,
109 | 				descriptionDistance_hamming = descriptionDistance_hamming,
110 | 				descriptionDistance_jaccard = descriptionDistance_jaccard
111 | 			)
112 | 
113 | set1d <- df_master #making a copy for geenrating interaction features. Need to do this before renaming columns 
114 | 
115 | names(df_master) <- paste("set1c", names(df_master), sep = "_")
116 | ######## Add Primary Columns ItemID1 and ItemID2
117 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
118 | print("Saving Misc features")
119 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1c_", "misc.fthr", sep = "" ))
120 | 
121 | # Start Interaction feature script
122 | source("./code/3_feature_set1d_interaction.R")
123 | #END
124 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/1_data_preprocessing.py:
--------------------------------------------------------------------------------
  1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
  2 | #### Author: Mikel
  3 | #### Avito Duplicate Ad Detection
  4 | # 1_data_preprocessing.py
  5 | # Takes in input data, cleans text and merges itemIDs.
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import nltk
 10 | import sklearn
 11 | import json
 12 | import math
 13 | import feather  #import pickle - feather used instead as it is compatible with R
 14 | from pandas.io.json import json_normalize
 15 | import unicodedata
 16 | from stop_words import get_stop_words
 17 | import time
 18 | from multiprocessing import Pool
 19 | import sys
 20 | import gc
 21 | from imp import load_source
 22 | 
 23 | import libavito as a
 24 | 
 25 | #########################
 26 | ##### SCRIPT CONFIG #####
 27 | #########################
 28 | 
 29 | # Define cleaning parameters
 30 | stopwords = get_stop_words('ru')
 31 | exclude_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'Sk', 'Sc', 'So', 'Co', 'Cf', 'Cc', 'Cs', 'Cn'])
 32 | sno = nltk.stem.SnowballStemmer('russian')
 33 | 
 34 | #########################
 35 | 
 36 | print(a.c.BOLD + 'Cleaning input data ...' + a.c.END)
 37 | 
 38 | # Get train/test mode from launch argument
 39 | mode = a.get_mode(sys.argv, '1_data_preprocessing.py')
 40 | 
 41 | ## Read settings required by script
 42 | config = a.read_config()
 43 | nthreads = config.preprocessing_nthreads
 44 | cache_loc = config.cache_loc
 45 | category_loc = config.category_csv
 46 | location_loc = config.location_csv
 47 | debug = config.debug
 48 | if mode == 0:
 49 |     data_loc = config.train_ItemInfo
 50 |     pairs_loc = config.train_ItemPairs
 51 | if mode == 1:
 52 |     data_loc = config.test_ItemInfo
 53 |     pairs_loc = config.test_ItemPairs
 54 | 
 55 | # Read file for processing into memory
 56 | start = time.time()
 57 | print('Reading input data ... ', end='', flush=True)
 58 | df = pd.read_csv(data_loc)
 59 | a.print_elapsed(start)
 60 | 
 61 | def get_clean_tokens(text):
 62 |     newtext = []
 63 | 
 64 |     # lower text
 65 |     text = text.lower()
 66 | 
 67 |     # replace punctation
 68 |     text = ''.join(x if unicodedata.category(x) not in exclude_cats else ' ' for x in text)
 69 | 
 70 |     # replace some symbols
 71 |     text = ''.join(x if x not in ["'", '`', '>', '<', '=', '+'] else ' ' for x in text)
 72 | 
 73 |     # tokenize the text
 74 |     text0 = nltk.word_tokenize(text, 'russian')
 75 | 
 76 |     # word by word
 77 |     for y in text0:
 78 |         # remove stopwords and stemming
 79 |         if len(y) > 0 and y not in stopwords:
 80 |             newtext.append(sno.stem(y))
 81 | 
 82 |     return newtext
 83 | 
 84 | def process_line(i):
 85 |     # Lists to store tokens in
 86 |     tx = []
 87 |     dx = []
 88 |     resx = []
 89 | 
 90 |     # Pluck initial strings from dataframe
 91 |     title = str(df.iloc[i]['title'])
 92 |     desc = str(df.iloc[i]['description'])
 93 |     jx = str(df.iloc[i]['attrsJSON']).lower()
 94 | 
 95 |     tx = get_clean_tokens(title)
 96 |     dx = get_clean_tokens(desc)
 97 | 
 98 |     # Process JSON
 99 |     try:
100 |         resx = json.loads(jx)
101 |         for key in resx.keys():
102 |             a = get_clean_tokens(resx[key])
103 |             resx[key] = " ".join(a)
104 |     except:
105 |         resx = []
106 |         if debug == 1:
107 |             print('DEBUG: Failed to read JSON "' + json + '" at ' + str(i))
108 |         pass
109 | 
110 |     jxs = '' + json.dumps(resx, ensure_ascii=False)
111 |     txs = ' '.join(tx)
112 |     dxs = ' '.join(dx)
113 | 
114 |     del tx, resx, dx
115 |     gc.collect()
116 | 
117 |     return [txs, dxs, jxs]
118 | 
119 | # def process_line(i):
120 | #     return ['empty', 'empty', 'empty']
121 | 
122 | newtitles = []
123 | newdescs = []
124 | newjson = []
125 | ids = df['itemID'].values
126 | 
127 | start = time.time()
128 | # If number of threads is equal to 1, output time remaining etc.
129 | o = len(df.index)
130 | if nthreads == 1:
131 |     print('Cleaning text with 1 thread ...')
132 |     k = 0
133 |     # Iterate over lines
134 |     for i in range(0, o):
135 |         x = process_line(i)
136 |         newtitles.append(x[0])
137 |         newdescs.append(x[1])
138 |         newjson.append(x[2])
139 |         k += 1
140 |         if k % 100 == 0:
141 |             a.print_progress(k, start, o)
142 | # Otherwise perform multi-threaded mapping
143 | else:
144 |     print('Cleaning text multi-threaded ... ', end='', flush=True)
145 |     pool = Pool(nthreads)
146 |     newdata = pool.map(process_line, range(0, o))
147 |     pool.close()
148 |     for x in newdata:
149 |         newtitles.append(x[0])
150 |         newdescs.append(x[1])
151 |         newjson.append(x[2])
152 | 
153 |     del newdata
154 |     gc.collect()
155 | 
156 |     a.print_elapsed(start)
157 | 
158 | #########################
159 | 
160 | print(a.c.BOLD + 'Joining input data ...' + a.c.END)
161 | 
162 | # Joining cleaned data into original data
163 | df['cleandesc'] = newdescs
164 | df['cleantitle'] = newtitles
165 | df['cleanjson'] = newjson
166 | 
167 | # Memory management
168 | del newdescs, newtitles, newjson
169 | gc.collect()
170 | 
171 | start = time.time()
172 | print('Joining parentCategory ... ', end='', flush=True)
173 | category = pd.read_csv(category_loc)
174 | df = df.merge(category, on=['categoryID'], copy=False)
175 | a.print_elapsed(start)
176 | 
177 | start = time.time()
178 | print('Joining regionID ... ', end='', flush=True)
179 | location = pd.read_csv(location_loc)
180 | df = df.merge(location, on=['locationID'], copy=False)
181 | a.print_elapsed(start)
182 | 
183 | start = time.time()
184 | print('Joining itemPairs ...', end='', flush=True)
185 | itemPairs = pd.read_csv(pairs_loc)
186 | df = pd.merge(pd.merge(itemPairs, df, how='inner', left_on='itemID_1', right_on='itemID'), df, how='inner', left_on='itemID_2', right_on='itemID')  # , suffixes=('_1', '_2'))
187 | df.drop(['itemID_x', 'itemID_y'], axis=1, inplace=True)
188 | df.columns = [c.replace('_x', '_1').replace('_y', '_2') for c in df.columns]
189 | a.print_elapsed(start)
190 | 
191 | start = time.time()
192 | print('Caching cleaned data ... ', end='', flush=True)
193 | 
194 | # Save updated dataset
195 | if mode == 0:
196 |     #pickle.dump(df, open(cache_loc + 'train.bin', 'wb'), protocol=4)
197 |     feather.write_dataframe(df, cache_loc + 'train.fthr')
198 |     df.to_csv(cache_loc + 'train.csv', index=False)
199 | if mode == 1:
200 |     #pickle.dump(df, open(cache_loc + 'test.bin', 'wb'), protocol=4)
201 |     feather.write_dataframe(df, cache_loc + 'test.fthr')
202 |     df.to_csv(cache_loc + 'test.csv', index=False)
203 | 
204 | a.print_elapsed(start)
205 | print('Data preprocessing complete!')
206 | 
207 | # Write status to status file so master script knows whether to proceed.
208 | f = open(cache_loc + 'status.txt', 'a')
209 | f.write('data_preprocessing_OK\n')
210 | f.close()
211 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/runAll.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | ################################################################################################
  3 | ################################################################################################
  4 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
  5 | #### Competition: Avito Duplicate Ad Detection
  6 | # Filename : runAll.sh
  7 | # Description: This bash Script generates the Submission Files
  8 | # Usage:
  9 | #       bash ./runAll.sh
 10 | ################################################################################################
 11 | ################################################################################################
 12 | 
 13 | 
 14 | echo "`tput smso` Running Data Preprocessing`tput rmso`"
 15 | python3 code/1_data_preprocessing.py --train
 16 | python3 code/1_data_preprocessing.py --test
 17 | 
 18 | echo "`tput smso` Running Image Processing`tput rmso`"
 19 | python3 code/2_image_info.py
 20 | 
 21 | echo "`tput smso` Extracing NGrams`tput rmso`"
 22 | Rscript code/runnAll.sh train
 23 | Rscript code/runnAll.sh test
 24 | 
 25 | echo "`tput smso` Extracting NChars`tput rmso`"
 26 | Rscript code/3_feature_set1b_nchar.R train
 27 | Rscript code/3_feature_set1b_nchar.R test
 28 | 
 29 | echo "`tput smso` Extracting Misc Features`tput rmso`"
 30 | Rscript code/3_feature_set1c_misc.R train
 31 | Rscript code/3_feature_set1c_misc.R test
 32 | 
 33 | echo "`tput smso`Extracing Attributes `tput rmso`"
 34 | Rscript code/3_feature_set1e_attribute.R train
 35 | Rscript code/3_feature_set1e_attribute.R test
 36 | 
 37 | echo "`tput smso`Extracting Special Counting Features `tput rmso`"
 38 | Rscript code/3_feature_set1f_SpecialCounting.R train
 39 | Rscript code/3_feature_set1f_SpecialCounting.R test
 40 | 
 41 | echo "`tput smso` Extracting Capital Letters`tput rmso`"
 42 | Rscript code/3_feature_set1g_capitalLetters.R train
 43 | Rscript code/3_feature_set1g_capitalLetters.R test
 44 | 
 45 | echo "`tput smso` Extracting hash features `tput rmso`"
 46 | Rscript code/3_feature_set1h_images.R train
 47 | Rscript code/3_feature_set1h_images.R test
 48 | 
 49 | echo "`tput smso` Extracing Image Size Features `tput rmso`"
 50 | Rscript code/3_feature_set1i_imagesSize.R train
 51 | Rscript code/3_feature_set1i_imagesSize.R test
 52 | 
 53 | echo "`tput smso` Extracing Location `tput rmso`"
 54 | python3 code/3_feature_set2a_lev_loc.py --train
 55 | python3 code/3_feature_set2a_lev_loc.py --test
 56 | 
 57 | echo "`tput smso` Extracing BRISK`tput rmso`"
 58 | python3 code/3_feature_set2b_brisk.py --train
 59 | python3 code/3_feature_set2b_brisk.py --test
 60 | 
 61 | echo "`tput smso`Extracting Histograms `tput rmso`"
 62 | python3 code/3_feature_set2c_hist.py --train
 63 | python3 code/3_feature_set2c_hist.py --test
 64 | 
 65 | echo "`tput smso`Extracing Descriptions `tput rmso`"
 66 | python3 code/3_feature_set3a_description.py --train
 67 | python3 code/3_feature_set3a_description.py --test
 68 | 
 69 | echo "`tput smso`Extracting Title `tput rmso`"
 70 | python3 code/3_feature_set3b_title.py --train
 71 | python3 code/3_feature_set3b_title.py --test
 72 | 
 73 | echo "`tput smso` Extracting Json `tput rmso`"
 74 | python3 code/3_feature_set3c_json.py --train
 75 | python3 code/3_feature_set3c_json.py --test
 76 | 
 77 | echo "`tput smso` Extracing Jsonpart2 `tput rmso`"
 78 | python3 code/3_feature_set3d_json1.py --train
 79 | python3 code/3_feature_set3d_json1.py --test
 80 | 
 81 | echo "`tput smso`Extracing hamming `tput rmso`"
 82 | python3 code/3_feature_set3f_hamming.py --train
 83 | python3 code/3_feature_set3f_hamming.py --test
 84 | 
 85 | echo "`tput smso`Extracing Json to Col `tput rmso`"
 86 | python3 code/3_json_to_cols.py
 87 | 
 88 | echo "`tput smso`Extracing WOE `tput rmso`"
 89 | Rscript code/3_feature_set3g_json_to_cols_createWOE.R train
 90 | Rscript code/3_feature_set3g_json_to_cols_createWOE.R test
 91 | 
 92 | echo "`tput smso` Consolidating few features `tput rmso`"
 93 | Rscript code/3_feature_set3z_consolidate.R train
 94 | Rscript code/3_feature_set3z_consolidate.R test
 95 | 
 96 | echo "`tput smso` Extracing Fuzzy`tput rmso`"
 97 | python3 code/3_feature_set4a_fuzzy.py --train
 98 | python3 code/3_feature_set4a_fuzzy.py --test
 99 | 
100 | echo "`tput smso` Extracting fuzzy Clean`tput rmso`"
101 | python3 code/3_feature_set4b_fuzzy_clean.py --train
102 | python3 code/3_feature_set4b_fuzzy_clean.py --test
103 | 
104 | echo "`tput smso`Extracing Alternate `tput rmso`"
105 | python3 code/3_feature_set4c_alternate.py --train
106 | python3 code/3_feature_set4c_alternate.py --test
107 | 
108 | echo "`tput smso` Extracing Similarity`tput rmso`"
109 | python3 code/3_feature_set4d_similarity_clean.py --train
110 | python3 code/3_feature_set4d_similarity_clean.py --test
111 | 
112 | echo "`tput smso`Extracing BOW `tput rmso`"
113 | python3 code/4_bag_of_words.py
114 | 
115 | 
116 | 
117 | ############################################################################################
118 | ############################################################################################
119 | #Consolidate All Features
120 | echo "`tput smso`CONSOLIDATING ALL FEATURES `tput rmso`"
121 | Rscript code/5_consolidate_features.R train
122 | Rscript code/5_consolidate_features.R test
123 | 
124 | echo "`tput smso`Replacing all NaN and Inf`tput rmso`"
125 | python3 code/5_data_postprocessing.py --train
126 | python3 code/5_data_postprocessing.py --test
127 | 
128 | echo "FEATURES DONE"
129 | ############################################################################################
130 | echo "Running models"
131 | 
132 | echo "`tput smso`Running logit_v2`tput rmso`"
133 | python2 code/models/marios_logit_v2.py
134 | 
135 | echo "`tput smso`Running nn_v1`tput rmso`"
136 | python2 code/models/marios_nn_v1.py
137 | 
138 | echo "`tput smso`Running nnnew_v2`tput rmso`"
139 | python2 code/models/marios_nnnew_v2.py
140 | 
141 | echo "`tput smso`Running nnnew_v3`tput rmso`"
142 | python2 code/models/marios_nnnew_v3.py
143 | 
144 | echo "`tput smso`Running nnnew_v4`tput rmso`"
145 | python2 code/models/marios_nnnew_v4.py
146 | 
147 | echo "`tput smso`Running ridge_v2`tput rmso`"
148 | python2 code/models/marios_ridge_v2.py
149 | 
150 | echo "`tput smso`Running sgd_v2`tput rmso`"
151 | python2 code/models/marios_sgd_v2.py
152 | 
153 | echo "`tput smso`Running xg_v1`tput rmso`"
154 | python2 code/models/marios_xg_v1.py
155 | 
156 | echo "`tput smso`Running xgrank_v2`tput rmso`"
157 | python2 code/models/marios_xgrank_v2.py
158 | 
159 | echo "`tput smso`Running xgrank_v3`tput rmso`"
160 | python2 code/models/marios_xgrank_v3.py
161 | 
162 | echo "`tput smso`Running xgregv3`tput rmso`"
163 | python2 code/models/marios_xgregv3.py
164 | 
165 | echo "`tput smso`Running xgson_v2`tput rmso`"
166 | python2 code/models/marios_xgson_v2.py
167 | 
168 | echo "`tput smso`Running xgson_v3`tput rmso`"
169 | python2 code/models/marios_xgson_v3.py
170 | 
171 | echo "`tput smso`Running xgson_v4`tput rmso`"
172 | python2 code/models/marios_xgson_v4.py
173 | 
174 | echo "`tput smso`Running xgson_v2_v5`tput rmso`"
175 | python2 code/models/marios_xgson_v2_v5.py
176 | 
177 | echo "`tput smso`Running meta-model`tput rmso`"
178 | python2 code/models/meta_rf_v1.py
179 | 
180 | echo "MODELS DONE"
181 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgregv3.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.preprocessing import StandardScaler
  3 | from sklearn.metrics import roc_auc_score
  4 | import XGBoostClassifier as xg
  5 | import os
  6 | import libavito
  7 | import feather
  8 | 
  9 | # bagger for xgboost
 10 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
 11 | 
 12 |    # create array object to hold predictions
 13 |    baggedpred=[ 0.0  for d in range(0, (xt.shape[0]))]
 14 |    #loop for as many times as we want bags
 15 |    for n in range (0, estimators):
 16 |         #shuff;e first, aids in increasing variance and forces different results
 17 |         #X_t,y_c=shuffle(X,y, random_state=seed+n)
 18 | 
 19 |         if update_seed: # update seed if requested, to give a slightly different model
 20 |             model.set_params(random_state=seed + n)
 21 |         model.fit(X_t,y_c) # fit model0.0917411475506
 22 |         preds=model.predict(xt) # predict probabilities
 23 |         # update bag's array
 24 |         for j in range (0, (xt.shape[0])):
 25 |                 baggedpred[j]+=preds[j]
 26 |         print("done bag %d " % (n))
 27 |    # divide with number of bags to create an average estimate
 28 |    for j in range (0, len(baggedpred)):
 29 |                 baggedpred[j]/=float(estimators)
 30 |    # return probabilities
 31 |    return np.array(baggedpred)
 32 | 
 33 | 
 34 | 
 35 | 
 36 | def loadcolumn(filename,col=4, skip=1, floats=True):
 37 |     pred=[]
 38 |     op=open(filename,'r')
 39 |     if skip==1:
 40 |         op.readline() #header
 41 |     for line in op:
 42 |         line=line.replace('\n','')
 43 |         sps=line.split(',')
 44 |         #load always the last columns
 45 |         if floats:
 46 |             pred.append(float(sps[col]))
 47 |         else :
 48 |             pred.append(str(sps[col]))
 49 |     op.close()
 50 |     return pred
 51 | 
 52 | 
 53 | def printfilcsve(X, filename):
 54 | 
 55 |     np.savetxt(filename,X, fmt='%.5f')
 56 | 
 57 | 
 58 | # read the train and test allclean.csv files. skip errors
 59 | def readfile(name, index=0):
 60 |     dopen=open(name,"r")
 61 |     array=[]
 62 |     skip_firstrow=False
 63 |     if index!=0:
 64 |         skip_firstrow=True
 65 |     for i,line in enumerate(dopen):
 66 |         if i==0 and skip_firstrow:
 67 |             continue
 68 |         splits=line.replace("\n","").replace(" ","").split(",")
 69 |         ar=[]
 70 |         for k in splits:
 71 |             try:
 72 |                ar.append(float(k))
 73 |             except:
 74 |                 ar.append(0.0)
 75 |                 print(" the string is %s ok?" % ((k)))
 76 |         array.append(ar)#[float(k)0.971474 if k!="0" else 0.0 for k in splits ])
 77 |         if i%100000==0:
 78 |             print(" we are at " , str(i))
 79 |     return   np.array(array)
 80 | 
 81 | 
 82 | def main():
 83 | 
 84 |         config = libavito.get_config()
 85 |         cache_loc = config.cache_loc
 86 |         nthreads = config.nthreads
 87 | 
 88 |         Usecv=True # true will split the training data 66-33 and do cv
 89 |         SEED=15
 90 |         threads=nthreads # number of workers for parallelism
 91 | 
 92 |         ######### Load files ############
 93 |         print("Loading input data")
 94 |         train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
 95 |         y = train['isDuplicate'].values
 96 |         X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
 97 |         del train
 98 |         print(X.shape)
 99 |         test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
100 |         ids = test['id'].values
101 |         X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
102 |         del test
103 |         print(X_test.shape)
104 | 
105 | 
106 |         metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
107 |         if not os.path.exists(metafolder):      #if it does not exists, we create it
108 |             os.makedirs(metafolder)
109 |         outset="marios_xgreg_v3" # predic of all files
110 | 
111 |         #model to use
112 | 
113 |         idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)]
114 |         idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )]
115 |         kfolder=[[idex1,idex2]]
116 |         #Create Arrays for meta
117 |         train_stacker=[ 0.0  for k in range (0,len(idex2)) ]
118 |         test_stacker=[0.0  for k in range (0,(X_test.shape[0]))]
119 |         # CHECK EVerything in five..it could be more efficient
120 | 
121 |         #create target variable
122 |         mean_kapa = 0.0
123 |         #kfolder=StratifiedKFold(y, n_folds=number_of_folds,shuffle=True, random_state=SEED)
124 |         #number_of_folds=0
125 |         #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
126 |         i=0 # iterator counter
127 |         if Usecv:
128 |             print ("starting cross validation")
129 |             for train_index, test_index in kfolder:
130 |                 # creaning and validation sets
131 |                 X_train, X_cv = X[train_index], X[test_index]
132 |                 y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
133 |                 print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
134 | 
135 | 
136 |                 preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
137 | 
138 | 
139 |                 # compute Loglikelihood metric for this CV fold
140 |                 #scalepreds(preds)
141 |                 kapa = roc_auc_score(y_cv,preds)
142 |                 print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)
143 | 
144 |                 mean_kapa += kapa
145 |                 #save the results
146 |                 no=0
147 |                 for real_index in test_index:
148 |                          train_stacker[no]=(preds[no])
149 |                          no+=1
150 |                 i+=1
151 |             if Usecv:
152 |                 print (" Average AUC: %f" % (mean_kapa) )
153 |                 print (" printing train datasets ")
154 |                 printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
155 | 
156 | 
157 |         #preds=bagged_set(X, y,model, SEED, 1, X_test, update_seed=True)
158 | 
159 |         preds=bagged_set(X, y,model, SEED , 5, X_test, update_seed=True)
160 | 
161 | 
162 |         for pr in range (0,len(preds)):
163 |                     test_stacker[pr]=(preds[pr])
164 | 
165 |         preds=np.array(preds)
166 |         printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
167 | 
168 | 
169 |         print("Write results...")
170 |         output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
171 |         print("Writing submission to %s" % output_file)
172 |         f = open(config.output_loc + output_file, "w")
173 |         f.write("id,probability\n")# the header
174 |         for g in range(0, len(preds))  :
175 |             pr=preds[g]
176 |             f.write("%d,%f\n" % (((ids[g]),pr ) ) )
177 |         f.close()
178 |         print("Done.")
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | if __name__=="__main__":
186 |   main()
187 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgson_v4.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import numpy as np
  4 | from sklearn.metrics import roc_auc_score
  5 | import XGBoostClassifier as xg
  6 | import os
  7 | import libavito
  8 | import feather
  9 | 
 10 | #load a single column from file
 11 | def loadcolumn(filename,col=4, skip=1, floats=True):
 12 |     pred=[]
 13 |     op=open(filename,'r')
 14 |     if skip==1:
 15 |         op.readline() #header
 16 |     for line in op:
 17 |         line=line.replace('\n','')
 18 |         sps=line.split(',')
 19 |         #load always the last columns
 20 |         if floats:
 21 |             pred.append(float(sps[col]))
 22 |         else :
 23 |             pred.append(str(sps[col]))
 24 |     op.close()
 25 |     return pred
 26 | 
 27 | #export file in csv using numpy
 28 | def printfilcsve(X, filename):
 29 | 
 30 |     np.savetxt(filename,X, fmt='%.5f')
 31 | 
 32 | # read the train and test allclean.csv files. skip errors
 33 | def readfile(name, index=0):
 34 |     dopen=open(name,"r")
 35 |     array=[]
 36 |     skip_firstrow=False
 37 |     if index!=0:
 38 |         skip_firstrow=True
 39 |     for i,line in enumerate(dopen):
 40 |         if i==0 and skip_firstrow:
 41 |             continue
 42 |         splits=line.replace("\n","").replace(" ","").split(",")
 43 |         ar=[]
 44 |         for k in splits:
 45 |             try:
 46 |                ar.append(float(k))
 47 |             except:
 48 |                 ar.append(0.0)
 49 |                 print(" the string is %s ok?" % ((k)))
 50 |         array.append(ar)#[float(k) if k!="0" else 0.0 for k in splits ])
 51 |         if i%100000==0:
 52 |             print(" we are at " , str(i))
 53 |     return   np.array(array)
 54 | 
 55 | 
 56 | 
 57 | # bagger for xgboost
 58 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
 59 | 
 60 |    # create array object to hold predictions
 61 |    baggedpred=[ 0.0  for d in range(0, (xt.shape[0]))]
 62 |    #loop for as many times as we want bags
 63 |    for n in range (0, estimators):
 64 |         #shuff;e first, aids in increasing variance and forces different results
 65 |         #X_t,y_c=shuffle(X,y, random_state=seed+n)
 66 | 
 67 |         if update_seed: # update seed if requested, to give a slightly different model
 68 |             model.set_params(random_state=seed + n)
 69 |         model.fit(X_t,y_c) # fit model0.0917411475506
 70 |         preds=model.predict_proba(xt)[:,1] # predict probabilities
 71 |         # update bag's array
 72 |         for j in range (0, (xt.shape[0])):
 73 |                 baggedpred[j]+=preds[j]
 74 |         print("done bag %d " % (n))
 75 |    # divide with number of bags to create an average estimate
 76 |    for j in range (0, len(baggedpred)):
 77 |                 baggedpred[j]/=float(estimators)
 78 |    # return probabilities
 79 |    return np.array(baggedpred)
 80 | 
 81 | def main():
 82 | 
 83 |         config = libavito.get_config()
 84 |         cache_loc = config.cache_loc
 85 |         nthreads = config.nthreads
 86 | 
 87 |         Usecv=True # true will split the training data 66-33 and do cv
 88 |         SEED=15
 89 |         threads=nthreads # number of workers for parallelism
 90 | 
 91 |         ######### Load files ############
 92 |         print("Loading input data")
 93 |         train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
 94 |         y = train['isDuplicate'].values
 95 |         X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
 96 |         del train
 97 |         print(X.shape)
 98 |         test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
 99 |         ids = test['id'].values
100 |         X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
101 |         del test
102 |         print(X_test.shape)
103 | 
104 | 
105 |         metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
106 |         if not os.path.exists(metafolder):      #if it does not exists, we create it
107 |             os.makedirs(metafolder)
108 |         outset="marios_xgson_v4" # predic of all files
109 | 
110 |         #model to use
111 | 
112 |         model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads,  eta=0.02, gamma=7.0,max_depth=20, min_child_weight=20, subsample=0.9,                                  colsample_bytree=0.4,objective='binary:logistic',seed=1)
113 | 
114 |         #Create Arrays for meta
115 |         idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai
116 |         idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )]  #indices for test
117 |         kfolder=[[idex1,idex2]] # create an object to put indices in
118 | 
119 |         #arrays to save predictions for validation and test for meta modelling (stacking)
120 |         train_stacker=[ 0.0  for k in range (0,len(idex2)) ]
121 |         test_stacker=[0.0  for k in range (0,(X_test.shape[0]))]
122 | 
123 |         #create target variable
124 |         mean_kapa = 0.0
125 |         #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
126 |         i=0 # iterator counter
127 |         if Usecv:
128 |             print ("starting cross validation" )
129 |             for train_index, test_index in kfolder:
130 |                 # creaning and validation sets
131 |                 X_train, X_cv = X[train_index], X[test_index]
132 |                 y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
133 |                 print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
134 | 
135 |                 #use xgboost bagger
136 |                 preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
137 | 
138 |                 # compute Loglikelihood metric for this CV fold
139 |                 #scalepreds(preds)
140 |                 kapa = roc_auc_score(y_cv,preds)
141 |                 print("size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa))
142 | 
143 |                 mean_kapa += kapa
144 |                 #save the results
145 |                 no=0
146 |                 for real_index in test_index:
147 |                          train_stacker[no]=(preds[no])
148 |                          no+=1
149 |                 i+=1
150 |             if (Usecv):
151 |                 #print the array of validation predictions for stacking later on inside the 'meta_folder'
152 |                 print (" Average AUC: %f" % (mean_kapa) )
153 |                 print (" printing train datasets ")
154 |                 printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
155 | 
156 |         preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True)
157 | 
158 | 
159 |         for pr in range (0,len(preds)):
160 |                     test_stacker[pr]=(preds[pr])
161 |         #print prediction as numpy array for stacking later on
162 |         preds=np.array(preds)
163 |         printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
164 | 
165 |         #create submission file
166 |         print("Write results...")
167 |         output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
168 |         print("Writing submission to %s" % output_file)
169 |         f = open(config.output_loc + output_file, "w")
170 |         f.write("id,probability\n")# the header
171 |         for g in range(0, len(preds))  :
172 |             pr=preds[g]
173 |             f.write("%d,%f\n" % (((ids[g]),pr ) ) )
174 |         f.close()
175 |         print("Done.")
176 | 
177 | if __name__=="__main__":
178 |     main()
179 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgsonv2_v5.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import numpy as np
  4 | from sklearn.metrics import roc_auc_score
  5 | import XGBoostClassifier as xg
  6 | import os
  7 | import libavito
  8 | import feather
  9 | 
 10 | #load a single column from file
 11 | def loadcolumn(filename,col=4, skip=1, floats=True):
 12 |     pred=[]
 13 |     op=open(filename,'r')
 14 |     if skip==1:
 15 |         op.readline() #header
 16 |     for line in op:
 17 |         line=line.replace('\n','')
 18 |         sps=line.split(',')
 19 |         #load always the last columns
 20 |         if floats:
 21 |             pred.append(float(sps[col]))
 22 |         else :
 23 |             pred.append(str(sps[col]))
 24 |     op.close()
 25 |     return pred
 26 | 
 27 | #export file in csv using numpy
 28 | def printfilcsve(X, filename):
 29 | 
 30 |     np.savetxt(filename,X, fmt='%.5f')
 31 | 
 32 | # read the train and test allclean.csv files. skip errors
 33 | def readfile(name, index=0):
 34 |     dopen=open(name,"r")
 35 |     array=[]
 36 |     skip_firstrow=False
 37 |     if index!=0:
 38 |         skip_firstrow=True
 39 |     for i,line in enumerate(dopen):
 40 |         if i==0 and skip_firstrow:
 41 |             continue
 42 |         splits=line.replace("\n","").replace(" ","").split(",")
 43 |         ar=[]
 44 |         for k in splits:
 45 |             try:
 46 |                ar.append(float(k))
 47 |             except:
 48 |                 ar.append(0.0)
 49 |                 print(" the string is %s ok?" % ((k)))
 50 |         array.append(ar)#[float(k) if k!="0" else 0.0 for k in splits ])
 51 |         if i%100000==0:
 52 |             print(" we are at " , str(i))
 53 |     return   np.array(array)
 54 | 
 55 | 
 56 | 
 57 | # bagger for xgboost
 58 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
 59 | 
 60 |    # create array object to hold predictions
 61 |    baggedpred=[ 0.0  for d in range(0, (xt.shape[0]))]
 62 |    #loop for as many times as we want bags
 63 |    for n in range (0, estimators):
 64 |         #shuff;e first, aids in increasing variance and forces different results
 65 |         #X_t,y_c=shuffle(X,y, random_state=seed+n)
 66 | 
 67 |         if update_seed: # update seed if requested, to give a slightly different model
 68 |             model.set_params(random_state=seed + n)
 69 |         model.fit(X_t,y_c) # fit model0.0917411475506
 70 |         preds=model.predict_proba(xt)[:,1] # predict probabilities
 71 |         # update bag's array
 72 |         for j in range (0, (xt.shape[0])):
 73 |                 baggedpred[j]+=preds[j]
 74 |         print("done bag %d " % (n))
 75 |    # divide with number of bags to create an average estimate
 76 |    for j in range (0, len(baggedpred)):
 77 |                 baggedpred[j]/=float(estimators)
 78 |    # return probabilities
 79 |    return np.array(baggedpred)
 80 | 
 81 | def main():
 82 | 
 83 |         config = libavito.get_config()
 84 |         cache_loc = config.cache_loc
 85 |         nthreads = config.nthreads
 86 | 
 87 |         Usecv=True # true will split the training data 66-33 and do cv
 88 |         SEED=15
 89 |         threads=nthreads # number of workers for parallelism
 90 | 
 91 |         ######### Load files ############
 92 |         print("Loading input data")
 93 |         train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
 94 |         y = train['isDuplicate'].values
 95 |         X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
 96 |         del train
 97 |         print(X.shape)
 98 |         test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
 99 |         ids = test['id'].values
100 |         X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
101 |         del test
102 |         print(X_test.shape)
103 | 
104 | 
105 |         metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
106 |         if not os.path.exists(metafolder):      #if it does not exists, we create it
107 |             os.makedirs(metafolder)
108 |         outset="marios_xgsonv2_v5" # predic of all files
109 | 
110 |         #model to use
111 | 
112 |         model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads,  eta=0.1, gamma=0.0,max_depth=20, min_child_weight=1, subsample=1.0,
113 |                                    colsample_bytree=0.9,objective='binary:logistic',silent=True, seed=1)
114 | 
115 |         #Create Arrays for meta
116 |         idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai
117 |         idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )]  #indices for test
118 |         kfolder=[[idex1,idex2]] # create an object to put indices in
119 | 
120 |         #arrays to save predictions for validation and test for meta modelling (stacking)
121 |         train_stacker=[ 0.0  for k in range (0,len(idex2)) ]
122 |         test_stacker=[0.0  for k in range (0,(X_test.shape[0]))]
123 | 
124 |         #create target variable
125 |         mean_kapa = 0.0
126 |         #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
127 |         i=0 # iterator counter
128 |         if Usecv:
129 |             print ("starting cross validation" )
130 |             for train_index, test_index in kfolder:
131 |                 # creaning and validation sets
132 |                 X_train, X_cv = X[train_index], X[test_index]
133 |                 y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
134 |                 print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
135 | 
136 |                 #use xgboost bagger
137 |                 preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
138 | 
139 |                 # compute Loglikelihood metric for this CV fold
140 |                 #scalepreds(preds)
141 |                 kapa = roc_auc_score(y_cv,preds)
142 |                 print("size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa))
143 | 
144 |                 mean_kapa += kapa
145 |                 #save the results
146 |                 no=0
147 |                 for real_index in test_index:
148 |                          train_stacker[no]=(preds[no])
149 |                          no+=1
150 |                 i+=1
151 |             if (Usecv):
152 |                 #print the array of validation predictions for stacking later on inside the 'meta_folder'
153 |                 print (" Average AUC: %f" % (mean_kapa) )
154 |                 print (" printing train datasets ")
155 |                 printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
156 | 
157 |         preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True)
158 | 
159 | 
160 |         for pr in range (0,len(preds)):
161 |                     test_stacker[pr]=(preds[pr])
162 |         #print prediction as numpy array for stacking later on
163 |         preds=np.array(preds)
164 |         printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
165 | 
166 |         #create submission file
167 |         print("Write results...")
168 |         output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
169 |         print("Writing submission to %s" % output_file)
170 |         f = open(config.output_loc + output_file, "w")
171 |         f.write("id,probability\n")# the header
172 |         for g in range(0, len(preds))  :
173 |             pr=preds[g]
174 |             f.write("%d,%f\n" % (((ids[g]),pr ) ) )
175 |         f.close()
176 |         print("Done.")
177 | 
178 | if __name__=="__main__":
179 |     main()
180 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgrank_v2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.externals import joblib
  3 | from sklearn.preprocessing import StandardScaler
  4 | from sklearn.metrics import roc_auc_score
  5 | import XGBoostClassifier as xg
  6 | import os
  7 | import libavito
  8 | import feather
  9 | 
 10 | # bagger for xgboost
 11 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
 12 | 
 13 |    # create array object to hold predictions
 14 |    baggedpred=[ 0.0  for d in range(0, (xt.shape[0]))]
 15 |    #loop for as many times as we want bags
 16 |    for n in range (0, estimators):
 17 |         #shuff;e first, aids in increasing variance and forces different results
 18 |         #X_t,y_c=shuffle(X,y, random_state=seed+n)
 19 | 
 20 |         if update_seed: # update seed if requested, to give a slightly different model
 21 |             model.set_params(random_state=seed + n)
 22 |         model.fit(X_t,y_c) # fit model0.0917411475506
 23 |         preds=model.predict(xt) # predict probabilities
 24 |         # update bag's array
 25 |         for j in range (0, (xt.shape[0])):
 26 |                 baggedpred[j]+=preds[j]
 27 |         print("done bag %d " % (n))
 28 |    # divide with number of bags to create an average estimate
 29 |    for j in range (0, len(baggedpred)):
 30 |                 baggedpred[j]/=float(estimators)
 31 |    # return probabilities
 32 |    return np.array(baggedpred)
 33 | 
 34 | 
 35 | 
 36 | 
 37 | def loadcolumn(filename,col=4, skip=1, floats=True):
 38 |     pred=[]
 39 |     op=open(filename,'r')
 40 |     if skip==1:
 41 |         op.readline() #header
 42 |     for line in op:
 43 |         line=line.replace('\n','')
 44 |         sps=line.split(',')
 45 |         #load always the last columns
 46 |         if floats:
 47 |             pred.append(float(sps[col]))
 48 |         else :
 49 |             pred.append(str(sps[col]))
 50 |     op.close()
 51 |     return pred
 52 | 
 53 | def printfilcsve(X, filename):
 54 | 
 55 |     np.savetxt(filename,X, fmt='%.5f')
 56 | 
 57 | 
 58 | # read the train and test allclean.csv files. skip errors
 59 | def readfile(name, index=0):
 60 |     dopen=open(name,"r")
 61 |     array=[]
 62 |     skip_firstrow=False
 63 |     if index!=0:
 64 |         skip_firstrow=True
 65 |     for i,line in enumerate(dopen):
 66 |         if i==0 and skip_firstrow:
 67 |             continue
 68 |         splits=line.replace("\n","").replace(" ","").split(",")
 69 |         ar=[]
 70 |         for k in splits:
 71 |             try:
 72 |                ar.append(float(k))
 73 |             except:
 74 |                 ar.append(0.0)
 75 |                 print(" the string is %s ok?" % ((k)))
 76 |         array.append(ar)#[float(k)0.971474 if k!="0" else 0.0 for k in splits ])
 77 |         if i%100000==0:
 78 |             print(" we are at " , str(i))
 79 |     return   np.array(array)
 80 | 
 81 | 
 82 | def main():
 83 | 
 84 |         config = libavito.get_config()
 85 |         cache_loc = config.cache_loc
 86 |         nthreads = config.nthreads
 87 | 
 88 |         Usecv=True # true will split the training data 66-33 and do cv
 89 |         SEED=15
 90 |         threads=nthreads # number of workers for parallelism
 91 | 
 92 |         ######### Load files ############
 93 |         print("Loading input data")
 94 |         train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
 95 |         y = train['isDuplicate'].values
 96 |         X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
 97 |         del train
 98 |         print(X.shape)
 99 |         test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
100 |         ids = test['id'].values
101 |         X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
102 |         del test
103 |         print(X_test.shape)
104 | 
105 |         metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
106 | 
107 |         model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads,  eta=0.02, gamma=7.0,max_depth=15, min_child_weight=20, subsample=0.9,
108 |                                    colsample_bytree=0.4,objective='rank:pairwise',seed=1)
109 | 
110 |         #create meta folder to drop predictions for train and test
111 |         if not os.path.exists(metafolder):      #if it does not exists, we create it
112 |             os.makedirs(metafolder)
113 | 
114 |         outset="marios_xgrank_v2" # predic of all files
115 | 
116 |         #model to use
117 | 
118 |         idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)]
119 |         idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )]
120 |         kfolder=[[idex1,idex2]]
121 |         #Create Arrays for meta
122 |         train_stacker=[ 0.0  for k in range (0,(idex2.shape[0])) ]
123 |         test_stacker=[0.0  for k in range (0,(X_test.shape[0]))]
124 |         # CHECK EVerything in five..it could be more efficient
125 | 
126 |         #create target variable
127 |         mean_kapa = 0.0
128 |         #kfolder=StratifiedKFold(y, n_folds=number_of_folds,shuffle=True, random_state=SEED)
129 |         #number_of_folds=0
130 |         #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
131 |         i=0 # iterator counter
132 |         if Usecv:
133 |             print ("starting cross validation")
134 |             for train_index, test_index in kfolder:
135 |                 # creaning and validation sets
136 |                 X_train, X_cv = X[train_index], X[test_index]
137 |                 y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
138 |                 print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
139 | 
140 | 
141 |                 preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
142 | 
143 | 
144 |                 # compute Loglikelihood metric for this CV fold
145 |                 #scalepreds(preds)
146 |                 kapa = roc_auc_score(y_cv,preds)
147 |                 print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)
148 | 
149 |                 mean_kapa += kapa
150 |                 #save the results
151 |                 no=0
152 |                 for real_index in test_index:
153 |                          train_stacker[no]=(preds[no])
154 |                          no+=1
155 |                 i+=1
156 |             if Usecv:
157 |                 print (" Average AUC: %f" % (mean_kapa) )
158 |                 print (" printing train datasets ")
159 |                 printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
160 | 
161 | 
162 |         #preds=bagged_set(X, y,model, SEED, 1, X_test, update_seed=True)
163 | 
164 |         preds=bagged_set(X, y,model, SEED , 5, X_test, update_seed=True)
165 | 
166 | 
167 |         for pr in range (0,len(preds)):
168 |                     test_stacker[pr]=(preds[pr])
169 | 
170 |         preds=np.array(preds)
171 |         printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
172 | 
173 | 
174 |         print("Write results...")
175 |         output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
176 |         print("Writing submission to %s" % output_file)
177 |         f = open(config.output_loc + output_file, "w")
178 |         f.write("id,probability\n")# the header
179 |         for g in range(0, len(preds))  :
180 |             pr=preds[g]
181 |             f.write("%d,%f\n" % (((ids[g]),pr ) ) )
182 |         f.close()
183 |         print("Done.")
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | if __name__=="__main__":
191 |   main()
192 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgson_v2.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import numpy as np
  4 | from sklearn.externals import joblib
  5 | from sklearn.metrics import roc_auc_score
  6 | import XGBoostClassifier as xg
  7 | import os
  8 | import libavito
  9 | import feather
 10 | 
 11 | #load a single column from file
 12 | def loadcolumn(filename,col=4, skip=1, floats=True):
 13 |     pred=[]
 14 |     op=open(filename,'r')
 15 |     if skip==1:
 16 |         op.readline() #header
 17 |     for line in op:
 18 |         line=line.replace('\n','')
 19 |         sps=line.split(',')
 20 |         #load always the last columns
 21 |         if floats:
 22 |             pred.append(float(sps[col]))
 23 |         else :
 24 |             pred.append(str(sps[col]))
 25 |     op.close()
 26 |     return pred
 27 | 
 28 | 
 29 | #export file in csv using numpy
 30 | def printfilcsve(X, filename):
 31 | 
 32 |     np.savetxt(filename,X, fmt='%.5f')
 33 | 
 34 | # read the train and test allclean.csv files. skip errors
 35 | def readfile(name, index=0):
 36 |     dopen=open(name,"r")
 37 |     array=[]
 38 |     skip_firstrow=False
 39 |     if index!=0:
 40 |         skip_firstrow=True
 41 |     for i,line in enumerate(dopen):
 42 |         if i==0 and skip_firstrow:
 43 |             continue
 44 |         splits=line.replace("\n","").replace(" ","").split(",")
 45 |         ar=[]
 46 |         for k in splits:
 47 |             try:
 48 |                ar.append(float(k))
 49 |             except:
 50 |                 ar.append(0.0)
 51 |                 print(" the string is %s ok?" % ((k)))
 52 |         array.append(ar)#[float(k) if k!="0" else 0.0 for k in splits ])
 53 |         if i%100000==0:
 54 |             print(" we are at " , str(i))
 55 |     return   np.array(array)
 56 | 
 57 | 
 58 | # bagger for xgboost
 59 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
 60 | 
 61 |    # create array object to hold predictions
 62 |    baggedpred=[ 0.0  for d in range(0, (xt.shape[0]))]
 63 |    #loop for as many times as we want bags
 64 |    for n in range (0, estimators):
 65 |         #shuff;e first, aids in increasing variance and forces different results
 66 |         #X_t,y_c=shuffle(X,y, random_state=seed+n)
 67 | 
 68 |         if update_seed: # update seed if requested, to give a slightly different model
 69 |             model.set_params(random_state=seed + n)
 70 |         model.fit(X_t,y_c) # fit model0.0917411475506
 71 |         preds=model.predict_proba(xt)[:,1] # predict probabilities
 72 |         # update bag's array
 73 |         for j in range (0, (xt.shape[0])):
 74 |                 baggedpred[j]+=preds[j]
 75 |         print("done bag %d " % (n))
 76 |    # divide with number of bags to create an average estimate
 77 |    for j in range (0, len(baggedpred)):
 78 |                 baggedpred[j]/=float(estimators)
 79 |    # return probabilities
 80 |    return np.array(baggedpred)
 81 | 
 82 | 
 83 | def main():
 84 |         config = libavito.get_config()
 85 |         cache_loc = config.cache_loc
 86 |         nthreads = config.nthreads
 87 | 
 88 |         Usecv=True # true will split the training data 66-33 and do cv
 89 |         SEED=15
 90 |         threads=nthreads # number of workers for parallelism
 91 | 
 92 |         ######### Load files ############
 93 |         print("Loading input data")
 94 |         train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
 95 |         y = train['isDuplicate'].values
 96 |         X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
 97 |         del train
 98 |         print(X.shape)
 99 |         test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
100 |         ids = test['id'].values
101 |         X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
102 |         del test
103 |         print(X_test.shape)
104 | 
105 | 
106 |         metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
107 |         if not os.path.exists(metafolder):      #if it does not exists, we create it
108 |             os.makedirs(metafolder)
109 |         outset="marios_xgson_v2" # predic of all files
110 | 
111 |         #model to use
112 | 
113 |         model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads,  eta=0.02, gamma=7.0,max_depth=20, min_child_weight=20, subsample=0.9,
114 |                                    colsample_bytree=0.4,objective='binary:logistic',seed=1)
115 | 
116 |         #Create Arrays for meta
117 |         idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai
118 |         idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )]  #indices for test
119 |         kfolder=[[idex1,idex2]] # create an object to put indices in
120 | 
121 |         #arrays to save predictions for validation and test for meta modelling (stacking)
122 |         train_stacker=[ 0.0  for k in range (0,(X.shape[0])) ]
123 |         test_stacker=[0.0  for k in range (0,(X_test.shape[0]))]
124 | 
125 |         #create target variable
126 |         mean_kapa = 0.0
127 |         #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
128 |         i=0 # iterator counter
129 |         if Usecv:
130 |             print ("starting cross validation" )
131 |             for train_index, test_index in kfolder:
132 |                 # creaning and validation sets
133 |                 X_train, X_cv = X[train_index], X[test_index]
134 |                 y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
135 |                 print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
136 | 
137 |                 #use xgboost bagger
138 |                 preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
139 | 
140 |                 # compute Loglikelihood metric for this CV fold
141 |                 #scalepreds(preds)
142 |                 kapa = roc_auc_score(y_cv,preds)
143 |                 print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)
144 | 
145 |                 mean_kapa += kapa
146 |                 #save the results
147 |                 no=0
148 |                 for real_index in test_index:
149 |                          train_stacker[no]=(preds[no])
150 |                          no+=1
151 |                 i+=1
152 |             if (Usecv):
153 |                 #print the array of validation predictions for stacking later on inside the 'meta_folder'
154 |                 print (" Average AUC: %f" % (mean_kapa) )
155 |                 print (" printing train datasets ")
156 |                 printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
157 | 
158 |         preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True)
159 | 
160 | 
161 |         for pr in range (0,len(preds)):
162 |                     test_stacker[pr]=(preds[pr])
163 |         #print prediction as numpy array for stacking later on
164 |         preds=np.array(preds)
165 |         printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
166 | 
167 |         #create submission file
168 |         print("Write results...")
169 |         output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
170 |         print("Writing submission to %s" % output_file)
171 |         f = open(config.output_loc + output_file, "w")
172 |         f.write("id,probability\n")# the header
173 |         for g in range(0, len(preds))  :
174 |             pr=preds[g]
175 |             f.write("%d,%f\n" % (((ids[g]),pr ) ) )
176 |         f.close()
177 |         print("Done.")
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | if __name__=="__main__":
185 |   main()
186 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgson_v3.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import numpy as np
  4 | from sklearn.externals import joblib
  5 | from sklearn.metrics import roc_auc_score
  6 | import XGBoostClassifier as xg
  7 | import os
  8 | import libavito
  9 | import feather
 10 | 
 11 | #load a single column from file
 12 | def loadcolumn(filename,col=4, skip=1, floats=True):
 13 |     pred=[]
 14 |     op=open(filename,'r')
 15 |     if skip==1:
 16 |         op.readline() #header
 17 |     for line in op:
 18 |         line=line.replace('\n','')
 19 |         sps=line.split(',')
 20 |         #load always the last columns
 21 |         if floats:
 22 |             pred.append(float(sps[col]))
 23 |         else :
 24 |             pred.append(str(sps[col]))
 25 |     op.close()
 26 |     return pred
 27 | 
 28 | 
 29 | 
 30 | #export file in csv using numpy
 31 | def printfilcsve(X, filename):
 32 | 
 33 |     np.savetxt(filename,X, fmt='%.5f')
 34 | 
 35 | # read the train and test allclean.csv files. skip errors
 36 | def readfile(name, index=0):
 37 |     dopen=open(name,"r")
 38 |     array=[]
 39 |     skip_firstrow=False
 40 |     if index!=0:
 41 |         skip_firstrow=True
 42 |     for i,line in enumerate(dopen):
 43 |         if i==0 and skip_firstrow:
 44 |             continue
 45 |         splits=line.replace("\n","").replace(" ","").split(",")
 46 |         ar=[]
 47 |         for k in splits:
 48 |             try:
 49 |                ar.append(float(k))
 50 |             except:
 51 |                 ar.append(0.0)
 52 |                 print(" the string is %s ok?" % ((k)))
 53 |         array.append(ar)#[float(k) if k!="0" else 0.0 for k in splits ])
 54 |         if i%100000==0:
 55 |             print(" we are at " , str(i))
 56 |     return   np.array(array)
 57 | 
 58 | 
 59 | # bagger for xgboost
 60 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
 61 | 
 62 |    # create array object to hold predictions
 63 |    baggedpred=[ 0.0  for d in range(0, (xt.shape[0]))]
 64 |    #loop for as many times as we want bags
 65 |    for n in range (0, estimators):
 66 |         #shuff;e first, aids in increasing variance and forces different results
 67 |         #X_t,y_c=shuffle(X,y, random_state=seed+n)
 68 | 
 69 |         if update_seed: # update seed if requested, to give a slightly different model
 70 |             model.set_params(random_state=seed + n)
 71 |         model.fit(X_t,y_c) # fit model0.0917411475506
 72 |         preds=model.predict_proba(xt)[:,1] # predict probabilities
 73 |         # update bag's array
 74 |         for j in range (0, (xt.shape[0])):
 75 |                 baggedpred[j]+=preds[j]
 76 |         print("done bag %d " % (n))
 77 |    # divide with number of bags to create an average estimate
 78 |    for j in range (0, len(baggedpred)):
 79 |                 baggedpred[j]/=float(estimators)
 80 |    # return probabilities
 81 |    return np.array(baggedpred)
 82 | 
 83 | 
 84 | def main():
 85 | 
 86 |         config = libavito.get_config()
 87 |         cache_loc = config.cache_loc
 88 |         nthreads = config.nthreads
 89 | 
 90 |         Usecv=True # true will split the training data 66-33 and do cv
 91 |         SEED=15
 92 |         threads=nthreads # number of workers for parallelism
 93 | 
 94 |         ######### Load files ############
 95 |         print("Loading input data")
 96 |         train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
 97 |         y = train['isDuplicate'].values
 98 |         X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
 99 |         del train
100 |         print(X.shape)
101 |         test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
102 |         ids = test['id'].values
103 |         X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
104 |         del test
105 |         print(X_test.shape)
106 | 
107 | 
108 |         metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
109 |         if not os.path.exists(metafolder):      #if it does not exists, we create it
110 |             os.makedirs(metafolder)
111 |         outset="marios_xgson_v3" # predic of all files
112 | 
113 |         #model to use
114 | 
115 |         model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads,  eta=0.02, gamma=7.0,max_depth=20, min_child_weight=20, subsample=0.9,
116 |                                    colsample_bytree=0.4,objective='binary:logistic',seed=1)
117 | 
118 |         #Create Arrays for meta
119 |         idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai
120 |         idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )]  #indices for test
121 |         kfolder=[[idex1,idex2]] # create an object to put indices in
122 | 
123 |         #arrays to save predictions for validation and test for meta modelling (stacking)
124 |         train_stacker=[ 0.0  for k in range (0,(idex2.shape[0])) ]
125 |         test_stacker=[0.0  for k in range (0,(X_test.shape[0]))]
126 | 
127 |         #create target variable
128 |         mean_kapa = 0.0
129 |         #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
130 |         i=0 # iterator counter
131 |         if Usecv:
132 |             print ("starting cross validation" )
133 |             for train_index, test_index in kfolder:
134 |                 # creaning and validation sets
135 |                 X_train, X_cv = X[train_index], X[test_index]
136 |                 y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
137 |                 print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
138 | 
139 |                 #use xgboost bagger
140 |                 preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
141 | 
142 |                 # compute Loglikelihood metric for this CV fold
143 |                 #scalepreds(preds)
144 |                 kapa = roc_auc_score(y_cv,preds)
145 |                 print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)
146 | 
147 |                 mean_kapa += kapa
148 |                 #save the results
149 |                 no=0
150 |                 for real_index in test_index:
151 |                          train_stacker[no]=(preds[no])
152 |                          no+=1
153 |                 i+=1
154 |             if (Usecv):
155 |                 #print the array of validation predictions for stacking later on inside the 'meta_folder'
156 |                 print (" Average AUC: %f" % (mean_kapa) )
157 |                 print (" printing train datasets ")
158 |                 printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
159 | 
160 |         preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True)
161 | 
162 | 
163 |         for pr in range (0,len(preds)):
164 |                     test_stacker[pr]=(preds[pr])
165 |         #print prediction as numpy array for stacking later on
166 |         preds=np.array(preds)
167 |         printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
168 | 
169 |         #create submission file
170 |         print("Write results...")
171 |         output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
172 |         print("Writing submission to %s" % output_file)
173 |         f = open(config.output_loc + output_file, "w")s
174 |         f.write("id,probability\n")# the header
175 |         for g in range(0, len(preds))  :
176 |             pr=preds[g]
177 |             f.write("%d,%f\n" % (((ids[g]),pr ) ) )
178 |         f.close()
179 |         print("Done.")
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | if __name__=="__main__":
187 |   main()
188 | 


--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgrank_v3.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.preprocessing import StandardScaler
  3 | from sklearn.metrics import roc_auc_score
  4 | import XGBoostClassifier as xg
  5 | import os
  6 | import libavito
  7 | import feather
  8 | 
  9 | # bagger for xgboost
 10 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
 11 | 
 12 |    # create array object to hold predictions
 13 |    baggedpred=[ 0.0  for d in range(0, (xt.shape[0]))]
 14 |    #loop for as many times as we want bags
 15 |    for n in range (0, estimators):
 16 |         #shuff;e first, aids in increasing variance and forces different results
 17 |         #X_t,y_c=shuffle(X,y, random_state=seed+n)
 18 | 
 19 |         if update_seed: # update seed if requested, to give a slightly different model
 20 |             model.set_params(random_state=seed + n)
 21 |         model.fit(X_t,y_c) # fit model0.0917411475506
 22 |         preds=model.predict(xt) # predict probabilities
 23 |         # update bag's array
 24 |         for j in range (0, (xt.shape[0])):
 25 |                 baggedpred[j]+=preds[j]
 26 |         print("done bag %d " % (n))
 27 |    # divide with number of bags to create an average estimate
 28 |    for j in range (0, len(baggedpred)):
 29 |                 baggedpred[j]/=float(estimators)
 30 |    # return probabilities
 31 |    return np.array(baggedpred)
 32 | 
 33 | def loadcolumn(filename,col=4, skip=1, floats=True):
 34 |     pred=[]
 35 |     op=open(filename,'r')
 36 |     if skip==1:
 37 |         op.readline() #header
 38 |     for line in op:
 39 |         line=line.replace('\n','')
 40 |         sps=line.split(',')
 41 |         #load always the last columns
 42 |         if floats:
 43 |             pred.append(float(sps[col]))
 44 |         else :
 45 |             pred.append(str(sps[col]))
 46 |     op.close()
 47 |     return pred
 48 | 
 49 | 
 50 | def printfilcsve(X, filename):
 51 | 
 52 |     np.savetxt(filename,X, fmt='%.5f')
 53 | 
 54 | 
 55 | # read the train and test allclean.csv files. skip errors
 56 | def readfile(name, index=0):
 57 |     dopen=open(name,"r")
 58 |     array=[]
 59 |     skip_firstrow=False
 60 |     if index!=0:
 61 |         skip_firstrow=True
 62 |     for i,line in enumerate(dopen):
 63 |         if i==0 and skip_firstrow:
 64 |             continue
 65 |         splits=line.replace("\n","").replace(" ","").split(",")
 66 |         ar=[]
 67 |         for k in splits:
 68 |             try:
 69 |                ar.append(float(k))
 70 |             except:
 71 |                 ar.append(0.0)
 72 |                 print(" the string is %s ok?" % ((k)))
 73 |         array.append(ar)#[float(k)0.971474 if k!="0" else 0.0 for k in splits ])
 74 |         if i%100000==0:
 75 |             print(" we are at " , str(i))
 76 |     return   np.array(array)
 77 | 
 78 | 
 79 | def main():
 80 | 
 81 |         Use_scale=True
 82 |         Usecv=True # true will split the training data 66-33 and do cv
 83 |         SEED=15
 84 |         threads=nthreads # number of workers for parallelism
 85 | 
 86 |         ######### Load files ############
 87 |         print("Loading input data")
 88 |         train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
 89 |         y = train['isDuplicate'].values
 90 |         X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
 91 |         del train
 92 |         print(X.shape)
 93 |         test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
 94 |         ids = test['id'].values
 95 |         X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
 96 |         del test
 97 |         print(X_test.shape)
 98 | 
 99 | 
100 |         metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
101 | 
102 | 
103 |         model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads,  eta=0.02, gamma=7.0,max_depth=15, min_child_weight=20, subsample=0.9,
104 |                                    colsample_bytree=0.4,objective='rank:pairwise',seed=1)
105 | 
106 |         if not os.path.exists(metafolder):      #if it does not exists, we create it
107 |             os.makedirs(metafolder)
108 | 
109 |         outset="marios_xgrank_v3" # predic of all files
110 | 
111 |         #model to use
112 | 
113 |         idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)]
114 |         idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )]
115 |         kfolder=[[idex1,idex2]]
116 |         #Create Arrays for meta
117 |         train_stacker=[ 0.0  for k in range (0,len(idex2)) ]
118 |         test_stacker=[0.0  for k in range (0,(X_test.shape[0]))]
119 |         # CHECK EVerything in five..it could be more efficient
120 | 
121 |         #create target variable
122 |         mean_kapa = 0.0
123 |         #kfolder=StratifiedKFold(y, n_folds=number_of_folds,shuffle=True, random_state=SEED)
124 |         #number_of_folds=0
125 |         #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
126 |         i=0 # iterator counter
127 |         if Usecv:
128 |             print ("starting cross validation")
129 |             for train_index, test_index in kfolder:
130 |                 # creaning and validation sets
131 |                 X_train, X_cv = X[train_index], X[test_index]
132 |                 y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
133 |                 print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
134 | 
135 |                 if Use_scale:
136 |                     stda=StandardScaler()
137 |                     X_train=stda.fit_transform(X_train)
138 |                     X_cv=stda.transform(X_cv)
139 | 
140 |                 preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
141 | 
142 | 
143 |                 # compute Loglikelihood metric for this CV fold
144 |                 #scalepreds(preds)
145 |                 kapa = roc_auc_score(y_cv,preds)
146 |                 print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)
147 | 
148 |                 mean_kapa += kapa
149 |                 #save the results
150 |                 no=0
151 |                 for real_index in test_index:
152 |                          train_stacker[no]=(preds[no])
153 |                          no+=1
154 |                 i+=1
155 |             if Usecv:
156 |                 print (" Average AUC: %f" % (mean_kapa) )
157 |                 print (" printing train datasets ")
158 |                 printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
159 | 
160 |         if Use_scale:
161 |             stda=StandardScaler()
162 |             X=stda.fit_transform(X)
163 |             X_test=stda.transform(X_test)
164 | 
165 |         #preds=bagged_set(X, y,model, SEED, 1, X_test, update_seed=True)
166 | 
167 |         preds=bagged_set(X, y,model, SEED , 5, X_test, update_seed=True)
168 | 
169 | 
170 |         for pr in range (0,len(preds)):
171 |                     test_stacker[pr]=(preds[pr])
172 | 
173 |         preds=np.array(preds)
174 |         printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
175 | 
176 | 
177 |         print("Write results...")
178 |         output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
179 |         print("Writing submission to %s" % output_file)
180 |         f = open(config.output_loc + output_file, "w")
181 |         f.write("id,probability\n")# the header
182 |         for g in range(0, len(preds))  :
183 |             pr=preds[g]
184 |             f.write("%d,%f\n" % (((ids[g]),pr ) ) )
185 |         f.close()
186 |         print("Done.")
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | if __name__=="__main__":
194 |   main()
195 | 


--------------------------------------------------------------------------------