├── AnalyticsVidhya
├── readme.md
├── WNS-analytics-wizard-2019
│ └── README.md
├── amexpert-2019-machine-learning-hackathon
│ └── code
│ │ ├── README.md
│ │ ├── agg_feature_2_merge.R
│ │ ├── agg_feature.R
│ │ └── agg_feature_2.R
├── Knocktober
│ └── readme.md
├── Date-your-Data
│ ├── feature_df_all_CountOfApplications.R
│ ├── 11_Ensemble_Models.R
│ ├── feature_df_all_Match_Internship_Location_with_other_locations.R
│ ├── BUILD_FINAL_SUBMISSION.R
│ ├── README.md
│ ├── 4_feature_internship_SkillsCoding.R
│ ├── 3_feature_internship_Profile_Coding.R
│ ├── 6_feature_student_degreeCoding.R
│ ├── 7_feature_student_ExperienceCoding.R
│ ├── 2_feature_internship_Profile_WordCount.R
│ ├── 5_feature_student_StreamsCoding.R
│ ├── 9_model_XGB_1.R
│ ├── 10_model_XGB_1.R
│ └── 1_internship_WordCorrection.R
└── AVDatafest_XtremeML
│ ├── input
│ └── holiday.csv
│ └── README.md
├── Kaggle
├── readme.md
└── Avito Duplicate Ad Detection
│ ├── input
│ └── README.txt
│ ├── output
│ └── README.txt
│ ├── cache
│ └── README.txt
│ ├── tokenizers
│ └── punkt
│ │ └── PY3
│ │ └── russian.pickle
│ ├── Documentation - TheQuants Team - Avito Contest.pdf
│ ├── code
│ ├── 5_data_postprocessing.py
│ ├── feature_verification.py
│ ├── 3_feature_set1d_interaction.R
│ ├── libavito.py
│ ├── models
│ │ ├── libavito.py
│ │ ├── marios_xgregv3.py
│ │ ├── marios_xgson_v4.py
│ │ ├── marios_xgsonv2_v5.py
│ │ ├── marios_xgrank_v2.py
│ │ ├── marios_xgson_v2.py
│ │ ├── marios_xgson_v3.py
│ │ └── marios_xgrank_v3.py
│ ├── 3_feature_set4a_fuzzy.py
│ ├── 3_feature_set1g_capitalLetters.R
│ ├── 3_feature_set4b_fuzzy_clean.py
│ ├── 3_feature_set1f_SpecialCounting.R
│ ├── legacy
│ │ └── 3_feature_set4e_count3way_clean.py
│ ├── 3_feature_set3d_json1.py
│ ├── 3_feature_set3c_json.py
│ ├── 3_feature_set3b_title.py
│ ├── 3_feature_set3a_description.py
│ ├── 3_feature_set1e_attribute.R
│ ├── 3_json_to_cols.py
│ ├── functions.R
│ ├── 3_feature_set3f_hamming.py
│ ├── 2_image_info.py
│ ├── 3_feature_set1a_ngram.R
│ ├── 3_feature_set1b_nchar.R
│ ├── 5_consolidate_features.R
│ ├── 3_feature_set4c_alternate.py
│ ├── 3_feature_set1c_misc.R
│ └── 1_data_preprocessing.py
│ ├── config.cfg
│ ├── README.md
│ └── runAll.sh
├── README.md
├── HackerEarth
├── Predict Lanes from LIDAR data
│ ├── Rplot.png
│ ├── final_1_calculateHaversineDistance.R
│ ├── README.md
│ ├── final_2_buildData.R
│ └── final_3_buildModel.R
└── Loan Default ML Challenge
│ └── README.md
├── HackerRank
└── Walmart-Codesprint
│ └── readme.md
└── Microsoft
└── Womens-Health-Risk-Assessment
├── README.md
└── Predict.R
/AnalyticsVidhya/readme.md:
--------------------------------------------------------------------------------
1 | Analytics Vidhya Hackathons
2 |
--------------------------------------------------------------------------------
/Kaggle/readme.md:
--------------------------------------------------------------------------------
1 | Repository for all my Kaggle Competitions
2 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/input/README.txt:
--------------------------------------------------------------------------------
1 | This is the default location for input files
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Competitions
2 | Repository of various competitions I participate
3 |
4 | (c) Sonny Laskar
5 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/output/README.txt:
--------------------------------------------------------------------------------
1 | This is the default location for output files such as submission files
2 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/WNS-analytics-wizard-2019/README.md:
--------------------------------------------------------------------------------
1 | # Repo for WNS Hackathon #
2 | https://github.com/sonnylaskar/wns-analytics-wizard-2019
3 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/cache/README.txt:
--------------------------------------------------------------------------------
1 | This is the default location for cache files such as models and cleaned data/features
2 |
--------------------------------------------------------------------------------
/HackerEarth/Predict Lanes from LIDAR data/Rplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sonnylaskar/Competitions/HEAD/HackerEarth/Predict Lanes from LIDAR data/Rplot.png
--------------------------------------------------------------------------------
/HackerRank/Walmart-Codesprint/readme.md:
--------------------------------------------------------------------------------
1 | # Walmart Codesprint
2 |
3 | #Final Rank: 14 / 132
4 |
5 | https://www.hackerrank.com/contests/walmart-codesprint-ml/challenges
6 |
7 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/tokenizers/punkt/PY3/russian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sonnylaskar/Competitions/HEAD/Kaggle/Avito Duplicate Ad Detection/tokenizers/punkt/PY3/russian.pickle
--------------------------------------------------------------------------------
/AnalyticsVidhya/amexpert-2019-machine-learning-hackathon/code/README.md:
--------------------------------------------------------------------------------
1 | American Express Coupon Conversion Hackathon
2 | https://datahack.analyticsvidhya.com/contest/amexpert-2019-machine-learning-hackathon/
3 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/Documentation - TheQuants Team - Avito Contest.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sonnylaskar/Competitions/HEAD/Kaggle/Avito Duplicate Ad Detection/Documentation - TheQuants Team - Avito Contest.pdf
--------------------------------------------------------------------------------
/AnalyticsVidhya/Knocktober/readme.md:
--------------------------------------------------------------------------------
1 | # Competition:
2 | https://datahack.analyticsvidhya.com/contest/knocktober-2016/
3 |
4 | Problem Type:
5 | Binary Classification
6 |
7 | # Models:
8 |
9 | 2 Bags of XGBoost
10 |
11 | 2 Bags of GBM
12 |
13 | Equal Weighted Rank Average of the above models.
14 |
15 | # Score:
16 | Public LB: 0.8362 (Rank 4)
17 |
18 | Private LB: 0.7685 (Rank 3)
19 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/feature_df_all_CountOfApplications.R:
--------------------------------------------------------------------------------
1 | #Feature
2 | #Add a column of how many applications received for any Internship_ID
3 | Intern_Freq <- data.frame(table(df_all$Internship_ID))
4 | names(Intern_Freq) <- c("Internship_ID", "Internship_ApplicationCount")
5 | Intern_Freq$Internship_ID <- as.integer(as.character(Intern_Freq$Internship_ID))
6 | df_all <- left_join(df_all, Intern_Freq, by = "Internship_ID" )
7 | rm(Intern_Freq)
8 | #END
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/11_Ensemble_Models.R:
--------------------------------------------------------------------------------
1 | library(readr)
2 |
3 | #Ensemble the 2 XGB models
4 | MODEL_1 <- read_csv("../Submissions/XGB_MODEL_S123_N526.csv")
5 | MODEL_2 <- read_csv("../Submissions/XGB_MODEL_S500_N710.csv")
6 |
7 | MEANSCORE <- (MODEL_1$Is_Shortlisted + MODEL_2$Is_Shortlisted) / 2
8 |
9 | #SAVE
10 | submission <- data.frame(Internship_ID = MODEL_1$Internship_ID,
11 | Student_ID = MODEL_1$Student_ID,
12 | Is_Shortlisted = MEANSCORE)
13 | write_csv(submission,"../Submissions/FINAL_SUBMISSION.csv")
14 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/feature_df_all_Match_Internship_Location_with_other_locations.R:
--------------------------------------------------------------------------------
1 | #Feature
2 | #Add if InternLocation matches with hometomeLocation,
3 | #if InternLocation matches with InstitudeLocationCode
4 | #if InternLocation matches with PreferredLocationCode
5 |
6 | df_all$isIntern_Loc_Match_HomeTown <- ifelse(df_all$LocationCode == df_all$hometownLocationCode, 1, 0)
7 | df_all$isIntern_Loc_Match_InstitudeLocationCode <- ifelse(df_all$LocationCode == df_all$InstitudeLocationCode, 1, 0)
8 | df_all$isIntern_Loc_Match_PreferredLocationCode <- ifelse(df_all$LocationCode == df_all$PreferredLocationCode, 1, 0)
9 |
--------------------------------------------------------------------------------
/HackerEarth/Loan Default ML Challenge/README.md:
--------------------------------------------------------------------------------
1 | Code for the HackerEarth competition on detecting loan defaulters
2 | https://www.hackerearth.com/challenge/competitive/machine-learning-challenge-one/
3 |
4 | # The code was built on the below platform:
5 | OS: Linux (CentOS)
6 | RAM: 16GB
7 | CPU Core: 8
8 |
9 | Software:
10 | R 3.3.2
11 |
12 | R Packages:
13 | readr,dplyr,caret,xgboost,gbm,data.table,lightgbm,tm,stringr,ModelMetrics)
14 |
15 | To generate the final submission:
16 | 1) Create the folders: input, code, output
17 | 2) Put the data files in input folder
18 | 3) Save the R files in code folder
19 | 4) Execute the below command :
20 | Rscript final_model.R
21 | 5) It will take around 1 hour to complete
22 | 6) The final submission file will be created in output folder
23 |
24 |
25 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/BUILD_FINAL_SUBMISSION.R:
--------------------------------------------------------------------------------
1 | #This will build the Final Solution
2 | #Will take some time
3 |
4 | source("1_internship_WordCorrection.R")
5 | source("2_feature_internship_Profile_WordCount.R")
6 | source("3_feature_internship_Profile_Coding.R")
7 | source("4_feature_internship_SkillsCoding.R")
8 | source("5_feature_student_StreamsCoding.R")
9 | source("6_feature_student_degreeCoding.R")
10 | source("7_feature_student_ExperienceCoding.R")
11 | source("8_preprocessing.R")
12 |
13 | print("Building First XGB model")
14 | source("9_model_XGB_1.R")
15 | print("Building Second XGB model")
16 | source("10_model_XGB_1.R")
17 |
18 | print("Calculating the Average of the 2 models")
19 | source("11_Ensemble_Models.R")
20 | print("Huh!!!, I am done!")
21 | print("Check out FINAL_SUBMISSION FILE in Submission FOlder")
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/5_data_postprocessing.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import libavito as a
4 | import feather as f
5 | import time
6 |
7 | cache_loc = a.get_config().cache_loc
8 |
9 | start = time.time()
10 | print('Transforming training data ... ', end='', flush=True)
11 | df = f.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
12 | df.replace([np.nan, None], -1, inplace=True)
13 | df.replace([np.inf, -np.inf], 9999.99, inplace=True)
14 | f.write_dataframe(df, cache_loc + 'final_featureSet_train.fthr')
15 | del df
16 | a.print_elapsed(start)
17 |
18 | start = time.time()
19 | print('Transforming testing data ... ', end='', flush=True)
20 | df = f.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
21 | df.replace([np.nan, None], -1, inplace=True)
22 | df.replace([np.inf, -np.inf], 9999.99, inplace=True)
23 | f.write_dataframe(df, cache_loc + 'final_featureSet_test.fthr')
24 | a.print_elapsed(start)
25 |
26 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/amexpert-2019-machine-learning-hackathon/code/agg_feature_2_merge.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)
2 | library(lubridate)
3 |
4 | createDf <- function(file_names) {
5 | df <- tibble()
6 | for (i in file_names) {
7 | tmp <- suppressMessages(read_csv(i))
8 | if (nrow(df) == 0) {
9 | df <- tmp
10 | } else {
11 | df <- left_join(df, tmp, by = c("CampaignDate", "customer_id", "item_id"))
12 | }
13 | rm(tmp)
14 | gc()
15 | }
16 | df
17 | }
18 |
19 | #[1] 26 27 28 29 30 1 2 3 4 5 6 7 8 9 10 11 12 13 16 17 18 19 20 21 22 23 24 25
20 | #assign("df_2", df_1)
21 |
22 | df <- tibble()
23 | for (i in 1:30) {
24 | print(i)
25 | tmp <- createDf(list.files(path = "../input/",
26 | pattern = paste0("agg_feat_",i,"_"),
27 | full.names = T))
28 | #assign(paste0("df_", i), df)
29 | if (nrow(df) == 0) {
30 | df <- tmp
31 | } else {
32 | df <- bind_rows(df, tmp)
33 | }
34 | rm(tmp)
35 | gc()
36 | }
37 |
38 | write_csv(df, "../input/agg_v2.csv")
39 |
40 |
--------------------------------------------------------------------------------
/Microsoft/Womens-Health-Risk-Assessment/README.md:
--------------------------------------------------------------------------------
1 | # Microsoft - Womens Health Risk Assessment Machine Learning Competition
2 | https://gallery.cortanaintelligence.com/Competition/Women-s-Health-Risk-Assessment-1
3 |
4 | # Problem
5 | ## Type of Problem:
6 | Supervised Multiclass Classification Problem
7 |
8 | ## Problem Description:
9 |
10 | The objective of this machine learning competition is to build machine learning models to assign a young woman subject (15-30 years old) in one of the 9 underdeveloped regions into a risk segment, and a subgroup within the segment.
11 | After the accurate assignments of the risk segment and subgroup in each region, a healthcare practitioner can deliver services to prevent the subject from the health risks, specifically sexual and reproductive health risks (like HIV infections). The types of services are personalized, based on the risk segment and subgroup assignments.
12 |
13 | ## Evaluation:
14 | Accuracy
15 |
16 | ## Score:
17 | Public Leaderboard : _87.316611_ Rank: _7_ / 493
18 |
19 | Private Leaderboard : _87.144886_ Rank: _7_ / 493
20 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/config.cfg:
--------------------------------------------------------------------------------
1 | # All file locations must be unix locations - FOLDERS MUST END IN '/'
2 |
3 | ###########################
4 | ##### MACHINE CONFIG #####
5 | ##########################
6 | # When preprocessing_nthreads is set to 1, you will receive
7 | # more progress/speed updates.
8 | preprocessing_nthreads = 12
9 | model_nthreads = 12
10 |
11 | # Set to the folder where config.cfg resides
12 | BASE_DIR = '/path/to/directory/'
13 |
14 | # Location to store intermediate files (eg. models or processed features) - SSD suggested
15 | cache_loc = './cache/'
16 |
17 | # Location to put output files
18 | output_loc = './output/'
19 |
20 | #######################
21 | ##### INPUT FILES #####
22 | #######################
23 |
24 | train_ItemInfo = './input/ItemInfo_train.csv'
25 | train_ItemPairs = './input/ItemPairs_train.csv'
26 |
27 | test_ItemInfo = './input/ItemInfo_test.csv'
28 | test_ItemPairs = './input/ItemPairs_test.csv'
29 |
30 | category_csv = './input/Category.csv'
31 | location_csv = './input/Location.csv'
32 |
33 | images_root = '/path/to/images/'
34 |
--------------------------------------------------------------------------------
/HackerEarth/Predict Lanes from LIDAR data/final_1_calculateHaversineDistance.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)
2 | library(stringr)
3 | library(geosphere)
4 |
5 | label <- read_csv("../input/labels.csv")
6 | label$roadCoordinates <- NULL
7 | train <- read_csv("../input/train.csv")
8 | test <- read_csv("../input/test.csv")
9 | df_all <- bind_rows(train, test)
10 |
11 | getDis <- function(x) {
12 | x <- as.data.frame(matrix(as.numeric(unlist(strsplit(unlist(strsplit(x, "\\|")), " "))), byrow = T, ncol = 2))
13 | x$V1 <- ifelse(x$V1 < -90, -90, x$V1)
14 | x$V2 <- ifelse(x$V2 < -90, -90, x$V2)
15 | x$V1 <- ifelse(x$V1 > 90, 90, x$V1)
16 | x$V2 <- ifelse(x$V2 > 90, 90, x$V2)
17 | x <- arrange(x, V1, V2)[c(1, nrow(x)), ]
18 | distHaversine(x[, 1], x[, 2])
19 | }
20 |
21 |
22 | getHaversineDistance <- function(id) {
23 | median(sapply(df_all$laneLineCoordinates[df_all$roadId == id], getDis, USE.NAMES = F) )
24 | }
25 |
26 | roads <- data_frame(roadId = unique(df_all$roadId))
27 | roads$haversineDistance <- (sapply(roads$roadId, getHaversineDistance))
28 |
29 | write_csv(roads, "../input/roadsDistance.csv")
30 |
31 |
32 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/README.md:
--------------------------------------------------------------------------------
1 | # AnalyticsVidhya Date Your Data Contest
2 | This repository contains the code used by me in the "Date Your Data Contest". This scored 0.7006 on the Private Leaderboard and secured 3rd position in the contest
3 |
4 | https://www.analyticsvidhya.com/blog/2016/03/winning-solutions-dyd-competition-xgboost-ruled/
5 |
6 | A) Prerequisites
7 |
8 | Ensure that the following packages are installed:
9 | dplyr, tidy, xgboost, tm, SnowBallC, readr, qdap, stringr, stylo, caret
10 |
11 | B) Build Submission File
12 |
13 | 1) Ensure that all datasets i.e. Student, Internships, train and test are
14 | present in the "data" folder. Download from the link in the description of this repository
15 | 2) Execute the RScript "BUILD_FINAL_SUBMISSION.R"
16 | 3) Wait for some time to complete
17 | 4) Check the "Submissions" folder for the FINAL Submission file
18 |
19 | C) Improvements
20 |
21 | 1) I couldn't try other models for lack of time. Building other models might also be helpful
22 | 2) I have build a long list of features but I haven't removed the unnecessary files. Feature Selection would have us reduce the feature Sets
23 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/feature_verification.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import libavito as a
4 | from multiprocessing import Pool
5 |
6 | df1 = pd.read_csv('')
7 | df2 = pd.read_csv('')
8 |
9 | def find_best_feature(c):
10 | ftr = df1[c].values
11 |
12 | high_correl = 0
13 | high_ftr = ''
14 | num_995 = 0
15 | for c2 in df2.columns:
16 | cor = np.corrcoef(ftr, df2[c2])[0, 1]
17 | if cor > 0.995:
18 | num_995 += 1
19 | if cor > high_correl:
20 | high_correl = cor
21 | high_ftr = c2
22 |
23 | return high_correl, high_ftr, num_995
24 |
25 | for c in df1.columns:
26 | hc, hf, n995 = find_best_feature(c)
27 |
28 | if hc == 1:
29 | print(a.c.OKGREEN + (c + ' -> ' + hf).ljust(60) + ' | CORREL 1' + a.c.END)
30 | elif hc > 0.995:
31 | print(a.c.OKBLUE + (c + ' -> ' + hf).ljust(60) + ' | CORREL ' + str(hc) + a.c.END)
32 | elif hc > 0.95:
33 | print(a.c.WARNING + (c + ' -> ' + hf).ljust(60) + ' | CORREL ' + str(hc) + a.c.END)
34 | else:
35 | print(a.c.FAIL + (c + ' -> ???? ').ljust(60) + ' | ' + str(hc) + ' ' + hf)
36 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1d_interaction.R:
--------------------------------------------------------------------------------
1 | # This script is called by 3_feature_set1d_misc.R script
2 | # DO NOT CALL it Directly
3 | # Start Interaction feature script
4 | print("Starting Interaction feature script")
5 | featureList <- c("isMetroIdSame", "isLocationIDSame", "isRegionIDSame", "isLongitudeSame", "isLatitudeSame", "isTitleSame", "isdescriptionSame")
6 | featureList <- combn(featureList, 2)
7 |
8 | create_interaction <- function(x) {
9 | i <- x[1]
10 | j <- x[2]
11 | print(c(i, j))
12 | columnName <- paste("interaction", i, j, sep = "_")
13 | set1d[[columnName]] <<- ifelse(set1d[[i]] == 1 & set1d[[j]] == 1, 1, 0)
14 | return(NULL)
15 | }
16 | apply(featureList, 2, create_interaction)
17 |
18 | set1d <- set1d[, grep("interaction", names(set1d), value = T)] #Filter only interaction features
19 | names(set1d) <- paste("set1d", names(set1d), sep = "_")
20 |
21 |
22 | ######## Add Primary Columns ItemID1 and ItemID2
23 | set1d <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], set1d)
24 | print("Saving Interaction features features")
25 | write_feather(set1d, paste(cache_loc, "/", "features_", trainOrTest, "_set1d_", "interaction.fthr", sep = "" ))
26 |
27 | #END
28 |
29 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/amexpert-2019-machine-learning-hackathon/code/agg_feature.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)
2 | library(lubridate)
3 |
4 | campaign_data <- read_csv("../input/campaign_data.csv")
5 | campaign_data$start_date <- dmy(campaign_data$start_date)
6 | campaign_data$end_date <- dmy(campaign_data$end_date)
7 | campaign_data <- arrange(campaign_data, start_date)
8 |
9 |
10 | customer_transaction_data <- read_csv("../input/customer_transaction_data.csv")
11 |
12 |
13 | x <- unique(customer_transaction_data$date)[1]
14 | campaignDates <- campaign_data$start_date
15 | roundToNearestCampaignDate <- function(x) {
16 | campaignDates[campaignDates > x][1]
17 | }
18 |
19 | df_dates <- tibble(date = unique(customer_transaction_data$date))
20 | df_dates <- df_dates %>%
21 | rowwise() %>%
22 | mutate(nextCampaignDate = roundToNearestCampaignDate(date))
23 |
24 | customer_transaction_data <- left_join(customer_transaction_data, df_dates, by = "date")
25 |
26 | customer_transaction_df <- customer_transaction_data %>%
27 | #head(100000) %>%
28 | group_by(nextCampaignDate, customer_id, item_id) %>%
29 | summarise(quantity_sum = sum(quantity, na.rm = T),
30 | selling_price_sum = sum(selling_price, na.rm = T),
31 | other_discount_sum = sum(other_discount, na.rm = T),
32 | coupon_discount_sum = sum(coupon_discount, na.rm = T))
33 |
34 | write_csv(customer_transaction_df, "../input/customer_transaction_df.csv")
35 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/4_feature_internship_SkillsCoding.R:
--------------------------------------------------------------------------------
1 | library(dplyr)
2 | library(tidyr)
3 | library(readr)
4 |
5 | #LOAD DATA
6 | internship <- read_csv("../data/Internship.csv", na = c("", "NA", "NULL"))
7 | NCOL <- ncol(internship)
8 |
9 | #With the below code we checked how the words look like in the Skills column
10 | unlist(strsplit(unlist(strsplit(internship$Skills_required, " ")), ",")) %>%
11 | table() %>%
12 | data.frame() %>%
13 | arrange(-Freq) %>%
14 | mutate(perc.weight = percent_rank(Freq)) %>%
15 | filter(perc.weight > 0.95) -> aList
16 |
17 | aList$NCHAR <- nchar(as.character(aList$.))
18 | aList <- filter(aList, NCHAR > 1)
19 | StringsForSkills <- setdiff(as.character(aList$.), stopwords("english"))
20 |
21 | #Add 4 columns to Student dataframe
22 | internship$Skills_requiredCode <- NA
23 |
24 | for (i in StringsForSkills) {
25 | print(i)
26 | internship$Skills_requiredCode[grep(i, internship$Skills_required, ignore.case = TRUE)] <- i
27 | }
28 |
29 | ##Dummy Variables for StreamsCode
30 | for (i in c("Skills_requiredCode")) {
31 | print(i)
32 | for(level in unique(internship[[i]])){
33 | internship[paste("dummy", i, level, sep = "_")] <- ifelse(internship[[i]] == level, 1, 0)
34 | }
35 | internship[[i]] <- NULL #Drop this column
36 | }
37 |
38 |
39 | #SAVE FILES
40 | write.csv(internship[, (NCOL+1):ncol(internship)], "../data/Features_internship_SkillsCode.csv", row.names = F)
41 |
42 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/3_feature_internship_Profile_Coding.R:
--------------------------------------------------------------------------------
1 | library(dplyr)
2 | library(tidyr)
3 | library(readr)
4 | library(tm)
5 |
6 | #LOAD DATA
7 | internship <- read_csv("../data/Internship_Processed.csv", na = c("", "NA", "NULL"))
8 | NCOL <- ncol(internship)
9 |
10 | #With the below code we checked how the words look like in the Skills column
11 | unlist(strsplit(unlist(strsplit(internship$Internship_Profile, " ")), ",")) %>%
12 | table() %>%
13 | data.frame() %>%
14 | arrange(-Freq) %>%
15 | mutate(perc.weight = percent_rank(Freq)) %>%
16 | filter(perc.weight > 0.95) -> aList
17 |
18 | aList$NCHAR <- nchar(as.character(aList$.))
19 | aList <- filter(aList, NCHAR > 1)
20 | StringsForProfile <- setdiff(as.character(aList$.), stopwords("english"))
21 |
22 | #Add 4 columns to Student dataframe
23 | internship$InternshipProfile_Code <- NA
24 |
25 | for (i in StringsForProfile) {
26 | print(i)
27 | internship$InternshipProfile_Code[grep(i, internship$Internship_Profile, ignore.case = TRUE)] <- i
28 | }
29 |
30 | ##Dummy Variables for StreamsCode
31 | for (i in c("InternshipProfile_Code")) {
32 | print(i)
33 | for(level in unique(internship[[i]])){
34 | internship[paste("dummy", i, level, sep = "_")] <- ifelse(internship[[i]] == level, 1, 0)
35 | }
36 | internship[[i]] <- NULL #Drop this column
37 | }
38 |
39 |
40 | #SAVE FILES
41 | write.csv(internship[, (NCOL+1):ncol(internship)], "../data/Features_internship_ProfileCode.csv", row.names = F)
42 |
43 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/6_feature_student_degreeCoding.R:
--------------------------------------------------------------------------------
1 | library(dplyr)
2 | library(tidyr)
3 | library(readr)
4 |
5 | #LOAD DATA
6 | student <- read_csv("../data/Student.csv", na = c("", "NA", "NULL"))
7 |
8 | #With the below code we checked how the words look like in the degree column
9 | #table(student$Degree) %>% data.frame() %>% arrange(-Freq) %>% View()
10 |
11 | #We will create 4 binary columns to identify the following:
12 | #1) IsUnderGraduate
13 | #2) IsPostGraduate
14 | #3) IsTechbackground
15 | #4) IsNonTechbackground
16 |
17 | StringsForUG <- c("BE|B.|Bachelor|Undergrad|BCA|UG|BBA|LLB")
18 |
19 | StringsForPG <- c("MBA|Management|M.|MCA|MBA|Post Graduate|Master|Ph.D")
20 |
21 | StringsForTech <- c("MCA|M.Tech|M. Tech|BCA|B.E.|B. E.|B.Tech|B. Tech|Science|Technology|Engineer|Software")
22 |
23 | StringsForNonTech <- c("MBA|Management|BBA|LLB|Business|Journalism|Mass|Arts|Pharma|Chartered|Dental|Social|English|Finance|Sports|Media|Fashion|Psychology")
24 |
25 | NCOL <- ncol(student)
26 | #Add 4 columns to Student dataframe
27 | student$IsUnderGraduate <- 0
28 | student$IsPostGraduate <- 0
29 | student$IsTechbackground <- 0
30 | student$IsNonTechbackground <- 0
31 |
32 | student$IsUnderGraduate[grep(StringsForUG, student$Degree, ignore.case = TRUE)] <- 1
33 | student$IsPostGraduate[grep(StringsForPG, student$Degree, ignore.case = TRUE)] <- 1
34 | student$IsTechbackground[grep(StringsForTech, student$Degree, ignore.case = TRUE)] <- 1
35 | student$IsNonTechbackground[grep(StringsForNonTech, student$Degree, ignore.case = TRUE)] <- 1
36 |
37 | #SAVE FILES
38 | write.csv(student[, (NCOL+1):ncol(student)], "../data/Features_student_DegreeCode.csv", row.names = F)
39 |
40 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/README.md:
--------------------------------------------------------------------------------
1 | # Kaggle Avito Duplicate Ad Detection Contest
2 | Winning Solution Blog : https://blog.kaggle.com/2016/08/31/avito-duplicate-ads-detection-winners-interview-2nd-place-team-the-quants-mikel-peter-marios-sonny/
3 |
4 | Contest Link: https://www.kaggle.com/c/avito-duplicate-ads-detection/
5 |
6 | Private Leaderboard Score - _0.95294_ ( Rank 2 / 548)
7 |
8 | Final solution of Avito Duplicate Ad Detection - TheQuants
9 |
10 | ##Prerequisites:
11 | **OS:** Any Linux Distribution (Ubuntu 14.04 Preferred)
12 | **RAM:** 128GB+ (64GB for feature extraction)
13 | **CPU:** 36 cores+ (Preferred)
14 | **GPU:** CUDA-compatible NVIDIA GPU with Compute Capability 3.5+ (TITAN X Preferred)
15 | **Storage:** 64GB+ (not including input data) - Images on SSD _highly recommended_
16 |
17 | **R Version:** 3.1+
18 | **R Packages:** data.table, dplyr, dummies, feather, Hmisc, igraph, jsonlite, parallel, raster, readr, reshape2, stringdist, stringr, stylo, textreuse, tidyr, tm, xgboost
19 |
20 | **Python Version:** 3.5.1
21 | **Python Packages:** scikit-learn, numpy, pandas, python-Levenshtein, codecs, OpenCV, feather-format, jellyfish, nltk, PIL, fuzzywuzzy, stop_words, haversine
22 |
23 | **Python Version:** 2.7.1
24 | **Python Packages:** scikit-learn, feather-format, numpy, pandas
25 | XGBoost (0.4.0)
26 | Keras (0.3.2)
27 | Theano (0.8.0rc1)
28 |
29 | ##How to Generate the Submission File
30 | 1) Update `config.cfg` and set all config parameters
31 | 2) Ensure all directories mentioned in config.cfg are write-able
32 | 3) Run `RunAll.sh`
33 |
34 | _Note_: In order to generate the full submission including models, it may take several weeks and needs at least 128GB of RAM
35 |
--------------------------------------------------------------------------------
/HackerEarth/Predict Lanes from LIDAR data/README.md:
--------------------------------------------------------------------------------
1 | # Approach for [HackerEarth India Hacks Machine Learning Competition - Semi Finals](https://www.hackerearth.com/challenge/test/indiahacks-2017-machine-learning-round-2/) - (12-13 August 2017, Bangalore, India)
2 | ## (c) [Sonny Laskar](https://github.com/sonnylaskar)
3 | ## Model scored #1 on Public Leaderboard and #2 on Private Leaderboard
4 |
5 | ## Pre-requisites:
6 | ```
7 | R 3.3+
8 | Packages: xgboost, tidyverse, feather, geosphere
9 | ```
10 | ## Approach
11 |
12 | ### Directory
13 | ```
14 | Create folders - code, input, output
15 | Copy all input files in input folder
16 | Copy all code files in code folder
17 | ```
18 |
19 | ### Scripts
20 | Execute *Rscript final_1_calculateHaversineDistance.R* to calculate the length of each line by finding the Haversine distance between the two extreme coordinates for each line
21 |
22 | Execute *Rscript final_2_buildData.R*
23 |
This script builds all features and prepares the data for final model
24 |
25 | ### Feature Engineering:
26 | ```
27 | sumOfDistanceFromLeft = Sum of all distances towards Left
28 | sumOfDistanceFromRight = Sum of all distances towards Right
29 | r_sumOfDistanceFromLR = Ratio of the above two
30 | int_distLR = Intersection between the distances in left and right
31 | latCounter = Unique Count of latitude after rounding off to 4 digits
32 | lonCounter = Unique Count of longitude after rounding off to 4 digits
33 | uniq_linesLeft = Unique lines on Left
34 | uniq_linesRight = Unique lines on Right
35 | totalLaneLinesMean = Mean of total Lane Lines
36 | haversineDistance = Haversine length of each line and averaged, Then it is scaled by dividing against the LaneLineMean value
37 | [Refer feature Importance plot for importance]
38 | ```
39 | Execute *final_3_buildModel.R* to build the final model
40 | XGBOOST models with 10 different seeds are built and averaged.
41 | The final submission file will be in output folder
42 |
43 |
44 | Cheers :-)
45 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/7_feature_student_ExperienceCoding.R:
--------------------------------------------------------------------------------
1 | library(dplyr)
2 | library(tidyr)
3 | library(readr)
4 | library(tm)
5 |
6 | #LOAD DATA
7 | student <- read_csv("../data/Student.csv", na = c("", "NA", "NULL"))
8 | NCOL <- ncol(student)
9 |
10 | #Filter only the Experience-related columns
11 | student <- student[, c(1,15:19)]
12 |
13 | #####SECTION FOR EXPERIENCE ############
14 | #Create columns for each type of Experience and make a single row for each Student ID
15 | student$Experience_Type[is.na(student$Experience_Type)] <- "NoExperience"
16 | student %>%
17 | select(Student_ID, Experience_Type) %>%
18 | mutate(yesno = 1) %>%
19 | distinct %>%
20 | spread(Experience_Type, yesno, fill = 0) -> studentExperience
21 |
22 | #####SECTION FOR PROFILE ############
23 | unlist(strsplit(unlist(strsplit(student$Profile, " ")), ",")) %>%
24 | table() %>%
25 | data.frame() %>%
26 | arrange(-Freq) %>%
27 | mutate(perc.weight = percent_rank(Freq)) %>%
28 | filter(perc.weight > 0.98) -> aList
29 |
30 | aList$NCHAR <- nchar(as.character(aList$.))
31 | aList <- filter(aList, NCHAR > 1)
32 | aList <- unique(tolower(stemDocument(as.character(aList$.))))
33 | StringsForExperienceProfile <- setdiff(aList, stopwords("english"))
34 |
35 | student$Experience_Profile_Type <- NA
36 | for (i in StringsForExperienceProfile) {
37 | print(i)
38 | student$Experience_Profile_Type[grep(i, student$Profile, ignore.case = TRUE)] <- i
39 | }
40 |
41 | #Create columns for each type of Profile and make a single row for each Student ID
42 | student$Experience_Profile_Type[is.na(student$Experience_Profile_Type)] <- "NoProfile"
43 | student %>%
44 | select(Student_ID, Experience_Profile_Type) %>%
45 | mutate(yesno = 1) %>%
46 | distinct %>%
47 | spread(Experience_Profile_Type, yesno, fill = 0) -> studentExperienceProfile
48 |
49 | #JOIN
50 | studentExperience <- left_join(studentExperience, studentExperienceProfile, by = "Student_ID")
51 | #SAVE FILES
52 | write.csv(studentExperience, "../data/Features_student_Experience.csv", row.names = F)
53 |
54 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/AVDatafest_XtremeML/input/holiday.csv:
--------------------------------------------------------------------------------
1 | Date,f_Holiday
2 | 2010-01-01,1.0
3 | 2010-01-06,1.0
4 | 2010-04-02,1.0
5 | 2010-05-01,1.0
6 | 2010-08-15,1.0
7 | 2010-09-11,1.0
8 | 2010-10-12,1.0
9 | 2010-11-01,1.0
10 | 2010-12-06,1.0
11 | 2010-12-08,1.0
12 | 2010-12-25,1.0
13 | 2010-12-27,1.0
14 | 2011-01-01,1.0
15 | 2011-01-06,1.0
16 | 2011-04-22,1.0
17 | 2011-05-01,1.0
18 | 2011-08-15,1.0
19 | 2011-09-11,1.0
20 | 2011-10-12,1.0
21 | 2011-11-01,1.0
22 | 2011-12-06,1.0
23 | 2011-12-08,1.0
24 | 2011-12-25,1.0
25 | 2011-12-06,1.0
26 | 2012-01-01,1.0
27 | 2012-01-06,1.0
28 | 2012-03-19,1.0
29 | 2012-04-06,1.0
30 | 2012-05-01,1.0
31 | 2012-05-06,1.0
32 | 2012-08-15,1.0
33 | 2012-09-11,1.0
34 | 2012-10-12,1.0
35 | 2012-11-01,1.0
36 | 2012-12-06,1.0
37 | 2012-12-08,1.0
38 | 2012-12-25,1.0
39 | 2013-01-01,1.0
40 | 2013-01-06,1.0
41 | 2013-03-19,1.0
42 | 2013-03-29,1.0
43 | 2013-05-01,1.0
44 | 2013-05-05,1.0
45 | 2013-06-24,1.0
46 | 2013-08-15,1.0
47 | 2013-09-11,1.0
48 | 2013-10-12,1.0
49 | 2013-11-01,1.0
50 | 2013-12-06,1.0
51 | 2013-12-08,1.0
52 | 2013-12-25,1.0
53 | 2014-01-01,1.0
54 | 2014-01-06,1.0
55 | 2014-03-19,1.0
56 | 2014-04-18,1.0
57 | 2014-04-21,1.0
58 | 2014-05-01,1.0
59 | 2014-05-04,1.0
60 | 2014-06-24,1.0
61 | 2014-08-15,1.0
62 | 2014-09-11,1.0
63 | 2014-10-12,1.0
64 | 2014-11-01,1.0
65 | 2014-12-06,1.0
66 | 2014-12-08,1.0
67 | 2014-12-25,1.0
68 | 2015-01-01,1.0
69 | 2015-01-06,1.0
70 | 2015-03-19,1.0
71 | 2015-04-03,1.0
72 | 2015-04-06,1.0
73 | 2015-05-01,1.0
74 | 2015-05-03,1.0
75 | 2015-06-24,1.0
76 | 2015-09-11,1.0
77 | 2015-10-12,1.0
78 | 2015-11-01,1.0
79 | 2015-12-06,1.0
80 | 2015-12-08,1.0
81 | 2015-12-25,1.0
82 | 2015-12-26,1.0
83 | 2016-01-01,1.0
84 | 2016-01-06,1.0
85 | 2016-03-19,1.0
86 | 2016-03-25,1.0
87 | 2016-03-28,1.0
88 | 2016-05-01,1.0
89 | 2016-05-16,1.0
90 | 2016-06-24,1.0
91 | 2016-08-15,1.0
92 | 2016-09-11,1.0
93 | 2016-10-12,1.0
94 | 2016-11-01,1.0
95 | 2016-12-06,1.0
96 | 2016-12-08,1.0
97 | 2016-12-25,1.0
98 | 2016-12-26,1.0
99 | 2017-01-01,1.0
100 | 2017-01-06,1.0
101 | 2017-03-19,1.0
102 | 2017-04-14,1.0
103 | 2017-04-17,1.0
104 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/2_feature_internship_Profile_WordCount.R:
--------------------------------------------------------------------------------
1 | library(dplyr)
2 | library(tidyr)
3 | library(readr)
4 | library(stylo)
5 | library(stringr)
6 |
7 | #LOAD DATA
8 | internship <- read_csv("../data/Internship_Processed.csv", na = c("", "NA", "NULL"))
9 |
10 | ########
11 | getNGrams <- function(my.text, n = 1) {
12 | # which can be split into a vector of consecutive words:
13 | #my.vector.of.words = txt.to.words(my.text) #Removed this single is would replace all numbers
14 | #my.vector.of.words <- unlist(strsplit(gsub("\\s+", " ", str_trim(my.text)), " "))
15 | my.vector.of.words <- unlist(strsplit(gsub("\\s+", " ", my.text), " "))
16 | #my.vector.of.words <- unlist(strsplit(my.text, " "))
17 | # now, we create a vector of word 2-grams:
18 | if (length(my.vector.of.words) >= n) {
19 | make.ngrams(my.vector.of.words, ngram.size = n)
20 | } else {
21 | return(NULL)
22 | }
23 | }
24 |
25 | ###################################
26 | getNgramsCount <- function(words, n) {
27 | #######################################
28 | # COUNTING NGRAMS FEATURES
29 | #######################################
30 | #Generate Ngrams
31 | NgramsProfile <- getNGrams(words, n)
32 |
33 | #Count of Ngrams
34 | countOfNgramsInProfile <- length(NgramsProfile)
35 |
36 | #Count of Unique NGrams
37 | countOfUniqueNgramsInProfile <- length(unique(NgramsProfile))
38 |
39 | return(c(countOfNgramsInProfile, countOfUniqueNgramsInProfile))
40 | }
41 |
42 | NCOL <- ncol(internship)
43 | for ( n in 1:2) {
44 | print(n)
45 | internship_words <- as.data.frame(t(mapply(getNgramsCount, internship$Internship_Profile, n)))
46 | colnames(internship_words) <- c(paste("countOf_", n, "_gramsInProfile", sep = ""),
47 | paste("countOfUnique_", n, "_gramsInProfile", sep = "")
48 | )
49 | row.names(internship_words) <- NULL
50 | internship <- cbind(internship, internship_words)
51 | }
52 |
53 |
54 | write.csv(internship[, (NCOL+1):ncol(internship)], "../data/Features_internship_Profile_WordCount.csv", row.names = F)
55 |
56 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/libavito.py:
--------------------------------------------------------------------------------
1 | #### Copyright 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
2 | #### Avito Duplicate Ad Detection
3 | # Author: Mikel
4 | # This file contains various functions which are used in multiple scripts
5 |
6 | from imp import load_source
7 | from time import time
8 | import sys
9 |
10 | # Terminal output colours for use in scripts
11 | class c:
12 | HEADER = '\033[95m'
13 | OKBLUE = '\033[94m'
14 | OKGREEN = '\033[92m'
15 | WARNING = '\033[93m'
16 | FAIL = '\033[91m'
17 | END = '\033[0m'
18 | BOLD = '\033[1m'
19 | UNDERLINE = '\033[4m'
20 |
21 | # Function to read the config file
22 | def read_config():
23 | conf = load_source('config.cfg', 'config.cfg')
24 | conf.nthreads = conf.model_nthreads
25 | conf.debug = 0
26 | # except Exception as e:
27 | # #print(bcol.FAIL + 'Failed to parse config file:' + bcol.END)
28 | # print(e.message, e.args)
29 | # raise Exception(bcol.FAIL + 'Failed to parse config file:' + bcol.END)
30 | return conf
31 |
32 | # Just an alias
33 | def get_config():
34 | return read_config()
35 |
36 | # Function which reads '--train' or '--test' launch arguments
37 | def get_mode(argv, name='Script'):
38 | if len(argv) != 2:
39 | raise RuntimeError(name + ' must be called with either --train or --test')
40 | if argv[1] == '--train':
41 | mode = 0
42 | elif argv[1] == '--test':
43 | mode = 1
44 | else:
45 | raise RuntimeError(name + ' must be called with either --train or --test')
46 | assert mode == 0 or mode == 1
47 | return mode
48 |
49 | # Function which prints current status and time remaining:
50 | def print_progress(k, start, o):
51 | if k != 0:
52 | dur_per_k = (time() - start) / k
53 | rem_dur = dur_per_k * (o - k)
54 | rem_mins = int(rem_dur / 60)
55 | rem_secs = rem_dur % 60
56 | toprint = str(k) + " items processed - " + str(rem_mins) + "m" + str(int(rem_secs)) + "s remaining. "
57 | sys.stdout.write(toprint + '\r')
58 | sys.stdout.flush()
59 |
60 | def print_elapsed(start):
61 | print(str(round(time() - start, 1)) + 's elapsed')
62 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/libavito.py:
--------------------------------------------------------------------------------
1 | #### Copyright 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
2 | #### Avito Duplicate Ad Detection
3 | # Author: Mikel
4 | # This file contains various functions which are used in multiple scripts
5 |
6 | from imp import load_source
7 | from time import time
8 | import sys
9 |
10 | # Terminal output colours for use in scripts
11 | class c:
12 | HEADER = '\033[95m'
13 | OKBLUE = '\033[94m'
14 | OKGREEN = '\033[92m'
15 | WARNING = '\033[93m'
16 | FAIL = '\033[91m'
17 | END = '\033[0m'
18 | BOLD = '\033[1m'
19 | UNDERLINE = '\033[4m'
20 |
21 | # Function to read the config file
22 | def read_config():
23 | conf = load_source('config.cfg', 'config.cfg')
24 | conf.nthreads = conf.model_nthreads
25 | conf.debug = 0
26 | # except Exception as e:
27 | # #print(bcol.FAIL + 'Failed to parse config file:' + bcol.END)
28 | # print(e.message, e.args)
29 | # raise Exception(bcol.FAIL + 'Failed to parse config file:' + bcol.END)
30 | return conf
31 |
32 | # Just an alias
33 | def get_config():
34 | return read_config()
35 |
36 | # Function which reads '--train' or '--test' launch arguments
37 | def get_mode(argv, name='Script'):
38 | if len(argv) != 2:
39 | raise RuntimeError(name + ' must be called with either --train or --test')
40 | if argv[1] == '--train':
41 | mode = 0
42 | elif argv[1] == '--test':
43 | mode = 1
44 | else:
45 | raise RuntimeError(name + ' must be called with either --train or --test')
46 | assert mode == 0 or mode == 1
47 | return mode
48 |
49 | # Function which prints current status and time remaining:
50 | def print_progress(k, start, o):
51 | if k != 0:
52 | dur_per_k = (time() - start) / k
53 | rem_dur = dur_per_k * (o - k)
54 | rem_mins = int(rem_dur / 60)
55 | rem_secs = rem_dur % 60
56 | toprint = str(k) + " items processed - " + str(rem_mins) + "m" + str(int(rem_secs)) + "s remaining. "
57 | sys.stdout.write(toprint + '\r')
58 | sys.stdout.flush()
59 |
60 | def print_elapsed(start):
61 | print(str(round(time() - start, 1)) + 's elapsed')
62 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/5_feature_student_StreamsCoding.R:
--------------------------------------------------------------------------------
1 | library(dplyr)
2 | library(tidyr)
3 | library(readr)
4 |
5 | #LOAD DATA
6 | student <- read_csv("../data/Student.csv", na = c("", "NA", "NULL"))
7 |
8 | #With the below code we checked how the words look like in the degree column
9 | #table(student$Stream) %>% data.frame() %>% arrange(-Freq) %>% View()
10 |
11 | NCOL <- ncol(student)
12 | #We will create binary columns for most popular streams
13 |
14 | #Add the Temporary Column
15 | student$StreamCode <- NA
16 |
17 | StringsForStreams <- c("Computer",
18 | "Electronics",
19 | "Mechanical",
20 | "Commerce",
21 | "Information",
22 | "Marketing",
23 | "Electrical",
24 | "Civil",
25 | "Finance",
26 | "Arts",
27 | "Science",
28 | "Economics",
29 | "Humanities",
30 | "Management",
31 | "English",
32 | "Human",
33 | "Software",
34 | "Bio",
35 | "Mass",
36 | "Operations",
37 | "Architecture",
38 | "Instrumentation",
39 | "Mathematics",
40 | "Physics",
41 | "Media",
42 | "Accounts",
43 | "Statistics",
44 | "Chemistry",
45 | "Political Science",
46 | "Psychology",
47 | "Fashion",
48 | "journalism"
49 | )
50 |
51 | for (i in StringsForStreams) {
52 | print(i)
53 | student$StreamCode[grep(i, student$Stream, ignore.case = TRUE)] <- i
54 | }
55 |
56 | ##Dummy Variables for StreamsCode
57 | for (i in c("StreamCode")) {
58 | print(i)
59 | for(level in unique(student[[i]])){
60 | student[paste("dummy", i, level, sep = "_")] <- ifelse(student[[i]] == level, 1, 0)
61 | }
62 | student[[i]] <- NULL #Drop this column
63 | }
64 |
65 | #SAVE FILES
66 | write.csv(student[, (NCOL+1):ncol(student)], "../data/Features_student_StreamCode.csv", row.names = F)
67 |
68 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/9_model_XGB_1.R:
--------------------------------------------------------------------------------
1 | library(dplyr)
2 | library(tidyr)
3 | library(readr)
4 | library(xgboost)
5 | library(pROC)
6 | library(caret)
7 |
8 | #MODEL DESCRIPTION
9 | #XGBOOST MODEL SEED = 123 and NROUND = 526
10 | #LOAD DATA
11 | train <- read.csv("../data/train_processed.csv", header = TRUE, stringsAsFactors = FALSE)
12 | test <- read.csv("../data/test_processed.csv", header = TRUE, stringsAsFactors = FALSE)
13 |
14 | #DONT NEED THESE COLUMNS ANY MORE
15 | train$Earliest_Start_Date <- NULL
16 | train$Internship_deadline <- NULL
17 | train$Start_Date <- NULL
18 | train$End_Date <- NULL
19 | train$End.Date <- NULL
20 | train$Start.Date <- NULL
21 |
22 | test$Earliest_Start_Date <- NULL
23 | test$Internship_deadline <- NULL
24 | test$Start_Date <- NULL
25 | test$End_Date <- NULL
26 | test$End.Date <- NULL
27 | test$Start.Date <- NULL
28 |
29 | #Validation Set
30 | set.seed(123)
31 | inTrain <- createDataPartition(y = train$Is_Shortlisted, p = .70, list = FALSE)
32 | trainSet <- train[inTrain, ]
33 | validateSet <- train[-inTrain, ]
34 | #####
35 |
36 | dtrain <- xgb.DMatrix(data = data.matrix(train[, c(2:ncol(train))]),
37 | label = data.matrix(train$Is_Shortlisted),
38 | missing=NA)
39 | dvalidate <- xgb.DMatrix(data = data.matrix(validateSet[, c(2:ncol(validateSet))]),
40 | label = data.matrix(validateSet$Is_Shortlisted),
41 | missing=NA)
42 | watchlist <- list(train = dtrain, test = dvalidate)
43 | param <- list("objective" = "binary:logistic",
44 | "eval_metric" = "auc",
45 | "eta" = 0.1,
46 | "max_depth" = 10,
47 | "subsample" = 1,
48 | "min_child_weight" = 1,
49 | "colsample_bytree" = 0.2
50 | )
51 | cv.nround <- 526
52 |
53 | t <- Sys.time()
54 | set.seed(123)
55 | bst <- xgb.train(param = param,
56 | data = dtrain,
57 | nrounds = cv.nround,
58 | maximize = TRUE)
59 | print(Sys.time() - t)
60 |
61 |
62 | test_target_xgb <- predict(bst,
63 | data.matrix(test[, c(2:ncol(test))]),
64 | missing=NA)
65 | submission <- data.frame(Internship_ID = test$Internship_ID,
66 | Student_ID = test$Student_ID,
67 | Is_Shortlisted = test_target_xgb)
68 | write_csv(submission,"../Submissions/XGB_MODEL_S123_N526.csv")
69 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/10_model_XGB_1.R:
--------------------------------------------------------------------------------
1 | library(dplyr)
2 | library(tidyr)
3 | library(readr)
4 | library(xgboost)
5 | library(pROC)
6 | library(caret)
7 |
8 | #MODEL DESCRIPTION
9 | #XGBOOST MODEL SEED = 500 and NROUND = 710
10 |
11 | #LOAD DATA
12 | train <- read.csv("../data/train_processes.csv", header = TRUE, stringsAsFactors = FALSE)
13 | test <- read.csv("../data/test_processes.csv", header = TRUE, stringsAsFactors = FALSE)
14 |
15 | #DONT NEED THESE COLUMNS ANY MORE
16 | train$Earliest_Start_Date <- NULL
17 | train$Internship_deadline <- NULL
18 | train$Start_Date <- NULL
19 | train$End_Date <- NULL
20 | train$End.Date <- NULL
21 | train$Start.Date <- NULL
22 |
23 | test$Earliest_Start_Date <- NULL
24 | test$Internship_deadline <- NULL
25 | test$Start_Date <- NULL
26 | test$End_Date <- NULL
27 | test$End.Date <- NULL
28 | test$Start.Date <- NULL
29 |
30 | #Validation Set
31 | set.seed(123)
32 | inTrain <- createDataPartition(y = train$Is_Shortlisted, p = .70, list = FALSE)
33 | trainSet <- train[inTrain, ]
34 | validateSet <- train[-inTrain, ]
35 | #####
36 |
37 | dtrain <- xgb.DMatrix(data = data.matrix(train[, c(2:ncol(train))]),
38 | label = data.matrix(train$Is_Shortlisted),
39 | missing=NA)
40 | dvalidate <- xgb.DMatrix(data = data.matrix(validateSet[, c(2:ncol(validateSet))]),
41 | label = data.matrix(validateSet$Is_Shortlisted),
42 | missing=NA)
43 | watchlist <- list(train = dtrain, test = dvalidate)
44 | param <- list("objective" = "binary:logistic",
45 | "eval_metric" = "auc",
46 | "eta" = 0.1,
47 | "max_depth" = 10,
48 | "subsample" = 1,
49 | "min_child_weight" = 1,
50 | "colsample_bytree" = 0.2
51 | )
52 | cv.nround <- 710
53 |
54 | t <- Sys.time()
55 | set.seed(500)
56 | bst <- xgb.train(param = param,
57 | data = dtrain,
58 | nrounds = cv.nround,
59 | maximize = TRUE)
60 | print(Sys.time() - t)
61 |
62 |
63 | test_target_xgb <- predict(bst,
64 | data.matrix(test[, c(2:ncol(test))]),
65 | missing=NA)
66 | submission <- data.frame(Internship_ID = test$Internship_ID,
67 | Student_ID = test$Student_ID,
68 | Is_Shortlisted = test_target_xgb)
69 | write_csv(submission,"../Submissions/XGB_MODEL_S500_N710.csv")
70 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/AVDatafest_XtremeML/README.md:
--------------------------------------------------------------------------------
1 | # Winning Solution for Analytics Vidhya Machine Learning Competition - [Xtreme ML Hack](https://datahack.analyticsvidhya.com/contest/machine-learning-hackathon/)
2 |
3 | (c) [Sonny](https://github.com/sonnylaskar)
4 |
5 | This model scored 60.9 on the Public Leaderboard, 61.7 on the [Private Leaderboard]("https://datahack.analyticsvidhya.com/contest/machine-learning-hackathon/lb") and ranked #2.
6 |
7 | ## Prerequisites:
8 | 1. R version 3.3.3
9 | 2. R Packages: readr, lubridate, dplyr, tidyr, xgboost
10 |
11 | ## Problem Statement:
12 | The largest water supplier of Barcelona wants to leverage machine learning to effectively predict daywise-mediumwise-departmentwise breakdown of predictions of how many contacts (tickets/enquiries) would it receive and how many resolutions would it make so that they can size their team properly and improve customer satisfaction.
13 |
14 | ## Approach:
15 | While this looked to be a time-series problem, it did not work out for me to solve it by leveraging various time series modelling techniques like ARIMA, etc. Hence I switched to solving it with regression. But the issue was that the Test dataset was in future and literally no information was known in future. We were allowed to use external data in this contest and Holiday calender seemed to be an obvious parameter that should surely affect such problems.
16 |
17 | ### Feature Engineering:
18 | 1. Date features like weekday, quarter, etc.
19 | 2. Whether a Day was a holiday in Spain?
20 | 3. How many days were elaped since the last holiday (in rank_percent)?
21 | 4. Lagged features of # of contacts and resolutions of 75 days, 90 days and 120 days (Since the prediction to be made was upto 75 days in future, hence I decided not to include any lag value less than 75 days)
22 |
23 | ### Modeling:
24 | Xgboost is the first model that I try everytime I have to solve any such problem. As always, it gave a significant score. For cross validation, I used the last 4 months data.
25 |
26 | ## Steps to reproduce the submission:
27 | 1. Copy all Train files in the folder _"input/Train"_
28 | 2. Copy all Test files in the folder _"input/Test"_
29 | 3. External data: I used holiday list of Spain as an external data from [here](http://www.officeholidays.com/countries/spain/regional.php?list_year=2010&list_region=catalonia "Calender")
30 | 4. Ensure folder _"output"_ exists
31 | 5. Run the Rscript _final_model.R_ from the _code_ directory
32 | 6. The final files will be created in the _"output"_ folder
33 |
34 | Enjoy :smile:
35 |
36 |
37 | Regards
38 |
39 | Sonny
40 |
--------------------------------------------------------------------------------
/HackerEarth/Predict Lanes from LIDAR data/final_2_buildData.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)
2 | library(feather)
3 | library(stringr)
4 |
5 | label <- read_csv("../input/labels.csv")
6 | label$roadCoordinates <- NULL
7 | train <- read_csv("../input/train.csv")
8 | test <- read_csv("../input/test.csv")
9 | df_all <- bind_rows(train, test)
10 | roadsDistance <- read_csv("../input/roadsDistance.csv")
11 |
12 | getLatLong <- function(x, t = "lat") {
13 | a <- matrix(as.numeric(unlist(strsplit(unlist(strsplit(x, "\\|")), " "))), byrow = T, ncol = 2)
14 | if (t == "lon") {
15 | apply(a, 2, mean)[1]
16 | } else {
17 | apply(a, 2, mean)[2]
18 | }
19 | }
20 |
21 |
22 | df_all$meanLat <- sapply(df_all$laneLineCoordinates, getLatLong, t = "lat", USE.NAMES = F)
23 | df_all$meanLon <- sapply(df_all$laneLineCoordinates, getLatLong, t = "lon", USE.NAMES = F)
24 |
25 | df_all %>%
26 | group_by(roadId) %>%
27 | summarise(
28 | sumOfDistanceFromLeft = sum(distFromLaneLineOnLeft, na.rm = T),
29 | sumOfDistanceFromRight = sum(distFromLaneLineOnRight, na.rm = T),
30 | r_sumOfDistanceFromLR = sumOfDistanceFromLeft / sumOfDistanceFromRight,
31 | int_distLR = length(intersect(distFromLaneLineOnLeft, distFromLaneLineOnRight)),
32 |
33 | latCounter = length(unique(round(meanLat, 4))),
34 | lonCounter = length(unique(round(meanLon, 4))),
35 |
36 | int_TotalLinesLR = length(intersect(totalLinesOnLeft, totalLaneLinesOnRight)),
37 | uniq_linesLeft = length(unique(totalLinesOnLeft)),
38 | uniq_linesRight = length(unique(totalLaneLinesOnRight)),
39 | totalLaneLinesMean = mean(totalLaneLines),
40 | totalLinesOnLeftMax = max(totalLinesOnLeft),
41 |
42 | uniq_lineId = length(unique(laneLineId)) / length((laneLineId)),
43 | roadCategory = unique(roadCategory),
44 |
45 | r_lineToRoadLength = sum(laneLineLength / roadLength < 0.8),
46 | r_lineToRoadLength2 = sum(laneLineLength / roadLength >= 0.8),
47 | laneLineLengthMean = mean(laneLineLength),
48 |
49 | sum_interSectingLines = sum(noOfIntersectingLaneLinesLeft, noOfIntersectingLaneLinesRight),
50 | noOfIntersectingLaneLinesLeftMean = mean(noOfIntersectingLaneLinesLeft),
51 |
52 | sum_isIntersectingWithRoadGeometryTrue = sum(isIntersectingWithRoadGeometry == "true"),
53 | sum_isIntersectingWithRoadGeometryFalse = sum(isIntersectingWithRoadGeometry == "false")
54 | ) -> df2
55 |
56 |
57 |
58 | df2$data <- ifelse(df2$roadId %in% train$roadId, "train", "test")
59 | df2 <- left_join(df2, roadsDistance, by = "roadId")
60 | df2$haversineDistance <- df2$haversineDistance / df2$laneLineLengthMean
61 | df2 <- left_join(df2, label, by = "roadId")
62 |
63 | write_feather(df2, "../input/df_all.fthr")
64 |
65 |
--------------------------------------------------------------------------------
/HackerEarth/Predict Lanes from LIDAR data/final_3_buildModel.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)
2 | library(feather)
3 | library(xgboost)
4 |
5 | nthread <- parallel::detectCores()
6 |
7 | df_all <- read_feather("../input/df_all.fthr")
8 | TARGET <- "noOfLanes"
9 | NAString <- NA
10 | model_features <- setdiff(names(df_all), c("roadId", TARGET, "data"))
11 |
12 | df_all_train <- df_all[df_all$data == "train", ]
13 | df_all_test <- df_all[df_all$data == "test", ]
14 | #rm(df_all)
15 | gc()
16 |
17 | ####### XGBOOST ############
18 | EARLY_STOPPING <- 100
19 | print.every.n <- 10
20 | df_all_train[[TARGET]] <- as.factor(df_all_train[[TARGET]] - 1)
21 | num_class <- length(levels(df_all_train[[TARGET]]))
22 |
23 | param <- list(
24 | objective = "multi:softprob",
25 | booster = "gbtree",
26 | eval_metric = "mlogloss",
27 | num_class = num_class,
28 | eta = 0.1,
29 | max_depth = 5,
30 | subsample = 0.9,
31 | min_child_weight = 1,
32 | colsample_bytree = 1.0,
33 | gamma = 0,
34 | nthread = nthread,
35 | num_parallel_tree = 2
36 | )
37 |
38 | if (param$eval_metric != "auc") {
39 | isMaximize <- F
40 | } else {
41 | isMaximize <- T
42 | }
43 | nrounds <- 100
44 | seed <- (1:10)*1000
45 |
46 | dtrain <- xgb.DMatrix( data = data.matrix(df_all_train[, model_features]),
47 | label = data.matrix(df_all_train[[TARGET]]),
48 | missing = NAString)
49 | watchlist <- list(train = dtrain)
50 |
51 | t <- Sys.time()
52 | print(param)
53 | test_xgb_model <- rep(0, nrow(df_all_test))
54 | for (s in seed) {
55 | cat("Generating XGB seed", s, "\n", sep = " ")
56 | set.seed(s)
57 | bst <- xgb.train( params = param,
58 | data = dtrain,
59 | nrounds = nrounds,
60 | verbose = 1,
61 | print_every_n = print.every.n,
62 | early_stopping_rounds = EARLY_STOPPING,
63 | watchlist = watchlist,
64 | maximize = isMaximize
65 | )
66 | print(format(Sys.time() - t, format = "%H:%M") )
67 | dtest <- xgb.DMatrix( data = data.matrix(df_all_test[, model_features]),
68 | missing = NAString)
69 | tmp <- predict(bst, dtest)
70 | tmp <- ifelse(tmp < 0, 0, tmp)
71 | test_xgb_model <- test_xgb_model + tmp
72 | }
73 | xgb_1 <- test_xgb_model / length(seed)
74 |
75 |
76 | xgb_1 <- apply(matrix(xgb_1, byrow = T, ncol = num_class), 1, which.max)
77 | xgb_1 <- data.frame(roadId = df_all_test$roadId, noOfLanes = xgb_1)
78 | write_csv(xgb_1, "../output/finalSubmission.csv")
79 |
80 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set4a_fuzzy.py:
--------------------------------------------------------------------------------
1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
2 | #### Author: Marios & Mikel
3 | #### Avito Duplicate Ad Detection
4 | # 3_feature_set4a_fuzzy.py
5 | # Creates text features using the fuzzywuzzy python packages
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import sys
10 | import time
11 | import gc
12 | import feather
13 | from fuzzywuzzy import fuzz
14 | from multiprocessing import Pool
15 |
16 | import libavito as a
17 |
18 | def process_row(row):
19 | values = []
20 | values.append(row[0])
21 | values.append(row[1])
22 |
23 | # Not black magic, iterate over title/description/json
24 | for d in [2, 4, 6]:
25 | st_1 = str(row[d])
26 | st_2 = str(row[d + 1])
27 | values.append(fuzz.partial_ratio(st_1, st_2))
28 | values.append(fuzz.token_set_ratio(st_1, st_2))
29 | values.append(fuzz.ratio(st_1, st_2))
30 | values.append(fuzz.token_sort_ratio(st_1, st_2))
31 | return values
32 |
33 | print(a.c.BOLD + 'Extracting set4a fuzzy text features ...' + a.c.END)
34 |
35 | # Get train/test mode from launch argument
36 | mode = a.get_mode(sys.argv, '3_feature_set4a_fuzzy.py')
37 |
38 | ## Read settings required by script
39 | config = a.read_config()
40 | nthreads = config.preprocessing_nthreads
41 | cache_loc = config.cache_loc
42 | debug = config.debug
43 | if mode == 0:
44 | root = config.train_images_root
45 | df = feather.read_dataframe(cache_loc + 'train.fthr')
46 | if mode == 1:
47 | root = config.test_images_root
48 | df = feather.read_dataframe(cache_loc + 'test.fthr')
49 |
50 | df = df[['itemID_1', 'itemID_2', 'title_1', 'title_2', 'description_1', 'description_2', 'attrsJSON_1', 'attrsJSON_2']]
51 |
52 | ftrs = []
53 |
54 | start = time.time()
55 | o = len(df.index)
56 | if nthreads == 1:
57 | print('Extracting features with 1 thread ...')
58 | k = 0
59 | # Iterate over files
60 | ftrs = []
61 | for row in df.values:
62 | x = process_row(row)
63 | ftrs.append(x)
64 | k += 1
65 | if k % 100 == 0:
66 | a.print_progress(k, start, o)
67 |
68 | # Otherwise perform multi-threaded mapping
69 | else:
70 | print('Extracting features multi-threaded ... ', end='', flush=True)
71 | pool = Pool(nthreads)
72 | ftrs = pool.map(process_row, df.values)
73 | pool.close()
74 | gc.collect()
75 |
76 | a.print_elapsed(start)
77 |
78 | ftrs = pd.DataFrame(ftrs)
79 | cols = ['itemID_1', 'itemID_2'] + ['set4a_X' + str(i) for i in range(1, len(ftrs.columns.tolist()) - 1)]
80 | print(cols)
81 | ftrs.columns = cols
82 |
83 | # Save updated dataset
84 | if mode == 0:
85 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set4a_fuzzy.fthr')
86 | if mode == 1:
87 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set4a_fuzzy.fthr')
88 |
89 | a.print_elapsed(start)
90 | print('set4a extraction complete!')
91 |
92 | # Write status to status file so master script knows whether to proceed.
93 | f = open(cache_loc + 'status.txt', 'a')
94 | f.write('feature_set4a_OK\n')
95 | f.close()
96 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/amexpert-2019-machine-learning-hackathon/code/agg_feature_2.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)
2 | library(lubridate)
3 |
4 | campaign_data <- read_csv("../input/campaign_data.csv")
5 | campaign_data$start_date <- dmy(campaign_data$start_date)
6 | campaign_data$end_date <- dmy(campaign_data$end_date)
7 | campaign_data <- arrange(campaign_data, start_date)
8 |
9 |
10 | customer_transaction_data <- read_csv("../input/customer_transaction_data.csv")
11 |
12 |
13 | #x <- unique(customer_transaction_data$date)
14 | #campaignDates <- campaign_data$start_date
15 | #roundToNearestCampaignDate <- function(x) {
16 | # campaignDates[campaignDates > x][1]
17 | #}
18 |
19 | #df_dates <- tibble(date = unique(customer_transaction_data$date))
20 | #df_dates <- df_dates %>%
21 | # rowwise() %>%
22 | # mutate(nextCampaignDate = roundToNearestCampaignDate(date))
23 |
24 | #customer_transaction_data <- left_join(customer_transaction_data, df_dates, by = "date")
25 |
26 | #customer_transaction_df <- customer_transaction_data %>%
27 | #head(100000) %>%
28 | # group_by(nextCampaignDate, customer_id, item_id) %>%
29 | # summarise(quantity_sum = sum(quantity, na.rm = T),
30 | # selling_price_sum = sum(selling_price, na.rm = T),
31 | # other_discount_sum = sum(other_discount, na.rm = T),
32 | # coupon_discount_sum = sum(coupon_discount, na.rm = T))
33 |
34 | #write_csv(customer_transaction_df, "../input/customer_transaction_df.csv")
35 |
36 | #df_dates <- tibble(campaignDates = campaignDates)
37 | #df_dates$date_1m <- df_dates$campaignDates - 30
38 | #df_dates$date_2m <- df_dates$campaignDates - 60
39 |
40 | for (i in unique(campaign_data$campaign_id)) {
41 | customer_transaction_data[[paste0("campaign_id_", i)]] <- campaign_data$start_date[campaign_data$campaign_id == i]
42 | }
43 |
44 | #[1] 26 27 28 29 30 1 2 3 4 5 6 7 8 9 10 11 12 13 16 17 18 19 20 21 22 23 24 25
45 |
46 | #customer_transaction_df <- tibble()
47 | for (i in unique(campaign_data$campaign_id)) {
48 | for (lagDays in c(seq(30, 30*12, 30))) {
49 | print(paste(i, lagDays))
50 | customer_transaction_data$CampaignDate <- customer_transaction_data[[paste0("campaign_id_", i)]]
51 | tmp <- customer_transaction_data %>%
52 | filter(date >= CampaignDate - lagDays & date < CampaignDate) %>%
53 | group_by(CampaignDate, customer_id, item_id) %>%
54 | summarise(quantity_sum = sum(quantity, na.rm = T),
55 | selling_price_sum = sum(selling_price, na.rm = T),
56 | other_discount_sum = sum(other_discount, na.rm = T),
57 | coupon_discount_sum = sum(coupon_discount, na.rm = T),
58 | quantity_mean = mean(quantity, na.rm = T),
59 | selling_price_mean = mean(selling_price, na.rm = T),
60 | other_discount_mean = mean(other_discount, na.rm = T),
61 | coupon_discount_mean = mean(coupon_discount, na.rm = T))
62 |
63 |
64 | if (nrow(tmp) > 0) {
65 | names(tmp)[-(1:3)] <- paste(names(tmp)[-(1:3)], lagDays, sep = "_")
66 | #customer_transaction_df <- bind_rows(customer_transaction_df, tmp)
67 | write_csv(tmp, paste("../input/agg_feat",i, lagDays, ".csv", sep = "_"))
68 | rm(tmp)
69 | }
70 | gc()
71 | }
72 | }
73 |
74 | #write_csv(customer_transaction_df, "../input/agg_feat_2.csv")
75 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1g_capitalLetters.R:
--------------------------------------------------------------------------------
1 | ################################################################################################
2 | ################################################################################################
3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
4 | #### Competition: Avito Duplicate Ad Detection
5 | # Filename : 3_feature_set1g_capitalLetters.R
6 | # Description: This Rscript generates Capital Letters Features
7 | # Usage:
8 | # Rscript ./code/3_feature_set1g_capitalLetters.R train
9 | # Rscript ./code/3_feature_set1g_capitalLetters.R test
10 | # Default argument is test
11 | ################################################################################################
12 | ################################################################################################
13 |
14 | args <- commandArgs(trailingOnly = F)
15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)])))
16 |
17 |
18 | # Source Config and functions.R file
19 | source(paste(BASE, "/../config.cfg", sep = ""))
20 | source(paste(BASE_DIR, "/code/functions.R", sep = ""))
21 |
22 | #Load any additional packages
23 | library(parallel)
24 |
25 | # Read argument for train or test
26 | trainOrTest <- commandArgs(trailingOnly = TRUE)
27 | if (length(trainOrTest) > 1) {
28 | stop("ERROR: I need only 1 argument : train or test")
29 | }
30 |
31 | if (length(trainOrTest) == 0) {
32 | print("No Arguments passed, Assuming you mean test")
33 | trainOrTest <- "test"
34 | }
35 |
36 | #Load dat
37 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
38 | cat("Reading file ", FILENAME, "\n", sep = " ")
39 | dat <- read_csv(FILENAME)
40 |
41 |
42 | #Function to generate functions
43 | getCapitalLetterFeatures <- function(x) {
44 | wordsWithCapitalLetters <- length(grep("[[:upper:]]", unlist(strsplit(x, " "))))
45 | countOfCapitalLetters <- length(grep("[[:upper:]]", unlist(strsplit(x, ""))))
46 | return(c(wordsWithCapitalLetters, countOfCapitalLetters))
47 | }
48 |
49 | df2 <- data.frame(ID = 1:nrow(dat)) #Else cbind will not work
50 | for (Field in c("title_1", "title_2", "description_1", "description_2")) {
51 | print(Field)
52 | df2_temp <- as.data.frame(t(mcmapply(getCapitalLetterFeatures, dat[[Field]], USE.NAMES = F)))
53 | names(df2_temp) <- c(paste("wordsWithCapitalLetters", Field, sep = "_"), paste("countOfCapitalLetters", Field, sep = "_"))
54 | df2 <- cbind(df2, df2_temp)
55 | }
56 | for (i in c("title", "description")) {
57 | for (j in c("wordsWithCapitalLetters", "countOfCapitalLetters")) {
58 | #print(c(i,j))
59 | NewField1 <- paste(j, "_", i,"_1", sep = "")
60 | NewField2 <- paste(j, "_", i,"_2", sep = "")
61 | #print(c(NewField1,NewField2))
62 | NewFieldName <- paste("ratio", NewField1, NewField2, sep = "_")
63 | print(NewFieldName)
64 | df2[[NewFieldName]] <- df2[[NewField1]] / df2[[NewField2]]
65 | df2[[NewFieldName]] <- round(ifelse(df2[[NewFieldName]] > 1, 1/df2[[NewFieldName]], df2[[NewFieldName]]), 2)
66 | }
67 | }
68 |
69 | df2$ID <- NULL
70 | names(df2) <- paste("set1g", names(df2), sep = "_")
71 |
72 |
73 | ######## Add Primary Columns ItemID1 and ItemID2
74 | df2 <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df2)
75 | print("Saving Capital Letter features")
76 | write_feather(df2, paste(cache_loc, "/", "features_", trainOrTest, "_set1g_", "capitalLetters.fthr", sep = "" ))
77 |
78 | #END
79 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set4b_fuzzy_clean.py:
--------------------------------------------------------------------------------
1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
2 | #### Author: Marios & Mikel
3 | #### Avito Duplicate Ad Detection
4 | # 3_feature_set4b_fuzzy_clean.py
5 | # Creates clean text features using the fuzzywuzzy python packages
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import sys
10 | import jellyfish
11 | import feather
12 | import time
13 | import gc
14 | from fuzzywuzzy import fuzz
15 | from multiprocessing import Pool
16 |
17 | import libavito as a
18 |
19 | def process_row(row):
20 | values = []
21 | values.append(row[0])
22 | values.append(row[1])
23 |
24 | # iterate over cleaned title/descs/jsons
25 | for d in [2, 4, 6]:
26 | s1 = str(row[d])
27 | s2 = str(row[d + 1])
28 | values.append(jellyfish.levenshtein_distance(s1, s2))
29 | values.append(jellyfish.jaro_distance(s1, s2))
30 | #values.append(float(jellyfish.damerau_levenshtein_distance(s1,s2)) )
31 | values.append(fuzz.partial_ratio(s1, s2))
32 | values.append(fuzz.token_set_ratio(s1, s2))
33 | values.append(fuzz.ratio(s1, s2))
34 | values.append(fuzz.token_sort_ratio(s1, s2))
35 | return values
36 |
37 | print(a.c.BOLD + 'Extracting set4b fuzzy cleaned text features ...' + a.c.END)
38 |
39 | # Get train/test mode from launch argument
40 | mode = a.get_mode(sys.argv, '3_feature_set4b_fuzzy_clean.py')
41 |
42 | ## Read settings required by script
43 | config = a.read_config()
44 | nthreads = config.preprocessing_nthreads
45 | cache_loc = config.cache_loc
46 | debug = config.debug
47 | if mode == 0:
48 | root = config.train_images_root
49 | df = feather.read_dataframe(cache_loc + 'train.fthr')
50 | if mode == 1:
51 | root = config.test_images_root
52 | df = feather.read_dataframe(cache_loc + 'test.fthr')
53 |
54 | df = df[['itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2', 'cleandesc_1', 'cleandesc_2', 'cleanjson_1', 'cleanjson_2']]
55 |
56 | ftrs = []
57 |
58 | start = time.time()
59 | o = len(df.index)
60 | if nthreads == 1:
61 | print('Extracting features with 1 thread ...')
62 | k = 0
63 | # Iterate over files
64 | ftrs = []
65 | for row in df.values:
66 | x = process_row(row)
67 | ftrs.append(x)
68 | k += 1
69 | if k % 100 == 0:
70 | a.print_progress(k, start, o)
71 |
72 | # Otherwise perform multi-threaded mapping
73 | else:
74 | print('Extracting features multi-threaded ... ', end='', flush=True)
75 | pool = Pool(nthreads)
76 | ftrs = pool.map(process_row, df.values)
77 | pool.close()
78 | gc.collect()
79 |
80 | a.print_elapsed(start)
81 |
82 | ftrs = pd.DataFrame(ftrs)
83 | cols = ['itemID_1', 'itemID_2'] + ['set4b_X' + str(i) for i in range(1, len(ftrs.columns.tolist()) - 1)]
84 | print(cols)
85 | ftrs.columns = cols
86 |
87 | # Save updated dataset
88 | if mode == 0:
89 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set4b_fuzzy_clean.fthr')
90 | if mode == 1:
91 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set4b_fuzzy_clean.fthr')
92 |
93 | a.print_elapsed(start)
94 | print('set4b extraction complete!')
95 |
96 | # Write status to status file so master script knows whether to proceed.
97 | f = open(cache_loc + 'status.txt', 'a')
98 | f.write('feature_set4b_OK\n')
99 | f.close()
100 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1f_SpecialCounting.R:
--------------------------------------------------------------------------------
1 | ################################################################################################
2 | ################################################################################################
3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
4 | #### Competition: Avito Duplicate Ad Detection
5 | # Filename : 3_feature_set1f_SpecialCounting.R
6 | # Description: This Rscript generates all Special Character Counting Features
7 | # Usage:
8 | # Rscript ./code/3_feature_set1f_SpecialCounting.R train
9 | # Rscript ./code/3_feature_set1f_SpecialCounting.R test
10 | # Default argument is test
11 | ################################################################################################
12 | ################################################################################################
13 |
14 | args <- commandArgs(trailingOnly = F)
15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)])))
16 |
17 |
18 | # Source Config and functions.R file
19 | source(paste(BASE, "/../config.cfg", sep = ""))
20 | source(paste(BASE_DIR, "/code/functions.R", sep = ""))
21 |
22 | #Load any additional packages
23 | library(parallel)
24 | library(stylo)
25 | library(stringr)
26 | library(tm)
27 |
28 | # Read argument for train or test
29 | trainOrTest <- commandArgs(trailingOnly = TRUE)
30 | if (length(trainOrTest) > 1) {
31 | stop("ERROR: I need only 1 argument : train or test")
32 | }
33 |
34 | if (length(trainOrTest) == 0) {
35 | print("No Arguments passed, Assuming you mean test")
36 | trainOrTest <- "test"
37 | }
38 |
39 | #Load data
40 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
41 | cat("Reading file ", FILENAME, "\n", sep = " ")
42 | dat <- read_csv(FILENAME)
43 |
44 |
45 |
46 | # Function to generate Features
47 | getFeatures <- function(x, type) {
48 | if (type == "digit") {
49 | lengths((regmatches(x, gregexpr("[[:digit:]]+",x))))
50 | } else if (type == "cntrl") {
51 | lengths((regmatches(x, gregexpr("[[:cntrl:]]+",x))))
52 | } else if (type == "graph") {
53 | lengths((regmatches(x, gregexpr("[[:graph:]]+",x))))
54 | } else if (type == "punct") {
55 | lengths((regmatches(x, gregexpr("[[:punct:]]+",x))))
56 | } else if (type == "xdigit") {
57 | lengths((regmatches(x, gregexpr("[[:xdigit:]]+",x))))
58 | } else {
59 | return(NA)
60 | }
61 | }
62 |
63 | print("Generating Count Features")
64 | for (i in c("digit", "graph", "punct", "xdigit")) {
65 | for (j in c("cleantitle_1", "cleantitle_2", "cleandesc_1", "cleandesc_2")) {
66 | print(c(i,j))
67 | assign(
68 | paste("countOf", i, "In", j , sep = "_"),
69 | sapply(dat[[j]], getFeatures, type = i, USE.NAMES = FALSE)
70 | )
71 | }
72 | }
73 |
74 | print("Generating Ratio Features")
75 | for (i in c("_digit", "_graph_", "_punct_", "_xdigit_")) {
76 | for (j in c("title", "desc")) {
77 | print(c(i, j))
78 | f_name <- grep(i, grep(j, ls(), value = T), value = T)
79 | ratio <- get(f_name[1]) / get(f_name[2])
80 | ratio <- ifelse(ratio > 1, 1/ratio, ratio)
81 | assign(
82 | paste("ratioOfcountOf", i, "In", j , sep = "_"),
83 | round(ratio, 2)
84 | )
85 | }
86 | }
87 |
88 | df_master <- as.data.frame(do.call(cbind, list(sapply(grep("countOf", ls(), value = T), get, USE.NAMES = T))))
89 | names(df_master) <- paste("set1f", names(df_master), sep = "_")
90 |
91 | ######## Add Primary Columns ItemID1 and ItemID2
92 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
93 | print("Saving Special Counting features")
94 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1f_", "specialCounting.fthr", sep = "" ))
95 |
96 | #END
97 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/legacy/3_feature_set4e_count3way_clean.py:
--------------------------------------------------------------------------------
1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
2 | #### Author: Marios & Mikel
3 | #### Avito Duplicate Ad Detection
4 | # 3_feature_set4e_count3way_clean.py
5 | # Counts how many 3-random-grams in item1 appear in item2
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import sys
10 | import jellyfish
11 | import feather
12 | import time
13 | import gc
14 | import re
15 | import math
16 | from collections import Counter
17 | from fuzzywuzzy import fuzz
18 | from multiprocessing import Pool
19 |
20 | import libavito as a
21 |
22 | def count_3words(words, text):
23 | # To count how many times of the search terms having three words at least showing in texts.
24 | count3 = 0
25 | if len(words) < 3 or len(text) < 3:
26 | return -1
27 | else:
28 | for m in range(0, len(words) - 2):
29 | words1 = words[m]
30 | for n in range(m + 1, len(words) - 1):
31 | words2 = words[n]
32 | for z in range(m + 2, len(words)):
33 | words3 = words[z]
34 | if words1 in text and words2 and words3 in text:
35 | count3 += 1
36 | return count3
37 |
38 | def process_row(row):
39 |
40 | title = 2
41 | desc = 4
42 | json = 6
43 |
44 | values = []
45 |
46 | values.append(row[0])
47 | values.append(row[1])
48 |
49 | for d in [title, desc, json]:
50 | st_1 = str(row[d]).replace(":", " ").replace('"', ' ')
51 | st_2 = str(row[d + 1]).replace(":", " ").replace('"', ' ')
52 | values.append(count_3words(st_1.split(" "), st_2.split(" ")))
53 |
54 | return values
55 |
56 | print(a.c.BOLD + 'Extracting set4e 3-way word count features ...' + a.c.END)
57 |
58 | # Get train/test mode from launch argument
59 | mode = a.get_mode(sys.argv, '3_feature_set4e_fuzzy_clean.py')
60 |
61 | ## Read settings required by script
62 | config = a.read_config()
63 | nthreads = config.preprocessing_nthreads
64 | cache_loc = config.cache_loc
65 | debug = config.debug
66 | if mode == 0:
67 | root = config.train_images_root
68 | df = feather.read_dataframe(cache_loc + 'train.fthr')
69 | if mode == 1:
70 | root = config.test_images_root
71 | df = feather.read_dataframe(cache_loc + 'test.fthr')[:1000]
72 |
73 | df = df[['itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2', 'cleandesc_1', 'cleandesc_2', 'cleanjson_1', 'cleanjson_2']]
74 |
75 | ftrs = []
76 |
77 | start = time.time()
78 | o = len(df.index)
79 | if nthreads == 1:
80 | print('Extracting features with 1 thread ...')
81 | k = 0
82 | # Iterate over files
83 | ftrs = []
84 | for row in df.values:
85 | x = process_row(row)
86 | ftrs.append(x)
87 | k += 1
88 | if k % 1 == 0:
89 | a.print_progress(k, start, o)
90 |
91 | # Otherwise perform multi-threaded mapping
92 | else:
93 | print('Extracting features multi-threaded ... ', end='', flush=True)
94 | pool = Pool(nthreads)
95 | ftrs = pool.map(process_row, df.values)
96 | pool.close()
97 | gc.collect()
98 |
99 | a.print_elapsed(start)
100 |
101 | ftrs = pd.DataFrame(ftrs)
102 | cols = ['itemID_1', 'itemID_2'] + ['set4e_X' + str(i) for i in range(1, len(ftrs.columns.tolist()) - 1)]
103 | print(cols)
104 | ftrs.columns = cols
105 |
106 | # Save updated dataset
107 | if mode == 0:
108 | feather.write_dataframe(ftrs, cache_loc + 'feature_set4e_train.fthr')
109 | if mode == 1:
110 | feather.write_dataframe(ftrs, cache_loc + 'feature_set4e_test.fthr')
111 |
112 | a.print_elapsed(start)
113 | print('set4e extraction complete!')
114 |
115 | # Write status to status file so master script knows whether to proceed.
116 | f = open(cache_loc + 'status.txt', 'a')
117 | f.write('feature_set4e_OK\n')
118 | f.close()
119 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3d_json1.py:
--------------------------------------------------------------------------------
1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
2 | #### Author: Peter & Mikel
3 | #### Avito Duplicate Ad Detection
4 | # 3_feature_set3d_json1.py
5 | # Creates json jaccard similarity
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import nltk
10 | import sklearn
11 | import json
12 | import sys
13 | import gc
14 | import feather
15 | from pandas.io.json import json_normalize
16 | import unicodedata
17 | from stop_words import get_stop_words
18 | import time
19 |
20 | import libavito as a
21 |
22 | stopwords = get_stop_words('ru')
23 | punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po'])
24 | sno = nltk.stem.SnowballStemmer('russian')
25 |
26 | def get_clean_tokens(text):
27 | newtext = []
28 | text0 = nltk.word_tokenize(text, 'russian')
29 | for y in text0:
30 | y = ''.join(x for x in y
31 | if unicodedata.category(x) not in punctutation_cats)
32 | if len(y) > 0 and y not in stopwords:
33 | newtext.append(sno.stem(y))
34 | return newtext
35 |
36 | def jaccard_similarity(x, y):
37 | intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
38 | union_cardinality = len(set.union(*[set(x), set(y)]))
39 | if union_cardinality == 0:
40 | return -1.0
41 | else:
42 | return intersection_cardinality / float(union_cardinality)
43 |
44 | def ratio_of_matches(x, y):
45 | intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
46 | x_cardinality = len(x)
47 | if x_cardinality == 0:
48 | return -1.0
49 | else:
50 | return intersection_cardinality / float(x_cardinality)
51 |
52 | print(a.c.BOLD + 'Extracting set3d JSON features ...' + a.c.END)
53 |
54 | # Get train/test mode from launch argument
55 | mode = a.get_mode(sys.argv, '3_feature_set3d_json1.py')
56 |
57 | ## Read settings required by script
58 | config = a.read_config()
59 | nthreads = config.preprocessing_nthreads
60 | cache_loc = config.cache_loc
61 | debug = config.debug
62 | if mode == 0:
63 | root = config.train_images_root
64 | df = feather.read_dataframe(cache_loc + 'train.fthr')
65 | if mode == 1:
66 | root = config.test_images_root
67 | df = feather.read_dataframe(cache_loc + 'test.fthr')
68 |
69 | train = df[['itemID_1', 'itemID_2', 'attrsJSON_1', 'attrsJSON_2']]
70 | del df
71 | gc.collect()
72 |
73 | train = train.fillna('')
74 |
75 | ftrs = []
76 |
77 | print('Calculating features ...')
78 | t0 = time.time()
79 | for i in range(0, len(train.index)):
80 | if i % 10000 == 0:
81 | a.print_progress(i, t0, len(train.index))
82 | try:
83 | jx = train.iloc[i]['attrsJSON_1'].lower()
84 | jy = train.iloc[i]['attrsJSON_2'].lower()
85 | resx = json.loads(jx)
86 | resy = json.loads(jy)
87 | similarkeys = jaccard_similarity(resx.keys(), resy.keys())
88 | similarvals = jaccard_similarity(resx.values(), resy.values())
89 | #out = str(train.iloc[i]['itemID_1']) + " " + str(train.iloc[i]['itemID_2']) + " " + str(similarkeys) + " " + str(similarvals)+ " " + str(len(resx)) + " " + str(len(resy)) + "\n"
90 | ftrs.append([train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], similarkeys, similarvals, len(resx), len(resy)])
91 | except:
92 | pass
93 |
94 | start = time.time()
95 | print('Caching data to disk ... ', end='', flush=True)
96 | ftrs = pd.DataFrame(ftrs)
97 | ftrs.columns = ['itemID_1', 'itemID_2', 'similarkeys', 'similarvals', 'nkey1', 'nkey2']
98 |
99 | # Save updated dataset
100 | if mode == 0:
101 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set3d.fthr')
102 | if mode == 1:
103 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set3d.fthr')
104 |
105 | a.print_elapsed(start)
106 | print('set3d extraction complete!')
107 |
108 | # Write status to status file so master script knows whether to proceed.
109 | f = open(cache_loc + 'status.txt', 'a')
110 | f.write('feature_set3d_OK\n')
111 | f.close()
112 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3c_json.py:
--------------------------------------------------------------------------------
1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
2 | #### Author: Peter & Mikel
3 | #### Avito Duplicate Ad Detection
4 | # 3_feature_set3c_json.py
5 | # Creates some features from clean jsons
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import nltk
10 | import sklearn
11 | import json
12 | import sys
13 | import gc
14 | import feather
15 | from pandas.io.json import json_normalize
16 | import unicodedata
17 | from stop_words import get_stop_words
18 | import time
19 | from multiprocessing import Pool
20 |
21 | import libavito as a
22 |
23 | stopwords = get_stop_words('ru')
24 | punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po'])
25 | sno = nltk.stem.SnowballStemmer('russian')
26 |
27 | def get_clean_tokens(text):
28 | newtext = []
29 | text0 = nltk.word_tokenize(text, 'russian')
30 | for y in text0:
31 | y = ''.join(x for x in y
32 | if unicodedata.category(x) not in punctutation_cats)
33 | if len(y) > 0 and y not in stopwords:
34 | newtext.append(sno.stem(y))
35 | return newtext
36 |
37 | def jaccard_similarity(x, y):
38 | intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
39 | union_cardinality = len(set.union(*[set(x), set(y)]))
40 | if union_cardinality == 0:
41 | return -1.0
42 | else:
43 | return intersection_cardinality / float(union_cardinality)
44 |
45 | def ratio_of_matches(x, y):
46 | intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
47 | x_cardinality = len(x)
48 | if x_cardinality == 0:
49 | return -1.0
50 | else:
51 | return intersection_cardinality / float(x_cardinality)
52 |
53 | print(a.c.BOLD + 'Extracting set3c JSON features ...' + a.c.END)
54 |
55 | # Get train/test mode from launch argument
56 | mode = a.get_mode(sys.argv, '3_feature_set3c_json.py')
57 |
58 | ## Read settings required by script
59 | config = a.read_config()
60 | nthreads = config.preprocessing_nthreads
61 | cache_loc = config.cache_loc
62 | debug = config.debug
63 | if mode == 0:
64 | root = config.train_images_root
65 | df = feather.read_dataframe(cache_loc + 'train.fthr')
66 | if mode == 1:
67 | root = config.test_images_root
68 | df = feather.read_dataframe(cache_loc + 'test.fthr')
69 |
70 | train = df[['itemID_1', 'itemID_2', 'attrsJSON_1', 'attrsJSON_2']]
71 | del df
72 | gc.collect()
73 |
74 | train = train.fillna('')
75 |
76 | ftrs = []
77 |
78 | def process_row(i):
79 | jx = get_clean_tokens(train.iloc[i]['attrsJSON_1'])
80 | jy = get_clean_tokens(train.iloc[i]['attrsJSON_2'])
81 | sim_j = jaccard_similarity(jx, jy)
82 | mat1_j = ratio_of_matches(jx, jy)
83 | mat2_j = ratio_of_matches(jy, jx)
84 | return [train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], sim_j, mat1_j, mat2_j]
85 |
86 | t0 = time.time()
87 | if nthreads == 1:
88 | print('Extracting features with 1 thread ...')
89 | for i in range(0, len(train.index)):
90 | if i % 10000 == 0:
91 | a.print_progress(i, t0, len(train.index))
92 | ftrs.append(process_row(i))
93 | else:
94 | print('Extracting features multi-threaded ... ', end='', flush=True)
95 | pool = Pool(nthreads)
96 | ftrs = pool.map(process_row, range(0, len(train.index)))
97 | pool.close()
98 | a.print_elapsed(t0)
99 |
100 | start = time.time()
101 | print('Caching data to disk ... ', end='', flush=True)
102 | ftrs = pd.DataFrame(ftrs)
103 | ftrs.columns = ['itemID_1', 'itemID_2', 'simjson', 'matjson1', 'matjson2']
104 |
105 | # Save updated dataset
106 | if mode == 0:
107 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set3c.fthr')
108 | if mode == 1:
109 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set3c.fthr')
110 |
111 | a.print_elapsed(start)
112 | print('set3c extraction complete!')
113 |
114 | # Write status to status file so master script knows whether to proceed.
115 | f = open(cache_loc + 'status.txt', 'a')
116 | f.write('feature_set3c_OK\n')
117 | f.close()
118 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3b_title.py:
--------------------------------------------------------------------------------
1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
2 | #### Author: Peter & Mikel
3 | #### Avito Duplicate Ad Detection
4 | # 3_feature_set3b_title.py
5 | # Creates some features from clean titles
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import nltk
10 | import sklearn
11 | import json
12 | import sys
13 | import gc
14 | import feather
15 | from pandas.io.json import json_normalize
16 | import unicodedata
17 | from stop_words import get_stop_words
18 | import time
19 | from multiprocessing import Pool
20 |
21 | import libavito as a
22 |
23 | stopwords = get_stop_words('ru')
24 | punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po'])
25 | sno = nltk.stem.SnowballStemmer('russian')
26 |
27 | def get_clean_tokens(text):
28 | newtext = []
29 | text0 = nltk.word_tokenize(text, 'russian')
30 | for y in text0:
31 | y = ''.join(x for x in y
32 | if unicodedata.category(x) not in punctutation_cats)
33 | if len(y) > 0 and y not in stopwords:
34 | newtext.append(sno.stem(y))
35 | return newtext
36 |
37 | def jaccard_similarity(x, y):
38 | intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
39 | union_cardinality = len(set.union(*[set(x), set(y)]))
40 | if union_cardinality == 0:
41 | return -1.0
42 | else:
43 | return intersection_cardinality / float(union_cardinality)
44 |
45 | def ratio_of_matches(x, y):
46 | intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
47 | x_cardinality = len(x)
48 | if x_cardinality == 0:
49 | return -1.0
50 | else:
51 | return intersection_cardinality / float(x_cardinality)
52 |
53 | print(a.c.BOLD + 'Extracting set3b title features ...' + a.c.END)
54 |
55 | # Get train/test mode from launch argument
56 | mode = a.get_mode(sys.argv, '3_feature_set3b_title.py')
57 |
58 | ## Read settings required by script
59 | config = a.read_config()
60 | nthreads = config.preprocessing_nthreads
61 | cache_loc = config.cache_loc
62 | debug = config.debug
63 | if mode == 0:
64 | root = config.train_images_root
65 | df = feather.read_dataframe(cache_loc + 'train.fthr')
66 | if mode == 1:
67 | root = config.test_images_root
68 | df = feather.read_dataframe(cache_loc + 'test.fthr')
69 |
70 | train = df[['itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2']]
71 | del df
72 | gc.collect()
73 |
74 | train = train.fillna('')
75 |
76 | ftrs = []
77 |
78 | def process_row(i):
79 | tx = train.iloc[i]['cleantitle_1'].split(' ')
80 | ty = train.iloc[i]['cleantitle_2'].split(' ')
81 | sim_t = jaccard_similarity(tx, ty)
82 | mat1_t = ratio_of_matches(tx, ty)
83 | mat2_t = ratio_of_matches(ty, tx)
84 | return [train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], sim_t, mat1_t, mat2_t, len(tx), len(ty)]
85 |
86 | t0 = time.time()
87 | if nthreads == 1:
88 | print('Extracting features with 1 thread ...')
89 | for i in range(0, len(train.index)):
90 | if i % 10000 == 0:
91 | a.print_progress(i, t0, len(train.index))
92 | ftrs.append(process_row(i))
93 | else:
94 | print('Extracting features multi-threaded ... ', end='', flush=True)
95 | pool = Pool(nthreads)
96 | ftrs = pool.map(process_row, range(0, len(train.index)))
97 | pool.close()
98 | a.print_elapsed(t0)
99 |
100 | start = time.time()
101 | print('Caching data to disk ... ', end='', flush=True)
102 | ftrs = pd.DataFrame(ftrs)
103 | ftrs.columns = ['itemID_1', 'itemID_2', 'simtitle', 'mattitle1', 'mattitle2', 'nwords1', 'nwords2']
104 |
105 | # Save updated dataset
106 | if mode == 0:
107 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set3b.fthr')
108 | if mode == 1:
109 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set3b.fthr')
110 |
111 | a.print_elapsed(start)
112 | print('set3b extraction complete!')
113 |
114 | # Write status to status file so master script knows whether to proceed.
115 | f = open(cache_loc + 'status.txt', 'a')
116 | f.write('feature_set3b_OK\n')
117 | f.close()
118 |
--------------------------------------------------------------------------------
/AnalyticsVidhya/Date-your-Data/1_internship_WordCorrection.R:
--------------------------------------------------------------------------------
1 | library(qdap)
2 | library(dplyr)
3 | library(tidyr)
4 | library(readr)
5 | library(stringr)
6 | library(tm)
7 |
8 | #LOAD DATA
9 | internship <- read_csv("../data/Internship.csv", na = c("", "NA", "NULL"))
10 |
11 | SPELLINGERRORS <- check_spelling(internship$Internship_Profile,
12 | assume.first.correct = TRUE,
13 | n.suggests = 4)
14 | SPELLINGERRORS <- data.frame(lapply(SPELLINGERRORS, as.character),
15 | stringsAsFactors=FALSE) %>%
16 | select(not.found, suggestion)
17 | #Remove Duplicate rows
18 | SPELLINGERRORS <- SPELLINGERRORS[!duplicated(SPELLINGERRORS[1:2]), ]
19 |
20 | #Now check sort(SPELLINGERRORS$not.found) and see which are actual spelling mistakes, which are correct but need modification
21 | #Below are what I have observed:
22 | SPELL_MISTAKES <- c("activites", "ambassodor", "andoid", "andorid", "andriod", "anubhava","autid","bussiness","chemsitry",
23 | "coordinaing","cosnulting","develoment","developement","develpoment","enrolment","facilitation",
24 | "finanace","managemnt","managment","mangement","marekting","markting","notejs","nutritionist","oflline","optimaization",
25 | "optimization","optmization","pharmacovigilance","reasearch","recruiter","professonal","requirment","retreival","socia",
26 | "trbology","tution","varification","vertification","writitng")
27 |
28 | SPELLINGERRORS <- SPELLINGERRORS[(SPELLINGERRORS$not.found %in% SPELL_MISTAKES), ]
29 | SIMILAR_WORDS <- list(
30 | c("apps", "app"),
31 | c("Accounting", "Accountant"),
32 | c("back-end", "backend"),
33 | c("beckend", "backend"),
34 | c("back end", "backend"),
35 | c("blog", "blogger"),
36 | c("blogging", "blogger"),
37 | c("blogs", "blogger"),
38 | c("cataloguing" ,"catalogue"),
39 | c("curating", "curation"),
40 | c("desiging", "design"),
41 | c("desigining", "design"),
42 | c("designe", "design"),
43 | c("telecalling", "telecaller"),
44 | c("telecommunications", "telecom"),
45 | c("trbology" , "tribology"),
46 | c("oflline", "offline")
47 | )
48 | m <- matrix(unlist(SIMILAR_WORDS), byrow = TRUE, ncol = 2)
49 | colnames(m) <- c("not.found", "suggestion")
50 | SPELLINGERRORS <- rbind(SPELLINGERRORS, m)
51 |
52 | #Function to replace Spelling errors
53 | replaceSpellingErrors <- function(words) {
54 | b <- c()
55 | for (i in unlist(strsplit(words, " "))) {
56 | if (i %in% SPELLINGERRORS$not.found) {
57 | b <- append(b, SPELLINGERRORS$suggestion[SPELLINGERRORS$not.found == i])
58 | } else {
59 | b <- append(b, i)
60 | }
61 | }
62 | return(paste(b, collapse = " "))
63 | }
64 |
65 | #Function to remove all unwanted stuff
66 | cleanUpText <- function(words, stem = TRUE) {
67 | #Remove all graph characters
68 | words <- str_replace_all(words,"[^[:graph:]]", " ")
69 | words <- gsub("[^[:alpha:][:space:]]*", "", words)
70 | words <- tolower(words)
71 | #Remove Punctuation except Hyphen -
72 | words <- gsub("([-])|[[:punct:]]", '\\1', words)
73 | #Remove all extra whitespace
74 | gsub("\\s+", " ", str_trim(words))
75 | #Replace all spelling errors
76 | words <- replaceSpellingErrors(words)
77 | #Stemming if stem = TRUE
78 | stemList <- c()
79 | if (stem) {
80 | for (i in words) {
81 | i <- gsub("[[:punct:]]$", "", i) #Remove any trailing punctuation mark
82 | i <- gsub("^[[:punct:]]", "", i) #Remove any leading punctuation mark
83 | j <- paste(stemDocument(unlist(strsplit(i," "))), collapse = " ")
84 | stemList <- append(stemList, j)
85 | }
86 | return(stemList)
87 | } else {
88 | return(words)
89 | }
90 | }
91 |
92 | t <- Sys.time()
93 | for (i in c("Internship_Profile")) {
94 | print(i)
95 | #internship[[i]] <- cleanUpText(internship[[i]], stem = TRUE)
96 | internship[[i]] <- sapply(internship[[i]], cleanUpText)
97 | }
98 | print(Sys.time()-t)
99 |
100 | #Save file
101 | write.csv(internship, "../data/Internship_Processed.csv", row.names = FALSE)
102 |
103 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3a_description.py:
--------------------------------------------------------------------------------
1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
2 | #### Author: Peter & Mikel
3 | #### Avito Duplicate Ad Detection
4 | # 3_feature_set3a_description.py
5 | # Creates some features from clean descriptions
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import nltk
10 | import sklearn
11 | import json
12 | import sys
13 | import gc
14 | import feather
15 | from pandas.io.json import json_normalize
16 | import unicodedata
17 | from stop_words import get_stop_words
18 | import time
19 | from multiprocessing import Pool
20 |
21 | import libavito as a
22 |
23 | stopwords = get_stop_words('ru')
24 | punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po'])
25 | sno = nltk.stem.SnowballStemmer('russian')
26 |
27 | def get_clean_tokens(text):
28 | newtext = []
29 | text0 = nltk.word_tokenize(text, 'russian')
30 | for y in text0:
31 | y = ''.join(x for x in y
32 | if unicodedata.category(x) not in punctutation_cats)
33 | if len(y) > 0 and y not in stopwords:
34 | newtext.append(sno.stem(y))
35 | return newtext
36 |
37 | def jaccard_similarity(x, y):
38 | intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
39 | union_cardinality = len(set.union(*[set(x), set(y)]))
40 | if union_cardinality == 0:
41 | return -1.0
42 | else:
43 | return intersection_cardinality / float(union_cardinality)
44 |
45 | def ratio_of_matches(x, y):
46 | intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
47 | x_cardinality = len(x)
48 | if x_cardinality == 0:
49 | return -1.0
50 | else:
51 | return intersection_cardinality / float(x_cardinality)
52 |
53 | print(a.c.BOLD + 'Extracting set3a description features ...' + a.c.END)
54 |
55 | # Get train/test mode from launch argument
56 | mode = a.get_mode(sys.argv, '3_feature_set3a_description.py')
57 |
58 | ## Read settings required by script
59 | config = a.read_config()
60 | nthreads = config.preprocessing_nthreads
61 | cache_loc = config.cache_loc
62 | debug = config.debug
63 | if mode == 0:
64 | root = config.train_images_root
65 | df = feather.read_dataframe(cache_loc + 'train.fthr')
66 | if mode == 1:
67 | root = config.test_images_root
68 | df = feather.read_dataframe(cache_loc + 'test.fthr')
69 |
70 | train = df[['itemID_1', 'itemID_2', 'cleandesc_1', 'cleandesc_2']]
71 | del df
72 | gc.collect()
73 |
74 | train = train.fillna('')
75 |
76 | ftrs = []
77 |
78 | def process_row(i):
79 | dx = train.iloc[i]['cleandesc_1'].split(' ')
80 | dy = train.iloc[i]['cleandesc_2'].split(' ')
81 | sim_d = jaccard_similarity(dx, dy)
82 | mat1_d = ratio_of_matches(dx, dy)
83 | mat2_d = ratio_of_matches(dy, dx)
84 | return [train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], sim_d, mat1_d, mat2_d, len(dx), len(dy)]
85 |
86 | # print('Calculating features ...')
87 | t0 = time.time()
88 | if nthreads == 1:
89 | print('Extracting features with 1 thread ...')
90 | for i in range(0, len(train.index)):
91 | if i % 10000 == 0:
92 | a.print_progress(i, t0, len(train.index))
93 | ftrs.append(process_row(i))
94 | else:
95 | print('Extracting features multi-threaded ... ', end='', flush=True)
96 | pool = Pool(nthreads)
97 | ftrs = pool.map(process_row, range(0, len(train.index)))
98 | pool.close()
99 | a.print_elapsed(t0)
100 |
101 | start = time.time()
102 | print('Caching data to disk ... ', end='', flush=True)
103 | ftrs = pd.DataFrame(ftrs)
104 | ftrs.columns = ['itemID_1', 'itemID_2', 'simdesc', 'mat1_d', 'mat2_d', 'nwords1', 'nwords2']
105 |
106 | # Save updated dataset
107 | if mode == 0:
108 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set3a.fthr')
109 | if mode == 1:
110 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set3a.fthr')
111 |
112 | a.print_elapsed(start)
113 | print('set3a extraction complete!')
114 |
115 | # Write status to status file so master script knows whether to proceed.
116 | f = open(cache_loc + 'status.txt', 'a')
117 | f.write('feature_set3a_OK\n')
118 | f.close()
119 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1e_attribute.R:
--------------------------------------------------------------------------------
1 | ################################################################################################
2 | ################################################################################################
3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
4 | #### Competition: Avito Duplicate Ad Detection
5 | # Filename : 3_feature_set1e_attribute.R
6 | # Description: This Rscript generates all Attribute (Json) features
7 | # Usage:
8 | # Rscript ./code/3_feature_set1e_attribute.R train
9 | # Rscript ./code/3_feature_set1e_attribute.R test
10 | # Default argument is test
11 | ################################################################################################
12 | ################################################################################################
13 | args <- commandArgs(trailingOnly = F)
14 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)])))
15 |
16 |
17 | # Source Config and functions.R file
18 | source(paste(BASE, "/../config.cfg", sep = ""))
19 | source(paste(BASE_DIR, "/code/functions.R", sep = ""))
20 |
21 | #Load any additional packages
22 | library(parallel)
23 | library(jsonlite)
24 |
25 | # Read argument for train or test
26 | trainOrTest <- commandArgs(trailingOnly = TRUE)
27 | if (length(trainOrTest) > 1) {
28 | stop("ERROR: I need only 1 argument : train or test")
29 | }
30 |
31 | if (length(trainOrTest) == 0) {
32 | print("No Arguments passed, Assuming you mean test")
33 | trainOrTest <- "test"
34 | }
35 |
36 | #Load data
37 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
38 | cat("Reading file ", FILENAME, "\n", sep = " ")
39 | dat <- read_csv(FILENAME)
40 |
41 |
42 |
43 | #Function to generate Attribute Features
44 | attribute_feature <- function(w) {
45 | x <- w[1]
46 | y <- w[2]
47 | if (is.na(x) | is.na(y) | x == "[]" | y == "[]") {
48 | return(rep(NA,8))
49 | }
50 | x <- paste("[", x, "]", sep = "")
51 | y <- paste("[", y, "]", sep = "")
52 | x.df <- fromJSON(x, simplifyDataFrame = TRUE)
53 | y.df <- fromJSON(y, simplifyDataFrame = TRUE)
54 | N_Attr_x <- ncol(x.df)
55 | N_Attr_y <- ncol(y.df)
56 | if (N_Attr_x == 0 | N_Attr_y == 0) {
57 | return(rep(NA,8))
58 | }
59 | L <- length(intersect(names(x.df), names(y.df)))
60 | ratioOfPercentageOfMatchingAttributesNames <- L / min(N_Attr_x, N_Attr_y)
61 | ratioOfPercentageOfMatchingAttributesValues <- NA
62 | c <- 0
63 | if (ratioOfPercentageOfMatchingAttributesNames > 0) {
64 | for (i in intersect(names(x.df), names(y.df))) {
65 | if (x.df[[i]] == y.df[[i]]) {
66 | c <- c + 1
67 | }
68 | }
69 | ratioOfPercentageOfMatchingAttributesValues <- c / L
70 | }
71 | numberOfAttributes_sum <- N_Attr_x + N_Attr_y
72 | numberOfAttributes_diff <- abs(N_Attr_x - N_Attr_y)
73 | numberOfAttributes_min <- min(N_Attr_x, N_Attr_y)
74 | numberOfAttributes_max <- max(N_Attr_x, N_Attr_y)
75 |
76 | return(c(
77 | numberOfAttributes_sum,
78 | numberOfAttributes_diff,
79 | numberOfAttributes_min,
80 | numberOfAttributes_max,
81 | L,
82 | ratioOfPercentageOfMatchingAttributesNames,
83 | c,
84 | ratioOfPercentageOfMatchingAttributesValues
85 | ))
86 |
87 |
88 | }
89 |
90 | print("Generating Features")
91 | #This can be made Parallel , I didnt do that as of now
92 | df_master <- as.data.frame(t(apply(dat[, c("cleanjson_1", "cleanjson_2")], 1, attribute_feature)))
93 | names(df_master) <- c(
94 | "numberOfAttributes_sum",
95 | "numberOfAttributes_diff",
96 | "numberOfAttributes_min",
97 | "numberOfAttributes_max",
98 | "NoOfMatchingAttributesNames",
99 | "ratioOfPercentageOfMatchingAttributesNames",
100 | "NoOfMatchingAttributesValues",
101 | "ratioOfPercentageOfMatchingAttributesValues"
102 | )
103 |
104 | names(df_master) <- paste("set1e", names(df_master), sep = "_")
105 |
106 | ######## Add Primary Columns ItemID1 and ItemID2
107 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
108 | print("Saving Attributes features")
109 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1e_", "attributes.fthr", sep = "" ))
110 |
111 | #END
112 |
--------------------------------------------------------------------------------
/Microsoft/Womens-Health-Risk-Assessment/Predict.R:
--------------------------------------------------------------------------------
1 | #(c) Sonny Laskar (sonnylaskar at gmail Dot Com)
2 | #Create a zip file with all packages which are not available in Microsoft Azure environment and upload the zip.
3 | #The zip file is available in "src" folder. My zip was named downloaded_packages.zip
4 | install.packages("src/downloaded_packages/stringi_1.1.1.zip", lib = ".", repos = NULL, verbose = TRUE)
5 | install.packages("src/downloaded_packages/magrittr_1.5.zip", lib = ".", repos = NULL, verbose = TRUE)
6 | install.packages("src/downloaded_packages/xgboost_0.4-4.zip", lib = ".", repos = NULL, verbose = TRUE)
7 |
8 | library(xgboost, lib.loc=".", verbose=TRUE)
9 | library(dplyr)
10 | library(gbm)
11 | library(randomForest)
12 | # Map 1-based optional input ports to variables
13 | dataset1 <- maml.mapInputPort(1) # class: data.frame
14 | dataset1$segment <- NULL
15 | dataset1$subgroup <- NULL
16 | cat("Original dim: ", dim(dataset1), "\n")
17 |
18 |
19 | encode_religion <- function(dat) {
20 | #Input: Character Vector for religion
21 | #Output: Numeric Vector
22 | dat <- ifelse(dat == "Buddhist", 1, dat)
23 | dat <- ifelse(dat == "Evangelical/Bo", 2, dat)
24 | dat <- ifelse(dat == "Hindu", 3, dat)
25 | dat <- ifelse(dat == "Jewish", 4, dat)
26 | dat <- ifelse(dat == "Muslim", 5, dat)
27 | dat <- ifelse(dat == "Other", 6, dat)
28 | dat <- ifelse(dat == "Other Christia", 7, dat)
29 | dat <- ifelse(dat == "Roman Catholic", 8, dat)
30 | dat <- ifelse(dat == "Russian/Easter", 9, dat)
31 | dat <- ifelse(dat == "Traditional/An", 10, dat)
32 | dat <- ifelse(dat == "", NA, dat)
33 | dat <- as.integer(dat)
34 | return(dat)
35 | }
36 |
37 | manual_encode_religion <- function(dat) {
38 | #Input: Character Vector for religion
39 | #Output: Numeric Vector
40 | RELIGION <- c("Hindu", "Evangelical/Bo", "Muslim", "Roman Catholic", "Other Christia", "Buddhist", "Russian/Easter", "Traditional/An", "Other", "Jewish")
41 | for (i in RELIGION) {
42 | c <- paste("religion", i, sep = ".")
43 | print(c)
44 | dat[[c]] <- ifelse(dat$religion == i, 1, 0)
45 | }
46 | dat$religion <- encode_religion(dat$religion)
47 | return(dat)
48 | }
49 |
50 | featureEngineering <- function(dat) {
51 | dat$INTNR <- NULL
52 | dat$geo <- as.integer(dat$geo)
53 | dat <- manual_encode_religion(dat)
54 | dat$segment <- NULL
55 | dat$subgroup <- NULL
56 | dat[is.na(dat)] <- -1
57 | dat$christian <- as.numeric(dat$christian) #Xgboost needs at least one column as numeric
58 | #Random Forest cannot handle / and space in colnames
59 | names(dat) <- gsub("/", "_", names(dat))
60 | names(dat) <- gsub(" ", "_", names(dat))
61 |
62 | return(dat)
63 | }
64 | dataset1 <- featureEngineering(dataset1)
65 | cat("New dim: ", dim(dataset1), "\n")
66 |
67 |
68 | sub <- data.frame(patientID = NULL, geo = NULL, class = NULL)
69 | for (GEO in 1:9) {
70 | print(GEO)
71 | dat <- dataset1[dataset1$geo == GEO, ]
72 | cat("New dim: ", dim(dat), "\n")
73 | if (nrow(dat) == 0) next
74 | patientID <- dat$patientID
75 | dat$patientID <- NULL
76 |
77 | if (GEO == 1) classes <- c("11","21","22")
78 | if (GEO == 2) classes <- c("11","12","21","22","31","41")
79 | if (GEO == 3) classes <- c("11","12","21","22")
80 | if (GEO == 4) classes <- c("11","12")
81 | if (GEO == 5) classes <- c("11","12","22","31","32")
82 | if (GEO == 6) classes <- c("11","12","21")
83 | if (GEO == 7) classes <- c("11","12","21","22","31")
84 | if (GEO == 8) classes <- c("11","21","31","41")
85 | if (GEO == 9) classes <- c("11","12","21","31","32")
86 | #LOAD XGB Model
87 | xgb_1000 <- readRDS(paste("src/downloaded_packages/xgb_geo_", GEO ,"_seed1000.model", sep = ""))
88 |
89 | xgb_test <- predict(xgb_1000, data.matrix(dat), missing=NA)
90 | xgb_test <- as.data.frame(matrix(xgb_test,
91 | nrow=nrow(dat),
92 | byrow = TRUE))
93 | colnames(xgb_test) <- classes
94 |
95 | #LOAD RF Model
96 | rf_1000 <- readRDS(paste("src/downloaded_packages/rf_geo_", GEO ,"_seed1000.model", sep = ""))
97 | rf_test <- as.data.frame(predict(rf_1000,
98 | dat,
99 | type= "prob"))
100 | colnames(rf_test) <- classes
101 |
102 | #Combined Weightage
103 | final <- (xgb_test*0.4 + rf_test*0.6)
104 | final$NEW <- apply(final, 1, function(x) {
105 | m <- which.max(x)
106 | names(final)[m]
107 | })
108 | sub <- rbind(sub, data.frame(patientID = patientID, geo = dat$geo, class = final$NEW))
109 | }
110 |
111 | data.set <- data.frame(patientID = sub$patientID,
112 | Geo_Pred = sub$geo,
113 | Segment_Pred = as.integer(substring(sub$class, 1, 1)),
114 | Subgroup_Pred = as.integer(substring(sub$class, 2, 2))
115 | )
116 |
117 | print(str(data.set))
118 | maml.mapOutputPort("data.set");
119 |
120 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_json_to_cols.py:
--------------------------------------------------------------------------------
1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
2 | #### Author: Peter & Mikel
3 | #### Avito Duplicate Ad Detection
4 | # 3_json_to_cols.py
5 | # Encodes json key similarity into a sparse format for feature extraction
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import sklearn
10 | import json
11 | from pandas.io.json import json_normalize
12 | import unicodedata
13 | import time
14 | import codecs
15 | import feather
16 |
17 | import libavito as a
18 |
19 | def jaccard_similarity(x, y):
20 | intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
21 | union_cardinality = len(set.union(*[set(x), set(y)]))
22 | if union_cardinality == 0:
23 | return -1.0
24 | else:
25 | return intersection_cardinality / float(union_cardinality)
26 |
27 | ## Read settings required by script
28 | config = a.read_config()
29 | nthreads = config.preprocessing_nthreads
30 | cache_loc = config.cache_loc
31 | debug = config.debug
32 | df_train = feather.read_dataframe(cache_loc + 'train.fthr')
33 | df_test = feather.read_dataframe(cache_loc + 'test.fthr')
34 |
35 | df_train = df_train[['itemID_1', 'itemID_2', 'cleanjson_1', 'cleanjson_2']]
36 | df_test = df_test[['itemID_1', 'itemID_2', 'cleanjson_1', 'cleanjson_2']]
37 |
38 | df = pd.concat([df_train, df_test])
39 |
40 | clean_jsons = df['cleanjson_1'].tolist() + df['cleanjson_2'].tolist()
41 |
42 | print('Creating key dict ... ')
43 | allkey = {}
44 | pa = 0
45 | t0 = time.time()
46 | for i in range(0, len(clean_jsons)):
47 | if i % 100000 == 0:
48 | a.print_progress(i, t0, len(clean_jsons))
49 | try:
50 | jx = clean_jsons[i].replace("'", "")
51 | resx = json.loads(jx)
52 | for x in resx.keys():
53 | if x in allkey:
54 | allkey[x] = allkey[x] + 1
55 | else:
56 | allkey[x] = 1
57 | except KeyboardInterrupt:
58 | raise
59 | except Exception as e:
60 | pa += 1
61 |
62 | t0 = time.time()
63 | print('Transforming key dict ... ', end='', flush=True)
64 | icount = 0
65 | keydict = {}
66 | for k, n in allkey.items():
67 | keydict[k] = icount
68 | icount += 1
69 | a.print_elapsed(t0)
70 |
71 | ftrs_train = []
72 | print('Generating for train ... ')
73 | t0 = time.time()
74 | pa = 0
75 | for i in range(0, len(df_train.index)):
76 | if i % 10000 == 0:
77 | a.print_progress(i, t0, len(df_train.index))
78 | try:
79 | jx = df_train.iloc[i]['cleanjson_1'].replace("'", "")
80 | jy = df_train.iloc[i]['cleanjson_2'].replace("'", "")
81 | resx = json.loads(jx)
82 | resy = json.loads(jy)
83 | except KeyboardInterrupt:
84 | raise
85 | except:
86 | continue
87 |
88 | if resx != [] and resy != []:
89 | for key in set.union(*[set(resx.keys()), set(resy.keys())]):
90 | if key in resx.keys() and key in resy.keys():
91 | c = resx[key]
92 | b = resy[key]
93 | res = jaccard_similarity(c, b)
94 | else:
95 | res = -1
96 | ftrs_train.append([df_train.iloc[i]['itemID_1'], df_train.iloc[i]['itemID_2'], str(keydict[key]), str(res)])
97 | else:
98 | pa += 1
99 |
100 | ftrs_test = []
101 | print('Generating for test ... ')
102 | t0 = time.time()
103 | for i in range(0, len(df_test.index)):
104 | if i % 10000 == 0:
105 | a.print_progress(i, t0, len(df_test.index))
106 | try:
107 | jx = df_test.iloc[i]['cleanjson_1'].replace("'", '')
108 | jy = df_test.iloc[i]['cleanjson_2'].replace("'", '')
109 | resx = json.loads(jx)
110 | resy = json.loads(jy)
111 | except KeyboardInterrupt:
112 | raise
113 | except:
114 | continue
115 |
116 | if resx != [] and resy != []:
117 | for key in set.union(*[set(resx.keys()), set(resy.keys())]):
118 | if key in resx.keys() and key in resy.keys():
119 | c = resx[key]
120 | b = resy[key]
121 | res = jaccard_similarity(c, b)
122 | else:
123 | res = -1
124 | ftrs_test.append([df_test.iloc[i]['itemID_1'], df_test.iloc[i]['itemID_2'], str(keydict[key]), str(res)])
125 | else:
126 | pa += 1
127 |
128 | print("\nError rows: " + str(pa))
129 |
130 | print(len(ftrs_train))
131 | print(len(ftrs_test))
132 |
133 | print('Tranforming features ... ', end='', flush=True)
134 | t0 = time.time()
135 | ftrs_train = pd.DataFrame(ftrs_train)
136 | ftrs_test = pd.DataFrame(ftrs_test)
137 | ftrs_train.columns = ['itemID_1', 'itemID_2', 'keyID', 'value']
138 | ftrs_test.columns = ['itemID_1', 'itemID_2', 'keyID', 'value']
139 | a.print_elapsed(t0)
140 |
141 | print('Caching data to disk ... ', end='', flush=True)
142 | t0 = time.time()
143 | feather.write_dataframe(ftrs_train, cache_loc + 'json_vals_train_v2.fthr')
144 | feather.write_dataframe(ftrs_test, cache_loc + 'json_vals_test_v2.fthr')
145 | a.print_elapsed(t0)
146 |
147 | print('json_to_cols Complete!')
148 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/functions.R:
--------------------------------------------------------------------------------
1 | #### Copyright 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
2 | #### Avito Duplicate Ad Detection
3 | # functions.R
4 | # TODO: WRITE DESCRIPTION OF SCRIPT HERE
5 |
6 | #Load Basic packages needed by all R scripts
7 | library(readr)
8 | library(dplyr)
9 | library(tidyr)
10 | library(feather)
11 |
12 | ######## GET NGRAMS FUNCTIONS
13 | getNGrams <- function(my.text, n = 1) {
14 | # which can be split into a vector of consecutive words:
15 | my.vector.of.words <- stemDocument(unlist(strsplit(gsub("\\s+", " ", str_trim(my.text)), " ")))
16 | # now, we create a vector of word n-grams:
17 | if (length(my.vector.of.words) >= n) {
18 | make.ngrams(my.vector.of.words, ngram.size = n)
19 | } else {
20 | return(NULL)
21 | }
22 | }
23 | ######## GET NCHARS FUNCTIONS
24 | getNGramsChars <- function(my.text, n = 1) {
25 | # which can be split into a vector of consecutive words:
26 | my.vector.of.words <- stemDocument(unlist(strsplit(gsub("\\s+", " ", str_trim(my.text)), " ")))
27 | # now, we create a vector of word n-grams:
28 | if (length(my.vector.of.words) >= n) {
29 | my.vector.of.chars = txt.to.features(my.vector.of.words, features = "c")
30 | make.ngrams(my.vector.of.chars, ngram.size = n)
31 | } else {
32 | return(NULL)
33 | }
34 | }
35 |
36 | ## NGRAMS
37 | getNgramsCount <- function(string1, string2, n = 1) {
38 | #######################################
39 | # COUNTING NGRAMS FEATURES
40 | #######################################
41 | #Generate Ngrams
42 | NgramsString1 <- getNGrams(tolower(string1), n)
43 | NgramsString2 <- getNGrams(tolower(string2), n)
44 |
45 | #Count of Ngrams
46 | countOfNgramsInString1 <- length(NgramsString1)
47 | countOfNgramsInString2 <- length(NgramsString2)
48 | ratioOfNgrams_String1_String2 <- round(countOfNgramsInString1 / countOfNgramsInString2, 3)
49 |
50 | #Count of Unique NGrams
51 | countOfUniqueNgramsInString1 <- length(unique(NgramsString1))
52 | countOfUniqueNgramsInString2 <- length(unique(NgramsString2))
53 | ratioOfUniqueNgrams_String1_String2 <- round(countOfUniqueNgramsInString1 / countOfUniqueNgramsInString2, 3)
54 |
55 | ratioOfIntersect_Ngrams_String1_in_String2 <- round(sum(NgramsString1 %in% NgramsString2) / countOfNgramsInString1, 3)
56 | ratioOfIntersect_Ngrams_String2_in_String1 <- round(sum(NgramsString2 %in% NgramsString1) / countOfNgramsInString2, 3)
57 |
58 | countOfNgramsInString_min <- min( countOfNgramsInString1, countOfNgramsInString2 )
59 | countOfNgramsInString_max <- max( countOfNgramsInString1, countOfNgramsInString2 )
60 | countOfNgramsInString_sum <- ( countOfNgramsInString1 + countOfNgramsInString2 )
61 | countOfNgramsInString_diff <- abs( countOfNgramsInString1 - countOfNgramsInString2 )
62 |
63 | return(c(
64 | countOfNgramsInString_min,
65 | countOfNgramsInString_max,
66 | countOfNgramsInString_sum,
67 | countOfNgramsInString_diff,
68 | countOfNgramsInString1,
69 | countOfNgramsInString2,
70 | countOfUniqueNgramsInString1,
71 | countOfUniqueNgramsInString2,
72 | ratioOfNgrams_String1_String2,
73 | ratioOfUniqueNgrams_String1_String2,
74 | ratioOfIntersect_Ngrams_String1_in_String2,
75 | ratioOfIntersect_Ngrams_String2_in_String1
76 | ))
77 | }
78 |
79 | ## NCHARS
80 | getNcharsCount <- function(string1, string2, n = 1) {
81 | #######################################
82 | # COUNTING Nchars FEATURES
83 | #######################################
84 | #Generate Nchars
85 | NcharsString1 <- getNGramsChars(tolower(string1), n)
86 | NcharsString2 <- getNGramsChars(tolower(string2), n)
87 |
88 | #Count of Nchars
89 | countOfNcharsInString1 <- length(NcharsString1)
90 | countOfNcharsInString2 <- length(NcharsString2)
91 | ratioOfNchars_String1_String2 <- round(countOfNcharsInString1 / countOfNcharsInString2, 3)
92 |
93 | #Count of Unique Nchars
94 | countOfUniqueNcharsInString1 <- length(unique(NcharsString1))
95 | countOfUniqueNcharsInString2 <- length(unique(NcharsString2))
96 | ratioOfUniqueNchars_String1_String2 <- round(countOfUniqueNcharsInString1 / countOfUniqueNcharsInString2, 3)
97 |
98 | ratioOfIntersect_Nchars_String1_in_String2 <- round(sum(NcharsString1 %in% NcharsString2) / countOfNcharsInString1, 3)
99 | ratioOfIntersect_Nchars_String2_in_String1 <- round(sum(NcharsString2 %in% NcharsString1) / countOfNcharsInString2, 3)
100 |
101 | countOfNcharsInString_min <- min( countOfNcharsInString1, countOfNcharsInString2 )
102 | countOfNcharsInString_max <- max( countOfNcharsInString1, countOfNcharsInString2 )
103 | countOfNcharsInString_sum <- ( countOfNcharsInString1 + countOfNcharsInString2 )
104 | countOfNcharsInString_diff <- abs(( countOfNcharsInString1 - countOfNcharsInString2 ))
105 |
106 | return(c(
107 | countOfNcharsInString_min,
108 | countOfNcharsInString_max,
109 | countOfNcharsInString_sum,
110 | countOfNcharsInString_diff,
111 | countOfNcharsInString1,
112 | countOfNcharsInString2,
113 | countOfUniqueNcharsInString1,
114 | countOfUniqueNcharsInString2,
115 | ratioOfNchars_String1_String2,
116 | ratioOfUniqueNchars_String1_String2,
117 | ratioOfIntersect_Nchars_String1_in_String2,
118 | ratioOfIntersect_Nchars_String2_in_String1
119 | ))
120 | }
121 |
122 |
123 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set3f_hamming.py:
--------------------------------------------------------------------------------
1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
2 | #### Author: Peter & Mikel
3 | #### Avito Duplicate Ad Detection
4 | # 3_feature_set3f_hamming.py
5 | # Creates features from image dHashes
6 |
7 | import pandas as pd
8 | import numpy as np
9 | import sys
10 | import feather
11 | import time
12 | import gc
13 | from multiprocessing import Pool
14 |
15 | import libavito as a
16 |
17 | def debug(s):
18 | print(str(s))
19 | time.sleep(1)
20 |
21 | print(a.c.BOLD + 'Extracting set3f image hamming features ...' + a.c.END)
22 |
23 | # Get train/test mode from launch argument
24 | mode = a.get_mode(sys.argv, '3_feature_set3f_hamming.py')
25 |
26 | ## Read settings required by script
27 | config = a.read_config()
28 | nthreads = config.preprocessing_nthreads
29 | cache_loc = config.cache_loc
30 | #debug = config.debug
31 | if mode == 0:
32 | df = feather.read_dataframe(cache_loc + 'train.fthr')
33 | if mode == 1:
34 | df = feather.read_dataframe(cache_loc + 'test.fthr')
35 |
36 | root = config.images_root
37 | image_db = feather.read_dataframe(cache_loc + 'image_database.fthr')
38 |
39 | df = df[['itemID_1', 'itemID_2', 'images_array_1', 'images_array_2']]
40 |
41 | start = time.time()
42 | print('Preparing imageDB ... ', end='', flush=True)
43 | image_db.index = image_db['image']
44 | nhash = image_db['FreqOfHash'].to_dict()
45 | ihash = image_db['imagehash'].to_dict()
46 | a.print_elapsed(start)
47 |
48 | def process_row(row):
49 | id1 = row[0]
50 | id2 = row[1]
51 | array_x = row[2]
52 | array_y = row[3]
53 |
54 | if array_x is not None:
55 | aux_x = array_x.replace(' ', '').split(',')
56 | else:
57 | aux_x = []
58 | if array_y is not None:
59 | aux_y = array_y.replace(' ', '').split(',')
60 | else:
61 | aux_y = []
62 |
63 | icount = []
64 | missing = 0
65 | minhamming = 99999
66 | minhamming30 = 99999
67 | minhamming50 = 99999
68 | minhamming100 = 99999
69 | #maxn = 0
70 | for k in range(0, 9):
71 | icount.append(0)
72 |
73 | # Find out if some images are repeated very often
74 | maxnx = 0
75 | maxny = 0
76 | for ix in aux_x:
77 | ix = int(ix)
78 | if ix in nhash:
79 | if maxnx < nhash[ix]:
80 | maxnx = nhash[ix]
81 |
82 | for iy in aux_y:
83 | iy = int(iy)
84 | if iy in nhash:
85 | if maxny < nhash[iy]:
86 | maxny = nhash[iy]
87 |
88 | for ix in aux_x:
89 | for iy in aux_y:
90 | if ix in ihash and iy in ihash:
91 | try:
92 | a = int('0x' + ihash[ix], 16)
93 | b = int('0x' + ihash[iy], 16)
94 | hamming = bin(a ^ b).count("1")
95 | if hamming < 9:
96 | icount[hamming] = icount[hamming] + 1
97 |
98 | if hamming < minhamming:
99 | minhamming = hamming
100 |
101 | if nhash[ix] < 100 and nhash[iy] < 100:
102 | if minhamming100 > hamming:
103 | minhamming100 = hamming
104 |
105 | if nhash[ix] < 30 and nhash[iy] < 30:
106 | if minhamming30 > hamming:
107 | minhamming30 = hamming
108 |
109 | if nhash[ix] < 50 and nhash[iy] < 50:
110 | if minhamming50 > hamming:
111 | minhamming50 = hamming
112 |
113 | except:
114 | pass
115 | #debug(['break', ix, iy])
116 | else:
117 | #debug(['missing', ix, iy])
118 | missing = missing + 1
119 |
120 | vals = [id1, id2] + icount + [missing, minhamming, maxnx, maxny, minhamming30, minhamming50, minhamming100]
121 | if min(len(aux_x), len(aux_y)) > 0:
122 | return vals
123 | else:
124 | return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
125 |
126 | ftrs = []
127 |
128 | start = time.time()
129 | o = len(df.index)
130 | if nthreads == 1:
131 | print('Extracting features with 1 thread ...')
132 | k = 0
133 | # Iterate over files
134 | ftrs = []
135 | for row in df.values:
136 | x = process_row(row)
137 | ftrs.append(x)
138 | k += 1
139 | if k % 100 == 0:
140 | a.print_progress(k, start, o)
141 |
142 | # Otherwise perform multi-threaded mapping
143 | else:
144 | print('Extracting features multi-threaded ... ', end='', flush=True)
145 | pool = Pool(nthreads)
146 | ftrs = pool.map(process_row, df.values)
147 | pool.close()
148 | gc.collect()
149 |
150 | a.print_elapsed(start)
151 |
152 | ftrs = pd.DataFrame(ftrs)
153 | ftrs = ftrs.loc[ftrs[0] > 0]
154 | cols = ['itemID_1', 'itemID_2'] + [str(c) for c in ['ham' + str(i) for i in range(9)] + ['miss', 'minham', 'maxnx', 'maxny', 'minham30', 'minham50', 'minham100']]
155 | print(cols)
156 | ftrs.columns = cols
157 |
158 | # Save updated dataset
159 | if mode == 0:
160 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set3f.fthr')
161 | if mode == 1:
162 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set3f.fthr')
163 |
164 | a.print_elapsed(start)
165 | print('set3f extraction complete!')
166 |
167 | # Write status to status file so master script knows whether to proceed.
168 | f = open(cache_loc + 'status.txt', 'a')
169 | f.write('feature_set3f_OK\n')
170 | f.close()
171 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/2_image_info.py:
--------------------------------------------------------------------------------
1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
2 | #### Author: Mikel
3 | #### Avito Duplicate Ad Detection
4 | # 2_image_info.py
5 | # Creates a database of images and metadata about them, including dHash
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import cv2
10 | import feather
11 | import glob
12 | import sys
13 | import time
14 | import os
15 | import gc
16 | from multiprocessing import Pool
17 | from PIL import Image
18 | from collections import Counter
19 |
20 | import libavito as a
21 |
22 | print(a.c.BOLD + 'Generating image info ...' + a.c.END)
23 |
24 | # Get train/test mode from launch argument
25 | mode = a.get_mode(sys.argv, '2_image_info.py')
26 |
27 | ## Read settings required by script
28 | config = a.read_config()
29 | nthreads = config.preprocessing_nthreads
30 | cache_loc = config.cache_loc
31 | debug = config.debug
32 | root = config.images_root
33 |
34 | # Function to compute difference hash of image
35 | def DifferenceHash(img):
36 | theImage = Image.fromarray(img)
37 | # Convert the image to 8-bit grayscale.
38 | theImage = theImage.convert("L") # 8-bit grayscale
39 | # Squeeze it down to an 8x8 image.
40 | theImage = theImage.resize((8, 8), Image.ANTIALIAS)
41 | # Go through the image pixel by pixel.
42 | # Return 1-bits when a pixel is equal to or brighter than the previous
43 | # pixel, and 0-bits when it's below.
44 | # Use the 64th pixel as the 0th pixel.
45 | previousPixel = theImage.getpixel((0, 7))
46 | differenceHash = 0
47 | for row in range(0, 8, 2):
48 | # Go left to right on odd rows.
49 | for col in range(8):
50 | differenceHash <<= 1
51 | pixel = theImage.getpixel((col, row))
52 | differenceHash |= 1 * (pixel >= previousPixel)
53 | previousPixel = pixel
54 | row += 1
55 | # Go right to left on even rows.
56 | for col in range(7, -1, -1):
57 | differenceHash <<= 1
58 | pixel = theImage.getpixel((col, row))
59 | differenceHash |= 1 * (pixel >= previousPixel)
60 | previousPixel = pixel
61 | return differenceHash
62 |
63 | def get_info(file_loc):
64 | try:
65 | # Get size of image
66 | size = os.path.getsize(file_loc)
67 |
68 | # Attempt to load image
69 | img = cv2.imread(file_loc)
70 | try:
71 | # Test if image is corrupt
72 | assert img.shape[0] * img.shape[1] > 0
73 | except:
74 | print('[WARNING] Image ' + file_loc + ' is corrupt, skipping.')
75 | raise
76 |
77 | # Get image metadata
78 | width = img.shape[1]
79 | height = img.shape[0]
80 |
81 | # Get ratio of image dimensions
82 | ratio = round(min(width, height) / max(width, height), 2)
83 |
84 | # Compute difference hash of image and convert to hex
85 | dhash = '%(hash)016x' % {"hash": DifferenceHash(img)}
86 |
87 | return [width, height, ratio, dhash, size]
88 |
89 | except KeyboardInterrupt:
90 | raise
91 | except:
92 | print('[WARNING] Image ' + file_loc + ' failed to process.')
93 | return [np.nan, np.nan, np.nan, np.nan, np.nan]
94 |
95 | def process_line(f):
96 | # Get image ID
97 | img_id = f.split('/')[-1].split('.')[0]
98 | # Retrieve info for image
99 | d = get_info(f)
100 | # Construct list and return
101 | info = []
102 | info.append(img_id)
103 | info.extend(d)
104 | return info
105 |
106 | # Recursively glob for jpeg files in the image root
107 | start = time.time()
108 | print('Looking for images in ' + root + ' ... ', end='', flush=True)
109 | files = glob.glob(root + '**/*.jpg', recursive=True)
110 | a.print_elapsed(start)
111 |
112 | print('Found ' + str(len(files)) + ' images.')
113 |
114 | l_id = []
115 | l_width = []
116 | l_height = []
117 | l_ratio = []
118 | l_hash = []
119 | l_size = []
120 | o = len(files)
121 | if nthreads == 1:
122 | print('Extracting image info with 1 thread ...')
123 | k = 0
124 | # Iterate over files
125 | for f in files:
126 | x = process_line(f)
127 | l_id.append(x[0])
128 | l_width.append(x[1])
129 | l_height.append(x[2])
130 | l_ratio.append(x[3])
131 | l_hash.append(x[4])
132 | l_size.append(x[5])
133 | k += 1
134 | if k % 1000 == 0:
135 | a.print_progress(k, start, o)
136 | # Otherwise perform multi-threaded mapping
137 | else:
138 | print('Extracting image info multi-threaded ... ', end='', flush=True)
139 | pool = Pool(nthreads)
140 | newdata = pool.map(process_line, files)
141 | pool.close()
142 | for x in newdata:
143 | l_id.append(x[0])
144 | l_width.append(x[1])
145 | l_height.append(x[2])
146 | l_ratio.append(x[3])
147 | l_hash.append(x[4])
148 | l_size.append(x[5])
149 | del newdata
150 | gc.collect()
151 |
152 | a.print_elapsed(start)
153 |
154 | print('Finding hash-counts ...', end='', flush=True)
155 | start = time.time()
156 | counttable = Counter(l_hash)
157 | l_hashcount = []
158 | for h in l_hash:
159 | l_hashcount.append(counttable[h])
160 | a.print_elapsed(start)
161 |
162 | # Bind lists to dataframe
163 | df = pd.DataFrame()
164 | df['image'] = l_id
165 | df['width'] = l_width
166 | df['height'] = l_height
167 | df['ratioOfDimension'] = l_ratio
168 | df['imagehash'] = l_hash
169 | df['FreqOfHash'] = l_hashcount
170 | df['imagesize'] = l_size
171 |
172 | start = time.time()
173 | print('Caching image data ... ', end='', flush=True)
174 |
175 | # Save updated dataset
176 | feather.write_dataframe(df, cache_loc + 'image_database.fthr')
177 | df.to_csv(cache_loc + 'image_database.csv', index=False)
178 |
179 | a.print_elapsed(start)
180 | print('Image info extraction complete!')
181 |
182 | # Write status to status file so master script knows whether to proceed.
183 | f = open(cache_loc + 'status.txt', 'a')
184 | f.write('image_info_OK\n')
185 | f.close()
186 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1a_ngram.R:
--------------------------------------------------------------------------------
1 | ################################################################################################
2 | ################################################################################################
3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
4 | #### Competition: Avito Duplicate Ad Detection
5 | # Filename : 3_feature_set1a_ngram.R
6 | # Description: This Rscript generates all ngram features
7 | # Usage:
8 | # Rscript ./code/3_feature_set1a_ngram.R train
9 | # Rscript ./code/3_feature_set1a_ngram.R test
10 | # Default argument is test
11 | ################################################################################################
12 | ################################################################################################
13 |
14 | args <- commandArgs(trailingOnly = F)
15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)])))
16 |
17 | # Source Config and functions.R file
18 | source(paste(BASE, "/../config.cfg", sep = ""))
19 | source(paste(BASE_DIR, "/code/functions.R", sep = ""))
20 |
21 | #Load any additional packages
22 | library(parallel)
23 | library(stylo)
24 | library(stringr)
25 | library(tm)
26 |
27 | # Read argument for train or test
28 | trainOrTest <- commandArgs(trailingOnly = TRUE)
29 | if (length(trainOrTest) > 1) {
30 | stop("ERROR: I need only 1 argument : train or test")
31 | }
32 |
33 | if (length(trainOrTest) == 0) {
34 | print("No Arguments passed, Assuming you mean test")
35 | trainOrTest <- "test"
36 | }
37 |
38 | #Load data
39 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
40 | cat("Reading file ", FILENAME, "\n", sep = " ")
41 | dat <- read_csv(FILENAME)
42 |
43 | #######################################
44 | # Start generating Features for DESCRIPTION columns
45 | print("Start generating nGrams Features for DESCRIPTION columns")
46 | for (n in 1:3) {
47 | print(n)
48 | df2 <- data.frame(t(mcmapply(getNgramsCount, dat$cleandesc_1, dat$cleandesc_2, n, USE.NAMES = FALSE, mc.cores = preprocessing_nthreads)))
49 | colnames(df2) <- c(
50 | paste("countOf_", n, "_Grams_description_min", sep = ""),
51 | paste("countOf_", n, "_Grams_description_max", sep = ""),
52 | paste("countOf_", n, "_Grams_description_sum", sep = ""),
53 | paste("countOf_", n, "_Grams_description_diff", sep = ""),
54 |
55 | paste("countOf_", n, "_Grams_cleandesc_1", sep = ""),
56 | paste("countOf_", n, "_Grams_cleandesc_2", sep = ""),
57 | paste("countOfUnique_", n, "_Grams_cleandesc_1", sep = ""),
58 | paste("countOfUnique_", n, "_Grams_cleandesc_2", sep = ""),
59 | paste("ratioOf_", n, "_Grams_cleandesc_1_cleandesc_2", sep = ""),
60 | paste("ratioOfUnique_", n, "_Grams_cleandesc_1_cleandesc_2", sep = ""),
61 | paste("ratioOfIntersect_", n, "_Grams_cleandesc_1_in_cleandesc_2", sep = ""),
62 | paste("ratioOfIntersect_", n, "_Grams_cleandesc_2_in_cleandesc_1", sep = "")
63 | )
64 | if (nrow(df2) != nrow(dat)) {
65 | cat("Expecting", nrow(dat), "Got", nrow(df2), "\n", sep = " ")
66 | stop("mcmapply is behaving weird. Getting less results")
67 | }
68 |
69 | if (exists("df_master")) {
70 | df_master <- bind_cols(df_master, df2)
71 | } else {
72 | df_master <- df2
73 | }
74 | }
75 | names(df_master) <- paste("set1a", names(df_master), sep = "_")
76 |
77 | ######## Add Primary Columns ItemID1 and ItemID2
78 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
79 | print("Saving Description ngrams features")
80 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1a_", "ngram_description.fthr", sep = "" ))
81 | rm(df_master, df2)
82 | gc()
83 |
84 |
85 | #######################################
86 | # Start generating Features for TITLE columns
87 | print("Start generating nGrams Features for TITLE columns")
88 | for (n in 1:3) {
89 | print(n)
90 | df2 <- data.frame(t(mcmapply(getNgramsCount, dat$cleantitle_1, dat$cleantitle_2, n, USE.NAMES = FALSE, mc.cores = preprocessing_nthreads)))
91 | colnames(df2) <- c(
92 | paste("countOf_", n, "_Grams_title_min", sep = ""),
93 | paste("countOf_", n, "_Grams_title_max", sep = ""),
94 | paste("countOf_", n, "_Grams_title_sum", sep = ""),
95 | paste("countOf_", n, "_Grams_title_diff", sep = ""),
96 |
97 | paste("countOf_", n, "_Grams_cleantitle_1", sep = ""),
98 | paste("countOf_", n, "_Grams_cleantitle_2", sep = ""),
99 | paste("countOfUnique_", n, "_Grams_cleantitle_1", sep = ""),
100 | paste("countOfUnique_", n, "_Grams_cleantitle_2", sep = ""),
101 | paste("ratioOf_", n, "_Grams_cleantitle_1_cleantitle_2", sep = ""),
102 | paste("ratioOfUnique_", n, "_Grams_cleantitle_1_cleantitle_2", sep = ""),
103 | paste("ratioOfIntersect_", n, "_Grams_cleantitle_1_in_cleantitle_2", sep = ""),
104 | paste("ratioOfIntersect_", n, "_Grams_cleantitle_2_in_cleantitle_1", sep = "")
105 | )
106 |
107 | if (nrow(df2) != nrow(dat)) {
108 | cat("Expecting", nrow(dat), "Got", nrow(df2), "\n", sep = " ")
109 | stop("mcmapply is behaving weird. Getting less results")
110 | }
111 |
112 | if (exists("df_master")) {
113 | df_master <- bind_cols(df_master, df2)
114 | } else {
115 | df_master <- df2
116 | }
117 | }
118 | names(df_master) <- paste("set1a", names(df_master), sep = "_")
119 |
120 | ######## Add Primary Columns ItemID1 and ItemID2
121 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
122 | print("Saving Title ngrams features")
123 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1a_", "ngram_title.fthr", sep = "" ))
124 | rm(df_master, df2)
125 | gc()
126 |
127 | #END
128 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1b_nchar.R:
--------------------------------------------------------------------------------
1 | ################################################################################################
2 | ################################################################################################
3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
4 | #### Competition: Avito Duplicate Ad Detection
5 | # Filename : 3_feature_set1b_nchar.R
6 | # Description: This Rscript generates all nchar features
7 | # Usage:
8 | # Rscript ./code/3_feature_set1b_nchar.R train
9 | # Rscript ./code/3_feature_set1b_nchar.R test
10 | # Default argument is test
11 | ################################################################################################
12 | ################################################################################################
13 |
14 | args <- commandArgs(trailingOnly = F)
15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)])))
16 |
17 | # Source Config and functions.R file
18 | source(paste(BASE, "/../config.cfg", sep = ""))
19 | source(paste(BASE_DIR, "/code/functions.R", sep = ""))
20 |
21 | #Load any additional packages
22 | library(parallel)
23 | library(stylo)
24 | library(stringr)
25 | library(tm)
26 |
27 | # Read argument for train or test
28 | trainOrTest <- commandArgs(trailingOnly = TRUE)
29 | if (length(trainOrTest) > 1) {
30 | stop("ERROR: I need only 1 argument : train or test")
31 | }
32 |
33 | if (length(trainOrTest) == 0) {
34 | print("No Arguments passed, Assuming you mean test")
35 | trainOrTest <- "test"
36 | }
37 |
38 | #Load data
39 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
40 | cat("Reading file ", FILENAME, "\n", sep = " ")
41 | dat <- read_csv(FILENAME)
42 |
43 |
44 | #######################################
45 | # Start generating Features for DESCRIPTION columns
46 | print("Start generating nChars Features for DESCRIPTION columns")
47 | for (n in 1:3) {
48 | print(n)
49 | df2 <- data.frame(t(mcmapply(getNcharsCount, dat$cleandesc_1, dat$cleandesc_2, n, USE.NAMES = FALSE, mc.cores = preprocessing_nthreads)))
50 | colnames(df2) <- c(
51 | paste("countOf_", n, "_Chars_description_min", sep = ""),
52 | paste("countOf_", n, "_Chars_description_max", sep = ""),
53 | paste("countOf_", n, "_Chars_description_sum", sep = ""),
54 | paste("countOf_", n, "_Chars_description_diff", sep = ""),
55 |
56 | paste("countOf_", n, "_Chars_cleandesc_1", sep = ""),
57 | paste("countOf_", n, "_Chars_cleandesc_2", sep = ""),
58 | paste("countOfUnique_", n, "_Chars_cleandesc_1", sep = ""),
59 | paste("countOfUnique_", n, "_Chars_cleandesc_2", sep = ""),
60 | paste("ratioOf_", n, "_Chars_cleandesc_1_cleandesc_2", sep = ""),
61 | paste("ratioOfUnique_", n, "_Chars_cleandesc_1_cleandesc_2", sep = ""),
62 | paste("ratioOfIntersect_", n, "_chars_cleandesc_1_in_cleandesc_2", sep = ""),
63 | paste("ratioOfIntersect_", n, "_chars_cleandesc_2_in_cleandesc_1", sep = "")
64 | )
65 | if (nrow(df2) != nrow(dat)) {
66 | cat("Expecting", nrow(dat), "Got", nrow(df2), "\n", sep = " ")
67 | stop("mcmapply is behaving weird. Getting less results")
68 | }
69 |
70 | if (exists("df_master")) {
71 | df_master <- bind_cols(df_master, df2)
72 | } else {
73 | df_master <- df2
74 | }
75 | }
76 |
77 | names(df_master) <- paste("set1b", names(df_master), sep = "_")
78 |
79 | ######## Add Primary Columns ItemID1 and ItemID2
80 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
81 | print("Saving Description nchars features")
82 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1b_", "nchar_description.fthr", sep = "" ))
83 | rm(df_master, df2)
84 | gc()
85 |
86 | #######################################
87 | # Start generating Features for TITLE columns
88 | print("Start generating nChars Features for TITLE columns")
89 | for (n in 1:3) {
90 | print(n)
91 | df2 <- data.frame(t(mcmapply(getNcharsCount, dat$cleantitle_1, dat$cleantitle_2, n, USE.NAMES = FALSE, mc.cores = preprocessing_nthreads)))
92 | colnames(df2) <- c(
93 | paste("countOf_", n, "_Chars_title_min", sep = ""),
94 | paste("countOf_", n, "_Chars_title_max", sep = ""),
95 | paste("countOf_", n, "_Chars_title_sum", sep = ""),
96 | paste("countOf_", n, "_Chars_title_diff", sep = ""),
97 |
98 | paste("countOf_", n, "_Chars_cleantitle_1", sep = ""),
99 | paste("countOf_", n, "_Chars_cleantitle_2", sep = ""),
100 | paste("countOfUnique_", n, "_Chars_cleantitle_1", sep = ""),
101 | paste("countOfUnique_", n, "_Chars_cleantitle_2", sep = ""),
102 | paste("ratioOf_", n, "_Chars_cleantitle_1_cleantitle_2", sep = ""),
103 | paste("ratioOfUnique_", n, "_Chars_cleantitle_1_cleantitle_2", sep = ""),
104 | paste("ratioOfIntersect_", n, "_chars_cleantitle_1_in_cleantitle_2", sep = ""),
105 | paste("ratioOfIntersect_", n, "_chars_cleantitle_2_in_cleantitle_1", sep = "")
106 | )
107 | if (nrow(df2) != nrow(dat)) {
108 | cat("Expecting", nrow(dat), "Got", nrow(df2), "\n", sep = " ")
109 | stop("mcmapply is behaving weird. Getting less results")
110 | }
111 |
112 | if (exists("df_master")) {
113 | df_master <- bind_cols(df_master, df2)
114 | } else {
115 | df_master <- df2
116 | }
117 | }
118 | names(df_master) <- paste("set1b", names(df_master), sep = "_")
119 |
120 | ######## Add Primary Columns ItemID1 and ItemID2
121 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
122 | print("Saving Title nchars features")
123 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1b_", "nchar_title.fthr", sep = "" ))
124 | rm(df_master, df2)
125 | gc()
126 |
127 | #END
128 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/5_consolidate_features.R:
--------------------------------------------------------------------------------
1 | ################################################################################################
2 | ################################################################################################
3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
4 | #### Competition: Avito Duplicate Ad Detection
5 | # Filename : 3_feature_set1a_ngram.R
6 | # Description: This Rscript generates all ngram features
7 | # Usage:
8 | # Rscript ./code/3_feature_set1a_ngram.R train
9 | # Rscript ./code/3_feature_set1a_ngram.R test
10 | # Default argument is test
11 | ################################################################################################
12 | ################################################################################################
13 |
14 | # Source Config and functions.R file
15 | source("config.cfg")
16 | source("./code/functions.R")
17 |
18 | library(readr)
19 | library(dplyr)
20 | library(feather)
21 |
22 |
23 | # Read argument for train or test
24 | trainOrTest <- commandArgs(trailingOnly = TRUE)
25 | if (length(trainOrTest) > 1) {
26 | stop("ERROR: I need only 1 argument : train or test")
27 | }
28 |
29 | if (length(trainOrTest) == 0) {
30 | print("No Arguments passed, Assuming you mean test")
31 | trainOrTest <- "test"
32 | }
33 |
34 | #Load data
35 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
36 | cat("Reading file ", FILENAME, "\n", sep = " ")
37 | completeDate <- read_csv(FILENAME)
38 | if (trainOrTest == "train") {
39 | completeDate <- completeDate[, c("itemID_1", "itemID_2", "isDuplicate")]
40 | gc()
41 | } else {
42 | completeDate <- completeDate[, c("id", "itemID_1", "itemID_2")]
43 | gc()
44 | }
45 |
46 | ngram_title <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1a_ngram_title.fthr", sep = "" ))
47 | completeDate <- left_join(completeDate, ngram_title, by = c("itemID_1", "itemID_2"))
48 | rm(ngram_title)
49 |
50 | ngram_description <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1a_ngram_description.fthr", sep = "" ))
51 | completeDate <- left_join(completeDate, ngram_description, by = c("itemID_1", "itemID_2"))
52 | rm(ngram_description)
53 |
54 | nchar_title <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1b_nchar_title.fthr", sep = "" ))
55 | completeDate <- left_join(completeDate, nchar_title, by = c("itemID_1", "itemID_2"))
56 | rm(nchar_title)
57 |
58 | nchar_description <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1b_nchar_description.fthr", sep = "" ))
59 | completeDate <- left_join(completeDate, nchar_description, by = c("itemID_1", "itemID_2"))
60 | rm(nchar_description)
61 |
62 | misc <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1c_misc.fthr", sep = "" ))
63 | completeDate <- left_join(completeDate, misc, by = c("itemID_1", "itemID_2"))
64 | rm(misc)
65 |
66 | interaction <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1d_interaction.fthr", sep = "" ))
67 | completeDate <- left_join(completeDate, interaction, by = c("itemID_1", "itemID_2"))
68 | rm(interaction)
69 |
70 | attributes <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1e_attributes.fthr", sep = "" ))
71 | completeDate <- left_join(completeDate, attributes, by = c("itemID_1", "itemID_2"))
72 | rm(attributes)
73 |
74 | specialCounting <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1f_specialCounting.fthr", sep = "" ))
75 | completeDate <- left_join(completeDate, specialCounting, by = c("itemID_1", "itemID_2"))
76 | rm(specialCounting)
77 |
78 | capitalLetters <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1g_capitalLetters.fthr", sep = "" ))
79 | completeDate <- left_join(completeDate, capitalLetters, by = c("itemID_1", "itemID_2"))
80 | rm(capitalLetters)
81 |
82 | image <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1h_image.fthr", sep = "" ))
83 | completeDate <- left_join(completeDate, image, by = c("itemID_1", "itemID_2"))
84 | rm(image)
85 |
86 | imageSize <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set1i_imageSize.fthr", sep = "" ))
87 | completeDate <- left_join(completeDate, imageSize, by = c("itemID_1", "itemID_2"))
88 | rm(imageSize)
89 |
90 |
91 |
92 | location_levenshtein <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set2a_location_levenshtein.fthr", sep = "" ))
93 | completeDate <- left_join(completeDate, location_levenshtein, by = c("itemID_1", "itemID_2"))
94 | rm(location_levenshtein)
95 |
96 | brisk <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set2b_brisk.fthr", sep = "" ))
97 | completeDate <- left_join(completeDate, brisk, by = c("itemID_1", "itemID_2"))
98 | rm(brisk)
99 |
100 | histogram <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set2c_histogram.fthr", sep = "" ))
101 | completeDate <- left_join(completeDate, histogram, by = c("itemID_1", "itemID_2"))
102 | rm(histogram)
103 |
104 |
105 | consolidated <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set3_consolidated.fthr", sep = "" ))
106 | completeDate <- left_join(completeDate, consolidated, by = c("itemID_1", "itemID_2"))
107 | rm(consolidated)
108 |
109 |
110 | fuzzy <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set4a_fuzzy.fthr", sep = "" ))
111 | completeDate <- left_join(completeDate, fuzzy, by = c("itemID_1", "itemID_2"))
112 | rm(fuzzy)
113 |
114 | fuzzy_clean <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set4b_fuzzy_clean.fthr", sep = "" ))
115 | completeDate <- left_join(completeDate, fuzzy_clean, by = c("itemID_1", "itemID_2"))
116 | rm(fuzzy_clean)
117 |
118 | alternate <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set4c_alternate.fthr", sep = "" ))
119 | completeDate <- left_join(completeDate, alternate, by = c("itemID_1", "itemID_2"))
120 | rm(alternate)
121 |
122 | similarity <- read_feather(paste(cache_loc, "/features_",trainOrTest, "_set4d_similarity.fthr", sep = "" ))
123 | completeDate <- left_join(completeDate, similarity, by = c("itemID_1", "itemID_2"))
124 | rm(similarity)
125 | gc()
126 |
127 | print("Saving Final Files")
128 | write_feather(completeDate, paste("cache/final_featureSet_", trainOrTest, ".fthr", sep = "" ))
129 | print("DONE")
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set4c_alternate.py:
--------------------------------------------------------------------------------
1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
2 | #### Author: Marios & Mikel
3 | #### Avito Duplicate Ad Detection
4 | # 3_feature_set4b_fuzzy_clean.py
5 | # Creates various text similarity features
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import sys
10 | import jellyfish
11 | import feather
12 | import time
13 | import gc
14 | import re
15 | import math
16 | from collections import Counter
17 | from fuzzywuzzy import fuzz
18 | from multiprocessing import Pool
19 |
20 | import libavito as a
21 |
22 | WORD = re.compile(r'\w+')
23 |
24 | def get_cosine(text1, text2):
25 | vec1 = text_to_vector(text1)
26 | vec2 = text_to_vector(text2)
27 | intersection = set(vec1.keys()) & set(vec2.keys())
28 | numerator = sum([vec1[x] * vec2[x] for x in intersection])
29 |
30 | sum1 = sum([vec1[x]**2 for x in vec1.keys()])
31 | sum2 = sum([vec2[x]**2 for x in vec2.keys()])
32 | denominator = math.sqrt(sum1) * math.sqrt(sum2)
33 |
34 | if not denominator:
35 | return 0.0
36 | else:
37 | return float(numerator) / denominator
38 |
39 | def text_to_vector(text):
40 | words = WORD.findall(text)
41 | return Counter(words)
42 |
43 |
44 | def count_2words_together(words, text, ranges):
45 | count2 = 0
46 | if len(words) < 2 or len(text) < 2:
47 | return -1
48 | else:
49 | for m in range(0, len(words) - 1):
50 | words1 = words[m]
51 | for n in range(m + 1, len(words)):
52 | words2 = words[n]
53 | if words1 in text:
54 | ind = text.index(words1)
55 | try:
56 | words2 in text[ind + 1:ind + 1 + ranges]
57 | count2 += 1
58 | except:
59 | pass
60 | return count2
61 |
62 | def count_2words(words, text):
63 | # To count how many times of the search terms having two words at least showing in texts.
64 | count2 = 0
65 | if len(words) < 2 or len(text) < 2:
66 | return -1
67 | else:
68 | for m in range(0, len(words) - 1):
69 | words1 = words[m]
70 | for n in range(m + 1, len(words)):
71 | words2 = words[n]
72 | if words1 in text and words2 in text:
73 | count2 += 1
74 | return count2
75 |
76 | def calculate_similarity_simple(str1, str2):
77 | count = 0
78 | if str1 in str2:
79 | count = 1
80 | return count
81 |
82 | def calculate_similarity_split(str1, str2):
83 | count = 0
84 | countabs = 0
85 | countper = 0
86 | split1 = str1.split(" ")
87 | split2 = str2.split(" ")
88 | for s1 in split1:
89 | for s2 in split2:
90 | if s1 in s2:
91 | count += 1
92 | if s1 == s2:
93 | countabs += 1
94 | countper += 1
95 |
96 | return count, countabs, countabs / (countper + 1)
97 |
98 | def process_row(row):
99 |
100 | title = 2
101 | desc = 4
102 | json = 6
103 |
104 | pairs = [[title, desc], [desc, title], [title, json], [json, title], [desc, json], [json, desc]]
105 | values = []
106 | # string feature counts
107 |
108 | values.append(row[0])
109 | values.append(row[1])
110 |
111 | for d, s in pairs:
112 | st_1 = str(row[d]).replace(":", " ")
113 | st_2 = str(row[s + 1]).replace(":", " ")
114 | values.append(calculate_similarity_simple(st_1, st_2))
115 | val1, val2, val3 = calculate_similarity_split(st_1, st_2)
116 | values.append(val1)
117 | values.append(val2)
118 | values.append(val3)
119 | st_1_array = st_1.split(" ")
120 | st_2_array = st_2.split(" ")
121 | values.append(count_2words(st_1_array, st_2_array))
122 | values.append(get_cosine(st_1, st_2))
123 | values.append(count_2words_together(st_1_array, st_2_array, 1))
124 | values.append(count_2words_together(st_1_array, st_2_array, 5))
125 |
126 | return values
127 |
128 | print(a.c.BOLD + 'Extracting set4c alternate text features ...' + a.c.END)
129 |
130 | # Get train/test mode from launch argument
131 | mode = a.get_mode(sys.argv, '3_feature_set4c_fuzzy_clean.py')
132 |
133 | ## Read settings required by script
134 | config = a.read_config()
135 | nthreads = config.preprocessing_nthreads
136 | cache_loc = config.cache_loc
137 | debug = config.debug
138 | if mode == 0:
139 | root = config.train_images_root
140 | df = feather.read_dataframe(cache_loc + 'train.fthr')
141 | if mode == 1:
142 | root = config.test_images_root
143 | df = feather.read_dataframe(cache_loc + 'test.fthr')
144 |
145 | df = df[['itemID_1', 'itemID_2', 'title_1', 'title_2', 'description_1', 'description_2', 'attrsJSON_1', 'attrsJSON_2']]
146 |
147 | ftrs = []
148 |
149 | start = time.time()
150 | o = len(df.index)
151 | if nthreads == 1:
152 | print('Extracting features with 1 thread ...')
153 | k = 0
154 | # Iterate over files
155 | ftrs = []
156 | for row in df.values:
157 | x = process_row(row)
158 | ftrs.append(x)
159 | k += 1
160 | if k % 100 == 0:
161 | a.print_progress(k, start, o)
162 |
163 | # Otherwise perform multi-threaded mapping
164 | else:
165 | print('Extracting features multi-threaded ... ', end='', flush=True)
166 | pool = Pool(nthreads)
167 | ftrs = pool.map(process_row, df.values)
168 | pool.close()
169 | gc.collect()
170 |
171 | a.print_elapsed(start)
172 |
173 | ftrs = pd.DataFrame(ftrs)
174 | cols = ['itemID_1', 'itemID_2'] + ['set4c_X' + str(i) for i in range(1, len(ftrs.columns.tolist()) - 1)]
175 | print(cols)
176 | ftrs.columns = cols
177 |
178 | # Save updated dataset
179 | if mode == 0:
180 | feather.write_dataframe(ftrs, cache_loc + 'features_train_set4c_alternate.fthr')
181 | if mode == 1:
182 | feather.write_dataframe(ftrs, cache_loc + 'features_test_set4c_alternate.fthr')
183 |
184 | a.print_elapsed(start)
185 | print('set4c extraction complete!')
186 |
187 | # Write status to status file so master script knows whether to proceed.
188 | f = open(cache_loc + 'status.txt', 'a')
189 | f.write('feature_set4c_OK\n')
190 | f.close()
191 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/3_feature_set1c_misc.R:
--------------------------------------------------------------------------------
1 | ################################################################################################
2 | ################################################################################################
3 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
4 | #### Competition: Avito Duplicate Ad Detection
5 | # Filename : 3_feature_set1c_misc.R
6 | # Description: This Rscript generates all ngram features
7 | # Usage:
8 | # Rscript ./code/3_feature_set1c_misc.R train
9 | # Rscript ./code/3_feature_set1c_misc.R test
10 | # Default argument is test
11 | ################################################################################################
12 | ################################################################################################
13 |
14 | args <- commandArgs(trailingOnly = F)
15 | BASE <- normalizePath(dirname(sub("^--file=", "", args[grep("^--file=", args)])))
16 |
17 | # Source Config and functions.R file
18 | source(paste(BASE, "/../config.cfg", sep = ""))
19 | source(paste(BASE_DIR, "/code/functions.R", sep = ""))
20 |
21 | #Load any additional packages
22 | library(parallel)
23 | library(stylo)
24 |
25 | # Read argument for train or test
26 | trainOrTest <- commandArgs(trailingOnly = TRUE)
27 | if (length(trainOrTest) > 1) {
28 | stop("ERROR: I need only 1 argument : train or test")
29 | }
30 |
31 | if (length(trainOrTest) == 0) {
32 | print("No Arguments passed, Assuming you mean test")
33 | trainOrTest <- "test"
34 | }
35 |
36 | #Load data
37 | FILENAME <- paste(cache_loc, "/", trainOrTest, ".csv", sep = "")
38 | cat("Reading file ", FILENAME, "\n", sep = " ")
39 | dat <- read_csv(FILENAME)
40 |
41 |
42 | ######## IDs and Long and Lat Features
43 | print("Generating Binary features ")
44 | isMetroIdSame <- ifelse(dat$metroID_1 == dat$metroID_2, 1, 0)
45 | isLocationIDSame <- ifelse(dat$locationID_1 == dat$locationID_2, 1, 0)
46 | isRegionIDSame <- ifelse(dat$regionID_1 == dat$regionID_2, 1, 0)
47 | isLongitudeSame <- ifelse(round(dat$lon_1, 2) == round(dat$lon_2, 2), 1, 0)
48 | isLatitudeSame <- ifelse(round(dat$lat_1, 2) == round(dat$lat_2, 2), 1, 0)
49 | isTitleSame <- ifelse(tolower(dat$cleantitle_1) == tolower(dat$cleantitle_2), 1, 0) #isTitle Same
50 | isdescriptionSame <- ifelse(tolower(dat$cleandesc_1) == tolower(dat$cleandesc_2), 1, 0) #isdescription Same
51 |
52 | ######## PRICE Features
53 | print("Generating Price features ")
54 | priceDiff <- abs(dat$price_1 - dat$price_2)
55 | ratioOfPrices <- dat$price_1 / dat$price_2
56 | ratioOfPrices <- round(ifelse(ratioOfPrices > 1, 1/ratioOfPrices, ratioOfPrices), 3)
57 | both_price_na <- ifelse(is.na(dat$price_1) & is.na(dat$price_2), 1, 0) #Both Price NA
58 | one_price_na <- ifelse(is.na(dat$price_1) | is.na(dat$price_2), 1, 0) #One Price NA
59 | total_price <- (dat$price_1 + dat$price_2) #Total Price
60 |
61 |
62 | ######## IMAGE Features
63 | print("Generating Image features")
64 | library(stringr)
65 | imageCount_sum <- str_count(dat$images_array_1, '[0-9.]+') + str_count(dat$images_array_2, '[0-9.]+')
66 | imageCount_diff <- abs(str_count(dat$images_array_1, '[0-9.]+') - str_count(dat$images_array_2, '[0-9.]+'))
67 | imageCount_min <- pmin(str_count(dat$images_array_1, '[0-9.]+'), str_count(dat$images_array_2, '[0-9.]+'), na.rm = F)
68 | imageCount_max <- pmax(str_count(dat$images_array_1, '[0-9.]+'), str_count(dat$images_array_2, '[0-9.]+'), na.rm = F)
69 | ratioOfNumberOfImages <- str_count(dat$images_array_1, '[0-9.]+') / str_count(dat$images_array_2, '[0-9.]+')
70 | ratioOfNumberOfImages <- round(ifelse(ratioOfNumberOfImages > 1, 1/ratioOfNumberOfImages, ratioOfNumberOfImages), 3)
71 |
72 | ######## DISTANCE STRING Features
73 | library(stringdist)
74 | print("Generating Text Distance features for title")
75 | titleDistance_cosine <- round(mcmapply(stringdist, dat$cleantitle_1, dat$cleantitle_2, method = "cosine", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3)
76 | titleDistance_hamming <- round(mcmapply(stringdist, dat$cleantitle_1, dat$cleantitle_2, method = "hamming", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3)
77 | titleDistance_jaccard <- round(mcmapply(stringdist, dat$cleantitle_1, dat$cleantitle_2, method = "jaccard", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3)
78 |
79 | print("Generating Text Distance features for description")
80 | descriptionDistance_cosine <- round(mcmapply(stringdist, dat$cleandesc_1, dat$cleandesc_2, method = "cosine", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3)
81 |
82 | descriptionDistance_hamming <- round(mcmapply(stringdist, dat$cleandesc_1, dat$cleandesc_2, method = "hamming", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3)
83 |
84 | descriptionDistance_jaccard <- round(mcmapply(stringdist, dat$cleandesc_1, dat$cleandesc_2, method = "jaccard", USE.NAMES = F, mc.cores = preprocessing_nthreads), 3)
85 |
86 |
87 | ######## DATA FRAME
88 | df_master <- data.frame( isMetroIdSame = isMetroIdSame,
89 | isLocationIDSame = isLocationIDSame,
90 | isRegionIDSame = isRegionIDSame,
91 | isLongitudeSame = isLongitudeSame,
92 | isLatitudeSame = isLatitudeSame,
93 | isTitleSame = isTitleSame,
94 | isdescriptionSame = isdescriptionSame,
95 | priceDiff = priceDiff,
96 | ratioOfPrices = ratioOfPrices,
97 | both_price_na = both_price_na,
98 | one_price_na = one_price_na,
99 | total_price = total_price,
100 | imageCount_sum = imageCount_sum,
101 | imageCount_diff = imageCount_diff,
102 | imageCount_min = imageCount_min,
103 | imageCount_max = imageCount_max,
104 | ratioOfNumberOfImages = ratioOfNumberOfImages,
105 | titleDistance_cosine = titleDistance_cosine,
106 | titleDistance_hamming = titleDistance_hamming,
107 | titleDistance_jaccard = titleDistance_jaccard,
108 | descriptionDistance_cosine = descriptionDistance_cosine,
109 | descriptionDistance_hamming = descriptionDistance_hamming,
110 | descriptionDistance_jaccard = descriptionDistance_jaccard
111 | )
112 |
113 | set1d <- df_master #making a copy for geenrating interaction features. Need to do this before renaming columns
114 |
115 | names(df_master) <- paste("set1c", names(df_master), sep = "_")
116 | ######## Add Primary Columns ItemID1 and ItemID2
117 | df_master <- cbind(dat[, grep("itemID_", names(dat), value = TRUE)], df_master)
118 | print("Saving Misc features")
119 | write_feather(df_master, paste(cache_loc, "/", "features_", trainOrTest, "_set1c_", "misc.fthr", sep = "" ))
120 |
121 | # Start Interaction feature script
122 | source("./code/3_feature_set1d_interaction.R")
123 | #END
124 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/1_data_preprocessing.py:
--------------------------------------------------------------------------------
1 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar, Peter Borrmann & Marios Michailidis // TheQuants
2 | #### Author: Mikel
3 | #### Avito Duplicate Ad Detection
4 | # 1_data_preprocessing.py
5 | # Takes in input data, cleans text and merges itemIDs.
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import nltk
10 | import sklearn
11 | import json
12 | import math
13 | import feather #import pickle - feather used instead as it is compatible with R
14 | from pandas.io.json import json_normalize
15 | import unicodedata
16 | from stop_words import get_stop_words
17 | import time
18 | from multiprocessing import Pool
19 | import sys
20 | import gc
21 | from imp import load_source
22 |
23 | import libavito as a
24 |
25 | #########################
26 | ##### SCRIPT CONFIG #####
27 | #########################
28 |
29 | # Define cleaning parameters
30 | stopwords = get_stop_words('ru')
31 | exclude_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'Sk', 'Sc', 'So', 'Co', 'Cf', 'Cc', 'Cs', 'Cn'])
32 | sno = nltk.stem.SnowballStemmer('russian')
33 |
34 | #########################
35 |
36 | print(a.c.BOLD + 'Cleaning input data ...' + a.c.END)
37 |
38 | # Get train/test mode from launch argument
39 | mode = a.get_mode(sys.argv, '1_data_preprocessing.py')
40 |
41 | ## Read settings required by script
42 | config = a.read_config()
43 | nthreads = config.preprocessing_nthreads
44 | cache_loc = config.cache_loc
45 | category_loc = config.category_csv
46 | location_loc = config.location_csv
47 | debug = config.debug
48 | if mode == 0:
49 | data_loc = config.train_ItemInfo
50 | pairs_loc = config.train_ItemPairs
51 | if mode == 1:
52 | data_loc = config.test_ItemInfo
53 | pairs_loc = config.test_ItemPairs
54 |
55 | # Read file for processing into memory
56 | start = time.time()
57 | print('Reading input data ... ', end='', flush=True)
58 | df = pd.read_csv(data_loc)
59 | a.print_elapsed(start)
60 |
61 | def get_clean_tokens(text):
62 | newtext = []
63 |
64 | # lower text
65 | text = text.lower()
66 |
67 | # replace punctation
68 | text = ''.join(x if unicodedata.category(x) not in exclude_cats else ' ' for x in text)
69 |
70 | # replace some symbols
71 | text = ''.join(x if x not in ["'", '`', '>', '<', '=', '+'] else ' ' for x in text)
72 |
73 | # tokenize the text
74 | text0 = nltk.word_tokenize(text, 'russian')
75 |
76 | # word by word
77 | for y in text0:
78 | # remove stopwords and stemming
79 | if len(y) > 0 and y not in stopwords:
80 | newtext.append(sno.stem(y))
81 |
82 | return newtext
83 |
84 | def process_line(i):
85 | # Lists to store tokens in
86 | tx = []
87 | dx = []
88 | resx = []
89 |
90 | # Pluck initial strings from dataframe
91 | title = str(df.iloc[i]['title'])
92 | desc = str(df.iloc[i]['description'])
93 | jx = str(df.iloc[i]['attrsJSON']).lower()
94 |
95 | tx = get_clean_tokens(title)
96 | dx = get_clean_tokens(desc)
97 |
98 | # Process JSON
99 | try:
100 | resx = json.loads(jx)
101 | for key in resx.keys():
102 | a = get_clean_tokens(resx[key])
103 | resx[key] = " ".join(a)
104 | except:
105 | resx = []
106 | if debug == 1:
107 | print('DEBUG: Failed to read JSON "' + json + '" at ' + str(i))
108 | pass
109 |
110 | jxs = '' + json.dumps(resx, ensure_ascii=False)
111 | txs = ' '.join(tx)
112 | dxs = ' '.join(dx)
113 |
114 | del tx, resx, dx
115 | gc.collect()
116 |
117 | return [txs, dxs, jxs]
118 |
119 | # def process_line(i):
120 | # return ['empty', 'empty', 'empty']
121 |
122 | newtitles = []
123 | newdescs = []
124 | newjson = []
125 | ids = df['itemID'].values
126 |
127 | start = time.time()
128 | # If number of threads is equal to 1, output time remaining etc.
129 | o = len(df.index)
130 | if nthreads == 1:
131 | print('Cleaning text with 1 thread ...')
132 | k = 0
133 | # Iterate over lines
134 | for i in range(0, o):
135 | x = process_line(i)
136 | newtitles.append(x[0])
137 | newdescs.append(x[1])
138 | newjson.append(x[2])
139 | k += 1
140 | if k % 100 == 0:
141 | a.print_progress(k, start, o)
142 | # Otherwise perform multi-threaded mapping
143 | else:
144 | print('Cleaning text multi-threaded ... ', end='', flush=True)
145 | pool = Pool(nthreads)
146 | newdata = pool.map(process_line, range(0, o))
147 | pool.close()
148 | for x in newdata:
149 | newtitles.append(x[0])
150 | newdescs.append(x[1])
151 | newjson.append(x[2])
152 |
153 | del newdata
154 | gc.collect()
155 |
156 | a.print_elapsed(start)
157 |
158 | #########################
159 |
160 | print(a.c.BOLD + 'Joining input data ...' + a.c.END)
161 |
162 | # Joining cleaned data into original data
163 | df['cleandesc'] = newdescs
164 | df['cleantitle'] = newtitles
165 | df['cleanjson'] = newjson
166 |
167 | # Memory management
168 | del newdescs, newtitles, newjson
169 | gc.collect()
170 |
171 | start = time.time()
172 | print('Joining parentCategory ... ', end='', flush=True)
173 | category = pd.read_csv(category_loc)
174 | df = df.merge(category, on=['categoryID'], copy=False)
175 | a.print_elapsed(start)
176 |
177 | start = time.time()
178 | print('Joining regionID ... ', end='', flush=True)
179 | location = pd.read_csv(location_loc)
180 | df = df.merge(location, on=['locationID'], copy=False)
181 | a.print_elapsed(start)
182 |
183 | start = time.time()
184 | print('Joining itemPairs ...', end='', flush=True)
185 | itemPairs = pd.read_csv(pairs_loc)
186 | df = pd.merge(pd.merge(itemPairs, df, how='inner', left_on='itemID_1', right_on='itemID'), df, how='inner', left_on='itemID_2', right_on='itemID') # , suffixes=('_1', '_2'))
187 | df.drop(['itemID_x', 'itemID_y'], axis=1, inplace=True)
188 | df.columns = [c.replace('_x', '_1').replace('_y', '_2') for c in df.columns]
189 | a.print_elapsed(start)
190 |
191 | start = time.time()
192 | print('Caching cleaned data ... ', end='', flush=True)
193 |
194 | # Save updated dataset
195 | if mode == 0:
196 | #pickle.dump(df, open(cache_loc + 'train.bin', 'wb'), protocol=4)
197 | feather.write_dataframe(df, cache_loc + 'train.fthr')
198 | df.to_csv(cache_loc + 'train.csv', index=False)
199 | if mode == 1:
200 | #pickle.dump(df, open(cache_loc + 'test.bin', 'wb'), protocol=4)
201 | feather.write_dataframe(df, cache_loc + 'test.fthr')
202 | df.to_csv(cache_loc + 'test.csv', index=False)
203 |
204 | a.print_elapsed(start)
205 | print('Data preprocessing complete!')
206 |
207 | # Write status to status file so master script knows whether to proceed.
208 | f = open(cache_loc + 'status.txt', 'a')
209 | f.write('data_preprocessing_OK\n')
210 | f.close()
211 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/runAll.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ################################################################################################
3 | ################################################################################################
4 | #### Copyright (c) 2016 Mikel Bober-Irizar, Sonny Laskar & Peter Borrmann // TheQuants
5 | #### Competition: Avito Duplicate Ad Detection
6 | # Filename : runAll.sh
7 | # Description: This bash Script generates the Submission Files
8 | # Usage:
9 | # bash ./runAll.sh
10 | ################################################################################################
11 | ################################################################################################
12 |
13 |
14 | echo "`tput smso` Running Data Preprocessing`tput rmso`"
15 | python3 code/1_data_preprocessing.py --train
16 | python3 code/1_data_preprocessing.py --test
17 |
18 | echo "`tput smso` Running Image Processing`tput rmso`"
19 | python3 code/2_image_info.py
20 |
21 | echo "`tput smso` Extracing NGrams`tput rmso`"
22 | Rscript code/runnAll.sh train
23 | Rscript code/runnAll.sh test
24 |
25 | echo "`tput smso` Extracting NChars`tput rmso`"
26 | Rscript code/3_feature_set1b_nchar.R train
27 | Rscript code/3_feature_set1b_nchar.R test
28 |
29 | echo "`tput smso` Extracting Misc Features`tput rmso`"
30 | Rscript code/3_feature_set1c_misc.R train
31 | Rscript code/3_feature_set1c_misc.R test
32 |
33 | echo "`tput smso`Extracing Attributes `tput rmso`"
34 | Rscript code/3_feature_set1e_attribute.R train
35 | Rscript code/3_feature_set1e_attribute.R test
36 |
37 | echo "`tput smso`Extracting Special Counting Features `tput rmso`"
38 | Rscript code/3_feature_set1f_SpecialCounting.R train
39 | Rscript code/3_feature_set1f_SpecialCounting.R test
40 |
41 | echo "`tput smso` Extracting Capital Letters`tput rmso`"
42 | Rscript code/3_feature_set1g_capitalLetters.R train
43 | Rscript code/3_feature_set1g_capitalLetters.R test
44 |
45 | echo "`tput smso` Extracting hash features `tput rmso`"
46 | Rscript code/3_feature_set1h_images.R train
47 | Rscript code/3_feature_set1h_images.R test
48 |
49 | echo "`tput smso` Extracing Image Size Features `tput rmso`"
50 | Rscript code/3_feature_set1i_imagesSize.R train
51 | Rscript code/3_feature_set1i_imagesSize.R test
52 |
53 | echo "`tput smso` Extracing Location `tput rmso`"
54 | python3 code/3_feature_set2a_lev_loc.py --train
55 | python3 code/3_feature_set2a_lev_loc.py --test
56 |
57 | echo "`tput smso` Extracing BRISK`tput rmso`"
58 | python3 code/3_feature_set2b_brisk.py --train
59 | python3 code/3_feature_set2b_brisk.py --test
60 |
61 | echo "`tput smso`Extracting Histograms `tput rmso`"
62 | python3 code/3_feature_set2c_hist.py --train
63 | python3 code/3_feature_set2c_hist.py --test
64 |
65 | echo "`tput smso`Extracing Descriptions `tput rmso`"
66 | python3 code/3_feature_set3a_description.py --train
67 | python3 code/3_feature_set3a_description.py --test
68 |
69 | echo "`tput smso`Extracting Title `tput rmso`"
70 | python3 code/3_feature_set3b_title.py --train
71 | python3 code/3_feature_set3b_title.py --test
72 |
73 | echo "`tput smso` Extracting Json `tput rmso`"
74 | python3 code/3_feature_set3c_json.py --train
75 | python3 code/3_feature_set3c_json.py --test
76 |
77 | echo "`tput smso` Extracing Jsonpart2 `tput rmso`"
78 | python3 code/3_feature_set3d_json1.py --train
79 | python3 code/3_feature_set3d_json1.py --test
80 |
81 | echo "`tput smso`Extracing hamming `tput rmso`"
82 | python3 code/3_feature_set3f_hamming.py --train
83 | python3 code/3_feature_set3f_hamming.py --test
84 |
85 | echo "`tput smso`Extracing Json to Col `tput rmso`"
86 | python3 code/3_json_to_cols.py
87 |
88 | echo "`tput smso`Extracing WOE `tput rmso`"
89 | Rscript code/3_feature_set3g_json_to_cols_createWOE.R train
90 | Rscript code/3_feature_set3g_json_to_cols_createWOE.R test
91 |
92 | echo "`tput smso` Consolidating few features `tput rmso`"
93 | Rscript code/3_feature_set3z_consolidate.R train
94 | Rscript code/3_feature_set3z_consolidate.R test
95 |
96 | echo "`tput smso` Extracing Fuzzy`tput rmso`"
97 | python3 code/3_feature_set4a_fuzzy.py --train
98 | python3 code/3_feature_set4a_fuzzy.py --test
99 |
100 | echo "`tput smso` Extracting fuzzy Clean`tput rmso`"
101 | python3 code/3_feature_set4b_fuzzy_clean.py --train
102 | python3 code/3_feature_set4b_fuzzy_clean.py --test
103 |
104 | echo "`tput smso`Extracing Alternate `tput rmso`"
105 | python3 code/3_feature_set4c_alternate.py --train
106 | python3 code/3_feature_set4c_alternate.py --test
107 |
108 | echo "`tput smso` Extracing Similarity`tput rmso`"
109 | python3 code/3_feature_set4d_similarity_clean.py --train
110 | python3 code/3_feature_set4d_similarity_clean.py --test
111 |
112 | echo "`tput smso`Extracing BOW `tput rmso`"
113 | python3 code/4_bag_of_words.py
114 |
115 |
116 |
117 | ############################################################################################
118 | ############################################################################################
119 | #Consolidate All Features
120 | echo "`tput smso`CONSOLIDATING ALL FEATURES `tput rmso`"
121 | Rscript code/5_consolidate_features.R train
122 | Rscript code/5_consolidate_features.R test
123 |
124 | echo "`tput smso`Replacing all NaN and Inf`tput rmso`"
125 | python3 code/5_data_postprocessing.py --train
126 | python3 code/5_data_postprocessing.py --test
127 |
128 | echo "FEATURES DONE"
129 | ############################################################################################
130 | echo "Running models"
131 |
132 | echo "`tput smso`Running logit_v2`tput rmso`"
133 | python2 code/models/marios_logit_v2.py
134 |
135 | echo "`tput smso`Running nn_v1`tput rmso`"
136 | python2 code/models/marios_nn_v1.py
137 |
138 | echo "`tput smso`Running nnnew_v2`tput rmso`"
139 | python2 code/models/marios_nnnew_v2.py
140 |
141 | echo "`tput smso`Running nnnew_v3`tput rmso`"
142 | python2 code/models/marios_nnnew_v3.py
143 |
144 | echo "`tput smso`Running nnnew_v4`tput rmso`"
145 | python2 code/models/marios_nnnew_v4.py
146 |
147 | echo "`tput smso`Running ridge_v2`tput rmso`"
148 | python2 code/models/marios_ridge_v2.py
149 |
150 | echo "`tput smso`Running sgd_v2`tput rmso`"
151 | python2 code/models/marios_sgd_v2.py
152 |
153 | echo "`tput smso`Running xg_v1`tput rmso`"
154 | python2 code/models/marios_xg_v1.py
155 |
156 | echo "`tput smso`Running xgrank_v2`tput rmso`"
157 | python2 code/models/marios_xgrank_v2.py
158 |
159 | echo "`tput smso`Running xgrank_v3`tput rmso`"
160 | python2 code/models/marios_xgrank_v3.py
161 |
162 | echo "`tput smso`Running xgregv3`tput rmso`"
163 | python2 code/models/marios_xgregv3.py
164 |
165 | echo "`tput smso`Running xgson_v2`tput rmso`"
166 | python2 code/models/marios_xgson_v2.py
167 |
168 | echo "`tput smso`Running xgson_v3`tput rmso`"
169 | python2 code/models/marios_xgson_v3.py
170 |
171 | echo "`tput smso`Running xgson_v4`tput rmso`"
172 | python2 code/models/marios_xgson_v4.py
173 |
174 | echo "`tput smso`Running xgson_v2_v5`tput rmso`"
175 | python2 code/models/marios_xgson_v2_v5.py
176 |
177 | echo "`tput smso`Running meta-model`tput rmso`"
178 | python2 code/models/meta_rf_v1.py
179 |
180 | echo "MODELS DONE"
181 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgregv3.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.preprocessing import StandardScaler
3 | from sklearn.metrics import roc_auc_score
4 | import XGBoostClassifier as xg
5 | import os
6 | import libavito
7 | import feather
8 |
9 | # bagger for xgboost
10 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
11 |
12 | # create array object to hold predictions
13 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))]
14 | #loop for as many times as we want bags
15 | for n in range (0, estimators):
16 | #shuff;e first, aids in increasing variance and forces different results
17 | #X_t,y_c=shuffle(X,y, random_state=seed+n)
18 |
19 | if update_seed: # update seed if requested, to give a slightly different model
20 | model.set_params(random_state=seed + n)
21 | model.fit(X_t,y_c) # fit model0.0917411475506
22 | preds=model.predict(xt) # predict probabilities
23 | # update bag's array
24 | for j in range (0, (xt.shape[0])):
25 | baggedpred[j]+=preds[j]
26 | print("done bag %d " % (n))
27 | # divide with number of bags to create an average estimate
28 | for j in range (0, len(baggedpred)):
29 | baggedpred[j]/=float(estimators)
30 | # return probabilities
31 | return np.array(baggedpred)
32 |
33 |
34 |
35 |
36 | def loadcolumn(filename,col=4, skip=1, floats=True):
37 | pred=[]
38 | op=open(filename,'r')
39 | if skip==1:
40 | op.readline() #header
41 | for line in op:
42 | line=line.replace('\n','')
43 | sps=line.split(',')
44 | #load always the last columns
45 | if floats:
46 | pred.append(float(sps[col]))
47 | else :
48 | pred.append(str(sps[col]))
49 | op.close()
50 | return pred
51 |
52 |
53 | def printfilcsve(X, filename):
54 |
55 | np.savetxt(filename,X, fmt='%.5f')
56 |
57 |
58 | # read the train and test allclean.csv files. skip errors
59 | def readfile(name, index=0):
60 | dopen=open(name,"r")
61 | array=[]
62 | skip_firstrow=False
63 | if index!=0:
64 | skip_firstrow=True
65 | for i,line in enumerate(dopen):
66 | if i==0 and skip_firstrow:
67 | continue
68 | splits=line.replace("\n","").replace(" ","").split(",")
69 | ar=[]
70 | for k in splits:
71 | try:
72 | ar.append(float(k))
73 | except:
74 | ar.append(0.0)
75 | print(" the string is %s ok?" % ((k)))
76 | array.append(ar)#[float(k)0.971474 if k!="0" else 0.0 for k in splits ])
77 | if i%100000==0:
78 | print(" we are at " , str(i))
79 | return np.array(array)
80 |
81 |
82 | def main():
83 |
84 | config = libavito.get_config()
85 | cache_loc = config.cache_loc
86 | nthreads = config.nthreads
87 |
88 | Usecv=True # true will split the training data 66-33 and do cv
89 | SEED=15
90 | threads=nthreads # number of workers for parallelism
91 |
92 | ######### Load files ############
93 | print("Loading input data")
94 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
95 | y = train['isDuplicate'].values
96 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
97 | del train
98 | print(X.shape)
99 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
100 | ids = test['id'].values
101 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
102 | del test
103 | print(X_test.shape)
104 |
105 |
106 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
107 | if not os.path.exists(metafolder): #if it does not exists, we create it
108 | os.makedirs(metafolder)
109 | outset="marios_xgreg_v3" # predic of all files
110 |
111 | #model to use
112 |
113 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)]
114 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )]
115 | kfolder=[[idex1,idex2]]
116 | #Create Arrays for meta
117 | train_stacker=[ 0.0 for k in range (0,len(idex2)) ]
118 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))]
119 | # CHECK EVerything in five..it could be more efficient
120 |
121 | #create target variable
122 | mean_kapa = 0.0
123 | #kfolder=StratifiedKFold(y, n_folds=number_of_folds,shuffle=True, random_state=SEED)
124 | #number_of_folds=0
125 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
126 | i=0 # iterator counter
127 | if Usecv:
128 | print ("starting cross validation")
129 | for train_index, test_index in kfolder:
130 | # creaning and validation sets
131 | X_train, X_cv = X[train_index], X[test_index]
132 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
133 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
134 |
135 |
136 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
137 |
138 |
139 | # compute Loglikelihood metric for this CV fold
140 | #scalepreds(preds)
141 | kapa = roc_auc_score(y_cv,preds)
142 | print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)
143 |
144 | mean_kapa += kapa
145 | #save the results
146 | no=0
147 | for real_index in test_index:
148 | train_stacker[no]=(preds[no])
149 | no+=1
150 | i+=1
151 | if Usecv:
152 | print (" Average AUC: %f" % (mean_kapa) )
153 | print (" printing train datasets ")
154 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
155 |
156 |
157 | #preds=bagged_set(X, y,model, SEED, 1, X_test, update_seed=True)
158 |
159 | preds=bagged_set(X, y,model, SEED , 5, X_test, update_seed=True)
160 |
161 |
162 | for pr in range (0,len(preds)):
163 | test_stacker[pr]=(preds[pr])
164 |
165 | preds=np.array(preds)
166 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
167 |
168 |
169 | print("Write results...")
170 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
171 | print("Writing submission to %s" % output_file)
172 | f = open(config.output_loc + output_file, "w")
173 | f.write("id,probability\n")# the header
174 | for g in range(0, len(preds)) :
175 | pr=preds[g]
176 | f.write("%d,%f\n" % (((ids[g]),pr ) ) )
177 | f.close()
178 | print("Done.")
179 |
180 |
181 |
182 |
183 |
184 |
185 | if __name__=="__main__":
186 | main()
187 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgson_v4.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import numpy as np
4 | from sklearn.metrics import roc_auc_score
5 | import XGBoostClassifier as xg
6 | import os
7 | import libavito
8 | import feather
9 |
10 | #load a single column from file
11 | def loadcolumn(filename,col=4, skip=1, floats=True):
12 | pred=[]
13 | op=open(filename,'r')
14 | if skip==1:
15 | op.readline() #header
16 | for line in op:
17 | line=line.replace('\n','')
18 | sps=line.split(',')
19 | #load always the last columns
20 | if floats:
21 | pred.append(float(sps[col]))
22 | else :
23 | pred.append(str(sps[col]))
24 | op.close()
25 | return pred
26 |
27 | #export file in csv using numpy
28 | def printfilcsve(X, filename):
29 |
30 | np.savetxt(filename,X, fmt='%.5f')
31 |
32 | # read the train and test allclean.csv files. skip errors
33 | def readfile(name, index=0):
34 | dopen=open(name,"r")
35 | array=[]
36 | skip_firstrow=False
37 | if index!=0:
38 | skip_firstrow=True
39 | for i,line in enumerate(dopen):
40 | if i==0 and skip_firstrow:
41 | continue
42 | splits=line.replace("\n","").replace(" ","").split(",")
43 | ar=[]
44 | for k in splits:
45 | try:
46 | ar.append(float(k))
47 | except:
48 | ar.append(0.0)
49 | print(" the string is %s ok?" % ((k)))
50 | array.append(ar)#[float(k) if k!="0" else 0.0 for k in splits ])
51 | if i%100000==0:
52 | print(" we are at " , str(i))
53 | return np.array(array)
54 |
55 |
56 |
57 | # bagger for xgboost
58 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
59 |
60 | # create array object to hold predictions
61 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))]
62 | #loop for as many times as we want bags
63 | for n in range (0, estimators):
64 | #shuff;e first, aids in increasing variance and forces different results
65 | #X_t,y_c=shuffle(X,y, random_state=seed+n)
66 |
67 | if update_seed: # update seed if requested, to give a slightly different model
68 | model.set_params(random_state=seed + n)
69 | model.fit(X_t,y_c) # fit model0.0917411475506
70 | preds=model.predict_proba(xt)[:,1] # predict probabilities
71 | # update bag's array
72 | for j in range (0, (xt.shape[0])):
73 | baggedpred[j]+=preds[j]
74 | print("done bag %d " % (n))
75 | # divide with number of bags to create an average estimate
76 | for j in range (0, len(baggedpred)):
77 | baggedpred[j]/=float(estimators)
78 | # return probabilities
79 | return np.array(baggedpred)
80 |
81 | def main():
82 |
83 | config = libavito.get_config()
84 | cache_loc = config.cache_loc
85 | nthreads = config.nthreads
86 |
87 | Usecv=True # true will split the training data 66-33 and do cv
88 | SEED=15
89 | threads=nthreads # number of workers for parallelism
90 |
91 | ######### Load files ############
92 | print("Loading input data")
93 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
94 | y = train['isDuplicate'].values
95 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
96 | del train
97 | print(X.shape)
98 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
99 | ids = test['id'].values
100 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
101 | del test
102 | print(X_test.shape)
103 |
104 |
105 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
106 | if not os.path.exists(metafolder): #if it does not exists, we create it
107 | os.makedirs(metafolder)
108 | outset="marios_xgson_v4" # predic of all files
109 |
110 | #model to use
111 |
112 | model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads, eta=0.02, gamma=7.0,max_depth=20, min_child_weight=20, subsample=0.9, colsample_bytree=0.4,objective='binary:logistic',seed=1)
113 |
114 | #Create Arrays for meta
115 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai
116 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )] #indices for test
117 | kfolder=[[idex1,idex2]] # create an object to put indices in
118 |
119 | #arrays to save predictions for validation and test for meta modelling (stacking)
120 | train_stacker=[ 0.0 for k in range (0,len(idex2)) ]
121 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))]
122 |
123 | #create target variable
124 | mean_kapa = 0.0
125 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
126 | i=0 # iterator counter
127 | if Usecv:
128 | print ("starting cross validation" )
129 | for train_index, test_index in kfolder:
130 | # creaning and validation sets
131 | X_train, X_cv = X[train_index], X[test_index]
132 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
133 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
134 |
135 | #use xgboost bagger
136 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
137 |
138 | # compute Loglikelihood metric for this CV fold
139 | #scalepreds(preds)
140 | kapa = roc_auc_score(y_cv,preds)
141 | print("size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa))
142 |
143 | mean_kapa += kapa
144 | #save the results
145 | no=0
146 | for real_index in test_index:
147 | train_stacker[no]=(preds[no])
148 | no+=1
149 | i+=1
150 | if (Usecv):
151 | #print the array of validation predictions for stacking later on inside the 'meta_folder'
152 | print (" Average AUC: %f" % (mean_kapa) )
153 | print (" printing train datasets ")
154 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
155 |
156 | preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True)
157 |
158 |
159 | for pr in range (0,len(preds)):
160 | test_stacker[pr]=(preds[pr])
161 | #print prediction as numpy array for stacking later on
162 | preds=np.array(preds)
163 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
164 |
165 | #create submission file
166 | print("Write results...")
167 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
168 | print("Writing submission to %s" % output_file)
169 | f = open(config.output_loc + output_file, "w")
170 | f.write("id,probability\n")# the header
171 | for g in range(0, len(preds)) :
172 | pr=preds[g]
173 | f.write("%d,%f\n" % (((ids[g]),pr ) ) )
174 | f.close()
175 | print("Done.")
176 |
177 | if __name__=="__main__":
178 | main()
179 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgsonv2_v5.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import numpy as np
4 | from sklearn.metrics import roc_auc_score
5 | import XGBoostClassifier as xg
6 | import os
7 | import libavito
8 | import feather
9 |
10 | #load a single column from file
11 | def loadcolumn(filename,col=4, skip=1, floats=True):
12 | pred=[]
13 | op=open(filename,'r')
14 | if skip==1:
15 | op.readline() #header
16 | for line in op:
17 | line=line.replace('\n','')
18 | sps=line.split(',')
19 | #load always the last columns
20 | if floats:
21 | pred.append(float(sps[col]))
22 | else :
23 | pred.append(str(sps[col]))
24 | op.close()
25 | return pred
26 |
27 | #export file in csv using numpy
28 | def printfilcsve(X, filename):
29 |
30 | np.savetxt(filename,X, fmt='%.5f')
31 |
32 | # read the train and test allclean.csv files. skip errors
33 | def readfile(name, index=0):
34 | dopen=open(name,"r")
35 | array=[]
36 | skip_firstrow=False
37 | if index!=0:
38 | skip_firstrow=True
39 | for i,line in enumerate(dopen):
40 | if i==0 and skip_firstrow:
41 | continue
42 | splits=line.replace("\n","").replace(" ","").split(",")
43 | ar=[]
44 | for k in splits:
45 | try:
46 | ar.append(float(k))
47 | except:
48 | ar.append(0.0)
49 | print(" the string is %s ok?" % ((k)))
50 | array.append(ar)#[float(k) if k!="0" else 0.0 for k in splits ])
51 | if i%100000==0:
52 | print(" we are at " , str(i))
53 | return np.array(array)
54 |
55 |
56 |
57 | # bagger for xgboost
58 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
59 |
60 | # create array object to hold predictions
61 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))]
62 | #loop for as many times as we want bags
63 | for n in range (0, estimators):
64 | #shuff;e first, aids in increasing variance and forces different results
65 | #X_t,y_c=shuffle(X,y, random_state=seed+n)
66 |
67 | if update_seed: # update seed if requested, to give a slightly different model
68 | model.set_params(random_state=seed + n)
69 | model.fit(X_t,y_c) # fit model0.0917411475506
70 | preds=model.predict_proba(xt)[:,1] # predict probabilities
71 | # update bag's array
72 | for j in range (0, (xt.shape[0])):
73 | baggedpred[j]+=preds[j]
74 | print("done bag %d " % (n))
75 | # divide with number of bags to create an average estimate
76 | for j in range (0, len(baggedpred)):
77 | baggedpred[j]/=float(estimators)
78 | # return probabilities
79 | return np.array(baggedpred)
80 |
81 | def main():
82 |
83 | config = libavito.get_config()
84 | cache_loc = config.cache_loc
85 | nthreads = config.nthreads
86 |
87 | Usecv=True # true will split the training data 66-33 and do cv
88 | SEED=15
89 | threads=nthreads # number of workers for parallelism
90 |
91 | ######### Load files ############
92 | print("Loading input data")
93 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
94 | y = train['isDuplicate'].values
95 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
96 | del train
97 | print(X.shape)
98 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
99 | ids = test['id'].values
100 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
101 | del test
102 | print(X_test.shape)
103 |
104 |
105 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
106 | if not os.path.exists(metafolder): #if it does not exists, we create it
107 | os.makedirs(metafolder)
108 | outset="marios_xgsonv2_v5" # predic of all files
109 |
110 | #model to use
111 |
112 | model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads, eta=0.1, gamma=0.0,max_depth=20, min_child_weight=1, subsample=1.0,
113 | colsample_bytree=0.9,objective='binary:logistic',silent=True, seed=1)
114 |
115 | #Create Arrays for meta
116 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai
117 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )] #indices for test
118 | kfolder=[[idex1,idex2]] # create an object to put indices in
119 |
120 | #arrays to save predictions for validation and test for meta modelling (stacking)
121 | train_stacker=[ 0.0 for k in range (0,len(idex2)) ]
122 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))]
123 |
124 | #create target variable
125 | mean_kapa = 0.0
126 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
127 | i=0 # iterator counter
128 | if Usecv:
129 | print ("starting cross validation" )
130 | for train_index, test_index in kfolder:
131 | # creaning and validation sets
132 | X_train, X_cv = X[train_index], X[test_index]
133 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
134 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
135 |
136 | #use xgboost bagger
137 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
138 |
139 | # compute Loglikelihood metric for this CV fold
140 | #scalepreds(preds)
141 | kapa = roc_auc_score(y_cv,preds)
142 | print("size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa))
143 |
144 | mean_kapa += kapa
145 | #save the results
146 | no=0
147 | for real_index in test_index:
148 | train_stacker[no]=(preds[no])
149 | no+=1
150 | i+=1
151 | if (Usecv):
152 | #print the array of validation predictions for stacking later on inside the 'meta_folder'
153 | print (" Average AUC: %f" % (mean_kapa) )
154 | print (" printing train datasets ")
155 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
156 |
157 | preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True)
158 |
159 |
160 | for pr in range (0,len(preds)):
161 | test_stacker[pr]=(preds[pr])
162 | #print prediction as numpy array for stacking later on
163 | preds=np.array(preds)
164 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
165 |
166 | #create submission file
167 | print("Write results...")
168 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
169 | print("Writing submission to %s" % output_file)
170 | f = open(config.output_loc + output_file, "w")
171 | f.write("id,probability\n")# the header
172 | for g in range(0, len(preds)) :
173 | pr=preds[g]
174 | f.write("%d,%f\n" % (((ids[g]),pr ) ) )
175 | f.close()
176 | print("Done.")
177 |
178 | if __name__=="__main__":
179 | main()
180 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgrank_v2.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.externals import joblib
3 | from sklearn.preprocessing import StandardScaler
4 | from sklearn.metrics import roc_auc_score
5 | import XGBoostClassifier as xg
6 | import os
7 | import libavito
8 | import feather
9 |
10 | # bagger for xgboost
11 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
12 |
13 | # create array object to hold predictions
14 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))]
15 | #loop for as many times as we want bags
16 | for n in range (0, estimators):
17 | #shuff;e first, aids in increasing variance and forces different results
18 | #X_t,y_c=shuffle(X,y, random_state=seed+n)
19 |
20 | if update_seed: # update seed if requested, to give a slightly different model
21 | model.set_params(random_state=seed + n)
22 | model.fit(X_t,y_c) # fit model0.0917411475506
23 | preds=model.predict(xt) # predict probabilities
24 | # update bag's array
25 | for j in range (0, (xt.shape[0])):
26 | baggedpred[j]+=preds[j]
27 | print("done bag %d " % (n))
28 | # divide with number of bags to create an average estimate
29 | for j in range (0, len(baggedpred)):
30 | baggedpred[j]/=float(estimators)
31 | # return probabilities
32 | return np.array(baggedpred)
33 |
34 |
35 |
36 |
37 | def loadcolumn(filename,col=4, skip=1, floats=True):
38 | pred=[]
39 | op=open(filename,'r')
40 | if skip==1:
41 | op.readline() #header
42 | for line in op:
43 | line=line.replace('\n','')
44 | sps=line.split(',')
45 | #load always the last columns
46 | if floats:
47 | pred.append(float(sps[col]))
48 | else :
49 | pred.append(str(sps[col]))
50 | op.close()
51 | return pred
52 |
53 | def printfilcsve(X, filename):
54 |
55 | np.savetxt(filename,X, fmt='%.5f')
56 |
57 |
58 | # read the train and test allclean.csv files. skip errors
59 | def readfile(name, index=0):
60 | dopen=open(name,"r")
61 | array=[]
62 | skip_firstrow=False
63 | if index!=0:
64 | skip_firstrow=True
65 | for i,line in enumerate(dopen):
66 | if i==0 and skip_firstrow:
67 | continue
68 | splits=line.replace("\n","").replace(" ","").split(",")
69 | ar=[]
70 | for k in splits:
71 | try:
72 | ar.append(float(k))
73 | except:
74 | ar.append(0.0)
75 | print(" the string is %s ok?" % ((k)))
76 | array.append(ar)#[float(k)0.971474 if k!="0" else 0.0 for k in splits ])
77 | if i%100000==0:
78 | print(" we are at " , str(i))
79 | return np.array(array)
80 |
81 |
82 | def main():
83 |
84 | config = libavito.get_config()
85 | cache_loc = config.cache_loc
86 | nthreads = config.nthreads
87 |
88 | Usecv=True # true will split the training data 66-33 and do cv
89 | SEED=15
90 | threads=nthreads # number of workers for parallelism
91 |
92 | ######### Load files ############
93 | print("Loading input data")
94 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
95 | y = train['isDuplicate'].values
96 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
97 | del train
98 | print(X.shape)
99 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
100 | ids = test['id'].values
101 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
102 | del test
103 | print(X_test.shape)
104 |
105 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
106 |
107 | model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads, eta=0.02, gamma=7.0,max_depth=15, min_child_weight=20, subsample=0.9,
108 | colsample_bytree=0.4,objective='rank:pairwise',seed=1)
109 |
110 | #create meta folder to drop predictions for train and test
111 | if not os.path.exists(metafolder): #if it does not exists, we create it
112 | os.makedirs(metafolder)
113 |
114 | outset="marios_xgrank_v2" # predic of all files
115 |
116 | #model to use
117 |
118 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)]
119 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )]
120 | kfolder=[[idex1,idex2]]
121 | #Create Arrays for meta
122 | train_stacker=[ 0.0 for k in range (0,(idex2.shape[0])) ]
123 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))]
124 | # CHECK EVerything in five..it could be more efficient
125 |
126 | #create target variable
127 | mean_kapa = 0.0
128 | #kfolder=StratifiedKFold(y, n_folds=number_of_folds,shuffle=True, random_state=SEED)
129 | #number_of_folds=0
130 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
131 | i=0 # iterator counter
132 | if Usecv:
133 | print ("starting cross validation")
134 | for train_index, test_index in kfolder:
135 | # creaning and validation sets
136 | X_train, X_cv = X[train_index], X[test_index]
137 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
138 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
139 |
140 |
141 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
142 |
143 |
144 | # compute Loglikelihood metric for this CV fold
145 | #scalepreds(preds)
146 | kapa = roc_auc_score(y_cv,preds)
147 | print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)
148 |
149 | mean_kapa += kapa
150 | #save the results
151 | no=0
152 | for real_index in test_index:
153 | train_stacker[no]=(preds[no])
154 | no+=1
155 | i+=1
156 | if Usecv:
157 | print (" Average AUC: %f" % (mean_kapa) )
158 | print (" printing train datasets ")
159 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
160 |
161 |
162 | #preds=bagged_set(X, y,model, SEED, 1, X_test, update_seed=True)
163 |
164 | preds=bagged_set(X, y,model, SEED , 5, X_test, update_seed=True)
165 |
166 |
167 | for pr in range (0,len(preds)):
168 | test_stacker[pr]=(preds[pr])
169 |
170 | preds=np.array(preds)
171 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
172 |
173 |
174 | print("Write results...")
175 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
176 | print("Writing submission to %s" % output_file)
177 | f = open(config.output_loc + output_file, "w")
178 | f.write("id,probability\n")# the header
179 | for g in range(0, len(preds)) :
180 | pr=preds[g]
181 | f.write("%d,%f\n" % (((ids[g]),pr ) ) )
182 | f.close()
183 | print("Done.")
184 |
185 |
186 |
187 |
188 |
189 |
190 | if __name__=="__main__":
191 | main()
192 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgson_v2.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import numpy as np
4 | from sklearn.externals import joblib
5 | from sklearn.metrics import roc_auc_score
6 | import XGBoostClassifier as xg
7 | import os
8 | import libavito
9 | import feather
10 |
11 | #load a single column from file
12 | def loadcolumn(filename,col=4, skip=1, floats=True):
13 | pred=[]
14 | op=open(filename,'r')
15 | if skip==1:
16 | op.readline() #header
17 | for line in op:
18 | line=line.replace('\n','')
19 | sps=line.split(',')
20 | #load always the last columns
21 | if floats:
22 | pred.append(float(sps[col]))
23 | else :
24 | pred.append(str(sps[col]))
25 | op.close()
26 | return pred
27 |
28 |
29 | #export file in csv using numpy
30 | def printfilcsve(X, filename):
31 |
32 | np.savetxt(filename,X, fmt='%.5f')
33 |
34 | # read the train and test allclean.csv files. skip errors
35 | def readfile(name, index=0):
36 | dopen=open(name,"r")
37 | array=[]
38 | skip_firstrow=False
39 | if index!=0:
40 | skip_firstrow=True
41 | for i,line in enumerate(dopen):
42 | if i==0 and skip_firstrow:
43 | continue
44 | splits=line.replace("\n","").replace(" ","").split(",")
45 | ar=[]
46 | for k in splits:
47 | try:
48 | ar.append(float(k))
49 | except:
50 | ar.append(0.0)
51 | print(" the string is %s ok?" % ((k)))
52 | array.append(ar)#[float(k) if k!="0" else 0.0 for k in splits ])
53 | if i%100000==0:
54 | print(" we are at " , str(i))
55 | return np.array(array)
56 |
57 |
58 | # bagger for xgboost
59 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
60 |
61 | # create array object to hold predictions
62 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))]
63 | #loop for as many times as we want bags
64 | for n in range (0, estimators):
65 | #shuff;e first, aids in increasing variance and forces different results
66 | #X_t,y_c=shuffle(X,y, random_state=seed+n)
67 |
68 | if update_seed: # update seed if requested, to give a slightly different model
69 | model.set_params(random_state=seed + n)
70 | model.fit(X_t,y_c) # fit model0.0917411475506
71 | preds=model.predict_proba(xt)[:,1] # predict probabilities
72 | # update bag's array
73 | for j in range (0, (xt.shape[0])):
74 | baggedpred[j]+=preds[j]
75 | print("done bag %d " % (n))
76 | # divide with number of bags to create an average estimate
77 | for j in range (0, len(baggedpred)):
78 | baggedpred[j]/=float(estimators)
79 | # return probabilities
80 | return np.array(baggedpred)
81 |
82 |
83 | def main():
84 | config = libavito.get_config()
85 | cache_loc = config.cache_loc
86 | nthreads = config.nthreads
87 |
88 | Usecv=True # true will split the training data 66-33 and do cv
89 | SEED=15
90 | threads=nthreads # number of workers for parallelism
91 |
92 | ######### Load files ############
93 | print("Loading input data")
94 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
95 | y = train['isDuplicate'].values
96 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
97 | del train
98 | print(X.shape)
99 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
100 | ids = test['id'].values
101 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
102 | del test
103 | print(X_test.shape)
104 |
105 |
106 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
107 | if not os.path.exists(metafolder): #if it does not exists, we create it
108 | os.makedirs(metafolder)
109 | outset="marios_xgson_v2" # predic of all files
110 |
111 | #model to use
112 |
113 | model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads, eta=0.02, gamma=7.0,max_depth=20, min_child_weight=20, subsample=0.9,
114 | colsample_bytree=0.4,objective='binary:logistic',seed=1)
115 |
116 | #Create Arrays for meta
117 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai
118 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )] #indices for test
119 | kfolder=[[idex1,idex2]] # create an object to put indices in
120 |
121 | #arrays to save predictions for validation and test for meta modelling (stacking)
122 | train_stacker=[ 0.0 for k in range (0,(X.shape[0])) ]
123 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))]
124 |
125 | #create target variable
126 | mean_kapa = 0.0
127 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
128 | i=0 # iterator counter
129 | if Usecv:
130 | print ("starting cross validation" )
131 | for train_index, test_index in kfolder:
132 | # creaning and validation sets
133 | X_train, X_cv = X[train_index], X[test_index]
134 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
135 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
136 |
137 | #use xgboost bagger
138 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
139 |
140 | # compute Loglikelihood metric for this CV fold
141 | #scalepreds(preds)
142 | kapa = roc_auc_score(y_cv,preds)
143 | print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)
144 |
145 | mean_kapa += kapa
146 | #save the results
147 | no=0
148 | for real_index in test_index:
149 | train_stacker[no]=(preds[no])
150 | no+=1
151 | i+=1
152 | if (Usecv):
153 | #print the array of validation predictions for stacking later on inside the 'meta_folder'
154 | print (" Average AUC: %f" % (mean_kapa) )
155 | print (" printing train datasets ")
156 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
157 |
158 | preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True)
159 |
160 |
161 | for pr in range (0,len(preds)):
162 | test_stacker[pr]=(preds[pr])
163 | #print prediction as numpy array for stacking later on
164 | preds=np.array(preds)
165 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
166 |
167 | #create submission file
168 | print("Write results...")
169 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
170 | print("Writing submission to %s" % output_file)
171 | f = open(config.output_loc + output_file, "w")
172 | f.write("id,probability\n")# the header
173 | for g in range(0, len(preds)) :
174 | pr=preds[g]
175 | f.write("%d,%f\n" % (((ids[g]),pr ) ) )
176 | f.close()
177 | print("Done.")
178 |
179 |
180 |
181 |
182 |
183 |
184 | if __name__=="__main__":
185 | main()
186 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgson_v3.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import numpy as np
4 | from sklearn.externals import joblib
5 | from sklearn.metrics import roc_auc_score
6 | import XGBoostClassifier as xg
7 | import os
8 | import libavito
9 | import feather
10 |
11 | #load a single column from file
12 | def loadcolumn(filename,col=4, skip=1, floats=True):
13 | pred=[]
14 | op=open(filename,'r')
15 | if skip==1:
16 | op.readline() #header
17 | for line in op:
18 | line=line.replace('\n','')
19 | sps=line.split(',')
20 | #load always the last columns
21 | if floats:
22 | pred.append(float(sps[col]))
23 | else :
24 | pred.append(str(sps[col]))
25 | op.close()
26 | return pred
27 |
28 |
29 |
30 | #export file in csv using numpy
31 | def printfilcsve(X, filename):
32 |
33 | np.savetxt(filename,X, fmt='%.5f')
34 |
35 | # read the train and test allclean.csv files. skip errors
36 | def readfile(name, index=0):
37 | dopen=open(name,"r")
38 | array=[]
39 | skip_firstrow=False
40 | if index!=0:
41 | skip_firstrow=True
42 | for i,line in enumerate(dopen):
43 | if i==0 and skip_firstrow:
44 | continue
45 | splits=line.replace("\n","").replace(" ","").split(",")
46 | ar=[]
47 | for k in splits:
48 | try:
49 | ar.append(float(k))
50 | except:
51 | ar.append(0.0)
52 | print(" the string is %s ok?" % ((k)))
53 | array.append(ar)#[float(k) if k!="0" else 0.0 for k in splits ])
54 | if i%100000==0:
55 | print(" we are at " , str(i))
56 | return np.array(array)
57 |
58 |
59 | # bagger for xgboost
60 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
61 |
62 | # create array object to hold predictions
63 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))]
64 | #loop for as many times as we want bags
65 | for n in range (0, estimators):
66 | #shuff;e first, aids in increasing variance and forces different results
67 | #X_t,y_c=shuffle(X,y, random_state=seed+n)
68 |
69 | if update_seed: # update seed if requested, to give a slightly different model
70 | model.set_params(random_state=seed + n)
71 | model.fit(X_t,y_c) # fit model0.0917411475506
72 | preds=model.predict_proba(xt)[:,1] # predict probabilities
73 | # update bag's array
74 | for j in range (0, (xt.shape[0])):
75 | baggedpred[j]+=preds[j]
76 | print("done bag %d " % (n))
77 | # divide with number of bags to create an average estimate
78 | for j in range (0, len(baggedpred)):
79 | baggedpred[j]/=float(estimators)
80 | # return probabilities
81 | return np.array(baggedpred)
82 |
83 |
84 | def main():
85 |
86 | config = libavito.get_config()
87 | cache_loc = config.cache_loc
88 | nthreads = config.nthreads
89 |
90 | Usecv=True # true will split the training data 66-33 and do cv
91 | SEED=15
92 | threads=nthreads # number of workers for parallelism
93 |
94 | ######### Load files ############
95 | print("Loading input data")
96 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
97 | y = train['isDuplicate'].values
98 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
99 | del train
100 | print(X.shape)
101 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
102 | ids = test['id'].values
103 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
104 | del test
105 | print(X_test.shape)
106 |
107 |
108 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
109 | if not os.path.exists(metafolder): #if it does not exists, we create it
110 | os.makedirs(metafolder)
111 | outset="marios_xgson_v3" # predic of all files
112 |
113 | #model to use
114 |
115 | model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads, eta=0.02, gamma=7.0,max_depth=20, min_child_weight=20, subsample=0.9,
116 | colsample_bytree=0.4,objective='binary:logistic',seed=1)
117 |
118 | #Create Arrays for meta
119 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)] # indices for trai
120 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )] #indices for test
121 | kfolder=[[idex1,idex2]] # create an object to put indices in
122 |
123 | #arrays to save predictions for validation and test for meta modelling (stacking)
124 | train_stacker=[ 0.0 for k in range (0,(idex2.shape[0])) ]
125 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))]
126 |
127 | #create target variable
128 | mean_kapa = 0.0
129 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
130 | i=0 # iterator counter
131 | if Usecv:
132 | print ("starting cross validation" )
133 | for train_index, test_index in kfolder:
134 | # creaning and validation sets
135 | X_train, X_cv = X[train_index], X[test_index]
136 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
137 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
138 |
139 | #use xgboost bagger
140 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
141 |
142 | # compute Loglikelihood metric for this CV fold
143 | #scalepreds(preds)
144 | kapa = roc_auc_score(y_cv,preds)
145 | print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)
146 |
147 | mean_kapa += kapa
148 | #save the results
149 | no=0
150 | for real_index in test_index:
151 | train_stacker[no]=(preds[no])
152 | no+=1
153 | i+=1
154 | if (Usecv):
155 | #print the array of validation predictions for stacking later on inside the 'meta_folder'
156 | print (" Average AUC: %f" % (mean_kapa) )
157 | print (" printing train datasets ")
158 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
159 |
160 | preds=bagged_set(X, y,model, SEED ,5, X_test, update_seed=True)
161 |
162 |
163 | for pr in range (0,len(preds)):
164 | test_stacker[pr]=(preds[pr])
165 | #print prediction as numpy array for stacking later on
166 | preds=np.array(preds)
167 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
168 |
169 | #create submission file
170 | print("Write results...")
171 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
172 | print("Writing submission to %s" % output_file)
173 | f = open(config.output_loc + output_file, "w")s
174 | f.write("id,probability\n")# the header
175 | for g in range(0, len(preds)) :
176 | pr=preds[g]
177 | f.write("%d,%f\n" % (((ids[g]),pr ) ) )
178 | f.close()
179 | print("Done.")
180 |
181 |
182 |
183 |
184 |
185 |
186 | if __name__=="__main__":
187 | main()
188 |
--------------------------------------------------------------------------------
/Kaggle/Avito Duplicate Ad Detection/code/models/marios_xgrank_v3.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.preprocessing import StandardScaler
3 | from sklearn.metrics import roc_auc_score
4 | import XGBoostClassifier as xg
5 | import os
6 | import libavito
7 | import feather
8 |
9 | # bagger for xgboost
10 | def bagged_set(X_t,y_c,model, seed, estimators, xt, update_seed=True):
11 |
12 | # create array object to hold predictions
13 | baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))]
14 | #loop for as many times as we want bags
15 | for n in range (0, estimators):
16 | #shuff;e first, aids in increasing variance and forces different results
17 | #X_t,y_c=shuffle(X,y, random_state=seed+n)
18 |
19 | if update_seed: # update seed if requested, to give a slightly different model
20 | model.set_params(random_state=seed + n)
21 | model.fit(X_t,y_c) # fit model0.0917411475506
22 | preds=model.predict(xt) # predict probabilities
23 | # update bag's array
24 | for j in range (0, (xt.shape[0])):
25 | baggedpred[j]+=preds[j]
26 | print("done bag %d " % (n))
27 | # divide with number of bags to create an average estimate
28 | for j in range (0, len(baggedpred)):
29 | baggedpred[j]/=float(estimators)
30 | # return probabilities
31 | return np.array(baggedpred)
32 |
33 | def loadcolumn(filename,col=4, skip=1, floats=True):
34 | pred=[]
35 | op=open(filename,'r')
36 | if skip==1:
37 | op.readline() #header
38 | for line in op:
39 | line=line.replace('\n','')
40 | sps=line.split(',')
41 | #load always the last columns
42 | if floats:
43 | pred.append(float(sps[col]))
44 | else :
45 | pred.append(str(sps[col]))
46 | op.close()
47 | return pred
48 |
49 |
50 | def printfilcsve(X, filename):
51 |
52 | np.savetxt(filename,X, fmt='%.5f')
53 |
54 |
55 | # read the train and test allclean.csv files. skip errors
56 | def readfile(name, index=0):
57 | dopen=open(name,"r")
58 | array=[]
59 | skip_firstrow=False
60 | if index!=0:
61 | skip_firstrow=True
62 | for i,line in enumerate(dopen):
63 | if i==0 and skip_firstrow:
64 | continue
65 | splits=line.replace("\n","").replace(" ","").split(",")
66 | ar=[]
67 | for k in splits:
68 | try:
69 | ar.append(float(k))
70 | except:
71 | ar.append(0.0)
72 | print(" the string is %s ok?" % ((k)))
73 | array.append(ar)#[float(k)0.971474 if k!="0" else 0.0 for k in splits ])
74 | if i%100000==0:
75 | print(" we are at " , str(i))
76 | return np.array(array)
77 |
78 |
79 | def main():
80 |
81 | Use_scale=True
82 | Usecv=True # true will split the training data 66-33 and do cv
83 | SEED=15
84 | threads=nthreads # number of workers for parallelism
85 |
86 | ######### Load files ############
87 | print("Loading input data")
88 | train = feather.read_dataframe(cache_loc + 'final_featureSet_train.fthr')
89 | y = train['isDuplicate'].values
90 | X = train.drop(['itemID_1', 'itemID_2', 'isDuplicate'], 1).values
91 | del train
92 | print(X.shape)
93 | test = feather.read_dataframe(cache_loc + 'final_featureSet_test.fthr')
94 | ids = test['id'].values
95 | X_test = test.drop(['itemID_1', 'itemID_2', 'id'], 1).values
96 | del test
97 | print(X_test.shape)
98 |
99 |
100 | metafolder=cache_loc + "meta_folder/" # folder to use to store for meta predictions
101 |
102 |
103 | model=xg.XGBoostClassifier(num_round=1000 ,nthread=threads, eta=0.02, gamma=7.0,max_depth=15, min_child_weight=20, subsample=0.9,
104 | colsample_bytree=0.4,objective='rank:pairwise',seed=1)
105 |
106 | if not os.path.exists(metafolder): #if it does not exists, we create it
107 | os.makedirs(metafolder)
108 |
109 | outset="marios_xgrank_v3" # predic of all files
110 |
111 | #model to use
112 |
113 | idex1=[k for k in range( 0,(X.shape[0] * 2)/ 3)]
114 | idex2=[k for k in range( (X.shape[0] * 2)/ 3,X.shape[0] )]
115 | kfolder=[[idex1,idex2]]
116 | #Create Arrays for meta
117 | train_stacker=[ 0.0 for k in range (0,len(idex2)) ]
118 | test_stacker=[0.0 for k in range (0,(X_test.shape[0]))]
119 | # CHECK EVerything in five..it could be more efficient
120 |
121 | #create target variable
122 | mean_kapa = 0.0
123 | #kfolder=StratifiedKFold(y, n_folds=number_of_folds,shuffle=True, random_state=SEED)
124 | #number_of_folds=0
125 | #X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
126 | i=0 # iterator counter
127 | if Usecv:
128 | print ("starting cross validation")
129 | for train_index, test_index in kfolder:
130 | # creaning and validation sets
131 | X_train, X_cv = X[train_index], X[test_index]
132 | y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
133 | print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
134 |
135 | if Use_scale:
136 | stda=StandardScaler()
137 | X_train=stda.fit_transform(X_train)
138 | X_cv=stda.transform(X_cv)
139 |
140 | preds=bagged_set(X_train,y_train,model, SEED, 5, X_cv, update_seed=True)
141 |
142 |
143 | # compute Loglikelihood metric for this CV fold
144 | #scalepreds(preds)
145 | kapa = roc_auc_score(y_cv,preds)
146 | print "size train: %d size cv: %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, 1, kapa)
147 |
148 | mean_kapa += kapa
149 | #save the results
150 | no=0
151 | for real_index in test_index:
152 | train_stacker[no]=(preds[no])
153 | no+=1
154 | i+=1
155 | if Usecv:
156 | print (" Average AUC: %f" % (mean_kapa) )
157 | print (" printing train datasets ")
158 | printfilcsve(np.array(train_stacker), metafolder+ outset + "train.csv")
159 |
160 | if Use_scale:
161 | stda=StandardScaler()
162 | X=stda.fit_transform(X)
163 | X_test=stda.transform(X_test)
164 |
165 | #preds=bagged_set(X, y,model, SEED, 1, X_test, update_seed=True)
166 |
167 | preds=bagged_set(X, y,model, SEED , 5, X_test, update_seed=True)
168 |
169 |
170 | for pr in range (0,len(preds)):
171 | test_stacker[pr]=(preds[pr])
172 |
173 | preds=np.array(preds)
174 | printfilcsve(np.array(test_stacker), metafolder+ outset + "test.csv")
175 |
176 |
177 | print("Write results...")
178 | output_file = "submission_"+ outset +str( (mean_kapa ))+".csv"
179 | print("Writing submission to %s" % output_file)
180 | f = open(config.output_loc + output_file, "w")
181 | f.write("id,probability\n")# the header
182 | for g in range(0, len(preds)) :
183 | pr=preds[g]
184 | f.write("%d,%f\n" % (((ids[g]),pr ) ) )
185 | f.close()
186 | print("Done.")
187 |
188 |
189 |
190 |
191 |
192 |
193 | if __name__=="__main__":
194 | main()
195 |
--------------------------------------------------------------------------------