├── .gitignore
├── bin
    └── .gitignore
├── cache
    ├── .gitignore
    └── counts
    │   └── .gitignore
├── logs
    └── .gitignore
├── preds
    └── .gitignore
├── subm
    └── .gitignore
├── repack.sh
├── util
    ├── sklearn_model.py
    ├── helpers.h
    ├── data.py
    ├── meta.py
    ├── nn-helpers.h
    ├── xgb_model.py
    ├── model-helpers.h
    ├── generation.h
    ├── keras_model.py
    ├── __init__.py
    ├── data.h
    └── io.h
├── convert-doc-topics.py
├── ftrl-model.h
├── convert-six-hours-after.py
├── ffm-model.h
├── nn-model.h
├── ffm-nn-model.h
├── convert-trfsrc-doc-sources.py
├── convert-trfsrc-docs.py
├── Makefile
├── prepare-events.py
├── prepare-split.py
├── prepare-documents.py
├── train-ad-mean.py
├── train-l2-blend.py
├── feat_Uuid_Source_id.py
├── feat_Uuid_Doc_Srce_id.py
├── train-ffm.py
├── feat_disp_ad_doc_others.R
├── train-ctr-nn.py
├── prepare-rivals.cpp
├── train-vw.py
├── ffm.h
├── feat_Uuid_OneHour_Range.py
├── train-ffm-2.py
├── ftrl-model.cpp
├── export-vw-data.cpp
├── ffm-io.cpp
├── export-np-data-ctr1.py
├── export-ffm-data.cpp
├── prepare-counts.cpp
├── prepare-leak.cpp
├── prepare-similarity.cpp
├── parse-xgb.py
├── train-l2.py
├── ffm-model.cpp
├── nn-model.cpp
├── prepare-group-viewed-docs.cpp
├── prepare-viewed-docs.cpp
├── export-bin-data-p1.cpp
├── export-bin-data-f2.cpp
└── export-bin-data-f4.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/bin/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/cache/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/logs/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/preds/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/subm/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/cache/counts/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/repack.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -x
2 | 
3 | for src in ../input/*.zip; do
4 |   dst=${src/.zip/.gz}
5 |   gunzip -c $src | gzip > $dst
6 | done
7 | 


--------------------------------------------------------------------------------
/util/sklearn_model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class SklearnModel(object):
 4 | 
 5 |     def __init__(self, model):
 6 |         self.model = model
 7 | 
 8 |     def fit(self, train_X, train_y, train_g=None, eval_X=None, eval_y=None, eval_g=None):
 9 |         self.model.fit(train_X, train_y)
10 |         return self
11 | 
12 |     def predict(self, test_X):
13 |         return self.model.predict_proba(test_X)[:, 1]
14 | 


--------------------------------------------------------------------------------
/convert-doc-topics.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def convert_id_list(doc_ids):
 5 |     return ' '.join(d.split(':')[1] for d in doc_ids.split(' '))
 6 | 
 7 | 
 8 | print "Loading uids..."
 9 | uids = pd.read_csv('cache/events.csv.gz', usecols=['uuid', 'uid'], index_col='uuid')['uid'].drop_duplicates()
10 | 
11 | print "Loading data..."
12 | dts = pd.read_csv('uuid_topc_ids.csv.zip')
13 | dts['uid'] = dts['uuid'].map(uids)
14 | dts['topic_ids'] = dts['document_topc_id'].map(convert_id_list)
15 | 
16 | dts[['uid', 'topic_ids']].to_csv('cache/viewed_doc_topics.csv.gz', index=False, compression='gzip')
17 | 
18 | print "Done."
19 | 


--------------------------------------------------------------------------------
/ftrl-model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "ffm.h"
 4 | 
 5 | class ftrl_model {
 6 |     float * weights_z;
 7 |     float * weights_n;
 8 | 
 9 |     float alpha, beta, l1, l2;
10 | 
11 |     uint n_bits, n_weights, mask;
12 | public:
13 |     ftrl_model(uint n_bits, float alpha, float beta, float l1, float l2);
14 |     ~ftrl_model();
15 | 
16 |     ffm_float predict(const ffm_feature * start, const ffm_feature * end, ffm_float norm, uint64_t * dropout_mask, float dropout_mult);
17 |     void update(const ffm_feature * start, const ffm_feature * end, ffm_float norm, ffm_float kappa, uint64_t * dropout_mask, float dropout_mult);
18 | 
19 |     uint get_dropout_mask_size(const ffm_feature * start, const ffm_feature * end) { return 0; }
20 | };
21 | 


--------------------------------------------------------------------------------
/convert-six-hours-after.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def convert_id_list(doc_ids):
 5 |     return ' '.join(d.split(':')[1] for d in doc_ids.split(' '))
 6 | 
 7 | 
 8 | print "Loading uids..."
 9 | uids = pd.read_csv('cache/events.csv.gz', usecols=['uuid', 'uid'], index_col='uuid')['uid'].drop_duplicates()
10 | 
11 | print "Loading data..."
12 | oha = pd.read_csv('uuid_siz_hours_after.csv.zip')
13 | oha.rename(columns={' timestamp': 'timestamp', ' document_id': 'document_id'}, inplace=True)
14 | oha['uid'] = oha['uuid'].map(uids)
15 | oha['doc_ids'] = oha['document_id'].map(convert_id_list)
16 | 
17 | print "Saving..."
18 | oha[['uid', 'timestamp', 'doc_ids']].to_csv('cache/viewed_docs_six_hours_after.csv.gz', index=False, compression='gzip')
19 | 
20 | print "Done."
21 | 


--------------------------------------------------------------------------------
/ffm-model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "ffm.h"
 4 | 
 5 | class ffm_model {
 6 |     float * ffm_weights;
 7 |     float * lin_weights;
 8 | 
 9 |     float bias_w;
10 |     float bias_wg;
11 | 
12 |     float eta;
13 |     float lambda;
14 | 
15 |     ffm_uint max_b_field;
16 |     ffm_uint min_a_field;
17 | public:
18 |     ffm_model(int seed, bool restricted, float eta, float lambda);
19 |     ~ffm_model();
20 | 
21 |     float predict(const ffm_feature * start, const ffm_feature * end, float norm, uint64_t * dropout_mask, float dropout_mult);
22 |     void update(const ffm_feature * start, const ffm_feature * end, float norm, float kappa, uint64_t * dropout_mask, float dropout_mult);
23 | 
24 |     uint get_dropout_mask_size(const ffm_feature * start, const ffm_feature * end);
25 | };
26 | 


--------------------------------------------------------------------------------
/util/helpers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cmath>
 4 | 
 5 | constexpr float pos_time_diff(int64_t td) {
 6 |     if (td < 0)
 7 |         return 0;
 8 | 
 9 |     return log(1 + td) / 100;
10 | }
11 | 
12 | constexpr float time_diff(int64_t td) {
13 |     if (td < 0)
14 |         return - log(1 - td) / 100;
15 | 
16 |     return log(1 + td) / 100;
17 | }
18 | 
19 | constexpr float logit(float p) {
20 |     return log(p / (1-p));
21 | }
22 | 
23 | constexpr float base_ctr = 0.194f;
24 | constexpr float base_ctr_logit = logit(base_ctr);
25 | 
26 | constexpr float ctr_logit(float views, float clicks, float reg = 50) {
27 |     return logit((clicks + base_ctr * reg) / (views + reg)) - base_ctr_logit;
28 | }
29 | 
30 | 
31 | template <typename T>
32 | inline T min(T a, T b) {
33 |     return a < b ? a : b;
34 | }
35 | 


--------------------------------------------------------------------------------
/nn-model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "ffm.h"
 4 | 
 5 | class nn_model {
 6 |     float * lin_w;
 7 |     float * lin_wg;
 8 | 
 9 |     float * l1_w;
10 |     float * l1_wg;
11 | 
12 |     float * l2_w;
13 |     float * l2_wg;
14 | 
15 |     float * l3_w;
16 |     float * l3_wg;
17 | 
18 |     float eta;
19 |     float lambda;
20 | 
21 |     uint max_b_field;
22 |     uint min_a_field;
23 | public:
24 |     nn_model(int seed, float eta, float lambda);
25 |     ~nn_model();
26 | 
27 |     float predict(const ffm_feature * start, const ffm_feature * end, float norm, uint64_t * dropout_mask, float dropout_mult);
28 |     void update(const ffm_feature * start, const ffm_feature * end, float norm, float kappa, uint64_t * dropout_mask, float dropout_mult);
29 | 
30 |     uint get_dropout_mask_size(const ffm_feature * start, const ffm_feature * end);
31 | };
32 | 


--------------------------------------------------------------------------------
/ffm-nn-model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "ffm.h"
 4 | 
 5 | class ffm_nn_model {
 6 |     float * ffm_weights;
 7 |     float * lin_weights;
 8 | 
 9 |     float * l1_w;
10 |     float * l1_wg;
11 | 
12 |     float * l2_w;
13 |     float * l2_wg;
14 | 
15 |     float eta, ffm_lambda, nn_lambda;
16 | 
17 |     uint max_b_field, min_a_field;
18 | public:
19 |     ffm_nn_model(int seed, bool restricted, float eta, float ffm_lambda, float nn_lambda);
20 |     ~ffm_nn_model();
21 | 
22 |     ffm_float predict(const ffm_feature * start, const ffm_feature * end, float norm, uint64_t * dropout_mask, float dropout_mult);
23 |     void update(const ffm_feature * start, const ffm_feature * end, float norm, float kappa, uint64_t * dropout_mask, float dropout_mult);
24 | 
25 |     uint get_dropout_mask_size(const ffm_feature * start, const ffm_feature * end);
26 | };
27 | 


--------------------------------------------------------------------------------
/util/data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import os
 5 | 
 6 | 
 7 | from .meta import input_dir, cache_dir
 8 | 
 9 | 
10 | def read_events():
11 |     return pd.read_csv(os.path.join(cache_dir, 'events.csv.gz'), index_col='display_id', dtype={'document_id': np.uint32, 'platform': np.uint8, 'region': np.uint32})
12 | 
13 | 
14 | def read_documents():
15 |     return pd.read_csv(os.path.join(cache_dir, 'documents.csv.gz'), index_col='document_id', dtype={'document_id': np.uint32, 'source_id': np.int32, 'publisher_id': np.int16}, parse_dates=['publish_time'])
16 | 
17 | 
18 | def read_ads():
19 |     df = pd.read_csv(os.path.join(input_dir, 'promoted_content.csv.gz'), index_col='ad_id', dtype={'ad_id': np.uint32, 'document_id': np.uint32, 'campaign_id': np.uint16, 'advertiser_id': np.uint16})
20 |     df.rename(columns={'document_id': 'ad_document_id'}, inplace=True)
21 | 
22 |     return df
23 | 


--------------------------------------------------------------------------------
/convert-trfsrc-doc-sources.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def convert_ids_filtered(field):
 5 |     def convert(doc_ids):
 6 |         triples = (d.split(':') for d in doc_ids.split(' '))
 7 | 
 8 |         return ' '.join(t[1] for t in triples if int(t[0]) == field)
 9 | 
10 |     return convert
11 | 
12 | 
13 | print "Loading uids..."
14 | uids = pd.read_csv('cache/events.csv.gz', usecols=['uuid', 'uid'], index_col='uuid')['uid'].drop_duplicates()
15 | 
16 | print "Loading data..."
17 | dts = pd.read_csv('uuid_docsrce_trfc_srce_ids.csv.zip')
18 | dts['uid'] = dts['uuid'].map(uids)
19 | dts['doc_src_int'] = dts['document_id'].map(convert_ids_filtered(32))
20 | dts['doc_src_soc'] = dts['document_id'].map(convert_ids_filtered(33))
21 | dts['doc_src_srh'] = dts['document_id'].map(convert_ids_filtered(34))
22 | 
23 | print "Saving..."
24 | dts[['uid', 'doc_src_int', 'doc_src_soc', 'doc_src_srh']].to_csv('cache/viewed_trfsrc_doc_sources.csv.gz', index=False, compression='gzip')
25 | 
26 | print "Done."
27 | 


--------------------------------------------------------------------------------
/util/meta.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | input_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..', 'input')
 4 | cache_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'cache')
 5 | 
 6 | 
 7 | full_split = (os.path.join(input_dir, 'clicks_train.csv.gz'), os.path.join(input_dir, 'clicks_test.csv.gz'))
 8 | 
 9 | cv1_split = (os.path.join(cache_dir, 'clicks_cv1_train.csv.gz'), os.path.join(cache_dir, 'clicks_cv1_test.csv.gz'))
10 | cv1_split_idx = (os.path.join(cache_dir, 'clicks_cv1_train_idx.csv.gz'), os.path.join(cache_dir, 'clicks_cv1_test_idx.csv.gz'))
11 | 
12 | cv2_split = (os.path.join(cache_dir, 'clicks_cv2_train.csv.gz'), os.path.join(cache_dir, 'clicks_cv2_test.csv.gz'))
13 | 
14 | cv1_split_time = 950400000
15 | test_split_time = 1123200000
16 | 
17 | row_counts = {
18 |     'cv2_train': 14164401,
19 |     'cv2_test': 6484938,
20 |     'cv1_train': 62252998,
21 |     'cv1_test': 24888733,
22 |     'full_train': 87141731,
23 |     'full_test': 32225162
24 | }
25 | 


--------------------------------------------------------------------------------
/convert-trfsrc-docs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def convert_ids_filtered(field):
 5 |     def convert(doc_ids):
 6 |         triples = (d.split(':') for d in doc_ids.split(' '))
 7 | 
 8 |         return ' '.join(t[1] for t in triples if int(t[0]) == field)
 9 | 
10 |     return convert
11 | 
12 | 
13 | print "Loading uids..."
14 | uids = pd.read_csv('cache/events.csv.gz', usecols=['uuid', 'uid'], index_col='uuid')['uid'].drop_duplicates()
15 | 
16 | print "Loading data..."
17 | dts = pd.read_csv('uuid_doc_trfc_srce_ids.csv.gz')
18 | dts['uid'] = dts['uuid'].map(uids)
19 | dts['doc_src_int'] = dts['document_id_trfc_srce'].map(convert_ids_filtered(32))
20 | dts['doc_src_soc'] = dts['document_id_trfc_srce'].map(convert_ids_filtered(33))
21 | dts['doc_src_srh'] = dts['document_id_trfc_srce'].map(convert_ids_filtered(34))
22 | 
23 | print "Saving..."
24 | dts[['uid', 'doc_src_int', 'doc_src_soc', 'doc_src_srh']].to_csv('cache/viewed_trfsrc_docs.csv.gz', index=False, compression='gzip')
25 | 
26 | print "Done."
27 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CXX = g++
 2 | CXXFLAGS = -Wall -O3 -std=c++14 -march=native -fopenmp
 3 | 
 4 | TARGETS = bin/prepare-leak bin/prepare-similarity bin/prepare-counts bin/prepare-rivals
 5 | TARGETS += bin/prepare-viewed-ads bin/prepare-viewed-docs bin/prepare-group-viewed-docs
 6 | TARGETS += bin/export-vw-data bin/export-ffm-data bin/export-bin-data-p1 bin/export-bin-data-f1 bin/export-bin-data-f2 bin/export-bin-data-f3 bin/export-bin-data-f4 bin/export-bin-data-f5
 7 | TARGETS += bin/ffm
 8 | 
 9 | 
10 | all: $(TARGETS)
11 | 
12 | bin/%.o: %.cpp
13 | 	$(CXX) $(CXXFLAGS) $(DFLAG) -MMD -c -o $@ $<
14 | 
15 | bin/%: bin/%.o
16 | 	$(CXX) $(CXXFLAGS) -o $@ $^ -lboost_iostreams -lboost_program_options
17 | 
18 | 
19 | bin/ffm: bin/ffm-io.o bin/ffm-model.o bin/ffm-nn-model.o bin/ftrl-model.o bin/nn-model.o
20 | bin/export-bin-data-p1: bin/ffm-io.o
21 | bin/export-bin-data-f1: bin/ffm-io.o
22 | bin/export-bin-data-f2: bin/ffm-io.o
23 | bin/export-bin-data-f3: bin/ffm-io.o
24 | bin/export-bin-data-f4: bin/ffm-io.o
25 | bin/export-bin-data-f5: bin/ffm-io.o
26 | 
27 | -include bin/*.d
28 | 
29 | .PHONY: clean
30 | clean:
31 | 	rm bin/*
32 | 


--------------------------------------------------------------------------------
/prepare-events.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import os
 5 | 
 6 | from util.meta import cache_dir, input_dir
 7 | 
 8 | 
 9 | def encode_feature(values):
10 |     uniq = values.unique()
11 |     mapping = dict(zip(uniq, range(1, len(uniq) + 1)))
12 | 
13 |     return values.map(mapping)
14 | 
15 | 
16 | df = pd.read_csv(os.path.join(input_dir, 'events.csv.zip'), index_col='display_id', dtype={'document_id': np.uint32, 'timestamp': np.uint32})
17 | df['platform'] = df['platform'].replace({'\N': 0}).astype(np.uint8)
18 | 
19 | location = df['geo_location'].fillna('').str.split('>')
20 | 
21 | df['country'] = location.map(lambda loc: loc[0] if len(loc) > 0 else 'Z')
22 | df['state'] = location.map(lambda loc: loc[1] if len(loc) > 1 else 'Z')
23 | df['region'] = location.map(lambda loc: int(loc[2]) if len(loc) > 2 else -1)
24 | 
25 | df["hour"] = (df["timestamp"] // (3600 * 1000)) % 24
26 | df["weekday"] = df["timestamp"] // (3600 * 24 * 1000)
27 | 
28 | df["uid"] = encode_feature(df["uuid"])
29 | 
30 | df.to_csv(os.path.join(cache_dir, 'events.csv.gz'), compression='gzip')
31 | 
32 | print "Done."
33 | 


--------------------------------------------------------------------------------
/util/nn-helpers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "model-helpers.h"
 4 | 
 5 | 
 6 | inline void backward_pass(uint input_size, float * input, float * input_grad, float * w, float * wg, float grad, float eta, float lambda) {
 7 |     __m256 ymm_eta = _mm256_set1_ps(eta);
 8 |     __m256 ymm_lambda = _mm256_set1_ps(lambda);
 9 |     __m256 ymm_grad = _mm256_set1_ps(grad);
10 | 
11 |     for (uint i = 0; i < input_size; i += 8) {
12 |         __m256 ymm_w = _mm256_load_ps(w + i);
13 | 
14 |         __m256 ymm_g = ymm_lambda * ymm_w + ymm_grad * _mm256_load_ps(input + i);
15 |         __m256 ymm_wg = _mm256_load_ps(wg + i) + ymm_g * ymm_g;
16 | 
17 |         _mm256_store_ps(input_grad + i, ymm_grad * ymm_w + _mm256_load_ps(input_grad + i));
18 | 
19 |         _mm256_store_ps(w + i, ymm_w - ymm_eta * ymm_g * _mm256_rsqrt_ps(ymm_wg));
20 |         _mm256_store_ps(wg + i, ymm_wg);
21 |     }
22 | }
23 | 
24 | inline float forward_pass(uint input_size, float * input, float * w) {
25 |     __m256 ymm_total = _mm256_set1_ps(0);
26 | 
27 |     for (uint i = 0; i < input_size; i += 8)
28 |         ymm_total += _mm256_load_ps(input + i) * _mm256_load_ps(w + i);
29 | 
30 |     return sum(ymm_total);
31 | }
32 | 


--------------------------------------------------------------------------------
/prepare-split.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | from util.meta import cv1_split, cv1_split_idx, cv1_split_time
 5 | 
 6 | print "Loading train..."
 7 | 
 8 | train = pd.read_csv("../input/clicks_train.csv.gz", dtype=np.int32)
 9 | train.reset_index(inplace=True)
10 | train.rename(columns={'index': 'idx'}, inplace=True)
11 | 
12 | print "Loading events..."
13 | 
14 | events = pd.read_csv("../input/events.csv.gz", dtype=np.int32, index_col=0, usecols=[0, 3])  # Load events
15 | events = events.loc[events.index.intersection(train.index.unique())]  # Take only train events
16 | 
17 | print "Splitting events..."
18 | 
19 | # Select display_ids for val - consists of time-based and sampled parts
20 | train_is_val = train['display_id'].isin(events[(events['timestamp'] >= cv1_split_time) | (events.index % 6 == 5)].index)
21 | 
22 | del events
23 | 
24 | print "Splitting clicks..."
25 | 
26 | train.loc[~train_is_val, ['idx']].to_csv(cv1_split_idx[0], index=False, compression='gzip')
27 | train.loc[~train_is_val, ['display_id', 'ad_id', 'clicked']].to_csv(cv1_split[0], index=False, compression='gzip')
28 | 
29 | train.loc[train_is_val, ['idx']].to_csv(cv1_split_idx[1], index=False, compression='gzip')
30 | train.loc[train_is_val, ['display_id', 'ad_id', 'clicked']].to_csv(cv1_split[1], index=False, compression='gzip')
31 | 
32 | del train
33 | 
34 | print "Done."
35 | 


--------------------------------------------------------------------------------
/util/xgb_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import xgboost as xgb
 3 | 
 4 | from sklearn.metrics import log_loss
 5 | 
 6 | from . import score_sorted
 7 | 
 8 | 
 9 | def y_hash(y):
10 |     return hash(tuple(np.where(y[:200])[0]) + tuple(np.where(y[-200:])[0]))
11 | 
12 | 
13 | class XgbModel(object):
14 | 
15 |     def __init__(self, n_iter, **params):
16 |         self.n_iter = n_iter
17 |         self.params = params
18 | 
19 |     def fit(self, train_X, train_y, train_g=None, eval_X=None, eval_y=None, eval_g=None):
20 |         params = self.params.copy()
21 | 
22 |         dtrain = xgb.DMatrix(train_X, label=train_y)
23 | 
24 |         groups = {y_hash(train_y): train_g}
25 | 
26 |         if eval_X is None:
27 |             watchlist = [(dtrain, 'train')]
28 |         else:
29 |             deval = xgb.DMatrix(eval_X, label=eval_y)
30 |             watchlist = [(deval, 'eval'), (dtrain, 'train')]
31 |             groups[y_hash(eval_y)] = eval_g
32 | 
33 |         def feval(y_pred, dtrain):
34 |             y_true = dtrain.get_label()
35 |             y_group = groups[y_hash(y_true)]
36 | 
37 |             return [
38 |                 ('loss', log_loss(y_true, y_pred)),
39 |                 ('map', score_sorted(y_true, y_pred, y_group)),
40 |             ]
41 | 
42 |         self.model = xgb.train(params, dtrain, self.n_iter, watchlist, feval=feval, verbose_eval=20)
43 | 
44 |         return self
45 | 
46 |     def predict(self, test_X):
47 |         return self.model.predict(xgb.DMatrix(test_X))
48 | 


--------------------------------------------------------------------------------
/prepare-documents.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import os
 5 | 
 6 | from util.meta import cache_dir, input_dir
 7 | 
 8 | 
 9 | def fix_date(d):
10 |     if type(d) in [str, unicode] and d.startswith('30'):
11 |         d = d.replace('30', '20', 1)
12 |     elif type(d) in [str, unicode] and d.startswith('00'):
13 |         d = d.replace('00', '20', 1)
14 | 
15 |     return d
16 | 
17 | 
18 | def encode_feature(values):
19 |     uniq = values.unique()
20 |     mapping = dict(zip(uniq, range(1, len(uniq) + 1)))
21 | 
22 |     return values.map(mapping)
23 | 
24 | 
25 | df = pd.read_csv(os.path.join(input_dir, 'documents_meta.csv.zip'), index_col='document_id', dtype={'document_id': np.uint32})
26 | df['source_id'] = df['source_id'].fillna(-1).astype(np.int32)
27 | df['publisher_id'] = df['publisher_id'].fillna(-1).astype(np.int16)
28 | df['publish_time'] = pd.to_datetime(df['publish_time'].map(fix_date).replace('nan', np.nan), errors='coerce')
29 | df['publish_timestamp'] = (df['publish_time'].astype(np.int64) // 1000000 - 1465876799998).clip(lower=-1000000000000)
30 | 
31 | df.to_csv(os.path.join(cache_dir, 'documents.csv.gz'), compression='gzip')
32 | 
33 | ## Document entities
34 | 
35 | df = pd.read_csv(os.path.join(input_dir, 'documents_entities.csv.zip'), index_col='document_id', dtype={'document_id': np.uint32})
36 | df['entity_id'] = encode_feature(df['entity_id'])
37 | df.to_csv(os.path.join(cache_dir, 'documents_entities.csv.gz'), compression='gzip')
38 | 
39 | print "Done."
40 | 


--------------------------------------------------------------------------------
/train-ad-mean.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from util.meta import full_split, val_split
 5 | from util import gen_prediction_name, gen_submission, score_prediction
 6 | 
 7 | reg = 10.0
 8 | 
 9 | 
10 | def fit_predict(train_file, pred_file):
11 |     train = pd.read_csv(train_file, dtype=np.int32)
12 |     train_pos = train[train['clicked'] == 1]
13 | 
14 |     ad_cnt = train['ad_id'].value_counts()
15 |     ad_pos_cnt = train_pos['ad_id'].value_counts()
16 | 
17 |     del train, train_pos
18 | 
19 |     test = pd.read_csv(pred_file, dtype=np.int32)
20 |     test['pred'] = test['ad_id'].map(ad_pos_cnt).fillna(0) / (test['ad_id'].map(ad_cnt).fillna(0) + reg)
21 | 
22 |     return test
23 | 
24 | ## Validating
25 | 
26 | print "Running on validation split..."
27 | 
28 | pred = fit_predict(val_split[0], val_split[1])
29 | 
30 | print "Scoring..."
31 | 
32 | present_score, future_score, total_score = score_prediction(pred)
33 | name = gen_prediction_name('ad-mean', total_score)
34 | 
35 | print "  Present score: %.5f" % present_score
36 | print "  Future score: %.5f" % future_score
37 | print "  Total score: %.5f" % total_score
38 | 
39 | pred[['display_id', 'ad_id', 'pred']].to_pickle('preds/%s-val.pickle' % name)
40 | 
41 | ## Predicting
42 | 
43 | print "Running on full split..."
44 | 
45 | pred = fit_predict(full_split[0], full_split[1])
46 | pred[['display_id', 'ad_id', 'pred']].to_pickle('preds/%s-test.pickle' % name)
47 | 
48 | print "  Generating submission..."
49 | subm = gen_submission(pred)
50 | subm.to_csv('subm/%s.csv.gz' % name, index=False, compression='gzip')
51 | 
52 | print "  File name: %s" % name
53 | print "Done."
54 | 


--------------------------------------------------------------------------------
/train-l2-blend.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from util.meta import full_split, val_split
 4 | from util import gen_prediction_name, gen_submission, score_prediction
 5 | 
 6 | preds = [
 7 |     ('20170108-2008-ffm2-f1-0.68984', 1.3),
 8 | 
 9 |     ('20170107-2248-ffm2-p1-0.68876', 0.9),
10 |     ('20170108-0345-ffm2-p2-0.68762', 0.7),
11 | 
12 |     ('20161230-1323-ffm-p1-0.68204', 0.2),
13 |     ('20161230-1049-ffm-p2-0.68169', 0.2),
14 | 
15 |     ('20161231-0544-vw-p1-0.67309', 0.07),
16 |     ('20161231-1927-vw-p2-0.66718', 0.03),
17 | 
18 |     ('20170106-1339-vw-p1-0.67829', 0.1),
19 | ]
20 | 
21 | 
22 | def fit_predict(split, split_name):
23 |     pred = pd.read_csv(split[1])
24 |     pred['pred'] = sum(pd.read_pickle('preds/%s-%s.pickle' % (p, 'test' if split_name == 'full' else 'val'))['pred'] * w for p, w in preds)
25 | 
26 |     return pred
27 | 
28 | 
29 | ## Validation
30 | 
31 | print "Validation split..."
32 | 
33 | pred = fit_predict(val_split, 'val')
34 | 
35 | print "  Scoring..."
36 | 
37 | present_score, future_score, score = score_prediction(pred)
38 | name = gen_prediction_name('l2-blend', score)
39 | 
40 | print "  Present score: %.5f" % present_score
41 | print "  Future score: %.5f" % future_score
42 | print "  Total score: %.5f" % score
43 | 
44 | pred[['pred']].to_pickle('preds/%s-val.pickle' % name)
45 | 
46 | del pred
47 | 
48 | ## Prediction
49 | 
50 | print "Full split..."
51 | 
52 | pred = fit_predict(full_split, 'full')
53 | pred[['pred']].to_pickle('preds/%s-test.pickle' % name)
54 | 
55 | print "  Generating submission..."
56 | subm = gen_submission(pred)
57 | subm.to_csv('subm/%s.csv.gz' % name, index=False, compression='gzip')
58 | 
59 | del pred, subm
60 | 
61 | print "  File name: %s" % name
62 | print "Done."
63 | 


--------------------------------------------------------------------------------
/util/model-helpers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <random>
 4 | 
 5 | #include <immintrin.h>
 6 | 
 7 | // Define intrinsic missing in gcc
 8 | #define _mm256_set_m128(v0, v1)  _mm256_insertf128_ps(_mm256_castps128_ps256(v1), (v0), 1)
 9 | 
10 | 
11 | 
12 | constexpr ffm_uint align_bytes = 32;
13 | constexpr ffm_uint align_floats = align_bytes / sizeof(float);
14 | 
15 | 
16 | inline float sum(__m256 val) {
17 |     __m128 s = _mm256_extractf128_ps(_mm256_add_ps(val,  _mm256_permute2f128_ps(val, val, 1)), 0);
18 | 
19 |     s = _mm_hadd_ps(s, s);
20 |     s = _mm_hadd_ps(s, s);
21 | 
22 |     float sum;
23 |     _mm_store_ss(&sum, s);
24 | 
25 |     return sum;
26 | }
27 | 
28 | constexpr uint aligned_float_array_size(uint cnt) {
29 |     return ((cnt - 1) / align_floats + 1) * align_floats;
30 | }
31 | 
32 | 
33 | template <typename T>
34 | inline T * malloc_aligned(size_t size) {
35 |     void *ptr;
36 | 
37 |     int status = posix_memalign(&ptr, align_bytes, size*sizeof(T));
38 | 
39 |     if(status != 0)
40 |         throw std::bad_alloc();
41 | 
42 |     return (T*) ptr;
43 | }
44 | 
45 | 
46 | template <typename T>
47 | inline void fill_with_zero(T * weights, size_t n) {
48 |     T * w = weights;
49 | 
50 |     for(size_t i = 0; i < n; i++)
51 |         *w++ = T(0);
52 | }
53 | 
54 | 
55 | template <typename D>
56 | static void fill_with_rand(ffm_float * weights, ffm_uint n, D gen, std::default_random_engine & rnd) {
57 |     ffm_float * w = weights;
58 | 
59 |     for(ffm_uint i = 0; i < n; i++) {
60 |         *w++ = gen(rnd);
61 |     }
62 | }
63 | 
64 | 
65 | template <typename T>
66 | inline void fill_with_ones(T * weights, size_t n) {
67 |     T * w = weights;
68 | 
69 |     for(size_t i = 0; i < n; i++)
70 |         *w++ = T(1);
71 | }
72 | 
73 | 
74 | inline uint test_mask_bit(uint64_t * mask, uint i) {
75 |     return (mask[i >> 6] >> (i & 63)) & 1;
76 | }
77 | 
78 | 
79 | template <typename T>
80 | inline T min(T a, T b) {
81 |     return a < b ? a : b;
82 | }
83 | 
84 | 
85 | template <typename T>
86 | inline int sgn(T val) {
87 |     return (T(0) < val) - (val < T(0));
88 | }
89 | 
90 | 
91 | inline float relu(float val) {
92 |     return val > 0 ? val : 0;
93 | }
94 | 
95 | inline bool isnan(float val) {
96 |     return val != val;
97 | }
98 | 


--------------------------------------------------------------------------------
/feat_Uuid_Source_id.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | This script will find the source (found in documents_meta) of each document clicked by a user.
 4 | This will be joined onto the training file with a key on the user.
 5 | This will be good to capture the sparse documents which would have been excluded from
 6 | other page_view features because the doucment was too sparse.
 7 | The script shuld be executed with pypy for faster execution.
 8 | @author: darragh
 9 | """
10 | 
11 | import csv, os, gzip
12 | 
13 | input_dir = os.getenv('INPUT', '../input')
14 | 
15 | uuid_ev = {}  # a check for if the user exists in events
16 | uuid_uid = {}  # Map of uuids to numeric ids
17 | doc_srce = {}  # From documents_meta, the source of each document
18 | 
19 | for c, row in enumerate(csv.DictReader(gzip.open(input_dir + '/documents_meta.csv.gz'))):
20 |     if row['source_id'] != '':
21 |         doc_srce[row['document_id']] = row['source_id']
22 | 
23 | for c, row in enumerate(csv.DictReader(gzip.open('cache/events.csv.gz'))):
24 |     if row['uuid'] != '':
25 |         uuid_ev[row['uuid']] = 1
26 |         uuid_uid[row['uuid']] = row['uid']
27 | 
28 | count = 0
29 | outfile = "cache/viewed_doc_sources.csv.gz"
30 | filename = input_dir + '/page_views.csv.gz'
31 | 
32 | # loop through the documents per user and get the source of the documents per user
33 | for c, row in enumerate(csv.DictReader(gzip.open(filename))):
34 |     if c % 1000000 == 0:
35 |         print (c, count)
36 | 
37 |     if row['document_id'] not in doc_srce:
38 |         continue
39 |     if row['uuid'] not in uuid_ev:
40 |         continue
41 | 
42 |     if uuid_ev[row['uuid']] == 1:
43 |         uuid_ev[row['uuid']] = set()
44 | 
45 |     lu = len(uuid_ev[row['uuid']])
46 |     uuid_ev[row['uuid']].add(doc_srce[row['document_id']])
47 | 
48 |     if lu != len(uuid_ev[row['uuid']]):
49 |         count += 1
50 | 
51 | # Delete output file if it already exists
52 | try:
53 |     os.remove(outfile)
54 | except OSError:
55 |     pass
56 | 
57 | # Open the file to write to
58 | fo = gzip.open(outfile, 'w')
59 | fo.write('uuid,source_id\n')
60 | for i in uuid_ev:
61 |     if uuid_ev[i] != 1:
62 |         tmp = list(uuid_ev[i])
63 |         fo.write('%s,%s\n' % (uuid_uid[i], ' '.join(tmp)))
64 |         del tmp
65 | fo.close()
66 | 


--------------------------------------------------------------------------------
/util/generation.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "io.h"
 4 | 
 5 | template <typename D, typename W>
 6 | void generate_files(const D & data, const std::vector<std::pair<std::vector<std::string>, std::string>> & filesets) {
 7 |     using namespace std;
 8 |     using namespace boost::iostreams;
 9 | 
10 |     for (auto it = filesets.begin(); it != filesets.end(); ++ it) {
11 |         auto in_file_name = it->first[0];
12 |         auto out_file_name = it->second;
13 | 
14 |         W writer(out_file_name);
15 | 
16 |         cout << "  Generating " << out_file_name << "... ";
17 |         cout.flush();
18 | 
19 |         time_t begin = time(nullptr);
20 | 
21 |         vector<unique_ptr<compressed_csv_file>> in_files;
22 | 
23 |         for (auto in_it = it->first.begin(); in_it != it->first.end(); ++in_it)
24 |             in_files.push_back(unique_ptr<compressed_csv_file>(new compressed_csv_file(*in_it)));
25 | 
26 |         for (int i = 0;; ++ i) {
27 |             vector<vector<string>> rows;
28 | 
29 |             for (auto in_it = in_files.begin(); in_it != in_files.end(); ++in_it)
30 |                 rows.push_back((*in_it)->getrow());
31 | 
32 |             if (rows[0].empty())
33 |                 break;
34 | 
35 |             writer.write(data, rows);
36 | 
37 |             if (i > 0 && i % 5000000 == 0) {
38 |                 cout << (i / 1000000) << "M... ";
39 |                 cout.flush();
40 |             }
41 |         }
42 | 
43 |         writer.finish();
44 | 
45 |         cout << "done in " << (time(nullptr) - begin) << " seconds" << endl;
46 |     };
47 | }
48 | 
49 | auto build_filesets(const std::vector<std::pair<std::string, std::string>> & files, const std::vector<std::string> & features, const std::string & out_suffix) {
50 |     using namespace std;
51 | 
52 |     vector<pair<vector<string>, string>> filesets;
53 | 
54 |     for (auto fi = files.begin(); fi != files.end(); ++ fi) {
55 |         vector<string> inputs;
56 | 
57 |         inputs.push_back(fi->first);
58 | 
59 |         for (auto ffi = features.begin(); ffi != features.end(); ++ ffi)
60 |             inputs.push_back(string("cache/") + (*ffi) + string("_") + fi->second + string(".csv.gz"));
61 | 
62 |         filesets.push_back(make_pair(inputs, string("cache/") + fi->second + out_suffix));
63 |     }
64 | 
65 |     return filesets;
66 | }
67 | 


--------------------------------------------------------------------------------
/util/keras_model.py:
--------------------------------------------------------------------------------
 1 | from keras.models import Sequential
 2 | from keras.layers import Dense, Dropout
 3 | from keras.layers.advanced_activations import PReLU
 4 | from keras.layers.normalization import BatchNormalization
 5 | from keras import regularizers
 6 | 
 7 | from sklearn.preprocessing import StandardScaler
 8 | 
 9 | 
10 | def regularizer(params):
11 |     if 'l1' in params and 'l2' in params:
12 |         return regularizers.l1l2(params['l1'], params['l2'])
13 |     elif 'l1' in params:
14 |         return regularizers.l1(params['l1'])
15 |     elif 'l2' in params:
16 |         return regularizers.l2(params['l2'])
17 |     else:
18 |         return None
19 | 
20 | 
21 | def nn_mlp_2(input_shape, params):
22 |     model = Sequential()
23 | 
24 |     for i, layer_size in enumerate(params['layers']):
25 |         reg = regularizer(params)
26 | 
27 |         if i == 0:
28 |             model.add(Dense(layer_size, init='he_normal', W_regularizer=reg, input_shape=input_shape))
29 |         else:
30 |             model.add(Dense(layer_size, init='he_normal', W_regularizer=reg))
31 | 
32 |         model.add(PReLU())
33 | 
34 |         if params.get('batch_norm', False):
35 |             model.add(BatchNormalization())
36 | 
37 |         if 'dropouts' in params:
38 |             model.add(Dropout(params['dropouts'][i]))
39 | 
40 |     model.add(Dense(1, init='glorot_normal', activation='sigmoid'))
41 | 
42 |     return model
43 | 
44 | 
45 | class KerasModel(object):
46 | 
47 |     def __init__(self, **params):
48 |         self.arch = nn_mlp_2
49 |         self.params = params
50 | 
51 |     def fit(self, train_X, train_y, train_g=None, eval_X=None, eval_y=None, eval_g=None):
52 |         self.model = self.arch((train_X.shape[1],), self.params)
53 |         self.model.compile(optimizer='adadelta', loss='binary_crossentropy')
54 | 
55 |         self.scaler = StandardScaler()
56 | 
57 |         train_X = self.scaler.fit_transform(train_X)
58 | 
59 |         if eval_X is not None:
60 |             eval_X = self.scaler.transform(eval_X)
61 | 
62 |         callbacks = []
63 | 
64 |         self.model.fit(
65 |             x=train_X, y=train_y,
66 |             batch_size=self.params.get('batch_size', 32), nb_epoch=self.params['n_epoch'],
67 |             validation_data=(None if eval_X is None else (eval_X, eval_y)),
68 |             verbose=1, callbacks=callbacks)
69 | 
70 |         return self
71 | 
72 |     def predict(self, test_X):
73 |         test_X = self.scaler.transform(test_X)
74 | 
75 |         return self.model.predict(test_X).flatten()
76 | 


--------------------------------------------------------------------------------
/feat_Uuid_Doc_Srce_id.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | This script will find all documents clicked historically by a user.
 4 | However for extra granularity we store spearately whether the traffic source of the doucment
 5 | was 'internal', 'social' or search. For each, it is stored in a separate FFM namespace.
 6 | We only store documents which occured at least 100 times in the events file.
 7 | The script shuld be executed with pypy for faster execution.
 8 | Code leveraged from rcarson @ https://www.kaggle.com/jiweiliu/outbrain-click-prediction/extract-leak-in-30-mins-with-small-memory
 9 | @author: darragh
10 | """
11 | import csv, os, gzip
12 | 
13 | input_dir = os.getenv('INPUT', '../input')
14 | 
15 | # define distionaries
16 | uuid_uid = {}  # Map of uuids to numeric ids
17 | uuid_ev = {}  # a check for if the user exists in events
18 | ctdoc = {}   # check if the document occured at least 100 times.
19 | 
20 | for c, row in enumerate(csv.DictReader(gzip.open('cache/events.csv.gz'))):
21 |     if row['uuid'] != '':
22 |         uuid_ev[row['uuid']] = 1
23 |         uuid_uid[row['uuid']] = row['uid']
24 |     if row['document_id'] not in ctdoc:
25 |         ctdoc[row['document_id']] = 1
26 |     else:
27 |         ctdoc[row['document_id']] += 1
28 | print('all docs : ' + str(len(ctdoc)))
29 | ctdoc = { key:value for key, value in ctdoc.items() if value > 100 }
30 | print('common docs > 100 : ' + str(len(ctdoc)))
31 | 
32 | count = 0
33 | outfile = "cache/viewed_doc_trf_source.csv.gz"
34 | filename = input_dir + '/page_views.csv.gz'
35 | # filename = input_dir + '/page_views_sample.csv.gz' # comment this out locally
36 | 
37 | for c, row in enumerate(csv.DictReader(gzip.open(filename))):
38 |     if c % 1000000 == 0:
39 |         print (c, count)
40 |     if row['document_id'] not in ctdoc:
41 |         continue
42 |     if row['uuid'] not in uuid_ev:
43 |         continue
44 | 
45 |     if uuid_ev[row['uuid']] == 1:
46 |         uuid_ev[row['uuid']] = set()
47 |     lu = len(uuid_ev[row['uuid']])
48 |     uuid_ev[row['uuid']].add(row['document_id'])
49 |     if lu != len(uuid_ev[row['uuid']]):
50 |         count += 1
51 | 
52 | # Delete output file if it already exists
53 | try:
54 |     os.remove(outfile)
55 | except OSError:
56 |     pass
57 | 
58 | # Open the file to write to
59 | fo = gzip.open(outfile, 'w')
60 | fo.write('uuid,doc_trf_ids\n')
61 | for i in uuid_ev:
62 |     if uuid_ev[i] != 1:
63 |         tmp = list(uuid_ev[i])
64 |         fo.write('%s,%s\n' % (uuid_uid[i], ' '.join(tmp)))
65 |         del tmp
66 | fo.close()
67 | 


--------------------------------------------------------------------------------
/train-ffm.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | import os
 5 | import argparse
 6 | 
 7 | from util.meta import full_split, val_split
 8 | from util import gen_prediction_name, gen_submission, score_prediction, print_and_exec
 9 | 
10 | 
11 | def fit_predict(profile, split, split_name):
12 |     train_file = 'cache/%s_train_ffm.txt' % split_name
13 |     pred_file = 'cache/%s_test_ffm.txt' % split_name
14 | 
15 |     print "  Training..."
16 | 
17 |     opts = profile['options']
18 | 
19 |     if split_name == "val":
20 |         opts += " -p cache/val_test_ffm.txt"
21 | 
22 |     print_and_exec("ffm-train %s %s /tmp/ffm.model" % (opts, train_file))
23 | 
24 |     print "  Predicting..."
25 | 
26 |     print_and_exec("ffm-predict %s /tmp/ffm.model /tmp/ffm.preds" % pred_file)
27 | 
28 |     pred = pd.read_csv(split[1])
29 |     pred['pred'] = np.loadtxt('/tmp/ffm.preds')
30 | 
31 |     return pred
32 | 
33 | 
34 | profiles = {
35 |     'p1': {
36 |         'options': "-t 3 -s 4",
37 |     },
38 | 
39 |     'p2': {
40 |         'options': "-l 0.00005 -r 0.1 -t 13 -s 9",
41 |     },
42 | }
43 | 
44 | 
45 | parser = argparse.ArgumentParser(description='Train FFM model')
46 | parser.add_argument('profile', type=str, help='Train profile')
47 | parser.add_argument('--rewrite-cache', action='store_true', help='Drop cache files prior to train')
48 | 
49 | args = parser.parse_args()
50 | profile = profiles[args.profile]
51 | 
52 | 
53 | if not os.path.exists('cache/val_train_ffm.txt') or args.rewrite_cache:
54 |     print "Generating data..."
55 |     os.system("bin/export-ffm-data")
56 | 
57 | 
58 | ## Validation
59 | 
60 | print "Validation split..."
61 | 
62 | pred = fit_predict(profile, val_split, 'val')
63 | 
64 | print "  Scoring..."
65 | 
66 | present_score, future_score, score = score_prediction(pred)
67 | name = gen_prediction_name('ffm-%s' % args.profile, score)
68 | 
69 | print "  Present score: %.5f" % present_score
70 | print "  Future score: %.5f" % future_score
71 | print "  Total score: %.5f" % score
72 | 
73 | pred[['pred']].to_csv('preds/%s-val.csv.gz' % name, index=False, compression='gzip')
74 | 
75 | del pred
76 | 
77 | ## Prediction
78 | 
79 | print "Full split..."
80 | 
81 | pred = fit_predict(profile, full_split, 'full')
82 | pred[['pred']].to_csv('preds/%s-test.csv.gz' % name, index=False, compression='gzip')
83 | 
84 | print "  Generating submission..."
85 | subm = gen_submission(pred)
86 | subm.to_csv('subm/%s.csv.gz' % name, index=False, compression='gzip')
87 | 
88 | del pred, subm
89 | 
90 | print "  File name: %s" % name
91 | print "Done."
92 | 


--------------------------------------------------------------------------------
/feat_disp_ad_doc_others.R:
--------------------------------------------------------------------------------
 1 | # """
 2 | # Here we create a feature keyed on display_id, where for each display, we hash the interaction
 3 | # of the display document, and each of the ads on the display.
 4 | # We use this feature by joining it back to the clicks_train and clicks_test files, so that
 5 | # each row has information on the ads it it is competing against.
 6 | # To run this script, execute the following command from the main folder :
 7 | #         Rscript final/feat_disp_ad_doc_others.R
 8 | # """
 9 | 
10 | 
11 | cat("Set up packages")
12 | rm(list=ls())
13 | if (!require("data.table")) install.packages("data.table")
14 | library(data.table)
15 | gc()
16 | 
17 | ################################################################################################
18 | # ad_id & document interaction per display
19 | ################################################################################################
20 | 
21 | D = 2^22  # Hash value
22 | 
23 | input_dir = Sys.getenv("INPUT", "../input")
24 | 
25 | cat("load and join the clicks and events data")
26 | ctrnraw = fread(paste0("gunzip -c ", input_dir, "/clicks_train.csv.gz"), select=c("display_id", "ad_id"))
27 | ctstraw = fread(paste0("gunzip -c ", input_dir, "/clicks_test.csv.gz"), select=c("display_id", "ad_id"))
28 | craw = rbind(ctrnraw, ctstraw)
29 | rm(ctrnraw, ctstraw); gc()
30 | event = fread(paste0("gunzip -c ", input_dir, "/events.csv.gz"), select=c("display_id", "document_id"))
31 | craw = merge(craw, event, all.x = T, by = "display_id")
32 | rm(event); gc()
33 | 
34 | cat("Get the interaction of the ad and the dipslay documents")
35 | # We add a '99' to avoid colisions; eg. if we just paste them together,
36 | # the diplay_id '1' and ad_id '1000' would get the same value as display_id '1100' and ad_id '1'
37 | craw[,document_ad_id:=paste0(document_id, "99", ad_id, sep="")]
38 | craw[,`:=`(document_id=NULL, ad_id=NULL)]
39 | 
40 | cat(" We do not want to see rare events, therefore we exclude any documents see less than 100 times.")
41 | craw[,ct:=.N, by="document_ad_id"]
42 | craw = craw[ct>99]
43 | craw[,ct:=NULL]
44 | gc()
45 | 
46 | # Because of how we created the interaction above, we have some very large numbers in the interaction.
47 | # to reduce space, lets hash these
48 | craw[,document_ad_id:=as.numeric(document_ad_id)%%D]
49 | 
50 | cat("Now we aggregate each interaction per display")
51 | setkeyv(craw, "display_id")
52 | craw = craw[,(paste0(document_ad_id, collapse = " ")), by=display_id]
53 | setnames(craw, c("display_id", "document_ad_id"))
54 | setkeyv(craw, "display_id")
55 | 
56 | cat("Write out the file")
57 | write.csv(craw, gzfile("cache/doc_ad_others.csv.gz"), row.names = F, quote = F)
58 | gc()
59 | 
60 | 


--------------------------------------------------------------------------------
/train-ctr-nn.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | import argparse
 5 | 
 6 | from util.meta import row_counts
 7 | from util import train_model
 8 | 
 9 | from keras.models import Sequential
10 | from keras.layers import Dense, Dropout
11 | from keras.layers.advanced_activations import PReLU
12 | from keras.layers.normalization import BatchNormalization
13 | from keras import regularizers
14 | 
15 | 
16 | input_size = 19
17 | 
18 | 
19 | def regularizer(params):
20 |     if 'l1' in params and 'l2' in params:
21 |         return regularizers.l1l2(params['l1'], params['l2'])
22 |     elif 'l1' in params:
23 |         return regularizers.l1(params['l1'])
24 |     elif 'l2' in params:
25 |         return regularizers.l2(params['l2'])
26 |     else:
27 |         return None
28 | 
29 | 
30 | def nn_mlp_2(input_shape, **params):
31 |     model = Sequential()
32 | 
33 |     for i, layer_size in enumerate(params['layers']):
34 |         reg = regularizer(params)
35 | 
36 |         if i == 0:
37 |             model.add(Dense(layer_size, init='he_normal', W_regularizer=reg, input_shape=input_shape))
38 |         else:
39 |             model.add(Dense(layer_size, init='he_normal', W_regularizer=reg))
40 | 
41 |         model.add(PReLU())
42 | 
43 |         if params.get('batch_norm', False):
44 |             model.add(BatchNormalization())
45 | 
46 |         if 'dropouts' in params:
47 |             model.add(Dropout(params['dropouts'][i]))
48 | 
49 |     model.add(Dense(1, init='glorot_normal', activation='sigmoid'))
50 | 
51 |     return model
52 | 
53 | 
54 | def read_data(name):
55 |     return np.memmap("cache/%s_np_ctr1.npy" % name, dtype='float32', mode='r', shape=(row_counts[name], input_size))
56 | 
57 | 
58 | def fit_predict(profile, split, split_name):
59 |     train_X = read_data(split_name + '_train')
60 |     train_y = pd.read_csv(split[0], usecols=['clicked'])['clicked'].values
61 | 
62 |     if split_name == 'full':
63 |         eval_X = None
64 |         eval_y = None
65 |     else:
66 |         eval_X = read_data(split_name + '_test')
67 |         eval_y = pd.read_csv(split[1], usecols=['clicked'])['clicked'].values
68 | 
69 |     model = nn_mlp_2((input_size,), layers=[20])
70 |     model.compile(optimizer='adadelta', loss='binary_crossentropy')
71 |     model.fit(
72 |         x=train_X, y=train_y,
73 |         batch_size=256, nb_epoch=1,
74 |         validation_data=(None if eval_X is None else (eval_X, eval_y)),
75 |         verbose=1, callbacks=[])
76 | 
77 |     pred_X = read_data(split_name + '_test') if eval_X is None else eval_X
78 |     pred = model.predict(pred_X, batch_size=256)
79 | 
80 |     pred_df = pd.read_csv(split[1])
81 |     pred_df['pred'] = pred
82 | 
83 |     return pred_df
84 | 
85 | 
86 | profile_name = 'v1'
87 | profile = {}
88 | 
89 | train_model(fit_predict, 'ctr-nn-%s' % profile_name, profile)
90 | 


--------------------------------------------------------------------------------
/prepare-rivals.cpp:
--------------------------------------------------------------------------------
 1 | #include "util/io.h"
 2 | 
 3 | std::vector<std::pair<std::string, std::string>> filesets {
 4 |     { "cache/clicks_cv1_train.csv.gz", "cv1_train" },
 5 |     { "cache/clicks_cv1_test.csv.gz", "cv1_test" },
 6 |     { "cache/clicks_cv2_train.csv.gz", "cv2_train" },
 7 |     { "cache/clicks_cv2_test.csv.gz", "cv2_test" },
 8 |     { "../input/clicks_train.csv.gz", "full_train" },
 9 |     { "../input/clicks_test.csv.gz", "full_test" },
10 | };
11 | 
12 | 
13 | void write_rivals(std::ostream & out, const std::vector<uint> rivals) {
14 |     for (uint i = 0; i < rivals.size(); ++ i) {
15 |         out << rivals.size() << ",";
16 | 
17 |         uint w = 0;
18 |         for (uint j = 0; j < rivals.size(); ++ j) {
19 |             if (i != j) {
20 |                 if (w > 0)
21 |                     out << " ";
22 | 
23 |                 out << rivals[j];
24 | 
25 |                 w ++;
26 |             }
27 |         }
28 | 
29 |         out << "," << i << std::endl;
30 |     }
31 | }
32 | 
33 | 
34 | int main() {
35 |     using namespace std;
36 | 
37 |     cout << "Generating rivals features..." << endl;
38 |     for (auto it = filesets.begin(); it != filesets.end(); ++ it) {
39 |         string out_file_name = string("cache/rivals_") + it->second + string(".csv.gz");
40 | 
41 |         cout << "  Generating " << out_file_name << "... ";
42 |         cout.flush();
43 | 
44 |         clock_t begin = clock();
45 | 
46 |         compressed_csv_file file(it->first);
47 |         ofstream outfile(out_file_name, std::ios_base::out | std::ios_base::binary);
48 | 
49 |         streamsize buffer_size = 1024*1024;
50 |         boost::iostreams::filtering_streambuf<boost::iostreams::output> buf;
51 |         buf.push(boost::iostreams::gzip_compressor(), buffer_size, buffer_size);
52 |         buf.push(outfile, buffer_size, buffer_size);
53 | 
54 |         std::ostream out(&buf);
55 | 
56 |         out << "rival_count,rivals,rank" << endl;
57 | 
58 |         uint cur_group_id = 0;
59 |         std::vector<uint> cur_rivals;
60 | 
61 |         for (int i = 0;; ++i) {
62 |             auto row = file.getrow();
63 | 
64 |             if (row.empty())
65 |                 break;
66 | 
67 |             uint group_id = stoi(row[0]);
68 |             uint ad_id = stoi(row[1]);
69 | 
70 |             if (cur_group_id != group_id) {
71 |                 write_rivals(out, cur_rivals);
72 | 
73 |                 cur_group_id = group_id;
74 |                 cur_rivals.clear();
75 |             }
76 | 
77 |             cur_rivals.push_back(ad_id);
78 | 
79 |             if (i > 0 && i % 5000000 == 0) {
80 |                 cout << (i / 1000000) << "M... ";
81 |                 cout.flush();
82 |             }
83 |         }
84 | 
85 |         write_rivals(out, cur_rivals);
86 | 
87 |         clock_t end = clock();
88 |         double elapsed = double(end - begin) / CLOCKS_PER_SEC;
89 | 
90 |         cout << "done in " << elapsed << " seconds" << endl;
91 |     }
92 | 
93 |     cout << "Done." << endl;
94 | }
95 | 


--------------------------------------------------------------------------------
/train-vw.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | import os
 5 | import argparse
 6 | 
 7 | from scipy.special import expit
 8 | 
 9 | from util.meta import full_split, val_split
10 | from util import gen_prediction_name, gen_submission, score_prediction, print_and_exec
11 | 
12 | 
13 | def fit_predict(profile, split, split_name):
14 |     train_file = 'cache/%s_train_vw.txt' % split_name
15 |     pred_file = 'cache/%s_test_vw.txt' % split_name
16 | 
17 |     print "  Training..."
18 | 
19 |     if os.path.exists(train_file + '.cache'):
20 |         os.remove(train_file + '.cache')
21 | 
22 |     interactions = ' '.join('-q %s' % i for i in profile['interactions'].split(' '))
23 | 
24 |     print_and_exec("vw --cache -P 5000000 --loss_function logistic %s %s -f /tmp/vw.model %s " % (profile['options'], interactions, train_file))
25 | 
26 |     print "  Predicting..."
27 | 
28 |     if os.path.exists(pred_file + '.cache'):
29 |         os.remove(pred_file + '.cache')
30 | 
31 |     print_and_exec("vw -i /tmp/vw.model -p /tmp/vw.preds -P 5000000 -t %s" % pred_file)
32 | 
33 |     pred = pd.read_csv(split[1])
34 |     pred['pred'] = expit(np.loadtxt('/tmp/vw.preds'))
35 | 
36 |     return pred
37 | 
38 | 
39 | profiles = {
40 |     'p1': {
41 |         'interactions': 'aa al ld lp dp fe fa fd fl fp ff',
42 |         'options': "--passes 3 -b 22 --nn 10 --ignore u --ignore t",
43 |     },
44 | 
45 |     'p2': {
46 |         'interactions': 'aa al ld lp dp ft fa fd fl fp ff tt ta tl td tp up',
47 |         'options': "--passes 4 -b 23 --nn 20",
48 |     }
49 | }
50 | 
51 | 
52 | parser = argparse.ArgumentParser(description='Train VW model')
53 | parser.add_argument('profile', type=str, help='Train profile')
54 | parser.add_argument('--rewrite-cache', action='store_true', help='Drop cache files prior to train')
55 | 
56 | args = parser.parse_args()
57 | profile = profiles[args.profile]
58 | 
59 | 
60 | if not os.path.exists('cache/val_train_vw.txt') or args.rewrite_cache:
61 |     print "Generating data..."
62 |     os.system("bin/export-vw-data")
63 | 
64 | 
65 | ## Validation
66 | 
67 | print "Validation split..."
68 | 
69 | pred = fit_predict(profile, val_split, 'val')
70 | 
71 | print "  Scoring..."
72 | 
73 | present_score, future_score, score = score_prediction(pred)
74 | name = gen_prediction_name('vw-%s' % args.profile, score)
75 | 
76 | print "  Present score: %.5f" % present_score
77 | print "  Future score: %.5f" % future_score
78 | print "  Total score: %.5f" % score
79 | 
80 | pred[['pred']].to_csv('preds/%s-val.csv.gz' % name, index=False, compression='gzip')
81 | 
82 | del pred
83 | 
84 | ## Prediction
85 | 
86 | print "Full split..."
87 | 
88 | pred = fit_predict(profile, full_split, 'full')
89 | pred[['pred']].to_csv('preds/%s-test.csv.gz' % name, index=False, compression='gzip')
90 | 
91 | print "  Generating submission..."
92 | subm = gen_submission(pred)
93 | subm.to_csv('subm/%s.csv.gz' % name, index=False, compression='gzip')
94 | 
95 | del pred, subm
96 | 
97 | print "  File name: %s" % name
98 | print "Done."
99 | 


--------------------------------------------------------------------------------
/ffm.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <string>
  4 | #include <vector>
  5 | #include <functional>
  6 | 
  7 | typedef void * ffm_file;
  8 | 
  9 | typedef uint32_t ffm_uint;
 10 | typedef uint64_t ffm_ulong;
 11 | typedef float ffm_float;
 12 | typedef double ffm_double;
 13 | 
 14 | 
 15 | const ffm_uint ffm_hash_bits = 20;
 16 | const ffm_uint ffm_hash_mask = (1 << ffm_hash_bits) - 1;
 17 | 
 18 | 
 19 | struct ffm_feature {
 20 |     ffm_uint index;
 21 |     ffm_float value;
 22 | };
 23 | 
 24 | // Structure for fast access to data
 25 | struct ffm_index {
 26 |     ffm_ulong size; // Number of examples;
 27 | 
 28 |     std::vector<ffm_float> labels; // Target values of examples (size N)
 29 |     std::vector<ffm_ulong> offsets; // Offsets of example data (size N +1) in number of features
 30 |     std::vector<ffm_float> norms; // Squares of l2 norm of examples (size N)
 31 |     std::vector<ffm_uint> groups; // Group identifiers for MAP calculation
 32 | };
 33 | 
 34 | // IO functions
 35 | 
 36 | void ffm_write_index(const std::string & file_name, const ffm_index & index);
 37 | 
 38 | ffm_index ffm_read_index(const std::string & file_name);
 39 | 
 40 | std::vector<ffm_feature> ffm_read_batch(const std::string & file_name, ffm_ulong from, ffm_ulong to);
 41 | void ffm_read_batch(const std::string & file_name, ffm_ulong from, ffm_ulong to, std::vector<ffm_feature> & features);
 42 | 
 43 | 
 44 | // Writes data files in sequential order
 45 | class ffm_stream_data_writer {
 46 |     ffm_file file;
 47 |     ffm_ulong offset;
 48 | public:
 49 |     ffm_stream_data_writer(const std::string & file_name);
 50 |     ~ffm_stream_data_writer();
 51 | 
 52 |     ffm_ulong write(const std::vector<ffm_feature> & features);
 53 | };
 54 | 
 55 | // Feature builder helper
 56 | 
 57 | 
 58 | class ffm_feature_vector_builder {
 59 |     std::vector<ffm_feature> vector;
 60 |     std::hash<std::string> str_hash;
 61 | 
 62 |     uint32_t hash_offset, hash_size;
 63 | public:
 64 |     ffm_feature_vector_builder(uint32_t hash_offset): hash_offset(hash_offset), hash_size((1 << ffm_hash_bits) - hash_offset) {}
 65 | 
 66 |     void raw(uint32_t field, uint32_t index, float value = 1.0) {
 67 |         ffm_feature f;
 68 |         f.index = (field << ffm_hash_bits) | (index & ffm_hash_mask);
 69 |         f.value = value;
 70 | 
 71 |         vector.push_back(f);
 72 |     }
 73 | 
 74 |     void hashed(uint32_t field, uint32_t category, float value = 1.0) {
 75 |         raw(field, h(category + field * 2654435761) % hash_size + hash_offset, value);
 76 |     }
 77 | 
 78 |     void hashed(uint32_t field, const std::string & category, float value = 1.0) {
 79 |         raw(field, h(str_hash(category) + field * 2654435761) % hash_size + hash_offset, value);
 80 |     }
 81 | 
 82 |     const std::vector<ffm_feature> & data() {
 83 |         return vector;
 84 |     }
 85 | 
 86 |     ffm_float norm() {
 87 |         ffm_float norm = 0.0;
 88 | 
 89 |         for (auto fi = vector.begin(); fi != vector.end(); ++ fi)
 90 |             norm += fi->value * fi->value;
 91 | 
 92 |         return norm;
 93 |     }
 94 | private:
 95 |     uint32_t h(uint32_t a) {
 96 |         a = (a ^ 61) ^ (a >> 16);
 97 |         a = a + (a << 3);
 98 |         a = a ^ (a >> 4);
 99 |         a = a * 0x27d4eb2d;
100 |         a = a ^ (a >> 15);
101 |         return a;
102 |     }
103 | };
104 | 


--------------------------------------------------------------------------------
/feat_Uuid_OneHour_Range.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | This script will find the documents clicked for a user within one hour after an
 4 | event has occurred. It works using the events file where we have the user and
 5 | the timestamp and we go to page_views and check which clicks are within one
 6 | hour of than timestamp for the user.
 7 | This is a form of leak, and most likely not useful in production, as we use
 8 | future information of the event to identify the ad picked.
 9 | The script shuld be executed with pypy for faster execution.
10 | @author: darragh
11 | """
12 | 
13 | import csv, os, gzip
14 | 
15 | input_dir = os.getenv('INPUT', '../input')
16 | 
17 | uuid_uid = {}  # Map of uuids to numeric ids
18 | uuidtstamps = {}   # store the users along with the ties they were presented ads
19 | uuidhrafter = {}   # store the folloing infomration from each event
20 |                    # key:{user, event timestamps} and value{all documents clicked within one hour}
21 | ctdoc = {}         # the count of how often docs appear
22 | 
23 | # Get timestamp user combination for each events
24 | for c, row in enumerate(csv.DictReader(gzip.open('cache/events.csv.gz'))):
25 |     if row['uuid'] not in uuidtstamps:
26 |         uuidtstamps[row['uuid']] = set()
27 |     uuidtstamps[row['uuid']].add(row['timestamp'])
28 |     uuidhrafter[row['uuid']+'_'+row['timestamp']] = 1
29 |     uuid_uid[row['uuid']] = row['uid']
30 | 
31 | 
32 | count = 0
33 | outfile = "cache/viewed_docs_one_hour_after.csv.gz"
34 | filename = input_dir + '/page_views.csv.gz'
35 | # filename = input_dir + '/page_views_sample.csv.gz' # comment this out locally
36 | 
37 | # Count documents which occured less than 80 times and exclude them
38 | for c, row in enumerate(csv.DictReader(gzip.open(filename))):
39 |     if row['uuid'] not in uuidtstamps:
40 |         continue
41 |     if row['document_id'] not in ctdoc:
42 |         ctdoc[row['document_id']] = 1
43 |     else:
44 |         ctdoc[row['document_id']] += 1
45 | print('all docs : ' + str(len(ctdoc)))
46 | ctdoc = { key:value for key, value in ctdoc.items() if value > 80 }
47 | print('common docs > 80 : ' + str(len(ctdoc)))
48 | 
49 | # for each page_views row where we get a uuid match, and the document occurs over
50 | # the required 80 count, we loop through the users click timestamps to find if
51 | # is within one hour of any of the event timestamps.
52 | for c, row in enumerate(csv.DictReader(gzip.open(filename))):
53 |     if c % 1000000 == 0:
54 |         print (c, count)
55 |     if row['document_id'] not in ctdoc:
56 |         continue
57 |     if row['uuid'] not in uuidtstamps:
58 |         continue
59 | 
60 |     for time in uuidtstamps[row['uuid']]:
61 |         diff = int(row['timestamp']) - int(time)
62 |         if abs(diff) < 3600*1000:
63 |             if diff > 0:
64 |                 if uuidhrafter[row['uuid'] + '_' + time] == 1:
65 |                     uuidhrafter[row['uuid'] + '_' + time] = set()
66 |                 uuidhrafter[row['uuid'] + '_' + time].add(row['document_id'])
67 |         del diff
68 | 
69 | # Delete output file if it already exists
70 | try:
71 |     os.remove(outfile)
72 | except OSError:
73 |     pass
74 | 
75 | # Open the file to write to
76 | fo = gzip.open(outfile, 'w')
77 | fo.write('uuid,timestamp,doc_ids\n')
78 | for i in uuidhrafter:
79 |     if uuidhrafter[i] != 1:
80 |         tmp = list(uuidhrafter[i])
81 |         utime = i.split('_')
82 |         fo.write('%s,%s,%s\n' % (uuid_uid[utime[0]], utime[1], ' '.join(tmp)))
83 |         del tmp, utime
84 | fo.close()
85 | 


--------------------------------------------------------------------------------
/train-ffm-2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | import os
  5 | import argparse
  6 | 
  7 | from util import print_and_exec, train_model
  8 | 
  9 | 
 10 | def fit_predict(profile, split, split_name):
 11 |     train_file = 'cache/%s_train_bin_%s' % (split_name, profile['dataset'])
 12 |     pred_file = 'cache/%s_test_bin_%s' % (split_name, profile['dataset'])
 13 | 
 14 |     n_bags = profile.get('bags', 1)
 15 | 
 16 |     pred = None
 17 |     for i in xrange(n_bags):
 18 |         opts = profile.get('options', '')
 19 |         opts += " --seed %d --epochs %d" % (profile.get('seed', np.random.randint(1e6)) + i * 3407, profile['epochs'])
 20 | 
 21 |         if split_name != "full":
 22 |             opts += " --val %s" % pred_file
 23 | 
 24 |         print_and_exec("bin/ffm %s --train %s --test %s --pred  /tmp/ffm2.preds" % (opts, train_file, pred_file))
 25 | 
 26 |         if pred is None:
 27 |             pred = np.loadtxt('/tmp/ffm2.preds')
 28 |         else:
 29 |             pred += np.loadtxt('/tmp/ffm2.preds')
 30 | 
 31 |     pred_df = pd.read_csv(split[1])
 32 |     pred_df['pred'] = pred / n_bags
 33 | 
 34 |     return pred_df
 35 | 
 36 | 
 37 | profiles = {
 38 |     'ffm2-p1': {
 39 |         'epochs': 4,
 40 |         'seed': 2017,
 41 |         'dataset': "p1",
 42 |     },
 43 | 
 44 |     'ffm2-p1r': {
 45 |         'epochs': 4,
 46 |         'seed': 123,
 47 |         'options': "--restricted",
 48 |         'dataset': "p1",
 49 |     },
 50 | 
 51 |     'ffm2-p1b': {
 52 |         'epochs': 4,
 53 |         'bags': 3,
 54 |         'dataset': "p1",
 55 |     },
 56 | 
 57 |     'ffm2-f1': {
 58 |         'epochs': 4,
 59 |         'seed': 42,
 60 |         'dataset': "f1",
 61 |     },
 62 | 
 63 |     'ffm2-f1b': {
 64 |         'bags': 2,
 65 |         'epochs': 5,
 66 |         'dataset': "f1",
 67 |     },
 68 | 
 69 |     'ffm2-f1r': {
 70 |         'epochs': 4,
 71 |         'seed': 71,
 72 |         'options': "--restricted",
 73 |         'dataset': "f1",
 74 |     },
 75 | 
 76 |     'ffm2-f2': {
 77 |         'epochs': 4,
 78 |         'seed': 456,
 79 |         'dataset': "f2",
 80 |     },
 81 | 
 82 |     'ffm2-f2r': {
 83 |         'epochs': 4,
 84 |         'seed': 879,
 85 |         'options': "--restricted",
 86 |         'dataset': "f2",
 87 |     },
 88 | 
 89 |     'ffm2-f3b': {
 90 |         'bags': 2,
 91 |         'epochs': 7,
 92 |         'dataset': "f3",
 93 |     },
 94 | 
 95 |     'ffm2-f4b': {
 96 |         'bags': 2,
 97 |         'epochs': 7,
 98 |         'dataset': "f4",
 99 |     },
100 | 
101 |     'ffm2-f5b': {
102 |         'bags': 2,
103 |         'epochs': 7,
104 |         'dataset': "f5",
105 |     },
106 | 
107 |     'nn-f3b': {
108 |         'bags': 2,
109 |         'epochs': 3,
110 |         'options': "--model nn --lambda 0.00001",
111 |         'dataset': "f3",
112 |     },
113 | 
114 |     'nn-f4': {
115 |         'epochs': 4,
116 |         'options': "--model nn --lambda 0.0001",
117 |         'dataset': "f4",
118 |     },
119 | 
120 |     'ffm-nn-f4b': {
121 |         'bags': 2,
122 |         'epochs': 4,
123 |         'options': "--model ffm-nn --lambda 0.00001",
124 |         'dataset': "f4",
125 |     },
126 | 
127 |     'nn-p1': {
128 |         'bags': 2,
129 |         'epochs': 5,
130 |         'options': "--model nn --dropout-log 2",
131 |         'dataset': "p1",
132 |     },
133 | 
134 |     'ffm2-nn-f3b': {
135 |         'bags': 2,
136 |         'epochs': 3,
137 |         'options': "--model ffm-nn --dropout-log 1",
138 |         'dataset': "f3",
139 |     },
140 | }
141 | 
142 | 
143 | parser = argparse.ArgumentParser(description='Train FFM2 model')
144 | parser.add_argument('profile', type=str, help='Train profile')
145 | parser.add_argument('--rewrite-cache', action='store_true', help='Drop cache files prior to train')
146 | parser.add_argument('--continue-train', type=str, help='Continue training of interrupted model')
147 | 
148 | args = parser.parse_args()
149 | 
150 | profile_name = args.profile
151 | profile = profiles[profile_name]
152 | 
153 | 
154 | if not os.path.exists('cache/full_train_bin_%s.index' % profile['dataset']) or args.rewrite_cache:
155 |     print "Generating data..."
156 |     os.system("bin/export-bin-data-%s" % profile['dataset'])
157 | 
158 | 
159 | train_model(fit_predict, profile_name, profile, name=args.continue_train)
160 | 


--------------------------------------------------------------------------------
/ftrl-model.cpp:
--------------------------------------------------------------------------------
  1 | #include "ftrl-model.h"
  2 | #include "util/model-helpers.h"
  3 | 
  4 | #include <iostream>
  5 | #include <iomanip>
  6 | #include <fstream>
  7 | #include <algorithm>
  8 | 
  9 | 
 10 | constexpr uint feature_buffer_size = 100000;
 11 | 
 12 | 
 13 | uint max_b_field = 29;
 14 | uint min_a_field = 10;
 15 | 
 16 | 
 17 | class feature_buffer {
 18 | public:
 19 |     uint size;
 20 | 
 21 |     uint * indices;
 22 |     float * values;
 23 |     float * weights;
 24 | public:
 25 |     feature_buffer() {
 26 |         size = 0;
 27 |         indices = malloc_aligned<uint>(feature_buffer_size);
 28 |         values = malloc_aligned<float>(feature_buffer_size);
 29 |         weights = malloc_aligned<float>(feature_buffer_size);
 30 |     }
 31 | 
 32 |     ~feature_buffer() {
 33 |         free(indices);
 34 |         free(values);
 35 |         free(weights);
 36 |     }
 37 | 
 38 |     void clear() {
 39 |         size = 0;
 40 |     }
 41 | 
 42 |     void add(uint index, float value) {
 43 |         indices[size] = index;
 44 |         values[size] = value;
 45 |         size ++;
 46 |     }
 47 | };
 48 | 
 49 | 
 50 | static thread_local feature_buffer local_feature_buffer;
 51 | 
 52 | 
 53 | ftrl_model::ftrl_model(uint n_bits, float alpha, float beta, float l1, float l2) {
 54 |     this->alpha = alpha;
 55 |     this->beta = beta;
 56 |     this->l1 = l1;
 57 |     this->l2 = l2;
 58 |     this->n_bits = n_bits;
 59 | 
 60 |     n_weights = 1 << n_bits;
 61 |     mask = n_weights - 1;
 62 | 
 63 |     weights_z = malloc_aligned<float>(n_weights);
 64 |     weights_n = malloc_aligned<float>(n_weights);
 65 | 
 66 |     fill_with_zero(weights_z, n_weights);
 67 |     fill_with_zero(weights_n, n_weights);
 68 | }
 69 | 
 70 | ftrl_model::~ftrl_model() {
 71 |     free(weights_z);
 72 |     free(weights_n);
 73 | }
 74 | 
 75 | 
 76 | 
 77 | float ftrl_model::predict(const ffm_feature * start, const ffm_feature * end, ffm_float norm, uint64_t * dropout_mask, float dropout_mult) {
 78 |     auto & feature_buf = local_feature_buffer;
 79 | 
 80 |     feature_buf.clear();
 81 |     feature_buf.add(0, 1.0);
 82 | 
 83 |     //int i = 0;
 84 |     for (const ffm_feature * fa = start; fa != end; ++ fa) {
 85 |         feature_buf.add(fa->index & mask, fa->value);
 86 | /*
 87 |         if ((fa->index >> ffm_hash_bits) < min_a_field)
 88 |             continue;
 89 | 
 90 |         for (const ffm_feature * fb = start; fb != fa; ++ fb, ++ i) {
 91 |             if ((fb->index >> ffm_hash_bits) > max_b_field)
 92 |                 break;
 93 | 
 94 |             if (test_mask_bit(dropout_mask, i) == 0)
 95 |                 continue;
 96 | 
 97 |             feature_buf.add((fa->index + fb->index * 2654435761) & mask, fa->value * fb->value);
 98 |         }*/
 99 |     }
100 | 
101 |     uint feature_count = feature_buf.size;
102 |     uint * feature_indices = feature_buf.indices;
103 |     float * feature_values = feature_buf.values;
104 |     float * feature_weights = feature_buf.weights;
105 | 
106 |     float total = 0;
107 | 
108 |     for (uint i = 0; i < feature_count; ++ i) {
109 |         uint feature_index = feature_indices[i];
110 | 
111 |         float zi = weights_z[feature_index];
112 |         float zsi = sgn(zi);
113 | 
114 |         if (zsi * zi < l1) {
115 |             feature_weights[i] = 0;
116 |         } else {
117 |             float wi = (zsi * l1 - zi) * feature_values[i] / ((beta + sqrt(weights_n[feature_index])) / alpha + l2);
118 | 
119 |             feature_weights[i] = wi;
120 | 
121 |             total += wi;
122 |         }
123 |     }
124 | 
125 |     return total;
126 | }
127 | 
128 | 
129 | void ftrl_model::update(const ffm_feature * start, const ffm_feature * end, ffm_float norm, ffm_float grad, uint64_t * dropout_mask, float dropout_mult) {
130 |     auto & feature_buf = local_feature_buffer;
131 | 
132 |     uint feature_count = feature_buf.size;
133 | 
134 |     uint  * fi = feature_buf.indices;
135 |     float * fv = feature_buf.values;
136 |     float * fw = feature_buf.weights;
137 | 
138 |     float * n = weights_n;
139 | 
140 |     __m256 ymm_alpha = _mm256_set1_ps(alpha);
141 |     __m256 ymm_grad = _mm256_set1_ps(grad);
142 | 
143 |     for (uint i = 0; i < feature_count; i += 8) {
144 |         __m256 ymm_n = _mm256_set_ps(n[fi[i + 7]], n[fi[i + 6]], n[fi[i + 5]], n[fi[i + 4]], n[fi[i + 3]], n[fi[i + 2]], n[fi[i + 1]], n[fi[i]]);
145 | 
146 |         __m256 ymm_fg = _mm256_load_ps(fv + i) * ymm_grad;
147 |         __m256 ymm_fg_sqr = ymm_fg * ymm_fg;
148 | 
149 |         __m256 ymm_sigma = (_mm256_sqrt_ps(ymm_n + ymm_fg_sqr) - _mm256_sqrt_ps(ymm_n)) / ymm_alpha;
150 | 
151 |         __m256 ymm_za = ymm_fg - ymm_sigma * _mm256_load_ps(fw + i);
152 | 
153 |         float * za = (float *)(&ymm_za);
154 |         float * gs = (float *)(&ymm_fg_sqr);
155 | 
156 |         uint fl = min(8u, feature_count - i);
157 | 
158 |         for (uint j = 0; j < fl; ++ j) {
159 |             weights_z[fi[i + j]] += za[j];
160 |             weights_n[fi[i + j]] += gs[j];
161 |         }
162 |     }
163 | }
164 | 


--------------------------------------------------------------------------------
/export-vw-data.cpp:
--------------------------------------------------------------------------------
  1 | #include "util/io.h"
  2 | #include "util/data.h"
  3 | #include "util/generation.h"
  4 | 
  5 | std::vector<std::pair<std::string, std::string>> files = {
  6 |     { "cache/clicks_cv1_train.csv.gz", "cv1_train" },
  7 |     { "cache/clicks_cv1_test.csv.gz", "cv1_test" },
  8 |     { "cache/clicks_cv2_train.csv.gz", "cv2_train" },
  9 |     { "cache/clicks_cv2_test.csv.gz", "cv2_test" },
 10 |     { "../input/clicks_train.csv.gz", "full_train" },
 11 |     { "../input/clicks_test.csv.gz", "full_test" },
 12 | };
 13 | 
 14 | std::vector<std::string> features = {
 15 |     "leak", "similarity",
 16 |     "viewed_docs",
 17 |     "viewed_ads", "viewed_ad_srcs",
 18 | };
 19 | 
 20 | class writer {
 21 |     std::ofstream out;
 22 | public:
 23 |     writer(const std::string & file_name) : out(file_name) {}
 24 | 
 25 |     void write(const reference_data & data, const std::vector<std::vector<std::string>> & rows);
 26 |     void finish() {}
 27 | };
 28 | 
 29 | 
 30 | void writer::write(const reference_data & data, const std::vector<std::vector<std::string>> & rows) {
 31 |     int event_id = stoi(rows[0][0]);
 32 |     int ad_id = stoi(rows[0][1]);
 33 |     int label = rows[0].size() == 3 ? stoi(rows[0][2]) : -1;
 34 | 
 35 |     int leak_viewed = stoi(rows[1][0]);
 36 |     int leak_not_viewed = stoi(rows[1][1]);
 37 | 
 38 |     //
 39 | 
 40 |     auto ad = data.ads[ad_id];
 41 |     auto event = data.events[event_id];
 42 | 
 43 |     auto ad_doc = data.documents.at(ad.document_id);
 44 |     auto ad_doc_categories = data.document_categories.equal_range(ad.document_id);
 45 |     auto ad_doc_topics = data.document_topics.equal_range(ad.document_id);
 46 |     auto ad_doc_entities = data.document_entities.equal_range(ad.document_id);
 47 | 
 48 |     auto ev_doc = data.documents.at(event.document_id);
 49 |     auto ev_doc_categories = data.document_categories.equal_range(event.document_id);
 50 |     auto ev_doc_topics = data.document_topics.equal_range(event.document_id);
 51 |     auto ev_doc_entities = data.document_entities.equal_range(event.document_id);
 52 | 
 53 |     std::stringstream line;
 54 | 
 55 |     if (label >= 0)
 56 |         line << (label * 2 - 1) << " ";
 57 | 
 58 |     line << "|a ad_" << ad_id << " ac_" << ad.campaign_id << " aa_" << ad.advertiser_id;
 59 |     line << "|l c_" << event.country << " s_" << event.state << " p_" << event.platform;
 60 |     line << "|t h_" << event.hour << " w_" << event.weekday;
 61 |     line << "|u u_" << event.uid;
 62 | 
 63 |     // Document info
 64 |     line << "|d ed_" << event.document_id << " eds_" << ev_doc.source_id << " edp_" << ev_doc.publisher_id;
 65 | 
 66 |     for (auto it = ev_doc_categories.first; it != ev_doc_categories.second; ++ it)
 67 |         line << " edc_" << it->second.first << ":" << it->second.second;
 68 | 
 69 |     for (auto it = ev_doc_topics.first; it != ev_doc_topics.second; ++ it)
 70 |         line << " edt_" << it->second.first << ":" << it->second.second;
 71 | 
 72 |     for (auto it = ev_doc_entities.first; it != ev_doc_entities.second; ++ it)
 73 |         line << " ede_" << it->second.first << ":" << it->second.second;
 74 | 
 75 |     // Promoted document info
 76 |     line << "|p ad_" << ad.document_id << " ads_" << ad_doc.source_id << " adp_" << ad_doc.publisher_id;
 77 | 
 78 |     for (auto it = ad_doc_categories.first; it != ad_doc_categories.second; ++ it)
 79 |         line << " adc_" << it->second.first << ":" << it->second.second;
 80 | 
 81 |     for (auto it = ad_doc_topics.first; it != ad_doc_topics.second; ++ it)
 82 |         line << " adt_" << it->second.first << ":" << it->second.second;
 83 | 
 84 |     for (auto it = ad_doc_entities.first; it != ad_doc_entities.second; ++ it)
 85 |         line << " ade_" << it->second.first << ":" << it->second.second;
 86 | 
 87 |     // Manual features
 88 |     line << "|f";
 89 | 
 90 |     if (ad_doc.publisher_id == ev_doc.publisher_id)
 91 |         line << " sp"; // Same publisher
 92 | 
 93 |     if (ad_doc.source_id == ev_doc.source_id)
 94 |         line << " ss"; // Same source
 95 | 
 96 |     // Leak features
 97 |     if (leak_viewed > 0)
 98 |         line << " v"; // Same source
 99 | 
100 |     if (leak_not_viewed > 0)
101 |         line << " nv"; // Same source
102 | 
103 |     // Similarity features
104 |     for (uint i = 0; i < rows[2].size(); ++ i)
105 |         line << " s_" << i << ':' << rows[2][i];
106 | 
107 |     // Ad views features
108 |     for (uint ri = 3; ri <= 5; ++ ri)
109 |         for (uint ci = 0; ci < rows[ri].size(); ++ ci)
110 |             if (stoi(rows[ri][ci]) > 0)
111 |                 line << " av_" << ri << "_" << ci;
112 | 
113 |     line << std::endl;
114 | 
115 |     out << line.str();
116 | }
117 | 
118 | int main() {
119 |     using namespace std;
120 | 
121 |     cout << "Loading reference data..." << endl;
122 |     auto data = load_reference_data();
123 | 
124 |     cout << "Generating files..." << endl;
125 |     generate_files<reference_data, writer>(data, build_filesets(files, features, "_vw.txt"));
126 | 
127 |     cout << "Done." << endl;
128 | }
129 | 


--------------------------------------------------------------------------------
/ffm-io.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdexcept>
  2 | 
  3 | #include <cstdio>
  4 | 
  5 | #include "ffm.h"
  6 | 
  7 | 
  8 | // index file reading / writing
  9 | 
 10 | 
 11 | void ffm_write_index(const std::string & file_name, const ffm_index & index) {
 12 |     using namespace std;
 13 | 
 14 |     if (index.labels.size() != index.size)
 15 |         throw runtime_error("Invalid index labels size");
 16 | 
 17 |     if (index.offsets.size() != index.size + 1)
 18 |         throw runtime_error("Invalid index offsets size");
 19 | 
 20 |     if (index.norms.size() != index.size)
 21 |         throw runtime_error("Invalid index norms size");
 22 | 
 23 |     if (index.groups.size() != index.size)
 24 |         throw runtime_error("Invalid index groups size");
 25 | 
 26 |     FILE * file = fopen(file_name.c_str(), "wb");
 27 | 
 28 |     if(file == nullptr)
 29 |         throw runtime_error(string("Can't open index file ") + file_name);
 30 | 
 31 |     if (fwrite(&index.size, sizeof(ffm_ulong), 1, file) != 1)
 32 |         throw runtime_error("Error writing example count");
 33 | 
 34 |     if (fwrite(index.labels.data(), sizeof(ffm_float), index.labels.size(), file) != index.labels.size())
 35 |         throw runtime_error("Error writing labels");
 36 | 
 37 |     if (fwrite(index.offsets.data(), sizeof(ffm_ulong), index.offsets.size(), file) != index.offsets.size())
 38 |         throw runtime_error("Error writing offsets");
 39 | 
 40 |     if (fwrite(index.norms.data(), sizeof(ffm_float), index.norms.size(), file) != index.norms.size())
 41 |         throw runtime_error("Error writing norms");
 42 | 
 43 |     if (fwrite(index.groups.data(), sizeof(ffm_uint), index.groups.size(), file) != index.groups.size())
 44 |         throw runtime_error("Error writing groups");
 45 | 
 46 |     fclose(file);
 47 | }
 48 | 
 49 | ffm_index ffm_read_index(const std::string & file_name) {
 50 |     using namespace std;
 51 | 
 52 |     ffm_index index;
 53 |     FILE * file = fopen(file_name.c_str(), "rb");
 54 | 
 55 |     if(file == nullptr)
 56 |         throw runtime_error(string("Can't open index file ") + file_name);
 57 | 
 58 |     if (fread(&index.size, sizeof(ffm_ulong), 1, file) != 1)
 59 |         throw runtime_error("Error reading example count");
 60 | 
 61 |     // Reserve space for y and offsets
 62 |     index.labels.resize(index.size, 0);
 63 |     index.offsets.resize(index.size + 1, 0);
 64 |     index.norms.resize(index.size, 0);
 65 |     index.groups.resize(index.size, 0);
 66 | 
 67 |     if (fread(index.labels.data(), sizeof(ffm_float), index.labels.size(), file) != index.labels.size())
 68 |         throw runtime_error("Error reading labels");
 69 | 
 70 |     if (fread(index.offsets.data(), sizeof(ffm_ulong), index.offsets.size(), file) != index.offsets.size())
 71 |         throw runtime_error("Error reading offsets");
 72 | 
 73 |     if (fread(index.norms.data(), sizeof(ffm_float), index.norms.size(), file) != index.norms.size())
 74 |         throw runtime_error("Error reading norms");
 75 | 
 76 |     if (fread(index.groups.data(), sizeof(ffm_uint), index.groups.size(), file) != index.groups.size())
 77 |         throw runtime_error("Error reading groups");
 78 | 
 79 |     fclose(file);
 80 | 
 81 |     return index;
 82 | }
 83 | 
 84 | // batch data reading
 85 | 
 86 | std::vector<ffm_feature> ffm_read_batch(const std::string & file_name, ffm_ulong from, ffm_ulong to) {
 87 |     std::vector<ffm_feature> features(to - from);
 88 |     ffm_read_batch(file_name, from, to, features);
 89 |     return features;
 90 | };
 91 | 
 92 | void ffm_read_batch(const std::string & file_name, ffm_ulong from, ffm_ulong to, std::vector<ffm_feature> & features) {
 93 |     using namespace std;
 94 | 
 95 |     if (to < from)
 96 |         throw runtime_error("Wrong range");
 97 | 
 98 |     features.resize(to - from);
 99 | 
100 |     // Empty range, no need to read
101 |     if (to == from)
102 |         return;
103 | 
104 |     FILE * file = fopen(file_name.c_str(), "rb");
105 | 
106 |     if (file == nullptr)
107 |         throw runtime_error(string("Can't open data file ") + file_name);
108 | 
109 |     if (fseek((FILE *)file, from * sizeof(ffm_feature), SEEK_SET) != 0)
110 |         throw new runtime_error("Can't set file pos");
111 | 
112 |     if (fread(features.data(), sizeof(ffm_feature), features.size(), (FILE *)file) != features.size())
113 |         throw new runtime_error("Can't read data");
114 | 
115 |     fclose(file);
116 | }
117 | 
118 | // stream data writing
119 | 
120 | ffm_stream_data_writer::ffm_stream_data_writer(const std::string & file_name): offset(0) {
121 |     using namespace std;
122 | 
123 |     file = fopen(file_name.c_str(), "wb");
124 | 
125 |     if(file == nullptr)
126 |         throw runtime_error(string("Can't open data file ") + file_name);
127 | }
128 | 
129 | ffm_stream_data_writer::~ffm_stream_data_writer() {
130 |     fclose((FILE *)file);
131 | }
132 | 
133 | ffm_ulong ffm_stream_data_writer::write(const std::vector<ffm_feature> & features) {
134 |     if (fwrite(features.data(), sizeof(ffm_feature), features.size(), (FILE *)file) != features.size())
135 |         throw std::runtime_error("Error writing example count");
136 | 
137 |     offset += features.size();
138 | 
139 |     return offset;
140 | }
141 | 


--------------------------------------------------------------------------------
/export-np-data-ctr1.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | from itertools import izip
 5 | from tqdm import tqdm
 6 | 
 7 | from scipy.special import logit
 8 | 
 9 | from util.meta import row_counts
10 | 
11 | 
12 | chunk_size = 100000
13 | ctr_smooth = 10
14 | 
15 | 
16 | def ctr_logit(views, clicks):
17 |     return logit((clicks + 0.194 * ctr_smooth) / (views + ctr_smooth))
18 | 
19 | 
20 | def export_data(clicks_file_name, out_name):
21 |     n_rows = row_counts[out_name]
22 |     res = np.memmap("cache/%s_np_ctr1.npy" % out_name, dtype='float32', mode='w+', shape=(n_rows, 19))
23 | 
24 |     click_stream = pd.read_csv(clicks_file_name, dtype=np.uint32, chunksize=chunk_size)
25 |     leak_stream = pd.read_csv("cache/leak_%s.csv.gz" % out_name, dtype=np.uint16, chunksize=chunk_size)
26 |     rival_stream = pd.read_csv("cache/rivals_%s.csv.gz" % out_name, usecols=['rival_count'], dtype=np.uint16, chunksize=chunk_size)
27 | 
28 |     uid_viewed_ads_stream = pd.read_csv("cache/uid_viewed_ads_%s.csv.gz" % out_name, dtype=np.uint16, chunksize=chunk_size)
29 |     uid_viewed_ad_srcs_stream = pd.read_csv("cache/uid_viewed_ad_srcs_%s.csv.gz" % out_name, dtype=np.uint16, chunksize=chunk_size)
30 |     uid_viewed_ad_cats_stream = pd.read_csv("cache/uid_viewed_ad_cats_%s.csv.gz" % out_name, dtype=np.float32, chunksize=chunk_size)
31 |     uid_viewed_ad_tops_stream = pd.read_csv("cache/uid_viewed_ad_tops_%s.csv.gz" % out_name, dtype=np.float32, chunksize=chunk_size)
32 | 
33 |     g2_viewed_ads_stream = pd.read_csv("cache/g2_viewed_ads_%s.csv.gz" % out_name, dtype=np.uint16, chunksize=chunk_size)
34 |     g2_viewed_ad_srcs_stream = pd.read_csv("cache/g2_viewed_ad_srcs_%s.csv.gz" % out_name, dtype=np.uint16, chunksize=chunk_size)
35 |     g2_viewed_ad_cats_stream = pd.read_csv("cache/g2_viewed_ad_cats_%s.csv.gz" % out_name, dtype=np.float32, chunksize=chunk_size)
36 |     #g2_viewed_ad_tops_stream = pd.read_csv("cache/g2_viewed_ad_tops_%s.csv.gz" % out_name, dtype=np.float32, chunksize=chunk_size)
37 | 
38 |     zipped_stream = izip(
39 |         click_stream, leak_stream, rival_stream,
40 |         uid_viewed_ads_stream, uid_viewed_ad_srcs_stream, uid_viewed_ad_cats_stream, uid_viewed_ad_tops_stream,
41 |         g2_viewed_ads_stream, g2_viewed_ad_srcs_stream, g2_viewed_ad_cats_stream
42 |     )
43 | 
44 |     chunk_start = 0
45 |     with tqdm(total=n_rows, desc='  Exporting %s' % clicks_file_name, unit='rows') as pbar:
46 |         for clk, leak, riv, uv_ad, uv_ad_src, uv_ad_cat, uv_ad_top, g2_ad, g2_ad_src, g2_ad_cat in zipped_stream:
47 |             chunk_end = chunk_start + clk.shape[0]
48 | 
49 |             res[chunk_start:chunk_end, 0] = (leak['viewed'] > 0).astype(np.float32)
50 |             res[chunk_start:chunk_end, 1] = (leak['not_viewed'] > 0).astype(np.float32)
51 | 
52 |             res[chunk_start:chunk_end, 2] = ctr_logit(uv_ad['ad_doc_past_views'], uv_ad['ad_doc_past_clicks'])
53 |             res[chunk_start:chunk_end, 3] = ctr_logit(uv_ad['ad_doc_future_views'], uv_ad['ad_doc_future_clicks'])
54 | 
55 |             res[chunk_start:chunk_end, 4] = ctr_logit(uv_ad_src['src_past_views'], uv_ad_src['src_past_clicks'])
56 |             res[chunk_start:chunk_end, 5] = ctr_logit(uv_ad_src['src_future_views'], uv_ad_src['src_future_clicks'])
57 | 
58 |             res[chunk_start:chunk_end, 6] = ctr_logit(uv_ad_top['top_past_views'], uv_ad_top['top_past_clicks'])
59 |             res[chunk_start:chunk_end, 7] = ctr_logit(uv_ad_top['top_future_views'], uv_ad_top['top_future_clicks'])
60 | 
61 |             res[chunk_start:chunk_end, 8] = ctr_logit(uv_ad_cat['cat_past_views'], uv_ad_cat['cat_past_clicks'])
62 |             res[chunk_start:chunk_end, 9] = ctr_logit(uv_ad_cat['cat_future_views'], uv_ad_cat['cat_future_clicks'])
63 | 
64 |             res[chunk_start:chunk_end, 10] = ctr_logit(g2_ad['ad_doc_past_views'], g2_ad['ad_doc_past_clicks'])
65 |             res[chunk_start:chunk_end, 11] = ctr_logit(g2_ad['ad_doc_future_views'], g2_ad['ad_doc_future_clicks'])
66 | 
67 |             res[chunk_start:chunk_end, 12] = ctr_logit(g2_ad_src['src_past_views'], g2_ad_src['src_past_clicks'])
68 |             res[chunk_start:chunk_end, 13] = ctr_logit(g2_ad_src['src_future_views'], g2_ad_src['src_future_clicks'])
69 | 
70 |             res[chunk_start:chunk_end, 14] = 0#ctr_logit(g2_ad_top['top_past_views'], g2_ad_top['top_past_clicks'])
71 |             res[chunk_start:chunk_end, 15] = 0#ctr_logit(g2_ad_top['top_future_views'], g2_ad_top['top_future_clicks'])
72 | 
73 |             res[chunk_start:chunk_end, 16] = ctr_logit(g2_ad_cat['cat_past_views'], g2_ad_cat['cat_past_clicks'])
74 |             res[chunk_start:chunk_end, 17] = ctr_logit(g2_ad_cat['cat_future_views'], g2_ad_cat['cat_future_clicks'])
75 | 
76 |             res[chunk_start:chunk_end, 18] = logit(1.0 / riv['rival_count'])
77 | 
78 |             chunk_start += clk.shape[0]
79 |             pbar.update(clk.shape[0])
80 | 
81 | 
82 | export_data('cache/clicks_cv2_train.csv.gz', 'cv2_train')
83 | export_data('cache/clicks_cv2_test.csv.gz', 'cv2_test')
84 | 
85 | export_data('cache/clicks_cv1_train.csv.gz', 'cv1_train')
86 | export_data('cache/clicks_cv1_test.csv.gz', 'cv1_test')
87 | 
88 | export_data('../input/clicks_train.csv.gz', 'full_train')
89 | export_data('../input/clicks_test.csv.gz', 'full_test')
90 | 
91 | print "Done."
92 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | import datetime
  5 | import os
  6 | 
  7 | from numba import jit
  8 | 
  9 | from .meta import cv1_split_time, full_split, cv1_split, cv2_split
 10 | 
 11 | 
 12 | def gen_prediction_name(model_name, score):
 13 |     return "%s-%s-%.5f" % (datetime.datetime.now().strftime('%Y%m%d-%H%M'), model_name, score)
 14 | 
 15 | 
 16 | def gen_submission(pred):
 17 |     pred = pred.sort_values(['display_id', 'pred'], ascending=[True, False])[['display_id', 'ad_id']]
 18 | 
 19 |     res_idx = []
 20 |     res_ads = []
 21 | 
 22 |     for t in pred.itertuples():
 23 |         if len(res_idx) > 0 and res_idx[-1] == t.display_id:
 24 |             if len(res_ads[-1]) < 12:
 25 |                 res_ads[-1].append(str(t.ad_id))
 26 |         else:
 27 |             res_idx.append(t.display_id)
 28 |             res_ads.append([str(t.ad_id)])
 29 | 
 30 |     return pd.DataFrame({'display_id': res_idx, 'ad_id': [' '.join(a) for a in res_ads]})
 31 | 
 32 | 
 33 | def score_prediction(pred):
 34 |     pred = pred.sort_values(['display_id', 'pred'], ascending=[True, False])[['display_id', 'clicked']]
 35 |     pred = pd.merge(pred, pd.read_csv("cache/events.csv.gz", dtype=np.int32, index_col=0, usecols=[0, 3]), left_on='display_id', right_index=True)
 36 | 
 37 |     cur_idx = None
 38 |     cur_rank = None
 39 | 
 40 |     future_score_sum = 0.0
 41 |     future_score_cnt = 0
 42 | 
 43 |     present_score_sum = 0.0
 44 |     present_score_cnt = 0
 45 | 
 46 |     for t in pred.itertuples():
 47 |         if cur_idx == t.display_id:
 48 |             cur_rank += 1
 49 |         else:
 50 |             if t.timestamp >= cv1_split_time:
 51 |                 future_score_cnt += 1
 52 |             else:
 53 |                 present_score_cnt += 1
 54 | 
 55 |             cur_idx = t.display_id
 56 |             cur_rank = 1
 57 | 
 58 |         if t.clicked == 1:
 59 |             if t.timestamp >= cv1_split_time:
 60 |                 future_score_sum += 1.0 / cur_rank
 61 |             else:
 62 |                 present_score_sum += 1.0 / cur_rank
 63 | 
 64 |     present_score = present_score_sum / present_score_cnt
 65 |     future_score = future_score_sum / future_score_cnt
 66 |     total_score = (present_score_sum + future_score_sum) / (present_score_cnt + future_score_cnt)
 67 | 
 68 |     return present_score, future_score, total_score
 69 | 
 70 | 
 71 | def print_and_exec(cmd):
 72 |     print cmd
 73 |     os.system(cmd)
 74 | 
 75 | 
 76 | @jit
 77 | def score_sorted(y_true, y_pred, y_group):
 78 |     cur_group = -1
 79 | 
 80 |     start_idx = -1
 81 |     true_idx = -1
 82 | 
 83 |     score_sum = 0.0
 84 |     score_cnt = 0
 85 | 
 86 |     for i in xrange(len(y_true)):
 87 |         if y_group[i] > cur_group:
 88 |             if cur_group >= 0:
 89 |                 rank = 0
 90 | 
 91 |                 for j in xrange(start_idx, i):
 92 |                     if y_pred[j] >= y_pred[true_idx]:
 93 |                         rank += 1
 94 | 
 95 |                 if rank > 0 and rank <= 12:
 96 |                     score_sum += 1.0 / rank
 97 | 
 98 |                 score_cnt += 1
 99 | 
100 |             start_idx = i
101 |             true_idx = -1
102 |             cur_group = y_group[i]
103 | 
104 |         if y_true[i]:
105 |             true_idx = i
106 | 
107 |     return score_sum / score_cnt
108 | 
109 | 
110 | def train_model(fit_predict, model_name, profile, name=None):
111 | 
112 |     ## Validation on CV2
113 |     if name is not None and os.path.exists('preds/%s-cv2.csv.gz' % name):
114 |         print "CV2 results already exist, skipping..."
115 |     else:
116 |         print "CV2 split..."
117 | 
118 |         pred = fit_predict(profile, cv2_split, 'cv2')
119 | 
120 |         print "  Scoring..."
121 | 
122 |         cv2_present_score, cv2_future_score, cv2_score = score_prediction(pred)
123 | 
124 |         if name is None:
125 |             name = gen_prediction_name(model_name, cv2_score)
126 | 
127 |         print "  Present score: %.5f" % cv2_present_score
128 |         print "  Future score: %.5f" % cv2_future_score
129 |         print "  Total score: %.5f" % cv2_score
130 | 
131 |         pred[['pred']].to_csv('preds/%s-cv2.csv.gz' % name, index=False, compression='gzip')
132 | 
133 |         del pred
134 | 
135 |     ## Validation on CV1
136 |     if os.path.exists('preds/%s-cv1.csv.gz' % name):
137 |         print "CV1 results already exist, skipping..."
138 |     else:
139 |         print "CV1 split..."
140 | 
141 |         pred = fit_predict(profile, cv1_split, 'cv1')
142 | 
143 |         print "  Scoring..."
144 | 
145 |         cv1_present_score, cv1_future_score, cv1_score = score_prediction(pred)
146 | 
147 |         print "  Present score: %.5f" % cv1_present_score
148 |         print "  Future score: %.5f" % cv1_future_score
149 |         print "  Total score: %.5f" % cv1_score
150 | 
151 |         pred[['pred']].to_csv('preds/%s-cv1.csv.gz' % name, index=False, compression='gzip')
152 | 
153 |         del pred
154 | 
155 |     ## Prediction
156 |     if os.path.exists('preds/%s-test.csv.gz' % name):
157 |         print "Full results already exist, skipping..."
158 |     else:
159 |         print "Full split..."
160 | 
161 |         pred = fit_predict(profile, full_split, 'full')
162 |         pred[['pred']].to_csv('preds/%s-test.csv.gz' % name, index=False, compression='gzip')
163 | 
164 |         print "  Generating submission..."
165 |         subm = gen_submission(pred)
166 |         subm.to_csv('subm/%s.csv.gz' % name, index=False, compression='gzip')
167 | 
168 |         del pred, subm
169 | 
170 |         print "  File name: %s" % name
171 | 
172 |     print "Done."
173 | 


--------------------------------------------------------------------------------
/export-ffm-data.cpp:
--------------------------------------------------------------------------------
  1 | #include "util/io.h"
  2 | #include "util/data.h"
  3 | #include "util/generation.h"
  4 | 
  5 | #include <functional>
  6 | #include <cmath>
  7 | 
  8 | std::vector<std::pair<std::vector<std::string>, std::string>> filesets = {
  9 |     { { "cache/clicks_cv2_train.csv.gz", "cache/leak_cv2_train.csv.gz" }, "cache/cv2_train_ffm.txt" },
 10 |     { { "cache/clicks_cv2_test.csv.gz", "cache/leak_cv2_test.csv.gz" }, "cache/cv2_test_ffm.txt" },
 11 |     { { "cache/clicks_cv1_train.csv.gz", "cache/leak_val_train.csv.gz" }, "cache/cv1_train_ffm.txt" },
 12 |     { { "cache/clicks_cv1_test.csv.gz", "cache/leak_val_test.csv.gz" }, "cache/cv1_test_ffm.txt" },
 13 |     { { "../input/clicks_train.csv.gz", "cache/leak_full_train.csv.gz" }, "cache/full_train_ffm.txt" },
 14 |     { { "../input/clicks_test.csv.gz", "cache/leak_full_test.csv.gz" }, "cache/full_test_ffm.txt" },
 15 | };
 16 | 
 17 | std::hash<std::string> str_hash;
 18 | 
 19 | uint32_t hash_offset = 100;
 20 | uint32_t hash_base = 1 << 19;
 21 | 
 22 | uint32_t h(uint32_t a, uint32_t f) {
 23 |     a = a + f * 2654435761;
 24 |     a = (a ^ 61) ^ (a >> 16);
 25 |     a = a + (a << 3);
 26 |     a = a ^ (a >> 4);
 27 |     a = a * 0x27d4eb2d;
 28 |     a = a ^ (a >> 15);
 29 |     return (a % hash_base) + hash_offset;
 30 | }
 31 | 
 32 | uint32_t h(const std::string & a, uint32_t f) {
 33 |     return (((str_hash(a) + f) * 2654435761) % hash_base) + hash_offset;
 34 | }
 35 | 
 36 | class line_builder {
 37 | public:
 38 |     std::stringstream stream;
 39 | 
 40 |     line_builder(int label) {
 41 |         stream << label;
 42 |     }
 43 | 
 44 |     void feature(uint32_t field, uint32_t category) {
 45 |         stream << ' ' << field << ':' << h(category, field) << ":1";
 46 |     }
 47 | 
 48 |     void feature(uint32_t field, uint32_t category, float value) {
 49 |         stream << ' ' << field << ':' << h(category, field) << ':' << value;
 50 |     }
 51 | 
 52 |     void feature(uint32_t field, const std::string & category) {
 53 |         stream << ' ' << field << ':' << h(category, field) << ":1";
 54 |     }
 55 | 
 56 |     void feature(uint32_t field, const std::string & category, float value) {
 57 |         stream << ' ' << field << ':' << h(category, field) << ':' << value;
 58 |     }
 59 | 
 60 |     void append(const char * str) {
 61 |         stream << str;
 62 |     }
 63 | 
 64 |     std::string str() {
 65 |         return stream.str();
 66 |     }
 67 | };
 68 | 
 69 | 
 70 | inline float pos_time_diff(int64_t td) {
 71 |     if (td < 0)
 72 |         return 0;
 73 | 
 74 |     return log(1 + td) / 100;
 75 | }
 76 | 
 77 | inline float time_diff(int64_t td) {
 78 |     if (td < 0)
 79 |         return - log(1 - td) / 100;
 80 | 
 81 |     return log(1 + td) / 100;
 82 | }
 83 | 
 84 | 
 85 | class writer {
 86 |     std::ofstream out;
 87 | public:
 88 |     writer(const std::string & file_name) : out(file_name) {}
 89 | 
 90 |     void write(const reference_data & data, const std::vector<std::vector<std::string>> & rows);
 91 |     void finish() {}
 92 | };
 93 | 
 94 | 
 95 | void writer::write(const reference_data & data, const std::vector<std::vector<std::string>> & rows) {
 96 |     int event_id = stoi(rows[0][0]);
 97 |     int ad_id = stoi(rows[0][1]);
 98 |     int label = rows[0].size() == 3 ? stoi(rows[0][2]) : -1;
 99 | 
100 |     int leak_viewed = stoi(rows[1][0]);
101 |     int leak_not_viewed = stoi(rows[1][1]);
102 | 
103 |     //
104 | 
105 |     auto ad = data.ads[ad_id];
106 |     auto event = data.events[event_id];
107 | 
108 |     auto ad_doc = data.documents.at(ad.document_id);
109 |     auto ad_doc_categories = data.document_categories.equal_range(ad.document_id);
110 |     //auto ad_doc_topics = data.document_topics.equal_range(ad.document_id);
111 |     //auto ad_doc_entities = data.document_entities.equal_range(ad.document_id);
112 | 
113 |     auto ev_doc = data.documents.at(event.document_id);
114 |     auto ev_doc_categories = data.document_categories.equal_range(event.document_id);
115 |     //auto ev_doc_topics = data.document_topics.equal_range(event.document_id);
116 |     //auto ev_doc_entities = data.document_entities.equal_range(event.document_id);
117 | 
118 |     // Start building line
119 |     line_builder line(label);
120 | 
121 |     line.feature(0, ad_id);
122 |     line.feature(1, ad.campaign_id);
123 |     line.feature(2, ad.advertiser_id);
124 | 
125 |     line.feature(3, event.platform);
126 |     line.feature(4, event.country);
127 |     line.feature(5, event.state);
128 | 
129 |     // Document info
130 |     line.feature(6, event.document_id);
131 |     line.feature(7, ev_doc.source_id);
132 |     line.feature(8, ev_doc.publisher_id);
133 | 
134 |     for (auto it = ev_doc_categories.first; it != ev_doc_categories.second; ++ it)
135 |         line.feature(12, it->second.first, it->second.second);
136 | 
137 |     // Promoted document info
138 |     line.feature(9, ad.document_id);
139 |     line.feature(10, ad_doc.source_id);
140 |     line.feature(11, ad_doc.publisher_id);
141 | 
142 |     for (auto it = ad_doc_categories.first; it != ad_doc_categories.second; ++ it)
143 |         line.feature(13, it->second.first, it->second.second);
144 | 
145 |     // Manual features
146 | 
147 |     if (ad_doc.publisher_id == ev_doc.publisher_id)
148 |         line.append(" 18:0:1"); // Same publisher
149 | 
150 |     if (ad_doc.source_id == ev_doc.source_id)
151 |         line.append(" 19:1:1"); // Same source
152 | 
153 |     if (leak_viewed > 0)
154 |         line.append(" 20:2:1"); // Same source
155 | 
156 |     if (leak_not_viewed > 0)
157 |         line.append(" 21:3:1"); // Same source
158 | 
159 |     line.stream << " 22:" << (event.weekday + 50) << ":1 23:" << (event.hour + 70) << ":1";
160 | 
161 |     line.stream << " 24:4:" << pos_time_diff(event.timestamp - ad_doc.publish_timestamp);
162 |     line.stream << " 25:5:" << time_diff(ev_doc.publish_timestamp - ad_doc.publish_timestamp);
163 |     line.append("\n");
164 | 
165 |     out << line.str();
166 | }
167 | 
168 | int main() {
169 |     using namespace std;
170 | 
171 |     cout << "Loading reference data..." << endl;
172 |     auto data = load_reference_data();
173 | 
174 |     cout << "Generating files..." << endl;
175 |     generate_files<reference_data, writer>(data, filesets);
176 | 
177 |     cout << "Done." << endl;
178 | }
179 | 


--------------------------------------------------------------------------------
/prepare-counts.cpp:
--------------------------------------------------------------------------------
  1 | #include "util/io.h"
  2 | #include "util/data.h"
  3 | 
  4 | std::vector<std::pair<std::string, std::vector<std::string>>> filesets {
  5 |     { "cv1", { "cache/clicks_cv1_train.csv.gz", "cache/clicks_cv1_test.csv.gz" } },
  6 |     { "cv2", { "cache/clicks_cv2_train.csv.gz", "cache/clicks_cv2_test.csv.gz" } },
  7 |     { "full", { "../input/clicks_train.csv.gz", "../input/clicks_test.csv.gz" } }
  8 | };
  9 | 
 10 | 
 11 | struct cnt {
 12 |     uint32_t train_count;
 13 |     uint32_t test_count;
 14 | 
 15 |     uint32_t & operator[](uint i) {
 16 |         switch (i) {
 17 |             case 0: return train_count;
 18 |             case 1: return test_count;
 19 |             default: throw std::logic_error("Invalid field index");
 20 |         }
 21 |     }
 22 | };
 23 | 
 24 | 
 25 | template <typename T>
 26 | void write_counts(const std::unordered_map<T, cnt> & map, const std::string & file_name) {
 27 |     using namespace std;
 28 | 
 29 |     cout << "Writing " << file_name << "... " << endl;
 30 | 
 31 |     ofstream outfile(file_name, std::ios_base::out | std::ios_base::binary);
 32 | 
 33 |     streamsize buffer_size = 1024*1024;
 34 |     boost::iostreams::filtering_streambuf<boost::iostreams::output> buf;
 35 |     buf.push(boost::iostreams::gzip_compressor(), buffer_size, buffer_size);
 36 |     buf.push(outfile, buffer_size, buffer_size);
 37 | 
 38 |     std::ostream out(&buf);
 39 | 
 40 |     out << "id,train_count,test_count" << endl;
 41 | 
 42 |     for (auto it = map.begin(); it != map.end(); ++ it)
 43 |         out << it->first << "," << it->second.train_count << "," << it->second.test_count << endl;
 44 | }
 45 | 
 46 | 
 47 | int main() {
 48 |     using namespace std;
 49 | 
 50 |     cout << "Loading reference data..." << endl;
 51 |     auto data = load_reference_data();
 52 | 
 53 |     for (auto it = filesets.begin(); it != filesets.end(); ++ it) {
 54 |         unordered_map<int, cnt> ad_counts;
 55 |         unordered_map<int, cnt> ad_campaign_counts;
 56 |         unordered_map<int, cnt> ad_advertiser_counts;
 57 |         unordered_map<int, cnt> ad_doc_counts;
 58 |         unordered_map<int, cnt> ad_doc_source_counts;
 59 |         unordered_map<int, cnt> ad_doc_publisher_counts;
 60 |         unordered_map<int, cnt> ev_doc_counts;
 61 |         unordered_map<int, cnt> ev_doc_source_counts;
 62 |         unordered_map<int, cnt> ev_doc_publisher_counts;
 63 |         unordered_map<int, cnt> uid_counts;
 64 | 
 65 |         unordered_map<std::string, cnt> ev_country_counts;
 66 |         unordered_map<std::string, cnt> ev_state_counts;
 67 |         unordered_map<std::string, cnt> ev_region_counts;
 68 | 
 69 | 
 70 |         cout << "Processing " << it->first << "... " << endl;
 71 | 
 72 |         for (uint fi = 0; fi < it->second.size(); ++ fi) {
 73 |             auto file_name = it->second[fi];
 74 |             clock_t begin = clock();
 75 | 
 76 |             cout << "  Loading " << file_name << "... ";
 77 |             cout.flush();
 78 | 
 79 |             compressed_csv_file file(file_name);
 80 | 
 81 |             for (int ri = 0;; ++ri) {
 82 |                 auto row = file.getrow();
 83 | 
 84 |                 if (row.empty())
 85 |                     break;
 86 | 
 87 |                 if (ri > 0 && ri % 5000000 == 0) {
 88 |                     cout << (ri / 1000000) << "M... ";
 89 |                     cout.flush();
 90 |                 }
 91 | 
 92 |                 // Extract fields
 93 |                 int ev_id = stoi(row[0]);
 94 |                 int ad_id = stoi(row[1]);
 95 | 
 96 |                 auto ad = data.ads[ad_id];
 97 |                 auto ev = data.events[ev_id];
 98 | 
 99 |                 auto ad_doc = data.documents.at(ad.document_id);
100 |                 auto ev_doc = data.documents.at(ev.document_id);
101 | 
102 |                 // Increment counters
103 |                 ++ ad_counts[ad_id][fi];
104 |                 ++ ad_campaign_counts[ad.campaign_id][fi];
105 |                 ++ ad_advertiser_counts[ad.advertiser_id][fi];
106 | 
107 |                 ++ ad_doc_counts[ad.document_id][fi];
108 |                 ++ ad_doc_source_counts[ad_doc.source_id][fi];
109 |                 ++ ad_doc_publisher_counts[ad_doc.publisher_id][fi];
110 | 
111 |                 ++ ev_doc_counts[ev.document_id][fi];
112 |                 ++ ev_doc_source_counts[ev_doc.source_id][fi];
113 |                 ++ ev_doc_publisher_counts[ev_doc.publisher_id][fi];
114 | 
115 |                 ++ ev_country_counts[ev.country][fi];
116 |                 ++ ev_state_counts[ev.state][fi];
117 |                 ++ ev_region_counts[ev.region][fi];
118 | 
119 |                 ++ uid_counts[ev.uid][fi];
120 |             }
121 | 
122 |             clock_t end = clock();
123 |             double elapsed = double(end - begin) / CLOCKS_PER_SEC;
124 | 
125 |             cout << "done in " << elapsed << " seconds" << endl;
126 |         }
127 | 
128 |         write_counts(ad_counts, string("cache/counts/ads_") + it->first + string(".csv.gz"));
129 |         write_counts(ad_campaign_counts, string("cache/counts/ad_campaigns_") + it->first + string(".csv.gz"));
130 |         write_counts(ad_advertiser_counts, string("cache/counts/ad_advertisers_") + it->first + string(".csv.gz"));
131 | 
132 |         write_counts(ad_doc_counts, string("cache/counts/ad_docs_") + it->first + string(".csv.gz"));
133 |         write_counts(ad_doc_source_counts, string("cache/counts/ad_doc_sources_") + it->first + string(".csv.gz"));
134 |         write_counts(ad_doc_publisher_counts, string("cache/counts/ad_doc_publishers_") + it->first + string(".csv.gz"));
135 | 
136 |         write_counts(ev_doc_counts, string("cache/counts/ev_docs_") + it->first + string(".csv.gz"));
137 |         write_counts(ev_doc_source_counts, string("cache/counts/ev_doc_sources_") + it->first + string(".csv.gz"));
138 |         write_counts(ev_doc_publisher_counts, string("cache/counts/ev_doc_publishers_") + it->first + string(".csv.gz"));
139 | 
140 |         write_counts(ev_country_counts, string("cache/counts/ev_countries_") + it->first + string(".csv.gz"));
141 |         write_counts(ev_state_counts, string("cache/counts/ev_states_") + it->first + string(".csv.gz"));
142 |         write_counts(ev_region_counts, string("cache/counts/ev_regions_") + it->first + string(".csv.gz"));
143 | 
144 |         write_counts(uid_counts, string("cache/counts/uids_") + it->first + string(".csv.gz"));
145 |     }
146 | 
147 |     cout << "Done." << endl;
148 | }
149 | 


--------------------------------------------------------------------------------
/prepare-leak.cpp:
--------------------------------------------------------------------------------
  1 | #include "util/io.h"
  2 | #include "util/data.h"
  3 | 
  4 | 
  5 | std::vector<std::pair<std::string, std::string>> filesets {
  6 |     std::make_pair("cache/clicks_cv1_train.csv.gz", "cache/leak_cv1_train.csv.gz"),
  7 |     std::make_pair("cache/clicks_cv1_test.csv.gz", "cache/leak_cv1_test.csv.gz"),
  8 |     std::make_pair("cache/clicks_cv2_train.csv.gz", "cache/leak_cv2_train.csv.gz"),
  9 |     std::make_pair("cache/clicks_cv2_test.csv.gz", "cache/leak_cv2_test.csv.gz"),
 10 |     std::make_pair("../input/clicks_train.csv.gz", "cache/leak_full_train.csv.gz"),
 11 |     std::make_pair("../input/clicks_test.csv.gz", "cache/leak_full_test.csv.gz"),
 12 | };
 13 | 
 14 | struct event_info {
 15 |     int uid;
 16 |     int timestamp;
 17 | };
 18 | 
 19 | 
 20 | std::unordered_map<std::string, int> uuid_map;
 21 | std::unordered_map<std::pair<int, int>, std::vector<int>> doc_views_map;
 22 | std::unordered_map<int,int> documents_map;
 23 | 
 24 | 
 25 | std::pair<int, event_info> read_event_info(const std::vector<std::string> & row) {
 26 |     auto id = stoi(row[0]);
 27 |     auto uuid = row[1];
 28 | 
 29 |     event_info res;
 30 |     res.timestamp = stoi(row[3]);
 31 | 
 32 |     auto it = uuid_map.find(uuid);
 33 | 
 34 |     if (it == uuid_map.end()) {
 35 |         res.uid = uuid_map.size();
 36 |         uuid_map.insert(std::make_pair(uuid, res.uid));
 37 |     } else {
 38 |         res.uid = it->second;
 39 |     }
 40 | 
 41 |     return std::make_pair(id, res);
 42 | }
 43 | 
 44 | std::pair<int, int> read_ad_document(const std::vector<std::string> & row) {
 45 |     int ad_document_id = stoi(row[1]);
 46 | 
 47 |     if (documents_map.count(ad_document_id) == 0)
 48 |         documents_map.insert(std::make_pair(ad_document_id, 0));
 49 | 
 50 |     return std::make_pair(stoi(row[0]), ad_document_id);
 51 | }
 52 | 
 53 | 
 54 | int main() {
 55 |     using namespace std;
 56 | 
 57 |     cout << "Loading reference data..." << endl;
 58 |     auto events = read_vector("cache/events.csv.gz", read_event_info, 23120127);
 59 |     auto ad_documents = read_vector("../input/promoted_content.csv.gz", read_ad_document, 573099);
 60 | 
 61 | 
 62 |     cout << "Loading click data..." << endl;
 63 |     for (auto it = filesets.begin(); it != filesets.end(); ++ it) {
 64 |         cout << "  Loading " << it->first << "... ";
 65 |         cout.flush();
 66 | 
 67 |         clock_t begin = clock();
 68 | 
 69 |         compressed_csv_file file(it->first);
 70 | 
 71 |         for (int i = 0;; ++i) {
 72 |             auto row = file.getrow();
 73 | 
 74 |             if (row.empty())
 75 |                 break;
 76 | 
 77 |             auto ev = events.at(stoi(row[0]));
 78 |             auto document_id = ad_documents.at(stoi(row[1]));
 79 | 
 80 |             doc_views_map[make_pair(document_id, ev.uid)] = std::vector<int>();
 81 | 
 82 |             if (i > 0 && i % 5000000 == 0) {
 83 |                 cout << (i / 1000000) << "M... ";
 84 |                 cout.flush();
 85 |             }
 86 |         }
 87 | 
 88 |         clock_t end = clock();
 89 |         double elapsed = double(end - begin) / CLOCKS_PER_SEC;
 90 | 
 91 |         cout << "done in " << elapsed << " seconds" << endl;
 92 |     }
 93 | 
 94 |     {
 95 |         cout << "Processing leak data... ";
 96 |         cout.flush();
 97 | 
 98 |         clock_t begin = clock();
 99 | 
100 |         compressed_csv_file file("../input/page_views.csv.gz");
101 |         int found = 0;
102 | 
103 |         for (int i = 0;; ++i) {
104 |             auto row = file.getrow();
105 | 
106 |             if (row.empty())
107 |                 break;
108 | 
109 |             auto uuid = row[0];
110 |             auto document_id = stoi(row[1]);
111 | 
112 |             // Register view
113 |             auto uid_it = uuid_map.find(uuid);
114 |             if (uid_it != uuid_map.end()) {
115 |                 auto uid = uid_it->second;
116 | 
117 |                 auto dv_it = doc_views_map.find(make_pair(document_id, uid));
118 |                 if (dv_it != doc_views_map.end()) {
119 |                     dv_it->second.push_back(stoi(row[2]));
120 |                     found ++;
121 |                 }
122 |             }
123 | 
124 |             // Register document view
125 |             auto doc_it = documents_map.find(document_id);
126 |             if (doc_it != documents_map.end()) {
127 |                 doc_it->second ++;
128 |             }
129 | 
130 |             if (i > 0 && i % 5000000 == 0) {
131 |                 cout << (i / 1000000) << "M... ";
132 |                 cout.flush();
133 |             }
134 |         }
135 | 
136 |         clock_t end = clock();
137 |         double elapsed = double(end - begin) / CLOCKS_PER_SEC;
138 | 
139 |         cout << "done in " << elapsed << " seconds, found " << found << " entries" << endl;
140 |     }
141 | 
142 |     cout << "Generating leak features..." << endl;
143 |     for (auto it = filesets.begin(); it != filesets.end(); ++ it) {
144 |         cout << "  Generating " << it->second << "... ";
145 |         cout.flush();
146 | 
147 |         clock_t begin = clock();
148 | 
149 |         compressed_csv_file file(it->first);
150 |         ofstream outfile(it->second, std::ios_base::out | std::ios_base::binary);
151 | 
152 |         streamsize buffer_size = 1024*1024;
153 |         boost::iostreams::filtering_streambuf<boost::iostreams::output> buf;
154 |         buf.push(boost::iostreams::gzip_compressor(), buffer_size, buffer_size);
155 |         buf.push(outfile, buffer_size, buffer_size);
156 | 
157 |         std::ostream out(&buf);
158 | 
159 |         out << "viewed,not_viewed" << endl;
160 | 
161 |         for (int i = 0;; ++i) {
162 |             auto row = file.getrow();
163 | 
164 |             if (row.empty())
165 |                 break;
166 | 
167 |             auto ev = events.at(stoi(row[0]));
168 |             auto document_id = ad_documents.at(stoi(row[1]));
169 | 
170 |             auto doc_view_times = doc_views_map.at(make_pair(document_id, ev.uid));
171 |             auto doc_views = documents_map.at(document_id);
172 | 
173 |             out << doc_view_times.size() << ","
174 |                 << int(doc_view_times.size() == 0 && doc_views > 0) << endl;
175 | 
176 |             if (i > 0 && i % 5000000 == 0) {
177 |                 cout << (i / 1000000) << "M... ";
178 |                 cout.flush();
179 |             }
180 |         }
181 | 
182 |         clock_t end = clock();
183 |         double elapsed = double(end - begin) / CLOCKS_PER_SEC;
184 | 
185 |         cout << "done in " << elapsed << " seconds" << endl;
186 |     }
187 | 
188 |     cout << "Done." << endl;
189 | }
190 | 


--------------------------------------------------------------------------------
/prepare-similarity.cpp:
--------------------------------------------------------------------------------
  1 | #include "util/io.h"
  2 | 
  3 | std::vector<std::pair<std::string, std::string>> filesets {
  4 |     std::make_pair("cache/clicks_cv1_train.csv.gz", "cache/similarity_cv1_train.csv.gz"),
  5 |     std::make_pair("cache/clicks_cv1_test.csv.gz", "cache/similarity_cv1_test.csv.gz"),
  6 |     std::make_pair("cache/clicks_cv2_train.csv.gz", "cache/similarity_cv2_train.csv.gz"),
  7 |     std::make_pair("cache/clicks_cv2_test.csv.gz", "cache/similarity_cv2_test.csv.gz"),
  8 |     std::make_pair("../input/clicks_train.csv.gz", "cache/similarity_full_train.csv.gz"),
  9 |     std::make_pair("../input/clicks_test.csv.gz", "cache/similarity_full_test.csv.gz"),
 10 | };
 11 | 
 12 | struct event_info {
 13 |     int uid;
 14 |     int document_id;
 15 | };
 16 | 
 17 | 
 18 | std::vector<int> event_documents;
 19 | std::vector<int> ad_documents;
 20 | 
 21 | typedef std::pair<int, float> annotation;
 22 | 
 23 | std::unordered_map<int,std::vector<annotation>> document_categories_map;
 24 | std::unordered_map<int,std::vector<annotation>> document_topics_map;
 25 | std::unordered_map<int,std::vector<annotation>> document_entities_map;
 26 | 
 27 | void create_document_entries(int document_id) {
 28 |     if (document_categories_map.count(document_id) == 0)
 29 |         document_categories_map.insert(std::make_pair(document_id, std::vector<annotation>()));
 30 | 
 31 |     if (document_topics_map.count(document_id) == 0)
 32 |         document_topics_map.insert(std::make_pair(document_id, std::vector<annotation>()));
 33 | 
 34 |     if (document_entities_map.count(document_id) == 0)
 35 |         document_entities_map.insert(std::make_pair(document_id, std::vector<annotation>()));
 36 | }
 37 | 
 38 | std::pair<int, int> read_ad_document(const std::vector<std::string> & row) {
 39 |     int document_id = stoi(row[1]);
 40 | 
 41 |     create_document_entries(document_id);
 42 | 
 43 |     return std::make_pair(stoi(row[0]), document_id);
 44 | }
 45 | 
 46 | std::pair<int, int> read_event_document(const std::vector<std::string> & row) {
 47 |     int document_id = stoi(row[2]);
 48 | 
 49 |     create_document_entries(document_id);
 50 | 
 51 |     return std::make_pair(stoi(row[0]), document_id);
 52 | }
 53 | 
 54 | std::pair<int, annotation> read_document_annotation(const std::vector<std::string> & row) {
 55 |     return std::make_pair(stoi(row[0]), std::make_pair(stoi(row[1]), stof(row[2])));
 56 | }
 57 | 
 58 | struct similarity_values {
 59 |     float product;
 60 |     float jaccard;
 61 | };
 62 | 
 63 | 
 64 | /**
 65 |   Compute different similarity measures between two vectors, sorted by key
 66 | */
 67 | similarity_values similarity(const std::vector<annotation> & va, const std::vector<annotation> & vb) {
 68 |     auto ia = va.begin();
 69 |     auto ib = vb.begin();
 70 | 
 71 |     float product = 0;
 72 | 
 73 |     int intersection_size = 0;
 74 |     int union_size = 0;
 75 | 
 76 |     while (ia != va.end() && ib != vb.end()) {
 77 |         if (ia->first < ib->first) {
 78 |             union_size ++;
 79 |             ia ++;
 80 |         } else if (ia->first > ib->first) {
 81 |             union_size ++;
 82 |             ib ++;
 83 |         } else {
 84 |             product += ia->second * ib->second;
 85 |             union_size ++;
 86 |             intersection_size ++;
 87 |             ia ++;
 88 |             ib ++;
 89 |         }
 90 |     }
 91 | 
 92 |     while (ia != va.end()) {
 93 |         union_size ++;
 94 |         ia ++;
 95 |     }
 96 | 
 97 |     while (ib != vb.end()) {
 98 |         union_size ++;
 99 |         ib ++;
100 |     }
101 | 
102 |     similarity_values res;
103 |     res.product = product;
104 |     res.jaccard = union_size > 0 ? float(intersection_size) / union_size : 0;
105 | 
106 |     return res;
107 | }
108 | 
109 | 
110 | int main() {
111 |     using namespace std;
112 | 
113 |     cout << "Loading reference data..." << endl;
114 |     read_vector(event_documents, "cache/events.csv.gz", read_event_document, 23120127);
115 |     read_vector(ad_documents, "../input/promoted_content.csv.gz", read_ad_document, 573099);
116 | 
117 |     read_sorted_vector_map(document_categories_map, "../input/documents_categories.csv.gz", read_document_annotation);
118 |     read_sorted_vector_map(document_topics_map, "../input/documents_topics.csv.gz", read_document_annotation);
119 |     read_sorted_vector_map(document_entities_map, "cache/documents_entities.csv.gz", read_document_annotation);
120 | 
121 | 
122 |     cout << "Generating similarity features..." << endl;
123 |     for (auto it = filesets.begin(); it != filesets.end(); ++ it) {
124 |         cout << "  Generating " << it->second << "... ";
125 |         cout.flush();
126 | 
127 |         clock_t begin = clock();
128 | 
129 |         compressed_csv_file file(it->first);
130 |         ofstream outfile(it->second, std::ios_base::out | std::ios_base::binary);
131 | 
132 |         streamsize buffer_size = 1024*1024;
133 |         boost::iostreams::filtering_streambuf<boost::iostreams::output> buf;
134 |         buf.push(boost::iostreams::gzip_compressor(), buffer_size, buffer_size);
135 |         buf.push(outfile, buffer_size, buffer_size);
136 | 
137 |         std::ostream out(&buf);
138 | 
139 |         out << "cat_sim_product,cat_sim_jaccard,top_sim_product,top_sim_jaccard,ent_sim_product,ent_sim_jaccard" << endl;
140 | 
141 |         for (int i = 0;; ++i) {
142 |             auto row = file.getrow();
143 | 
144 |             if (row.empty())
145 |                 break;
146 | 
147 |             auto ev_doc_id = event_documents.at(stoi(row[0]));
148 |             auto ad_doc_id = ad_documents.at(stoi(row[1]));
149 | 
150 |             // Compute document similarity metrics
151 |             auto cat_sim = similarity(document_categories_map.at(ad_doc_id), document_categories_map.at(ev_doc_id));
152 |             auto top_sim = similarity(document_topics_map.at(ad_doc_id), document_topics_map.at(ev_doc_id));
153 |             auto ent_sim = similarity(document_entities_map.at(ad_doc_id), document_entities_map.at(ev_doc_id));
154 | 
155 |             // Write similarity metrics
156 |             out << cat_sim.product << "," << cat_sim.jaccard << ",";
157 |             out << top_sim.product << "," << top_sim.jaccard << ",";
158 |             out << ent_sim.product << "," << ent_sim.jaccard << endl;
159 | 
160 |             if (i > 0 && i % 5000000 == 0) {
161 |                 cout << (i / 1000000) << "M... ";
162 |                 cout.flush();
163 |             }
164 |         }
165 | 
166 |         clock_t end = clock();
167 |         double elapsed = double(end - begin) / CLOCKS_PER_SEC;
168 | 
169 |         cout << "done in " << elapsed << " seconds" << endl;
170 |     }
171 | 
172 |     cout << "Done." << endl;
173 | }
174 | 


--------------------------------------------------------------------------------
/util/data.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <string>
  4 | #include <vector>
  5 | #include <stdexcept>
  6 | 
  7 | #include "io.h"
  8 | 
  9 | struct event {
 10 |     uint32_t document_id;
 11 |     uint32_t uid;
 12 |     int64_t timestamp;
 13 |     uint16_t platform;
 14 |     uint16_t weekday;
 15 |     uint16_t hour;
 16 |     std::string country;
 17 |     std::string state;
 18 |     std::string region;
 19 |     std::string location;
 20 | };
 21 | 
 22 | struct ad {
 23 |     uint32_t document_id;
 24 |     uint32_t campaign_id;
 25 |     uint32_t advertiser_id;
 26 | };
 27 | 
 28 | struct document {
 29 |     uint32_t source_id;
 30 |     uint32_t publisher_id;
 31 |     int64_t publish_timestamp;
 32 | };
 33 | 
 34 | 
 35 | struct traffic_source_id_list {
 36 |     std::vector<uint> internal, social, search;
 37 | };
 38 | 
 39 | 
 40 | 
 41 | // Small util
 42 | 
 43 | 
 44 | namespace std {
 45 |     template <typename A, typename B>
 46 |     struct hash<std::pair<A, B>> {
 47 |         std::size_t operator()(const std::pair<A, B>& k) const {
 48 |           return std::hash<A>()(k.first) ^ (std::hash<B>()(k.second) >> 1);
 49 |         }
 50 |     };
 51 | }
 52 | 
 53 | 
 54 | 
 55 | // Functions to read data types
 56 | 
 57 | 
 58 | std::pair<int, event> read_event(const std::vector<std::string> & row) {
 59 |     event e;
 60 | 
 61 |     e.document_id = stoi(row[2]);
 62 |     e.timestamp = stoll(row[3]);
 63 | 
 64 |     try {
 65 |         e.platform = stoi(row[4]);
 66 |     } catch (std::invalid_argument) {
 67 |         e.platform = 0;
 68 |     }
 69 | 
 70 |     e.location = row[5];
 71 | 
 72 |     e.country = row[6];
 73 |     e.state = row[7];
 74 |     e.region = row[8];
 75 | 
 76 |     e.hour = stoi(row[9]);
 77 |     e.weekday = stoi(row[10]);
 78 |     e.uid = stoi(row[11]);
 79 | 
 80 |     return std::make_pair(stoi(row[0]), e);
 81 | }
 82 | 
 83 | std::pair<int, ad> read_ad(const std::vector<std::string> & row) {
 84 |     ad a;
 85 | 
 86 |     a.document_id = stoi(row[1]);
 87 |     a.campaign_id = stoi(row[2]);
 88 |     a.advertiser_id = stoi(row[3]);
 89 | 
 90 |     return std::make_pair(stoi(row[0]), a);
 91 | }
 92 | 
 93 | std::pair<int, document> read_document(const std::vector<std::string> & row) {
 94 |     document d;
 95 | 
 96 |     d.source_id = stoi(row[1]);
 97 |     d.publisher_id = stoi(row[2]);
 98 |     d.publish_timestamp = stoll(row[4]);
 99 | 
100 |     return std::make_pair(stoi(row[0]), d);
101 | }
102 | 
103 | std::pair<int, std::pair<int, float>> read_document_annotation(const std::vector<std::string> & row) {
104 |     return std::make_pair(stoi(row[0]), std::make_pair(stoi(row[1]), stof(row[2])));
105 | }
106 | 
107 | std::pair<int, int> read_count(const std::vector<std::string> & row) {
108 |     return std::make_pair(stoi(row[0]), stoi(row[1]));
109 | }
110 | 
111 | std::vector<uint> parse_id_list(const std::string & field) {
112 |     using namespace std;
113 | 
114 |     vector<uint> values;
115 | 
116 |     stringstream ss;
117 |     ss.str(field);
118 | 
119 |     string item;
120 |     while (getline(ss, item, ' ')) {
121 |         values.push_back(stoi(item));
122 |     }
123 | 
124 |     return values;
125 | }
126 | 
127 | std::pair<std::pair<uint, uint>, std::vector<uint>> read_display_indexed_id_list(const std::vector<std::string> & row) {
128 |     using namespace std;
129 | 
130 |     auto key = make_pair(stoi(row[0]), stoi(row[1]));
131 |     auto val = parse_id_list(row[2]);
132 | 
133 |     return make_pair(key, val);
134 | }
135 | 
136 | 
137 | std::pair<uint, std::vector<uint>> read_uid_indexed_id_list(const std::vector<std::string> & row) {
138 |     using namespace std;
139 | 
140 |     return make_pair(stoi(row[0]), parse_id_list(row[1]));
141 | }
142 | 
143 | 
144 | std::pair<uint, traffic_source_id_list> read_uid_indexed_trfsrc_id_list(const std::vector<std::string> & row) {
145 |     using namespace std;
146 | 
147 |     traffic_source_id_list trf;
148 |     trf.internal = parse_id_list(row[1]);
149 |     trf.social = parse_id_list(row[2]);
150 | 
151 |     if (row.size() > 3)
152 |         trf.search = parse_id_list(row[3]);
153 | 
154 |     return make_pair(stoi(row[0]), move(trf));
155 | }
156 | 
157 | std::vector<event> read_events() {
158 |     return read_vector("cache/events.csv.gz", read_event, 23120127);
159 | }
160 | 
161 | std::vector<ad> read_ads() {
162 |     return read_vector("../input/promoted_content.csv.gz", read_ad, 573099);
163 | }
164 | 
165 | 
166 | 
167 | // All data
168 | 
169 | 
170 | struct reference_data {
171 |     std::vector<event> events;
172 |     std::vector<ad> ads;
173 |     std::unordered_map<int, document> documents;
174 |     std::unordered_multimap<int, std::pair<int, float>> document_categories;
175 |     std::unordered_multimap<int, std::pair<int, float>> document_topics;
176 |     std::unordered_multimap<int, std::pair<int, float>> document_entities;
177 | 
178 |     std::unordered_map<std::pair<uint, uint>, std::vector<uint>> viewed_docs_one_hour_after;
179 |     //std::unordered_map<std::pair<uint, uint>, std::vector<uint>> viewed_docs_six_hours_after;
180 | 
181 |     std::unordered_map<uint, std::vector<uint>> doc_ad_others;
182 |     std::unordered_map<uint, std::vector<uint>> viewed_doc_trf_source;
183 |     std::unordered_map<uint, std::vector<uint>> viewed_doc_sources;
184 | 
185 |     std::unordered_map<uint, traffic_source_id_list> viewed_trfsrc_doc_sources;
186 |     std::unordered_map<uint, traffic_source_id_list> viewed_trfsrc_docs;
187 | };
188 | 
189 | reference_data load_reference_data() {
190 |     reference_data res;
191 |     res.viewed_trfsrc_doc_sources = read_map("cache/viewed_trfsrc_doc_sources.csv.gz", read_uid_indexed_trfsrc_id_list);
192 |     res.viewed_trfsrc_docs = read_map("cache/viewed_trfsrc_docs.csv.gz", read_uid_indexed_trfsrc_id_list);
193 | 
194 |     res.events = read_events();
195 |     res.ads = read_ads();
196 |     res.documents = read_map("cache/documents.csv.gz", read_document);
197 |     res.document_categories = read_multi_map("../input/documents_categories.csv.gz", read_document_annotation);
198 |     res.document_topics = read_multi_map("../input/documents_topics.csv.gz", read_document_annotation);
199 |     res.document_entities = read_multi_map("cache/documents_entities.csv.gz", read_document_annotation);
200 | 
201 |     res.viewed_docs_one_hour_after = read_map("cache/viewed_docs_one_hour_after.csv.gz", read_display_indexed_id_list);
202 |     //res.viewed_docs_six_hours_after = read_map("cache/viewed_docs_six_hours_after.csv.gz", read_display_indexed_id_list);
203 | 
204 |     res.doc_ad_others = read_map("cache/doc_ad_others.csv.gz", read_uid_indexed_id_list);
205 |     res.viewed_doc_trf_source = read_map("cache/viewed_doc_trf_source.csv.gz", read_uid_indexed_id_list);
206 |     res.viewed_doc_sources = read_map("cache/viewed_doc_sources.csv.gz", read_uid_indexed_id_list);
207 | 
208 | 
209 |     return res;
210 | }
211 | 


--------------------------------------------------------------------------------
/util/io.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <fstream>
  4 | #include <iostream>
  5 | #include <sstream>
  6 | #include <vector>
  7 | #include <unordered_map>
  8 | #include <algorithm>
  9 | #include <ctime>
 10 | 
 11 | #include <boost/iostreams/device/file.hpp>
 12 | #include <boost/iostreams/filtering_stream.hpp>
 13 | #include <boost/iostreams/filter/gzip.hpp>
 14 | 
 15 | #include <boost/type_index.hpp>
 16 | 
 17 | 
 18 | void split(const std::string &s, char delim, std::vector<std::string> &elems) {
 19 |     std::stringstream ss;
 20 |     ss.str(s);
 21 |     std::string item;
 22 |     while (std::getline(ss, item, delim)) {
 23 |         elems.push_back(item);
 24 |     }
 25 | }
 26 | 
 27 | std::vector<std::string> split(const std::string &s, char delim) {
 28 |     std::vector<std::string> elems;
 29 |     split(s, delim, elems);
 30 |     return elems;
 31 | }
 32 | 
 33 | 
 34 | class compressed_csv_file {
 35 | public:
 36 |     boost::iostreams::filtering_istream in;
 37 |     std::vector<std::string> header;
 38 | public:
 39 |     compressed_csv_file(const std::string & name) {
 40 |         std::streamsize buffer_size = 1024*1024;
 41 | 
 42 |         in.push(boost::iostreams::gzip_decompressor(), buffer_size, buffer_size);
 43 |         in.push(boost::iostreams::file_source(name, std::ios_base::in | std::ios_base::binary), buffer_size, buffer_size);
 44 | 
 45 |         header = getrow();
 46 | 
 47 |         if (!in)
 48 |             throw std::runtime_error(std::string("Can't open file ") + name);
 49 |     }
 50 | 
 51 |     std::string getline() {
 52 |         std::string line;
 53 |         std::getline(in, line);
 54 |         return line;
 55 |     }
 56 | 
 57 |     std::vector<std::string> getrow() {
 58 |         return split(getline(), ',');
 59 |     }
 60 | 
 61 |     operator bool() {
 62 |         return !in.eof();
 63 |     }
 64 | };
 65 | 
 66 | 
 67 | template <typename K, typename T>
 68 | std::unordered_map<K, T> read_map(const std::string & file_name, std::pair<K, T> read_entry(const std::vector<std::string> &)) {
 69 |     using namespace std;
 70 | 
 71 |     time_t begin = time(nullptr);
 72 | 
 73 |     cout << "  Loading " << boost::typeindex::type_id<T>().pretty_name() << "s from " << file_name << "... ";
 74 |     cout.flush();
 75 | 
 76 |     compressed_csv_file file(file_name);
 77 |     unordered_map<K, T> res;
 78 | 
 79 |     for (int i = 0;; ++i) {
 80 |         vector<string> row = file.getrow();
 81 | 
 82 |         if (row.empty())
 83 |             break;
 84 | 
 85 |         res.insert(read_entry(row));
 86 | 
 87 |         if (i > 0 && i % 5000000 == 0) {
 88 |             cout << (i / 1000000) << "M... ";
 89 |             cout.flush();
 90 |         }
 91 |     }
 92 | 
 93 |     cout << "done in " << (time(nullptr) - begin) << " seconds, " << res.size() << " records." << endl;
 94 | 
 95 |     return res;
 96 | }
 97 | 
 98 | 
 99 | template <typename K, typename T>
100 | void read_vector(std::vector<T> & res, const std::string & file_name, std::pair<K, T> read_entry(const std::vector<std::string> &), size_t size) {
101 |     using namespace std;
102 | 
103 |     time_t begin = time(nullptr);
104 | 
105 |     cout << "  Loading " << boost::typeindex::type_id<T>().pretty_name() << "s from " << file_name << "... ";
106 |     cout.flush();
107 | 
108 |     compressed_csv_file file(file_name);
109 | 
110 |     // Resize vector to contain all elements
111 |     res.resize(size);
112 | 
113 |     for (int i = 0;; ++i) {
114 |         vector<string> row = file.getrow();
115 | 
116 |         if (row.empty())
117 |             break;
118 | 
119 |         auto entry = read_entry(row);
120 | 
121 |         res[entry.first] = move(entry.second);
122 | 
123 |         if (i > 0 && i % 5000000 == 0) {
124 |             cout << (i / 1000000) << "M... ";
125 |             cout.flush();
126 |         }
127 |     }
128 | 
129 |     cout << "done in " << (time(nullptr) - begin) << " seconds." << endl;
130 | }
131 | 
132 | 
133 | template <typename K, typename T>
134 | std::vector<T> read_vector(const std::string & file_name, std::pair<K, T> read_entry(const std::vector<std::string> &), size_t size) {
135 |     std::vector<T> res;
136 |     read_vector(res, file_name, read_entry, size);
137 |     return res;
138 | }
139 | 
140 | 
141 | template <typename K, typename T>
142 | std::unordered_multimap<K, T> read_multi_map(const std::string & file_name, std::pair<K, T> read_entry(const std::vector<std::string> &)) {
143 |     using namespace std;
144 | 
145 |     time_t begin = time(nullptr);
146 | 
147 |     cout << "  Loading " << boost::typeindex::type_id<T>().pretty_name() << "s from " << file_name << "... ";
148 |     cout.flush();
149 | 
150 |     compressed_csv_file file(file_name);
151 |     unordered_multimap<K, T> res;
152 | 
153 |     for (int i = 0;; ++i) {
154 |         vector<string> row = file.getrow();
155 | 
156 |         if (row.empty())
157 |             break;
158 | 
159 |         res.insert(read_entry(row));
160 | 
161 |         if (i > 0 && i % 5000000 == 0) {
162 |             cout << (i / 1000000) << "M... ";
163 |             cout.flush();
164 |         }
165 |     }
166 | 
167 |     cout << "done in " << (time(nullptr) - begin) << " seconds, " << res.size() << " records." << endl;
168 | 
169 |     return res;
170 | }
171 | 
172 | 
173 | template <typename K, typename T>
174 | void read_sorted_vector_map(std::unordered_map<K, std::vector<T>> & res, const std::string & file_name, std::pair<K, T> read_entry(const std::vector<std::string> &)) {
175 |     using namespace std;
176 | 
177 |     time_t begin = time(nullptr);
178 | 
179 |     cout << "  Loading " << boost::typeindex::type_id<T>().pretty_name() << "s from " << file_name << "... ";
180 |     cout.flush();
181 | 
182 |     compressed_csv_file file(file_name);
183 | 
184 |     for (int i = 0;; ++i) {
185 |         vector<string> row = file.getrow();
186 | 
187 |         if (row.empty())
188 |             break;
189 | 
190 |         auto e = read_entry(row);
191 | 
192 |         if (res.count(e.first) == 0)
193 |             res.insert(std::make_pair(e.first, vector<T>()));
194 | 
195 |         res.at(e.first).push_back(move(e.second));
196 | 
197 |         if (i > 0 && i % 5000000 == 0) {
198 |             cout << (i / 1000000) << "M... ";
199 |             cout.flush();
200 |         }
201 |     }
202 | 
203 |     for (auto it = res.begin(); it != res.end(); ++ it)
204 |         std::sort(it->second.begin(), it->second.end());
205 | 
206 |     cout << "done in " << (time(nullptr) - begin) << " seconds, " << res.size() << " records." << endl;
207 | }
208 | 
209 | template <typename K, typename T>
210 | std::unordered_map<K, std::vector<T>> read_sorted_vector_map(const std::string & file_name, std::pair<K, T> read_entry(const std::vector<std::string> &)) {
211 |     std::unordered_map<K, std::vector<T>> res;
212 |     read_sorted_vector_map(res, file_name, read_entry);
213 |     return res;
214 | }
215 | 


--------------------------------------------------------------------------------
/parse-xgb.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | import seaborn as sns
  5 | 
  6 | import re
  7 | 
  8 | node_regex = re.compile("(\d+):\[(.*)<(.+)\]\syes=(.*),no=(.*),missing=.*,gain=(.*),cover=(.*)")
  9 | leaf_regex = re.compile("(\d+):leaf=(.*),cover=(.*)")
 10 | 
 11 | 
 12 | def merge_feature_importances_inplace(to_imp, from_imp):
 13 |     for f in from_imp:
 14 |         if f in to_imp:
 15 |             to_imp[f] += from_imp[f]
 16 |         else:
 17 |             to_imp[f] = from_imp[f]
 18 | 
 19 | 
 20 | class FeatureImportance(object):
 21 |     def __init__(self, expected_fscore, expected_gain):
 22 |         self.expected_fscore = expected_fscore
 23 |         self.expected_gain = expected_gain
 24 | 
 25 |     def __add__(self, other):
 26 |         return FeatureImportance(
 27 |             expected_fscore=self.expected_fscore + other.expected_fscore,
 28 |             expected_gain=self.expected_gain + other.expected_gain
 29 |         )
 30 | 
 31 |     def __repr__(self):
 32 |         return "[expected_fscore=%.5f, expected_gain=%.5f]" % (self.expected_fscore, self.expected_gain)
 33 | 
 34 | 
 35 | class XgbLeaf(object):
 36 |     def __init__(self, index, value, cover):
 37 |         self.index = index
 38 |         self.value = value
 39 |         self.cover = cover
 40 | 
 41 |     def collect_feature_importances(self, importances, path_probability):
 42 |         pass
 43 | 
 44 |     def collect_split_values(self, feature, values):
 45 |         pass
 46 | 
 47 |     def constrain(self, feature, value):
 48 |         return self
 49 | 
 50 | 
 51 | class XgbTree(object):
 52 |     def __init__(self, index, feature, split_value, gain, cover):
 53 |         self.index = index
 54 |         self.feature = feature
 55 |         self.split_value = split_value
 56 |         self.gain = gain
 57 |         self.cover = cover
 58 | 
 59 |     def collect_feature_importances(self, importances, path_probability):
 60 |         importance = FeatureImportance(expected_fscore=path_probability, expected_gain=path_probability * self.gain)
 61 | 
 62 |         if self.feature in importances:
 63 |             importances[self.feature] += importance
 64 |         else:
 65 |             importances[self.feature] = importance
 66 | 
 67 |         self.left.collect_feature_importances(importances, path_probability=path_probability * self.left.cover / self.cover)
 68 |         self.right.collect_feature_importances(importances, path_probability=path_probability * self.right.cover / self.cover)
 69 | 
 70 |     def collect_split_values(self, feature, values):
 71 |         if feature == self.feature:
 72 |             if self.split_value in values:
 73 |                 values[self.split_value] += 1
 74 |             else:
 75 |                 values[self.split_value] = 1
 76 | 
 77 |         self.left.collect_split_values(feature, values)
 78 |         self.right.collect_split_values(feature, values)
 79 | 
 80 |     def constrain(self, feature, value):
 81 |         if feature == self.feature:
 82 |             if value < self.split_value:
 83 |                 return self.left.constrain(feature, value)
 84 |             else:
 85 |                 return self.right.constrain(feature, value)
 86 | 
 87 |         tree = XgbTree(index=self.index, feature=self.feature, split_value=self.split_value, gain=self.gain, cover=self.cover)
 88 |         tree.left = self.left.constrain(feature, value)
 89 |         tree.right = self.right.constrain(feature, value)
 90 | 
 91 |         return tree
 92 | 
 93 | 
 94 | class XgbModel(object):
 95 |     def __init__(self, trees):
 96 |         self.trees = trees
 97 | 
 98 |     def get_feature_importances(self):
 99 |         importances = {}
100 | 
101 |         for tree in self.trees:
102 |             tree.collect_feature_importances(importances, path_probability=1.0)
103 | 
104 |         return importances
105 | 
106 |     def get_split_values(self, feature):
107 |         values = {}
108 | 
109 |         for tree in self.trees:
110 |             tree.collect_split_values(feature, values)
111 | 
112 |         return values
113 | 
114 |     def constrain(self, feature, value):
115 |         return XgbModel([tree.constrain(feature, value) for tree in self.trees])
116 | 
117 | 
118 | 
119 | def parse_node(f):
120 |     line = f.readline().strip()
121 | 
122 |     if 'leaf' in line:
123 |         m = leaf_regex.match(line)
124 | 
125 |         return XgbLeaf(
126 |             index=int(m.group(1)),
127 |             value=float(m.group(2)),
128 |             cover=float(m.group(3))
129 |         )
130 |     else:
131 |         m = node_regex.match(line)
132 | 
133 |         tree = XgbTree(
134 |             index=int(m.group(1)),
135 |             feature=m.group(2),
136 |             split_value=float(m.group(3)),
137 |             gain=float(m.group(6)),
138 |             cover=float(m.group(7))
139 |         )
140 | 
141 |         left_index = int(m.group(4))
142 |         right_index = int(m.group(5))
143 | 
144 |         first = parse_node(f)
145 |         second = parse_node(f)
146 | 
147 |         if first.index == left_index and second.index == right_index:
148 |             tree.left = first
149 |             tree.right = second
150 |         elif first.index == right_index and second.index == left_index:
151 |             tree.left = second
152 |             tree.right = first
153 |         else:
154 |             raise RuntimeError("Mismatching tree indices")
155 | 
156 |         return tree
157 | 
158 | 
159 | def parse_model_dump(file_name):
160 |     with open(file_name) as f:
161 |         trees = []
162 | 
163 |         while True:
164 |             line = f.readline().strip()
165 | 
166 |             if not line:
167 |                 break
168 |             elif 'booster' in line:
169 |                 trees.append(parse_node(f))
170 |             else:
171 |                 raise RuntimeError("Can't parse line: '%s'" % line)
172 | 
173 |         return XgbModel(trees)
174 | 
175 | 
176 | model = parse_model_dump('xg.v10.dump')
177 | 
178 | time_split_values = model.get_split_values('time').keys()
179 | 
180 | min_time_value = min(time_split_values) - 1.0
181 | max_time_value = max(time_split_values) + 1.0
182 | 
183 | time_values = np.linspace(min_time_value, max_time_value, 200)
184 | 
185 | gain_records = []
186 | fscore_records = []
187 | 
188 | for time_value in time_values:
189 |     imps = model.constrain('time', time_value).get_feature_importances()
190 | 
191 |     gain_records.append({f: imps[f].expected_gain for f in imps})
192 |     fscore_records.append({f: imps[f].expected_fscore for f in imps})
193 | 
194 | 
195 | df_gain = pd.DataFrame.from_records(gain_records, index=map(int, time_values))
196 | df_gain.index.rename('time', inplace=True)
197 | df_gain.to_csv('xg.v10.gain.csv')
198 | 
199 | df_fscore = pd.DataFrame.from_records(fscore_records, index=map(int, time_values))
200 | df_fscore.index.rename('time', inplace=True)
201 | df_fscore.to_csv('xg.v10.fscore.csv')
202 | 


--------------------------------------------------------------------------------
/train-l2.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | import sys
  5 | 
  6 | from util.meta import full_split, cv1_split, cv1_split_time, test_split_time
  7 | from util import gen_prediction_name, gen_submission, score_sorted
  8 | from util.sklearn_model import SklearnModel
  9 | from util.keras_model import KerasModel
 10 | from util.xgb_model import XgbModel
 11 | 
 12 | from sklearn.model_selection import GroupKFold
 13 | from sklearn.linear_model import LogisticRegression
 14 | from sklearn.metrics import log_loss
 15 | 
 16 | from scipy.special import logit
 17 | 
 18 | 
 19 | preds = [
 20 |     #'20170114-2122-ffm2-f1b-0.68827',
 21 |     #'20170114-0106-ffm2-f1b-0.68775',
 22 | 
 23 |     #'20170113-1506-ffm2-f1-0.68447',
 24 |     #'20170113-1213-ffm2-p1-0.68392',
 25 | 
 26 |     '20170110-0230-ffm2-f1-0.69220',
 27 |     '20170110-1055-ffm2-f1-2-0.69214',
 28 | 
 29 |     '20170110-0124-ffm2-f1-0.69175',
 30 | 
 31 |     '20170109-1354-ffm2-f1-0.69148',
 32 | 
 33 |     '20170108-2008-ffm2-f1-0.68984',
 34 | 
 35 |     '20170107-2248-ffm2-p1-0.68876',
 36 |     '20170108-0345-ffm2-p2-0.68762',
 37 | 
 38 |     '20170106-2000-ffm2-p1-0.68754',
 39 |     '20170106-2050-ffm2-p2-0.68656',
 40 | 
 41 |     '20170105-2113-ffm2-p1-0.68684',
 42 | 
 43 |     '20161230-1323-ffm-p1-0.68204',
 44 |     '20161230-1049-ffm-p2-0.68169',
 45 | 
 46 |     '20161231-0544-vw-p1-0.67309',
 47 |     '20161231-1927-vw-p2-0.66718',
 48 | 
 49 |     '20170106-1339-vw-p1-0.67829',
 50 |     '20170109-1239-vw-p2-0.67148',
 51 | ]
 52 | 
 53 | models = {
 54 |     'lr': lambda: SklearnModel(LogisticRegression(C=0.01)),
 55 |     'nn': lambda: KerasModel(batch_size=128, layers=[40, 10], dropouts=[0.3, 0.1], n_epoch=1),
 56 |     'xgb': lambda: XgbModel(n_iter=1500, silent=1, objective='binary:logistic', eval_metric='logloss', seed=144, max_depth=4, colsample_bytree=0.5, subsample=0.25, tree_method='exact', eta=0.05)
 57 | }
 58 | 
 59 | model_name = sys.argv[1]
 60 | model_factory = models[model_name]
 61 | 
 62 | 
 63 | def y_hash(y):
 64 |     return hash(tuple(np.where(y[:200])[0]) + tuple(np.where(y[-200:])[0]))
 65 | 
 66 | 
 67 | def fit_present_model(events, train_X, train_y, train_event):
 68 |     print "Training present model..."
 69 | 
 70 |     train_is_present = train_event.isin(events[events['timestamp'] < cv1_split_time].index).values
 71 | 
 72 |     present_train_X = train_X[train_is_present].values
 73 |     present_train_y = train_y[train_is_present].values
 74 |     present_train_g = train_event[train_is_present].values
 75 | 
 76 |     folds = list(GroupKFold(3).split(present_train_X, present_train_y, present_train_g))
 77 |     ll_scores = []
 78 |     map_scores = []
 79 | 
 80 |     for k, (idx_train, idx_test) in enumerate(folds):
 81 |         fold_train_X = present_train_X[idx_train]
 82 |         fold_train_y = present_train_y[idx_train]
 83 |         fold_train_g = present_train_g[idx_train]
 84 | 
 85 |         fold_val_X = present_train_X[idx_test]
 86 |         fold_val_y = present_train_y[idx_test]
 87 |         fold_val_g = present_train_g[idx_test]
 88 | 
 89 |         model = model_factory()
 90 |         model.fit(fold_train_X, fold_train_y, fold_train_g, fold_val_X, fold_val_y, fold_val_g)
 91 | 
 92 |         pred = model.predict(fold_val_X)
 93 | 
 94 |         ll_scores.append(log_loss(fold_val_y, pred, eps=1e-7))
 95 |         map_scores.append(score_sorted(fold_val_y, pred, fold_val_g))
 96 | 
 97 |         print "    Fold %d logloss: %.7f, map score: %.7f" % (k+1, ll_scores[-1], map_scores[-1])
 98 | 
 99 |     print "  Present map score: %.7f +- %.7f" % (np.mean(map_scores), np.std(map_scores))
100 | 
101 |     return model_factory().fit(present_train_X, present_train_y, fold_train_g), np.mean(map_scores)
102 | 
103 | 
104 | def fit_future_model(events, train_X, train_y, train_event):
105 |     print "Training future model..."
106 | 
107 |     val2_split_time = 1078667779
108 | 
109 |     train_is_future_all = train_event.isin(events[events['timestamp'] >= cv1_split_time].index.values)
110 |     train_is_future_train = train_event.isin(events[(events['timestamp'] >= cv1_split_time) & (events['timestamp'] < val2_split_time)].index.values)
111 |     train_is_future_val = train_event.isin(events[(events['timestamp'] >= val2_split_time) & (events['timestamp'] < test_split_time)].index.values)
112 | 
113 |     future_train_X = train_X[train_is_future_train].values
114 |     future_train_y = train_y[train_is_future_train].values
115 |     future_train_g = train_event[train_is_future_train].values
116 | 
117 |     future_val_X = train_X[train_is_future_val].values
118 |     future_val_y = train_y[train_is_future_val].values
119 |     future_val_g = train_event[train_is_future_val].values
120 | 
121 |     model = model_factory()
122 |     model.fit(future_train_X, future_train_y, future_train_g, future_val_X, future_val_y, future_val_g)
123 | 
124 |     pred = model.predict(future_val_X)
125 | 
126 |     ll_score = log_loss(future_val_y, pred, eps=1e-7)
127 |     map_score = score_sorted(future_val_y, pred, future_val_g)
128 | 
129 |     print "  Future logloss: %.7f, map score: %.7f" % (ll_score, map_score)
130 | 
131 |     future_all_X = train_X[train_is_future_all].values
132 |     future_all_y = train_y[train_is_future_all].values
133 |     future_all_g = train_event[train_is_future_all].values
134 | 
135 |     return model_factory().fit(future_all_X, future_all_y, future_all_g), map_score
136 | 
137 | 
138 | def load_x(ds):
139 |     if ds == 'train':
140 |         feature_ds = 'cv1_test'
141 |         pred_ds = 'cv1'
142 |     elif ds == 'test':
143 |         feature_ds = 'full_test'
144 |         pred_ds = 'test'
145 |     else:
146 |         raise ValueError()
147 | 
148 |     X = []
149 |     X.append((pd.read_csv('cache/leak_%s.csv.gz' % feature_ds, dtype=np.uint8) > 0).astype(np.uint8))
150 | 
151 |     for pi, p in enumerate(preds):
152 |         X.append(logit(pd.read_csv('preds/%s-%s.csv.gz' % (p, pred_ds), dtype=np.float32)[['pred']].rename(columns={'pred': 'p%d' % pi}).clip(lower=1e-7, upper=1-1e-7)))
153 | 
154 |     return pd.concat(X, axis=1)
155 | 
156 | 
157 | def load_train_data():
158 |     print "Loading train data..."
159 | 
160 |     d = pd.read_csv(cv1_split[1], dtype=np.uint32, usecols=['display_id', 'clicked'])
161 | 
162 |     return load_x('train'), d['clicked'], d['display_id']
163 | 
164 | 
165 | ## Main part
166 | 
167 | 
168 | print "Loading events..."
169 | 
170 | events = pd.read_csv("../input/events.csv.gz", dtype=np.int32, index_col=0, usecols=[0, 3])  # Load events
171 | 
172 | 
173 | ## Training models
174 | 
175 | train_data = load_train_data()
176 | 
177 | present_model, present_score = fit_present_model(events, *train_data)
178 | future_model, future_score = fit_future_model(events, *train_data)
179 | 
180 | score = present_score * 0.47671335657020786 + future_score * 0.5232866434297921
181 | 
182 | print "Estimated score: %.7f" % score
183 | 
184 | del train_data
185 | 
186 | 
187 | ## Predicting
188 | 
189 | print "Predicting on test..."
190 | print "  Loading data..."
191 | 
192 | test_X = load_x('test').values
193 | 
194 | test_p = pd.read_csv(full_split[1], dtype=np.uint32)
195 | test_p['pred'] = np.nan
196 | 
197 | test_is_present = test_p['display_id'].isin(events[events['timestamp'] < test_split_time].index).values
198 | test_is_future = test_p['display_id'].isin(events[events['timestamp'] >= test_split_time].index).values
199 | 
200 | del events
201 | 
202 | print "  Predicting..."
203 | 
204 | name = gen_prediction_name('l2-%s' % model_name, score)
205 | 
206 | test_p.loc[test_is_present, 'pred'] = present_model.predict(test_X[test_is_present])
207 | test_p.loc[test_is_future, 'pred'] = future_model.predict(test_X[test_is_future])
208 | 
209 | test_p[['pred']].to_csv('preds/%s-test.csv.gz' % name, index=False, compression='gzip')
210 | 
211 | del test_X, test_is_future, test_is_present
212 | 
213 | print "  Generating submission..."
214 | subm = gen_submission(test_p)
215 | subm.to_csv('subm/%s.csv.gz' % name, index=False, compression='gzip')
216 | 
217 | print "  File name: %s" % name
218 | print "Done."
219 | 


--------------------------------------------------------------------------------
/ffm-model.cpp:
--------------------------------------------------------------------------------
  1 | #include "ffm-model.h"
  2 | #include "util/model-helpers.h"
  3 | 
  4 | #include <iostream>
  5 | #include <iomanip>
  6 | #include <fstream>
  7 | #include <algorithm>
  8 | 
  9 | constexpr ffm_ulong n_fields = 40;
 10 | constexpr ffm_ulong n_features = 1 << ffm_hash_bits;
 11 | 
 12 | constexpr ffm_ulong n_dim = 14;
 13 | constexpr ffm_ulong n_dim_aligned = ((n_dim - 1) / align_floats + 1) * align_floats;
 14 | 
 15 | constexpr ffm_ulong index_stride = n_fields * n_dim_aligned * 2;
 16 | constexpr ffm_ulong field_stride = n_dim_aligned * 2;
 17 | 
 18 | constexpr uint prefetch_depth = 1;
 19 | 
 20 | 
 21 | inline void prefetch_interaction_weights(float * addr) {
 22 |     for (uint i = 0, sz = field_stride * sizeof(float); i < sz; i += 64)
 23 |         _mm_prefetch(((char *)addr) + i, _MM_HINT_T1);
 24 | }
 25 | 
 26 | 
 27 | template <typename D>
 28 | static void init_ffm_weights(ffm_float * weights, ffm_uint n, D gen, std::default_random_engine & rnd) {
 29 |     ffm_float * w = weights;
 30 | 
 31 |     for(ffm_uint i = 0; i < n; i++) {
 32 |         for (ffm_uint d = 0; d < n_dim; d++, w++)
 33 |             *w = gen(rnd);
 34 | 
 35 |         for (ffm_uint d = n_dim; d < n_dim_aligned; d++, w++)
 36 |             *w = 0;
 37 | 
 38 |         for (ffm_uint d = n_dim_aligned; d < 2*n_dim_aligned; d++, w++)
 39 |             *w = 1;
 40 |     }
 41 | }
 42 | 
 43 | 
 44 | static void init_lin_weights(ffm_float * weights, ffm_uint n) {
 45 |     ffm_float * w = weights;
 46 | 
 47 |     for(ffm_uint i = 0; i < n; i++) {
 48 |         *w++ = 0;
 49 |         *w++ = 1;
 50 |     }
 51 | }
 52 | 
 53 | ffm_model::ffm_model(int seed, bool restricted, float eta, float lambda) {
 54 |     this->eta = eta;
 55 |     this->lambda = lambda;
 56 | 
 57 |     if (restricted) {
 58 |         max_b_field = 29;
 59 |         min_a_field = 10;
 60 |     } else {
 61 |         max_b_field = n_fields;
 62 |         min_a_field = 0;
 63 |     }
 64 | 
 65 |     std::default_random_engine rnd(seed);
 66 | 
 67 |     bias_w = 0;
 68 |     bias_wg = 1;
 69 | 
 70 |     ffm_weights = malloc_aligned<float>(n_features * n_fields * n_dim_aligned * 2);
 71 |     lin_weights = malloc_aligned<float>(n_features * 2);
 72 | 
 73 |     init_ffm_weights(ffm_weights, n_features * n_fields, std::uniform_real_distribution<ffm_float>(0.0, 1.0/sqrt(n_dim)), rnd);
 74 |     init_lin_weights(lin_weights, n_features);
 75 | }
 76 | 
 77 | ffm_model::~ffm_model() {
 78 |     free(ffm_weights);
 79 |     free(lin_weights);
 80 | }
 81 | 
 82 | 
 83 | uint ffm_model::get_dropout_mask_size(const ffm_feature * start, const ffm_feature * end) {
 84 |     uint feature_count = end - start;
 85 |     uint interaction_count = feature_count * (feature_count + 1) / 2;
 86 | 
 87 |     return interaction_count;
 88 | }
 89 | 
 90 | 
 91 | ffm_float ffm_model::predict(const ffm_feature * start, const ffm_feature * end, ffm_float norm, uint64_t * dropout_mask, float dropout_mult) {
 92 |     ffm_float linear_total = bias_w;
 93 |     ffm_float linear_norm = end - start;
 94 | 
 95 |     __m256 xmm_total = _mm256_set1_ps(0);
 96 | 
 97 |     ffm_uint i = 0;
 98 | 
 99 |     for (const ffm_feature * fa = start; fa != end; ++ fa) {
100 |         ffm_uint index_a = fa->index &  ffm_hash_mask;
101 |         ffm_uint field_a = fa->index >> ffm_hash_bits;
102 |         ffm_float value_a = fa->value;
103 | 
104 |         linear_total += value_a * lin_weights[index_a*2] / linear_norm;
105 | 
106 |         if (field_a < min_a_field)
107 |             continue;
108 | 
109 |         for (const ffm_feature * fb = start; fb != fa; ++ fb, ++ i) {
110 |             ffm_uint index_b = fb->index &  ffm_hash_mask;
111 |             ffm_uint field_b = fb->index >> ffm_hash_bits;
112 |             ffm_float value_b = fb->value;
113 | 
114 |             if (field_b > max_b_field)
115 |                 break;
116 | 
117 |             if (fb + prefetch_depth < fa && test_mask_bit(dropout_mask, i + prefetch_depth)) { // Prefetch row only if no dropout
118 |                 ffm_uint index_p = fb[prefetch_depth].index &  ffm_hash_mask;
119 |                 ffm_uint field_p = fb[prefetch_depth].index >> ffm_hash_bits;
120 | 
121 |                 prefetch_interaction_weights(ffm_weights + index_p * index_stride + field_a * field_stride);
122 |                 prefetch_interaction_weights(ffm_weights + index_a * index_stride + field_p * field_stride);
123 |             }
124 | 
125 |             if (test_mask_bit(dropout_mask, i) == 0)
126 |                 continue;
127 | 
128 |             //if (field_a == field_b)
129 |             //    continue;
130 | 
131 |             ffm_float * wa = ffm_weights + index_a * index_stride + field_b * field_stride;
132 |             ffm_float * wb = ffm_weights + index_b * index_stride + field_a * field_stride;
133 | 
134 |             __m256 xmm_val = _mm256_set1_ps(dropout_mult * value_a * value_b / norm);
135 | 
136 |             for(ffm_uint d = 0; d < n_dim; d += 8) {
137 |                 __m256 xmm_wa = _mm256_load_ps(wa + d);
138 |                 __m256 xmm_wb = _mm256_load_ps(wb + d);
139 | 
140 |                 xmm_total = _mm256_add_ps(xmm_total, _mm256_mul_ps(_mm256_mul_ps(xmm_wa, xmm_wb), xmm_val));
141 |             }
142 |         }
143 |     }
144 | 
145 |     return sum(xmm_total) + linear_total;
146 | }
147 | 
148 | 
149 | void ffm_model::update(const ffm_feature * start, const ffm_feature * end, ffm_float norm, ffm_float kappa, uint64_t * dropout_mask, float dropout_mult) {
150 |     ffm_float linear_norm = end - start;
151 | 
152 |     __m256 xmm_eta = _mm256_set1_ps(eta);
153 |     __m256 xmm_lambda = _mm256_set1_ps(lambda);
154 | 
155 |     ffm_uint i = 0;
156 | 
157 |     for (const ffm_feature * fa = start; fa != end; ++ fa) {
158 |         ffm_uint index_a = fa->index &  ffm_hash_mask;
159 |         ffm_uint field_a = fa->index >> ffm_hash_bits;
160 |         ffm_float value_a = fa->value;
161 | 
162 |         ffm_float g = lambda * lin_weights[index_a*2] + kappa * value_a / linear_norm;
163 |         ffm_float wg = lin_weights[index_a*2 + 1] + g*g;
164 | 
165 |         lin_weights[index_a*2] -= eta * g / sqrt(wg);
166 |         lin_weights[index_a*2 + 1] = wg;
167 | 
168 |         if (field_a < min_a_field)
169 |             continue;
170 | 
171 |         for (const ffm_feature * fb = start; fb != fa; ++ fb, ++ i) {
172 |             ffm_uint index_b = fb->index &  ffm_hash_mask;
173 |             ffm_uint field_b = fb->index >> ffm_hash_bits;
174 |             ffm_float value_b = fb->value;
175 | 
176 |             if (field_b > max_b_field)
177 |                 break;
178 | 
179 |             if (fb + prefetch_depth < fa && test_mask_bit(dropout_mask, i + prefetch_depth)) { // Prefetch row only if no dropout
180 |                 ffm_uint index_p = fb[prefetch_depth].index &  ffm_hash_mask;
181 |                 ffm_uint field_p = fb[prefetch_depth].index >> ffm_hash_bits;
182 | 
183 |                 prefetch_interaction_weights(ffm_weights + index_p * index_stride + field_a * field_stride);
184 |                 prefetch_interaction_weights(ffm_weights + index_a * index_stride + field_p * field_stride);
185 |             }
186 | 
187 |             if (test_mask_bit(dropout_mask, i) == 0)
188 |                 continue;
189 | 
190 |             //if (field_a == field_b)
191 |             //    continue;
192 | 
193 |             ffm_float * wa = ffm_weights + index_a * index_stride + field_b * field_stride;
194 |             ffm_float * wb = ffm_weights + index_b * index_stride + field_a * field_stride;
195 | 
196 |             ffm_float * wga = wa + n_dim_aligned;
197 |             ffm_float * wgb = wb + n_dim_aligned;
198 | 
199 |             __m256 xmm_kappa_val = _mm256_set1_ps(kappa * dropout_mult * value_a * value_b / norm);
200 | 
201 |             for(ffm_uint d = 0; d < n_dim; d += 8) {
202 |                 // Load weights
203 |                 __m256 xmm_wa = _mm256_load_ps(wa + d);
204 |                 __m256 xmm_wb = _mm256_load_ps(wb + d);
205 | 
206 |                 __m256 xmm_wga = _mm256_load_ps(wga + d);
207 |                 __m256 xmm_wgb = _mm256_load_ps(wgb + d);
208 | 
209 |                 // Compute gradient values
210 |                 __m256 xmm_ga = _mm256_add_ps(_mm256_mul_ps(xmm_lambda, xmm_wa), _mm256_mul_ps(xmm_kappa_val, xmm_wb));
211 |                 __m256 xmm_gb = _mm256_add_ps(_mm256_mul_ps(xmm_lambda, xmm_wb), _mm256_mul_ps(xmm_kappa_val, xmm_wa));
212 | 
213 |                 // Update weights
214 |                 xmm_wga = _mm256_add_ps(xmm_wga, _mm256_mul_ps(xmm_ga, xmm_ga));
215 |                 xmm_wgb = _mm256_add_ps(xmm_wgb, _mm256_mul_ps(xmm_gb, xmm_gb));
216 | 
217 |                 xmm_wa  = _mm256_sub_ps(xmm_wa, _mm256_mul_ps(xmm_eta, _mm256_mul_ps(_mm256_rsqrt_ps(xmm_wga), xmm_ga)));
218 |                 xmm_wb  = _mm256_sub_ps(xmm_wb, _mm256_mul_ps(xmm_eta, _mm256_mul_ps(_mm256_rsqrt_ps(xmm_wgb), xmm_gb)));
219 | 
220 |                 // Store weights
221 |                 _mm256_store_ps(wa + d, xmm_wa);
222 |                 _mm256_store_ps(wb + d, xmm_wb);
223 | 
224 |                 _mm256_store_ps(wga + d, xmm_wga);
225 |                 _mm256_store_ps(wgb + d, xmm_wgb);
226 |             }
227 |         }
228 |     }
229 | 
230 |     // Update bias
231 |     bias_wg += kappa;
232 |     bias_w -= eta * kappa / sqrt(bias_wg);
233 | }
234 | 


--------------------------------------------------------------------------------
/nn-model.cpp:
--------------------------------------------------------------------------------
  1 | #include "nn-model.h"
  2 | 
  3 | #include "util/model-helpers.h"
  4 | #include "util/nn-helpers.h"
  5 | 
  6 | #include <iostream>
  7 | #include <iomanip>
  8 | #include <fstream>
  9 | #include <algorithm>
 10 | 
 11 | 
 12 | constexpr uint n_features = 1 << ffm_hash_bits;
 13 | 
 14 | constexpr uint l0_output_size = aligned_float_array_size(96);
 15 | constexpr uint l1_output_size = aligned_float_array_size(64);
 16 | constexpr uint l2_output_size = aligned_float_array_size(48);
 17 | 
 18 | constexpr uint l1_layer_size = l0_output_size * (l1_output_size - 1);
 19 | constexpr uint l2_layer_size = l1_output_size * (l2_output_size - 1);
 20 | constexpr uint l3_layer_size = l2_output_size;
 21 | 
 22 | 
 23 | uint rehash(uint feature_index) {
 24 |     return feature_index &  ffm_hash_mask;
 25 | }
 26 | 
 27 | 
 28 | class state_buffer {
 29 | public:
 30 |     float * l0_output;
 31 |     float * l0_output_grad;
 32 |     float * l0_dropout_mask;
 33 | 
 34 |     float * l1_output;
 35 |     float * l1_output_grad;
 36 |     float * l1_dropout_mask;
 37 | 
 38 |     float * l2_output;
 39 |     float * l2_output_grad;
 40 |     float * l2_dropout_mask;
 41 | 
 42 |     std::default_random_engine gen;
 43 | public:
 44 |     state_buffer() {
 45 |         l0_output = malloc_aligned<float>(l0_output_size);
 46 |         l0_output_grad = malloc_aligned<float>(l0_output_size);
 47 |         l0_dropout_mask = malloc_aligned<float>(l0_output_size);
 48 | 
 49 |         l1_output = malloc_aligned<float>(l1_output_size);
 50 |         l1_output_grad = malloc_aligned<float>(l1_output_size);
 51 |         l1_dropout_mask = malloc_aligned<float>(l1_output_size);
 52 | 
 53 |         l2_output = malloc_aligned<float>(l2_output_size);
 54 |         l2_output_grad = malloc_aligned<float>(l2_output_size);
 55 |         l2_dropout_mask = malloc_aligned<float>(l2_output_size);
 56 |     }
 57 | 
 58 |     ~state_buffer() {
 59 |         free(l0_output);
 60 |         free(l0_output_grad);
 61 |         free(l0_dropout_mask);
 62 | 
 63 |         free(l1_output);
 64 |         free(l1_output_grad);
 65 |         free(l1_dropout_mask);
 66 | 
 67 |         free(l2_output);
 68 |         free(l2_output_grad);
 69 |         free(l2_dropout_mask);
 70 |     }
 71 | };
 72 | 
 73 | static thread_local state_buffer local_state_buffer;
 74 | 
 75 | 
 76 | nn_model::nn_model(int seed, float eta, float lambda) {
 77 |     this->eta = eta;
 78 |     this->lambda = lambda;
 79 | 
 80 |     std::default_random_engine rnd(seed);
 81 | 
 82 |     lin_w = malloc_aligned<float>(n_features * l0_output_size);
 83 |     lin_wg = malloc_aligned<float>(n_features * l0_output_size);
 84 | 
 85 |     l1_w = malloc_aligned<float>(l1_layer_size);
 86 |     l1_wg = malloc_aligned<float>(l1_layer_size);
 87 | 
 88 |     l2_w = malloc_aligned<float>(l2_layer_size);
 89 |     l2_wg = malloc_aligned<float>(l2_layer_size);
 90 | 
 91 |     l3_w = malloc_aligned<float>(l3_layer_size);
 92 |     l3_wg = malloc_aligned<float>(l3_layer_size);
 93 | 
 94 |     fill_with_rand(lin_w, n_features * l0_output_size, std::uniform_real_distribution<float>(-0.1, 0.1), rnd);
 95 |     fill_with_ones(lin_wg, n_features * l0_output_size);
 96 | 
 97 |     fill_with_rand(l1_w, l1_layer_size, std::normal_distribution<float>(0, 2/sqrt(l0_output_size)), rnd);
 98 |     fill_with_ones(l1_wg, l1_layer_size);
 99 | 
100 |     fill_with_rand(l2_w, l2_layer_size, std::normal_distribution<float>(0, 2/sqrt(l1_output_size)), rnd);
101 |     fill_with_ones(l2_wg, l2_layer_size);
102 | 
103 |     fill_with_rand(l3_w, l3_layer_size, std::normal_distribution<float>(0, 2/sqrt(l2_output_size)), rnd);
104 |     fill_with_ones(l3_wg, l3_layer_size);
105 | }
106 | 
107 | 
108 | nn_model::~nn_model() {
109 |     free(lin_w);
110 |     free(lin_wg);
111 | 
112 |     free(l1_w);
113 |     free(l1_wg);
114 | 
115 |     free(l2_w);
116 |     free(l2_wg);
117 | }
118 | 
119 | 
120 | uint nn_model::get_dropout_mask_size(const ffm_feature * start, const ffm_feature * end) {
121 |     return 0;
122 | }
123 | 
124 | 
125 | float nn_model::predict(const ffm_feature * start, const ffm_feature * end, float norm, uint64_t * _dropout_mask, float dropout_mult) {
126 |     float linear_norm = end - start;
127 |     state_buffer & buf = local_state_buffer;
128 | 
129 |     float * l0_output = buf.l0_output;
130 |     float * l0_dropout_mask = buf.l0_dropout_mask;
131 | 
132 |     float * l1_output = buf.l1_output;
133 |     float * l1_dropout_mask = buf.l1_dropout_mask;
134 | 
135 |     float * l2_output = buf.l2_output;
136 |     float * l2_dropout_mask = buf.l2_dropout_mask;
137 | 
138 |     auto & gen = buf.gen;
139 | 
140 |     std::uniform_real_distribution<float> dropout_distr(0, 1);
141 | 
142 |     if (dropout_mult > 1) { // Apply dropout only in train
143 |         float l0_dropout_prob = 0;//0.02;
144 |         float l1_dropout_prob = 0;//0.02;
145 |         float l2_dropout_prob = 0;//0.02;
146 | 
147 |         float l0_dropout_scale = 1 / (1 - l0_dropout_prob);
148 |         float l1_dropout_scale = 1 / (1 - l1_dropout_prob);
149 |         float l2_dropout_scale = 1 / (1 - l2_dropout_prob);
150 | 
151 |         // Prepare dropout masks
152 |         l0_dropout_mask[0] = 1.0; // No dropout on bias
153 |         for (uint j = 1; j < l0_output_size; ++ j)
154 |             l0_dropout_mask[j] = (dropout_distr(gen) >= l0_dropout_prob) * l0_dropout_scale;
155 | 
156 |         l1_dropout_mask[0] = 1.0; // No dropout on bias
157 |         for (uint j = 1; j < l1_output_size; ++ j)
158 |             l1_dropout_mask[j] = (dropout_distr(gen) >= l1_dropout_prob) * l1_dropout_scale;
159 | 
160 |         l2_dropout_mask[0] = 1.0; // No dropout on bias
161 |         for (uint j = 1; j < l2_output_size; ++ j)
162 |             l2_dropout_mask[j] = (dropout_distr(gen) >= l2_dropout_prob) * l2_dropout_scale;
163 |     } else {
164 |         fill_with_ones(l0_dropout_mask, l0_output_size);
165 |         fill_with_ones(l1_dropout_mask, l1_output_size);
166 |         fill_with_ones(l2_dropout_mask, l2_output_size);
167 |     }
168 | 
169 |     // Compute activations
170 | 
171 |     fill_with_zero(l0_output, l0_output_size);
172 |     fill_with_zero(l1_output, l1_output_size);
173 |     fill_with_zero(l2_output, l2_output_size);
174 | 
175 |     for (const ffm_feature * fa = start; fa != end; ++ fa) {
176 |         uint index = rehash(fa->index);
177 |         float value = fa->value;
178 | 
179 |         float * wl = lin_w + index * l0_output_size;
180 | 
181 |         __m256 ymm_val = _mm256_set1_ps(value / linear_norm);
182 |         for(ffm_uint d = 0; d < l0_output_size; d += 8) {
183 |             _mm256_store_ps(l0_output + d,  _mm256_load_ps(l0_output + d) + _mm256_load_ps(wl + d) * ymm_val);
184 |         }
185 |     }
186 | 
187 |     l0_output[0] = 1.0; // Layer 0 bias, here we rewritre some computation results, but who cares
188 |     l1_output[0] = 1.0; // Layer 1 bias
189 |     l2_output[0] = 1.0; // Layer 2 bias
190 | 
191 |     // Layer 0 relu
192 |     for (uint j = 1; j < l0_output_size; ++ j)
193 |         l0_output[j] = relu(l0_output[j]) * l0_dropout_mask[j];
194 | 
195 |     // Layer 1 forward pass
196 |     for (uint j = 1; j < l1_output_size; ++ j)
197 |         l1_output[j] = relu(forward_pass(l0_output_size, l0_output, l1_w + (j - 1) * l0_output_size)) * l1_dropout_mask[j];
198 | 
199 |     // Layer 2 forward pass
200 |     for (uint j = 1; j < l2_output_size; ++ j)
201 |         l2_output[j] = relu(forward_pass(l1_output_size, l1_output, l2_w + (j - 1) * l1_output_size)) * l2_dropout_mask[j];
202 | 
203 |     // Layer 3 forward pass
204 |     return forward_pass(l2_output_size, l2_output, l3_w);
205 | }
206 | 
207 | 
208 | void nn_model::update(const ffm_feature * start, const ffm_feature * end, float norm, float kappa, uint64_t * _dropout_mask, float _dropout_mult) {
209 |     float linear_norm = end - start;
210 |     state_buffer & buf = local_state_buffer;
211 | 
212 |     float * l0_output = buf.l0_output;
213 |     float * l0_output_grad = buf.l0_output_grad;
214 |     float * l0_dropout_mask = buf.l0_dropout_mask;
215 | 
216 |     float * l1_output = buf.l1_output;
217 |     float * l1_output_grad = buf.l1_output_grad;
218 |     float * l1_dropout_mask = buf.l1_dropout_mask;
219 | 
220 |     float * l2_output = buf.l2_output;
221 |     float * l2_output_grad = buf.l2_output_grad;
222 |     float * l2_dropout_mask = buf.l2_dropout_mask;
223 | 
224 |     fill_with_zero(l0_output_grad, l0_output_size);
225 |     fill_with_zero(l1_output_grad, l1_output_size);
226 |     fill_with_zero(l2_output_grad, l2_output_size);
227 | 
228 |     backward_pass(l2_output_size, l2_output, l2_output_grad, l3_w, l3_wg, kappa, eta, lambda);
229 | 
230 |     // Backprop layer 2
231 |     for (uint j = 1, ofs = 0; j < l2_output_size; ++ j, ofs += l1_output_size) {
232 |         float l2_grad = l2_output_grad[j] * l2_dropout_mask[j];
233 | 
234 |         if (l2_output[j] <= 0) // Relu activation: grad in negative part is zero
235 |             l2_grad = 0;
236 | 
237 |         backward_pass(l1_output_size, l1_output, l1_output_grad, l2_w + ofs, l2_wg + ofs, l2_grad, eta, lambda);
238 |     }
239 | 
240 |     // Backprop layer 1
241 |     for (uint j = 1, ofs = 0; j < l1_output_size; ++ j, ofs += l0_output_size) {
242 |         float l1_grad = l1_output_grad[j] * l1_dropout_mask[j];
243 | 
244 |         if (l1_output[j] <= 0) // Relu activation: grad in negative part is zero
245 |             l1_grad = 0;
246 | 
247 |         backward_pass(l0_output_size, l0_output, l0_output_grad, l1_w + ofs, l1_wg + ofs, l1_grad, eta, lambda);
248 |     }
249 | 
250 |     // Backprop layer 0
251 |     l0_output_grad[0] = 0;
252 |     for (uint j = 1; j < l0_output_size; ++ j) {
253 |         float l0_grad = l0_output_grad[j] * l0_dropout_mask[j];
254 | 
255 |         if (l0_output[j] <= 0) // Relu activation: grad in negative part is zero
256 |             l0_grad = 0;
257 | 
258 |         l0_output_grad[j] = l0_grad;
259 |     }
260 | 
261 |     // Update linear and interaction weights
262 |     __m256 ymm_eta = _mm256_set1_ps(eta);
263 |     __m256 ymm_lambda = _mm256_set1_ps(lambda);
264 | 
265 |     for (const ffm_feature * fa = start; fa != end; ++ fa) {
266 |         uint index = rehash(fa->index);
267 |         float value = fa->value;
268 | 
269 |         float * wl = lin_w + index * l0_output_size;
270 |         float * wgl = lin_wg + index * l0_output_size;
271 | 
272 |         __m256 ymm_val = _mm256_set1_ps(value / linear_norm);
273 | 
274 |         for (uint d = 0; d < l0_output_size; d += 8) {
275 |             __m256 ymm_kappa_val = _mm256_load_ps(l0_output_grad + d) * ymm_val;
276 | 
277 |             // Load weights
278 |             __m256 ymm_wl = _mm256_load_ps(wl + d);
279 |             __m256 ymm_wgl = _mm256_load_ps(wgl + d);
280 | 
281 |             // Compute gradient values
282 |             __m256 ymm_g  = ymm_lambda * ymm_wl + ymm_kappa_val;
283 | 
284 |             // Update weights
285 |             ymm_wgl = ymm_wgl + ymm_g * ymm_g;
286 |             ymm_wl  = ymm_wl - ymm_eta * ymm_g * _mm256_rsqrt_ps(ymm_wgl);
287 | 
288 |             // Store weights
289 |             _mm256_store_ps(wl + d, ymm_wl);
290 |             _mm256_store_ps(wgl + d, ymm_wgl);
291 |         }
292 |     }
293 | }
294 | 


--------------------------------------------------------------------------------
/prepare-group-viewed-docs.cpp:
--------------------------------------------------------------------------------
  1 | #include "util/io.h"
  2 | #include "util/data.h"
  3 | 
  4 | 
  5 | std::vector<std::pair<std::string, std::string>> filesets {
  6 |     { "cache/clicks_cv1_train.csv.gz", "cv1_train" },
  7 |     { "cache/clicks_cv1_test.csv.gz", "cv1_test" },
  8 |     { "cache/clicks_cv2_train.csv.gz", "cv2_train" },
  9 |     { "cache/clicks_cv2_test.csv.gz", "cv2_test" },
 10 |     { "../input/clicks_train.csv.gz", "full_train" },
 11 |     { "../input/clicks_test.csv.gz", "full_test" },
 12 | };
 13 | 
 14 | 
 15 | std::unordered_map<std::string, int> group_id_map;
 16 | 
 17 | 
 18 | std::string extract_group(const std::string & platform, const std::string & geo) {
 19 |     return platform + geo;
 20 | }
 21 | 
 22 | 
 23 | std::pair<int, int> read_event_group(const std::vector<std::string> & row) {
 24 |     auto id = stoi(row[0]);
 25 | 
 26 |     auto group = extract_group(row[4], row[5]); // platform + geo
 27 |     int group_id = group_id_map.size();
 28 | 
 29 |     auto it = group_id_map.find(group);
 30 |     if (it == group_id_map.end()) {
 31 |         group_id_map.insert(std::make_pair(group, group_id));
 32 |     } else {
 33 |         group_id = it->second;
 34 |     }
 35 | 
 36 |     return std::make_pair(id, group_id);
 37 | }
 38 | 
 39 | std::pair<int, int> read_ad_document(const std::vector<std::string> & row) {
 40 |     return std::make_pair(stoi(row[0]), stoi(row[1]));
 41 | }
 42 | 
 43 | 
 44 | std::vector<int> event_groups;
 45 | std::vector<int> ad_doc_ids;
 46 | std::unordered_map<int, document> documents;
 47 | std::unordered_multimap<int, std::pair<int, float>> document_categories;
 48 | std::unordered_multimap<int, std::pair<int, float>> document_topics;
 49 | 
 50 | std::streamsize buffer_size = 1024*1024;
 51 | 
 52 | 
 53 | class doc_source_writer {
 54 |     std::unordered_map<std::pair<int, int>, int> publisher_views_map;
 55 |     std::unordered_map<std::pair<int, int>, int> source_views_map;
 56 | public:
 57 |     std::string get_header() {
 58 |         return "publisher_view_count,source_view_count";
 59 |     }
 60 | 
 61 |     void prepare(int group_id, int document_id) {
 62 |         using namespace std;
 63 | 
 64 |         auto document = documents.at(document_id);
 65 | 
 66 |         if (document.publisher_id > 0)
 67 |             publisher_views_map[make_pair(document.publisher_id, group_id)] = 0;
 68 | 
 69 |         if (document.source_id > 0)
 70 |             source_views_map[make_pair(document.source_id, group_id)] = 0;
 71 |     }
 72 | 
 73 |     void update(int group_id, int document_id) {
 74 |         using namespace std;
 75 | 
 76 |         auto document = documents.at(document_id);
 77 | 
 78 |         auto pv_it = publisher_views_map.find(make_pair(document.publisher_id, group_id));
 79 |         if (pv_it != publisher_views_map.end())
 80 |             pv_it->second ++;
 81 | 
 82 |         auto sv_it = source_views_map.find(make_pair(document.source_id, group_id));
 83 |         if (sv_it != source_views_map.end())
 84 |             sv_it->second ++;
 85 |     }
 86 | 
 87 |     void write(std::ostream & out, int group_id, int document_id) {
 88 |         using namespace std;
 89 | 
 90 |         auto document = documents.at(document_id);
 91 | 
 92 |         auto publisher_view_times = document.publisher_id > 0 ? publisher_views_map[make_pair(document.publisher_id, group_id)] : -1;
 93 |         auto source_view_times = document.source_id > 0 ? source_views_map[make_pair(document.source_id, group_id)] : -1;
 94 | 
 95 |         out << publisher_view_times << ","
 96 |             << source_view_times << endl;
 97 |     }
 98 | };
 99 | 
100 | 
101 | class doc_category_writer {
102 |     std::unordered_map<std::pair<int, int>, float> category_views_map;
103 | public:
104 |     std::string get_header() {
105 |         return "category_view_weight";
106 |     }
107 | 
108 |     void prepare(int group_id, int document_id) {
109 |         auto doc_categories = document_categories.equal_range(document_id);
110 | 
111 |         for (auto it = doc_categories.first; it != doc_categories.second; ++ it)
112 |             category_views_map[std::make_pair(it->second.first, group_id)] = 0;
113 |     }
114 | 
115 |     void update(int group_id, int document_id) {
116 |         auto doc_categories = document_categories.equal_range(document_id);
117 | 
118 |         for (auto it = doc_categories.first; it != doc_categories.second; ++ it) {
119 |             auto cv_it = category_views_map.find(std::make_pair(it->second.first, group_id));
120 |             if (cv_it != category_views_map.end())
121 |                 cv_it->second += it->second.second;
122 |         }
123 |     }
124 | 
125 |     void write(std::ostream & out, int group_id, int document_id) {
126 |         auto doc_categories = document_categories.equal_range(document_id);
127 | 
128 |         float category_view_weight = 0;
129 | 
130 |         for (auto it = doc_categories.first; it != doc_categories.second; ++ it)
131 |             category_view_weight += category_views_map[std::make_pair(it->second.first, group_id)];
132 | 
133 |         out << category_view_weight << std::endl;
134 |     }
135 | };
136 | 
137 | 
138 | class doc_topic_writer {
139 |     std::unordered_map<std::pair<int, int>, float> topic_views_map;
140 | public:
141 |     std::string get_header() {
142 |         return "topic_view_weight";
143 |     }
144 | 
145 |     void prepare(int group_id, int document_id) {
146 |         auto doc_topics = document_topics.equal_range(document_id);
147 | 
148 |         for (auto it = doc_topics.first; it != doc_topics.second; ++ it)
149 |             topic_views_map[std::make_pair(it->second.first, group_id)] = 0;
150 |     }
151 | 
152 |     void update(int group_id, int document_id) {
153 |         auto doc_topics = document_topics.equal_range(document_id);
154 | 
155 |         for (auto it = doc_topics.first; it != doc_topics.second; ++ it) {
156 |             auto cv_it = topic_views_map.find(std::make_pair(it->second.first, group_id));
157 |             if (cv_it != topic_views_map.end())
158 |                 cv_it->second += it->second.second;
159 |         }
160 |     }
161 | 
162 |     void write(std::ostream & out, int group_id, int document_id) {
163 |         auto doc_topics = document_topics.equal_range(document_id);
164 | 
165 |         float topic_view_weight = 0;
166 | 
167 |         for (auto it = doc_topics.first; it != doc_topics.second; ++ it)
168 |             topic_view_weight += topic_views_map[std::make_pair(it->second.first, group_id)];
169 | 
170 |         out << topic_view_weight << std::endl;
171 |     }
172 | };
173 | 
174 | 
175 | template <typename W>
176 | void generate(const std::string & file_name_prefix) {
177 |     using namespace std;
178 | 
179 |     cout << "Generating " << file_name_prefix << "..." << endl;
180 | 
181 |     W w;
182 | 
183 |     cout << "  Loading click data..." << endl;
184 |     for (auto it = filesets.begin(); it != filesets.end(); ++ it) {
185 |         cout << "    Loading " << it->first << "... ";
186 |         cout.flush();
187 | 
188 |         clock_t begin = clock();
189 | 
190 |         compressed_csv_file file(it->first);
191 | 
192 |         for (int i = 0;; ++i) {
193 |             auto row = file.getrow();
194 | 
195 |             if (row.empty())
196 |                 break;
197 | 
198 |             auto group_id = event_groups.at(stoi(row[0]));
199 |             auto document_id = ad_doc_ids.at(stoi(row[1]));
200 | 
201 |             w.prepare(group_id, document_id);
202 | 
203 |             if (i > 0 && i % 5000000 == 0) {
204 |                 cout << (i / 1000000) << "M... ";
205 |                 cout.flush();
206 |             }
207 |         }
208 | 
209 |         clock_t end = clock();
210 |         double elapsed = double(end - begin) / CLOCKS_PER_SEC;
211 | 
212 |         cout << "done in " << elapsed << " seconds" << endl;
213 |     }
214 | 
215 |     {
216 |         cout << "  Processing page views data... ";
217 |         cout.flush();
218 | 
219 |         clock_t begin = clock();
220 | 
221 |         compressed_csv_file file("../input/page_views.csv.gz");
222 |         int found = 0;
223 | 
224 |         for (int i = 0;; ++i) {
225 |             auto row = file.getrow();
226 | 
227 |             if (row.empty())
228 |                 break;
229 | 
230 |             auto document_id = stoi(row[1]);
231 |             auto group = extract_group(row[3], row[4]); // platform + geo
232 | 
233 |             // Register view
234 |             auto group_it = group_id_map.find(group);
235 |             if (group_it != group_id_map.end()) {
236 |                 w.update(group_it->second, document_id);
237 |             }
238 | 
239 |             if (i > 0 && i % 5000000 == 0) {
240 |                 cout << (i / 1000000) << "M... ";
241 |                 cout.flush();
242 |             }
243 |         }
244 | 
245 |         clock_t end = clock();
246 |         double elapsed = double(end - begin) / CLOCKS_PER_SEC;
247 | 
248 |         cout << "done in " << elapsed << " seconds, found " << found << " entries" << endl;
249 |     }
250 | 
251 |     cout << "  Generating viewed docs features..." << endl;
252 |     for (auto it = filesets.begin(); it != filesets.end(); ++ it) {
253 |         auto out_file_name = string("cache/") + file_name_prefix + string("_") + it->second + string(".csv.gz");
254 | 
255 |         cout << "  Generating " << out_file_name << "... ";
256 |         cout.flush();
257 | 
258 |         clock_t begin = clock();
259 | 
260 |         compressed_csv_file file(it->first);
261 | 
262 |         boost::iostreams::filtering_ostream out;
263 |         out.push(boost::iostreams::gzip_compressor(), buffer_size, buffer_size);
264 |         out.push(boost::iostreams::file_sink(out_file_name, std::ios_base::out | std::ios_base::binary), buffer_size, buffer_size);
265 | 
266 |         out << w.get_header() << endl;
267 | 
268 |         for (int i = 0;; ++i) {
269 |             auto row = file.getrow();
270 | 
271 |             if (row.empty())
272 |                 break;
273 | 
274 |             auto group_id = event_groups.at(stoi(row[0]));
275 |             auto document_id = ad_doc_ids.at(stoi(row[1]));
276 | 
277 |             w.write(out, group_id, document_id);
278 | 
279 |             if (i > 0 && i % 5000000 == 0) {
280 |                 cout << (i / 1000000) << "M... ";
281 |                 cout.flush();
282 |             }
283 |         }
284 | 
285 |         clock_t end = clock();
286 |         double elapsed = double(end - begin) / CLOCKS_PER_SEC;
287 | 
288 |         cout << "done in " << elapsed << " seconds" << endl;
289 |     }
290 | }
291 | 
292 | 
293 | int main() {
294 |     using namespace std;
295 | 
296 |     cout << "Loading reference data..." << endl;
297 |     event_groups = read_vector("cache/events.csv.gz", read_event_group, 23120127);
298 |     ad_doc_ids = read_vector("../input/promoted_content.csv.gz", read_ad_document, 573099);
299 |     documents = read_map("cache/documents.csv.gz", read_document);
300 |     document_categories = read_multi_map("../input/documents_categories.csv.gz", read_document_annotation);
301 |     document_topics = read_multi_map("../input/documents_topics.csv.gz", read_document_annotation);
302 | 
303 |     generate<doc_source_writer>("g1_viewed_docs");
304 |     generate<doc_category_writer>("g1_viewed_categories");
305 |     generate<doc_topic_writer>("g1_viewed_topics");
306 | 
307 |     cout << "Done." << endl;
308 | }
309 | 


--------------------------------------------------------------------------------
/prepare-viewed-docs.cpp:
--------------------------------------------------------------------------------
  1 | #include "util/io.h"
  2 | #include "util/data.h"
  3 | 
  4 | 
  5 | std::vector<std::pair<std::string, std::string>> filesets {
  6 |     { "cache/clicks_cv1_train.csv.gz", "cv1_train" },
  7 |     { "cache/clicks_cv1_test.csv.gz", "cv1_test" },
  8 |     { "cache/clicks_cv2_train.csv.gz", "cv2_train" },
  9 |     { "cache/clicks_cv2_test.csv.gz", "cv2_test" },
 10 |     { "../input/clicks_train.csv.gz", "full_train" },
 11 |     { "../input/clicks_test.csv.gz", "full_test" },
 12 | };
 13 | 
 14 | std::vector<int> max_timestamps = {
 15 |     1123200000,
 16 |     1123200000,
 17 |     std::numeric_limits<int>::max()
 18 | };
 19 | 
 20 | struct event_info {
 21 |     int uid;
 22 |     int timestamp;
 23 | };
 24 | 
 25 | 
 26 | std::unordered_map<std::string, int> uuid_map;
 27 | 
 28 | 
 29 | std::pair<int, event_info> read_event_info(const std::vector<std::string> & row) {
 30 |     auto id = stoi(row[0]);
 31 |     auto uuid = row[1];
 32 | 
 33 |     event_info res;
 34 |     res.timestamp = stoi(row[3]);
 35 | 
 36 |     auto it = uuid_map.find(uuid);
 37 | 
 38 |     if (it == uuid_map.end()) {
 39 |         res.uid = uuid_map.size();
 40 |         uuid_map.insert(std::make_pair(uuid, res.uid));
 41 |     } else {
 42 |         res.uid = it->second;
 43 |     }
 44 | 
 45 |     return std::make_pair(id, res);
 46 | }
 47 | 
 48 | std::pair<int, int> read_ad_document(const std::vector<std::string> & row) {
 49 |     return std::make_pair(stoi(row[0]), stoi(row[1]));
 50 | }
 51 | 
 52 | 
 53 | std::vector<event_info> events;
 54 | std::vector<int> ad_doc_ids;
 55 | std::unordered_map<int, document> documents;
 56 | std::unordered_multimap<int, std::pair<int, float>> document_categories;
 57 | std::unordered_multimap<int, std::pair<int, float>> document_topics;
 58 | 
 59 | std::streamsize buffer_size = 1024*1024;
 60 | 
 61 | 
 62 | class doc_source_writer {
 63 |     std::unordered_map<std::pair<int, int>, int> publisher_views_map;
 64 |     std::unordered_map<std::pair<int, int>, int> source_views_map;
 65 | public:
 66 |     std::string get_header() {
 67 |         return "publisher_view_count,source_view_count";
 68 |     }
 69 | 
 70 |     void prepare(int uid, int document_id, int timestamp) {
 71 |         using namespace std;
 72 | 
 73 |         auto document = documents.at(document_id);
 74 | 
 75 |         if (document.publisher_id > 0)
 76 |             publisher_views_map[make_pair(document.publisher_id, uid)] = 0;
 77 | 
 78 |         if (document.source_id > 0)
 79 |             source_views_map[make_pair(document.source_id, uid)] = 0;
 80 |     }
 81 | 
 82 |     void update(int uid, int document_id, int timestamp) {
 83 |         using namespace std;
 84 | 
 85 |         auto document = documents.at(document_id);
 86 | 
 87 |         auto pv_it = publisher_views_map.find(make_pair(document.publisher_id, uid));
 88 |         if (pv_it != publisher_views_map.end())
 89 |             pv_it->second ++;
 90 | 
 91 |         auto sv_it = source_views_map.find(make_pair(document.source_id, uid));
 92 |         if (sv_it != source_views_map.end())
 93 |             sv_it->second ++;
 94 |     }
 95 | 
 96 |     void write(std::ostream & out, int uid, int document_id, int timestamp) {
 97 |         using namespace std;
 98 | 
 99 |         auto document = documents.at(document_id);
100 | 
101 |         auto publisher_view_times = document.publisher_id > 0 ? publisher_views_map[make_pair(document.publisher_id, uid)] : -1;
102 |         auto source_view_times = document.source_id > 0 ? source_views_map[make_pair(document.source_id, uid)] : -1;
103 | 
104 |         out << publisher_view_times << ","
105 |             << source_view_times << endl;
106 |     }
107 | };
108 | 
109 | 
110 | class doc_category_writer {
111 |     std::unordered_map<std::pair<int, int>, float> category_views_map;
112 | public:
113 |     std::string get_header() {
114 |         return "category_view_weight";
115 |     }
116 | 
117 |     void prepare(int uid, int document_id, int timestamp) {
118 |         auto doc_categories = document_categories.equal_range(document_id);
119 | 
120 |         for (auto it = doc_categories.first; it != doc_categories.second; ++ it)
121 |             category_views_map[std::make_pair(it->second.first, uid)] = 0;
122 |     }
123 | 
124 |     void update(int uid, int document_id, int timestamp) {
125 |         auto doc_categories = document_categories.equal_range(document_id);
126 | 
127 |         for (auto it = doc_categories.first; it != doc_categories.second; ++ it) {
128 |             auto cv_it = category_views_map.find(std::make_pair(it->second.first, uid));
129 |             if (cv_it != category_views_map.end())
130 |                 cv_it->second += it->second.second;
131 |         }
132 |     }
133 | 
134 |     void write(std::ostream & out, int uid, int document_id, int timestamp) {
135 |         auto doc_categories = document_categories.equal_range(document_id);
136 | 
137 |         float category_view_weight = 0;
138 | 
139 |         for (auto it = doc_categories.first; it != doc_categories.second; ++ it)
140 |             category_view_weight += category_views_map[std::make_pair(it->second.first, uid)];
141 | 
142 |         out << category_view_weight << std::endl;
143 |     }
144 | };
145 | 
146 | 
147 | class doc_topic_writer {
148 |     std::unordered_map<std::pair<int, int>, float> topic_views_map;
149 | public:
150 |     std::string get_header() {
151 |         return "topic_view_weight";
152 |     }
153 | 
154 |     void prepare(int uid, int document_id, int timestamp) {
155 |         auto doc_topics = document_topics.equal_range(document_id);
156 | 
157 |         for (auto it = doc_topics.first; it != doc_topics.second; ++ it)
158 |             topic_views_map[std::make_pair(it->second.first, uid)] = 0;
159 |     }
160 | 
161 |     void update(int uid, int document_id, int timestamp) {
162 |         auto doc_topics = document_topics.equal_range(document_id);
163 | 
164 |         for (auto it = doc_topics.first; it != doc_topics.second; ++ it) {
165 |             auto cv_it = topic_views_map.find(std::make_pair(it->second.first, uid));
166 |             if (cv_it != topic_views_map.end())
167 |                 cv_it->second += it->second.second;
168 |         }
169 |     }
170 | 
171 |     void write(std::ostream & out, int uid, int document_id, int timestamp) {
172 |         auto doc_topics = document_topics.equal_range(document_id);
173 | 
174 |         float topic_view_weight = 0;
175 | 
176 |         for (auto it = doc_topics.first; it != doc_topics.second; ++ it)
177 |             topic_view_weight += topic_views_map[std::make_pair(it->second.first, uid)];
178 | 
179 |         out << topic_view_weight << std::endl;
180 |     }
181 | };
182 | 
183 | 
184 | template <typename W>
185 | void generate(const std::string & file_name_prefix, uint ofs) {
186 |     using namespace std;
187 | 
188 |     cout << "Generating " << file_name_prefix << "..." << endl;
189 | 
190 |     W w;
191 | 
192 |     cout << "  Loading click data..." << endl;
193 |     for (uint fi = ofs; fi < ofs + 2; ++ fi) {
194 |         auto in_file_name = filesets[fi].first;
195 | 
196 |         cout << "    Loading " << in_file_name << "... ";
197 |         cout.flush();
198 | 
199 |         time_t begin = time(nullptr);
200 | 
201 |         compressed_csv_file file(in_file_name);
202 | 
203 |         for (int i = 0;; ++i) {
204 |             auto row = file.getrow();
205 | 
206 |             if (row.empty())
207 |                 break;
208 | 
209 |             auto ev = events.at(stoi(row[0]));
210 |             auto document_id = ad_doc_ids.at(stoi(row[1]));
211 | 
212 |             w.prepare(ev.uid, document_id, ev.timestamp);
213 | 
214 |             if (i > 0 && i % 5000000 == 0) {
215 |                 cout << (i / 1000000) << "M... ";
216 |                 cout.flush();
217 |             }
218 |         }
219 | 
220 |         cout << "done in " << (time(nullptr) - begin) << " seconds" << endl;
221 |     }
222 | 
223 |     {
224 |         cout << "  Processing page views data... ";
225 |         cout.flush();
226 | 
227 |         time_t begin = time(nullptr);
228 | 
229 |         compressed_csv_file file("../input/page_views.csv.gz");
230 |         int found = 0;
231 |         int max_timestamp = max_timestamps[ofs / 2];
232 | 
233 |         for (int i = 0;; ++i) {
234 |             auto row = file.getrow();
235 | 
236 |             if (row.empty())
237 |                 break;
238 | 
239 |             auto uuid = row[0];
240 |             auto document_id = stoi(row[1]);
241 |             auto timestamp = stoi(row[2]);
242 | 
243 |             if (timestamp <= max_timestamp) {
244 |                 // Register view
245 |                 auto uid_it = uuid_map.find(uuid);
246 |                 if (uid_it != uuid_map.end()) {
247 |                     w.update(uid_it->second, document_id, timestamp);
248 |                 }
249 |             }
250 | 
251 |             if (i > 0 && i % 5000000 == 0) {
252 |                 cout << (i / 1000000) << "M... ";
253 |                 cout.flush();
254 |             }
255 |         }
256 | 
257 |         cout << "done in " << (time(nullptr) - begin) << " seconds, found " << found << " entries" << endl;
258 |     }
259 | 
260 |     cout << "  Generating viewed docs features..." << endl;
261 |     for (uint fi = ofs; fi < ofs + 2; ++ fi) {
262 |         auto out_file_name = string("cache/") + file_name_prefix + string("_") + filesets[fi].second + string(".csv.gz");
263 | 
264 |         cout << "  Generating " << out_file_name << "... ";
265 |         cout.flush();
266 | 
267 |         time_t begin = time(nullptr);
268 | 
269 |         compressed_csv_file file(filesets[fi].first);
270 | 
271 |         boost::iostreams::filtering_ostream out;
272 |         out.push(boost::iostreams::gzip_compressor(), buffer_size, buffer_size);
273 |         out.push(boost::iostreams::file_sink(out_file_name, std::ios_base::out | std::ios_base::binary), buffer_size, buffer_size);
274 | 
275 |         out << w.get_header() << endl;
276 | 
277 |         for (int i = 0;; ++i) {
278 |             auto row = file.getrow();
279 | 
280 |             if (row.empty())
281 |                 break;
282 | 
283 |             auto ev = events.at(stoi(row[0]));
284 |             auto document_id = ad_doc_ids.at(stoi(row[1]));
285 | 
286 |             w.write(out, ev.uid, document_id, ev.timestamp);
287 | 
288 |             if (i > 0 && i % 5000000 == 0) {
289 |                 cout << (i / 1000000) << "M... ";
290 |                 cout.flush();
291 |             }
292 |         }
293 | 
294 |         cout << "done in " << (time(nullptr) - begin) << " seconds" << endl;
295 |     }
296 | }
297 | 
298 | 
299 | int main() {
300 |     using namespace std;
301 | 
302 |     cout << "Loading reference data..." << endl;
303 |     events = read_vector("cache/events.csv.gz", read_event_info, 23120127);
304 |     ad_doc_ids = read_vector("../input/promoted_content.csv.gz", read_ad_document, 573099);
305 |     documents = read_map("cache/documents.csv.gz", read_document);
306 |     document_categories = read_multi_map("../input/documents_categories.csv.gz", read_document_annotation);
307 |     document_topics = read_multi_map("../input/documents_topics.csv.gz", read_document_annotation);
308 | 
309 |     for (uint ofs = 0; ofs < filesets.size(); ofs += 2) {
310 |         generate<doc_source_writer>("viewed_docs", ofs);
311 |         generate<doc_category_writer>("viewed_categories", ofs);
312 |         generate<doc_topic_writer>("viewed_topics", ofs);
313 |     }
314 | 
315 |     cout << "Done." << endl;
316 | }
317 | 


--------------------------------------------------------------------------------
/export-bin-data-p1.cpp:
--------------------------------------------------------------------------------
  1 | #include "util/io.h"
  2 | #include "util/data.h"
  3 | #include "util/generation.h"
  4 | #include "util/helpers.h"
  5 | 
  6 | #include "ffm.h"
  7 | 
  8 | std::vector<std::pair<std::string, std::string>> files = {
  9 |     { "cache/clicks_cv2_train.csv.gz", "cv2_train" },
 10 |     { "cache/clicks_cv2_test.csv.gz", "cv2_test" },
 11 |     { "cache/clicks_cv1_train.csv.gz", "cv1_train" },
 12 |     { "cache/clicks_cv1_test.csv.gz", "cv1_test" },
 13 |     { "../input/clicks_train.csv.gz", "full_train" },
 14 |     { "../input/clicks_test.csv.gz", "full_test" },
 15 | };
 16 | 
 17 | std::vector<std::string> features = {
 18 |     "leak",
 19 |     "viewed_docs", "viewed_categories", "viewed_topics",
 20 |     "uid_viewed_ads", "uid_viewed_ad_srcs", "uid_viewed_ad_cats", "uid_viewed_ad_tops",
 21 |     "rivals"
 22 | };
 23 | 
 24 | std::string cur_dataset;
 25 | 
 26 | std::unordered_map<int, int> ad_counts;
 27 | std::unordered_map<int, int> ad_campaign_counts;
 28 | std::unordered_map<int, int> ad_advertiser_counts;
 29 | std::unordered_map<int, int> ad_doc_counts;
 30 | std::unordered_map<int, int> ad_doc_source_counts;
 31 | std::unordered_map<int, int> ad_doc_publisher_counts;
 32 | std::unordered_map<int, int> ev_doc_counts;
 33 | std::unordered_map<int, int> ev_doc_source_counts;
 34 | std::unordered_map<int, int> ev_doc_publisher_counts;
 35 | std::unordered_map<int, int> uid_counts;
 36 | 
 37 | 
 38 | void load_dataset_data(const std::string & dataset) {
 39 |     if (cur_dataset == dataset)
 40 |         return;
 41 | 
 42 |     cur_dataset = dataset;
 43 |     std::cout << "Loading " << dataset << " data..." << std::endl;
 44 | 
 45 |     ad_counts = read_map(std::string("cache/ad_counts_") + dataset + std::string(".csv.gz"), read_count);
 46 |     ad_campaign_counts = read_map(std::string("cache/ad_campaign_counts_") + dataset + std::string(".csv.gz"), read_count);
 47 |     ad_advertiser_counts = read_map(std::string("cache/ad_advertiser_counts_") + dataset + std::string(".csv.gz"), read_count);
 48 | 
 49 |     ad_doc_counts = read_map(std::string("cache/ad_doc_counts_") + dataset + std::string(".csv.gz"), read_count);
 50 |     ad_doc_source_counts = read_map(std::string("cache/ad_doc_source_counts_") + dataset + std::string(".csv.gz"), read_count);
 51 |     ad_doc_publisher_counts = read_map(std::string("cache/ad_doc_publisher_counts_") + dataset + std::string(".csv.gz"), read_count);
 52 | 
 53 |     ev_doc_counts = read_map(std::string("cache/ev_doc_counts_") + dataset + std::string(".csv.gz"), read_count);
 54 |     ev_doc_source_counts = read_map(std::string("cache/ev_doc_source_counts_") + dataset + std::string(".csv.gz"), read_count);
 55 |     ev_doc_publisher_counts = read_map(std::string("cache/ev_doc_publisher_counts_") + dataset + std::string(".csv.gz"), read_count);
 56 |     uid_counts = read_map(std::string("cache/uid_counts_") + dataset + std::string(".csv.gz"), read_count);
 57 | }
 58 | 
 59 | 
 60 | class writer {
 61 |     std::string file_name;
 62 | 
 63 |     ffm_stream_data_writer data_out;
 64 |     ffm_index index;
 65 | public:
 66 |     writer(const std::string & file_name): file_name(file_name), data_out(file_name + ".data") {
 67 |         index.size = 0;
 68 |         index.offsets.push_back(0);
 69 | 
 70 |         load_dataset_data(file_name.substr(6, file_name.find("_") - 6));
 71 |     }
 72 | 
 73 |     void write(const reference_data & data, const std::vector<std::vector<std::string>> & rows);
 74 |     void finish();
 75 | };
 76 | 
 77 | 
 78 | void writer::write(const reference_data & data, const std::vector<std::vector<std::string>> & rows) {
 79 |     int event_id = stoi(rows[0][0]);
 80 |     int ad_id = stoi(rows[0][1]);
 81 | 
 82 |     //
 83 | 
 84 |     auto ad = data.ads[ad_id];
 85 |     auto event = data.events[event_id];
 86 | 
 87 |     auto ad_doc = data.documents.at(ad.document_id);
 88 |     auto ad_doc_categories = data.document_categories.equal_range(ad.document_id);
 89 |     //auto ad_doc_topics = data.document_topics.equal_range(ad.document_id);
 90 |     //auto ad_doc_entities = data.document_entities.equal_range(ad.document_id);
 91 | 
 92 |     auto ev_doc = data.documents.at(event.document_id);
 93 |     auto ev_doc_categories = data.document_categories.equal_range(event.document_id);
 94 |     //auto ev_doc_topics = data.document_topics.equal_range(event.document_id);
 95 |     //auto ev_doc_entities = data.document_entities.equal_range(event.document_id);
 96 | 
 97 |     // Get counts
 98 |     auto ad_count = ad_counts.at(ad_id);
 99 |     auto ad_campaign_count = ad_campaign_counts.at(ad.campaign_id);
100 |     auto ad_advertiser_count = ad_advertiser_counts.at(ad.advertiser_id);
101 | 
102 |     auto ad_doc_count = ad_doc_counts.at(ad.document_id);
103 |     auto ad_doc_source_count = ad_doc_source_counts.at(ad_doc.source_id);
104 |     auto ad_doc_publisher_count = ad_doc_publisher_counts.at(ad_doc.publisher_id);
105 | 
106 |     auto ev_doc_count = ev_doc_counts.at(event.document_id);
107 |     auto ev_doc_source_count = ev_doc_source_counts.at(ev_doc.source_id);
108 |     auto ev_doc_publisher_count = ev_doc_publisher_counts.at(ev_doc.publisher_id);
109 | 
110 |     auto uid_count = uid_counts.at(event.uid);
111 | 
112 |     // Start building line
113 |     ffm_feature_vector_builder features(200);
114 | 
115 |     // Event features
116 |     features.hashed(0, event.platform);
117 |     features.hashed(1, event.country);
118 |     features.hashed(2, event.state);
119 |     //features.hashed(, event.region);
120 |     features.hashed(3, uid_count < 50 ? uid_count : event.uid + 100);
121 | 
122 |     // Document info
123 |     features.hashed(4, ev_doc_count < 50 ? ev_doc_count : event.document_id + 100);
124 |     features.hashed(5, ev_doc_source_count < 10 ? ev_doc_source_count : ev_doc.source_id + 10);
125 |     features.hashed(6, ev_doc_publisher_count < 10 ? ev_doc_publisher_count : ev_doc.publisher_id + 10);
126 | 
127 |     for (auto it = ev_doc_categories.first; it != ev_doc_categories.second; ++ it)
128 |         features.hashed(7, it->second.first, it->second.second);
129 |     /*
130 |     for (auto it = ev_doc_topics.first; it != ev_doc_topics.second; ++ it)
131 |         features.hashed(14, it->second.first, it->second.second);
132 | 
133 |     for (auto it = ev_doc_entities.first; it != ev_doc_entities.second; ++ it)
134 |         features.hashed(16, it->second.first, it->second.second);
135 |     */
136 | 
137 |     // Common features
138 | 
139 |     // Same feature markers
140 | 
141 |     if (ad_doc.publisher_id == ev_doc.publisher_id)
142 |         features.raw(10, 0); // Same publisher
143 | 
144 |     if (ad_doc.source_id == ev_doc.source_id)
145 |         features.raw(10, 1); // Same source
146 | 
147 |     // Document view features (including leak)
148 | 
149 |     if (stoi(rows[1][0]) > 0)
150 |         features.raw(11, 2); // Viewed ad document (leak)
151 | 
152 |     if (stoi(rows[1][1]) > 0)
153 |         features.raw(11, 3); // Not viewed ad document (leak)
154 | 
155 |     if (stoi(rows[2][0]) > 0)
156 |         features.raw(11, 4); // Viewed documents of same publisher
157 | 
158 |     if (stoi(rows[2][1]) > 0)
159 |         features.raw(11, 5); // Viewed documents of same source
160 | 
161 |     if (stof(rows[3][0]) > 0)
162 |         features.raw(11, 6); // Viewed documents of the similar category
163 | 
164 |     if (stof(rows[4][0]) > 0)
165 |         features.raw(11, 7); // Viewed documents of the similar topic
166 | 
167 |     // Ad view/click features
168 | 
169 |     auto & v_ad_row = rows[5];
170 |     auto & v_ad_src_row = rows[6];
171 |     auto & v_ad_cat_row = rows[7];
172 |     auto & v_ad_top_row = rows[8];
173 | 
174 |     if (stoi(v_ad_row[2]) > 0)
175 |         features.raw(12, 20); // Viewed this ad earlier
176 | 
177 |     if (stoi(v_ad_row[1]) > 0)
178 |         features.raw(12, 21); // Clicked this ad earlier
179 | 
180 |     if (stoi(v_ad_row[5]) > 0)
181 |         features.raw(12, 22); // Viewed this ad doc earlier
182 | 
183 |     if (stoi(v_ad_row[4]) > 0)
184 |         features.raw(12, 23); // Clicked this ad doc earlier
185 | 
186 | 
187 |     if (stoi(v_ad_src_row[2]) > 0)
188 |         features.raw(12, 24); // Viewed ad of the same publisher earlier
189 | 
190 |     if (stoi(v_ad_src_row[1]) > 0)
191 |         features.raw(12, 25); // Clicked ad of the same publisher earlier
192 | 
193 |     if (stoi(v_ad_src_row[5]) > 0)
194 |         features.raw(12, 26); // Viewed ad of the same source earlier
195 | 
196 |     if (stoi(v_ad_src_row[4]) > 0)
197 |         features.raw(12, 27); // Clicked ad of the same source earlier
198 | 
199 | 
200 |     if (stof(v_ad_cat_row[2]) > 0)
201 |         features.raw(12, 28); // Viewed ad of the similar category
202 | 
203 |     if (stof(v_ad_cat_row[1]) > 0)
204 |         features.raw(12, 29); // Clicked ad of the similar category
205 | 
206 | 
207 |     if (stof(v_ad_top_row[2]) > 0)
208 |         features.raw(12, 30); // Viewed ad of the similar topic
209 | 
210 |     if (stof(v_ad_top_row[1]) > 0)
211 |         features.raw(12, 32); // Clicked ad of the similar topic
212 | 
213 | 
214 |     features.raw(13, event.weekday + 50);
215 |     features.raw(13, event.hour + 70);
216 | 
217 |     features.raw(14, 80, pos_time_diff(event.timestamp - ad_doc.publish_timestamp));
218 |     features.raw(14, 81, time_diff(ev_doc.publish_timestamp - ad_doc.publish_timestamp));
219 | 
220 |     features.raw(18, stoi(rows[9][0]) + 180); // Rival count
221 | 
222 |     // Rival ids
223 |     auto rival_ids = split(rows[9][1], ' ');
224 |     for (uint ri = 0; ri < rival_ids.size(); ++ ri) {
225 |         auto rival_id = stoi(rival_ids[ri]);
226 | 
227 |         if (rival_id != ad_id)
228 |             features.hashed(20, rival_id);
229 |     }
230 | 
231 |     auto doc_ad_others_it = data.doc_ad_others.find(event_id);
232 |     if (doc_ad_others_it != data.doc_ad_others.end()) {
233 |         auto ids = doc_ad_others_it->second;
234 | 
235 |         for (uint i = 0; i < ids.size(); ++ i)
236 |             features.hashed(21, ids[i]);
237 |     }
238 | 
239 |     // Similarity features
240 |     /*
241 |     for (uint i = 0; i < rows[2].size(); ++ i)
242 |         if (stof(rows[2][i]) > 0)
243 |             features.raw(26 + i, 6 + i, stof(rows[2][i]));
244 |     */
245 | 
246 |     // Ad features
247 |     features.hashed(30, ad_count < 50 ? ad_count : ad_id + 100);
248 |     features.hashed(31, ad_campaign_count < 50 ? ad_campaign_count : ad.campaign_id + 100);
249 |     features.hashed(32, ad_advertiser_count < 50 ? ad_advertiser_count : ad.advertiser_id + 100);
250 | 
251 |     // Promoted document info
252 |     features.hashed(33, ad_doc_count < 50 ? ad_doc_count : ad.document_id + 100);
253 |     features.hashed(34, ad_doc_source_count < 10 ? ad_doc_source_count : ad_doc.source_id + 10);
254 |     features.hashed(35, ad_doc_publisher_count < 10 ? ad_doc_publisher_count : ad_doc.publisher_id + 10);
255 | 
256 |     for (auto it = ad_doc_categories.first; it != ad_doc_categories.second; ++ it)
257 |         features.hashed(36, it->second.first, it->second.second);
258 |     /*
259 |     for (auto it = ad_doc_topics.first; it != ad_doc_topics.second; ++ it)
260 |         features.hashed(15, it->second.first, it->second.second);
261 | 
262 |     for (auto it = ad_doc_entities.first; it != ad_doc_entities.second; ++ it)
263 |         features.hashed(17, it->second.first, it->second.second);
264 |     */
265 | 
266 |     // Write data
267 |     auto offset = data_out.write(features.data());
268 | 
269 |     // Update index
270 |     index.size ++;
271 |     index.labels.push_back(rows[0].size() == 3 ? stof(rows[0][2]) * 2 - 1 : 0);
272 |     index.offsets.push_back(offset);
273 |     index.norms.push_back(features.norm());
274 |     index.groups.push_back(event_id);
275 | }
276 | 
277 | 
278 | void writer::finish() {
279 |     ffm_write_index(file_name + ".index", index);
280 | }
281 | 
282 | int main() {
283 |     using namespace std;
284 | 
285 |     cout << "Loading reference data..." << endl;
286 |     auto data = load_reference_data();
287 | 
288 |     cout << "Generating files..." << endl;
289 |     generate_files<reference_data, writer>(data, build_filesets(files, features, "_bin_p1"));
290 | 
291 |     cout << "Done." << endl;
292 | }
293 | 


--------------------------------------------------------------------------------
/export-bin-data-f2.cpp:
--------------------------------------------------------------------------------
  1 | #include "util/io.h"
  2 | #include "util/data.h"
  3 | #include "util/generation.h"
  4 | #include "util/helpers.h"
  5 | 
  6 | #include "ffm.h"
  7 | 
  8 | std::vector<std::pair<std::string, std::string>> files = {
  9 |     { "cache/clicks_cv2_train.csv.gz", "cv2_train" },
 10 |     { "cache/clicks_cv2_test.csv.gz", "cv2_test" },
 11 |     { "cache/clicks_cv1_train.csv.gz", "cv1_train" },
 12 |     { "cache/clicks_cv1_test.csv.gz", "cv1_test" },
 13 |     { "../input/clicks_train.csv.gz", "full_train" },
 14 |     { "../input/clicks_test.csv.gz", "full_test" },
 15 | };
 16 | 
 17 | std::vector<std::string> features = {
 18 |     "leak",
 19 |     "viewed_docs", "viewed_categories", "viewed_topics",
 20 |     "uid_viewed_ads", "uid_viewed_ad_cmps", "uid_viewed_ad_srcs", "uid_viewed_ad_cats", "uid_viewed_ad_tops"
 21 | };
 22 | 
 23 | std::string cur_dataset;
 24 | 
 25 | std::unordered_map<int, int> ad_counts;
 26 | std::unordered_map<int, int> ad_campaign_counts;
 27 | std::unordered_map<int, int> ad_advertiser_counts;
 28 | std::unordered_map<int, int> ad_doc_counts;
 29 | std::unordered_map<int, int> ad_doc_source_counts;
 30 | std::unordered_map<int, int> ad_doc_publisher_counts;
 31 | std::unordered_map<int, int> ev_doc_counts;
 32 | std::unordered_map<int, int> ev_doc_source_counts;
 33 | std::unordered_map<int, int> ev_doc_publisher_counts;
 34 | std::unordered_map<int, int> uid_counts;
 35 | 
 36 | 
 37 | void load_dataset_data(const std::string & dataset) {
 38 |     if (cur_dataset == dataset)
 39 |         return;
 40 | 
 41 |     cur_dataset = dataset;
 42 |     std::cout << "Loading " << dataset << " data..." << std::endl;
 43 | 
 44 |     ad_counts = read_map(std::string("cache/ad_counts_") + dataset + std::string(".csv.gz"), read_count);
 45 |     ad_campaign_counts = read_map(std::string("cache/ad_campaign_counts_") + dataset + std::string(".csv.gz"), read_count);
 46 |     ad_advertiser_counts = read_map(std::string("cache/ad_advertiser_counts_") + dataset + std::string(".csv.gz"), read_count);
 47 | 
 48 |     ad_doc_counts = read_map(std::string("cache/ad_doc_counts_") + dataset + std::string(".csv.gz"), read_count);
 49 |     ad_doc_source_counts = read_map(std::string("cache/ad_doc_source_counts_") + dataset + std::string(".csv.gz"), read_count);
 50 |     ad_doc_publisher_counts = read_map(std::string("cache/ad_doc_publisher_counts_") + dataset + std::string(".csv.gz"), read_count);
 51 | 
 52 |     ev_doc_counts = read_map(std::string("cache/ev_doc_counts_") + dataset + std::string(".csv.gz"), read_count);
 53 |     ev_doc_source_counts = read_map(std::string("cache/ev_doc_source_counts_") + dataset + std::string(".csv.gz"), read_count);
 54 |     ev_doc_publisher_counts = read_map(std::string("cache/ev_doc_publisher_counts_") + dataset + std::string(".csv.gz"), read_count);
 55 |     uid_counts = read_map(std::string("cache/uid_counts_") + dataset + std::string(".csv.gz"), read_count);
 56 | }
 57 | 
 58 | 
 59 | class writer {
 60 |     std::string file_name;
 61 | 
 62 |     ffm_stream_data_writer data_out;
 63 |     ffm_index index;
 64 | public:
 65 |     writer(const std::string & file_name): file_name(file_name), data_out(file_name + ".data") {
 66 |         index.size = 0;
 67 |         index.offsets.push_back(0);
 68 | 
 69 |         load_dataset_data(file_name.substr(6, file_name.find("_") - 6));
 70 |     }
 71 | 
 72 |     void write(const reference_data & data, const std::vector<std::vector<std::string>> & rows);
 73 |     void finish();
 74 | };
 75 | 
 76 | 
 77 | void writer::write(const reference_data & data, const std::vector<std::vector<std::string>> & rows) {
 78 |     int event_id = stoi(rows[0][0]);
 79 |     int ad_id = stoi(rows[0][1]);
 80 | 
 81 |     //
 82 | 
 83 |     auto ad = data.ads[ad_id];
 84 |     auto event = data.events[event_id];
 85 | 
 86 |     auto ad_doc = data.documents.at(ad.document_id);
 87 |     auto ad_doc_categories = data.document_categories.equal_range(ad.document_id);
 88 |     auto ad_doc_topics = data.document_topics.equal_range(ad.document_id);
 89 |     //auto ad_doc_entities = data.document_entities.equal_range(ad.document_id);
 90 | 
 91 |     auto ev_doc = data.documents.at(event.document_id);
 92 |     auto ev_doc_categories = data.document_categories.equal_range(event.document_id);
 93 |     auto ev_doc_topics = data.document_topics.equal_range(event.document_id);
 94 |     //auto ev_doc_entities = data.document_entities.equal_range(event.document_id);
 95 | 
 96 |     // Get counts
 97 |     auto ad_count = ad_counts.at(ad_id);
 98 |     auto ad_campaign_count = ad_campaign_counts.at(ad.campaign_id);
 99 |     auto ad_advertiser_count = ad_advertiser_counts.at(ad.advertiser_id);
100 | 
101 |     auto ad_doc_count = ad_doc_counts.at(ad.document_id);
102 |     auto ad_doc_source_count = ad_doc_source_counts.at(ad_doc.source_id);
103 |     auto ad_doc_publisher_count = ad_doc_publisher_counts.at(ad_doc.publisher_id);
104 | 
105 |     auto ev_doc_count = ev_doc_counts.at(event.document_id);
106 |     auto ev_doc_source_count = ev_doc_source_counts.at(ev_doc.source_id);
107 |     auto ev_doc_publisher_count = ev_doc_publisher_counts.at(ev_doc.publisher_id);
108 | 
109 |     auto uid_count = uid_counts.at(event.uid);
110 | 
111 |     // Start building line
112 |     ffm_feature_vector_builder features(200);
113 | 
114 |     // Event features
115 |     features.hashed(0, event.platform);
116 |     features.hashed(1, event.country);
117 |     features.hashed(2, event.country + event.state);
118 |     //features.hashed(, event.region);
119 |     features.hashed(3, uid_count < 50 ? uid_count : event.uid + 100);
120 | 
121 |     // Document info
122 |     features.hashed(4, ev_doc_count < 50 ? ev_doc_count : event.document_id + 100);
123 |     features.hashed(5, ev_doc_source_count < 10 ? ev_doc_source_count : ev_doc.source_id + 10);
124 |     features.hashed(6, ev_doc_publisher_count < 10 ? ev_doc_publisher_count : ev_doc.publisher_id + 10);
125 | 
126 |     for (auto it = ev_doc_categories.first; it != ev_doc_categories.second; ++ it)
127 |         features.hashed(7, it->second.first, it->second.second);
128 | 
129 |     for (auto it = ev_doc_topics.first; it != ev_doc_topics.second; ++ it)
130 |         features.hashed(8, it->second.first, it->second.second);
131 |     /*
132 |     for (auto it = ev_doc_entities.first; it != ev_doc_entities.second; ++ it)
133 |         features.hashed(16, it->second.first, it->second.second);
134 |     */
135 | 
136 |     // Common features
137 | 
138 |     // Same feature markers
139 | 
140 |     if (ad_doc.publisher_id == ev_doc.publisher_id)
141 |         features.raw(10, 0); // Same publisher
142 | 
143 |     if (ad_doc.source_id == ev_doc.source_id)
144 |         features.raw(10, 1); // Same source
145 | 
146 |     // Document view features (including leak)
147 | 
148 |     if (stoi(rows[1][0]) > 0)
149 |         features.raw(11, 2); // Viewed ad document (leak)
150 | 
151 |     if (stoi(rows[1][1]) > 0)
152 |         features.raw(11, 3); // Not viewed ad document (leak)
153 | 
154 |     if (stoi(rows[2][0]) > 0)
155 |         features.raw(11, 4); // Viewed documents of same publisher
156 | 
157 |     if (stoi(rows[2][1]) > 0)
158 |         features.raw(11, 5); // Viewed documents of same source
159 | 
160 |     if (stof(rows[3][0]) > 0)
161 |         features.raw(11, 6); // Viewed documents of the similar category
162 | 
163 |     if (stof(rows[4][0]) > 0)
164 |         features.raw(11, 7); // Viewed documents of the similar topic
165 | 
166 |     // Ad view/click features
167 | 
168 |     auto & v_ad_row = rows[5];
169 |     auto & v_ad_cmp_row = rows[6];
170 |     auto & v_ad_src_row = rows[7];
171 |     auto & v_ad_cat_row = rows[8];
172 |     auto & v_ad_top_row = rows[9];
173 | 
174 |     if (stoi(v_ad_row[2]) > 0)
175 |         features.raw(12, 20); // Viewed this ad earlier
176 | 
177 |     if (stoi(v_ad_row[1]) > 0)
178 |         features.raw(12, 21); // Clicked this ad earlier
179 | 
180 |     if (stoi(v_ad_row[5]) > 0)
181 |         features.raw(12, 22); // Viewed this ad doc earlier
182 | 
183 |     if (stoi(v_ad_row[4]) > 0)
184 |         features.raw(12, 23); // Clicked this ad doc earlier
185 | 
186 | 
187 | //    if (stoi(v_ad_cmp_row[1]) > 0)
188 | //        features.raw(12, 33); // Clicked ad of the same campaign earlier
189 | 
190 | 
191 |     if (stoi(v_ad_src_row[2]) > 0)
192 |         features.raw(12, 24); // Viewed ad of the same publisher earlier
193 | 
194 |     if (stoi(v_ad_src_row[1]) > 0)
195 |         features.raw(12, 25); // Clicked ad of the same publisher earlier
196 | 
197 |     if (stoi(v_ad_src_row[5]) > 0)
198 |         features.raw(12, 26); // Viewed ad of the same source earlier
199 | 
200 |     if (stoi(v_ad_src_row[4]) > 0)
201 |         features.raw(12, 27); // Clicked ad of the same source earlier
202 | 
203 | 
204 |     if (stof(v_ad_cat_row[2]) > 0)
205 |         features.raw(12, 28); // Viewed ad of the similar category
206 | 
207 |     if (stof(v_ad_cat_row[1]) > 0)
208 |         features.raw(12, 29); // Clicked ad of the similar category
209 | 
210 | 
211 |     if (stof(v_ad_top_row[2]) > 0)
212 |         features.raw(12, 30); // Viewed ad of the similar topic
213 | 
214 |     if (stof(v_ad_top_row[1]) > 0)
215 |         features.raw(12, 32); // Clicked ad of the similar topic
216 | 
217 |     // Ad view features from future
218 | 
219 |     if (stoi(v_ad_row[8]) > 0)
220 |         features.raw(13, 40); // Viewed this ad later
221 | 
222 |     if (stoi(v_ad_row[11]) > 0)
223 |         features.raw(13, 41); // Viewed this ad doc later
224 | 
225 | 
226 |     if (stoi(v_ad_row[8]) == 0 && stoi(v_ad_cmp_row[8]) > 0)
227 |         features.raw(14, 42); // Not viewed this ad doc later however viewed this campaign
228 | 
229 |     // CTR features
230 | 
231 |     features.raw(17, 43, ctr_logit(stoi(v_ad_src_row[3]) + stoi(v_ad_src_row[9]), stoi(v_ad_src_row[4]) + stoi(v_ad_src_row[10]))); // CTR logit of past and future source clicks
232 | 
233 |     // Other features
234 | 
235 |     features.raw(15, event.weekday + 50);
236 |     features.raw(15, event.hour + 70);
237 | 
238 |     features.raw(16, 80, pos_time_diff(event.timestamp - ad_doc.publish_timestamp));
239 |     features.raw(16, 81, time_diff(ev_doc.publish_timestamp - ad_doc.publish_timestamp));
240 | 
241 |     // Similarity features
242 |     /*
243 |     for (uint i = 0; i < rows[2].size(); ++ i)
244 |         if (stof(rows[2][i]) > 0)
245 |             features.raw(26 + i, 6 + i, stof(rows[2][i]));
246 |     */
247 | 
248 |     // Ad features
249 |     features.hashed(30, ad_count < 50 ? ad_count : ad_id + 100);
250 |     features.hashed(31, ad_campaign_count < 50 ? ad_campaign_count : ad.campaign_id + 100);
251 |     features.hashed(32, ad_advertiser_count < 50 ? ad_advertiser_count : ad.advertiser_id + 100);
252 | 
253 |     // Promoted document info
254 |     features.hashed(33, ad_doc_count < 50 ? ad_doc_count : ad.document_id + 100);
255 |     features.hashed(34, ad_doc_source_count < 10 ? ad_doc_source_count : ad_doc.source_id + 10);
256 |     features.hashed(35, ad_doc_publisher_count < 10 ? ad_doc_publisher_count : ad_doc.publisher_id + 10);
257 | 
258 |     for (auto it = ad_doc_categories.first; it != ad_doc_categories.second; ++ it)
259 |         features.hashed(36, it->second.first, it->second.second);
260 | 
261 |     for (auto it = ad_doc_topics.first; it != ad_doc_topics.second; ++ it)
262 |         features.hashed(37, it->second.first, it->second.second);
263 |     /*
264 |     for (auto it = ad_doc_entities.first; it != ad_doc_entities.second; ++ it)
265 |         features.hashed(17, it->second.first, it->second.second);
266 |     */
267 | 
268 |     // Write data
269 |     auto offset = data_out.write(features.data());
270 | 
271 |     // Update index
272 |     index.size ++;
273 |     index.labels.push_back(rows[0].size() == 3 ? stof(rows[0][2]) * 2 - 1 : 0);
274 |     index.offsets.push_back(offset);
275 |     index.norms.push_back(features.norm());
276 |     index.groups.push_back(event_id);
277 | }
278 | 
279 | 
280 | void writer::finish() {
281 |     ffm_write_index(file_name + ".index", index);
282 | }
283 | 
284 | int main() {
285 |     using namespace std;
286 | 
287 |     cout << "Loading reference data..." << endl;
288 |     auto data = load_reference_data();
289 | 
290 |     cout << "Generating files..." << endl;
291 |     generate_files<reference_data, writer>(data, build_filesets(files, features, "_bin_f2"));
292 | 
293 |     cout << "Done." << endl;
294 | }
295 | 


--------------------------------------------------------------------------------
/export-bin-data-f4.cpp:
--------------------------------------------------------------------------------
  1 | #include "util/io.h"
  2 | #include "util/data.h"
  3 | #include "util/generation.h"
  4 | #include "util/helpers.h"
  5 | 
  6 | #include "ffm.h"
  7 | 
  8 | std::vector<std::pair<std::string, std::string>> files = {
  9 |     { "cache/clicks_cv2_train.csv.gz", "cv2_train" },
 10 |     { "cache/clicks_cv2_test.csv.gz", "cv2_test" },
 11 |     { "cache/clicks_cv1_train.csv.gz", "cv1_train" },
 12 |     { "cache/clicks_cv1_test.csv.gz", "cv1_test" },
 13 |     { "../input/clicks_train.csv.gz", "full_train" },
 14 |     { "../input/clicks_test.csv.gz", "full_test" },
 15 | };
 16 | 
 17 | std::vector<std::string> features = {
 18 |     "leak",
 19 |     "viewed_docs", "viewed_categories", "viewed_topics",
 20 |     "uid_viewed_ads", "uid_viewed_ad_cmps", "uid_viewed_ad_srcs", "uid_viewed_ad_cats", "uid_viewed_ad_tops",
 21 |     "rivals"//, "similarity"
 22 | };
 23 | 
 24 | std::string cur_dataset;
 25 | 
 26 | struct cnt {
 27 |     uint32_t min_count;
 28 |     uint32_t sum_count;
 29 | };
 30 | 
 31 | std::pair<int, cnt> read_cnt(const std::vector<std::string> & row) {
 32 |     int id = stoi(row[0]);
 33 |     uint32_t train_count = stoi(row[1]);
 34 |     uint32_t test_count = stoi(row[2]);
 35 | 
 36 |     cnt c;
 37 |     c.min_count = min(train_count, test_count);
 38 |     c.sum_count = train_count + test_count;
 39 | 
 40 |     return std::make_pair(id, c);
 41 | }
 42 | 
 43 | std::unordered_map<int, cnt> ad_counts;
 44 | std::unordered_map<int, cnt> ad_campaign_counts;
 45 | std::unordered_map<int, cnt> ad_advertiser_counts;
 46 | std::unordered_map<int, cnt> ad_doc_counts;
 47 | std::unordered_map<int, cnt> ad_doc_source_counts;
 48 | std::unordered_map<int, cnt> ad_doc_publisher_counts;
 49 | std::unordered_map<int, cnt> ev_doc_counts;
 50 | std::unordered_map<int, cnt> ev_doc_source_counts;
 51 | std::unordered_map<int, cnt> ev_doc_publisher_counts;
 52 | std::unordered_map<int, cnt> uid_counts;
 53 | 
 54 | 
 55 | void load_dataset_data(const std::string & dataset) {
 56 |     if (cur_dataset == dataset)
 57 |         return;
 58 | 
 59 |     cur_dataset = dataset;
 60 |     std::cout << "Loading " << dataset << " data..." << std::endl;
 61 | 
 62 |     ad_counts = read_map(std::string("cache/counts/ads_") + dataset + std::string(".csv.gz"), read_cnt);
 63 |     ad_campaign_counts = read_map(std::string("cache/counts/ad_campaigns_") + dataset + std::string(".csv.gz"), read_cnt);
 64 |     ad_advertiser_counts = read_map(std::string("cache/counts/ad_advertisers_") + dataset + std::string(".csv.gz"), read_cnt);
 65 | 
 66 |     ad_doc_counts = read_map(std::string("cache/counts/ad_docs_") + dataset + std::string(".csv.gz"), read_cnt);
 67 |     ad_doc_source_counts = read_map(std::string("cache/counts/ad_doc_sources_") + dataset + std::string(".csv.gz"), read_cnt);
 68 |     ad_doc_publisher_counts = read_map(std::string("cache/counts/ad_doc_publishers_") + dataset + std::string(".csv.gz"), read_cnt);
 69 | 
 70 |     ev_doc_counts = read_map(std::string("cache/counts/ev_docs_") + dataset + std::string(".csv.gz"), read_cnt);
 71 |     ev_doc_source_counts = read_map(std::string("cache/counts/ev_doc_sources_") + dataset + std::string(".csv.gz"), read_cnt);
 72 |     ev_doc_publisher_counts = read_map(std::string("cache/counts/ev_doc_publishers_") + dataset + std::string(".csv.gz"), read_cnt);
 73 |     uid_counts = read_map(std::string("cache/counts/uids_") + dataset + std::string(".csv.gz"), read_cnt);
 74 | }
 75 | 
 76 | 
 77 | class writer {
 78 |     std::string file_name;
 79 | 
 80 |     ffm_stream_data_writer data_out;
 81 |     ffm_index index;
 82 | public:
 83 |     writer(const std::string & file_name): file_name(file_name), data_out(file_name + ".data") {
 84 |         index.size = 0;
 85 |         index.offsets.push_back(0);
 86 | 
 87 |         load_dataset_data(file_name.substr(6, file_name.find("_") - 6));
 88 |     }
 89 | 
 90 |     void write(const reference_data & data, const std::vector<std::vector<std::string>> & rows);
 91 |     void finish();
 92 | };
 93 | 
 94 | 
 95 | void writer::write(const reference_data & data, const std::vector<std::vector<std::string>> & rows) {
 96 |     int event_id = stoi(rows[0][0]);
 97 |     int ad_id = stoi(rows[0][1]);
 98 | 
 99 |     //
100 | 
101 |     auto ad = data.ads[ad_id];
102 |     auto event = data.events[event_id];
103 | 
104 |     auto ad_doc = data.documents.at(ad.document_id);
105 |     auto ad_doc_categories = data.document_categories.equal_range(ad.document_id);
106 |     //auto ad_doc_topics = data.document_topics.equal_range(ad.document_id);
107 |     //auto ad_doc_entities = data.document_entities.equal_range(ad.document_id);
108 | 
109 |     auto ev_doc = data.documents.at(event.document_id);
110 |     auto ev_doc_categories = data.document_categories.equal_range(event.document_id);
111 |     //auto ev_doc_topics = data.document_topics.equal_range(event.document_id);
112 |     //auto ev_doc_entities = data.document_entities.equal_range(event.document_id);
113 | 
114 |     // Get counts
115 |     auto ad_count = ad_counts.at(ad_id);
116 |     auto ad_campaign_count = ad_campaign_counts.at(ad.campaign_id);
117 |     auto ad_advertiser_count = ad_advertiser_counts.at(ad.advertiser_id);
118 | 
119 |     auto ad_doc_count = ad_doc_counts.at(ad.document_id);
120 |     auto ad_doc_source_count = ad_doc_source_counts.at(ad_doc.source_id);
121 |     auto ad_doc_publisher_count = ad_doc_publisher_counts.at(ad_doc.publisher_id);
122 | 
123 |     auto ev_doc_count = ev_doc_counts.at(event.document_id);
124 |     auto ev_doc_source_count = ev_doc_source_counts.at(ev_doc.source_id);
125 |     auto ev_doc_publisher_count = ev_doc_publisher_counts.at(ev_doc.publisher_id);
126 | 
127 |     auto uid_count = uid_counts.at(event.uid);
128 | 
129 |     // Start building line
130 |     ffm_feature_vector_builder features(700);
131 | 
132 |     // Event features
133 |     features.hashed(0, event.platform);
134 |     features.hashed(1, event.country);
135 |     features.hashed(2, event.state);
136 |     //features.hashed(, event.region);
137 |     features.hashed(3, uid_count.min_count < 50 ? uid_count.sum_count : event.uid + 100);
138 | 
139 |     // Document info
140 |     features.hashed(4, ev_doc_count.min_count < 5 ? ev_doc_count.sum_count : event.document_id + 100);
141 |     features.hashed(5, ev_doc_source_count.min_count < 5 ? ev_doc_source_count.sum_count : ev_doc.source_id + 10);
142 |     features.hashed(6, ev_doc_publisher_count.min_count < 5 ? ev_doc_publisher_count.sum_count : ev_doc.publisher_id + 10);
143 | 
144 |     for (auto it = ev_doc_categories.first; it != ev_doc_categories.second; ++ it)
145 |         features.hashed(7, it->second.first, it->second.second);
146 | 
147 |     // Common features
148 | 
149 |     // Same feature markers
150 | 
151 |     if (ad_doc.publisher_id == ev_doc.publisher_id)
152 |         features.raw(10, 0); // Same publisher
153 | 
154 |     if (ad_doc.source_id == ev_doc.source_id)
155 |         features.raw(10, 1); // Same source
156 | 
157 |     // Document view features (including leak)
158 | 
159 |     if (stoi(rows[1][0]) > 0)
160 |         features.raw(11, 2); // Viewed ad document (leak)
161 | 
162 |     if (stoi(rows[1][1]) > 0)
163 |         features.raw(11, 3); // Not viewed ad document (leak)
164 | 
165 |     if (stoi(rows[2][0]) > 0)
166 |         features.raw(11, 4); // Viewed documents of same publisher
167 | 
168 |     if (stoi(rows[2][1]) > 0)
169 |         features.raw(11, 5); // Viewed documents of same source
170 | 
171 |     if (stof(rows[3][0]) > 0)
172 |         features.raw(11, 6); // Viewed documents of the similar category
173 | 
174 |     if (stof(rows[4][0]) > 0)
175 |         features.raw(11, 7); // Viewed documents of the similar topic
176 | 
177 |     // Ad view/click features
178 | 
179 |     auto & v_ad_row = rows[5];
180 |     auto & v_ad_cmp_row = rows[6];
181 |     auto & v_ad_src_row = rows[7];
182 |     auto & v_ad_cat_row = rows[8];
183 |     auto & v_ad_top_row = rows[9];
184 | 
185 |     if (stoi(v_ad_row[2]) > 0)
186 |         features.raw(12, 20); // Viewed this ad earlier
187 | 
188 |     if (stoi(v_ad_row[1]) > 0)
189 |         features.raw(12, 21); // Clicked this ad earlier
190 | 
191 |     if (stoi(v_ad_row[5]) > 0)
192 |         features.raw(12, 22); // Viewed this ad doc earlier
193 | 
194 |     if (stoi(v_ad_row[4]) > 0)
195 |         features.raw(12, 23); // Clicked this ad doc earlier
196 | 
197 | 
198 | //    if (stoi(v_ad_cmp_row[1]) > 0)
199 | //        features.raw(12, 33); // Clicked ad of the same campaign earlier
200 | 
201 | 
202 |     if (stoi(v_ad_src_row[2]) > 0)
203 |         features.raw(12, 24); // Viewed ad of the same publisher earlier
204 | 
205 |     if (stoi(v_ad_src_row[1]) > 0)
206 |         features.raw(12, 25); // Clicked ad of the same publisher earlier
207 | 
208 |     if (stoi(v_ad_src_row[5]) > 0)
209 |         features.raw(12, 26); // Viewed ad of the same source earlier
210 | 
211 |     if (stoi(v_ad_src_row[4]) > 0)
212 |         features.raw(12, 27); // Clicked ad of the same source earlier
213 | 
214 | 
215 |     if (stof(v_ad_cat_row[2]) > 0)
216 |         features.raw(12, 28); // Viewed ad of the similar category
217 | 
218 |     if (stof(v_ad_cat_row[1]) > 0)
219 |         features.raw(12, 29); // Clicked ad of the similar category
220 | 
221 | 
222 |     if (stof(v_ad_top_row[2]) > 0)
223 |         features.raw(12, 30); // Viewed ad of the similar topic
224 | 
225 |     if (stof(v_ad_top_row[1]) > 0)
226 |         features.raw(12, 32); // Clicked ad of the similar topic
227 | 
228 |     // Ad view features from future
229 | 
230 |     if (stoi(v_ad_row[8]) > 0)
231 |         features.raw(13, 40); // Viewed this ad later
232 | 
233 |     if (stoi(v_ad_row[11]) > 0)
234 |         features.raw(13, 41); // Viewed this ad doc later
235 | 
236 | 
237 |     if (stoi(v_ad_row[8]) == 0 && stoi(v_ad_cmp_row[8]) > 0)
238 |         features.raw(14, 42); // Not viewed this ad doc later however viewed this campaign
239 | 
240 |     // CTR features
241 | 
242 |     features.raw(17, 43, ctr_logit(stoi(v_ad_src_row[3]) + stoi(v_ad_src_row[9]), stoi(v_ad_src_row[4]) + stoi(v_ad_src_row[10]))); // CTR logit of past and future source clicks
243 | 
244 |     // Other features
245 | 
246 |     features.raw(15, event.weekday + 70);
247 |     features.raw(15, event.hour + 50);
248 | 
249 |     features.raw(16, 80, pos_time_diff(event.timestamp - ad_doc.publish_timestamp));
250 |     features.raw(16, 81, time_diff(ev_doc.publish_timestamp - ad_doc.publish_timestamp));
251 | 
252 |     features.raw(18, stoi(rows[10][0]) + 180); // Rival count
253 | 
254 |     // Rival ids
255 |     auto rival_ids = split(rows[10][1], ' ');
256 |     for (uint ri = 0; ri < rival_ids.size(); ++ ri) {
257 |         auto rival_id = stoi(rival_ids[ri]);
258 | 
259 |         if (rival_id != ad_id)
260 |             features.hashed(20, rival_id);
261 |     }
262 | 
263 |     auto vd_oha_it = data.viewed_docs_one_hour_after.find(std::make_pair(event.uid, event.timestamp));
264 |     if (vd_oha_it != data.viewed_docs_one_hour_after.end()) {
265 |         auto doc_ids = vd_oha_it->second;
266 | 
267 |         for (uint i = 0; i < doc_ids.size(); ++ i)
268 |             features.hashed(19, doc_ids[i]);
269 |     }
270 | 
271 |     auto doc_ad_others_it = data.doc_ad_others.find(event_id);
272 |     if (doc_ad_others_it != data.doc_ad_others.end()) {
273 |         auto ids = doc_ad_others_it->second;
274 | 
275 |         for (uint i = 0; i < ids.size(); ++ i)
276 |             features.hashed(21, ids[i]);
277 |     }
278 | 
279 |     auto doc_trf_it = data.viewed_doc_trf_source.find(event.uid);
280 |     if (doc_trf_it != data.viewed_doc_trf_source.end()) {
281 |         auto ids = doc_trf_it->second;
282 | 
283 |         for (uint i = 0; i < ids.size(); ++ i)
284 |             features.hashed(22, ids[i]);
285 |     }
286 | 
287 |     auto doc_src_it = data.viewed_doc_sources.find(event.uid);
288 |     if (doc_src_it != data.viewed_doc_sources.end()) {
289 |         auto ids = doc_src_it->second;
290 | 
291 |         for (uint i = 0; i < ids.size(); ++ i)
292 |             features.hashed(23, ids[i]);
293 |     }
294 | 
295 |     // Ad features
296 |     features.hashed(30, ad_count.min_count < 5 ? ad_count.sum_count : ad_id + 100);
297 |     features.hashed(31, ad_campaign_count.min_count < 5 ? ad_campaign_count.sum_count : ad.campaign_id + 100);
298 |     features.hashed(32, ad_advertiser_count.min_count < 5 ? ad_advertiser_count.sum_count : ad.advertiser_id + 100);
299 | 
300 |     // Promoted document info
301 |     features.hashed(33, ad_doc_count.min_count < 5 ? ad_doc_count.sum_count : ad.document_id + 100);
302 |     features.hashed(34, ad_doc_source_count.min_count < 5 ? ad_doc_source_count.sum_count : ad_doc.source_id + 10);
303 |     features.hashed(35, ad_doc_publisher_count.min_count < 5 ? ad_doc_publisher_count.sum_count : ad_doc.publisher_id + 10);
304 | 
305 |     for (auto it = ad_doc_categories.first; it != ad_doc_categories.second; ++ it)
306 |         features.hashed(36, it->second.first, it->second.second);
307 | 
308 |     // Write data
309 |     auto offset = data_out.write(features.data());
310 | 
311 |     // Update index
312 |     index.size ++;
313 |     index.labels.push_back(rows[0].size() == 3 ? stof(rows[0][2]) * 2 - 1 : 0);
314 |     index.offsets.push_back(offset);
315 |     index.norms.push_back(features.norm());
316 |     index.groups.push_back(event_id);
317 | }
318 | 
319 | 
320 | void writer::finish() {
321 |     ffm_write_index(file_name + ".index", index);
322 | }
323 | 
324 | int main() {
325 |     using namespace std;
326 | 
327 |     cout << "Loading reference data..." << endl;
328 |     auto data = load_reference_data();
329 | 
330 |     cout << "Generating files..." << endl;
331 |     generate_files<reference_data, writer>(data, build_filesets(files, features, "_bin_f4"));
332 | 
333 |     cout << "Done." << endl;
334 | }
335 | 


--------------------------------------------------------------------------------