├── images
    ├── dataset.png
    ├── cpu_load.local.png
    ├── log_loss.local.png
    ├── roc_auc.local.png
    ├── time_vs_cores.png
    ├── log_loss.cluster.png
    ├── roc_auc.cluster.png
    ├── train_time.local.png
    ├── train_time.cluster.png
    ├── log_loss.lr_hash_size.png
    ├── log_loss.why_optimize.png
    ├── maximum_memory.local.png
    ├── roc_auc.lr_hash_size.png
    ├── roc_auc.why_optimize.png
    ├── log_loss.cluster_selection.png
    ├── log_loss.local_and_cluster.png
    ├── roc_auc.cluster_selection.png
    ├── roc_auc.local_and_cluster.png
    ├── train_time.cluster_selection.png
    └── train_time.local_and_cluster.png
├── scripts
    ├── running
    │   ├── vw.conf
    │   ├── xgb.conf
    │   ├── build_plots.sh
    │   ├── run.sh
    │   ├── xgb.sh
    │   ├── vw.sh
    │   ├── xgb.ooc.sh
    │   ├── measure.py
    │   └── plots.py
    └── conversion
    │   ├── criteoToLibsvm.scala
    │   ├── libsvmToVw.scala
    │   └── sampleLibsvm.scala
├── results
    ├── metrics.old.tsv
    ├── metrics.lr_hash_size.tsv
    ├── metrics.cluster.tsv
    ├── metrics.selection.tsv
    ├── vw_xgb.tsv
    └── metrics.tsv
├── README.md
└── notebooks
    ├── experiment_spark_rf.ipynb
    ├── experiment_spark_lr.ipynb
    └── experiment_local.ipynb


/images/dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/dataset.png


--------------------------------------------------------------------------------
/images/cpu_load.local.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/cpu_load.local.png


--------------------------------------------------------------------------------
/images/log_loss.local.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/log_loss.local.png


--------------------------------------------------------------------------------
/images/roc_auc.local.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/roc_auc.local.png


--------------------------------------------------------------------------------
/images/time_vs_cores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/time_vs_cores.png


--------------------------------------------------------------------------------
/images/log_loss.cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/log_loss.cluster.png


--------------------------------------------------------------------------------
/images/roc_auc.cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/roc_auc.cluster.png


--------------------------------------------------------------------------------
/images/train_time.local.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/train_time.local.png


--------------------------------------------------------------------------------
/images/train_time.cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/train_time.cluster.png


--------------------------------------------------------------------------------
/images/log_loss.lr_hash_size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/log_loss.lr_hash_size.png


--------------------------------------------------------------------------------
/images/log_loss.why_optimize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/log_loss.why_optimize.png


--------------------------------------------------------------------------------
/images/maximum_memory.local.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/maximum_memory.local.png


--------------------------------------------------------------------------------
/images/roc_auc.lr_hash_size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/roc_auc.lr_hash_size.png


--------------------------------------------------------------------------------
/images/roc_auc.why_optimize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/roc_auc.why_optimize.png


--------------------------------------------------------------------------------
/images/log_loss.cluster_selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/log_loss.cluster_selection.png


--------------------------------------------------------------------------------
/images/log_loss.local_and_cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/log_loss.local_and_cluster.png


--------------------------------------------------------------------------------
/images/roc_auc.cluster_selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/roc_auc.cluster_selection.png


--------------------------------------------------------------------------------
/images/roc_auc.local_and_cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/roc_auc.local_and_cluster.png


--------------------------------------------------------------------------------
/images/train_time.cluster_selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/train_time.cluster_selection.png


--------------------------------------------------------------------------------
/images/train_time.local_and_cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rambler-digital-solutions/criteo-1tb-benchmark/HEAD/images/train_time.local_and_cluster.png


--------------------------------------------------------------------------------
/scripts/running/vw.conf:
--------------------------------------------------------------------------------
1 | -b          29
2 | -l          0.3
3 | --initial_t 1
4 | --decay_learning_rate   0.5
5 | --power_t   0.5
6 | --l1        1e-15
7 | --l2        0
8 | 


--------------------------------------------------------------------------------
/scripts/running/xgb.conf:
--------------------------------------------------------------------------------
 1 | booster = gbtree
 2 | objective = binary:logistic
 3 | nthread = 12
 4 | eval_metric = logloss
 5 | 
 6 | max_depth =  7
 7 | num_round =  200
 8 | eta =  0.2
 9 | gamma =  0.4
10 | 
11 | subsample =  0.8
12 | colsample_bytree =  0.8
13 | min_child_weight =  20
14 | 
15 | alpha =  3
16 | lambda =  100
17 | 


--------------------------------------------------------------------------------
/scripts/running/build_plots.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python plots.py -i metrics.old.tsv -n 'why_optimize' -c 'b,k,y'
4 | python plots.py -i metrics.selection.tsv -n 'cluster_selection' -c 'k,r,g,b'
5 | python plots.py -i vw_xgb.tsv -n 'local' -p -c 'b,k,y'
6 | python plots.py -i metrics.lr_hash_size.tsv -n 'lr_hash_size' -c 'r,g,b,k,y'
7 | python plots.py -i metrics.cluster.tsv -n 'cluster' -c 'r,g,b,k,y'
8 | python plots.py -i metrics.tsv -n 'local_and_cluster' -c 'r,g,b,k,y' -l '0.13,0.15'
9 | 


--------------------------------------------------------------------------------
/results/metrics.old.tsv:
--------------------------------------------------------------------------------
 1 | "Engine"	"Train size"	"ROC AUC"	"Log loss"	"Train time"
 2 | "vw"	10000	0.609	0.1501	1
 3 | "vw"	30000	0.632	0.1492	2
 4 | "vw"	100000	0.653	0.1461	3
 5 | "vw"	300000	0.679	0.1429	4
 6 | "vw"	1000000	0.692	0.1413	5
 7 | "vw"	3000000	0.707	0.1387	6
 8 | "vw"	10000000	0.72	0.1369	7
 9 | "xgb"	10000	0.65	0.1472	8
10 | "xgb"	30000	0.67	0.1451	9
11 | "xgb"	100000	0.683	0.1442	10
12 | "xgb"	300000	0.694	0.14355	11
13 | "xgb"	1000000	0.695	0.14335	12
14 | "xgb"	3000000	0.696	0.14325	13
15 | "xgb"	10000000	0.6965	0.14323	14
16 | "xgb.ooc"	10000	0.642	0.1476	15
17 | "xgb.ooc"	30000	0.672	0.1446	16
18 | "xgb.ooc"	100000	0.685	0.144	17
19 | "xgb.ooc"	300000	0.694	0.1435	18
20 | "xgb.ooc"	1000000	0.695	0.1433	19
21 | "xgb.ooc"	3000000	0.696	0.1433	20
22 | "xgb.ooc"	10000000	0.6964	0.143231	21
23 | 


--------------------------------------------------------------------------------
/scripts/running/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | 
 4 | DATA_PREFIX="./data"
 5 | 
 6 | test_vw="${DATA_PREFIX}/data.test.1kk.vw"
 7 | test_xgb="${DATA_PREFIX}/data.test.1kk.libsvm"
 8 | 
 9 | for train_num in 10k 30k 100k 300k 1kk 3kk 10kk 30kk 100kk 300kk 1kkk 3kkk; do
10 |     echo " * * * Train size ${train_num} lines * * *"
11 | 
12 |     train_vw="${DATA_PREFIX}/data.train.${train_num}.vw"
13 |     train_xgb="${DATA_PREFIX}/data.train.${train_num}.libsvm"
14 | 
15 |     echo "Running VW with train ${train_vw} and ${test_vw}"
16 |     ./vw.sh "${train_vw}" "${test_vw}"
17 | 
18 |     echo "Running XGBoost with train ${train_xgb} and ${test_xgb}"
19 |     ./xgb.sh "${train_xgb}" "${test_xgb}"
20 | 
21 |     echo "Running XGBoost (out-of-core) with train ${train_xgb} and ${test_xgb}"
22 |     ./xgb.ooc.sh "${train_xgb}" "${test_xgb}"
23 | done
24 | 


--------------------------------------------------------------------------------
/scripts/running/xgb.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | 
 4 | TRAIN="${1}"
 5 | TEST="${2}"
 6 | 
 7 | if [[ "${TRAIN}" == "" || "${TEST}" == "" ]]; then
 8 |     echo "Usage: $0 train test"
 9 |     exit 1
10 | fi
11 | 
12 | TIME="${TRAIN}.time"
13 | MODEL="${TRAIN}.model"
14 | PREDICTIONS="${TEST}.predictions"
15 | 
16 | 
17 | /usr/local/bin/time -v --output="${TIME}" \
18 |     xgboost xgb.conf data="${TRAIN}" model_out="${MODEL}"
19 | 
20 | xgboost xgb.conf task=pred test:data="${TEST}" model_in="${MODEL}" name_pred="${PREDICTIONS}"
21 | 
22 | 
23 | METRICS="metrics.tsv"
24 | if ! [[ -e ${METRICS} ]]; then
25 |     echo -e "Engine\tTrain size\tROC AUC\tLog loss\tTrain time\tMaximum memory\tCPU load" | tee "${METRICS}"
26 | fi
27 | 
28 | python measure.py "xgb" "${TRAIN}" "${TEST}" | tee -a "${METRICS}"
29 | 
30 | rm "${TIME}" "${PREDICTIONS}" "${TRAIN}.buffer" "${TEST}.buffer"
31 | 


--------------------------------------------------------------------------------
/scripts/running/vw.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | 
 4 | TRAIN="${1}"
 5 | TEST="${2}"
 6 | 
 7 | if [[ "${TRAIN}" == "" || "${TEST}" == "" ]]; then
 8 |     echo "Usage: $0 train test"
 9 |     exit 1
10 | fi
11 | 
12 | TIME="${TRAIN}.time"
13 | MODEL="${TRAIN}.model"
14 | PREDICTIONS="${TEST}.predictions"
15 | 
16 | 
17 | VW_OPTS=($(cat vw.conf))
18 | echo VW_OPTS = "${VW_OPTS[@]}"
19 | 
20 | /usr/local/bin/time -v --output="${TIME}" \
21 |     vw83 --link=logistic --loss_function=logistic -d "${TRAIN}" -f "${MODEL}" "${VW_OPTS[@]}"
22 | 
23 | vw83 -i "${MODEL}" --loss_function=logistic -t -d "${TEST}" -p "${PREDICTIONS}"
24 | 
25 | 
26 | METRICS="metrics.tsv"
27 | if ! [[ -e ${METRICS} ]]; then
28 |     echo -e "Engine\tTrain size\tROC AUC\tLog loss\tTrain time\tMaximum memory\tCPU load" | tee "${METRICS}"
29 | fi
30 | 
31 | python measure.py "vw" "${TRAIN}" "${TEST}" | tee -a "${METRICS}"
32 | 
33 | rm "${TIME}" "${PREDICTIONS}"
34 | 


--------------------------------------------------------------------------------
/scripts/running/xgb.ooc.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | 
 4 | TRAIN="${1}"
 5 | TEST="${2}"
 6 | 
 7 | TRAIN_OOC="${TRAIN}#${TRAIN}.cache"
 8 | 
 9 | if [[ "${TRAIN}" == "" || "${TEST}" == "" ]]; then
10 |     echo "Usage: $0 train test"
11 |     exit 1
12 | fi
13 | 
14 | TIME="${TRAIN}.time"
15 | MODEL="${TRAIN}.ooc.model"
16 | PREDICTIONS="${TEST}.predictions"
17 | 
18 | 
19 | /usr/local/bin/time -v --output="${TIME}" \
20 |     xgboost xgb.conf data="${TRAIN_OOC}" model_out="${MODEL}"
21 | 
22 | xgboost xgb.conf task=pred test:data="${TEST}" model_in="${MODEL}" name_pred="${PREDICTIONS}"
23 | 
24 | 
25 | METRICS="metrics.tsv"
26 | if ! [[ -e ${METRICS} ]]; then
27 |     echo -e "Engine\tTrain size\tROC AUC\tLog loss\tTrain time\tMaximum memory\tCPU load" | tee "${METRICS}"
28 | fi
29 | 
30 | python measure.py "xgb.ooc" "${TRAIN}" "${TEST}" | tee -a "${METRICS}"
31 | 
32 | rm "${TIME}" "${PREDICTIONS}" "${TRAIN}.cache"* "${TEST}.buffer"
33 | 


--------------------------------------------------------------------------------
/scripts/conversion/criteoToLibsvm.scala:
--------------------------------------------------------------------------------
 1 | def rowToLibsvm(row: org.apache.spark.sql.Row): String = {
 2 |   0 until row.length flatMap {
 3 |     case 0 => Some(row(0).toString)
 4 |     case i if row(i) == null => None
 5 |     case i => Some(i.toString + ':' + (if (i < 14) row(i) else java.lang.Long.parseLong(row(i).toString, 16)).toString)
 6 |   } mkString " "
 7 | }
 8 | 
 9 | def readDataFrame(path: String): org.apache.spark.sql.DataFrame = {
10 |   spark.read.option("header", "false").option("inferSchema", "true").option("delimiter", "\t").csv(path)
11 | }
12 | 
13 | def writeDataFrame(df: org.apache.spark.sql.DataFrame, path: String): Unit = {
14 |   df.rdd.map(rowToLibsvm).saveAsTextFile(path)
15 | }
16 | 
17 | def processDay(day: Int): Unit = {
18 |   println(s"Processing of the day $day started")
19 |   val inputPath = s"criteo_1tb/plain/day_$day"
20 |   println(s"Loading data from $inputPath")
21 |   val df = readDataFrame(inputPath)
22 |   val outputPath = s"criteo_1tb/libsvm/day_$day.libsvm"
23 |   println(s"Saving data to $outputPath")
24 |   writeDataFrame(df, outputPath)
25 |   println(s"Processing of the day $day finished")
26 | }
27 | 
28 | 
29 | println("Do '0 to 23 foreach processDay' to convert all data to LibSVM format.")
30 | 


--------------------------------------------------------------------------------
/results/metrics.lr_hash_size.tsv:
--------------------------------------------------------------------------------
 1 | "Engine"	"Train size"	"ROC AUC"	"Log loss"	"Train time"
 2 | "lr, 100k hashes"	10000	0.577664202812	0.191408441626	50.9623498917
 3 | "lr, 100k hashes"	30000	0.593790859746	0.181353099861	87.1837468147
 4 | "lr, 100k hashes"	100000	0.612247160316	0.169781474994	148.152549982
 5 | "lr, 100k hashes"	300000	0.644818188231	0.155259096662	138.703317881
 6 | "lr, 100k hashes"	1000000	0.684101056031	0.142522071064	154.500109911
 7 | "lr, 100k hashes"	3000000	0.720519424179	0.135748217546	140.207423925
 8 | "lr, 100k hashes"	10000000	0.742150224051	0.133061793307	175.111939907
 9 | "lr, 100k hashes"	30000000	0.750851634783	0.132163191494	205.059295177
10 | "lr, 100k hashes"	100000000	0.754105416714	0.131909898252	429.173339128
11 | "lr, 30k hashes"	10000	0.575027699783	0.188237385947	34.5243330002
12 | "lr, 30k hashes"	30000	0.594786944726	0.174772368249	47.6537959576
13 | "lr, 30k hashes"	100000	0.625299136967	0.15797820297	94.8742370605
14 | "lr, 30k hashes"	300000	0.669467206714	0.144246165797	106.976841927
15 | "lr, 30k hashes"	1000000	0.71062957893	0.136591896049	106.445657969
16 | "lr, 30k hashes"	3000000	0.732331332028	0.134246693916	117.792547941
17 | "lr, 30k hashes"	10000000	0.742112278546	0.133347719357	139.843713999
18 | "lr, 30k hashes"	30000000	0.744923892422	0.133133669898	171.719541073
19 | "lr, 30k hashes"	100000000	0.7463477576	0.133017218565	478.496304035
20 | 


--------------------------------------------------------------------------------
/results/metrics.cluster.tsv:
--------------------------------------------------------------------------------
 1 | "Engine"	"Train size"	"ROC AUC"	"Log loss"	"Train time"
 2 | "lr"	10000	0.577664202812	0.191408441626	50.9623498917
 3 | "lr"	30000	0.593790859746	0.181353099861	87.1837468147
 4 | "lr"	100000	0.612247160316	0.169781474994	148.152549982
 5 | "lr"	300000	0.644818188231	0.155259096662	138.703317881
 6 | "lr"	1000000	0.684101056031	0.142522071064	154.500109911
 7 | "lr"	3000000	0.720519424179	0.135748217546	140.207423925
 8 | "lr"	10000000	0.742150224051	0.133061793307	175.111939907
 9 | "lr"	30000000	0.750851634783	0.132163191494	205.059295177
10 | "lr"	100000000	0.754105416714	0.131909898252	429.173339128
11 | "lr"	300000000	0.755291615978	0.131809908334	655.645493984
12 | "lr"	1000000000	0.755555957222	0.131795697268	5133.28028798
13 | "lr"	3000000000	0.755575083657	0.131792462522	5726.66465712
14 | "rf"	10000	0.666209405818	0.141072504942	162.25733614
15 | "rf"	30000	0.686325476035	0.139292662004	123.259371996
16 | "rf"	100000	0.698154160624	0.138366116883	67.557528019
17 | "rf"	300000	0.704939649921	0.137753153698	78.4612021446
18 | "rf"	1000000	0.7072363872	0.137494568389	105.36420989
19 | "rf"	3000000	0.707892500228	0.13742745787	154.24793601
20 | "rf"	10000000	0.708613003835	0.13734362384	518.322438955
21 | "rf"	30000000	0.708447321414	0.137352474673	665.707078934
22 | "rf"	100000000	0.708270614391	0.137371388594	1821.9798851
23 | "rf"	300000000	0.708382095174	0.137347952598	3362.20391607
24 | 


--------------------------------------------------------------------------------
/results/metrics.selection.tsv:
--------------------------------------------------------------------------------
 1 | "Engine"	"Train size"	"ROC AUC"	"Log loss"	"Train time"
 2 | lr	10000	0.578177884844	0.178971884565	29.5085098743
 3 | rf	10000	0.621504103494	0.144342437741	76.4821279049
 4 | tree	10000	0.600422662988	0.357273201261	11.1679940224
 5 | bayes	10000	0.509993205115	0.818047776891	1.48214101791
 6 | lr	30000	0.606712654428	0.16236564437	35.3462071419
 7 | rf	30000	0.643115035038	0.142662388067	92.7048139572
 8 | tree	30000	0.617018579741	0.165522218545	11.9768800735
 9 | bayes	30000	0.565664291131	0.473385707556	0.820657014847
10 | lr	100000	0.652313535925	0.145698453913	40.4066979885
11 | rf	100000	0.663411563927	0.14115005143	144.407087088
12 | tree	100000	0.638114905876	0.149601338341	12.5886089802
13 | bayes	100000	0.637296917738	0.373915545595	0.909769058228
14 | lr	300000	0.694017701134	0.138406933493	41.6715488434
15 | rf	300000	0.674058380306	0.140104620859	244.973959923
16 | tree	300000	0.641005555466	0.142781786127	16.7563710213
17 | bayes	300000	0.672807318246	0.418186728992	1.29778695107
18 | lr	1000000	0.719469466249	0.135623586024	46.4208109379
19 | rf	1000000	0.677311170308	0.139673520936	586.515081882
20 | tree	1000000	0.64388298082	0.141953441838	29.7467141151
21 | bayes	1000000	0.690009509821	0.437564216119	1.74085712433
22 | lr	3000000	0.729501417959	0.134854378505	52.4596869946
23 | rf	3000000	0.680087853763	0.13939451653	1414.78307986
24 | tree	3000000	0.642813675941	0.141866601496	65.7208981514
25 | bayes	3000000	0.697140532878	0.450650398396	2.11705684662
26 | 


--------------------------------------------------------------------------------
/scripts/running/measure.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from sklearn.metrics import (
 3 |     auc,
 4 |     log_loss,
 5 |     roc_curve,
 6 | )
 7 | 
 8 | 
 9 | engine = sys.argv[1]
10 | train_file = sys.argv[2]
11 | test_file = sys.argv[3]
12 | 
13 | scores_file = test_file + '.predictions'
14 | time_file = train_file + '.time'
15 | 
16 | 
17 | def get_last_in_line(s):
18 |     return s.rstrip().split( )[-1]
19 | 
20 | def parse_elapsed_time(s):
21 |     return reduce(lambda a, b: a * 60 + b, map(float, get_last_in_line(s).split(':')))
22 | 
23 | def parse_max_memory(s):
24 |     return int(get_last_in_line(s)) * 1024
25 | 
26 | def parse_cpu(s):
27 |     return float(get_last_in_line(s).rstrip('%')) / 100
28 | 
29 | 
30 | elapsed = -1
31 | memory = -1
32 | cpu = -1
33 | 
34 | with open(time_file, 'rb') as f:
35 |     for line in f:
36 |         if 'Elapsed (wall clock) time' in line:
37 |             elapsed = parse_elapsed_time(line)
38 |         elif 'Maximum resident set size' in line:
39 |             memory = parse_max_memory(line)
40 |         elif 'Percent of CPU' in line:
41 |             cpu = parse_cpu(line)
42 | 
43 | 
44 | with open(test_file, 'rb') as f:
45 |     labels = [line.rstrip().split(' ')[0] == '1' for line in f]
46 | 
47 | with open(scores_file, 'rb') as f:
48 |     scores = [float(line.rstrip().split(' ')[0]) for line in f]
49 | 
50 | fpr, tpr, _ = roc_curve(labels, scores)
51 | roc_auc = auc(fpr, tpr)
52 | ll = log_loss(labels, scores)
53 | 
54 | 
55 | try:
56 |     train_size = int(train_file.split('/')[-1].split('.')[2].replace('k', '000'))
57 | except:
58 |     train_size = 0
59 | 
60 | print '\t'.join(map(str, [engine, train_size, roc_auc, ll, elapsed, memory, cpu]))
61 | 


--------------------------------------------------------------------------------
/scripts/conversion/libsvmToVw.scala:
--------------------------------------------------------------------------------
 1 | import java.security.MessageDigest
 2 | 
 3 | import org.apache.hadoop.fs.{FileSystem, Path => HadoopPath}
 4 | import org.apache.spark.rdd.RDD
 5 | 
 6 | 
 7 | def convertLine(row: String): String = {
 8 |   val elements = row.split(' ')
 9 |   val target  = elements.head.toInt * 2 - 1
10 |   (target.toString + " |") +: elements.tail.map(e => {
11 |     val es = e.split(':')
12 |     val index = es(0).toInt
13 |     if (index < 14) e
14 |     else es.mkString("_")
15 |   }) mkString " "
16 | }
17 | 
18 | def md5[T](s: T) = {
19 |   MessageDigest
20 |     .getInstance("MD5")
21 |     .digest(s.toString.getBytes)
22 |     .map("%02x".format(_))
23 |     .mkString
24 | }
25 | 
26 | def convertFile(srcPath: String, dstPath: String) = {
27 |   sc
28 |     .textFile(srcPath)
29 |     .map(convertLine)
30 |     .zipWithIndex
31 |     .sortBy(z => md5(z._2))
32 |     .map(_._1)
33 |     .saveAsTextFile(dstPath)
34 | }
35 | 
36 | val names = List("test", "train")
37 | 
38 | def powTenAndTriple(n: Int): List[Long] = { val v = scala.math.pow(10, n).longValue; List(v, 3 * v) }
39 | val nums = (4 to 9 flatMap powTenAndTriple).toList
40 | def numToString(num: Long): String = num.toString.reverse.replaceAll("000", "k").reverse
41 | 
42 | val fs = FileSystem.get(sc.hadoopConfiguration)
43 | def fileExists(path: String): Boolean = fs.exists(new HadoopPath(path))
44 | def removeFile(path: String): Unit = fs.delete(new HadoopPath(path))
45 | 
46 | def doDataConversion = {
47 |   for {
48 |     num <- nums
49 |     name <- names
50 |     numName = numToString(num)
51 |     srcPath = s"criteo/libsvm/$name/$numName"
52 |     dstPath = s"criteo/vw/$name/$numName"
53 |     if fileExists(srcPath)
54 |     if !fileExists(dstPath + "/_SUCCESS")
55 |   } {
56 |     println(s"$srcPath -> $dstPath")
57 |     removeFile(dstPath)
58 |     convertFile(srcPath, dstPath)
59 |   }
60 | }
61 | 
62 | 
63 | println("Use 'doDataConversion' to start data conversion.")
64 | 


--------------------------------------------------------------------------------
/results/vw_xgb.tsv:
--------------------------------------------------------------------------------
 1 | "Engine"	"Train size"	"ROC AUC"	"Log loss"	"Train time"	"Maximum memory"	"CPU load"
 2 | "vw"	10000	0.617455161515	0.144524706901	15.17	8615268352	1.0
 3 | "vw"	30000	0.647287001387	0.142233353612	13.9	8615268352	0.99
 4 | "vw"	100000	0.683247769366	0.139338887152	16.62	8615727104	1.01
 5 | "vw"	300000	0.70142316924	0.13745122263	17.75	8616218624	1.06
 6 | "vw"	1000000	0.716287545509	0.13588863014	23.83	8618024960	1.2
 7 | "vw"	3000000	0.727931938776	0.134788994538	40.1	8615903232	1.34
 8 | "vw"	10000000	0.739548558371	0.133521239905	91.08	8617746432	1.48
 9 | "vw"	30000000	0.746322264127	0.132553436306	242.17	8616017920	1.57
10 | "vw"	100000000	0.752882908682	0.131795464547	718.71	8617304064	1.62
11 | "vw"	300000000	0.754807741333	0.131569260518	1989.99	8616005632	1.67
12 | "vw"	1000000000	0.757005674498	0.131394975718	6431.0	8617570304	1.71
13 | "vw"	3000000000	0.756123547995	0.132255694498	17798.0	8617238528	1.78
14 | "xgb"	10000	0.663509743693	0.142349739278	1.56	18403328	8.4
15 | "xgb"	30000	0.682316321881	0.140907061928	4.58	33456128	10.63
16 | "xgb"	100000	0.696964467698	0.139122124164	14.86	71958528	12.06
17 | "xgb"	300000	0.714743656593	0.136567639058	45.52	193576960	13.07
18 | "xgb"	1000000	0.733393270406	0.134198458139	179.49	617226240	13.2
19 | "xgb"	3000000	0.744253560697	0.13280500935	610.78	1828356096	14.27
20 | "xgb"	10000000	0.750706783469	0.131929939622	2541.02	6056378368	14.12
21 | "xgb"	30000000	0.753289508009	0.131529827949	9065	18139652096	13.51
22 | "xgb"	100000000	0.755224491108	0.131270032458	37105	60420395008	10.78
23 | "xgb.ooc"	10000	0.663936721734	0.142204298072	6.52	31752192	3.71
24 | "xgb.ooc"	30000	0.684182538527	0.140580030251	24.2	48721920	3.68
25 | "xgb.ooc"	100000	0.697765852364	0.138864711732	87.48	109694976	3.72
26 | "xgb.ooc"	300000	0.714633362901	0.136556600616	264.64	279904256	4.09
27 | "xgb.ooc"	1000000	0.731054714952	0.134485312106	944.91	852369408	4.98
28 | "xgb.ooc"	3000000	0.742823599115	0.132987550283	2517.86	1456574464	5.79
29 | "xgb.ooc"	10000000	0.750953801983	0.131909674184	5874	1921560576	8.2
30 | "xgb.ooc"	30000000	0.753881106383	0.131430669756	14520	2562379776	9.95
31 | "xgb.ooc"	100000000	0.755746234321	0.131237777868	44875	4807467008	8.79
32 | 


--------------------------------------------------------------------------------
/scripts/conversion/sampleLibsvm.scala:
--------------------------------------------------------------------------------
 1 | type Data = (org.apache.spark.rdd.RDD[String], Long)
 2 | 
 3 | def sample(data: Data, n: Long): Data = {
 4 |   val rdd = data._1
 5 |   val count = data._2
 6 |   val ratio = n.toDouble / count
 7 |   val sampledRdd = rdd.sample(false, scala.math.min(1.0, ratio * 1.01), scala.util.Random.nextLong)
 8 |   val exactSampledRdd = sampledRdd.zipWithIndex.filter { case (_, i) => i < n } map (_._1)
 9 |   val exactCount = exactSampledRdd.count
10 |   (exactSampledRdd, exactCount)
11 | }
12 | 
13 | def load(path: String): (Data, Data) = {
14 |   val filesAndStats = 0 to 23 map {
15 |     case i => {
16 |       val dayPath = s"$path/day_${i}.*"
17 |       val rdd = sc.textFile(dayPath)
18 |       val count = rdd.count
19 |       (rdd, count)
20 |     }
21 |   }
22 |   val test = filesAndStats.last
23 |   val train = filesAndStats.init.reduce((a, b) => (a._1 union b._1, a._2 + b._2))
24 |   (train, test)
25 | }
26 | 
27 | val fs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)
28 | 
29 | val numberOfParts = 1024
30 | 
31 | def writeSamples(data: Data, samples: List[Long], path: String, ext: String): Unit = samples foreach {
32 |   case n =>
33 |     val name = n.toString.reverse.replaceAll("000", "k").reverse
34 |     val writePath = s"$path/$name.$ext"
35 |     val hadoopSuccessPath = new org.apache.hadoop.fs.Path(writePath + "/_SUCCESS")
36 |     if (fs.exists(hadoopSuccessPath)) {
37 |       println(s"Data was already successfully written to $writePath, skipping.")
38 |     } else {
39 |       val hadoopPath = new org.apache.hadoop.fs.Path(writePath)
40 |       println(s"Removing $writePath.")
41 |       fs.delete(hadoopPath)
42 |       println("Sampling data")
43 |       val sampledData = sample(data, n)
44 |       println(s"Writing ${sampledData._2} lines to $writePath.")
45 |       sampledData._1.coalesce(numberOfParts).saveAsTextFile(writePath)
46 |     }
47 | }
48 | 
49 | def powTenAndTriple(n: Int): List[Long] = { val v = scala.math.pow(10, n).longValue; List(v, 3 * v) }
50 | 
51 | val testSamples = List(1000000l)
52 | val trainSamples = (4 to 9 flatMap powTenAndTriple).toList
53 | 
54 | def processDataPersist(what: String): Unit = {
55 |   println(s"Working with $what.")
56 | 
57 |   val dataPath = s"criteo_1tb/$what"
58 |   println(s"Loading data from $dataPath.")
59 |   val (train, test) = load(dataPath)
60 |   println("Data loaded.")
61 | 
62 |   def processDataSet(name: String, data: Data, samples: List[Long]): Unit = {
63 |     println(s"Sampling $name to ${samples.mkString("[", ", ", "]")} lines.")
64 |     writeSamples(data, samples, s"$dataPath/$name", what)
65 |   }
66 | 
67 |   test._1.persist
68 |   processDataSet("test", test, testSamples)
69 |   test._1.unpersist(true)
70 | 
71 |   train._1.persist
72 |   processDataSet("train", train, trainSamples)
73 |   train._1.unpersist(true)
74 | 
75 |   println(s"Done with $what.")
76 | }
77 | 
78 | def doDataPreparationLibSVM = {
79 |   processDataPersist("libsvm")
80 | }
81 | 
82 | println("Use 'doDataPreparationLibSVM' to start data preparation.")
83 | 


--------------------------------------------------------------------------------
/results/metrics.tsv:
--------------------------------------------------------------------------------
 1 | "Engine"	"Train size"	"ROC AUC"	"Log loss"	"Train time"
 2 | "vw"	10000	0.617455161515	0.144524706901	15.17
 3 | "vw"	30000	0.647287001387	0.142233353612	13.9
 4 | "vw"	100000	0.683247769366	0.139338887152	16.62
 5 | "vw"	300000	0.70142316924	0.13745122263	17.75
 6 | "vw"	1000000	0.716287545509	0.13588863014	23.83
 7 | "vw"	3000000	0.727931938776	0.134788994538	40.1
 8 | "vw"	10000000	0.739548558371	0.133521239905	91.08
 9 | "vw"	30000000	0.746322264127	0.132553436306	242.17
10 | "vw"	100000000	0.752882908682	0.131795464547	718.71
11 | "vw"	300000000	0.754807741333	0.131569260518	1989.99
12 | "vw"	1000000000	0.757005674498	0.131394975718	6431.0
13 | "vw"	3000000000	0.756123547995	0.132255694498	17798.0
14 | "xgb"	10000	0.663509743693	0.142349739278	1.56
15 | "xgb"	30000	0.682316321881	0.140907061928	4.58
16 | "xgb"	100000	0.696964467698	0.139122124164	14.86
17 | "xgb"	300000	0.714743656593	0.136567639058	45.52
18 | "xgb"	1000000	0.733393270406	0.134198458139	179.49
19 | "xgb"	3000000	0.744253560697	0.13280500935	610.78
20 | "xgb"	10000000	0.750706783469	0.131929939622	2541.02
21 | "xgb"	30000000	0.753289508009	0.131529827949	9065
22 | "xgb"	100000000	0.755224491108	0.131270032458	37105
23 | "xgb.ooc"	10000	0.663936721734	0.142204298072	6.52
24 | "xgb.ooc"	30000	0.684182538527	0.140580030251	24.2
25 | "xgb.ooc"	100000	0.697765852364	0.138864711732	87.48
26 | "xgb.ooc"	300000	0.714633362901	0.136556600616	264.64
27 | "xgb.ooc"	1000000	0.731054714952	0.134485312106	944.91
28 | "xgb.ooc"	3000000	0.742823599115	0.132987550283	2517.86
29 | "xgb.ooc"	10000000	0.750953801983	0.131909674184	5874
30 | "xgb.ooc"	30000000	0.753881106383	0.131430669756	14520
31 | "xgb.ooc"	100000000	0.755746234321	0.131237777868	44875
32 | "lr"	10000	0.577664202812	0.191408441626	50.9623498917
33 | "lr"	30000	0.593790859746	0.181353099861	87.1837468147
34 | "lr"	100000	0.612247160316	0.169781474994	148.152549982
35 | "lr"	300000	0.644818188231	0.155259096662	138.703317881
36 | "lr"	1000000	0.684101056031	0.142522071064	154.500109911
37 | "lr"	3000000	0.720519424179	0.135748217546	140.207423925
38 | "lr"	10000000	0.742150224051	0.133061793307	175.111939907
39 | "lr"	30000000	0.750851634783	0.132163191494	205.059295177
40 | "lr"	100000000	0.754105416714	0.131909898252	429.173339128
41 | "lr"	300000000	0.755291615978	0.131809908334	655.645493984
42 | "lr"	1000000000	0.755555957222	0.131795697268	5133.28028798
43 | "lr"	3000000000	0.755575083657	0.131792462522	5726.66465712
44 | "rf"	10000	0.666209405818	0.141072504942	162.25733614
45 | "rf"	30000	0.686325476035	0.139292662004	123.259371996
46 | "rf"	100000	0.698154160624	0.138366116883	67.557528019
47 | "rf"	300000	0.704939649921	0.137753153698	78.4612021446
48 | "rf"	1000000	0.7072363872	0.137494568389	105.36420989
49 | "rf"	3000000	0.707892500228	0.13742745787	154.24793601
50 | "rf"	10000000	0.708613003835	0.13734362384	518.322438955
51 | "rf"	30000000	0.708447321414	0.137352474673	665.707078934
52 | "rf"	100000000	0.708270614391	0.137371388594	1821.9798851
53 | "rf"	300000000	0.708382095174	0.137347952598	3362.20391607
54 | 


--------------------------------------------------------------------------------
/scripts/running/plots.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import argparse
  4 | import re
  5 | 
  6 | import cycler
  7 | import pandas
  8 | 
  9 | from matplotlib import pyplot
 10 | 
 11 | 
 12 | def extract_data_for_plotting(df, what):
 13 |     return reduce(
 14 |         lambda left, right: pandas.merge(
 15 |             left,
 16 |             right,
 17 |             how='outer',
 18 |             on='Train size',
 19 |         ),
 20 |         map(
 21 |             lambda name: (
 22 |                 df[df.Engine == name][['Train size', what]]
 23 |                 .rename(columns={what: name})
 24 |             ),
 25 |             df.Engine.unique(),
 26 |         ),
 27 |     )
 28 | 
 29 | 
 30 | def plot_stuff(df, what, ylabel=None, **kwargs):
 31 |     data = extract_data_for_plotting(df, what).set_index('Train size')
 32 |     ax = data.plot(
 33 |         figsize=(6, 6),
 34 |         title=what,
 35 |         grid=True,
 36 |         linewidth=2.0,
 37 |         marker='o',
 38 |         **kwargs
 39 |     )
 40 |     ax.legend(loc='best')
 41 |     if ylabel is not None:
 42 |         ax.set_ylabel(ylabel)
 43 |     ax.grid(which='major', linestyle='-')
 44 |     ax.grid(which='minor', linestyle=':')
 45 | 
 46 |     what_normalized = re.sub(r'\s', '_', what).lower()
 47 | 
 48 |     if experiment_name is not None:
 49 |         ax.get_figure().savefig(what_normalized + '.' + experiment_name + '.png')
 50 |     else:
 51 |         ax.get_figure().savefig(what_normalized + '.png')
 52 | 
 53 | 
 54 | parser = argparse.ArgumentParser()
 55 | 
 56 | parser.add_argument('-i', '--input', type=str, default='metrics.tsv',
 57 |                     help='input file')
 58 | 
 59 | parser.add_argument('-n', '--name', type=str,
 60 |                     help='experiment name')
 61 | 
 62 | parser.add_argument('-p', '--perf', action='store_true',
 63 |                     help='build perf graphs')
 64 | 
 65 | parser.add_argument('-c', '--colors', type=str,
 66 |                     help='color cycle')
 67 | 
 68 | parser.add_argument('-l', '--logloss',
 69 |                     help='log loss scale')
 70 | 
 71 | args = parser.parse_args()
 72 | 
 73 | 
 74 | metrics_file = args.input
 75 | experiment_name = args.name
 76 | perf_graphs = args.perf
 77 | 
 78 | if args.colors is not None:
 79 |     color_cycle = args.colors.split(',')
 80 |     pyplot.rc('axes', prop_cycle=(cycler.cycler('color', color_cycle)))
 81 | 
 82 | df = (
 83 |     pandas
 84 |     .read_csv(metrics_file, sep='\t')
 85 |     .sort_values(by=['Engine', 'Train size'])
 86 | )
 87 | 
 88 | plot_stuff(df, 'ROC AUC', logx=True)
 89 | 
 90 | if args.logloss is not None:
 91 |     ll_from, ll_to = map(float, args.logloss.split(','))
 92 |     plot_stuff(df, 'Log loss', logx=True, ylim=(ll_from, ll_to))
 93 | else:
 94 |     plot_stuff(df, 'Log loss', logx=True)
 95 | 
 96 | plot_stuff(df, 'Train time', loglog=True, ylabel='s')
 97 | 
 98 | if perf_graphs:
 99 |     plot_stuff(df, 'Maximum memory', loglog=True, ylabel='bytes')
100 |     plot_stuff(df, 'CPU load', logx=True)
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Criteo 1 TiB benchmark
  2 | 
  3 | 
  4 | 
  5 | ## Table of contents
  6 | 
  7 | * [Introduction](#introduction)
  8 | * [Task and data](#task-and-data)
  9 | * [Algorithms](#algorithms)
 10 | * [Setup](#setup)
 11 | * [Experiment layout](#experiment-layout)
 12 |   * [Hyperparameter optimization](#hyperparameter-optimization)
 13 |   * [Data format for Spark.ML](#data-format-for-sparkml)
 14 |   * [Code](#code)
 15 | * [Results](#results)
 16 |   * [Local training - Vowpal Wabbit & XGBoost](#local-training---vowpal-wabbit--xgboost)
 17 |   * [Distributed training - Spark.ML](#distributed-training---sparkml)
 18 |   * [Distributed training - Time vs. Cores](#distributed-training---time-vs-cores)
 19 |   * [Comparison of local vs. remote](#comparison-of-local-vs-remote)
 20 | * [Conclusion](#conclusion)
 21 | * [Resources](#resources)
 22 | 
 23 | 
 24 | ## Introduction
 25 | [_(back to toc)_](#table-of-contents)
 26 | 
 27 | This project is a minimal benchmark of applicability of several implementations of machine learning algorithms to training on big data. Our main focus is [Spark.ML](http://spark.apache.org/mllib/) and how it compares to commonly used single-node machine learning tools Vowpal Wabbit and XGBoost in terms of scaling to terabyte (billions of lines) train data. Quick web search shows that many people tested Spark but not on tasks requiring a cluster so they are mostly single-node tests.
 28 | 
 29 | This project is inspired by https://github.com/szilard/benchm-ml but is focused on training models on billions of lines of train data, including Spark.ML in multinode cluster environment.
 30 | 
 31 | 
 32 | 
 33 | ## Task and data
 34 | [_(back to toc)_](#table-of-contents)
 35 | 
 36 | Our target application is prediction of click-through ratio (CTR) of banners in online advertising. [Criteo released](http://labs.criteo.com/2015/03/criteo-releases-its-new-dataset/) an industry-standard open dataset which represents banner impressions in online advertising during the timespan of 24 days. It is more than 1 terabyte in size and consists of more than 4 billion lines of data. Each line represents a banner impression and contains 40 columns separated by tabulation:
 37 | 
 38 | - the first column is a label - {0, 1} - 1 meaning the banner was clicked and 0 otherwise;
 39 | - 13 numeric columns;
 40 | - 26 categorical columns with categories being 32-bit hashes.
 41 | 
 42 | This is how it looks like:
 43 | 
 44 | ![Dataset schema](images/dataset.png)
 45 | 
 46 | All the data except the last day was concatenated and sampled into training sets of 10ⁿ and 3×10ⁿ lines with `n ∈ {4, 5, ..., 9}` (i.e. train samples' sizes are 10k, 30k, 100k, ..., 1kkk, 3kkk lines). The last day was used for testing - a sample of one million lines was taken from it. All samples were converted to
 47 | 
 48 | - LibSVM format for training XGBoost models and as a source for the transformation to Spark.ML DataFrame;
 49 | - Vowpal Wabbit data format.
 50 | 
 51 | Data for Spark.ML models was processed on-the-fly from LibSVM format:
 52 | 
 53 | - into a dataset of tuples of "label" (integer) and "features" (SparseVector of size 10⁵ using [hashing trick](https://en.wikipedia.org/wiki/Feature_hashing#Feature_vectorization_using_the_hashing_trick) for all features) for Spark.ML LogisticRegression;
 54 | - into a dataset of tuples of "label" (integer) and "features" (SparseVector of size 39 taken as-is from corresponding columns, see below) for Spark.ML RandomForestClassifier.
 55 | 
 56 | 
 57 | 
 58 | ## Algorithms
 59 | [_(back to toc)_](#table-of-contents)
 60 | 
 61 | Historically, we make use of [Vowpal Wabbit](https://github.com/JohnLangford/vowpal_wabbit) and [XGBoost](https://github.com/dmlc/xgboost) exploiting "Local train + Distributed apply" scenario. Our task was to run a performance test of our currently used approach and Spark.ML library algorithms.
 62 | 
 63 | We used the following non-distributed algorithms:
 64 | 
 65 | - Vowpal Wabbit - it implements logistic regression with a hashing trick and reads the data only once never keeping more than one sample in memory (it is an out-of-core implementation);
 66 | - in-memory XGBoost - gradient-boosted trees implementation that (by default) loads the whole data into memory (which is faster than multiple reads from disk, but we are limited in size by machine memory);
 67 | - out-of-core XGBoost - a variant of XGBoost training which uses an on-disk cache; this is slower (compared to the in-memory variant) but potentially we can train on the data limited in size only by the size of HDD.
 68 | 
 69 | Spark.ML contains following classification algorithms:
 70 | 
 71 | - [LogisticRegression](http://spark.apache.org/docs/latest/ml-classification-regression.html#logistic-regression),
 72 | - [RandomForestClassifier](http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier),
 73 | - [NaiveBayes](http://spark.apache.org/docs/latest/ml-classification-regression.html#naive-bayes),
 74 | - [DecisionTreeClassifier](http://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-classifier),
 75 | - [GBTClassifier](http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-classifier),
 76 | - [MultilayerPerceptronClassifier](http://spark.apache.org/docs/latest/ml-classification-regression.html#multilayer-perceptron-classifier).
 77 | 
 78 | Our preliminary research shows that four of the algorithms are not well-suited for our task of CTR prediction:
 79 | 
 80 | - NaiveBayes provides significantly worse logistic loss (which is an essential metric of CTR models' quality) than all other models;
 81 | - DecisionTreeClassifier suffers in quality in comparison to the RandomForestClassifier but still requires roughly the same amount of time to train;
 82 | - GBTClassifier (Spark.ML implementation of gradient-boosted trees) and MultilayerPerceptronClassifier do not support prediction of probabilities that are required by the task (these two models are not shown on graphs above).
 83 | 
 84 | ![ROC AUC](images/roc_auc.cluster_selection.png) ![Log loss](images/log_loss.cluster_selection.png) ![Training time](images/train_time.cluster_selection.png)
 85 | 
 86 | Thus we use only LogisticRegression and RandomForestClassifier for our testing purposes.
 87 | 
 88 | 
 89 | 
 90 | ## Setup
 91 | [_(back to toc)_](#table-of-contents)
 92 | 
 93 | Local models were trained on a 12-core (24-thread) machine with 128 GiB of memory. Distributed training was performed on our production cluster (total capacity is approximately 2000 cores and 10 TiB of memory); for the experiment a small part of resources has been allocated - 256 cores and 1 TiB of memory for training on datasets upto 300 million of lines and 512 cores and 2 TiB of memory for training on one billion and 3 billion lines of train data. 4 cores and 16 TiB of memory per Spark executor was used.
 94 | 
 95 | For the experiment we used Vowpal Wabbit 8.3.0, XGBoost 0.4 and Spark 2.1.0 running on a Hadoop 2.6 cluster (using YARN as a cluster manager).
 96 | 
 97 | 
 98 | 
 99 | ## Experiment layout
100 | 
101 | ### Hyperparameter optimization
102 | [_(back to toc)_](#table-of-contents)
103 | 
104 | Our first idea was to skip models' hyperparameters optimization completely, but unfortunately XGBoost's default hyperparameters are not good enough for training even on million lines of data - the default number of trees is only 10, and it hits the ceiling quite soon:
105 | 
106 | ![ROC AUC](images/roc_auc.why_optimize.png) ![Log loss](images/log_loss.why_optimize.png)
107 | 
108 | These figures reminded us that production usage of any machine learning model is associated with optimization of its hyperparameters, and in our experiment we should do the same. For optimization of models' hyperparameters (including Spark.ML ones) we used the million-line sample of train data and 5-fold cross validation for metric (log loss) averaging.
109 | 
110 | 
111 | 
112 | ### Data format for Spark.ML
113 | [_(back to toc)_](#table-of-contents)
114 | 
115 | We tried to use [one-hot-encoding](https://www.quora.com/What-is-one-hot-encoding-and-when-is-it-used-in-data-science) of categorical features, but due to very large number of unique values it turned out to be very time and memory consuming, so for Spark.ML we decided to try the hashing trick. Spark.ML LogisticRegression was trained using this approach. We sticked to hashing space of 10⁵ hashes as it turned out to give about the same quality as VW on large samples. Taking less hashes usually leads to better quality on smaller data (because of less overfitting) and worse quality on bigger data (because some patterns in data are consumed by collisions in hashing space):
116 | 
117 | ![ROC AUC](images/roc_auc.lr_hash_size.png) ![Log loss](images/log_loss.lr_hash_size.png)
118 | 
119 | RandomForestClassifier was very slow to train even with a thousand hashes, so we used "as-is" format for it:
120 | 
121 | - all numeric features were converted to elements of SparseVector as-is;
122 | - all categorical features were converted to elements of SparseVector by interpreting the hashes as 32 bit numbers.
123 | 
124 | 
125 | 
126 | ### Code
127 | [_(back to toc)_](#table-of-contents)
128 | 
129 | All work was performed in Jupyter notebooks in Python. Notebooks:
130 | 
131 | - [experiment_local.ipynb](notebooks/experiment_local.ipynb) was used for preparing the data and training of the local models;
132 | - [experiment_spark_lr.ipynb](notebooks/experiment_spark_lr.ipynb) and [experiment_spark_rf.ipynb](notebooks/experiment_spark_rf.ipynb) for training Spark.ML LogisticRegression and RandomForestClassifier accordingly.
133 | 
134 | 
135 | 
136 | ## Results
137 | 
138 | ### Local training - Vowpal Wabbit & XGBoost
139 | [_(back to toc)_](#table-of-contents)
140 | 
141 | ![ROC AUC](images/roc_auc.local.png) ![Log loss](images/log_loss.local.png) ![Train time](images/train_time.local.png) ![Maximum memory](images/maximum_memory.local.png) ![CPU load](images/cpu_load.local.png)
142 | 
143 | Some observations:
144 | 
145 | - our main concern about an out-of-core training of XGBoost was that it would not produce the same quality as its in-memory variant due to the approximate splitting algorithm; however, in-memory XGBoost and out-of-core XGBoost turned out to provide about the same level of quality, but out-of-core variant is about an order of magnitude slower;
146 | - in-memory XGBoost is about an order of magnitude slower than Vowpal Wabbit on the same amount of train data;
147 | - Vowpal Wabbit was able to give about the same quality as XGBoost trained on an order of magnitude smaller sample.
148 | 
149 | 
150 | 
151 | ### Distributed training - Spark.ML
152 | [_(back to toc)_](#table-of-contents)
153 | 
154 | ![ROC AUC](images/roc_auc.cluster.png) ![Log loss](images/log_loss.cluster.png) ![Train time](images/train_time.cluster.png)
155 | 
156 | We made the following conclusions:
157 | 
158 | - RandomForestClassifier is quite slow, and it is even slower when the data consists of large vectors;
159 | - LogisticRegression is hard to set up for smaller samples and for bigger samples at the same time - either it overfits on small data or it cannot extract patterns due to more aggressive hashing trick.
160 | 
161 | 
162 | 
163 | ### Distributed training - Time vs. Cores
164 | [_(back to toc)_](#table-of-contents)
165 | 
166 | To check how model training scales to multi-core setup we made a quick test where we increased the number of cores and measured training time for every step. To make it fast we used a 10⁷ sample of train data and checked training time for a number of cores from 5 to 50 in steps of 5. In order to eliminate the uncertainty brought forth by running the test in parallel with production tasks, for this test we created a standalone Spark cluster running on three machines with a total of ≈50 cores and ≈200 GiB of memory.
167 | 
168 | ![Time vs. cores](images/time_vs_cores.png)
169 | 
170 | Training time dropped quite fast when we went from 5 to 15 cores but slowed down afterwards and completely ceased to improve by the mark of 40 cores (even growing a little on transition from 40 to 45 cores). The main idea we have extracted from this figure is that one should not increase amount of resources beyond minimum required, so that work distribution and aggregation overhead would be cheaper than potential improvement of speed by parallelization.
171 | 
172 | 
173 | 
174 | ### Comparison of local vs. remote
175 | [_(back to toc)_](#table-of-contents)
176 | 
177 | ![ROC AUC](images/roc_auc.local_and_cluster.png) ![Log loss](images/log_loss.local_and_cluster.png) ![Train time](images/train_time.local_and_cluster.png)
178 | 
179 | We can see that:
180 | 
181 | - on large datasets (100 million of lines and more) Spark.ML is faster than both Vowpal Wabbit and XGBoost; maybe it is possible to make it even faster by finding the best cluster setup for each size of training sample (we had not done this work);
182 | - however it is slow when working with large vectors - steps should be taken in order to find a balance between quality and speed;
183 | - for small tasks Spark introduces overhead that can be more expensive than it is possible to gain by computing the task in parallel (well, this is true for parallel computing in general).
184 | 
185 | 
186 | 
187 | ## Conclusion
188 | [_(back to toc)_](#table-of-contents)
189 | 
190 | The best quality measured by logarithmic loss (which is a metric of choice for CTR prediction) was achived using XGBoost - no matter in-memory or out-of-core, as they both seem to be equal in quality - on a sample smaller in size than other models required for the same level of quality.
191 | However XGBoost is very slow on big samples in out-of-core setup thus it was not rational to test it on a 300kk sample and above (training the in-memory setup on large samples was also not possible due to memory restrictions).
192 | The highest ROC AUC was reached by Vowpal Wabbit on one-billion-line train sample, strangely decreasing in quality by three-billion-line sample.
193 | Spark.ML LogisticRegression is quite close in quality to Vowpal Wabbit, and maybe it can be made even better by increasing the feature space size (which is 100k hashes in current setup).
194 | Spark.ML LogisticRegression appeared to be considerably faster than VW on billion-line samples and maybe it can be made even faster by optimizing the allocated resources.
195 | Spark.ML RandomForestClassifier stopped increasing in quality quite early and it is also quite slow.
196 | 
197 | 
198 | 
199 | ## Resources
200 | [_(back to toc)_](#table-of-contents)
201 | 
202 | Results in table format can be found [here](results). Scala scripts used for faster conversion and sampling can be found [here](scripts/conversion) - these scripts can be used from [spark-shell](http://spark.apache.org/docs/latest/quick-start.html#basics) using `:load` command. Scripts for running VW & XGBoost and plotting outside of the Jupyter notebooks can be found [here](scripts/running).
203 | 


--------------------------------------------------------------------------------
/notebooks/experiment_spark_rf.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Criteo 1 TiB benchmark - Spark.ML random forest\n",
  8 |     "\n",
  9 |     "Specialization of the experimental notebook for Spark.ML random forest."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Table of contents\n",
 17 |     "\n",
 18 |     "* [Configuration](#Configuration)\n",
 19 |     "* [Distributed training](#Distributed-training)\n",
 20 |     "* [End](#End)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "%load_ext autotime\n",
 32 |     "%matplotlib inline\n",
 33 |     "\n",
 34 |     "from __future__ import print_function"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "## Configuration\n",
 42 |     "[_(back to toc)_](#Table-of-contents)"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "Paths:"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {
 56 |     "collapsed": true
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "libsvm_data_remote_path = 'criteo/libsvm'\n",
 61 |     "local_runtime_path = 'criteo/runtime'"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {
 68 |     "collapsed": true
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "import os\n",
 73 |     "\n",
 74 |     "\n",
 75 |     "libsvm_train_template = os.path.join(libsvm_data_remote_path, 'train', '{}')\n",
 76 |     "libsvm_test_template = os.path.join(libsvm_data_remote_path, 'test', '{}')"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {
 83 |     "collapsed": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "def ensure_directory_exists(path):\n",
 88 |     "    if not os.path.exists(path):\n",
 89 |     "        os.makedirs(path)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "Samples to take:"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {
103 |     "collapsed": true
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "train_samples = [\n",
108 |     "    10000, 30000,  # tens of thousands\n",
109 |     "    100000, 300000,  # hundreds of thousands\n",
110 |     "    1000000, 3000000,  # millions\n",
111 |     "    10000000, 30000000,  # tens of millions\n",
112 |     "    100000000, 300000000,  # hundreds of millions\n",
113 |     "    1000000000, 3000000000,  # billions\n",
114 |     "]\n",
115 |     "\n",
116 |     "test_samples = [1000000]"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "Spark configuration and initialization:"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {
130 |     "collapsed": true
131 |    },
132 |    "outputs": [],
133 |    "source": [
134 |     "executor_instances = 64\n",
135 |     "executor_cores = 4\n",
136 |     "memory_per_core = 4"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {
143 |     "collapsed": true
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "app_name = 'Criteo experiment'\n",
148 |     "\n",
149 |     "master = 'yarn'\n",
150 |     "\n",
151 |     "settings = {\n",
152 |     "    'spark.network.timeout': '600',\n",
153 |     "    \n",
154 |     "    'spark.driver.cores': '16',\n",
155 |     "    'spark.driver.maxResultSize': '16G',\n",
156 |     "    'spark.driver.memory': '32G',\n",
157 |     "    \n",
158 |     "    'spark.executor.cores': str(executor_cores),\n",
159 |     "    'spark.executor.instances': str(executor_instances),\n",
160 |     "    'spark.executor.memory': str(memory_per_core * executor_cores) + 'G',\n",
161 |     "    \n",
162 |     "    'spark.speculation': 'true',\n",
163 |     "    \n",
164 |     "    'spark.yarn.queue': 'root.HungerGames',\n",
165 |     "}"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {
172 |     "collapsed": true,
173 |     "scrolled": false
174 |    },
175 |    "outputs": [],
176 |    "source": [
177 |     "from pyspark.sql import SparkSession\n",
178 |     "\n",
179 |     "\n",
180 |     "builder = SparkSession.builder\n",
181 |     "\n",
182 |     "builder.appName(app_name)\n",
183 |     "builder.master(master)\n",
184 |     "for k, v in settings.items():\n",
185 |     "    builder.config(k, v)\n",
186 |     "\n",
187 |     "spark = builder.getOrCreate()\n",
188 |     "sc = spark.sparkContext\n",
189 |     "\n",
190 |     "sc.setLogLevel('ERROR')"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "Logging:"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {
204 |     "collapsed": true
205 |    },
206 |    "outputs": [],
207 |    "source": [
208 |     "import sys\n",
209 |     "import logging\n",
210 |     "reload(logging)\n",
211 |     "\n",
212 |     "\n",
213 |     "handler = logging.StreamHandler(stream=sys.stdout)\n",
214 |     "formatter = logging.Formatter('[%(asctime)s] %(message)s')\n",
215 |     "handler.setFormatter(formatter)\n",
216 |     "\n",
217 |     "ensure_directory_exists(local_runtime_path)\n",
218 |     "file_handler = logging.FileHandler(filename=os.path.join(local_runtime_path, 'mylog.log'), mode='a')\n",
219 |     "file_handler.setFormatter(formatter)\n",
220 |     "\n",
221 |     "logger = logging.getLogger()\n",
222 |     "logger.addHandler(handler)\n",
223 |     "logger.addHandler(file_handler)\n",
224 |     "logger.setLevel(logging.DEBUG)"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {
231 |     "collapsed": true
232 |    },
233 |    "outputs": [],
234 |    "source": [
235 |     "logger.info('Spark version: %s.', spark.version)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "Plot measurements:"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {
249 |     "collapsed": true,
250 |     "scrolled": false
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "import pandas\n",
255 |     "\n",
256 |     "\n",
257 |     "def extract_data_for_plotting(df, what):\n",
258 |     "    return reduce(\n",
259 |     "        lambda left, right: pandas.merge(left, right, how='outer', on='Train size'),\n",
260 |     "        map(\n",
261 |     "            lambda name: df[df.Engine == name][['Train size', what]].rename(columns={what: name}),\n",
262 |     "            df.Engine.unique(),\n",
263 |     "        ),\n",
264 |     "    )   \n",
265 |     "\n",
266 |     "def plot_stuff(df, what, ylabel=None, **kwargs):\n",
267 |     "    data = extract_data_for_plotting(df, what).set_index('Train size')\n",
268 |     "    ax = data.plot(marker='o', figsize=(6, 6), title=what, grid=True, linewidth=2.0, **kwargs)  # xlim=(1e4, 4e9)\n",
269 |     "    if ylabel is not None:\n",
270 |     "        ax.set_ylabel(ylabel)"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "Let's name samples as their shortened \"engineering\" notation - 1e5 is 100k etc.:"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {
284 |     "collapsed": true
285 |    },
286 |    "outputs": [],
287 |    "source": [
288 |     "def sample_name(sample):\n",
289 |     "    return str(sample)[::-1].replace('000', 'k')[::-1]"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {},
295 |    "source": [
296 |     "## Distributed training\n",
297 |     "[_(back to toc)_](#Table-of-contents)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "Loading of LibSVM data as Spark.ML dataset:"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {
311 |     "collapsed": true
312 |    },
313 |    "outputs": [],
314 |    "source": [
315 |     "from pyspark.ml.linalg import SparseVector\n",
316 |     "\n",
317 |     "\n",
318 |     "def parse_libsvm_line_for_rf(line):\n",
319 |     "    parts = line.split(' ')\n",
320 |     "    label = int(parts[0])\n",
321 |     "    indices, values = zip(*map(lambda s: s.split(':'), parts[1:]))\n",
322 |     "    return (label, SparseVector(40, map(int, indices), map(int, values)))"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {
329 |     "collapsed": true
330 |    },
331 |    "outputs": [],
332 |    "source": [
333 |     "task_splitting = 1  # tasks per core\n",
334 |     "\n",
335 |     "def load_ml_data_for_rf(template, sample):\n",
336 |     "    path = template.format(sample_name(sample))\n",
337 |     "    return sc.textFile(path).map(parse_libsvm_line_for_rf).toDF(['label', 'features']).repartition(executor_cores * executor_instances * task_splitting)"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "metadata": {},
343 |    "source": [
344 |     "Evaluating a model:"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "metadata": {
351 |     "collapsed": true
352 |    },
353 |    "outputs": [],
354 |    "source": [
355 |     "from matplotlib import pyplot\n",
356 |     "from sklearn.metrics import auc, log_loss, roc_curve\n",
357 |     "\n",
358 |     "\n",
359 |     "def calculate_roc(predictions):\n",
360 |     "    labels, scores = zip(*predictions.rdd.map(lambda row: (row.label, row.probability[1])).collect())\n",
361 |     "    fpr, tpr, _ = roc_curve(labels, scores)\n",
362 |     "    roc_auc = auc(fpr, tpr)\n",
363 |     "    ll = log_loss(labels, scores)\n",
364 |     "    return fpr, tpr, roc_auc, ll\n",
365 |     "\n",
366 |     "def evaluate_model(name, model, test, train=None):\n",
367 |     "    metrics = dict()\n",
368 |     "    \n",
369 |     "    figure = pyplot.figure(figsize=(6, 6))\n",
370 |     "    ax = figure.gca()\n",
371 |     "    ax.set_title('ROC - ' + name)\n",
372 |     "    \n",
373 |     "    if train is not None:\n",
374 |     "        train_predictions = model.transform(train)\n",
375 |     "        train_fpr, train_tpr, train_roc_auc, train_log_loss = calculate_roc(train_predictions)\n",
376 |     "        \n",
377 |     "        metrics['train_roc_auc'] = train_roc_auc\n",
378 |     "        metrics['train_log_loss'] = train_log_loss\n",
379 |     "        \n",
380 |     "        ax.plot(train_fpr, train_tpr, linewidth=2.0, label='train (auc = {:.3f})'.format(train_roc_auc))\n",
381 |     "    \n",
382 |     "    test_predictions = model.transform(test)\n",
383 |     "    test_fpr, test_tpr, test_roc_auc, test_log_loss = calculate_roc(test_predictions)\n",
384 |     "    \n",
385 |     "    metrics['test_roc_auc'] = test_roc_auc\n",
386 |     "    metrics['test_log_loss'] = test_log_loss\n",
387 |     "    \n",
388 |     "    ax.plot(test_fpr, test_tpr, linewidth=2.0, label='test (auc = {:.3f})'.format(test_roc_auc))\n",
389 |     "    \n",
390 |     "    ax.plot([0.0, 1.0], [0.0, 1.0], linestyle='--', c='gray')\n",
391 |     "    ax.legend()\n",
392 |     "    pyplot.show()\n",
393 |     "    \n",
394 |     "    return metrics"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "markdown",
399 |    "metadata": {},
400 |    "source": [
401 |     "Models to work on:"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {
408 |     "collapsed": true
409 |    },
410 |    "outputs": [],
411 |    "source": [
412 |     "from pyspark.ml.classification import (\n",
413 |     "    RandomForestClassifier, \n",
414 |     ")"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "metadata": {
421 |     "collapsed": true
422 |    },
423 |    "outputs": [],
424 |    "source": [
425 |     "classifiers = {\n",
426 |     "    'rf': RandomForestClassifier(featureSubsetStrategy='sqrt', impurity='entropy', minInstancesPerNode=3, maxBins=64, maxDepth=10, numTrees=160),\n",
427 |     "}"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "markdown",
432 |    "metadata": {},
433 |    "source": [
434 |     "Monkey-patch RDDs and DataFrames for context persistence:"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": null,
440 |    "metadata": {
441 |     "collapsed": true
442 |    },
443 |    "outputs": [],
444 |    "source": [
445 |     "import pyspark\n",
446 |     "\n",
447 |     "\n",
448 |     "def enter_method(self):\n",
449 |     "    self.persist()\n",
450 |     "\n",
451 |     "def exit_method(self,exc_type, exc, traceback):\n",
452 |     "    self.unpersist()\n",
453 |     "\n",
454 |     "\n",
455 |     "pyspark.sql.dataframe.DataFrame.__enter__ = enter_method\n",
456 |     "pyspark.sql.dataframe.DataFrame.__exit__ = exit_method"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "markdown",
461 |    "metadata": {},
462 |    "source": [
463 |     "Do distributed training:"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": null,
469 |    "metadata": {
470 |     "collapsed": true,
471 |     "scrolled": false
472 |    },
473 |    "outputs": [],
474 |    "source": [
475 |     "import time\n",
476 |     "\n",
477 |     "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
478 |     "\n",
479 |     "\n",
480 |     "test_sample = test_samples[-1]\n",
481 |     "\n",
482 |     "evaluator = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='probability', metricName='areaUnderROC')\n",
483 |     "\n",
484 |     "new_quality_data = []\n",
485 |     "new_data_rows = []\n",
486 |     "\n",
487 |     "logger.info('Loading \"%s\" test samples for rf.', test_sample)\n",
488 |     "test_df = load_ml_data_for_rf(libsvm_test_template, test_sample)\n",
489 |     "with test_df:\n",
490 |     "    logger.info('Loaded \"%s\" lines.', test_df.count())\n",
491 |     "\n",
492 |     "    for train_sample in train_samples:\n",
493 |     "\n",
494 |     "        logger.info('Working on \"%s\" train sample.', train_sample)\n",
495 |     "\n",
496 |     "        logger.info('Loading \"%s\" train samples for rf.', train_sample)\n",
497 |     "        train_df = load_ml_data_for_rf(libsvm_train_template, train_sample)\n",
498 |     "        with train_df:\n",
499 |     "            logger.info('Loaded \"%s\" lines.', train_df.count())\n",
500 |     "\n",
501 |     "            for classifier_name, classifier in classifiers.items():\n",
502 |     "\n",
503 |     "                logger.info('Training a model \"%s\" on sample \"%s\".', classifier_name, train_sample)\n",
504 |     "\n",
505 |     "                start = time.time()\n",
506 |     "                model = classifier.fit(train_df)\n",
507 |     "                duration = time.time() - start\n",
508 |     "\n",
509 |     "                logger.info('Training a model \"%s\" on sample \"%s\" took \"%g\" seconds.', classifier_name, train_sample, duration)\n",
510 |     "\n",
511 |     "                logger.info('Evaluating the model \"%s\" trained on sample \"%s\".', classifier_name, train_sample)\n",
512 |     "                metrics = evaluate_model(classifier_name + ' - ' + sample_name(train_sample), model, test_df, train=(train_df if train_sample <= 1000000 else None))\n",
513 |     "\n",
514 |     "                test_predictions = model.transform(test_df)\n",
515 |     "                ml_metric_value = evaluator.evaluate(test_predictions)\n",
516 |     "\n",
517 |     "                logger.info(\n",
518 |     "                    'For the model \"%s\" trained on sample \"%s\" metrics are: \"%s\"; ROC AUC calculated by Spark is \"%s\".',\n",
519 |     "                    classifier_name,\n",
520 |     "                    train_sample,\n",
521 |     "                    metrics,\n",
522 |     "                    ml_metric_value,\n",
523 |     "                )\n",
524 |     "\n",
525 |     "                data_row = {\n",
526 |     "                    'Train size': train_sample,\n",
527 |     "                    'ROC AUC': metrics['test_roc_auc'],\n",
528 |     "                    'Log loss': metrics['test_log_loss'],\n",
529 |     "                    'Duration': duration,\n",
530 |     "                    'Engine': classifier_name,\n",
531 |     "                }\n",
532 |     "                new_quality_data.append(data_row)\n",
533 |     "                data_row_string = '\\t'.join(str(data_row[field]) for field in ['Engine', 'Train size', 'ROC AUC', 'Log loss', 'Duration'])\n",
534 |     "                new_data_rows.append(data_row_string)\n",
535 |     "                logger.info('Data row: \"%s\".', data_row_string)"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "markdown",
540 |    "metadata": {},
541 |    "source": [
542 |     "Plot metrics:"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "code",
547 |    "execution_count": null,
548 |    "metadata": {
549 |     "collapsed": true,
550 |     "scrolled": false
551 |    },
552 |    "outputs": [],
553 |    "source": [
554 |     "measurements_df = pandas.DataFrame(new_quality_data).sort_values(by=['Train size'])\n",
555 |     "plot_stuff(measurements_df, 'ROC AUC', logx=True)\n",
556 |     "plot_stuff(measurements_df, 'Log loss', logx=True, ylim=(0.135, 0.145))\n",
557 |     "plot_stuff(measurements_df, 'Duration', loglog=True, ylabel='s')"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "code",
562 |    "execution_count": null,
563 |    "metadata": {
564 |     "collapsed": true
565 |    },
566 |    "outputs": [],
567 |    "source": [
568 |     "for row in new_data_rows:\n",
569 |     "    print(row)"
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "markdown",
574 |    "metadata": {},
575 |    "source": [
576 |     "## End\n",
577 |     "[_(back to toc)_](#Table-of-contents)"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "markdown",
582 |    "metadata": {},
583 |    "source": [
584 |     "Work done, stop Spark:"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": null,
590 |    "metadata": {
591 |     "collapsed": true
592 |    },
593 |    "outputs": [],
594 |    "source": [
595 |     "spark.stop()"
596 |    ]
597 |   }
598 |  ],
599 |  "metadata": {
600 |   "kernelspec": {
601 |    "display_name": "Python 3",
602 |    "language": "python",
603 |    "name": "python3"
604 |   },
605 |   "language_info": {
606 |    "codemirror_mode": {
607 |     "name": "ipython",
608 |     "version": 3
609 |    },
610 |    "file_extension": ".py",
611 |    "mimetype": "text/x-python",
612 |    "name": "python",
613 |    "nbconvert_exporter": "python",
614 |    "pygments_lexer": "ipython3",
615 |    "version": "3.6.1"
616 |   }
617 |  },
618 |  "nbformat": 4,
619 |  "nbformat_minor": 2
620 | }
621 | 


--------------------------------------------------------------------------------
/notebooks/experiment_spark_lr.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Criteo 1 TiB benchmark - Spark.ML logistic regression\n",
 10 |     "\n",
 11 |     "Specialization of the experiment notebook for Spark.ML logistic regression."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "# Table of contents\n",
 19 |     "\n",
 20 |     "* [Configuration](#Configuration)\n",
 21 |     "* [Distributed training](#Distributed-training)\n",
 22 |     "* [End](#End)"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "%load_ext autotime\n",
 34 |     "%matplotlib inline\n",
 35 |     "\n",
 36 |     "from __future__ import print_function"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## Configuration\n",
 44 |     "[_(back to toc)_](#Table-of-contents)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "Paths:"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {
 58 |     "collapsed": true
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "libsvm_data_remote_path = 'criteo/libsvm'\n",
 63 |     "local_runtime_path = 'criteo/runtime'"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {
 70 |     "collapsed": true
 71 |    },
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "import os\n",
 75 |     "\n",
 76 |     "\n",
 77 |     "libsvm_train_template = os.path.join(libsvm_data_remote_path, 'train', '{}')\n",
 78 |     "libsvm_test_template = os.path.join(libsvm_data_remote_path, 'test', '{}')"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "def ensure_directory_exists(path):\n",
 90 |     "    if not os.path.exists(path):\n",
 91 |     "        os.makedirs(path)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "Samples to take:"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {
105 |     "collapsed": true
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "train_samples = [\n",
110 |     "    10000, 30000,  # tens of thousands\n",
111 |     "    100000, 300000,  # hundreds of thousands\n",
112 |     "    1000000, 3000000,  # millions\n",
113 |     "    10000000, 30000000,  # tens of millions\n",
114 |     "    100000000, 300000000,  # hundreds of millions\n",
115 |     "    1000000000, 3000000000,  # billions\n",
116 |     "]\n",
117 |     "\n",
118 |     "test_samples = [1000000]"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "Spark configuration and initialization:"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {
132 |     "collapsed": true
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "total_cores = 256"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {
143 |     "collapsed": true
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "executor_cores = 4\n",
148 |     "executor_instances = total_cores / executor_cores\n",
149 |     "memory_per_core = 2"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "app_name = 'Criteo experiment - LR on 128 cores'\n",
161 |     "\n",
162 |     "master = 'yarn'\n",
163 |     "\n",
164 |     "settings = {\n",
165 |     "    'spark.network.timeout': '600',\n",
166 |     "    \n",
167 |     "    'spark.driver.cores': '16',\n",
168 |     "    'spark.driver.maxResultSize': '16G',\n",
169 |     "    'spark.driver.memory': '32G',\n",
170 |     "    \n",
171 |     "    'spark.executor.cores': str(executor_cores),\n",
172 |     "    'spark.executor.instances': str(executor_instances),\n",
173 |     "    'spark.executor.memory': str(memory_per_core * executor_cores) + 'G',\n",
174 |     "    \n",
175 |     "    'spark.speculation': 'true',\n",
176 |     "    \n",
177 |     "    'spark.yarn.queue': 'root.HungerGames',\n",
178 |     "}"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {
185 |     "collapsed": true,
186 |     "scrolled": false
187 |    },
188 |    "outputs": [],
189 |    "source": [
190 |     "from pyspark.sql import SparkSession\n",
191 |     "\n",
192 |     "\n",
193 |     "builder = SparkSession.builder\n",
194 |     "\n",
195 |     "builder.appName(app_name)\n",
196 |     "builder.master(master)\n",
197 |     "for k, v in settings.items():\n",
198 |     "    builder.config(k, v)\n",
199 |     "\n",
200 |     "spark = builder.getOrCreate()\n",
201 |     "sc = spark.sparkContext\n",
202 |     "\n",
203 |     "sc.setLogLevel('ERROR')"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "Logging:"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {
217 |     "collapsed": true
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "import sys\n",
222 |     "import logging\n",
223 |     "reload(logging)\n",
224 |     "\n",
225 |     "\n",
226 |     "handler = logging.StreamHandler(stream=sys.stdout)\n",
227 |     "formatter = logging.Formatter('[%(asctime)s] %(message)s')\n",
228 |     "handler.setFormatter(formatter)\n",
229 |     "\n",
230 |     "ensure_directory_exists(local_runtime_path)\n",
231 |     "file_handler = logging.FileHandler(filename=os.path.join(local_runtime_path, 'mylog.log'), mode='a')\n",
232 |     "file_handler.setFormatter(formatter)\n",
233 |     "\n",
234 |     "logger = logging.getLogger()\n",
235 |     "logger.addHandler(handler)\n",
236 |     "logger.addHandler(file_handler)\n",
237 |     "logger.setLevel(logging.DEBUG)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "collapsed": true
245 |    },
246 |    "outputs": [],
247 |    "source": [
248 |     "logger.info('Spark version: %s.', spark.version)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "Plot measurements:"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {
262 |     "collapsed": true,
263 |     "scrolled": false
264 |    },
265 |    "outputs": [],
266 |    "source": [
267 |     "import pandas\n",
268 |     "\n",
269 |     "\n",
270 |     "def extract_data_for_plotting(df, what):\n",
271 |     "    return reduce(\n",
272 |     "        lambda left, right: pandas.merge(left, right, how='outer', on='Train size'),\n",
273 |     "        map(\n",
274 |     "            lambda name: df[df.Engine == name][['Train size', what]].rename(columns={what: name}),\n",
275 |     "            df.Engine.unique(),\n",
276 |     "        ),\n",
277 |     "    )   \n",
278 |     "\n",
279 |     "def plot_stuff(df, what, ylabel=None, **kwargs):\n",
280 |     "    data = extract_data_for_plotting(df, what).set_index('Train size')\n",
281 |     "    ax = data.plot(marker='o', figsize=(6, 6), title=what, grid=True, linewidth=2.0, **kwargs)  # xlim=(1e4, 4e9)\n",
282 |     "    if ylabel is not None:\n",
283 |     "        ax.set_ylabel(ylabel)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {},
289 |    "source": [
290 |     "Let's name samples as their shortened \"engineering\" notation - 1e5 is 100k etc.:"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {
297 |     "collapsed": true
298 |    },
299 |    "outputs": [],
300 |    "source": [
301 |     "def sample_name(sample):\n",
302 |     "    return str(sample)[::-1].replace('000', 'k')[::-1]"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "## Distributed training\n",
310 |     "[_(back to toc)_](#Table-of-contents)"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "Loading of LibSVM data as Spark.ML dataset:"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {
324 |     "collapsed": true
325 |    },
326 |    "outputs": [],
327 |    "source": [
328 |     "import hashlib\n",
329 |     "import math\n",
330 |     "import struct\n",
331 |     "from pyspark.ml.linalg import SparseVector\n",
332 |     "\n",
333 |     "\n",
334 |     "total_features = 100000\n",
335 |     "\n",
336 |     "\n",
337 |     "def hash_fun(x):\n",
338 |     "    return int(struct.unpack('L', hashlib.md5(x).digest()[:8])[0] % total_features)\n",
339 |     "\n",
340 |     "def parse_kv_pair(kv):\n",
341 |     "    k, _, v = kv.partition(':')\n",
342 |     "    return int(k), int(v)\n",
343 |     "\n",
344 |     "def parse_libsvm_line(line):\n",
345 |     "    parts = line.split(' ')\n",
346 |     "    \n",
347 |     "    label = int(parts[0])\n",
348 |     "    pairs = map(parse_kv_pair, parts[1:])\n",
349 |     "    \n",
350 |     "    for i in range(len(pairs)):\n",
351 |     "        k, v = pairs[i]\n",
352 |     "        if k < 14:\n",
353 |     "            if v > 2:\n",
354 |     "                pairs[i] = (k, int(math.log(v) ** 2))\n",
355 |     "        else:\n",
356 |     "            pairs[i] = (k, '{:08x}'.format(v))\n",
357 |     "    \n",
358 |     "    indices = sorted({hash_fun('{}_{}'.format(k, v)) for k, v in pairs})\n",
359 |     "    values = [1.0] * len(indices)\n",
360 |     "    \n",
361 |     "    return (label, SparseVector(total_features, indices, values))"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {
368 |     "collapsed": true
369 |    },
370 |    "outputs": [],
371 |    "source": [
372 |     "task_splitting = 1  # tasks per core\n",
373 |     "\n",
374 |     "def load_ml_data(template, sample):\n",
375 |     "    path = template.format(sample_name(sample))\n",
376 |     "    return sc.textFile(path).map(parse_libsvm_line).toDF(['label', 'features']).repartition(executor_cores * executor_instances * task_splitting)"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {},
382 |    "source": [
383 |     "Evaluating a model:"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": null,
389 |    "metadata": {
390 |     "collapsed": true
391 |    },
392 |    "outputs": [],
393 |    "source": [
394 |     "from matplotlib import pyplot\n",
395 |     "from sklearn.metrics import auc, log_loss, roc_curve\n",
396 |     "\n",
397 |     "\n",
398 |     "def calculate_roc(predictions):\n",
399 |     "    labels, scores = zip(*predictions.rdd.map(lambda row: (row.label, row.probability[1])).collect())\n",
400 |     "    fpr, tpr, _ = roc_curve(labels, scores)\n",
401 |     "    roc_auc = auc(fpr, tpr)\n",
402 |     "    ll = log_loss(labels, scores)\n",
403 |     "    return fpr, tpr, roc_auc, ll\n",
404 |     "\n",
405 |     "def evaluate_model(name, model, test, train=None):\n",
406 |     "    metrics = dict()\n",
407 |     "    \n",
408 |     "    figure = pyplot.figure(figsize=(6, 6))\n",
409 |     "    ax = figure.gca()\n",
410 |     "    ax.set_title('ROC - ' + name)\n",
411 |     "    \n",
412 |     "    if train is not None:\n",
413 |     "        train_predictions = model.transform(train)\n",
414 |     "        train_fpr, train_tpr, train_roc_auc, train_log_loss = calculate_roc(train_predictions)\n",
415 |     "        \n",
416 |     "        metrics['train_roc_auc'] = train_roc_auc\n",
417 |     "        metrics['train_log_loss'] = train_log_loss\n",
418 |     "        \n",
419 |     "        ax.plot(train_fpr, train_tpr, linewidth=2.0, label='train (auc = {:.3f})'.format(train_roc_auc))\n",
420 |     "    \n",
421 |     "    test_predictions = model.transform(test)\n",
422 |     "    test_fpr, test_tpr, test_roc_auc, test_log_loss = calculate_roc(test_predictions)\n",
423 |     "    \n",
424 |     "    metrics['test_roc_auc'] = test_roc_auc\n",
425 |     "    metrics['test_log_loss'] = test_log_loss\n",
426 |     "    \n",
427 |     "    ax.plot(test_fpr, test_tpr, linewidth=2.0, label='test (auc = {:.3f})'.format(test_roc_auc))\n",
428 |     "    \n",
429 |     "    ax.plot([0.0, 1.0], [0.0, 1.0], linestyle='--', c='gray')\n",
430 |     "    ax.legend()\n",
431 |     "    pyplot.show()\n",
432 |     "    \n",
433 |     "    return metrics"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "markdown",
438 |    "metadata": {},
439 |    "source": [
440 |     "Models to work on:"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": null,
446 |    "metadata": {
447 |     "collapsed": true
448 |    },
449 |    "outputs": [],
450 |    "source": [
451 |     "from pyspark.ml.classification import (\n",
452 |     "    LogisticRegression,\n",
453 |     ")"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": null,
459 |    "metadata": {
460 |     "collapsed": true
461 |    },
462 |    "outputs": [],
463 |    "source": [
464 |     "classifiers = {\n",
465 |     "    'lr': LogisticRegression(regParam=0.03),\n",
466 |     "}"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "markdown",
471 |    "metadata": {},
472 |    "source": [
473 |     "Monkey-patch RDDs and DataFrames for context persistence:"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": null,
479 |    "metadata": {
480 |     "collapsed": true
481 |    },
482 |    "outputs": [],
483 |    "source": [
484 |     "import pyspark\n",
485 |     "\n",
486 |     "\n",
487 |     "def enter_method(self):\n",
488 |     "    self.persist()\n",
489 |     "\n",
490 |     "def exit_method(self,exc_type, exc, traceback):\n",
491 |     "    self.unpersist()\n",
492 |     "\n",
493 |     "\n",
494 |     "pyspark.sql.dataframe.DataFrame.__enter__ = enter_method\n",
495 |     "pyspark.sql.dataframe.DataFrame.__exit__ = exit_method"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "markdown",
500 |    "metadata": {},
501 |    "source": [
502 |     "Do distributed training:"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": null,
508 |    "metadata": {
509 |     "collapsed": true,
510 |     "scrolled": false
511 |    },
512 |    "outputs": [],
513 |    "source": [
514 |     "import time\n",
515 |     "\n",
516 |     "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
517 |     "\n",
518 |     "\n",
519 |     "test_sample = test_samples[-1]\n",
520 |     "\n",
521 |     "evaluator = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='probability', metricName='areaUnderROC')\n",
522 |     "\n",
523 |     "new_quality_data = []\n",
524 |     "new_data_rows = []\n",
525 |     "\n",
526 |     "test_dfs = dict()\n",
527 |     "\n",
528 |     "logger.info('Loading \"%s\" test samples.', test_sample)\n",
529 |     "test_df = load_ml_data(libsvm_test_template, test_sample)\n",
530 |     "with test_df:\n",
531 |     "    logger.info('Loaded \"%s\" lines.', test_df.count())\n",
532 |     "\n",
533 |     "    for train_sample in train_samples:\n",
534 |     "\n",
535 |     "        logger.info('Working on \"%s\" train sample.', train_sample)\n",
536 |     "\n",
537 |     "        for classifier_name, classifier in classifiers.items():\n",
538 |     "\n",
539 |     "            logger.info('Training a model \"%s\" on sample \"%s\".', classifier_name, train_sample)\n",
540 |     "\n",
541 |     "            logger.info('Loading \"%s\" train samples.', train_sample)\n",
542 |     "            train_df = load_ml_data(libsvm_train_template, train_sample)\n",
543 |     "            with train_df:\n",
544 |     "                logger.info('Loaded \"%s\" lines.', train_df.count())\n",
545 |     "                \n",
546 |     "                logger.info('Training a model \"%s\" on sample \"%s\".', classifier_name, train_sample)\n",
547 |     "\n",
548 |     "                start = time.time()\n",
549 |     "                model = classifier.fit(train_df)\n",
550 |     "                duration = time.time() - start\n",
551 |     "\n",
552 |     "                logger.info('Training a model \"%s\" on sample \"%s\" took \"%g\" seconds.', classifier_name, train_sample, duration)\n",
553 |     "\n",
554 |     "                logger.info('Evaluating the model \"%s\" trained on sample \"%s\".', classifier_name, train_sample)\n",
555 |     "                metrics = evaluate_model(classifier_name + ' - ' + sample_name(train_sample), model, test_df, train=(train_df if train_sample <= 1000000 else None))\n",
556 |     "\n",
557 |     "                test_predictions = model.transform(test_df)\n",
558 |     "                ml_metric_value = evaluator.evaluate(test_predictions)\n",
559 |     "\n",
560 |     "                logger.info(\n",
561 |     "                    'For the model \"%s\" trained on sample \"%s\" metrics are: \"%s\"; ROC AUC calculated by Spark is \"%s\".',\n",
562 |     "                    classifier_name,\n",
563 |     "                    train_sample,\n",
564 |     "                    metrics,\n",
565 |     "                    ml_metric_value,\n",
566 |     "                )\n",
567 |     "\n",
568 |     "                data_row = {\n",
569 |     "                    'Train size': train_sample,\n",
570 |     "                    'ROC AUC': metrics['test_roc_auc'],\n",
571 |     "                    'Log loss': metrics['test_log_loss'],\n",
572 |     "                    'Duration': duration,\n",
573 |     "                    'Engine': classifier_name,\n",
574 |     "                }\n",
575 |     "                new_quality_data.append(data_row)\n",
576 |     "                data_row_string = '\\t'.join(str(data_row[field]) for field in ['Engine', 'Train size', 'ROC AUC', 'Log loss', 'Duration'])\n",
577 |     "                new_data_rows.append(data_row_string)\n",
578 |     "                logger.info('Data row: \"%s\".', data_row_string)"
579 |    ]
580 |   },
581 |   {
582 |    "cell_type": "code",
583 |    "execution_count": null,
584 |    "metadata": {
585 |     "collapsed": true,
586 |     "scrolled": false
587 |    },
588 |    "outputs": [],
589 |    "source": [
590 |     "measurements_df = pandas.DataFrame(new_quality_data).sort_values(by=['Train size'])\n",
591 |     "plot_stuff(measurements_df, 'ROC AUC', logx=True)\n",
592 |     "plot_stuff(measurements_df, 'Log loss', logx=True, ylim=(0.13, 0.18))\n",
593 |     "plot_stuff(measurements_df, 'Duration', loglog=True, ylabel='s')"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": null,
599 |    "metadata": {
600 |     "collapsed": true
601 |    },
602 |    "outputs": [],
603 |    "source": [
604 |     "for row in new_data_rows:\n",
605 |     "    print(row)"
606 |    ]
607 |   },
608 |   {
609 |    "cell_type": "markdown",
610 |    "metadata": {},
611 |    "source": [
612 |     "## End\n",
613 |     "[_(back to toc)_](#Table-of-contents)"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "markdown",
618 |    "metadata": {},
619 |    "source": [
620 |     "Work done, stop Spark:"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": null,
626 |    "metadata": {
627 |     "collapsed": true
628 |    },
629 |    "outputs": [],
630 |    "source": [
631 |     "spark.stop()"
632 |    ]
633 |   }
634 |  ],
635 |  "metadata": {
636 |   "kernelspec": {
637 |    "display_name": "Python 3",
638 |    "language": "python",
639 |    "name": "python3"
640 |   },
641 |   "language_info": {
642 |    "codemirror_mode": {
643 |     "name": "ipython",
644 |     "version": 3
645 |    },
646 |    "file_extension": ".py",
647 |    "mimetype": "text/x-python",
648 |    "name": "python",
649 |    "nbconvert_exporter": "python",
650 |    "pygments_lexer": "ipython3",
651 |    "version": "3.6.1"
652 |   }
653 |  },
654 |  "nbformat": 4,
655 |  "nbformat_minor": 2
656 | }
657 | 


--------------------------------------------------------------------------------
/notebooks/experiment_local.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Criteo 1 TiB benchmark\n",
   8 |     "\n",
   9 |     "In this experiment we will evalutate a number of machine learning tools on a varying size of train data to determine how fast they learn, how much memory they consume etc.\n",
  10 |     "\n",
  11 |     "We will assess Vowpal Wabbit and XGBoost in local mode, and Spark.ML models in cluster mode.\n",
  12 |     "\n",
  13 |     "We will use terabyte click logs released by Criteo and sample needed amount of data from them.\n",
  14 |     "\n",
  15 |     "This instance of experiment notebook focuses on data preparation and training VW & XGBoost locally.\n",
  16 |     "\n",
  17 |     "Let's go!"
  18 |    ]
  19 |   },
  20 |   {
  21 |    "cell_type": "markdown",
  22 |    "metadata": {},
  23 |    "source": [
  24 |     "# Table of contents\n",
  25 |     "\n",
  26 |     "* [Configuration](#Configuration)\n",
  27 |     "* [Data preparation](#Data-preparation)\n",
  28 |     "  * [Criteo → LibSVM](#Criteo-→-LibSVM)\n",
  29 |     "  * [LibSVM → Train and test (sampling)](#LibSVM-→-Train-and-test-(sampling%29)\n",
  30 |     "  * [LibSVM train and test → VW train and test](#LibSVM-train-and-test-→-VW-train-and-test)\n",
  31 |     "  * [Local data](#Local-data)\n",
  32 |     "* [Local training](#Local-training)"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "code",
  37 |    "execution_count": null,
  38 |    "metadata": {
  39 |     "collapsed": true
  40 |    },
  41 |    "outputs": [],
  42 |    "source": [
  43 |     "%load_ext autotime\n",
  44 |     "%matplotlib inline\n",
  45 |     "\n",
  46 |     "from __future__ import print_function"
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "markdown",
  51 |    "metadata": {},
  52 |    "source": [
  53 |     "## Configuration\n",
  54 |     "[_(back to toc)_](#Table-of-contents)"
  55 |    ]
  56 |   },
  57 |   {
  58 |    "cell_type": "markdown",
  59 |    "metadata": {},
  60 |    "source": [
  61 |     "Paths:"
  62 |    ]
  63 |   },
  64 |   {
  65 |    "cell_type": "code",
  66 |    "execution_count": null,
  67 |    "metadata": {},
  68 |    "outputs": [],
  69 |    "source": [
  70 |     "criteo_data_remote_path = 'criteo/plain'\n",
  71 |     "libsvm_data_remote_path = 'criteo/libsvm'\n",
  72 |     "vw_data_remote_path = 'criteo/vw'\n",
  73 |     "\n",
  74 |     "local_data_path = 'criteo/data'\n",
  75 |     "local_results_path = 'criteo/results'\n",
  76 |     "local_runtime_path = 'criteo/runtime'"
  77 |    ]
  78 |   },
  79 |   {
  80 |    "cell_type": "code",
  81 |    "execution_count": null,
  82 |    "metadata": {},
  83 |    "outputs": [],
  84 |    "source": [
  85 |     "import os\n",
  86 |     "\n",
  87 |     "\n",
  88 |     "criteo_day_template = os.path.join(criteo_data_remote_path, 'day_{}')\n",
  89 |     "libsvm_day_template = os.path.join(libsvm_data_remote_path, 'day_{}')\n",
  90 |     "vw_day_template = os.path.join(vw_data_remote_path, 'day_{}')\n",
  91 |     "\n",
  92 |     "libsvm_train_template = os.path.join(libsvm_data_remote_path, 'train', '{}')\n",
  93 |     "libsvm_test_template = os.path.join(libsvm_data_remote_path, 'test', '{}')\n",
  94 |     "vw_train_template = os.path.join(vw_data_remote_path, 'train', '{}')\n",
  95 |     "vw_test_template = os.path.join(vw_data_remote_path, 'test', '{}')\n",
  96 |     "\n",
  97 |     "local_libsvm_test_template = os.path.join(local_data_path, 'data.test.{}.libsvm')\n",
  98 |     "local_libsvm_train_template = os.path.join(local_data_path, 'data.train.{}.libsvm')\n",
  99 |     "local_vw_test_template = os.path.join(local_data_path, 'data.test.{}.vw')\n",
 100 |     "local_vw_train_template = os.path.join(local_data_path, 'data.train.{}.vw')"
 101 |    ]
 102 |   },
 103 |   {
 104 |    "cell_type": "code",
 105 |    "execution_count": null,
 106 |    "metadata": {},
 107 |    "outputs": [],
 108 |    "source": [
 109 |     "def ensure_directory_exists(path):\n",
 110 |     "    if not os.path.exists(path):\n",
 111 |     "        os.makedirs(path)"
 112 |    ]
 113 |   },
 114 |   {
 115 |    "cell_type": "markdown",
 116 |    "metadata": {},
 117 |    "source": [
 118 |     "Days to work on:"
 119 |    ]
 120 |   },
 121 |   {
 122 |    "cell_type": "code",
 123 |    "execution_count": null,
 124 |    "metadata": {},
 125 |    "outputs": [],
 126 |    "source": [
 127 |     "days = list(range(0, 23 + 1))"
 128 |    ]
 129 |   },
 130 |   {
 131 |    "cell_type": "markdown",
 132 |    "metadata": {},
 133 |    "source": [
 134 |     "Samples to take:"
 135 |    ]
 136 |   },
 137 |   {
 138 |    "cell_type": "code",
 139 |    "execution_count": null,
 140 |    "metadata": {},
 141 |    "outputs": [],
 142 |    "source": [
 143 |     "train_samples = [\n",
 144 |     "    10000, 30000,  # tens of thousands\n",
 145 |     "    100000, 300000,  # hundreds of thousands\n",
 146 |     "    1000000, 3000000,  # millions\n",
 147 |     "    10000000, 30000000,  # tens of millions\n",
 148 |     "    100000000, 300000000,  # hundreds of millions\n",
 149 |     "    1000000000, 3000000000,  # billions\n",
 150 |     "]\n",
 151 |     "test_samples = [1000000]"
 152 |    ]
 153 |   },
 154 |   {
 155 |    "cell_type": "markdown",
 156 |    "metadata": {},
 157 |    "source": [
 158 |     "Spark configuration and initialization:"
 159 |    ]
 160 |   },
 161 |   {
 162 |    "cell_type": "code",
 163 |    "execution_count": null,
 164 |    "metadata": {},
 165 |    "outputs": [],
 166 |    "source": [
 167 |     "total_cores = 256"
 168 |    ]
 169 |   },
 170 |   {
 171 |    "cell_type": "code",
 172 |    "execution_count": null,
 173 |    "metadata": {},
 174 |    "outputs": [],
 175 |    "source": [
 176 |     "executor_cores = 4\n",
 177 |     "executor_instances = total_cores / executor_cores\n",
 178 |     "memory_per_core = 4"
 179 |    ]
 180 |   },
 181 |   {
 182 |    "cell_type": "code",
 183 |    "execution_count": null,
 184 |    "metadata": {},
 185 |    "outputs": [],
 186 |    "source": [
 187 |     "app_name = 'Criteo experiment'\n",
 188 |     "\n",
 189 |     "master = 'yarn'\n",
 190 |     "\n",
 191 |     "settings = {\n",
 192 |     "    'spark.network.timeout': '600',\n",
 193 |     "    \n",
 194 |     "    'spark.driver.cores': '16',\n",
 195 |     "    'spark.driver.maxResultSize': '16G',\n",
 196 |     "    'spark.driver.memory': '32G',\n",
 197 |     "    \n",
 198 |     "    'spark.executor.cores': str(executor_cores),\n",
 199 |     "    'spark.executor.instances': str(executor_instances),\n",
 200 |     "    'spark.executor.memory': str(memory_per_core * executor_cores) + 'G',\n",
 201 |     "    \n",
 202 |     "    'spark.speculation': 'true',\n",
 203 |     "    'spark.yarn.queue': 'root.HungerGames',\n",
 204 |     "}"
 205 |    ]
 206 |   },
 207 |   {
 208 |    "cell_type": "code",
 209 |    "execution_count": null,
 210 |    "metadata": {
 211 |     "scrolled": true
 212 |    },
 213 |    "outputs": [],
 214 |    "source": [
 215 |     "from pyspark.sql import SparkSession\n",
 216 |     "\n",
 217 |     "\n",
 218 |     "builder = SparkSession.builder\n",
 219 |     "\n",
 220 |     "builder.appName(app_name)\n",
 221 |     "builder.master(master)\n",
 222 |     "for k, v in settings.items():\n",
 223 |     "    builder.config(k, v)\n",
 224 |     "\n",
 225 |     "spark = builder.getOrCreate()\n",
 226 |     "sc = spark.sparkContext\n",
 227 |     "\n",
 228 |     "sc.setLogLevel('ERROR')"
 229 |    ]
 230 |   },
 231 |   {
 232 |    "cell_type": "markdown",
 233 |    "metadata": {},
 234 |    "source": [
 235 |     "Logging:"
 236 |    ]
 237 |   },
 238 |   {
 239 |    "cell_type": "code",
 240 |    "execution_count": null,
 241 |    "metadata": {},
 242 |    "outputs": [],
 243 |    "source": [
 244 |     "import sys\n",
 245 |     "import logging\n",
 246 |     "reload(logging)\n",
 247 |     "\n",
 248 |     "\n",
 249 |     "handler = logging.StreamHandler(stream=sys.stdout)\n",
 250 |     "formatter = logging.Formatter('[%(asctime)s] %(message)s')\n",
 251 |     "handler.setFormatter(formatter)\n",
 252 |     "\n",
 253 |     "ensure_directory_exists(local_runtime_path)\n",
 254 |     "file_handler = logging.FileHandler(filename=os.path.join(local_runtime_path, 'mylog.log'), mode='a')\n",
 255 |     "file_handler.setFormatter(formatter)\n",
 256 |     "\n",
 257 |     "logger = logging.getLogger()\n",
 258 |     "logger.addHandler(handler)\n",
 259 |     "logger.addHandler(file_handler)\n",
 260 |     "logger.setLevel(logging.INFO)"
 261 |    ]
 262 |   },
 263 |   {
 264 |    "cell_type": "code",
 265 |    "execution_count": null,
 266 |    "metadata": {},
 267 |    "outputs": [],
 268 |    "source": [
 269 |     "logger.info('Spark version: %s.', spark.version)"
 270 |    ]
 271 |   },
 272 |   {
 273 |    "cell_type": "markdown",
 274 |    "metadata": {},
 275 |    "source": [
 276 |     "## Data preparation\n",
 277 |     "[_(back to toc)_](#Table-of-contents)"
 278 |    ]
 279 |   },
 280 |   {
 281 |    "cell_type": "markdown",
 282 |    "metadata": {},
 283 |    "source": [
 284 |     "Poor man's HDFS API:"
 285 |    ]
 286 |   },
 287 |   {
 288 |    "cell_type": "code",
 289 |    "execution_count": null,
 290 |    "metadata": {},
 291 |    "outputs": [],
 292 |    "source": [
 293 |     "def hdfs_exists(path):\n",
 294 |     "    l = !hadoop fs -ls $path 2>/dev/null\n",
 295 |     "    return len(l) != 0\n",
 296 |     "\n",
 297 |     "def hdfs_success(path):\n",
 298 |     "    return hdfs_exists(os.path.join(path, '_SUCCESS'))\n",
 299 |     "\n",
 300 |     "def hdfs_delete(path, recurse=False):\n",
 301 |     "    if recurse:\n",
 302 |     "        _ = !hadoop fs -rm -r $path\n",
 303 |     "    else:\n",
 304 |     "        _ = !hadoop fs -rm $path\n",
 305 |     "\n",
 306 |     "def hdfs_get(remote_path, local_path):\n",
 307 |     "    remote_path_glob = os.path.join(remote_path, 'part-*')\n",
 308 |     "    _ = !hadoop fs -cat $remote_path_glob >$local_path"
 309 |    ]
 310 |   },
 311 |   {
 312 |    "cell_type": "markdown",
 313 |    "metadata": {},
 314 |    "source": [
 315 |     "Load RDDs from one place and save them to another converted:"
 316 |    ]
 317 |   },
 318 |   {
 319 |    "cell_type": "code",
 320 |    "execution_count": null,
 321 |    "metadata": {},
 322 |    "outputs": [],
 323 |    "source": [
 324 |     "def convert_chunked_data(input_path_template, output_path_template, chunks, load_rdd, convert_row, transform_rdd=None):\n",
 325 |     "    for chunk in chunks:\n",
 326 |     "        input_path = input_path_template.format(chunk)\n",
 327 |     "        output_path = output_path_template.format(chunk)\n",
 328 |     "\n",
 329 |     "        if hdfs_success(output_path):\n",
 330 |     "            logger.info('Chunk \"%s\" is already converted and saved to \"%s\", skipping.', chunk, output_path)\n",
 331 |     "            continue\n",
 332 |     "\n",
 333 |     "        logger.info('Reading chunk \"%s\" data from \"%s\".', chunk, input_path)\n",
 334 |     "        rdd = load_rdd(input_path)\n",
 335 |     "\n",
 336 |     "        if hdfs_exists(output_path):\n",
 337 |     "            logger.info('Cleaning \"%s\".', output_path)\n",
 338 |     "            hdfs_delete(output_path, recurse=True)\n",
 339 |     "\n",
 340 |     "        logger.info('Processing and saving to \"%s\".', output_path)\n",
 341 |     "        rdd = rdd.map(convert_row)\n",
 342 |     "        \n",
 343 |     "        if transform_rdd is not None:\n",
 344 |     "            rdd = transform_rdd(rdd)\n",
 345 |     "        \n",
 346 |     "        rdd.saveAsTextFile(output_path)\n",
 347 |     "\n",
 348 |     "        logger.info('Done with chunk \"%s\".', chunk)"
 349 |    ]
 350 |   },
 351 |   {
 352 |    "cell_type": "markdown",
 353 |    "metadata": {},
 354 |    "source": [
 355 |     "### Criteo → LibSVM\n",
 356 |     "[_(back to toc)_](#Table-of-contents)"
 357 |    ]
 358 |   },
 359 |   {
 360 |    "cell_type": "markdown",
 361 |    "metadata": {},
 362 |    "source": [
 363 |     "Criteo RDD is actually a DataFrame:"
 364 |    ]
 365 |   },
 366 |   {
 367 |    "cell_type": "code",
 368 |    "execution_count": null,
 369 |    "metadata": {},
 370 |    "outputs": [],
 371 |    "source": [
 372 |     "def load_criteo_rdd(path):\n",
 373 |     "    return (\n",
 374 |     "        spark\n",
 375 |     "        .read\n",
 376 |     "        .option('header', 'false')\n",
 377 |     "        .option('inferSchema', 'true')\n",
 378 |     "        .option('delimiter', '\\t')\n",
 379 |     "        .csv(path)\n",
 380 |     "        .rdd\n",
 381 |     "    )"
 382 |    ]
 383 |   },
 384 |   {
 385 |    "cell_type": "markdown",
 386 |    "metadata": {},
 387 |    "source": [
 388 |     "Simply add an index to each existing column except the first one which is a target:"
 389 |    ]
 390 |   },
 391 |   {
 392 |    "cell_type": "code",
 393 |    "execution_count": null,
 394 |    "metadata": {},
 395 |    "outputs": [],
 396 |    "source": [
 397 |     "def criteo_to_libsvm(row):\n",
 398 |     "    return (\n",
 399 |     "        str(row[0])\n",
 400 |     "        + ' '\n",
 401 |     "        + ' '.join(\n",
 402 |     "            [\n",
 403 |     "                # integer features\n",
 404 |     "                str(i) + ':' + str(row[i])\n",
 405 |     "                for i in range(1, 13 + 1)\n",
 406 |     "                if row[i] is not None\n",
 407 |     "            ] + [\n",
 408 |     "                # string features converted from hex to int\n",
 409 |     "                str(i) + ':' + str(int(row[i], 16))\n",
 410 |     "                for i in range(14, 39 + 1)\n",
 411 |     "                if row[i] is not None\n",
 412 |     "            ]\n",
 413 |     "        )\n",
 414 |     "    )"
 415 |    ]
 416 |   },
 417 |   {
 418 |    "cell_type": "markdown",
 419 |    "metadata": {},
 420 |    "source": [
 421 |     "Do it for all days:"
 422 |    ]
 423 |   },
 424 |   {
 425 |    "cell_type": "code",
 426 |    "execution_count": null,
 427 |    "metadata": {
 428 |     "collapsed": true
 429 |    },
 430 |    "outputs": [],
 431 |    "source": [
 432 |     "convert_chunked_data(criteo_day_template, libsvm_day_template, days, load_criteo_rdd, criteo_to_libsvm)"
 433 |    ]
 434 |   },
 435 |   {
 436 |    "cell_type": "markdown",
 437 |    "metadata": {},
 438 |    "source": [
 439 |     "### LibSVM → Train and test (sampling)\n",
 440 |     "[_(back to toc)_](#Table-of-contents)"
 441 |    ]
 442 |   },
 443 |   {
 444 |    "cell_type": "markdown",
 445 |    "metadata": {},
 446 |    "source": [
 447 |     "Let's name samples as their shortened \"engineering\" notation - e.g. 1e5 is 100k etc.:"
 448 |    ]
 449 |   },
 450 |   {
 451 |    "cell_type": "code",
 452 |    "execution_count": null,
 453 |    "metadata": {},
 454 |    "outputs": [],
 455 |    "source": [
 456 |     "def sample_name(sample):\n",
 457 |     "    return str(sample)[::-1].replace('000', 'k')[::-1]"
 458 |    ]
 459 |   },
 460 |   {
 461 |    "cell_type": "markdown",
 462 |    "metadata": {},
 463 |    "source": [
 464 |     "Load data, sample a bit more than needed and cut at exact desired number of lines by zipping with index and filtering upto required index:"
 465 |    ]
 466 |   },
 467 |   {
 468 |    "cell_type": "code",
 469 |    "execution_count": null,
 470 |    "metadata": {},
 471 |    "outputs": [],
 472 |    "source": [
 473 |     "oversample = 1.03\n",
 474 |     "sampled_partitions = 256\n",
 475 |     "\n",
 476 |     "\n",
 477 |     "def sample_and_save(input_path_template, output_path_template, days, samples):\n",
 478 |     "    union = None\n",
 479 |     "    union_count = None\n",
 480 |     "    \n",
 481 |     "    for sample in samples:\n",
 482 |     "        name = sample_name(sample)\n",
 483 |     "        output_path = output_path_template.format(name)\n",
 484 |     "        \n",
 485 |     "        if hdfs_success(output_path):\n",
 486 |     "            logger.info('Sample \"%s\" is already written to \"%s\", skipping.', sample, output_path)\n",
 487 |     "            continue\n",
 488 |     "            \n",
 489 |     "        logger.info('Preparing to write sample to \"%s\".', output_path)\n",
 490 |     "        \n",
 491 |     "        if union is None:\n",
 492 |     "            rdds = map(lambda day: sc.textFile(input_path_template.format(day)), days)\n",
 493 |     "            union = reduce(lambda left, right: left.union(right), rdds)\n",
 494 |     "\n",
 495 |     "            union_count = union.count()\n",
 496 |     "            logger.info('Total number of lines for days \"%s\" is \"%s\".', days, union_count)\n",
 497 |     "            \n",
 498 |     "        ratio = float(sample) / union_count\n",
 499 |     "        \n",
 500 |     "        sampled_union = (\n",
 501 |     "            union\n",
 502 |     "            .sample(False, min(1.0, oversample * ratio))\n",
 503 |     "            .zipWithIndex()\n",
 504 |     "            .filter(lambda z: z[1] < sample)\n",
 505 |     "            .map(lambda z: z[0])\n",
 506 |     "        )\n",
 507 |     "        \n",
 508 |     "        if hdfs_exists(output_path):\n",
 509 |     "            logger.info('Cleaning \"%s\".', output_path)\n",
 510 |     "            hdfs_delete(output_path, recurse=True)\n",
 511 |     "            \n",
 512 |     "        logger.info('Writing sample \"%s\" to \"%s\".', sample, output_path)\n",
 513 |     "        sampled_union.coalesce(sampled_partitions).saveAsTextFile(output_path)\n",
 514 |     "        \n",
 515 |     "        logger.info('Saved \"%s\" lines to \"%s\".', sc.textFile(output_path).count(), output_path)"
 516 |    ]
 517 |   },
 518 |   {
 519 |    "cell_type": "markdown",
 520 |    "metadata": {},
 521 |    "source": [
 522 |     "Sample all LibSVM data:"
 523 |    ]
 524 |   },
 525 |   {
 526 |    "cell_type": "code",
 527 |    "execution_count": null,
 528 |    "metadata": {},
 529 |    "outputs": [],
 530 |    "source": [
 531 |     "sample_and_save(libsvm_day_template, libsvm_test_template, days[-1:], test_samples)"
 532 |    ]
 533 |   },
 534 |   {
 535 |    "cell_type": "code",
 536 |    "execution_count": null,
 537 |    "metadata": {},
 538 |    "outputs": [],
 539 |    "source": [
 540 |     "sample_and_save(libsvm_day_template, libsvm_train_template, days[:-1], train_samples)"
 541 |    ]
 542 |   },
 543 |   {
 544 |    "cell_type": "markdown",
 545 |    "metadata": {},
 546 |    "source": [
 547 |     "### LibSVM train and test → VW train and test\n",
 548 |     "[_(back to toc)_](#Table-of-contents)"
 549 |    ]
 550 |   },
 551 |   {
 552 |    "cell_type": "markdown",
 553 |    "metadata": {},
 554 |    "source": [
 555 |     "LibSVM RDD is a text file:"
 556 |    ]
 557 |   },
 558 |   {
 559 |    "cell_type": "code",
 560 |    "execution_count": null,
 561 |    "metadata": {},
 562 |    "outputs": [],
 563 |    "source": [
 564 |     "def load_libsvm_rdd(path):\n",
 565 |     "    return sc.textFile(path)"
 566 |    ]
 567 |   },
 568 |   {
 569 |    "cell_type": "markdown",
 570 |    "metadata": {},
 571 |    "source": [
 572 |     "Conversion is trivial - we only have to map target to {-1, 1} and convert categorical features to VW feature names as a whole:"
 573 |    ]
 574 |   },
 575 |   {
 576 |    "cell_type": "code",
 577 |    "execution_count": null,
 578 |    "metadata": {},
 579 |    "outputs": [],
 580 |    "source": [
 581 |     "def libsvm_to_vw(line):\n",
 582 |     "    parts = line.split(' ')\n",
 583 |     "    parts[0] = '1 |' if parts[0] == '1' else '-1 |'\n",
 584 |     "    for i in range(1, len(parts)):\n",
 585 |     "        index, _, value = parts[i].partition(':')\n",
 586 |     "        if int(index) >= 14:\n",
 587 |     "            parts[i] = index + '_' + value\n",
 588 |     "    return ' '.join(parts)"
 589 |    ]
 590 |   },
 591 |   {
 592 |    "cell_type": "markdown",
 593 |    "metadata": {},
 594 |    "source": [
 595 |     "Also, data for VW should be well shuffled:"
 596 |    ]
 597 |   },
 598 |   {
 599 |    "cell_type": "code",
 600 |    "execution_count": null,
 601 |    "metadata": {},
 602 |    "outputs": [],
 603 |    "source": [
 604 |     "import hashlib\n",
 605 |     "\n",
 606 |     "\n",
 607 |     "def calculate_hash(something):\n",
 608 |     "    m = hashlib.md5()\n",
 609 |     "    m.update(str(something))\n",
 610 |     "    return m.hexdigest()\n",
 611 |     "\n",
 612 |     "def random_sort(rdd):\n",
 613 |     "    return (\n",
 614 |     "        rdd\n",
 615 |     "        .zipWithIndex()\n",
 616 |     "        .sortBy(lambda z: calculate_hash(z[1]))\n",
 617 |     "        .map(lambda z: z[0])\n",
 618 |     "    )"
 619 |    ]
 620 |   },
 621 |   {
 622 |    "cell_type": "markdown",
 623 |    "metadata": {},
 624 |    "source": [
 625 |     "Convert all LibSVM samples:"
 626 |    ]
 627 |   },
 628 |   {
 629 |    "cell_type": "code",
 630 |    "execution_count": null,
 631 |    "metadata": {},
 632 |    "outputs": [],
 633 |    "source": [
 634 |     "convert_chunked_data(libsvm_test_template, vw_test_template, [sample_name(sample) for sample in test_samples], load_libsvm_rdd, libsvm_to_vw, transform_rdd=random_sort)"
 635 |    ]
 636 |   },
 637 |   {
 638 |    "cell_type": "code",
 639 |    "execution_count": null,
 640 |    "metadata": {
 641 |     "scrolled": false
 642 |    },
 643 |    "outputs": [],
 644 |    "source": [
 645 |     "convert_chunked_data(libsvm_train_template, vw_train_template, [sample_name(sample) for sample in train_samples], load_libsvm_rdd, libsvm_to_vw, transform_rdd=random_sort)"
 646 |    ]
 647 |   },
 648 |   {
 649 |    "cell_type": "markdown",
 650 |    "metadata": {},
 651 |    "source": [
 652 |     "Spark is no longer needed:"
 653 |    ]
 654 |   },
 655 |   {
 656 |    "cell_type": "code",
 657 |    "execution_count": null,
 658 |    "metadata": {
 659 |     "collapsed": true
 660 |    },
 661 |    "outputs": [],
 662 |    "source": [
 663 |     "spark.stop()"
 664 |    ]
 665 |   },
 666 |   {
 667 |    "cell_type": "markdown",
 668 |    "metadata": {},
 669 |    "source": [
 670 |     "### Local data\n",
 671 |     "[_(back to toc)_](#Table-of-contents)"
 672 |    ]
 673 |   },
 674 |   {
 675 |    "cell_type": "markdown",
 676 |    "metadata": {},
 677 |    "source": [
 678 |     "Download all sampled data to local directory:"
 679 |    ]
 680 |   },
 681 |   {
 682 |    "cell_type": "code",
 683 |    "execution_count": null,
 684 |    "metadata": {},
 685 |    "outputs": [],
 686 |    "source": [
 687 |     "ensure_directory_exists(local_data_path)"
 688 |    ]
 689 |   },
 690 |   {
 691 |    "cell_type": "code",
 692 |    "execution_count": null,
 693 |    "metadata": {},
 694 |    "outputs": [],
 695 |    "source": [
 696 |     "def count_lines(path):\n",
 697 |     "    with open(path) as f:\n",
 698 |     "        for i, _ in enumerate(f):\n",
 699 |     "            pass\n",
 700 |     "    return i + 1\n",
 701 |     "\n",
 702 |     "def download_data(remote_template, local_template, samples):\n",
 703 |     "    for sample in samples:\n",
 704 |     "        name = sample_name(sample)\n",
 705 |     "        remote_path = remote_template.format(name)\n",
 706 |     "        local_path = local_template.format(name)\n",
 707 |     "        if os.path.exists(local_path):\n",
 708 |     "            count = count_lines(local_path)\n",
 709 |     "            if count == sample:\n",
 710 |     "                logger.info('File \"%s\" is already loaded, skipping.', local_path)\n",
 711 |     "                continue\n",
 712 |     "            else:\n",
 713 |     "                logger.info('File \"%s\" already exists but number of lines \"%s\" is wrong (must be \"%s\"), reloading.', local_path, count, sample)\n",
 714 |     "        logger.info('Loading file \"%s\" as local file \"%s\".', remote_path, local_path)\n",
 715 |     "        hdfs_get(remote_path, local_path)\n",
 716 |     "        count = count_lines(local_path)\n",
 717 |     "        logger.info('File loaded to \"%s\", number of lines is \"%s\".', local_path, count)\n",
 718 |     "        assert count == sample, 'File \"{}\" contains wrong number of lines \"{}\" (must be \"{}\").'.format(local_path, count, sample)"
 719 |    ]
 720 |   },
 721 |   {
 722 |    "cell_type": "code",
 723 |    "execution_count": null,
 724 |    "metadata": {},
 725 |    "outputs": [],
 726 |    "source": [
 727 |     "download_data(libsvm_test_template, local_libsvm_test_template, test_samples)"
 728 |    ]
 729 |   },
 730 |   {
 731 |    "cell_type": "code",
 732 |    "execution_count": null,
 733 |    "metadata": {},
 734 |    "outputs": [],
 735 |    "source": [
 736 |     "download_data(libsvm_train_template, local_libsvm_train_template, train_samples)"
 737 |    ]
 738 |   },
 739 |   {
 740 |    "cell_type": "code",
 741 |    "execution_count": null,
 742 |    "metadata": {},
 743 |    "outputs": [],
 744 |    "source": [
 745 |     "download_data(vw_test_template, local_vw_test_template, test_samples)"
 746 |    ]
 747 |   },
 748 |   {
 749 |    "cell_type": "code",
 750 |    "execution_count": null,
 751 |    "metadata": {},
 752 |    "outputs": [],
 753 |    "source": [
 754 |     "download_data(vw_train_template, local_vw_train_template, train_samples)"
 755 |    ]
 756 |   },
 757 |   {
 758 |    "cell_type": "markdown",
 759 |    "metadata": {},
 760 |    "source": [
 761 |     "## Local training\n",
 762 |     "[_(back to toc)_](#Table-of-contents)"
 763 |    ]
 764 |   },
 765 |   {
 766 |    "cell_type": "markdown",
 767 |    "metadata": {},
 768 |    "source": [
 769 |     "Measuring model quality and ML engine technical metrics:"
 770 |    ]
 771 |   },
 772 |   {
 773 |    "cell_type": "code",
 774 |    "execution_count": null,
 775 |    "metadata": {},
 776 |    "outputs": [],
 777 |    "source": [
 778 |     "import sys \n",
 779 |     "from matplotlib import pyplot\n",
 780 |     "from sklearn.metrics import (\n",
 781 |     "    auc,\n",
 782 |     "    log_loss,\n",
 783 |     "    roc_curve,\n",
 784 |     ")\n",
 785 |     "\n",
 786 |     "\n",
 787 |     "def measure(engine, sample, test_file, time_file, predictions_file):\n",
 788 |     "    \n",
 789 |     "    def get_last_in_line(s):\n",
 790 |     "        return s.rstrip().split( )[-1]\n",
 791 |     "\n",
 792 |     "    def parse_elapsed_time(s):\n",
 793 |     "        return reduce(lambda a, b: a * 60 + b, map(float, get_last_in_line(s).split(':')))\n",
 794 |     "\n",
 795 |     "    def parse_max_memory(s):\n",
 796 |     "        return int(get_last_in_line(s)) * 1024\n",
 797 |     "\n",
 798 |     "    def parse_cpu(s):\n",
 799 |     "        return float(get_last_in_line(s).rstrip('%')) / 100 \n",
 800 |     "\n",
 801 |     "\n",
 802 |     "    elapsed = -1\n",
 803 |     "    memory = -1\n",
 804 |     "    cpu = -1\n",
 805 |     "\n",
 806 |     "    with open(time_file, 'rb') as f:\n",
 807 |     "        for line in f:\n",
 808 |     "            if 'Elapsed (wall clock) time' in line:\n",
 809 |     "                elapsed = parse_elapsed_time(line)\n",
 810 |     "            elif 'Maximum resident set size' in line:\n",
 811 |     "                memory = parse_max_memory(line)\n",
 812 |     "            elif 'Percent of CPU' in line:\n",
 813 |     "                cpu = parse_cpu(line)\n",
 814 |     "\n",
 815 |     "    with open(test_file, 'rb') as f:\n",
 816 |     "        labels = [line.rstrip().split(' ')[0] == '1' for line in f]\n",
 817 |     "\n",
 818 |     "    with open(predictions_file, 'rb') as f:\n",
 819 |     "        scores = [float(line.rstrip().split(' ')[0]) for line in f]\n",
 820 |     "\n",
 821 |     "    fpr, tpr, _ = roc_curve(labels, scores)\n",
 822 |     "    roc_auc = auc(fpr, tpr)\n",
 823 |     "    ll = log_loss(labels, scores)\n",
 824 |     "    \n",
 825 |     "    figure = pyplot.figure(figsize=(6, 6))\n",
 826 |     "    pyplot.plot(fpr, tpr, linewidth=2.0)\n",
 827 |     "    pyplot.plot([0, 1], [0, 1], 'k--')\n",
 828 |     "    pyplot.xlabel('FPR')\n",
 829 |     "    pyplot.ylabel('TPR')\n",
 830 |     "    pyplot.title('{} {} - {:.3f} ROC AUC'.format(engine, sample_name(sample), roc_auc))\n",
 831 |     "    pyplot.show()\n",
 832 |     "\n",
 833 |     "    return {\n",
 834 |     "        'Engine': engine,\n",
 835 |     "        'Train size': sample,\n",
 836 |     "        'ROC AUC': roc_auc,\n",
 837 |     "        'Log loss': ll,\n",
 838 |     "        'Train time': elapsed,\n",
 839 |     "        'Maximum memory': memory,\n",
 840 |     "        'CPU load': cpu,\n",
 841 |     "    }"
 842 |    ]
 843 |   },
 844 |   {
 845 |    "cell_type": "markdown",
 846 |    "metadata": {},
 847 |    "source": [
 848 |     "Settings for VW & XGBoost and how to run them; I use (a little bit patched for correctness sake) GNU Time to measure running time, CPU load and memory consumption; configurations for VW & XGBoost are obtained via Hyperopt:"
 849 |    ]
 850 |   },
 851 |   {
 852 |    "cell_type": "code",
 853 |    "execution_count": null,
 854 |    "metadata": {},
 855 |    "outputs": [],
 856 |    "source": [
 857 |     "def get_time_command_and_file(train_file):\n",
 858 |     "    time_file = train_file + '.time'\n",
 859 |     "    return [\n",
 860 |     "        '/usr/local/bin/time',\n",
 861 |     "        '-v',\n",
 862 |     "        '--output=' + time_file,\n",
 863 |     "    ], time_file\n",
 864 |     "\n",
 865 |     "def get_vw_commands_and_predictions_file(train_file, test_file):\n",
 866 |     "    model_file = train_file + '.model'\n",
 867 |     "    predictions_file = test_file + '.predictions'\n",
 868 |     "    return [\n",
 869 |     "        'vw83',\n",
 870 |     "        '--link=logistic',\n",
 871 |     "        '--loss_function=logistic',\n",
 872 |     "        '-b', '29',\n",
 873 |     "        '-l', '0.3',\n",
 874 |     "        '--initial_t', '1',\n",
 875 |     "        '--decay_learning_rate', '0.5',\n",
 876 |     "        '--power_t', '0.5',\n",
 877 |     "        '--l1', '1e-15',\n",
 878 |     "        '--l2', '0',\n",
 879 |     "        '-d', train_file,\n",
 880 |     "        '-f', model_file,\n",
 881 |     "    ], [\n",
 882 |     "        'vw83',\n",
 883 |     "        '--loss_function=logistic',\n",
 884 |     "        '-t',\n",
 885 |     "        '-i', model_file,\n",
 886 |     "        '-d', test_file,\n",
 887 |     "        '-p', predictions_file,\n",
 888 |     "    ], predictions_file\n",
 889 |     "\n",
 890 |     "\n",
 891 |     "xgboost_conf = [\n",
 892 |     "    'booster = gbtree',\n",
 893 |     "    'objective = binary:logistic',\n",
 894 |     "    'nthread = 24',\n",
 895 |     "    'eval_metric = logloss',\n",
 896 |     "    'max_depth = 7',\n",
 897 |     "    'num_round = 200',\n",
 898 |     "    'eta = 0.2',\n",
 899 |     "    'gamma = 0.4',\n",
 900 |     "    'subsample = 0.8',\n",
 901 |     "    'colsample_bytree = 0.8',\n",
 902 |     "    'min_child_weight = 20',\n",
 903 |     "    'alpha = 3',\n",
 904 |     "    'lambda = 100',\n",
 905 |     "]\n",
 906 |     "\n",
 907 |     "\n",
 908 |     "def get_xgboost_commands_and_predictions_file(train_file, test_file, cache=False):\n",
 909 |     "    config_file = os.path.join(local_runtime_path, 'xgb.conf')\n",
 910 |     "    ensure_directory_exists(local_runtime_path)\n",
 911 |     "    with open(config_file, 'wb') as f:\n",
 912 |     "        for line in xgboost_conf:\n",
 913 |     "            print(line, file=f)\n",
 914 |     "    model_file = train_file + '.model'\n",
 915 |     "    predictions_file = test_file + '.predictions'\n",
 916 |     "    if cache:\n",
 917 |     "        train_file = train_file + '#' + train_file + '.cache'\n",
 918 |     "    return [\n",
 919 |     "        'xgboost',\n",
 920 |     "        config_file,\n",
 921 |     "        'data=' + train_file,\n",
 922 |     "        'model_out=' + model_file,\n",
 923 |     "    ], [\n",
 924 |     "        'xgboost',\n",
 925 |     "        config_file,\n",
 926 |     "        'task=pred',\n",
 927 |     "        'test:data=' + test_file,\n",
 928 |     "        'model_in=' + model_file,\n",
 929 |     "        'name_pred=' + predictions_file,\n",
 930 |     "    ], predictions_file\n",
 931 |     "\n",
 932 |     "def get_xgboost_ooc_commands_and_predictions_file(train_file, test_file):\n",
 933 |     "    return get_xgboost_commands_and_predictions_file(train_file, test_file, cache=True)"
 934 |    ]
 935 |   },
 936 |   {
 937 |    "cell_type": "code",
 938 |    "execution_count": null,
 939 |    "metadata": {},
 940 |    "outputs": [],
 941 |    "source": [
 942 |     "engines = {\n",
 943 |     "    'vw': (get_vw_commands_and_predictions_file, local_vw_train_template, local_vw_test_template),\n",
 944 |     "    'xgb': (get_xgboost_commands_and_predictions_file, local_libsvm_train_template, local_libsvm_test_template),\n",
 945 |     "    'xgb.ooc': (get_xgboost_ooc_commands_and_predictions_file, local_libsvm_train_template, local_libsvm_test_template),\n",
 946 |     "}"
 947 |    ]
 948 |   },
 949 |   {
 950 |    "cell_type": "markdown",
 951 |    "metadata": {},
 952 |    "source": [
 953 |     "Train & test everything:"
 954 |    ]
 955 |   },
 956 |   {
 957 |    "cell_type": "code",
 958 |    "execution_count": null,
 959 |    "metadata": {
 960 |     "scrolled": false
 961 |    },
 962 |    "outputs": [],
 963 |    "source": [
 964 |     "import subprocess\n",
 965 |     "\n",
 966 |     "\n",
 967 |     "measurements = []\n",
 968 |     "\n",
 969 |     "for sample in train_samples:\n",
 970 |     "    for engine in engines:\n",
 971 |     "        logger.info('Training \"%s\" on \"%s\" lines of data.', engine, sample)\n",
 972 |     "        \n",
 973 |     "        get_commands_and_predictions_file, train_template, test_template = engines[engine]\n",
 974 |     "\n",
 975 |     "        train_file = train_template.format(sample_name(sample))\n",
 976 |     "        test_file = test_template.format(sample_name(test_samples[0]))\n",
 977 |     "        logger.info('Will train on \"%s\" and test on \"%s\".', train_file, test_file)\n",
 978 |     "\n",
 979 |     "        command_time, time_file = get_time_command_and_file(train_file)\n",
 980 |     "        command_engine_train, command_engine_test, predictions_file = get_commands_and_predictions_file(train_file, test_file)\n",
 981 |     "\n",
 982 |     "        logger.info('Performing train.')\n",
 983 |     "        subprocess.call(command_time + command_engine_train)\n",
 984 |     "\n",
 985 |     "        logger.info('Performing test.')\n",
 986 |     "        subprocess.call(command_engine_test)\n",
 987 |     "\n",
 988 |     "        logger.info('Measuring results.')\n",
 989 |     "        measurement = measure(engine, sample, test_file, time_file, predictions_file)\n",
 990 |     "        logger.info(measurement)\n",
 991 |     "        measurements.append(measurement)"
 992 |    ]
 993 |   },
 994 |   {
 995 |    "cell_type": "markdown",
 996 |    "metadata": {},
 997 |    "source": [
 998 |     "Load measurements:"
 999 |    ]
1000 |   },
1001 |   {
1002 |    "cell_type": "code",
1003 |    "execution_count": null,
1004 |    "metadata": {},
1005 |    "outputs": [],
1006 |    "source": [
1007 |     "import pandas\n",
1008 |     "\n",
1009 |     "\n",
1010 |     "measurements_df = pandas.DataFrame(measurements).sort_values(by=['Engine', 'Train size'])\n",
1011 |     "measurements_df"
1012 |    ]
1013 |   },
1014 |   {
1015 |    "cell_type": "markdown",
1016 |    "metadata": {},
1017 |    "source": [
1018 |     "Plot measurements:"
1019 |    ]
1020 |   },
1021 |   {
1022 |    "cell_type": "code",
1023 |    "execution_count": null,
1024 |    "metadata": {
1025 |     "scrolled": false
1026 |    },
1027 |    "outputs": [],
1028 |    "source": [
1029 |     "def extract_data_for_plotting(df, what):\n",
1030 |     "    return reduce(\n",
1031 |     "        lambda left, right: pandas.merge(left, right, how='outer', on='Train size'),\n",
1032 |     "        map(\n",
1033 |     "            lambda name: df[df.Engine == name][['Train size', what]].rename(columns={what: name}),\n",
1034 |     "            df.Engine.unique(),\n",
1035 |     "        ),\n",
1036 |     "    )   \n",
1037 |     "\n",
1038 |     "def plot_stuff(df, what, ylabel=None, **kwargs):\n",
1039 |     "    data = extract_data_for_plotting(df, what).set_index('Train size')\n",
1040 |     "    ax = data.plot(marker='o', figsize=(6, 6), title=what, grid=True, linewidth=2.0, **kwargs)  # xlim=(1e4, 4e9)\n",
1041 |     "    if ylabel is not None:\n",
1042 |     "        ax.set_ylabel(ylabel)\n",
1043 |     "\n",
1044 |     "\n",
1045 |     "plot_stuff(measurements_df, 'ROC AUC', logx=True)\n",
1046 |     "plot_stuff(measurements_df, 'Log loss', logx=True)\n",
1047 |     "plot_stuff(measurements_df, 'Train time', loglog=True, ylabel='s')\n",
1048 |     "plot_stuff(measurements_df, 'Maximum memory', loglog=True, ylabel='bytes')\n",
1049 |     "plot_stuff(measurements_df, 'CPU load', logx=True)"
1050 |    ]
1051 |   },
1052 |   {
1053 |    "cell_type": "code",
1054 |    "execution_count": null,
1055 |    "metadata": {
1056 |     "collapsed": true
1057 |    },
1058 |    "outputs": [],
1059 |    "source": []
1060 |   }
1061 |  ],
1062 |  "metadata": {
1063 |   "kernelspec": {
1064 |    "display_name": "Python 2",
1065 |    "language": "python",
1066 |    "name": "python2"
1067 |   },
1068 |   "language_info": {
1069 |    "codemirror_mode": {
1070 |     "name": "ipython",
1071 |     "version": 2
1072 |    },
1073 |    "file_extension": ".py",
1074 |    "mimetype": "text/x-python",
1075 |    "name": "python",
1076 |    "nbconvert_exporter": "python",
1077 |    "pygments_lexer": "ipython2",
1078 |    "version": "2.7.9"
1079 |   }
1080 |  },
1081 |  "nbformat": 4,
1082 |  "nbformat_minor": 2
1083 | }
1084 | 


--------------------------------------------------------------------------------