├── python
    ├── figcrop_single.sh
    ├── bid_strategy.py
    ├── mcpc_bid.py
    ├── extract_click_mp.py
    ├── model.py
    ├── integrate_click.py
    ├── make_output.py
    ├── opt_bid.py
    ├── interval_landscape.py
    ├── config.py
    ├── rr_model.py
    ├── statistics.py
    ├── sqlr_model.py
    ├── bid_landscape.py
    ├── integrate_performance.py
    ├── eu_model.py
    ├── tool.py
    ├── test_lr.py
    ├── test_sqlr.py
    ├── test_lrlin.py
    ├── test_rr.py
    ├── test_eu.py
    ├── dataset.py
    ├── replay.py
    ├── test_em.py
    ├── lr_model.py
    └── em_model.py
├── scripts
    ├── pinyou_stage2.sh
    ├── run-sqlr.sh
    ├── run-lr.sh
    ├── run-eu.sh
    ├── replay.sh
    ├── run-rr.sh
    ├── pinyou_stage2_lrlin.sh
    ├── integrate_performance.sh
    ├── run-em.sh
    └── pinyou_stage1.sh
└── README.md


/python/figcrop_single.sh:
--------------------------------------------------------------------------------
1 | file=$1
2 | pdfcrop $file $file
3 | 


--------------------------------------------------------------------------------
/scripts/pinyou_stage2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | props="128 64 32 16 8 4 2 1"
3 | 
4 | for prop in $props; do
5 | 	sh run-em.sh $prop > ../output/$prop.log &
6 | done


--------------------------------------------------------------------------------
/scripts/run-sqlr.sh:
--------------------------------------------------------------------------------
1 | camps=$1
2 | steps="1 10 20 30 40 50 60"
3 | 
4 | for camp in $camps; do
5 | 	for step in $steps; do
6 | 		echo $camp $step
7 | 		python ../python/test_sqlr.py $camp $step
8 | 	done
9 | done


--------------------------------------------------------------------------------
/scripts/run-lr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | echo "run"
 3 | camps=$1
 4 | steps="1E-4 1E-5 1E-3 5E-3 5E-5 5E-4 1E-2 5E-2 1E-1"
 5 | for camp in $camps; do
 6 | 	for step in $steps; do
 7 | 	    echo $camp
 8 | 	    python ../python/test_lr.py $camp $step
 9 | 	done
10 | done
11 | 


--------------------------------------------------------------------------------
/python/bid_strategy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | class BidStrategy:
 4 | 	'''Bid strategy with CTR estimation module and bid function module.'''
 5 | 
 6 | 	def __init__(self, parameters):
 7 | 		pass
 8 | 
 9 | 	def bid(self):
10 | 		pass
11 | 
12 | def main():
13 | 	print "main method."
14 | 
15 | if __name__ == '__main__':
16 | 	main()


--------------------------------------------------------------------------------
/scripts/run-eu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | camps=$1
 3 | laplace="3"
 4 | scale="10 20 30 40 60 80 90"
 5 | ratio="0 0.1 0.5"
 6 | 
 7 | for camp in $camps; do
 8 | 	for lap in $laplace; do
 9 | 		for sca in $scale; do
10 | 			for rat in $ratio; do
11 | 				echo $camp $lap $sca $rat
12 | 				python ../python/test_eu.py $camp $lap $sca $rat
13 | 			done
14 | 		done
15 | 	done
16 | done


--------------------------------------------------------------------------------
/scripts/replay.sh:
--------------------------------------------------------------------------------
 1 | project_folder="/home/rk/Code/optimal-ctr-bidding/"
 2 | cd $project_folder/python
 3 | budget_props="1 4 8 16 32"
 4 | 
 5 | name=$1
 6 | model=$2
 7 | test_file=$3
 8 | output_folder=$4
 9 | 
10 | echo $name
11 | echo $model
12 | echo $test_file
13 | echo $output_folder
14 | 
15 | for prop in $budget_props; do
16 | 	echo $prop
17 | 	python replay.py -1 $prop $model $test_file $output_folder/$prop\_$name\_perf.csv
18 | done
19 | 


--------------------------------------------------------------------------------
/python/mcpc_bid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from bid_strategy import BidStrategy
 3 | 
 4 | class McpcBid(BidStrategy):
 5 | 	def __init__(self, camp_v):
 6 | 		self.camp_v = camp_v
 7 | 
 8 | 	def set_camp_value(self, v):
 9 | 		self.camp_v = v
10 | 
11 | 	def bid(self, ctr):
12 | 		bid_price = int(self.camp_v * ctr * 1E3)
13 | # 		print "bid price \t" + `bid_price`
14 | 		return bid_price
15 | 
16 | def main():
17 | 	print "main method."
18 | 
19 | if __name__ == '__main__':
20 | 	main()


--------------------------------------------------------------------------------
/scripts/run-rr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | camps=$1
 3 | laplace="3 10"
 4 | scale="0.005 0.01 0.02 0.03 0.04 0.06 0.08 0.1 0.2 0.5"  #"0.005 0.01 0.012 0.016 0.032 0.05 0.064 0.128 0.256 0.512 1.0"
 5 | ratio="0.1 0.01 0.5" #"0"
 6 | 
 7 | for camp in $camps; do
 8 | 	for lap in $laplace; do
 9 | 		for sca in $scale; do
10 | 			for rat in $ratio; do
11 | 				echo $camp $lap $sca $rat
12 | 				python ../python/test_rr.py $camp $lap $sca $rat
13 | 			done
14 | 		done
15 | 	done
16 | done
17 | 


--------------------------------------------------------------------------------
/python/extract_click_mp.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | import math
 4 | import operator
 5 | import tool
 6 | 
 7 | if len(sys.argv) < 3:
 8 | 	print "python extract_click_mp.py test.yzx.txt camp_click.txt"
 9 | 	exit(-1)
10 | 
11 | test_file = open(sys.argv[1], 'r')
12 | out_file = open(sys.argv[2], 'w')
13 | 
14 | for line in test_file:
15 | 	li = tool.ints(line.replace(':1','').split())
16 | 	clk = li[0]
17 | 	mp = li[1]
18 | 	out_file.write(`clk` + "\t" + `mp` + "\n")
19 | 
20 | test_file.close()
21 | out_file.close()


--------------------------------------------------------------------------------
/python/model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | class Model:
 4 | 	def __init__(self, train_data, test_data):
 5 | 		self.set_train_data(train_data)
 6 | 		self.set_test_data(test_data)
 7 | 
 8 | 	def set_train_data(self, train_data):
 9 | 		self.train_data = train_data
10 | 
11 | 	def set_test_data(self, test_data):
12 | 		self.test_data = test_data
13 | 
14 | 	def train(self):
15 | 		pass
16 | 
17 | 	def converged(self):
18 | 		pass
19 | 		return False
20 | 
21 | 	def test(self):
22 | 		pass
23 | 
24 | 	def calc_performance(self, dataset):
25 | 		pass
26 | 
27 | def main():
28 | 	print "main method."
29 | 
30 | if __name__ == '__main__':
31 | 	main()


--------------------------------------------------------------------------------
/python/integrate_click.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | import math
 4 | import operator
 5 | import tool
 6 | 
 7 | if len(sys.argv) < 3:
 8 | 	print "python integrate_click.py bid_mp_file click_file output_file"
 9 | 	exit(-1)
10 | 
11 | bid_mp_file = open(sys.argv[1], 'r')
12 | click_file = open(sys.argv[2], 'r')
13 | out_file = open(sys.argv[3], 'w')
14 | 
15 | bi_list = zip(bid_mp_file, click_file)
16 | 
17 | for (line_a, line_b) in bi_list:
18 | 	li_a = tool.ints(line_a.split())
19 | 	li_b = tool.ints(line_b.split())
20 | 	clk = li_b[0]
21 | 	bi = li_a[0]
22 | 	mp = li_a[1]
23 | 	out_file.write(`bi` + "\t" + `mp` + "\t" + `clk` + "\n")
24 | 
25 | bid_mp_file.close()
26 | click_file.close()
27 | out_file.close()


--------------------------------------------------------------------------------
/scripts/pinyou_stage2_lrlin.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python ../python/test_lrlin.py 1458 0.001 > ../output/lr_1458_0.001.log &
 4 | python ../python/test_lrlin.py 2259 0.1 > ../output/lr_2259_0.1.log &
 5 | python ../python/test_lrlin.py 2261 0.001 > ../output/lr_2261_0.001.log &
 6 | python ../python/test_lrlin.py 2821 0.1 > ../output/lr_2821_0.1.log &
 7 | python ../python/test_lrlin.py 2997 0.0005 > ../output/lr_2997_0.0005.log &
 8 | python ../python/test_lrlin.py 3358 0.01 > ../output/lr_3358_0.01.log &
 9 | python ../python/test_lrlin.py 3386 0.001 > ../output/lr_3386_0.001.log &
10 | python ../python/test_lrlin.py 3427 0.005 > ../output/lr_3427_0.005.log &
11 | python ../python/test_lrlin.py 3476 0.005 > ../output/lr_3476_0.005.log &
12 | 


--------------------------------------------------------------------------------
/python/make_output.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os
 3 | import sys
 4 | 
 5 | if len(sys.argv) < 2:
 6 | 	print "Usage: python yoyi_make_output.py lr/yoyi(/)"
 7 | 	exit(-1)
 8 | 
 9 | folder = sys.argv[1]
10 | files = os.listdir(folder)
11 | 
12 | fo = open(folder + '/integration.txt', 'w')
13 | header = "camp_id\tmodel\tdataset\trevenue\tctr\tcpc\tauc\trmse\tcpm\tbids\timps\tclks\tlaplace\tinterval\tscale\tds_ratio\tbudget_prop"
14 | fo.write(header + '\n')
15 | 
16 | for f in files:
17 | 	if not f.endswith('.csv'):
18 | 		continue
19 | 	file_path = os.path.join(folder, f)
20 | 	fi = open(file_path)
21 | 	lines = fi.read().split('\n')
22 | 	if len(lines) < 2:
23 | 		continue
24 | 	fo.write(lines[1] + '\n')
25 | 	fi.close()
26 | 
27 | fo.close()


--------------------------------------------------------------------------------
/scripts/integrate_performance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | python ../python/integrate_performance.py ../output/selected/ revenue 1458
 3 | python ../python/integrate_performance.py ../output/selected/ revenue 2259
 4 | python ../python/integrate_performance.py ../output/selected/ revenue 2261
 5 | python ../python/integrate_performance.py ../output/selected/ revenue 2821
 6 | python ../python/integrate_performance.py ../output/selected/ revenue 2997
 7 | python ../python/integrate_performance.py ../output/selected/ revenue 3358
 8 | python ../python/integrate_performance.py ../output/selected/ revenue 3386
 9 | python ../python/integrate_performance.py ../output/selected/ revenue 3427
10 | python ../python/integrate_performance.py ../output/selected/ revenue 3476


--------------------------------------------------------------------------------
/python/opt_bid.py:
--------------------------------------------------------------------------------
 1 | from bid_strategy import BidStrategy
 2 | import config
 3 | 
 4 | class OptBid(BidStrategy):
 5 | 	def __init__(self, camp_v, mu):
 6 | 		self.mu = mu
 7 | 		self.phi = 1.0 / (1.0 + self.mu)
 8 | 		self.camp_v = camp_v
 9 | 
10 | 	def set_camp_value(self, v):
11 | 		self.camp_v = v
12 | 
13 | 	def set_mu(self, mu):
14 | 		self.mu = mu
15 | 		self.phi = 1.0 / (1.0 + self.mu)
16 | 
17 | 	def calibrate(self, ctr):
18 | 		ctr_calib = ctr / (ctr + (1 - ctr) / config.ds_ratio)
19 | 		return ctr_calib
20 | 
21 | 	def bid_calib(self, camp_v, mu, ctr):
22 | 		bid_price = int(camp_v * self.calibrate(ctr) / (1.0 + mu) * 1E3)
23 | 		return bid_price
24 | 
25 | 	# b = 1.0 / (1.0 + mu) * ctr
26 | 	def bid(self, ctr):
27 | 		bid_price = int(1.0 / (1.0 + self.mu) * self.camp_v * ctr * 1E3)
28 | 		return bid_price


--------------------------------------------------------------------------------
/python/interval_landscape.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from bid_landscape import BidLandscape
 3 | 
 4 | class IntervalLandscape(BidLandscape):
 5 | 	'''The interval style bid landscape.'''
 6 | 
 7 | 	def __init__(self, dataset, campaign_id, laplace=1, interval=2):
 8 | 		BidLandscape.__init__(self, dataset, campaign_id, laplace)
 9 | 		self.interval = interval if not (interval > self.max_price) else (self.max_price + 1)
10 | 
11 | 	#TODO test the interval mode
12 | 	def get_probability(self, price):
13 | 		if price > self.max_price:
14 | 			return self.get_probability(self.max_price)
15 | 		left_index = int(price / self.interval) * self.interval
16 | 		idx = left_index + self.interval
17 | 		right_index = len(self.distribution) if len(self.distribution) < idx else idx
18 | 		probability = 0.0
19 | 		for p in range(left_index, right_index):
20 | 			probability += self.distribution[p]
21 | 		return probability
22 | 
23 | def main():
24 | 	print "main method."
25 | 
26 | if __name__ == '__main__':
27 | 	main()


--------------------------------------------------------------------------------
/scripts/run-em.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | prop=$1
 4 | #"64 32 16 8 4 2 1"
 5 | 
 6 | python ../python/test_em.py 1458 rr 3 0.512 0 $prop
 7 | python ../python/test_em.py 1458 eu 3 90 0.1 $prop
 8 | 
 9 | python ../python/test_em.py 2259 eu 3 80 0.5 $prop
10 | python ../python/test_em.py 2259 rr 3 2E-1 0.5 $prop
11 | 
12 | python ../python/test_em.py 2261 rr 10 0.01 0.5 $prop
13 | python ../python/test_em.py 2261 eu 3 20 0.5 $prop
14 | 
15 | python ../python/test_em.py 2821 rr 3 0.128 0 $prop
16 | python ../python/test_em.py 2821 eu 3 40 0.5 $prop
17 | 
18 | python ../python/test_em.py 2997 eu 3 80 0.5 $prop
19 | python ../python/test_em.py 2997 rr 10 0.256 0 $prop
20 | 
21 | python ../python/test_em.py 3358 rr 10 0.016 0 $prop
22 | python ../python/test_em.py 3358 eu 3 90 0.1 $prop
23 | 
24 | python ../python/test_em.py 3386 rr 10 0.04 0.5 $prop
25 | python ../python/test_em.py 3386 eu 3 80 0.1 $prop
26 | 
27 | python ../python/test_em.py 3427 rr 3 0.5 0.5 $prop
28 | python ../python/test_em.py 3427 eu 3 90 0.1 $prop
29 | 
30 | python ../python/test_em.py 3476 rr 3 0.032 0 $prop
31 | python ../python/test_em.py 3476 eu 3 40 0.1 $prop
32 | 


--------------------------------------------------------------------------------
/scripts/pinyou_stage1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd /home/rk/Code/optimal-ctr-bidding/scripts/
 4 | pwd
 5 | 
 6 | echo "run"
 7 | sh run-lr.sh "1458 2261 2821" > ../output/log_parallel/lr_1458_2261_2821.log &
 8 | sh run-rr.sh "1458 2261 2821" > ../output/log_parallel/rr_1458_2261_2821.log &
 9 | sh run-eu.sh "1458 2261 2821" > ../output/log_parallel/eu_1458_2261_2821.log &
10 | sh run-lr.sh "2997 3358 3386" > ../output/log_parallel/lr_2997_3358_3386.log &
11 | sh run-rr.sh "2997 3358 3386" > ../output/log_parallel/rr_2997_3358_3386.log &
12 | sh run-eu.sh "2997 3358 3386" > ../output/log_parallel/eu_2997_3358_3386.log &
13 | sh run-lr.sh "3427 3476 2259" > ../output/log_parallel/lr_3427_3476_2259.log &
14 | sh run-rr.sh "3427 3476 2259" > ../output/log_parallel/rr_3427_3476_2259.log &
15 | sh run-eu.sh "3427 3476 2259" > ../output/log_parallel/eu_3427_3476_2259.log &
16 | #sqlr
17 | sh run-sqlr.sh "1458 2261 2821" > ../output/log_parallel/sqlr_1458_2261_2821.log &
18 | sh run-sqlr.sh "2997 3358 3386" > ../output/log_parallel/sqlr_2997_3358_3386.log &
19 | sh run-sqlr.sh "3427 3476 2259" > ../output/log_parallel/sqlr_3427_3476_2259.log &
20 | 
21 | echo "done"


--------------------------------------------------------------------------------
/python/config.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | INTVL = False
 4 | 
 5 | #--- data folder ---#
 6 | data_folder = "../../make-ipinyou-data/"
 7 | train_postfix = "/train.yzx.txt"
 8 | test_postfix = "/test.yzx.txt"
 9 | 
10 | output_folder = "../output/"
11 | 
12 | campaign_list = [1458, 2259, 2261, 2821, 2997, 3358, 3386, 3427, 3476]
13 | campaign = 100000
14 | 
15 | #--- training hyper parameter ---#
16 | model_list = ['rr', 'eu']
17 | model_name = ''
18 | laplace = 3
19 | interval = 5
20 | budget_prop = 1
21 | 
22 | lr_train_round = 20
23 | lr_alpha = 5E-3
24 | lr_lambda = 1E-4
25 | eu_lambda = 1E-2
26 | 
27 | eu_train_round = 30
28 | eu_scale = 10
29 | ds_ratio = 0
30 | mu_range = np.arange(-0.99, 0.99, 0.01)
31 | #np.arange(-0.8, 0.1, 0.1).extend(np.arange(-0.1, 0.1, 0.01).extend(np.arange(0.1, 0.9, 0.1)))
32 | 
33 | em_scale = 1E-3
34 | em_round = 30
35 | 
36 | #--- debug parameter ---#
37 | math_err_num = 0
38 | 
39 | 
40 | #--- replay parameter ---#
41 | test_progress_unit = 0.1
42 | train_progress_unit = 0.25
43 | budget_props = [128, 64, 32, 16, 8, 4, 2, 1]
44 | 
45 | #--- draw parameter ---#
46 | colors = {'lr':'cx--', 'rr':'or-', 'eu':'kp-', 'sqlr':'*b--'}
47 | 


--------------------------------------------------------------------------------
/python/rr_model.py:
--------------------------------------------------------------------------------
 1 | from eu_model import EuModel
 2 | from bid_landscape import BidLandscape
 3 | from bid_strategy import BidStrategy
 4 | from dataset import Dataset
 5 | import math
 6 | import random
 7 | import tool
 8 | import config
 9 | 
10 | class RrModel(EuModel):
11 |     def __init__(self, train_data, test_data):
12 |         EuModel.__init__(self, train_data, test_data)
13 | 
14 |     def train(self):
15 |         random.seed(10)
16 |         train_data = self.train_data
17 |         progress = 0.0
18 |         iter_id = train_data.init_index()
19 |         while not train_data.reached_tail(iter_id):
20 |             data = train_data.get_next_data(iter_id)
21 |             y = data[0]
22 |             feature = data[2:len(data)]
23 |             ctr = tool.estimate_ctr(self.weight, feature, train_flag=True)
24 |             phi = 1.0 / (1.0 + self.mu)
25 |             bp = self.bid_strategy.bid(ctr)
26 |             pz = self.train_data.landscape.get_probability(bp)
27 |             # print `bp` + '\t' + `pz`
28 |             scale_x = (phi * ctr - y) * phi * math.pow(self.camp_v, 2) * pz * config.eu_scale
29 |             for idx in feature:
30 |                 self.weight[idx] = self.weight[idx] * self.reg_update_param - config.lr_alpha * scale_x
31 |             # prg = train_data.get_progress(iter_id)
32 |             # if prg < 0.9 and prg > (progress + config.train_progress_unit - 1E-3):
33 |             #     self.test()
34 |             #     progress += config.train_progress_unit
35 | 


--------------------------------------------------------------------------------
/python/statistics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from dataset import Dataset
 3 | import config
 4 | import sys
 5 | 
 6 | if len(sys.argv) > 1:
 7 |     config.campaign_list = [int(sys.argv[1])]
 8 | 
 9 | fo = open("../output/statistics.csv", 'w')
10 | header = "camp_id\tdataset\tmax_price\tctr\tecpc\tecpm\tclk_sum\tcost_sum\tsize\n"
11 | 
12 | fo.write(header)
13 | 
14 | for camp_id in config.campaign_list:
15 |     train_dataset = Dataset(config.data_folder+`camp_id`+config.train_postfix, camp_id)
16 |     tr_stat = train_dataset.get_statistics()
17 |     
18 |     test_dataset = Dataset(config.data_folder+`camp_id`+config.test_postfix, camp_id)
19 |     te_stat = test_dataset.get_statistics()
20 | 
21 |     tr_line = "%d\t%s\t%d\t%f\t%d\t%d\t%d\t%d\t%d\n" % (camp_id, "train", tr_stat['max_price'], tr_stat['ctr'], 
22 |                                                     tr_stat['ecpc'], tr_stat['ecpm'], tr_stat['clk_sum'], 
23 |                                                     tr_stat['cost_sum'], tr_stat['size'])
24 |     te_line = "%d\t%s\t%d\t%f\t%d\t%d\t%d\t%d\t%d\n" % (camp_id, "test", te_stat['max_price'], te_stat['ctr'], 
25 |                                                     te_stat['ecpc'], te_stat['ecpm'], te_stat['clk_sum'], 
26 |                                                     te_stat['cost_sum'], te_stat['size'])
27 | 
28 |     fo.write(tr_line)
29 |     fo.write(te_line)
30 | 
31 |     del train_dataset
32 |     del test_dataset
33 | 
34 |     print "Deleted " + `camp_id`
35 | fo.close()
36 | 
37 | print "done"


--------------------------------------------------------------------------------
/python/sqlr_model.py:
--------------------------------------------------------------------------------
 1 | from lr_model import LrModel
 2 | from bid_landscape import BidLandscape
 3 | from opt_bid import OptBid
 4 | from dataset import Dataset
 5 | import math
 6 | import random
 7 | import tool
 8 | import config
 9 | 
10 | class SqlrModel(LrModel):
11 | 	def __init__(self, train_data, test_data):
12 | 		LrModel.__init__(self, train_data, test_data)
13 | 
14 | 	def train(self): # train with one traversal of the full train_data
15 | 		random.seed(10)
16 | 		train_data = self.train_data
17 | # 		print "Train data \t" + `train_data` + "\tsize \t" + `train_data.get_size()`
18 | 		progress = 0.0
19 | 		iter_id = train_data.init_index()
20 | 		while not train_data.reached_tail(iter_id):
21 | 			data = train_data.get_next_data(iter_id)
22 | 			y = data[0]
23 | 			feature = data[2:len(data)]
24 | 			ctr = self.estimate_ctr(self.weight, feature, train_flag=True, ctr_avg=train_data.get_statistics()['ctr'])
25 | 			for idx in feature: # update
26 | 				self.weight[idx] = self.weight[idx] * self.reg_update_param - config.lr_alpha * (ctr - y) * ctr * (1-ctr)
27 | 			# prg = train_data.get_progress(iter_id)
28 | 			# if prg < 0.9 and prg > (progress + config.train_progress_unit - 1E-3):
29 | 			# 	self.test()
30 | 			# 	progress += config.train_progress_unit
31 | 	
32 | 	def estimate_ctr(self, weight, feature, train_flag = False, ctr_avg=0.125):
33 | 		value = 0.0
34 | 		for idx in feature:
35 | 			if idx in weight:
36 | 				value += weight[idx]
37 | 			elif train_flag:
38 | 				if idx == 0:
39 | 					weight[idx] = - math.log(1.0 / (ctr_avg) - 1.0)
40 | 				else:
41 | 					weight[idx] = tool.next_init_weight()
42 | 		ctr = tool.sigmoid(value)
43 | 	#   print "Estimated CTR \t" + `ctr`
44 | 		return ctr


--------------------------------------------------------------------------------
/python/bid_landscape.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | class BidLandscape:
 4 | 	'''The landscape making and storage class.'''
 5 | 
 6 | 	def __init__(self, dataset, camp_id, laplace=1):
 7 | 		self.dataset = dataset
 8 | 		self.dataset.init_landscape(self)
 9 | 		self.camp_id = camp_id
10 | 		self.laplace = laplace if laplace>1 else 1
11 | 		self.init_distribution()
12 | 		self.make_distribution()
13 | 		print "Inited Bid Landscape."
14 | 
15 | 	def get_campaign_id(self):
16 | 		return self.camp_id
17 | 
18 | 	def init_distribution(self):
19 | 		self.max_price = self.dataset.get_max_price()
20 | 		self.distribution = [0.0*i for i in range(0, self.max_price+1)]
21 | 
22 | 	def get_distribution(self):
23 | 		return self.distribution
24 | 
25 | 	def make_distribution(self): # make the original distribution with laplace smoothing
26 | 		mp_dict = {}
27 | 		iter_id = self.dataset.init_index()
28 | 		while not self.dataset.reached_tail(iter_id):
29 | 			data = self.dataset.get_next_data(iter_id)
30 | 			mp = data[1]
31 | 			if mp in mp_dict:
32 | 				mp_dict[mp] = mp_dict[mp] + 1
33 | 			else:
34 | 				mp_dict[mp] = 1
35 | 		total_num = self.dataset.get_size() + (self.max_price + 1) * self.laplace
36 | 		for p in range(0, self.max_price+1):
37 | 			if p not in mp_dict:
38 | 				self.distribution[p] = 1.0 * self.laplace / total_num
39 | 			else:
40 | 				self.distribution[p] = 1.0 * (mp_dict[p] + self.laplace) / total_num
41 | 		print "Landscape made."
42 | 
43 | 	def get_probability(self, price): # get the probability of the given price in the landscape
44 | 		price = int(price)
45 | 		probability = 0.0
46 | 		if price > self.max_price:
47 | 			probability = self.distribution[self.max_price]
48 | 		elif price < 0:
49 | 			probability = self.distribution[0]
50 | 		else:
51 | 			probability = self.distribution[price]
52 | 		return probability
53 | 
54 | 
55 | 
56 | 
57 | def main():
58 | 	print "main method."
59 | 
60 | if __name__ == '__main__':
61 | 	main()


--------------------------------------------------------------------------------
/python/integrate_performance.py:
--------------------------------------------------------------------------------
 1 | import tool
 2 | import config
 3 | import sys
 4 | import os
 5 | import matplotlib.pyplot as pl
 6 | 
 7 | def draw(camp, metric, performances, folder):
 8 | 	print camp
 9 | 	pl.figure(figsize=(5, 5))
10 | 	legend = []
11 | 	min_y = 1E10
12 | 	max_y = 0
13 | 	for model in performances:
14 | 		perf = performances[model]
15 | 		legend.append(model.upper())
16 | 		pl.plot(range(0, len(perf), 1), perf, config.colors[model])
17 | 		mi = min(perf)
18 | 		ma = max(perf)
19 | 		min_y = mi if mi < min_y else min_y
20 | 		max_y = ma if ma > max_y else max_y
21 | 	pl.xlabel("Training Rounds")
22 | 	pl.ylabel(metric)
23 | 	min_y = min_y - abs(int(0.05*min_y))
24 | 	max_y = max_y + abs(int(0.05*max_y))
25 | 	pl.ylim([min_y, max_y])
26 | 	pl.title('Learning curve in Camp. ' + camp)
27 | 	path = os.path.join(folder, camp+"_"+metric+".pdf")
28 | 	pl.grid(True)
29 | 	pl.legend(legend, loc = 'lower right')
30 | 	# pl.show()
31 | 	pl.savefig(path, dpi=300)
32 | 	pl.close()
33 | 
34 | def read_values(file_path, metric):
35 | 	if metric == 'revenue':
36 | 		metric = 'Test'
37 | 	fi = open(file_path, 'r')
38 | 	lines = fi.read().split('\n')
39 | 	fi.close()
40 | 	count_flag = False
41 | 	metric_index = 1 # defaultly count on revenue
42 | 	perf_list = []
43 | 	for line in lines:
44 | 		args = line.split('\t')
45 | 		if args[0] == 'Round':
46 | 			count_flag = True
47 | 			try:
48 | 				metric_index = args.index(metric)
49 | 			except ValueError:
50 | 				print "No such metric name."
51 | 				print ValueError
52 | 				exit(-1)
53 | 			continue
54 | 		if count_flag and not args[0] == '':
55 | 			perf_list.append(float(args[metric_index]))
56 | 	return perf_list
57 | 
58 | def main():
59 | 	if len(sys.argv) < 4:
60 | 		print "Usage: python draw_camp_perf.py ../output revenue 1458"
61 | 		exit(-1)
62 | 
63 | 	folder = sys.argv[1] # '../output/'
64 | 	metric = sys.argv[2] # 'revenue'
65 | 	camp = sys.argv[3] # '1458'
66 | 	print camp
67 | 	files = os.listdir(folder)
68 | 
69 | 	performances = {}
70 | 
71 | 	for f in files:
72 | 		if not f.endswith('.csv'):
73 | 			continue
74 | 		params = f.split('_')
75 | 		camp_id = params[0]
76 | 		if not camp_id == camp:
77 | 			continue
78 | 		model = params[1]
79 | 		perf_list = read_values(os.path.join(folder, f), metric)
80 | 		performances[model] = perf_list
81 | 
82 | 	print performances
83 | 	# folder = "./"
84 | 	# camp = "1458"
85 | 	# metric = "revenue"
86 | 	# performances = {'eu': [1,3,5,7,9],
87 | 	# 				'rr': [2,2,6,8,19,13],
88 | 	# 				'lr': [1,2,5,5],
89 | 	# 				'sqlr': [0,3,4,5,8]}
90 | 	# print performances
91 | 	# draw(camp, metric, performances, folder)
92 | 
93 | if __name__ == '__main__':
94 | 	main()


--------------------------------------------------------------------------------
/python/eu_model.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Dec 14, 2015
 3 | 
 4 | @author: rk
 5 | '''
 6 | from lr_model import LrModel
 7 | from bid_landscape import BidLandscape
 8 | from opt_bid import OptBid
 9 | from dataset import Dataset
10 | import math
11 | import random
12 | import tool
13 | import config
14 | 
15 | class EuModel(LrModel):
16 |     def __init__(self, train_data, test_data):
17 |         LrModel.__init__(self, train_data, test_data)
18 |         
19 |     def init_parameters(self):
20 |         self.camp_v = self.train_data.get_statistics()['ecpc']
21 |         if config.ds_ratio > 0:
22 |             self.ori_camp_v = self.train_data.get_statistics()['ori_ecpc']
23 |         self.budget = int(self.test_data.get_statistics()['cost_sum'] / config.budget_prop)
24 |         self.mu = 0.0
25 |     
26 |     def init_bid_strategy(self):
27 |         self.bid_strategy = OptBid(self.camp_v, self.mu)
28 | 
29 |     def train(self):
30 |         random.seed(10)
31 |         train_data = self.train_data
32 |         progress = 0.0
33 |         iter_id = train_data.init_index()
34 |         while not train_data.reached_tail(iter_id):
35 |             data = train_data.get_next_data(iter_id)
36 |             y = data[0]
37 |             feature = data[2:len(data)]
38 |             ctr = self.estimate_ctr(self.weight, feature, train_flag=True, ctr_avg=train_data.get_statistics()['ctr'])
39 |             # tool.estimate_ctr(self.weight, feature, train_flag=True)
40 |             phi = 1.0 / (1.0 + self.mu)
41 |             bp = self.bid_strategy.bid(ctr)
42 |             pz = self.train_data.landscape.get_probability(bp)
43 |             # print `bp` + '\t' + `pz`
44 |             scale_x = (phi * ctr - y) * phi * math.pow(self.camp_v, 2) * pz * ctr * (1 - ctr) * config.eu_scale
45 |             for idx in feature:
46 |                 self.weight[idx] = self.weight[idx] * self.reg_update_param - config.lr_alpha * scale_x
47 |             # prg = train_data.get_progress(iter_id)
48 |             # if prg < 0.9 and prg > (progress + config.train_progress_unit - 1E-3):
49 |             #     self.test()
50 |             #     progress += config.train_progress_unit
51 | 
52 |     def estimate_ctr(self, weight, feature, train_flag = False, ctr_avg=0.125):
53 |         value = 0.0
54 |         for idx in feature:
55 |             if idx in weight:
56 |                 value += weight[idx]
57 |             elif train_flag:
58 |                 if idx == 0:
59 |                     weight[idx] = - math.log(1.0 / (ctr_avg) - 1.0)
60 |                 else:
61 |                     weight[idx] = tool.next_init_weight()
62 |         ctr = tool.sigmoid(value)
63 |     #   print "Estimated CTR \t" + `ctr`
64 |         return ctr


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Optimal CTR bidding
 2 | An experimental framework to support experiments in CIKM 2016 paper "User Response Learning for Directly Optimizing Campaign Performance in Display Advertising". [PDF](http://apex.sjtu.edu.cn/public/files/papers/20160817/opt-ctr-bid.pdf)
 3 | 
 4 | If you have any problem, please [send an E-mail](mailto:kren@apex.sjtu.edu.cn) to [Kan Ren](http://apex.sjtu.edu.cn/members/kren).
 5 | 
 6 | ## Datasets
 7 | * `iPinYou` has been decribed in [this page](https://github.com/wnzhang/make-ipinyou-data).
 8 | * `YOYI` is the newly published dataset in our CIKM paper. The detail of this dataset is [here](http://apex.sjtu.edu.cn/datasets/7).
 9 | 
10 | ## Format of data
11 | We use `yzx` data structure to formalize bidding logs.
12 | Each record contains
13 | * `y`: true label of user response (1 for positive and 0 otherwise).
14 | * `z`: the market price of this sample.
15 | * `x`: pre-processed features of the bid request.
16 | 
17 | Other details of `yzx` data can be found in [this benchmarking paper](http://arxiv.org/abs/1407.7073)
18 | 
19 | ## Prepare the dataset
20 | * Clone and prepare `iPinYou` dataset as described [here](https://github.com/wnzhang/make-ipinyou-data). Note that, please put `make-ipinyou-data` folder in the same parent folder as `optimal-ctr-bidding` project.
21 | ```
22 | |-- code-folder
23 | ----|-- make-ipinyou-data
24 | --------|-- yoyi-data
25 | --------|-- 1458
26 | --------|-- 2259
27 | --------...
28 | ----|-- optimal-ctr-bidding
29 | --------|-- python
30 | --------|-- scripts
31 | --------|-- README.md
32 | ```
33 | * (optional) Download `YOYI` dataset and put the folder in `make-ipinyou-data`.
34 | 
35 | ## Run the code
36 | * Go to `script` folder and execute `run_MODEL` scripts, where `MODEL` is a placeholder of model names including "lr", "sqlr", "eu" and "rr". Details of the models can be found in our paper.
37 | * Example: ```sh run-lr.sh "1458 2261 2821"```
38 | 
39 | ## Citation
40 | ```
41 | @inproceedings{ren2016user,
42 |   title={User response learning for directly optimizing campaign performance in display advertising},
43 |   author={Ren, Kan and Zhang, Weinan and Rong, Yifei and Zhang, Haifeng and Yu, Yong and Wang, Jun},
44 |   booktitle={Proceedings of the 25th ACM International on Conference on Information and Knowledge Management},
45 |   pages={679--688},
46 |   year={2016},
47 |   organization={ACM}
48 | }
49 | ```
50 | 
51 | ```
52 | @article{ren2018bidding,
53 |   title={Bidding Machine: Learning to Bid for Directly Optimizing Profits in Display Advertising},
54 |   author={Ren, Kan and Zhang, Weinan and Chang, Ke and Rong, Yifei and Yu, Yong and Wang, Jun},
55 |   journal={IEEE Transactions on Knowledge and Data Engineering},
56 |   volume={30},
57 |   number={4},
58 |   pages={645--659},
59 |   year={2018},
60 |   publisher={IEEE}
61 | }
62 | ```
63 | 


--------------------------------------------------------------------------------
/python/tool.py:
--------------------------------------------------------------------------------
  1 | #--- tool package ---#
  2 | import os
  3 | import random
  4 | import math
  5 | import config
  6 | 
  7 | init_weight = 0.05
  8 | # random.seed(10)
  9 | 
 10 | def next_init_weight():
 11 | 	return (random.random() - 0.5) * init_weight
 12 | 
 13 | # convert string list to integer array [yzx]
 14 | def ints(data):
 15 | 	int_array = []
 16 | 	for d in data:
 17 | 		int_array.append(int(d))
 18 | 	return int_array
 19 | 
 20 | # convert to string list
 21 | def strings(data):
 22 | 	str_array = []
 23 | 	for d in data:
 24 | 		str_array.append(`d`)
 25 | 	return str_array
 26 | 
 27 | # sigmoid function
 28 | def sigmoid(z):
 29 | 	value = 0.5
 30 | 	try:
 31 | 		value = 1.0 / (1.0 + math.exp(-z))
 32 | 	except:
 33 | 		# print "Math Out of Range. " + `z`
 34 | 		value = 1E-9
 35 | 	return value
 36 | 
 37 | def estimate_ctr(weight, feature, train_flag = False):
 38 | 	value = 0.0
 39 | 	for idx in feature:
 40 | 		if idx in weight:
 41 | 			value += weight[idx]
 42 | 		elif train_flag:
 43 | 			weight[idx] = next_init_weight()
 44 | 	ctr = sigmoid(value)
 45 | # 	print "Estimated CTR \t" + `ctr`
 46 | 	return ctr
 47 | 
 48 | def calibrate_ctr(pctr, ds_ratio):
 49 | 	cal_pctr = pctr / (pctr + (1 - pctr) / ds_ratio)
 50 | 	return cal_pctr
 51 | 
 52 | def gen_performance_line(log):
 53 | 	performance = log['performance']
 54 | 	line = `performance['revenue']` + "\t" \
 55 | 			+ `performance['roi']` + "\t" \
 56 | 			+ `performance['ctr']` + "\t" \
 57 | 			+ `performance['cpc']` + "\t" \
 58 | 			+ `performance['auc']` + "\t" \
 59 | 			+ `performance['rmse']` + "\t" \
 60 | 			+ `performance['cpm']` + "\t" \
 61 | 			+ `performance['bids']` + "\t" \
 62 | 			+ `performance['imps']` + "\t" \
 63 |  			+ `performance['clks']`
 64 |  	return line
 65 | 
 66 | def judge_stop(logs):
 67 | 	stop = False
 68 | 	# step = int(1/config.train_progress_unit)
 69 | 	step = 1
 70 | 	curr_loop = len(logs) - 1 # the latest record id
 71 | 	if curr_loop >= 2*step:
 72 | 		current_r = logs[curr_loop]['performance']['revenue']
 73 | 		last_r = logs[curr_loop - step]['performance']['revenue']
 74 | 		last_2_r = logs[curr_loop - 2*step]['performance']['revenue']
 75 | 		# print "Curr:last:last_2 = " + `current_r` + ":" + `last_r` + ":" + `last_2_r`
 76 | 		if current_r < last_r and last_r < last_2_r:
 77 | 			stop = True
 78 | 	return stop
 79 | 
 80 | def extend_judge_stop(logs):
 81 | 	stop = False
 82 | 	if len(logs) < 10:
 83 | 		stop = False
 84 | 	else:
 85 | 		stop = judge_stop(logs)
 86 | 	return stop
 87 | 
 88 | def get_last_log(logs):
 89 | 	return logs[len(logs)-1]
 90 | 
 91 | #--- no use below ---#
 92 | 
 93 | # load data from file as [[yzx]]
 94 | def load_data(file_path):
 95 | 	dataset = []
 96 | 	if not os.path.isfile(file_path):
 97 | 		print "ERROR: file not exist. " + file_path
 98 | 	else:
 99 | 		fi = open(file_path, 'r')
100 | 		for line in fi:
101 | 			li = ints(line.replace(':1','').split())
102 | 			dataset.append(li)
103 | 		fi.close()
104 | 	return dataset
105 | 


--------------------------------------------------------------------------------
/python/test_lr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import config
 3 | import tool
 4 | from dataset import Dataset
 5 | from bid_landscape import BidLandscape
 6 | from interval_landscape import IntervalLandscape
 7 | from lr_model import LrModel
 8 | import sys
 9 | 
10 | 
11 | def main():
12 | 	if len(sys.argv) < 3:
13 | 		print "Usage python test_lr.py campaign_id learn_rate (budget_prop)"
14 | 		exit(-1)
15 | 	data_folder = "../../make-ipinyou-data/"
16 | 	config.campaign_id = int(sys.argv[1])
17 | 	# print config.campaign
18 | 	# print config.campaign_id
19 | 	# exit(-1)
20 | 	config.lr_alpha = float(sys.argv[2])
21 | 	if len(sys.argv) == 4:
22 | 		config.budget_prop = int(sys.argv[3])
23 | 	train_path = data_folder + `config.campaign_id` + "/train.yzx.txt"
24 | 	test_path = data_folder + `config.campaign_id` + "/test.yzx.txt"
25 | 	
26 | 	train_data = Dataset(train_path, config.campaign_id)
27 | 	train_data.shuffle() # make train data shuffled
28 | 	test_data = Dataset(test_path, config.campaign_id)
29 | 	print "Load done."
30 | 	
31 | 	lr_model = LrModel(train_data, test_data)
32 | 	print "campaign v = " + `lr_model.camp_v`
33 | 	print "learn_rate = " + `config.lr_alpha`
34 | 	print "budget = " + `lr_model.budget`
35 | 
36 | 	if config.ds_ratio > 0:
37 | 		print "Need calibration."
38 | 	else:
39 | 		print "No calibration."
40 | 
41 | 	print "Begin training ..."
42 | 	for i in range(0, config.lr_train_round):
43 | 		lr_model.train()
44 | 		lr_model.test()
45 | 		print "Round " + `i+1` + "\t" + `tool.get_last_log(lr_model.test_log)['performance']`
46 | 		if tool.judge_stop(lr_model.test_log):
47 | 			break;
48 | 	print "Train done."
49 | 
50 | 
51 | 	log_file = `config.campaign_id` + "_lr_" + `config.lr_alpha` + "_" + `config.budget_prop` + ".csv"
52 | 	fo = open("../output/"+log_file, 'w')
53 | 	
54 | 	print "Begin log ..."
55 | 	header = "camp_id\tmodel\tdataset\trevenue\tctr\tcpc\tauc\trmse\tcpm\tbids\timps\tclks\tlaplace\tinterval\tlearn_rate\tnds_ratio\tbudget_prop"
56 | 	best_test_log = lr_model.get_best_test_log()
57 | 	best_test_line = `config.campaign_id` + "\t" + "LR\ttest\t" \
58 | 						+ tool.gen_performance_line(best_test_log) + "\t" \
59 | 						+ 'None' + "\t" + "None" + "\t" + `config.lr_alpha` + "\t" \
60 | 						+ "None" + "\t" + `config.budget_prop`
61 | 	fo.write(header+"\n")
62 | 	fo.write(best_test_line+"\n")
63 | 
64 | 	fo.write("\n")
65 | 
66 | 	fo.write("Round\tTest\tctr\tcpc\tauc\trmse\tcpm\tclks\timps\tbids\n")
67 | 	for i in range(0, len(lr_model.test_log)):
68 | 		test_log = lr_model.test_log[i]
69 | 		line = `i+1` + "\t" + `test_log['performance']['revenue']` \
70 | 				+ "\t" + `test_log['performance']['ctr']` \
71 | 				+ "\t" + `test_log['performance']['cpc']` \
72 | 				+ "\t" + `test_log['performance']['auc']` \
73 | 				+ "\t" + `test_log['performance']['rmse']` \
74 | 				+ "\t" + `test_log['performance']['cpm']` \
75 | 				+ "\t" + `test_log['performance']['clks']` \
76 | 				+ "\t" + `test_log['performance']['imps']` \
77 | 				+ "\t" + `test_log['performance']['bids']`
78 | 		fo.write(line + "\n")
79 | 	fo.close()
80 | 	print "Log done."
81 | 
82 | 	weight_path = `config.campaign_id` + "_" + "lr_best_weight" \
83 | 				+ "_" + `config.lr_alpha` + "_" + `config.budget_prop` \
84 | 				+ ".weight"
85 | 	lr_model.output_weight(best_test_log['weight'], "../output/" + weight_path)
86 | 
87 | if __name__ == '__main__':
88 | 	main()


--------------------------------------------------------------------------------
/python/test_sqlr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import config
 3 | import tool
 4 | from dataset import Dataset
 5 | from bid_landscape import BidLandscape
 6 | from interval_landscape import IntervalLandscape
 7 | from sqlr_model import SqlrModel
 8 | import sys
 9 | 
10 | 
11 | def main():
12 | 	if len(sys.argv) < 3:
13 | 		print "Usage python test_sqlr.py campaign_id learn_rate (budget_prop)"
14 | 		exit(-1)
15 | 	data_folder = "../../make-ipinyou-data/"
16 | 	config.campaign_id = int(sys.argv[1])
17 | 	config.lr_alpha = float(sys.argv[2])
18 | 	if len(sys.argv) == 4:
19 | 		config.budget_prop = int(sys.argv[3])
20 | 	train_path = data_folder + `config.campaign_id` + "/train.yzx.txt"
21 | 	test_path = data_folder + `config.campaign_id` + "/test.yzx.txt"
22 | 	print "Camp_id\tlearn_alpha"
23 | 	print `config.campaign_id` + "\t" + `config.lr_alpha`
24 | 	
25 | 	train_data = Dataset(train_path, config.campaign_id)
26 | 	train_data.shuffle()
27 | 	test_data = Dataset(test_path, config.campaign_id)
28 | 	print "Load done."
29 | 	
30 | 	lr_model = SqlrModel(train_data, test_data)
31 | 	print "campaign v = " + `lr_model.camp_v`
32 | 	print "budget = " + `lr_model.budget`
33 | 
34 | 	log_file = `config.campaign_id` + "_sqlr_" + `config.lr_alpha` + "_" + `config.budget_prop` + ".csv"
35 | 	fo = open("../output/"+log_file, 'w')
36 | 
37 | 	print "Begin training ..."
38 | 	for i in range(0, config.lr_train_round):
39 | 		lr_model.train()
40 | 		lr_model.test()
41 | 		print "Round " + `i+1` + "\t" + `tool.get_last_log(lr_model.test_log)['performance']`
42 | 		if tool.judge_stop(lr_model.test_log):
43 | 			break;
44 | 	print "Train done."
45 | 
46 | 	print "Begin log ..."
47 | 	header = "camp_id\tmodel\tdataset\trevenue\tctr\tcpc\tauc\trmse\tcpm\tbids\timps\tclks\tlaplace\tinterval\tlearn_rate\tnds_ratio"
48 | 	best_test_log = lr_model.get_best_test_log()
49 | 	best_test_line = `config.campaign_id` + "\t" + "SQ\ttest\t" \
50 | 						+ tool.gen_performance_line(best_test_log) + "\t" \
51 | 						+ `config.laplace` + "\t" + "None" + "\t" \
52 | 						+ `config.lr_alpha` + "\t" + "None"
53 | 	fo.write(header+"\n")
54 | 	fo.write(best_test_line+"\n")
55 | 
56 | 	fo.write("\n")
57 | 
58 | 	fo.write("Round\tTest\tctr\tcpc\tauc\trmse\tcpm\tclks\timps\tbids\n")
59 | 	for i in range(0, len(lr_model.test_log)):
60 | 		test_log = lr_model.test_log[i]
61 | 		line = `i+1` + "\t" + `test_log['performance']['revenue']` \
62 | 				+ "\t" + `test_log['performance']['ctr']` \
63 | 				+ "\t" + `test_log['performance']['cpc']` \
64 | 				+ "\t" + `test_log['performance']['auc']` \
65 | 				+ "\t" + `test_log['performance']['rmse']` \
66 | 				+ "\t" + `test_log['performance']['cpm']` \
67 | 				+ "\t" + `test_log['performance']['clks']` \
68 | 				+ "\t" + `test_log['performance']['imps']` \
69 | 				+ "\t" + `test_log['performance']['bids']`
70 | 		fo.write(line + "\n")
71 | 	fo.close()
72 | 	print "Log done."
73 | 	
74 | 	# weight_path = `config.campaign_id` + "_sqlr_best_weight_" + `config.lr_alpha` + "_" + `config.budget_prop` + ".txt"
75 | 	# lr_model.output_weight(best_test_log['weight'], "../output/" + weight_path)
76 | 
77 | 	weight_path = `config.campaign_id` + "_" + "sqlr_best_weight" \
78 | 				+ "_" + `config.laplace` \
79 | 				+ "_" + `config.eu_scale` \
80 | 				+ "_" + `config.ds_ratio` \
81 | 				+ ".weight"
82 | 	lr_model.output_weight(best_test_log['weight'], "../output/" + weight_path)
83 | 	
84 | 
85 | if __name__ == '__main__':
86 | 	main()
87 | 


--------------------------------------------------------------------------------
/python/test_lrlin.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import config
  3 | import tool
  4 | from dataset import Dataset
  5 | from bid_landscape import BidLandscape
  6 | from interval_landscape import IntervalLandscape
  7 | from lr_model import LrModel
  8 | import sys
  9 | 
 10 | 
 11 | def main():
 12 | 	if len(sys.argv) < 3:
 13 | 		print "Usage python test_lr.py campaign_id learn_rate (budget_prop)"
 14 | 		exit(-1)
 15 | 	data_folder = "../../make-ipinyou-data/"
 16 | 	config.campaign_id = int(sys.argv[1])
 17 | 	# print config.campaign
 18 | 	# print config.campaign_id
 19 | 	# exit(-1)
 20 | 	config.lr_alpha = float(sys.argv[2])
 21 | 	if len(sys.argv) == 4:
 22 | 		config.budget_prop = int(sys.argv[3])
 23 | 	train_path = data_folder + `config.campaign_id` + "/train.yzx.txt"
 24 | 	test_path = data_folder + `config.campaign_id` + "/test.yzx.txt"
 25 | 	
 26 | 	train_data = Dataset(train_path, config.campaign_id)
 27 | 	train_data.shuffle() # make train data shuffled
 28 | 	test_data = Dataset(test_path, config.campaign_id)
 29 | 	print "Load done."
 30 | 	
 31 | 	lr_model = LrModel(train_data, test_data)
 32 | 	print "campaign v = " + `lr_model.camp_v`
 33 | 	print "learn_rate = " + `config.lr_alpha`
 34 | 	print "budget = " + `lr_model.budget`
 35 | 
 36 | 	if config.ds_ratio > 0:
 37 | 		print "Need calibration."
 38 | 	else:
 39 | 		print "No calibration."
 40 | 
 41 | 	print "Begin training ..."
 42 | 	for i in range(0, config.lr_train_round):
 43 | 		lr_model.train()
 44 | 		lr_model.test()
 45 | 		print "Round " + `i+1` + "\t" + `tool.get_last_log(lr_model.test_log)['performance']`
 46 | 		if tool.judge_stop(lr_model.test_log):
 47 | 			break;
 48 | 	print "Train done."
 49 | 
 50 | 	log_file = `config.campaign_id` + "_lrlin_" + `config.lr_alpha` + "_" + `config.budget_prop` + ".csv"
 51 | 	fo = open("../output/"+log_file, 'w')
 52 | 	
 53 | 	print "Begin log ..."
 54 | 	header = "camp_id\tmodel\tdataset\trevenue\tctr\tcpc\tauc\trmse\tcpm\tbids\timps\tclks\tlaplace\tinterval\tlearn_rate\tnds_ratio\tbudget_prop"
 55 | 	best_test_log = lr_model.get_best_test_log()
 56 | 	best_test_line = `config.campaign_id` + "\t" + "LR\ttest\t" \
 57 | 						+ tool.gen_performance_line(best_test_log) + "\t" \
 58 | 						+ 'None' + "\t" + "None" + "\t" + `config.lr_alpha` + "\t" \
 59 | 						+ "None" + "\t" + `config.budget_prop`
 60 | 	fo.write(header+"\n")
 61 | 	fo.write(best_test_line+"\n")
 62 | 
 63 | 	# search for best linear parameter
 64 | 	opt_param = lr_model.lin_bid(best_test_log['weight'])
 65 | 	fo.write("prop\trevenue\troi\tctr\tcpc\tauc\trmse\tcpm\timps\tclks\tlin_param\n")
 66 | 	for prop in config.budget_props:
 67 | 		performance = lr_model.replay(best_test_log['weight'], lr_model.test_data, prop)
 68 | 		fo.write(`prop`); fo.write("\t")
 69 | 		fo.write(`performance['revenue']`); fo.write("\t")
 70 | 		fo.write(`performance['roi']`); fo.write("\t")
 71 | 		fo.write(`performance['ctr']`); fo.write("\t")
 72 | 		fo.write(`performance['cpc']`); fo.write("\t")
 73 | 		fo.write(`performance['auc']`); fo.write("\t")
 74 | 		fo.write(`performance['rmse']`); fo.write("\t")
 75 | 		fo.write(`performance['cpm']`); fo.write("\t")
 76 | 		fo.write(`performance['imps']`); fo.write("\t")
 77 | 		fo.write(`performance['clks']`); fo.write("\t")
 78 | 		fo.write(`opt_param`)
 79 | 		fo.write("\n")
 80 | 
 81 | 
 82 | 	fo.write("\n")
 83 | 
 84 | 	fo.write("Round\tTest\tctr\tcpc\tauc\trmse\tcpm\tclks\timps\tbids\n")
 85 | 	for i in range(0, len(lr_model.test_log)):
 86 | 		test_log = lr_model.test_log[i]
 87 | 		line = `i+1` + "\t" + `test_log['performance']['revenue']` \
 88 | 				+ "\t" + `test_log['performance']['ctr']` \
 89 | 				+ "\t" + `test_log['performance']['cpc']` \
 90 | 				+ "\t" + `test_log['performance']['auc']` \
 91 | 				+ "\t" + `test_log['performance']['rmse']` \
 92 | 				+ "\t" + `test_log['performance']['cpm']` \
 93 | 				+ "\t" + `test_log['performance']['clks']` \
 94 | 				+ "\t" + `test_log['performance']['imps']` \
 95 | 				+ "\t" + `test_log['performance']['bids']`
 96 | 		fo.write(line + "\n")
 97 | 	fo.close()
 98 | 	print "Log done."
 99 | 
100 | 	weight_path = `config.campaign_id` + "_" + "lrlin_best_weight" \
101 | 				+ "_" + `config.lr_alpha` + "_" + `config.budget_prop` \
102 | 				+ ".weight"
103 | 	lr_model.output_weight(best_test_log['weight'], "../output/" + weight_path)
104 | 
105 | if __name__ == '__main__':
106 | 	main()


--------------------------------------------------------------------------------
/python/test_rr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from dataset import Dataset
 3 | from bid_landscape import BidLandscape
 4 | from rr_model import RrModel
 5 | import sys
 6 | import config
 7 | import tool
 8 | 
 9 | def main():
10 |     if len(sys.argv) < 5:
11 |         print "Usage: python test_eu.py campaign_id laplace eu_scale ds_ratio"
12 |         exit(-1)
13 |     
14 |     config.campaign_id = int(sys.argv[1]) if int(sys.argv[1]) in config.campaign_list else config.campaign_id
15 |     config.laplace = int(sys.argv[2]) if int(sys.argv[2])>0 else config.laplace
16 |     config.eu_scale = float(sys.argv[3]) if float(sys.argv[3])>0 else config.eu_scale
17 |     config.ds_ratio = float(sys.argv[4]) if float(sys.argv[4])>0 else 0
18 |     print "camp_id\tlaplace\tscale\tds_ratio"
19 |     print `config.campaign_id` + "\t" + `config.laplace` + "\t" + `config.eu_scale` + "\t" + `config.ds_ratio`
20 | 
21 |     train_path = config.data_folder + `config.campaign_id` + "/train.yzx.txt"
22 |     test_path = config.data_folder + `config.campaign_id` + "/test.yzx.txt"
23 |     train_data = Dataset(train_path, config.campaign_id)
24 |     train_data.shuffle() # make train data shuffled
25 |     test_data = Dataset(test_path, config.campaign_id)
26 |     if config.INTVL:
27 |         IntervalLandscape(train_data, train_data.get_camp_id(), config.laplace, 3)
28 |         IntervalLandscape(test_data, test_data.get_camp_id(), config.laplace, 3)
29 |     else:
30 |         BidLandscape(train_data, train_data.get_camp_id(), config.laplace)
31 |         BidLandscape(test_data, test_data.get_camp_id(), config.laplace)
32 |     print "Load done."
33 | 
34 |     # downsampling
35 |     train_data_ds = train_data.down_sampling(config.ds_ratio) if config.ds_ratio>0 else train_data
36 |     print train_data_ds.get_statistics()
37 |     print "Down sampled."
38 | 
39 |     rr_model = RrModel(train_data_ds, test_data)
40 |     print "campaign v = " + `rr_model.camp_v`
41 | 
42 |     # train
43 |     print "Begin training ..."
44 |     for i in range(0, config.eu_train_round):
45 |         rr_model.train()
46 |         rr_model.test()
47 |         print "Round " + `i+1` + "\t" + `tool.get_last_log(rr_model.test_log)['performance']`
48 |         if tool.judge_stop(rr_model.test_log):
49 |             break;
50 |     print "Train done."
51 | 
52 |     # rr_2997_3_0.1_0.05.csv
53 |     log_file = `config.campaign_id` + "_rr" \
54 |                 + "_" + `config.laplace` \
55 |                 + "_" + `config.eu_scale` \
56 |                 + "_" + `config.ds_ratio` \
57 |                 + ".csv"
58 |     fo = open("../output/"+log_file, 'w')
59 |     
60 |     print "Being log ..."
61 |     header = "camp_id\tmodel\tdataset\trevenue\tctr\tcpc\tauc\trmse\tcpm\tbids\timps\tclks\tlaplace\tinterval\tscale\tds_ratio"
62 |     best_test_log = rr_model.get_best_test_log()
63 |     best_test_line = `config.campaign_id` + "\t" + "RR\ttest\t" \
64 |                         + tool.gen_performance_line(best_test_log) + "\t" \
65 |                         + `config.laplace` + "\t" + "None" + "\t" + `config.eu_scale` + "\t" + (`config.ds_ratio` if config.ds_ratio>0 else "None")
66 |     fo.write(header+"\n")
67 |     fo.write(best_test_line+"\n")
68 | 
69 |     fo.write("\n")
70 | 
71 |     fo.write("Round\tTest\tctr\tcpc\tauc\trmse\tcpm\tclks\timps\tbids\n")
72 |     for i in range(0, len(rr_model.test_log)):
73 |         test_log = rr_model.test_log[i]
74 |         line = `i+1` + "\t" + `test_log['performance']['revenue']` \
75 |                 + "\t" + `test_log['performance']['ctr']` \
76 |                 + "\t" + `test_log['performance']['cpc']` \
77 |                 + "\t" + `test_log['performance']['auc']` \
78 |                 + "\t" + `test_log['performance']['rmse']` \
79 |                 + "\t" + `test_log['performance']['cpm']` \
80 |                 + "\t" + `test_log['performance']['clks']` \
81 |                 + "\t" + `test_log['performance']['imps']` \
82 |                 + "\t" + `test_log['performance']['bids']`
83 |         fo.write(line + "\n")
84 |     fo.close()
85 |     print "Log done."
86 | 
87 |     weight_path = `config.campaign_id` + "_" + "rr_best_weight" \
88 |                 + "_" + `config.laplace` \
89 |                 + "_" + `config.eu_scale` \
90 |                 + "_" + `config.ds_ratio` \
91 |                  + ".weight"
92 |     rr_model.output_weight(best_test_log['weight'], "../output/" + weight_path)
93 | 
94 |     
95 | 
96 | if __name__ == '__main__':
97 |     main()
98 | 


--------------------------------------------------------------------------------
/python/test_eu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from dataset import Dataset
 3 | from bid_landscape import BidLandscape
 4 | from eu_model import EuModel
 5 | import sys
 6 | import config
 7 | import tool
 8 | 
 9 | def main():
10 |     if len(sys.argv) < 5:
11 |         print "Usage: python test_eu.py campaign_id laplace eu_scale ds_ratio"
12 |         exit(-1)
13 |     
14 |     config.campaign_id = int(sys.argv[1]) if int(sys.argv[1]) in config.campaign_list else config.campaign_id
15 |     config.laplace = int(sys.argv[2]) if int(sys.argv[2])>0 else config.laplace
16 |     config.eu_scale = float(sys.argv[3]) if float(sys.argv[3])>0 else config.eu_scale
17 |     config.ds_ratio = float(sys.argv[4]) if float(sys.argv[4])>0 else 0
18 |     print "camp_id\tlaplace\tscale\tds_ratio"
19 |     print `config.campaign_id` + "\t" + `config.laplace` + "\t" + `config.eu_scale` + "\t" + `config.ds_ratio`
20 | 
21 |     train_path = config.data_folder + `config.campaign_id` + "/train.yzx.txt"
22 |     test_path = config.data_folder + `config.campaign_id` + "/test.yzx.txt"
23 |     train_data = Dataset(train_path, config.campaign_id)
24 |     train_data.shuffle() # make train data shuffled
25 |     test_data = Dataset(test_path, config.campaign_id)
26 |     if config.INTVL:
27 |         IntervalLandscape(train_data, train_data.get_camp_id(), config.laplace, 3)
28 |         IntervalLandscape(test_data, test_data.get_camp_id(), config.laplace, 3)
29 |     else:
30 |         BidLandscape(train_data, train_data.get_camp_id(), config.laplace)
31 |         BidLandscape(test_data, test_data.get_camp_id(), config.laplace)
32 |     print "Load done."
33 | 
34 |     # downsampling
35 |     if config.ds_ratio>0:
36 |         train_data_ds = train_data.down_sampling(config.ds_ratio)
37 |     else:
38 |         train_data_ds = train_data
39 |     print "Down sampled."
40 |     print train_data_ds.get_statistics()
41 | 
42 |     eu_model = EuModel(train_data_ds, test_data)
43 |     print "campaign v = " + `eu_model.camp_v`
44 | 
45 |     # train
46 |     print "Begin training ..."
47 |     for i in range(0, config.eu_train_round):
48 |         eu_model.train()
49 |         eu_model.test()
50 |         print "Round " + `i+1` + "\t" + `tool.get_last_log(eu_model.test_log)['performance']`
51 |         if tool.judge_stop(eu_model.test_log):
52 |             break;
53 |     print "Train done."
54 | 
55 |     # eu_2997_3_0.1_0.05.csv
56 |     log_file = `config.campaign_id` + "_eu" \
57 |                 + "_" + `config.laplace` \
58 |                 + "_" + `config.eu_scale` \
59 |                 + "_" + `config.ds_ratio` \
60 |                 + ".csv"
61 |     fo = open("../output/"+log_file, 'w')
62 |     
63 |     print "Begin log ..."
64 |     header = "camp_id\tmodel\tdataset\trevenue\tctr\tcpc\tauc\trmse\tcpm\tbids\timps\tclks\tlaplace\tinterval\teu_scale\tnds_ratio"
65 |     best_test_log = eu_model.get_best_test_log()
66 |     best_test_line = `config.campaign_id` + "\t" + "EU\ttest\t" \
67 |                         + tool.gen_performance_line(best_test_log) + "\t" \
68 |                         + `config.laplace` + "\t" + "None" + "\t" + `config.eu_scale` + "\t" + `config.ds_ratio`
69 |     fo.write(header+"\n")
70 |     fo.write(best_test_line+"\n")
71 | 
72 |     fo.write("\n")
73 | 
74 |     fo.write("Round\tTest\tctr\tcpc\tauc\trmse\tcpm\tclks\timps\tbids\n")
75 |     for i in range(0, len(eu_model.test_log)):
76 |         test_log = eu_model.test_log[i]
77 |         line = `i+1` + "\t" + `test_log['performance']['revenue']` \
78 |                 + "\t" + `test_log['performance']['ctr']` \
79 |                 + "\t" + `test_log['performance']['cpc']` \
80 |                 + "\t" + `test_log['performance']['auc']` \
81 |                 + "\t" + `test_log['performance']['rmse']` \
82 |                 + "\t" + `test_log['performance']['cpm']` \
83 |                 + "\t" + `test_log['performance']['clks']` \
84 |                 + "\t" + `test_log['performance']['imps']` \
85 |                 + "\t" + `test_log['performance']['bids']`
86 |         fo.write(line + "\n")
87 |     fo.close()
88 |     print "Log done."
89 | 
90 |     weight_path = `config.campaign_id` + "_" + "eu_best_weight" \
91 |                 + "_" + `config.laplace` \
92 |                 + "_" + `config.eu_scale` \
93 |                 + "_" + `config.ds_ratio` \
94 |                  + ".weight"
95 |     eu_model.output_weight(best_test_log['weight'], "../output/" + weight_path)
96 | 
97 | 
98 | if __name__ == '__main__':
99 |     main()


--------------------------------------------------------------------------------
/python/dataset.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import os
  3 | import tool
  4 | import copy
  5 | import random
  6 | 
  7 | class Dataset:
  8 | 	'''The class for data loading and storage.'''
  9 | 
 10 | 	def __init__(self, file_path, camp_id):
 11 | 		self.file_path = file_path
 12 | 		self.camp_id = camp_id
 13 | 		self.init_statistics()
 14 | 		self.load()
 15 | 		self.iterators = []
 16 | 
 17 | 	def load(self): # load data from the specified file path
 18 | 		print "Loading data ..."
 19 | 		self.dataset = []
 20 | 		if not os.path.isfile(self.file_path):
 21 | 			print "ERROR: file not exist. " + self.file_path
 22 | 			exit(-1)
 23 | 		size = 0
 24 | 		cost_sum = 0
 25 | 		clk_sum = 0
 26 | 		max_price = -1
 27 | 		fi = open(self.file_path, 'r')
 28 | 		for line in fi:
 29 | 			li = tool.ints(line.replace(':1','').split())
 30 | 			if self.camp_id < 0:
 31 | 				li.append(-1)
 32 | 			self.dataset.append(li)
 33 | 			y = li[0]
 34 | 			mp = li[1]
 35 | 			size += 1
 36 | 			cost_sum += mp
 37 | 			max_price = mp if mp > max_price else max_price
 38 | 			clk_sum += y
 39 | 		fi.close()
 40 | 		self.statistics['size'] = size
 41 | 		self.statistics['cost_sum'] = cost_sum
 42 | 		self.statistics['clk_sum'] = clk_sum
 43 | 		self.statistics['ecpm'] = 1.0 * cost_sum / size
 44 | 		self.statistics['ecpc'] = int(cost_sum / clk_sum * 1E-3)
 45 | 		self.statistics['ctr'] = 1.0 * clk_sum / size
 46 | 		self.statistics['max_price'] = max_price
 47 | 		print "Loaded."
 48 | 		print self.get_statistics()		
 49 | 
 50 | 	def shuffle(self):
 51 | 		random.seed(200)
 52 | 		random.shuffle(self.dataset)
 53 | 
 54 | 	def init_statistics(self): # init all the statistic elements
 55 | 		self.statistics = {'size':0, 'cost_sum':0, 'clk_sum':0, 
 56 | 							'ecpm':0, 'ecpc':0, 'ctr':0.0, 'max_price':0}
 57 | 	
 58 | 	def update_statistics(self):
 59 | # 		print "update statistics \t" + `self`
 60 | 		size = 0
 61 | 		cost_sum = 0
 62 | 		clk_sum = 0
 63 | 		max_price = -1
 64 | 		for data in self.dataset:
 65 | 			y = data[0]
 66 | 			mp = data[1]
 67 | 			size += 1
 68 | 			cost_sum += mp
 69 | 			max_price = mp if mp > max_price else max_price
 70 | 			clk_sum += y
 71 | 		self.statistics['size'] = size
 72 | 		self.statistics['cost_sum'] = cost_sum
 73 | 		self.statistics['clk_sum'] = clk_sum
 74 | 		self.statistics['ecpm'] = 1.0 * cost_sum / size
 75 | 		self.statistics['ori_ecpc'] = self.statistics['ecpc']
 76 | 		self.statistics['ecpc'] = cost_sum / clk_sum * 1E-3
 77 | 		self.statistics['ctr'] = 1.0 * clk_sum / size
 78 | 		self.statistics['max_price'] = max_price
 79 | 
 80 | 	def init_landscape(self, landscape): # record the bid landscape into the dataset instance
 81 | 		self.landscape = landscape
 82 | 
 83 | 	def down_sampling(self, ratio):
 84 | # 		print "original dataset \t " + `self`
 85 | 		ds_dataset = copy.deepcopy(self)
 86 | # 		print "downsampled dataset \t" + `ds_dataset`
 87 | 		ds_dataset.self_down_sampling(ratio)
 88 | 		return ds_dataset
 89 | 	
 90 | 	def self_down_sampling(self, ratio):
 91 | 		random.seed(20)
 92 | 		ds_dataset = []
 93 | 		neg_dataset = []
 94 | 		pos_num = self.statistics['clk_sum']
 95 | 		neg_num = self.get_size() - pos_num
 96 | 		desired_neg_num = int(neg_num * ratio) # desired_neg_num if desired_neg_num < neg_num else neg_num 
 97 | 		for data in self.dataset:
 98 | 			y = data[0]
 99 | 			if y == 1:
100 | 				ds_dataset.append(copy.deepcopy(data))
101 | 			else:
102 | 				neg_dataset.append(copy.deepcopy(data))
103 | 		ds_dataset += random.sample(neg_dataset, desired_neg_num)
104 | 		#TODO update statistics, e.g. size
105 | 		self.dataset = ds_dataset
106 | 		self.update_statistics()
107 | 		random.shuffle(ds_dataset)
108 | 		self.init_all_iterators()
109 | 
110 | 	def get_camp_id(self):
111 | 		return self.camp_id
112 | 
113 | 	def get_statistics(self):
114 | 		return self.statistics
115 | 
116 | 	def get_landscape(self):
117 | 		if self.landscape == None:
118 | 			print "ERROR: Please init landscape first. [Dataset.init_landscape(landscape)]"
119 | 		return self.landscape
120 | 
121 | 	def get_dataset(self):
122 | 		return self.dataset
123 | 
124 | 	def init_index(self): # initialize an iterator and store it
125 | 		self.iterators.append(0)
126 | 		iter_id = len(self.iterators) - 1
127 | 		return iter_id
128 | 	
129 | 	def init_all_iterators(self):
130 | 		iter_num = len(self.iterators)
131 | 		if iter_num > 0:
132 | 			for idx in range(0, iter_num):
133 | 				self.iterators[idx] = 0
134 | 
135 | 	def get_next_data(self, iter_id): # get the next data in the dataset
136 | 		if self.iterators[iter_id] >= self.get_size():
137 | 			self.iterators[iter_id] = 0
138 | 		data = self.dataset[self.iterators[iter_id]]
139 | 		self.iterators[iter_id] = self.iterators[iter_id] + 1
140 | 		return data
141 | 
142 | 	def get_progress(self, iter_id):
143 | 		progress = 1.0 * self.iterators[iter_id] / self.get_size()
144 | 		return progress
145 | 
146 | 	def get_size(self): # get the volume size of the dataset
147 | 		return self.statistics['size']
148 | 
149 | 	def get_max_price(self):
150 | 		return self.statistics['max_price']
151 | 
152 | 	def reached_tail(self, iter_id): # judge whether the last data have been reached
153 | 		flag = (self.iterators[iter_id] >= self.get_size())
154 | 		return flag
155 | 
156 | 
157 | def main():
158 | 	print "main method."
159 | 
160 | if __name__ == '__main__':
161 | 	main()
162 | 


--------------------------------------------------------------------------------
/python/replay.py:
--------------------------------------------------------------------------------
  1 | import config
  2 | from dataset import Dataset
  3 | from opt_bid import OptBid
  4 | import sys
  5 | import os
  6 | import tool
  7 | from sklearn.metrics import roc_auc_score
  8 | from sklearn.metrics import mean_squared_error
  9 | import math
 10 | 
 11 | nds_ratio = 0.01
 12 | camp_v = 36000
 13 | 
 14 | # header_dataset = "camp_id\tnds_ratio\tcamp_v\tsize\tcost_sum\tclk_sum\tecpc\tcpm\tctr\tmax_price\n"
 15 | def make_dataset_record(dataset, camp_id):
 16 | 	stat = dataset.get_statistics()
 17 | 	line = `camp_id` + "\t" + `nds_ratio` + "\t" + \
 18 | 			`camp_v` + "\t" + `stat['size']` + "\t" + \
 19 | 			`stat['cost_sum']` + "\t" + `stat['clk_sum']` + "\t" + \
 20 | 			`stat['ecpc']` + "\t" + `stat['ecpm']` + "\t" + \
 21 | 			`stat['ctr']` + "\t" + `stat['max_price']` + "\n"
 22 | 	return line
 23 | 
 24 | # header_log = "progress\trevenue\tctr\twin_rate\tauc\trmse\tecpc\tcpm\tclk_sum\timp_sum\tbid_sum\n"
 25 | def make_log_record(log):
 26 | 	line = `log['progress']` + "\t" + `log['revenue']` + "\t" + \
 27 | 			`log['ctr']` + "\t" + `log['win_rate']` + "\t" + \
 28 | 			`log['auc']` + "\t" + `log['rmse']` + "\t" + \
 29 | 			`log['cpc']` + "\t" + `log['cpm']` + "\t" + \
 30 | 			`log['clks']` + "\t" + `log['imps']` + "\t" + \
 31 | 			`log['bids']` + "\n"
 32 | 	return line
 33 | 
 34 | def make_performance(progress, bid_sum, cost_sum, imp_sum, clk_sum, revenue_sum, labels, p_labels):
 35 | 	log = {}
 36 | 	log['progress'] = progress
 37 | 	log['bids'] = bid_sum
 38 | 	log['imps'] = imp_sum
 39 | 	log['clks'] = clk_sum
 40 | 	log['revenue'] = revenue_sum
 41 | 	log['auc'] = roc_auc_score(labels, p_labels)
 42 | 	log['rmse'] = math.sqrt(mean_squared_error(labels, p_labels))
 43 | 	log['cpc'] = 0.0 if clk_sum == 0 else 1.0 * cost_sum / clk_sum * 1E-3
 44 | 	log['cpm'] = 0.0 if imp_sum == 0 else 1.0 * cost_sum / imp_sum
 45 | 	log['ctr'] = 0.0 if imp_sum == 0 else 1.0 * clk_sum / imp_sum
 46 | 	log['win_rate'] = 0.0 if bid_sum == 0 else 1.0 * imp_sum / bid_sum
 47 | 	print log
 48 | 	return log
 49 | 
 50 | def calibrate_ctr(pctr):
 51 | 	cal_pctr = pctr / (pctr + (1 - pctr) / nds_ratio)
 52 | 	return cal_pctr
 53 | 
 54 | def bid_cal(ctr):
 55 | 	cal_ctr = calibrate_ctr(ctr)
 56 | 	bid_price = int(camp_v * cal_ctr * 1E3)
 57 | 	return bid_price
 58 | 
 59 | def bid(ctr):
 60 | 	bid_price = int(camp_v * ctr * 1E3)
 61 | 	print camp_v
 62 | 	return bid_price
 63 | 
 64 | def check_file(path):
 65 | 	if not os.path.isfile(path):
 66 | 		print "ERROR: file not exist. " + path
 67 | 		exit(-1)
 68 | 
 69 | def read_weight(path):
 70 | 	weight = {}
 71 | 	check_file(path)
 72 | 	fi = open(path, 'r')
 73 | 	for line in fi:
 74 | 		k_v = line.split()
 75 | 		key = int(k_v[0])
 76 | 		value = float(k_v[1])
 77 | 		weight[key] = value
 78 | 	return weight
 79 | 
 80 | def main():
 81 | 	if len(sys.argv) < 6:
 82 | 		print "Usage: python replay.py camp_id(yoyi=-1) budget_prop weight.txt test.yzx.txt log.csv (calib)"
 83 | 		exit(-1)
 84 | 
 85 | 	camp_id = int(sys.argv[1])
 86 | 	print "Campaign ID = " + `camp_id`
 87 | 	budget_prop = int(sys.argv[2])
 88 | 	weight_path = sys.argv[3]
 89 | 	data_path = sys.argv[4]
 90 | 	log_path = sys.argv[5]
 91 | 	global nds_ratio
 92 | 	global camp_v
 93 | 	if len(sys.argv) == 7:
 94 | 		nds_ratio = float(sys.argv[6])
 95 | 		if not nds_ratio > 0:
 96 | 			print "No calibration."
 97 | 
 98 | 	dataset = Dataset(data_path, camp_id)
 99 | 	weight = read_weight(weight_path)
100 | 	budget = int(dataset.get_statistics()['cost_sum'] / budget_prop)
101 | 	if camp_id > 0:
102 | 		camp_v = dataset.get_statistics()['ecpc']
103 | 
104 | 	# init the metrics
105 | 	logs = []
106 | 	labels = []
107 | 	p_labels = []
108 | 	bid_sum = 0
109 | 	cost_sum = 0
110 | 	imp_sum = 0
111 | 	clk_sum = 0
112 | 	revenue_sum = 0
113 | 
114 | 	detail_fo = open("../detail/" + `camp_id` + ".txt", 'w')
115 | 
116 | 	# replay
117 | 	progress = 0.0
118 | 	total_num = dataset.get_statistics()['size']
119 | 	iter_id = dataset.init_index()
120 | 	while not dataset.reached_tail(iter_id):
121 | 		data = dataset.get_next_data(iter_id)
122 | 		bid_sum += 1
123 | 		y = data[0]
124 | 		mp = data[1]
125 | 		feature = data[2:len(data)]
126 | 		ctr = tool.estimate_ctr(weight, feature, train_flag=False)
127 | 		labels.append(y)
128 | 		p_labels.append(ctr)
129 | 		if camp_id < 0 or nds_ratio > 0:
130 | 			bid_price = bid_cal(ctr)
131 | 		else:
132 | 			bid_price = bid(ctr)
133 | 		if bid_price > mp:
134 | 			cost_sum += mp
135 | 			imp_sum += 1
136 | 			clk_sum += y
137 | 			revenue_sum = revenue_sum - mp + int(camp_v * y * 1E3)
138 | 		detail_fo.write(`bid_price` + "\t" + `mp`+"\n")
139 | 		# prg = 1.0 * bid_sum / total_num
140 | 		# if prg > (progress + config.test_progress_unit - 1E-5):
141 | 		# 	progress += config.test_progress_unit
142 | 		# 	performance = make_performance(prg, bid_sum, cost_sum, imp_sum, clk_sum, revenue_sum, labels, p_labels)
143 | 		# 	logs.append(performance)
144 | 		if cost_sum > budget:
145 | 			performance = make_performance(prg, bid_sum, cost_sum, imp_sum, clk_sum, revenue_sum, labels, p_labels)
146 | 			logs.append(performance)
147 | 			break
148 | 	performance = make_performance(1.0, bid_sum, cost_sum, imp_sum, clk_sum, revenue_sum, labels, p_labels)
149 | 	logs.append(performance)
150 | 
151 | 	detail_fo.close()
152 | 
153 | 	# make record
154 | 	log_file = open(log_path, 'w')
155 | 	header_dataset = "camp_id\tnds_ratio\tcamp_v\tsize\tcost_sum\tclk_sum\tecpc\tcpm\tctr\tmax_price\n"
156 | 	header_log = "progress\trevenue\tctr\twin_rate\tauc\trmse\tecpc\tcpm\tclk_sum\timp_sum\tbid_sum\n"
157 | 	log_file.write(header_dataset)
158 | 	log_file.write(make_dataset_record(dataset, camp_id))
159 | 	log_file.write(header_log)
160 | 	for log in logs:
161 | 		log_file.write(make_log_record(log))
162 | 
163 | 
164 | if __name__ == '__main__':
165 | 	main()
166 | 


--------------------------------------------------------------------------------
/python/test_em.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | from dataset import Dataset
  3 | from bid_landscape import BidLandscape
  4 | from em_model import EmModel
  5 | import sys
  6 | import config
  7 | import tool
  8 | 
  9 | def main():
 10 |     if len(sys.argv) < 7:
 11 |         print "Usage: python test_em.py camp_id model_name laplace x_scale ds_ratio budget_prop"
 12 |         exit(-1)
 13 | 
 14 |     config.campaign_id = int(sys.argv[1])
 15 |     model_name = sys.argv[2]
 16 |     if not model_name in config.model_list:
 17 |         print "Wrong model name."
 18 |         exit(-1)
 19 |     config.model_name = model_name
 20 |     config.laplace = int(sys.argv[3])
 21 |     config.em_scale = float(sys.argv[4])
 22 |     config.ds_ratio = float(sys.argv[5]) if float(sys.argv[5]) > 0 else 0
 23 |     config.budget_prop = int(sys.argv[6])
 24 |     print "camp_id\tmodel\tlaplace\tscale\tds_ratio\tbudget_prop"
 25 |     print `config.campaign_id` + "\t" + `model_name` \
 26 |             + "\t" + `config.laplace` + "\t" + `config.em_scale` \
 27 |             + "\t" + `config.ds_ratio` + "\t" + `config.budget_prop`
 28 | 
 29 |     train_path = config.data_folder + `config.campaign_id` + "/train.yzx.txt"
 30 |     test_path = config.data_folder + `config.campaign_id` + "/test.yzx.txt"
 31 |     train_data = Dataset(train_path, config.campaign_id)
 32 |     train_data.shuffle() # make train data shuffled
 33 |     test_data = Dataset(test_path, config.campaign_id)
 34 |     
 35 |     # no interval setting
 36 |     BidLandscape(train_data, train_data.get_camp_id(), config.laplace)
 37 |     BidLandscape(test_data, test_data.get_camp_id(), config.laplace)
 38 |     print "Load done."
 39 | 
 40 |     # downsampling
 41 |     train_data_ds = train_data.down_sampling(config.ds_ratio) if config.ds_ratio>0 else train_data
 42 |     print train_data_ds.get_statistics()
 43 |     print "Down sampled."
 44 | 
 45 |     em_model = EmModel(train_data_ds, test_data, model_name)
 46 |     print "campaign v = " + `em_model.camp_v`
 47 | 
 48 |     # train
 49 |     print "Begin training ..."
 50 |     for i in range(0, config.em_round):
 51 |         em_model.train()
 52 |         print "EM Round " + `i+1` + "\t" + `tool.get_last_log(em_model.em_log)['performance']`
 53 |         if tool.judge_stop(em_model.em_log):
 54 |             break;
 55 |     print "Train done."
 56 | 
 57 |     # em_rr_2997_3_0.1_0.csv
 58 |     log_file = "em_" + model_name \
 59 |                 + "_" + `config.campaign_id` \
 60 |                 + "_" + `config.budget_prop` \
 61 |                 + "_" + `config.laplace` \
 62 |                 + "_" + `config.em_scale` \
 63 |                 + "_" + `config.ds_ratio` \
 64 |                 + ".csv"
 65 |     fo = open("../output/" + log_file, 'w')
 66 | 
 67 |     print "Begin log ..."
 68 |     header = "camp_id\tmodel\tdataset\trevenue\troi\tctr\tcpc\tauc\trmse\tcpm\tbids\timps\tclks\tlaplace\tinterval\tscale\tds_ratio\tbudget_prop\tem_round\tmu"
 69 |     best_em_log = em_model.get_best_log(em_model.em_log)
 70 |     best_em_line = `config.campaign_id` + "\t" + "em"+model_name + "\ttest\t" \
 71 |                     + tool.gen_performance_line(best_em_log) + "\t" \
 72 |                     + `config.laplace` + "\t" + "None" + "\t" + `config.em_scale` + "\t" \
 73 |                     + (`config.ds_ratio` if config.ds_ratio>0 else "None") + "\t" \
 74 |                     + `config.budget_prop` +"\t" \
 75 |                     + `len(em_model.em_log)` + "\t" + `best_em_log['mu']`
 76 | 
 77 |     fo.write(header + "\n")
 78 |     fo.write(best_em_line + "\n")
 79 | 
 80 |     fo.write("Test with Budget Constraints\n")
 81 | 
 82 |     # # reset mu
 83 |     # em_model.mu = best_em_log['mu']
 84 |     # em_model.bid_strategy.set_mu(em_model.mu)
 85 |     # # replay
 86 |     # fo.write("prop\trevenue\troi\tctr\tcpc\tauc\trmse\tcpm\timps\tclks\n")
 87 |     # for prop in config.budget_props:
 88 |     #     performance = em_model.replay(best_em_log['weight'], em_model.test_data, prop)
 89 |     #     fo.write(`prop`); fo.write("\t")
 90 |     #     fo.write(`performance['revenue']`); fo.write("\t")
 91 |     #     fo.write(`performance['roi']`); fo.write("\t")
 92 |     #     fo.write(`performance['ctr']`); fo.write("\t")
 93 |     #     fo.write(`performance['cpc']`); fo.write("\t")
 94 |     #     fo.write(`performance['auc']`); fo.write("\t")
 95 |     #     fo.write(`performance['rmse']`); fo.write("\t")
 96 |     #     fo.write(`performance['cpm']`); fo.write("\t")
 97 |     #     fo.write(`performance['imps']`); fo.write("\t")
 98 |     #     fo.write(`performance['clks']`); fo.write("\t")
 99 |     #     fo.write("\n")
100 | 
101 | 
102 |     fo.write("\n")
103 | 
104 |     fo.write("Round\trevenue\troi\tcpc\tctr\tauc\trmse\timps\ttruncate\tmu\n")
105 |     for i in range(0, len(em_model.em_log)):
106 |         em_log = em_model.em_log[i]
107 |         line = `i+1` + "\t" + `em_log['performance']['revenue']` + "\t" \
108 |                 + `em_log['performance']['roi']` + "\t" \
109 |                 + `em_log['performance']['cpc']` + "\t" \
110 |                 + `em_log['performance']['ctr']` + "\t" \
111 |                 + `em_log['performance']['auc']` + "\t" \
112 |                 + `em_log['performance']['rmse']` + "\t" \
113 |                 + `em_log['performance']['imps']` + "\t" \
114 |                 + `em_log['weight'][0]` + "\t" \
115 | 		+ `em_log['mu']`
116 |         fo.write(line + "\n")
117 |     fo.write("\n")
118 |     for i in range(0, len(em_model.test_log)):
119 |         test_log = em_model.test_log[i]
120 |         line = `i+1` + "\t" + `test_log['performance']['revenue']` + "\t" \
121 |                 + `test_log['performance']['roi']` + "\t" \
122 |                 + `test_log['performance']['cpc']` + "\t" \
123 |                 + `test_log['performance']['ctr']` + "\t" \
124 |                 + `test_log['performance']['auc']` + "\t" \
125 |                 + `test_log['performance']['rmse']` + "\t" \
126 |                 + `test_log['performance']['imps']` + "\t" \
127 |                 + `test_log['weight'][0]`
128 |         if 'm' in test_log:
129 |             line = line + "\tm"
130 |         fo.write(line + "\n")
131 | 
132 |     fo.close()
133 |     print "Log done."
134 | 
135 | if __name__ == '__main__':
136 |     main()
137 | 


--------------------------------------------------------------------------------
/python/lr_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import copy
  3 | import random
  4 | from model import Model
  5 | from mcpc_bid import McpcBid
  6 | import tool
  7 | import config
  8 | from sklearn.metrics import roc_auc_score
  9 | from sklearn.metrics import mean_squared_error
 10 | import math
 11 | 
 12 | class LrModel(Model):
 13 | 	def __init__(self, train_data, test_data):
 14 | 		Model.__init__(self, train_data, test_data)
 15 | 		self.init_parameters()
 16 | 		self.init_weight()
 17 | 		self.init_bid_strategy()
 18 | 		self.reg_update_param = 1 - config.lr_alpha * config.lr_lambda
 19 | 		# print self.reg_update_param
 20 | 		self.train_log = []
 21 | 		self.test_log = []
 22 | 
 23 | 	def init_weight(self):
 24 | 		self.weight = {}
 25 | 		self.best_weight = {}
 26 | 
 27 | 	def init_bid_strategy(self):
 28 | 		self.bid_strategy = McpcBid(self.camp_v)
 29 | 
 30 | 	def init_parameters(self):
 31 | 		self.camp_v = self.train_data.get_statistics()['ecpc']
 32 | 		self.mu = 0.0
 33 | 		self.budget = int(self.test_data.get_statistics()['cost_sum'] / config.budget_prop)
 34 | 		# print "camp_v \t " + `self.camp_v`
 35 | 
 36 | 	def train(self): # train with one traversal of the full train_data
 37 | 		random.seed(10)
 38 | 		train_data = self.train_data
 39 | # 		print "Train data \t" + `train_data` + "\tsize \t" + `train_data.get_size()`
 40 | 		progress = 0.0
 41 | 		iter_id = train_data.init_index()
 42 | 		while not train_data.reached_tail(iter_id):
 43 | 			data = train_data.get_next_data(iter_id)
 44 | 			y = data[0]
 45 | 			feature = data[2:len(data)]
 46 | 			ctr = tool.estimate_ctr(self.weight, feature, train_flag=True)
 47 | 			for idx in feature: # update
 48 | 				self.weight[idx] = self.weight[idx] * self.reg_update_param - config.lr_alpha * (ctr - y)
 49 | 			# prg = train_data.get_progress(iter_id)
 50 | 			# if prg < 0.9 and prg > (progress + config.train_progress_unit - 1E-3):
 51 | 			# 	self.test()
 52 | 			# 	progress += config.train_progress_unit
 53 | 
 54 | 	def test(self):
 55 | 		parameters = {'weight':self.weight}
 56 | 		performance = self.calc_performance(self.test_data, parameters)
 57 | 		# record performance
 58 | 		log = self.make_log(self.weight, performance)
 59 | 		self.test_log.append(log)
 60 | 
 61 | 	def make_log(self, weight, performance):
 62 | 		log = {}
 63 | 		log['weight'] = copy.deepcopy(weight)
 64 | 		log['performance'] = copy.deepcopy(performance)
 65 | 		log['mu'] = self.mu
 66 | 		return log
 67 | 
 68 | 	def calc_performance(self, dataset, parameters): # calculate the performance w.r.t. the given dataset and parameters
 69 | 		weight = parameters['weight']
 70 | 		# budget = parameters['budget']
 71 | 		bid_sum = 0
 72 | 		cost_sum = 0
 73 | 		imp_sum = 0
 74 | 		clk_sum = 0
 75 | 		revenue_sum = 0
 76 | 		labels = []
 77 | 		p_labels = []
 78 | 		iter_id = dataset.init_index()
 79 | 		while not dataset.reached_tail(iter_id): #TODO no budget set
 80 | 			bid_sum += 1
 81 | 			data = dataset.get_next_data(iter_id)
 82 | 			y = data[0]
 83 | 			market_price = data[1]
 84 | 			feature = data[2:len(data)]
 85 | 			ctr = tool.estimate_ctr(weight, feature, train_flag=False)
 86 | 			labels.append(y)
 87 | 			p_labels.append(ctr)
 88 | 			if config.ds_ratio > 0: # down sampled, needs to calibrate
 89 | 				bid_price = self.bid_strategy.bid_calib(self.ori_camp_v, self.mu, ctr)
 90 | 			else:
 91 | 				bid_price = self.bid_strategy.bid(ctr)
 92 | 			if bid_price > market_price:
 93 | 				cost_sum += market_price
 94 | 				imp_sum += 1
 95 | 				clk_sum += y
 96 | 				if config.ds_ratio > 0:
 97 | 					revenue_sum = int(revenue_sum - market_price + y * self.ori_camp_v * 1E3)
 98 | 				else:
 99 | 					revenue_sum = int(revenue_sum - market_price + y * self.camp_v * 1E3)
100 | 			if cost_sum >= self.budget:
101 | 				break
102 | 		cpc = 0.0 if clk_sum == 0 else 1.0 * cost_sum / clk_sum * 1E-3
103 | 		cpm = 0.0 if imp_sum == 0 else 1.0 * cost_sum / imp_sum
104 | 		ctr = 0.0 if imp_sum == 0 else 1.0 * clk_sum / imp_sum
105 | 		roi = 0.0 if cost_sum == 0 else 1.0 * (revenue_sum) / cost_sum
106 | 		auc = roc_auc_score(labels, p_labels)
107 | 		rmse = math.sqrt(mean_squared_error(labels, p_labels))
108 | 		performance = {'bids':bid_sum, 'cpc':cpc, 'cpm':cpm, 
109 | 						'ctr': ctr, 'revenue':revenue_sum, 
110 | 						'imps':imp_sum, 'clks':clk_sum,
111 | 						'auc': auc, 'rmse': rmse,
112 | 						'roi': roi}
113 | 		return performance
114 | 
115 | 	def get_best_train_log(self):
116 | 		return self.get_best_log(self.train_log)
117 | 	
118 | 	def get_best_test_log(self):
119 | 		return self.get_best_log(self.test_log)
120 | 
121 | 	def get_best_log(self, logs):
122 | 		best_log = {}
123 | 		if len(logs) == 0:
124 | 			print "ERROR: no record in the log."
125 | 		else:
126 | 			best_revenue = -1E10
127 | 			for log in logs:
128 | 				revenue = log['performance']['revenue']
129 | 				if revenue > best_revenue:
130 | 					best_revenue = revenue
131 | 					best_log = log
132 | 		return best_log
133 | 
134 | 	def output_weight(self, weight, path):
135 | 		fo = open(path, 'w')
136 | 		for idx in weight:
137 | 			fo.write(`idx` + '\t' + `weight[idx]` + '\n')
138 | 		fo.close()
139 | 
140 | 	def lin_bid(self, weight):
141 | 		params = range(30, 100, 5) + range(100, 400, 10) + range(400, 800, 50)
142 | 		base_ctr = self.train_data.get_statistics()['ctr']
143 | 		dataset = self.test_data
144 | 		opt_param = 3000
145 | 		opt_revenue = -1E10
146 | 		for param in params:
147 | 			bid_sum = 0
148 | 			cost_sum = 0
149 | 			imp_sum = 0
150 | 			clk_sum = 0
151 | 			revenue_sum = 0
152 | 			labels = []
153 | 			p_labels = []
154 | 			iter_id = dataset.init_index()
155 | 			while not dataset.reached_tail(iter_id): #TODO no budget set
156 | 				bid_sum += 1
157 | 				data = dataset.get_next_data(iter_id)
158 | 				y = data[0]
159 | 				market_price = data[1]
160 | 				feature = data[2:len(data)]
161 | 				ctr = tool.estimate_ctr(weight, feature, train_flag=False)
162 | 				labels.append(y)
163 | 				p_labels.append(ctr)
164 | 				bid_price = int(param * ctr / base_ctr)
165 | 				if bid_price > market_price:
166 | 					cost_sum += market_price
167 | 					imp_sum += 1
168 | 					clk_sum += y
169 | 					revenue_sum = int(revenue_sum - market_price + y * self.camp_v * 1E3)
170 | 				if cost_sum >= self.budget:
171 | 					break
172 | 			cpc = 0.0 if clk_sum == 0 else 1.0 * cost_sum / clk_sum * 1E-3
173 | 			cpm = 0.0 if imp_sum == 0 else 1.0 * cost_sum / imp_sum
174 | 			ctr = 0.0 if imp_sum == 0 else 1.0 * clk_sum / imp_sum
175 | 			roi = 0.0 if cost_sum == 0 else 1.0 * (revenue_sum) / cost_sum
176 | 			auc = roc_auc_score(labels, p_labels)
177 | 			rmse = math.sqrt(mean_squared_error(labels, p_labels))
178 | 			performance = {'bids':bid_sum, 'cpc':cpc, 'cpm':cpm, 
179 | 							'ctr': ctr, 'revenue':revenue_sum, 
180 | 							'imps':imp_sum, 'clks':clk_sum,
181 | 							'auc': auc, 'rmse': rmse,
182 | 							'roi': roi}
183 | 			if performance['revenue'] > opt_revenue:
184 | 				opt_revenue = performance['revenue']
185 | 				opt_param = param
186 | 		self.opt_param = opt_param
187 | 		return opt_param
188 | 
189 | 	def replay(self, weight, test_data, budget_prop):
190 | 		budget = int(1.0 * test_data.get_statistics()['cost_sum'] / budget_prop)
191 | 		base_ctr = self.train_data.get_statistics()['ctr']
192 | 		label = []
193 | 		p_labels = []
194 | 		bid_sum = 0
195 | 		cost_sum = 0
196 | 		imp_sum = 0
197 | 		clk_sum = 0
198 | 		revenue_sum = 0
199 | 		labels = []
200 | 		p_labels = []
201 | 		iter_id = test_data.init_index()
202 | 		while not test_data.reached_tail(iter_id):
203 | 			data = test_data.get_next_data(iter_id)
204 | 			bid_sum += 1
205 | 			y = data[0]
206 | 			mp = data[1]
207 | 			feature = data[2:len(data)]
208 | 			ctr = tool.estimate_ctr(weight, feature, train_flag=False)
209 | 			labels.append(y)
210 | 			p_labels.append(ctr)
211 | 			bp = int(self.opt_param * ctr / base_ctr)
212 | 			# bp = self.bid_strategy.bid(ctr)
213 | 			if bp > mp:
214 | 				cost_sum += mp
215 | 				imp_sum += 1
216 | 				clk_sum += y
217 | 				revenue_sum = int(revenue_sum - mp + y * self.camp_v * 1E3)
218 | 			if cost_sum >= budget:
219 | 				break
220 | 		cpc = 0.0 if clk_sum == 0 else 1.0 * cost_sum / clk_sum * 1E-3
221 | 		cpm = 0.0 if imp_sum == 0 else 1.0 * cost_sum / imp_sum
222 | 		ctr = 0.0 if imp_sum == 0 else 1.0 * clk_sum / imp_sum
223 | 		roi = 0.0 if cost_sum == 0 else 1.0 * revenue_sum / cost_sum
224 | 		auc = roc_auc_score(labels, p_labels)
225 | 		rmse = math.sqrt(mean_squared_error(labels, p_labels))
226 | 		performance = {'bids':bid_sum, 'cpc':cpc, 'cpm':cpm, 
227 | 					'auc': auc, 'rmse': rmse,
228 | 					'ctr': ctr, 'revenue':revenue_sum, 
229 | 					'imps':imp_sum, 'clks':clk_sum,
230 | 					'roi': roi}
231 | 		return performance
232 | 


--------------------------------------------------------------------------------
/python/em_model.py:
--------------------------------------------------------------------------------
  1 | from lr_model import LrModel
  2 | from opt_bid import OptBid
  3 | from dataset import Dataset
  4 | import math
  5 | import random
  6 | import copy
  7 | import tool
  8 | import config
  9 | from sklearn.metrics import roc_auc_score
 10 | from sklearn.metrics import mean_squared_error
 11 | 
 12 | 
 13 | class EmModel(LrModel):
 14 |     def __init__(self, train_data, test_data, model):
 15 |         LrModel.__init__(self, train_data, test_data)
 16 |         if not model in config.model_list:
 17 |             print "Wrong model name when initializing EM model."
 18 |             exit(-1)
 19 |         self.model = model
 20 |         self.em_log = []
 21 | 
 22 |     def init_parameters(self):
 23 |         self.camp_v = self.train_data.get_statistics()['ecpc']
 24 |         if config.ds_ratio > 0:
 25 |             self.ori_camp_v = self.train_data.get_statistics()['ori_ecpc']
 26 |         self.mu = 0.0
 27 |         self.budget = int(self.test_data.get_statistics()['cost_sum'] / config.budget_prop)
 28 |         # budget is only used in test phase or M-step
 29 | 
 30 |     def init_bid_strategy(self):
 31 |         self.bid_strategy = OptBid(self.camp_v, self.mu)
 32 | 
 33 |     def train(self):
 34 |         e_stop = False
 35 |         loop = 0
 36 |         while not e_stop:
 37 |             self.e_step()
 38 |             self.test()
 39 |             print "E step loop " + `loop+1` + "\t" + `self.test_log[len(self.test_log)-1]['performance']`
 40 |             e_stop = tool.judge_stop(self.test_log)
 41 |             loop += 1
 42 |         # self.test_log.pop(); self.test_log.pop() # delete the last two points
 43 |         best_log = self.get_best_e_log(self.test_log)
 44 |         print "Changed weight to the best one. the best revenue in last E phase is " + `best_log['performance']['revenue']`
 45 |         self.weight = best_log['weight']
 46 |         print "E step done."
 47 |         self.m_step()
 48 |         print "Optimal mu = " + `self.mu`
 49 |         print "M step done."
 50 | 
 51 |     def e_step(self):
 52 |         random.seed(10)
 53 |         train_data = self.train_data
 54 |         progress = 0.0
 55 |         iter_id = train_data.init_index()
 56 |         while not train_data.reached_tail(iter_id):
 57 |             data = train_data.get_next_data(iter_id)
 58 |             y = data[0]
 59 |             feature = data[2:len(data)]
 60 |             ctr = tool.estimate_ctr(self.weight, feature, train_flag=True)
 61 |             phi = 1.0 / (1.0 + self.mu)
 62 |             bp = self.bid_strategy.bid(ctr)
 63 |             pz = self.train_data.landscape.get_probability(bp)
 64 |             scale_x = (phi * ctr - y) * phi * math.pow(self.camp_v, 2) * pz * config.em_scale
 65 |             if config.model_name == 'eu':
 66 |                 scale_x = ctr * (1 - ctr) * scale_x
 67 |             for idx in feature:
 68 |                 self.weight[idx] = self.weight[idx] * self.reg_update_param - config.lr_alpha * scale_x
 69 |             # prg = train_data.get_progress(iter_id)
 70 |             # if prg < 0.9 and prg > (progress + config.train_progress_unit - 1E-3):
 71 |             #     self.test()
 72 |             #     progress += config.train_progress_unit
 73 | 
 74 |     def m_step(self):
 75 |         opt_mu = self.mu
 76 |         opt_revenue = -1E10
 77 |         opt_performance = {}
 78 |         test_data = self.test_data
 79 |         for mu in config.mu_range:
 80 |             bid_sum = 0
 81 |             cost_sum = 0
 82 |             imp_sum = 0
 83 |             clk_sum = 0
 84 |             revenue_sum = 0
 85 |             labels = []
 86 |             p_labels = []
 87 |             self.bid_strategy.set_mu(mu)
 88 |             iter_id = test_data.init_index()
 89 |             while not test_data.reached_tail(iter_id):
 90 |                 data = test_data.get_next_data(iter_id)
 91 |                 bid_sum += 1
 92 |                 y = data[0]
 93 |                 mp = data[1]
 94 |                 feature = data[2:len(data)]
 95 |                 ctr = tool.estimate_ctr(self.weight, feature, train_flag=False)
 96 |                 labels.append(y)
 97 |                 p_labels.append(ctr)
 98 |                 if config.ds_ratio > 0: # down sampled, needs to calibrate
 99 |                     bp = self.bid_strategy.bid_calib(self.ori_camp_v, mu, ctr)
100 |                 else:
101 |                     bp = self.bid_strategy.bid(ctr)
102 |                 # bp = self.bid_strategy.bid(ctr)
103 |                 if bp > mp:
104 |                     cost_sum += mp
105 |                     imp_sum += 1
106 |                     clk_sum += y
107 |                     # revenue_sum = int(revenue_sum - mp + y * self.camp_v * 1E3)\
108 |                     if config.ds_ratio > 0:
109 |                         revenue_sum = int(revenue_sum - mp + y * self.ori_camp_v * 1E3)
110 |                     else:
111 |                         revenue_sum = int(revenue_sum - mp + y * self.camp_v * 1E3)
112 |                 if cost_sum >= self.budget:
113 |                 	break
114 |             cpc = 0.0 if clk_sum == 0 else 1.0 * cost_sum / clk_sum * 1E-3
115 |             cpm = 0.0 if imp_sum == 0 else 1.0 * cost_sum / imp_sum
116 |             ctr = 0.0 if imp_sum == 0 else 1.0 * clk_sum / imp_sum
117 |             roi = 0.0 if cost_sum == 0 else 1.0 * revenue_sum / cost_sum
118 |             auc = roc_auc_score(labels, p_labels)
119 |             rmse = math.sqrt(mean_squared_error(labels, p_labels))
120 |             performance = {'bids':bid_sum, 'cpc':cpc, 'cpm':cpm, 
121 |                         'auc': auc, 'rmse': rmse,
122 |                         'ctr': ctr, 'revenue':revenue_sum, 
123 |                         'imps':imp_sum, 'clks':clk_sum,
124 |                         'roi': roi}
125 |             print "current mu = " + `mu` + "\t" + `performance`
126 |             if performance['revenue'] > opt_revenue:
127 |                 opt_revenue = performance['revenue']
128 |                 opt_performance = performance
129 |                 opt_mu = mu
130 |         # reset the value of mu in both bidding function and model inner parameter
131 |         self.bid_strategy.set_mu(opt_mu)
132 |         self.mu = opt_mu
133 |         log = self.make_log(self.weight, opt_performance)
134 |         log['m'] = True
135 |         self.test_log.append(log)
136 |         self.em_log.append(log)
137 | 
138 |     def make_log(self, weight, performance):
139 |         log = {}
140 |         log['weight'] = copy.deepcopy(weight)
141 |         log['performance'] = copy.deepcopy(performance)
142 |         log['mu'] = self.mu
143 |         return log
144 | 
145 |     def get_best_e_log(self, logs):
146 |         best_log = {}
147 |         if len(logs) == 0:
148 |             print "ERROR: no record in the log."
149 |         else:
150 |             best_revenue = -1E10
151 |             idx = len(logs)-1
152 |             while idx>=0 and not 'm' in logs[idx]:
153 |                 log = logs[idx]
154 |                 revenue = log['performance']['revenue']
155 |                 if revenue > best_revenue:
156 |                     best_revenue = revenue
157 |                     best_log = log
158 |                 idx -= 1
159 |         return best_log
160 | 
161 |     # def replay(self, weight, test_data, budget_prop):
162 |     #     budget = int(1.0 * test_data.get_statistics()['cost_sum'] / budget_prop)
163 |     #     mu = self.mu
164 |     #     label = []
165 |     #     p_labels = []
166 |     #     bid_sum = 0
167 |     #     cost_sum = 0
168 |     #     imp_sum = 0
169 |     #     clk_sum = 0
170 |     #     revenue_sum = 0
171 |     #     labels = []
172 |     #     p_labels = []
173 |     #     iter_id = test_data.init_index()
174 |     #     while not test_data.reached_tail(iter_id):
175 |     #         data = test_data.get_next_data(iter_id)
176 |     #         bid_sum += 1
177 |     #         y = data[0]
178 |     #         mp = data[1]
179 |     #         feature = data[2:len(data)]
180 |     #         ctr = tool.estimate_ctr(weight, feature, train_flag=False)
181 |     #         labels.append(y)
182 |     #         p_labels.append(ctr)
183 |     #         if config.ds_ratio > 0: # down sampled, needs to calibrate
184 |     #             bp = self.bid_strategy.bid_calib(self.ori_camp_v, mu, ctr)
185 |     #         else:
186 |     #             bp = self.bid_strategy.bid(ctr)
187 |     #         # bp = self.bid_strategy.bid(ctr)
188 |     #         if bp > mp:
189 |     #             cost_sum += mp
190 |     #             imp_sum += 1
191 |     #             clk_sum += y
192 |     #             # revenue_sum = int(revenue_sum - mp + y * self.camp_v * 1E3)\
193 |     #             if config.ds_ratio > 0:
194 |     #                 revenue_sum = int(revenue_sum - mp + y * self.ori_camp_v * 1E3)
195 |     #             else:
196 |     #                 revenue_sum = int(revenue_sum - mp + y * self.camp_v * 1E3)
197 |     #         if cost_sum >= budget:
198 |     #             break
199 |     #     cpc = 0.0 if clk_sum == 0 else 1.0 * cost_sum / clk_sum * 1E-3
200 |     #     cpm = 0.0 if imp_sum == 0 else 1.0 * cost_sum / imp_sum
201 |     #     ctr = 0.0 if imp_sum == 0 else 1.0 * clk_sum / imp_sum
202 |     #     roi = 0.0 if cost_sum == 0 else 1.0 * revenue_sum / cost_sum
203 |     #     auc = roc_auc_score(labels, p_labels)
204 |     #     rmse = math.sqrt(mean_squared_error(labels, p_labels))
205 |     #     performance = {'bids':bid_sum, 'cpc':cpc, 'cpm':cpm, 
206 |     #                 'auc': auc, 'rmse': rmse,
207 |     #                 'ctr': ctr, 'revenue':revenue_sum, 
208 |     #                 'imps':imp_sum, 'clks':clk_sum,
209 |     #                 'roi': roi}
210 |     #     return performance
211 | 


--------------------------------------------------------------------------------