├── users-11.dat
├── ch6_supporting_content.xlsx
├── errata
    ├── images
    │   ├── right_eq_1.png
    │   ├── right_eq_2.png
    │   ├── wrong_eq_1.png
    │   └── wrong_eq_2.png
    └── errata.htm
├── README.md
├── ratings-11.dat
├── movies-11.dat
├── fraud_data_3.csv
├── ch5.py
├── A1.py
├── ch5-prepare.sh
├── A1.data
├── ch5-criteo-process.py
├── requirements.txt
├── ch2.py
├── ch4.py
├── ch6.py
├── ch7.py
└── ch3.py


/users-11.dat:
--------------------------------------------------------------------------------
1 | 0::F::1::10::48067
2 | 1::M::56::16::70072
3 | 2::M::25::15::55117
4 | 


--------------------------------------------------------------------------------
/ch6_supporting_content.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/ch6_supporting_content.xlsx


--------------------------------------------------------------------------------
/errata/images/right_eq_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/errata/images/right_eq_1.png


--------------------------------------------------------------------------------
/errata/images/right_eq_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/errata/images/right_eq_2.png


--------------------------------------------------------------------------------
/errata/images/wrong_eq_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/errata/images/wrong_eq_1.png


--------------------------------------------------------------------------------
/errata/images/wrong_eq_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/errata/images/wrong_eq_2.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # aiw-second-edition
2 | Code required for the examples in Algorithms of the Intelligent Web, 2nd Edition
3 | 
4 | Errata can be found under './errata'
5 | 


--------------------------------------------------------------------------------
/ratings-11.dat:
--------------------------------------------------------------------------------
 1 | 0::0::5::978300760
 2 | 0::1::4::978302109
 3 | 0::2::5::978301968
 4 | 0::3::4::978300275
 5 | 0::4::5::978824291
 6 | 0::5::4::978302268
 7 | 0::6::5::978302039
 8 | 1::0::5::978300719
 9 | 1::2::5::978302268
10 | 1::4::4::978301368
11 | 1::5::5::978824268
12 | 1::7::5::978301752
13 | 1::8::4::978302281
14 | 1::9::5::978302124
15 | 2::0::1::978301753
16 | 2::2::2::978302188
17 | 2::3::2::978824268
18 | 2::6::3::978301777
19 | 2::9::2::978301713
20 | 2::10::1::978302039
21 | 


--------------------------------------------------------------------------------
/movies-11.dat:
--------------------------------------------------------------------------------
 1 | 0::Toy Story (1995)::Animation|Children's|Comedy
 2 | 1::Jumanji (1995)::Adventure|Children's|Fantasy
 3 | 2::Grumpier Old Men (1995)::Comedy|Romance
 4 | 3::Waiting to Exhale (1995)::Comedy|Drama
 5 | 4::Father of the Bride Part II (1995)::Comedy
 6 | 5::Heat (1995)::Action|Crime|Thriller
 7 | 6::Sabrina (1995)::Comedy|Romance
 8 | 7::Tom and Huck (1995)::Adventure|Children's
 9 | 8::Sudden Death (1995)::Action
10 | 9::GoldenEye (1995)::Action|Adventure|Thriller
11 | 10::American President, The (1995)::Comedy|Drama|Romance
12 | 


--------------------------------------------------------------------------------
/fraud_data_3.csv:
--------------------------------------------------------------------------------
1 | IsFraud,Amount,Country,TimeOfTransaction,BusinessType,NumberOfTransactionsAtThisShop,DayOfWeek0,10,DK,13,2,0,10,100,LT,18,1,3,40,49.99,AT,11,4,3,00,12,AUS,9,6,4,30,250,UK,12,1,5,60,149.99,UK,17,2,2,50,10,UK,16,8,1,40,49.99,DK,12,9,4,30,18,UK,14,2,3,60,27,DK,10,1,5,50,40,DK,11,1,6,20,2,UK,10,2,7,40,34.99,UK,9,4,8,30,2,UK,8,9,9,01,18000,LT,13,9,0,01,20000,LT,14,9,0,01,19000,LT,13,9,0,01,6000,LT,13,9,1,41,9000,LT,12,9,0,41,5000,LT,12,9,0,41,20000,LT,12,9,0,01,10000,LT,12,9,0,61,20000.01,UK,13,1,0,01,21000,LT,13,9,0,01,11000,LT,13,9,1,61,210000,UK,14,1,0,01,22000,UK,12,1,1,01,280000,LT,12,9,0,01,15000,LT,12,9,0,6


--------------------------------------------------------------------------------
/ch5.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pylab as pl
 3 | from sklearn import svm, datasets
 4 | from sklearn.utils import shuffle
 5 | from sklearn.metrics import roc_curve, auc
 6 | 
 7 | #Listing 5.6
 8 | ground_truth_file_name = './ground_truth.dat'
 9 | probability_file_name = './probabilities.out'
10 | 
11 | ground_truth_file = open(ground_truth_file_name,'r')
12 | probability_file = open(probability_file_name,'r')
13 | 
14 | ground_truth = np.array(map(int,ground_truth_file))
15 | probabilities = np.array(map(float,probability_file))
16 | 
17 | ground_truth_file.close()
18 | probability_file.close()
19 | 
20 | #from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
21 | fpr, tpr, thresholds = roc_curve(ground_truth, probabilities)
22 | roc_auc = auc(fpr, tpr)
23 | print "Area under the ROC curve : %f" % roc_auc
24 | 
25 | pl.clf()
26 | pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
27 | pl.plot([0, 1], [0, 1], 'k--')
28 | pl.xlim([0.0, 1.0])
29 | pl.ylim([0.0, 1.0])
30 | pl.xlabel('False Positive Rate')
31 | pl.ylabel('True Positive Rate')
32 | pl.title('Receiver operating characteristic')
33 | pl.legend(loc="lower right")
34 | pl.show()
35 | 


--------------------------------------------------------------------------------
/A1.py:
--------------------------------------------------------------------------------
 1 | #Code for Appendix 1. Algorithms of the Intelligent Web 2nd Edition.
 2 | 
 3 | #Listing A.1.1
 4 | from kafka import KafkaClient, SimpleProducer
 5 | 
 6 | kafka = KafkaClient("localhost:9092")
 7 | 
 8 | producer = SimpleProducer(kafka)
 9 | producer.send_messages("test", "Hello World!")
10 | producer.send_messages("test","This is my second message")
11 | producer.send_messages("test","And this is my third!")
12 | 
13 | #Listing A.1.2
14 | from kafka import KafkaClient, SimpleConsumer
15 | 
16 | kafka = KafkaClient("localhost:9092")				
17 | consumer = SimpleConsumer(kafka,"mygroup","test")					
18 | 
19 | for message in consumer:
20 | 	print(message)
21 | 
22 | #Listing A.1.5
23 | from kafka import KafkaClient, SimpleProducer
24 | 
25 | kafka = KafkaClient("localhost:9092")
26 | producer = SimpleProducer(kafka,async=False,
27 | 			  req_acks=SimpleProducer.ACK_AFTER_CLUSTER_COMMIT,
28 | 			  ack_timeout=2000)
29 | 
30 | producer.send_messages("test-replicated-topic", "Hello Kafka Cluster!")
31 | producer.send_messages("test-replicated-topic","Message to be replicated.")
32 | producer.send_messages("test-replicated-topic","And so is this!")
33 | 
34 | #Listing A.1.8
35 | from kafka import KafkaClient
36 | from kafka.common import ProduceRequest
37 | from kafka.protocol import KafkaProtocol,create_message
38 | 
39 | kafka = KafkaClient("localhost:9092")
40 | 
41 | f = open('A1.data','r')
42 | 
43 | for line in f:
44 | 	s = line.split("\t")[0]
45 | 	part = abs(hash(s)) % 3 
46 | 	req = ProduceRequest(topic="click-streams",partition=part,messages=[create_message(s)])
47 | 	resps = kafka.send_produce_request(payloads=[req], fail_on_error=True)
48 | 


--------------------------------------------------------------------------------
/ch5-prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #This file contains the code for listing 5.1 - 5.5.
 4 | #This code can be run as is, if it placed in the same directory as the 
 5 | #train_vw_file and the location of your vowpal_wabbit checkout is correct.
 6 | #In order to obtain the train_vw_file you must run ch5-criteo-process.py 
 7 | #over the train.txt file obtained from the criteo display challenge dataset (full).
 8 | 
 9 | #!5.1
10 | #echo 'Running Listing 5.1'
11 | wc -l train_vw_file
12 | grep -c '^-1' train_vw_file
13 | grep -c '^1' train_vw_file
14 | 
15 | #!5.2
16 | #echo 'Running Listing 5.2'
17 | grep '^-1' train_vw_file | sort -R  > negative_examples.dat
18 | grep '^1' train_vw_file | sort -R > positive_examples.dat	
19 | awk 'NR % 3 == 0' negative_examples.dat > negative_examples_downsampled.dat
20 | 
21 | cat negative_examples_downsampled.dat > all_examples.dat
22 | cat positive_examples.dat >> all_examples.dat
23 | 
24 | cat all_examples.dat | sort -R  > all_examples_shuffled.dat
25 | awk 'NR % 10 == 0' all_examples_shuffled.dat > all_examples_shuffled_down.dat
26 | 
27 | #!5.3
28 | echo 'Running Listing 5.3'
29 | vw all_examples_shuffled_down.dat --loss_function=logistic -c -b 22 --passes=3 -f model.vw
30 | vw all_examples_shuffled_down.dat -t -i model.vw --invert_hash readable.model
31 | cat readable.model | awk 'NR > 9 {print}' | sort -r -g -k 3 -t : | head -1000 > readable_model_sorted_top
32 | 
33 | #!5.4
34 | #Output only
35 | 
36 | #!5.5
37 | echo 'Running Listing 5.5'
38 | vw -d test_vw_file -t -i model.vw --loss_function=logistic -r predictions.out
39 | ~/dev/vowpal_wabbit/utl/logistic -0 predictions.out > probabilities.out
40 | cut -d ' ' -f 1 test_vw_file | sed -e 's/^-1/0/' > ground_truth.dat
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/A1.data:
--------------------------------------------------------------------------------
 1 | 381094c0-9e45-424f-90b6-ad64b06cc184	2014-01-02 17:33:07	click
 2 | 6615087e-ea19-492c-869c-28fc1fa77588	2014-01-02 17:33:20	view
 3 | d4889090-942b-4790-9831-362051b0847b	2014-01-02 17:34:01	adview
 4 | e6688db5-d31f-4ede-985a-115fb51836fb	2014-01-02 17:35:06	adview
 5 | e6688db5-d31f-4ede-985a-115fb51836fb	2014-01-02 17:35:30	click
 6 | e6688db5-d31f-4ede-985a-115fb51836fb	2014-01-02 17:37:01	click
 7 | e6688db5-d31f-4ede-985a-115fb51836fb	2014-01-02 17:39:00	convert
 8 | ae6ae5c9-acb2-479b-adcb-0c29623d921b	2014-01-02 17:40:00     adview
 9 | 1ac384c1-1b2d-4ed0-b467-7c90b7ac42d8	2014-01-02 17:40:01     adview
10 | 280bfa16-07ac-49ed-a1a5-9ab50a754027	2014-01-02 17:40:03     click
11 | dda0e95d-9c30-4f60-bb6a-febf05526b83	2014-01-02 17:40:05     adview
12 | 8a1204f1-5076-4d4c-8b23-84c77ad541d8	2014-01-02 17:40:10     adview
13 | 3bdb8f17-11cc-49cb-94cf-75676be909b7	2014-01-02 17:40:11     adview
14 | 69b61156-6c31-4317-aec5-bd48908b4973	2014-01-02 17:40:13     adview
15 | 69722471-0532-4f29-b2b4-2f9007604e4f	2014-01-02 17:40:14     adview
16 | 00e5edf6-a483-48fa-82ed-fbfac8a6b1e6	2014-01-02 17:40:15     adview
17 | 9398d369-6382-4be0-97bc-182b3713745f	2014-01-02 17:40:17     convert
18 | f40c1588-e4e1-4f7d-8ef5-5f76046886fb	2014-01-02 17:40:18     adview
19 | 54823527-fe62-4a81-8551-6282309b0a3f	2014-01-02 17:40:20     click
20 | 46d6f178-7c11-48c1-a1d7-f7152e7b2f1c	2014-01-02 17:40:26     adview
21 | 4c4e545b-d194-4531-962f-66e9d3b6116d	2014-01-02 17:41:00     convert
22 | 42b311f5-ba84-4666-a901-03063f7504a9	2014-01-02 17:41:01     adview
23 | bfa28923-c358-4741-bcbf-ff99b640ee14	2014-01-02 17:42:06     adview
24 | 54c29b39-5640-49b8-b610-6f2e6dc6bd1b	2014-01-02 17:42:10     convert
25 | edf6c5d2-1373-4dbb-8528-925d525b4a42	2014-01-02 17:43:03     click
26 | f7f6752f-03bf-43f1-927c-8acdafd235e2	2014-01-02 17:43:11     adview
27 | f4b7c0a6-b209-4cc4-b4e7-395489e0e724	2014-01-02 17:43:19     click
28 | 


--------------------------------------------------------------------------------
/ch5-criteo-process.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import random
 3 | import sys
 4 | 
 5 | def process_file (file_name, vw_file_name_train, vw_file_name_test, train_pct):
 6 |     file = open(file_name)
 7 |     vw_file_train = open(vw_file_name_train,'w')
 8 |     vw_file_test = open(vw_file_name_test,'w')
 9 |     
10 |     continuous_set = [int(x) for x in numpy.linspace(1,13,13)]
11 |     categorical_set =[int(x) for x in numpy.linspace(1,26,26)]
12 |     
13 |     print continuous_set
14 |     print categorical_set
15 |     
16 |     first_line_headers = ["Class"]
17 |     for i in continuous_set:
18 |         first_line_headers.append("i"+str(i))
19 |     
20 |     for c in categorical_set:
21 |         first_line_headers.append("c"+str(c))
22 | 
23 |     print first_line_headers
24 | 
25 |     for line in file:
26 |         line_split = line.split('\t')
27 |         target_click = -1
28 |         if int(line_split[0])>0:
29 |             target_click=1
30 | 
31 |         #Essentially now manually build up the training string
32 |         vw_line = ""+str(target_click)+" "  
33 | 
34 |         for feature_index in continuous_set:
35 |             if line_split[feature_index]!="":
36 |                 vw_line+="|"+first_line_headers[feature_index] +" c:"+ line_split[feature_index] + " "
37 | 
38 |         for feature_index in [x+len(continuous_set) for x in categorical_set]: #Index doesn't start from 0
39 |             if line_split[feature_index]!="":
40 |                 vw_line+="|"+first_line_headers[feature_index] + " " + line_split[feature_index] + " "
41 | 
42 |         if(random.random()<=train_pct):
43 |             vw_file_train.write(vw_line.replace('\n', '')+"\n") #Get rid of any unwanted line breaks
44 |         else:
45 |             vw_file_test.write(vw_line.replace('\n', '')+"\n") #Get rid of any unwanted line breaks
46 | 
47 |     file.close()
48 |     vw_file_train.close()
49 |     vw_file_test.close()
50 | 
51 | if __name__ == '__main__':
52 | 	filename='./train.txt'
53 | 	vw_file_name_train = './train_vw_file'
54 | 	vw_file_name_test = './test_vw_file'
55 | 
56 | 	filename = sys.argv[1] if len(sys.argv) >=2 else filename
57 | 	vw_file_name_train = sys.argv[2] if len(sys.argv) >=3 else vw_file_name_train
58 | 	vw_file_name_test = sys.argv[3] if len(sys.argv) >=4 else vw_file_name_test
59 | 	
60 | 	process_file(filename,vw_file_name_train,vw_file_name_test,0.7)
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | --System:
 2 | Python 2.7.10
 3 | Ubuntu 14.04.2
 4 | VirtualEnv 13.1.2
 5 | Scala version 2.10.4 
 6 | 
 7 | --System prerequisties:
 8 | sudo apt-get install python-dev libblas-dev liblapack-dev libatlas-base-dev gfortran libpng12-dev libfreetype6-dev libjpeg-dev tcl-dev tk-dev python-tk libboost-program-options-dev libboost-python-dev libsnappy-dev
 9 | 
10 | --Python prerequisites:
11 | pip install numpy scipy scikit-learn matplotlib csc-pysparse networkx divisi2
12 | 
13 | --Installing Recsys:
14 | git clone https://github.com/ocelma/python-recsys.git
15 | cd python-recsys
16 | sudo python setup.py install
17 | --[under virtualenv ./venv/bin/python setup.py install]
18 | 
19 | --Installing Movielens Dataset:
20 | wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
21 | unzip ml-1m.zip
22 | 
23 | --Installing VW (vw-8.1.1):
24 | git clone git://github.com/JohnLangford/vowpal_wabbit.git 
25 | make
26 | make test
27 | sudo make install
28 | 
29 | --Installing PyBrain (PyBrain==0.3.3)
30 | git clone git://github.com/pybrain/pybrain.git pybrain
31 | cd pybrain
32 | sudo python setup.py install
33 | --[Under virtualenv: ./venv/bin/python setup.py install]
34 | 
35 | --Installing KafkaPython :
36 | git clone https://github.com/dpkp/kafka-python.git
37 | git checkout v0.9.5
38 | cd kafka-python
39 | sudo python setup.py install
40 | --[Under virtualenv: ./venv/bin/python setup.py install]
41 | 
42 | ---This is a list of the versions of the requirements of the various packages installed above.
43 | ....numpy-1.10.1
44 | ....scipy-0.16.1
45 | ....sklearn-0.17
46 | ....matplotlib==1.5.0
47 | ....csc-pysparse-1.1.1.4 
48 | ....csc-utils-0.6.7 
49 | ....divisi2-2.2.5
50 | ....vowpal-wabbit-8.1.1
51 | 
52 | ---This is a complete snapshot of my python environment after installing the above to a clean virtualenv environment
53 | ['backports-abc==0.4’, 
54 | 'backports.ssl-match-hostname==3.4.0.2’, 
55 | 'certifi==2015.11.20.1’, 
56 | 'csc-pysparse==1.1.1.4’, 
57 | 'csc-utils==0.6.7’, 
58 | 'cycler==0.9.0’, 
59 | 'decorator==4.0.4’,
60 | 'divisi2==2.2.5’, 
61 | 'functools32==3.2.3.post2’, 
62 | 'ipython-genutils==0.1.0’, 
63 | 'ipython==4.0.1’, 
64 | 'jinja2==2.8’, 
65 | 'jsonschema==2.5.1’, 
66 | 'kafka-python==0.9.5’, 
67 | 'markupsafe==0.23’, 
68 | 'matplotlib==1.5.0’, 
69 | 'networkx==1.10’, 
70 | 'numpy==1.10.1’, 
71 | 'pandas==0.17.1’, 
72 | 'path.py==8.1.2’, 
73 | 'pexpect==4.0.1’, 
74 | 'pickleshare==0.5’, 
75 | 'pillow==3.0.0’, 
76 | 'pip==7.1.2’, 
77 | 'ptyprocess==0.5’, 
78 | 'pybrain==0.3.3’, 
79 | 'pyparsing==2.0.6’, 
80 | 'python-dateutil==2.4.2’, 
81 | 'python-recsys==0.2’, 
82 | 'pytz==2015.7’, 
83 | 'pyzmq==15.1.0’, 
84 | 'scikit-image==0.11.3’, 
85 | 'scikit-learn==0.17’, 
86 | 'scipy==0.16.1’, 
87 | 'seaborn==0.6.0’, 
88 | 'setuptools==18.2’, 
89 | 'simplegeneric==0.8.1’, 
90 | 'singledispatch==3.4.0.3’, 
91 | 'six==1.10.0’, 
92 | 'sklearn==0.0’, 
93 | 'sympy==0.7.6.1’, 
94 | 'tornado==4.3’, 
95 | 'traitlets==4.0.0’, 
96 | 'wheel==0.24.0']
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/ch2.py:
--------------------------------------------------------------------------------
  1 | #2.1
  2 | import numpy as np
  3 | from sklearn import datasets
  4 | 					
  5 | iris = datasets.load_iris() 
  6 | np.array(zip(iris.data,iris.target))[0:10]	
  7 | 
  8 | #2.2
  9 | print(iris.DESCR)
 10 | iris.target_names
 11 | 
 12 | #2.3
 13 | #Psuedocode
 14 | 
 15 | #2.4
 16 | from sklearn.cluster import KMeans
 17 | from sklearn import datasets
 18 | 
 19 | iris = datasets.load_iris()
 20 | X = iris.data
 21 | km = KMeans(n_clusters=3)
 22 | km.fit(X)
 23 | 
 24 | print(km.labels_)
 25 | 
 26 | #2.5
 27 | from sklearn.cluster import KMeans
 28 | from sklearn import datasets
 29 | from itertools import cycle, combinations
 30 | import matplotlib.pyplot as pl
 31 | 
 32 | iris = datasets.load_iris()
 33 | km = KMeans(n_clusters=3)
 34 | km.fit(iris.data)
 35 | 
 36 | predictions = km.predict(iris.data)
 37 | 
 38 | colors = cycle('rgb')
 39 | markers = cycle('^+o')
 40 | labels = ["Cluster 1","Cluster 2","Cluster 3"]
 41 | targets = range(len(labels))
 42 | 
 43 | feature_index=range(len(iris.feature_names))
 44 | feature_names=iris.feature_names
 45 | combs=combinations(feature_index,2)
 46 | 
 47 | f,axarr=pl.subplots(3,2)
 48 | axarr_flat=axarr.flat
 49 | 
 50 | for comb, axflat in zip(combs,axarr_flat):
 51 |         for target, color, label, marker in zip(targets,colors,labels,markers):
 52 |                 feature_index_x=comb[0]
 53 |                 feature_index_y=comb[1]
 54 |                 axflat.scatter(iris.data[predictions==target,feature_index_x],
 55 |                                 iris.data[predictions==target,feature_index_y],c=color,label=label,marker=marker)
 56 |                 axflat.set_xlabel(feature_names[feature_index_x])
 57 |                 axflat.set_ylabel(feature_names[feature_index_y])
 58 | 
 59 | f.tight_layout()
 60 | pl.show()
 61 | 
 62 | #2.6
 63 | from sklearn.mixture import GMM
 64 | from sklearn import datasets
 65 | from itertools import cycle, combinations
 66 | import matplotlib as mpl
 67 | import matplotlib.pyplot as pl
 68 | import numpy as np
 69 | 
 70 | # make_ellipses method taken from: http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_classifier.html#example-mixture-plot-gmm-classifier-py
 71 | # Author: Ron Weiss <ronweiss@gmail.com>, Gael Varoquaux
 72 | # License: BSD 3 clause
 73 | 
 74 | def make_ellipses(gmm, ax, x, y):
 75 |     for n, color in enumerate('rgb'):
 76 | 	row_idx = np.array([x,y])
 77 | 	col_idx = np.array([x,y])
 78 |         v, w = np.linalg.eigh(gmm._get_covars()[n][row_idx[:,None],col_idx])
 79 |         u = w[0] / np.linalg.norm(w[0])
 80 |         angle = np.arctan2(u[1], u[0])
 81 |         angle = 180 * angle / np.pi  # convert to degrees
 82 |         v *= 9
 83 |         ell = mpl.patches.Ellipse(gmm.means_[n, [x,y]], v[0], v[1],
 84 |                                   180 + angle, color=color)
 85 |         ell.set_clip_box(ax.bbox)
 86 |         ell.set_alpha(0.2)
 87 |         ax.add_artist(ell)
 88 | 
 89 | iris = datasets.load_iris()
 90 | 
 91 | gmm = GMM(n_components=3,covariance_type='full', n_iter=20)
 92 | gmm.fit(iris.data)
 93 | 
 94 | predictions = gmm.predict(iris.data)
 95 | 
 96 | colors = cycle('rgb')
 97 | markers = cycle('^+o')
 98 | labels = ["Cluster 1","Cluster 2","Cluster 3"]
 99 | targets = range(len(labels))
100 | 
101 | feature_index=range(len(iris.feature_names))
102 | feature_names=iris.feature_names
103 | combs=combinations(feature_index,2)
104 | 
105 | f,axarr=pl.subplots(3,2)
106 | axarr_flat=axarr.flat
107 | 
108 | for comb, axflat in zip(combs,axarr_flat):
109 |  	for target, color, label,marker in zip(targets,colors,labels,markers):
110 |   		feature_index_x=comb[0]
111 |   		feature_index_y=comb[1]
112 |   		axflat.scatter(iris.data[predictions==target,feature_index_x],
113 | 				iris.data[predictions==target,feature_index_y],c=color,label=label,marker=marker)
114 |   		axflat.set_xlabel(feature_names[feature_index_x])
115 |   		axflat.set_ylabel(feature_names[feature_index_y])
116 | 		make_ellipses(gmm,axflat,feature_index_x,feature_index_y)
117 | 
118 | pl.tight_layout()
119 | pl.show()		
120 | 
121 | #2.7
122 | import numpy as np
123 | import matplotlib.pyplot as pl
124 | 
125 | from sklearn import decomposition
126 | from sklearn import datasets
127 | from itertools import cycle
128 | 
129 | iris = datasets.load_iris()
130 | X = iris.data
131 | Y = iris.target
132 | 
133 | targets = range(len(iris.target_names))
134 | colors = cycle('rgb')
135 | markers = cycle('^+o')
136 | 
137 | pca = decomposition.PCA(n_components=2)
138 | pca.fit(X)
139 | 
140 | X = pca.transform(X)
141 | 
142 | for target,color,marker in zip(targets,colors,markers):
143 | 	pl.scatter(X[Y==target,0],X[Y==target,1],label=iris.target_names[target],c=color,marker=marker)
144 | 
145 | pl.legend()
146 | pl.show()
147 | 
148 | 


--------------------------------------------------------------------------------
/ch4.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import numpy as np
  3 | from sklearn.preprocessing import OneHotEncoder, LabelEncoder
  4 | from sklearn import linear_model, datasets, cross_validation
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | #4.1
  8 | dataset = []
  9 | f = open('./fraud_data_3.csv', 'rU')
 10 | try:
 11 |     reader = csv.reader(f,delimiter=',')
 12 |     next(reader, None)
 13 |     for row in reader:
 14 |         dataset.append(row)
 15 | finally:
 16 |     f.close()
 17 | 
 18 | #4.2
 19 | target = np.array([x[0] for x in dataset])
 20 | data = np.array([x[1:] for x in dataset])
 21 | # Amount, Country, TimeOfTransaction, BusinessType, NumberOfTransactionsAtThisShop, DayOfWeek
 22 | categorical_mask = [False,True,True,True,False,True]
 23 | enc = LabelEncoder()
 24 | 
 25 | for i in range(0,data.shape[1]):
 26 |     if(categorical_mask[i]):
 27 |         label_encoder = enc.fit(data[:,i])
 28 |         print "Categorical classes:", label_encoder.classes_
 29 |         integer_classes = label_encoder.transform(label_encoder.classes_)
 30 |         print "Integer classes:", integer_classes        
 31 |         t = label_encoder.transform(data[:, i])
 32 |         data[:, i] = t
 33 | 
 34 | #4.3:
 35 | mask = np.ones(data.shape, dtype=bool)
 36 | 
 37 | for i in range(0,data.shape[1]):
 38 |     if(categorical_mask[i]):
 39 |         mask[:,i]=False
 40 | 
 41 | data_non_categoricals = data[:, np.all(mask, axis=0)] #keep only the true, non categoricals
 42 | data_categoricals = data[:,~np.all(mask,axis=0)]
 43 | 
 44 | hotenc = OneHotEncoder()
 45 | hot_encoder = hotenc.fit(data_categoricals)
 46 | encoded_hot = hot_encoder.transform(data_categoricals)
 47 | 
 48 | #4.4:
 49 | new_data=data_non_categoricals
 50 | new_data=new_data.astype(np.float)
 51 | 
 52 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(new_data, target, test_size=0.4, random_state=0,dtype=float)
 53 | 
 54 | logreg = linear_model.LogisticRegression(tol=1e-10)
 55 | logreg.fit(X_train,y_train)						
 56 | log_output = logreg.predict_log_proba(X_test)				
 57 | 
 58 | print("Odds: "+ str(np.exp(logreg.coef_)))
 59 | print("Odds intercept" + str(np.exp(logreg.intercept_)))
 60 | print("Likelihood Intercept:" + str(np.exp(logreg.intercept_)/(1+np.exp(logreg.intercept_))))
 61 | 
 62 | f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
 63 | plt.setp((ax1,ax2),xticks=[])
 64 | 
 65 | ax1.scatter(range(0,len(log_output[:,1]),1),log_output[:,1],s=100,label='Log Prob.',color='Blue',alpha=0.5)
 66 | ax1.scatter(range(0,len(y_test),1),y_test,label='Labels',s=250,color='Green',alpha=0.5)
 67 | ax1.legend(bbox_to_anchor=(0., 1.02, 1., 0.102), ncol=2, loc=3, mode="expand", borderaxespad=0.)
 68 | ax1.set_xlabel('Test Instances')
 69 | ax1.set_ylabel('Binary Ground Truth Labels /  Model Log. Prob.')
 70 | 
 71 | prob_output = [np.exp(x) for x in log_output[:,1]]
 72 | ax2.scatter(range(0,len(prob_output),1),prob_output,s=100,label='Prob.', color='Blue',alpha=0.5)
 73 | ax2.scatter(range(0,len(y_test),1),y_test,label='Labels',s=250,color='Green',alpha=0.5)
 74 | ax2.legend(bbox_to_anchor=(0., 1.02, 1., 0.102), ncol=2, loc=3, mode="expand", borderaxespad=0.)
 75 | ax2.set_xlabel('Test Instances')
 76 | ax2.set_ylabel('Binary Ground Truth Labels / Model Prob.')
 77 | 
 78 | plt.show()
 79 | 
 80 | #4.5:
 81 | new_data = np.append(data_non_categoricals,encoded_hot.todense(),1)
 82 | new_data=new_data.astype(np.float)
 83 | 
 84 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(new_data, target, test_size=0.4, random_state=0,dtype=float)
 85 | 
 86 | logreg = linear_model.LogisticRegression(tol=1e-10)
 87 | logreg.fit(X_train,y_train)
 88 | log_output = logreg.predict_log_proba(X_test)
 89 | 
 90 | print("Odds: "+ str(np.exp(logreg.coef_)))
 91 | print("Odds intercept" + str(np.exp(logreg.intercept_)))
 92 | print("Likelihood Intercept:" + str(np.exp(logreg.intercept_)/(1+np.exp(logreg.intercept_))))
 93 | 
 94 | f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
 95 | plt.setp((ax1,ax2),xticks=[])
 96 | 
 97 | ax1.scatter(range(0,len(log_output[:,1]),1),log_output[:,1],s=100,label='Log Prob.',color='Blue',alpha=0.5)
 98 | ax1.scatter(range(0,len(y_test),1),y_test,label='Labels',s=250,color='Green',alpha=0.5)
 99 | ax1.legend(bbox_to_anchor=(0., 1.02, 1., 0.102), ncol=2, loc=3, mode="expand", borderaxespad=0.)
100 | ax1.set_xlabel('Test Instances')
101 | ax1.set_ylabel('Binary Ground Truth Labels /  Model Log. Prob.')
102 | 
103 | prob_output = [np.exp(x) for x in log_output[:,1]]
104 | ax2.scatter(range(0,len(prob_output),1),prob_output,s=100,label='Prob.', color='Blue',alpha=0.5)
105 | ax2.scatter(range(0,len(y_test),1),y_test,label='Labels',s=250,color='Green',alpha=0.5)
106 | ax2.legend(bbox_to_anchor=(0., 1.02, 1., 0.102), ncol=2, loc=3, mode="expand", borderaxespad=0.)
107 | ax2.set_xlabel('Test Instances')
108 | ax2.set_ylabel('Binary Ground Truth Labels / Model Prob.')
109 | 
110 | plt.show()
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/errata/errata.htm:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
  2 | <html>
  3 |   <head>
  4 | <meta charset="UTF-8">
  5 | </head>
  6 |   <!--&hellip;--><!--ellipsis-->
  7 |     <title>Manning Publications</title>
  8 |     <head>
  9 | 
 10 |     <style>
 11 |       body {
 12 |         font-size: 10pt;
 13 |         margin: 1.5em;
 14 |         background-color: #404040;
 15 |         color: black;
 16 |         font-family: Verdana,sans-serif;
 17 |       }
 18 |       h1 {
 19 |         font-size: 1.2em;
 20 |         font-weight: bold;
 21 |         margin-top: 2em;
 22 |       }
 23 |       h2 {
 24 |         font-size: 1.1em;
 25 |         font-weight: bold;
 26 |       }
 27 |       fieldset {
 28 |         width: 740px;
 29 |         margin-bottom: 12px;
 30 |         border: 2px #dddddd;
 31 |         border-radius: 4px;
 32 |         background-color: aliceblue;
 33 |       }
 34 | 
 35 |       fieldset div {
 36 |         padding: 20px;
 37 |         margin-bottom: 6px;
 38 |         font-weight: normal;
 39 |       }
 40 | 
 41 |       legend {
 42 |         border: 2px ridge #00457b;
 43 |         font-size: 1.2em;
 44 |         font-weight: bold;
 45 |         background-color: #e36a51;
 46 |         color: black;
 47 |         padding: 8px 16px;
 48 |       }
 49 |       
 50 |       
 51 |     </style>
 52 |   </head>
 53 | 
 54 |   <body>
 55 | 
 56 |     <fieldset>
 57 |            <legend>Algorithms of the Intelligent Web 2nd Edition. Errata</legend>
 58 |   <p>
 59 |          Any errors found in <a href="https://manning.com/books/algorithms-of-the-intelligent-web-second-edition"><b><i>Algorithms of the Intelligent Web, 2nd Ed.</i></b></a> should be reported using the <a href="https://forums.manning.com/forums/algorithms-of-the-intelligent-web-second-edition" target="_blank">online forum</a>. I'll collect all errors from there and compile within this errata for subsequent publications. Thanks for taking the time to contribute.
 60 |       </p>
 61 |       
 62 |         <ul>
 63 | 
 64 |           <li>
 65 |             <h2>Page 149-150</h2>
 66 |             <p>
 67 |               There are some errors caused due to typesetting on this page. Mostly, this results in erroneous subscripts. 
 68 |               <br>
 69 |               The equation which reads
 70 |               </p><pre> 
 71 |                <img src="./images/wrong_eq_1.png" height="50">
 72 |               </pre>
 73 |               Should read
 74 |               <pre> 
 75 |                <img src="./images/right_eq_1.png" height="50">
 76 |               </pre>
 77 |               Similarly, the equation which reads 
 78 |               <pre>             </pre>
 79 |               <pre> 
 80 |                <img src="./images/wrong_eq_2.png" height="50">
 81 |               </pre>
 82 |               Should read
 83 |                <pre>  
 84 |                <img src="./images/right_eq_2.png" height="50">
 85 |               </pre>
 86 |             <p></p>
 87 |           </li>
 88 | 
 89 | 
 90 |           <li>
 91 |             <h2>Page 31-32, Listing 2.5</h2>
 92 |             <p>
 93 |               Code formatting within the book is incorrect, although it is correct within the associated code resources. There are two issues. The first is around tab /new line placement. The second is due to a page break. It appears the tabbing has been incorrectly continued on the new page, so if you consider the listing as a whole, it is incorrect. It currently reads:
 94 |               <pre>
 95 |                 for comb, axflat in zip(combs,axarr_flat):
 96 |                         for target, color, label, marker in zip(targets,colors,labels,markers):
 97 |                                 feature_index_x=comb[0]
 98 |                                 feature_index_y=comb[1]
 99 | 
100 |                         NOTE PAGE BREAK HERE
101 | 
102 |                         axflat.scatter(iris.data[predictions==target,feature_index_x],
103 |                   
104 |                         iris.data[predictions==target,feature_index_y],c=color,label=label)
105 |                                     axflat.set_xlabel(feature_names[feature_index_x])
106 |                                     axflat.set_ylabel(feature_names[feature_index_y])
107 | 
108 |                 f.tight_layout()
109 |                 pl.show()
110 |               </pre>
111 |               it should in fact read: 
112 |               <pre>
113 |                 for comb, axflat in zip(combs,axarr_flat):
114 |                         for target, color, label, marker in zip(targets,colors,labels,markers):
115 |                                 feature_index_x=comb[0]
116 |                                 feature_index_y=comb[1]
117 |                                 axflat.scatter(iris.data[predictions==target,feature_index_x],
118 |                                                 iris.data[predictions==target,feature_index_y],c=color,label=label,marker=marker)
119 |                                 axflat.set_xlabel(feature_names[feature_index_x])
120 |                                 axflat.set_ylabel(feature_names[feature_index_y])
121 | 
122 |                 f.tight_layout()
123 |                 pl.show()
124 |               </pre>
125 |               The overall effect of this, is that if the code is entered directly from the book, the line starting iris.data is unexpected and the python interpreter will throw an error. If this is fixed, then excess tabbing at the next two lines and the new line preceeding it will cause the interpreter to throw an error. Finally, if all formatting errors are rectified, only the last type of flower will be plotted, as the call to scatter appears outside of the for-loop.
126 |             </p>
127 |             
128 |           </li>
129 |         </ul>
130 |         
131 |       </p>
132 | 


--------------------------------------------------------------------------------
/ch6.py:
--------------------------------------------------------------------------------
  1 | #6.3
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import random
  5 | from sklearn.linear_model import perceptron
  6 | 
  7 | #Let's set up our data and our target
  8 | data = np.array([[0,1],[0,0],[1,0],[1,1]])
  9 | target = np.array([0,0,0,1])
 10 | 
 11 | #6.4
 12 | p = perceptron.Perceptron(n_iter=100)
 13 | p_out = p.fit(data,target)
 14 | print p_out
 15 | msg = ("Coefficients: %s, Intercept: %s")
 16 | print msg % (str(p.coef_),str(p.intercept_)) 
 17 | 
 18 | #6.5
 19 | colors = np.array(['k','r'])
 20 | markers = np.array(['*','o'])
 21 | for data,target in zip(data,target):
 22 | 	plt.scatter(data[0],data[1],s=100,c=colors[target],marker=markers[target])
 23 | 
 24 | #Need to calculate a hyperplane the straight line as it intersects with z=0
 25 | #Recall that our optimisation is solving z=m1x + m2y + c
 26 | #If we want to understand the straight line created at the intersection with the viewing plane of x and y (where z=0)
 27 | #0=m1x + m2y +c
 28 | #m2y -m1x -c
 29 | #y = -m1/m2x - c/m2
 30 | 
 31 | grad = -p.coef_[0][0]/p.coef_[0][1]
 32 | intercept = -p.intercept_/p.coef_[0][1]
 33 | 
 34 | x_vals = np.linspace(0,1)
 35 | y_vals = grad*x_vals + intercept
 36 | plt.plot(x_vals,y_vals)
 37 | plt.show()
 38 | 
 39 | #6.X A HANDBUILT multilayer perceptron (NOT REPRODUCED IN THE BOOK).
 40 | data = np.array([[0,1],[0,0],[1,0],[1,1]])
 41 | target = np.array([1,0,1,0])
 42 | 
 43 | colors = np.array(['k','r'])
 44 | markers = np.array(['*','o'])
 45 | for _data,_target in zip(data,target):
 46 | 	plt.scatter(_data[0],_data[1],s=100,c=colors[_target],marker=markers[_target])
 47 | 
 48 | plt.xlabel('x_1')
 49 | plt.ylabel('x_2')
 50 | 
 51 | #Let's plot the hand built boolean classifier
 52 | grad = -1
 53 | intercept = 0.5
 54 | 
 55 | x_vals = np.linspace(0,1)
 56 | y_vals = grad*x_vals + intercept
 57 | #plt.scatter(data[:,0],data[:,1],c=colors[target])
 58 | plt.plot(x_vals,y_vals,'b')
 59 | 
 60 | grad = -1
 61 | intercept = 1.5
 62 | x_vals = np.linspace(0,1)
 63 | y_vals = grad*x_vals + intercept
 64 | plt.plot(x_vals,y_vals,'r')
 65 | 
 66 | plt.xlabel('x_1')
 67 | plt.ylabel('x_2')
 68 | 
 69 | plt.text(0.8,-0.7,"x_2 = -x_1 + 0.5")
 70 | plt.text(0,1.65,"x_2 = -x_1 + 1.5")
 71 | plt.text(0.4,0.5,"y_1 = 1")
 72 | plt.text(0.8,1.5,"y_1 = 0")
 73 | plt.text(0,-0.5,"y_1 = 0")
 74 | plt.show()
 75 | 
 76 | #6.6
 77 | from pybrain.structure import LinearLayer, SigmoidLayer
 78 | from pybrain.datasets import SupervisedDataSet
 79 | from pybrain.supervised.trainers import BackpropTrainer
 80 | from pybrain.structure import FeedForwardNetwork
 81 | from pybrain.structure import FullConnection
 82 | from pybrain.structure.modules import BiasUnit
 83 | 
 84 | import random
 85 | 
 86 | #Create network modules
 87 | net = FeedForwardNetwork()
 88 | inl = LinearLayer(2)
 89 | hidl = SigmoidLayer(2)
 90 | outl = LinearLayer(1)
 91 | b = BiasUnit()
 92 | 
 93 | #6.7
 94 | #Create connections
 95 | in_to_h = FullConnection(inl, hidl)
 96 | h_to_out = FullConnection(hidl, outl)
 97 | bias_to_h = FullConnection(b,hidl)
 98 | bias_to_out = FullConnection(b,outl)
 99 | 
100 | #Add modules to net
101 | net.addInputModule(inl)
102 | net.addModule(hidl);
103 | net.addModule(b)
104 | net.addOutputModule(outl)
105 | 
106 | #Add connections to net and sort
107 | net.addConnection(in_to_h)
108 | net.addConnection(h_to_out)
109 | net.addConnection(bias_to_h)
110 | net.addConnection(bias_to_out)
111 | net.sortModules()
112 | 
113 | #6.8
114 | #input data
115 | d = [(0,0),
116 |      (0,1),
117 |      (1,0),
118 |      (1,1)]
119 | 
120 | #target class
121 | c = [0,1,1,0]
122 | 
123 | data_set = SupervisedDataSet(2, 1) # 2 inputs, 1 output
124 | 
125 | random.seed()
126 | for i in xrange(1000):
127 |     r = random.randint(0,3)
128 |     data_set.addSample(d[r],c[r])
129 | 
130 | backprop_trainer \
131 | 	= BackpropTrainer(net, data_set, learningrate=0.1)
132 | 
133 | for i in xrange(50):
134 |     err = backprop_trainer.train()
135 |     print "Iter. %d, err.: %.5f" % (i, err)
136 | 
137 | #6.9
138 | print "[w(x_1,j=1),w(x_2,j=1),w(x_1,j=2),w(x_2,j=2)]: " + str(in_to_h.params)
139 | print "[w(j=1,j=3),w(j=2,j=3)]: "+str(h_to_out.params)
140 | print "[w(x_b,j=1),w(x_b,j=2)]: "+str(bias_to_h.params)
141 | print "[w(x_b,j=3)]:" +str(bias_to_out.params)
142 | 
143 | #6.10
144 | print "Activating 0,0. Output: " + str(net.activate([0,0]))
145 | print "Activating 0,1. Output: " + str(net.activate([0,1]))
146 | print "Activating 1,0. Output: " + str(net.activate([1,0]))
147 | print "Activating 1,1. Output: " + str(net.activate([1,1]))
148 | 
149 | 
150 | ###########
151 | # From here onwards:RBMS
152 | 
153 | # Original Authors: Yann N. Dauphin, Vlad Niculae, Gabriel Synnaeve
154 | # License: BSD
155 | 
156 | import numpy as np
157 | import matplotlib.pyplot as plt
158 | 
159 | from scipy.ndimage import convolve
160 | from sklearn import linear_model, datasets, metrics
161 | from sklearn.cross_validation import train_test_split
162 | from sklearn.neural_network import BernoulliRBM
163 | from sklearn.pipeline import Pipeline
164 | 
165 | #6.12
166 | def nudge_dataset(X, Y):
167 |     """
168 |     This produces a dataset 5 times bigger than the original one,
169 |     by moving the 8x8 images in X around by 1px to left, right, down, up
170 |     """
171 |     direction_vectors = [[[0, 1, 0],[0, 0, 0],[0, 0, 0]],
172 | 			[[0, 0, 0],[1, 0, 0],[0, 0, 0]],
173 | 			[[0, 0, 0],[0, 0, 1],[0, 0, 0]],
174 | 			[[0, 0, 0],[0, 0, 0],[0, 1, 0]]]
175 |     shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant', weights=w).ravel()
176 |     X = np.concatenate([X] + [np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors])
177 |     Y = np.concatenate([Y for _ in range(5)], axis=0)
178 |     return X, Y
179 | 
180 | #6.11
181 | digits = datasets.load_digits()
182 | X = np.asarray(digits.data, 'float32')
183 | X, Y = nudge_dataset(X, digits.target)
184 | X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)  # 0-1 scaling
185 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2,random_state=0)
186 | 
187 | #6.13
188 | # Models we will use
189 | logistic = linear_model.LogisticRegression()
190 | rbm = BernoulliRBM(random_state=0, verbose=True)
191 | 
192 | classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
193 | 
194 | ###############################################################################
195 | # Training
196 | 
197 | # Hyper-parameters. These were set by cross-validation,
198 | # using a GridSearchCV. Here we are not performing cross-validation to
199 | # save time.
200 | rbm.learning_rate = 0.06
201 | rbm.n_iter = 20
202 | # More components tend to give better prediction performance, but larger
203 | # fitting time
204 | rbm.n_components = 100
205 | logistic.C = 6000.0
206 | 
207 | # Training RBM-Logistic Pipeline
208 | classifier.fit(X_train, Y_train)
209 | 
210 | # Training Logistic regression
211 | logistic_classifier = linear_model.LogisticRegression(C=100.0)
212 | logistic_classifier.fit(X_train, Y_train)
213 | 
214 | #6.14
215 | # Evaluation
216 | print("Logistic regression using RBM features:\n%s\n" % (
217 |     metrics.classification_report(
218 |         Y_test,
219 |         classifier.predict(X_test))))
220 | 
221 | print("Logistic regression using raw pixel features:\n%s\n" % (
222 |     metrics.classification_report(
223 |         Y_test,
224 |         logistic_classifier.predict(X_test))))
225 | 
226 | 
227 | #6.15
228 | plt.figure(figsize=(4.2, 4))
229 | for i, comp in enumerate(rbm.components_):
230 |     #print(i)
231 |     #print(comp)
232 |     plt.subplot(10, 10, i + 1)
233 |     plt.imshow(comp.reshape((8, 8)), cmap=plt.cm.gray_r,interpolation='nearest')
234 |     plt.xticks(())
235 |     plt.yticks(())
236 | 
237 | plt.suptitle('100 components extracted by RBM', fontsize=16)
238 | plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
239 | plt.show()
240 | 


--------------------------------------------------------------------------------
/ch7.py:
--------------------------------------------------------------------------------
  1 | #7.1
  2 | import math
  3 | import random
  4 | import numpy as np
  5 | import matplotlib.pyplot as plt
  6 | import itertools
  7 | 
  8 | #Assume 5000 samples per group
  9 | n_experiment = 10000
 10 | n_control = 10000
 11 | 
 12 | p_experiment= 0.002
 13 | p_control = 0.001
 14 | 
 15 | se_experiment_sq = p_experiment*(1-p_experiment) / n_experiment
 16 | se_control_sq = p_control*(1-p_control) / n_control
 17 | 
 18 | Z = (p_experiment-p_control)/math.sqrt(se_experiment_sq+se_control_sq)
 19 | 
 20 | print Z
 21 | 
 22 | #Not a listing, but mentioned in the text and is required for production.
 23 | def get_z(n_experiment, n_control, p_experiment, p_control):
 24 |     var_experiment = p_experiment*(1-p_experiment) / n_experiment
 25 |     var_control = p_control*(1-p_control) / n_control
 26 |     Z = (p_experiment-p_control)/math.sqrt(var_experiment+var_control)
 27 |     return Z
 28 | 
 29 | experiment_array = np.linspace(100, 20000, 100)
 30 | control_array = np.linspace(100,20000,100)
 31 | experiment_probability_array = np.empty(100); experiment_probability_array.fill(0.002)
 32 | control_probability_array = np.empty(100); control_probability_array.fill(0.001)
 33 | data =  zip(experiment_array,control_array,experiment_probability_array,control_probability_array)
 34 | #Need to create associated parameters and zip these together.
 35 | z_results = [get_z(k[0],k[1],k[2],k[3]) for k in data]
 36 | 
 37 | plt.plot(experiment_array,z_results)
 38 | #95 % confidence interval
 39 | x_values = [0,20000]
 40 | y_values = [1.96,1.96]
 41 | plt.plot(x_values,y_values)
 42 | plt.text(x_values[0],y_values[0]+0.01,"95%")
 43 | 
 44 | #90% confidence interval
 45 | x_values = [0,20000]
 46 | y_values = [1.645,1.645]
 47 | plt.plot(x_values,y_values)
 48 | plt.text(x_values[0],y_values[0]+0.01,"90%")
 49 | 
 50 | #80% confidence interval 1.28
 51 | x_values = [0,20000]
 52 | y_values = [1.28,1.28]
 53 | plt.plot(x_values,y_values)
 54 | plt.text(x_values[0],y_values[0]+0.01,"80%")
 55 | 
 56 | #70% confidence interval 1.04
 57 | x_values = [0,20000]
 58 | y_values = [1.04,1.04]
 59 | plt.plot(x_values,y_values)
 60 | plt.text(x_values[0],y_values[0]+0.01,"70%")
 61 | 
 62 | plt.xlabel("Number of users in each A/B group")
 63 | plt.ylabel("z value")
 64 | 
 65 | plt.title("Graph of number of users against z value for a fixed conversion rate \n (0.001/0.002 A/B respectively)")
 66 | plt.show()
 67 | 
 68 | ##Bayesian Bandit
 69 | #7.6
 70 | from scipy.stats import beta
 71 | import numpy as np
 72 | import matplotlib.pyplot as plt
 73 | import random
 74 | from operator import sub, div, add
 75 | 
 76 | class Bandit:
 77 |     def __init__(self,probability):
 78 |         self.probability=probability
 79 |     
 80 |     def pull_handle(self):
 81 |         if random.random()<self.probability:
 82 |             return 1
 83 |         else:
 84 |             return 0
 85 |         
 86 |     def get_prob(self):
 87 |         return self.probability
 88 | 
 89 | #7.7
 90 | def sample_distributions_and_choose(bandit_params):
 91 |     sample_array =  \
 92 | 	[beta.rvs(param[0], param[1], size=1)[0] for param in bandit_params]
 93 |     return np.argmax(sample_array)
 94 | 
 95 | #7.8
 96 | def run_single_regret(bandit_list,bandit_params,plays):
 97 |     sum_probs_chosen=0
 98 |     opt=np.zeros(plays)
 99 |     chosen=np.zeros(plays)
100 |     bandit_probs = [x.get_prob() for x in bandit_list]
101 |     opt_solution = max(bandit_probs)
102 |     for i in range(0,plays):
103 |         index = sample_distributions_and_choose(bandit_params)
104 |         sum_probs_chosen+=bandit_probs[index]
105 |         if(bandit_list[index].pull_handle()):
106 |             bandit_params[index]=\
107 | 		(bandit_params[index][0]+1,bandit_params[index][1])
108 |         else:
109 |             bandit_params[index]=\
110 | 		(bandit_params[index][0],bandit_params[index][1]+1)
111 |         opt[i] = (i+1)*opt_solution
112 |         chosen[i] = sum_probs_chosen
113 |     regret_total = map(sub,opt,chosen)
114 |     return regret_total
115 | 
116 | #7.9
117 | #Plot params beforehand
118 | bandit_list = [Bandit(0.1),Bandit(0.3),Bandit(0.8)]
119 | bandit_params = [(1,1),(1,1),(1,1)]
120 | 
121 | x = np.linspace(0,1, 100)
122 | plt.plot(x, beta.pdf(x, bandit_params[0][0], bandit_params[0][1]),'-r*', alpha=0.6, label='Bandit 1')
123 | plt.plot(x, beta.pdf(x, bandit_params[1][0], bandit_params[1][1]),'-b+', alpha=0.6, label='Bandit 2')
124 | plt.plot(x, beta.pdf(x, bandit_params[2][0], bandit_params[2][1]),'-go', alpha=0.6, label='Bandit 3')
125 | plt.legend()
126 | plt.xlabel("payout probability")
127 | plt.ylabel("probability density of belief")
128 | plt.show()
129 | 
130 | #7.10
131 | #Just do a single regret
132 | plays=1000
133 | bandit_list = [Bandit(0.1),Bandit(0.3),Bandit(0.8)]
134 | bandit_params = [(1,1),(1,1),(1,1)]
135 | 
136 | regret_total = run_single_regret(bandit_list,bandit_params,plays)
137 | plt.plot(regret_total)
138 | plt.title("expected regret against steps in experiment")
139 | plt.xlabel("Step in experiment")
140 | plt.ylabel("Cumulative expected regret in experiment at this step")
141 | plt.show()
142 | 
143 | #7.11
144 | #Now plot the params:
145 | x = np.linspace(0,1, 100)
146 | plt.plot(x, beta.pdf(x, bandit_params[0][0], bandit_params[0][1]),'-r*', alpha=0.6, label='Bandit 1')
147 | plt.plot(x, beta.pdf(x, bandit_params[1][0], bandit_params[1][1]),'-b+', alpha=0.6, label='Bandit 2')
148 | plt.plot(x, beta.pdf(x, bandit_params[2][0], bandit_params[2][1]),'-go', alpha=0.6, label='Bandit 3')
149 | plt.legend()
150 | plt.xlabel("payout probability")
151 | plt.ylabel("probability density of belief")
152 | plt.show()
153 | 
154 | #7.12
155 | #Do many regrets on the same graph
156 | plays=1000
157 | runs=100
158 | 
159 | for i in range(0,runs):
160 |     bandit_list = [Bandit(0.1),Bandit(0.3),Bandit(0.8)]
161 |     bandit_params = [(1,1),(1,1),(1,1)]
162 |     regret_total = run_single_regret(bandit_list,bandit_params,plays)
163 |     plt.plot(regret_total,label='%s'%i)
164 | 
165 | plt.title("expected regret against steps in experiment")
166 | plt.xlabel("Step in experiment")
167 | plt.ylabel("Cumulative expected regret in experiment at this step")
168 | plt.show()
169 | 
170 | #7.13
171 | #Plot the average regret at each step over many runs.
172 | regret_sum=np.zeros(plays)
173 | 
174 | plays=1000
175 | runs=100
176 | 
177 | for i in range(0,runs):
178 |     bandit_list = [Bandit(0.1),Bandit(0.3),Bandit(0.8)]
179 |     bandit_params = [(1,1),(1,1),(1,1)]
180 |     regret_total = run_single_regret(bandit_list,bandit_params,plays)
181 |     regret_sum=map(add,regret_sum, np.asarray(regret_total))
182 | 
183 | plt.plot(regret_sum/(runs*np.ones(plays)))
184 | plt.title("average expected regret at each step over %s iterations"%runs )
185 | plt.xlabel("Step in experiment")
186 | plt.ylabel("Average total expected regret in experiment at this step")
187 | plt.show()
188 | 
189 | #From here on, none of this code is reproduced within the book, however it is used to derive figures in the book.
190 | def plot_regret(p_a,p_b,p_c,marker_cycle):
191 |     plays=1000
192 |     runs=100
193 |     regret_sum=np.zeros(plays)
194 |     for i in range(0,runs):
195 |         bandit_list = [Bandit(p_a),Bandit(p_b),Bandit(p_c)]
196 |         bandit_params = [(1,1),(1,1),(1,1)]
197 |         regret_total = run_single_regret(bandit_list,bandit_params,plays)
198 |         regret_sum = map(add,regret_sum, np.asarray(regret_total))
199 |     plt.plot(regret_sum/(runs*np.ones(plays)),label="%s,%s,%s"%(p_a,p_b,p_c),marker=marker_cycle.next(),markevery=50)
200 |     plt.title("average expected regret at each step over %s iterations"%runs)
201 | 
202 | def plot_regret_number_bandits(number_bandits,color_cycle,marker_cycle):
203 |     plays=1000
204 |     runs=100
205 |     regret_sum=np.zeros(plays)
206 |     for i in range(0,runs):
207 |         bandit_list = []
208 |         bandit_params = []
209 |         for j in np.linspace(1,0,number_bandits, endpoint=False): #dont want to include 0 prob bandit
210 |             bandit_list.append(Bandit(j))
211 |             bandit_params.append((1,1))
212 |         regret_total = run_single_regret(bandit_list,bandit_params,plays)
213 |         regret_sum = map(add,regret_sum,np.asarray(regret_total))
214 |     plt.plot(regret_sum/(runs*np.ones(plays)),label="# Bandits: %s"%(number_bandits),color=color_cycle.next(),marker=marker_cycle.next(),markevery=50)
215 | 
216 | #Experiment for regret curve where probabiltiies are closer to each other
217 | markers = itertools.cycle((',', '+', '.', 'o', '*')) 
218 | 
219 | plot_regret(0,0.5,1,markers)
220 | plot_regret(0.1,.5,.9,markers)
221 | plot_regret(.2,.5,.8,markers)
222 | plot_regret(.3,.5,.7,markers)
223 | plot_regret(.4,.5,.6,markers)
224 | 
225 | plt.legend()
226 | plt.xlabel("Step in experiment")
227 | plt.ylabel("Average total expected regret in experiment at this step")
228 | plt.show()
229 | 
230 | #Experiment for regret curve where probabilities are on a different scale
231 | 
232 | plot_regret(0,0.05,0.1,markers)
233 | plot_regret(0.01,0.05,0.09,markers)
234 | plot_regret(0.02,0.05,0.08,markers)
235 | plot_regret(0.03,0.05,0.07,markers)
236 | plot_regret(0.04,0.05,0.06,markers)
237 | 
238 | plt.legend()
239 | plt.xlabel("Step in experiment")
240 | plt.ylabel("Average cumulated expected regret in experiment at this step")
241 | plt.show()
242 | 
243 | markers = itertools.cycle((',', '+', '.', 'o', '*', '^', 'D', 'h', 'p', 's')) 
244 | 
245 | #Experiment for regret curve where the number of bandits are increased.
246 | color_cycle = itertools.cycle(plt.cm.spectral(np.linspace(0,1,10)))
247 | plot_regret_number_bandits(1,color_cycle,markers)
248 | plot_regret_number_bandits(2,color_cycle,markers)
249 | plot_regret_number_bandits(3,color_cycle,markers)
250 | plot_regret_number_bandits(4,color_cycle,markers)
251 | plot_regret_number_bandits(5,color_cycle,markers)
252 | plot_regret_number_bandits(6,color_cycle,markers)
253 | plot_regret_number_bandits(7,color_cycle,markers)
254 | plot_regret_number_bandits(8,color_cycle,markers)
255 | plot_regret_number_bandits(9,color_cycle,markers)
256 | plot_regret_number_bandits(10,color_cycle,markers)
257 | 
258 | plt.legend()
259 | plt.xlabel("Step in experiment")
260 | plt.ylabel("Average cumulative expected regret experiment at this step")
261 | plt.show()
262 | print "end"
263 | 


--------------------------------------------------------------------------------
/ch3.py:
--------------------------------------------------------------------------------
  1 | import pdb
  2 | import csv
  3 | import re
  4 | import math
  5 | import numpy as np
  6 | from numpy import linalg as LA
  7 | 
  8 | import recsys.algorithm
  9 | from recsys.algorithm.factorize import SVD
 10 | from recsys.datamodel.data import Data
 11 | from recsys.datamodel.user import User
 12 | from recsys.datamodel.item import Item
 13 | from recsys.evaluation.prediction import RMSE, MAE
 14 | from recsys.utils.svdlibc import SVDLIBC
 15 | 
 16 | #Helpers
 17 | 
 18 | def read_user_data_from_ratings(data_file):
 19 |     data = Data()
 20 |     format = {'col':0, 'row':1, 'value':2, 'ids': 'int'}    
 21 |     data.load(dat_file, sep='::', format=format)
 22 |     
 23 |     userdict = {}
 24 |     for d in data.get():
 25 |         if d[2] in userdict:
 26 |             user = userdict[d[2]] 
 27 |         else:
 28 |             user = User(d[2]) 
 29 |         
 30 |         user.add_item(d[1],d[0])
 31 |         userdict[d[2]] = user
 32 |     return userdict
 33 | 
 34 | def read_item_data(filename):
 35 |     itemdict = {}
 36 |     f = open(item_file,'r')
 37 |     for r in f:
 38 |         p = r.split('::')
 39 |         if p[0] in itemdict:
 40 |             print "Duplicate!", p[0]
 41 |         else:
 42 |             item = Item(p[0])
 43 |         idat = {}
 44 |         if len(p)>1:
 45 |             idat['Title']  = p[1] 
 46 |         if len(p)>2: 
 47 |             idat['Genres'] = p[2]
 48 |         item.add_data(idat)
 49 |         itemdict[p[0]] = item
 50 |     return itemdict
 51 | 
 52 | def items_reviewed(user_id, userdict):
 53 |     return [x[0] for x in userdict[user_id].get_items()]
 54 | 
 55 | def get_score_item_reviewed(user_id,item_id, userdict):
 56 |     #return the first score in the list that matches the id
 57 |     return [x[1] for x in userdict[user_id].get_items() if x[0]==int(item_id)][0] 
 58 | 
 59 | def get_name_item_reviewed(user_id,userdict,itemdict):
 60 |     l =  [(x[0],itemdict[str(x[0])].get_data()['Title'],itemdict[str(x[0])].get_data()['Genres'],x[1]) for x in userdict[user_id].get_items()]
 61 |     return sorted(l,key=lambda x: x[3], reverse=True)
 62 | 
 63 | #Pass a list of (id, title, filmcategory,rating) and a filter_value (0,5]
 64 | #Returns a  list of [category,#count of ratings>filter_value],
 65 | def movies_by_category(movie_list,filter_value):
 66 |     d = {}
 67 |     for x in movie_list:
 68 |         if x[3]>filter_value:
 69 |             if str(x[2]) in d:
 70 |                 d[str(x[2])]+=1
 71 |             else:
 72 |                 d[str(x[2])]=1
 73 |     dictlist = []
 74 |     for key, value in d.iteritems():
 75 |         temp = [key,value]
 76 |         dictlist.append(temp)
 77 |         
 78 |     return dictlist
 79 | 
 80 | #End Helpers
 81 | 
 82 | #3.2
 83 | def similarity(user_id_a,user_id_b,sim_type=0):
 84 |     user_a_tuple_list = userdict[user_id_a].get_items()
 85 |     user_b_tuple_list = userdict[user_id_b].get_items()
 86 |     common_items=0
 87 |     sim = 0.0
 88 |     for t1 in user_a_tuple_list:
 89 |         for t2 in user_b_tuple_list:
 90 |             if (t1[0] == t2[0]):
 91 |                 common_items += 1
 92 |                 sim += math.pow(t1[1]-t2[1],2)
 93 |     if common_items>0:
 94 |         sim = math.sqrt(sim/common_items)
 95 |         sim = 1.0 - math.tanh(sim)
 96 |         if sim_type==1:
 97 |             max_common = min(len(user_a_tuple_list),len(user_b_tuple_list))
 98 |             sim = sim * common_items / max_common
 99 |     print "User Similarity between",names[user_id_a],"and",names[user_id_b],"is", sim
100 |     return sim #If no common items, returns zero
101 | 
102 | #3.1
103 | # load movielens data
104 | dat_file = 'ratings-11.dat'
105 | item_file = 'movies-11.dat'
106 | 
107 | names = ['Frank','Constantine','Catherine']
108 | 
109 | userdict = read_user_data_from_ratings(dat_file) #Build a userdict with users and ratings 
110 | itemdict = read_item_data(item_file) #Build an item, info dict 
111 | 
112 | similarity(0,1,sim_type=0)
113 | similarity(0,1,sim_type=1)
114 | similarity(0,2,sim_type=0)
115 | similarity(1,2,sim_type=0)
116 | similarity(2,1,sim_type=0)
117 | similarity(0,0,sim_type=0)
118 | similarity(0,0,sim_type=1)
119 | 
120 | #3.7
121 | class RatingCountMatrix:
122 |     user_id_a = None
123 |     user_id_b = None
124 |     matrix = None
125 |     
126 |     #Instantiate with two users and the total possible number of ratings (eg 5 in the movielens case)
127 |     def __init__(self, user_id_a, user_id_b):
128 |         num_rating_values = max([x[0] for x in data])
129 |         self.user_id_a = user_id_a
130 |         self.user_id_b = user_id_b
131 |         self.matrix = np.empty((num_rating_values,num_rating_values,))
132 |         self.matrix[:] = 0
133 |         self.calculate_matrix(user_id_a,user_id_b)
134 |         
135 |     def get_shape(self):
136 |         a = self.matrix.shape
137 |         return a
138 |     
139 |     def get_matrix(self):
140 |         return self.matrix
141 |     
142 |     def calculate_matrix(self,user_id_a, user_id_b):
143 |         for item in items_reviewed(user_id_a, userdict):
144 |             if int(item) in items_reviewed(user_id_b, userdict):
145 |                 i = get_score_item_reviewed(user_id_a,item, userdict)-1 #Need to subtract 1 as indexes are 0 to 4 (5 items)
146 |                 j = get_score_item_reviewed(user_id_b,item, userdict)-1
147 |                 self.matrix[i][j] +=1
148 |                 
149 |         
150 |     #Total number of items that the users have both ranked        
151 |     def get_total_count(self):
152 |         return self.matrix.sum()
153 |     
154 |     #Total number of items that they both agree on
155 |     def get_agreement_count(self):
156 |        return np.trace(self.matrix) #sum across the diagonal
157 | 
158 | #3.6
159 | class SimilarityMatrix:
160 |    
161 |     similarity_matrix = None
162 |     
163 |     def __init__(self):
164 |         self.build()
165 |    
166 |     def build(self):
167 |         self.similarity_matrix = np.empty((len(userdict),len(userdict),))
168 |     
169 |         for u in range(0,len(userdict)):
170 |             for v in range(u+1,len(userdict)):
171 |                 rcm = RatingCountMatrix(int(u),int(v))
172 |                 if(rcm.get_agreement_count()>0):
173 |                     self.similarity_matrix[u][v] = rcm.get_agreement_count()/rcm.get_total_count()
174 |                 else:
175 |                     self.similarity_matrix[u][v] = 0
176 |             self.similarity_matrix[u][u]=1
177 |             
178 |     def get_user_similarity(self,user_id1, user_id2):
179 |         return self.similarity_matrix[min(user_id1,user_id2),max(user_id1,user_id2)] # Due to upper traingular form 
180 | 
181 | # 3.5:
182 | def predict_rating(user_id, item_id): 
183 |     estimated_rating = None;
184 |     similarity_sum = 0;
185 |     weighted_rating_sum = 0;
186 |     
187 |     if (int(item_id) in items_reviewed(user_id,userdict)):
188 |         return get_score_item_reviewed(user_id,item_id,userdict)
189 |     else:
190 |         for u in userdict.keys():
191 |             if (int(item_id) in items_reviewed(u,userdict)):
192 |                 item_rating = get_score_item_reviewed(u,item_id,userdict)
193 |                 user_similarity = similarity_matrix.get_user_similarity(user_id,u)
194 |                 weighted_rating = user_similarity * item_rating
195 |                 weighted_rating_sum += weighted_rating
196 |                 similarity_sum += user_similarity
197 |                 
198 |         if (similarity_sum > 0.0):
199 |             estimated_rating = weighted_rating_sum / similarity_sum
200 |     
201 |     return estimated_rating
202 | 
203 | # 3.4:
204 | def recommend(user_id, top_n): 
205 |     #[(item,value),(item1, value1)...]
206 |     recommendations = []
207 |     for i in itemdict.keys():
208 |         if (int(i) not in items_reviewed(int(user_id),userdict)):
209 |             recommendations.append((i,predict_rating(user_id, i))) #only get those not predicted.
210 |     recommendations.sort(key=lambda t: t[1], reverse=True)
211 |     return recommendations[:top_n]
212 | 
213 | #3.3:
214 | data = Data()
215 | format = {'col':0, 'row':1, 'value':2, 'ids': 'int'}
216 |     # About format parameter:
217 |     #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
218 |     #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat file
219 |     #   'ids': int -> Ids (row and col ids) are integers (not strings)
220 | data.load(dat_file, sep='::', format=format)
221 | 
222 | similarity_matrix = SimilarityMatrix()
223 | recommend(0,10)
224 | recommend(1,10)
225 | recommend(2,10)
226 | 
227 | ##################
228 | #Now we do SVD
229 | ##################
230 | 
231 | #3.8
232 | svd = SVD()
233 | recsys.algorithm.VERBOSE = True
234 | 
235 | dat_file = './ml-1m/ratings.dat'
236 | item_file = './ml-1m/movies.dat'
237 | 
238 | data = Data()
239 | data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
240 | 
241 | items_full = read_item_data(item_file)
242 | user_full = read_user_data_from_ratings(dat_file)
243 | 
244 | svd.set_data(data)
245 | 
246 | #3.9
247 | k = 100
248 | svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True)
249 | films = svd.recommend(10,only_unknowns=True, is_row=False) #movies that user 10 should see (That they haven't rated)
250 | 
251 | #3.10
252 | [items_full[str(x[0])].get_data() for x in films]
253 | 
254 | #3.11
255 | get_name_item_reviewed(10,user_full,items_full)
256 | 
257 | #3.12
258 | items_full[str(2628)].get_data()
259 | users_for_star_wars = svd.recommend(2628,only_unknowns=True)
260 | users_for_star_wars
261 | 
262 | #3.13
263 | movies_reviewed_by_sw_rec  =[get_name_item_reviewed(x[0],user_full,items_full) for x in users_for_star_wars]
264 | movies_flatten = [movie for movie_list in movies_reviewed_by_sw_rec for movie in movie_list]
265 | movie_aggregate = movies_by_category(movies_flatten, 3)
266 | movies_sort = sorted(movie_aggregate,key=lambda x: x[1], reverse=True)
267 | movies_sort
268 | 
269 | #3.14
270 | from recsys.evaluation.prediction import RMSE
271 | err = RMSE()
272 | for rating, item_id, user_id in data.get():
273 |     try:
274 |         prediction = svd.predict(item_id, user_id)
275 |         err.add(rating, prediction)
276 |     except KeyError, k:
277 |         continue
278 | 
279 | print 'RMSE is ' + str(err.compute())
280 | 


--------------------------------------------------------------------------------