├── users-11.dat
├── ch6_supporting_content.xlsx
├── errata
├── images
│ ├── right_eq_1.png
│ ├── right_eq_2.png
│ ├── wrong_eq_1.png
│ └── wrong_eq_2.png
└── errata.htm
├── README.md
├── ratings-11.dat
├── movies-11.dat
├── fraud_data_3.csv
├── ch5.py
├── A1.py
├── ch5-prepare.sh
├── A1.data
├── ch5-criteo-process.py
├── requirements.txt
├── ch2.py
├── ch4.py
├── ch6.py
├── ch7.py
└── ch3.py
/users-11.dat:
--------------------------------------------------------------------------------
1 | 0::F::1::10::48067
2 | 1::M::56::16::70072
3 | 2::M::25::15::55117
4 |
--------------------------------------------------------------------------------
/ch6_supporting_content.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/ch6_supporting_content.xlsx
--------------------------------------------------------------------------------
/errata/images/right_eq_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/errata/images/right_eq_1.png
--------------------------------------------------------------------------------
/errata/images/right_eq_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/errata/images/right_eq_2.png
--------------------------------------------------------------------------------
/errata/images/wrong_eq_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/errata/images/wrong_eq_1.png
--------------------------------------------------------------------------------
/errata/images/wrong_eq_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/errata/images/wrong_eq_2.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # aiw-second-edition
2 | Code required for the examples in Algorithms of the Intelligent Web, 2nd Edition
3 |
4 | Errata can be found under './errata'
5 |
--------------------------------------------------------------------------------
/ratings-11.dat:
--------------------------------------------------------------------------------
1 | 0::0::5::978300760
2 | 0::1::4::978302109
3 | 0::2::5::978301968
4 | 0::3::4::978300275
5 | 0::4::5::978824291
6 | 0::5::4::978302268
7 | 0::6::5::978302039
8 | 1::0::5::978300719
9 | 1::2::5::978302268
10 | 1::4::4::978301368
11 | 1::5::5::978824268
12 | 1::7::5::978301752
13 | 1::8::4::978302281
14 | 1::9::5::978302124
15 | 2::0::1::978301753
16 | 2::2::2::978302188
17 | 2::3::2::978824268
18 | 2::6::3::978301777
19 | 2::9::2::978301713
20 | 2::10::1::978302039
21 |
--------------------------------------------------------------------------------
/movies-11.dat:
--------------------------------------------------------------------------------
1 | 0::Toy Story (1995)::Animation|Children's|Comedy
2 | 1::Jumanji (1995)::Adventure|Children's|Fantasy
3 | 2::Grumpier Old Men (1995)::Comedy|Romance
4 | 3::Waiting to Exhale (1995)::Comedy|Drama
5 | 4::Father of the Bride Part II (1995)::Comedy
6 | 5::Heat (1995)::Action|Crime|Thriller
7 | 6::Sabrina (1995)::Comedy|Romance
8 | 7::Tom and Huck (1995)::Adventure|Children's
9 | 8::Sudden Death (1995)::Action
10 | 9::GoldenEye (1995)::Action|Adventure|Thriller
11 | 10::American President, The (1995)::Comedy|Drama|Romance
12 |
--------------------------------------------------------------------------------
/fraud_data_3.csv:
--------------------------------------------------------------------------------
1 | IsFraud,Amount,Country,TimeOfTransaction,BusinessType,NumberOfTransactionsAtThisShop,DayOfWeek
0,10,DK,13,2,0,1
0,100,LT,18,1,3,4
0,49.99,AT,11,4,3,0
0,12,AUS,9,6,4,3
0,250,UK,12,1,5,6
0,149.99,UK,17,2,2,5
0,10,UK,16,8,1,4
0,49.99,DK,12,9,4,3
0,18,UK,14,2,3,6
0,27,DK,10,1,5,5
0,40,DK,11,1,6,2
0,2,UK,10,2,7,4
0,34.99,UK,9,4,8,3
0,2,UK,8,9,9,0
1,18000,LT,13,9,0,0
1,20000,LT,14,9,0,0
1,19000,LT,13,9,0,0
1,6000,LT,13,9,1,4
1,9000,LT,12,9,0,4
1,5000,LT,12,9,0,4
1,20000,LT,12,9,0,0
1,10000,LT,12,9,0,6
1,20000.01,UK,13,1,0,0
1,21000,LT,13,9,0,0
1,11000,LT,13,9,1,6
1,210000,UK,14,1,0,0
1,22000,UK,12,1,1,0
1,280000,LT,12,9,0,0
1,15000,LT,12,9,0,6
--------------------------------------------------------------------------------
/ch5.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pylab as pl
3 | from sklearn import svm, datasets
4 | from sklearn.utils import shuffle
5 | from sklearn.metrics import roc_curve, auc
6 |
7 | #Listing 5.6
8 | ground_truth_file_name = './ground_truth.dat'
9 | probability_file_name = './probabilities.out'
10 |
11 | ground_truth_file = open(ground_truth_file_name,'r')
12 | probability_file = open(probability_file_name,'r')
13 |
14 | ground_truth = np.array(map(int,ground_truth_file))
15 | probabilities = np.array(map(float,probability_file))
16 |
17 | ground_truth_file.close()
18 | probability_file.close()
19 |
20 | #from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
21 | fpr, tpr, thresholds = roc_curve(ground_truth, probabilities)
22 | roc_auc = auc(fpr, tpr)
23 | print "Area under the ROC curve : %f" % roc_auc
24 |
25 | pl.clf()
26 | pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
27 | pl.plot([0, 1], [0, 1], 'k--')
28 | pl.xlim([0.0, 1.0])
29 | pl.ylim([0.0, 1.0])
30 | pl.xlabel('False Positive Rate')
31 | pl.ylabel('True Positive Rate')
32 | pl.title('Receiver operating characteristic')
33 | pl.legend(loc="lower right")
34 | pl.show()
35 |
--------------------------------------------------------------------------------
/A1.py:
--------------------------------------------------------------------------------
1 | #Code for Appendix 1. Algorithms of the Intelligent Web 2nd Edition.
2 |
3 | #Listing A.1.1
4 | from kafka import KafkaClient, SimpleProducer
5 |
6 | kafka = KafkaClient("localhost:9092")
7 |
8 | producer = SimpleProducer(kafka)
9 | producer.send_messages("test", "Hello World!")
10 | producer.send_messages("test","This is my second message")
11 | producer.send_messages("test","And this is my third!")
12 |
13 | #Listing A.1.2
14 | from kafka import KafkaClient, SimpleConsumer
15 |
16 | kafka = KafkaClient("localhost:9092")
17 | consumer = SimpleConsumer(kafka,"mygroup","test")
18 |
19 | for message in consumer:
20 | print(message)
21 |
22 | #Listing A.1.5
23 | from kafka import KafkaClient, SimpleProducer
24 |
25 | kafka = KafkaClient("localhost:9092")
26 | producer = SimpleProducer(kafka,async=False,
27 | req_acks=SimpleProducer.ACK_AFTER_CLUSTER_COMMIT,
28 | ack_timeout=2000)
29 |
30 | producer.send_messages("test-replicated-topic", "Hello Kafka Cluster!")
31 | producer.send_messages("test-replicated-topic","Message to be replicated.")
32 | producer.send_messages("test-replicated-topic","And so is this!")
33 |
34 | #Listing A.1.8
35 | from kafka import KafkaClient
36 | from kafka.common import ProduceRequest
37 | from kafka.protocol import KafkaProtocol,create_message
38 |
39 | kafka = KafkaClient("localhost:9092")
40 |
41 | f = open('A1.data','r')
42 |
43 | for line in f:
44 | s = line.split("\t")[0]
45 | part = abs(hash(s)) % 3
46 | req = ProduceRequest(topic="click-streams",partition=part,messages=[create_message(s)])
47 | resps = kafka.send_produce_request(payloads=[req], fail_on_error=True)
48 |
--------------------------------------------------------------------------------
/ch5-prepare.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #This file contains the code for listing 5.1 - 5.5.
4 | #This code can be run as is, if it placed in the same directory as the
5 | #train_vw_file and the location of your vowpal_wabbit checkout is correct.
6 | #In order to obtain the train_vw_file you must run ch5-criteo-process.py
7 | #over the train.txt file obtained from the criteo display challenge dataset (full).
8 |
9 | #!5.1
10 | #echo 'Running Listing 5.1'
11 | wc -l train_vw_file
12 | grep -c '^-1' train_vw_file
13 | grep -c '^1' train_vw_file
14 |
15 | #!5.2
16 | #echo 'Running Listing 5.2'
17 | grep '^-1' train_vw_file | sort -R > negative_examples.dat
18 | grep '^1' train_vw_file | sort -R > positive_examples.dat
19 | awk 'NR % 3 == 0' negative_examples.dat > negative_examples_downsampled.dat
20 |
21 | cat negative_examples_downsampled.dat > all_examples.dat
22 | cat positive_examples.dat >> all_examples.dat
23 |
24 | cat all_examples.dat | sort -R > all_examples_shuffled.dat
25 | awk 'NR % 10 == 0' all_examples_shuffled.dat > all_examples_shuffled_down.dat
26 |
27 | #!5.3
28 | echo 'Running Listing 5.3'
29 | vw all_examples_shuffled_down.dat --loss_function=logistic -c -b 22 --passes=3 -f model.vw
30 | vw all_examples_shuffled_down.dat -t -i model.vw --invert_hash readable.model
31 | cat readable.model | awk 'NR > 9 {print}' | sort -r -g -k 3 -t : | head -1000 > readable_model_sorted_top
32 |
33 | #!5.4
34 | #Output only
35 |
36 | #!5.5
37 | echo 'Running Listing 5.5'
38 | vw -d test_vw_file -t -i model.vw --loss_function=logistic -r predictions.out
39 | ~/dev/vowpal_wabbit/utl/logistic -0 predictions.out > probabilities.out
40 | cut -d ' ' -f 1 test_vw_file | sed -e 's/^-1/0/' > ground_truth.dat
41 |
42 |
43 |
--------------------------------------------------------------------------------
/A1.data:
--------------------------------------------------------------------------------
1 | 381094c0-9e45-424f-90b6-ad64b06cc184 2014-01-02 17:33:07 click
2 | 6615087e-ea19-492c-869c-28fc1fa77588 2014-01-02 17:33:20 view
3 | d4889090-942b-4790-9831-362051b0847b 2014-01-02 17:34:01 adview
4 | e6688db5-d31f-4ede-985a-115fb51836fb 2014-01-02 17:35:06 adview
5 | e6688db5-d31f-4ede-985a-115fb51836fb 2014-01-02 17:35:30 click
6 | e6688db5-d31f-4ede-985a-115fb51836fb 2014-01-02 17:37:01 click
7 | e6688db5-d31f-4ede-985a-115fb51836fb 2014-01-02 17:39:00 convert
8 | ae6ae5c9-acb2-479b-adcb-0c29623d921b 2014-01-02 17:40:00 adview
9 | 1ac384c1-1b2d-4ed0-b467-7c90b7ac42d8 2014-01-02 17:40:01 adview
10 | 280bfa16-07ac-49ed-a1a5-9ab50a754027 2014-01-02 17:40:03 click
11 | dda0e95d-9c30-4f60-bb6a-febf05526b83 2014-01-02 17:40:05 adview
12 | 8a1204f1-5076-4d4c-8b23-84c77ad541d8 2014-01-02 17:40:10 adview
13 | 3bdb8f17-11cc-49cb-94cf-75676be909b7 2014-01-02 17:40:11 adview
14 | 69b61156-6c31-4317-aec5-bd48908b4973 2014-01-02 17:40:13 adview
15 | 69722471-0532-4f29-b2b4-2f9007604e4f 2014-01-02 17:40:14 adview
16 | 00e5edf6-a483-48fa-82ed-fbfac8a6b1e6 2014-01-02 17:40:15 adview
17 | 9398d369-6382-4be0-97bc-182b3713745f 2014-01-02 17:40:17 convert
18 | f40c1588-e4e1-4f7d-8ef5-5f76046886fb 2014-01-02 17:40:18 adview
19 | 54823527-fe62-4a81-8551-6282309b0a3f 2014-01-02 17:40:20 click
20 | 46d6f178-7c11-48c1-a1d7-f7152e7b2f1c 2014-01-02 17:40:26 adview
21 | 4c4e545b-d194-4531-962f-66e9d3b6116d 2014-01-02 17:41:00 convert
22 | 42b311f5-ba84-4666-a901-03063f7504a9 2014-01-02 17:41:01 adview
23 | bfa28923-c358-4741-bcbf-ff99b640ee14 2014-01-02 17:42:06 adview
24 | 54c29b39-5640-49b8-b610-6f2e6dc6bd1b 2014-01-02 17:42:10 convert
25 | edf6c5d2-1373-4dbb-8528-925d525b4a42 2014-01-02 17:43:03 click
26 | f7f6752f-03bf-43f1-927c-8acdafd235e2 2014-01-02 17:43:11 adview
27 | f4b7c0a6-b209-4cc4-b4e7-395489e0e724 2014-01-02 17:43:19 click
28 |
--------------------------------------------------------------------------------
/ch5-criteo-process.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import random
3 | import sys
4 |
5 | def process_file (file_name, vw_file_name_train, vw_file_name_test, train_pct):
6 | file = open(file_name)
7 | vw_file_train = open(vw_file_name_train,'w')
8 | vw_file_test = open(vw_file_name_test,'w')
9 |
10 | continuous_set = [int(x) for x in numpy.linspace(1,13,13)]
11 | categorical_set =[int(x) for x in numpy.linspace(1,26,26)]
12 |
13 | print continuous_set
14 | print categorical_set
15 |
16 | first_line_headers = ["Class"]
17 | for i in continuous_set:
18 | first_line_headers.append("i"+str(i))
19 |
20 | for c in categorical_set:
21 | first_line_headers.append("c"+str(c))
22 |
23 | print first_line_headers
24 |
25 | for line in file:
26 | line_split = line.split('\t')
27 | target_click = -1
28 | if int(line_split[0])>0:
29 | target_click=1
30 |
31 | #Essentially now manually build up the training string
32 | vw_line = ""+str(target_click)+" "
33 |
34 | for feature_index in continuous_set:
35 | if line_split[feature_index]!="":
36 | vw_line+="|"+first_line_headers[feature_index] +" c:"+ line_split[feature_index] + " "
37 |
38 | for feature_index in [x+len(continuous_set) for x in categorical_set]: #Index doesn't start from 0
39 | if line_split[feature_index]!="":
40 | vw_line+="|"+first_line_headers[feature_index] + " " + line_split[feature_index] + " "
41 |
42 | if(random.random()<=train_pct):
43 | vw_file_train.write(vw_line.replace('\n', '')+"\n") #Get rid of any unwanted line breaks
44 | else:
45 | vw_file_test.write(vw_line.replace('\n', '')+"\n") #Get rid of any unwanted line breaks
46 |
47 | file.close()
48 | vw_file_train.close()
49 | vw_file_test.close()
50 |
51 | if __name__ == '__main__':
52 | filename='./train.txt'
53 | vw_file_name_train = './train_vw_file'
54 | vw_file_name_test = './test_vw_file'
55 |
56 | filename = sys.argv[1] if len(sys.argv) >=2 else filename
57 | vw_file_name_train = sys.argv[2] if len(sys.argv) >=3 else vw_file_name_train
58 | vw_file_name_test = sys.argv[3] if len(sys.argv) >=4 else vw_file_name_test
59 |
60 | process_file(filename,vw_file_name_train,vw_file_name_test,0.7)
61 |
62 |
63 |
64 |
65 |
66 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | --System:
2 | Python 2.7.10
3 | Ubuntu 14.04.2
4 | VirtualEnv 13.1.2
5 | Scala version 2.10.4
6 |
7 | --System prerequisties:
8 | sudo apt-get install python-dev libblas-dev liblapack-dev libatlas-base-dev gfortran libpng12-dev libfreetype6-dev libjpeg-dev tcl-dev tk-dev python-tk libboost-program-options-dev libboost-python-dev libsnappy-dev
9 |
10 | --Python prerequisites:
11 | pip install numpy scipy scikit-learn matplotlib csc-pysparse networkx divisi2
12 |
13 | --Installing Recsys:
14 | git clone https://github.com/ocelma/python-recsys.git
15 | cd python-recsys
16 | sudo python setup.py install
17 | --[under virtualenv ./venv/bin/python setup.py install]
18 |
19 | --Installing Movielens Dataset:
20 | wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
21 | unzip ml-1m.zip
22 |
23 | --Installing VW (vw-8.1.1):
24 | git clone git://github.com/JohnLangford/vowpal_wabbit.git
25 | make
26 | make test
27 | sudo make install
28 |
29 | --Installing PyBrain (PyBrain==0.3.3)
30 | git clone git://github.com/pybrain/pybrain.git pybrain
31 | cd pybrain
32 | sudo python setup.py install
33 | --[Under virtualenv: ./venv/bin/python setup.py install]
34 |
35 | --Installing KafkaPython :
36 | git clone https://github.com/dpkp/kafka-python.git
37 | git checkout v0.9.5
38 | cd kafka-python
39 | sudo python setup.py install
40 | --[Under virtualenv: ./venv/bin/python setup.py install]
41 |
42 | ---This is a list of the versions of the requirements of the various packages installed above.
43 | ....numpy-1.10.1
44 | ....scipy-0.16.1
45 | ....sklearn-0.17
46 | ....matplotlib==1.5.0
47 | ....csc-pysparse-1.1.1.4
48 | ....csc-utils-0.6.7
49 | ....divisi2-2.2.5
50 | ....vowpal-wabbit-8.1.1
51 |
52 | ---This is a complete snapshot of my python environment after installing the above to a clean virtualenv environment
53 | ['backports-abc==0.4’,
54 | 'backports.ssl-match-hostname==3.4.0.2’,
55 | 'certifi==2015.11.20.1’,
56 | 'csc-pysparse==1.1.1.4’,
57 | 'csc-utils==0.6.7’,
58 | 'cycler==0.9.0’,
59 | 'decorator==4.0.4’,
60 | 'divisi2==2.2.5’,
61 | 'functools32==3.2.3.post2’,
62 | 'ipython-genutils==0.1.0’,
63 | 'ipython==4.0.1’,
64 | 'jinja2==2.8’,
65 | 'jsonschema==2.5.1’,
66 | 'kafka-python==0.9.5’,
67 | 'markupsafe==0.23’,
68 | 'matplotlib==1.5.0’,
69 | 'networkx==1.10’,
70 | 'numpy==1.10.1’,
71 | 'pandas==0.17.1’,
72 | 'path.py==8.1.2’,
73 | 'pexpect==4.0.1’,
74 | 'pickleshare==0.5’,
75 | 'pillow==3.0.0’,
76 | 'pip==7.1.2’,
77 | 'ptyprocess==0.5’,
78 | 'pybrain==0.3.3’,
79 | 'pyparsing==2.0.6’,
80 | 'python-dateutil==2.4.2’,
81 | 'python-recsys==0.2’,
82 | 'pytz==2015.7’,
83 | 'pyzmq==15.1.0’,
84 | 'scikit-image==0.11.3’,
85 | 'scikit-learn==0.17’,
86 | 'scipy==0.16.1’,
87 | 'seaborn==0.6.0’,
88 | 'setuptools==18.2’,
89 | 'simplegeneric==0.8.1’,
90 | 'singledispatch==3.4.0.3’,
91 | 'six==1.10.0’,
92 | 'sklearn==0.0’,
93 | 'sympy==0.7.6.1’,
94 | 'tornado==4.3’,
95 | 'traitlets==4.0.0’,
96 | 'wheel==0.24.0']
97 |
98 |
99 |
--------------------------------------------------------------------------------
/ch2.py:
--------------------------------------------------------------------------------
1 | #2.1
2 | import numpy as np
3 | from sklearn import datasets
4 |
5 | iris = datasets.load_iris()
6 | np.array(zip(iris.data,iris.target))[0:10]
7 |
8 | #2.2
9 | print(iris.DESCR)
10 | iris.target_names
11 |
12 | #2.3
13 | #Psuedocode
14 |
15 | #2.4
16 | from sklearn.cluster import KMeans
17 | from sklearn import datasets
18 |
19 | iris = datasets.load_iris()
20 | X = iris.data
21 | km = KMeans(n_clusters=3)
22 | km.fit(X)
23 |
24 | print(km.labels_)
25 |
26 | #2.5
27 | from sklearn.cluster import KMeans
28 | from sklearn import datasets
29 | from itertools import cycle, combinations
30 | import matplotlib.pyplot as pl
31 |
32 | iris = datasets.load_iris()
33 | km = KMeans(n_clusters=3)
34 | km.fit(iris.data)
35 |
36 | predictions = km.predict(iris.data)
37 |
38 | colors = cycle('rgb')
39 | markers = cycle('^+o')
40 | labels = ["Cluster 1","Cluster 2","Cluster 3"]
41 | targets = range(len(labels))
42 |
43 | feature_index=range(len(iris.feature_names))
44 | feature_names=iris.feature_names
45 | combs=combinations(feature_index,2)
46 |
47 | f,axarr=pl.subplots(3,2)
48 | axarr_flat=axarr.flat
49 |
50 | for comb, axflat in zip(combs,axarr_flat):
51 | for target, color, label, marker in zip(targets,colors,labels,markers):
52 | feature_index_x=comb[0]
53 | feature_index_y=comb[1]
54 | axflat.scatter(iris.data[predictions==target,feature_index_x],
55 | iris.data[predictions==target,feature_index_y],c=color,label=label,marker=marker)
56 | axflat.set_xlabel(feature_names[feature_index_x])
57 | axflat.set_ylabel(feature_names[feature_index_y])
58 |
59 | f.tight_layout()
60 | pl.show()
61 |
62 | #2.6
63 | from sklearn.mixture import GMM
64 | from sklearn import datasets
65 | from itertools import cycle, combinations
66 | import matplotlib as mpl
67 | import matplotlib.pyplot as pl
68 | import numpy as np
69 |
70 | # make_ellipses method taken from: http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_classifier.html#example-mixture-plot-gmm-classifier-py
71 | # Author: Ron Weiss , Gael Varoquaux
72 | # License: BSD 3 clause
73 |
74 | def make_ellipses(gmm, ax, x, y):
75 | for n, color in enumerate('rgb'):
76 | row_idx = np.array([x,y])
77 | col_idx = np.array([x,y])
78 | v, w = np.linalg.eigh(gmm._get_covars()[n][row_idx[:,None],col_idx])
79 | u = w[0] / np.linalg.norm(w[0])
80 | angle = np.arctan2(u[1], u[0])
81 | angle = 180 * angle / np.pi # convert to degrees
82 | v *= 9
83 | ell = mpl.patches.Ellipse(gmm.means_[n, [x,y]], v[0], v[1],
84 | 180 + angle, color=color)
85 | ell.set_clip_box(ax.bbox)
86 | ell.set_alpha(0.2)
87 | ax.add_artist(ell)
88 |
89 | iris = datasets.load_iris()
90 |
91 | gmm = GMM(n_components=3,covariance_type='full', n_iter=20)
92 | gmm.fit(iris.data)
93 |
94 | predictions = gmm.predict(iris.data)
95 |
96 | colors = cycle('rgb')
97 | markers = cycle('^+o')
98 | labels = ["Cluster 1","Cluster 2","Cluster 3"]
99 | targets = range(len(labels))
100 |
101 | feature_index=range(len(iris.feature_names))
102 | feature_names=iris.feature_names
103 | combs=combinations(feature_index,2)
104 |
105 | f,axarr=pl.subplots(3,2)
106 | axarr_flat=axarr.flat
107 |
108 | for comb, axflat in zip(combs,axarr_flat):
109 | for target, color, label,marker in zip(targets,colors,labels,markers):
110 | feature_index_x=comb[0]
111 | feature_index_y=comb[1]
112 | axflat.scatter(iris.data[predictions==target,feature_index_x],
113 | iris.data[predictions==target,feature_index_y],c=color,label=label,marker=marker)
114 | axflat.set_xlabel(feature_names[feature_index_x])
115 | axflat.set_ylabel(feature_names[feature_index_y])
116 | make_ellipses(gmm,axflat,feature_index_x,feature_index_y)
117 |
118 | pl.tight_layout()
119 | pl.show()
120 |
121 | #2.7
122 | import numpy as np
123 | import matplotlib.pyplot as pl
124 |
125 | from sklearn import decomposition
126 | from sklearn import datasets
127 | from itertools import cycle
128 |
129 | iris = datasets.load_iris()
130 | X = iris.data
131 | Y = iris.target
132 |
133 | targets = range(len(iris.target_names))
134 | colors = cycle('rgb')
135 | markers = cycle('^+o')
136 |
137 | pca = decomposition.PCA(n_components=2)
138 | pca.fit(X)
139 |
140 | X = pca.transform(X)
141 |
142 | for target,color,marker in zip(targets,colors,markers):
143 | pl.scatter(X[Y==target,0],X[Y==target,1],label=iris.target_names[target],c=color,marker=marker)
144 |
145 | pl.legend()
146 | pl.show()
147 |
148 |
--------------------------------------------------------------------------------
/ch4.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import numpy as np
3 | from sklearn.preprocessing import OneHotEncoder, LabelEncoder
4 | from sklearn import linear_model, datasets, cross_validation
5 | import matplotlib.pyplot as plt
6 |
7 | #4.1
8 | dataset = []
9 | f = open('./fraud_data_3.csv', 'rU')
10 | try:
11 | reader = csv.reader(f,delimiter=',')
12 | next(reader, None)
13 | for row in reader:
14 | dataset.append(row)
15 | finally:
16 | f.close()
17 |
18 | #4.2
19 | target = np.array([x[0] for x in dataset])
20 | data = np.array([x[1:] for x in dataset])
21 | # Amount, Country, TimeOfTransaction, BusinessType, NumberOfTransactionsAtThisShop, DayOfWeek
22 | categorical_mask = [False,True,True,True,False,True]
23 | enc = LabelEncoder()
24 |
25 | for i in range(0,data.shape[1]):
26 | if(categorical_mask[i]):
27 | label_encoder = enc.fit(data[:,i])
28 | print "Categorical classes:", label_encoder.classes_
29 | integer_classes = label_encoder.transform(label_encoder.classes_)
30 | print "Integer classes:", integer_classes
31 | t = label_encoder.transform(data[:, i])
32 | data[:, i] = t
33 |
34 | #4.3:
35 | mask = np.ones(data.shape, dtype=bool)
36 |
37 | for i in range(0,data.shape[1]):
38 | if(categorical_mask[i]):
39 | mask[:,i]=False
40 |
41 | data_non_categoricals = data[:, np.all(mask, axis=0)] #keep only the true, non categoricals
42 | data_categoricals = data[:,~np.all(mask,axis=0)]
43 |
44 | hotenc = OneHotEncoder()
45 | hot_encoder = hotenc.fit(data_categoricals)
46 | encoded_hot = hot_encoder.transform(data_categoricals)
47 |
48 | #4.4:
49 | new_data=data_non_categoricals
50 | new_data=new_data.astype(np.float)
51 |
52 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(new_data, target, test_size=0.4, random_state=0,dtype=float)
53 |
54 | logreg = linear_model.LogisticRegression(tol=1e-10)
55 | logreg.fit(X_train,y_train)
56 | log_output = logreg.predict_log_proba(X_test)
57 |
58 | print("Odds: "+ str(np.exp(logreg.coef_)))
59 | print("Odds intercept" + str(np.exp(logreg.intercept_)))
60 | print("Likelihood Intercept:" + str(np.exp(logreg.intercept_)/(1+np.exp(logreg.intercept_))))
61 |
62 | f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
63 | plt.setp((ax1,ax2),xticks=[])
64 |
65 | ax1.scatter(range(0,len(log_output[:,1]),1),log_output[:,1],s=100,label='Log Prob.',color='Blue',alpha=0.5)
66 | ax1.scatter(range(0,len(y_test),1),y_test,label='Labels',s=250,color='Green',alpha=0.5)
67 | ax1.legend(bbox_to_anchor=(0., 1.02, 1., 0.102), ncol=2, loc=3, mode="expand", borderaxespad=0.)
68 | ax1.set_xlabel('Test Instances')
69 | ax1.set_ylabel('Binary Ground Truth Labels / Model Log. Prob.')
70 |
71 | prob_output = [np.exp(x) for x in log_output[:,1]]
72 | ax2.scatter(range(0,len(prob_output),1),prob_output,s=100,label='Prob.', color='Blue',alpha=0.5)
73 | ax2.scatter(range(0,len(y_test),1),y_test,label='Labels',s=250,color='Green',alpha=0.5)
74 | ax2.legend(bbox_to_anchor=(0., 1.02, 1., 0.102), ncol=2, loc=3, mode="expand", borderaxespad=0.)
75 | ax2.set_xlabel('Test Instances')
76 | ax2.set_ylabel('Binary Ground Truth Labels / Model Prob.')
77 |
78 | plt.show()
79 |
80 | #4.5:
81 | new_data = np.append(data_non_categoricals,encoded_hot.todense(),1)
82 | new_data=new_data.astype(np.float)
83 |
84 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(new_data, target, test_size=0.4, random_state=0,dtype=float)
85 |
86 | logreg = linear_model.LogisticRegression(tol=1e-10)
87 | logreg.fit(X_train,y_train)
88 | log_output = logreg.predict_log_proba(X_test)
89 |
90 | print("Odds: "+ str(np.exp(logreg.coef_)))
91 | print("Odds intercept" + str(np.exp(logreg.intercept_)))
92 | print("Likelihood Intercept:" + str(np.exp(logreg.intercept_)/(1+np.exp(logreg.intercept_))))
93 |
94 | f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
95 | plt.setp((ax1,ax2),xticks=[])
96 |
97 | ax1.scatter(range(0,len(log_output[:,1]),1),log_output[:,1],s=100,label='Log Prob.',color='Blue',alpha=0.5)
98 | ax1.scatter(range(0,len(y_test),1),y_test,label='Labels',s=250,color='Green',alpha=0.5)
99 | ax1.legend(bbox_to_anchor=(0., 1.02, 1., 0.102), ncol=2, loc=3, mode="expand", borderaxespad=0.)
100 | ax1.set_xlabel('Test Instances')
101 | ax1.set_ylabel('Binary Ground Truth Labels / Model Log. Prob.')
102 |
103 | prob_output = [np.exp(x) for x in log_output[:,1]]
104 | ax2.scatter(range(0,len(prob_output),1),prob_output,s=100,label='Prob.', color='Blue',alpha=0.5)
105 | ax2.scatter(range(0,len(y_test),1),y_test,label='Labels',s=250,color='Green',alpha=0.5)
106 | ax2.legend(bbox_to_anchor=(0., 1.02, 1., 0.102), ncol=2, loc=3, mode="expand", borderaxespad=0.)
107 | ax2.set_xlabel('Test Instances')
108 | ax2.set_ylabel('Binary Ground Truth Labels / Model Prob.')
109 |
110 | plt.show()
111 |
112 |
113 |
--------------------------------------------------------------------------------
/errata/errata.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Manning Publications
8 |
9 |
10 |
52 |
53 |
54 |
55 |
56 |
127 |
128 |
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/ch6.py:
--------------------------------------------------------------------------------
1 | #6.3
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import random
5 | from sklearn.linear_model import perceptron
6 |
7 | #Let's set up our data and our target
8 | data = np.array([[0,1],[0,0],[1,0],[1,1]])
9 | target = np.array([0,0,0,1])
10 |
11 | #6.4
12 | p = perceptron.Perceptron(n_iter=100)
13 | p_out = p.fit(data,target)
14 | print p_out
15 | msg = ("Coefficients: %s, Intercept: %s")
16 | print msg % (str(p.coef_),str(p.intercept_))
17 |
18 | #6.5
19 | colors = np.array(['k','r'])
20 | markers = np.array(['*','o'])
21 | for data,target in zip(data,target):
22 | plt.scatter(data[0],data[1],s=100,c=colors[target],marker=markers[target])
23 |
24 | #Need to calculate a hyperplane the straight line as it intersects with z=0
25 | #Recall that our optimisation is solving z=m1x + m2y + c
26 | #If we want to understand the straight line created at the intersection with the viewing plane of x and y (where z=0)
27 | #0=m1x + m2y +c
28 | #m2y -m1x -c
29 | #y = -m1/m2x - c/m2
30 |
31 | grad = -p.coef_[0][0]/p.coef_[0][1]
32 | intercept = -p.intercept_/p.coef_[0][1]
33 |
34 | x_vals = np.linspace(0,1)
35 | y_vals = grad*x_vals + intercept
36 | plt.plot(x_vals,y_vals)
37 | plt.show()
38 |
39 | #6.X A HANDBUILT multilayer perceptron (NOT REPRODUCED IN THE BOOK).
40 | data = np.array([[0,1],[0,0],[1,0],[1,1]])
41 | target = np.array([1,0,1,0])
42 |
43 | colors = np.array(['k','r'])
44 | markers = np.array(['*','o'])
45 | for _data,_target in zip(data,target):
46 | plt.scatter(_data[0],_data[1],s=100,c=colors[_target],marker=markers[_target])
47 |
48 | plt.xlabel('x_1')
49 | plt.ylabel('x_2')
50 |
51 | #Let's plot the hand built boolean classifier
52 | grad = -1
53 | intercept = 0.5
54 |
55 | x_vals = np.linspace(0,1)
56 | y_vals = grad*x_vals + intercept
57 | #plt.scatter(data[:,0],data[:,1],c=colors[target])
58 | plt.plot(x_vals,y_vals,'b')
59 |
60 | grad = -1
61 | intercept = 1.5
62 | x_vals = np.linspace(0,1)
63 | y_vals = grad*x_vals + intercept
64 | plt.plot(x_vals,y_vals,'r')
65 |
66 | plt.xlabel('x_1')
67 | plt.ylabel('x_2')
68 |
69 | plt.text(0.8,-0.7,"x_2 = -x_1 + 0.5")
70 | plt.text(0,1.65,"x_2 = -x_1 + 1.5")
71 | plt.text(0.4,0.5,"y_1 = 1")
72 | plt.text(0.8,1.5,"y_1 = 0")
73 | plt.text(0,-0.5,"y_1 = 0")
74 | plt.show()
75 |
76 | #6.6
77 | from pybrain.structure import LinearLayer, SigmoidLayer
78 | from pybrain.datasets import SupervisedDataSet
79 | from pybrain.supervised.trainers import BackpropTrainer
80 | from pybrain.structure import FeedForwardNetwork
81 | from pybrain.structure import FullConnection
82 | from pybrain.structure.modules import BiasUnit
83 |
84 | import random
85 |
86 | #Create network modules
87 | net = FeedForwardNetwork()
88 | inl = LinearLayer(2)
89 | hidl = SigmoidLayer(2)
90 | outl = LinearLayer(1)
91 | b = BiasUnit()
92 |
93 | #6.7
94 | #Create connections
95 | in_to_h = FullConnection(inl, hidl)
96 | h_to_out = FullConnection(hidl, outl)
97 | bias_to_h = FullConnection(b,hidl)
98 | bias_to_out = FullConnection(b,outl)
99 |
100 | #Add modules to net
101 | net.addInputModule(inl)
102 | net.addModule(hidl);
103 | net.addModule(b)
104 | net.addOutputModule(outl)
105 |
106 | #Add connections to net and sort
107 | net.addConnection(in_to_h)
108 | net.addConnection(h_to_out)
109 | net.addConnection(bias_to_h)
110 | net.addConnection(bias_to_out)
111 | net.sortModules()
112 |
113 | #6.8
114 | #input data
115 | d = [(0,0),
116 | (0,1),
117 | (1,0),
118 | (1,1)]
119 |
120 | #target class
121 | c = [0,1,1,0]
122 |
123 | data_set = SupervisedDataSet(2, 1) # 2 inputs, 1 output
124 |
125 | random.seed()
126 | for i in xrange(1000):
127 | r = random.randint(0,3)
128 | data_set.addSample(d[r],c[r])
129 |
130 | backprop_trainer \
131 | = BackpropTrainer(net, data_set, learningrate=0.1)
132 |
133 | for i in xrange(50):
134 | err = backprop_trainer.train()
135 | print "Iter. %d, err.: %.5f" % (i, err)
136 |
137 | #6.9
138 | print "[w(x_1,j=1),w(x_2,j=1),w(x_1,j=2),w(x_2,j=2)]: " + str(in_to_h.params)
139 | print "[w(j=1,j=3),w(j=2,j=3)]: "+str(h_to_out.params)
140 | print "[w(x_b,j=1),w(x_b,j=2)]: "+str(bias_to_h.params)
141 | print "[w(x_b,j=3)]:" +str(bias_to_out.params)
142 |
143 | #6.10
144 | print "Activating 0,0. Output: " + str(net.activate([0,0]))
145 | print "Activating 0,1. Output: " + str(net.activate([0,1]))
146 | print "Activating 1,0. Output: " + str(net.activate([1,0]))
147 | print "Activating 1,1. Output: " + str(net.activate([1,1]))
148 |
149 |
150 | ###########
151 | # From here onwards:RBMS
152 |
153 | # Original Authors: Yann N. Dauphin, Vlad Niculae, Gabriel Synnaeve
154 | # License: BSD
155 |
156 | import numpy as np
157 | import matplotlib.pyplot as plt
158 |
159 | from scipy.ndimage import convolve
160 | from sklearn import linear_model, datasets, metrics
161 | from sklearn.cross_validation import train_test_split
162 | from sklearn.neural_network import BernoulliRBM
163 | from sklearn.pipeline import Pipeline
164 |
165 | #6.12
166 | def nudge_dataset(X, Y):
167 | """
168 | This produces a dataset 5 times bigger than the original one,
169 | by moving the 8x8 images in X around by 1px to left, right, down, up
170 | """
171 | direction_vectors = [[[0, 1, 0],[0, 0, 0],[0, 0, 0]],
172 | [[0, 0, 0],[1, 0, 0],[0, 0, 0]],
173 | [[0, 0, 0],[0, 0, 1],[0, 0, 0]],
174 | [[0, 0, 0],[0, 0, 0],[0, 1, 0]]]
175 | shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant', weights=w).ravel()
176 | X = np.concatenate([X] + [np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors])
177 | Y = np.concatenate([Y for _ in range(5)], axis=0)
178 | return X, Y
179 |
180 | #6.11
181 | digits = datasets.load_digits()
182 | X = np.asarray(digits.data, 'float32')
183 | X, Y = nudge_dataset(X, digits.target)
184 | X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001) # 0-1 scaling
185 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2,random_state=0)
186 |
187 | #6.13
188 | # Models we will use
189 | logistic = linear_model.LogisticRegression()
190 | rbm = BernoulliRBM(random_state=0, verbose=True)
191 |
192 | classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
193 |
194 | ###############################################################################
195 | # Training
196 |
197 | # Hyper-parameters. These were set by cross-validation,
198 | # using a GridSearchCV. Here we are not performing cross-validation to
199 | # save time.
200 | rbm.learning_rate = 0.06
201 | rbm.n_iter = 20
202 | # More components tend to give better prediction performance, but larger
203 | # fitting time
204 | rbm.n_components = 100
205 | logistic.C = 6000.0
206 |
207 | # Training RBM-Logistic Pipeline
208 | classifier.fit(X_train, Y_train)
209 |
210 | # Training Logistic regression
211 | logistic_classifier = linear_model.LogisticRegression(C=100.0)
212 | logistic_classifier.fit(X_train, Y_train)
213 |
214 | #6.14
215 | # Evaluation
216 | print("Logistic regression using RBM features:\n%s\n" % (
217 | metrics.classification_report(
218 | Y_test,
219 | classifier.predict(X_test))))
220 |
221 | print("Logistic regression using raw pixel features:\n%s\n" % (
222 | metrics.classification_report(
223 | Y_test,
224 | logistic_classifier.predict(X_test))))
225 |
226 |
227 | #6.15
228 | plt.figure(figsize=(4.2, 4))
229 | for i, comp in enumerate(rbm.components_):
230 | #print(i)
231 | #print(comp)
232 | plt.subplot(10, 10, i + 1)
233 | plt.imshow(comp.reshape((8, 8)), cmap=plt.cm.gray_r,interpolation='nearest')
234 | plt.xticks(())
235 | plt.yticks(())
236 |
237 | plt.suptitle('100 components extracted by RBM', fontsize=16)
238 | plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
239 | plt.show()
240 |
--------------------------------------------------------------------------------
/ch7.py:
--------------------------------------------------------------------------------
1 | #7.1
2 | import math
3 | import random
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 | import itertools
7 |
8 | #Assume 5000 samples per group
9 | n_experiment = 10000
10 | n_control = 10000
11 |
12 | p_experiment= 0.002
13 | p_control = 0.001
14 |
15 | se_experiment_sq = p_experiment*(1-p_experiment) / n_experiment
16 | se_control_sq = p_control*(1-p_control) / n_control
17 |
18 | Z = (p_experiment-p_control)/math.sqrt(se_experiment_sq+se_control_sq)
19 |
20 | print Z
21 |
22 | #Not a listing, but mentioned in the text and is required for production.
23 | def get_z(n_experiment, n_control, p_experiment, p_control):
24 | var_experiment = p_experiment*(1-p_experiment) / n_experiment
25 | var_control = p_control*(1-p_control) / n_control
26 | Z = (p_experiment-p_control)/math.sqrt(var_experiment+var_control)
27 | return Z
28 |
29 | experiment_array = np.linspace(100, 20000, 100)
30 | control_array = np.linspace(100,20000,100)
31 | experiment_probability_array = np.empty(100); experiment_probability_array.fill(0.002)
32 | control_probability_array = np.empty(100); control_probability_array.fill(0.001)
33 | data = zip(experiment_array,control_array,experiment_probability_array,control_probability_array)
34 | #Need to create associated parameters and zip these together.
35 | z_results = [get_z(k[0],k[1],k[2],k[3]) for k in data]
36 |
37 | plt.plot(experiment_array,z_results)
38 | #95 % confidence interval
39 | x_values = [0,20000]
40 | y_values = [1.96,1.96]
41 | plt.plot(x_values,y_values)
42 | plt.text(x_values[0],y_values[0]+0.01,"95%")
43 |
44 | #90% confidence interval
45 | x_values = [0,20000]
46 | y_values = [1.645,1.645]
47 | plt.plot(x_values,y_values)
48 | plt.text(x_values[0],y_values[0]+0.01,"90%")
49 |
50 | #80% confidence interval 1.28
51 | x_values = [0,20000]
52 | y_values = [1.28,1.28]
53 | plt.plot(x_values,y_values)
54 | plt.text(x_values[0],y_values[0]+0.01,"80%")
55 |
56 | #70% confidence interval 1.04
57 | x_values = [0,20000]
58 | y_values = [1.04,1.04]
59 | plt.plot(x_values,y_values)
60 | plt.text(x_values[0],y_values[0]+0.01,"70%")
61 |
62 | plt.xlabel("Number of users in each A/B group")
63 | plt.ylabel("z value")
64 |
65 | plt.title("Graph of number of users against z value for a fixed conversion rate \n (0.001/0.002 A/B respectively)")
66 | plt.show()
67 |
68 | ##Bayesian Bandit
69 | #7.6
70 | from scipy.stats import beta
71 | import numpy as np
72 | import matplotlib.pyplot as plt
73 | import random
74 | from operator import sub, div, add
75 |
76 | class Bandit:
77 | def __init__(self,probability):
78 | self.probability=probability
79 |
80 | def pull_handle(self):
81 | if random.random()1:
45 | idat['Title'] = p[1]
46 | if len(p)>2:
47 | idat['Genres'] = p[2]
48 | item.add_data(idat)
49 | itemdict[p[0]] = item
50 | return itemdict
51 |
52 | def items_reviewed(user_id, userdict):
53 | return [x[0] for x in userdict[user_id].get_items()]
54 |
55 | def get_score_item_reviewed(user_id,item_id, userdict):
56 | #return the first score in the list that matches the id
57 | return [x[1] for x in userdict[user_id].get_items() if x[0]==int(item_id)][0]
58 |
59 | def get_name_item_reviewed(user_id,userdict,itemdict):
60 | l = [(x[0],itemdict[str(x[0])].get_data()['Title'],itemdict[str(x[0])].get_data()['Genres'],x[1]) for x in userdict[user_id].get_items()]
61 | return sorted(l,key=lambda x: x[3], reverse=True)
62 |
63 | #Pass a list of (id, title, filmcategory,rating) and a filter_value (0,5]
64 | #Returns a list of [category,#count of ratings>filter_value],
65 | def movies_by_category(movie_list,filter_value):
66 | d = {}
67 | for x in movie_list:
68 | if x[3]>filter_value:
69 | if str(x[2]) in d:
70 | d[str(x[2])]+=1
71 | else:
72 | d[str(x[2])]=1
73 | dictlist = []
74 | for key, value in d.iteritems():
75 | temp = [key,value]
76 | dictlist.append(temp)
77 |
78 | return dictlist
79 |
80 | #End Helpers
81 |
82 | #3.2
83 | def similarity(user_id_a,user_id_b,sim_type=0):
84 | user_a_tuple_list = userdict[user_id_a].get_items()
85 | user_b_tuple_list = userdict[user_id_b].get_items()
86 | common_items=0
87 | sim = 0.0
88 | for t1 in user_a_tuple_list:
89 | for t2 in user_b_tuple_list:
90 | if (t1[0] == t2[0]):
91 | common_items += 1
92 | sim += math.pow(t1[1]-t2[1],2)
93 | if common_items>0:
94 | sim = math.sqrt(sim/common_items)
95 | sim = 1.0 - math.tanh(sim)
96 | if sim_type==1:
97 | max_common = min(len(user_a_tuple_list),len(user_b_tuple_list))
98 | sim = sim * common_items / max_common
99 | print "User Similarity between",names[user_id_a],"and",names[user_id_b],"is", sim
100 | return sim #If no common items, returns zero
101 |
102 | #3.1
103 | # load movielens data
104 | dat_file = 'ratings-11.dat'
105 | item_file = 'movies-11.dat'
106 |
107 | names = ['Frank','Constantine','Catherine']
108 |
109 | userdict = read_user_data_from_ratings(dat_file) #Build a userdict with users and ratings
110 | itemdict = read_item_data(item_file) #Build an item, info dict
111 |
112 | similarity(0,1,sim_type=0)
113 | similarity(0,1,sim_type=1)
114 | similarity(0,2,sim_type=0)
115 | similarity(1,2,sim_type=0)
116 | similarity(2,1,sim_type=0)
117 | similarity(0,0,sim_type=0)
118 | similarity(0,0,sim_type=1)
119 |
120 | #3.7
121 | class RatingCountMatrix:
122 | user_id_a = None
123 | user_id_b = None
124 | matrix = None
125 |
126 | #Instantiate with two users and the total possible number of ratings (eg 5 in the movielens case)
127 | def __init__(self, user_id_a, user_id_b):
128 | num_rating_values = max([x[0] for x in data])
129 | self.user_id_a = user_id_a
130 | self.user_id_b = user_id_b
131 | self.matrix = np.empty((num_rating_values,num_rating_values,))
132 | self.matrix[:] = 0
133 | self.calculate_matrix(user_id_a,user_id_b)
134 |
135 | def get_shape(self):
136 | a = self.matrix.shape
137 | return a
138 |
139 | def get_matrix(self):
140 | return self.matrix
141 |
142 | def calculate_matrix(self,user_id_a, user_id_b):
143 | for item in items_reviewed(user_id_a, userdict):
144 | if int(item) in items_reviewed(user_id_b, userdict):
145 | i = get_score_item_reviewed(user_id_a,item, userdict)-1 #Need to subtract 1 as indexes are 0 to 4 (5 items)
146 | j = get_score_item_reviewed(user_id_b,item, userdict)-1
147 | self.matrix[i][j] +=1
148 |
149 |
150 | #Total number of items that the users have both ranked
151 | def get_total_count(self):
152 | return self.matrix.sum()
153 |
154 | #Total number of items that they both agree on
155 | def get_agreement_count(self):
156 | return np.trace(self.matrix) #sum across the diagonal
157 |
158 | #3.6
159 | class SimilarityMatrix:
160 |
161 | similarity_matrix = None
162 |
163 | def __init__(self):
164 | self.build()
165 |
166 | def build(self):
167 | self.similarity_matrix = np.empty((len(userdict),len(userdict),))
168 |
169 | for u in range(0,len(userdict)):
170 | for v in range(u+1,len(userdict)):
171 | rcm = RatingCountMatrix(int(u),int(v))
172 | if(rcm.get_agreement_count()>0):
173 | self.similarity_matrix[u][v] = rcm.get_agreement_count()/rcm.get_total_count()
174 | else:
175 | self.similarity_matrix[u][v] = 0
176 | self.similarity_matrix[u][u]=1
177 |
178 | def get_user_similarity(self,user_id1, user_id2):
179 | return self.similarity_matrix[min(user_id1,user_id2),max(user_id1,user_id2)] # Due to upper traingular form
180 |
181 | # 3.5:
182 | def predict_rating(user_id, item_id):
183 | estimated_rating = None;
184 | similarity_sum = 0;
185 | weighted_rating_sum = 0;
186 |
187 | if (int(item_id) in items_reviewed(user_id,userdict)):
188 | return get_score_item_reviewed(user_id,item_id,userdict)
189 | else:
190 | for u in userdict.keys():
191 | if (int(item_id) in items_reviewed(u,userdict)):
192 | item_rating = get_score_item_reviewed(u,item_id,userdict)
193 | user_similarity = similarity_matrix.get_user_similarity(user_id,u)
194 | weighted_rating = user_similarity * item_rating
195 | weighted_rating_sum += weighted_rating
196 | similarity_sum += user_similarity
197 |
198 | if (similarity_sum > 0.0):
199 | estimated_rating = weighted_rating_sum / similarity_sum
200 |
201 | return estimated_rating
202 |
203 | # 3.4:
204 | def recommend(user_id, top_n):
205 | #[(item,value),(item1, value1)...]
206 | recommendations = []
207 | for i in itemdict.keys():
208 | if (int(i) not in items_reviewed(int(user_id),userdict)):
209 | recommendations.append((i,predict_rating(user_id, i))) #only get those not predicted.
210 | recommendations.sort(key=lambda t: t[1], reverse=True)
211 | return recommendations[:top_n]
212 |
213 | #3.3:
214 | data = Data()
215 | format = {'col':0, 'row':1, 'value':2, 'ids': 'int'}
216 | # About format parameter:
217 | # 'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
218 | # 'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat file
219 | # 'ids': int -> Ids (row and col ids) are integers (not strings)
220 | data.load(dat_file, sep='::', format=format)
221 |
222 | similarity_matrix = SimilarityMatrix()
223 | recommend(0,10)
224 | recommend(1,10)
225 | recommend(2,10)
226 |
227 | ##################
228 | #Now we do SVD
229 | ##################
230 |
231 | #3.8
232 | svd = SVD()
233 | recsys.algorithm.VERBOSE = True
234 |
235 | dat_file = './ml-1m/ratings.dat'
236 | item_file = './ml-1m/movies.dat'
237 |
238 | data = Data()
239 | data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
240 |
241 | items_full = read_item_data(item_file)
242 | user_full = read_user_data_from_ratings(dat_file)
243 |
244 | svd.set_data(data)
245 |
246 | #3.9
247 | k = 100
248 | svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True)
249 | films = svd.recommend(10,only_unknowns=True, is_row=False) #movies that user 10 should see (That they haven't rated)
250 |
251 | #3.10
252 | [items_full[str(x[0])].get_data() for x in films]
253 |
254 | #3.11
255 | get_name_item_reviewed(10,user_full,items_full)
256 |
257 | #3.12
258 | items_full[str(2628)].get_data()
259 | users_for_star_wars = svd.recommend(2628,only_unknowns=True)
260 | users_for_star_wars
261 |
262 | #3.13
263 | movies_reviewed_by_sw_rec =[get_name_item_reviewed(x[0],user_full,items_full) for x in users_for_star_wars]
264 | movies_flatten = [movie for movie_list in movies_reviewed_by_sw_rec for movie in movie_list]
265 | movie_aggregate = movies_by_category(movies_flatten, 3)
266 | movies_sort = sorted(movie_aggregate,key=lambda x: x[1], reverse=True)
267 | movies_sort
268 |
269 | #3.14
270 | from recsys.evaluation.prediction import RMSE
271 | err = RMSE()
272 | for rating, item_id, user_id in data.get():
273 | try:
274 | prediction = svd.predict(item_id, user_id)
275 | err.add(rating, prediction)
276 | except KeyError, k:
277 | continue
278 |
279 | print 'RMSE is ' + str(err.compute())
280 |
--------------------------------------------------------------------------------