├── users-11.dat ├── ch6_supporting_content.xlsx ├── errata ├── images │ ├── right_eq_1.png │ ├── right_eq_2.png │ ├── wrong_eq_1.png │ └── wrong_eq_2.png └── errata.htm ├── README.md ├── ratings-11.dat ├── movies-11.dat ├── fraud_data_3.csv ├── ch5.py ├── A1.py ├── ch5-prepare.sh ├── A1.data ├── ch5-criteo-process.py ├── requirements.txt ├── ch2.py ├── ch4.py ├── ch6.py ├── ch7.py └── ch3.py /users-11.dat: -------------------------------------------------------------------------------- 1 | 0::F::1::10::48067 2 | 1::M::56::16::70072 3 | 2::M::25::15::55117 4 | -------------------------------------------------------------------------------- /ch6_supporting_content.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/ch6_supporting_content.xlsx -------------------------------------------------------------------------------- /errata/images/right_eq_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/errata/images/right_eq_1.png -------------------------------------------------------------------------------- /errata/images/right_eq_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/errata/images/right_eq_2.png -------------------------------------------------------------------------------- /errata/images/wrong_eq_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/errata/images/wrong_eq_1.png -------------------------------------------------------------------------------- /errata/images/wrong_eq_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dougmcilwraith/aiw-second-edition/HEAD/errata/images/wrong_eq_2.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # aiw-second-edition 2 | Code required for the examples in Algorithms of the Intelligent Web, 2nd Edition 3 | 4 | Errata can be found under './errata' 5 | -------------------------------------------------------------------------------- /ratings-11.dat: -------------------------------------------------------------------------------- 1 | 0::0::5::978300760 2 | 0::1::4::978302109 3 | 0::2::5::978301968 4 | 0::3::4::978300275 5 | 0::4::5::978824291 6 | 0::5::4::978302268 7 | 0::6::5::978302039 8 | 1::0::5::978300719 9 | 1::2::5::978302268 10 | 1::4::4::978301368 11 | 1::5::5::978824268 12 | 1::7::5::978301752 13 | 1::8::4::978302281 14 | 1::9::5::978302124 15 | 2::0::1::978301753 16 | 2::2::2::978302188 17 | 2::3::2::978824268 18 | 2::6::3::978301777 19 | 2::9::2::978301713 20 | 2::10::1::978302039 21 | -------------------------------------------------------------------------------- /movies-11.dat: -------------------------------------------------------------------------------- 1 | 0::Toy Story (1995)::Animation|Children's|Comedy 2 | 1::Jumanji (1995)::Adventure|Children's|Fantasy 3 | 2::Grumpier Old Men (1995)::Comedy|Romance 4 | 3::Waiting to Exhale (1995)::Comedy|Drama 5 | 4::Father of the Bride Part II (1995)::Comedy 6 | 5::Heat (1995)::Action|Crime|Thriller 7 | 6::Sabrina (1995)::Comedy|Romance 8 | 7::Tom and Huck (1995)::Adventure|Children's 9 | 8::Sudden Death (1995)::Action 10 | 9::GoldenEye (1995)::Action|Adventure|Thriller 11 | 10::American President, The (1995)::Comedy|Drama|Romance 12 | -------------------------------------------------------------------------------- /fraud_data_3.csv: -------------------------------------------------------------------------------- 1 | IsFraud,Amount,Country,TimeOfTransaction,BusinessType,NumberOfTransactionsAtThisShop,DayOfWeek 0,10,DK,13,2,0,1 0,100,LT,18,1,3,4 0,49.99,AT,11,4,3,0 0,12,AUS,9,6,4,3 0,250,UK,12,1,5,6 0,149.99,UK,17,2,2,5 0,10,UK,16,8,1,4 0,49.99,DK,12,9,4,3 0,18,UK,14,2,3,6 0,27,DK,10,1,5,5 0,40,DK,11,1,6,2 0,2,UK,10,2,7,4 0,34.99,UK,9,4,8,3 0,2,UK,8,9,9,0 1,18000,LT,13,9,0,0 1,20000,LT,14,9,0,0 1,19000,LT,13,9,0,0 1,6000,LT,13,9,1,4 1,9000,LT,12,9,0,4 1,5000,LT,12,9,0,4 1,20000,LT,12,9,0,0 1,10000,LT,12,9,0,6 1,20000.01,UK,13,1,0,0 1,21000,LT,13,9,0,0 1,11000,LT,13,9,1,6 1,210000,UK,14,1,0,0 1,22000,UK,12,1,1,0 1,280000,LT,12,9,0,0 1,15000,LT,12,9,0,6 -------------------------------------------------------------------------------- /ch5.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pylab as pl 3 | from sklearn import svm, datasets 4 | from sklearn.utils import shuffle 5 | from sklearn.metrics import roc_curve, auc 6 | 7 | #Listing 5.6 8 | ground_truth_file_name = './ground_truth.dat' 9 | probability_file_name = './probabilities.out' 10 | 11 | ground_truth_file = open(ground_truth_file_name,'r') 12 | probability_file = open(probability_file_name,'r') 13 | 14 | ground_truth = np.array(map(int,ground_truth_file)) 15 | probabilities = np.array(map(float,probability_file)) 16 | 17 | ground_truth_file.close() 18 | probability_file.close() 19 | 20 | #from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html 21 | fpr, tpr, thresholds = roc_curve(ground_truth, probabilities) 22 | roc_auc = auc(fpr, tpr) 23 | print "Area under the ROC curve : %f" % roc_auc 24 | 25 | pl.clf() 26 | pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) 27 | pl.plot([0, 1], [0, 1], 'k--') 28 | pl.xlim([0.0, 1.0]) 29 | pl.ylim([0.0, 1.0]) 30 | pl.xlabel('False Positive Rate') 31 | pl.ylabel('True Positive Rate') 32 | pl.title('Receiver operating characteristic') 33 | pl.legend(loc="lower right") 34 | pl.show() 35 | -------------------------------------------------------------------------------- /A1.py: -------------------------------------------------------------------------------- 1 | #Code for Appendix 1. Algorithms of the Intelligent Web 2nd Edition. 2 | 3 | #Listing A.1.1 4 | from kafka import KafkaClient, SimpleProducer 5 | 6 | kafka = KafkaClient("localhost:9092") 7 | 8 | producer = SimpleProducer(kafka) 9 | producer.send_messages("test", "Hello World!") 10 | producer.send_messages("test","This is my second message") 11 | producer.send_messages("test","And this is my third!") 12 | 13 | #Listing A.1.2 14 | from kafka import KafkaClient, SimpleConsumer 15 | 16 | kafka = KafkaClient("localhost:9092") 17 | consumer = SimpleConsumer(kafka,"mygroup","test") 18 | 19 | for message in consumer: 20 | print(message) 21 | 22 | #Listing A.1.5 23 | from kafka import KafkaClient, SimpleProducer 24 | 25 | kafka = KafkaClient("localhost:9092") 26 | producer = SimpleProducer(kafka,async=False, 27 | req_acks=SimpleProducer.ACK_AFTER_CLUSTER_COMMIT, 28 | ack_timeout=2000) 29 | 30 | producer.send_messages("test-replicated-topic", "Hello Kafka Cluster!") 31 | producer.send_messages("test-replicated-topic","Message to be replicated.") 32 | producer.send_messages("test-replicated-topic","And so is this!") 33 | 34 | #Listing A.1.8 35 | from kafka import KafkaClient 36 | from kafka.common import ProduceRequest 37 | from kafka.protocol import KafkaProtocol,create_message 38 | 39 | kafka = KafkaClient("localhost:9092") 40 | 41 | f = open('A1.data','r') 42 | 43 | for line in f: 44 | s = line.split("\t")[0] 45 | part = abs(hash(s)) % 3 46 | req = ProduceRequest(topic="click-streams",partition=part,messages=[create_message(s)]) 47 | resps = kafka.send_produce_request(payloads=[req], fail_on_error=True) 48 | -------------------------------------------------------------------------------- /ch5-prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #This file contains the code for listing 5.1 - 5.5. 4 | #This code can be run as is, if it placed in the same directory as the 5 | #train_vw_file and the location of your vowpal_wabbit checkout is correct. 6 | #In order to obtain the train_vw_file you must run ch5-criteo-process.py 7 | #over the train.txt file obtained from the criteo display challenge dataset (full). 8 | 9 | #!5.1 10 | #echo 'Running Listing 5.1' 11 | wc -l train_vw_file 12 | grep -c '^-1' train_vw_file 13 | grep -c '^1' train_vw_file 14 | 15 | #!5.2 16 | #echo 'Running Listing 5.2' 17 | grep '^-1' train_vw_file | sort -R > negative_examples.dat 18 | grep '^1' train_vw_file | sort -R > positive_examples.dat 19 | awk 'NR % 3 == 0' negative_examples.dat > negative_examples_downsampled.dat 20 | 21 | cat negative_examples_downsampled.dat > all_examples.dat 22 | cat positive_examples.dat >> all_examples.dat 23 | 24 | cat all_examples.dat | sort -R > all_examples_shuffled.dat 25 | awk 'NR % 10 == 0' all_examples_shuffled.dat > all_examples_shuffled_down.dat 26 | 27 | #!5.3 28 | echo 'Running Listing 5.3' 29 | vw all_examples_shuffled_down.dat --loss_function=logistic -c -b 22 --passes=3 -f model.vw 30 | vw all_examples_shuffled_down.dat -t -i model.vw --invert_hash readable.model 31 | cat readable.model | awk 'NR > 9 {print}' | sort -r -g -k 3 -t : | head -1000 > readable_model_sorted_top 32 | 33 | #!5.4 34 | #Output only 35 | 36 | #!5.5 37 | echo 'Running Listing 5.5' 38 | vw -d test_vw_file -t -i model.vw --loss_function=logistic -r predictions.out 39 | ~/dev/vowpal_wabbit/utl/logistic -0 predictions.out > probabilities.out 40 | cut -d ' ' -f 1 test_vw_file | sed -e 's/^-1/0/' > ground_truth.dat 41 | 42 | 43 | -------------------------------------------------------------------------------- /A1.data: -------------------------------------------------------------------------------- 1 | 381094c0-9e45-424f-90b6-ad64b06cc184 2014-01-02 17:33:07 click 2 | 6615087e-ea19-492c-869c-28fc1fa77588 2014-01-02 17:33:20 view 3 | d4889090-942b-4790-9831-362051b0847b 2014-01-02 17:34:01 adview 4 | e6688db5-d31f-4ede-985a-115fb51836fb 2014-01-02 17:35:06 adview 5 | e6688db5-d31f-4ede-985a-115fb51836fb 2014-01-02 17:35:30 click 6 | e6688db5-d31f-4ede-985a-115fb51836fb 2014-01-02 17:37:01 click 7 | e6688db5-d31f-4ede-985a-115fb51836fb 2014-01-02 17:39:00 convert 8 | ae6ae5c9-acb2-479b-adcb-0c29623d921b 2014-01-02 17:40:00 adview 9 | 1ac384c1-1b2d-4ed0-b467-7c90b7ac42d8 2014-01-02 17:40:01 adview 10 | 280bfa16-07ac-49ed-a1a5-9ab50a754027 2014-01-02 17:40:03 click 11 | dda0e95d-9c30-4f60-bb6a-febf05526b83 2014-01-02 17:40:05 adview 12 | 8a1204f1-5076-4d4c-8b23-84c77ad541d8 2014-01-02 17:40:10 adview 13 | 3bdb8f17-11cc-49cb-94cf-75676be909b7 2014-01-02 17:40:11 adview 14 | 69b61156-6c31-4317-aec5-bd48908b4973 2014-01-02 17:40:13 adview 15 | 69722471-0532-4f29-b2b4-2f9007604e4f 2014-01-02 17:40:14 adview 16 | 00e5edf6-a483-48fa-82ed-fbfac8a6b1e6 2014-01-02 17:40:15 adview 17 | 9398d369-6382-4be0-97bc-182b3713745f 2014-01-02 17:40:17 convert 18 | f40c1588-e4e1-4f7d-8ef5-5f76046886fb 2014-01-02 17:40:18 adview 19 | 54823527-fe62-4a81-8551-6282309b0a3f 2014-01-02 17:40:20 click 20 | 46d6f178-7c11-48c1-a1d7-f7152e7b2f1c 2014-01-02 17:40:26 adview 21 | 4c4e545b-d194-4531-962f-66e9d3b6116d 2014-01-02 17:41:00 convert 22 | 42b311f5-ba84-4666-a901-03063f7504a9 2014-01-02 17:41:01 adview 23 | bfa28923-c358-4741-bcbf-ff99b640ee14 2014-01-02 17:42:06 adview 24 | 54c29b39-5640-49b8-b610-6f2e6dc6bd1b 2014-01-02 17:42:10 convert 25 | edf6c5d2-1373-4dbb-8528-925d525b4a42 2014-01-02 17:43:03 click 26 | f7f6752f-03bf-43f1-927c-8acdafd235e2 2014-01-02 17:43:11 adview 27 | f4b7c0a6-b209-4cc4-b4e7-395489e0e724 2014-01-02 17:43:19 click 28 | -------------------------------------------------------------------------------- /ch5-criteo-process.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import random 3 | import sys 4 | 5 | def process_file (file_name, vw_file_name_train, vw_file_name_test, train_pct): 6 | file = open(file_name) 7 | vw_file_train = open(vw_file_name_train,'w') 8 | vw_file_test = open(vw_file_name_test,'w') 9 | 10 | continuous_set = [int(x) for x in numpy.linspace(1,13,13)] 11 | categorical_set =[int(x) for x in numpy.linspace(1,26,26)] 12 | 13 | print continuous_set 14 | print categorical_set 15 | 16 | first_line_headers = ["Class"] 17 | for i in continuous_set: 18 | first_line_headers.append("i"+str(i)) 19 | 20 | for c in categorical_set: 21 | first_line_headers.append("c"+str(c)) 22 | 23 | print first_line_headers 24 | 25 | for line in file: 26 | line_split = line.split('\t') 27 | target_click = -1 28 | if int(line_split[0])>0: 29 | target_click=1 30 | 31 | #Essentially now manually build up the training string 32 | vw_line = ""+str(target_click)+" " 33 | 34 | for feature_index in continuous_set: 35 | if line_split[feature_index]!="": 36 | vw_line+="|"+first_line_headers[feature_index] +" c:"+ line_split[feature_index] + " " 37 | 38 | for feature_index in [x+len(continuous_set) for x in categorical_set]: #Index doesn't start from 0 39 | if line_split[feature_index]!="": 40 | vw_line+="|"+first_line_headers[feature_index] + " " + line_split[feature_index] + " " 41 | 42 | if(random.random()<=train_pct): 43 | vw_file_train.write(vw_line.replace('\n', '')+"\n") #Get rid of any unwanted line breaks 44 | else: 45 | vw_file_test.write(vw_line.replace('\n', '')+"\n") #Get rid of any unwanted line breaks 46 | 47 | file.close() 48 | vw_file_train.close() 49 | vw_file_test.close() 50 | 51 | if __name__ == '__main__': 52 | filename='./train.txt' 53 | vw_file_name_train = './train_vw_file' 54 | vw_file_name_test = './test_vw_file' 55 | 56 | filename = sys.argv[1] if len(sys.argv) >=2 else filename 57 | vw_file_name_train = sys.argv[2] if len(sys.argv) >=3 else vw_file_name_train 58 | vw_file_name_test = sys.argv[3] if len(sys.argv) >=4 else vw_file_name_test 59 | 60 | process_file(filename,vw_file_name_train,vw_file_name_test,0.7) 61 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | --System: 2 | Python 2.7.10 3 | Ubuntu 14.04.2 4 | VirtualEnv 13.1.2 5 | Scala version 2.10.4 6 | 7 | --System prerequisties: 8 | sudo apt-get install python-dev libblas-dev liblapack-dev libatlas-base-dev gfortran libpng12-dev libfreetype6-dev libjpeg-dev tcl-dev tk-dev python-tk libboost-program-options-dev libboost-python-dev libsnappy-dev 9 | 10 | --Python prerequisites: 11 | pip install numpy scipy scikit-learn matplotlib csc-pysparse networkx divisi2 12 | 13 | --Installing Recsys: 14 | git clone https://github.com/ocelma/python-recsys.git 15 | cd python-recsys 16 | sudo python setup.py install 17 | --[under virtualenv ./venv/bin/python setup.py install] 18 | 19 | --Installing Movielens Dataset: 20 | wget http://files.grouplens.org/datasets/movielens/ml-1m.zip 21 | unzip ml-1m.zip 22 | 23 | --Installing VW (vw-8.1.1): 24 | git clone git://github.com/JohnLangford/vowpal_wabbit.git 25 | make 26 | make test 27 | sudo make install 28 | 29 | --Installing PyBrain (PyBrain==0.3.3) 30 | git clone git://github.com/pybrain/pybrain.git pybrain 31 | cd pybrain 32 | sudo python setup.py install 33 | --[Under virtualenv: ./venv/bin/python setup.py install] 34 | 35 | --Installing KafkaPython : 36 | git clone https://github.com/dpkp/kafka-python.git 37 | git checkout v0.9.5 38 | cd kafka-python 39 | sudo python setup.py install 40 | --[Under virtualenv: ./venv/bin/python setup.py install] 41 | 42 | ---This is a list of the versions of the requirements of the various packages installed above. 43 | ....numpy-1.10.1 44 | ....scipy-0.16.1 45 | ....sklearn-0.17 46 | ....matplotlib==1.5.0 47 | ....csc-pysparse-1.1.1.4 48 | ....csc-utils-0.6.7 49 | ....divisi2-2.2.5 50 | ....vowpal-wabbit-8.1.1 51 | 52 | ---This is a complete snapshot of my python environment after installing the above to a clean virtualenv environment 53 | ['backports-abc==0.4’, 54 | 'backports.ssl-match-hostname==3.4.0.2’, 55 | 'certifi==2015.11.20.1’, 56 | 'csc-pysparse==1.1.1.4’, 57 | 'csc-utils==0.6.7’, 58 | 'cycler==0.9.0’, 59 | 'decorator==4.0.4’, 60 | 'divisi2==2.2.5’, 61 | 'functools32==3.2.3.post2’, 62 | 'ipython-genutils==0.1.0’, 63 | 'ipython==4.0.1’, 64 | 'jinja2==2.8’, 65 | 'jsonschema==2.5.1’, 66 | 'kafka-python==0.9.5’, 67 | 'markupsafe==0.23’, 68 | 'matplotlib==1.5.0’, 69 | 'networkx==1.10’, 70 | 'numpy==1.10.1’, 71 | 'pandas==0.17.1’, 72 | 'path.py==8.1.2’, 73 | 'pexpect==4.0.1’, 74 | 'pickleshare==0.5’, 75 | 'pillow==3.0.0’, 76 | 'pip==7.1.2’, 77 | 'ptyprocess==0.5’, 78 | 'pybrain==0.3.3’, 79 | 'pyparsing==2.0.6’, 80 | 'python-dateutil==2.4.2’, 81 | 'python-recsys==0.2’, 82 | 'pytz==2015.7’, 83 | 'pyzmq==15.1.0’, 84 | 'scikit-image==0.11.3’, 85 | 'scikit-learn==0.17’, 86 | 'scipy==0.16.1’, 87 | 'seaborn==0.6.0’, 88 | 'setuptools==18.2’, 89 | 'simplegeneric==0.8.1’, 90 | 'singledispatch==3.4.0.3’, 91 | 'six==1.10.0’, 92 | 'sklearn==0.0’, 93 | 'sympy==0.7.6.1’, 94 | 'tornado==4.3’, 95 | 'traitlets==4.0.0’, 96 | 'wheel==0.24.0'] 97 | 98 | 99 | -------------------------------------------------------------------------------- /ch2.py: -------------------------------------------------------------------------------- 1 | #2.1 2 | import numpy as np 3 | from sklearn import datasets 4 | 5 | iris = datasets.load_iris() 6 | np.array(zip(iris.data,iris.target))[0:10] 7 | 8 | #2.2 9 | print(iris.DESCR) 10 | iris.target_names 11 | 12 | #2.3 13 | #Psuedocode 14 | 15 | #2.4 16 | from sklearn.cluster import KMeans 17 | from sklearn import datasets 18 | 19 | iris = datasets.load_iris() 20 | X = iris.data 21 | km = KMeans(n_clusters=3) 22 | km.fit(X) 23 | 24 | print(km.labels_) 25 | 26 | #2.5 27 | from sklearn.cluster import KMeans 28 | from sklearn import datasets 29 | from itertools import cycle, combinations 30 | import matplotlib.pyplot as pl 31 | 32 | iris = datasets.load_iris() 33 | km = KMeans(n_clusters=3) 34 | km.fit(iris.data) 35 | 36 | predictions = km.predict(iris.data) 37 | 38 | colors = cycle('rgb') 39 | markers = cycle('^+o') 40 | labels = ["Cluster 1","Cluster 2","Cluster 3"] 41 | targets = range(len(labels)) 42 | 43 | feature_index=range(len(iris.feature_names)) 44 | feature_names=iris.feature_names 45 | combs=combinations(feature_index,2) 46 | 47 | f,axarr=pl.subplots(3,2) 48 | axarr_flat=axarr.flat 49 | 50 | for comb, axflat in zip(combs,axarr_flat): 51 | for target, color, label, marker in zip(targets,colors,labels,markers): 52 | feature_index_x=comb[0] 53 | feature_index_y=comb[1] 54 | axflat.scatter(iris.data[predictions==target,feature_index_x], 55 | iris.data[predictions==target,feature_index_y],c=color,label=label,marker=marker) 56 | axflat.set_xlabel(feature_names[feature_index_x]) 57 | axflat.set_ylabel(feature_names[feature_index_y]) 58 | 59 | f.tight_layout() 60 | pl.show() 61 | 62 | #2.6 63 | from sklearn.mixture import GMM 64 | from sklearn import datasets 65 | from itertools import cycle, combinations 66 | import matplotlib as mpl 67 | import matplotlib.pyplot as pl 68 | import numpy as np 69 | 70 | # make_ellipses method taken from: http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_classifier.html#example-mixture-plot-gmm-classifier-py 71 | # Author: Ron Weiss , Gael Varoquaux 72 | # License: BSD 3 clause 73 | 74 | def make_ellipses(gmm, ax, x, y): 75 | for n, color in enumerate('rgb'): 76 | row_idx = np.array([x,y]) 77 | col_idx = np.array([x,y]) 78 | v, w = np.linalg.eigh(gmm._get_covars()[n][row_idx[:,None],col_idx]) 79 | u = w[0] / np.linalg.norm(w[0]) 80 | angle = np.arctan2(u[1], u[0]) 81 | angle = 180 * angle / np.pi # convert to degrees 82 | v *= 9 83 | ell = mpl.patches.Ellipse(gmm.means_[n, [x,y]], v[0], v[1], 84 | 180 + angle, color=color) 85 | ell.set_clip_box(ax.bbox) 86 | ell.set_alpha(0.2) 87 | ax.add_artist(ell) 88 | 89 | iris = datasets.load_iris() 90 | 91 | gmm = GMM(n_components=3,covariance_type='full', n_iter=20) 92 | gmm.fit(iris.data) 93 | 94 | predictions = gmm.predict(iris.data) 95 | 96 | colors = cycle('rgb') 97 | markers = cycle('^+o') 98 | labels = ["Cluster 1","Cluster 2","Cluster 3"] 99 | targets = range(len(labels)) 100 | 101 | feature_index=range(len(iris.feature_names)) 102 | feature_names=iris.feature_names 103 | combs=combinations(feature_index,2) 104 | 105 | f,axarr=pl.subplots(3,2) 106 | axarr_flat=axarr.flat 107 | 108 | for comb, axflat in zip(combs,axarr_flat): 109 | for target, color, label,marker in zip(targets,colors,labels,markers): 110 | feature_index_x=comb[0] 111 | feature_index_y=comb[1] 112 | axflat.scatter(iris.data[predictions==target,feature_index_x], 113 | iris.data[predictions==target,feature_index_y],c=color,label=label,marker=marker) 114 | axflat.set_xlabel(feature_names[feature_index_x]) 115 | axflat.set_ylabel(feature_names[feature_index_y]) 116 | make_ellipses(gmm,axflat,feature_index_x,feature_index_y) 117 | 118 | pl.tight_layout() 119 | pl.show() 120 | 121 | #2.7 122 | import numpy as np 123 | import matplotlib.pyplot as pl 124 | 125 | from sklearn import decomposition 126 | from sklearn import datasets 127 | from itertools import cycle 128 | 129 | iris = datasets.load_iris() 130 | X = iris.data 131 | Y = iris.target 132 | 133 | targets = range(len(iris.target_names)) 134 | colors = cycle('rgb') 135 | markers = cycle('^+o') 136 | 137 | pca = decomposition.PCA(n_components=2) 138 | pca.fit(X) 139 | 140 | X = pca.transform(X) 141 | 142 | for target,color,marker in zip(targets,colors,markers): 143 | pl.scatter(X[Y==target,0],X[Y==target,1],label=iris.target_names[target],c=color,marker=marker) 144 | 145 | pl.legend() 146 | pl.show() 147 | 148 | -------------------------------------------------------------------------------- /ch4.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import numpy as np 3 | from sklearn.preprocessing import OneHotEncoder, LabelEncoder 4 | from sklearn import linear_model, datasets, cross_validation 5 | import matplotlib.pyplot as plt 6 | 7 | #4.1 8 | dataset = [] 9 | f = open('./fraud_data_3.csv', 'rU') 10 | try: 11 | reader = csv.reader(f,delimiter=',') 12 | next(reader, None) 13 | for row in reader: 14 | dataset.append(row) 15 | finally: 16 | f.close() 17 | 18 | #4.2 19 | target = np.array([x[0] for x in dataset]) 20 | data = np.array([x[1:] for x in dataset]) 21 | # Amount, Country, TimeOfTransaction, BusinessType, NumberOfTransactionsAtThisShop, DayOfWeek 22 | categorical_mask = [False,True,True,True,False,True] 23 | enc = LabelEncoder() 24 | 25 | for i in range(0,data.shape[1]): 26 | if(categorical_mask[i]): 27 | label_encoder = enc.fit(data[:,i]) 28 | print "Categorical classes:", label_encoder.classes_ 29 | integer_classes = label_encoder.transform(label_encoder.classes_) 30 | print "Integer classes:", integer_classes 31 | t = label_encoder.transform(data[:, i]) 32 | data[:, i] = t 33 | 34 | #4.3: 35 | mask = np.ones(data.shape, dtype=bool) 36 | 37 | for i in range(0,data.shape[1]): 38 | if(categorical_mask[i]): 39 | mask[:,i]=False 40 | 41 | data_non_categoricals = data[:, np.all(mask, axis=0)] #keep only the true, non categoricals 42 | data_categoricals = data[:,~np.all(mask,axis=0)] 43 | 44 | hotenc = OneHotEncoder() 45 | hot_encoder = hotenc.fit(data_categoricals) 46 | encoded_hot = hot_encoder.transform(data_categoricals) 47 | 48 | #4.4: 49 | new_data=data_non_categoricals 50 | new_data=new_data.astype(np.float) 51 | 52 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(new_data, target, test_size=0.4, random_state=0,dtype=float) 53 | 54 | logreg = linear_model.LogisticRegression(tol=1e-10) 55 | logreg.fit(X_train,y_train) 56 | log_output = logreg.predict_log_proba(X_test) 57 | 58 | print("Odds: "+ str(np.exp(logreg.coef_))) 59 | print("Odds intercept" + str(np.exp(logreg.intercept_))) 60 | print("Likelihood Intercept:" + str(np.exp(logreg.intercept_)/(1+np.exp(logreg.intercept_)))) 61 | 62 | f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) 63 | plt.setp((ax1,ax2),xticks=[]) 64 | 65 | ax1.scatter(range(0,len(log_output[:,1]),1),log_output[:,1],s=100,label='Log Prob.',color='Blue',alpha=0.5) 66 | ax1.scatter(range(0,len(y_test),1),y_test,label='Labels',s=250,color='Green',alpha=0.5) 67 | ax1.legend(bbox_to_anchor=(0., 1.02, 1., 0.102), ncol=2, loc=3, mode="expand", borderaxespad=0.) 68 | ax1.set_xlabel('Test Instances') 69 | ax1.set_ylabel('Binary Ground Truth Labels / Model Log. Prob.') 70 | 71 | prob_output = [np.exp(x) for x in log_output[:,1]] 72 | ax2.scatter(range(0,len(prob_output),1),prob_output,s=100,label='Prob.', color='Blue',alpha=0.5) 73 | ax2.scatter(range(0,len(y_test),1),y_test,label='Labels',s=250,color='Green',alpha=0.5) 74 | ax2.legend(bbox_to_anchor=(0., 1.02, 1., 0.102), ncol=2, loc=3, mode="expand", borderaxespad=0.) 75 | ax2.set_xlabel('Test Instances') 76 | ax2.set_ylabel('Binary Ground Truth Labels / Model Prob.') 77 | 78 | plt.show() 79 | 80 | #4.5: 81 | new_data = np.append(data_non_categoricals,encoded_hot.todense(),1) 82 | new_data=new_data.astype(np.float) 83 | 84 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(new_data, target, test_size=0.4, random_state=0,dtype=float) 85 | 86 | logreg = linear_model.LogisticRegression(tol=1e-10) 87 | logreg.fit(X_train,y_train) 88 | log_output = logreg.predict_log_proba(X_test) 89 | 90 | print("Odds: "+ str(np.exp(logreg.coef_))) 91 | print("Odds intercept" + str(np.exp(logreg.intercept_))) 92 | print("Likelihood Intercept:" + str(np.exp(logreg.intercept_)/(1+np.exp(logreg.intercept_)))) 93 | 94 | f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) 95 | plt.setp((ax1,ax2),xticks=[]) 96 | 97 | ax1.scatter(range(0,len(log_output[:,1]),1),log_output[:,1],s=100,label='Log Prob.',color='Blue',alpha=0.5) 98 | ax1.scatter(range(0,len(y_test),1),y_test,label='Labels',s=250,color='Green',alpha=0.5) 99 | ax1.legend(bbox_to_anchor=(0., 1.02, 1., 0.102), ncol=2, loc=3, mode="expand", borderaxespad=0.) 100 | ax1.set_xlabel('Test Instances') 101 | ax1.set_ylabel('Binary Ground Truth Labels / Model Log. Prob.') 102 | 103 | prob_output = [np.exp(x) for x in log_output[:,1]] 104 | ax2.scatter(range(0,len(prob_output),1),prob_output,s=100,label='Prob.', color='Blue',alpha=0.5) 105 | ax2.scatter(range(0,len(y_test),1),y_test,label='Labels',s=250,color='Green',alpha=0.5) 106 | ax2.legend(bbox_to_anchor=(0., 1.02, 1., 0.102), ncol=2, loc=3, mode="expand", borderaxespad=0.) 107 | ax2.set_xlabel('Test Instances') 108 | ax2.set_ylabel('Binary Ground Truth Labels / Model Prob.') 109 | 110 | plt.show() 111 | 112 | 113 | -------------------------------------------------------------------------------- /errata/errata.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Manning Publications 8 | 9 | 10 | 52 | 53 | 54 | 55 | 56 |
57 | Algorithms of the Intelligent Web 2nd Edition. Errata 58 |

59 | Any errors found in Algorithms of the Intelligent Web, 2nd Ed. should be reported using the online forum. I'll collect all errors from there and compile within this errata for subsequent publications. Thanks for taking the time to contribute. 60 |

61 | 62 | 130 | 131 |

132 | -------------------------------------------------------------------------------- /ch6.py: -------------------------------------------------------------------------------- 1 | #6.3 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import random 5 | from sklearn.linear_model import perceptron 6 | 7 | #Let's set up our data and our target 8 | data = np.array([[0,1],[0,0],[1,0],[1,1]]) 9 | target = np.array([0,0,0,1]) 10 | 11 | #6.4 12 | p = perceptron.Perceptron(n_iter=100) 13 | p_out = p.fit(data,target) 14 | print p_out 15 | msg = ("Coefficients: %s, Intercept: %s") 16 | print msg % (str(p.coef_),str(p.intercept_)) 17 | 18 | #6.5 19 | colors = np.array(['k','r']) 20 | markers = np.array(['*','o']) 21 | for data,target in zip(data,target): 22 | plt.scatter(data[0],data[1],s=100,c=colors[target],marker=markers[target]) 23 | 24 | #Need to calculate a hyperplane the straight line as it intersects with z=0 25 | #Recall that our optimisation is solving z=m1x + m2y + c 26 | #If we want to understand the straight line created at the intersection with the viewing plane of x and y (where z=0) 27 | #0=m1x + m2y +c 28 | #m2y -m1x -c 29 | #y = -m1/m2x - c/m2 30 | 31 | grad = -p.coef_[0][0]/p.coef_[0][1] 32 | intercept = -p.intercept_/p.coef_[0][1] 33 | 34 | x_vals = np.linspace(0,1) 35 | y_vals = grad*x_vals + intercept 36 | plt.plot(x_vals,y_vals) 37 | plt.show() 38 | 39 | #6.X A HANDBUILT multilayer perceptron (NOT REPRODUCED IN THE BOOK). 40 | data = np.array([[0,1],[0,0],[1,0],[1,1]]) 41 | target = np.array([1,0,1,0]) 42 | 43 | colors = np.array(['k','r']) 44 | markers = np.array(['*','o']) 45 | for _data,_target in zip(data,target): 46 | plt.scatter(_data[0],_data[1],s=100,c=colors[_target],marker=markers[_target]) 47 | 48 | plt.xlabel('x_1') 49 | plt.ylabel('x_2') 50 | 51 | #Let's plot the hand built boolean classifier 52 | grad = -1 53 | intercept = 0.5 54 | 55 | x_vals = np.linspace(0,1) 56 | y_vals = grad*x_vals + intercept 57 | #plt.scatter(data[:,0],data[:,1],c=colors[target]) 58 | plt.plot(x_vals,y_vals,'b') 59 | 60 | grad = -1 61 | intercept = 1.5 62 | x_vals = np.linspace(0,1) 63 | y_vals = grad*x_vals + intercept 64 | plt.plot(x_vals,y_vals,'r') 65 | 66 | plt.xlabel('x_1') 67 | plt.ylabel('x_2') 68 | 69 | plt.text(0.8,-0.7,"x_2 = -x_1 + 0.5") 70 | plt.text(0,1.65,"x_2 = -x_1 + 1.5") 71 | plt.text(0.4,0.5,"y_1 = 1") 72 | plt.text(0.8,1.5,"y_1 = 0") 73 | plt.text(0,-0.5,"y_1 = 0") 74 | plt.show() 75 | 76 | #6.6 77 | from pybrain.structure import LinearLayer, SigmoidLayer 78 | from pybrain.datasets import SupervisedDataSet 79 | from pybrain.supervised.trainers import BackpropTrainer 80 | from pybrain.structure import FeedForwardNetwork 81 | from pybrain.structure import FullConnection 82 | from pybrain.structure.modules import BiasUnit 83 | 84 | import random 85 | 86 | #Create network modules 87 | net = FeedForwardNetwork() 88 | inl = LinearLayer(2) 89 | hidl = SigmoidLayer(2) 90 | outl = LinearLayer(1) 91 | b = BiasUnit() 92 | 93 | #6.7 94 | #Create connections 95 | in_to_h = FullConnection(inl, hidl) 96 | h_to_out = FullConnection(hidl, outl) 97 | bias_to_h = FullConnection(b,hidl) 98 | bias_to_out = FullConnection(b,outl) 99 | 100 | #Add modules to net 101 | net.addInputModule(inl) 102 | net.addModule(hidl); 103 | net.addModule(b) 104 | net.addOutputModule(outl) 105 | 106 | #Add connections to net and sort 107 | net.addConnection(in_to_h) 108 | net.addConnection(h_to_out) 109 | net.addConnection(bias_to_h) 110 | net.addConnection(bias_to_out) 111 | net.sortModules() 112 | 113 | #6.8 114 | #input data 115 | d = [(0,0), 116 | (0,1), 117 | (1,0), 118 | (1,1)] 119 | 120 | #target class 121 | c = [0,1,1,0] 122 | 123 | data_set = SupervisedDataSet(2, 1) # 2 inputs, 1 output 124 | 125 | random.seed() 126 | for i in xrange(1000): 127 | r = random.randint(0,3) 128 | data_set.addSample(d[r],c[r]) 129 | 130 | backprop_trainer \ 131 | = BackpropTrainer(net, data_set, learningrate=0.1) 132 | 133 | for i in xrange(50): 134 | err = backprop_trainer.train() 135 | print "Iter. %d, err.: %.5f" % (i, err) 136 | 137 | #6.9 138 | print "[w(x_1,j=1),w(x_2,j=1),w(x_1,j=2),w(x_2,j=2)]: " + str(in_to_h.params) 139 | print "[w(j=1,j=3),w(j=2,j=3)]: "+str(h_to_out.params) 140 | print "[w(x_b,j=1),w(x_b,j=2)]: "+str(bias_to_h.params) 141 | print "[w(x_b,j=3)]:" +str(bias_to_out.params) 142 | 143 | #6.10 144 | print "Activating 0,0. Output: " + str(net.activate([0,0])) 145 | print "Activating 0,1. Output: " + str(net.activate([0,1])) 146 | print "Activating 1,0. Output: " + str(net.activate([1,0])) 147 | print "Activating 1,1. Output: " + str(net.activate([1,1])) 148 | 149 | 150 | ########### 151 | # From here onwards:RBMS 152 | 153 | # Original Authors: Yann N. Dauphin, Vlad Niculae, Gabriel Synnaeve 154 | # License: BSD 155 | 156 | import numpy as np 157 | import matplotlib.pyplot as plt 158 | 159 | from scipy.ndimage import convolve 160 | from sklearn import linear_model, datasets, metrics 161 | from sklearn.cross_validation import train_test_split 162 | from sklearn.neural_network import BernoulliRBM 163 | from sklearn.pipeline import Pipeline 164 | 165 | #6.12 166 | def nudge_dataset(X, Y): 167 | """ 168 | This produces a dataset 5 times bigger than the original one, 169 | by moving the 8x8 images in X around by 1px to left, right, down, up 170 | """ 171 | direction_vectors = [[[0, 1, 0],[0, 0, 0],[0, 0, 0]], 172 | [[0, 0, 0],[1, 0, 0],[0, 0, 0]], 173 | [[0, 0, 0],[0, 0, 1],[0, 0, 0]], 174 | [[0, 0, 0],[0, 0, 0],[0, 1, 0]]] 175 | shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant', weights=w).ravel() 176 | X = np.concatenate([X] + [np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors]) 177 | Y = np.concatenate([Y for _ in range(5)], axis=0) 178 | return X, Y 179 | 180 | #6.11 181 | digits = datasets.load_digits() 182 | X = np.asarray(digits.data, 'float32') 183 | X, Y = nudge_dataset(X, digits.target) 184 | X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001) # 0-1 scaling 185 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2,random_state=0) 186 | 187 | #6.13 188 | # Models we will use 189 | logistic = linear_model.LogisticRegression() 190 | rbm = BernoulliRBM(random_state=0, verbose=True) 191 | 192 | classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) 193 | 194 | ############################################################################### 195 | # Training 196 | 197 | # Hyper-parameters. These were set by cross-validation, 198 | # using a GridSearchCV. Here we are not performing cross-validation to 199 | # save time. 200 | rbm.learning_rate = 0.06 201 | rbm.n_iter = 20 202 | # More components tend to give better prediction performance, but larger 203 | # fitting time 204 | rbm.n_components = 100 205 | logistic.C = 6000.0 206 | 207 | # Training RBM-Logistic Pipeline 208 | classifier.fit(X_train, Y_train) 209 | 210 | # Training Logistic regression 211 | logistic_classifier = linear_model.LogisticRegression(C=100.0) 212 | logistic_classifier.fit(X_train, Y_train) 213 | 214 | #6.14 215 | # Evaluation 216 | print("Logistic regression using RBM features:\n%s\n" % ( 217 | metrics.classification_report( 218 | Y_test, 219 | classifier.predict(X_test)))) 220 | 221 | print("Logistic regression using raw pixel features:\n%s\n" % ( 222 | metrics.classification_report( 223 | Y_test, 224 | logistic_classifier.predict(X_test)))) 225 | 226 | 227 | #6.15 228 | plt.figure(figsize=(4.2, 4)) 229 | for i, comp in enumerate(rbm.components_): 230 | #print(i) 231 | #print(comp) 232 | plt.subplot(10, 10, i + 1) 233 | plt.imshow(comp.reshape((8, 8)), cmap=plt.cm.gray_r,interpolation='nearest') 234 | plt.xticks(()) 235 | plt.yticks(()) 236 | 237 | plt.suptitle('100 components extracted by RBM', fontsize=16) 238 | plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) 239 | plt.show() 240 | -------------------------------------------------------------------------------- /ch7.py: -------------------------------------------------------------------------------- 1 | #7.1 2 | import math 3 | import random 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import itertools 7 | 8 | #Assume 5000 samples per group 9 | n_experiment = 10000 10 | n_control = 10000 11 | 12 | p_experiment= 0.002 13 | p_control = 0.001 14 | 15 | se_experiment_sq = p_experiment*(1-p_experiment) / n_experiment 16 | se_control_sq = p_control*(1-p_control) / n_control 17 | 18 | Z = (p_experiment-p_control)/math.sqrt(se_experiment_sq+se_control_sq) 19 | 20 | print Z 21 | 22 | #Not a listing, but mentioned in the text and is required for production. 23 | def get_z(n_experiment, n_control, p_experiment, p_control): 24 | var_experiment = p_experiment*(1-p_experiment) / n_experiment 25 | var_control = p_control*(1-p_control) / n_control 26 | Z = (p_experiment-p_control)/math.sqrt(var_experiment+var_control) 27 | return Z 28 | 29 | experiment_array = np.linspace(100, 20000, 100) 30 | control_array = np.linspace(100,20000,100) 31 | experiment_probability_array = np.empty(100); experiment_probability_array.fill(0.002) 32 | control_probability_array = np.empty(100); control_probability_array.fill(0.001) 33 | data = zip(experiment_array,control_array,experiment_probability_array,control_probability_array) 34 | #Need to create associated parameters and zip these together. 35 | z_results = [get_z(k[0],k[1],k[2],k[3]) for k in data] 36 | 37 | plt.plot(experiment_array,z_results) 38 | #95 % confidence interval 39 | x_values = [0,20000] 40 | y_values = [1.96,1.96] 41 | plt.plot(x_values,y_values) 42 | plt.text(x_values[0],y_values[0]+0.01,"95%") 43 | 44 | #90% confidence interval 45 | x_values = [0,20000] 46 | y_values = [1.645,1.645] 47 | plt.plot(x_values,y_values) 48 | plt.text(x_values[0],y_values[0]+0.01,"90%") 49 | 50 | #80% confidence interval 1.28 51 | x_values = [0,20000] 52 | y_values = [1.28,1.28] 53 | plt.plot(x_values,y_values) 54 | plt.text(x_values[0],y_values[0]+0.01,"80%") 55 | 56 | #70% confidence interval 1.04 57 | x_values = [0,20000] 58 | y_values = [1.04,1.04] 59 | plt.plot(x_values,y_values) 60 | plt.text(x_values[0],y_values[0]+0.01,"70%") 61 | 62 | plt.xlabel("Number of users in each A/B group") 63 | plt.ylabel("z value") 64 | 65 | plt.title("Graph of number of users against z value for a fixed conversion rate \n (0.001/0.002 A/B respectively)") 66 | plt.show() 67 | 68 | ##Bayesian Bandit 69 | #7.6 70 | from scipy.stats import beta 71 | import numpy as np 72 | import matplotlib.pyplot as plt 73 | import random 74 | from operator import sub, div, add 75 | 76 | class Bandit: 77 | def __init__(self,probability): 78 | self.probability=probability 79 | 80 | def pull_handle(self): 81 | if random.random()1: 45 | idat['Title'] = p[1] 46 | if len(p)>2: 47 | idat['Genres'] = p[2] 48 | item.add_data(idat) 49 | itemdict[p[0]] = item 50 | return itemdict 51 | 52 | def items_reviewed(user_id, userdict): 53 | return [x[0] for x in userdict[user_id].get_items()] 54 | 55 | def get_score_item_reviewed(user_id,item_id, userdict): 56 | #return the first score in the list that matches the id 57 | return [x[1] for x in userdict[user_id].get_items() if x[0]==int(item_id)][0] 58 | 59 | def get_name_item_reviewed(user_id,userdict,itemdict): 60 | l = [(x[0],itemdict[str(x[0])].get_data()['Title'],itemdict[str(x[0])].get_data()['Genres'],x[1]) for x in userdict[user_id].get_items()] 61 | return sorted(l,key=lambda x: x[3], reverse=True) 62 | 63 | #Pass a list of (id, title, filmcategory,rating) and a filter_value (0,5] 64 | #Returns a list of [category,#count of ratings>filter_value], 65 | def movies_by_category(movie_list,filter_value): 66 | d = {} 67 | for x in movie_list: 68 | if x[3]>filter_value: 69 | if str(x[2]) in d: 70 | d[str(x[2])]+=1 71 | else: 72 | d[str(x[2])]=1 73 | dictlist = [] 74 | for key, value in d.iteritems(): 75 | temp = [key,value] 76 | dictlist.append(temp) 77 | 78 | return dictlist 79 | 80 | #End Helpers 81 | 82 | #3.2 83 | def similarity(user_id_a,user_id_b,sim_type=0): 84 | user_a_tuple_list = userdict[user_id_a].get_items() 85 | user_b_tuple_list = userdict[user_id_b].get_items() 86 | common_items=0 87 | sim = 0.0 88 | for t1 in user_a_tuple_list: 89 | for t2 in user_b_tuple_list: 90 | if (t1[0] == t2[0]): 91 | common_items += 1 92 | sim += math.pow(t1[1]-t2[1],2) 93 | if common_items>0: 94 | sim = math.sqrt(sim/common_items) 95 | sim = 1.0 - math.tanh(sim) 96 | if sim_type==1: 97 | max_common = min(len(user_a_tuple_list),len(user_b_tuple_list)) 98 | sim = sim * common_items / max_common 99 | print "User Similarity between",names[user_id_a],"and",names[user_id_b],"is", sim 100 | return sim #If no common items, returns zero 101 | 102 | #3.1 103 | # load movielens data 104 | dat_file = 'ratings-11.dat' 105 | item_file = 'movies-11.dat' 106 | 107 | names = ['Frank','Constantine','Catherine'] 108 | 109 | userdict = read_user_data_from_ratings(dat_file) #Build a userdict with users and ratings 110 | itemdict = read_item_data(item_file) #Build an item, info dict 111 | 112 | similarity(0,1,sim_type=0) 113 | similarity(0,1,sim_type=1) 114 | similarity(0,2,sim_type=0) 115 | similarity(1,2,sim_type=0) 116 | similarity(2,1,sim_type=0) 117 | similarity(0,0,sim_type=0) 118 | similarity(0,0,sim_type=1) 119 | 120 | #3.7 121 | class RatingCountMatrix: 122 | user_id_a = None 123 | user_id_b = None 124 | matrix = None 125 | 126 | #Instantiate with two users and the total possible number of ratings (eg 5 in the movielens case) 127 | def __init__(self, user_id_a, user_id_b): 128 | num_rating_values = max([x[0] for x in data]) 129 | self.user_id_a = user_id_a 130 | self.user_id_b = user_id_b 131 | self.matrix = np.empty((num_rating_values,num_rating_values,)) 132 | self.matrix[:] = 0 133 | self.calculate_matrix(user_id_a,user_id_b) 134 | 135 | def get_shape(self): 136 | a = self.matrix.shape 137 | return a 138 | 139 | def get_matrix(self): 140 | return self.matrix 141 | 142 | def calculate_matrix(self,user_id_a, user_id_b): 143 | for item in items_reviewed(user_id_a, userdict): 144 | if int(item) in items_reviewed(user_id_b, userdict): 145 | i = get_score_item_reviewed(user_id_a,item, userdict)-1 #Need to subtract 1 as indexes are 0 to 4 (5 items) 146 | j = get_score_item_reviewed(user_id_b,item, userdict)-1 147 | self.matrix[i][j] +=1 148 | 149 | 150 | #Total number of items that the users have both ranked 151 | def get_total_count(self): 152 | return self.matrix.sum() 153 | 154 | #Total number of items that they both agree on 155 | def get_agreement_count(self): 156 | return np.trace(self.matrix) #sum across the diagonal 157 | 158 | #3.6 159 | class SimilarityMatrix: 160 | 161 | similarity_matrix = None 162 | 163 | def __init__(self): 164 | self.build() 165 | 166 | def build(self): 167 | self.similarity_matrix = np.empty((len(userdict),len(userdict),)) 168 | 169 | for u in range(0,len(userdict)): 170 | for v in range(u+1,len(userdict)): 171 | rcm = RatingCountMatrix(int(u),int(v)) 172 | if(rcm.get_agreement_count()>0): 173 | self.similarity_matrix[u][v] = rcm.get_agreement_count()/rcm.get_total_count() 174 | else: 175 | self.similarity_matrix[u][v] = 0 176 | self.similarity_matrix[u][u]=1 177 | 178 | def get_user_similarity(self,user_id1, user_id2): 179 | return self.similarity_matrix[min(user_id1,user_id2),max(user_id1,user_id2)] # Due to upper traingular form 180 | 181 | # 3.5: 182 | def predict_rating(user_id, item_id): 183 | estimated_rating = None; 184 | similarity_sum = 0; 185 | weighted_rating_sum = 0; 186 | 187 | if (int(item_id) in items_reviewed(user_id,userdict)): 188 | return get_score_item_reviewed(user_id,item_id,userdict) 189 | else: 190 | for u in userdict.keys(): 191 | if (int(item_id) in items_reviewed(u,userdict)): 192 | item_rating = get_score_item_reviewed(u,item_id,userdict) 193 | user_similarity = similarity_matrix.get_user_similarity(user_id,u) 194 | weighted_rating = user_similarity * item_rating 195 | weighted_rating_sum += weighted_rating 196 | similarity_sum += user_similarity 197 | 198 | if (similarity_sum > 0.0): 199 | estimated_rating = weighted_rating_sum / similarity_sum 200 | 201 | return estimated_rating 202 | 203 | # 3.4: 204 | def recommend(user_id, top_n): 205 | #[(item,value),(item1, value1)...] 206 | recommendations = [] 207 | for i in itemdict.keys(): 208 | if (int(i) not in items_reviewed(int(user_id),userdict)): 209 | recommendations.append((i,predict_rating(user_id, i))) #only get those not predicted. 210 | recommendations.sort(key=lambda t: t[1], reverse=True) 211 | return recommendations[:top_n] 212 | 213 | #3.3: 214 | data = Data() 215 | format = {'col':0, 'row':1, 'value':2, 'ids': 'int'} 216 | # About format parameter: 217 | # 'row': 1 -> Rows in matrix come from column 1 in ratings.dat file 218 | # 'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat file 219 | # 'ids': int -> Ids (row and col ids) are integers (not strings) 220 | data.load(dat_file, sep='::', format=format) 221 | 222 | similarity_matrix = SimilarityMatrix() 223 | recommend(0,10) 224 | recommend(1,10) 225 | recommend(2,10) 226 | 227 | ################## 228 | #Now we do SVD 229 | ################## 230 | 231 | #3.8 232 | svd = SVD() 233 | recsys.algorithm.VERBOSE = True 234 | 235 | dat_file = './ml-1m/ratings.dat' 236 | item_file = './ml-1m/movies.dat' 237 | 238 | data = Data() 239 | data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) 240 | 241 | items_full = read_item_data(item_file) 242 | user_full = read_user_data_from_ratings(dat_file) 243 | 244 | svd.set_data(data) 245 | 246 | #3.9 247 | k = 100 248 | svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True) 249 | films = svd.recommend(10,only_unknowns=True, is_row=False) #movies that user 10 should see (That they haven't rated) 250 | 251 | #3.10 252 | [items_full[str(x[0])].get_data() for x in films] 253 | 254 | #3.11 255 | get_name_item_reviewed(10,user_full,items_full) 256 | 257 | #3.12 258 | items_full[str(2628)].get_data() 259 | users_for_star_wars = svd.recommend(2628,only_unknowns=True) 260 | users_for_star_wars 261 | 262 | #3.13 263 | movies_reviewed_by_sw_rec =[get_name_item_reviewed(x[0],user_full,items_full) for x in users_for_star_wars] 264 | movies_flatten = [movie for movie_list in movies_reviewed_by_sw_rec for movie in movie_list] 265 | movie_aggregate = movies_by_category(movies_flatten, 3) 266 | movies_sort = sorted(movie_aggregate,key=lambda x: x[1], reverse=True) 267 | movies_sort 268 | 269 | #3.14 270 | from recsys.evaluation.prediction import RMSE 271 | err = RMSE() 272 | for rating, item_id, user_id in data.get(): 273 | try: 274 | prediction = svd.predict(item_id, user_id) 275 | err.add(rating, prediction) 276 | except KeyError, k: 277 | continue 278 | 279 | print 'RMSE is ' + str(err.compute()) 280 | --------------------------------------------------------------------------------