├── .gitattributes ├── LICENSE ├── README.md ├── score.py └── score_streamsvm.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Zygmunt Zając 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Large scale linear learners 2 | =========================== 3 | 4 | Code for scoring predictions from VW, Liblinear and StreamSVM on _webspam_. 5 | 6 | score.py - compute accuracy, AUC and confusion matrix for VW and Liblinear predictions 7 | score_streamsvm.py - the same thing for StreamSVM 8 | 9 | See: 10 | * [http://fastml.com/comparing-large-scale-linear-learners/](http://fastml.com/comparing-large-scale-linear-learners/) 11 | * [https://www.indiegogo.com/projects/large-scale-linear-learners-compared](https://www.indiegogo.com/projects/large-scale-linear-learners-compared) 12 | * [http://fastml.com/vowpal-wabbit-liblinear-sbm-and-streamsvm-compared/](http://fastml.com/vowpal-wabbit-liblinear-sbm-and-streamsvm-compared/) 13 | 14 | -------------------------------------------------------------------------------- /score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 'compute metrics for libsvm test file and VW/Liblinear predictions file' 4 | 5 | import sys 6 | import numpy as np 7 | from sklearn.metrics import accuracy_score as accuracy 8 | from sklearn.metrics import roc_auc_score as AUC 9 | from sklearn.metrics import confusion_matrix 10 | 11 | y_file = sys.argv[1] 12 | p_file = sys.argv[2] 13 | 14 | print "loading p..." 15 | 16 | p = np.loadtxt( p_file ) 17 | 18 | y_predicted = np.ones(( p.shape[0] )) 19 | y_predicted[p < 0] = -1 20 | 21 | print "loading y..." 22 | 23 | y = np.loadtxt( y_file, usecols= [0] ) 24 | 25 | print "accuracy:", accuracy( y, y_predicted ) 26 | print "AUC:", AUC( y, p ) 27 | 28 | print 29 | print "confusion matrix:" 30 | print confusion_matrix( y, y_predicted ) 31 | 32 | 33 | """ 34 | run score.py data/test_v.txt vw/p_v_logistic.txt 35 | 36 | accuracy: 0.994675826535 37 | 38 | confusion matrix: 39 | [[27444 136] 40 | [ 236 42054]] 41 | 42 | AUC: 0.998418419401 43 | """ 44 | 45 | """ 46 | p_v_hinge.txt 47 | 48 | accuracy: 0.993502218406 49 | 50 | confusion matrix: 51 | [[27310 270] 52 | [ 184 42106]] 53 | 54 | AUC: 0.99632599445 55 | """ 56 | 57 | """ 58 | cdblock 59 | 60 | accuracy: 0.993244597109 61 | AUC: 0.993511427279 62 | 63 | confusion matrix: 64 | [[27436 144] 65 | [ 328 41962]] 66 | """ 67 | 68 | """ 69 | cdblock -s 7 (logistic regression) 70 | accuracy: 0.985201087734 71 | AUC: 0.985763288671 72 | 73 | confusion matrix: 74 | [[27261 319] 75 | [ 715 41575]] 76 | """ 77 | 78 | """ 79 | score_streamsvm.py (hinge) 80 | 81 | accuracy: 0.990596822671 82 | AUC: 0.991292619197 83 | 84 | confusion matrix: 85 | [[27431 149] 86 | [ 508 41782]] 87 | 88 | 89 | (ui) 90 | accuracy: 0.990596822671 91 | AUC: 0.998972438313 92 | 93 | confusion matrix: 94 | [[27431 149] 95 | [ 508 41782]] 96 | 97 | """ -------------------------------------------------------------------------------- /score_streamsvm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 'compute metrics for libsvm test file and StreamSVM predictions file' 4 | 5 | import sys 6 | import numpy as np 7 | from sklearn.metrics import accuracy_score as accuracy 8 | from sklearn.metrics import roc_auc_score as AUC 9 | from sklearn.metrics import confusion_matrix 10 | 11 | y_file = sys.argv[1] 12 | p_file = sys.argv[2] 13 | 14 | print "loading p..." 15 | 16 | p = np.loadtxt( p_file, usecols = [1] ) 17 | 18 | y_predicted = np.ones(( p.shape[0] )) 19 | y_predicted[p < 0] = -1 20 | 21 | print "loading y..." 22 | 23 | y = np.loadtxt( y_file, usecols= [0] ) 24 | 25 | print "accuracy:", accuracy( y, y_predicted ) 26 | print "AUC:", AUC( y, p ) 27 | 28 | print 29 | print "confusion matrix:" 30 | print confusion_matrix( y, y_predicted ) 31 | 32 | 33 | 34 | --------------------------------------------------------------------------------