├── .gitignore ├── OWLQN.cpp ├── OWLQN.h ├── README.md ├── TerminationCriterion.cpp ├── TerminationCriterion.h ├── leastSquares.cpp ├── leastSquares.h ├── logreg.cpp ├── logreg.h ├── main.cpp └── pics ├── data_flow.jpg ├── lr.jpg ├── lr_formular.jpg ├── lr_owlqn.jpg └── owlqn.jpg /.gitignore: -------------------------------------------------------------------------------- 1 | *.tlog 2 | *.obj 3 | Debug/owlqnProj.exe.intermediate.manifest 4 | *.lastbuildstate 5 | Debug/owlqnProj.log 6 | *.cache 7 | Debug/vc100.idb 8 | *.pdb 9 | owlqnProj.vcxproj 10 | *.filters 11 | owlqnProj.vcxproj.user -------------------------------------------------------------------------------- /OWLQN.cpp: -------------------------------------------------------------------------------- 1 | #include "OWLQN.h" 2 | 3 | #include "TerminationCriterion.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | 13 | double OptimizerState::dotProduct(const DblVec& a, const DblVec& b) { 14 | double result = 0; 15 | for (size_t i=0; i 0) { 65 | //xi>0时，|xi| = xi，l1处的倒数为l1weight，下降方向为梯度的反方向 66 | dir[i] = -grad[i] - l1weight; 67 | } else {//xi == 0 68 | if (grad[i] < -l1weight) { 69 | //xi == 0，右导grad[i] + l1weight < 0，虚梯度取右导，下降方向为虚梯度的反方向，dir[i] > 0，偏向正象限 70 | dir[i] = -grad[i] - l1weight; 71 | } else if (grad[i] > l1weight) { 72 | //xi == 0，左导grad[i] - l1weight > 0，虚梯度取左导，下降方向为虚梯度的反方向，dir[i] < 0，偏向负象限 73 | dir[i] = -grad[i] + l1weight; 74 | } else { 75 | //xi == 0，左右导数都为0，下降方向为0 76 | dir[i] = 0; 77 | } 78 | } 79 | } 80 | } 81 | 82 | //记录当前的最速下降方向 83 | steepestDescDir = dir; 84 | } 85 | 86 | //lgfgs 87 | //计算下降方向dir（参数的二阶梯度） 88 | //lbfgs中的two loop，用过去m次的信息来近似计算Hessian矩阵的逆(进而得到当前的下降方向) 89 | void OptimizerState::MapDirByInverseHessian() { 90 | int count = (int)sList.size(); //lbfgs记忆的过去的迭代结果的个数m 91 | 92 | if (count != 0) { 93 | //第一个for loop 94 | for (int i = count - 1; i >= 0; i--) { 95 | alphas[i] = -dotProduct(*sList[i], dir) / roList[i]; //不同于论文中的地方是，这里ruo的计算未取倒数，所以这里是除法；另外，这里的alpha取了负值 96 | addMult(dir, *yList[i], alphas[i]); 97 | } 98 | 99 | //根据lastY和lastRuo 计算了一个值，对应论文中的rj，这里保存了roList，所以使用roList[[count - 1]简化了计算 100 | const DblVec& lastY = *yList[count - 1]; 101 | double yDotY = dotProduct(lastY, lastY); 102 | double scalar = roList[count - 1] / yDotY; 103 | scale(dir, scalar); 104 | 105 | //第二个for loop 106 | for (int i = 0; i < count; i++) { 107 | double beta = dotProduct(*yList[i], dir) / roList[i];//不同于论文中的地方是，这里ruo的计算未取倒数，所以这里是除法 108 | addMult(dir, *sList[i], -alphas[i] - beta); 109 | } 110 | } 111 | } 112 | 113 | void OptimizerState::FixDirSigns() { 114 | //如果存在l1正则化项 115 | if (l1weight > 0) { 116 | //dim是参数（特征）的维度数 117 | for (size_t i = 0; i 0) { 158 | val += dir[i] * (grad[i] + l1weight); 159 | } else if (dir[i] < 0) { 160 | val += dir[i] * (grad[i] - l1weight); 161 | } else if (dir[i] > 0) { 162 | val += dir[i] * (grad[i] + l1weight); 163 | } 164 | } 165 | } 166 | 167 | return val; 168 | } 169 | } 170 | 171 | //根据x，dir，alpha获得新的查找点newX 172 | void OptimizerState::GetNextPoint(double alpha) { 173 | //获得新的查找点newX 174 | addMultInto(newX, x, dir, alpha); 175 | if (l1weight > 0) { 176 | for (size_t i=0; i 0) { 190 | for (size_t i=0; i= 0) { 206 | cerr << "L-BFGS chose a non-descent direction: check your gradient!" << endl; 207 | exit(1); 208 | } 209 | 210 | double alpha = 1.0; 211 | double backoff = 0.5; 212 | 213 | //第一次迭代时 214 | if (iter == 1) { 215 | //alpha = 0.1; 216 | //backoff = 0.5; 217 | //计算dir的绝对值 218 | double normDir = sqrt(dotProduct(dir, dir)); 219 | //将alpha、backoff设置成新的特定值 220 | alpha = (1 / normDir); 221 | backoff = 0.1; 222 | } 223 | 224 | const double c1 = 1e-4; 225 | double oldValue = value; //记录之前的损失值 226 | 227 | while (true) { 228 | //根据x，dir，alpha获得新的查找点newX 229 | GetNextPoint(alpha); 230 | //根据newX（即参数）来计算新的梯度newGrad、新的损失值value 231 | value = EvalL1(); 232 | 233 | 234 | //计算的是线性查找更新步长的停止查找条件 235 | if (value <= oldValue + c1 * origDirDeriv * alpha) break; 236 | 237 | if (!quiet) cout << "." << flush; 238 | 239 | //更新alpha：如果不符合停止查找条件，步长回退，即beta^n 240 | alpha *= backoff; 241 | } 242 | 243 | if (!quiet) cout << endl; 244 | } 245 | 246 | //优化的状态迁移：更新lbfgs中两个记忆列表 247 | void OptimizerState::Shift() { 248 | DblVec *nextS = NULL, *nextY = NULL; 249 | 250 | //lbfgs中记忆项的个数 251 | int listSize = (int)sList.size(); 252 | 253 | //刚开始时，记忆项不到m，所以申请新的记忆项空间 254 | if (listSize < m) { 255 | try { 256 | nextS = new vector(dim); 257 | nextY = new vector(dim); 258 | } catch (bad_alloc) { 259 | m = listSize; //未分配的可用内存不够时，就使用当前能够分配的记忆项的数量作为m 260 | if (nextS != NULL) { //如果给S分配成功了，但Y未分配成功，就把S释放掉 261 | delete nextS; 262 | nextS = NULL; 263 | } 264 | } 265 | } 266 | 267 | //如果未分配新的S和Y，即已经有m个记忆项了 268 | if (nextS == NULL) { 269 | nextS = sList.front(); 270 | sList.pop_front(); //弹出最老的s 271 | nextY = yList.front(); 272 | yList.pop_front(); //弹出最老的y 273 | roList.pop_front(); //弹出最老的rou 274 | } 275 | 276 | //计算参数和梯度的差值，存入*nextS和nextY 277 | addMultInto(*nextS, newX, x, -1); 278 | addMultInto(*nextY, newGrad, grad, -1); 279 | 280 | //计算新的ruo，不同于论文中的地方是，这里未取倒数 281 | double ro = dotProduct(*nextS, *nextY); 282 | 283 | //保存新的记忆项 284 | sList.push_back(nextS); 285 | yList.push_back(nextY); 286 | roList.push_back(ro); 287 | 288 | //将新的参数和梯度设为当前的参数和梯度 289 | x.swap(newX); 290 | grad.swap(newGrad); 291 | 292 | //迭代计数增加 293 | iter++; 294 | } 295 | 296 | //寻找最小损失的过程 297 | //输入依次为：优化问题、初始参数、收敛时的参数（输出的结果）、l1正则化项的参数、允许的误差、limit-memory中记忆的迭代步数的数量 298 | void OWLQN::Minimize(DifferentiableFunction& function, const DblVec& initial, DblVec& minimum, double l1weight, double tol, int m) const { 299 | //输入依次为：优化问题、初始参数、limit-memory中记忆的迭代步数的数量、l1正则化项的参数、是否输出静默 300 | OptimizerState state(function, initial, m, l1weight, quiet); 301 | 302 | if (!quiet) { 303 | cout << setprecision(4) << scientific << right; 304 | cout << endl << "Optimizing function of " << state.dim << " variables with OWL-QN parameters:" << endl; 305 | cout << " l1 regularization weight: " << l1weight << "." << endl; 306 | cout << " L-BFGS memory parameter (m): " << m << endl; 307 | cout << " Convergence tolerance: " << tol << endl; 308 | cout << endl; 309 | cout << "Iter n: new_value (conv_crit) line_search" << endl << flush; 310 | cout << "Iter 0: " << setw(10) << state.value << " (***********) " << flush; 311 | } 312 | 313 | ostringstream str; 314 | termCrit->GetValue(state, str); 315 | 316 | while (true) { 317 | //更新search direction 318 | state.UpdateDir(); 319 | //查找step size 320 | state.BackTrackingLineSearch(); 321 | 322 | //判断是否满足终止条件 323 | ostringstream str; 324 | //减少的损失值相对于当前损失的比例 325 | double termCritVal = termCrit->GetValue(state, str); 326 | if (!quiet) { 327 | cout << "Iter " << setw(4) << state.iter << ": " << setw(10) << state.value; 328 | cout << str.str() << flush; 329 | } 330 | //如果减少的损失值相对于当前损失的比例小于某个阈值，就停止迭代 331 | if (termCritVal < tol) break; 332 | 333 | //更新状态 334 | state.Shift(); 335 | } 336 | 337 | if (!quiet) cout << endl; 338 | 339 | //将最终得到的参数存到计算结果变量中 340 | minimum = state.newX; 341 | } 342 | -------------------------------------------------------------------------------- /OWLQN.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strint/LogisticRegression_OWLQN_Notes/af82f1f4fb818f19600efa5000a378950b720ce7/OWLQN.h -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 逻辑回归及OWLQN的算法和实现 2 | ## Logistic Regression 3 | ### 逻辑回归的模型 4 | 5 | ![](pics/lr.jpg) 6 | 7 | ### 代码中逻辑回归对应的公式和计算流程 8 | 9 | ![](pics/lr_formular.jpg) 10 | ![](pics/lr_owlqn.jpg) 11 | 12 | 13 | ## OWLQN 14 | * BFGS, LBFGS: [Anders Skajaa.Limited Memory BFGS for Nonsmooth Optimization.2010](http://www.cs.nyu.edu/overton/mstheses/skajaa/msthesis.pdf) 15 | * LBFGS:[J. Nocedal and S. Wright. Numerical Optimization. Springer, 2nd 16 | edition, 2006.](http://home.agh.edu.pl/~pba/pdfdoc/Numerical_Optimization.pdf) 17 | * OWLQN:[Galen Andrew and Jianfeng Gao.Scalable Training of L1-Regularized Log-Linear Models.2007](http://research.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) 18 | 19 | ![](pics/owlqn.jpg) 20 | 21 | 22 | ## 逻辑回归和OWLQN中的数据流和数据依赖 23 | ![](pics/data_flow.jpg) 24 | 25 | ## 代码 26 | 附带的代码是[Galen Andrew and Jianfeng Gao. 2007](http://research.microsoft.com/en-us/downloads/b1eb1016-1738-4bd5-83a9-370c9d498a03/default.aspx)论文中提供的。在这里对代码添加了比较详细的注释，详细代码参见目录中的其它文件。其中对虚梯度部分计算的注释如下： 27 | 28 | ```cpp 29 | //OWLQN 30 | //计算下降方向dir（参数的一阶梯度，虚梯度的负方向） 31 | void OptimizerState::MakeSteepestDescDir() { 32 | 33 | if (l1weight == 0) { 34 | //l1正则化项权值为0时，查找方向dir为损失函数梯度的负方向 35 | scaleInto(dir, grad, -1); 36 | } else { 37 | //l1正则化项权值不为0时，根据损失函数的梯度和l1正则化项权值来确定查找方向 38 | for (size_t i=0; i 0) { 43 | //xi>0时，|xi| = xi，l1处的倒数为l1weight，下降方向为梯度的反方向 44 | dir[i] = -grad[i] - l1weight; 45 | } else {//xi == 0 46 | if (grad[i] < -l1weight) { 47 | //xi == 0，右导grad[i] + l1weight < 0，虚梯度取右导，下降方向为虚梯度的反方向，dir[i] > 0，偏向正象限 48 | dir[i] = -grad[i] - l1weight; 49 | } else if (grad[i] > l1weight) { 50 | //xi == 0，左导grad[i] - l1weight > 0，虚梯度取左导，下降方向为虚梯度的反方向，dir[i] < 0，偏向负象限 51 | dir[i] = -grad[i] + l1weight; 52 | } else { 53 | //xi == 0，左右导数都为0，下降方向为0 54 | dir[i] = 0; 55 | } 56 | } 57 | } 58 | } 59 | 60 | //记录当前的最速下降方向 61 | steepestDescDir = dir; 62 | } 63 | ``` 64 | 65 | 66 | ## 相关的项目 67 | [并行逻辑回归](https://github.com/xswang/DML/tree/master/logistic_regression) 68 | 69 | 70 | ## 参考 71 | * [Galen Andrew and Jianfeng Gao. 2007](http://research.microsoft.com/en-us/downloads/b1eb1016-1738-4bd5-83a9-370c9d498a03/default.aspx) 72 | -------------------------------------------------------------------------------- /TerminationCriterion.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strint/LogisticRegression_OWLQN_Notes/af82f1f4fb818f19600efa5000a378950b720ce7/TerminationCriterion.cpp -------------------------------------------------------------------------------- /TerminationCriterion.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | class OptimizerState; 8 | 9 | struct TerminationCriterion { 10 | virtual double GetValue(const OptimizerState& state, std::ostream& message) = 0; 11 | virtual ~TerminationCriterion() { } 12 | }; 13 | 14 | class RelativeMeanImprovementCriterion : public TerminationCriterion { 15 | const int numItersToAvg; 16 | std::deque prevVals; 17 | 18 | public: 19 | RelativeMeanImprovementCriterion(int numItersToAvg = 5) : numItersToAvg(numItersToAvg) {} 20 | 21 | double GetValue(const OptimizerState& state, std::ostream& message); 22 | }; 23 | -------------------------------------------------------------------------------- /leastSquares.cpp: -------------------------------------------------------------------------------- 1 | #include "leastSquares.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | 9 | LeastSquaresProblem::LeastSquaresProblem(const char* matFilename, const char* bFilename) { 10 | ifstream matfile(matFilename); 11 | if (!matfile.good()) { 12 | cerr << "error opening matrix file " << matFilename << endl; 13 | exit(1); 14 | } 15 | 16 | string s; 17 | getline(matfile, s); 18 | if (!s.compare("%%MatrixMarket matrix array real general")) { 19 | skipEmptyAndComment(matfile, s); 20 | stringstream st(s); 21 | st >> m >> n; 22 | Amat.resize(m * n); 23 | 24 | for (size_t j=0; j> val; 28 | A(i, j) = val; 29 | } 30 | } 31 | 32 | matfile.close(); 33 | } else { 34 | matfile.close(); 35 | cerr << "Unsupported matrix format \"" << s << "\" in " << matFilename << endl; 36 | exit(1); 37 | } 38 | 39 | ifstream bFile(bFilename); 40 | if (!bFile.good()) { 41 | cerr << "error opening y-value file " << bFilename << endl; 42 | exit(1); 43 | } 44 | getline(bFile, s); 45 | if (s.compare("%%MatrixMarket matrix array real general")) { 46 | bFile.close(); 47 | cerr << "unsupported y-value file format \"" << s << "\" in " << bFilename << endl; 48 | exit(1); 49 | } 50 | 51 | skipEmptyAndComment(bFile, s); 52 | stringstream bst(s); 53 | size_t bNum, bCol; 54 | bst >> bNum >> bCol; 55 | if (bNum != m) { 56 | cerr << "number of y-values doesn't match number of instances in " << bFilename << endl; 57 | exit(1); 58 | } else if (bCol != 1) { 59 | cerr << "y-value matrix may not have more than one column" << endl; 60 | exit(1); 61 | } 62 | 63 | b.resize(m); 64 | for (size_t i=0; i> val; 67 | b[i] = val; 68 | } 69 | bFile.close(); 70 | } 71 | 72 | 73 | double LeastSquaresObjective::Eval(const DblVec& input, DblVec& gradient) { 74 | static DblVec temp(problem.m); 75 | 76 | if (input.size() != problem.n) { 77 | cerr << "Error: input is not the correct size." << endl; 78 | exit(1); 79 | } 80 | 81 | for (size_t i=0; i 4 | #include 5 | 6 | #include "OWLQN.h" 7 | 8 | struct LeastSquaresObjective; 9 | 10 | class LeastSquaresProblem { 11 | std::vector Amat; 12 | std::vector b; 13 | size_t m, n; 14 | 15 | void skipEmptyAndComment(std::ifstream& file, std::string& s) { 16 | do { 17 | std::getline(file, s); 18 | } while (s.size() == 0 || s[0] == '%'); 19 | } 20 | 21 | friend struct LeastSquaresObjective; 22 | 23 | public: 24 | LeastSquaresProblem(size_t m, size_t n) : Amat(m * n), b(m), m(m), n(n) { } 25 | 26 | LeastSquaresProblem(const char* matfile, const char* bFile); 27 | 28 | float A(size_t i, size_t j) const { 29 | return Amat[i + m * j]; 30 | } 31 | 32 | float& A(size_t i, size_t j) { 33 | return Amat[i + m * j]; 34 | } 35 | 36 | size_t NumFeats() const { return n; } 37 | size_t NumInstances() const { return m; } 38 | }; 39 | 40 | struct LeastSquaresObjective : public DifferentiableFunction { 41 | const LeastSquaresProblem& problem; 42 | const double l2weight; 43 | 44 | LeastSquaresObjective(const LeastSquaresProblem& p, double l2weight = 0) : problem(p), l2weight(l2weight) { } 45 | 46 | double Eval(const DblVec& input, DblVec& gradient); 47 | }; 48 | -------------------------------------------------------------------------------- /logreg.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strint/LogisticRegression_OWLQN_Notes/af82f1f4fb818f19600efa5000a378950b720ce7/logreg.cpp -------------------------------------------------------------------------------- /logreg.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strint/LogisticRegression_OWLQN_Notes/af82f1f4fb818f19600efa5000a378950b720ce7/logreg.h -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "OWLQN.h" 6 | #include "leastSquares.h" 7 | #include "logreg.h" 8 | 9 | using namespace std; 10 | 11 | void printUsageAndExit() { 12 | cout << "Orthant-Wise Limited-memory Quasi-Newton trainer" << endl; 13 | cout << "trains L1-regularized logistic regression or least-squares models" << endl << endl; 14 | cout << "usage: feature_file label_file regWeight output_file [options]" << endl; 15 | cout << " feature_file input feature matrix in Matrix Market format (mxn real coordinate or array)" << endl; 16 | cout << " rows represent features for each instance" << endl; 17 | cout << " label_file input instance labels in Matrix Market format (mx1 real array)" << endl; 18 | cout << " rows contain single real value" << endl; 19 | cout << " for logistic regression problems, value must be 1 or -1" << endl; 20 | cout << " regWeight coefficient of l1 regularizer" << endl; 21 | cout << " output_file output weight vector in Matrix Market format (1xm real array)" << endl << endl; 22 | cout << "options:" << endl; 23 | cout << " -ls use least squares formulation (logistic regression is default)" << endl; 24 | cout << " -q quiet. Suppress all output" << endl; 25 | cout << " -tol sets convergence tolerance (default is 1e-4)" << endl; 26 | cout << " -m sets L-BFGS memory parameter (default is 10)" << endl; 27 | cout << " -l2weight " << endl; 28 | cout << " sets L2 regularization weight (default is 0)" << endl; 29 | cout << endl; 30 | system("pause"); 31 | exit(0); 32 | } 33 | 34 | void printVector(const DblVec &vec, const char* filename) { 35 | ofstream outfile(filename); 36 | if (!outfile.good()) { 37 | cerr << "error opening matrix file " << filename << endl; 38 | exit(1); 39 | } 40 | outfile << "%%MatrixMarket matrix array real general" << endl; 41 | outfile << "1 " << vec.size() << endl; 42 | for (size_t i=0; i= argc || (tol = atof(argv[i])) <= 0) { 82 | cout << "-tol (convergence tolerance) flag requires 1 positive real argument." << endl; 83 | exit(1); 84 | } 85 | } else if (!strcmp(argv[i], "-l2weight")) { 86 | //读取l2正则化项的权重 87 | ++i; 88 | if (i >= argc || (l2weight = atof(argv[i])) < 0) { 89 | cout << "-l2weight flag requires 1 non-negative real argument." << endl; 90 | exit(1); 91 | } 92 | } else if (!strcmp(argv[i], "-m")) { 93 | //读取记忆项的个数 94 | ++i; 95 | if (i >= argc || (m = atoi(argv[i])) == 0) { 96 | cout << "-m (L-BFGS memory param) flag requires 1 positive int argument." << endl; 97 | exit(1); 98 | } 99 | } else { 100 | cerr << "unrecognized argument: " << argv[i] << endl; 101 | exit(1); 102 | } 103 | } 104 | 105 | if (!quiet) { 106 | cout << argv[0] << " called with arguments " << endl << " "; 107 | for (int i=1; iNumFeats(); 119 | } else { 120 | //将数据导入到逻辑回归问题中 121 | LogisticRegressionProblem *prob = new LogisticRegressionProblem(feature_file, label_file); 122 | obj = new LogisticRegressionObjective(*prob, l2weight); 123 | size = prob->NumFeats(); 124 | } 125 | 126 | //size为特征的维度，init为初始参数值向量，ans为结果参数值向量 127 | DblVec init(size), ans(size); 128 | 129 | OWLQN opt(quiet); 130 | //输入依次是LogisticRegressionObjective（包含了样本数据、l2正则化项的系数、损失函数）、 131 | //参数的初始化值、参数最终的结果、l1正则化项的系数、允许的误差、lbfgs的记忆的项数 132 | opt.Minimize(*obj, init, ans, regweight, tol, m); 133 | 134 | int nonZero = 0; 135 | for (size_t i = 0; i