(p, n_bregman_iterations);
342 | problem = p;
343 | firstCall = true;
344 | this->gradient_method = gradient_method;
345 | // Optimization is done in the log-domain (to keep positive values)
346 | exp_weight = true;
347 | // Exportation of pdfs
348 | scaleDictionary = scale_dictionary;
349 | exportAtoms = export_atoms;
350 | exportFittings = export_fittings;
351 | exportOnlyFinalSolution = export_only_final_solution;
352 | warmRestart = warm_restart;
353 | exportEveryMIter = 1;
354 | lbfgs_parameter_init(&lbfgs_param);
355 | }
356 |
357 | ~WassersteinRegression() {
358 | delete bary_computation;
359 | }
360 |
361 |
362 | void regress_both(double* solution) {
363 | double residual;
364 | firstCall = true;
365 | lbfgs_param.linesearch = LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE; // LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE; // LBFGS_LINESEARCH_BACKTRACKING_WOLFE; // LBFGS_LINESEARCH_BACKTRACKING_ARMIJO;// LBFGS_LINESEARCH_MORETHUENTE;
366 | lbfgs_param.epsilon = 1E-50; // Convergence test on accuracy
367 | lbfgs_param.max_linesearch = 20; // Max number of trials for the line search
368 | lbfgs_param.delta = 1E-50; // Convergence test on minimum rate of decrease
369 | if(warmRestart)
370 | lbfgs_param.max_iterations = 10; // Number of iterations when doing multiple small runs of LBFGS
371 | else
372 | lbfgs_param.max_iterations = wrTotalIteration;
373 |
374 | iteration = 0;
375 |
376 | int P = problem->num_pdfs;
377 | int n = K*P + K*N;
378 | double scaleDic = scaleDictionary; // Scale between dictionary and weight sizes
379 |
380 | if(warmRestart)
381 | {
382 | b_storage.resize(K*N*P); // Initialize the storage vector for warm restart
383 | std::fill(b_storage.begin(), b_storage.begin()+K*N*P, 1.0);
384 | b_temp.resize(K*N*P);
385 | b.resize(K*N*(this->bary_computation->Niters+1)); // b initialization
386 | std::fill(b.begin(), b.begin()+K*N, 1.0);
387 | }
388 |
389 | // Transfer the data to the log domain
390 | if (exp_weight) {
391 | for (int i=0; iwarmRestart)
419 | {
420 | // Store the last computed scalings
421 | std::copy(this->b_temp.begin(), this->b_temp.end(),this->b_storage.begin());
422 | }
423 | }
424 | std::vector solution2(solution,solution+P*K+K*N);
425 |
426 | // Transfer the data back to the original domain
427 | if (exp_weight) {
428 | for (int i=0; i base_result(N);
467 | for (int i=0; i gradient(K*P+N*K); // dummy
487 | // Force exporting
488 | int M = exportEveryMIter;
489 | bool final = exportOnlyFinalSolution;
490 | exportOnlyFinalSolution = false;
491 | exportEveryMIter = 1;
492 | // Compute barycenter (and gradients but not used)
493 | evaluate_both(this,&solution2[0],&gradient[0],K*P+N*K,0);
494 | // Revert changes made to force exporting
495 | exportOnlyFinalSolution = final;
496 | exportEveryMIter = M;
497 | std::string dir = exporter.mOutputFolderPath + "/";
498 | std::ostringstream ss;
499 | ss << "i-fitting_" << std::setfill('0') << std::setw(3) << iteration;
500 | std::string oldname(ss.str());
501 | std::vector files = getFileListByPattern(dir + oldname + "*");
502 | for (int i=0; iNiters), std::min(iteration*1, bary_computation->Niters));
546 | const int n_iter_gradients = bary_computation->Niters;
547 |
548 | std::vector > a(K*N), u(K*N), convu(K*N);
549 | //std::vector > barycenter(N);
550 |
551 | if(!warmRestart)
552 | {
553 | b.resize(K*N*(n_iter_gradients+1));
554 | std::fill(b.begin(), b.begin()+K*N, 1.0);
555 | }
556 |
557 | conv_b.resize(K*N*n_iter_gradients);
558 | phi.resize(K*N*(n_iter_gradients+1));
559 | g.resize(N);
560 | r.resize(K*N); memset(&r[0], 0, K*N*sizeof(r[0]));
561 |
562 | problem->normalize_values();
563 |
564 | // Bregman Projections
565 | for (int iter=1; iter<=n_iter_gradients; iter++) {
566 |
567 | //#pragma omp parallel for firstprivate(a)
568 | problem->kernel.convolveAdjoint(&b[(iter-1)*K*N], &conv_b[(size_t)((iter-1)*K*N)], K);
569 | for (int i=0; ipdfs[i][0*N + j] / conv_b[offset+j];
573 | }
574 | }
575 | problem->kernel.convolve(&a[0], &phi[iter*K*N], K);
576 |
577 | memset(&barycenter[0], 0, N*sizeof(barycenter[0]));
578 | for (int i=0; ilambdas[i];
580 | for (int j=0; j > n(N), v(K*N,0.0), tmp(K*N), c(K*N), sumv(N);
597 | loss.gradient(&barycenter[0], &problem->observed_pdf[id*N], N, &g[0]);
598 |
599 | // gradient w.r.t dictionary
600 | memset(resultDic, 0, K*N*sizeof(resultDic[0]));
601 |
602 | /*if (iteration%2==0)*/ {
603 | memcpy(&n[0], &g[0], N*sizeof(double));
604 | for (int sub_iter = n_iter_gradients; sub_iter>=1; sub_iter--) {
605 | memset(&sumv[0], 0, N*sizeof(sumv[0]));
606 | for (int i=0; ilambdas[i]*n[j]-v[i*N+j]) * b[sub_iter*K*N + i*N+j];
609 | }
610 | }
611 | problem->kernel.convolve(&tmp[0], &c[0], K);
612 | for (int i=0; ipdfs[i][0*N + j]*c[i*N+j]/sqr(conv_b[(sub_iter-1)*K*N+i*N+j]);
616 | }
617 | }
618 | if(sub_iter==1) break;
619 | problem->kernel.convolveAdjoint(&tmp[0], &v[0], K);
620 | for (int i=0; i=1; sub_iter--) {
644 |
645 | //#pragma omp parallel for firstprivate(u, convu)
646 | for (int i=0; ilambdas[i]*g[j] - r[i*N+j])/phi[sub_iter*K*N + i*N+j];
651 | }
652 | resultLa[i] += dotp;
653 | }
654 | if (sub_iter!=1) {
655 | problem->kernel.convolve(&u[0], &convu[0], K);
656 | for (int i=0; ipdfs[i][0*N +j] / sqr(conv_b[(sub_iter-1)*K*N + i*N + j]);
659 | }
660 | }
661 | problem->kernel.convolveAdjoint(&convu[0], &r[0], K);
662 | for (int i=0; iNiters;
681 |
682 | std::vector > a(K*N), u(K*N), convu(K*N);
683 |
684 | if(!warmRestart)
685 | {
686 | b.resize(K*N*(n_iter_gradients+1));
687 | std::fill(b.begin(), b.begin()+K*N, 0.0);
688 | }
689 |
690 | conv_b.resize(K*N*n_iter_gradients);
691 | phi.resize(K*N*(n_iter_gradients+1));
692 | g.resize(N);
693 | r.resize(K*N); memset(&r[0], 0, K*N*sizeof(r[0]));
694 |
695 | problem->normalize_values();
696 |
697 | // Forward log, Backward log + sign array
698 |
699 | for (int iter=1; iter<=n_iter_gradients; iter++) {
700 |
701 | //#pragma omp parallel for firstprivate(a)
702 | problem->kernel.log_convolveAdjoint(&b[(iter-1)*K*N], &conv_b[(size_t)((iter-1)*K*N)], K);
703 | for (int i=0; ipdfs[i][0*N + j]) - conv_b[offset+j];
707 | }
708 | }
709 | problem->kernel.log_convolve(&a[0], &phi[iter*K*N], K);
710 |
711 | memset(&barycenter[0], 0, N*sizeof(barycenter[0]));
712 | for (int i=0; ilambdas[i];
714 | for (int j=0; j > n(N), v(K*N,0.0), tmp(K*N), c(K*N), sumv(N);
731 |
732 | loss.gradient(&barycenter[0], &problem->observed_pdf[id*N], N, &g[0]);
733 |
734 | // gradient w.r.t dictionary
735 |
736 | memset(resultDic, 0, K*N*sizeof(resultDic[0]));
737 | memcpy(&n[0], &g[0], N*sizeof(double));
738 |
739 | unsigned char * signArray = new unsigned char[(K*N+7)/8];
740 |
741 |
742 | for (int sub_iter = n_iter_gradients; sub_iter>=1; sub_iter--) {
743 | memset(&sumv[0], 0, N*sizeof(sumv[0]));
744 | for (int i=0; ilambdas[i]*n[j] + v[i*N+j], signArray, i*N+j) + b[sub_iter*K*N + i*N+j];
747 | }
748 | }
749 | problem->kernel.log_convolve_signArray(&tmp[0], signArray, &c[0], K);
750 | for (int i=0; ipdfs[i][0*N + j]) + c[i*N+j] - 2.0*(conv_b[(sub_iter-1)*K*N+i*N+j]);
756 | }
757 | }
758 | if(sub_iter==1) break;
759 | problem->kernel.log_convolve_signArrayAdjoint(&tmp[0], signArray, &v[0], K);
760 | for (int i=0; i=1; sub_iter--) {
781 |
782 | //#pragma omp parallel for firstprivate(u, convu)
783 | for (int i=0; ilambdas[i]*g[j] + r[i*N+j], signArray, i*N+j) - phi[sub_iter*K*N + i*N+j];
788 | }
789 | resultLa[i] += dotp;
790 | }
791 |
792 | if (sub_iter!=1) {
793 | problem->kernel.log_convolve_signArray(&u[0], signArray, &convu[0], K);
794 | for (int i=0; ipdfs[i][0*N +j]) - 2.0*(conv_b[(sub_iter-1)*K*N + i*N + j]);
798 | }
799 | }
800 | problem->kernel.log_convolve_signArrayAdjoint(&convu[0], signArray, &r[0], K);
801 | for (int i=0; iNiters;
820 |
821 | std::vector > c_b, c_conv_b, c_phi, c_g, c_r, c_bary;
822 | // std::vector > c_b, c_conv_b, c_phi, c_g, c_r;
823 |
824 | std::vector > a(K*N), u(K*N), convu(K*N);
825 | // std::fill(b.begin(), b.begin()+K*N, 0.0);
826 |
827 | g.resize(N);
828 | c_b.resize(K*N*(bary_computation->Niters+1));
829 | std::fill(c_b.begin(), c_b.begin()+K*N, 0.0);
830 | c_conv_b.resize(K*N*bary_computation->Niters);
831 | c_phi.resize(K*N*(bary_computation->Niters+1));
832 | c_g.resize(N);
833 | c_r.resize(K*N); memset(&c_r[0], 0, K*N*sizeof(c_r[0]));
834 | c_bary.resize(N);
835 | float logepsilon = (float)log(EPSILON);
836 |
837 | problem->normalize_values();
838 |
839 | // Bregman Projections
840 | for (int iter=1; iter<=n_iter_gradients; iter++) {
841 |
842 | //#pragma omp parallel for firstprivate(a)
843 | problem->kernel.log_convolveAdjoint(&c_b[(iter-1)*K*N], &c_conv_b[(size_t)((iter-1)*K*N)], K);
844 | for (int i=0; ipdfs[i][0*N + j]) - c_conv_b[offset+j];
848 | // a[i*N + j] = problem->pdfs[i][0*N + j] / conv_b[offset+j];
849 | }
850 | }
851 | problem->kernel.log_convolve(&a[0], &c_phi[iter*K*N], K);
852 | // problem->kernel.convolve(&a[0], &phi[iter*K*N], K);
853 |
854 | memset(&c_bary[0], 0, N*sizeof(c_bary[0]));
855 | for (int i=0; ilambdas[i];
857 | for (int j=0; j > n(N), v(K*N,0.0), tmp(K*N), c(K*N), sumv(N);
882 | // std::vector > n(N);
883 |
884 | loss.gradient(&barycenter[0], &problem->observed_pdf[id*N], N, &g[0]);
885 |
886 | for (int j=0; j=1; sub_iter--) {
897 | memset(&sumv[0], 0, N*sizeof(sumv[0]));
898 | for (int i=0; ilambdas[i]*n[j] - v[i*N+j]) + c_b[sub_iter*K*N + i*N+j];
901 | // tmp[i*N+j] = (problem->lambdas[i]*n[j]-v[i*N+j]) * b[sub_iter*K*N + i*N+j];
902 | }
903 | }
904 |
905 | problem->kernel.log_convolve(&tmp[0], &c[0], K);
906 | // problem->kernel.convolve(&tmp[0], &c[0], K);
907 | for (int i=0; ipdfs[i][0*N + j])+c[i*N+j] - 2.0f*(c_conv_b[(sub_iter-1)*K*N+i*N+j]);
912 | // tmp[i*N+j] = -problem->pdfs[i][0*N + j]*c[i*N+j]/sqr(conv_b[(sub_iter-1)*K*N+i*N+j]);
913 | }
914 | }
915 | if(sub_iter==1) break;
916 | problem->kernel.log_convolveAdjoint(&tmp[0], &v[0], K);
917 | // problem->kernel.convolveAdjoint(&tmp[0], &v[0], K);
918 |
919 | ComplexDataType minus = std::log((ComplexDataType)(-1.0f));
920 | for (int i=0; i=1; sub_iter--) {
942 |
943 | //#pragma omp parallel for firstprivate(u, convu)
944 | for (int i=0; ilambdas[i]*c_g[j] - c_r[i*N+j]) - c_phi[sub_iter*K*N + i*N+j];
950 | // u[i*N+j] = (problem->lambdas[i]*g[j] - r[i*N+j])/phi[sub_iter*K*N + i*N+j];
951 | }
952 | resultLa[i] += std::real(dotp);
953 | }
954 |
955 | if (sub_iter!=1) {
956 | problem->kernel.log_convolve(&u[0], &convu[0], K);
957 | // problem->kernel.convolve(&u[0], &convu[0], K);
958 | for (int i=0; ipdfs[i][0*N +j]) - 2.0f*(c_conv_b[(sub_iter-1)*K*N + i*N + j]);
961 | // convu[i*N+j] *= -problem->pdfs[i][0*N +j] / sqr(conv_b[(sub_iter-1)*K*N + i*N + j]);
962 | }
963 | }
964 | problem->kernel.log_convolveAdjoint(&convu[0], &c_r[0], K);
965 | // problem->kernel.convolveAdjoint(&convu[0], &r[0], K);
966 |
967 | ComplexDataType minus = std::log((ComplexDataType)(-1.0f));
968 | for (int i=0; i* regression = (WassersteinRegression*)(instance);
989 | Problem* prob = regression->problem;
990 | WassersteinBarycenter* bary = regression->bary_computation;
991 | int N = prob->N;
992 | int K = prob->K;
993 | int P = prob->num_pdfs;
994 | double scaleDic = regression->scaleDictionary;
995 | assert(n==P*K+K*N);
996 |
997 | // Transfer the data back to the original domain
998 | double* variables2 = new double[n];
999 | memcpy(variables2, variables, n*sizeof(double));
1000 | if (regression->exp_weight) {
1001 | for (int i=0; i s(P, 0);
1014 | for (int i=0; i
s2(K, 0.);
1024 | for (int i=0; ipdfs[i][j] = variables2[P*K+i*N+j];
1037 | }
1038 | }
1039 |
1040 | std::vector barycenter(N); // To store one barycenter
1041 | std::vector curGradientDic(K*N); // To store the gradient on dictionary
1042 | memset(gradient, 0, (K*P+N*K) * sizeof(gradient[0])); // Set all values of gradient to 0
1043 | lbfgsfloatval_t lossVal = 0;
1044 |
1045 | // Containers for histogram and filename batches
1046 | std::vector > barycentersBatch(P);
1047 | std::vector filenamesBatch(P);
1048 | // Check condition for exporting the histograms
1049 | bool exportHists = regression->exportFittings && !regression->exportOnlyFinalSolution && (regression->iteration % regression->exportEveryMIter == 0);
1050 |
1051 | // For each histogram in the input dataset
1052 | for (int id =0; idset_weights(&variables2[id*K]); // Store weights
1054 | //prob->normalize_values();
1055 | std::cout<<"|"<warmRestart)
1058 | {
1059 | // Use last scaling for initialization (warm restart)
1060 | std::copy(regression->b_storage.begin()+K*N*id, regression->b_storage.begin()+K*N*(id+1),regression->b.begin());
1061 | }
1062 |
1063 | // Compute the barycenter and both gradients
1064 | #ifndef COMPUTE_BARYCENTER_LOG
1065 | regression->compute_gradient_both(&gradient[id*K], &curGradientDic[0], &barycenter[0], id);
1066 | #else
1067 | regression->compute_gradient_both_log_signArray(&gradient[id*K], &curGradientDic[0], &barycenter[0], id);
1068 | #endif
1069 |
1070 | // Save data (barycenter and filenames) for exporting in batch later
1071 | if(exportHists)
1072 | {
1073 | barycentersBatch[id] = barycenter;
1074 | std::ostringstream ss;
1075 | ss << "i-fitting_" << std::setfill('0');
1076 | ss << std::setw(3) << regression->iteration << "_";
1077 | ss << std::setw(3) << id;
1078 | filenamesBatch[id] = ss.str();
1079 | }
1080 |
1081 | if(regression->warmRestart)
1082 | {
1083 | // Save last scaling b (for warm restart)
1084 | std::copy(regression->b.begin()+K*N*bary->Niters, regression->b.end(),regression->b_temp.begin()+K*N*id);
1085 | }
1086 |
1087 | // Compute the loss between the computed barycenter and the input histogram
1088 | // Total loss is the sum over all the input histograms
1089 | double currentLoss = regression->loss.loss(&barycenter[0], &prob->observed_pdf[id*N], prob->N);
1090 | // std::cout<<" "<exporter.exportHistogramsBatch(barycentersBatch,filenamesBatch);
1119 | }
1120 |
1121 | std::cout<exp_weight) {
1126 | for (int i=0; icompute_gradient_numeric(&test[0]);
1140 | std::cout<<"new iter-----------------"<* regression = (WassersteinRegression*)instance;
1151 | regression->iteration++;
1152 | if(regression->warmRestart)
1153 | printf("LBFGS Iteration %d, total iterations %d :\n", k,regression->iteration);
1154 | else
1155 | printf("Iteration %d:\n", k);
1156 | printf("time elapsed: %f (s)\n", regression->chrono.GetDiffMs()*0.001);
1157 | int K = regression->K;
1158 | int P = regression->problem->num_pdfs;
1159 | int N = regression->N;
1160 |
1161 | double scaleDic = regression->scaleDictionary;
1162 |
1163 | // Display fitting variables
1164 | std::cout<<"weights + 10 first values of first atom:"<exp_weight) {
1166 | for (int i=0; iexportAtoms && !(regression->exportOnlyFinalSolution) && (k % regression->exportEveryMIter == 0))
1188 | {
1189 | std::vector base_result(N);
1190 | for (int i=0; iiteration << "_";
1202 | ss << std::setw(3) << i;
1203 | std::string filename(ss.str());
1204 | regression->exporter.exportHistogram(base_result,filename);
1205 | }
1206 | }
1207 | return 0;
1208 | }
1209 |
1210 | // Storage for warm restart
1211 | std::vector b_storage;
1212 | std::vector b_temp;
1213 | // Storage for intermediate results
1214 | std::vector > b, conv_b, phi, g, r;
1215 |
1216 | WassersteinBarycenter* bary_computation;
1217 | Problem* problem;
1218 | size_t K, N;
1219 | std::vector prev_solution;
1220 | gradient_type gradient_method;
1221 | PerfChrono chrono;
1222 | const BaseLoss &loss;
1223 | lbfgs_parameter_t lbfgs_param;
1224 | bool firstCall;
1225 | bool exp_weight; // Holds whether the weights are in log-domain
1226 | int iteration;
1227 | int exportEveryMIter;
1228 | // For histogram exportation
1229 | const ExportHistogramBase &exporter;
1230 | // For dual regression (regress_both)
1231 | double scaleDictionary;
1232 | bool exportAtoms;
1233 | bool exportFittings;
1234 | bool exportOnlyFinalSolution; // works for atoms and fittings
1235 | // For warm restart
1236 | bool warmRestart;
1237 | int wrTotalIteration;
1238 | };
1239 |
--------------------------------------------------------------------------------
/cpp/lbfgs/arithmetic_ansi.h:
--------------------------------------------------------------------------------
1 | /*
2 | * ANSI C implementation of vector operations.
3 | *
4 | * Copyright (c) 2007-2010 Naoaki Okazaki
5 | * All rights reserved.
6 | *
7 | * Permission is hereby granted, free of charge, to any person obtaining a copy
8 | * of this software and associated documentation files (the "Software"), to deal
9 | * in the Software without restriction, including without limitation the rights
10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | * copies of the Software, and to permit persons to whom the Software is
12 | * furnished to do so, subject to the following conditions:
13 | *
14 | * The above copyright notice and this permission notice shall be included in
15 | * all copies or substantial portions of the Software.
16 | *
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 | * THE SOFTWARE.
24 | */
25 |
26 | /* $Id$ */
27 |
28 | #include
29 | #include
30 |
31 | #if LBFGS_FLOAT == 32 && LBFGS_IEEE_FLOAT
32 | #define fsigndiff(x, y) (((*(uint32_t*)(x)) ^ (*(uint32_t*)(y))) & 0x80000000U)
33 | #else
34 | #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
35 | #endif/*LBFGS_IEEE_FLOAT*/
36 |
37 | inline static void* vecalloc(size_t size)
38 | {
39 | void *memblock = malloc(size);
40 | if (memblock) {
41 | memset(memblock, 0, size);
42 | }
43 | return memblock;
44 | }
45 |
46 | inline static void vecfree(void *memblock)
47 | {
48 | free(memblock);
49 | }
50 |
51 | inline static void vecset(lbfgsfloatval_t *x, const lbfgsfloatval_t c, const int n)
52 | {
53 | int i;
54 |
55 | for (i = 0;i < n;++i) {
56 | x[i] = c;
57 | }
58 | }
59 |
60 | inline static void veccpy(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
61 | {
62 | int i;
63 |
64 | for (i = 0;i < n;++i) {
65 | y[i] = x[i];
66 | }
67 | }
68 |
69 | inline static void vecncpy(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
70 | {
71 | int i;
72 |
73 | for (i = 0;i < n;++i) {
74 | y[i] = -x[i];
75 | }
76 | }
77 |
78 | inline static void vecadd(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const lbfgsfloatval_t c, const int n)
79 | {
80 | int i;
81 |
82 | for (i = 0;i < n;++i) {
83 | y[i] += c * x[i];
84 | }
85 | }
86 |
87 | inline static void vecdiff(lbfgsfloatval_t *z, const lbfgsfloatval_t *x, const lbfgsfloatval_t *y, const int n)
88 | {
89 | int i;
90 |
91 | for (i = 0;i < n;++i) {
92 | z[i] = x[i] - y[i];
93 | }
94 | }
95 |
96 | inline static void vecscale(lbfgsfloatval_t *y, const lbfgsfloatval_t c, const int n)
97 | {
98 | int i;
99 |
100 | for (i = 0;i < n;++i) {
101 | y[i] *= c;
102 | }
103 | }
104 |
105 | inline static void vecmul(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
106 | {
107 | int i;
108 |
109 | for (i = 0;i < n;++i) {
110 | y[i] *= x[i];
111 | }
112 | }
113 |
114 | inline static void vecdot(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const lbfgsfloatval_t *y, const int n)
115 | {
116 | int i;
117 | *s = 0.;
118 | for (i = 0;i < n;++i) {
119 | *s += x[i] * y[i];
120 | }
121 | }
122 |
123 | inline static void vec2norm(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const int n)
124 | {
125 | vecdot(s, x, x, n);
126 | *s = (lbfgsfloatval_t)sqrt(*s);
127 | }
128 |
129 | inline static void vec2norminv(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const int n)
130 | {
131 | vec2norm(s, x, n);
132 | *s = (lbfgsfloatval_t)(1.0 / *s);
133 | }
134 |
--------------------------------------------------------------------------------
/cpp/lbfgs/arithmetic_sse_double.h:
--------------------------------------------------------------------------------
1 | /*
2 | * SSE2 implementation of vector oprations (64bit double).
3 | *
4 | * Copyright (c) 2007-2010 Naoaki Okazaki
5 | * All rights reserved.
6 | *
7 | * Permission is hereby granted, free of charge, to any person obtaining a copy
8 | * of this software and associated documentation files (the "Software"), to deal
9 | * in the Software without restriction, including without limitation the rights
10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | * copies of the Software, and to permit persons to whom the Software is
12 | * furnished to do so, subject to the following conditions:
13 | *
14 | * The above copyright notice and this permission notice shall be included in
15 | * all copies or substantial portions of the Software.
16 | *
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 | * THE SOFTWARE.
24 | */
25 |
26 | /* $Id$ */
27 |
28 | #include
29 | #ifndef __APPLE__
30 | #include
31 | #endif
32 | #include
33 |
34 | #if 1400 <= _MSC_VER
35 | #include
36 | #endif/*1400 <= _MSC_VER*/
37 |
38 | #if HAVE_EMMINTRIN_H
39 | #include
40 | #endif/*HAVE_EMMINTRIN_H*/
41 |
42 | inline static void* vecalloc(size_t size)
43 | {
44 | #if defined(_MSC_VER)
45 | void *memblock = _aligned_malloc(size, 16);
46 | #elif defined(__APPLE__) /* OS X always aligns on 16-byte boundaries */
47 | void *memblock = malloc(size);
48 | #else
49 | void *memblock = NULL, *p = NULL;
50 | if (posix_memalign(&p, 16, size) == 0) {
51 | memblock = p;
52 | }
53 | #endif
54 | if (memblock != NULL) {
55 | memset(memblock, 0, size);
56 | }
57 | return memblock;
58 | }
59 |
60 | inline static void vecfree(void *memblock)
61 | {
62 | #ifdef _MSC_VER
63 | _aligned_free(memblock);
64 | #else
65 | free(memblock);
66 | #endif
67 | }
68 |
69 | #define fsigndiff(x, y) \
70 | ((_mm_movemask_pd(_mm_set_pd(*(x), *(y))) + 1) & 0x002)
71 |
72 | #define vecset(x, c, n) \
73 | { \
74 | int i; \
75 | __m128d XMM0 = _mm_set1_pd(c); \
76 | for (i = 0;i < (n);i += 8) { \
77 | _mm_store_pd((x)+i , XMM0); \
78 | _mm_store_pd((x)+i+2, XMM0); \
79 | _mm_store_pd((x)+i+4, XMM0); \
80 | _mm_store_pd((x)+i+6, XMM0); \
81 | } \
82 | }
83 |
84 | #define veccpy(y, x, n) \
85 | { \
86 | int i; \
87 | for (i = 0;i < (n);i += 8) { \
88 | __m128d XMM0 = _mm_load_pd((x)+i ); \
89 | __m128d XMM1 = _mm_load_pd((x)+i+2); \
90 | __m128d XMM2 = _mm_load_pd((x)+i+4); \
91 | __m128d XMM3 = _mm_load_pd((x)+i+6); \
92 | _mm_store_pd((y)+i , XMM0); \
93 | _mm_store_pd((y)+i+2, XMM1); \
94 | _mm_store_pd((y)+i+4, XMM2); \
95 | _mm_store_pd((y)+i+6, XMM3); \
96 | } \
97 | }
98 |
99 | #define vecncpy(y, x, n) \
100 | { \
101 | int i; \
102 | for (i = 0;i < (n);i += 8) { \
103 | __m128d XMM0 = _mm_setzero_pd(); \
104 | __m128d XMM1 = _mm_setzero_pd(); \
105 | __m128d XMM2 = _mm_setzero_pd(); \
106 | __m128d XMM3 = _mm_setzero_pd(); \
107 | __m128d XMM4 = _mm_load_pd((x)+i ); \
108 | __m128d XMM5 = _mm_load_pd((x)+i+2); \
109 | __m128d XMM6 = _mm_load_pd((x)+i+4); \
110 | __m128d XMM7 = _mm_load_pd((x)+i+6); \
111 | XMM0 = _mm_sub_pd(XMM0, XMM4); \
112 | XMM1 = _mm_sub_pd(XMM1, XMM5); \
113 | XMM2 = _mm_sub_pd(XMM2, XMM6); \
114 | XMM3 = _mm_sub_pd(XMM3, XMM7); \
115 | _mm_store_pd((y)+i , XMM0); \
116 | _mm_store_pd((y)+i+2, XMM1); \
117 | _mm_store_pd((y)+i+4, XMM2); \
118 | _mm_store_pd((y)+i+6, XMM3); \
119 | } \
120 | }
121 |
122 | #define vecadd(y, x, c, n) \
123 | { \
124 | int i; \
125 | __m128d XMM7 = _mm_set1_pd(c); \
126 | for (i = 0;i < (n);i += 4) { \
127 | __m128d XMM0 = _mm_load_pd((x)+i ); \
128 | __m128d XMM1 = _mm_load_pd((x)+i+2); \
129 | __m128d XMM2 = _mm_load_pd((y)+i ); \
130 | __m128d XMM3 = _mm_load_pd((y)+i+2); \
131 | XMM0 = _mm_mul_pd(XMM0, XMM7); \
132 | XMM1 = _mm_mul_pd(XMM1, XMM7); \
133 | XMM2 = _mm_add_pd(XMM2, XMM0); \
134 | XMM3 = _mm_add_pd(XMM3, XMM1); \
135 | _mm_store_pd((y)+i , XMM2); \
136 | _mm_store_pd((y)+i+2, XMM3); \
137 | } \
138 | }
139 |
140 | #define vecdiff(z, x, y, n) \
141 | { \
142 | int i; \
143 | for (i = 0;i < (n);i += 8) { \
144 | __m128d XMM0 = _mm_load_pd((x)+i ); \
145 | __m128d XMM1 = _mm_load_pd((x)+i+2); \
146 | __m128d XMM2 = _mm_load_pd((x)+i+4); \
147 | __m128d XMM3 = _mm_load_pd((x)+i+6); \
148 | __m128d XMM4 = _mm_load_pd((y)+i ); \
149 | __m128d XMM5 = _mm_load_pd((y)+i+2); \
150 | __m128d XMM6 = _mm_load_pd((y)+i+4); \
151 | __m128d XMM7 = _mm_load_pd((y)+i+6); \
152 | XMM0 = _mm_sub_pd(XMM0, XMM4); \
153 | XMM1 = _mm_sub_pd(XMM1, XMM5); \
154 | XMM2 = _mm_sub_pd(XMM2, XMM6); \
155 | XMM3 = _mm_sub_pd(XMM3, XMM7); \
156 | _mm_store_pd((z)+i , XMM0); \
157 | _mm_store_pd((z)+i+2, XMM1); \
158 | _mm_store_pd((z)+i+4, XMM2); \
159 | _mm_store_pd((z)+i+6, XMM3); \
160 | } \
161 | }
162 |
163 | #define vecscale(y, c, n) \
164 | { \
165 | int i; \
166 | __m128d XMM7 = _mm_set1_pd(c); \
167 | for (i = 0;i < (n);i += 4) { \
168 | __m128d XMM0 = _mm_load_pd((y)+i ); \
169 | __m128d XMM1 = _mm_load_pd((y)+i+2); \
170 | XMM0 = _mm_mul_pd(XMM0, XMM7); \
171 | XMM1 = _mm_mul_pd(XMM1, XMM7); \
172 | _mm_store_pd((y)+i , XMM0); \
173 | _mm_store_pd((y)+i+2, XMM1); \
174 | } \
175 | }
176 |
177 | #define vecmul(y, x, n) \
178 | { \
179 | int i; \
180 | for (i = 0;i < (n);i += 8) { \
181 | __m128d XMM0 = _mm_load_pd((x)+i ); \
182 | __m128d XMM1 = _mm_load_pd((x)+i+2); \
183 | __m128d XMM2 = _mm_load_pd((x)+i+4); \
184 | __m128d XMM3 = _mm_load_pd((x)+i+6); \
185 | __m128d XMM4 = _mm_load_pd((y)+i ); \
186 | __m128d XMM5 = _mm_load_pd((y)+i+2); \
187 | __m128d XMM6 = _mm_load_pd((y)+i+4); \
188 | __m128d XMM7 = _mm_load_pd((y)+i+6); \
189 | XMM4 = _mm_mul_pd(XMM4, XMM0); \
190 | XMM5 = _mm_mul_pd(XMM5, XMM1); \
191 | XMM6 = _mm_mul_pd(XMM6, XMM2); \
192 | XMM7 = _mm_mul_pd(XMM7, XMM3); \
193 | _mm_store_pd((y)+i , XMM4); \
194 | _mm_store_pd((y)+i+2, XMM5); \
195 | _mm_store_pd((y)+i+4, XMM6); \
196 | _mm_store_pd((y)+i+6, XMM7); \
197 | } \
198 | }
199 |
200 |
201 |
202 | #if 3 <= __SSE__ || defined(__SSE3__)
203 | /*
204 | Horizontal add with haddps SSE3 instruction. The work register (rw)
205 | is unused.
206 | */
207 | #define __horizontal_sum(r, rw) \
208 | r = _mm_hadd_ps(r, r); \
209 | r = _mm_hadd_ps(r, r);
210 |
211 | #else
212 | /*
213 | Horizontal add with SSE instruction. The work register (rw) is used.
214 | */
215 | #define __horizontal_sum(r, rw) \
216 | rw = r; \
217 | r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
218 | r = _mm_add_ps(r, rw); \
219 | rw = r; \
220 | r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
221 | r = _mm_add_ps(r, rw);
222 |
223 | #endif
224 |
225 | #define vecdot(s, x, y, n) \
226 | { \
227 | int i; \
228 | __m128d XMM0 = _mm_setzero_pd(); \
229 | __m128d XMM1 = _mm_setzero_pd(); \
230 | __m128d XMM2, XMM3, XMM4, XMM5; \
231 | for (i = 0;i < (n);i += 4) { \
232 | XMM2 = _mm_load_pd((x)+i ); \
233 | XMM3 = _mm_load_pd((x)+i+2); \
234 | XMM4 = _mm_load_pd((y)+i ); \
235 | XMM5 = _mm_load_pd((y)+i+2); \
236 | XMM2 = _mm_mul_pd(XMM2, XMM4); \
237 | XMM3 = _mm_mul_pd(XMM3, XMM5); \
238 | XMM0 = _mm_add_pd(XMM0, XMM2); \
239 | XMM1 = _mm_add_pd(XMM1, XMM3); \
240 | } \
241 | XMM0 = _mm_add_pd(XMM0, XMM1); \
242 | XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
243 | XMM0 = _mm_add_pd(XMM0, XMM1); \
244 | _mm_store_sd((s), XMM0); \
245 | }
246 |
247 | #define vec2norm(s, x, n) \
248 | { \
249 | int i; \
250 | __m128d XMM0 = _mm_setzero_pd(); \
251 | __m128d XMM1 = _mm_setzero_pd(); \
252 | __m128d XMM2, XMM3, XMM4, XMM5; \
253 | for (i = 0;i < (n);i += 4) { \
254 | XMM2 = _mm_load_pd((x)+i ); \
255 | XMM3 = _mm_load_pd((x)+i+2); \
256 | XMM4 = XMM2; \
257 | XMM5 = XMM3; \
258 | XMM2 = _mm_mul_pd(XMM2, XMM4); \
259 | XMM3 = _mm_mul_pd(XMM3, XMM5); \
260 | XMM0 = _mm_add_pd(XMM0, XMM2); \
261 | XMM1 = _mm_add_pd(XMM1, XMM3); \
262 | } \
263 | XMM0 = _mm_add_pd(XMM0, XMM1); \
264 | XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
265 | XMM0 = _mm_add_pd(XMM0, XMM1); \
266 | XMM0 = _mm_sqrt_pd(XMM0); \
267 | _mm_store_sd((s), XMM0); \
268 | }
269 |
270 |
271 | #define vec2norminv(s, x, n) \
272 | { \
273 | int i; \
274 | __m128d XMM0 = _mm_setzero_pd(); \
275 | __m128d XMM1 = _mm_setzero_pd(); \
276 | __m128d XMM2, XMM3, XMM4, XMM5; \
277 | for (i = 0;i < (n);i += 4) { \
278 | XMM2 = _mm_load_pd((x)+i ); \
279 | XMM3 = _mm_load_pd((x)+i+2); \
280 | XMM4 = XMM2; \
281 | XMM5 = XMM3; \
282 | XMM2 = _mm_mul_pd(XMM2, XMM4); \
283 | XMM3 = _mm_mul_pd(XMM3, XMM5); \
284 | XMM0 = _mm_add_pd(XMM0, XMM2); \
285 | XMM1 = _mm_add_pd(XMM1, XMM3); \
286 | } \
287 | XMM2 = _mm_set1_pd(1.0); \
288 | XMM0 = _mm_add_pd(XMM0, XMM1); \
289 | XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
290 | XMM0 = _mm_add_pd(XMM0, XMM1); \
291 | XMM0 = _mm_sqrt_pd(XMM0); \
292 | XMM2 = _mm_div_pd(XMM2, XMM0); \
293 | _mm_store_sd((s), XMM2); \
294 | }
295 |
--------------------------------------------------------------------------------
/cpp/lbfgs/arithmetic_sse_float.h:
--------------------------------------------------------------------------------
1 | /*
2 | * SSE/SSE3 implementation of vector oprations (32bit float).
3 | *
4 | * Copyright (c) 2007-2010 Naoaki Okazaki
5 | * All rights reserved.
6 | *
7 | * Permission is hereby granted, free of charge, to any person obtaining a copy
8 | * of this software and associated documentation files (the "Software"), to deal
9 | * in the Software without restriction, including without limitation the rights
10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | * copies of the Software, and to permit persons to whom the Software is
12 | * furnished to do so, subject to the following conditions:
13 | *
14 | * The above copyright notice and this permission notice shall be included in
15 | * all copies or substantial portions of the Software.
16 | *
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 | * THE SOFTWARE.
24 | */
25 |
26 | /* $Id$ */
27 |
28 | #include
29 | #ifndef __APPLE__
30 | #include
31 | #endif
32 | #include
33 |
34 | #if 1400 <= _MSC_VER
35 | #include
36 | #endif/*_MSC_VER*/
37 |
38 | #if HAVE_XMMINTRIN_H
39 | #include
40 | #endif/*HAVE_XMMINTRIN_H*/
41 |
42 | #if LBFGS_FLOAT == 32 && LBFGS_IEEE_FLOAT
43 | #define fsigndiff(x, y) (((*(uint32_t*)(x)) ^ (*(uint32_t*)(y))) & 0x80000000U)
44 | #else
45 | #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
46 | #endif/*LBFGS_IEEE_FLOAT*/
47 |
48 | inline static void* vecalloc(size_t size)
49 | {
50 | #if defined(_MSC_VER)
51 | void *memblock = _aligned_malloc(size, 16);
52 | #elif defined(__APPLE__) /* OS X always aligns on 16-byte boundaries */
53 | void *memblock = malloc(size);
54 | #else
55 | void *memblock = NULL, *p = NULL;
56 | if (posix_memalign(&p, 16, size) == 0) {
57 | memblock = p;
58 | }
59 | #endif
60 | if (memblock != NULL) {
61 | memset(memblock, 0, size);
62 | }
63 | return memblock;
64 | }
65 |
66 | inline static void vecfree(void *memblock)
67 | {
68 | _aligned_free(memblock);
69 | }
70 |
71 | #define vecset(x, c, n) \
72 | { \
73 | int i; \
74 | __m128 XMM0 = _mm_set_ps1(c); \
75 | for (i = 0;i < (n);i += 16) { \
76 | _mm_store_ps((x)+i , XMM0); \
77 | _mm_store_ps((x)+i+ 4, XMM0); \
78 | _mm_store_ps((x)+i+ 8, XMM0); \
79 | _mm_store_ps((x)+i+12, XMM0); \
80 | } \
81 | }
82 |
83 | #define veccpy(y, x, n) \
84 | { \
85 | int i; \
86 | for (i = 0;i < (n);i += 16) { \
87 | __m128 XMM0 = _mm_load_ps((x)+i ); \
88 | __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
89 | __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
90 | __m128 XMM3 = _mm_load_ps((x)+i+12); \
91 | _mm_store_ps((y)+i , XMM0); \
92 | _mm_store_ps((y)+i+ 4, XMM1); \
93 | _mm_store_ps((y)+i+ 8, XMM2); \
94 | _mm_store_ps((y)+i+12, XMM3); \
95 | } \
96 | }
97 |
98 | #define vecncpy(y, x, n) \
99 | { \
100 | int i; \
101 | const uint32_t mask = 0x80000000; \
102 | __m128 XMM4 = _mm_load_ps1((float*)&mask); \
103 | for (i = 0;i < (n);i += 16) { \
104 | __m128 XMM0 = _mm_load_ps((x)+i ); \
105 | __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
106 | __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
107 | __m128 XMM3 = _mm_load_ps((x)+i+12); \
108 | XMM0 = _mm_xor_ps(XMM0, XMM4); \
109 | XMM1 = _mm_xor_ps(XMM1, XMM4); \
110 | XMM2 = _mm_xor_ps(XMM2, XMM4); \
111 | XMM3 = _mm_xor_ps(XMM3, XMM4); \
112 | _mm_store_ps((y)+i , XMM0); \
113 | _mm_store_ps((y)+i+ 4, XMM1); \
114 | _mm_store_ps((y)+i+ 8, XMM2); \
115 | _mm_store_ps((y)+i+12, XMM3); \
116 | } \
117 | }
118 |
119 | #define vecadd(y, x, c, n) \
120 | { \
121 | int i; \
122 | __m128 XMM7 = _mm_set_ps1(c); \
123 | for (i = 0;i < (n);i += 8) { \
124 | __m128 XMM0 = _mm_load_ps((x)+i ); \
125 | __m128 XMM1 = _mm_load_ps((x)+i+4); \
126 | __m128 XMM2 = _mm_load_ps((y)+i ); \
127 | __m128 XMM3 = _mm_load_ps((y)+i+4); \
128 | XMM0 = _mm_mul_ps(XMM0, XMM7); \
129 | XMM1 = _mm_mul_ps(XMM1, XMM7); \
130 | XMM2 = _mm_add_ps(XMM2, XMM0); \
131 | XMM3 = _mm_add_ps(XMM3, XMM1); \
132 | _mm_store_ps((y)+i , XMM2); \
133 | _mm_store_ps((y)+i+4, XMM3); \
134 | } \
135 | }
136 |
137 | #define vecdiff(z, x, y, n) \
138 | { \
139 | int i; \
140 | for (i = 0;i < (n);i += 16) { \
141 | __m128 XMM0 = _mm_load_ps((x)+i ); \
142 | __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
143 | __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
144 | __m128 XMM3 = _mm_load_ps((x)+i+12); \
145 | __m128 XMM4 = _mm_load_ps((y)+i ); \
146 | __m128 XMM5 = _mm_load_ps((y)+i+ 4); \
147 | __m128 XMM6 = _mm_load_ps((y)+i+ 8); \
148 | __m128 XMM7 = _mm_load_ps((y)+i+12); \
149 | XMM0 = _mm_sub_ps(XMM0, XMM4); \
150 | XMM1 = _mm_sub_ps(XMM1, XMM5); \
151 | XMM2 = _mm_sub_ps(XMM2, XMM6); \
152 | XMM3 = _mm_sub_ps(XMM3, XMM7); \
153 | _mm_store_ps((z)+i , XMM0); \
154 | _mm_store_ps((z)+i+ 4, XMM1); \
155 | _mm_store_ps((z)+i+ 8, XMM2); \
156 | _mm_store_ps((z)+i+12, XMM3); \
157 | } \
158 | }
159 |
160 | #define vecscale(y, c, n) \
161 | { \
162 | int i; \
163 | __m128 XMM7 = _mm_set_ps1(c); \
164 | for (i = 0;i < (n);i += 8) { \
165 | __m128 XMM0 = _mm_load_ps((y)+i ); \
166 | __m128 XMM1 = _mm_load_ps((y)+i+4); \
167 | XMM0 = _mm_mul_ps(XMM0, XMM7); \
168 | XMM1 = _mm_mul_ps(XMM1, XMM7); \
169 | _mm_store_ps((y)+i , XMM0); \
170 | _mm_store_ps((y)+i+4, XMM1); \
171 | } \
172 | }
173 |
174 | #define vecmul(y, x, n) \
175 | { \
176 | int i; \
177 | for (i = 0;i < (n);i += 16) { \
178 | __m128 XMM0 = _mm_load_ps((x)+i ); \
179 | __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
180 | __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
181 | __m128 XMM3 = _mm_load_ps((x)+i+12); \
182 | __m128 XMM4 = _mm_load_ps((y)+i ); \
183 | __m128 XMM5 = _mm_load_ps((y)+i+ 4); \
184 | __m128 XMM6 = _mm_load_ps((y)+i+ 8); \
185 | __m128 XMM7 = _mm_load_ps((y)+i+12); \
186 | XMM4 = _mm_mul_ps(XMM4, XMM0); \
187 | XMM5 = _mm_mul_ps(XMM5, XMM1); \
188 | XMM6 = _mm_mul_ps(XMM6, XMM2); \
189 | XMM7 = _mm_mul_ps(XMM7, XMM3); \
190 | _mm_store_ps((y)+i , XMM4); \
191 | _mm_store_ps((y)+i+ 4, XMM5); \
192 | _mm_store_ps((y)+i+ 8, XMM6); \
193 | _mm_store_ps((y)+i+12, XMM7); \
194 | } \
195 | }
196 |
197 |
198 |
199 | #if 3 <= __SSE__ || defined(__SSE3__)
200 | /*
201 | Horizontal add with haddps SSE3 instruction. The work register (rw)
202 | is unused.
203 | */
204 | #define __horizontal_sum(r, rw) \
205 | r = _mm_hadd_ps(r, r); \
206 | r = _mm_hadd_ps(r, r);
207 |
208 | #else
209 | /*
210 | Horizontal add with SSE instruction. The work register (rw) is used.
211 | */
212 | #define __horizontal_sum(r, rw) \
213 | rw = r; \
214 | r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
215 | r = _mm_add_ps(r, rw); \
216 | rw = r; \
217 | r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
218 | r = _mm_add_ps(r, rw);
219 |
220 | #endif
221 |
222 | #define vecdot(s, x, y, n) \
223 | { \
224 | int i; \
225 | __m128 XMM0 = _mm_setzero_ps(); \
226 | __m128 XMM1 = _mm_setzero_ps(); \
227 | __m128 XMM2, XMM3, XMM4, XMM5; \
228 | for (i = 0;i < (n);i += 8) { \
229 | XMM2 = _mm_load_ps((x)+i ); \
230 | XMM3 = _mm_load_ps((x)+i+4); \
231 | XMM4 = _mm_load_ps((y)+i ); \
232 | XMM5 = _mm_load_ps((y)+i+4); \
233 | XMM2 = _mm_mul_ps(XMM2, XMM4); \
234 | XMM3 = _mm_mul_ps(XMM3, XMM5); \
235 | XMM0 = _mm_add_ps(XMM0, XMM2); \
236 | XMM1 = _mm_add_ps(XMM1, XMM3); \
237 | } \
238 | XMM0 = _mm_add_ps(XMM0, XMM1); \
239 | __horizontal_sum(XMM0, XMM1); \
240 | _mm_store_ss((s), XMM0); \
241 | }
242 |
243 | #define vec2norm(s, x, n) \
244 | { \
245 | int i; \
246 | __m128 XMM0 = _mm_setzero_ps(); \
247 | __m128 XMM1 = _mm_setzero_ps(); \
248 | __m128 XMM2, XMM3; \
249 | for (i = 0;i < (n);i += 8) { \
250 | XMM2 = _mm_load_ps((x)+i ); \
251 | XMM3 = _mm_load_ps((x)+i+4); \
252 | XMM2 = _mm_mul_ps(XMM2, XMM2); \
253 | XMM3 = _mm_mul_ps(XMM3, XMM3); \
254 | XMM0 = _mm_add_ps(XMM0, XMM2); \
255 | XMM1 = _mm_add_ps(XMM1, XMM3); \
256 | } \
257 | XMM0 = _mm_add_ps(XMM0, XMM1); \
258 | __horizontal_sum(XMM0, XMM1); \
259 | XMM2 = XMM0; \
260 | XMM1 = _mm_rsqrt_ss(XMM0); \
261 | XMM3 = XMM1; \
262 | XMM1 = _mm_mul_ss(XMM1, XMM1); \
263 | XMM1 = _mm_mul_ss(XMM1, XMM3); \
264 | XMM1 = _mm_mul_ss(XMM1, XMM0); \
265 | XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
266 | XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
267 | XMM3 = _mm_add_ss(XMM3, XMM1); \
268 | XMM3 = _mm_mul_ss(XMM3, XMM2); \
269 | _mm_store_ss((s), XMM3); \
270 | }
271 |
272 | #define vec2norminv(s, x, n) \
273 | { \
274 | int i; \
275 | __m128 XMM0 = _mm_setzero_ps(); \
276 | __m128 XMM1 = _mm_setzero_ps(); \
277 | __m128 XMM2, XMM3; \
278 | for (i = 0;i < (n);i += 16) { \
279 | XMM2 = _mm_load_ps((x)+i ); \
280 | XMM3 = _mm_load_ps((x)+i+4); \
281 | XMM2 = _mm_mul_ps(XMM2, XMM2); \
282 | XMM3 = _mm_mul_ps(XMM3, XMM3); \
283 | XMM0 = _mm_add_ps(XMM0, XMM2); \
284 | XMM1 = _mm_add_ps(XMM1, XMM3); \
285 | } \
286 | XMM0 = _mm_add_ps(XMM0, XMM1); \
287 | __horizontal_sum(XMM0, XMM1); \
288 | XMM2 = XMM0; \
289 | XMM1 = _mm_rsqrt_ss(XMM0); \
290 | XMM3 = XMM1; \
291 | XMM1 = _mm_mul_ss(XMM1, XMM1); \
292 | XMM1 = _mm_mul_ss(XMM1, XMM3); \
293 | XMM1 = _mm_mul_ss(XMM1, XMM0); \
294 | XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
295 | XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
296 | XMM3 = _mm_add_ss(XMM3, XMM1); \
297 | _mm_store_ss((s), XMM3); \
298 | }
299 |
--------------------------------------------------------------------------------
/cpp/lbfgs/lbfgs.h:
--------------------------------------------------------------------------------
1 | /*
2 | * C library of Limited memory BFGS (L-BFGS).
3 | *
4 | * Copyright (c) 1990, Jorge Nocedal
5 | * Copyright (c) 2007-2010 Naoaki Okazaki
6 | * All rights reserved.
7 | *
8 | * Permission is hereby granted, free of charge, to any person obtaining a copy
9 | * of this software and associated documentation files (the "Software"), to deal
10 | * in the Software without restriction, including without limitation the rights
11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | * copies of the Software, and to permit persons to whom the Software is
13 | * furnished to do so, subject to the following conditions:
14 | *
15 | * The above copyright notice and this permission notice shall be included in
16 | * all copies or substantial portions of the Software.
17 | *
18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 | * THE SOFTWARE.
25 | */
26 |
27 | /* $Id$ */
28 |
29 | #ifndef __LBFGS_H__
30 | #define __LBFGS_H__
31 |
32 | #ifdef __cplusplus
33 | extern "C" {
34 | #endif/*__cplusplus*/
35 |
36 | /*
37 | * The default precision of floating point values is 64bit (double).
38 | */
39 | #ifndef LBFGS_FLOAT
40 | #define LBFGS_FLOAT 64
41 | #endif/*LBFGS_FLOAT*/
42 |
43 | /*
44 | * Activate optimization routines for IEEE754 floating point values.
45 | */
46 | #ifndef LBFGS_IEEE_FLOAT
47 | #define LBFGS_IEEE_FLOAT 1
48 | #endif/*LBFGS_IEEE_FLOAT*/
49 |
50 | #if LBFGS_FLOAT == 32
51 | typedef float lbfgsfloatval_t;
52 |
53 | #elif LBFGS_FLOAT == 64
54 | typedef double lbfgsfloatval_t;
55 |
56 | #else
57 | #error "libLBFGS supports single (float; LBFGS_FLOAT = 32) or double (double; LBFGS_FLOAT=64) precision only."
58 |
59 | #endif
60 |
61 |
62 | /**
63 | * \addtogroup liblbfgs_api libLBFGS API
64 | * @{
65 | *
66 | * The libLBFGS API.
67 | */
68 |
69 | /**
70 | * Return values of lbfgs().
71 | *
72 | * Roughly speaking, a negative value indicates an error.
73 | */
74 | enum {
75 | /** L-BFGS reaches convergence. */
76 | LBFGS_SUCCESS = 0,
77 | LBFGS_CONVERGENCE = 0,
78 | LBFGS_STOP,
79 | /** The initial variables already minimize the objective function. */
80 | LBFGS_ALREADY_MINIMIZED,
81 |
82 | /** Unknown error. */
83 | LBFGSERR_UNKNOWNERROR = -1024,
84 | /** Logic error. */
85 | LBFGSERR_LOGICERROR,
86 | /** Insufficient memory. */
87 | LBFGSERR_OUTOFMEMORY,
88 | /** The minimization process has been canceled. */
89 | LBFGSERR_CANCELED,
90 | /** Invalid number of variables specified. */
91 | LBFGSERR_INVALID_N,
92 | /** Invalid number of variables (for SSE) specified. */
93 | LBFGSERR_INVALID_N_SSE,
94 | /** The array x must be aligned to 16 (for SSE). */
95 | LBFGSERR_INVALID_X_SSE,
96 | /** Invalid parameter lbfgs_parameter_t::epsilon specified. */
97 | LBFGSERR_INVALID_EPSILON,
98 | /** Invalid parameter lbfgs_parameter_t::past specified. */
99 | LBFGSERR_INVALID_TESTPERIOD,
100 | /** Invalid parameter lbfgs_parameter_t::delta specified. */
101 | LBFGSERR_INVALID_DELTA,
102 | /** Invalid parameter lbfgs_parameter_t::linesearch specified. */
103 | LBFGSERR_INVALID_LINESEARCH,
104 | /** Invalid parameter lbfgs_parameter_t::max_step specified. */
105 | LBFGSERR_INVALID_MINSTEP,
106 | /** Invalid parameter lbfgs_parameter_t::max_step specified. */
107 | LBFGSERR_INVALID_MAXSTEP,
108 | /** Invalid parameter lbfgs_parameter_t::ftol specified. */
109 | LBFGSERR_INVALID_FTOL,
110 | /** Invalid parameter lbfgs_parameter_t::wolfe specified. */
111 | LBFGSERR_INVALID_WOLFE,
112 | /** Invalid parameter lbfgs_parameter_t::gtol specified. */
113 | LBFGSERR_INVALID_GTOL,
114 | /** Invalid parameter lbfgs_parameter_t::xtol specified. */
115 | LBFGSERR_INVALID_XTOL,
116 | /** Invalid parameter lbfgs_parameter_t::max_linesearch specified. */
117 | LBFGSERR_INVALID_MAXLINESEARCH,
118 | /** Invalid parameter lbfgs_parameter_t::orthantwise_c specified. */
119 | LBFGSERR_INVALID_ORTHANTWISE,
120 | /** Invalid parameter lbfgs_parameter_t::orthantwise_start specified. */
121 | LBFGSERR_INVALID_ORTHANTWISE_START,
122 | /** Invalid parameter lbfgs_parameter_t::orthantwise_end specified. */
123 | LBFGSERR_INVALID_ORTHANTWISE_END,
124 | /** The line-search step went out of the interval of uncertainty. */
125 | LBFGSERR_OUTOFINTERVAL,
126 | /** A logic error occurred; alternatively, the interval of uncertainty
127 | became too small. */
128 | LBFGSERR_INCORRECT_TMINMAX,
129 | /** A rounding error occurred; alternatively, no line-search step
130 | satisfies the sufficient decrease and curvature conditions. */
131 | LBFGSERR_ROUNDING_ERROR,
132 | /** The line-search step became smaller than lbfgs_parameter_t::min_step. */
133 | LBFGSERR_MINIMUMSTEP,
134 | /** The line-search step became larger than lbfgs_parameter_t::max_step. */
135 | LBFGSERR_MAXIMUMSTEP,
136 | /** The line-search routine reaches the maximum number of evaluations. */
137 | LBFGSERR_MAXIMUMLINESEARCH,
138 | /** The algorithm routine reaches the maximum number of iterations. */
139 | LBFGSERR_MAXIMUMITERATION,
140 | /** Relative width of the interval of uncertainty is at most
141 | lbfgs_parameter_t::xtol. */
142 | LBFGSERR_WIDTHTOOSMALL,
143 | /** A logic error (negative line-search step) occurred. */
144 | LBFGSERR_INVALIDPARAMETERS,
145 | /** The current search direction increases the objective function value. */
146 | LBFGSERR_INCREASEGRADIENT,
147 | };
148 |
149 | /**
150 | * Line search algorithms.
151 | */
152 | enum {
153 | /** The default algorithm (MoreThuente method). */
154 | LBFGS_LINESEARCH_DEFAULT = 0,
155 | /** MoreThuente method proposd by More and Thuente. */
156 | LBFGS_LINESEARCH_MORETHUENTE = 0,
157 | /**
158 | * Backtracking method with the Armijo condition.
159 | * The backtracking method finds the step length such that it satisfies
160 | * the sufficient decrease (Armijo) condition,
161 | * - f(x + a * d) <= f(x) + lbfgs_parameter_t::ftol * a * g(x)^T d,
162 | *
163 | * where x is the current point, d is the current search direction, and
164 | * a is the step length.
165 | */
166 | LBFGS_LINESEARCH_BACKTRACKING_ARMIJO = 1,
167 | /** The backtracking method with the defualt (regular Wolfe) condition. */
168 | LBFGS_LINESEARCH_BACKTRACKING = 2,
169 | /**
170 | * Backtracking method with regular Wolfe condition.
171 | * The backtracking method finds the step length such that it satisfies
172 | * both the Armijo condition (LBFGS_LINESEARCH_BACKTRACKING_ARMIJO)
173 | * and the curvature condition,
174 | * - g(x + a * d)^T d >= lbfgs_parameter_t::wolfe * g(x)^T d,
175 | *
176 | * where x is the current point, d is the current search direction, and
177 | * a is the step length.
178 | */
179 | LBFGS_LINESEARCH_BACKTRACKING_WOLFE = 2,
180 | /**
181 | * Backtracking method with strong Wolfe condition.
182 | * The backtracking method finds the step length such that it satisfies
183 | * both the Armijo condition (LBFGS_LINESEARCH_BACKTRACKING_ARMIJO)
184 | * and the following condition,
185 | * - |g(x + a * d)^T d| <= lbfgs_parameter_t::wolfe * |g(x)^T d|,
186 | *
187 | * where x is the current point, d is the current search direction, and
188 | * a is the step length.
189 | */
190 | LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 3,
191 | };
192 |
193 | /**
194 | * L-BFGS optimization parameters.
195 | * Call lbfgs_parameter_init() function to initialize parameters to the
196 | * default values.
197 | */
198 | typedef struct {
199 | /**
200 | * The number of corrections to approximate the inverse hessian matrix.
201 | * The L-BFGS routine stores the computation results of previous \ref m
202 | * iterations to approximate the inverse hessian matrix of the current
203 | * iteration. This parameter controls the size of the limited memories
204 | * (corrections). The default value is \c 6. Values less than \c 3 are
205 | * not recommended. Large values will result in excessive computing time.
206 | */
207 | int m;
208 |
209 | /**
210 | * Epsilon for convergence test.
211 | * This parameter determines the accuracy with which the solution is to
212 | * be found. A minimization terminates when
213 | * ||g|| < \ref epsilon * max(1, ||x||),
214 | * where ||.|| denotes the Euclidean (L2) norm. The default value is
215 | * \c 1e-5.
216 | */
217 | lbfgsfloatval_t epsilon;
218 |
219 | /**
220 | * Distance for delta-based convergence test.
221 | * This parameter determines the distance, in iterations, to compute
222 | * the rate of decrease of the objective function. If the value of this
223 | * parameter is zero, the library does not perform the delta-based
224 | * convergence test. The default value is \c 0.
225 | */
226 | int past;
227 |
228 | /**
229 | * Delta for convergence test.
230 | * This parameter determines the minimum rate of decrease of the
231 | * objective function. The library stops iterations when the
232 | * following condition is met:
233 | * (f' - f) / f < \ref delta,
234 | * where f' is the objective value of \ref past iterations ago, and f is
235 | * the objective value of the current iteration.
236 | * The default value is \c 0.
237 | */
238 | lbfgsfloatval_t delta;
239 |
240 | /**
241 | * The maximum number of iterations.
242 | * The lbfgs() function terminates an optimization process with
243 | * ::LBFGSERR_MAXIMUMITERATION status code when the iteration count
244 | * exceedes this parameter. Setting this parameter to zero continues an
245 | * optimization process until a convergence or error. The default value
246 | * is \c 0.
247 | */
248 | int max_iterations;
249 |
250 | /**
251 | * The line search algorithm.
252 | * This parameter specifies a line search algorithm to be used by the
253 | * L-BFGS routine.
254 | */
255 | int linesearch;
256 |
257 | /**
258 | * The maximum number of trials for the line search.
259 | * This parameter controls the number of function and gradients evaluations
260 | * per iteration for the line search routine. The default value is \c 20.
261 | */
262 | int max_linesearch;
263 |
264 | /**
265 | * The minimum step of the line search routine.
266 | * The default value is \c 1e-20. This value need not be modified unless
267 | * the exponents are too large for the machine being used, or unless the
268 | * problem is extremely badly scaled (in which case the exponents should
269 | * be increased).
270 | */
271 | lbfgsfloatval_t min_step;
272 |
273 | /**
274 | * The maximum step of the line search.
275 | * The default value is \c 1e+20. This value need not be modified unless
276 | * the exponents are too large for the machine being used, or unless the
277 | * problem is extremely badly scaled (in which case the exponents should
278 | * be increased).
279 | */
280 | lbfgsfloatval_t max_step;
281 |
282 | /**
283 | * A parameter to control the accuracy of the line search routine.
284 | * The default value is \c 1e-4. This parameter should be greater
285 | * than zero and smaller than \c 0.5.
286 | */
287 | lbfgsfloatval_t ftol;
288 |
289 | /**
290 | * A coefficient for the Wolfe condition.
291 | * This parameter is valid only when the backtracking line-search
292 | * algorithm is used with the Wolfe condition,
293 | * ::LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE or
294 | * ::LBFGS_LINESEARCH_BACKTRACKING_WOLFE .
295 | * The default value is \c 0.9. This parameter should be greater
296 | * the \ref ftol parameter and smaller than \c 1.0.
297 | */
298 | lbfgsfloatval_t wolfe;
299 |
300 | /**
301 | * A parameter to control the accuracy of the line search routine.
302 | * The default value is \c 0.9. If the function and gradient
303 | * evaluations are inexpensive with respect to the cost of the
304 | * iteration (which is sometimes the case when solving very large
305 | * problems) it may be advantageous to set this parameter to a small
306 | * value. A typical small value is \c 0.1. This parameter shuold be
307 | * greater than the \ref ftol parameter (\c 1e-4) and smaller than
308 | * \c 1.0.
309 | */
310 | lbfgsfloatval_t gtol;
311 |
312 | /**
313 | * The machine precision for floating-point values.
314 | * This parameter must be a positive value set by a client program to
315 | * estimate the machine precision. The line search routine will terminate
316 | * with the status code (::LBFGSERR_ROUNDING_ERROR) if the relative width
317 | * of the interval of uncertainty is less than this parameter.
318 | */
319 | lbfgsfloatval_t xtol;
320 |
321 | /**
322 | * Coeefficient for the L1 norm of variables.
323 | * This parameter should be set to zero for standard minimization
324 | * problems. Setting this parameter to a positive value activates
325 | * Orthant-Wise Limited-memory Quasi-Newton (OWL-QN) method, which
326 | * minimizes the objective function F(x) combined with the L1 norm |x|
327 | * of the variables, {F(x) + C |x|}. This parameter is the coeefficient
328 | * for the |x|, i.e., C. As the L1 norm |x| is not differentiable at
329 | * zero, the library modifies function and gradient evaluations from
330 | * a client program suitably; a client program thus have only to return
331 | * the function value F(x) and gradients G(x) as usual. The default value
332 | * is zero.
333 | */
334 | lbfgsfloatval_t orthantwise_c;
335 |
336 | /**
337 | * Start index for computing L1 norm of the variables.
338 | * This parameter is valid only for OWL-QN method
339 | * (i.e., \ref orthantwise_c != 0). This parameter b (0 <= b < N)
340 | * specifies the index number from which the library computes the
341 | * L1 norm of the variables x,
342 | * |x| := |x_{b}| + |x_{b+1}| + ... + |x_{N}| .
343 | * In other words, variables x_1, ..., x_{b-1} are not used for
344 | * computing the L1 norm. Setting b (0 < b < N), one can protect
345 | * variables, x_1, ..., x_{b-1} (e.g., a bias term of logistic
346 | * regression) from being regularized. The default value is zero.
347 | */
348 | int orthantwise_start;
349 |
350 | /**
351 | * End index for computing L1 norm of the variables.
352 | * This parameter is valid only for OWL-QN method
353 | * (i.e., \ref orthantwise_c != 0). This parameter e (0 < e <= N)
354 | * specifies the index number at which the library stops computing the
355 | * L1 norm of the variables x,
356 | */
357 | int orthantwise_end;
358 | } lbfgs_parameter_t;
359 |
360 |
361 | /**
362 | * Callback interface to provide objective function and gradient evaluations.
363 | *
364 | * The lbfgs() function call this function to obtain the values of objective
365 | * function and its gradients when needed. A client program must implement
366 | * this function to evaluate the values of the objective function and its
367 | * gradients, given current values of variables.
368 | *
369 | * @param instance The user data sent for lbfgs() function by the client.
370 | * @param x The current values of variables.
371 | * @param g The gradient vector. The callback function must compute
372 | * the gradient values for the current variables.
373 | * @param n The number of variables.
374 | * @param step The current step of the line search routine.
375 | * @retval lbfgsfloatval_t The value of the objective function for the current
376 | * variables.
377 | */
378 | typedef lbfgsfloatval_t (*lbfgs_evaluate_t)(
379 | void *instance,
380 | const lbfgsfloatval_t *x,
381 | lbfgsfloatval_t *g,
382 | const int n,
383 | const lbfgsfloatval_t step
384 | );
385 |
386 | /**
387 | * Callback interface to receive the progress of the optimization process.
388 | *
389 | * The lbfgs() function call this function for each iteration. Implementing
390 | * this function, a client program can store or display the current progress
391 | * of the optimization process.
392 | *
393 | * @param instance The user data sent for lbfgs() function by the client.
394 | * @param x The current values of variables.
395 | * @param g The current gradient values of variables.
396 | * @param fx The current value of the objective function.
397 | * @param xnorm The Euclidean norm of the variables.
398 | * @param gnorm The Euclidean norm of the gradients.
399 | * @param step The line-search step used for this iteration.
400 | * @param n The number of variables.
401 | * @param k The iteration count.
402 | * @param ls The number of evaluations called for this iteration.
403 | * @retval int Zero to continue the optimization process. Returning a
404 | * non-zero value will cancel the optimization process.
405 | */
406 | typedef int (*lbfgs_progress_t)(
407 | void *instance,
408 | const lbfgsfloatval_t *x,
409 | const lbfgsfloatval_t *g,
410 | const lbfgsfloatval_t fx,
411 | const lbfgsfloatval_t xnorm,
412 | const lbfgsfloatval_t gnorm,
413 | const lbfgsfloatval_t step,
414 | int n,
415 | int k,
416 | int ls
417 | );
418 |
419 | /*
420 | A user must implement a function compatible with ::lbfgs_evaluate_t (evaluation
421 | callback) and pass the pointer to the callback function to lbfgs() arguments.
422 | Similarly, a user can implement a function compatible with ::lbfgs_progress_t
423 | (progress callback) to obtain the current progress (e.g., variables, function
424 | value, ||G||, etc) and to cancel the iteration process if necessary.
425 | Implementation of a progress callback is optional: a user can pass \c NULL if
426 | progress notification is not necessary.
427 |
428 | In addition, a user must preserve two requirements:
429 | - The number of variables must be multiples of 16 (this is not 4).
430 | - The memory block of variable array ::x must be aligned to 16.
431 |
432 | This algorithm terminates an optimization
433 | when:
434 |
435 | ||G|| < \epsilon \cdot \max(1, ||x||) .
436 |
437 | In this formula, ||.|| denotes the Euclidean norm.
438 | */
439 |
440 | /**
441 | * Start a L-BFGS optimization.
442 | *
443 | * @param n The number of variables.
444 | * @param x The array of variables. A client program can set
445 | * default values for the optimization and receive the
446 | * optimization result through this array. This array
447 | * must be allocated by ::lbfgs_malloc function
448 | * for libLBFGS built with SSE/SSE2 optimization routine
449 | * enabled. The library built without SSE/SSE2
450 | * optimization does not have such a requirement.
451 | * @param ptr_fx The pointer to the variable that receives the final
452 | * value of the objective function for the variables.
453 | * This argument can be set to \c NULL if the final
454 | * value of the objective function is unnecessary.
455 | * @param proc_evaluate The callback function to provide function and
456 | * gradient evaluations given a current values of
457 | * variables. A client program must implement a
458 | * callback function compatible with \ref
459 | * lbfgs_evaluate_t and pass the pointer to the
460 | * callback function.
461 | * @param proc_progress The callback function to receive the progress
462 | * (the number of iterations, the current value of
463 | * the objective function) of the minimization
464 | * process. This argument can be set to \c NULL if
465 | * a progress report is unnecessary.
466 | * @param instance A user data for the client program. The callback
467 | * functions will receive the value of this argument.
468 | * @param param The pointer to a structure representing parameters for
469 | * L-BFGS optimization. A client program can set this
470 | * parameter to \c NULL to use the default parameters.
471 | * Call lbfgs_parameter_init() function to fill a
472 | * structure with the default values.
473 | * @retval int The status code. This function returns zero if the
474 | * minimization process terminates without an error. A
475 | * non-zero value indicates an error.
476 | */
477 | int lbfgs(
478 | int n,
479 | lbfgsfloatval_t *x,
480 | lbfgsfloatval_t *ptr_fx,
481 | lbfgs_evaluate_t proc_evaluate,
482 | lbfgs_progress_t proc_progress,
483 | void *instance,
484 | lbfgs_parameter_t *param
485 | );
486 |
487 | /**
488 | * Initialize L-BFGS parameters to the default values.
489 | *
490 | * Call this function to fill a parameter structure with the default values
491 | * and overwrite parameter values if necessary.
492 | *
493 | * @param param The pointer to the parameter structure.
494 | */
495 | void lbfgs_parameter_init(lbfgs_parameter_t *param);
496 |
497 | /**
498 | * Allocate an array for variables.
499 | *
500 | * This function allocates an array of variables for the convenience of
501 | * ::lbfgs function; the function has a requreiemt for a variable array
502 | * when libLBFGS is built with SSE/SSE2 optimization routines. A user does
503 | * not have to use this function for libLBFGS built without SSE/SSE2
504 | * optimization.
505 | *
506 | * @param n The number of variables.
507 | */
508 | lbfgsfloatval_t* lbfgs_malloc(int n);
509 |
510 | /**
511 | * Free an array of variables.
512 | *
513 | * @param x The array of variables allocated by ::lbfgs_malloc
514 | * function.
515 | */
516 | void lbfgs_free(lbfgsfloatval_t *x);
517 |
518 |
519 | /**
520 | * Get string description of an lbfgs() return code.
521 | *
522 | * @param err A value returned by lbfgs().
523 | */
524 | const char* lbfgs_strerror(int err);
525 |
526 | /** @} */
527 |
528 | #ifdef __cplusplus
529 | }
530 | #endif/*__cplusplus*/
531 |
532 |
533 |
534 | /**
535 | @mainpage libLBFGS: a library of Limited-memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS)
536 |
537 | @section intro Introduction
538 |
539 | This library is a C port of the implementation of Limited-memory
540 | Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
541 | The original FORTRAN source code is available at:
542 | http://www.ece.northwestern.edu/~nocedal/lbfgs.html
543 |
544 | The L-BFGS method solves the unconstrainted minimization problem,
545 |
546 |
547 | minimize F(x), x = (x1, x2, ..., xN),
548 |
549 |
550 | only if the objective function F(x) and its gradient G(x) are computable. The
551 | well-known Newton's method requires computation of the inverse of the hessian
552 | matrix of the objective function. However, the computational cost for the
553 | inverse hessian matrix is expensive especially when the objective function
554 | takes a large number of variables. The L-BFGS method iteratively finds a
555 | minimizer by approximating the inverse hessian matrix by information from last
556 | m iterations. This innovation saves the memory storage and computational time
557 | drastically for large-scaled problems.
558 |
559 | Among the various ports of L-BFGS, this library provides several features:
560 | - Optimization with L1-norm (Orthant-Wise Limited-memory Quasi-Newton
561 | (OWL-QN) method):
562 | In addition to standard minimization problems, the library can minimize
563 | a function F(x) combined with L1-norm |x| of the variables,
564 | {F(x) + C |x|}, where C is a constant scalar parameter. This feature is
565 | useful for estimating parameters of sparse log-linear models (e.g.,
566 | logistic regression and maximum entropy) with L1-regularization (or
567 | Laplacian prior).
568 | - Clean C code:
569 | Unlike C codes generated automatically by f2c (Fortran 77 into C converter),
570 | this port includes changes based on my interpretations, improvements,
571 | optimizations, and clean-ups so that the ported code would be well-suited
572 | for a C code. In addition to comments inherited from the original code,
573 | a number of comments were added through my interpretations.
574 | - Callback interface:
575 | The library receives function and gradient values via a callback interface.
576 | The library also notifies the progress of the optimization by invoking a
577 | callback function. In the original implementation, a user had to set
578 | function and gradient values every time the function returns for obtaining
579 | updated values.
580 | - Thread safe:
581 | The library is thread-safe, which is the secondary gain from the callback
582 | interface.
583 | - Cross platform. The source code can be compiled on Microsoft Visual
584 | Studio 2010, GNU C Compiler (gcc), etc.
585 | - Configurable precision: A user can choose single-precision (float)
586 | or double-precision (double) accuracy by changing ::LBFGS_FLOAT macro.
587 | - SSE/SSE2 optimization:
588 | This library includes SSE/SSE2 optimization (written in compiler intrinsics)
589 | for vector arithmetic operations on Intel/AMD processors. The library uses
590 | SSE for float values and SSE2 for double values. The SSE/SSE2 optimization
591 | routine is disabled by default.
592 |
593 | This library is used by:
594 | - CRFsuite: A fast implementation of Conditional Random Fields (CRFs)
595 | - Classias: A collection of machine-learning algorithms for classification
596 | - mlegp: an R package for maximum likelihood estimates for Gaussian processes
597 | - imaging2: the imaging2 class library
598 | - Algorithm::LBFGS - Perl extension for L-BFGS
599 | - YAP-LBFGS (an interface to call libLBFGS from YAP Prolog)
600 |
601 | @section download Download
602 |
603 | - Source code
604 | - GitHub repository
605 |
606 | libLBFGS is distributed under the term of the
607 | MIT license.
608 |
609 | @section changelog History
610 | - Version 1.10 (2010-12-22):
611 | - Fixed compiling errors on Mac OS X; this patch was kindly submitted by
612 | Nic Schraudolph.
613 | - Reduced compiling warnings on Mac OS X; this patch was kindly submitted
614 | by Tamas Nepusz.
615 | - Replaced memalign() with posix_memalign().
616 | - Updated solution and project files for Microsoft Visual Studio 2010.
617 | - Version 1.9 (2010-01-29):
618 | - Fixed a mistake in checking the validity of the parameters "ftol" and
619 | "wolfe"; this was discovered by Kevin S. Van Horn.
620 | - Version 1.8 (2009-07-13):
621 | - Accepted the patch submitted by Takashi Imamichi;
622 | the backtracking method now has three criteria for choosing the step
623 | length:
624 | - ::LBFGS_LINESEARCH_BACKTRACKING_ARMIJO: sufficient decrease (Armijo)
625 | condition only
626 | - ::LBFGS_LINESEARCH_BACKTRACKING_WOLFE: regular Wolfe condition
627 | (sufficient decrease condition + curvature condition)
628 | - ::LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE: strong Wolfe condition
629 | - Updated the documentation to explain the above three criteria.
630 | - Version 1.7 (2009-02-28):
631 | - Improved OWL-QN routines for stability.
632 | - Removed the support of OWL-QN method in MoreThuente algorithm because
633 | it accidentally fails in early stages of iterations for some objectives.
634 | Because of this change, the OW-LQN method must be used with the
635 | backtracking algorithm (::LBFGS_LINESEARCH_BACKTRACKING), or the
636 | library returns ::LBFGSERR_INVALID_LINESEARCH.
637 | - Renamed line search algorithms as follows:
638 | - ::LBFGS_LINESEARCH_BACKTRACKING: regular Wolfe condition.
639 | - ::LBFGS_LINESEARCH_BACKTRACKING_LOOSE: regular Wolfe condition.
640 | - ::LBFGS_LINESEARCH_BACKTRACKING_STRONG: strong Wolfe condition.
641 | - Source code clean-up.
642 | - Version 1.6 (2008-11-02):
643 | - Improved line-search algorithm with strong Wolfe condition, which was
644 | contributed by Takashi Imamichi. This routine is now default for
645 | ::LBFGS_LINESEARCH_BACKTRACKING. The previous line search algorithm
646 | with regular Wolfe condition is still available as
647 | ::LBFGS_LINESEARCH_BACKTRACKING_LOOSE.
648 | - Configurable stop index for L1-norm computation. A member variable
649 | ::lbfgs_parameter_t::orthantwise_end was added to specify the index
650 | number at which the library stops computing the L1 norm of the
651 | variables. This is useful to prevent some variables from being
652 | regularized by the OW-LQN method.
653 | - A sample program written in C++ (sample/sample.cpp).
654 | - Version 1.5 (2008-07-10):
655 | - Configurable starting index for L1-norm computation. A member variable
656 | ::lbfgs_parameter_t::orthantwise_start was added to specify the index
657 | number from which the library computes the L1 norm of the variables.
658 | This is useful to prevent some variables from being regularized by the
659 | OWL-QN method.
660 | - Fixed a zero-division error when the initial variables have already
661 | been a minimizer (reported by Takashi Imamichi). In this case, the
662 | library returns ::LBFGS_ALREADY_MINIMIZED status code.
663 | - Defined ::LBFGS_SUCCESS status code as zero; removed unused constants,
664 | LBFGSFALSE and LBFGSTRUE.
665 | - Fixed a compile error in an implicit down-cast.
666 | - Version 1.4 (2008-04-25):
667 | - Configurable line search algorithms. A member variable
668 | ::lbfgs_parameter_t::linesearch was added to choose either MoreThuente
669 | method (::LBFGS_LINESEARCH_MORETHUENTE) or backtracking algorithm
670 | (::LBFGS_LINESEARCH_BACKTRACKING).
671 | - Fixed a bug: the previous version did not compute psuedo-gradients
672 | properly in the line search routines for OWL-QN. This bug might quit
673 | an iteration process too early when the OWL-QN routine was activated
674 | (0 < ::lbfgs_parameter_t::orthantwise_c).
675 | - Configure script for POSIX environments.
676 | - SSE/SSE2 optimizations with GCC.
677 | - New functions ::lbfgs_malloc and ::lbfgs_free to use SSE/SSE2 routines
678 | transparently. It is uncessary to use these functions for libLBFGS built
679 | without SSE/SSE2 routines; you can still use any memory allocators if
680 | SSE/SSE2 routines are disabled in libLBFGS.
681 | - Version 1.3 (2007-12-16):
682 | - An API change. An argument was added to lbfgs() function to receive the
683 | final value of the objective function. This argument can be set to
684 | \c NULL if the final value is unnecessary.
685 | - Fixed a null-pointer bug in the sample code (reported by Takashi Imamichi).
686 | - Added build scripts for Microsoft Visual Studio 2005 and GCC.
687 | - Added README file.
688 | - Version 1.2 (2007-12-13):
689 | - Fixed a serious bug in orthant-wise L-BFGS.
690 | An important variable was used without initialization.
691 | - Version 1.1 (2007-12-01):
692 | - Implemented orthant-wise L-BFGS.
693 | - Implemented lbfgs_parameter_init() function.
694 | - Fixed several bugs.
695 | - API documentation.
696 | - Version 1.0 (2007-09-20):
697 | - Initial release.
698 |
699 | @section api Documentation
700 |
701 | - @ref liblbfgs_api "libLBFGS API"
702 |
703 | @section sample Sample code
704 |
705 | @include sample.c
706 |
707 | @section ack Acknowledgements
708 |
709 | The L-BFGS algorithm is described in:
710 | - Jorge Nocedal.
711 | Updating Quasi-Newton Matrices with Limited Storage.
712 | Mathematics of Computation, Vol. 35, No. 151, pp. 773--782, 1980.
713 | - Dong C. Liu and Jorge Nocedal.
714 | On the limited memory BFGS method for large scale optimization.
715 | Mathematical Programming B, Vol. 45, No. 3, pp. 503-528, 1989.
716 |
717 | The line search algorithms used in this implementation are described in:
718 | - John E. Dennis and Robert B. Schnabel.
719 | Numerical Methods for Unconstrained Optimization and Nonlinear
720 | Equations, Englewood Cliffs, 1983.
721 | - Jorge J. More and David J. Thuente.
722 | Line search algorithm with guaranteed sufficient decrease.
723 | ACM Transactions on Mathematical Software (TOMS), Vol. 20, No. 3,
724 | pp. 286-307, 1994.
725 |
726 | This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
727 | method presented in:
728 | - Galen Andrew and Jianfeng Gao.
729 | Scalable training of L1-regularized log-linear models.
730 | In Proceedings of the 24th International Conference on Machine
731 | Learning (ICML 2007), pp. 33-40, 2007.
732 |
733 | Special thanks go to:
734 | - Yoshimasa Tsuruoka and Daisuke Okanohara for technical information about
735 | OWL-QN
736 | - Takashi Imamichi for the useful enhancements of the backtracking method
737 | - Kevin S. Van Horn, Nic Schraudolph, and Tamas Nepusz for bug fixes
738 |
739 | Finally I would like to thank the original author, Jorge Nocedal, who has been
740 | distributing the effieicnt and explanatory implementation in an open source
741 | licence.
742 |
743 | @section reference Reference
744 |
745 | - L-BFGS by Jorge Nocedal.
746 | - Orthant-Wise Limited-memory Quasi-Newton Optimizer for L1-regularized Objectives by Galen Andrew.
747 | - C port (via f2c) by Taku Kudo.
748 | - C#/C++/Delphi/VisualBasic6 port in ALGLIB.
749 | - Computational Crystallography Toolbox includes
750 | scitbx::lbfgs.
751 | */
752 |
753 | #endif/*__LBFGS_H__*/
754 |
--------------------------------------------------------------------------------
/cpp/loss.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | template T safe_log(const T x) {
4 | return log(std::max(EPSILON, x));
5 | }
6 |
7 | // Parent class for all losses
8 | template
9 | class BaseLoss {
10 | public:
11 | BaseLoss() {}
12 | BaseLoss(Kernel &ker) {}
13 |
14 | virtual double loss(const double* v1, const double* v2, int N) const = 0;
15 | virtual void gradient(const double* v1, const double* v2, int N, double* res) const = 0;
16 | };
17 |
18 |
19 | template
20 | class QuadraticLoss : public BaseLoss {
21 | public:
22 | QuadraticLoss() {}
23 | QuadraticLoss(Kernel &ker) {}
24 |
25 | double loss(const double* v1, const double* v2, int N) const {
26 | double r = 0;
27 | for (int i=0; i
44 | class TVLoss : public BaseLoss {
45 | public:
46 | TVLoss() {}
47 | TVLoss(Kernel &ker) {}
48 |
49 | double loss(const double* v1, const double* v2, int N) const {
50 | double r = 0;
51 | for (int i=0; i0?0.5:-0.5);
63 | }
64 |
65 | }
66 | };
67 |
68 | template
69 | class KLLoss : public BaseLoss {
70 | public:
71 | KLLoss() {}
72 | KLLoss(Kernel &ker) {}
73 |
74 | double loss(const double* v1, const double* v2, int N) const {
75 | double r = 0;
76 | for (int i=0; i
93 | class WassersteinLoss : public BaseLoss {
94 | public:
95 | WassersteinLoss(Kernel &ker, int n_iter = 50) : kernel(&ker), num_iter(n_iter) {}
96 |
97 | Kernel * kernel;
98 | int num_iter;
99 |
100 | double loss(const double* v1, const double* v2, int N) const {return myWloss(v1,v2,N,num_iter);}
101 |
102 | double myWloss(const double* v1, const double* v2, int N, int n_iter) const {
103 |
104 | int Niters = n_iter;
105 | std::vector a(N, 1.), b(N, 1.), convolution(N);
106 |
107 | // Bregman Projections
108 | for (int iter=0; iterconvolveAdjoint(&b[0], &convolution[0], 1);
111 | for (int j=0; jconvolve(&a[0], &convolution[0], 1);
115 | for (int j=0; jconvolveAdjoint(&b[0], &convolution[0], 1);
122 | for (int j=0; jgamma*l;
127 | }
128 |
129 | void gradient(const double* v1, const double* v2, int N, double* res) const {myWgradient(v1,v2,N,res,num_iter);}
130 |
131 | void myWgradient(const double* v1, const double* v2, int N, double* res, int n_iter) const {
132 |
133 | int Niters = n_iter;
134 | std::vector a(N, 1.), b(N, 1.), convolution(N);
135 |
136 | // Bregman Projections
137 | for (int iter=0; iterconvolveAdjoint(&b[0], &convolution[0]);
140 | for (int j=0; jconvolve(&a[0], &convolution[0]);
144 | for (int j=0; jgamma*safe_log(a[i]);
151 | }
152 | }
153 |
154 | };
155 |
--------------------------------------------------------------------------------
/cpp/main_dictionary_learning.cpp:
--------------------------------------------------------------------------------
1 | // Uncomment for log-domain stabilization
2 | //#define COMPUTE_BARYCENTER_LOG
3 |
4 | #include "inverseWasserstein.h"
5 | #include "histogramIO.h"
6 | #include "cimg/CImg.h"
7 | #include "chrono.h"
8 |
9 | #include