├── LICENSE ├── Makefile ├── README.md ├── corpus.cpp ├── corpus.h ├── ctr ├── ctr.cpp ├── ctr.h ├── ctr.submit.sh ├── data.cpp ├── data.h ├── main.cpp ├── opt.cpp ├── opt.h ├── qsub.sh ├── run.sh ├── script.sh ├── utils.cpp └── utils.h /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC = g++ -Wall 2 | #CC = g++ -ansi -Wall -pedantic 3 | #CFLAGS = -g -Wall -O3 -ffast-math -DHAVE_INLINE -DGSL_RANGE_CHECK_OFF 4 | # CFLAGS = -g -Wall 5 | LDFLAGS = -lgsl -lm -lgslcblas 6 | 7 | GSL_INCLUDE = /home/chongw/include 8 | GSL_LIB = /home/chongw/lib 9 | 10 | GSL_INCLUDE_MAC = /usr/local/include/ 11 | GSL_LIB_MAC = /usr/local/lib/ 12 | 13 | LSOURCE = main.cpp utils.cpp corpus.cpp ctr.cpp data.cpp opt.cpp 14 | LHEADER = utils.h corpus.h ctr.h data.h opt.h 15 | 16 | mac: $(LSOURCE) $(HEADER) 17 | $(CC) -I$(GSL_INCLUDE_MAC) -L$(GSL_LIB_MAC) $(LSOURCE) -o ctr $(LDFLAGS) 18 | 19 | mac-d: $(LSOURCE) $(HEADER) 20 | $(CC) -g -I$(GSL_INCLUDE_MAC) -L$(GSL_LIB_MAC) $(LSOURCE) -o ctr $@ $(LDFLAGS) 21 | 22 | linux: $(LSOURCE) $(HEADER) 23 | $(CC) -I$(GSL_INCLUDE) -L$(GSL_LIB) $(LSOURCE) -o ctr-condor $(LDFLAGS) 24 | 25 | linux-d: $(LSOURCE) $(HEADER) 26 | $(CC) -g -I$(GSL_INCLUDE) -L$(GSL_LIB_MAC) $(LSOURCE) -o ctr-condor $@ $(LDFLAGS) 27 | 28 | clean: 29 | -rm -f ctr 30 | clean-d: 31 | -rm -f ctr 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ctr 2 | 3 | Collaborative modeling for recommendation. Implements variational inference for a collaborative topic models. These models recommend items to users based on item content and other users' ratings. Written by [Chong Wang](http://www.cs.princeton.edu/~chongw/index.html). Fun fact: this inspired the New York Times' engine ([blog post](http://open.blogs.nytimes.com/2015/08/11/building-the-next-new-york-times-recommendation-engine/))! 4 | 5 | ## Demo 6 | 7 | http://www.cs.cmu.edu/~chongw/data/citeulike/ 8 | 9 | ## Sample data 10 | 11 | http://www.cs.cmu.edu/~chongw/data/citeulike/ 12 | 13 | ## Reference 14 | 15 | Chong Wang and David M. Blei. Collaborative topic modeling for recommending scientific articles. In KDD 2011. [PDF](http://www.cs.princeton.edu/~chongw/papers/WangBlei2011.pdf) 16 | -------------------------------------------------------------------------------- /corpus.cpp: -------------------------------------------------------------------------------- 1 | #include "corpus.h" 2 | #include 3 | #include 4 | 5 | c_corpus::c_corpus() { 6 | m_num_docs = 0; 7 | m_size_vocab = 0; 8 | m_num_total_words = 0; 9 | } 10 | 11 | c_corpus::~c_corpus() { 12 | for (int i = 0; i < m_num_docs; i ++) { 13 | c_document * doc = m_docs[i]; 14 | delete doc; 15 | } 16 | m_docs.clear(); 17 | 18 | m_num_docs = 0; 19 | m_size_vocab = 0; 20 | m_num_total_words = 0; 21 | } 22 | 23 | void c_corpus::read_data(const char * data_filename, int OFFSET) { 24 | 25 | int length = 0, count = 0, word = 0, n = 0, nd = 0, nw = 0; 26 | 27 | FILE * fileptr; 28 | fileptr = fopen(data_filename, "r"); 29 | nd = 0; 30 | nw = 0; 31 | 32 | printf("reading data from %s\n", data_filename); 33 | while ((fscanf(fileptr, "%10d", &length) != EOF)) { 34 | c_document * doc = new c_document(length); 35 | for (n = 0; n < length; n++) { 36 | fscanf(fileptr, "%10d:%10d", &word, &count); 37 | word = word - OFFSET; 38 | doc->m_words[n] = word; 39 | doc->m_counts[n] = count; 40 | doc->m_total += count; 41 | if (word >= nw) 42 | nw = word + 1; 43 | } 44 | m_num_total_words += doc->m_total; 45 | m_docs.push_back(doc); 46 | nd++; 47 | } 48 | fclose(fileptr); 49 | m_num_docs = nd; 50 | m_size_vocab = nw; 51 | printf("number of docs : %d\n", nd); 52 | printf("number of terms : %d\n", nw); 53 | printf("number of total words : %d\n", m_num_total_words); 54 | } 55 | 56 | int c_corpus::max_corpus_length() const { 57 | int max_length = 0; 58 | 59 | for (int d = 0; d < m_num_docs; d++) { 60 | if (m_docs[d]->m_length > max_length) 61 | max_length = m_docs[d]->m_length; 62 | } 63 | return max_length; 64 | } 65 | 66 | -------------------------------------------------------------------------------- /corpus.h: -------------------------------------------------------------------------------- 1 | // class for lda-c format 2 | // 3 | #ifndef CORPUS_H 4 | #define CORPUS_H 5 | 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | class c_document { 12 | public: 13 | /* for document itself */ 14 | int * m_words; 15 | int * m_counts; 16 | int m_length; 17 | int m_total; 18 | 19 | public: 20 | c_document() { 21 | m_words = NULL; 22 | m_counts = NULL; 23 | m_length = 0; 24 | m_total = 0; 25 | } 26 | 27 | c_document(int len) { 28 | m_length = len; 29 | m_words = new int [len]; 30 | m_counts = new int [len]; 31 | m_total = 0; 32 | } 33 | 34 | ~c_document() { 35 | if (m_words != NULL) { 36 | delete [] m_words; 37 | delete [] m_counts; 38 | m_length = 0; 39 | m_total = 0; 40 | } 41 | } 42 | }; 43 | 44 | class c_corpus { 45 | public: 46 | c_corpus(); 47 | ~c_corpus(); 48 | void read_data(const char * data_filename, int OFFSET=0); 49 | int max_corpus_length() const; 50 | public: 51 | int m_num_docs; 52 | int m_size_vocab; 53 | int m_num_total_words; 54 | vector m_docs; 55 | }; 56 | 57 | #endif // CORPUS_H 58 | -------------------------------------------------------------------------------- /ctr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/ctr/a21d0a81e451b9c1ccf05143f6915bb11bf66545/ctr -------------------------------------------------------------------------------- /ctr.cpp: -------------------------------------------------------------------------------- 1 | #include "ctr.h" 2 | #include "opt.h" 3 | 4 | extern gsl_rng * RANDOM_NUMBER; 5 | int min_iter = 15; 6 | double beta_smooth = 0.01; 7 | 8 | c_ctr::c_ctr() { 9 | m_beta = NULL; 10 | m_theta = NULL; 11 | m_U = NULL; 12 | m_V = NULL; 13 | 14 | m_num_factors = 0; // m_num_topics 15 | m_num_items = 0; // m_num_docs 16 | m_num_users = 0; // num of users 17 | } 18 | 19 | c_ctr::~c_ctr() { 20 | // free memory 21 | if (m_beta != NULL) gsl_matrix_free(m_beta); 22 | if (m_theta != NULL) gsl_matrix_free(m_theta); 23 | if (m_U != NULL) gsl_matrix_free(m_U); 24 | if (m_V != NULL) gsl_matrix_free(m_V); 25 | } 26 | 27 | void c_ctr::read_init_information(const char* theta_init_path, 28 | const char* beta_init_path, 29 | const c_corpus* c, 30 | double alpha_smooth) { 31 | int num_topics = m_num_factors; 32 | m_theta = gsl_matrix_alloc(c->m_num_docs, num_topics); 33 | printf("\nreading theta initialization from %s\n", theta_init_path); 34 | FILE * f = fopen(theta_init_path, "r"); 35 | mtx_fscanf(f, m_theta); 36 | fclose(f); 37 | 38 | //smoothing 39 | gsl_matrix_add_constant(m_theta, alpha_smooth); 40 | 41 | //normalize m_theta, in case it's not 42 | for (size_t j = 0; j < m_theta->size1; j ++) { 43 | gsl_vector_view theta_v = gsl_matrix_row(m_theta, j); 44 | vnormalize(&theta_v.vector); 45 | } 46 | 47 | m_beta = gsl_matrix_alloc(num_topics, c->m_size_vocab); 48 | printf("reading beta initialization from %s\n", beta_init_path); 49 | f = fopen(beta_init_path, "r"); 50 | mtx_fscanf(f, m_beta); 51 | fclose(f); 52 | 53 | // exponentiate if it's not 54 | if (mget(m_beta, 0, 0) < 0) { 55 | mtx_exp(m_beta); 56 | } 57 | else { 58 | gsl_matrix_add_constant(m_beta, beta_smooth); 59 | for (size_t j = 0; j < m_beta->size1; j ++) { 60 | gsl_vector_view beta_v = gsl_matrix_row(m_beta, j); 61 | vnormalize(&beta_v.vector); 62 | } 63 | } 64 | } 65 | 66 | void c_ctr::set_model_parameters(int num_factors, 67 | int num_users, 68 | int num_items) { 69 | m_num_factors = num_factors; 70 | m_num_users = num_users; 71 | m_num_items = num_items; 72 | } 73 | 74 | void c_ctr::init_model(int ctr_run) { 75 | 76 | m_U = gsl_matrix_calloc(m_num_users, m_num_factors); 77 | m_V = gsl_matrix_calloc(m_num_items, m_num_factors); 78 | 79 | if (ctr_run) { 80 | gsl_matrix_memcpy(m_V, m_theta); 81 | } 82 | else { 83 | // this is for convenience, so that updates are similar. 84 | m_theta = gsl_matrix_calloc(m_num_items, m_num_factors); 85 | 86 | for (size_t i = 0; i < m_V->size1; i ++) 87 | for (size_t j = 0; j < m_V->size2; j ++) 88 | mset(m_V, i, j, runiform()); 89 | } 90 | } 91 | 92 | 93 | void c_ctr::stochastic_learn_map_estimate(const c_data* users, const c_data* items, 94 | const c_corpus* c, const ctr_hyperparameter* param, 95 | const char* directory) { 96 | // init model parameters 97 | printf("\nrunning stochastic learning ...\n"); 98 | printf("initializing the model ...\n"); 99 | init_model(param->ctr_run); 100 | 101 | // filename 102 | char name[500]; 103 | 104 | // start time 105 | time_t start, current; 106 | time(&start); 107 | int elapsed = 0; 108 | 109 | int iter = 0; 110 | double likelihood = -exp(50), likelihood_old; 111 | double converge = 1.0; 112 | double learning_rate = param->learning_rate; 113 | 114 | /// create the state log file 115 | sprintf(name, "%s/state.log", directory); 116 | FILE* file = fopen(name, "w"); 117 | fprintf(file, "iter time likelihood converge\n"); 118 | 119 | /* alloc auxiliary variables */ 120 | gsl_vector* x = gsl_vector_alloc(m_num_factors); 121 | 122 | gsl_matrix* phi = NULL; 123 | gsl_matrix* word_ss = NULL; 124 | gsl_matrix* log_beta = NULL; 125 | gsl_vector* gamma = NULL; 126 | 127 | if (param->ctr_run && param->theta_opt) { 128 | int max_len = c->max_corpus_length(); 129 | phi = gsl_matrix_calloc(max_len, m_num_factors); 130 | word_ss = gsl_matrix_calloc(m_num_factors, c->m_size_vocab); 131 | log_beta = gsl_matrix_calloc(m_num_factors, c->m_size_vocab); 132 | gsl_matrix_memcpy(log_beta, m_beta); 133 | mtx_log(log_beta); 134 | gamma = gsl_vector_alloc(m_num_factors); 135 | } 136 | 137 | /* tmp variables for indexes */ 138 | int i, j, m, n, l, k, ll, jj; 139 | int* item_ids; 140 | bool positive = true; 141 | 142 | double result, inner; 143 | int active_num_items = 0; 144 | for (j = 0; j < m_num_items; ++j) { 145 | if (items->m_vec_len[j] > 0) 146 | active_num_items++; 147 | } 148 | 149 | int* idx_base = new int[active_num_items]; 150 | l = 0; 151 | for (j = 0; j < m_num_items; ++j) { 152 | if (items->m_vec_len[j] > 0) { 153 | idx_base[l] = j; 154 | ++l; 155 | } 156 | } 157 | int* sel = new int[active_num_items]; 158 | 159 | while (iter < param->max_iter) { 160 | likelihood_old = likelihood; 161 | likelihood = 0.0; 162 | 163 | for (i = 0; i < m_num_users; ++i) { 164 | item_ids = users->m_vec_data[i]; 165 | n = users->m_vec_len[i]; 166 | if (n > 0) { 167 | double lambda_u = param->lambda_u / (2*n); 168 | gsl_vector_view u = gsl_matrix_row(m_U, i); 169 | // this user has rated some articles 170 | // Randomly choose 2*n negative examples 171 | sample_k_from_n(n, active_num_items, sel, idx_base); 172 | qsort(sel, n, sizeof(int), compare); 173 | l = 0; ll = 0; 174 | while (true) { 175 | if (l < n) { 176 | j = item_ids[l]; // positive 177 | } else { 178 | j = -1; 179 | } 180 | 181 | if (ll < n) { 182 | jj = sel[ll]; //negative 183 | while (ll < n-1 && jj == sel[ll+1]) ++ll; // skip same values 184 | } else { 185 | jj = -1; 186 | } 187 | 188 | if (j == -1) { 189 | if (jj == -1) break; 190 | else { 191 | positive = false; // jj is a negative example 192 | ++ll; 193 | } 194 | } else { 195 | if (j < jj) { 196 | positive = true; // j is a positive example 197 | ++l; 198 | } else if (j == jj) { 199 | positive = true; // j is a positive example 200 | ++l; 201 | ++ll; 202 | } else { // j > jj 203 | if (jj == -1) { 204 | positive = true; // j is a positive example 205 | ++l; 206 | } else { 207 | positive = false; 208 | ++ll; // jj is a negative example 209 | } 210 | } 211 | } 212 | gsl_vector_view v; 213 | gsl_vector_view theta_v; 214 | double lambda_v = 0.0; 215 | if (positive) { 216 | // j is a positive example 217 | lambda_v = param->lambda_v / (2 * items->m_vec_len[j]); 218 | v = gsl_matrix_row(m_V, j); 219 | theta_v = gsl_matrix_row(m_theta, j); 220 | // second-order 221 | // u 222 | gsl_vector_scale(&u.vector, 1 - learning_rate); 223 | gsl_blas_ddot(&v.vector, &v.vector, &inner); 224 | gsl_blas_daxpy(learning_rate / (lambda_u + inner), &v.vector, &u.vector); 225 | // v 226 | if (!param->lda_regression) { 227 | gsl_vector_scale(&v.vector, 1 - learning_rate); 228 | gsl_blas_daxpy(learning_rate, &theta_v.vector, &v.vector); 229 | gsl_blas_ddot(&u.vector, &u.vector, &inner); 230 | gsl_blas_ddot(&u.vector, &theta_v.vector, &result); 231 | gsl_blas_daxpy(learning_rate * (1.0 - result) / (lambda_v + inner), &u.vector, &v.vector); 232 | } 233 | 234 | gsl_blas_ddot(&u.vector, &v.vector, &result); 235 | likelihood += -0.5 * (1 - result) * (1 - result); 236 | // gsl_blas_ddot(&u.vector, &v.vector, &result); 237 | // result -= 1.0; 238 | } else { 239 | // jj is a negative example 240 | lambda_v = param->lambda_v / (2 * items->m_vec_len[jj]); 241 | v = gsl_matrix_row(m_V, jj); 242 | theta_v = gsl_matrix_row(m_theta, jj); 243 | // second order 244 | // u 245 | gsl_vector_scale(&u.vector, 1 - learning_rate); 246 | 247 | // v 248 | if (!param->lda_regression) { 249 | gsl_vector_scale(&v.vector, 1 - learning_rate); 250 | gsl_blas_daxpy(learning_rate, &theta_v.vector, &v.vector); 251 | gsl_blas_ddot(&u.vector, &u.vector, &inner); 252 | gsl_blas_ddot(&u.vector, &theta_v.vector, &result); 253 | gsl_blas_daxpy(-learning_rate * result / (lambda_v + inner), &u.vector, &v.vector); 254 | } 255 | 256 | gsl_blas_ddot(&u.vector, &v.vector, &result); 257 | likelihood += -0.5 * result * result; 258 | // gsl_blas_ddot(&u.vector, &v.vector, &result); 259 | } 260 | // update u 261 | // first-order 262 | // gsl_vector_scale(&u.vector, 1 - param->learning_rate * lambda_u); 263 | // gsl_blas_daxpy(-result * param->learning_rate, &v.vector, &u.vector); 264 | // second order 265 | 266 | // update v 267 | // gsl_vector_scale(&v.vector, 1 - param->learning_rate * lambda_v); 268 | // gsl_blas_daxpy(-result * param->learning_rate, &u.vector, &v.vector); 269 | // gsl_blas_daxpy(param->learning_rate * lambda_v, &theta_v.vector, &v.vector); 270 | } 271 | assert(n == l && n == l); 272 | //printf("n=%d, l=%d, ll=%d, j=%d, jj=%d\n", n, l, ll, j, jj); 273 | 274 | // update the likelihood 275 | gsl_blas_ddot(&u.vector, &u.vector, &result); 276 | likelihood += -0.5 * param->lambda_u * result; 277 | } 278 | } 279 | 280 | for (j = 0; j < m_num_items; ++j) { 281 | gsl_vector_view v = gsl_matrix_row(m_V, j); 282 | gsl_vector_view theta_v = gsl_matrix_row(m_theta, j); 283 | gsl_vector_memcpy(x, &v.vector); 284 | gsl_vector_sub(x, &theta_v.vector); 285 | gsl_blas_ddot(x, x, &result); 286 | likelihood += -0.5 * param->lambda_v * result; 287 | } 288 | 289 | // update theta 290 | if (param->ctr_run && param->theta_opt) { 291 | gsl_matrix_set_zero(word_ss); 292 | for (j = 0; j < m_num_items; j ++) { 293 | gsl_vector_view v = gsl_matrix_row(m_V, j); 294 | gsl_vector_view theta_v = gsl_matrix_row(m_theta, j); 295 | m = items->m_vec_len[j]; 296 | if (m>0) { 297 | // m > 0, some users have rated this article 298 | const c_document* doc = c->m_docs[j]; 299 | likelihood += doc_inference(doc, &theta_v.vector, log_beta, phi, gamma, word_ss, true); 300 | optimize_simplex(gamma, &v.vector, param->lambda_v, &theta_v.vector); 301 | } 302 | else { 303 | // m=0, this article has never been rated 304 | const c_document* doc = c->m_docs[j]; 305 | doc_inference(doc, &theta_v.vector, log_beta, phi, gamma, word_ss, false); 306 | vnormalize(gamma); 307 | gsl_vector_memcpy(&theta_v.vector, gamma); 308 | } 309 | } 310 | gsl_matrix_memcpy(m_beta, word_ss); 311 | for (k = 0; k < m_num_factors; k ++) { 312 | gsl_vector_view row = gsl_matrix_row(m_beta, k); 313 | vnormalize(&row.vector); 314 | } 315 | gsl_matrix_memcpy(log_beta, m_beta); 316 | mtx_log(log_beta); 317 | } 318 | 319 | time(¤t); 320 | elapsed = (int)difftime(current, start); 321 | 322 | iter++; 323 | if (iter > 50 && learning_rate > 0.001) learning_rate /= 2.0; 324 | converge = fabs((likelihood-likelihood_old)/likelihood_old); 325 | 326 | fprintf(file, "%04d %06d %10.5f %.10f\n", iter, elapsed, likelihood, converge); 327 | fflush(file); 328 | printf("iter=%04d, time=%06d, likelihood=%.5f, converge=%.10f\n", iter, elapsed, likelihood, converge); 329 | 330 | // save intermediate results 331 | if (iter % param->save_lag == 0) { 332 | 333 | sprintf(name, "%s/%04d-U.dat", directory, iter); 334 | FILE * file_U = fopen(name, "w"); 335 | gsl_matrix_fwrite(file_U, m_U); 336 | fclose(file_U); 337 | 338 | sprintf(name, "%s/%04d-V.dat", directory, iter); 339 | FILE * file_V = fopen(name, "w"); 340 | gsl_matrix_fwrite(file_V, m_V); 341 | fclose(file_V); 342 | 343 | if (param->ctr_run && param->theta_opt) { 344 | sprintf(name, "%s/%04d-theta.dat", directory, iter); 345 | FILE * file_theta = fopen(name, "w"); 346 | gsl_matrix_fwrite(file_theta, m_theta); 347 | fclose(file_theta); 348 | 349 | sprintf(name, "%s/%04d-beta.dat", directory, iter); 350 | FILE * file_beta = fopen(name, "w"); 351 | gsl_matrix_fwrite(file_beta, m_beta); 352 | fclose(file_beta); 353 | } 354 | } 355 | } 356 | 357 | // save final results 358 | sprintf(name, "%s/final-U.dat", directory); 359 | FILE * file_U = fopen(name, "w"); 360 | gsl_matrix_fwrite(file_U, m_U); 361 | fclose(file_U); 362 | 363 | sprintf(name, "%s/final-V.dat", directory); 364 | FILE * file_V = fopen(name, "w"); 365 | gsl_matrix_fwrite(file_V, m_V); 366 | fclose(file_V); 367 | 368 | if (param->ctr_run && param->theta_opt) { 369 | sprintf(name, "%s/final-theta.dat", directory); 370 | FILE * file_theta = fopen(name, "w"); 371 | gsl_matrix_fwrite(file_theta, m_theta); 372 | fclose(file_theta); 373 | 374 | sprintf(name, "%s/final-beta.dat", directory); 375 | FILE * file_beta = fopen(name, "w"); 376 | gsl_matrix_fwrite(file_beta, m_beta); 377 | fclose(file_beta); 378 | } 379 | 380 | // free memory 381 | gsl_vector_free(x); 382 | delete [] idx_base; 383 | delete [] sel; 384 | 385 | if (param->ctr_run && param->theta_opt) { 386 | gsl_matrix_free(phi); 387 | gsl_matrix_free(log_beta); 388 | gsl_matrix_free(word_ss); 389 | gsl_vector_free(gamma); 390 | } 391 | } 392 | 393 | void c_ctr::learn_map_estimate(const c_data* users, const c_data* items, 394 | const c_corpus* c, const ctr_hyperparameter* param, 395 | const char* directory) { 396 | // init model parameters 397 | printf("\ninitializing the model ...\n"); 398 | init_model(param->ctr_run); 399 | 400 | // filename 401 | char name[500]; 402 | 403 | // start time 404 | time_t start, current; 405 | time(&start); 406 | int elapsed = 0; 407 | 408 | int iter = 0; 409 | double likelihood = -exp(50), likelihood_old; 410 | double converge = 1.0; 411 | 412 | /// create the state log file 413 | sprintf(name, "%s/state.log", directory); 414 | FILE* file = fopen(name, "w"); 415 | fprintf(file, "iter time likelihood converge\n"); 416 | 417 | 418 | /* alloc auxiliary variables */ 419 | gsl_matrix* XX = gsl_matrix_alloc(m_num_factors, m_num_factors); 420 | gsl_matrix* A = gsl_matrix_alloc(m_num_factors, m_num_factors); 421 | gsl_matrix* B = gsl_matrix_alloc(m_num_factors, m_num_factors); 422 | gsl_vector* x = gsl_vector_alloc(m_num_factors); 423 | 424 | gsl_matrix* phi = NULL; 425 | gsl_matrix* word_ss = NULL; 426 | gsl_matrix* log_beta = NULL; 427 | gsl_vector* gamma = NULL; 428 | 429 | if (param->ctr_run && param->theta_opt) { 430 | int max_len = c->max_corpus_length(); 431 | phi = gsl_matrix_calloc(max_len, m_num_factors); 432 | word_ss = gsl_matrix_calloc(m_num_factors, c->m_size_vocab); 433 | log_beta = gsl_matrix_calloc(m_num_factors, c->m_size_vocab); 434 | gsl_matrix_memcpy(log_beta, m_beta); 435 | mtx_log(log_beta); 436 | gamma = gsl_vector_alloc(m_num_factors); 437 | } 438 | 439 | /* tmp variables for indexes */ 440 | int i, j, m, n, l, k; 441 | int* item_ids; 442 | int* user_ids; 443 | 444 | double result; 445 | 446 | /// confidence parameters 447 | double a_minus_b = param->a - param->b; 448 | 449 | while ((iter < param->max_iter and converge > 1e-4 ) or iter < min_iter) { 450 | 451 | likelihood_old = likelihood; 452 | likelihood = 0.0; 453 | 454 | // update U 455 | gsl_matrix_set_zero(XX); 456 | for (j = 0; j < m_num_items; j ++) { 457 | m = items->m_vec_len[j]; 458 | if (m>0) { 459 | gsl_vector_const_view v = gsl_matrix_const_row(m_V, j); 460 | gsl_blas_dger(1.0, &v.vector, &v.vector, XX); 461 | } 462 | } 463 | gsl_matrix_scale(XX, param->b); 464 | // this is only for U 465 | gsl_matrix_add_diagonal(XX, param->lambda_u); 466 | 467 | for (i = 0; i < m_num_users; i ++) { 468 | item_ids = users->m_vec_data[i]; 469 | n = users->m_vec_len[i]; 470 | if (n > 0) { 471 | // this user has rated some articles 472 | gsl_matrix_memcpy(A, XX); 473 | gsl_vector_set_zero(x); 474 | for (l=0; l < n; l ++) { 475 | j = item_ids[l]; 476 | gsl_vector_const_view v = gsl_matrix_const_row(m_V, j); 477 | gsl_blas_dger(a_minus_b, &v.vector, &v.vector, A); 478 | gsl_blas_daxpy(param->a, &v.vector, x); 479 | } 480 | 481 | gsl_vector_view u = gsl_matrix_row(m_U, i); 482 | matrix_vector_solve(A, x, &(u.vector)); 483 | 484 | // update the likelihood 485 | gsl_blas_ddot(&u.vector, &u.vector, &result); 486 | likelihood += -0.5 * param->lambda_u * result; 487 | } 488 | } 489 | 490 | if (param->lda_regression) break; // one iteration is enough for lda-regression 491 | 492 | // update V 493 | if (param->ctr_run && param->theta_opt) gsl_matrix_set_zero(word_ss); 494 | 495 | gsl_matrix_set_zero(XX); 496 | for (i = 0; i < m_num_users; i ++) { 497 | n = users->m_vec_len[i]; 498 | if (n>0) { 499 | gsl_vector_const_view u = gsl_matrix_const_row(m_U, i); 500 | gsl_blas_dger(1.0, &u.vector, &u.vector, XX); 501 | } 502 | } 503 | gsl_matrix_scale(XX, param->b); 504 | 505 | for (j = 0; j < m_num_items; j ++) { 506 | gsl_vector_view v = gsl_matrix_row(m_V, j); 507 | gsl_vector_view theta_v = gsl_matrix_row(m_theta, j); 508 | 509 | user_ids = items->m_vec_data[j]; 510 | m = items->m_vec_len[j]; 511 | if (m>0) { 512 | // m > 0, some users have rated this article 513 | gsl_matrix_memcpy(A, XX); 514 | gsl_vector_set_zero(x); 515 | for (l = 0; l < m; l ++) { 516 | i = user_ids[l]; 517 | gsl_vector_const_view u = gsl_matrix_const_row(m_U, i); 518 | gsl_blas_dger(a_minus_b, &u.vector, &u.vector, A); 519 | gsl_blas_daxpy(param->a, &u.vector, x); 520 | } 521 | 522 | // adding the topic vector 523 | // even when ctr_run=0, m_theta=0 524 | gsl_blas_daxpy(param->lambda_v, &theta_v.vector, x); 525 | 526 | gsl_matrix_memcpy(B, A); // save for computing likelihood 527 | 528 | // here different from U update 529 | gsl_matrix_add_diagonal(A, param->lambda_v); 530 | matrix_vector_solve(A, x, &v.vector); 531 | 532 | // update the likelihood for the relevant part 533 | likelihood += -0.5 * m * param->a; 534 | for (l = 0; l < m; l ++) { 535 | i = user_ids[l]; 536 | gsl_vector_const_view u = gsl_matrix_const_row(m_U, i); 537 | gsl_blas_ddot(&u.vector, &v.vector, &result); 538 | likelihood += param->a * result; 539 | } 540 | likelihood += -0.5 * mahalanobis_prod(B, &v.vector, &v.vector); 541 | // likelihood part of theta, even when theta=0, which is a 542 | // special case 543 | gsl_vector_memcpy(x, &v.vector); 544 | gsl_vector_sub(x, &theta_v.vector); 545 | gsl_blas_ddot(x, x, &result); 546 | likelihood += -0.5 * param->lambda_v * result; 547 | 548 | if (param->ctr_run && param->theta_opt) { 549 | const c_document* doc = c->m_docs[j]; 550 | likelihood += doc_inference(doc, &theta_v.vector, log_beta, phi, gamma, word_ss, true); 551 | optimize_simplex(gamma, &v.vector, param->lambda_v, &theta_v.vector); 552 | } 553 | } 554 | else { 555 | // m=0, this article has never been rated 556 | if (param->ctr_run && param->theta_opt) { 557 | const c_document* doc = c->m_docs[j]; 558 | doc_inference(doc, &theta_v.vector, log_beta, phi, gamma, word_ss, false); 559 | vnormalize(gamma); 560 | gsl_vector_memcpy(&theta_v.vector, gamma); 561 | } 562 | } 563 | } 564 | 565 | // update beta if needed 566 | if (param->ctr_run && param->theta_opt) { 567 | gsl_matrix_memcpy(m_beta, word_ss); 568 | for (k = 0; k < m_num_factors; k ++) { 569 | gsl_vector_view row = gsl_matrix_row(m_beta, k); 570 | vnormalize(&row.vector); 571 | } 572 | gsl_matrix_memcpy(log_beta, m_beta); 573 | mtx_log(log_beta); 574 | } 575 | 576 | time(¤t); 577 | elapsed = (int)difftime(current, start); 578 | 579 | iter++; 580 | converge = fabs((likelihood-likelihood_old)/likelihood_old); 581 | 582 | if (likelihood < likelihood_old) printf("likelihood is decreasing!\n"); 583 | 584 | fprintf(file, "%04d %06d %10.5f %.10f\n", iter, elapsed, likelihood, converge); 585 | fflush(file); 586 | printf("iter=%04d, time=%06d, likelihood=%.5f, converge=%.10f\n", iter, elapsed, likelihood, converge); 587 | 588 | // save intermediate results 589 | if (iter % param->save_lag == 0) { 590 | 591 | sprintf(name, "%s/%04d-U.dat", directory, iter); 592 | FILE * file_U = fopen(name, "w"); 593 | mtx_fprintf(file_U, m_U); 594 | fclose(file_U); 595 | 596 | sprintf(name, "%s/%04d-V.dat", directory, iter); 597 | FILE * file_V = fopen(name, "w"); 598 | mtx_fprintf(file_V, m_V); 599 | fclose(file_V); 600 | 601 | if (param->ctr_run) { 602 | sprintf(name, "%s/%04d-theta.dat", directory, iter); 603 | FILE * file_theta = fopen(name, "w"); 604 | mtx_fprintf(file_theta, m_theta); 605 | fclose(file_theta); 606 | 607 | sprintf(name, "%s/%04d-beta.dat", directory, iter); 608 | FILE * file_beta = fopen(name, "w"); 609 | mtx_fprintf(file_beta, m_beta); 610 | fclose(file_beta); 611 | } 612 | } 613 | } 614 | 615 | // save final results 616 | sprintf(name, "%s/final-U.dat", directory); 617 | FILE * file_U = fopen(name, "w"); 618 | mtx_fprintf(file_U, m_U); 619 | fclose(file_U); 620 | 621 | sprintf(name, "%s/final-V.dat", directory); 622 | FILE * file_V = fopen(name, "w"); 623 | mtx_fprintf(file_V, m_V); 624 | fclose(file_V); 625 | 626 | if (param->ctr_run) { 627 | sprintf(name, "%s/final-theta.dat", directory); 628 | FILE * file_theta = fopen(name, "w"); 629 | mtx_fprintf(file_theta, m_theta); 630 | fclose(file_theta); 631 | 632 | sprintf(name, "%s/final-beta.dat", directory); 633 | FILE * file_beta = fopen(name, "w"); 634 | mtx_fprintf(file_beta, m_beta); 635 | fclose(file_beta); 636 | } 637 | 638 | // free memory 639 | gsl_matrix_free(XX); 640 | gsl_matrix_free(A); 641 | gsl_matrix_free(B); 642 | gsl_vector_free(x); 643 | 644 | if (param->ctr_run && param->theta_opt) { 645 | gsl_matrix_free(phi); 646 | gsl_matrix_free(log_beta); 647 | gsl_matrix_free(word_ss); 648 | gsl_vector_free(gamma); 649 | } 650 | } 651 | 652 | double c_ctr::doc_inference(const c_document* doc, const gsl_vector* theta_v, 653 | const gsl_matrix* log_beta, gsl_matrix* phi, 654 | gsl_vector* gamma, gsl_matrix* word_ss, 655 | bool update_word_ss) { 656 | 657 | double pseudo_count = 1.0; 658 | double likelihood = 0; 659 | gsl_vector* log_theta_v = gsl_vector_alloc(theta_v->size); 660 | gsl_vector_memcpy(log_theta_v, theta_v); 661 | vct_log(log_theta_v); 662 | 663 | int n, k, w; 664 | double x; 665 | for (n = 0; n < doc->m_length; n ++) { 666 | w = doc->m_words[n]; 667 | for (k = 0; k < m_num_factors; k ++) 668 | mset(phi, n, k, vget(theta_v, k) * mget(m_beta, k, w)); 669 | 670 | gsl_vector_view row = gsl_matrix_row(phi, n); 671 | vnormalize(&row.vector); 672 | 673 | for (k = 0; k < m_num_factors; k ++) { 674 | x = mget(phi, n, k); 675 | if (x > 0) 676 | likelihood += x*(vget(log_theta_v, k) + mget(log_beta, k, w) - log(x)); 677 | } 678 | } 679 | 680 | if (pseudo_count > 0) { 681 | likelihood += pseudo_count * vsum(log_theta_v); 682 | } 683 | 684 | gsl_vector_set_all(gamma, pseudo_count); // smoothing with small pseudo counts 685 | for (n = 0; n < doc->m_length; n ++) { 686 | for (k = 0; k < m_num_factors; k ++) { 687 | x = doc->m_counts[n] * mget(phi, n, k); 688 | vinc(gamma, k, x); 689 | if (update_word_ss) minc(word_ss, k, doc->m_words[n], x); 690 | } 691 | } 692 | 693 | gsl_vector_free(log_theta_v); 694 | return likelihood; 695 | } 696 | -------------------------------------------------------------------------------- /ctr.h: -------------------------------------------------------------------------------- 1 | // class for ctr 2 | // 3 | #ifndef CTR_H 4 | #define CTR_H 5 | 6 | #include "utils.h" 7 | #include "corpus.h" 8 | #include "data.h" 9 | 10 | struct ctr_hyperparameter { 11 | double a; 12 | double b; 13 | double lambda_u; 14 | double lambda_v; 15 | double learning_rate; 16 | double alpha_smooth; 17 | int random_seed; 18 | int max_iter; 19 | int save_lag; 20 | int theta_opt; 21 | int ctr_run; 22 | int lda_regression; 23 | 24 | void set(double aa, double bb, 25 | double lu, double lv, 26 | double lr, double as, 27 | int rs, int mi, int sl, 28 | int to, int cr, int lda_r) { 29 | a = aa; b = bb; 30 | lambda_u = lu; lambda_v = lv; 31 | learning_rate = lr; 32 | alpha_smooth = as; 33 | random_seed = rs; max_iter = mi; 34 | save_lag = sl; theta_opt = to; 35 | ctr_run = cr; lda_regression = lda_r; 36 | } 37 | 38 | void save(char* filename) { 39 | FILE * file = fopen(filename, "w"); 40 | fprintf(file, "a = %.4f\n", a); 41 | fprintf(file, "b = %.4f\n", b); 42 | fprintf(file, "lambda_u = %.4f\n", lambda_u); 43 | fprintf(file, "lambda_v = %.4f\n", lambda_v); 44 | fprintf(file, "learning_rate = %.6f\n", learning_rate); 45 | fprintf(file, "alpha_smooth = %.6f\n", alpha_smooth); 46 | fprintf(file, "random seed = %d\n", (int)random_seed); 47 | fprintf(file, "max iter = %d\n", max_iter); 48 | fprintf(file, "save lag = %d\n", save_lag); 49 | fprintf(file, "theta opt = %d\n", theta_opt); 50 | fprintf(file, "ctr run = %d\n", ctr_run); 51 | fprintf(file, "lda_regression = %d\n", lda_regression); 52 | fclose(file); 53 | } 54 | }; 55 | 56 | class c_ctr { 57 | public: 58 | c_ctr(); 59 | ~c_ctr(); 60 | void read_init_information(const char* theta_init_path, 61 | const char* beta_init_path, 62 | const c_corpus* c, double alpha_smooth); 63 | 64 | void set_model_parameters(int num_factors, 65 | int num_users, 66 | int num_items); 67 | 68 | void learn_map_estimate(const c_data* users, const c_data* items, 69 | const c_corpus* c, const ctr_hyperparameter* param, 70 | const char* directory); 71 | 72 | void stochastic_learn_map_estimate(const c_data* users, const c_data* items, 73 | const c_corpus* c, const ctr_hyperparameter* param, 74 | const char* directory); 75 | 76 | void init_model(int ctr_run); 77 | 78 | double doc_inference(const c_document* doc, const gsl_vector* theta_v, 79 | const gsl_matrix* log_beta, gsl_matrix* phi, 80 | gsl_vector* gamma, gsl_matrix* word_ss, 81 | bool update_word_ss); 82 | public: 83 | gsl_matrix* m_beta; 84 | gsl_matrix* m_theta; 85 | 86 | gsl_matrix* m_U; 87 | gsl_matrix* m_V; 88 | 89 | int m_num_factors; // m_num_topics 90 | int m_num_items; // m_num_docs 91 | int m_num_users; // num of users 92 | }; 93 | 94 | #endif // CTR_H 95 | -------------------------------------------------------------------------------- /ctr.submit.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #root_path=../data/arxiv/cv 3 | #num_factors=100 4 | 5 | #for i in `seq 1 5` 6 | #do 7 | ## ./qsub.sh ./ctr --directory $root_path/cv-cf-$i --user $root_path/cf-train-$i-users.dat --item \ 8 | ## $root_path/cf-train-$i-items.dat --a 1 --b 0.01 --lambda_u 0.01 --lambda_v 0.01 \ 9 | ## --random_seed 33333 --num_factors $num_factors --save_lag 20 10 | # 11 | # for type in ofm cf 12 | # do 13 | # ./qsub.sh ./ctr --directory $root_path/cv-ctr-$i-$type --user $root_path/$type-train-$i-users.dat --item \ 14 | # $root_path/$type-train-$i-items.dat --a 1 --b 0.01 --lambda_u 0.01 --lambda_v 100 \ 15 | # --mult $root_path/mult.dat --theta_init $root_path/theta-vector.dat \ 16 | # --beta_init $root_path/final.beta --num_factors $num_factors --save_lag 20 --theta_opt 17 | # done 18 | # 19 | #done 20 | 21 | #for i in 0 1 2 3 4 22 | #do 23 | # for K in 200 24 | # do 25 | # for lambda in 0.01 0.1 1 10 100 1000 5000 26 | # do 27 | # ./qsub.sh ./ctr --directory ../data/citeulike/data/cv-in-matrix/cf-fold-$i-K-$K-lambda-$lambda \ 28 | # --user ../data/citeulike/data/cv-in-matrix/fold-$i-users.train \ 29 | # --item ../data/citeulike/data/cv-in-matrix/fold-$i-items.train \ 30 | # --lambda_u 0.01 --lambda_v $lambda --num_factors $K --save_lag 50 \ 31 | # --max_iter 100 32 | # 33 | # ./qsub.sh ./ctr --directory ../data/citeulike/data/cv-in-matrix/ctr-fold-$i-K-$K-lambda-$lambda \ 34 | # --user ../data/citeulike/data/cv-in-matrix/fold-$i-users.train \ 35 | # --item ../data/citeulike/data/cv-in-matrix/fold-$i-items.train \ 36 | # --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 37 | # --mult ../data/citeulike/data/mult.dat \ 38 | # --theta_init ../data/citeulike/data/lda-$K/final.doc.states \ 39 | # --beta_init ../data/citeulike/data/lda-$K/final.topics --num_factors $K \ 40 | # --save_lag 50 --alpha_smooth 1 --max_iter 100 --theta_opt 41 | # 42 | # ./qsub.sh ./ctr --directory ../data/citeulike/data/cv-out-of-matrix/ctr-fold-$i-K-$K-lambda-$lambda \ 43 | # --user ../data/citeulike/data/cv-out-of-matrix/fold-$i-users.train \ 44 | # --item ../data/citeulike/data/cv-out-of-matrix/fold-$i-items.train \ 45 | # --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 46 | # --mult ../data/citeulike/data/mult.dat \ 47 | # --theta_init ../data/citeulike/data/lda-$K/final.doc.states \ 48 | # --beta_init ../data/citeulike/data/lda-$K/final.topics --num_factors $K \ 49 | # --save_lag 50 --alpha_smooth 1 --max_iter 100 --theta_opt 50 | # 51 | # if [ "$lambda" == 10 ]; then 52 | # ./qsub.sh ./ctr --directory ../data/citeulike/data/cv-in-matrix/lda-fold-$i-K-$K-lambda-$lambda \ 53 | # --user ../data/citeulike/data/cv-in-matrix/fold-$i-users.train \ 54 | # --item ../data/citeulike/data/cv-in-matrix/fold-$i-items.train \ 55 | # --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 56 | # --mult ../data/citeulike/data/mult.dat \ 57 | # --theta_init ../data/citeulike/data/lda-$K/final.doc.states \ 58 | # --beta_init ../data/citeulike/data/lda-$K/final.topics --num_factors $K \ 59 | # --save_lag 50 --alpha_smooth 1 --max_iter 100 --lda_regression 60 | # 61 | # ./qsub.sh ./ctr --directory ../data/citeulike/data/cv-out-of-matrix/lda-fold-$i-K-$K-lambda-$lambda \ 62 | # --user ../data/citeulike/data/cv-out-of-matrix/fold-$i-users.train \ 63 | # --item ../data/citeulike/data/cv-out-of-matrix/fold-$i-items.train \ 64 | # --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 65 | # --mult ../data/citeulike/data/mult.dat \ 66 | # --theta_init ../data/citeulike/data/lda-$K/final.doc.states \ 67 | # --beta_init ../data/citeulike/data/lda-$K/final.topics --num_factors $K \ 68 | # --save_lag 50 --alpha_smooth 1 --max_iter 100 --lda_regression 69 | # fi 70 | # done 71 | # done 72 | #done 73 | 74 | rootpath=../data/mendeley/ 75 | for i in 0 1 2 3 4 76 | do 77 | for K in 500 78 | do 79 | for lambda in 10000 20000 50000 80 | do 81 | condor_run "./ctr-condor --directory $rootpath/condor-result/cv-in-matrix/ctr-fold-$i-K-$K-lambda-$lambda \ 82 | --user $rootpath/cv-in-matrix/fold-$i-users.train \ 83 | --item $rootpath/cv-in-matrix/fold-$i-items.train \ 84 | --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 85 | --mult $rootpath/mult.dat \ 86 | --theta_init $rootpath/lda-$K/final.doc.states \ 87 | --beta_init $rootpath/lda-$K/final.topics --num_factors $K --save_lag 2000 \ 88 | --learning_rate 0.002 --random_seed 939384 --max_iter 400 --alpha_smooth 0.1" >> logs/ctr-in-$i-${lambda}.out & 89 | 90 | condor_run "./ctr-condor --directory $rootpath/condor-result/cv-out-of-matrix/ctr-fold-$i-K-$K-lambda-$lambda \ 91 | --user $rootpath/cv-out-of-matrix/fold-$i-users.train \ 92 | --item $rootpath/cv-out-of-matrix/fold-$i-items.train \ 93 | --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 94 | --mult $rootpath/mult.dat \ 95 | --theta_init $rootpath/lda-$K/final.doc.states \ 96 | --beta_init $rootpath/lda-$K/final.topics --num_factors $K --save_lag 2000 \ 97 | --learning_rate 0.002 --random_seed 939384 --max_iter 400 --alpha_smooth 0.1" >> logs/ctr-out-$i-${lambda}.out & 98 | done 99 | done 100 | done 101 | -------------------------------------------------------------------------------- /data.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "data.h" 4 | 5 | c_data::c_data() { 6 | } 7 | 8 | c_data::~c_data() { 9 | for (size_t i = 0; i < m_vec_data.size(); i ++) { 10 | int* ids = m_vec_data[i]; 11 | if (ids != NULL) delete [] ids; 12 | } 13 | m_vec_data.clear(); 14 | m_vec_len.clear(); 15 | } 16 | 17 | void c_data::read_data(const char * data_filename, int OFFSET) { 18 | 19 | int length = 0, n = 0, id = 0, total = 0; 20 | 21 | FILE * fileptr; 22 | fileptr = fopen(data_filename, "r"); 23 | 24 | while ((fscanf(fileptr, "%10d", &length) != EOF)) { 25 | int * ids = NULL; 26 | if (length > 0) { 27 | ids = new int[length]; 28 | for (n = 0; n < length; n++) { 29 | fscanf(fileptr, "%10d", &id); 30 | ids[n] = id - OFFSET; 31 | } 32 | } 33 | m_vec_data.push_back(ids); 34 | m_vec_len.push_back(length); 35 | total += length; 36 | } 37 | fclose(fileptr); 38 | printf("read %d vectors with %d entries ...\n", (int)m_vec_len.size(), total); 39 | } 40 | 41 | -------------------------------------------------------------------------------- /data.h: -------------------------------------------------------------------------------- 1 | // class for reading the sparse matrix data 2 | // for both user matrix and item matrix 3 | // user matrix: 4 | // number_of_items item1 item2 ... 5 | // item matrix: 6 | // number_of_users user1 user2 ... 7 | 8 | #ifndef DATA_H 9 | #define DATA_H 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | class c_data { 16 | public: 17 | c_data(); 18 | ~c_data(); 19 | void read_data(const char * data_filename, int OFFSET=0); 20 | public: 21 | vector m_vec_data; 22 | vector m_vec_len; 23 | }; 24 | 25 | #endif // DATA_H 26 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "ctr.h" 5 | 6 | gsl_rng * RANDOM_NUMBER = NULL; 7 | 8 | void print_usage_and_exit() { 9 | // print usage information 10 | printf("*********************************collaborative topic models for recommendations************************\n"); 11 | printf("Authors: Chong Wang, chongw@cs.princeton.edu, Computer Science Department, Princeton University.\n"); 12 | printf("usage:\n"); 13 | printf(" ctr [options]\n"); 14 | printf(" --help: print help information\n"); 15 | 16 | printf("\n"); 17 | printf(" --directory: save directory, required\n"); 18 | 19 | printf("\n"); 20 | printf(" --user: user file, required\n"); 21 | printf(" --item: item file, required\n"); 22 | printf(" --a: positive item weight, default 1\n"); 23 | printf(" --b: negative item weight, default 0.01 (b < a)\n"); 24 | printf(" --lambda_u: user vector regularizer, default 0.01\n"); 25 | printf(" --lambda_v: item vector regularizer, default 100\n"); 26 | printf(" --learning_rate: stochastic version for large datasets, default -1. Stochastic learning will be called when > 0.\n"); 27 | printf(" --alpha_smooth: alpha smooth, default [0.0]\n"); 28 | printf("\n"); 29 | 30 | printf(" --random_seed: the random seed, default from the current time\n"); 31 | printf(" --save_lag: the saving lag, default 20 (-1 means no savings for intermediate results)\n"); 32 | printf(" --max_iter: the max number of iterations, default 200\n"); 33 | printf("\n"); 34 | 35 | printf(" --num_factors: the number of factors, default 200\n"); 36 | printf(" --mult: mult file, in lda-c format, optional, if not provided, it's the matrix factorization\n"); 37 | printf(" --theta_init: topic proportions file from lda, optional (required if mult file is provided)\n"); 38 | printf(" --beta_init: topic distributions file from lda, optional (required if mult file is provided)\n"); 39 | printf(" --theta_opt: optimize theta or not, optional, default not\n"); 40 | printf(" --lda_regression: run lda regression, default not\n"); 41 | 42 | printf("*******************************************************************************************************\n"); 43 | 44 | exit(0); 45 | } 46 | 47 | int main(int argc, char* argv[]) { 48 | if (argc < 2) print_usage_and_exit(); 49 | 50 | char filename[500]; 51 | int theta_opt = 0; 52 | int lda_regression = 0; 53 | 54 | const char* const short_options = "hd:x:i:a:b:u:v:r:s:m:k:t:e:y:z:w:"; 55 | const struct option long_options[] = { 56 | {"help", no_argument, NULL, 'h'}, 57 | {"directory", required_argument, NULL, 'd'}, 58 | {"user", required_argument, NULL, 'x'}, 59 | {"item", required_argument, NULL, 'i'}, 60 | {"a", required_argument, NULL, 'a'}, 61 | {"b", required_argument, NULL, 'b'}, 62 | {"lambda_u", required_argument, NULL, 'u'}, 63 | {"lambda_v", required_argument, NULL, 'v'}, 64 | {"random_seed", required_argument, NULL, 'r'}, 65 | {"save_lag", required_argument, NULL, 's'}, 66 | {"max_iter", required_argument, NULL, 'm'}, 67 | {"num_factors", required_argument, NULL, 'k'}, 68 | {"mult", required_argument, NULL, 't'}, 69 | {"theta_init", required_argument, NULL, 'e'}, 70 | {"beta_init", required_argument, NULL, 'y'}, 71 | {"learning_rate", required_argument, NULL, 'z'}, 72 | {"alpha_smooth", required_argument, NULL, 'w'}, 73 | {"theta_opt", no_argument, &theta_opt, 1}, 74 | {"lda_regression",no_argument, &lda_regression, 1}, 75 | {NULL, 0, NULL, 0}}; 76 | 77 | char* directory = NULL; 78 | 79 | char* user_path = NULL; 80 | char* item_path = NULL; 81 | double a = 1.0; 82 | double b = 0.01; 83 | double lambda_u = 0.01; 84 | double lambda_v = 100; 85 | double learning_rate = -1; 86 | double alpha_smooth = 0.0; 87 | 88 | time_t t; time(&t); 89 | long random_seed = (long) t; 90 | int save_lag = 20; 91 | int max_iter = 200; 92 | 93 | int num_factors = 200; 94 | char* mult_path = NULL; 95 | char* theta_init_path = NULL; 96 | char* beta_init_path = NULL; 97 | 98 | int cc = 0; 99 | while(true) { 100 | cc = getopt_long(argc, argv, short_options, long_options, NULL); 101 | switch(cc) { 102 | case 'h': 103 | print_usage_and_exit(); 104 | break; 105 | case 'd': 106 | directory = optarg; 107 | break; 108 | case 'x': 109 | user_path = optarg; 110 | break; 111 | case 'i': 112 | item_path = optarg; 113 | break; 114 | case 'a': 115 | a = atof(optarg); 116 | break; 117 | case 'b': 118 | b = atof(optarg); 119 | break; 120 | case 'u': 121 | lambda_u = atof(optarg); 122 | break; 123 | case 'v': 124 | lambda_v = atof(optarg); 125 | break; 126 | case 'z': 127 | learning_rate = atof(optarg); 128 | break; 129 | case 'w': 130 | alpha_smooth = atof(optarg); 131 | break; 132 | case 'r': 133 | random_seed = atoi(optarg); 134 | break; 135 | case 's': 136 | save_lag = atoi(optarg); 137 | break; 138 | case 'm': 139 | max_iter = atoi(optarg); 140 | break; 141 | case 'k': 142 | num_factors = atoi(optarg); 143 | break; 144 | case 't': 145 | mult_path = optarg; 146 | break; 147 | case 'e': 148 | theta_init_path = optarg; 149 | break; 150 | case 'y': 151 | beta_init_path = optarg; 152 | break; 153 | case -1: 154 | break; 155 | case '?': 156 | print_usage_and_exit(); 157 | break; 158 | default: 159 | break; 160 | } 161 | if (cc == -1) 162 | break; 163 | } 164 | 165 | /// print information 166 | printf("\n************************************************************************************************\n"); 167 | 168 | if (!dir_exists(directory)) make_directory(directory); 169 | printf("result directory: %s\n", directory); 170 | 171 | if (!file_exists(user_path)) { 172 | printf("user file %s doesn't exist! quit ...\n", user_path); 173 | exit(-1); 174 | } 175 | printf("user file: %s\n", user_path); 176 | 177 | if (!file_exists(item_path)) { 178 | printf("item file %s doesn't exist! quit ...\n", item_path); 179 | exit(-1); 180 | } 181 | printf("item file: %s\n", item_path); 182 | 183 | printf("a: %.4f\n", a); 184 | printf("b: %.4f\n", b); 185 | printf("lambda_u: %.4f\n", lambda_u); 186 | printf("lambda_v: %.4f\n", lambda_v); 187 | printf("learning_rate: %.5f\n", learning_rate); 188 | printf("alpha_smooth: %.5f\n", alpha_smooth); 189 | printf("random seed: %d\n", (int)random_seed); 190 | printf("save lag: %d\n", save_lag); 191 | printf("max iter: %d\n", max_iter); 192 | printf("number of factors: %d\n", num_factors); 193 | 194 | if (mult_path != NULL) { 195 | if (!file_exists(item_path)) { 196 | printf("mult file %s doesn't exist! quit ...\n", mult_path); 197 | exit(-1); 198 | } 199 | printf("mult file: %s\n", mult_path); 200 | 201 | if (theta_init_path == NULL) { 202 | printf("topic proportions file must be provided ...\n"); 203 | exit(-1); 204 | } 205 | if (!file_exists(theta_init_path)) { 206 | printf("topic proportions file %s doesn't exist! quit ...\n", theta_init_path); 207 | exit(-1); 208 | } 209 | printf("topic proportions file: %s\n", theta_init_path); 210 | 211 | if (beta_init_path == NULL) { 212 | printf("topic distributions file must be provided ...\n"); 213 | exit(-1); 214 | } 215 | if (!file_exists(beta_init_path)) { 216 | printf("topic distributions file %s doesn't exist! quit ...\n", beta_init_path); 217 | exit(-1); 218 | } 219 | printf("topic distributions file: %s\n", beta_init_path); 220 | if (theta_opt) printf("theta optimization: True\n"); 221 | else printf("theta optimization: false\n"); 222 | } 223 | else if (theta_opt) { 224 | printf("theta optimization: false"); 225 | printf("(theta_opt has no effect, back to default value: false)\n"); 226 | theta_opt = 0; 227 | } 228 | 229 | printf("\n"); 230 | 231 | /// save the settings 232 | int ctr_run = 1; 233 | if (mult_path == NULL) ctr_run = 0; 234 | ctr_hyperparameter ctr_param; 235 | ctr_param.set(a, b, lambda_u, lambda_v, learning_rate, alpha_smooth, 236 | random_seed, max_iter, save_lag, theta_opt, ctr_run, lda_regression); 237 | sprintf(filename, "%s/settings.txt", directory); 238 | ctr_param.save(filename); 239 | 240 | /// init random numbe generator 241 | RANDOM_NUMBER = new_random_number_generator(random_seed); 242 | 243 | // read users 244 | printf("reading user matrix from %s ...\n", user_path); 245 | c_data* users = new c_data(); 246 | users->read_data(user_path); 247 | int num_users = (int)users->m_vec_data.size(); 248 | 249 | // read items 250 | printf("reading item matrix from %s ...\n", item_path); 251 | c_data* items = new c_data(); 252 | items->read_data(item_path); 253 | int num_items = (int)items->m_vec_data.size(); 254 | 255 | // create model instance 256 | c_ctr* ctr = new c_ctr(); 257 | ctr->set_model_parameters(num_factors, num_users, num_items); 258 | 259 | c_corpus* c = NULL; 260 | if (mult_path != NULL) { 261 | // read word data 262 | c = new c_corpus(); 263 | c->read_data(mult_path); 264 | ctr->read_init_information(theta_init_path, beta_init_path, c, alpha_smooth); 265 | } 266 | 267 | if (learning_rate <= 0) { 268 | ctr->learn_map_estimate(users, items, c, &ctr_param, directory); 269 | } else { 270 | ctr->stochastic_learn_map_estimate(users, items, c, &ctr_param, directory); 271 | } 272 | 273 | free_random_number_generator(RANDOM_NUMBER); 274 | if (c != NULL) delete c; 275 | 276 | delete ctr; 277 | delete users; 278 | delete items; 279 | return 0; 280 | } 281 | -------------------------------------------------------------------------------- /opt.cpp: -------------------------------------------------------------------------------- 1 | #include "opt.h" 2 | #include "utils.h" 3 | #include 4 | 5 | // projection gradient algorithm 6 | void optimize_simplex(const gsl_vector* gamma, const gsl_vector* v, double lambda, gsl_vector* opt_x) { 7 | size_t size = opt_x->size; 8 | gsl_vector* g = gsl_vector_alloc(size); 9 | gsl_vector* x_bar = gsl_vector_alloc(size); 10 | gsl_vector* opt_x_old = gsl_vector_alloc(size); 11 | gsl_vector_memcpy(opt_x_old, opt_x); // save the old value 12 | double f_old = f_simplex(gamma, v, lambda, opt_x); 13 | //printf("f_old: %0.10f -> ", f_old); 14 | 15 | df_simplex(gamma, v, lambda, opt_x, g); 16 | double ab_sum = gsl_blas_dasum(g); 17 | if (ab_sum > 1.0) gsl_vector_scale(g, 1.0/ab_sum); // rescale the gradient 18 | 19 | gsl_blas_daxpy(-1, g, opt_x); 20 | simplex_projection(opt_x, x_bar); 21 | gsl_vector_sub(x_bar, opt_x_old); 22 | double r = 0; 23 | gsl_blas_ddot(g, x_bar, &r); 24 | r *= 0.5; 25 | 26 | double beta = 0.5; 27 | double f_new; 28 | double t = beta; 29 | int iter = 0; 30 | while(++iter < 100) { 31 | gsl_vector_memcpy(opt_x, opt_x_old); 32 | gsl_blas_daxpy(t, x_bar, opt_x); 33 | 34 | f_new = f_simplex(gamma, v, lambda, opt_x); 35 | if (f_new > f_old + r * t) t = t * beta; 36 | else break; 37 | } 38 | //printf("f_new %0.10f\n", f_new); 39 | 40 | if (!is_feasible(opt_x)) printf("sth is wrong, not feasible. you've got to check it ...\n"); 41 | 42 | gsl_vector_free(g); 43 | gsl_vector_free(opt_x_old); 44 | gsl_vector_free(x_bar); 45 | } 46 | 47 | double f_simplex(const gsl_vector* gamma, const gsl_vector* v, 48 | double lambda, const gsl_vector* opt_x) { 49 | double f = 0.0, val; 50 | 51 | gsl_vector* y = gsl_vector_alloc(opt_x->size); 52 | gsl_vector_memcpy(y, opt_x); 53 | vct_log(y); 54 | 55 | gsl_blas_ddot(y, gamma, &f); 56 | 57 | gsl_vector_memcpy(y, v); 58 | gsl_vector_sub(y, opt_x); 59 | gsl_blas_ddot(y, y, &val); 60 | f -= 0.5 * lambda * val; 61 | gsl_vector_free(y); 62 | 63 | return -f; 64 | } 65 | 66 | void df_simplex(const gsl_vector* gamma, const gsl_vector* v, 67 | double lambda, const gsl_vector* opt_x, 68 | gsl_vector* g) { 69 | gsl_vector_memcpy(g, opt_x); 70 | gsl_vector_sub(g, v); 71 | gsl_vector_scale(g, -lambda); 72 | 73 | gsl_vector* y = gsl_vector_alloc(opt_x->size); 74 | gsl_vector_memcpy(y, gamma); 75 | gsl_vector_div(y, opt_x); 76 | gsl_vector_add(g, y); 77 | gsl_vector_scale(g, -1.0); 78 | gsl_vector_free(y); 79 | } 80 | 81 | bool is_feasible(const gsl_vector* x) { 82 | double val; 83 | double sum = 0; 84 | for (size_t i = 0; i < x->size-1; i ++) { 85 | val = vget(x, i); 86 | if (val < 0 || val >1) return false; 87 | sum += val; 88 | if (sum > 1) return false; 89 | } 90 | return true; 91 | } 92 | 93 | // project x on to simplex (using // http://www.cs.berkeley.edu/~jduchi/projects/DuchiShSiCh08.pdf) 94 | void simplex_projection(const gsl_vector* x, gsl_vector* x_proj, double z) { 95 | gsl_vector_memcpy(x_proj, x); 96 | gsl_sort_vector(x_proj); 97 | double cumsum = -z, u; 98 | int j = 0; 99 | int i; // this has to be int, not size_t 100 | for (i = (int)x->size-1; i >= 0; i --) { 101 | u = vget(x_proj, i); 102 | cumsum += u; 103 | if (u > cumsum/(j+1)) j++; 104 | else break; 105 | } 106 | double theta = cumsum/j; 107 | for (i = 0; i < (int)x->size; i ++) { 108 | u = vget(x, i)-theta; 109 | if (u <= 0) u = 0.0; 110 | vset(x_proj, i, u); 111 | } 112 | vnormalize(x_proj); // fix the normaliztion issue due to numerical errors 113 | } 114 | -------------------------------------------------------------------------------- /opt.h: -------------------------------------------------------------------------------- 1 | #ifndef OPT_H 2 | #define OPT_H 3 | #include 4 | 5 | void optimize_simplex(const gsl_vector* gamma, const gsl_vector* v, double lambda, gsl_vector* opt_x); 6 | double f_simplex(const gsl_vector* gamma, const gsl_vector* v, double lambda, const gsl_vector* opt_x); 7 | void df_simplex(const gsl_vector* gamma, const gsl_vector* v, double lambda, const gsl_vector* opt_x, gsl_vector* g); 8 | bool is_feasible(const gsl_vector* x); 9 | void simplex_projection(const gsl_vector* x, gsl_vector* x_proj, double z=1.0); 10 | 11 | #endif // OPT_H 12 | 13 | -------------------------------------------------------------------------------- /qsub.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | echo cd `pwd` \; "$@" | qsub -h -l mem=4gb,walltime=48:00:00 3 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | root_path=../cv-c-code 3 | num_factors=200 4 | #root_path=../data/arxiv/cv 5 | #num_factors=100 6 | 7 | for i in `seq 1 5` 8 | do 9 | # ./qsub.sh ./ctr --directory $root_path/cv-cf-$i --user $root_path/cf-train-$i-users.dat --item \ 10 | # $root_path/cf-train-$i-items.dat --a 1 --b 0.01 --lambda_u 0.01 --lambda_v 0.01 \ 11 | # --random_seed 33333 --num_factors $num_factors --save_lag 20 12 | 13 | for type in ofm cf 14 | do 15 | ./qsub.sh ./ctr --directory $root_path/cv-ctr-$i-$type --user $root_path/$type-train-$i-users.dat --item \ 16 | $root_path/$type-train-$i-items.dat --a 1 --b 0.01 --lambda_u 0.01 --lambda_v 100 \ 17 | --mult $root_path/mult.dat --theta_init $root_path/theta-vector.dat \ 18 | --beta_init $root_path/final.beta --num_factors $num_factors --save_lag 20 --theta_opt 19 | done 20 | 21 | done 22 | -------------------------------------------------------------------------------- /script.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #root_path=../data/arxiv/cv 3 | #num_factors=100 4 | 5 | #for i in `seq 1 5` 6 | #do 7 | ## ./qsub.sh ./ctr --directory $root_path/cv-cf-$i --user $root_path/cf-train-$i-users.dat --item \ 8 | ## $root_path/cf-train-$i-items.dat --a 1 --b 0.01 --lambda_u 0.01 --lambda_v 0.01 \ 9 | ## --random_seed 33333 --num_factors $num_factors --save_lag 20 10 | # 11 | # for type in ofm cf 12 | # do 13 | # ./qsub.sh ./ctr --directory $root_path/cv-ctr-$i-$type --user $root_path/$type-train-$i-users.dat --item \ 14 | # $root_path/$type-train-$i-items.dat --a 1 --b 0.01 --lambda_u 0.01 --lambda_v 100 \ 15 | # --mult $root_path/mult.dat --theta_init $root_path/theta-vector.dat \ 16 | # --beta_init $root_path/final.beta --num_factors $num_factors --save_lag 20 --theta_opt 17 | # done 18 | # 19 | #done 20 | 21 | #for i in 0 1 2 3 4 22 | #do 23 | # for K in 200 24 | # do 25 | # for lambda in 0.01 0.1 1 10 100 1000 5000 26 | # do 27 | # ./qsub.sh ./ctr --directory ../data/citeulike/data/cv-in-matrix/cf-fold-$i-K-$K-lambda-$lambda \ 28 | # --user ../data/citeulike/data/cv-in-matrix/fold-$i-users.train \ 29 | # --item ../data/citeulike/data/cv-in-matrix/fold-$i-items.train \ 30 | # --lambda_u 0.01 --lambda_v $lambda --num_factors $K --save_lag 50 \ 31 | # --max_iter 100 32 | # 33 | # ./qsub.sh ./ctr --directory ../data/citeulike/data/cv-in-matrix/ctr-fold-$i-K-$K-lambda-$lambda \ 34 | # --user ../data/citeulike/data/cv-in-matrix/fold-$i-users.train \ 35 | # --item ../data/citeulike/data/cv-in-matrix/fold-$i-items.train \ 36 | # --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 37 | # --mult ../data/citeulike/data/mult.dat \ 38 | # --theta_init ../data/citeulike/data/lda-$K/final.doc.states \ 39 | # --beta_init ../data/citeulike/data/lda-$K/final.topics --num_factors $K \ 40 | # --save_lag 50 --alpha_smooth 1 --max_iter 100 --theta_opt 41 | # 42 | # ./qsub.sh ./ctr --directory ../data/citeulike/data/cv-out-of-matrix/ctr-fold-$i-K-$K-lambda-$lambda \ 43 | # --user ../data/citeulike/data/cv-out-of-matrix/fold-$i-users.train \ 44 | # --item ../data/citeulike/data/cv-out-of-matrix/fold-$i-items.train \ 45 | # --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 46 | # --mult ../data/citeulike/data/mult.dat \ 47 | # --theta_init ../data/citeulike/data/lda-$K/final.doc.states \ 48 | # --beta_init ../data/citeulike/data/lda-$K/final.topics --num_factors $K \ 49 | # --save_lag 50 --alpha_smooth 1 --max_iter 100 --theta_opt 50 | # 51 | # if [ "$lambda" == 10 ]; then 52 | # ./qsub.sh ./ctr --directory ../data/citeulike/data/cv-in-matrix/lda-fold-$i-K-$K-lambda-$lambda \ 53 | # --user ../data/citeulike/data/cv-in-matrix/fold-$i-users.train \ 54 | # --item ../data/citeulike/data/cv-in-matrix/fold-$i-items.train \ 55 | # --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 56 | # --mult ../data/citeulike/data/mult.dat \ 57 | # --theta_init ../data/citeulike/data/lda-$K/final.doc.states \ 58 | # --beta_init ../data/citeulike/data/lda-$K/final.topics --num_factors $K \ 59 | # --save_lag 50 --alpha_smooth 1 --max_iter 100 --lda_regression 60 | # 61 | # ./qsub.sh ./ctr --directory ../data/citeulike/data/cv-out-of-matrix/lda-fold-$i-K-$K-lambda-$lambda \ 62 | # --user ../data/citeulike/data/cv-out-of-matrix/fold-$i-users.train \ 63 | # --item ../data/citeulike/data/cv-out-of-matrix/fold-$i-items.train \ 64 | # --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 65 | # --mult ../data/citeulike/data/mult.dat \ 66 | # --theta_init ../data/citeulike/data/lda-$K/final.doc.states \ 67 | # --beta_init ../data/citeulike/data/lda-$K/final.topics --num_factors $K \ 68 | # --save_lag 50 --alpha_smooth 1 --max_iter 100 --lda_regression 69 | # fi 70 | # done 71 | # done 72 | #done 73 | 74 | rootpath=../data/mendeley/ 75 | for i in 0 1 2 3 4 76 | do 77 | for K in 500 78 | do 79 | for lambda in 10000 20000 50000 80 | do 81 | if [ "$lambda" == 100 ]; then 82 | ./qsub.sh ./ctr --directory $rootpath/cv-in-matrix/lda-fold-$i-K-$K-lambda-$lambda \ 83 | --user $rootpath/cv-in-matrix/fold-$i-users.train \ 84 | --item $rootpath/cv-in-matrix/fold-$i-items.train \ 85 | --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 86 | --mult $rootpath/mult.dat \ 87 | --theta_init $rootpath/lda-$K/final.doc.states \ 88 | --beta_init $rootpath/lda-$K/final.topics --num_factors $K --save_lag 2000 \ 89 | --learning_rate 0.002 --random_seed 939384 --max_iter 1000 --lda_regression --alpha_smooth 0.1 90 | 91 | ./qsub.sh ./ctr --directory $rootpath/cv-out-of-matrix/lda-fold-$i-K-$K-lambda-$lambda \ 92 | --user $rootpath/cv-out-of-matrix/fold-$i-users.train \ 93 | --item $rootpath/cv-out-of-matrix/fold-$i-items.train \ 94 | --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 95 | --mult $rootpath/mult.dat \ 96 | --theta_init $rootpath/lda-$K/final.doc.states \ 97 | --beta_init $rootpath/lda-$K/final.topics --num_factors $K --save_lag 2000 \ 98 | --learning_rate 0.002 --random_seed 939384 --max_iter 200 --lda_regression --alpha_smooth 0.1 99 | fi 100 | 101 | ./qsub.sh ./ctr --directory $rootpath/cv-in-matrix/ctr-fold-$i-K-$K-lambda-$lambda \ 102 | --user $rootpath/cv-in-matrix/fold-$i-users.train \ 103 | --item $rootpath/cv-in-matrix/fold-$i-items.train \ 104 | --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 105 | --mult $rootpath/mult.dat \ 106 | --theta_init $rootpath/lda-$K/final.doc.states \ 107 | --beta_init $rootpath/lda-$K/final.topics --num_factors $K --save_lag 2000 \ 108 | --learning_rate 0.002 --random_seed 939384 --max_iter 250 --alpha_smooth 0.1 109 | 110 | ./qsub.sh ./ctr --directory $rootpath/cv-out-of-matrix/ctr-fold-$i-K-$K-lambda-$lambda \ 111 | --user $rootpath/cv-out-of-matrix/fold-$i-users.train \ 112 | --item $rootpath/cv-out-of-matrix/fold-$i-items.train \ 113 | --lambda_u 0.01 --lambda_v $lambda --num_factors $K \ 114 | --mult $rootpath/mult.dat \ 115 | --theta_init $rootpath/lda-$K/final.doc.states \ 116 | --beta_init $rootpath/lda-$K/final.topics --num_factors $K --save_lag 2000 \ 117 | --learning_rate 0.002 --random_seed 939384 --max_iter 250 --alpha_smooth 0.1 118 | done 119 | done 120 | done 121 | -------------------------------------------------------------------------------- /utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | extern gsl_rng * RANDOM_NUMBER; 4 | 5 | /* 6 | * compare two ints 7 | * */ 8 | 9 | int compare (const void * a, const void * b) { 10 | return ( *(int*)a - *(int*)b ); 11 | } 12 | 13 | /* 14 | * given log(a) and log(b), return log(a+b) 15 | * 16 | */ 17 | 18 | double log_sum(double log_a, double log_b) { 19 | double v; 20 | 21 | if (log_a == -1) return(log_b); 22 | 23 | if (log_a < log_b) { 24 | v = log_b+log(1 + exp(log_a-log_b)); 25 | } 26 | else { 27 | v = log_a+log(1 + exp(log_b-log_a)); 28 | } 29 | return(v); 30 | } 31 | 32 | /* 33 | void vinc(gsl_vector* v, int i, double x) 34 | { 35 | vset(v, i, vget(v, i) + x); 36 | } 37 | */ 38 | 39 | void minc(gsl_matrix* m, int i, int j, double x) { 40 | mset(m, i, j, mget(m, i, j) + x); 41 | } 42 | 43 | /* 44 | * compute the row sums of a matrix 45 | * 46 | */ 47 | 48 | void row_sum(const gsl_matrix* m, gsl_vector* val) { 49 | size_t i, j; 50 | gsl_vector_set_zero(val); 51 | 52 | for (i = 0; i < m->size1; i++) 53 | for (j = 0; j < m->size2; j++) 54 | vinc(val, i, mget(m, i, j)); 55 | } 56 | 57 | /* 58 | * compute the column sums of a matrix 59 | * 60 | */ 61 | 62 | void col_sum(const gsl_matrix* m, gsl_vector* val) { 63 | size_t i, j; 64 | gsl_vector_set_zero(val); 65 | 66 | for (i = 0; i < m->size1; i++) 67 | for (j = 0; j < m->size2; j++) 68 | vinc(val, j, mget(m, i, j)); 69 | } 70 | 71 | 72 | /* 73 | * print a vector to standard out 74 | * 75 | */ 76 | 77 | void vct_fprintf(FILE * file, const gsl_vector * v) { 78 | size_t i; 79 | for (i = 0; i < v->size; i++) 80 | fprintf(file, "%10.15e ", vget(v, i)); 81 | fprintf(file, "\n"); 82 | } 83 | 84 | 85 | /* 86 | * print a matrix to standard out 87 | * 88 | */ 89 | 90 | void mtx_fprintf(FILE * file, const gsl_matrix * m) { 91 | size_t i, j; 92 | for (i = 0; i < m->size1; i++) { 93 | for (j = 0; j < m->size2; j++) 94 | fprintf(file, "%10.15e ", mget(m, i, j)); 95 | fprintf(file, "\n"); 96 | } 97 | } 98 | 99 | void mtx_fscanf(FILE* file, gsl_matrix* m) { 100 | size_t i, j; 101 | double x; 102 | for (i = 0; i < m->size1; i++) { 103 | for (j = 0; j < m->size2; j++) { 104 | fscanf(file, "%lf", &x); 105 | mset(m, i, j, x); 106 | } 107 | } 108 | } 109 | 110 | /* 111 | * matrix vector solve using blas 112 | * 113 | */ 114 | 115 | void matrix_vector_solve(const gsl_matrix* m, const gsl_vector* b, gsl_vector* v) { 116 | gsl_matrix *lu; 117 | gsl_permutation* p; 118 | int signum; 119 | 120 | p = gsl_permutation_alloc(m->size1); 121 | lu = gsl_matrix_alloc(m->size1, m->size2); 122 | 123 | gsl_matrix_memcpy(lu, m); 124 | gsl_linalg_LU_decomp(lu, p, &signum); 125 | gsl_linalg_LU_solve(lu, p, b, v); 126 | 127 | gsl_matrix_free(lu); 128 | gsl_permutation_free(p); 129 | } 130 | 131 | /* 132 | * matrix inversion using blas 133 | * 134 | */ 135 | 136 | void matrix_inverse(const gsl_matrix* m, gsl_matrix* inverse) { 137 | gsl_matrix *lu; 138 | gsl_permutation* p; 139 | int signum; 140 | 141 | p = gsl_permutation_alloc(m->size1); 142 | lu = gsl_matrix_alloc(m->size1, m->size2); 143 | 144 | gsl_matrix_memcpy(lu, m); 145 | gsl_linalg_LU_decomp(lu, p, &signum); 146 | gsl_linalg_LU_invert(lu, p, inverse); 147 | 148 | gsl_matrix_free(lu); 149 | gsl_permutation_free(p); 150 | } 151 | 152 | /* 153 | * log determinant using blas 154 | * 155 | */ 156 | 157 | double log_det(const gsl_matrix* m) { 158 | gsl_matrix* lu; 159 | gsl_permutation* p; 160 | double result; 161 | int signum; 162 | 163 | p = gsl_permutation_alloc(m->size1); 164 | lu = gsl_matrix_alloc(m->size1, m->size2); 165 | 166 | gsl_matrix_memcpy(lu, m); 167 | gsl_linalg_LU_decomp(lu, p, &signum); 168 | result = gsl_linalg_LU_lndet(lu); 169 | 170 | gsl_matrix_free(lu); 171 | gsl_permutation_free(p); 172 | 173 | return(result); 174 | } 175 | 176 | 177 | /* 178 | * eigenvalues of a symmetric matrix using blas 179 | * 180 | */ 181 | 182 | void sym_eigen(gsl_matrix* m, gsl_vector* vals, gsl_matrix* vects) { 183 | gsl_eigen_symmv_workspace* wk; 184 | gsl_matrix* mcpy; 185 | int r; 186 | 187 | mcpy = gsl_matrix_alloc(m->size1, m->size2); 188 | wk = gsl_eigen_symmv_alloc(m->size1); 189 | gsl_matrix_memcpy(mcpy, m); 190 | r = gsl_eigen_symmv(mcpy, vals, vects, wk); 191 | gsl_eigen_symmv_free(wk); 192 | gsl_matrix_free(mcpy); 193 | } 194 | 195 | 196 | /* 197 | * sum of a vector 198 | * 199 | */ 200 | /* 201 | double sum(const gsl_vector* v) { 202 | double val = 0; 203 | int i, size = v->size; 204 | for (i = 0; i < size; i++) 205 | val += vget(v, i); 206 | return(val); 207 | } 208 | */ 209 | 210 | /* 211 | * apply a function to each element of a gsl vector. 212 | * 213 | */ 214 | void gsl_vector_apply(gsl_vector* x, double(*fun)(double)) { 215 | size_t i; 216 | for(i = 0; i < x->size; i ++) 217 | vset(x, i, fun(vget(x, i))); 218 | } 219 | 220 | 221 | /* 222 | * take log of each element for a vector 223 | * 224 | */ 225 | void vct_log(gsl_vector* v) { 226 | int i, size = v->size; 227 | for (i = 0; i < size; i++) 228 | vset(v, i, safe_log(vget(v, i))); 229 | } 230 | 231 | 232 | /* 233 | * take log of each element for a matrix 234 | * 235 | */ 236 | void mtx_log(gsl_matrix* x) { 237 | size_t i, j; 238 | for (i = 0; i < x->size1; i++) 239 | for (j = 0; j < x->size2; j++) 240 | mset(x, i, j, safe_log(mget(x, i, j))); 241 | } 242 | 243 | 244 | 245 | /* 246 | * l2 norm of a vector 247 | * 248 | */ 249 | 250 | double vnorm(const gsl_vector *v) { 251 | return gsl_blas_dnrm2(v); 252 | } 253 | 254 | 255 | /* 256 | * normalize a vector in log space 257 | * 258 | * x_i = log(a_i) 259 | * v = log(a_1 + ... + a_k) 260 | * x_i = x_i - v 261 | * 262 | */ 263 | 264 | double log_normalize(gsl_vector* x) { 265 | double v = vget(x, 0); 266 | size_t i; 267 | 268 | for (i = 1; i < x->size; i++) 269 | v = log_sum(v, vget(x, i)); 270 | 271 | for (i = 0; i < x->size; i++) 272 | vset(x, i, vget(x,i)-v); 273 | 274 | return v; 275 | } 276 | 277 | 278 | /* 279 | * normalize a positive vector 280 | * 281 | */ 282 | 283 | double vnormalize(gsl_vector* x) { 284 | double v = vsum(x); 285 | if (v > 0 || v < 0) 286 | gsl_vector_scale(x, 1/v); 287 | return v; 288 | } 289 | 290 | 291 | /* 292 | * exponentiate a vector 293 | * 294 | */ 295 | 296 | void vct_exp(gsl_vector* x) { 297 | for (size_t i = 0; i < x->size; i++) 298 | vset(x, i, exp(vget(x, i))); 299 | } 300 | 301 | /* 302 | * exponentiate a matrix 303 | * 304 | */ 305 | void mtx_exp(gsl_matrix* x) { 306 | size_t i, j; 307 | for (i = 0; i < x->size1; i++) 308 | for (j = 0; j < x->size2; j++) 309 | mset(x, i, j, exp(mget(x, i, j))); 310 | } 311 | 312 | double mahalanobis_distance(const gsl_matrix* m, 313 | const gsl_vector* u, 314 | const gsl_vector* v) { 315 | double val = 0; 316 | gsl_vector* x = gsl_vector_alloc(u->size); 317 | gsl_vector_memcpy(x, u); 318 | gsl_vector_sub(x, v); 319 | val = mahalanobis_prod(m, x, x); 320 | gsl_vector_free(x); 321 | return val; 322 | } 323 | 324 | // blasified 325 | double mahalanobis_prod(const gsl_matrix* m, 326 | const gsl_vector* u, 327 | const gsl_vector* v) { 328 | gsl_vector* x = gsl_vector_alloc(u->size); 329 | gsl_blas_dgemv(CblasNoTrans, 1.0, m, v, 0.0, x); 330 | double val = 0; 331 | gsl_blas_ddot(u, x, &val); 332 | gsl_vector_free(x); 333 | return val; 334 | } 335 | 336 | double matrix_dot_prod(const gsl_matrix* m1, const gsl_matrix* m2) { 337 | double val = 0, result; 338 | for (size_t i = 0; i < m1->size1; i ++) { 339 | gsl_vector_const_view v1 = gsl_matrix_const_row(m1, i); 340 | gsl_vector_const_view v2 = gsl_matrix_const_row(m2, i); 341 | gsl_blas_ddot(&v1.vector, &v2.vector, &result); 342 | val += result; 343 | } 344 | return val; 345 | } 346 | 347 | 348 | /** 349 | * 350 | * check if file exisits 351 | */ 352 | bool file_exists(const char * filename) { 353 | if ( 0 == access(filename, R_OK)) 354 | return true; 355 | return false; 356 | } 357 | 358 | 359 | 360 | /* 361 | * check if a directory exists 362 | * 363 | * !!! shouldn't be here 364 | */ 365 | 366 | int dir_exists(const char *dname) { 367 | struct stat st; 368 | int ret; 369 | 370 | if (stat(dname,&st) != 0) { 371 | return 0; 372 | } 373 | 374 | ret = S_ISDIR(st.st_mode); 375 | 376 | if(!ret) { 377 | errno = ENOTDIR; 378 | } 379 | 380 | return ret; 381 | } 382 | 383 | void make_directory(const char* name) { 384 | mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR); 385 | } 386 | 387 | 388 | /* 389 | * new random number generator 390 | * 391 | */ 392 | gsl_rng * new_random_number_generator(long seed) { 393 | gsl_rng * random_number_generator = gsl_rng_alloc(gsl_rng_taus); 394 | gsl_rng_set(random_number_generator, (long) seed); // init the seed 395 | 396 | return random_number_generator; 397 | } 398 | 399 | /* 400 | * free random number generator 401 | * */ 402 | 403 | void free_random_number_generator(gsl_rng * random_number_generator) { 404 | gsl_rng_free(random_number_generator); 405 | } 406 | 407 | void choose_k_from_n(int k, int n, int* result, int* src) { 408 | gsl_ran_choose (RANDOM_NUMBER, (void *) result, k, (void *) src, n, sizeof(int)); 409 | } 410 | 411 | void sample_k_from_n(int k, int n, int* result, int* src) { 412 | gsl_ran_sample (RANDOM_NUMBER, (void *) result, k, (void *) src, n, sizeof(int)); 413 | } 414 | 415 | double digamma(double x) { 416 | return gsl_sf_psi(x); 417 | } 418 | 419 | unsigned int rmultinomial(const gsl_vector* v) { 420 | size_t i; 421 | 422 | double sum = vsum(v); 423 | 424 | double u = runiform() * sum; 425 | double cum_sum = 0.0; 426 | for (i = 0; i < v->size; i ++) { 427 | cum_sum += vget(v, i); 428 | if (u < cum_sum) break; 429 | } 430 | return i; 431 | } 432 | 433 | double rgamma(double a, double b) { 434 | return gsl_ran_gamma_mt(RANDOM_NUMBER, a, b); 435 | } 436 | 437 | double rbeta(double a, double b) { 438 | return gsl_ran_beta(RANDOM_NUMBER, a, b); 439 | } 440 | 441 | unsigned int rbernoulli(double p) { 442 | return gsl_ran_bernoulli(RANDOM_NUMBER, p); 443 | } 444 | 445 | double runiform() { 446 | return gsl_rng_uniform_pos(RANDOM_NUMBER); 447 | } 448 | 449 | void rshuffle(void* base, size_t n, size_t size) { 450 | gsl_ran_shuffle(RANDOM_NUMBER, base, n, size); 451 | } 452 | 453 | unsigned long int runiform_int(unsigned long int n) { 454 | return gsl_rng_uniform_int(RANDOM_NUMBER, n); 455 | } 456 | 457 | -------------------------------------------------------------------------------- /utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | 25 | #define outlog(format, args...) \ 26 | fprintf(stderr, format, args); \ 27 | fprintf(stderr, "\n"); 28 | 29 | int compare (const void * a, const void * b); 30 | 31 | inline double safe_log(double x) { 32 | if (x <= 0) 33 | return(-10000); 34 | else 35 | return(log(x)); 36 | } 37 | double log_sum(double, double); 38 | 39 | inline double vget(const gsl_vector* v, int i) { return(gsl_vector_get(v, i)); } 40 | 41 | inline void vset(gsl_vector* v, int i, double x) { gsl_vector_set(v, i, x); } 42 | 43 | // Increment a vector element by a double. 44 | inline void vinc(gsl_vector* v, int i, double x) { 45 | vset(v, i, vget(v, i) + x); 46 | } 47 | 48 | inline double mget(const gsl_matrix* m, int i, int j) 49 | { return(gsl_matrix_get(m, i, j)); } 50 | 51 | inline void mset(gsl_matrix* m, int i, int j, double x) 52 | { gsl_matrix_set(m, i, j, x); } 53 | 54 | // Increment a matrix element by a double. 55 | void minc(gsl_matrix*, int, int, double); 56 | 57 | void col_sum(const gsl_matrix*, gsl_vector*); 58 | void row_sum(const gsl_matrix*, gsl_vector*); 59 | 60 | void vct_fprintf(FILE* file, const gsl_vector* v); 61 | void mtx_fprintf(FILE* file, const gsl_matrix* m); 62 | void mtx_fscanf(FILE* file, gsl_matrix* m); 63 | 64 | inline bool check_sym(const gsl_matrix *m) { 65 | for (size_t i = 0; i < m->size1-1; i ++) 66 | for (size_t j=i; j < m->size2; j ++) 67 | if (mget(m, i, j) != mget(m, j, i)) { 68 | printf("not sym\n"); 69 | return false; 70 | } 71 | return true; 72 | } 73 | 74 | double log_det(const gsl_matrix*); 75 | 76 | void matrix_inverse(const gsl_matrix*, gsl_matrix*); 77 | void matrix_vector_solve(const gsl_matrix* m, const gsl_vector* b, gsl_vector* v); 78 | 79 | void sym_eigen(gsl_matrix*, gsl_vector*, gsl_matrix*); 80 | 81 | inline double vsum(const gsl_vector* v) { 82 | double val = 0; 83 | int i, size = v->size; 84 | for (i = 0; i < size; i++) 85 | val += vget(v, i); 86 | return(val); 87 | } 88 | 89 | double vnorm(const gsl_vector * v); 90 | 91 | void gsl_vector_apply(gsl_vector* x, double(*fun)(double)); 92 | void vct_log(gsl_vector* v); 93 | void mtx_log(gsl_matrix* x); 94 | void vct_exp(gsl_vector* x); 95 | void mtx_exp(gsl_matrix* x); 96 | 97 | double mahalanobis_distance(const gsl_matrix * m, const gsl_vector* u, const gsl_vector* v); 98 | double mahalanobis_prod(const gsl_matrix * m, const gsl_vector* u, const gsl_vector* v); 99 | double matrix_dot_prod(const gsl_matrix * m1, const gsl_matrix* m2); 100 | 101 | void choose_k_from_n(int k, int n, int* result, int* src); 102 | void sample_k_from_n(int k, int n, int* result, int* src); 103 | 104 | double log_normalize(gsl_vector* x); 105 | double vnormalize(gsl_vector* x); 106 | 107 | int dir_exists(const char *dname); 108 | bool file_exists(const char * filename); 109 | void make_directory(const char* name); 110 | 111 | double digamma(double x); 112 | unsigned int rmultinomial(const gsl_vector* v); 113 | double rgamma(double a, double b); 114 | double rbeta(double a, double b); 115 | unsigned int rbernoulli(double p); 116 | double runiform(); 117 | void rshuffle (void* base, size_t n, size_t size); 118 | unsigned long int runiform_int(unsigned long int n); 119 | 120 | // new and free random number generator 121 | gsl_rng* new_random_number_generator(long seed); 122 | void free_random_number_generator(gsl_rng * random_number_generator); 123 | 124 | #endif 125 | --------------------------------------------------------------------------------