├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── data ├── chembl1868.csv ├── small.csr ├── train_data.csr ├── train_data.txt └── train_labels.txt ├── dune-project ├── files ├── install_matrix.r └── install_xgboost.r ├── gblinear_test.r ├── orxgboost.opam ├── read_sparse_matrix.r ├── rtest.sh ├── src ├── dune ├── gblinear.ml ├── gbtree.ml ├── gbtree.mli ├── gnuplot.ml ├── model.ml ├── result.ml ├── test.ml └── utls.ml └── test.r /.gitignore: -------------------------------------------------------------------------------- 1 | src/.merlin 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The Library is distributed under the terms of the GNU Lesser General 2 | Public License version 2.1 (included below). 3 | 4 | As a special exception to the GNU Library General Public License, you 5 | may link, statically or dynamically, a "work that uses the Library" 6 | with a publicly distributed version of the Library to produce an 7 | executable file containing portions of the Library, and distribute 8 | that executable file under terms of your choice, without any of the 9 | additional requirements listed in clause 6 of the GNU Library General 10 | Public License. By "a publicly distributed version of the Library", we 11 | mean either the unmodified Library as distributed, or a modified 12 | version of the Library that is distributed under the conditions 13 | defined in clause 3 of the GNU Library General Public License. This 14 | exception does not however invalidate any other reasons why the 15 | executable file might be covered by the GNU Library General Public 16 | License. 17 | 18 | ----------------------------------------------------------------------- 19 | 20 | GNU LESSER GENERAL PUBLIC LICENSE 21 | Version 2.1, February 1999 22 | 23 | Copyright (C) 1991, 1999 Free Software Foundation, Inc. 24 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25 | Everyone is permitted to copy and distribute verbatim copies 26 | of this license document, but changing it is not allowed. 27 | 28 | [This is the first released version of the Lesser GPL. It also counts 29 | as the successor of the GNU Library Public License, version 2, hence 30 | the version number 2.1.] 31 | 32 | Preamble 33 | 34 | The licenses for most software are designed to take away your 35 | freedom to share and change it. By contrast, the GNU General Public 36 | Licenses are intended to guarantee your freedom to share and change 37 | free software--to make sure the software is free for all its users. 38 | 39 | This license, the Lesser General Public License, applies to some 40 | specially designated software packages--typically libraries--of the 41 | Free Software Foundation and other authors who decide to use it. You 42 | can use it too, but we suggest you first think carefully about whether 43 | this license or the ordinary General Public License is the better 44 | strategy to use in any particular case, based on the explanations below. 45 | 46 | When we speak of free software, we are referring to freedom of use, 47 | not price. Our General Public Licenses are designed to make sure that 48 | you have the freedom to distribute copies of free software (and charge 49 | for this service if you wish); that you receive source code or can get 50 | it if you want it; that you can change the software and use pieces of 51 | it in new free programs; and that you are informed that you can do 52 | these things. 53 | 54 | To protect your rights, we need to make restrictions that forbid 55 | distributors to deny you these rights or to ask you to surrender these 56 | rights. These restrictions translate to certain responsibilities for 57 | you if you distribute copies of the library or if you modify it. 58 | 59 | For example, if you distribute copies of the library, whether gratis 60 | or for a fee, you must give the recipients all the rights that we gave 61 | you. You must make sure that they, too, receive or can get the source 62 | code. If you link other code with the library, you must provide 63 | complete object files to the recipients, so that they can relink them 64 | with the library after making changes to the library and recompiling 65 | it. And you must show them these terms so they know their rights. 66 | 67 | We protect your rights with a two-step method: (1) we copyright the 68 | library, and (2) we offer you this license, which gives you legal 69 | permission to copy, distribute and/or modify the library. 70 | 71 | To protect each distributor, we want to make it very clear that 72 | there is no warranty for the free library. Also, if the library is 73 | modified by someone else and passed on, the recipients should know 74 | that what they have is not the original version, so that the original 75 | author's reputation will not be affected by problems that might be 76 | introduced by others. 77 | 78 | Finally, software patents pose a constant threat to the existence of 79 | any free program. We wish to make sure that a company cannot 80 | effectively restrict the users of a free program by obtaining a 81 | restrictive license from a patent holder. Therefore, we insist that 82 | any patent license obtained for a version of the library must be 83 | consistent with the full freedom of use specified in this license. 84 | 85 | Most GNU software, including some libraries, is covered by the 86 | ordinary GNU General Public License. This license, the GNU Lesser 87 | General Public License, applies to certain designated libraries, and 88 | is quite different from the ordinary General Public License. We use 89 | this license for certain libraries in order to permit linking those 90 | libraries into non-free programs. 91 | 92 | When a program is linked with a library, whether statically or using 93 | a shared library, the combination of the two is legally speaking a 94 | combined work, a derivative of the original library. The ordinary 95 | General Public License therefore permits such linking only if the 96 | entire combination fits its criteria of freedom. The Lesser General 97 | Public License permits more lax criteria for linking other code with 98 | the library. 99 | 100 | We call this license the "Lesser" General Public License because it 101 | does Less to protect the user's freedom than the ordinary General 102 | Public License. It also provides other free software developers Less 103 | of an advantage over competing non-free programs. These disadvantages 104 | are the reason we use the ordinary General Public License for many 105 | libraries. However, the Lesser license provides advantages in certain 106 | special circumstances. 107 | 108 | For example, on rare occasions, there may be a special need to 109 | encourage the widest possible use of a certain library, so that it becomes 110 | a de-facto standard. To achieve this, non-free programs must be 111 | allowed to use the library. A more frequent case is that a free 112 | library does the same job as widely used non-free libraries. In this 113 | case, there is little to gain by limiting the free library to free 114 | software only, so we use the Lesser General Public License. 115 | 116 | In other cases, permission to use a particular library in non-free 117 | programs enables a greater number of people to use a large body of 118 | free software. For example, permission to use the GNU C Library in 119 | non-free programs enables many more people to use the whole GNU 120 | operating system, as well as its variant, the GNU/Linux operating 121 | system. 122 | 123 | Although the Lesser General Public License is Less protective of the 124 | users' freedom, it does ensure that the user of a program that is 125 | linked with the Library has the freedom and the wherewithal to run 126 | that program using a modified version of the Library. 127 | 128 | The precise terms and conditions for copying, distribution and 129 | modification follow. Pay close attention to the difference between a 130 | "work based on the library" and a "work that uses the library". The 131 | former contains code derived from the library, whereas the latter must 132 | be combined with the library in order to run. 133 | 134 | GNU LESSER GENERAL PUBLIC LICENSE 135 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 136 | 137 | 0. This License Agreement applies to any software library or other 138 | program which contains a notice placed by the copyright holder or 139 | other authorized party saying it may be distributed under the terms of 140 | this Lesser General Public License (also called "this License"). 141 | Each licensee is addressed as "you". 142 | 143 | A "library" means a collection of software functions and/or data 144 | prepared so as to be conveniently linked with application programs 145 | (which use some of those functions and data) to form executables. 146 | 147 | The "Library", below, refers to any such software library or work 148 | which has been distributed under these terms. A "work based on the 149 | Library" means either the Library or any derivative work under 150 | copyright law: that is to say, a work containing the Library or a 151 | portion of it, either verbatim or with modifications and/or translated 152 | straightforwardly into another language. (Hereinafter, translation is 153 | included without limitation in the term "modification".) 154 | 155 | "Source code" for a work means the preferred form of the work for 156 | making modifications to it. For a library, complete source code means 157 | all the source code for all modules it contains, plus any associated 158 | interface definition files, plus the scripts used to control compilation 159 | and installation of the library. 160 | 161 | Activities other than copying, distribution and modification are not 162 | covered by this License; they are outside its scope. The act of 163 | running a program using the Library is not restricted, and output from 164 | such a program is covered only if its contents constitute a work based 165 | on the Library (independent of the use of the Library in a tool for 166 | writing it). Whether that is true depends on what the Library does 167 | and what the program that uses the Library does. 168 | 169 | 1. You may copy and distribute verbatim copies of the Library's 170 | complete source code as you receive it, in any medium, provided that 171 | you conspicuously and appropriately publish on each copy an 172 | appropriate copyright notice and disclaimer of warranty; keep intact 173 | all the notices that refer to this License and to the absence of any 174 | warranty; and distribute a copy of this License along with the 175 | Library. 176 | 177 | You may charge a fee for the physical act of transferring a copy, 178 | and you may at your option offer warranty protection in exchange for a 179 | fee. 180 | 181 | 2. You may modify your copy or copies of the Library or any portion 182 | of it, thus forming a work based on the Library, and copy and 183 | distribute such modifications or work under the terms of Section 1 184 | above, provided that you also meet all of these conditions: 185 | 186 | a) The modified work must itself be a software library. 187 | 188 | b) You must cause the files modified to carry prominent notices 189 | stating that you changed the files and the date of any change. 190 | 191 | c) You must cause the whole of the work to be licensed at no 192 | charge to all third parties under the terms of this License. 193 | 194 | d) If a facility in the modified Library refers to a function or a 195 | table of data to be supplied by an application program that uses 196 | the facility, other than as an argument passed when the facility 197 | is invoked, then you must make a good faith effort to ensure that, 198 | in the event an application does not supply such function or 199 | table, the facility still operates, and performs whatever part of 200 | its purpose remains meaningful. 201 | 202 | (For example, a function in a library to compute square roots has 203 | a purpose that is entirely well-defined independent of the 204 | application. Therefore, Subsection 2d requires that any 205 | application-supplied function or table used by this function must 206 | be optional: if the application does not supply it, the square 207 | root function must still compute square roots.) 208 | 209 | These requirements apply to the modified work as a whole. If 210 | identifiable sections of that work are not derived from the Library, 211 | and can be reasonably considered independent and separate works in 212 | themselves, then this License, and its terms, do not apply to those 213 | sections when you distribute them as separate works. But when you 214 | distribute the same sections as part of a whole which is a work based 215 | on the Library, the distribution of the whole must be on the terms of 216 | this License, whose permissions for other licensees extend to the 217 | entire whole, and thus to each and every part regardless of who wrote 218 | it. 219 | 220 | Thus, it is not the intent of this section to claim rights or contest 221 | your rights to work written entirely by you; rather, the intent is to 222 | exercise the right to control the distribution of derivative or 223 | collective works based on the Library. 224 | 225 | In addition, mere aggregation of another work not based on the Library 226 | with the Library (or with a work based on the Library) on a volume of 227 | a storage or distribution medium does not bring the other work under 228 | the scope of this License. 229 | 230 | 3. You may opt to apply the terms of the ordinary GNU General Public 231 | License instead of this License to a given copy of the Library. To do 232 | this, you must alter all the notices that refer to this License, so 233 | that they refer to the ordinary GNU General Public License, version 2, 234 | instead of to this License. (If a newer version than version 2 of the 235 | ordinary GNU General Public License has appeared, then you can specify 236 | that version instead if you wish.) Do not make any other change in 237 | these notices. 238 | 239 | Once this change is made in a given copy, it is irreversible for 240 | that copy, so the ordinary GNU General Public License applies to all 241 | subsequent copies and derivative works made from that copy. 242 | 243 | This option is useful when you wish to copy part of the code of 244 | the Library into a program that is not a library. 245 | 246 | 4. You may copy and distribute the Library (or a portion or 247 | derivative of it, under Section 2) in object code or executable form 248 | under the terms of Sections 1 and 2 above provided that you accompany 249 | it with the complete corresponding machine-readable source code, which 250 | must be distributed under the terms of Sections 1 and 2 above on a 251 | medium customarily used for software interchange. 252 | 253 | If distribution of object code is made by offering access to copy 254 | from a designated place, then offering equivalent access to copy the 255 | source code from the same place satisfies the requirement to 256 | distribute the source code, even though third parties are not 257 | compelled to copy the source along with the object code. 258 | 259 | 5. A program that contains no derivative of any portion of the 260 | Library, but is designed to work with the Library by being compiled or 261 | linked with it, is called a "work that uses the Library". Such a 262 | work, in isolation, is not a derivative work of the Library, and 263 | therefore falls outside the scope of this License. 264 | 265 | However, linking a "work that uses the Library" with the Library 266 | creates an executable that is a derivative of the Library (because it 267 | contains portions of the Library), rather than a "work that uses the 268 | library". The executable is therefore covered by this License. 269 | Section 6 states terms for distribution of such executables. 270 | 271 | When a "work that uses the Library" uses material from a header file 272 | that is part of the Library, the object code for the work may be a 273 | derivative work of the Library even though the source code is not. 274 | Whether this is true is especially significant if the work can be 275 | linked without the Library, or if the work is itself a library. The 276 | threshold for this to be true is not precisely defined by law. 277 | 278 | If such an object file uses only numerical parameters, data 279 | structure layouts and accessors, and small macros and small inline 280 | functions (ten lines or less in length), then the use of the object 281 | file is unrestricted, regardless of whether it is legally a derivative 282 | work. (Executables containing this object code plus portions of the 283 | Library will still fall under Section 6.) 284 | 285 | Otherwise, if the work is a derivative of the Library, you may 286 | distribute the object code for the work under the terms of Section 6. 287 | Any executables containing that work also fall under Section 6, 288 | whether or not they are linked directly with the Library itself. 289 | 290 | 6. As an exception to the Sections above, you may also combine or 291 | link a "work that uses the Library" with the Library to produce a 292 | work containing portions of the Library, and distribute that work 293 | under terms of your choice, provided that the terms permit 294 | modification of the work for the customer's own use and reverse 295 | engineering for debugging such modifications. 296 | 297 | You must give prominent notice with each copy of the work that the 298 | Library is used in it and that the Library and its use are covered by 299 | this License. You must supply a copy of this License. If the work 300 | during execution displays copyright notices, you must include the 301 | copyright notice for the Library among them, as well as a reference 302 | directing the user to the copy of this License. Also, you must do one 303 | of these things: 304 | 305 | a) Accompany the work with the complete corresponding 306 | machine-readable source code for the Library including whatever 307 | changes were used in the work (which must be distributed under 308 | Sections 1 and 2 above); and, if the work is an executable linked 309 | with the Library, with the complete machine-readable "work that 310 | uses the Library", as object code and/or source code, so that the 311 | user can modify the Library and then relink to produce a modified 312 | executable containing the modified Library. (It is understood 313 | that the user who changes the contents of definitions files in the 314 | Library will not necessarily be able to recompile the application 315 | to use the modified definitions.) 316 | 317 | b) Use a suitable shared library mechanism for linking with the 318 | Library. A suitable mechanism is one that (1) uses at run time a 319 | copy of the library already present on the user's computer system, 320 | rather than copying library functions into the executable, and (2) 321 | will operate properly with a modified version of the library, if 322 | the user installs one, as long as the modified version is 323 | interface-compatible with the version that the work was made with. 324 | 325 | c) Accompany the work with a written offer, valid for at 326 | least three years, to give the same user the materials 327 | specified in Subsection 6a, above, for a charge no more 328 | than the cost of performing this distribution. 329 | 330 | d) If distribution of the work is made by offering access to copy 331 | from a designated place, offer equivalent access to copy the above 332 | specified materials from the same place. 333 | 334 | e) Verify that the user has already received a copy of these 335 | materials or that you have already sent this user a copy. 336 | 337 | For an executable, the required form of the "work that uses the 338 | Library" must include any data and utility programs needed for 339 | reproducing the executable from it. However, as a special exception, 340 | the materials to be distributed need not include anything that is 341 | normally distributed (in either source or binary form) with the major 342 | components (compiler, kernel, and so on) of the operating system on 343 | which the executable runs, unless that component itself accompanies 344 | the executable. 345 | 346 | It may happen that this requirement contradicts the license 347 | restrictions of other proprietary libraries that do not normally 348 | accompany the operating system. Such a contradiction means you cannot 349 | use both them and the Library together in an executable that you 350 | distribute. 351 | 352 | 7. You may place library facilities that are a work based on the 353 | Library side-by-side in a single library together with other library 354 | facilities not covered by this License, and distribute such a combined 355 | library, provided that the separate distribution of the work based on 356 | the Library and of the other library facilities is otherwise 357 | permitted, and provided that you do these two things: 358 | 359 | a) Accompany the combined library with a copy of the same work 360 | based on the Library, uncombined with any other library 361 | facilities. This must be distributed under the terms of the 362 | Sections above. 363 | 364 | b) Give prominent notice with the combined library of the fact 365 | that part of it is a work based on the Library, and explaining 366 | where to find the accompanying uncombined form of the same work. 367 | 368 | 8. You may not copy, modify, sublicense, link with, or distribute 369 | the Library except as expressly provided under this License. Any 370 | attempt otherwise to copy, modify, sublicense, link with, or 371 | distribute the Library is void, and will automatically terminate your 372 | rights under this License. However, parties who have received copies, 373 | or rights, from you under this License will not have their licenses 374 | terminated so long as such parties remain in full compliance. 375 | 376 | 9. You are not required to accept this License, since you have not 377 | signed it. However, nothing else grants you permission to modify or 378 | distribute the Library or its derivative works. These actions are 379 | prohibited by law if you do not accept this License. Therefore, by 380 | modifying or distributing the Library (or any work based on the 381 | Library), you indicate your acceptance of this License to do so, and 382 | all its terms and conditions for copying, distributing or modifying 383 | the Library or works based on it. 384 | 385 | 10. Each time you redistribute the Library (or any work based on the 386 | Library), the recipient automatically receives a license from the 387 | original licensor to copy, distribute, link with or modify the Library 388 | subject to these terms and conditions. You may not impose any further 389 | restrictions on the recipients' exercise of the rights granted herein. 390 | You are not responsible for enforcing compliance by third parties with 391 | this License. 392 | 393 | 11. If, as a consequence of a court judgment or allegation of patent 394 | infringement or for any other reason (not limited to patent issues), 395 | conditions are imposed on you (whether by court order, agreement or 396 | otherwise) that contradict the conditions of this License, they do not 397 | excuse you from the conditions of this License. If you cannot 398 | distribute so as to satisfy simultaneously your obligations under this 399 | License and any other pertinent obligations, then as a consequence you 400 | may not distribute the Library at all. For example, if a patent 401 | license would not permit royalty-free redistribution of the Library by 402 | all those who receive copies directly or indirectly through you, then 403 | the only way you could satisfy both it and this License would be to 404 | refrain entirely from distribution of the Library. 405 | 406 | If any portion of this section is held invalid or unenforceable under any 407 | particular circumstance, the balance of the section is intended to apply, 408 | and the section as a whole is intended to apply in other circumstances. 409 | 410 | It is not the purpose of this section to induce you to infringe any 411 | patents or other property right claims or to contest validity of any 412 | such claims; this section has the sole purpose of protecting the 413 | integrity of the free software distribution system which is 414 | implemented by public license practices. Many people have made 415 | generous contributions to the wide range of software distributed 416 | through that system in reliance on consistent application of that 417 | system; it is up to the author/donor to decide if he or she is willing 418 | to distribute software through any other system and a licensee cannot 419 | impose that choice. 420 | 421 | This section is intended to make thoroughly clear what is believed to 422 | be a consequence of the rest of this License. 423 | 424 | 12. If the distribution and/or use of the Library is restricted in 425 | certain countries either by patents or by copyrighted interfaces, the 426 | original copyright holder who places the Library under this License may add 427 | an explicit geographical distribution limitation excluding those countries, 428 | so that distribution is permitted only in or among countries not thus 429 | excluded. In such case, this License incorporates the limitation as if 430 | written in the body of this License. 431 | 432 | 13. The Free Software Foundation may publish revised and/or new 433 | versions of the Lesser General Public License from time to time. 434 | Such new versions will be similar in spirit to the present version, 435 | but may differ in detail to address new problems or concerns. 436 | 437 | Each version is given a distinguishing version number. If the Library 438 | specifies a version number of this License which applies to it and 439 | "any later version", you have the option of following the terms and 440 | conditions either of that version or of any later version published by 441 | the Free Software Foundation. If the Library does not specify a 442 | license version number, you may choose any version ever published by 443 | the Free Software Foundation. 444 | 445 | 14. If you wish to incorporate parts of the Library into other free 446 | programs whose distribution conditions are incompatible with these, 447 | write to the author to ask for permission. For software which is 448 | copyrighted by the Free Software Foundation, write to the Free 449 | Software Foundation; we sometimes make exceptions for this. Our 450 | decision will be guided by the two goals of preserving the free status 451 | of all derivatives of our free software and of promoting the sharing 452 | and reuse of software generally. 453 | 454 | NO WARRANTY 455 | 456 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO 457 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 458 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR 459 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY 460 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE 461 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 462 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE 463 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME 464 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 465 | 466 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN 467 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY 468 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU 469 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR 470 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 471 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING 472 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A 473 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF 474 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 475 | DAMAGES. 476 | 477 | END OF TERMS AND CONDITIONS 478 | 479 | How to Apply These Terms to Your New Libraries 480 | 481 | If you develop a new library, and you want it to be of the greatest 482 | possible use to the public, we recommend making it free software that 483 | everyone can redistribute and change. You can do so by permitting 484 | redistribution under these terms (or, alternatively, under the terms of the 485 | ordinary General Public License). 486 | 487 | To apply these terms, attach the following notices to the library. It is 488 | safest to attach them to the start of each source file to most effectively 489 | convey the exclusion of warranty; and each file should have at least the 490 | "copyright" line and a pointer to where the full notice is found. 491 | 492 | 493 | Copyright (C) 494 | 495 | This library is free software; you can redistribute it and/or 496 | modify it under the terms of the GNU Lesser General Public 497 | License as published by the Free Software Foundation; either 498 | version 2.1 of the License, or (at your option) any later version. 499 | 500 | This library is distributed in the hope that it will be useful, 501 | but WITHOUT ANY WARRANTY; without even the implied warranty of 502 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 503 | Lesser General Public License for more details. 504 | 505 | You should have received a copy of the GNU Lesser General Public 506 | License along with this library; if not, write to the Free Software 507 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 508 | 509 | Also add information on how to contact you by electronic and paper mail. 510 | 511 | You should also get your employer (if you work as a programmer) or your 512 | school, if any, to sign a "copyright disclaimer" for the library, if 513 | necessary. Here is a sample; alter the names: 514 | 515 | Yoyodyne, Inc., hereby disclaims all copyright interest in the 516 | library `Frob' (a library for tweaking knobs) written by James Random Hacker. 517 | 518 | , 1 April 1990 519 | Ty Coon, President of Vice 520 | 521 | That's all there is to it! 522 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: install uninstall reinstall test 2 | 3 | build: 4 | dune build @install 5 | 6 | clean: 7 | dune clean 8 | 9 | edit: 10 | emacs src/*.ml & 11 | 12 | install: 13 | dune build @install 14 | dune install 15 | 16 | test: 17 | dune build src/test.exe 18 | _build/default/src/test.exe -np `getconf _NPROCESSORS_ONLN` 19 | 20 | uninstall: 21 | dune uninstall 22 | 23 | reinstall: uninstall install 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # orxgboost 2 | Gradient boosting for OCaml using the R xgboost package under the carpet 3 | 4 | https://cran.r-project.org/web/packages/xgboost/index.html 5 | 6 | https://cran.r-project.org/web/packages/xgboost/xgboost.pdf 7 | 8 | Please look into the R documentation for the corresponding package for 9 | details: 10 | 11 | ``` 12 | R 13 | install.packages('xgboost', dependencies = TRUE, repos='http://cran.r-project.org') 14 | library(xgboost) 15 | help(xgboost) 16 | ```` 17 | -------------------------------------------------------------------------------- /data/small.csr: -------------------------------------------------------------------------------- 1 | 1:9 2:1 3:2 4:1 10:2 2 | 1:9 4:2 5:2 6:1 9:1 10:1 3 | 1:7 4:2 6:2 9:2 10:1 4 | -------------------------------------------------------------------------------- /data/train_data.csr: -------------------------------------------------------------------------------- 1 | 1:9 2:1 3:2 4:1 10:2 19:1 21:1 24:1 39:1 44:1 45:1 68:1 69:1 80:1 2 | 1:9 4:2 5:2 6:1 9:1 10:1 17:1 245:1 493:1 3 | 1:7 4:2 6:2 9:2 10:1 12:1 17:1 18:1 25:1 69:1 70:1 71:1 72:1 74:1 78:1 149:1 4 | 1:7 2:1 4:1 5:1 8:2 11:1 12:1 15:1 16:1 17:1 25:1 34:1 58:1 67:1 86:1 159:1 5 | 1:3 2:3 4:3 5:1 7:1 11:1 12:4 19:1 34:1 53:2 60:1 62:1 83:1 92:1 110:1 129:1 156:1 163:1 6 | 1:6 4:4 6:2 10:2 12:2 69:2 103:2 149:2 7 | 1:7 4:2 5:1 6:1 9:1 10:1 17:1 28:1 31:1 33:1 44:1 45:1 69:1 149:1 8 | 1:10 4:2 6:2 9:2 15:2 18:2 69:2 149:1 9 | 1:4 2:1 4:2 5:3 12:1 13:1 20:2 24:1 28:1 34:1 38:1 43:1 125:1 145:1 151:1 174:1 201:1 10 | 1:10 3:2 4:1 10:2 25:1 39:1 69:1 100:1 202:1 11 | 1:10 4:3 6:1 9:1 12:1 15:2 18:1 69:2 149:1 12 | 1:7 2:1 4:1 5:1 8:2 11:1 12:1 15:1 16:1 17:1 25:1 34:1 58:1 67:1 86:1 159:1 13 | 1:9 2:1 4:1 5:1 8:2 15:1 16:1 17:1 34:1 43:1 53:1 62:1 79:1 147:1 185:1 14 | 1:6 4:4 6:2 10:2 12:2 69:2 103:2 149:2 15 | 1:2 3:2 4:1 6:1 9:1 10:1 12:1 17:1 31:1 33:1 87:1 89:1 95:1 124:1 148:1 149:1 16 | 1:4 2:1 4:2 5:3 12:1 13:1 20:2 24:1 28:1 34:1 38:1 43:1 125:1 145:1 151:1 174:1 201:1 17 | 1:6 4:4 6:2 10:2 12:2 69:2 103:2 149:2 18 | 1:10 4:2 6:2 9:2 15:2 18:2 69:2 149:1 19 | 1:10 4:2 6:2 9:2 15:2 18:2 69:2 149:1 20 | 1:8 4:2 5:1 6:1 9:1 18:1 24:2 31:1 33:1 49:1 94:1 107:1 139:1 21 | 1:8 2:1 3:2 11:1 13:2 23:1 25:1 29:1 44:1 45:1 100:1 138:1 202:1 314:1 341:1 22 | 1:7 2:1 4:1 5:1 8:2 11:1 12:1 15:1 16:1 17:1 25:1 34:1 58:1 67:1 86:1 159:1 23 | 1:8 4:2 5:1 6:1 9:1 18:1 24:2 31:1 33:1 49:1 94:1 107:1 139:1 24 | 1:8 3:2 4:3 5:1 6:1 9:1 10:1 12:2 17:1 149:1 169:1 25 | 1:10 4:2 6:2 9:2 15:2 18:2 69:2 149:1 26 | 1:8 4:2 5:3 10:1 14:2 31:1 33:1 35:1 130:1 134:1 149:1 169:1 171:1 27 | 1:5 4:3 5:1 6:3 9:3 12:2 14:2 17:1 35:1 46:1 69:1 123:1 128:1 28 | 1:9 3:1 5:2 6:1 9:1 10:1 15:2 17:3 41:1 133:1 149:1 29 | 1:3 11:1 31:2 33:2 116:1 170:1 277:2 352:1 665:1 30 | 1:8 4:2 5:1 6:1 9:1 18:1 24:2 31:1 33:1 49:1 94:1 107:1 139:1 31 | 1:7 4:1 8:2 12:1 15:1 16:1 17:1 25:1 26:1 31:1 33:1 67:1 78:1 100:1 102:1 159:1 32 | 1:9 2:1 3:2 4:1 10:2 19:1 21:1 24:1 39:1 44:1 45:1 68:1 69:1 80:1 33 | 1:8 4:1 6:1 9:1 10:1 17:3 31:2 33:2 41:1 69:1 149:1 34 | 1:10 4:2 6:2 9:2 15:2 18:2 69:2 149:1 35 | 1:7 4:2 6:2 9:2 10:1 12:1 17:1 18:1 25:1 69:1 70:1 71:1 72:1 74:1 78:1 149:1 36 | 1:7 4:2 6:2 9:2 10:1 12:1 17:1 18:1 25:1 69:1 70:1 71:1 72:1 74:1 78:1 149:1 37 | 1:9 4:2 5:2 6:1 9:1 10:1 17:1 245:1 493:1 38 | 1:13 4:2 6:3 9:1 18:1 69:2 149:1 181:1 39 | 1:3 2:1 4:1 5:2 7:1 11:1 12:2 20:2 21:1 36:1 43:1 53:1 56:1 81:1 87:1 157:1 188:1 216:1 263:1 40 | 1:10 4:3 6:1 9:1 12:1 15:2 18:1 69:2 149:1 41 | 1:8 4:2 5:1 6:1 9:1 12:1 20:2 37:1 43:1 61:1 66:1 79:1 90:1 106:1 125:1 137:1 138:1 42 | 1:8 4:1 6:1 9:1 10:1 17:3 31:2 33:2 41:1 69:1 149:1 43 | 1:8 5:1 6:1 8:2 9:1 15:1 16:1 17:1 25:1 67:1 97:1 102:1 159:1 44 | 1:3 2:1 3:2 4:1 5:2 6:1 7:1 9:1 12:1 17:1 24:1 31:1 33:1 41:1 68:1 74:1 156:1 45 | 1:7 2:2 5:1 6:2 9:1 11:1 15:2 17:1 18:1 29:1 34:1 38:1 58:1 59:2 113:2 145:1 166:1 46 | 1:3 2:3 4:1 5:2 7:1 24:1 31:1 33:1 34:1 44:1 45:1 57:1 62:1 68:1 80:1 101:1 126:1 47 | 1:9 2:2 7:1 15:2 18:1 19:1 20:2 21:1 32:1 34:1 43:1 53:1 62:1 126:1 147:1 48 | 1:11 2:1 11:1 14:2 23:1 24:1 25:1 31:1 33:1 36:1 49:1 96:1 99:1 127:1 218:1 49 | 1:8 2:1 3:1 4:1 10:2 11:1 14:2 23:1 24:1 31:1 33:1 35:1 39:1 46:1 54:1 68:1 69:1 82:2 50 | 1:4 2:1 3:3 5:1 8:2 10:3 11:1 16:3 17:1 34:1 37:1 39:1 52:1 58:1 66:1 67:1 97:1 51 | 1:8 4:1 6:2 8:1 9:2 14:2 15:1 16:1 18:2 25:1 30:1 40:1 153:1 154:1 177:1 217:1 52 | 1:11 2:1 6:1 8:1 9:1 11:1 15:2 16:2 18:1 23:1 38:1 64:1 84:1 108:1 116:1 117:1 53 | 1:7 3:1 4:2 8:1 10:3 12:1 17:1 22:1 32:1 39:2 69:2 78:1 133:1 54 | 1:10 2:2 3:1 4:1 5:3 7:2 16:2 20:1 24:1 29:1 38:1 44:1 45:1 49:1 60:1 64:1 76:1 85:1 109:1 173:1 204:1 55 | 1:9 2:2 6:1 9:1 11:2 18:1 20:2 23:1 24:1 38:1 43:1 49:1 53:1 83:1 151:1 201:1 56 | 3:6 4:1 5:2 8:1 27:1 30:2 37:1 40:1 42:6 51:2 66:1 67:1 112:1 172:1 192:1 215:2 278:1 791:1 57 | 1:7 3:5 4:1 5:1 10:1 20:1 36:1 39:1 55:1 69:1 76:1 79:1 85:2 96:1 195:1 235:1 58 | 1:4 4:1 8:1 12:1 16:1 20:2 22:1 36:1 52:1 55:1 98:1 116:1 150:1 175:1 252:1 259:1 59 | 1:1 2:1 3:2 5:2 7:1 8:2 12:2 32:2 51:1 56:1 108:1 200:1 220:1 241:1 343:1 60 | 1:4 5:2 8:2 9:1 10:2 16:2 17:1 26:2 39:1 48:1 51:1 67:1 82:1 164:1 61 | 1:5 2:1 4:1 5:2 7:1 17:1 18:1 20:2 24:1 37:1 41:1 43:1 53:1 61:1 66:1 68:1 79:1 185:1 62 | 1:1 3:2 5:1 8:2 15:1 17:1 26:1 28:1 36:1 41:1 67:1 74:1 86:1 96:1 100:1 136:1 63 | 1:5 2:1 3:4 5:1 8:1 10:4 15:1 17:1 34:1 39:2 50:1 52:1 62:1 69:1 92:1 110:1 133:1 151:1 64 | 1:10 2:1 4:2 6:2 7:1 8:1 11:1 22:2 108:1 116:1 117:1 181:1 260:1 65 | 1:13 2:2 6:2 11:2 19:1 21:1 23:1 58:1 103:1 166:1 66 | 1:4 2:2 6:1 9:1 11:2 12:2 18:1 23:2 38:1 43:1 60:2 114:1 132:2 167:1 67 | 1:6 2:1 4:1 6:1 23:1 24:1 28:1 38:1 47:2 64:1 84:1 103:1 118:1 121:1 68 | 3:10 5:1 10:2 17:2 39:1 41:1 51:3 74:1 78:1 133:1 146:1 69 | 1:4 3:2 8:1 15:1 26:2 38:1 40:1 44:1 45:1 48:1 90:1 137:1 243:1 316:1 70 | 1:4 2:2 3:3 5:1 7:1 8:1 13:3 16:2 24:1 29:1 37:1 66:1 80:1 81:1 118:1 241:1 71 | 1:7 6:2 8:1 9:2 11:1 16:2 18:2 100:1 102:1 116:1 119:1 252:1 259:1 72 | 1:7 2:1 4:1 6:3 8:2 9:3 11:1 12:1 18:3 19:1 21:1 22:1 23:1 32:1 100:1 102:1 154:1 73 | 1:4 3:5 4:2 5:1 8:1 15:1 16:1 20:2 28:1 30:1 40:1 43:1 61:1 91:1 122:1 191:1 194:1 74 | 1:8 4:2 8:1 12:1 15:1 16:1 25:2 26:1 28:1 67:1 97:1 381:1 75 | 1:7 2:2 3:1 8:1 11:2 15:1 16:1 23:2 28:1 30:1 37:1 40:1 44:1 45:1 66:1 79:1 132:2 76 | 1:5 2:2 4:1 7:1 18:1 24:1 27:1 28:1 34:1 47:2 62:1 118:1 121:1 168:1 245:1 77 | 1:6 2:1 4:1 6:3 9:3 12:1 18:2 19:2 21:2 42:2 80:1 298:1 78 | 1:6 2:1 4:2 6:2 9:2 11:1 12:2 15:1 17:2 20:2 38:1 40:1 43:1 53:1 83:1 201:1 79 | 3:5 8:2 9:1 17:1 51:1 67:1 97:2 102:1 164:1 203:1 265:1 681:1 80 | 1:3 2:2 3:2 4:3 5:2 12:2 16:1 20:1 22:1 24:1 27:1 28:1 32:1 65:1 73:1 77:1 131:1 139:1 143:1 183:1 425:1 81 | 1:6 2:1 6:2 9:2 11:1 16:1 20:1 25:1 31:1 33:1 56:1 77:1 83:1 85:2 88:1 128:1 152:1 172:1 222:1 82 | 1:10 2:1 6:2 7:1 9:2 14:2 15:2 18:2 19:1 21:1 25:1 153:1 177:1 178:1 217:1 83 | 1:9 2:2 7:2 11:1 13:1 14:2 21:1 27:1 35:1 50:1 81:1 119:1 209:1 237:1 279:1 294:1 687:1 84 | 1:7 2:1 7:1 19:1 20:2 21:1 31:2 33:2 36:1 37:1 38:1 43:1 53:1 55:1 60:1 61:1 64:1 79:1 96:1 115:1 185:1 85 | 1:5 2:1 4:1 5:1 7:1 12:1 20:1 21:1 28:1 32:1 67:1 68:1 70:1 71:1 72:1 76:1 81:1 109:1 86 | 1:5 2:1 8:1 12:1 16:2 19:1 21:1 24:1 37:1 38:1 49:1 60:1 63:1 64:1 79:1 80:1 87 | 1:2 10:2 14:2 17:2 36:1 41:1 46:1 89:1 218:1 333:1 631:1 821:1 88 | 1:4 2:1 3:2 4:1 6:1 8:1 11:2 23:1 28:2 30:1 31:1 32:1 33:1 40:1 103:1 108:1 116:1 117:1 89 | -------------------------------------------------------------------------------- /data/train_labels.txt: -------------------------------------------------------------------------------- 1 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 2 | -------------------------------------------------------------------------------- /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 1.0) 2 | (name orxgboost) 3 | -------------------------------------------------------------------------------- /files/install_matrix.r: -------------------------------------------------------------------------------- 1 | pkg <- 'Matrix' 2 | if (!require(pkg, character.only = TRUE)) { 3 | install.packages(pkg, dependencies = TRUE, 4 | repos='http://cran.r-project.org') 5 | } 6 | -------------------------------------------------------------------------------- /files/install_xgboost.r: -------------------------------------------------------------------------------- 1 | pkg <- 'xgboost' 2 | if (!require(pkg, character.only = TRUE)) { 3 | install.packages(pkg, dependencies = TRUE, 4 | repos='http://cran.r-project.org') 5 | } 6 | -------------------------------------------------------------------------------- /gblinear_test.r: -------------------------------------------------------------------------------- 1 | 2 | library(xgboost) 3 | 4 | train_fn <- 'data/chembl1868.csv' 5 | 6 | training_set <- as.matrix(read.table(train_fn, colClasses = 'numeric', 7 | header = TRUE)) 8 | cols_count = dim(training_set)[2] 9 | 10 | x <- training_set[, 2:cols_count] # all lines, all columns except 1st 11 | y <- training_set[, 1:1] # all lines, only 1st column (resp. var) 12 | 13 | # check number of rows 14 | stopifnot(cols_count == length(y)) 15 | 16 | # train 17 | gbtree <- xgboost(data = x, label = y, booster='gblinear', eta = 0.2, objective = 'reg:squarederror', eval_metric = 'rmse', nrounds = 50, lambda = 0.0, alpha = 0.0) 18 | 19 | xgb.save(gbtree, 'r_gbtree_model.bin') 20 | 21 | xgb.load('r_gbtree_model.bin') 22 | 23 | # stupid test on training data; don't do this at home !!! 24 | values <- predict(gbtree, x) 25 | 26 | write.table(values, file = 'data/predictions.txt', sep = '\n', 27 | row.names = F, col.names = F) 28 | quit() 29 | -------------------------------------------------------------------------------- /orxgboost.opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | maintainer: "unixjunkie@sdf.org" 3 | authors: ["Francois Berenger"] 4 | homepage: "https://github.com/UnixJunkie/orxgboost" 5 | bug-reports: "https://github.com/UnixJunkie/orxgboost/issues" 6 | dev-repo: "git+https://github.com/UnixJunkie/orxgboost.git" 7 | license: "LGPL-2.1-only with OCaml-LGPL-linking-exception" 8 | build: [ 9 | ["R" "CMD" "BATCH" "install_matrix.r"] 10 | ["R" "CMD" "BATCH" "install_xgboost.r"] 11 | ["dune" "build" "-p" name "-j" jobs] 12 | ] 13 | depends: [ 14 | "ocaml" 15 | "dune" 16 | "conf-r" 17 | "dolog" {>= "4.0.0"} 18 | "batteries" 19 | "cpm" 20 | "parany" {>= "11.0.2"} 21 | ] 22 | post-messages: [ 23 | "Please interact with R to install needed things in user-space: 24 | R 25 | install.packages('Marix', repos='http://cran.r-project.org') 26 | install.packages('xgboost', repos='http://cran.r-project.org')" {failure} 27 | ] 28 | synopsis: "Gradient boosting for OCaml using the R xgboost package" 29 | description: """ 30 | This package really fires up and talks to an R interpreter. 31 | Data are exchanged via text files. 32 | It can handle dense or sparse (in CSR format) data matrices. 33 | For details, cf. Chen, Tianqi, and Carlos Guestrin. 34 | "Xgboost: A scalable tree boosting system." 35 | Proceedings of KDD'16. ACM, 2016. 36 | DOI: 10.1145/2939672.2939785. 37 | https://xgboost.readthedocs.io/en/latest/""" 38 | extra-files: [ 39 | ["install_xgboost.r" "md5=0cc15f242946933f2d335a18b833fff1"] 40 | ["install_matrix.r" "md5=ab1c0ae726388159b1315bc9fe61a013"] 41 | ] 42 | # url { 43 | # src: "https://github.com/UnixJunkie/orxgboost/archive/vXXX.tar.gz" 44 | # checksum: "md5=YYY" 45 | # } 46 | -------------------------------------------------------------------------------- /read_sparse_matrix.r: -------------------------------------------------------------------------------- 1 | library(Matrix) 2 | read_csr_file <- function(file, ncol = NULL) 3 | { 4 | lines <- readLines(file) 5 | nrow <- length(lines) 6 | res <- Matrix(0, nrow, ncol) 7 | i <- 1 8 | for (line in lines) { 9 | cols = strsplit(line, '[ ]+') 10 | for (col in cols[[1]]) { 11 | s <- strsplit(col, ':') 12 | j <- as.integer(s[[1]][1]) 13 | k <- as.numeric(s[[1]][2]) 14 | res[i, j] <- k 15 | } 16 | i <- i + 1 17 | } 18 | res 19 | } 20 | 21 | file='data/small.csr' 22 | z <- read_csr_file(file, 10) 23 | z 24 | -------------------------------------------------------------------------------- /rtest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | R --vanilla --slave < test.r 2>&1 > /dev/null 4 | -------------------------------------------------------------------------------- /src/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name orxgboost) 3 | (public_name orxgboost) 4 | (modules result utls gbtree gblinear gnuplot) 5 | (libraries batteries dolog)) 6 | 7 | (executables 8 | (names test) 9 | (modules test) 10 | (libraries dolog batteries cpm orxgboost)) 11 | 12 | (executables 13 | (names model) 14 | (public_names orxgboost_model) 15 | (modules model) 16 | (libraries minicli dolog batteries cpm orxgboost parany)) 17 | -------------------------------------------------------------------------------- /src/gblinear.ml: -------------------------------------------------------------------------------- 1 | 2 | open Printf 3 | 4 | (* gradient boosted regressor using Gblinear *) 5 | 6 | module L = BatList 7 | module Log = Dolog.Log 8 | 9 | type filename = string 10 | 11 | type params = { eta: float; (* learning rate in [0.0:1.0] *) 12 | lambda: float; (* L2 regularization term on weights *) 13 | alpha: float; (* L1 regularization term on weights *) 14 | nrounds: int (* number of training rounds *) } 15 | 16 | let make_params eta lambda alpha nrounds = 17 | { eta; lambda; alpha; nrounds } 18 | 19 | (* train model and return the filename it was saved to upon success *) 20 | let train 21 | ?debug:(debug = false) 22 | (params: params) 23 | (data_fn: filename): Result.t = 24 | let model_fn: filename = Filename.temp_file "gblinear_model_" ".bin" in 25 | (* create R script and store it in a temp file *) 26 | let r_script_fn = Filename.temp_file "gblinear_train_" ".r" in 27 | Utls.with_out_file r_script_fn (fun out -> 28 | fprintf out 29 | "library(xgboost)\n\ 30 | training_set <- as.matrix(read.table('%s', colClasses = 'numeric', \ 31 | header = TRUE))\n\ 32 | cols_count = dim(training_set)[2]\n\ 33 | x <- training_set[, 2:cols_count]\n\ 34 | y <- training_set[, 1:1]\n\ 35 | gbtree <- xgboost(data = x, label = y, booster='gblinear', eta = %f,\n\ 36 | objective = 'reg:squarederror',\n\ 37 | eval_metric = 'rmse',\n\ 38 | nrounds = %d, lambda = %f, alpha = %f)\n\ 39 | xgb.save(gbtree, '%s')\n\ 40 | quit()\n" 41 | data_fn params.eta params.nrounds params.lambda params.alpha model_fn 42 | ); 43 | let r_log_fn = Filename.temp_file "gblinear_train_" ".log" in 44 | (* execute it *) 45 | let cmd = sprintf "R --vanilla --slave < %s 2>&1 > %s" r_script_fn r_log_fn in 46 | if debug then Log.debug "%s" cmd; 47 | if Sys.command cmd <> 0 then 48 | Utls.collect_script_and_log debug r_script_fn r_log_fn model_fn 49 | else 50 | Utls.ignore_fst 51 | (if not debug then L.iter Sys.remove [r_script_fn; r_log_fn]) 52 | (Result.Ok model_fn) 53 | 54 | (* use model in 'model_fn' to predict decision values for test data in 'data_fn' 55 | and return the filename containing values upon success *) 56 | let predict ?debug:(debug = false) (maybe_model_fn: Result.t) (data_fn: filename) 57 | : float list = 58 | match maybe_model_fn with 59 | | Error err -> failwith ("Gblinear.predict: model error: " ^ err) 60 | | Ok model_fn -> 61 | let predictions_fn = Filename.temp_file "gblinear_predictions_" ".txt" in 62 | (* create R script in temp file *) 63 | let r_script_fn = Filename.temp_file "gblinear_predict_" ".r" in 64 | Utls.with_out_file r_script_fn (fun out -> 65 | fprintf out 66 | "library(xgboost)\n\ 67 | test_set <- as.matrix(read.table('%s', colClasses = 'numeric',\n\ 68 | header = TRUE))\n\ 69 | lines_count = dim(test_set)[1]\n\ 70 | cols_count = dim(test_set)[2]\n\ 71 | x <- test_set[, 2:cols_count]\n\ 72 | gbtree <- xgb.load('%s')\n\ 73 | values <- predict(gbtree, x)\n\ 74 | stopifnot(lines_count == length(values))\n\ 75 | write.table(values, file = '%s', sep = '\n',\n\ 76 | row.names = F, col.names = F)\n\ 77 | quit()\n" 78 | data_fn model_fn predictions_fn 79 | ); 80 | (* execute it *) 81 | let r_log_fn = Filename.temp_file "gblinear_predict_" ".log" in 82 | let cmd = sprintf "R --vanilla --slave < %s 2>&1 > %s" r_script_fn r_log_fn in 83 | if debug then Log.debug "%s" cmd; 84 | if Sys.command cmd <> 0 then 85 | match Utls.collect_script_and_log 86 | debug r_script_fn r_log_fn predictions_fn with 87 | | Ok _ -> assert(false) 88 | | Error err -> failwith ("Gblinear.predict: R error: " ^ err) 89 | else 90 | let preds = Utls.read_predictions debug (Result.Ok predictions_fn) in 91 | (if not debug then 92 | L.iter Sys.remove [r_script_fn; r_log_fn]); 93 | preds 94 | -------------------------------------------------------------------------------- /src/gbtree.ml: -------------------------------------------------------------------------------- 1 | open Printf 2 | 3 | module L = List 4 | module Log = Dolog.Log 5 | 6 | type filename = string 7 | 8 | type gbtree_params = { eta: float; (* learning rate *) 9 | gamma: float; (* minimum loss reduction *) 10 | max_depth: int; (* max depth of tree *) 11 | min_child_weight: float; (* minimum sum of 12 | instance weight *) 13 | subsample: float; (* subsample ratio of 14 | training instances *) 15 | colsample_bytree: float; (* subsample ratio of columns *) 16 | num_parallel_tree: int; (* number of trees to grow 17 | per round *) 18 | (* We wont support this last one: 19 | monotone_constraints: int array; *) } 20 | 21 | type linear_params = { lambda: float; (* L2 regularization term on weights *) 22 | lambda_bias: float; (* L2 regularization term on bias *) 23 | alpha: float } (* L1 regularization term on weights *) 24 | 25 | type booster = 26 | | Gbtree of gbtree_params 27 | | Gblinear of linear_params 28 | 29 | let default_linear_params () = 30 | Gblinear { lambda = 0.0; lambda_bias = 0.0; alpha = 0.0 } 31 | 32 | let default_gbtree_params () = 33 | Gbtree { eta = 0.3; 34 | gamma = 1.0; 35 | max_depth = 6; 36 | min_child_weight = 1.0; 37 | subsample = 1.0; 38 | colsample_bytree = 1.0; 39 | num_parallel_tree = 1 } 40 | 41 | let string_of_params = function 42 | | Gbtree { eta; gamma; max_depth; min_child_weight; subsample; 43 | colsample_bytree; num_parallel_tree } -> 44 | sprintf "booster = 'gbtree', \ 45 | eta = %f, gamma = %f, max_depth = %d, \ 46 | min_child_weight = %f, subsample = %f, \ 47 | colsample_bytree = %f, num_parallel_tree = %d" 48 | eta gamma max_depth min_child_weight subsample 49 | colsample_bytree num_parallel_tree 50 | | Gblinear { lambda; lambda_bias; alpha } -> 51 | sprintf "booster = 'gblinear', lambda = %f, lambda_bias = %f, alpha = %f" 52 | lambda lambda_bias alpha 53 | 54 | let string_of_debug = function 55 | | true -> "verbose = 1" (* makes xgboost verbose *) 56 | | false -> "verbose = 0" (* makes xgboost silent *) 57 | 58 | (* capture everything in case of error *) 59 | let collect_script_and_log = 60 | Utls.collect_script_and_log 61 | 62 | type nb_columns = int 63 | type sparsity = Dense 64 | | Sparse of nb_columns 65 | 66 | let read_csr_file = 67 | "read_csr_file <- function(file, ncol = NULL)\n\ 68 | {\n\ 69 | lines <- readLines(file)\n\ 70 | nrow <- length(lines)\n\ 71 | res <- Matrix(0, nrow, ncol)\n\ 72 | i <- 1\n\ 73 | for (line in lines) {\n\ 74 | cols = strsplit(line, '[ ]+')\n\ 75 | for (col in cols[[1]]) {\n\ 76 | s <- strsplit(col, ':')\n\ 77 | j <- as.integer(s[[1]][1])\n\ 78 | k <- as.numeric(s[[1]][2])\n\ 79 | res[i, j] <- k\n\ 80 | }\n\ 81 | i <- i + 1\n\ 82 | }\n\ 83 | res\n\ 84 | }" 85 | 86 | let read_matrix_str maybe_sparse data_fn = 87 | match maybe_sparse with 88 | | Dense -> 89 | sprintf "as.matrix(read.table('%s', colClasses = 'numeric'))" data_fn 90 | | Sparse ncol -> 91 | sprintf "read_csr_file('%s', ncol = %d)" data_fn ncol 92 | 93 | (* train model and return the filename it was saved to upon success *) 94 | let train 95 | ?debug:(debug = false) 96 | (sparse: sparsity) 97 | (nrounds: int) 98 | (params: booster) 99 | (data_fn: filename) 100 | (labels_fn: filename): Result.t = 101 | let model_fn: filename = Filename.temp_file "orxgboost_model_" ".bin" in 102 | (* create R script and store it in a temp file *) 103 | let r_script_fn = Filename.temp_file "orxgboost_train_" ".r" in 104 | let read_x_str = read_matrix_str sparse data_fn in 105 | let params_str = string_of_params params in 106 | let verbose_str = string_of_debug debug in 107 | Utls.with_out_file r_script_fn (fun out -> 108 | fprintf out 109 | "library(xgboost)\n\ 110 | library(Matrix)\n\ 111 | %s\n\ 112 | x <- %s\n\ 113 | y <- as.vector(read.table('%s'), mode = 'numeric')\n\ 114 | lut <- data.frame(old = c(-1.0, 1.0), new = c(0.0, 1.0))\n\ 115 | label <- lut$new[match(y, lut$old)]\n\ 116 | stopifnot(nrow(x) == length(label))\n\ 117 | tree <- xgboost(data = x, label, %s, nrounds = %d, \ 118 | objective = 'binary:logitraw', \ 119 | eval_metric = 'auc', %s)\n\ 120 | xgb.save(tree, '%s')\n\ 121 | quit()\n" 122 | read_csr_file 123 | read_x_str labels_fn verbose_str nrounds params_str model_fn 124 | ); 125 | let r_log_fn = Filename.temp_file "orxgboost_train_" ".log" in 126 | (* execute it *) 127 | let cmd = sprintf "R --vanilla --slave < %s 2>&1 > %s" r_script_fn r_log_fn in 128 | if debug then Log.debug "%s" cmd; 129 | if Sys.command cmd <> 0 then 130 | collect_script_and_log debug r_script_fn r_log_fn model_fn 131 | else 132 | Utls.ignore_fst 133 | (if not debug then L.iter Sys.remove [r_script_fn; r_log_fn]) 134 | (Result.Ok model_fn) 135 | 136 | (* use model in 'model_fn' to predict decision values for test data in 'data_fn' 137 | and return the filename containing values upon success *) 138 | let predict ?debug:(debug = false) 139 | (sparse: sparsity) (maybe_model_fn: Result.t) (data_fn: filename): Result.t = 140 | match maybe_model_fn with 141 | | Error err -> Error err 142 | | Ok model_fn -> 143 | let predictions_fn = Filename.temp_file "orxgboost_predictions_" ".txt" in 144 | (* create R script in temp file *) 145 | let r_script_fn = Filename.temp_file "orxgboost_predict_" ".r" in 146 | let read_x_str = read_matrix_str sparse data_fn in 147 | Utls.with_out_file r_script_fn (fun out -> 148 | fprintf out 149 | "library(xgboost)\n\ 150 | library(Matrix)\n\ 151 | %s\n\ 152 | newdata <- %s\n\ 153 | tree <- xgb.load('%s')\n\ 154 | values <- predict(tree, newdata)\n\ 155 | stopifnot(nrow(newdata) == length(values))\n\ 156 | write.table(values, file = '%s', sep = '\\n', \ 157 | row.names = FALSE, col.names = FALSE)\n\ 158 | quit()\n" 159 | read_csr_file read_x_str model_fn predictions_fn 160 | ); 161 | (* execute it *) 162 | let r_log_fn = Filename.temp_file "orxgboost_predict_" ".log" in 163 | let cmd = sprintf "R --vanilla --slave < %s 2>&1 > %s" r_script_fn r_log_fn in 164 | if debug then Log.debug "%s" cmd; 165 | if Sys.command cmd <> 0 then 166 | collect_script_and_log debug r_script_fn r_log_fn predictions_fn 167 | else 168 | Utls.ignore_fst 169 | (if not debug then L.iter Sys.remove [r_script_fn; r_log_fn]) 170 | (Result.Ok predictions_fn) 171 | 172 | (* read predicted decision values *) 173 | let read_predictions ?debug:(debug = false) = 174 | Utls.read_predictions debug 175 | -------------------------------------------------------------------------------- /src/gbtree.mli: -------------------------------------------------------------------------------- 1 | 2 | type filename = string 3 | 4 | type gbtree_params = { eta: float; (* learning rate *) 5 | gamma: float; (* minimum loss reduction *) 6 | max_depth: int; (* max depth of tree *) 7 | min_child_weight: float; (* minimum sum of 8 | instance weight *) 9 | subsample: float; (* subsample ratio of 10 | training instances *) 11 | colsample_bytree: float; (* subsample ratio of columns *) 12 | num_parallel_tree: int } (* number of trees to grow 13 | per round *) 14 | 15 | type linear_params = { lambda: float; (* L2 regularization term on weights *) 16 | lambda_bias: float; (* L2 regularization term on bias *) 17 | alpha: float } (* L1 regularization term on weights *) 18 | 19 | type booster = 20 | | Gbtree of gbtree_params 21 | | Gblinear of linear_params 22 | 23 | val default_linear_params: unit -> booster 24 | 25 | val default_gbtree_params: unit -> booster 26 | 27 | type nb_columns = int 28 | type sparsity = Dense 29 | | Sparse of nb_columns 30 | 31 | (** [train sparsity nrounds params data_fn labels_fn] 32 | will train a gradient-boosted tree using given parameters 33 | on the data in [data_fn] with labels in [labels_fn]. 34 | [data_fn] is a numerical matrix dumped in a tab-separated text file 35 | without any format header. 36 | Dense or sparse matrices (in CSR format) are supported. 37 | Rows are observations, columns are features. 38 | [labels_fn] is a vector of tab-separated "1" or "-1" integer labels 39 | in a text file, without any format header. 40 | Column [i] in [labels_fn] is the corresponding label of line [i] 41 | in [data_fn]. *) 42 | val train: ?debug:bool -> 43 | sparsity -> int -> booster -> filename -> filename -> Result.t 44 | 45 | (** [predict sparsity model data_fn] will run the previously trained 46 | gradient-boosted tree on the data stored in [data_fn]. 47 | [data_fn] must use the same format than the file that was used 48 | during training. 49 | On success, a filename is returned. 50 | This text file contains the predicted decision values, 51 | one per line of [data_fn]. *) 52 | val predict: ?debug:bool -> sparsity -> Result.t -> filename -> Result.t 53 | 54 | (** [read_predictions ?debug result] will decode predicted decision values 55 | in [result], or crash if the previous call to [predict] 56 | was not successful. 57 | Upon success and if [not debug], the file containing the 58 | predicted decision values is removed. *) 59 | val read_predictions: ?debug:bool -> Result.t -> float list 60 | -------------------------------------------------------------------------------- /src/gnuplot.ml: -------------------------------------------------------------------------------- 1 | 2 | open Printf 3 | 4 | module L = BatList 5 | 6 | let regr_plot title actual preds = 7 | let x_min, x_max = L.min_max ~cmp:BatFloat.compare actual in 8 | let y_min, y_max = L.min_max ~cmp:BatFloat.compare preds in 9 | let xy_min = min x_min y_min in 10 | let xy_max = max x_max y_max in 11 | let data_fn = Filename.temp_file "RFR_regr_data_" ".txt" in 12 | Utls.with_out_file data_fn (fun out -> 13 | L.iter (fun (x, y) -> 14 | fprintf out "%f %f\n" x y 15 | ) (L.combine actual preds) 16 | ); 17 | let plot_fn = Filename.temp_file "RFR_regr_plot_" ".gpl" in 18 | Utls.lines_to_file plot_fn 19 | ["set xlabel 'actual'"; 20 | "set ylabel 'predicted'"; 21 | "set xtics out nomirror"; 22 | "set ytics out nomirror"; 23 | sprintf "set xrange [%f:%f]" xy_min xy_max; 24 | sprintf "set yrange [%f:%f]" xy_min xy_max; 25 | "set key left"; 26 | "set size square"; 27 | sprintf "set title '%s'" title; 28 | "g(x) = x"; 29 | "f(x) = a*x + b"; 30 | sprintf "fit f(x) '%s' u 1:2 via a, b" data_fn; 31 | "plot g(x) t 'perfect' lc rgb 'black', \\"; 32 | sprintf "'%s' using 1:2 not, \\" data_fn; 33 | "f(x) t 'fit'"]; 34 | (* sprintf "'%s' using 1:2:($2-$3):($2+$3) w errorbars \ 35 | * t 'n=%d r2=%.2f', \\" data_fn nb_trees r2; *) 36 | ignore(Sys.command (sprintf "gnuplot --persist %s" plot_fn)) 37 | -------------------------------------------------------------------------------- /src/model.ml: -------------------------------------------------------------------------------- 1 | 2 | module CLI = Minicli.CLI 3 | module Gblinear = Orxgboost.Gblinear 4 | module Fn = Filename 5 | module Gnuplot = Orxgboost.Gnuplot 6 | module L = BatList 7 | module Utls = Orxgboost.Utls 8 | module Log = Dolog.Log 9 | module Parmap = Parany.Parmap 10 | module Result = Orxgboost.Result 11 | 12 | open Printf 13 | 14 | let extract_values verbose fn = 15 | let actual_fn = Fn.temp_file "orxgboost_test_" ".txt" in 16 | (* NR > 1: skip CSV header line *) 17 | let cmd = sprintf "awk '(NR > 1){print $1}' %s > %s" fn actual_fn in 18 | Utls.run_command verbose cmd; 19 | let actual = Utls.float_list_of_file actual_fn in 20 | (* filesystem cleanup *) 21 | (if not verbose then Sys.remove actual_fn); 22 | actual 23 | 24 | let train_test_dump csv_header train test = 25 | let train_fn = Fn.temp_file "orxgboost_train_" ".csv" in 26 | let test_fn = Fn.temp_file "orxgboost_test_" ".csv" in 27 | Utls.lines_to_file train_fn (csv_header :: train); 28 | Utls.lines_to_file test_fn (csv_header :: test); 29 | (train_fn, test_fn) 30 | 31 | let shuffle_then_cut seed p train_fn = 32 | match Utls.lines_of_file train_fn with 33 | | [] | [_] -> assert(false) (* no lines or header line only?! *) 34 | | (csv_header :: csv_payload) -> 35 | let rng = BatRandom.State.make [|seed|] in 36 | let rand_lines = L.shuffle ~state:rng csv_payload in 37 | let train, test = Cpm.Utls.train_test_split p rand_lines in 38 | train_test_dump csv_header train test 39 | 40 | let shuffle_then_nfolds seed n train_fn = 41 | match Utls.lines_of_file train_fn with 42 | | [] | [_] -> assert(false) (* no lines or header line only?! *) 43 | | (csv_header :: csv_payload) -> 44 | let rng = BatRandom.State.make [|seed|] in 45 | let rand_lines = L.shuffle ~state:rng csv_payload in 46 | let train_tests = Cpm.Utls.cv_folds n rand_lines in 47 | L.rev_map (fun (x, y) -> train_test_dump csv_header x y) train_tests 48 | 49 | (* what to do with the trained model *) 50 | type mode = Load of string 51 | | Save of string 52 | | Discard 53 | 54 | let trained_model_fn_from_mode = function 55 | | Discard -> failwith "Model.trained_model_fn_from_mode: discard" 56 | | Save _ -> failwith "Model.trained_model_fn_from_mode: save" 57 | | Load fn -> fn 58 | 59 | let train verbose save_or_load config train_fn = 60 | match save_or_load with 61 | | Load trained_model_fn -> 62 | (Log.info "loading model from %s" trained_model_fn; 63 | trained_model_fn) 64 | | _ -> 65 | let model_fn = match save_or_load with 66 | | Load _ -> assert(false) 67 | | Save fn -> fn 68 | | Discard -> Fn.temp_file "orxgboost_model_" ".bin" in 69 | match Gblinear.train ~debug:verbose config train_fn with 70 | | Result.Error err -> failwith ("Model.train: " ^ err) 71 | | Result.Ok trained_model_fn -> 72 | begin 73 | Utls.run_command 74 | verbose (sprintf "mv %s %s" trained_model_fn model_fn); 75 | Log.debug "saving model to %s" model_fn; 76 | model_fn 77 | end 78 | 79 | let test verbose model_fn test_fn = 80 | Gblinear.predict ~debug:verbose (Result.Ok model_fn) test_fn 81 | 82 | let train_test_raw verbose save_or_load config train_fn test_fn = 83 | let model_fn = train verbose save_or_load config train_fn in 84 | let actual = extract_values verbose test_fn in 85 | let preds = test verbose model_fn test_fn in 86 | (model_fn, actual, preds) 87 | 88 | let r2_plot no_plot actual preds = 89 | let test_R2 = Cpm.RegrStats.r2 actual preds in 90 | (if not no_plot then 91 | let title = sprintf "DNN model fit; R2=%.2f" test_R2 in 92 | Gnuplot.regr_plot title actual preds 93 | ); 94 | Log.debug "R2_te: %.3f" test_R2; 95 | test_R2 96 | 97 | let train_test verbose save_or_load no_plot config train_fn test_fn = 98 | let _model_fn, actual, preds = 99 | train_test_raw verbose save_or_load config train_fn test_fn in 100 | r2_plot no_plot actual preds 101 | 102 | let decode_float_range (range_str: string): float list = 103 | L.map float_of_string 104 | (BatString.split_on_char ';' range_str) 105 | 106 | let decode_int_range (range_str: string): int list = 107 | L.map int_of_string 108 | (BatString.split_on_char ';' range_str) 109 | 110 | let main () = 111 | Log.(set_log_level INFO); 112 | Log.color_on (); 113 | let argc, args = CLI.init () in 114 | let train_portion_def = 0.8 in 115 | let show_help = CLI.get_set_bool ["-h";"--help"] args in 116 | if argc = 1 || show_help then 117 | begin 118 | eprintf "usage:\n\ 119 | %s\n \ 120 | [--train ]: training set\n \ 121 | [-p ]: train portion; default=%f\n \ 122 | [--seed ]: RNG seed\n \ 123 | [--test ]: test set\n \ 124 | [--scan]: toggle scan of hyper params\n \ 125 | [--eta ]: learning rate in ]0.0:1.0]\n \ 126 | [--eta-scan [float;float;...]]: eta range\n \ 127 | [--lambda ]: L2 regularization in [0.0:100.0]\n \ 128 | [--lambda-scan \"float;float;...\": lambda range\n \ 129 | [--alpha ]: L1 regularization in [0.0:100.0]\n \ 130 | [--alpha-scan \"float;float;...\": alpha range\n \ 131 | [--rounds ]: number of training rounds >= 1\n \ 132 | [--rounds-scan \"int;int;...\": rounds range\n \ 133 | [-np ]: max CPU cores\n \ 134 | [--NxCV ]: number of folds of cross validation\n \ 135 | [-s ]: save trained model to file\n \ 136 | [-l ]: restore trained model from file\n \ 137 | [-o ]: predictions output file\n \ 138 | [--no-plot]: don't call gnuplot\n \ 139 | [-v]: verbose/debug mode\n \ 140 | [-h|--help]: show this message\n" 141 | Sys.argv.(0) train_portion_def; 142 | exit 1 143 | end; 144 | let verbose = CLI.get_set_bool ["-v"] args in 145 | let must_scan = CLI.get_set_bool ["--scan"] args in 146 | let ncores = CLI.get_int_def ["-np"] args 1 in 147 | let seed = match CLI.get_int_opt ["--seed"] args with 148 | | Some s -> s (* reproducible *) 149 | | None -> (* random *) 150 | let () = Random.self_init () in 151 | Random.int 0x3FFFFFFF (* 0x3FFFFFFF = 2^30 - 1 *) in 152 | let no_plot = CLI.get_set_bool ["--no-plot"] args in 153 | let maybe_train_fn = CLI.get_string_opt ["--train"] args in 154 | let maybe_test_fn = CLI.get_string_opt ["--test"] args in 155 | let nrounds = CLI.get_int_def ["--rounds"] args 20 in 156 | Utls.enforce (nrounds >= 1) "nrounds < 1"; 157 | (* eta = learning rate *) 158 | let eta = CLI.get_float_def ["--eta"] args 0.3 in 159 | Utls.enforce (0.0 < eta && eta <= 1.0) "eta not in ]0:1]"; 160 | (* lambda = L2 regularization *) 161 | let lambda = CLI.get_float_def ["--lambda"] args 0.0 in 162 | Utls.enforce (lambda >= 0.0 && eta <= 100.0) "lambda not in [0.0:100]"; 163 | (* alpha = L1 regularization *) 164 | let alpha = CLI.get_float_def ["--alpha"] args 0.0 in 165 | Utls.enforce (alpha >= 0.0 && alpha <= 100.0) "alpha not in [0.0:100]"; 166 | (* all those default ranges are somewhat arbitrary *) 167 | let eta_range = match CLI.get_string_opt ["--eta-scan"] args with 168 | | Some s -> decode_float_range s 169 | | None -> [0.01; 0.02; 0.03; 0.05; 0.1; 0.2; 0.3; 0.5] in 170 | let lambda_range = match CLI.get_string_opt ["--lambda-scan"] args with 171 | | Some s -> decode_float_range s 172 | | None -> [0.01; 0.02; 0.03; 0.05; 0.1; 0.2; 0.3; 0.5; 1.0] in 173 | let alpha_range = match CLI.get_string_opt ["--alpha-scan"] args with 174 | | Some s -> decode_float_range s 175 | | None -> [0.01; 0.02; 0.03; 0.05; 0.1; 0.2; 0.3; 0.5; 1.0] in 176 | let rounds_range = match CLI.get_string_opt ["--rounds-scan"] args with 177 | | Some s -> decode_int_range s 178 | | None -> [10; 20; 30; 50; 100; 200; 300; 500; 1000; 2000; 3000; 5000] in 179 | let nfolds = CLI.get_int_def ["--NxCV"] args 1 in 180 | let train_portion = CLI.get_float_def ["-p"] args 0.8 in 181 | let scores_fn = match CLI.get_string_opt ["-o"] args with 182 | | None -> Fn.temp_file "orxgboost_preds_" ".txt" 183 | | Some fn -> fn in 184 | let save_or_load = 185 | match (CLI.get_string_opt ["-l"] args, CLI.get_string_opt ["-s"] args) with 186 | | (Some fn, None) -> Load fn 187 | | (None, Some fn) -> Save fn 188 | | (None, None) -> Discard 189 | | (Some _, Some _) -> failwith "Model: both -l and -s" in 190 | CLI.finalize (); 191 | let config = Gblinear.make_params eta lambda alpha nrounds in 192 | match maybe_train_fn, maybe_test_fn with 193 | | (None, None) -> failwith "provide --train and/or --test" 194 | | (None, Some test_fn) -> 195 | (* trained model production use *) 196 | let model_fn = trained_model_fn_from_mode save_or_load in 197 | let preds = test verbose model_fn test_fn in 198 | Utls.float_list_to_file scores_fn preds 199 | | (Some train_fn, Some test_fn) -> 200 | ignore(train_test verbose save_or_load no_plot config train_fn test_fn) 201 | | (Some train_fn', None) -> 202 | if nfolds > 1 then 203 | begin (* cross validation *) 204 | Log.info "shuffle -> %dxCV" nfolds; 205 | let train_test_fns = shuffle_then_nfolds seed nfolds train_fn' in 206 | let actual_preds = 207 | Parmap.parmap ncores (fun (train_fn, test_fn) -> 208 | train_test_raw 209 | verbose save_or_load config train_fn test_fn 210 | ) train_test_fns in 211 | let actual = L.concat (L.map Utls.snd3 actual_preds) in 212 | let preds = L.concat (L.map Utls.trd3 actual_preds) in 213 | ignore(r2_plot no_plot actual preds) 214 | end 215 | else 216 | begin (* no cross validation *) 217 | (* train/test split *) 218 | Log.info "shuffle -> train/test split (p=%.2f)" train_portion; 219 | let train_fn, test_fn = 220 | shuffle_then_cut seed train_portion train_fn' in 221 | if not must_scan then 222 | let r2 = 223 | train_test verbose save_or_load no_plot config train_fn test_fn in 224 | Log.info "R2: %.3f" r2 225 | else 226 | let configs = ref [] in 227 | L.iter (fun e -> 228 | L.iter (fun l -> 229 | L.iter (fun a -> 230 | L.iter (fun n -> 231 | configs := (e, l, a, n) :: !configs 232 | ) rounds_range 233 | ) alpha_range 234 | ) lambda_range 235 | ) eta_range; 236 | (* randomize them so that the parameter space exploration is not 237 | sequential/boring *) 238 | configs := L.shuffle !configs; 239 | Log.info "configs: %d" (L.length !configs); 240 | let (best_e, best_l, best_a, best_n, bets_r2) = 241 | Parany.Parmap.parfold ncores 242 | (fun (e, l, a, n) -> 243 | let conf = Gblinear.make_params e l a n in 244 | let r2 = 245 | train_test verbose save_or_load no_plot conf train_fn test_fn in 246 | (e, l, a, n, r2) 247 | ) 248 | (fun (e, l, a, n, r2) (e', l', a', n', r2') -> 249 | if r2' > r2 then 250 | (Log.info "(e,l,a,n):r2 (%.2f, %.2f, %.2f, %d):%.3f" 251 | e' l' a' n' r2'; 252 | (e', l', a', n', r2')) 253 | else 254 | (Log.warn "(e,l,a,n):r2 (%.2f, %.2f, %.2f, %d):%.3f" 255 | e' l' a' n' r2'; 256 | (e, l, a, n, r2)) 257 | ) (0.0, 0.0, 0.0, 0, 0.0) !configs in 258 | Log.info "BEST: (e,l,a,n):r2 (%.2f, %.2f, %.2f, %d):%.3f" 259 | best_e best_l best_a best_n bets_r2 260 | end 261 | 262 | let () = main () 263 | -------------------------------------------------------------------------------- /src/result.ml: -------------------------------------------------------------------------------- 1 | type filename = string 2 | type error_message = string 3 | 4 | type t = Ok of filename 5 | | Error of error_message 6 | -------------------------------------------------------------------------------- /src/test.ml: -------------------------------------------------------------------------------- 1 | open Printf 2 | 3 | module Gbtree = Orxgboost.Gbtree 4 | module L = BatList 5 | module Log = Dolog.Log 6 | module Utls = Orxgboost.Utls 7 | 8 | module Score_label = struct 9 | type t = bool * float (* (label, pred_score) *) 10 | let get_label (l, _) = l 11 | let get_score (_, s) = s 12 | end 13 | 14 | module ROC = Cpm.MakeROC.Make(Score_label) 15 | 16 | let main () = 17 | Log.set_log_level Log.DEBUG; 18 | Log.color_on (); 19 | let data_fn = "data/train_data.txt" in 20 | let sparse_data_fn = "data/train_data.csr" in 21 | let labels_fn = "data/train_labels.txt" in 22 | let preds = 23 | let params = Gbtree.default_gbtree_params () in 24 | let model = 25 | Gbtree.train 26 | ~debug:true 27 | Dense 28 | 10 29 | params 30 | data_fn 31 | labels_fn in 32 | Gbtree.read_predictions 33 | (Gbtree.predict ~debug:true Dense model data_fn) in 34 | let sparse_preds = 35 | let params = Gbtree.default_gbtree_params () in 36 | let sparsity = Gbtree.Sparse 1831 in 37 | let model = 38 | Gbtree.train 39 | ~debug:true 40 | sparsity 41 | 10 42 | params 43 | sparse_data_fn 44 | labels_fn in 45 | Gbtree.read_predictions 46 | (Gbtree.predict ~debug:true sparsity model sparse_data_fn) in 47 | let lin_preds = 48 | let params = Gbtree.default_linear_params () in 49 | let model = 50 | Gbtree.train 51 | ~debug:true 52 | Dense 53 | 10 54 | params 55 | data_fn 56 | labels_fn in 57 | let preds_fn = Gbtree.predict ~debug:true Dense model data_fn in 58 | Gbtree.read_predictions preds_fn in 59 | assert(List.length preds = 88); 60 | assert(List.length sparse_preds = 88); 61 | assert(List.length lin_preds = 88); 62 | (* List.iter (printf "%f\n") predictions *) 63 | let labels = 64 | let labels_line = Utls.with_in_file labels_fn input_line in 65 | let label_strings = BatString.split_on_char '\t' labels_line in 66 | L.map (function 67 | | "1" -> true 68 | | "-1" -> false 69 | | other -> failwith other 70 | ) label_strings in 71 | let auc = ROC.auc (List.combine labels preds) in 72 | let sparse_auc = ROC.auc (List.combine labels sparse_preds) in 73 | let lin_auc = ROC.auc (List.combine labels lin_preds) in 74 | printf "AUC: %.3f\n" auc; 75 | printf "sparse AUC: %.3f\n" sparse_auc; 76 | printf "lin AUC: %.3f\n" lin_auc 77 | 78 | let () = main () 79 | -------------------------------------------------------------------------------- /src/utls.ml: -------------------------------------------------------------------------------- 1 | 2 | open Printf 3 | 4 | module L = BatList 5 | module Log = Dolog.Log 6 | 7 | let with_in_file fn f = 8 | let input = open_in_bin fn in 9 | let res = f input in 10 | close_in input; 11 | res 12 | 13 | let with_out_file fn f = 14 | let output = open_out_bin fn in 15 | let res = f output in 16 | close_out output; 17 | res 18 | 19 | let lines_of_file fn = 20 | with_in_file fn (fun input -> 21 | let res, exn = L.unfold_exc (fun () -> input_line input) in 22 | if exn <> End_of_file then 23 | raise exn 24 | else res 25 | ) 26 | 27 | let lines_to_file fn lines = 28 | with_out_file fn (fun out -> 29 | L.iter (fprintf out "%s\n") lines 30 | ) 31 | 32 | (* call f on lines of file *) 33 | let iter_on_lines_of_file fn f = 34 | let input = open_in_bin fn in 35 | try 36 | while true do 37 | f (input_line input) 38 | done 39 | with End_of_file -> close_in input 40 | 41 | let count_lines (fn: string): int = 42 | let count = ref 0 in 43 | iter_on_lines_of_file fn (fun _line -> 44 | incr count 45 | ); 46 | !count 47 | 48 | let append_file_to_buffer buff fn = 49 | with_in_file fn (fun input -> 50 | let len = in_channel_length input in 51 | Buffer.add_channel buff input len 52 | ) 53 | 54 | let ignore_fst _fst snd = 55 | snd 56 | 57 | let fold_on_lines_of_file fn f acc = 58 | with_in_file fn (fun input -> 59 | let acc' = ref acc in 60 | try 61 | while true do 62 | acc' := f !acc' (input_line input) 63 | done; 64 | assert(false) 65 | with End_of_file -> !acc' 66 | ) 67 | 68 | let float_list_of_file fn = 69 | let res = 70 | fold_on_lines_of_file fn (fun acc line -> 71 | let pred = 72 | try Scanf.sscanf line "%f" (fun x -> x) 73 | with Scanf.Scan_failure msg -> 74 | (* percolate a NaN rather than crashing *) 75 | (Log.error "%s: %s" msg line; 76 | nan) in 77 | pred :: acc 78 | ) [] in 79 | L.rev res 80 | 81 | let float_list_to_file fn l = 82 | with_out_file fn (fun out -> 83 | L.iter (fprintf out "%f\n") l 84 | ) 85 | 86 | type filename = string 87 | 88 | (* capture everything in case of error *) 89 | let collect_script_and_log 90 | (debug: bool) 91 | (r_script_fn: filename) (r_log_fn: filename) (model_fn: filename) 92 | : Result.t = 93 | let buff = Buffer.create 4096 in 94 | bprintf buff "--- %s ---\n" r_script_fn; 95 | append_file_to_buffer buff r_script_fn; 96 | bprintf buff "--- %s ---\n" r_log_fn; 97 | append_file_to_buffer buff r_log_fn; 98 | let err_msg = Buffer.contents buff in 99 | if not debug then L.iter Sys.remove [r_script_fn; r_log_fn; model_fn]; 100 | Error err_msg 101 | 102 | let read_predictions (debug: bool) (maybe_predictions_fn: Result.t): float list = 103 | match maybe_predictions_fn with 104 | | Error err -> failwith err (* should have been handled by user before *) 105 | | Ok predictions_fn -> 106 | if debug then Log.debug "%s" predictions_fn; 107 | let res = float_list_of_file predictions_fn in 108 | if not debug then Sys.remove predictions_fn; 109 | res 110 | 111 | let run_command verbose cmd = 112 | if verbose then Log.info "cmd: %s" cmd; 113 | ignore(Sys.command cmd) 114 | 115 | let fst3 (a, _, _) = a 116 | let snd3 (_, b, _) = b 117 | let trd3 (_, _, c) = c 118 | 119 | (* abort if condition is not met *) 120 | let enforce (condition: bool) (err_msg: string): unit = 121 | if not condition then 122 | failwith err_msg 123 | -------------------------------------------------------------------------------- /test.r: -------------------------------------------------------------------------------- 1 | library('xgboost') 2 | 3 | # matrix with n rows (observations) and p columns (features) 4 | x <- as.matrix(read.table("data/train_data.txt", colClasses = "numeric")) 5 | 6 | # vector of size n and values +1 or -1 only 7 | y <- as.vector(read.table("data/train_labels.txt"), mode = "numeric") 8 | 9 | # transform [-1,1] to [0,1] 10 | lut <- data.frame(old = c(-1.0,1.0), new = c(0.0,1.0)) 11 | labels <- lut$new[match(y, lut$old)] 12 | 13 | # check number of rows 14 | stopifnot(nrow(x) == length(labels)) 15 | 16 | # train 17 | gbtree <- xgboost(data = x, label = labels, nrounds = 300, objective = "binary:logitraw", eval_metric = "auc", eta = 0.01, subsample = 0.5) 18 | 19 | xgb.save(gbtree, "r_gbtree_model.bin") 20 | 21 | xgb.load("r_gbtree_model.bin") 22 | 23 | # stupid test on training data; don't do this at home !!! 24 | values <- predict(gbtree, x) 25 | 26 | write.table(values, file = "data/predictions.txt", sep = "\n", row.names = F, col.names = F) 27 | 28 | quit() 29 | --------------------------------------------------------------------------------