├── .gitignore ├── CHANGES.txt ├── LICENSE ├── LICENSE.txt ├── MANIFEST ├── MANIFEST.in ├── README.md ├── demo ├── ftrl_fm_cython.py └── mf_qe_nn_clf.py ├── doc ├── Makefile ├── conf.py ├── index.rst ├── kaggler.metrics.rst ├── kaggler.model.rst ├── kaggler.online_model.rst ├── kaggler.preprocessing.rst ├── kaggler.rst ├── kaggler.test.rst └── modules.rst ├── kaggler ├── __init__.py ├── const.py ├── data_io.py ├── metrics │ ├── __init__.py │ ├── classification.py │ └── regression.py ├── model │ ├── __init__.py │ └── nn.py ├── online_model │ ├── DecisionTree │ │ ├── OnlineClassificationTree.py │ │ ├── _tree.pyx │ │ ├── test.py │ │ └── utils.pyx │ ├── __init__.py │ ├── fm.c │ ├── fm.pyx │ ├── ftrl.c │ ├── ftrl.pyx │ ├── ftrl_dropout.pyx │ ├── ftrl_fm.c │ ├── ftrl_fm.pyx │ ├── nn.c │ ├── nn.pyx │ ├── nn_h2.c │ ├── nn_h2.pyx │ ├── sgd.c │ └── sgd.pyx ├── preprocessing │ ├── __init__.py │ └── data.py ├── test │ ├── __init__.py │ └── test_sgd.py ├── util.c ├── util.pxd └── util.pyx ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | _build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | 44 | # Translations 45 | *.mo 46 | *.pot 47 | 48 | # Django stuff: 49 | *.log 50 | 51 | # Sphinx documentation 52 | docs/_build/ 53 | 54 | # PyBuilder 55 | target/ 56 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | 0.3.4, 2015-02-11 -- Add README.md to MANIFEST.in 2 | 0.1.1, 2014-09-24 -- Fix wrong dependencies 3 | 0.1.0, 2014-07-22 -- Initial release. 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | CHANGES.txt 3 | LICENSE.txt 4 | README.txt 5 | setup.py 6 | kaggler/__init__.py 7 | kaggler/const.py 8 | kaggler/logger.py 9 | kaggler/nn_auc.py 10 | kaggler/util.py 11 | kaggler/test/__init__.py 12 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt README.md 2 | recursive-include docs *.txt 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kaggler 2 | Kaggler is a Python package for Kaggle data science competitions and distributed under the version 3 of the GNU General Public License. 3 | 4 | It provides online learning algorithms for classification - inspired by Kaggle user [tinrtgu's code](http://goo.gl/K8hQBx). It uses the sparse input format that handles large sparse data efficiently. Core code is optimized for speed by using Cython. 5 | 6 | # Algorithms 7 | Currently algorithms available are as follows: 8 | 9 | ## Online learning algorithms 10 | * Stochastic Gradient Descent (SGD) 11 | * Follow-the-Regularized-Leader (FTRL) 12 | * Follow-the-Regularized-Leader with Factorization Machine (FTRL_FM) 13 | * Factorization Machine (FM) 14 | * Neural Networks (NN) - with a single (NN) or two (NN_H2) ReLU hidden layers 15 | * Decision Tree 16 | 17 | ## Batch learning algorithm 18 | * Neural Networks (NN) - with a single hidden layer and L-BFGS optimization 19 | 20 | # Install 21 | ## Using pip 22 | Python package is available at PyPi for pip installation: 23 | ``` 24 | sudo pip install -U Kaggler 25 | ``` 26 | 27 | ## From source code 28 | If you want to install it from source code: 29 | ``` 30 | python setup.py build_ext --inplace 31 | sudo python setup.py install 32 | ``` 33 | 34 | # Input Format 35 | libsvm style sparse file format is used. 36 | ``` 37 | 1 1:1 4:1 5:0.5 38 | 0 2:1 5:1 39 | ``` 40 | 41 | # Example 42 | ``` 43 | from kaggler.online_model import SGD, FTRL, FM, NN 44 | 45 | # SGD 46 | clf = SGD(a=.01, # learning rate 47 | l1=1e-6, # L1 regularization parameter 48 | l2=1e-6, # L2 regularization parameter 49 | n=2**20, # number of hashed features 50 | epoch=10, # number of epochs 51 | interaction=True) # use feature interaction or not 52 | 53 | # FTRL 54 | clf = FTRL(a=.1, # alpha in the per-coordinate rate 55 | b=1, # beta in the per-coordinate rate 56 | l1=1., # L1 regularization parameter 57 | l2=1., # L2 regularization parameter 58 | n=2**20, # number of hashed features 59 | epoch=1, # number of epochs 60 | interaction=True) # use feature interaction or not 61 | 62 | # FM 63 | clf = FM(n=1e5, # number of features 64 | epoch=100, # number of epochs 65 | dim=4, # size of factors for interactions 66 | a=.01) # learning rate 67 | 68 | # NN 69 | clf = NN(n=1e5, # number of features 70 | epoch=10, # number of epochs 71 | h=16, # number of hidden units 72 | a=.1, # learning rate 73 | l2=1e-6) # L2 regularization parameter 74 | 75 | # online training and prediction directly with a libsvm file 76 | for x, y in clf.read_sparse('train.sparse'): 77 | p = clf.predict_one(x) # predict for an input 78 | clf.update_one(x, p - y) # update the model with the target using error 79 | 80 | for x, _ in clf.read_sparse('test.sparse'): 81 | p = clf.predict_one(x) 82 | 83 | # online training and prediction with a scipy sparse matrix 84 | from sklearn.datasets import load_svmlight_file 85 | 86 | X, y = load_svmlight_file('train.sparse') 87 | 88 | clf.fit(X, y) 89 | p = clf.predict(X) 90 | ``` 91 | 92 | # Package Documentation 93 | Package documentation is available at [here](http://pythonhosted.org//Kaggler). 94 | -------------------------------------------------------------------------------- /demo/ftrl_fm_cython.py: -------------------------------------------------------------------------------- 1 | # time pypy-2.4 -u runmodel.py | tee output_0.txt 2 | from kaggler.online_model.ftrl_fm import FTRL_FM 3 | import random 4 | from math import log 5 | import numpy as np 6 | from datetime import datetime 7 | import pandas as pd 8 | from sklearn.cross_validation import KFold 9 | from sklearn.metrics import roc_auc_score 10 | #### RANDOM SEED #### 11 | seed = 1024 12 | np.random.seed(seed) 13 | ##################### 14 | 15 | #################### 16 | #### PARAMETERS #### 17 | #################### 18 | 19 | reportFrequency = 1000 20 | path = "E:\\Redhat\\" 21 | trainingFile = "E:\\Redhat\\train_le_date.csv" 22 | testingFile = "E:\\Redhat\\test_le_date.csv" 23 | # train = pd.read_csv(trainingFile) 24 | # test = pd.read_csv(testingFile) 25 | # y = train['outcome'].values 26 | # skf = KFold(len(y), n_folds=4, shuffle=False, random_state=seed) 27 | # for ind_tr, ind_te in skf: 28 | # X_train = train.iloc[ind_tr] 29 | # X_test = train.iloc[ind_te] 30 | # break 31 | 32 | # X_train.to_csv(path+'X_train.csv',index=False) 33 | # X_test.to_csv(path+'X_test.csv',index=False) 34 | 35 | fm_dim = 4 36 | fm_initDev = .01 37 | 38 | alpha = 0.1 39 | beta = 1. 40 | 41 | alpha_fm = .01 42 | beta_fm = 1. 43 | 44 | p_D = 22 45 | D = 2 ** p_D 46 | 47 | L1 = 0.1 48 | L2 = 1.0 49 | L1_fm = 0.1 50 | L2_fm = 1.0 51 | 52 | n_epochs = 3 53 | 54 | #### 55 | start = datetime.now() 56 | 57 | # initialize a FM learner 58 | learner = FTRL_FM(fm_dim, fm_initDev, L1, L2, L1_fm, L2_fm, D, alpha, beta, alpha_fm = alpha_fm, beta_fm = beta_fm) 59 | 60 | learner.fit(trainingFile=open(path+'X_train.csv'),n_epochs=5,validationFile=open(path+'X_test.csv'),eval_metric=roc_auc_score,reportFrequency=reportFrequency) 61 | 62 | # save the weights 63 | # w_outfile = path+"param.w.txt" 64 | # w_fm_outfile = path+"param.w_fm.txt" 65 | # learner.write_w(w_outfile) 66 | # learner.write_w_fm(w_fm_outfile) 67 | pd.to_pickle(learner,path+'ftrl_fm.pkl') 68 | 69 | 70 | test = pd.read_csv(path+'test_le_date.csv') 71 | activity_id = test['activity_id'] 72 | print('Make submission') 73 | # X_t = [X_t[:,i] for i in range(X_t.shape[1])] 74 | y_preds = learner.predict(testingFile=open(testingFile),n_epochs=5) 75 | submission = pd.DataFrame() 76 | submission['activity_id'] = activity_id 77 | submission['outcome'] = outcome 78 | submission.to_csv('submission_ftrl_fm_%s.csv'%dim,index=False) 79 | -------------------------------------------------------------------------------- /demo/mf_qe_nn_clf.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from scipy import sparse as ssp 4 | from sklearn.preprocessing import LabelEncoder,LabelBinarizer,MinMaxScaler,OneHotEncoder,StandardScaler,Normalizer 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances 7 | from sklearn.feature_selection import SelectFromModel 8 | from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer 9 | from sklearn.datasets import dump_svmlight_file,load_svmlight_file 10 | from sklearn.svm import LinearSVC 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.cross_validation import KFold,StratifiedKFold 13 | from sklearn.metrics import roc_auc_score,accuracy_score 14 | from keras.preprocessing import sequence 15 | from keras.callbacks import ModelCheckpoint 16 | from keras import backend as K 17 | from keras.layers import Input, Embedding, LSTM, Dense,Flatten, Dropout, merge,Convolution1D,MaxPooling1D,Lambda 18 | from keras.layers.normalization import BatchNormalization 19 | from keras.optimizers import SGD,Nadam 20 | from keras.layers.advanced_activations import PReLU,LeakyReLU,ELU,SReLU 21 | from keras.models import Model 22 | from keras.utils.visualize_util import plot 23 | import distance 24 | import xgboost as xgb 25 | 26 | seed = 1024 27 | np.random.seed(seed) 28 | 29 | path = "../" 30 | 31 | 32 | def str_jaccard(str1, str2): 33 | res = distance.jaccard(str1, str2) 34 | return res 35 | 36 | 37 | question_numeric = ['char_4_q','char_5_q','char_6_q'] 38 | 39 | train = pd.read_csv(path+'invited_info_train.txt',dtype={"expert_id":str,'question_id':str}) 40 | expert_id = train['expert_id'].values 41 | expert_id = LabelEncoder().fit_transform(expert_id) 42 | 43 | test = pd.read_csv(path+'validate_nolabel.txt',dtype={"expert_id":str,'question_id':str}).fillna(-1) 44 | test.columns = ['question_id','expert_id','label'] 45 | len_train = train.shape[0] 46 | 47 | 48 | train = pd.concat([train,test]) 49 | 50 | expert = pd.read_csv(path+'user_info.txt',dtype={"expert_id":str}) 51 | question = pd.read_csv(path+'question_info.txt',dtype={"question_id":str}).fillna(-1) 52 | question['char_3_q'] = question['char_3_q'].astype(str) 53 | 54 | expert['char_1'] = expert['char_1'].apply(lambda x: x.replace('/',' ')) 55 | expert['char_2'] = expert['char_2'].apply(lambda x: x.replace('/',' ')) 56 | expert['char_3'] = expert['char_3'].apply(lambda x: x.replace('/',' ')) 57 | 58 | question['char_2_q'] = question['char_2_q'].apply(lambda x: x.replace('/',' ')) 59 | question['char_3_q'] = question['char_3_q'].apply(lambda x: x.replace('/',' ')) 60 | 61 | count_char_1 = CountVectorizer(ngram_range=(1,3)) 62 | tfidf_char_2 = TfidfVectorizer(ngram_range=(1,3)) 63 | tfidf_char_3 = TfidfVectorizer(ngram_range=(1,3)) 64 | 65 | count_char_1.fit(expert['char_1'].values) 66 | tfidf_char_2.fit(expert['char_2'].values.tolist()+question['char_2_q'].values.tolist()) 67 | tfidf_char_3.fit(expert['char_3'].values.tolist()+question['char_3_q'].values.tolist()) 68 | 69 | lb_char_1_q = LabelBinarizer(sparse_output=True) 70 | lb_char_1_q.fit(question['char_1_q'].values) 71 | 72 | 73 | train = pd.merge(train,expert,on='expert_id',how='left')#.fillna(' ') 74 | train = pd.merge(train,question,on='question_id',how='left') 75 | 76 | 77 | le = LabelEncoder() 78 | train['question_id'] = le.fit_transform(train['question_id'].values) 79 | train['expert_id'] = le.fit_transform(train['expert_id'].values) 80 | 81 | y = train['label'].values 82 | features = [ 83 | 'question_id', 84 | 'expert_id', 85 | ] 86 | 87 | X = train[features].values 88 | # X = OneHotEncoder().fit_transform(X).tocsr() 89 | # X_char_1 = count_char_1.transform(train['char_1'].values) 90 | # X_char_2 = tfidf_char_2.transform(train['char_2'].values) 91 | # X_char_3 = tfidf_char_3.transform(train['char_3'].values) 92 | 93 | 94 | # X_char_1_q = lb_char_1_q.fit_transform(train['char_1_q'].values) 95 | # X_char_2_q = tfidf_char_2.transform(train['char_2_q'].values) 96 | # X_char_3_q = tfidf_char_3.transform(train['char_3_q'].values) 97 | 98 | # stand_char_4_5_6_q = StandardScaler() 99 | # stand_char_4_5_6_q.fit(train[question_numeric].values) 100 | # X_char_4_5_6_q = stand_char_4_5_6_q.transform(train[question_numeric].values) 101 | 102 | 103 | print ('X raw',X.shape) 104 | 105 | # sim_char_2 = [] 106 | # for expert_char_2,question_char_2 in zip(X_char_2,X_char_2_q): 107 | # cos_sim_2 = pairwise_distances(expert_char_2, question_char_2, metric='cosine')[0][0] 108 | # sim_char_2.append(cos_sim_2) 109 | # sim_char_2 = np.array(sim_char_2) 110 | # sim_char_2 = np.expand_dims(sim_char_2,1) 111 | 112 | # sim_char_3 = [] 113 | # for expert_char_3,question_char_3 in zip(X_char_3,X_char_3_q): 114 | # cos_sim_3 = pairwise_distances(expert_char_3, question_char_3, metric='cosine')[0][0] 115 | # sim_char_3.append(cos_sim_3) 116 | # sim_char_3 = np.array(sim_char_3) 117 | # sim_char_3 = np.expand_dims(sim_char_3,1) 118 | 119 | # X = ssp.hstack([ 120 | # X, 121 | # # X_char_1, 122 | # # X_char_2, 123 | # # X_char_3, 124 | # # X_char_1_q, 125 | # # X_char_2_q, 126 | # # X_char_3_q, 127 | # # X_char_4_5_6_q, 128 | # # sim_char_2, 129 | # # sim_char_3, 130 | # ]).tocsr() 131 | 132 | # dump_svmlight_file(X,y,path+'data.svm') 133 | 134 | # data,y_all = load_svmlight_file(path+'data.svm') 135 | y_all = y 136 | data = X 137 | num_q = len(np.unique(data[:,0])) 138 | num_e = len(np.unique(data[:,1])) 139 | del X 140 | del y 141 | 142 | X = data[:len_train] 143 | y = y_all[:len_train] 144 | X_t= data[len_train:] 145 | del data 146 | del y_all 147 | 148 | def make_mf_lr(X ,y, clf, X_test, n_round=3): 149 | n = X.shape[0] 150 | ''' 151 | Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor 152 | ''' 153 | print clf 154 | mf_tr = np.zeros(X.shape[0]) 155 | mf_te = np.zeros(X_test.shape[0]) 156 | for i in range(n_round): 157 | skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=42+i*1000) 158 | for ind_tr, ind_te in skf: 159 | X_tr = X[ind_tr] 160 | X_te = X[ind_te] 161 | 162 | # print('X_tr shape',X_tr.shape) 163 | # print('X_te shape',X_te.shape) 164 | 165 | y_tr = y[ind_tr] 166 | y_te = y[ind_te] 167 | 168 | clf.fit(X_tr, y_tr) 169 | mf_tr[ind_te] += clf.predict_proba(X_te)[:,1] 170 | mf_te += clf.predict_proba(X_test)[:,1]*0.5 171 | y_pred = clf.predict_proba(X_te)[:,1] 172 | score = roc_auc_score(y_te, y_pred) 173 | print 'pred[{}] score:{}'.format(i, score) 174 | return (mf_tr / n_round, mf_te / n_round) 175 | 176 | 177 | def make_mf_lsvc(X ,y, clf, X_test, n_round=3): 178 | n = X.shape[0] 179 | ''' 180 | Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor 181 | ''' 182 | print clf 183 | mf_tr = np.zeros(X.shape[0]) 184 | mf_te = np.zeros(X_test.shape[0]) 185 | for i in range(n_round): 186 | skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=42+i*1000) 187 | for ind_tr, ind_te in skf: 188 | X_tr = X[ind_tr] 189 | X_te = X[ind_te] 190 | 191 | # print('X_tr shape',X_tr.shape) 192 | # print('X_te shape',X_te.shape) 193 | 194 | y_tr = y[ind_tr] 195 | y_te = y[ind_te] 196 | 197 | clf.fit(X_tr, y_tr) 198 | mf_tr[ind_te] += clf.decision_function(X_te) 199 | mf_te += clf.decision_function(X_test)*0.5 200 | y_pred = clf.decision_function(X_te) 201 | score = roc_auc_score(y_te, y_pred) 202 | print 'pred[{}] score:{}'.format(i, score) 203 | return (mf_tr / n_round, mf_te / n_round) 204 | 205 | def make_mf_nn(X ,y, X_test, n_round=3): 206 | n = X.shape[0] 207 | ''' 208 | Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor 209 | ''' 210 | from kaggler.online_model.ftrl import FTRL 211 | mf_tr = np.zeros(X.shape[0]) 212 | mf_te = np.zeros(X_test.shape[0]) 213 | for i in range(n_round): 214 | skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=42+i*1000) 215 | for ind_tr, ind_te in skf: 216 | clf = build_model(X) 217 | X_tr = [X[:,0][ind_tr],X[:,1][ind_tr]] 218 | X_te = [X[:,0][ind_te],X[:,1][ind_te]] 219 | 220 | # print('X_tr shape',X_tr.shape) 221 | # print('X_te shape',X_te.shape) 222 | 223 | y_tr = y[ind_tr] 224 | y_te = y[ind_te] 225 | 226 | clf.fit(X_tr, y_tr,nb_epoch=2,batch_size=128,validation_data=[X_te,y_te]) 227 | mf_tr[ind_te] += clf.predict(X_te).ravel() 228 | mf_te += clf.predict([X_test[:,0],X_test[:,1]]).ravel()*0.5 229 | y_pred = clf.predict(X_te).ravel() 230 | score = roc_auc_score(y_te, y_pred) 231 | print 'pred[{}] score:{}'.format(i, score) 232 | return (mf_tr / n_round, mf_te / n_round) 233 | 234 | def build_model(X,dim=128): 235 | 236 | inputs_p = Input(shape=(1,), dtype='int32') 237 | 238 | embed_p = Embedding( 239 | num_q, 240 | dim, 241 | dropout=0.2, 242 | input_length=1 243 | )(inputs_p) 244 | 245 | inputs_d = Input(shape=(1,), dtype='int32') 246 | 247 | embed_d = Embedding( 248 | num_e, 249 | dim, 250 | dropout=0.2, 251 | input_length=1 252 | )(inputs_d) 253 | 254 | 255 | flatten_p= Flatten()(embed_p) 256 | 257 | flatten_d= Flatten()(embed_d) 258 | 259 | flatten = merge([ 260 | flatten_p, 261 | flatten_d, 262 | ],mode='concat') 263 | 264 | fc1 = Dense(512)(flatten) 265 | fc1 = SReLU()(fc1) 266 | dp1 = Dropout(0.7)(fc1) 267 | 268 | outputs = Dense(1,activation='sigmoid',name='outputs')(dp1) 269 | 270 | inputs = [ 271 | inputs_p, 272 | inputs_d, 273 | ] 274 | 275 | 276 | 277 | model = Model(input=inputs, output=outputs) 278 | nadam = Nadam() 279 | sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True) 280 | model.compile( 281 | optimizer=nadam, 282 | loss= 'binary_crossentropy' 283 | ) 284 | 285 | return model 286 | 287 | mf_nn_clf = make_mf_nn(X ,y, X_t, n_round=10) 288 | pd.to_pickle(mf_nn_clf,path+'mf_nn_clf.pkl') 289 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Kaggler.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Kaggler.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/Kaggler" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Kaggler" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Kaggler documentation build configuration file, created by 4 | # sphinx-quickstart on Tue Feb 10 04:55:59 2015. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | #sys.path.insert(0, os.path.abspath('.')) 22 | sys.path.insert(0, os.path.abspath("../..")) 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | #needs_sphinx = '1.0' 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | 'sphinx.ext.autodoc', 34 | 'sphinxcontrib.napoleon', 35 | 'sphinx.ext.doctest', 36 | 'sphinx.ext.intersphinx', 37 | 'sphinx.ext.todo', 38 | 'sphinx.ext.coverage', 39 | 'sphinx.ext.mathjax', 40 | 'sphinx.ext.viewcode', 41 | ] 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['_templates'] 45 | 46 | # The suffix of source filenames. 47 | source_suffix = '.rst' 48 | 49 | # The encoding of source files. 50 | #source_encoding = 'utf-8-sig' 51 | 52 | # The master toctree document. 53 | master_doc = 'index' 54 | 55 | # General information about the project. 56 | project = u'Kaggler' 57 | copyright = u'2015, Jeong-Yoon Lee' 58 | 59 | # The version info for the project you're documenting, acts as replacement for 60 | # |version| and |release|, also used in various other places throughout the 61 | # built documents. 62 | # 63 | # The short X.Y version. 64 | version = '0.4' 65 | # The full version, including alpha/beta/rc tags. 66 | release = '0.4.1' 67 | 68 | # The language for content autogenerated by Sphinx. Refer to documentation 69 | # for a list of supported languages. 70 | #language = None 71 | 72 | # There are two options for replacing |today|: either, you set today to some 73 | # non-false value, then it is used: 74 | #today = '' 75 | # Else, today_fmt is used as the format for a strftime call. 76 | #today_fmt = '%B %d, %Y' 77 | 78 | # List of patterns, relative to source directory, that match files and 79 | # directories to ignore when looking for source files. 80 | exclude_patterns = ['_build'] 81 | 82 | # The reST default role (used for this markup: `text`) to use for all 83 | # documents. 84 | #default_role = None 85 | 86 | # If true, '()' will be appended to :func: etc. cross-reference text. 87 | #add_function_parentheses = True 88 | 89 | # If true, the current module name will be prepended to all description 90 | # unit titles (such as .. function::). 91 | #add_module_names = True 92 | 93 | # If true, sectionauthor and moduleauthor directives will be shown in the 94 | # output. They are ignored by default. 95 | #show_authors = False 96 | 97 | # The name of the Pygments (syntax highlighting) style to use. 98 | pygments_style = 'sphinx' 99 | 100 | # A list of ignored prefixes for module index sorting. 101 | #modindex_common_prefix = [] 102 | 103 | # If true, keep warnings as "system message" paragraphs in the built documents. 104 | #keep_warnings = False 105 | 106 | 107 | # -- Options for HTML output ---------------------------------------------- 108 | 109 | # The theme to use for HTML and HTML Help pages. See the documentation for 110 | # a list of builtin themes. 111 | html_theme = 'default' 112 | 113 | # Theme options are theme-specific and customize the look and feel of a theme 114 | # further. For a list of options available for each theme, see the 115 | # documentation. 116 | #html_theme_options = {} 117 | 118 | # Add any paths that contain custom themes here, relative to this directory. 119 | #html_theme_path = [] 120 | 121 | # The name for this set of Sphinx documents. If None, it defaults to 122 | # " v documentation". 123 | #html_title = None 124 | 125 | # A shorter title for the navigation bar. Default is the same as html_title. 126 | #html_short_title = None 127 | 128 | # The name of an image file (relative to this directory) to place at the top 129 | # of the sidebar. 130 | #html_logo = None 131 | 132 | # The name of an image file (within the static path) to use as favicon of the 133 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 134 | # pixels large. 135 | #html_favicon = None 136 | 137 | # Add any paths that contain custom static files (such as style sheets) here, 138 | # relative to this directory. They are copied after the builtin static files, 139 | # so a file named "default.css" will overwrite the builtin "default.css". 140 | html_static_path = ['_static'] 141 | 142 | # Add any extra paths that contain custom files (such as robots.txt or 143 | # .htaccess) here, relative to this directory. These files are copied 144 | # directly to the root of the documentation. 145 | #html_extra_path = [] 146 | 147 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 148 | # using the given strftime format. 149 | #html_last_updated_fmt = '%b %d, %Y' 150 | 151 | # If true, SmartyPants will be used to convert quotes and dashes to 152 | # typographically correct entities. 153 | #html_use_smartypants = True 154 | 155 | # Custom sidebar templates, maps document names to template names. 156 | #html_sidebars = {} 157 | 158 | # Additional templates that should be rendered to pages, maps page names to 159 | # template names. 160 | #html_additional_pages = {} 161 | 162 | # If false, no module index is generated. 163 | #html_domain_indices = True 164 | 165 | # If false, no index is generated. 166 | #html_use_index = True 167 | 168 | # If true, the index is split into individual pages for each letter. 169 | #html_split_index = False 170 | 171 | # If true, links to the reST sources are added to the pages. 172 | #html_show_sourcelink = True 173 | 174 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 175 | #html_show_sphinx = True 176 | 177 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 178 | #html_show_copyright = True 179 | 180 | # If true, an OpenSearch description file will be output, and all pages will 181 | # contain a tag referring to it. The value of this option must be the 182 | # base URL from which the finished HTML is served. 183 | #html_use_opensearch = '' 184 | 185 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 186 | #html_file_suffix = None 187 | 188 | # Output file base name for HTML help builder. 189 | htmlhelp_basename = 'Kagglerdoc' 190 | 191 | 192 | # -- Options for LaTeX output --------------------------------------------- 193 | 194 | latex_elements = { 195 | # The paper size ('letterpaper' or 'a4paper'). 196 | #'papersize': 'letterpaper', 197 | 198 | # The font size ('10pt', '11pt' or '12pt'). 199 | #'pointsize': '10pt', 200 | 201 | # Additional stuff for the LaTeX preamble. 202 | #'preamble': '', 203 | } 204 | 205 | # Grouping the document tree into LaTeX files. List of tuples 206 | # (source start file, target name, title, 207 | # author, documentclass [howto, manual, or own class]). 208 | latex_documents = [ 209 | ('index', 'Kaggler.tex', u'Kaggler Documentation', 210 | u'Jeong-Yoon Lee', 'manual'), 211 | ] 212 | 213 | # The name of an image file (relative to this directory) to place at the top of 214 | # the title page. 215 | #latex_logo = None 216 | 217 | # For "manual" documents, if this is true, then toplevel headings are parts, 218 | # not chapters. 219 | #latex_use_parts = False 220 | 221 | # If true, show page references after internal links. 222 | #latex_show_pagerefs = False 223 | 224 | # If true, show URL addresses after external links. 225 | #latex_show_urls = False 226 | 227 | # Documents to append as an appendix to all manuals. 228 | #latex_appendices = [] 229 | 230 | # If false, no module index is generated. 231 | #latex_domain_indices = True 232 | 233 | 234 | # -- Options for manual page output --------------------------------------- 235 | 236 | # One entry per manual page. List of tuples 237 | # (source start file, name, description, authors, manual section). 238 | man_pages = [ 239 | ('index', 'kaggler', u'Kaggler Documentation', 240 | [u'Jeong-Yoon Lee'], 1) 241 | ] 242 | 243 | # If true, show URL addresses after external links. 244 | #man_show_urls = False 245 | 246 | 247 | # -- Options for Texinfo output ------------------------------------------- 248 | 249 | # Grouping the document tree into Texinfo files. List of tuples 250 | # (source start file, target name, title, author, 251 | # dir menu entry, description, category) 252 | texinfo_documents = [ 253 | ('index', 'Kaggler', u'Kaggler Documentation', 254 | u'Jeong-Yoon Lee', 'Kaggler', 'One line description of project.', 255 | 'Miscellaneous'), 256 | ] 257 | 258 | # Documents to append as an appendix to all manuals. 259 | #texinfo_appendices = [] 260 | 261 | # If false, no module index is generated. 262 | #texinfo_domain_indices = True 263 | 264 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 265 | #texinfo_show_urls = 'footnote' 266 | 267 | # If true, do not generate a @detailmenu in the "Top" node's menu. 268 | #texinfo_no_detailmenu = False 269 | 270 | 271 | # Example configuration for intersphinx: refer to the Python standard library. 272 | intersphinx_mapping = {'http://docs.python.org/': None} 273 | 274 | # Napoleon settings 275 | napoleon_google_docstring = True 276 | napoleon_numpy_docstring = True 277 | napoleon_include_private_with_doc = False 278 | napoleon_include_special_with_doc = True 279 | napoleon_use_admonition_for_examples = False 280 | napoleon_use_admonition_for_notes = False 281 | napoleon_use_admonition_for_references = False 282 | napoleon_use_ivar = False 283 | napoleon_use_param = True 284 | napoleon_use_rtype = True 285 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | kaggler package 2 | =============== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | kaggler.metrics 10 | kaggler.online_model 11 | kaggler.preprocessing 12 | kaggler.test 13 | 14 | Submodules 15 | ---------- 16 | 17 | kaggler.const module 18 | -------------------- 19 | 20 | .. automodule:: kaggler.const 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | kaggler.data_io module 26 | ----------------- 27 | 28 | .. automodule:: kaggler.data_io 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | kaggler.util module 34 | ------------------- 35 | 36 | .. automodule:: kaggler.util 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | 41 | 42 | Module contents 43 | --------------- 44 | 45 | .. automodule:: kaggler 46 | :members: 47 | :undoc-members: 48 | :show-inheritance: 49 | -------------------------------------------------------------------------------- /doc/kaggler.metrics.rst: -------------------------------------------------------------------------------- 1 | kaggler.metrics package 2 | ============================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | kaggler.metrics.classification module 8 | ------------------------------ 9 | 10 | .. automodule:: kaggler.metrics.classification 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | kaggler.metrics.regression module 16 | ------------------------------ 17 | 18 | .. automodule:: kaggler.metrics.regression 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: kaggler.metrics 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /doc/kaggler.model.rst: -------------------------------------------------------------------------------- 1 | kaggler.model package 2 | ===================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | kaggler.model.nn module 8 | ----------------------- 9 | 10 | .. automodule:: kaggler.model.nn 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: kaggler.model 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /doc/kaggler.online_model.rst: -------------------------------------------------------------------------------- 1 | kaggler.online_model package 2 | ============================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | kaggler.online_model.fm module 8 | ------------------------------ 9 | 10 | .. automodule:: kaggler.online_model.fm 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | kaggler.online_model.ftrl module 16 | -------------------------------- 17 | 18 | .. automodule:: kaggler.online_model.ftrl 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | kaggler.online_model.ftrl_dropout module 24 | ---------------------------------------- 25 | 26 | .. automodule:: kaggler.online_model.ftrl_dropout 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | kaggler.online_model.nn module 32 | ------------------------------ 33 | 34 | .. automodule:: kaggler.online_model.nn 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | kaggler.online_model.nn_h2 module 40 | --------------------------------- 41 | 42 | .. automodule:: kaggler.online_model.nn_h2 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | kaggler.online_model.sgd module 48 | ------------------------------- 49 | 50 | .. automodule:: kaggler.online_model.sgd 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | 56 | Module contents 57 | --------------- 58 | 59 | .. automodule:: kaggler.online_model 60 | :members: 61 | :undoc-members: 62 | :show-inheritance: 63 | -------------------------------------------------------------------------------- /doc/kaggler.preprocessing.rst: -------------------------------------------------------------------------------- 1 | kaggler.preprocessing package 2 | ============================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | kaggler.preprocessing.data module 8 | ------------------------------ 9 | 10 | .. automodule:: kaggler.preprocessing.data 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: kaggler.preprocessing 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /doc/kaggler.rst: -------------------------------------------------------------------------------- 1 | kaggler package 2 | =============== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | kaggler.metrics 10 | kaggler.online_model 11 | kaggler.preprocessing 12 | kaggler.test 13 | 14 | Submodules 15 | ---------- 16 | 17 | kaggler.const module 18 | -------------------- 19 | 20 | .. automodule:: kaggler.const 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | kaggler.data_io module 26 | ----------------- 27 | 28 | .. automodule:: kaggler.data_io 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | kaggler.util module 34 | ------------------- 35 | 36 | .. automodule:: kaggler.util 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | 41 | 42 | Module contents 43 | --------------- 44 | 45 | .. automodule:: kaggler 46 | :members: 47 | :undoc-members: 48 | :show-inheritance: 49 | -------------------------------------------------------------------------------- /doc/kaggler.test.rst: -------------------------------------------------------------------------------- 1 | kaggler.test package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | kaggler.test.test_sgd module 8 | ---------------------------- 9 | 10 | .. automodule:: kaggler.test.test_sgd 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: kaggler.test 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /doc/modules.rst: -------------------------------------------------------------------------------- 1 | kaggler 2 | ======= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | kaggler 8 | -------------------------------------------------------------------------------- /kaggler/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.4.1' 2 | __all__ = ['const', 3 | 'data_io', 4 | 'metrics', 5 | 'model', 6 | 'online_model', 7 | 'preprocessing', 8 | 'util'] 9 | -------------------------------------------------------------------------------- /kaggler/const.py: -------------------------------------------------------------------------------- 1 | FIXED_SEED = 2015 2 | SEC_PER_MIN = 60 3 | -------------------------------------------------------------------------------- /kaggler/data_io.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_svmlight_file 2 | 3 | import heapq 4 | import numpy as np 5 | 6 | 7 | def is_number(s): 8 | """Check if a string is a number or not.""" 9 | 10 | try: 11 | float(s) 12 | return True 13 | except ValueError: 14 | return False 15 | 16 | 17 | def load_data(path, dense=False): 18 | """Load data from a CSV or libsvm format file. 19 | 20 | Args: 21 | path (str): A path to the CSV or libsvm format file containing data. 22 | dense (boolean): An optional variable indicating if the return matrix 23 | should be dense. By default, it is false. 24 | """ 25 | 26 | with open(path, 'r') as f: 27 | line = f.readline().strip() 28 | 29 | if ':' in line: 30 | X, y = load_svmlight_file(path) 31 | X = X.astype(np.float32) 32 | if dense: 33 | X = X.todense() 34 | elif ',' in line: 35 | X = np.loadtxt(path, delimiter=',', 36 | skiprows=0 if is_number(line.split(',')[0]) else 1) 37 | y = X[:, 0] 38 | X = X[:, 1:] 39 | else: 40 | raise NotImplementedError, "Neither CSV nor LibSVM formatted file." 41 | 42 | return X, y 43 | 44 | 45 | def read_sps(path): 46 | for line in open(path): 47 | # parse x 48 | xs = line.rstrip().split(' ') 49 | 50 | yield xs[1:], int(xs[0]) 51 | 52 | 53 | def shuf_file(f, shuf_win): 54 | heap = [] 55 | for line in f: 56 | key = hash(line) 57 | if len(heap) < shuf_win: 58 | heapq.heappush(heap, (key, line)) 59 | else: 60 | _, out = heapq.heappushpop(heap, (key, line)) 61 | yield out 62 | 63 | while len(heap) > 0: 64 | _, out = heapq.heappop(heap) 65 | yield out 66 | -------------------------------------------------------------------------------- /kaggler/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .classification import auc 2 | from .classification import logloss 3 | from .regression import gini 4 | from .regression import rmse 5 | -------------------------------------------------------------------------------- /kaggler/metrics/classification.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from sklearn.metrics import roc_auc_score as auc 3 | from sklearn.metrics import log_loss 4 | 5 | 6 | def logloss(y, p): 7 | """Bounded log loss error. 8 | 9 | Args: 10 | y (numpy.array): target 11 | p (numpy.array): prediction 12 | 13 | Returns: 14 | bounded log loss error 15 | """ 16 | 17 | p[p < 1e-15] = 1e-15 18 | p[p > 1 - 1e-15] = 1 - 1e-15 19 | return log_loss(y, p) 20 | -------------------------------------------------------------------------------- /kaggler/metrics/regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from sklearn.metrics import mean_squared_error 3 | 4 | import numpy as np 5 | 6 | 7 | def rmse(y, p): 8 | """Root Mean Squared Error (RMSE). 9 | 10 | Args: 11 | y (numpy.array): target 12 | p (numpy.array): prediction 13 | 14 | Returns: 15 | e (numpy.float64): RMSE 16 | """ 17 | 18 | # check and get number of samples 19 | assert y.shape == p.shape 20 | 21 | return np.sqrt(mean_squared_error(y, p)) 22 | 23 | 24 | def gini(y, p): 25 | """Normalized Gini Coefficient. 26 | 27 | Args: 28 | y (numpy.array): target 29 | p (numpy.array): prediction 30 | 31 | Returns: 32 | e (numpy.float64): normalized Gini coefficient 33 | """ 34 | 35 | # check and get number of samples 36 | assert y.shape == p.shape 37 | 38 | n_samples = y.shape[0] 39 | 40 | # sort rows on prediction column 41 | # (from largest to smallest) 42 | arr = np.array([y, p]).transpose() 43 | true_order = arr[arr[:,0].argsort()][::-1,0] 44 | pred_order = arr[arr[:,1].argsort()][::-1,0] 45 | 46 | # get Lorenz curves 47 | l_true = np.cumsum(true_order) / np.sum(true_order) 48 | l_pred = np.cumsum(pred_order) / np.sum(pred_order) 49 | l_ones = np.linspace(1/n_samples, 1, n_samples) 50 | 51 | # get Gini coefficients (area between curves) 52 | g_true = np.sum(l_ones - l_true) 53 | g_pred = np.sum(l_ones - l_pred) 54 | 55 | # normalize to true Gini coefficient 56 | return g_pred / g_true 57 | -------------------------------------------------------------------------------- /kaggler/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .nn import NN 2 | -------------------------------------------------------------------------------- /kaggler/model/nn.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from scipy import sparse 3 | from scipy.optimize import minimize 4 | from sklearn.metrics import roc_auc_score 5 | 6 | import logging 7 | import numpy as np 8 | import time 9 | 10 | from ..const import SEC_PER_MIN 11 | 12 | 13 | class NN(object): 14 | """Implement a neural network with a single h layer.""" 15 | 16 | def __init__(self, n=5, h=10, b=100000, l1=.0, l2=.0, random_state=None): 17 | """Initialize the NN class object. 18 | 19 | Args: 20 | h (int): number of h nodes 21 | b (int): number of input examples to be processed together to find 22 | the second order gradient for back-propagation 23 | n (int): number of epoches 24 | l1 (float): regularization parameter for weights between the input 25 | and hidden layers 26 | l2 (float): regularization parameter for weights between the hidden 27 | and output layers. 28 | """ 29 | 30 | np.random.seed(random_state) 31 | self.h = h 32 | self.b = b 33 | self.n = n 34 | self.l1 = l1 35 | self.l2 = l2 36 | self.n_opt = 0 37 | 38 | def fit(self, X, y, X_val=None, y_val=None): 39 | """Train a network with the quasi-Newton method. 40 | 41 | Args: 42 | X (np.array of float): feature matrix for training 43 | y (np.array of float): target values for training 44 | X_val (np.array of float): feature matrix for validation 45 | y_val (np.array of float): target values for validation 46 | """ 47 | y = y.reshape((len(y), 1)) 48 | 49 | if sparse.issparse(X): 50 | X = X.tocsr() 51 | 52 | if X_val is not None: 53 | n_val = len(y_val) 54 | y_val = y_val.reshape((n_val, 1)) 55 | 56 | # Set initial weights randomly. 57 | self.i = X.shape[1] 58 | self.l1 = self.l1 / self.i 59 | self.w = (np.random.rand((self.i + 2) * self.h + 1) - .5) * 1e-6 60 | self.w_opt = self.w 61 | self.n_opt = 0 62 | 63 | logging.info('training ...') 64 | n_obs = X.shape[0] 65 | batch = self.b 66 | n_epoch = self.n 67 | idx = range(n_obs) 68 | self.auc_opt = .5 69 | 70 | start = time.time() 71 | print('\tEPOCH TRAIN VALID BEST TIME (m)') 72 | print('\t--------------------------------------------') 73 | 74 | # Before training 75 | p = self.predict_raw(X) 76 | auc = roc_auc_score(y, p) 77 | auc_val = auc 78 | if X_val is not None: 79 | p_val = self.predict_raw(X_val) 80 | auc_val = roc_auc_score(y_val, p_val) 81 | 82 | print('\t{:3d}: {:.6f} {:.6f} {:.6f} {:.2f}'.format( 83 | 0, auc, auc_val, self.auc_opt, 84 | (time.time() - start) / SEC_PER_MIN)) 85 | 86 | # Use 'while' instead of 'for' to increase n_epoch if the validation 87 | # error keeps improving at the end of n_epoch 88 | epoch = 1 89 | while epoch <= n_epoch: 90 | # Shuffle inputs every epoch - it helps avoiding the local optimum 91 | # when batch < n_obs. 92 | np.random.shuffle(idx) 93 | 94 | # Find the optimal weights for batch input examples. 95 | # If batch == 1, it's the stochastic optimization, which is slow 96 | # but uses minimal memory. If batch == n_obs, it's the batch 97 | # optimization, which is fast but uses maximum memory. 98 | # Otherwise, it's the mini-batch optimization, which balances the 99 | # speed and space trade-offs. 100 | for i in range(int(n_obs / batch) + 1): 101 | if (i + 1) * batch > n_obs: 102 | sub_idx = idx[batch * i:n_obs] 103 | else: 104 | sub_idx = idx[batch * i:batch * (i + 1)] 105 | 106 | x = X[sub_idx] 107 | neg_idx = [n_idx for n_idx, n_y in enumerate(y[sub_idx]) if n_y == 0.] 108 | pos_idx = [p_idx for p_idx, p_y in enumerate(y[sub_idx]) if p_y == 1.] 109 | x0 = x[neg_idx] 110 | x1 = x[pos_idx] 111 | # Update weights to minimize the cost function using the 112 | # quasi-Newton method (L-BFGS-B), where: 113 | # func -- cost function 114 | # jac -- jacobian (derivative of the cost function) 115 | # maxiter -- number of iterations for L-BFGS-B 116 | ret = minimize(self.func, 117 | self.w, 118 | args=(x0, x1), 119 | method='L-BFGS-B', 120 | jac=self.fprime, 121 | options={'maxiter': 5}) 122 | self.w = ret.x 123 | 124 | p = self.predict_raw(X) 125 | auc = roc_auc_score(y, p) 126 | auc_val = auc 127 | 128 | if X_val is not None: 129 | p_val = self.predict_raw(X_val) 130 | auc_val = roc_auc_score(y_val, p_val) 131 | 132 | if auc_val > self.auc_opt: 133 | self.auc_opt = auc_val 134 | self.w_opt = self.w 135 | self.n_opt = epoch 136 | 137 | # If validation auc is still improving after n_epoch, 138 | # try 10 more epochs 139 | if epoch == n_epoch: 140 | n_epoch += 5 141 | 142 | print('\t{:3d}: {:.6f} {:.6f} {:.6f} {:.2f}'.format( 143 | epoch, auc, auc_val, self.auc_opt, 144 | (time.time() - start) / SEC_PER_MIN)) 145 | 146 | epoch += 1 147 | 148 | if X_val is not None: 149 | print('Optimal epoch is {0} ({1:.6f})'.format(self.n_opt, 150 | self.auc_opt)) 151 | self.w = self.w_opt 152 | 153 | logging.info('done training') 154 | 155 | def predict(self, X): 156 | """Predict targets for a feature matrix. 157 | 158 | Args: 159 | X (np.array of float): feature matrix for prediction 160 | 161 | Returns: 162 | 163 | """ 164 | logging.info('predicting ...') 165 | ps = self.predict_raw(X) 166 | 167 | return sigm(ps[:, 0]) 168 | 169 | def predict_raw(self, X): 170 | """Predict targets for a feature matrix. 171 | 172 | Args: 173 | X (np.array of float): feature matrix for prediction 174 | """ 175 | # b -- bias for the input and h layers 176 | b = np.ones((X.shape[0], 1)) 177 | w2 = self.w[-(self.h + 1):].reshape(self.h + 1, 1) 178 | w1 = self.w[:-(self.h + 1)].reshape(self.i + 1, self.h) 179 | 180 | # Make X to have the same number of columns as self.i. 181 | # Because of the sparse matrix representation, X for prediction can 182 | # have a different number of columns. 183 | if X.shape[1] > self.i: 184 | # If X has more columns, cut extra columns. 185 | X = X[:, :self.i] 186 | elif X.shape[1] < self.i: 187 | # If X has less columns, cut the rows of the weight matrix between 188 | # the input and h layers instead of X itself because the SciPy 189 | # sparse matrix does not support .set_shape() yet. 190 | idx = range(X.shape[1]) 191 | idx.append(self.i) # Include the last row for the bias 192 | w1 = w1[idx, :] 193 | 194 | if sparse.issparse(X): 195 | return np.hstack((sigm(sparse.hstack((X, b)).dot(w1)), b)).dot(w2) 196 | else: 197 | return np.hstack((sigm(np.hstack((X, b)).dot(w1)), b)).dot(w2) 198 | 199 | def func(self, w, *args): 200 | """Return the costs of the neural network for predictions. 201 | 202 | Args: 203 | w (array of float): weight vectors such that: 204 | w[:-h1] -- weights between the input and h layers 205 | w[-h1:] -- weights between the h and output layers 206 | args: features (args[0]) and target (args[1]) 207 | 208 | Returns: 209 | combined cost of RMSE, L1, and L2 regularization 210 | """ 211 | x0 = args[0] 212 | x1 = args[1] 213 | 214 | n0 = x0.shape[0] 215 | n1 = x1.shape[0] 216 | 217 | # n -- number of pairs to evaluate 218 | n = max(n0, n1) * 10 219 | idx0 = np.random.choice(range(n0), size=n) 220 | idx1 = np.random.choice(range(n1), size=n) 221 | 222 | # b -- bias for the input and h layers 223 | b0 = np.ones((n0, 1)) 224 | b1 = np.ones((n1, 1)) 225 | i1 = self.i + 1 226 | h = self.h 227 | h1 = h + 1 228 | 229 | # Predict for features -- cannot use predict_raw() because here 230 | # different weights can be used. 231 | if sparse.issparse(x0): 232 | p0 = np.hstack((sigm(sparse.hstack((x0, b0)).dot(w[:-h1].reshape( 233 | i1, h))), b0)).dot(w[-h1:].reshape(h1, 1)) 234 | p1 = np.hstack((sigm(sparse.hstack((x1, b1)).dot(w[:-h1].reshape( 235 | i1, h))), b1)).dot(w[-h1:].reshape(h1, 1)) 236 | else: 237 | p0 = np.hstack((sigm(np.hstack((x0, b0)).dot(w[:-h1].reshape( 238 | i1, h))), b0)).dot(w[-h1:].reshape(h1, 1)) 239 | p1 = np.hstack((sigm(np.hstack((x1, b1)).dot(w[:-h1].reshape( 240 | i1, h))), b1)).dot(w[-h1:].reshape(h1, 1)) 241 | 242 | p0 = p0[idx0] 243 | p1 = p1[idx1] 244 | 245 | # Return the cost that consists of the sum of squared error + 246 | # L2-regularization for weights between the input and h layers + 247 | # L2-regularization for weights between the h and output layers. 248 | #return .5 * (sum((1 - sigm(p1 - p0)) ** 2) + self.l1 * sum(w[:-h1] ** 2) + 249 | return .5 * (sum((1 - p1 + p0) ** 2) / n + 250 | self.l1 * sum(w[:-h1] ** 2) / (i1 * h) + 251 | self.l2 * sum(w[-h1:] ** 2) / h1) 252 | 253 | def fprime(self, w, *args): 254 | """Return the derivatives of the cost function for predictions. 255 | 256 | Args: 257 | w (array of float): weight vectors such that: 258 | w[:-h1] -- weights between the input and h layers 259 | w[-h1:] -- weights between the h and output layers 260 | args: features (args[0]) and target (args[1]) 261 | 262 | Returns: 263 | gradients of the cost function for predictions 264 | """ 265 | 266 | x0 = args[0] 267 | x1 = args[1] 268 | 269 | n0 = x0.shape[0] 270 | n1 = x1.shape[0] 271 | 272 | # n -- number of pairs to evaluate 273 | n = max(n0, n1) * 10 274 | idx0 = np.random.choice(range(n0), size=n) 275 | idx1 = np.random.choice(range(n1), size=n) 276 | 277 | # b -- bias for the input and h layers 278 | b = np.ones((n, 1)) 279 | i1 = self.i + 1 280 | h = self.h 281 | h1 = h + 1 282 | 283 | w2 = w[-h1:].reshape(h1, 1) 284 | w1 = w[:-h1].reshape(i1, h) 285 | 286 | if sparse.issparse(x0): 287 | x0 = x0.tocsr()[idx0] 288 | x1 = x1.tocsr()[idx1] 289 | xb0 = sparse.hstack((x0, b)) 290 | xb1 = sparse.hstack((x1, b)) 291 | else: 292 | x0 = x0[idx0] 293 | x1 = x1[idx1] 294 | xb0 = np.hstack((x0, b)) 295 | xb1 = np.hstack((x1, b)) 296 | 297 | z0 = np.hstack((sigm(xb0.dot(w1)), b)) 298 | z1 = np.hstack((sigm(xb1.dot(w1)), b)) 299 | y0 = z0.dot(w2) 300 | y1 = z1.dot(w2) 301 | 302 | #e = 1 - sigm(y1 - y0) 303 | #dy = e * dsigm(y1 - y0) 304 | e = 1 - (y1 - y0) 305 | dy = e / n 306 | 307 | # Calculate the derivative of the cost function w.r.t. F and w2 where: 308 | # F -- weights between the input and h layers 309 | # w2 -- weights between the h and output layers 310 | dw1 = -(xb1.T.dot(dy.dot(w2[:-1].reshape(1, h)) * dsigm(xb1.dot(w1))) - 311 | xb0.T.dot(dy.dot(w2[:-1].reshape(1, h)) * dsigm(xb0.dot(w1))) 312 | ).reshape(i1 * h) + self.l1 * w[:-h1] / (i1 * h) 313 | dw2 = -(z1 - z0).T.dot(dy).reshape(h1) + self.l2 * w[-h1:] / h1 314 | 315 | return np.append(dw1, dw2) 316 | 317 | 318 | def sigm(x): 319 | """Return the value of the sigmoid function at x. 320 | 321 | Args: 322 | x (np.array of float or float) 323 | 324 | Returns: 325 | value(s) of the sigmoid function for x. 326 | """ 327 | 328 | # Avoid numerical overflow by capping the input to the exponential 329 | # function - doesn't affect the return value. 330 | return 1 / (1 + np.exp(-np.maximum(x, -20))) 331 | 332 | 333 | def dsigm(x): 334 | """Return the value of derivative of sigmoid function w.r.t. x. 335 | Args: 336 | x (np.array of float or float) 337 | 338 | Returns: 339 | derivative(s) of the sigmoid function w.r.t. x. 340 | """ 341 | 342 | return sigm(x) * (1 - sigm(x)) 343 | -------------------------------------------------------------------------------- /kaggler/online_model/DecisionTree/OnlineClassificationTree.py: -------------------------------------------------------------------------------- 1 | from _tree import Tree 2 | from OnlineDecisionTree import * 3 | from utils import * 4 | import numpy as np 5 | import pandas as pd 6 | 7 | class ClassificationTree(Tree): 8 | 9 | def __init__( 10 | self, 11 | number_of_features, 12 | number_of_functions=10, 13 | min_sample_split=200, 14 | predict_initialize={ 15 | 'count_dict': {}, 16 | } 17 | ): 18 | # Constant values 19 | self.number_of_features = number_of_features 20 | self.number_of_functions = number_of_functions 21 | self.min_sample_split = min_sample_split 22 | self.predict_initialize = predict_initialize 23 | self.max_sample = 1000 24 | # Dynamic values 25 | self.left = None 26 | self.right = None 27 | self.randomly_selected_features = [] 28 | self._randomly_select() 29 | self.criterion = None 30 | 31 | 32 | def _calculate_split_score(self, split): 33 | """ 34 | calculate the score of the split: 35 | score = current_error - after_split_error 36 | """ 37 | left_error = gini(split['left']) 38 | right_error = gini(split['right']) 39 | error = gini(self.Y) 40 | # if the split is any good, the score should be greater than 0 41 | total = float(len(self.Y)) 42 | score = error - 1 / total * (len(split['left']) * left_error\ 43 | + len(split['right']) * right_error) 44 | return score 45 | 46 | def _apply_best_split(self): 47 | best_split, best_split_score = self._find_best_split() 48 | if best_split_score > 0: 49 | self.criterion = lambda x : x[best_split['feature']] \ 50 | > best_split['value'] 51 | # create the left child 52 | self.left = ClassificationTree( 53 | number_of_features=self.number_of_features, 54 | number_of_functions=self.number_of_functions, 55 | min_sample_split=self.min_sample_split, 56 | predict_initialize={ 57 | 'count_dict': count_dict(best_split['left']), 58 | } 59 | ) 60 | # create the right child 61 | self.right = ClassificationTree( 62 | number_of_features=self.number_of_features, 63 | number_of_functions=self.number_of_functions, 64 | min_sample_split=self.min_sample_split, 65 | predict_initialize={ 66 | 'count_dict': count_dict(best_split['right']), 67 | } 68 | ) 69 | # Collect garbage 70 | self.samples = {} 71 | self.Y = [] 72 | 73 | 74 | def predict(self, x): 75 | """ 76 | Make prediction recursively. Use both the samples inside the current 77 | node and the statistics inherited from parent. 78 | """ 79 | if self._is_leaf(): 80 | d1 = self.predict_initialize['count_dict'] 81 | d2 = count_dict(self.Y) 82 | for key, value in d1.iteritems(): 83 | if key in d2: 84 | d2[key] += value 85 | else: 86 | d2[key] = value 87 | return argmax(d2) 88 | else: 89 | if self.criterion(x): 90 | return self.right.predict(x) 91 | else: 92 | return self.left.predict(x) 93 | -------------------------------------------------------------------------------- /kaggler/online_model/DecisionTree/_tree.pyx: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | cimport numpy as np 4 | 5 | from utils import * 6 | 7 | ctypedef np.int_t DTYPE_t 8 | 9 | cdef class Tree: 10 | 11 | def __cinit__( 12 | self, 13 | int number_of_features, 14 | int number_of_functions=10, 15 | int min_sample_split=20, 16 | dict predict_initialize={ 17 | 'mean':2.0, 18 | 'variance':1.0, 19 | 'num_samples':0 20 | } 21 | ): 22 | # Constant values 23 | self.number_of_features = number_of_features 24 | self.number_of_functions = number_of_functions 25 | self.min_sample_split = min_sample_split 26 | self.predict_initialize = predict_initialize 27 | self.max_sample = 100 28 | # Dynamic values 29 | self.left = None 30 | self.right = None 31 | self.randomly_selected_features = [] 32 | self._randomly_select() 33 | self.criterion = None 34 | 35 | 36 | def _randomly_select(self): 37 | # Check the number of randomly selected features 38 | if self.number_of_features < self.number_of_functions: 39 | raise Exception("The feature number is more than maximum") 40 | 41 | # Randomly select features into a set, and then transform to a list 42 | self.randomly_selected_features=set([]) 43 | while len(self.randomly_selected_features) < self.number_of_functions: 44 | self.randomly_selected_features.add(\ 45 | random.randint(0, self.number_of_features-1)) 46 | self.randomly_selected_features = list(self.randomly_selected_features) 47 | 48 | # Initialize the samples belong to the node 49 | self.samples = {} 50 | self.Y = [] 51 | for feature in self.randomly_selected_features: 52 | self.samples[feature] = [] 53 | 54 | def _is_leaf(self): 55 | return self.criterion == None 56 | 57 | cpdef update(self, np.ndarray x, y): 58 | """ 59 | Update the model according to a single (x, y) input. 60 | 61 | If the current node is a leaf, then update the samples of the 62 | current node. 63 | 64 | Else update its left or right node recursively according to the 65 | value of x. 66 | When the left and right child are created, they inherit mean and 67 | sample count information from the parent. 68 | """ 69 | cdef int N 70 | if self._is_leaf(): 71 | N = len(self.Y) 72 | if N <= self.max_sample: 73 | self._update_samples(x, y) 74 | if N == self.min_sample_split or N == 2 * self.min_sample_split: 75 | self._apply_best_split() 76 | 77 | else: 78 | if self.criterion(x): 79 | self.right.update(x, y) 80 | else: 81 | self.left.update(x, y) 82 | 83 | cpdef _update_samples(self, np.ndarray x, DTYPE_t y): 84 | cdef int feature 85 | for feature in self.randomly_selected_features: 86 | self.samples[feature].append((x[feature], y)) 87 | self.Y.append(y) 88 | 89 | cpdef tuple _find_best_split(self): 90 | cdef dict best_split = {} 91 | cdef double best_split_score = 0 92 | cdef int feature 93 | cdef double value 94 | cdef DTYPE_t prediction 95 | cdef list sample_feature 96 | cdef list left, right 97 | cdef dict split 98 | cdef double split_score 99 | # Try all the selected features and values combination, find the best 100 | for feature in self.randomly_selected_features: 101 | for (value, prediction) in self.samples[feature]: 102 | sample_feature = self.samples[feature] 103 | left, right = bin_split(sample_feature, value) 104 | 105 | split = { 106 | 'left': left, 107 | 'right': right, 108 | 'value': value, 109 | 'feature': feature, 110 | } 111 | 112 | split_score = self._calculate_split_score(split) 113 | if split_score > best_split_score: 114 | best_split = split 115 | best_split_score = split_score 116 | 117 | return best_split, best_split_score 118 | -------------------------------------------------------------------------------- /kaggler/online_model/DecisionTree/test.py: -------------------------------------------------------------------------------- 1 | import profile 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn import preprocessing 5 | from OnlineClassificationTree import * 6 | 7 | def test(): 8 | filename = "dataset.csv" 9 | df = pd.read_csv(filename, header = 0) 10 | data = df.values 11 | y = data[:, -1] 12 | lbl_enc = preprocessing.LabelEncoder() 13 | y = lbl_enc.fit_transform(y) 14 | data = data[:, 0:-1] 15 | train = data[0:50000] 16 | ytrain = y[0:50000] 17 | test = data[50000:] 18 | ytest = y[50000:] 19 | learner = ClassificationTree(number_of_features=93) 20 | 21 | for t, x in enumerate(train): 22 | learner.update(x, ytrain[t]) 23 | if t % 1000 == 0: 24 | print t 25 | correct_num = 0 26 | for t, x in enumerate(test): 27 | y_pred = learner.predict(x) 28 | if y_pred == ytest[t]: 29 | correct_num += 1 30 | if t % 1000 == 0: 31 | print t 32 | 33 | print correct_num 34 | 35 | if __name__ == '__main__': 36 | profile.run("test()") 37 | -------------------------------------------------------------------------------- /kaggler/online_model/DecisionTree/utils.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | cimport numpy as np 3 | cimport cython 4 | from libc.math cimport sqrt, abs 5 | 6 | ctypedef np.int_t DTYPE_t 7 | 8 | cpdef DTYPE_t argmax(dict d): 9 | cdef double max_count = 0 10 | cdef double total_count = 0 11 | cdef double value 12 | cdef DTYPE_t key 13 | cdef DTYPE_t max_class = 0 14 | for key, value in d.iteritems(): 15 | total_count += value 16 | if value > max_count: 17 | max_count = value 18 | max_class = key 19 | return max_class 20 | 21 | 22 | def predict_max(list a): 23 | return argmax(count_dict(a)) 24 | 25 | cpdef dict count_dict(list a): 26 | cdef DTYPE_t x 27 | cdef dict d = {} 28 | for x in a: 29 | d.setdefault(x, 0) 30 | d[x] += 1 31 | return d 32 | 33 | cpdef double mean_squared_error(list x): 34 | cdef np.ndarray xnp 35 | xnp = np.array(x) 36 | xnp = xnp - xnp.mean() 37 | return sqrt((xnp * xnp.T).mean()) 38 | 39 | cpdef double mean_absolute_error(list x): 40 | cdef np.ndarray xnp 41 | xnp = np.array(x) 42 | xnp = xnp - xnp.mean() 43 | return abs(xnp).mean() 44 | 45 | cpdef double gini(list x): 46 | cdef dict d = {} 47 | cdef double total 48 | cdef list to_square 49 | cdef np.ndarray to_square2 50 | cdef DTYPE_t y 51 | for y in x: 52 | d.setdefault(y, 0) 53 | d[y] += 1 54 | total = len(x) 55 | to_square = [] 56 | cdef double value 57 | cdef DTYPE_t key 58 | for key, value in d.iteritems(): 59 | to_square.append(value/total) 60 | to_square2 = np.array(to_square) 61 | return 1 - (to_square2 * to_square2.T).sum() 62 | 63 | cpdef tuple bin_split(list sample_feature, double feature_value): 64 | cdef list left, right 65 | cdef tuple x 66 | left = [x[1] for x in sample_feature if x[0]<=feature_value] 67 | right = [x[1] for x in sample_feature if x[0]>feature_value] 68 | return left, right 69 | -------------------------------------------------------------------------------- /kaggler/online_model/__init__.py: -------------------------------------------------------------------------------- 1 | from .ftrl import FTRL 2 | from .fm import FM 3 | from .nn import NN 4 | from .nn_h2 import NN_H2 5 | from .sgd import SGD 6 | -------------------------------------------------------------------------------- /kaggler/online_model/fm.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False 2 | # cython: wraparound=False 3 | # cython: cdivision=True 4 | from __future__ import division 5 | import numpy as np 6 | 7 | cimport cython 8 | from libc.math cimport sqrt, abs 9 | from ..util cimport sigm 10 | cimport numpy as np 11 | 12 | from cython.parallel import prange, parallel, threadid 13 | 14 | np.import_array() 15 | 16 | 17 | cdef class FM: 18 | """Factorization Machine online learner. 19 | 20 | Attributes: 21 | n (int): number of input features 22 | epoch (int): number of epochs 23 | k (int): size of factors for interactions 24 | a (double): initial learning rate 25 | w0 (double): weight for bias 26 | c0 (double): counters 27 | w (array of double): feature weights 28 | c (array of double): counters for weights 29 | V (array of double): feature weights for factors 30 | """ 31 | 32 | cdef unsigned int epoch 33 | cdef unsigned int n 34 | cdef unsigned int k 35 | cdef double a 36 | cdef double w0 37 | cdef double c0 38 | cdef double[:] w 39 | cdef double[:] c 40 | cdef double[:] V 41 | 42 | def __init__(self, 43 | unsigned int n, 44 | unsigned int epoch=100, 45 | unsigned int dim=4, 46 | double a=0.01, 47 | seed=0): 48 | """Initialize the FM class object. 49 | 50 | Args: 51 | n (int): number of input features 52 | epoch (int): number of epochs 53 | dim (int): size of factors for interactions 54 | a (double): initial learning rate 55 | seed (int): random seed 56 | """ 57 | cdef int i 58 | 59 | rng = np.random.RandomState(seed) 60 | 61 | self.n = n # # of features 62 | self.epoch = epoch # # of epochs 63 | self.k = dim # interaction dimension 64 | self.a = a # learning rate 65 | 66 | # initialize weights, factorized interactions, and counts 67 | self.w0 = 0. 68 | self.c0 = 0. 69 | self.w = np.zeros((self.n,), dtype=np.float64) 70 | self.c = np.zeros((self.n,), dtype=np.float64) 71 | self.V = (rng.rand(self.n * self.k) - .5) * 1e-6 72 | 73 | def __repr__(self): 74 | return ('FM(n={}, epoch={}, dim={}, a={})').format( 75 | self.n, self.epoch, self.dim, self.a 76 | ) 77 | 78 | def read_sparse(self, path): 79 | """Apply hashing trick to the libsvm format sparse file. 80 | 81 | Args: 82 | path (str): a file path to the libsvm format sparse file 83 | 84 | Yields: 85 | idx (list of int): a list of index of non-zero features 86 | val (list of double): a list of values of non-zero features 87 | y (int): target value 88 | """ 89 | for line in open(path): 90 | xs = line.rstrip().split(' ') 91 | 92 | y = int(xs[0]) 93 | idx = [] 94 | val = [] 95 | for item in xs[1:]: 96 | i, v = item.split(':') 97 | idx.append(int(i)) 98 | val.append(float(v)) 99 | 100 | yield zip(idx, val), y 101 | 102 | def fit(self, X, y): 103 | """Update the model with a sparse input feature matrix and its targets. 104 | 105 | Args: 106 | X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features 107 | y (numpy.array): targets 108 | 109 | Returns: 110 | updated model weights and counts 111 | """ 112 | n = X.shape[0] 113 | for epoch in range(self.epoch): 114 | for row in range(n): 115 | x = zip(X[row].indices, X[row].data) 116 | self.update_one(x, self.predict_one(x) - y[row]) 117 | 118 | def predict(self, X): 119 | """Predict for a sparse matrix X. 120 | 121 | Args: 122 | X (scipy.sparse.csr_matrix): a sparse matrix for input features 123 | 124 | Returns: 125 | p (numpy.array): predictions for input features 126 | """ 127 | 128 | p = np.zeros((X.shape[0], ), dtype=np.float64) 129 | for row in range(X.shape[0]): 130 | p[row] = self.predict_one(zip(X[row].indices, X[row].data)) 131 | 132 | return p 133 | 134 | def predict_one(self, list x): 135 | """Predict for features. 136 | 137 | Args: 138 | x (list of tuple): a list of (index, value) of non-zero features 139 | 140 | Returns: 141 | p (double): a prediction for input features 142 | """ 143 | cdef int i 144 | cdef int k 145 | cdef double v 146 | cdef double p 147 | cdef double wx 148 | cdef double[:] vx 149 | cdef double[:] v2x2 150 | 151 | wx = 0. 152 | vx = np.zeros((self.k,), dtype=np.float64) 153 | v2x2 = np.zeros((self.k,), dtype=np.float64) 154 | for i, v in x: 155 | wx += self.w[i] * v 156 | for k in range(self.k): 157 | vx[k] += self.V[i * self.k + k] * v 158 | v2x2[k] += (self.V[i * self.k + k] ** 2) * (v ** 2) 159 | 160 | p = self.w0 + wx 161 | for k in range(self.k): 162 | p += .5 * (vx[k] ** 2 - v2x2[k]) 163 | 164 | return sigm(p) 165 | 166 | def update_one(self, list x, double e): 167 | """Update the model. 168 | 169 | Args: 170 | idx (list of int): a list of index of non-zero features 171 | val (list of double): a list of values of non-zero features 172 | e (double): error between the prediction of the model and target 173 | 174 | Returns: 175 | updated model weights and counts 176 | """ 177 | cdef int i 178 | cdef int k 179 | cdef int f 180 | cdef double v 181 | cdef double g2 182 | cdef double dl_dw 183 | cdef double[:] vx 184 | 185 | # calculate v_f * x in advance 186 | vx = np.zeros((self.k,), dtype=np.float64) 187 | for i, v in x: 188 | for k in range(self.k): 189 | vx[k] += self.V[i * self.k + k] * v 190 | 191 | # update w0, w, V, c0, and c 192 | g2 = e * e 193 | 194 | self.w0 -= self.a / (sqrt(self.c0) + 1) * e 195 | for i, v in x: 196 | dl_dw = self.a / (sqrt(self.c[i]) + 1) * e * v 197 | self.w[i] -= dl_dw 198 | for f in range(self.k): 199 | self.V[i * self.k + f] -= dl_dw * (vx[f] - 200 | self.V[i * self.k + f] * v) 201 | 202 | self.c[i] += g2 203 | 204 | self.c0 += g2 205 | -------------------------------------------------------------------------------- /kaggler/online_model/ftrl.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False 2 | # cython: wraparound=False 3 | # cython: cdivision=True 4 | from __future__ import division 5 | import numpy as np 6 | 7 | cimport cython 8 | from libc.math cimport sqrt, abs 9 | from ..util cimport sigm 10 | cimport numpy as np 11 | 12 | 13 | np.import_array() 14 | 15 | 16 | cdef class FTRL: 17 | """FTRL online learner with the hasing trick using liblinear format data. 18 | 19 | inspired by Kaggle user tinrtgu's code at http://goo.gl/K8hQBx 20 | original FTRL paper is available at http://goo.gl/iqIaH0 21 | 22 | Attributes: 23 | n (int): number of features after hashing trick 24 | epoch (int): number of epochs 25 | a (double): alpha in the per-coordinate rate 26 | b (double): beta in the per-coordinate rate 27 | l1 (double): L1 regularization parameter 28 | l2 (double): L2 regularization parameter 29 | w (array of double): feature weights 30 | c (array of double): counters for weights 31 | z (array of double): lazy weights 32 | interaction (boolean): whether to use 2nd order interaction or not 33 | """ 34 | 35 | cdef double a # learning rate 36 | cdef double b 37 | cdef double l1 38 | cdef double l2 39 | cdef unsigned int epoch 40 | cdef unsigned int n 41 | cdef bint interaction 42 | cdef double[:] w 43 | cdef double[:] c 44 | cdef double[:] z 45 | 46 | def __init__(self, 47 | double a=0.01, 48 | double b=1., 49 | double l1=1., 50 | double l2=1., 51 | unsigned int n=2**20, 52 | unsigned int epoch=1, 53 | bint interaction=True): 54 | """Initialize the FTRL class object. 55 | 56 | Args: 57 | a (double): alpha in the per-coordinate rate 58 | b (double): beta in the per-coordinate rate 59 | l1 (double): L1 regularization parameter 60 | l2 (double): L2 regularization parameter 61 | n (int): number of features after hashing trick 62 | epoch (int): number of epochs 63 | interaction (boolean): whether to use 2nd order interaction or not 64 | """ 65 | 66 | self.a = a 67 | self.b = b 68 | self.l1 = l1 69 | self.l2 = l2 70 | self.n = n 71 | self.epoch = epoch 72 | self.interaction = interaction 73 | 74 | # initialize weights and counts 75 | self.w = np.zeros((self.n + 1,), dtype=np.float64) 76 | self.c = np.zeros((self.n + 1,), dtype=np.float64) 77 | self.z = np.zeros((self.n + 1,), dtype=np.float64) 78 | 79 | def __repr__(self): 80 | return ('FTRL(a={}, b={}, l1={}, l2={}, n={}, epoch={}, interaction={})').format( 81 | self.a, self.b, self.l1, self.l2, self.n, self.epoch, self.interaction 82 | ) 83 | 84 | def _indices(self, list x): 85 | cdef unsigned int index 86 | cdef int l 87 | cdef int i 88 | cdef int j 89 | 90 | # return the index of the bias term 91 | yield self.n 92 | 93 | for index in x: 94 | yield abs(hash(index)) % self.n 95 | 96 | if self.interaction: 97 | l = len(x) 98 | x = sorted(x) 99 | for i in xrange(l): 100 | for j in xrange(i + 1, l): 101 | yield abs(hash('{}_{}'.format(x[i], x[j]))) % self.n 102 | 103 | def read_sparse(self, path): 104 | """Apply hashing trick to the libsvm format sparse file. 105 | 106 | Args: 107 | path (str): a file path to the libsvm format sparse file 108 | 109 | Yields: 110 | x (list of int): a list of index of non-zero features 111 | y (int): target value 112 | """ 113 | for line in open(path): 114 | xs = line.rstrip().split(' ') 115 | 116 | y = int(xs[0]) 117 | x = [] 118 | for item in xs[1:]: 119 | index, _ = item.split(':') 120 | x.append(index) 121 | 122 | yield x, y 123 | 124 | def fit(self, X, y): 125 | """Update the model with a sparse input feature matrix and its targets. 126 | 127 | Args: 128 | X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features 129 | y (numpy.array): targets 130 | 131 | Returns: 132 | updated model weights and counts 133 | """ 134 | for epoch in range(self.epoch): 135 | for row in range(X.shape[0]): 136 | x = list(X[row].indices) 137 | self.update_one(x, self.predict_one(x) - y[row]) 138 | 139 | def predict(self, X): 140 | """Predict for a sparse matrix X. 141 | 142 | Args: 143 | X (scipy.sparse.csr_matrix): a sparse matrix for input features 144 | 145 | Returns: 146 | p (numpy.array): predictions for input features 147 | """ 148 | p = np.zeros((X.shape[0], ), dtype=np.float64) 149 | for row in range(X.shape[0]): 150 | p[row] = self.predict_one(list(X[row].indices)) 151 | 152 | return p 153 | 154 | def update_one(self, list x, double e): 155 | """Update the model. 156 | 157 | Args: 158 | x (list of int): a list of index of non-zero features 159 | e (double): error between prediction of the model and target 160 | 161 | Returns: 162 | updates model weights and counts 163 | """ 164 | cdef int i 165 | cdef double e2 166 | cdef double s 167 | 168 | e2 = e * e 169 | for i in self._indices(x): 170 | s = (sqrt(self.c[i] + e2) - sqrt(self.c[i])) / self.a 171 | self.w[i] += e - s * self.z[i] 172 | self.c[i] += e2 173 | 174 | def predict_one(self, list x): 175 | """Predict for features. 176 | 177 | Args: 178 | x (list of int): a list of index of non-zero features 179 | 180 | Returns: 181 | p (double): a prediction for input features 182 | """ 183 | cdef int i 184 | cdef double sign 185 | cdef double wTx 186 | 187 | wTx = 0. 188 | for i in self._indices(x): 189 | sign = -1. if self.w[i] < 0 else 1. 190 | if sign * self.w[i] <= self.l1: 191 | self.z[i] = 0. 192 | else: 193 | self.z[i] = (sign * self.l1 - self.w[i]) / \ 194 | ((self.b + sqrt(self.c[i])) / self.a + self.l2) 195 | 196 | wTx += self.z[i] 197 | 198 | return sigm(wTx) 199 | -------------------------------------------------------------------------------- /kaggler/online_model/ftrl_dropout.pyx: -------------------------------------------------------------------------------- 1 | from csv import DictReader 2 | from math import exp, log, sqrt 3 | 4 | import cPickle as pickle 5 | import gzip 6 | import random 7 | 8 | 9 | class ftrl_proximal(object): 10 | ''' Our main algorithm: Follow the regularized leader - proximal 11 | 12 | In short, 13 | this is an adaptive-learning-rate sparse logistic-regression with 14 | efficient L1-L2-regularization 15 | 16 | Reference: 17 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 18 | ''' 19 | 20 | def __init__(self, alpha, beta, L1, L2, D, interaction=False, dropout=1.0): 21 | # parameters 22 | self.alpha = alpha 23 | self.beta = beta 24 | self.L1 = L1 25 | self.L2 = L2 26 | 27 | # feature related parameters 28 | self.D = D 29 | self.interaction = interaction 30 | self.dropout = dropout 31 | 32 | # model 33 | # n: squared sum of past gradients 34 | # z: weights 35 | # w: lazy weights 36 | self.n = [0.] * D 37 | self.z = [0.] * D 38 | 39 | self.w = [0.] * D # use this for execution speed up 40 | 41 | def _indices(self, x): 42 | ''' A helper generator that yields the indices in x 43 | 44 | The purpose of this generator is to make the following 45 | code a bit cleaner when doing feature interaction. 46 | ''' 47 | 48 | for i in x: 49 | yield i 50 | 51 | if self.interaction: 52 | L = len(x) 53 | for i in xrange(1, L): # skip bias term, so we start at 1 54 | for j in xrange(i+1, L): 55 | # one-hot encode interactions with hash trick 56 | yield abs(hash(str(x[i]) + '_' + str(x[j]))) % self.D 57 | 58 | def predict(self, x, dropped = None): 59 | ''' Get probability estimation on x 60 | 61 | INPUT: 62 | x: features 63 | 64 | OUTPUT: 65 | probability of p(y = 1 | x; w) 66 | ''' 67 | # params 68 | dropout = self.dropout 69 | 70 | # model 71 | w = self.w 72 | 73 | # wTx is the inner product of w and x 74 | wTx = 0. 75 | for j, i in enumerate(self._indices(x)): 76 | 77 | if dropped != None and dropped[j]: 78 | continue 79 | 80 | wTx += w[i] 81 | 82 | if dropped != None: wTx /= dropout 83 | 84 | # bounded sigmoid function, this is the probability estimation 85 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.))) 86 | 87 | def update(self, x, y): 88 | ''' Update model using x, p, y 89 | 90 | INPUT: 91 | x: feature, a list of indices 92 | p: click probability prediction of our model 93 | y: answer 94 | 95 | MODIFIES: 96 | self.n: increase by squared gradient 97 | self.z: weights 98 | ''' 99 | 100 | # parameters 101 | alpha = self.alpha 102 | beta = self.beta 103 | L1 = self.L1 104 | L2 = self.L2 105 | 106 | # model 107 | n = self.n 108 | z = self.z 109 | w = self.w # no need to change this, it won't gain anything 110 | dropout = self.dropout 111 | 112 | ind = [ i for i in self._indices(x)] 113 | 114 | if dropout == 1: 115 | dropped = None 116 | else: 117 | dropped = [random.random() > dropout for i in xrange(0,len(ind))] 118 | 119 | p = self.predict(x, dropped) 120 | 121 | # gradient under logloss 122 | g = p - y 123 | 124 | # update z and n 125 | for j, i in enumerate(ind): 126 | 127 | # implement dropout as overfitting prevention 128 | if dropped != None and dropped[j]: continue 129 | 130 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha 131 | z[i] += g - sigma * w[i] 132 | n[i] += g * g 133 | 134 | sign = -1. if z[i] < 0 else 1. # get sign of z[i] 135 | 136 | # build w on the fly using z and n, hence the name - lazy weights - 137 | if sign * z[i] <= L1: 138 | # w[i] vanishes due to L1 regularization 139 | w[i] = 0. 140 | else: 141 | # apply prediction time L1, L2 regularization to z and get w 142 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2) 143 | 144 | def read_csv(self, f_train): 145 | ''' GENERATOR: Apply hash-trick to the original csv row 146 | and for simplicity, we one-hot-encode everything 147 | 148 | INPUT: 149 | path: path to training or testing file 150 | 151 | YIELDS: 152 | ID: id of the instance, mainly useless 153 | x: a list of hashed and one-hot-encoded 'indices' 154 | we only need the index since all values are either 0 or 1 155 | y: y = 1 if we have a click, else we have y = 0 156 | ''' 157 | for t, row in enumerate(DictReader(f_train)): 158 | # process id 159 | ID = row['id'] 160 | del row['id'] 161 | 162 | # process clicks 163 | y = 0. 164 | if 'click' in row: 165 | if row['click'] == '1': 166 | y = 1. 167 | del row['click'] 168 | 169 | # turn hour really into hour, it was originally YYMMDDHH 170 | 171 | date = row['hour'][0:6] 172 | row['hour'] = row['hour'][6:] 173 | 174 | # stderr.write("_%s_" % date) 175 | 176 | # extract date 177 | row['wd'] = str(int(date) % 7) 178 | row['wd_hour'] = "%s_%s" % (row['wd'], row['hour']) 179 | 180 | # build x 181 | x = [0] # 0 is the index of the bias term 182 | for key in row: 183 | value = row[key] 184 | 185 | # one-hot encode everything with hash trick 186 | index = abs(hash(key + '_' + value)) % self.D 187 | x.append(index) 188 | 189 | yield t, ID, x, y 190 | 191 | def write_model(self, model, model_save, args): 192 | with gzip.open(model_save, "wb") as model_file: 193 | pickle.dump((args, model), model_file) 194 | 195 | def load_model(self, model_save): 196 | with gzip.open(model_save, "rb") as model_file: 197 | (p, model) = pickle.load(model_file) 198 | 199 | return model 200 | -------------------------------------------------------------------------------- /kaggler/online_model/ftrl_fm.pyx: -------------------------------------------------------------------------------- 1 | ''' Based on Tinrtgu's FTRL code: http://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory 2 | ''' 3 | 4 | from csv import DictReader 5 | cimport cython 6 | from libc.math cimport exp, copysign, log, sqrt 7 | import numpy as np 8 | import copy 9 | cimport numpy as np 10 | np.import_array() 11 | from cython.parallel import parallel 12 | from datetime import datetime 13 | import random 14 | 15 | cdef class FTRL_FM: 16 | cdef double alpha # learning rate 17 | cdef double beta 18 | cdef double alpha_fm # learning rate 19 | cdef double beta_fm 20 | cdef double L1 21 | cdef double L2 22 | cdef double L1_fm 23 | cdef double L2_fm 24 | cdef double L1_fm_tmp 25 | cdef double L2_fm_tmp 26 | cdef unsigned int fm_dim 27 | cdef unsigned int D 28 | cdef double fm_initDev 29 | cdef double dropoutRate 30 | 31 | 32 | cdef unsigned int epoch 33 | # cdef unsigned int n 34 | cdef bint interaction 35 | cdef double[:] w 36 | cdef double[:] n 37 | cdef double[:] z 38 | cdef dict n_fm 39 | cdef dict z_fm 40 | cdef dict w_fm 41 | def __init__( 42 | self, 43 | unsigned int fm_dim=4, 44 | double fm_initDev=0.01, 45 | double L1=0.0, 46 | double L2=0.0, 47 | double L1_fm=0.0, 48 | double L2_fm=0.0, 49 | unsigned int D=2*22, 50 | double alpha=0.005, 51 | double beta=1.0, 52 | double alpha_fm = .1, 53 | double beta_fm = 1.0, 54 | double dropoutRate = 1.0 55 | ): 56 | ''' initialize the factorization machine.''' 57 | 58 | self.alpha = alpha # learning rate parameter alpha 59 | self.beta = beta # learning rate parameter beta 60 | self.L1 = L1 # L1 regularizer for first order terms 61 | self.L2 = L2 # L2 regularizer for first order terms 62 | self.alpha_fm = alpha_fm # learning rate parameter alpha for factorization machine 63 | self.beta_fm = beta_fm # learning rate parameter beta for factorization machine 64 | self.L1_fm = L1_fm # L1 regularizer for factorization machine weights. Only use L1 after one epoch of training, because small initializations are needed for gradient. 65 | self.L2_fm = L2_fm # L2 regularizer for factorization machine weights. 66 | self.fm_dim = fm_dim # dimension of factorization. 67 | self.fm_initDev = fm_initDev # standard deviation for random intitialization of factorization weights. 68 | self.dropoutRate = dropoutRate # dropout rate (which is actually the inclusion rate), i.e. dropoutRate = .8 indicates a probability of .2 of dropping out a feature. 69 | 70 | self.L1_fm_tmp = L1_fm # L1 regularizer for factorization machine weights. Only use L1 after one epoch of training, because small initializations are needed for gradient. 71 | self.L2_fm_tmp = L2_fm # L2 regularizer for factorization machine weights. 72 | 73 | self.D = D 74 | 75 | # model 76 | # n: squared sum of past gradients 77 | # z: weights 78 | # w: lazy weights 79 | 80 | # let index 0 be bias term to avoid collisions. 81 | self.n = np.zeros(self.D + 1, dtype=np.float64) 82 | self.z = np.zeros(self.D + 1, dtype=np.float64) 83 | self.w = np.zeros(self.D + 1, dtype=np.float64) 84 | 85 | self.n_fm = {} 86 | self.z_fm = {} 87 | self.w_fm = {} 88 | 89 | 90 | def init_fm(self,unsigned int i): 91 | ''' initialize the factorization weight vector for variable i. 92 | ''' 93 | cdef unsigned int k 94 | if i not in self.n_fm: 95 | self.n_fm[i] = np.zeros(self.fm_dim, dtype=np.float64) 96 | self.w_fm[i] = np.zeros(self.fm_dim, dtype=np.float64) 97 | self.z_fm[i] = np.zeros(self.fm_dim, dtype=np.float64) 98 | 99 | for k in range(self.fm_dim): 100 | self.z_fm[i][k] = random.gauss(0., self.fm_initDev) 101 | 102 | def predict_raw(self, list x): 103 | ''' predict_one the raw score prior to logit transformation. 104 | ''' 105 | alpha = self.alpha 106 | beta = self.beta 107 | L1 = self.L1 108 | L2 = self.L2 109 | alpha_fm = self.alpha_fm 110 | beta_fm = self.beta_fm 111 | L1_fm = self.L1_fm 112 | L2_fm = self.L2_fm 113 | 114 | # first order weights model 115 | n = self.n 116 | z = self.z 117 | w = self.w 118 | 119 | # FM interaction model 120 | n_fm = self.n_fm 121 | z_fm = self.z_fm 122 | w_fm = self.w_fm 123 | 124 | cdef double raw_y = 0. 125 | cdef unsigned int i 126 | cdef double sign 127 | cdef unsigned int len_x 128 | cdef unsigned int k 129 | 130 | # calculate the bias contribution 131 | for i in [0]: 132 | # no regularization for bias 133 | self.w[i] = (- self.z[i]) / ((self.beta + sqrt(self.n[i])) / self.alpha) 134 | 135 | raw_y += self.w[i] 136 | 137 | # calculate the first order contribution. 138 | for i in x: 139 | sign = -1. if self.z[i] < 0. else 1. # get sign of z[i] 140 | 141 | if sign * self.z[i] <= self.L1: 142 | self.w[i] = 0. 143 | else: 144 | self.w[i] = (sign * self.L1 - self.z[i]) / ((self.beta + sqrt(n[i])) / self.alpha + self.L2) 145 | 146 | raw_y += self.w[i] 147 | 148 | 149 | len_x = len(x) 150 | # calculate factorization machine contribution. 151 | for i in x: 152 | self.init_fm(i) 153 | for k in range(self.fm_dim): 154 | sign = -1. if self.z_fm[i][k] < 0. else 1. # get the sign of z_fm[i][k] 155 | 156 | if sign * self.z_fm[i][k] <= self.L1_fm: 157 | self.w_fm[i][k] = 0. 158 | else: 159 | self.w_fm[i][k] = (sign * self.L1_fm - self.z_fm[i][k]) / ((self.beta_fm + sqrt(self.n_fm[i][k])) / self.alpha_fm + self.L2_fm) 160 | 161 | for i in range(len_x): 162 | for j in range(i + 1, len_x): 163 | for k in range(self.fm_dim): 164 | raw_y += w_fm[x[i]][k] * w_fm[x[j]][k] 165 | 166 | return raw_y 167 | 168 | def predict_one(self, list x): 169 | ''' predict_one the logit 170 | ''' 171 | return 1. / (1. + exp(- max(min(self.predict_raw(x), 35.), -35.))) 172 | 173 | def dropout(self, list x): 174 | ''' dropout variables in list x 175 | ''' 176 | cdef unsigned int i 177 | cdef double var 178 | for i, var in enumerate(x): 179 | if random.random() > self.dropoutRate: 180 | del x[i] 181 | 182 | def dropoutThenPredict(self, list x): 183 | ''' first dropout some variables and then predict_one the logit using the dropped out data. 184 | ''' 185 | self.dropout(x) 186 | return self.predict_one(x) 187 | 188 | def predictWithDroppedOutModel(self, list x): 189 | ''' predict_one using all data, using a model trained with dropout. 190 | ''' 191 | return 1. / (1. + exp(- max(min(self.predict_raw(x) * self.dropoutRate, 35.), -35.))) 192 | 193 | def update(self, list x, double p, double y): 194 | ''' Update the parameters using FTRL (Follow the Regularized Leader) 195 | ''' 196 | # alpha = self.alpha 197 | # alpha_fm = self.alpha_fm 198 | 199 | # # model 200 | # n = self.n 201 | # z = self.z 202 | # w = self.w 203 | 204 | # # FM model 205 | # n_fm = self.n_fm 206 | # z_fm = self.z_fm 207 | # w_fm = self.w_fm 208 | 209 | cdef double g 210 | # cost gradient with respect to raw prediction. 211 | g = p - y 212 | 213 | cdef int len_x 214 | cdef int i 215 | cdef int j 216 | cdef int k 217 | cdef double sigma 218 | cdef dict fm_sum 219 | # cdef np.ndarray fm_sum 220 | 221 | fm_sum = {} # sums for calculating gradients for FM. 222 | # fm_sum = np.zeros(len(x + [0])) 223 | # fm_sum = np.expand_dims(fm_sum,1) 224 | len_x = len(x) 225 | # with nogil, parallel(): 226 | for i in x + [0]: 227 | # update the first order weights. 228 | sigma = (sqrt(self.n[i] + g * g) - sqrt(self.n[i])) / self.alpha 229 | self.z[i] += g - sigma * self.w[i] 230 | self.n[i] += g * g 231 | 232 | # initialize the sum of the FM interaction weights. 233 | fm_sum[i] = np.zeros(self.fm_dim) 234 | 235 | # sum the gradients for FM interaction weights. 236 | for i in range(len_x): 237 | for j in range(len_x): 238 | if i != j: 239 | for k in range(self.fm_dim): 240 | fm_sum[x[i]][k] += self.w_fm[x[j]][k] 241 | 242 | for i in x: 243 | for k in range(self.fm_dim): 244 | g_fm = g * fm_sum[i][k] 245 | sigma = (sqrt(self.n_fm[i][k] + g_fm * g_fm) - sqrt(self.n_fm[i][k])) / self.alpha_fm 246 | self.z_fm[i][k] += g_fm - sigma * self.w_fm[i][k] 247 | self.n_fm[i][k] += g_fm * g_fm 248 | 249 | def write_w(self, filePath): 250 | ''' write out the first order weights w to a file. 251 | ''' 252 | with open(filePath, "w") as f_out: 253 | for i, w in enumerate(self.w): 254 | f_out.write("%i,%f\n" % (i, w)) 255 | 256 | def write_w_fm(self, filePath): 257 | ''' write out the factorization machine weights to a file. 258 | ''' 259 | with open(filePath, "w") as f_out: 260 | for k, w_fm in self.w_fm.iteritems(): 261 | f_out.write("%i,%s\n" % (k, ",".join([str(w) for w in w_fm]))) 262 | 263 | 264 | def predict(self,testingFile,hashSalt='salt'): 265 | start = datetime.now() 266 | # initialize a FM learner 267 | learner = self 268 | cdef int e 269 | cdef double cvLoss = 0. 270 | cdef double cvCount = 0. 271 | cdef double progressiveLoss = 0. 272 | cdef double progressiveCount = 0. 273 | cdef list x 274 | cdef double y 275 | cdef unsigned int t 276 | cdef double p 277 | cdef double loss 278 | cdef list y_preds = [] 279 | for t, ID, x, y in data(testingFile, self.D, hashSalt,loop=False): 280 | p = learner.predict_one(x) 281 | y_preds.append(p) 282 | return y_preds 283 | 284 | 285 | def evaluate(self,validationFile,eval_metric,hashSalt='salt'): 286 | start = datetime.now() 287 | # initialize a FM learner 288 | learner = self 289 | cdef int e 290 | cdef double cvLoss = 0. 291 | cdef double cvCount = 0. 292 | cdef double progressiveLoss = 0. 293 | cdef double progressiveCount = 0. 294 | cdef list x 295 | cdef double y 296 | cdef unsigned int t 297 | cdef double p 298 | cdef double loss 299 | cdef list y_preds = [] 300 | cdef list y_test = [] 301 | for t, ID, x, y in data(validationFile, self.D, hashSalt,loop=False): 302 | p = learner.predict_one(x) 303 | y_preds.append(p) 304 | y_test.append(y) 305 | score = eval_metric(y_preds,y_preds) 306 | return score 307 | 308 | def fit(self,trainingFile,hashSalt='salt',n_epochs=5,reportFrequency=10000,validationFile=None,eval_metric=None): 309 | start = datetime.now() 310 | # initialize a FM learner 311 | learner = self 312 | cdef int e 313 | cdef double cvLoss = 0. 314 | cdef double cvCount = 0. 315 | cdef double progressiveLoss = 0. 316 | cdef double progressiveCount = 0. 317 | cdef list x 318 | cdef double y 319 | cdef unsigned int t 320 | cdef double p 321 | cdef double loss 322 | print("Start Training:") 323 | for e in range(n_epochs): 324 | 325 | # if it is the first epoch, then don't use L1_fm or L2_fm 326 | if e == 0: 327 | learner.L1_fm = 0. 328 | learner.L2_fm = 0. 329 | else: 330 | learner.L1_fm = learner.L1_fm_tmp 331 | learner.L2_fm = learner.L1_fm_tmp 332 | 333 | 334 | for t, ID, x, y in data(trainingFile, self.D, hashSalt,loop=True): 335 | p = learner.predict_one(x) 336 | loss = logLoss(p, y) 337 | learner.update(x, p, y) 338 | progressiveLoss += loss 339 | progressiveCount += 1. 340 | if t % reportFrequency == 0: 341 | print("Epoch %d\tcount: %d\tProgressive Loss: %f" % (e, t, progressiveLoss / progressiveCount)) 342 | if validationFile!=None and eval_metric!=None: 343 | eval_score = self.evaluate(validationFile,eval_metric) 344 | print("Epoch %d\tcount: %d\tEvaludation score: %f" % (e, t, eval_score)) 345 | 346 | print("Epoch %d finished.\tvalidation loss: %f\telapsed time: %s" % (e, cvLoss / cvCount, str(datetime.now() - start))) 347 | if validationFile!=None and eval_metric!=None: 348 | eval_score = self.evaluate(validationFile,eval_metric) 349 | print("Epoch %d\finished: %d\tEvaludation score: %f" % (e, t, eval_score)) 350 | 351 | 352 | def logLoss(double p, double y): 353 | ''' 354 | calculate the log loss cost 355 | p: prediction [0, 1] 356 | y: actual value {0, 1} 357 | ''' 358 | p = max(min(p, 1. - 1e-15), 1e-15) 359 | return - log(p) if y == 1. else -log(1. - p) 360 | 361 | def data(filePath, hashSize, hashSalt,loop=False): 362 | ''' generator for data using hash trick 363 | 364 | INPUT: 365 | filePath 366 | hashSize 367 | hashSalt: String with which to salt the hash function 368 | ''' 369 | cdef unsigned int t 370 | cdef double y 371 | cdef list x 372 | cdef str value 373 | cdef unsigned int index 374 | cdef dict row 375 | import os 376 | if not loop: 377 | for t, row in enumerate(DictReader(filePath)): 378 | ID = row['activity_id'] 379 | del row['activity_id'] 380 | 381 | del row['outcome_isnull'] 382 | 383 | y = 0. 384 | if 'outcome' in row: 385 | if row['outcome'] == '1': 386 | y = 1. 387 | del row['outcome'] 388 | 389 | # date = int(row['hour'][4:6]) 390 | 391 | # row['hour'] = row['hour'][6:] 392 | 393 | x = [] 394 | 395 | for key in row: 396 | value = row[key] 397 | 398 | index = abs(hash(hashSalt + key + '_' + value)) % hashSize + 1 # 1 is added to hash index because I want 0 to indicate the bias term. 399 | x.append(index) 400 | 401 | yield t, ID, x, y 402 | else: 403 | while True: 404 | for t, row in enumerate(DictReader(filePath)): 405 | ID = row['activity_id'] 406 | del row['activity_id'] 407 | 408 | del row['outcome_isnull'] 409 | 410 | y = 0. 411 | if 'outcome' in row: 412 | if row['outcome'] == '1': 413 | y = 1. 414 | del row['outcome'] 415 | 416 | # date = int(row['hour'][4:6]) 417 | 418 | # row['hour'] = row['hour'][6:] 419 | 420 | x = [] 421 | 422 | for key in row: 423 | value = row[key] 424 | 425 | index = abs(hash(hashSalt + key + '_' + value)) % hashSize + 1 # 1 is added to hash index because I want 0 to indicate the bias term. 426 | x.append(index) 427 | 428 | yield t, ID, x, y 429 | -------------------------------------------------------------------------------- /kaggler/online_model/nn.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False 2 | # cython: wraparound=False 3 | # cython: cdivision=True 4 | from __future__ import division 5 | import numpy as np 6 | 7 | cimport cython 8 | from libc.math cimport sqrt, abs 9 | from ..util cimport sigm 10 | cimport numpy as np 11 | 12 | 13 | np.import_array() 14 | 15 | 16 | cdef class NN: 17 | """Neural Network with a single ReLU hidden layer online learner. 18 | 19 | Attributes: 20 | n (int): number of input units 21 | epoch (int): number of epochs 22 | h (int): number of hidden units 23 | a (double): initial learning rate 24 | l2 (double): L2 regularization parameter 25 | w0 (array of double): weights between the input and hidden layers 26 | w1 (array of double): weights between the hidden and output layers 27 | z (array of double): hidden units 28 | c (double): counter 29 | c1 (array of double): counters for hidden units 30 | """ 31 | 32 | cdef unsigned int epoch # number of epochs 33 | cdef unsigned int n # number of input units 34 | cdef unsigned int h # number of hidden units 35 | cdef double a # learning rate 36 | cdef double l2 # L2 regularization parameter 37 | cdef double[:] w0 # weights between the input and hidden layers 38 | cdef double[:] w1 # weights between the hidden and output layers 39 | cdef double[:] z # hidden units 40 | cdef double c # counter 41 | cdef double[:] c0 # counters for input units 42 | cdef double[:] c1 # counters for hidden units 43 | 44 | def __init__(self, 45 | unsigned int n, 46 | unsigned int epoch=10, 47 | unsigned int h=10, 48 | double a=0.01, 49 | double l2=0., 50 | unsigned int seed=0): 51 | """Initialize the NN class object. 52 | 53 | Args: 54 | n (int): number of input units 55 | epoch (int): number of epochs 56 | h (int): number of the hidden units 57 | a (double): initial learning rate 58 | l2 (double): L2 regularization parameter 59 | seed (unsigned int): random seed 60 | """ 61 | 62 | cdef int i 63 | 64 | rng = np.random.RandomState(seed) 65 | 66 | self.epoch = epoch 67 | self.n = n 68 | self.h = h 69 | 70 | self.a = a 71 | self.l2 = l2 72 | 73 | self.w1 = (rng.rand(self.h + 1) - .5) * 1e-6 74 | self.w0 = (rng.rand((self.n + 1) * self.h) - .5) * 1e-6 75 | 76 | # hidden units in the hidden layer 77 | self.z = np.zeros((self.h,), dtype=np.float64) 78 | 79 | # counters for biases and inputs 80 | self.c = 0. 81 | self.c1 = np.zeros((self.h,), dtype=np.float64) 82 | self.c0 = np.zeros((self.n,), dtype=np.float64) 83 | 84 | def __repr__(self): 85 | return ('NN(n={}, epoch={}, h={}, a={}, l2={})').format( 86 | self.n, self.epoch, self.h, self.a, self.l2 87 | ) 88 | 89 | def read_sparse(self, path): 90 | """Read a libsvm format sparse file line by line. 91 | 92 | Args: 93 | path (str): a file path to the libsvm format sparse file 94 | 95 | Yields: 96 | idx (list of int): a list of index of non-zero features 97 | val (list of double): a list of values of non-zero features 98 | y (int): target value 99 | """ 100 | for line in open(path): 101 | xs = line.rstrip().split(' ') 102 | 103 | y = int(xs[0]) 104 | idx = [] 105 | val = [] 106 | for item in xs[1:]: 107 | i, v = item.split(':') 108 | idx.append(int(i) % self.n) 109 | val.append(float(v)) 110 | 111 | yield zip(idx, val), y 112 | 113 | def fit(self, X, y): 114 | """Update the model with a sparse input feature matrix and its targets. 115 | 116 | Args: 117 | X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features 118 | y (numpy.array): targets 119 | 120 | Returns: 121 | updated model weights and counts 122 | """ 123 | for epoch in range(self.epoch): 124 | for row in range(X.shape[0]): 125 | x = zip(X[row].indices, X[row].data) 126 | self.update_one(x, self.predict_one(x) - y[row]) 127 | 128 | def predict(self, X): 129 | """Predict for a sparse matrix X. 130 | 131 | Args: 132 | X (scipy.sparse.csr_matrix): a sparse matrix for input features 133 | 134 | Returns: 135 | p (numpy.array): predictions for input features 136 | """ 137 | 138 | p = np.zeros((X.shape[0], ), dtype=np.float64) 139 | for row in range(X.shape[0]): 140 | p[row] = self.predict_one(zip(X[row].indices, X[row].data)) 141 | 142 | return p 143 | 144 | def predict_one(self, list x): 145 | """Predict for features. 146 | 147 | Args: 148 | x (list of tuple): a list of (index, value) of non-zero features 149 | 150 | Returns: 151 | p (double): a prediction for input features 152 | """ 153 | cdef double p 154 | cdef int j 155 | cdef int i 156 | cdef double v 157 | 158 | # starting with the bias in the hidden layer 159 | p = self.w1[self.h] 160 | 161 | # calculating and adding values of hidden units 162 | for j in range(self.h): 163 | # starting with the bias in the input layer 164 | self.z[j] = self.w0[self.n * self.h + j] 165 | 166 | # calculating and adding values of input units 167 | for i, v in x: 168 | self.z[j] += self.w0[i * self.h + j] * v 169 | 170 | # apply the ReLU activation function to the hidden unit 171 | self.z[j] = self.z[j] if self.z[j] > 0. else 0. 172 | 173 | p += self.w1[j] * self.z[j] 174 | 175 | # apply the sigmoid activation function to the output unit 176 | return sigm(p) 177 | 178 | def update_one(self, list x, double e): 179 | """Update the model with one observation. 180 | 181 | Args: 182 | x (list of tuple): a list of (index, value) of non-zero features 183 | e (double): error between the prediction of the model and target 184 | 185 | Returns: 186 | updated model weights and counts 187 | """ 188 | cdef int j 189 | cdef int i 190 | cdef double dl_dy 191 | cdef double dl_dz 192 | cdef double dl_dw1 193 | cdef double dl_dw0 194 | cdef double v 195 | 196 | dl_dy = e # dl/dy * (initial learning rate) 197 | 198 | # starting with the bias in the hidden layer 199 | self.w1[self.h] -= (dl_dy + self.l2 * self.w1[self.h]) * self.a / (sqrt(self.c) + 1) 200 | for j in range(self.h): 201 | # update weights related to non-zero hidden units 202 | if self.z[j] == 0.: 203 | continue 204 | 205 | # update weights between the hidden units and output 206 | # dl/dw1 = dl/dy * dy/dw1 = dl/dy * z 207 | dl_dw1 = dl_dy * self.z[j] 208 | self.w1[j] -= (dl_dw1 + self.l2 * self.w1[j]) * self.a / (sqrt(self.c1[j]) + 1) 209 | 210 | # starting with the bias in the input layer 211 | # dl/dz = dl/dy * dy/dz = dl/dy * w1 212 | dl_dz = dl_dy * self.w1[j] 213 | self.w0[self.n * self.h + j] -= (dl_dz + 214 | self.l2 * self.w0[self.n * self.h + j]) * self.a / (sqrt(self.c1[j]) + 1) 215 | # update weights related to non-zero input units 216 | for i, v in x: 217 | # update weights between the hidden unit j and input i 218 | # dl/dw0 = dl/dz * dz/dw0 = dl/dz * v 219 | dl_dw0 = dl_dz * v 220 | self.w0[i * self.h + j] -= (dl_dw0 + 221 | self.l2 * self.w0[i * self.h + j]) * self.a / (sqrt(self.c0[i]) + 1) 222 | 223 | # update counter for the input i 224 | self.c0[i] += dl_dw0 * dl_dw0 225 | 226 | # update counter for the hidden unit j 227 | self.c1[j] += dl_dw1 * dl_dw1 228 | 229 | # update overall counter 230 | self.c += dl_dy * dl_dy 231 | -------------------------------------------------------------------------------- /kaggler/online_model/nn_h2.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False 2 | # cython: wraparound=False 3 | # cython: cdivision=True 4 | from __future__ import division 5 | import numpy as np 6 | 7 | cimport cython 8 | from libc.math cimport sqrt, abs 9 | from ..util cimport sigm 10 | cimport numpy as np 11 | 12 | 13 | np.import_array() 14 | 15 | 16 | cdef class NN_H2: 17 | """Neural Network with 2 ReLU hidden layers online learner. 18 | 19 | Attributes: 20 | n (int): number of input units 21 | epoch (int): number of epochs 22 | h1 (int): number of the 1st level hidden units 23 | h2 (int): number of the 2nd level hidden units 24 | a (double): initial learning rate 25 | l2 (double): L2 regularization parameter 26 | w0 (array of double): weights between the input and 1st hidden layers 27 | w1 (array of double): weights between the 1st and 2nd hidden layers 28 | w2 (array of double): weights between the 2nd hidden and output layers 29 | z1 (array of double): 1st level hidden units 30 | z2 (array of double): 2nd level hidden units 31 | c (double): counter 32 | c1 (array of double): counters for 1st level hidden units 33 | c2 (array of double): counters for 2nd level hidden units 34 | """ 35 | 36 | cdef unsigned int n # number of input units 37 | cdef unsigned int h1 # number of the 1st level hidden units 38 | cdef unsigned int h2 # number of the 2nd level hidden units 39 | cdef double a # learning rate 40 | cdef double l2 # L2 regularization parameter 41 | cdef double[:] w0 # weights between the input and 1st hidden layers 42 | cdef double[:] w1 # weights between the 1st and 2nd hidden layers 43 | cdef double[:] w2 # weights between the 2nd hidden and output layers 44 | cdef double[:] z1 # 1st level hidden units 45 | cdef double[:] z2 # 2nd level hidden units 46 | cdef double c # counter 47 | cdef double[:] c0 # counters for input units 48 | cdef double[:] c1 # counters for 1st level hidden units 49 | cdef double[:] c2 # counters for 2nd level hidden units 50 | 51 | def __init__(self, 52 | unsigned int n, 53 | unsigned int epoch=10, 54 | unsigned int h1=128, 55 | unsigned int h2=256, 56 | double a=0.01, 57 | double l2=0., 58 | unsigned int seed=0): 59 | """Initialize the NN class object. 60 | 61 | Args: 62 | n (int): number of input units 63 | epoch (int): number of epochs 64 | h1 (int): number of the 1st level hidden units 65 | h2 (int): number of the 2nd level hidden units 66 | a (double): initial learning rate 67 | l2 (double): L2 regularization parameter 68 | seed (unsigned int): random seed 69 | """ 70 | 71 | cdef int i 72 | 73 | rng = np.random.RandomState(seed) 74 | 75 | self.n = n 76 | self.epoch = epoch 77 | self.h1 = h1 78 | self.h2 = h2 79 | 80 | self.a = a 81 | self.l2 = l2 82 | 83 | # weights between the output and 2nd hidden layer 84 | self.w2 = (rng.rand(self.h2 + 1) - .5) * 1e-7 85 | 86 | # weights between the 2nd hidden layer and 1st hidden layer 87 | self.w1 = (rng.rand((self.h1 + 1) * self.h2) - .5) * 1e-7 88 | 89 | # weights between the 1st hidden layer and inputs 90 | self.w0 = (rng.rand((self.n + 1) * self.h1) - .5) * 1e-7 91 | 92 | # hidden units in the 2nd hidden layer 93 | self.z2 = np.zeros((self.h2,), dtype=np.float64) 94 | 95 | # hidden units in the 1st hidden layer 96 | self.z1 = np.zeros((self.h1,), dtype=np.float64) 97 | 98 | # counters for the hidden units and inputs 99 | self.c = 0. 100 | self.c2 = np.zeros((self.h2,), dtype=np.float64) 101 | self.c1 = np.zeros((self.h1,), dtype=np.float64) 102 | self.c0 = np.zeros((self.n,), dtype=np.float64) 103 | 104 | def __repr__(self): 105 | return ('NN_H2(n={}, epoch={}, h1={}, h2={}, a={}, l2={})').format( 106 | self.n, self.epoch, self.h1, self.h2, self.a, self.l2 107 | ) 108 | 109 | def read_sparse(self, path): 110 | """Read the libsvm format sparse file line by line. 111 | 112 | Args: 113 | path (str): a file path to the libsvm format sparse file 114 | 115 | Yields: 116 | idx (list of int): a list of index of non-zero features 117 | val (list of double): a list of values of non-zero features 118 | y (int): target value 119 | """ 120 | for line in open(path): 121 | xs = line.rstrip().split(' ') 122 | 123 | y = int(xs[0]) 124 | idx = [] 125 | val = [] 126 | for item in xs[1:]: 127 | i, v = item.split(':') 128 | idx.append(abs(hash(i)) % self.n) 129 | val.append(float(v)) 130 | 131 | yield zip(idx, val), y 132 | 133 | def fit(self, X, y): 134 | """Update the model with a sparse input feature matrix and its targets. 135 | 136 | Args: 137 | X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features 138 | y (numpy.array): targets 139 | 140 | Returns: 141 | updated model weights and counts 142 | """ 143 | for epoch in range(self.epoch): 144 | for row in range(X.shape[0]): 145 | x = zip(X[row].indices, X[row].data) 146 | self.update_one(x, self.predict_one(x) - y[row]) 147 | 148 | def predict(self, X): 149 | """Predict for a sparse matrix X. 150 | 151 | Args: 152 | X (scipy.sparse.csr_matrix): a sparse matrix for input features 153 | 154 | Returns: 155 | p (numpy.array): predictions for input features 156 | """ 157 | 158 | p = np.zeros((X.shape[0], ), dtype=np.float64) 159 | for row in range(X.shape[0]): 160 | p[row] = self.predict_one(zip(X[row].indices, X[row].data)) 161 | 162 | return p 163 | 164 | def predict_one(self, list x): 165 | """Predict for features. 166 | 167 | Args: 168 | x (list of tuple): a list of (index, value) of non-zero features 169 | 170 | Returns: 171 | p (double): a prediction for input features 172 | """ 173 | cdef double p 174 | cdef int k 175 | cdef int j 176 | cdef int i 177 | cdef double v 178 | 179 | # starting from the bias in the 2nd hidden layer 180 | p = self.w2[self.h2] 181 | 182 | # calculating and adding values of 2nd level hidden units 183 | for k in range(self.h2): 184 | # staring with the bias in the 1st hidden layer 185 | self.z2[k] = self.w1[self.h1 * self.h2 + k] 186 | 187 | # calculating and adding values of 1st level hidden units 188 | for j in range(self.h1): 189 | # starting with the bias in the input layer 190 | self.z1[j] = self.w0[self.n * self.h1 + j] 191 | 192 | # calculating and adding values of input units 193 | for i, v in x: 194 | self.z1[j] += self.w0[i * self.h1 + j] * v 195 | 196 | # apply the ReLU activation function to the first level hidden unit 197 | self.z1[j] = self.z1[j] if self.z1[j] > 0. else 0. 198 | 199 | self.z2[k] += self.w1[j * self.h2 + k] * self.z1[j] 200 | 201 | # apply the ReLU activation function to the 2nd level hidden unit 202 | self.z2[k] = self.z2[k] if self.z2[k] > 0. else 0. 203 | 204 | p += self.w2[k] * self.z2[k] 205 | 206 | # apply the sigmoid activation function to the output unit 207 | return sigm(p) 208 | 209 | def update_one(self, list x, double e): 210 | """Update the model. 211 | 212 | Args: 213 | x (list of tuple): a list of (index, value) of non-zero features 214 | e (double): error between the prediction of the model and target 215 | 216 | Returns: 217 | updated model weights and counts 218 | """ 219 | cdef int k 220 | cdef int j 221 | cdef int i 222 | cdef double dl_dy 223 | cdef double dl_dz1 224 | cdef double dl_dz2 225 | cdef double dl_dw0 226 | cdef double dl_dw1 227 | cdef double dl_dw2 228 | cdef double v 229 | 230 | # XXX: assuming predict() was called right before with the same idx and 231 | # val inputs. Otherwise self.z will be incorrect for updates. 232 | dl_dy = e # dl/dy * (initial learning rate) 233 | 234 | # starting with the bias in the 2nd hidden layer 235 | self.w2[self.h2] -= (dl_dy + self.l2 * self.w2[self.h2]) * self.a / (sqrt(self.c) + 1) 236 | for k in range(self.h2): 237 | # update weights related to non-zero 2nd level hidden units 238 | if self.z2[k] == 0.: 239 | continue 240 | 241 | # update weights between the 2nd hidden units and output 242 | # dl/dw2 = dl/dy * dy/dw2 = dl/dy * z2 243 | dl_dw2 = dl_dy * self.z2[k] 244 | self.w2[k] -= (dl_dw2 + self.l2 * self.w2[k]) * self.a / (sqrt(self.c2[k]) + 1) 245 | 246 | # starting with the bias in the 1st hidden layer 247 | # dl/dz2 = dl/dy * dy/dz2 = dl/dy * w2 248 | dl_dz2 = dl_dy * self.w2[k] 249 | self.w1[self.h1 * self.h2 + k] -= (dl_dz2 + 250 | self.l2 * self.w1[self.h1 * self.h2 + k]) * self.a / (sqrt(self.c2[k]) + 1) 251 | for j in range(self.h1): 252 | # update weights realted to non-zero hidden units 253 | if self.z1[j] == 0.: 254 | continue 255 | 256 | # update weights between the hidden units and output 257 | # dl/dw1 = dl/dz2 * dz2/dw1 = dl/dz2 * z1 258 | dl_dw1 = dl_dz2 * self.z1[j] 259 | self.w1[j * self.h2 + k] -= (dl_dw1 + self.l2 * self.w1[j]) * self.a / (sqrt(self.c1[j]) + 1) 260 | 261 | # starting with the bias in the input layer 262 | # dl/dz1 = dl/dz2 * dz2/dz1 = dl/dz2 * w1 263 | dl_dz1 = dl_dz2 * self.w1[j * self.h2 + k] 264 | self.w0[self.n * self.h1 + j] -= (dl_dz1 + 265 | self.l2 * self.w0[self.n * self.h1 + j]) * self.a / (sqrt(self.c1[j]) + 1) 266 | # update weights related to non-zero input units 267 | for i, v in x: 268 | # update weights between the hidden unit j and input i 269 | # dl/dw0 = dl/dz1 * dz/dw0 = dl/dz1 * v 270 | dl_dw0 = dl_dz1 * v 271 | self.w0[i * self.h1 + j] -= (dl_dw0 + 272 | self.l2 * self.w0[i * self.h1 + j]) * self.a / (sqrt(self.c0[i]) + 1) 273 | 274 | # update counter for the input i 275 | self.c0[i] += dl_dw0 * dl_dw0 276 | 277 | # update counter for the 1st level hidden unit j 278 | self.c1[j] += dl_dw1 * dl_dw1 279 | 280 | # update counter for the 2nd level hidden unit k 281 | self.c2[k] += dl_dw2 * dl_dw2 282 | 283 | # update overall counter 284 | self.c += dl_dy * dl_dy 285 | -------------------------------------------------------------------------------- /kaggler/online_model/sgd.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False 2 | # cython: wraparound=False 3 | # cython: cdivision=True 4 | from __future__ import division 5 | import numpy as np 6 | 7 | cimport cython 8 | from libc.math cimport sqrt, abs 9 | from ..util cimport sigm 10 | cimport numpy as np 11 | 12 | 13 | np.import_array() 14 | 15 | 16 | cdef class SGD: 17 | """Simple online learner using a hasing trick. 18 | 19 | Attributes: 20 | epoch (int): number of epochs 21 | n (int): number of features after hashing trick 22 | a (double): initial learning rate 23 | l1 (double): L1 regularization parameter 24 | l2 (double): L2 regularization parameter 25 | w (array of double): feature weights 26 | c (array of double): counters for weights 27 | interaction (boolean): whether to use 2nd order interaction or not 28 | """ 29 | cdef unsigned int epoch 30 | cdef unsigned int n 31 | cdef double a 32 | cdef double l1 33 | cdef double l2 34 | cdef double[:] w 35 | cdef double[:] c 36 | cdef bint interaction 37 | 38 | def __init__(self, 39 | double a=0.01, 40 | double l1=0.0, 41 | double l2=0.0, 42 | unsigned int n=2**20, 43 | unsigned int epoch=10, 44 | bint interaction=True): 45 | """Initialize the SGD class object. 46 | 47 | Args: 48 | epoch (int): number of epochs 49 | n (int): number of features after hashing trick 50 | a (double): initial learning rate 51 | l1 (double): L1 regularization parameter 52 | l2 (double): L2 regularization parameter 53 | w (array of double): feature weights 54 | c (array of double): counters for weights 55 | interaction (boolean): whether to use 2nd order interaction or not 56 | """ 57 | 58 | self.epoch = epoch 59 | self.n = n # # of features 60 | self.a = a # learning rate 61 | self.l1 = l1 62 | self.l2 = l2 63 | 64 | # initialize weights and counts 65 | self.w = np.zeros((self.n + 1,), dtype=np.float64) 66 | self.c = np.zeros((self.n + 1,), dtype=np.float64) 67 | self.interaction = interaction 68 | 69 | def __repr__(self): 70 | return ('SGD(a={}, l1={}, l2={}, n={}, epoch={}, interaction={})').format( 71 | self.a, self.l1, self.l2, self.n, self.epoch, self.interaction 72 | ) 73 | 74 | def _indices(self, list x): 75 | cdef unsigned int index 76 | cdef int l 77 | cdef int i 78 | cdef int j 79 | 80 | yield self.n 81 | 82 | for index in x: 83 | yield abs(hash(index)) % self.n 84 | 85 | if self.interaction: 86 | l = len(x) 87 | x = sorted(x) 88 | for i in xrange(l): 89 | for j in xrange(i + 1, l): 90 | yield abs(hash('{}_{}'.format(x[i], x[j]))) % self.n 91 | 92 | def read_sparse(self, path): 93 | """Apply hashing trick to the libsvm format sparse file. 94 | 95 | Args: 96 | path (str): a file path to the libsvm format sparse file 97 | 98 | Yields: 99 | x (list of int): a list of index of non-zero features 100 | y (int): target value 101 | """ 102 | for line in open(path): 103 | xs = line.rstrip().split(' ') 104 | 105 | y = int(xs[0]) 106 | x = [] 107 | for item in xs[1:]: 108 | index, _ = item.split(':') 109 | x.append(abs(hash(index)) % self.n) 110 | 111 | yield x, y 112 | 113 | def fit(self, X, y): 114 | """Update the model with a sparse input feature matrix and its targets. 115 | 116 | Args: 117 | X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features 118 | y (numpy.array): targets 119 | 120 | Returns: 121 | updated model weights and counts 122 | """ 123 | for epoch in range(self.epoch): 124 | for row in range(X.shape[0]): 125 | x = list(X[row].indices) 126 | self.update_one(x, self.predict_one(x) - y[row]) 127 | 128 | def predict(self, X): 129 | """Predict for a sparse matrix X. 130 | 131 | Args: 132 | X (scipy.sparse.csr_matrix): a sparse matrix for input features 133 | 134 | Returns: 135 | p (numpy.array): predictions for input features 136 | """ 137 | p = np.zeros((X.shape[0], ), dtype=np.float64) 138 | for row in range(X.shape[0]): 139 | p[row] = self.predict_one(list(X[row].indices)) 140 | 141 | return p 142 | 143 | def predict_one(self, list x): 144 | """Predict for features. 145 | 146 | Args: 147 | x (list of int): a list of index of non-zero features 148 | 149 | Returns: 150 | p (double): a prediction for input features 151 | """ 152 | cdef int i 153 | cdef double wTx 154 | 155 | wTx = 0. 156 | for i in self._indices(x): 157 | wTx += self.w[i] 158 | 159 | return sigm(wTx) 160 | 161 | def update_one(self, list x, double e): 162 | """Update the model. 163 | 164 | Args: 165 | x (list of int): a list of index of non-zero features 166 | e (double): error between the prediction of the model and target 167 | 168 | Returns: 169 | updates model weights and counts 170 | """ 171 | cdef int i 172 | cdef double g2 173 | 174 | g2 = e * e 175 | for i in self._indices(x): 176 | self.w[i] -= (e + 177 | (self.l1 if self.w[i] >= 0. else -self.l1) + 178 | self.l2 * self.w[i]) * self.a / (sqrt(self.c[i]) + 1) 179 | self.c[i] += g2 180 | -------------------------------------------------------------------------------- /kaggler/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from .data import OneHotEncoder 2 | from .data import LabelEncoder 3 | from .data import Normalizer 4 | -------------------------------------------------------------------------------- /kaggler/preprocessing/data.py: -------------------------------------------------------------------------------- 1 | from scipy import sparse 2 | from scipy.stats import norm 3 | from statsmodels.distributions.empirical_distribution import ECDF 4 | import logging 5 | import numpy as np 6 | import pandas as pd 7 | 8 | 9 | NAN_INT = 7535805 10 | 11 | 12 | class Normalizer(object): 13 | """Normalizer that transforms numerical columns into normal distribution. 14 | 15 | Attributes: 16 | ecdfs (list of empirical CDF): empirical CDFs for columns 17 | """ 18 | 19 | def fit(self, X, y=None): 20 | self.ecdfs = [None] * X.shape[1] 21 | 22 | for col in range(X.shape[1]): 23 | self.ecdfs[col] = ECDF(X[:, col]) 24 | 25 | def transform(self, X): 26 | """Normalize numerical columns. 27 | 28 | Args: 29 | X (numpy.array) : numerical columns to normalize 30 | 31 | Returns: 32 | X (numpy.array): normalized numerical columns 33 | """ 34 | 35 | for col in range(X.shape[1]): 36 | X[:, col] = self._transform_col(X[:, col], col) 37 | 38 | return X 39 | 40 | def fit_transform(self, X, y=None): 41 | """Normalize numerical columns. 42 | 43 | Args: 44 | X (numpy.array) : numerical columns to normalize 45 | 46 | Returns: 47 | X (numpy.array): normalized numerical columns 48 | """ 49 | 50 | self.ecdfs = [None] * X.shape[1] 51 | 52 | for col in range(X.shape[1]): 53 | self.ecdfs[col] = ECDF(X[:, col]) 54 | X[:, col] = self._transform_col(X[:, col], col) 55 | 56 | return X 57 | 58 | def _transform_col(self, x, col): 59 | """Normalize one numerical column. 60 | 61 | Args: 62 | x (numpy.array): a numerical column to normalize 63 | col (int): column index 64 | 65 | Returns: 66 | A normalized feature vector. 67 | """ 68 | 69 | return norm.ppf(self.ecdfs[col](x) * .998 + .001) 70 | 71 | 72 | class LabelEncoder(object): 73 | """Label Encoder that groups infrequent values into one label. 74 | 75 | Attributes: 76 | min_obs (int): minimum number of observation to assign a label. 77 | label_encoders (list of dict): label encoders for columns 78 | label_maxes (list of int): maximum of labels for columns 79 | """ 80 | 81 | def __init__(self, min_obs=10): 82 | """Initialize the OneHotEncoder class object. 83 | 84 | Args: 85 | min_obs (int): minimum number of observation to assign a label. 86 | """ 87 | 88 | self.min_obs = min_obs 89 | 90 | def __repr__(self): 91 | return ('LabelEncoder(min_obs={})').format(self.min_obs) 92 | 93 | def _get_label_encoder_and_max(self, x): 94 | """Return a mapping from values and its maximum of a column to integer labels. 95 | 96 | Args: 97 | x (numpy.array): a categorical column to encode. 98 | 99 | Returns: 100 | label_encoder (dict): mapping from values of features to integers 101 | max_label (int): maximum label 102 | """ 103 | 104 | # NaN cannot be used as a key for dict. So replace it with a random integer. 105 | x[pd.isnull(x)] = NAN_INT 106 | 107 | # count each unique value 108 | label_count = {} 109 | for label in x: 110 | try: 111 | label_count[label] += 1 112 | except KeyError: 113 | label_count[label] = 1 114 | 115 | # add unique values appearing more than min_obs to the encoder. 116 | label_encoder = {} 117 | label_index = 1 118 | labels_not_encoded = 0 119 | for label in label_count.keys(): 120 | if label_count[label] >= self.min_obs: 121 | label_encoder[label] = label_index 122 | label_index += 1 123 | else: 124 | labels_not_encoded += 1 125 | 126 | max_label = label_index - 1 127 | 128 | # if every label is encoded, then replace the maximum label with 0 so 129 | # that total number of labels encoded is (# of total labels - 1). 130 | if labels_not_encoded == 0: 131 | for label in label_encoder: 132 | # find the label with the maximum encoded value 133 | if label_encoder[label] == max_label: 134 | # set the value of the label to 0 and decrease the maximum 135 | # by 1. 136 | label_encoder[label] = 0 137 | max_label -= 1 138 | break 139 | 140 | return label_encoder, max_label 141 | 142 | def _transform_col(self, x, col): 143 | """Encode one categorical column into labels. 144 | 145 | Args: 146 | x (numpy.array): a categorical column to encode 147 | col (int): column index 148 | 149 | Returns: 150 | x (numpy.array): a column with labels. 151 | """ 152 | 153 | label_encoder = self.label_encoders[col] 154 | 155 | # replace NaNs with the pre-defined random integer 156 | x[pd.isnull(x)] = NAN_INT 157 | 158 | labels = np.zeros((x.shape[0], )) 159 | for label in label_encoder: 160 | labels[x == label] = label_encoder[label] 161 | 162 | return labels 163 | 164 | def fit(self, X, y=None): 165 | self.label_encoders = [None] * X.shape[1] 166 | self.label_maxes = [None] * X.shape[1] 167 | 168 | for col in range(X.shape[1]): 169 | self.label_encoders[col], self.label_maxes[col] = \ 170 | self._get_label_encoder_and_max(X[:, col]) 171 | 172 | return self 173 | 174 | def transform(self, X): 175 | """Encode categorical columns into sparse matrix with one-hot-encoding. 176 | 177 | Args: 178 | X (numpy.array): categorical columns to encode 179 | 180 | Returns: 181 | X (numpy.array): label encoded columns 182 | """ 183 | 184 | for col in range(X.shape[1]): 185 | X[:, col] = self._transform_col(X[:, col], col) 186 | 187 | return X 188 | 189 | def fit_transform(self, X, y=None): 190 | """Encode categorical columns into label encoded columns 191 | 192 | Args: 193 | X (numpy.array): categorical columns to encode 194 | 195 | Returns: 196 | X (numpy.array): label encoded columns 197 | """ 198 | 199 | self.label_encoders = [None] * X.shape[1] 200 | self.label_maxes = [None] * X.shape[1] 201 | 202 | for col in range(X.shape[1]): 203 | self.label_encoders[col], self.label_maxes[col] = \ 204 | self._get_label_encoder_and_max(X[:, col]) 205 | 206 | X[:, col] = self._transform_col(X[:, col], col) 207 | 208 | return X 209 | 210 | 211 | class OneHotEncoder(object): 212 | """One-Hot-Encoder that groups infrequent values into one dummy variable. 213 | 214 | Attributes: 215 | min_obs (int): minimum number of observation to create a dummy variable 216 | label_encoders (list of (dict, int)): label encoders and their maximums 217 | for columns 218 | """ 219 | 220 | def __init__(self, min_obs=10): 221 | """Initialize the OneHotEncoder class object. 222 | 223 | Args: 224 | min_obs (int): minimum number of observation to create a dummy variable 225 | label_encoder (LabelEncoder): LabelEncoder that transofrm 226 | """ 227 | 228 | self.min_obs = min_obs 229 | self.label_encoder = LabelEncoder(min_obs) 230 | 231 | def __repr__(self): 232 | return ('OneHotEncoder(min_obs={})').format(self.min_obs) 233 | 234 | def _transform_col(self, x, col): 235 | """Encode one categorical column into sparse matrix with one-hot-encoding. 236 | 237 | Args: 238 | x (numpy.array): a categorical column to encode 239 | col (int): column index 240 | 241 | Returns: 242 | X (scipy.sparse.coo_matrix): sparse matrix encoding a categorical 243 | variable into dummy variables 244 | """ 245 | 246 | labels = self.label_encoder._transform_col(x, col) 247 | label_max = self.label_encoder.label_maxes[col] 248 | 249 | # build row and column index for non-zero values of a sparse matrix 250 | index = np.array(range(len(labels))) 251 | i = index[labels > 0] 252 | j = labels[labels > 0] - 1 # column index starts from 0 253 | 254 | if len(i) > 0: 255 | return sparse.coo_matrix((np.ones_like(i), (i, j)), 256 | shape=(x.shape[0], label_max)) 257 | else: 258 | # if there is no non-zero value, return no matrix 259 | return None 260 | 261 | def fit(self, X, y=None): 262 | self.label_encoder.fit(X) 263 | 264 | return self 265 | 266 | def transform(self, X): 267 | """Encode categorical columns into sparse matrix with one-hot-encoding. 268 | 269 | Args: 270 | X (numpy.array): categorical columns to encode 271 | 272 | Returns: 273 | X_new (scipy.sparse.coo_matrix): sparse matrix encoding categorical 274 | variables into dummy variables 275 | """ 276 | 277 | for col in range(X.shape[1]): 278 | X_col = self._transform_col(X[:, col], col) 279 | if X_col is not None: 280 | if col == 0: 281 | X_new = X_col 282 | else: 283 | X_new = sparse.hstack((X_new, X_col)) 284 | 285 | logging.debug('{} --> {} features'.format( 286 | col, self.label_encoder.label_maxes[col]) 287 | ) 288 | 289 | return X_new 290 | 291 | def fit_transform(self, X, y=None): 292 | """Encode categorical columns into sparse matrix with one-hot-encoding. 293 | 294 | Args: 295 | X (numpy.array): categorical columns to encode 296 | 297 | Returns: 298 | sparse matrix encoding categorical variables into dummy variables 299 | """ 300 | 301 | self.label_encoder.fit(X) 302 | 303 | return self.transform(X) 304 | -------------------------------------------------------------------------------- /kaggler/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qqgeogor/Kaggler/f53ab7f47eec731648fa03064ec3b7fc11f92396/kaggler/test/__init__.py -------------------------------------------------------------------------------- /kaggler/test/test_sgd.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from kaggler.online_model import SGD 5 | 6 | 7 | DUMMY_SPARSE_STR = """0 1:1 3:1 10:1 8 | 0 3:1 5:1 9 | 1 4:1 6:1 8:1 10:1""" 10 | 11 | DUMMY_Y = [0, 0, 1] 12 | DUMMY_LEN_X = [3, 2, 4] 13 | 14 | class TestSGD(unittest.TestCase): 15 | 16 | def setUp(self): 17 | self.model = SGD(n=2**10, a=0.1, l1=1, l2=1, interaction=True) 18 | self.sparse_file = '/tmp/dummy.sps' 19 | 20 | """Create dummpy sparse files.""" 21 | with open(self.sparse_file, 'w') as f: 22 | f.write(DUMMY_SPARSE_STR) 23 | 24 | def tearDown(self): 25 | # If a dummy file exists, remove it. 26 | if os.path.isfile(self.sparse_file): 27 | os.remove(self.sparse_file) 28 | 29 | def test_read_sparse(self): 30 | len_xs = [] 31 | ys = [] 32 | for x, y in self.model.read_sparse(self.sparse_file): 33 | # check hash collision for feature index 34 | self.assertEqual(len(set(x)), len(x)) 35 | 36 | ys.append(y) 37 | len_xs.append(len(x)) 38 | 39 | # check if target values are correct 40 | self.assertEqual(ys, DUMMY_Y) 41 | 42 | # check if the number of feature index are correct 43 | self.assertEqual(len_xs, DUMMY_LEN_X) 44 | 45 | 46 | if __name__ == '__main__': 47 | unittest.main() 48 | 49 | -------------------------------------------------------------------------------- /kaggler/util.pxd: -------------------------------------------------------------------------------- 1 | cdef inline double fmax(double a, double b): return a if a >= b else b 2 | cdef inline double fmin(double a, double b): return a if a <= b else b 3 | 4 | cdef double sigm(double x) 5 | -------------------------------------------------------------------------------- /kaggler/util.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False 2 | # cython: wraparound=False 3 | # cython: cdivision=True 4 | from __future__ import division 5 | from scipy import sparse 6 | 7 | import logging 8 | import numpy as np 9 | 10 | cimport cython 11 | from libc.math cimport exp, log 12 | cimport numpy as np 13 | 14 | 15 | np.import_array() 16 | 17 | 18 | cdef double sigm(double x): 19 | """Bounded sigmoid function.""" 20 | return 1 / (1 + exp(-fmax(fmin(x, 20.0), -20.0))) 21 | 22 | 23 | def get_downsampled_index(n, rate=0.): 24 | """Return the index that downsamples a vector x by the rate.""" 25 | 26 | return np.random.choice(range(n), int(n * rate), replace=False) 27 | 28 | 29 | def get_downsampled_index0(x, rate=0., threshold=0.): 30 | """Return the index that downsamples 0s of a vector x by the rate.""" 31 | 32 | idx1 = np.where(x > threshold)[0] 33 | idx0 = np.where(x <= threshold)[0] 34 | idx0_down = np.random.choice(idx0, int(len(idx0) * rate), replace=False) 35 | 36 | idx = list(idx0_down) + list(idx1) 37 | np.random.shuffle(idx) 38 | 39 | return idx 40 | 41 | 42 | def set_column_width(X, n_col): 43 | """Set the column width of a matrix X to n_col.""" 44 | 45 | if X.shape[1] < n_col: 46 | if sparse.issparse(X): 47 | X = sparse.hstack((X, np.zeros((X.shape[0], n_col - X.shape[1])))) 48 | X = X.tocsr() 49 | else: 50 | X = np.hstack((X, np.zeros((X.shape[0], n_col - X.shape[1])))) 51 | 52 | elif X.shape[1] > n_col: 53 | if sparse.issparse(X): 54 | X = X.tocsc()[:, :-(X.shape[1] - n_col)] 55 | X = X.tocsr() 56 | else: 57 | X = X[:, :-(X.shape[1] - n_col)] 58 | 59 | return X 60 | 61 | 62 | def rank(x): 63 | """Rank a vector x. Ties will be averaged.""" 64 | 65 | unique, idx_inverse = np.unique(x, return_inverse=True) 66 | 67 | unique_rank_sum = np.zeros_like(unique) 68 | unique_rank_count = np.zeros_like(unique) 69 | 70 | np.add.at(unique_rank_sum, idx_inverse, x.argsort().argsort()) 71 | np.add.at(unique_rank_count, idx_inverse, 1) 72 | 73 | unique_rank_mean = unique_rank_sum.astype(np.float) / unique_rank_count 74 | 75 | return unique_rank_mean[idx_inverse] 76 | 77 | 78 | def set_min_max(x, lb, ub): 79 | x[x < lb] = lb 80 | x[x > ub] = ub 81 | 82 | return x 83 | 84 | 85 | def point(rank, n_team, n_teammate=1, t=0): 86 | """Calculate Kaggle points to earn after a competition. 87 | 88 | Args: 89 | rank (int): final ranking in the private leaderboard. 90 | n_team (int): the number of teams participated in the competition. 91 | n_teammate (int): the number of team members in my team. 92 | t (int): the number of days since the competition ends. 93 | 94 | Returns: 95 | returns Kaggle points to earn after a compeittion. 96 | """ 97 | return (1e5 / np.sqrt(n_teammate) * (rank ** -.75) * 98 | np.log10(1 + np.log10(n_team)) * np.exp(-t / 500)) 99 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension 2 | from Cython.Distutils import build_ext 3 | 4 | import numpy as np 5 | 6 | try: 7 | from pypandoc import convert 8 | read_md = lambda f: convert(f, 'rst') 9 | except ImportError: 10 | print("warning: pypandoc module not found, could not convert Markdown to RST") 11 | read_md = lambda f: open(f, 'r').read() 12 | 13 | setup( 14 | name='Kaggler', 15 | version='0.4.1', 16 | 17 | author='Jeong-Yoon Lee', 18 | author_email='jeongyoon.lee1@gmail.com', 19 | 20 | packages=['kaggler', 21 | 'kaggler.model', 22 | 'kaggler.metrics', 23 | 'kaggler.online_model', 24 | 'kaggler.preprocessing', 25 | 'kaggler.test'], 26 | url='https://github.com/jeongyoonlee/Kaggler', 27 | license='LICENSE.txt', 28 | 29 | description='Code for Kaggle Data Science Competitions.', 30 | long_description=read_md('README.md'), 31 | 32 | install_requires=[ 33 | 'cython', 34 | 'numpy', 35 | 'scipy >= 0.14.0', 36 | 'scikit-learn >= 0.15.0', 37 | 'statsmodels >= 0.5.0', 38 | ], 39 | 40 | cmdclass={'build_ext': build_ext}, 41 | ext_modules=[Extension('kaggler.online_model.ftrl', 42 | ['kaggler/online_model/ftrl.pyx'], 43 | libraries=[], 44 | include_dirs=[np.get_include(), '.'], 45 | extra_compile_args=['-O3']), 46 | Extension('kaggler.online_model.sgd', 47 | ['kaggler/online_model/sgd.pyx'], 48 | libraries=[], 49 | include_dirs=[np.get_include(), '.'], 50 | extra_compile_args=['-O3']), 51 | Extension('kaggler.online_model.fm', 52 | ['kaggler/online_model/fm.pyx'], 53 | libraries=[], 54 | include_dirs=[np.get_include(), '.'], 55 | extra_compile_args=['-O3']), 56 | Extension('kaggler.online_model.nn', 57 | ['kaggler/online_model/nn.pyx'], 58 | libraries=[], 59 | include_dirs=[np.get_include(), '.'], 60 | extra_compile_args=['-O3']), 61 | Extension('kaggler.online_model.nn_h2', 62 | ['kaggler/online_model/nn_h2.pyx'], 63 | libraries=[], 64 | include_dirs=[np.get_include(), '.'], 65 | extra_compile_args=['-O3']), 66 | Extension('kaggler.util', 67 | ['kaggler/util.pyx', 'kaggler/util.pxd'], 68 | libraries=[], 69 | include_dirs=[np.get_include(), '.'], 70 | extra_compile_args=['-O3']), 71 | Extension('kaggler.online_model.ftrl_fm', 72 | ['kaggler/online_model/ftrl_fm.pyx'], 73 | libraries=[], 74 | include_dirs=[np.get_include(), '.'], 75 | extra_compile_args=[ 76 | '-O3', 77 | # '-fopenmp', 78 | ], 79 | # extra_link_args=['-fopenmp'], 80 | ), 81 | 82 | ], 83 | ) 84 | --------------------------------------------------------------------------------