├── .gitignore
├── CHANGES.txt
├── LICENSE
├── LICENSE.txt
├── MANIFEST
├── MANIFEST.in
├── README.md
├── demo
    ├── ftrl_fm_cython.py
    └── mf_qe_nn_clf.py
├── doc
    ├── Makefile
    ├── conf.py
    ├── index.rst
    ├── kaggler.metrics.rst
    ├── kaggler.model.rst
    ├── kaggler.online_model.rst
    ├── kaggler.preprocessing.rst
    ├── kaggler.rst
    ├── kaggler.test.rst
    └── modules.rst
├── kaggler
    ├── __init__.py
    ├── const.py
    ├── data_io.py
    ├── metrics
    │   ├── __init__.py
    │   ├── classification.py
    │   └── regression.py
    ├── model
    │   ├── __init__.py
    │   └── nn.py
    ├── online_model
    │   ├── DecisionTree
    │   │   ├── OnlineClassificationTree.py
    │   │   ├── _tree.pyx
    │   │   ├── test.py
    │   │   └── utils.pyx
    │   ├── __init__.py
    │   ├── fm.c
    │   ├── fm.pyx
    │   ├── ftrl.c
    │   ├── ftrl.pyx
    │   ├── ftrl_dropout.pyx
    │   ├── ftrl_fm.c
    │   ├── ftrl_fm.pyx
    │   ├── nn.c
    │   ├── nn.pyx
    │   ├── nn_h2.c
    │   ├── nn_h2.pyx
    │   ├── sgd.c
    │   └── sgd.pyx
    ├── preprocessing
    │   ├── __init__.py
    │   └── data.py
    ├── test
    │   ├── __init__.py
    │   └── test_sgd.py
    ├── util.c
    ├── util.pxd
    └── util.pyx
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | _build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .cache
41 | nosetests.xml
42 | coverage.xml
43 | 
44 | # Translations
45 | *.mo
46 | *.pot
47 | 
48 | # Django stuff:
49 | *.log
50 | 
51 | # Sphinx documentation
52 | docs/_build/
53 | 
54 | # PyBuilder
55 | target/
56 | 


--------------------------------------------------------------------------------
/CHANGES.txt:
--------------------------------------------------------------------------------
1 | 0.3.4, 2015-02-11 -- Add README.md to MANIFEST.in
2 | 0.1.1, 2014-09-24 -- Fix wrong dependencies
3 | 0.1.0, 2014-07-22 -- Initial release.
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <http://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <http://www.gnu.org/philosophy/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <http://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <http://www.gnu.org/philosophy/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
 1 | # file GENERATED by distutils, do NOT edit
 2 | CHANGES.txt
 3 | LICENSE.txt
 4 | README.txt
 5 | setup.py
 6 | kaggler/__init__.py
 7 | kaggler/const.py
 8 | kaggler/logger.py
 9 | kaggler/nn_auc.py
10 | kaggler/util.py
11 | kaggler/test/__init__.py
12 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt README.md
2 | recursive-include docs *.txt
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Kaggler
 2 | Kaggler is a Python package for Kaggle data science competitions and distributed under the version 3 of the GNU General Public License.
 3 | 
 4 | It provides online learning algorithms for classification - inspired by Kaggle user [tinrtgu's code](http://goo.gl/K8hQBx).  It uses the sparse input format that handles large sparse data efficiently.  Core code is optimized for speed by using Cython.
 5 | 
 6 | # Algorithms
 7 | Currently algorithms available are as follows:
 8 | 
 9 | ## Online learning algorithms
10 | * Stochastic Gradient Descent (SGD)
11 | * Follow-the-Regularized-Leader (FTRL)
12 | * Follow-the-Regularized-Leader with Factorization Machine (FTRL_FM)
13 | * Factorization Machine (FM)
14 | * Neural Networks (NN) - with a single (NN) or two (NN_H2) ReLU hidden layers
15 | * Decision Tree
16 | 
17 | ## Batch learning algorithm
18 | * Neural Networks (NN) - with a single hidden layer and L-BFGS optimization
19 | 
20 | # Install
21 | ## Using pip
22 | Python package is available at PyPi for pip installation:
23 | ```
24 | sudo pip install -U Kaggler
25 | ```
26 | 
27 | ## From source code
28 | If you want to install it from source code:
29 | ```
30 | python setup.py build_ext --inplace
31 | sudo python setup.py install
32 | ```
33 | 
34 | # Input Format
35 | libsvm style sparse file format is used.
36 | ```
37 | 1 1:1 4:1 5:0.5
38 | 0 2:1 5:1
39 | ```
40 | 
41 | # Example
42 | ```
43 | from kaggler.online_model import SGD, FTRL, FM, NN
44 | 
45 | # SGD
46 | clf = SGD(a=.01,                # learning rate
47 |           l1=1e-6,              # L1 regularization parameter
48 |           l2=1e-6,              # L2 regularization parameter
49 |           n=2**20,              # number of hashed features
50 |           epoch=10,             # number of epochs
51 |           interaction=True)     # use feature interaction or not
52 | 
53 | # FTRL
54 | clf = FTRL(a=.1,                # alpha in the per-coordinate rate
55 |            b=1,                 # beta in the per-coordinate rate
56 |            l1=1.,               # L1 regularization parameter
57 |            l2=1.,               # L2 regularization parameter
58 |            n=2**20,             # number of hashed features
59 |            epoch=1,             # number of epochs
60 |            interaction=True)    # use feature interaction or not
61 | 
62 | # FM
63 | clf = FM(n=1e5,                 # number of features
64 |          epoch=100,             # number of epochs
65 |          dim=4,                 # size of factors for interactions
66 |          a=.01)                 # learning rate
67 | 
68 | # NN
69 | clf = NN(n=1e5,                 # number of features
70 |          epoch=10,              # number of epochs
71 |          h=16,                  # number of hidden units
72 |          a=.1,                  # learning rate
73 |          l2=1e-6)               # L2 regularization parameter
74 | 
75 | # online training and prediction directly with a libsvm file
76 | for x, y in clf.read_sparse('train.sparse'):
77 |     p = clf.predict_one(x)      # predict for an input
78 |     clf.update_one(x, p - y)    # update the model with the target using error
79 | 
80 | for x, _ in clf.read_sparse('test.sparse'):
81 |     p = clf.predict_one(x)
82 | 
83 | # online training and prediction with a scipy sparse matrix
84 | from sklearn.datasets import load_svmlight_file
85 | 
86 | X, y = load_svmlight_file('train.sparse')
87 | 
88 | clf.fit(X, y)
89 | p = clf.predict(X)
90 | ```
91 | 
92 | # Package Documentation
93 | Package documentation is available at [here](http://pythonhosted.org//Kaggler).
94 | 


--------------------------------------------------------------------------------
/demo/ftrl_fm_cython.py:
--------------------------------------------------------------------------------
 1 | # time pypy-2.4 -u runmodel.py | tee output_0.txt
 2 | from kaggler.online_model.ftrl_fm import FTRL_FM
 3 | import random
 4 | from math import log
 5 | import numpy as np
 6 | from datetime import datetime
 7 | import pandas as pd
 8 | from sklearn.cross_validation import KFold
 9 | from sklearn.metrics import roc_auc_score
10 | #### RANDOM SEED ####
11 | seed = 1024
12 | np.random.seed(seed)
13 | #####################
14 | 
15 | ####################
16 | #### PARAMETERS ####
17 | ####################
18 | 
19 | reportFrequency = 1000
20 | path = "E:\\Redhat\\"
21 | trainingFile = "E:\\Redhat\\train_le_date.csv"
22 | testingFile = "E:\\Redhat\\test_le_date.csv"
23 | # train = pd.read_csv(trainingFile)
24 | # test = pd.read_csv(testingFile)
25 | # y = train['outcome'].values
26 | # skf = KFold(len(y), n_folds=4, shuffle=False, random_state=seed)
27 | # for ind_tr, ind_te in skf:
28 | #     X_train = train.iloc[ind_tr]
29 | #     X_test = train.iloc[ind_te]
30 | #     break
31 | 
32 | # X_train.to_csv(path+'X_train.csv',index=False)
33 | # X_test.to_csv(path+'X_test.csv',index=False)
34 | 
35 | fm_dim = 4
36 | fm_initDev = .01
37 | 
38 | alpha = 0.1
39 | beta = 1.
40 | 
41 | alpha_fm = .01
42 | beta_fm = 1.
43 | 
44 | p_D = 22
45 | D = 2 ** p_D
46 | 
47 | L1 = 0.1
48 | L2 = 1.0
49 | L1_fm = 0.1
50 | L2_fm = 1.0
51 | 
52 | n_epochs = 3
53 | 
54 | ####
55 | start = datetime.now()
56 | 
57 | # initialize a FM learner
58 | learner = FTRL_FM(fm_dim, fm_initDev, L1, L2, L1_fm, L2_fm, D, alpha, beta, alpha_fm = alpha_fm, beta_fm = beta_fm)
59 | 
60 | learner.fit(trainingFile=open(path+'X_train.csv'),n_epochs=5,validationFile=open(path+'X_test.csv'),eval_metric=roc_auc_score,reportFrequency=reportFrequency)
61 | 
62 | # save the weights
63 | # w_outfile = path+"param.w.txt"
64 | # w_fm_outfile = path+"param.w_fm.txt"
65 | # learner.write_w(w_outfile)
66 | # learner.write_w_fm(w_fm_outfile)
67 | pd.to_pickle(learner,path+'ftrl_fm.pkl')
68 | 
69 | 
70 | test = pd.read_csv(path+'test_le_date.csv')
71 | activity_id = test['activity_id']
72 | print('Make submission')
73 | # X_t = [X_t[:,i] for i in range(X_t.shape[1])]
74 | y_preds = learner.predict(testingFile=open(testingFile),n_epochs=5)
75 | submission = pd.DataFrame()
76 | submission['activity_id'] = activity_id
77 | submission['outcome'] = outcome
78 | submission.to_csv('submission_ftrl_fm_%s.csv'%dim,index=False)
79 | 


--------------------------------------------------------------------------------
/demo/mf_qe_nn_clf.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from scipy import sparse as ssp
  4 | from sklearn.preprocessing import LabelEncoder,LabelBinarizer,MinMaxScaler,OneHotEncoder,StandardScaler,Normalizer
  5 | from sklearn.linear_model import LogisticRegression
  6 | from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
  7 | from sklearn.feature_selection import SelectFromModel
  8 | from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
  9 | from sklearn.datasets import dump_svmlight_file,load_svmlight_file
 10 | from sklearn.svm import LinearSVC
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | from sklearn.cross_validation import KFold,StratifiedKFold
 13 | from sklearn.metrics import roc_auc_score,accuracy_score
 14 | from keras.preprocessing import sequence
 15 | from keras.callbacks import ModelCheckpoint
 16 | from keras import backend as K
 17 | from keras.layers import Input, Embedding, LSTM, Dense,Flatten, Dropout, merge,Convolution1D,MaxPooling1D,Lambda
 18 | from keras.layers.normalization import BatchNormalization
 19 | from keras.optimizers import SGD,Nadam
 20 | from keras.layers.advanced_activations import PReLU,LeakyReLU,ELU,SReLU
 21 | from keras.models import Model
 22 | from keras.utils.visualize_util import plot
 23 | import distance
 24 | import xgboost as xgb
 25 | 
 26 | seed = 1024
 27 | np.random.seed(seed)
 28 | 
 29 | path = "../"
 30 | 
 31 | 
 32 | def str_jaccard(str1, str2):
 33 |     res = distance.jaccard(str1, str2)
 34 |     return res
 35 | 
 36 | 
 37 | question_numeric = ['char_4_q','char_5_q','char_6_q']
 38 | 
 39 | train = pd.read_csv(path+'invited_info_train.txt',dtype={"expert_id":str,'question_id':str})
 40 | expert_id = train['expert_id'].values
 41 | expert_id = LabelEncoder().fit_transform(expert_id)
 42 | 
 43 | test = pd.read_csv(path+'validate_nolabel.txt',dtype={"expert_id":str,'question_id':str}).fillna(-1)
 44 | test.columns = ['question_id','expert_id','label']
 45 | len_train = train.shape[0]
 46 | 
 47 | 
 48 | train = pd.concat([train,test])
 49 | 
 50 | expert = pd.read_csv(path+'user_info.txt',dtype={"expert_id":str})
 51 | question = pd.read_csv(path+'question_info.txt',dtype={"question_id":str}).fillna(-1)
 52 | question['char_3_q'] = question['char_3_q'].astype(str)
 53 | 
 54 | expert['char_1'] = expert['char_1'].apply(lambda x: x.replace('/',' '))
 55 | expert['char_2'] = expert['char_2'].apply(lambda x: x.replace('/',' '))
 56 | expert['char_3'] = expert['char_3'].apply(lambda x: x.replace('/',' '))
 57 | 
 58 | question['char_2_q'] = question['char_2_q'].apply(lambda x: x.replace('/',' '))
 59 | question['char_3_q'] = question['char_3_q'].apply(lambda x: x.replace('/',' '))
 60 | 
 61 | count_char_1 = CountVectorizer(ngram_range=(1,3))
 62 | tfidf_char_2 = TfidfVectorizer(ngram_range=(1,3))
 63 | tfidf_char_3 = TfidfVectorizer(ngram_range=(1,3))
 64 | 
 65 | count_char_1.fit(expert['char_1'].values)
 66 | tfidf_char_2.fit(expert['char_2'].values.tolist()+question['char_2_q'].values.tolist())
 67 | tfidf_char_3.fit(expert['char_3'].values.tolist()+question['char_3_q'].values.tolist())
 68 | 
 69 | lb_char_1_q = LabelBinarizer(sparse_output=True)
 70 | lb_char_1_q.fit(question['char_1_q'].values)
 71 | 
 72 | 
 73 | train = pd.merge(train,expert,on='expert_id',how='left')#.fillna(' ')
 74 | train = pd.merge(train,question,on='question_id',how='left')
 75 | 
 76 | 
 77 | le = LabelEncoder()
 78 | train['question_id'] = le.fit_transform(train['question_id'].values)
 79 | train['expert_id'] = le.fit_transform(train['expert_id'].values)
 80 | 
 81 | y = train['label'].values
 82 | features = [
 83 |     'question_id',
 84 |     'expert_id',
 85 |     ]
 86 | 
 87 | X = train[features].values
 88 | # X = OneHotEncoder().fit_transform(X).tocsr()
 89 | # X_char_1 = count_char_1.transform(train['char_1'].values)
 90 | # X_char_2 = tfidf_char_2.transform(train['char_2'].values)
 91 | # X_char_3 = tfidf_char_3.transform(train['char_3'].values)
 92 | 
 93 | 
 94 | # X_char_1_q = lb_char_1_q.fit_transform(train['char_1_q'].values)
 95 | # X_char_2_q = tfidf_char_2.transform(train['char_2_q'].values)
 96 | # X_char_3_q = tfidf_char_3.transform(train['char_3_q'].values)
 97 | 
 98 | # stand_char_4_5_6_q = StandardScaler()
 99 | # stand_char_4_5_6_q.fit(train[question_numeric].values)
100 | # X_char_4_5_6_q = stand_char_4_5_6_q.transform(train[question_numeric].values)
101 | 
102 | 
103 | print ('X raw',X.shape)
104 | 
105 | # sim_char_2 = []
106 | # for expert_char_2,question_char_2 in zip(X_char_2,X_char_2_q):
107 | #     cos_sim_2 = pairwise_distances(expert_char_2, question_char_2, metric='cosine')[0][0]
108 | #     sim_char_2.append(cos_sim_2)
109 | # sim_char_2 = np.array(sim_char_2)
110 | # sim_char_2 = np.expand_dims(sim_char_2,1)
111 | 
112 | # sim_char_3 = []
113 | # for expert_char_3,question_char_3 in zip(X_char_3,X_char_3_q):
114 | #     cos_sim_3 = pairwise_distances(expert_char_3, question_char_3, metric='cosine')[0][0]
115 | #     sim_char_3.append(cos_sim_3)
116 | # sim_char_3 = np.array(sim_char_3)
117 | # sim_char_3 = np.expand_dims(sim_char_3,1)
118 | 
119 | # X = ssp.hstack([
120 | #     X,
121 | #     # X_char_1,
122 | #     # X_char_2,
123 | #     # X_char_3,
124 | #     # X_char_1_q,
125 | #     # X_char_2_q,
126 | #     # X_char_3_q,
127 | #     # X_char_4_5_6_q,
128 | #     # sim_char_2,
129 | #     # sim_char_3,
130 | #     ]).tocsr()
131 | 
132 | # dump_svmlight_file(X,y,path+'data.svm')
133 | 
134 | # data,y_all = load_svmlight_file(path+'data.svm')
135 | y_all = y
136 | data = X
137 | num_q = len(np.unique(data[:,0]))
138 | num_e = len(np.unique(data[:,1]))
139 | del X
140 | del y
141 | 
142 | X = data[:len_train]
143 | y = y_all[:len_train]
144 | X_t= data[len_train:]
145 | del data
146 | del y_all
147 | 
148 | def make_mf_lr(X ,y, clf, X_test, n_round=3):
149 |     n = X.shape[0]
150 |     '''
151 |     Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor
152 |     '''
153 |     print clf
154 |     mf_tr = np.zeros(X.shape[0])
155 |     mf_te = np.zeros(X_test.shape[0])
156 |     for i in range(n_round):
157 |         skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=42+i*1000)
158 |         for ind_tr, ind_te in skf:
159 |             X_tr = X[ind_tr]
160 |             X_te = X[ind_te]
161 |             
162 |             # print('X_tr shape',X_tr.shape)
163 |             # print('X_te shape',X_te.shape)
164 |             
165 |             y_tr = y[ind_tr]
166 |             y_te = y[ind_te]
167 |             
168 |             clf.fit(X_tr, y_tr)
169 |             mf_tr[ind_te] += clf.predict_proba(X_te)[:,1]
170 |             mf_te += clf.predict_proba(X_test)[:,1]*0.5
171 |             y_pred = clf.predict_proba(X_te)[:,1]
172 |             score = roc_auc_score(y_te, y_pred)
173 |             print 'pred[{}] score:{}'.format(i, score)
174 |     return (mf_tr / n_round, mf_te / n_round)
175 | 
176 | 
177 | def make_mf_lsvc(X ,y, clf, X_test, n_round=3):
178 |     n = X.shape[0]
179 |     '''
180 |     Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor
181 |     '''
182 |     print clf
183 |     mf_tr = np.zeros(X.shape[0])
184 |     mf_te = np.zeros(X_test.shape[0])
185 |     for i in range(n_round):
186 |         skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=42+i*1000)
187 |         for ind_tr, ind_te in skf:
188 |             X_tr = X[ind_tr]
189 |             X_te = X[ind_te]
190 |             
191 |             # print('X_tr shape',X_tr.shape)
192 |             # print('X_te shape',X_te.shape)
193 |             
194 |             y_tr = y[ind_tr]
195 |             y_te = y[ind_te]
196 |             
197 |             clf.fit(X_tr, y_tr)
198 |             mf_tr[ind_te] += clf.decision_function(X_te)
199 |             mf_te += clf.decision_function(X_test)*0.5
200 |             y_pred = clf.decision_function(X_te)
201 |             score = roc_auc_score(y_te, y_pred)
202 |             print 'pred[{}] score:{}'.format(i, score)
203 |     return (mf_tr / n_round, mf_te / n_round)
204 | 
205 | def make_mf_nn(X ,y, X_test, n_round=3):
206 |     n = X.shape[0]
207 |     '''
208 |     Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor
209 |     '''
210 |     from kaggler.online_model.ftrl import FTRL
211 |     mf_tr = np.zeros(X.shape[0])
212 |     mf_te = np.zeros(X_test.shape[0])
213 |     for i in range(n_round):
214 |         skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=42+i*1000)
215 |         for ind_tr, ind_te in skf:
216 |             clf = build_model(X)
217 |             X_tr = [X[:,0][ind_tr],X[:,1][ind_tr]]
218 |             X_te = [X[:,0][ind_te],X[:,1][ind_te]]
219 | 
220 |             # print('X_tr shape',X_tr.shape)
221 |             # print('X_te shape',X_te.shape)
222 |             
223 |             y_tr = y[ind_tr]
224 |             y_te = y[ind_te]
225 |             
226 |             clf.fit(X_tr, y_tr,nb_epoch=2,batch_size=128,validation_data=[X_te,y_te])
227 |             mf_tr[ind_te] += clf.predict(X_te).ravel()
228 |             mf_te += clf.predict([X_test[:,0],X_test[:,1]]).ravel()*0.5
229 |             y_pred = clf.predict(X_te).ravel()
230 |             score = roc_auc_score(y_te, y_pred)
231 |             print 'pred[{}] score:{}'.format(i, score)
232 |     return (mf_tr / n_round, mf_te / n_round)
233 | 
234 | def build_model(X,dim=128):
235 | 
236 |     inputs_p = Input(shape=(1,), dtype='int32')
237 | 
238 |     embed_p = Embedding(
239 |                     num_q,
240 |                     dim,
241 |                     dropout=0.2,
242 |                     input_length=1
243 |                     )(inputs_p)
244 | 
245 |     inputs_d = Input(shape=(1,), dtype='int32')
246 | 
247 |     embed_d = Embedding(
248 |                     num_e,
249 |                     dim,
250 |                     dropout=0.2,
251 |                     input_length=1
252 |                     )(inputs_d)
253 | 
254 | 
255 |     flatten_p= Flatten()(embed_p)
256 |     
257 |     flatten_d= Flatten()(embed_d)
258 |     
259 |     flatten = merge([
260 |                 flatten_p,
261 |                 flatten_d,
262 |                 ],mode='concat')
263 |     
264 |     fc1 = Dense(512)(flatten)
265 |     fc1 = SReLU()(fc1)
266 |     dp1 = Dropout(0.7)(fc1)
267 |     
268 |     outputs = Dense(1,activation='sigmoid',name='outputs')(dp1)
269 | 
270 |     inputs = [
271 |                 inputs_p,
272 |                 inputs_d,
273 |             ]
274 | 
275 | 
276 | 
277 |     model = Model(input=inputs, output=outputs)
278 |     nadam = Nadam()
279 |     sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
280 |     model.compile(
281 |                 optimizer=nadam,
282 |                 loss= 'binary_crossentropy'
283 |               )
284 | 
285 |     return model
286 | 
287 | mf_nn_clf = make_mf_nn(X ,y, X_t, n_round=10)
288 | pd.to_pickle(mf_nn_clf,path+'mf_nn_clf.pkl')
289 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Kaggler.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Kaggler.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/Kaggler"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Kaggler"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Kaggler documentation build configuration file, created by
  4 | # sphinx-quickstart on Tue Feb 10 04:55:59 2015.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | #sys.path.insert(0, os.path.abspath('.'))
 22 | sys.path.insert(0, os.path.abspath("../.."))
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #needs_sphinx = '1.0'
 28 | 
 29 | # Add any Sphinx extension module names here, as strings. They can be
 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 31 | # ones.
 32 | extensions = [
 33 |     'sphinx.ext.autodoc',
 34 |     'sphinxcontrib.napoleon',
 35 |     'sphinx.ext.doctest',
 36 |     'sphinx.ext.intersphinx',
 37 |     'sphinx.ext.todo',
 38 |     'sphinx.ext.coverage',
 39 |     'sphinx.ext.mathjax',
 40 |     'sphinx.ext.viewcode',
 41 | ]
 42 | 
 43 | # Add any paths that contain templates here, relative to this directory.
 44 | templates_path = ['_templates']
 45 | 
 46 | # The suffix of source filenames.
 47 | source_suffix = '.rst'
 48 | 
 49 | # The encoding of source files.
 50 | #source_encoding = 'utf-8-sig'
 51 | 
 52 | # The master toctree document.
 53 | master_doc = 'index'
 54 | 
 55 | # General information about the project.
 56 | project = u'Kaggler'
 57 | copyright = u'2015, Jeong-Yoon Lee'
 58 | 
 59 | # The version info for the project you're documenting, acts as replacement for
 60 | # |version| and |release|, also used in various other places throughout the
 61 | # built documents.
 62 | #
 63 | # The short X.Y version.
 64 | version = '0.4'
 65 | # The full version, including alpha/beta/rc tags.
 66 | release = '0.4.1'
 67 | 
 68 | # The language for content autogenerated by Sphinx. Refer to documentation
 69 | # for a list of supported languages.
 70 | #language = None
 71 | 
 72 | # There are two options for replacing |today|: either, you set today to some
 73 | # non-false value, then it is used:
 74 | #today = ''
 75 | # Else, today_fmt is used as the format for a strftime call.
 76 | #today_fmt = '%B %d, %Y'
 77 | 
 78 | # List of patterns, relative to source directory, that match files and
 79 | # directories to ignore when looking for source files.
 80 | exclude_patterns = ['_build']
 81 | 
 82 | # The reST default role (used for this markup: `text`) to use for all
 83 | # documents.
 84 | #default_role = None
 85 | 
 86 | # If true, '()' will be appended to :func: etc. cross-reference text.
 87 | #add_function_parentheses = True
 88 | 
 89 | # If true, the current module name will be prepended to all description
 90 | # unit titles (such as .. function::).
 91 | #add_module_names = True
 92 | 
 93 | # If true, sectionauthor and moduleauthor directives will be shown in the
 94 | # output. They are ignored by default.
 95 | #show_authors = False
 96 | 
 97 | # The name of the Pygments (syntax highlighting) style to use.
 98 | pygments_style = 'sphinx'
 99 | 
100 | # A list of ignored prefixes for module index sorting.
101 | #modindex_common_prefix = []
102 | 
103 | # If true, keep warnings as "system message" paragraphs in the built documents.
104 | #keep_warnings = False
105 | 
106 | 
107 | # -- Options for HTML output ----------------------------------------------
108 | 
109 | # The theme to use for HTML and HTML Help pages.  See the documentation for
110 | # a list of builtin themes.
111 | html_theme = 'default'
112 | 
113 | # Theme options are theme-specific and customize the look and feel of a theme
114 | # further.  For a list of options available for each theme, see the
115 | # documentation.
116 | #html_theme_options = {}
117 | 
118 | # Add any paths that contain custom themes here, relative to this directory.
119 | #html_theme_path = []
120 | 
121 | # The name for this set of Sphinx documents.  If None, it defaults to
122 | # "<project> v<release> documentation".
123 | #html_title = None
124 | 
125 | # A shorter title for the navigation bar.  Default is the same as html_title.
126 | #html_short_title = None
127 | 
128 | # The name of an image file (relative to this directory) to place at the top
129 | # of the sidebar.
130 | #html_logo = None
131 | 
132 | # The name of an image file (within the static path) to use as favicon of the
133 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
134 | # pixels large.
135 | #html_favicon = None
136 | 
137 | # Add any paths that contain custom static files (such as style sheets) here,
138 | # relative to this directory. They are copied after the builtin static files,
139 | # so a file named "default.css" will overwrite the builtin "default.css".
140 | html_static_path = ['_static']
141 | 
142 | # Add any extra paths that contain custom files (such as robots.txt or
143 | # .htaccess) here, relative to this directory. These files are copied
144 | # directly to the root of the documentation.
145 | #html_extra_path = []
146 | 
147 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
148 | # using the given strftime format.
149 | #html_last_updated_fmt = '%b %d, %Y'
150 | 
151 | # If true, SmartyPants will be used to convert quotes and dashes to
152 | # typographically correct entities.
153 | #html_use_smartypants = True
154 | 
155 | # Custom sidebar templates, maps document names to template names.
156 | #html_sidebars = {}
157 | 
158 | # Additional templates that should be rendered to pages, maps page names to
159 | # template names.
160 | #html_additional_pages = {}
161 | 
162 | # If false, no module index is generated.
163 | #html_domain_indices = True
164 | 
165 | # If false, no index is generated.
166 | #html_use_index = True
167 | 
168 | # If true, the index is split into individual pages for each letter.
169 | #html_split_index = False
170 | 
171 | # If true, links to the reST sources are added to the pages.
172 | #html_show_sourcelink = True
173 | 
174 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
175 | #html_show_sphinx = True
176 | 
177 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
178 | #html_show_copyright = True
179 | 
180 | # If true, an OpenSearch description file will be output, and all pages will
181 | # contain a <link> tag referring to it.  The value of this option must be the
182 | # base URL from which the finished HTML is served.
183 | #html_use_opensearch = ''
184 | 
185 | # This is the file name suffix for HTML files (e.g. ".xhtml").
186 | #html_file_suffix = None
187 | 
188 | # Output file base name for HTML help builder.
189 | htmlhelp_basename = 'Kagglerdoc'
190 | 
191 | 
192 | # -- Options for LaTeX output ---------------------------------------------
193 | 
194 | latex_elements = {
195 | # The paper size ('letterpaper' or 'a4paper').
196 | #'papersize': 'letterpaper',
197 | 
198 | # The font size ('10pt', '11pt' or '12pt').
199 | #'pointsize': '10pt',
200 | 
201 | # Additional stuff for the LaTeX preamble.
202 | #'preamble': '',
203 | }
204 | 
205 | # Grouping the document tree into LaTeX files. List of tuples
206 | # (source start file, target name, title,
207 | #  author, documentclass [howto, manual, or own class]).
208 | latex_documents = [
209 |   ('index', 'Kaggler.tex', u'Kaggler Documentation',
210 |    u'Jeong-Yoon Lee', 'manual'),
211 | ]
212 | 
213 | # The name of an image file (relative to this directory) to place at the top of
214 | # the title page.
215 | #latex_logo = None
216 | 
217 | # For "manual" documents, if this is true, then toplevel headings are parts,
218 | # not chapters.
219 | #latex_use_parts = False
220 | 
221 | # If true, show page references after internal links.
222 | #latex_show_pagerefs = False
223 | 
224 | # If true, show URL addresses after external links.
225 | #latex_show_urls = False
226 | 
227 | # Documents to append as an appendix to all manuals.
228 | #latex_appendices = []
229 | 
230 | # If false, no module index is generated.
231 | #latex_domain_indices = True
232 | 
233 | 
234 | # -- Options for manual page output ---------------------------------------
235 | 
236 | # One entry per manual page. List of tuples
237 | # (source start file, name, description, authors, manual section).
238 | man_pages = [
239 |     ('index', 'kaggler', u'Kaggler Documentation',
240 |      [u'Jeong-Yoon Lee'], 1)
241 | ]
242 | 
243 | # If true, show URL addresses after external links.
244 | #man_show_urls = False
245 | 
246 | 
247 | # -- Options for Texinfo output -------------------------------------------
248 | 
249 | # Grouping the document tree into Texinfo files. List of tuples
250 | # (source start file, target name, title, author,
251 | #  dir menu entry, description, category)
252 | texinfo_documents = [
253 |   ('index', 'Kaggler', u'Kaggler Documentation',
254 |    u'Jeong-Yoon Lee', 'Kaggler', 'One line description of project.',
255 |    'Miscellaneous'),
256 | ]
257 | 
258 | # Documents to append as an appendix to all manuals.
259 | #texinfo_appendices = []
260 | 
261 | # If false, no module index is generated.
262 | #texinfo_domain_indices = True
263 | 
264 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
265 | #texinfo_show_urls = 'footnote'
266 | 
267 | # If true, do not generate a @detailmenu in the "Top" node's menu.
268 | #texinfo_no_detailmenu = False
269 | 
270 | 
271 | # Example configuration for intersphinx: refer to the Python standard library.
272 | intersphinx_mapping = {'http://docs.python.org/': None}
273 | 
274 | # Napoleon settings
275 | napoleon_google_docstring = True
276 | napoleon_numpy_docstring = True
277 | napoleon_include_private_with_doc = False
278 | napoleon_include_special_with_doc = True
279 | napoleon_use_admonition_for_examples = False
280 | napoleon_use_admonition_for_notes = False
281 | napoleon_use_admonition_for_references = False
282 | napoleon_use_ivar = False
283 | napoleon_use_param = True
284 | napoleon_use_rtype = True
285 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | kaggler package
 2 | ===============
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     kaggler.metrics
10 |     kaggler.online_model
11 |     kaggler.preprocessing
12 |     kaggler.test
13 | 
14 | Submodules
15 | ----------
16 | 
17 | kaggler.const module
18 | --------------------
19 | 
20 | .. automodule:: kaggler.const
21 |     :members:
22 |     :undoc-members:
23 |     :show-inheritance:
24 | 
25 | kaggler.data_io module
26 | -----------------
27 | 
28 | .. automodule:: kaggler.data_io
29 |     :members:
30 |     :undoc-members:
31 |     :show-inheritance:
32 | 
33 | kaggler.util module
34 | -------------------
35 | 
36 | .. automodule:: kaggler.util
37 |     :members:
38 |     :undoc-members:
39 |     :show-inheritance:
40 | 
41 | 
42 | Module contents
43 | ---------------
44 | 
45 | .. automodule:: kaggler
46 |     :members:
47 |     :undoc-members:
48 |     :show-inheritance:
49 | 


--------------------------------------------------------------------------------
/doc/kaggler.metrics.rst:
--------------------------------------------------------------------------------
 1 | kaggler.metrics package
 2 | ============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | kaggler.metrics.classification module
 8 | ------------------------------
 9 | 
10 | .. automodule:: kaggler.metrics.classification
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | kaggler.metrics.regression module
16 | ------------------------------
17 | 
18 | .. automodule:: kaggler.metrics.regression
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: kaggler.metrics
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/doc/kaggler.model.rst:
--------------------------------------------------------------------------------
 1 | kaggler.model package
 2 | =====================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | kaggler.model.nn module
 8 | -----------------------
 9 | 
10 | .. automodule:: kaggler.model.nn
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: kaggler.model
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/doc/kaggler.online_model.rst:
--------------------------------------------------------------------------------
 1 | kaggler.online_model package
 2 | ============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | kaggler.online_model.fm module
 8 | ------------------------------
 9 | 
10 | .. automodule:: kaggler.online_model.fm
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | kaggler.online_model.ftrl module
16 | --------------------------------
17 | 
18 | .. automodule:: kaggler.online_model.ftrl
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | kaggler.online_model.ftrl_dropout module
24 | ----------------------------------------
25 | 
26 | .. automodule:: kaggler.online_model.ftrl_dropout
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | kaggler.online_model.nn module
32 | ------------------------------
33 | 
34 | .. automodule:: kaggler.online_model.nn
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | kaggler.online_model.nn_h2 module
40 | ---------------------------------
41 | 
42 | .. automodule:: kaggler.online_model.nn_h2
43 |     :members:
44 |     :undoc-members:
45 |     :show-inheritance:
46 | 
47 | kaggler.online_model.sgd module
48 | -------------------------------
49 | 
50 | .. automodule:: kaggler.online_model.sgd
51 |     :members:
52 |     :undoc-members:
53 |     :show-inheritance:
54 | 
55 | 
56 | Module contents
57 | ---------------
58 | 
59 | .. automodule:: kaggler.online_model
60 |     :members:
61 |     :undoc-members:
62 |     :show-inheritance:
63 | 


--------------------------------------------------------------------------------
/doc/kaggler.preprocessing.rst:
--------------------------------------------------------------------------------
 1 | kaggler.preprocessing package
 2 | ============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | kaggler.preprocessing.data module
 8 | ------------------------------
 9 | 
10 | .. automodule:: kaggler.preprocessing.data
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: kaggler.preprocessing
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/doc/kaggler.rst:
--------------------------------------------------------------------------------
 1 | kaggler package
 2 | ===============
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     kaggler.metrics
10 |     kaggler.online_model
11 |     kaggler.preprocessing
12 |     kaggler.test
13 | 
14 | Submodules
15 | ----------
16 | 
17 | kaggler.const module
18 | --------------------
19 | 
20 | .. automodule:: kaggler.const
21 |     :members:
22 |     :undoc-members:
23 |     :show-inheritance:
24 | 
25 | kaggler.data_io module
26 | -----------------
27 | 
28 | .. automodule:: kaggler.data_io
29 |     :members:
30 |     :undoc-members:
31 |     :show-inheritance:
32 | 
33 | kaggler.util module
34 | -------------------
35 | 
36 | .. automodule:: kaggler.util
37 |     :members:
38 |     :undoc-members:
39 |     :show-inheritance:
40 | 
41 | 
42 | Module contents
43 | ---------------
44 | 
45 | .. automodule:: kaggler
46 |     :members:
47 |     :undoc-members:
48 |     :show-inheritance:
49 | 


--------------------------------------------------------------------------------
/doc/kaggler.test.rst:
--------------------------------------------------------------------------------
 1 | kaggler.test package
 2 | ====================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | kaggler.test.test_sgd module
 8 | ----------------------------
 9 | 
10 | .. automodule:: kaggler.test.test_sgd
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: kaggler.test
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/doc/modules.rst:
--------------------------------------------------------------------------------
1 | kaggler
2 | =======
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    kaggler
8 | 


--------------------------------------------------------------------------------
/kaggler/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.4.1'
2 | __all__ = ['const',
3 |            'data_io',
4 |            'metrics',
5 |            'model',
6 |            'online_model',
7 |            'preprocessing',
8 |            'util']
9 | 


--------------------------------------------------------------------------------
/kaggler/const.py:
--------------------------------------------------------------------------------
1 | FIXED_SEED = 2015
2 | SEC_PER_MIN = 60
3 | 


--------------------------------------------------------------------------------
/kaggler/data_io.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import load_svmlight_file
 2 | 
 3 | import heapq
 4 | import numpy as np
 5 | 
 6 | 
 7 | def is_number(s):
 8 |     """Check if a string is a number or not."""
 9 | 
10 |     try:
11 |         float(s)
12 |         return True
13 |     except ValueError:
14 |         return False
15 | 
16 | 
17 | def load_data(path, dense=False):
18 |     """Load data from a CSV or libsvm format file.
19 |     
20 |     Args:
21 |         path (str): A path to the CSV or libsvm format file containing data.
22 |         dense (boolean): An optional variable indicating if the return matrix
23 |                          should be dense.  By default, it is false.
24 |     """
25 | 
26 |     with open(path, 'r') as f:
27 |         line = f.readline().strip()
28 | 
29 |     if ':' in line:
30 |         X, y = load_svmlight_file(path)
31 |         X = X.astype(np.float32)
32 |         if dense:
33 |             X = X.todense()
34 |     elif ',' in line:
35 |         X = np.loadtxt(path, delimiter=',',
36 |                        skiprows=0 if is_number(line.split(',')[0]) else 1)
37 |         y = X[:, 0]
38 |         X = X[:, 1:]
39 |     else:
40 |         raise NotImplementedError, "Neither CSV nor LibSVM formatted file."
41 | 
42 |     return X, y
43 | 
44 | 
45 | def read_sps(path):
46 |     for line in open(path):
47 |         # parse x
48 |         xs = line.rstrip().split(' ')
49 | 
50 |         yield xs[1:], int(xs[0])
51 | 
52 | 
53 | def shuf_file(f, shuf_win):
54 |     heap = []
55 |     for line in f:
56 |         key = hash(line)
57 |         if len(heap) < shuf_win:
58 |             heapq.heappush(heap, (key, line))
59 |         else:
60 |             _, out = heapq.heappushpop(heap, (key, line))
61 |             yield out
62 | 
63 |     while len(heap) > 0:
64 |         _, out = heapq.heappop(heap)
65 |         yield out
66 | 


--------------------------------------------------------------------------------
/kaggler/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from .classification import auc
2 | from .classification import logloss
3 | from .regression import gini
4 | from .regression import rmse
5 | 


--------------------------------------------------------------------------------
/kaggler/metrics/classification.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from sklearn.metrics import roc_auc_score as auc
 3 | from sklearn.metrics import log_loss
 4 | 
 5 | 
 6 | def logloss(y, p):
 7 |     """Bounded log loss error.
 8 |     
 9 |     Args:
10 |         y (numpy.array): target
11 |         p (numpy.array): prediction
12 | 
13 |     Returns:
14 |         bounded log loss error
15 |     """
16 | 
17 |     p[p < 1e-15] = 1e-15
18 |     p[p > 1 - 1e-15] = 1 - 1e-15
19 |     return log_loss(y, p)
20 | 


--------------------------------------------------------------------------------
/kaggler/metrics/regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from sklearn.metrics import mean_squared_error
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | def rmse(y, p):
 8 |     """Root Mean Squared Error (RMSE).
 9 |     
10 |     Args:
11 |         y (numpy.array): target
12 |         p (numpy.array): prediction
13 | 
14 |     Returns:
15 |         e (numpy.float64): RMSE
16 |     """
17 | 
18 |     # check and get number of samples
19 |     assert y.shape == p.shape
20 | 
21 |     return np.sqrt(mean_squared_error(y, p))
22 | 
23 | 
24 | def gini(y, p):
25 |     """Normalized Gini Coefficient.
26 |     
27 |     Args:
28 |         y (numpy.array): target
29 |         p (numpy.array): prediction
30 | 
31 |     Returns:
32 |         e (numpy.float64): normalized Gini coefficient
33 |     """
34 | 
35 |     # check and get number of samples
36 |     assert y.shape == p.shape
37 | 
38 |     n_samples = y.shape[0]
39 |     
40 |     # sort rows on prediction column
41 |     # (from largest to smallest)
42 |     arr = np.array([y, p]).transpose()
43 |     true_order = arr[arr[:,0].argsort()][::-1,0]
44 |     pred_order = arr[arr[:,1].argsort()][::-1,0]
45 |     
46 |     # get Lorenz curves
47 |     l_true = np.cumsum(true_order) / np.sum(true_order)
48 |     l_pred = np.cumsum(pred_order) / np.sum(pred_order)
49 |     l_ones = np.linspace(1/n_samples, 1, n_samples)
50 |     
51 |     # get Gini coefficients (area between curves)
52 |     g_true = np.sum(l_ones - l_true)
53 |     g_pred = np.sum(l_ones - l_pred)
54 |     
55 |     # normalize to true Gini coefficient
56 |     return g_pred / g_true
57 | 


--------------------------------------------------------------------------------
/kaggler/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .nn import NN
2 | 


--------------------------------------------------------------------------------
/kaggler/model/nn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from scipy import sparse
  3 | from scipy.optimize import minimize
  4 | from sklearn.metrics import roc_auc_score
  5 | 
  6 | import logging
  7 | import numpy as np
  8 | import time
  9 | 
 10 | from ..const import SEC_PER_MIN
 11 | 
 12 | 
 13 | class NN(object):
 14 |     """Implement a neural network with a single h layer."""
 15 | 
 16 |     def __init__(self, n=5, h=10, b=100000, l1=.0, l2=.0, random_state=None):
 17 |         """Initialize the NN class object.
 18 | 
 19 |         Args:
 20 |             h (int): number of h nodes
 21 |             b (int): number of input examples to be processed together to find
 22 |                      the second order gradient for back-propagation
 23 |             n (int): number of epoches
 24 |             l1 (float): regularization parameter for weights between the input
 25 |                         and hidden layers
 26 |             l2 (float): regularization parameter for weights between the hidden
 27 |                         and output layers.
 28 |         """
 29 | 
 30 |         np.random.seed(random_state)
 31 |         self.h = h
 32 |         self.b = b
 33 |         self.n = n
 34 |         self.l1 = l1
 35 |         self.l2 = l2
 36 |         self.n_opt = 0
 37 | 
 38 |     def fit(self, X, y, X_val=None, y_val=None):
 39 |         """Train a network with the quasi-Newton method.
 40 |         
 41 |         Args:
 42 |             X (np.array of float): feature matrix for training
 43 |             y (np.array of float): target values for training
 44 |             X_val (np.array of float): feature matrix for validation
 45 |             y_val (np.array of float): target values for validation
 46 |         """
 47 |         y = y.reshape((len(y), 1))
 48 | 
 49 |         if sparse.issparse(X):
 50 |             X = X.tocsr()
 51 | 
 52 |         if X_val is not None:
 53 |             n_val = len(y_val)
 54 |             y_val = y_val.reshape((n_val, 1))
 55 | 
 56 |         # Set initial weights randomly.
 57 |         self.i = X.shape[1]
 58 |         self.l1 = self.l1 / self.i
 59 |         self.w = (np.random.rand((self.i + 2) * self.h + 1) - .5) * 1e-6
 60 |         self.w_opt = self.w
 61 |         self.n_opt = 0
 62 | 
 63 |         logging.info('training ...')
 64 |         n_obs = X.shape[0]
 65 |         batch = self.b
 66 |         n_epoch = self.n
 67 |         idx = range(n_obs)
 68 |         self.auc_opt = .5
 69 | 
 70 |         start = time.time()
 71 |         print('\tEPOCH TRAIN     VALID     BEST      TIME (m)')
 72 |         print('\t--------------------------------------------')
 73 | 
 74 |         # Before training
 75 |         p = self.predict_raw(X)
 76 |         auc = roc_auc_score(y, p)
 77 |         auc_val = auc
 78 |         if X_val is not None:
 79 |             p_val = self.predict_raw(X_val)
 80 |             auc_val = roc_auc_score(y_val, p_val)
 81 | 
 82 |         print('\t{:3d}:  {:.6f}  {:.6f}  {:.6f}  {:.2f}'.format(
 83 |               0, auc, auc_val, self.auc_opt,
 84 |               (time.time() - start) / SEC_PER_MIN))
 85 |      
 86 |         # Use 'while' instead of 'for' to increase n_epoch if the validation
 87 |         # error keeps improving at the end of n_epoch 
 88 |         epoch = 1
 89 |         while epoch <= n_epoch:
 90 |             # Shuffle inputs every epoch - it helps avoiding the local optimum
 91 |             # when batch < n_obs.
 92 |             np.random.shuffle(idx)
 93 | 
 94 |             # Find the optimal weights for batch input examples.
 95 |             # If batch == 1, it's the stochastic optimization, which is slow
 96 |             # but uses minimal memory.  If batch == n_obs, it's the batch
 97 |             # optimization, which is fast but uses maximum memory.
 98 |             # Otherwise, it's the mini-batch optimization, which balances the
 99 |             # speed and space trade-offs.
100 |             for i in range(int(n_obs / batch) + 1):
101 |                 if (i + 1) * batch > n_obs:
102 |                     sub_idx = idx[batch * i:n_obs]
103 |                 else:
104 |                     sub_idx = idx[batch * i:batch * (i + 1)]
105 | 
106 |                 x = X[sub_idx]
107 |                 neg_idx = [n_idx for n_idx, n_y in enumerate(y[sub_idx]) if n_y == 0.]
108 |                 pos_idx = [p_idx for p_idx, p_y in enumerate(y[sub_idx]) if p_y == 1.]
109 |                 x0 = x[neg_idx]
110 |                 x1 = x[pos_idx]
111 |                 # Update weights to minimize the cost function using the
112 |                 # quasi-Newton method (L-BFGS-B), where:
113 |                 #   func -- cost function
114 |                 #   jac -- jacobian (derivative of the cost function)
115 |                 #   maxiter -- number of iterations for L-BFGS-B
116 |                 ret = minimize(self.func,
117 |                                self.w,
118 |                                args=(x0, x1),
119 |                                method='L-BFGS-B',
120 |                                jac=self.fprime,
121 |                                options={'maxiter': 5})
122 |                 self.w = ret.x
123 | 
124 |             p = self.predict_raw(X)
125 |             auc = roc_auc_score(y, p)
126 |             auc_val = auc
127 | 
128 |             if X_val is not None:
129 |                 p_val = self.predict_raw(X_val)
130 |                 auc_val = roc_auc_score(y_val, p_val)
131 | 
132 |                 if auc_val > self.auc_opt:
133 |                     self.auc_opt = auc_val
134 |                     self.w_opt = self.w
135 |                     self.n_opt = epoch
136 | 
137 |                     # If validation auc is still improving after n_epoch,
138 |                     # try 10 more epochs
139 |                     if epoch == n_epoch:
140 |                         n_epoch += 5
141 | 
142 |             print('\t{:3d}:  {:.6f}  {:.6f}  {:.6f}  {:.2f}'.format(
143 |                   epoch, auc, auc_val, self.auc_opt,
144 |                   (time.time() - start) / SEC_PER_MIN))
145 | 
146 |             epoch += 1
147 | 
148 |         if X_val is not None:
149 |             print('Optimal epoch is {0} ({1:.6f})'.format(self.n_opt,
150 |                                                           self.auc_opt))
151 |             self.w = self.w_opt
152 | 
153 |         logging.info('done training')
154 | 
155 |     def predict(self, X):
156 |         """Predict targets for a feature matrix.
157 | 
158 |         Args:
159 |             X (np.array of float): feature matrix for prediction
160 | 
161 |         Returns:
162 |             
163 |         """
164 |         logging.info('predicting ...')
165 |         ps = self.predict_raw(X)
166 | 
167 |         return sigm(ps[:, 0])
168 | 
169 |     def predict_raw(self, X):
170 |         """Predict targets for a feature matrix.
171 | 
172 |         Args:
173 |             X (np.array of float): feature matrix for prediction
174 |         """
175 |         # b -- bias for the input and h layers
176 |         b = np.ones((X.shape[0], 1))
177 |         w2 = self.w[-(self.h + 1):].reshape(self.h + 1, 1)
178 |         w1 = self.w[:-(self.h + 1)].reshape(self.i + 1, self.h)
179 | 
180 |         # Make X to have the same number of columns as self.i.
181 |         # Because of the sparse matrix representation, X for prediction can
182 |         # have a different number of columns.
183 |         if X.shape[1] > self.i:
184 |             # If X has more columns, cut extra columns.
185 |             X = X[:, :self.i]
186 |         elif X.shape[1] < self.i:
187 |             # If X has less columns, cut the rows of the weight matrix between
188 |             # the input and h layers instead of X itself because the SciPy
189 |             # sparse matrix does not support .set_shape() yet.
190 |             idx = range(X.shape[1])
191 |             idx.append(self.i)        # Include the last row for the bias
192 |             w1 = w1[idx, :]
193 | 
194 |         if sparse.issparse(X):
195 |             return np.hstack((sigm(sparse.hstack((X, b)).dot(w1)), b)).dot(w2)
196 |         else:
197 |             return np.hstack((sigm(np.hstack((X, b)).dot(w1)), b)).dot(w2)
198 | 
199 |     def func(self, w, *args):
200 |         """Return the costs of the neural network for predictions.
201 | 
202 |         Args:
203 |             w (array of float): weight vectors such that:
204 |                 w[:-h1] -- weights between the input and h layers
205 |                 w[-h1:] -- weights between the h and output layers
206 |             args: features (args[0]) and target (args[1])
207 | 
208 |         Returns:
209 |             combined cost of RMSE, L1, and L2 regularization
210 |         """
211 |         x0 = args[0]
212 |         x1 = args[1]
213 | 
214 |         n0 = x0.shape[0]
215 |         n1 = x1.shape[0]
216 | 
217 |         # n -- number of pairs to evaluate
218 |         n = max(n0, n1) * 10
219 |         idx0 = np.random.choice(range(n0), size=n)
220 |         idx1 = np.random.choice(range(n1), size=n)
221 | 
222 |         # b -- bias for the input and h layers
223 |         b0 = np.ones((n0, 1))
224 |         b1 = np.ones((n1, 1))
225 |         i1 = self.i + 1
226 |         h = self.h
227 |         h1 = h + 1
228 | 
229 |         # Predict for features -- cannot use predict_raw() because here
230 |         # different weights can be used.
231 |         if sparse.issparse(x0):
232 |             p0 = np.hstack((sigm(sparse.hstack((x0, b0)).dot(w[:-h1].reshape(
233 |                                i1, h))), b0)).dot(w[-h1:].reshape(h1, 1))
234 |             p1 = np.hstack((sigm(sparse.hstack((x1, b1)).dot(w[:-h1].reshape(
235 |                                i1, h))), b1)).dot(w[-h1:].reshape(h1, 1))
236 |         else:
237 |             p0 = np.hstack((sigm(np.hstack((x0, b0)).dot(w[:-h1].reshape(
238 |                                i1, h))), b0)).dot(w[-h1:].reshape(h1, 1))
239 |             p1 = np.hstack((sigm(np.hstack((x1, b1)).dot(w[:-h1].reshape(
240 |                                i1, h))), b1)).dot(w[-h1:].reshape(h1, 1))
241 | 
242 |         p0 = p0[idx0]
243 |         p1 = p1[idx1]
244 | 
245 |         # Return the cost that consists of the sum of squared error +
246 |         # L2-regularization for weights between the input and h layers +
247 |         # L2-regularization for weights between the h and output layers.
248 |         #return .5 * (sum((1 - sigm(p1 - p0)) ** 2) + self.l1 * sum(w[:-h1] ** 2) +
249 |         return .5 * (sum((1 - p1 + p0) ** 2) / n +
250 |                      self.l1 * sum(w[:-h1] ** 2) / (i1 * h) +
251 |                      self.l2 * sum(w[-h1:] ** 2) / h1)
252 | 
253 |     def fprime(self, w, *args):
254 |         """Return the derivatives of the cost function for predictions.
255 | 
256 |         Args:
257 |             w (array of float): weight vectors such that:
258 |                 w[:-h1] -- weights between the input and h layers
259 |                 w[-h1:] -- weights between the h and output layers
260 |             args: features (args[0]) and target (args[1])
261 | 
262 |         Returns:
263 |             gradients of the cost function for predictions
264 |         """
265 | 
266 |         x0 = args[0]
267 |         x1 = args[1]
268 | 
269 |         n0 = x0.shape[0]
270 |         n1 = x1.shape[0]
271 | 
272 |         # n -- number of pairs to evaluate
273 |         n = max(n0, n1) * 10
274 |         idx0 = np.random.choice(range(n0), size=n)
275 |         idx1 = np.random.choice(range(n1), size=n)
276 | 
277 |         # b -- bias for the input and h layers
278 |         b = np.ones((n, 1))
279 |         i1 = self.i + 1
280 |         h = self.h
281 |         h1 = h + 1
282 | 
283 |         w2 = w[-h1:].reshape(h1, 1)
284 |         w1 = w[:-h1].reshape(i1, h)
285 | 
286 |         if sparse.issparse(x0):
287 |             x0 = x0.tocsr()[idx0]
288 |             x1 = x1.tocsr()[idx1]
289 |             xb0 = sparse.hstack((x0, b))
290 |             xb1 = sparse.hstack((x1, b))
291 |         else:
292 |             x0 = x0[idx0]
293 |             x1 = x1[idx1]
294 |             xb0 = np.hstack((x0, b))
295 |             xb1 = np.hstack((x1, b))
296 | 
297 |         z0 = np.hstack((sigm(xb0.dot(w1)), b))
298 |         z1 = np.hstack((sigm(xb1.dot(w1)), b))
299 |         y0 = z0.dot(w2)
300 |         y1 = z1.dot(w2)
301 | 
302 |         #e = 1 - sigm(y1 - y0)
303 |         #dy = e * dsigm(y1 - y0)
304 |         e = 1 - (y1 - y0)
305 |         dy = e / n
306 | 
307 |         # Calculate the derivative of the cost function w.r.t. F and w2 where:
308 |         # F -- weights between the input and h layers
309 |         # w2 -- weights between the h and output layers
310 |         dw1 = -(xb1.T.dot(dy.dot(w2[:-1].reshape(1, h)) * dsigm(xb1.dot(w1))) -
311 |                xb0.T.dot(dy.dot(w2[:-1].reshape(1, h)) * dsigm(xb0.dot(w1)))
312 |                        ).reshape(i1 * h) + self.l1 * w[:-h1] / (i1 * h)
313 |         dw2 = -(z1 - z0).T.dot(dy).reshape(h1) + self.l2 * w[-h1:] / h1
314 | 
315 |         return np.append(dw1, dw2)
316 | 
317 | 
318 | def sigm(x):
319 |     """Return the value of the sigmoid function at x.
320 | 
321 |     Args:
322 |         x (np.array of float or float)
323 | 
324 |     Returns:
325 |         value(s) of the sigmoid function for x.
326 |     """
327 | 
328 |     # Avoid numerical overflow by capping the input to the exponential
329 |     # function - doesn't affect the return value.
330 |     return 1 / (1 + np.exp(-np.maximum(x, -20)))
331 | 
332 | 
333 | def dsigm(x):
334 |     """Return the value of derivative of sigmoid function w.r.t. x.
335 |     Args:
336 |         x (np.array of float or float)
337 | 
338 |     Returns:
339 |         derivative(s) of the sigmoid function w.r.t. x.
340 |     """
341 | 
342 |     return sigm(x) * (1 - sigm(x))
343 | 


--------------------------------------------------------------------------------
/kaggler/online_model/DecisionTree/OnlineClassificationTree.py:
--------------------------------------------------------------------------------
 1 | from _tree import Tree
 2 | from OnlineDecisionTree import *
 3 | from utils import *
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | class ClassificationTree(Tree):
 8 | 
 9 |     def __init__(
10 |             self,
11 |             number_of_features,
12 |             number_of_functions=10,
13 |             min_sample_split=200,
14 |             predict_initialize={
15 |                 'count_dict': {},
16 |             }
17 |         ):
18 |         # Constant values
19 |         self.number_of_features = number_of_features
20 |         self.number_of_functions = number_of_functions
21 |         self.min_sample_split = min_sample_split
22 |         self.predict_initialize = predict_initialize
23 |         self.max_sample = 1000
24 |         # Dynamic values
25 |         self.left = None
26 |         self.right = None
27 |         self.randomly_selected_features = []
28 |         self._randomly_select()
29 |         self.criterion = None
30 | 
31 | 
32 |     def _calculate_split_score(self, split):
33 |         """
34 |         calculate the score of the split:
35 |         score = current_error - after_split_error
36 |         """
37 |         left_error = gini(split['left'])
38 |         right_error = gini(split['right'])
39 |         error = gini(self.Y)
40 |         # if the split is any good, the score should be greater than 0
41 |         total = float(len(self.Y))
42 |         score = error - 1 / total * (len(split['left']) * left_error\
43 |                                      + len(split['right']) * right_error)
44 |         return score
45 | 
46 |     def _apply_best_split(self):
47 |         best_split, best_split_score = self._find_best_split()
48 |         if best_split_score > 0:
49 |             self.criterion = lambda x : x[best_split['feature']] \
50 |                              > best_split['value']
51 |             # create the left child
52 |             self.left = ClassificationTree(
53 |                 number_of_features=self.number_of_features,
54 |                 number_of_functions=self.number_of_functions,
55 |                 min_sample_split=self.min_sample_split,
56 |                 predict_initialize={
57 |                     'count_dict': count_dict(best_split['left']),
58 |                 }
59 |             )
60 |             # create the right child
61 |             self.right = ClassificationTree(
62 |                 number_of_features=self.number_of_features,
63 |                 number_of_functions=self.number_of_functions,
64 |                 min_sample_split=self.min_sample_split,
65 |                 predict_initialize={
66 |                     'count_dict': count_dict(best_split['right']),
67 |                 }
68 |             )
69 |             # Collect garbage
70 |             self.samples = {}
71 |             self.Y = []
72 | 
73 | 
74 |     def predict(self, x):
75 |         """
76 |         Make prediction recursively. Use both the samples inside the current
77 |         node and the statistics inherited from parent.
78 |         """
79 |         if self._is_leaf():
80 |             d1 = self.predict_initialize['count_dict']
81 |             d2 = count_dict(self.Y)
82 |             for key, value in d1.iteritems():
83 |                 if key in d2:
84 |                     d2[key] += value
85 |                 else:
86 |                     d2[key] = value
87 |             return argmax(d2)
88 |         else:
89 |             if self.criterion(x):
90 |                 return self.right.predict(x)
91 |             else:
92 |                 return self.left.predict(x)
93 | 


--------------------------------------------------------------------------------
/kaggler/online_model/DecisionTree/_tree.pyx:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | cimport numpy as np
  4 | 
  5 | from utils import *
  6 | 
  7 | ctypedef np.int_t DTYPE_t
  8 | 
  9 | cdef class Tree:
 10 | 
 11 |     def __cinit__(
 12 |             self,
 13 |             int number_of_features,
 14 |             int number_of_functions=10,
 15 |             int min_sample_split=20,
 16 |             dict predict_initialize={
 17 |                 'mean':2.0,
 18 |                 'variance':1.0,
 19 |                 'num_samples':0
 20 |             }
 21 |         ):
 22 |         # Constant values
 23 |         self.number_of_features = number_of_features
 24 |         self.number_of_functions = number_of_functions
 25 |         self.min_sample_split = min_sample_split
 26 |         self.predict_initialize = predict_initialize
 27 |         self.max_sample = 100
 28 |         # Dynamic values
 29 |         self.left = None
 30 |         self.right = None
 31 |         self.randomly_selected_features = []
 32 |         self._randomly_select()
 33 |         self.criterion = None
 34 | 
 35 | 
 36 |     def _randomly_select(self):
 37 |         # Check the number of randomly selected features
 38 |         if self.number_of_features < self.number_of_functions:
 39 |             raise Exception("The feature number is more than maximum")
 40 | 
 41 |         # Randomly select features into a set, and then transform to a list
 42 |         self.randomly_selected_features=set([])
 43 |         while len(self.randomly_selected_features) < self.number_of_functions:
 44 |             self.randomly_selected_features.add(\
 45 |                 random.randint(0, self.number_of_features-1))
 46 |         self.randomly_selected_features = list(self.randomly_selected_features)
 47 | 
 48 |         # Initialize the samples belong to the node
 49 |         self.samples = {}
 50 |         self.Y = []
 51 |         for feature in self.randomly_selected_features:
 52 |             self.samples[feature] = []
 53 | 
 54 |     def _is_leaf(self):
 55 |         return self.criterion == None
 56 |     
 57 |     cpdef update(self, np.ndarray x, y):
 58 |         """
 59 |         Update the model according to a single (x, y) input.
 60 |         
 61 |         If the current node is a leaf, then update the samples of the
 62 |         current node.
 63 |         
 64 |         Else update its left or right node recursively according to the
 65 |         value of x.
 66 |         When the left and right child are created, they inherit mean and
 67 |         sample count information from the parent.
 68 |         """
 69 |         cdef int N
 70 |         if self._is_leaf():
 71 |             N = len(self.Y)
 72 |             if N <= self.max_sample:
 73 |                 self._update_samples(x, y)
 74 |             if N == self.min_sample_split or N == 2 * self.min_sample_split:
 75 |                 self._apply_best_split()
 76 | 
 77 |         else:
 78 |             if self.criterion(x):
 79 |                 self.right.update(x, y)
 80 |             else:
 81 |                 self.left.update(x, y)
 82 | 
 83 |     cpdef _update_samples(self, np.ndarray x, DTYPE_t y):
 84 |         cdef int feature
 85 |         for feature in self.randomly_selected_features:
 86 |             self.samples[feature].append((x[feature], y))
 87 |         self.Y.append(y)
 88 | 
 89 |     cpdef tuple _find_best_split(self):
 90 |         cdef dict best_split = {}
 91 |         cdef double best_split_score = 0
 92 |         cdef int feature
 93 |         cdef double value
 94 |         cdef DTYPE_t prediction
 95 |         cdef list sample_feature
 96 |         cdef list left, right
 97 |         cdef dict split
 98 |         cdef double split_score
 99 |         # Try all the selected features and values combination, find the best
100 |         for feature in self.randomly_selected_features:
101 |             for (value, prediction) in self.samples[feature]:
102 |                 sample_feature = self.samples[feature]
103 |                 left, right = bin_split(sample_feature, value)
104 |                 
105 |                 split = {
106 |                     'left': left,
107 |                     'right': right,
108 |                     'value': value,
109 |                     'feature': feature,
110 |                 }
111 |                 
112 |                 split_score = self._calculate_split_score(split)
113 |                 if split_score > best_split_score:
114 |                     best_split = split
115 |                     best_split_score = split_score
116 | 
117 |         return best_split, best_split_score
118 | 


--------------------------------------------------------------------------------
/kaggler/online_model/DecisionTree/test.py:
--------------------------------------------------------------------------------
 1 | import profile
 2 | import numpy as np
 3 | import pandas as pd
 4 | from sklearn import preprocessing
 5 | from OnlineClassificationTree import *
 6 | 
 7 | def test():
 8 |     filename = "dataset.csv"
 9 |     df = pd.read_csv(filename, header = 0)
10 |     data = df.values
11 |     y = data[:, -1]
12 |     lbl_enc = preprocessing.LabelEncoder()
13 |     y = lbl_enc.fit_transform(y)
14 |     data = data[:, 0:-1]
15 |     train = data[0:50000]
16 |     ytrain = y[0:50000]
17 |     test = data[50000:]
18 |     ytest = y[50000:]
19 |     learner = ClassificationTree(number_of_features=93)
20 |     
21 |     for t, x in enumerate(train):
22 |         learner.update(x, ytrain[t])
23 |         if t % 1000 == 0:
24 |             print t
25 |     correct_num = 0
26 |     for t, x in enumerate(test):
27 |         y_pred = learner.predict(x)
28 |         if y_pred == ytest[t]:
29 |             correct_num += 1
30 |         if t % 1000 == 0:
31 |             print t
32 | 
33 |     print correct_num
34 | 
35 | if __name__ == '__main__':
36 |     profile.run("test()")
37 | 


--------------------------------------------------------------------------------
/kaggler/online_model/DecisionTree/utils.pyx:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | cimport numpy as np
 3 | cimport cython
 4 | from libc.math cimport sqrt, abs
 5 | 
 6 | ctypedef np.int_t DTYPE_t
 7 | 
 8 | cpdef DTYPE_t argmax(dict d):
 9 |     cdef double max_count = 0
10 |     cdef double total_count = 0
11 |     cdef double value
12 |     cdef DTYPE_t key
13 |     cdef DTYPE_t max_class = 0
14 |     for key, value in d.iteritems():
15 |         total_count += value
16 |         if value > max_count:
17 |             max_count = value
18 |             max_class = key
19 |     return max_class
20 | 
21 | 
22 | def predict_max(list a):
23 |     return argmax(count_dict(a))
24 | 
25 | cpdef dict count_dict(list a):
26 |     cdef DTYPE_t x
27 |     cdef dict d = {}
28 |     for x in a:
29 |         d.setdefault(x, 0)
30 |         d[x] += 1
31 |     return d
32 | 
33 | cpdef double mean_squared_error(list x):
34 |     cdef np.ndarray xnp
35 |     xnp = np.array(x)
36 |     xnp = xnp - xnp.mean()
37 |     return sqrt((xnp * xnp.T).mean())
38 | 
39 | cpdef double mean_absolute_error(list x):
40 |     cdef np.ndarray xnp
41 |     xnp = np.array(x)
42 |     xnp = xnp - xnp.mean()
43 |     return abs(xnp).mean()
44 | 
45 | cpdef double gini(list x):
46 |     cdef dict d = {}
47 |     cdef double total
48 |     cdef list to_square
49 |     cdef np.ndarray to_square2
50 |     cdef DTYPE_t y
51 |     for y in x:
52 |         d.setdefault(y, 0)
53 |         d[y] += 1
54 |     total = len(x)
55 |     to_square = []
56 |     cdef double value
57 |     cdef DTYPE_t key
58 |     for key, value in d.iteritems():
59 |         to_square.append(value/total)
60 |     to_square2 = np.array(to_square)
61 |     return 1 - (to_square2 * to_square2.T).sum()
62 | 
63 | cpdef tuple bin_split(list sample_feature, double feature_value):
64 |     cdef list left, right
65 |     cdef tuple x
66 |     left = [x[1] for x in sample_feature if x[0]<=feature_value]
67 |     right = [x[1] for x in sample_feature if x[0]>feature_value]
68 |     return left, right
69 | 


--------------------------------------------------------------------------------
/kaggler/online_model/__init__.py:
--------------------------------------------------------------------------------
1 | from .ftrl import FTRL
2 | from .fm import FM
3 | from .nn import NN
4 | from .nn_h2 import NN_H2
5 | from .sgd import SGD
6 | 


--------------------------------------------------------------------------------
/kaggler/online_model/fm.pyx:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False
  2 | # cython: wraparound=False
  3 | # cython: cdivision=True
  4 | from __future__ import division
  5 | import numpy as np
  6 | 
  7 | cimport cython
  8 | from libc.math cimport sqrt, abs
  9 | from ..util cimport sigm
 10 | cimport numpy as np
 11 | 
 12 | from cython.parallel import prange, parallel, threadid  
 13 | 
 14 | np.import_array()
 15 | 
 16 | 
 17 | cdef class FM:
 18 |     """Factorization Machine online learner.
 19 | 
 20 |     Attributes:
 21 |         n (int): number of input features
 22 |         epoch (int): number of epochs
 23 |         k (int): size of factors for interactions
 24 |         a (double): initial learning rate
 25 |         w0 (double): weight for bias
 26 |         c0 (double): counters
 27 |         w (array of double): feature weights
 28 |         c (array of double): counters for weights
 29 |         V (array of double): feature weights for factors
 30 |     """
 31 | 
 32 |     cdef unsigned int epoch
 33 |     cdef unsigned int n
 34 |     cdef unsigned int k
 35 |     cdef double a
 36 |     cdef double w0
 37 |     cdef double c0
 38 |     cdef double[:] w
 39 |     cdef double[:] c
 40 |     cdef double[:] V
 41 | 
 42 |     def __init__(self,
 43 |                  unsigned int n,
 44 |                  unsigned int epoch=100,
 45 |                  unsigned int dim=4,
 46 |                  double a=0.01,
 47 |                  seed=0):
 48 |         """Initialize the FM class object.
 49 | 
 50 |         Args:
 51 |             n (int): number of input features
 52 |             epoch (int): number of epochs
 53 |             dim (int): size of factors for interactions
 54 |             a (double): initial learning rate
 55 |             seed (int): random seed
 56 |         """
 57 |         cdef int i
 58 | 
 59 |         rng = np.random.RandomState(seed)
 60 | 
 61 |         self.n = n          # # of features
 62 |         self.epoch = epoch  # # of epochs
 63 |         self.k = dim        # interaction dimension
 64 |         self.a = a          # learning rate
 65 | 
 66 |         # initialize weights, factorized interactions, and counts
 67 |         self.w0 = 0.
 68 |         self.c0 = 0.
 69 |         self.w = np.zeros((self.n,), dtype=np.float64)
 70 |         self.c = np.zeros((self.n,), dtype=np.float64)
 71 |         self.V = (rng.rand(self.n * self.k) - .5) * 1e-6
 72 | 
 73 |     def __repr__(self):                                                         
 74 |         return ('FM(n={}, epoch={}, dim={}, a={})').format(
 75 |             self.n, self.epoch, self.dim, self.a
 76 |         )
 77 | 
 78 |     def read_sparse(self, path):
 79 |         """Apply hashing trick to the libsvm format sparse file.
 80 | 
 81 |         Args:
 82 |             path (str): a file path to the libsvm format sparse file
 83 | 
 84 |         Yields:
 85 |             idx (list of int): a list of index of non-zero features
 86 |             val (list of double): a list of values of non-zero features
 87 |             y (int): target value
 88 |         """
 89 |         for line in open(path):
 90 |             xs = line.rstrip().split(' ')
 91 | 
 92 |             y = int(xs[0])
 93 |             idx = []
 94 |             val = []
 95 |             for item in xs[1:]:
 96 |                 i, v = item.split(':')
 97 |                 idx.append(int(i))
 98 |                 val.append(float(v))
 99 | 
100 |             yield zip(idx, val), y
101 | 
102 |     def fit(self, X, y):
103 |         """Update the model with a sparse input feature matrix and its targets.
104 | 
105 |         Args:
106 |             X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features
107 |             y (numpy.array): targets
108 | 
109 |         Returns:
110 |             updated model weights and counts
111 |         """
112 |         n = X.shape[0]
113 |         for epoch in range(self.epoch):
114 |             for row in range(n):
115 |                 x = zip(X[row].indices, X[row].data)
116 |                 self.update_one(x, self.predict_one(x) - y[row])
117 |                 
118 |     def predict(self, X):
119 |         """Predict for a sparse matrix X.
120 | 
121 |         Args:
122 |             X (scipy.sparse.csr_matrix): a sparse matrix for input features
123 | 
124 |         Returns:
125 |             p (numpy.array): predictions for input features
126 |         """
127 | 
128 |         p = np.zeros((X.shape[0], ), dtype=np.float64)
129 |         for row in range(X.shape[0]):
130 |             p[row] = self.predict_one(zip(X[row].indices, X[row].data))
131 | 
132 |         return p
133 | 
134 |     def predict_one(self, list x):
135 |         """Predict for features.
136 | 
137 |         Args:
138 |             x (list of tuple): a list of (index, value) of non-zero features
139 | 
140 |         Returns:
141 |             p (double): a prediction for input features
142 |         """
143 |         cdef int i
144 |         cdef int k
145 |         cdef double v
146 |         cdef double p
147 |         cdef double wx
148 |         cdef double[:] vx
149 |         cdef double[:] v2x2
150 | 
151 |         wx = 0.
152 |         vx = np.zeros((self.k,), dtype=np.float64)
153 |         v2x2 = np.zeros((self.k,), dtype=np.float64)
154 |         for i, v in x:
155 |             wx += self.w[i] * v
156 |             for k in range(self.k):
157 |                 vx[k] += self.V[i * self.k + k] * v
158 |                 v2x2[k] += (self.V[i * self.k + k] ** 2) * (v ** 2)
159 | 
160 |         p = self.w0 + wx
161 |         for k in range(self.k):
162 |             p += .5 * (vx[k] ** 2 - v2x2[k])
163 | 
164 |         return sigm(p)
165 | 
166 |     def update_one(self, list x, double e):
167 |         """Update the model.
168 | 
169 |         Args:
170 |             idx (list of int): a list of index of non-zero features
171 |             val (list of double): a list of values of non-zero features
172 |             e (double): error between the prediction of the model and target
173 | 
174 |         Returns:
175 |             updated model weights and counts
176 |         """
177 |         cdef int i
178 |         cdef int k
179 |         cdef int f
180 |         cdef double v
181 |         cdef double g2
182 |         cdef double dl_dw
183 |         cdef double[:] vx
184 | 
185 |         # calculate v_f * x in advance
186 |         vx = np.zeros((self.k,), dtype=np.float64)
187 |         for i, v in x:
188 |             for k in range(self.k):
189 |                 vx[k] += self.V[i * self.k + k] * v
190 | 
191 |         # update w0, w, V, c0, and c
192 |         g2 = e * e
193 | 
194 |         self.w0 -= self.a / (sqrt(self.c0) + 1) * e
195 |         for i, v in x:
196 |             dl_dw = self.a / (sqrt(self.c[i]) + 1) * e * v
197 |             self.w[i] -= dl_dw
198 |             for f in range(self.k):
199 |                 self.V[i * self.k + f] -= dl_dw * (vx[f] -
200 |                                                    self.V[i * self.k + f] * v)
201 | 
202 |             self.c[i] += g2
203 | 
204 |         self.c0 += g2
205 | 


--------------------------------------------------------------------------------
/kaggler/online_model/ftrl.pyx:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False
  2 | # cython: wraparound=False
  3 | # cython: cdivision=True
  4 | from __future__ import division
  5 | import numpy as np
  6 | 
  7 | cimport cython
  8 | from libc.math cimport sqrt, abs
  9 | from ..util cimport sigm
 10 | cimport numpy as np
 11 | 
 12 | 
 13 | np.import_array()
 14 | 
 15 | 
 16 | cdef class FTRL:
 17 |     """FTRL online learner with the hasing trick using liblinear format data.
 18 |     
 19 |     inspired by Kaggle user tinrtgu's code at http://goo.gl/K8hQBx
 20 |     original FTRL paper is available at http://goo.gl/iqIaH0
 21 | 
 22 |     Attributes:
 23 |         n (int): number of features after hashing trick
 24 |         epoch (int): number of epochs
 25 |         a (double): alpha in the per-coordinate rate
 26 |         b (double): beta in the per-coordinate rate
 27 |         l1 (double): L1 regularization parameter
 28 |         l2 (double): L2 regularization parameter
 29 |         w (array of double): feature weights
 30 |         c (array of double): counters for weights
 31 |         z (array of double): lazy weights
 32 |         interaction (boolean): whether to use 2nd order interaction or not
 33 |     """
 34 | 
 35 |     cdef double a      # learning rate
 36 |     cdef double b
 37 |     cdef double l1
 38 |     cdef double l2
 39 |     cdef unsigned int epoch
 40 |     cdef unsigned int n
 41 |     cdef bint interaction
 42 |     cdef double[:] w
 43 |     cdef double[:] c
 44 |     cdef double[:] z
 45 | 
 46 |     def __init__(self,
 47 |                  double a=0.01,
 48 |                  double b=1.,
 49 |                  double l1=1.,
 50 |                  double l2=1.,
 51 |                  unsigned int n=2**20,
 52 |                  unsigned int epoch=1,
 53 |                  bint interaction=True):
 54 |         """Initialize the FTRL class object.
 55 | 
 56 |         Args:
 57 |             a (double): alpha in the per-coordinate rate
 58 |             b (double): beta in the per-coordinate rate
 59 |             l1 (double): L1 regularization parameter
 60 |             l2 (double): L2 regularization parameter
 61 |             n (int): number of features after hashing trick
 62 |             epoch (int): number of epochs
 63 |             interaction (boolean): whether to use 2nd order interaction or not
 64 |         """
 65 | 
 66 |         self.a = a
 67 |         self.b = b
 68 |         self.l1 = l1
 69 |         self.l2 = l2
 70 |         self.n = n
 71 |         self.epoch = epoch
 72 |         self.interaction = interaction
 73 | 
 74 |         # initialize weights and counts
 75 |         self.w = np.zeros((self.n + 1,), dtype=np.float64)
 76 |         self.c = np.zeros((self.n + 1,), dtype=np.float64)
 77 |         self.z = np.zeros((self.n + 1,), dtype=np.float64)
 78 | 
 79 |     def __repr__(self):
 80 |         return ('FTRL(a={}, b={}, l1={}, l2={}, n={}, epoch={}, interaction={})').format(
 81 |             self.a, self.b, self.l1, self.l2, self.n, self.epoch, self.interaction
 82 |         )
 83 | 
 84 |     def _indices(self, list x):
 85 |         cdef unsigned int index
 86 |         cdef int l
 87 |         cdef int i
 88 |         cdef int j
 89 | 
 90 |         # return the index of the bias term
 91 |         yield self.n
 92 | 
 93 |         for index in x:
 94 |             yield abs(hash(index)) % self.n
 95 | 
 96 |         if self.interaction:
 97 |             l = len(x)
 98 |             x = sorted(x)
 99 |             for i in xrange(l):
100 |                 for j in xrange(i + 1, l):
101 |                     yield abs(hash('{}_{}'.format(x[i], x[j]))) % self.n
102 | 
103 |     def read_sparse(self, path):
104 |         """Apply hashing trick to the libsvm format sparse file.
105 | 
106 |         Args:
107 |             path (str): a file path to the libsvm format sparse file
108 | 
109 |         Yields:
110 |             x (list of int): a list of index of non-zero features
111 |             y (int): target value
112 |         """
113 |         for line in open(path):
114 |             xs = line.rstrip().split(' ')
115 | 
116 |             y = int(xs[0])
117 |             x = []
118 |             for item in xs[1:]:
119 |                 index, _ = item.split(':')
120 |                 x.append(index)
121 | 
122 |             yield x, y
123 | 
124 |     def fit(self, X, y):
125 |         """Update the model with a sparse input feature matrix and its targets.
126 | 
127 |         Args:
128 |             X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features
129 |             y (numpy.array): targets
130 | 
131 |         Returns:
132 |             updated model weights and counts
133 |         """
134 |         for epoch in range(self.epoch):
135 |             for row in range(X.shape[0]):
136 |                 x = list(X[row].indices)
137 |                 self.update_one(x, self.predict_one(x) - y[row])
138 | 
139 |     def predict(self, X):
140 |         """Predict for a sparse matrix X.
141 | 
142 |         Args:
143 |             X (scipy.sparse.csr_matrix): a sparse matrix for input features
144 | 
145 |         Returns:
146 |             p (numpy.array): predictions for input features
147 |         """
148 |         p = np.zeros((X.shape[0], ), dtype=np.float64)
149 |         for row in range(X.shape[0]):
150 |             p[row] = self.predict_one(list(X[row].indices))
151 | 
152 |         return p
153 | 
154 |     def update_one(self, list x, double e):
155 |         """Update the model.
156 | 
157 |         Args:
158 |             x (list of int): a list of index of non-zero features
159 |             e (double): error between prediction of the model and target
160 | 
161 |         Returns:
162 |             updates model weights and counts
163 |         """
164 |         cdef int i
165 |         cdef double e2
166 |         cdef double s
167 | 
168 |         e2 = e * e
169 |         for i in self._indices(x):
170 |             s = (sqrt(self.c[i] + e2) - sqrt(self.c[i])) / self.a
171 |             self.w[i] += e - s * self.z[i]
172 |             self.c[i] += e2
173 | 
174 |     def predict_one(self, list x):
175 |         """Predict for features.
176 | 
177 |         Args:
178 |             x (list of int): a list of index of non-zero features
179 | 
180 |         Returns:
181 |             p (double): a prediction for input features
182 |         """
183 |         cdef int i
184 |         cdef double sign
185 |         cdef double wTx
186 | 
187 |         wTx = 0.
188 |         for i in self._indices(x):
189 |             sign = -1. if self.w[i] < 0 else 1.
190 |             if sign * self.w[i] <= self.l1:
191 |                 self.z[i] = 0.
192 |             else:
193 |                 self.z[i] = (sign * self.l1 - self.w[i]) / \
194 |                             ((self.b + sqrt(self.c[i])) / self.a + self.l2)
195 | 
196 |             wTx += self.z[i]
197 | 
198 |         return sigm(wTx)
199 | 


--------------------------------------------------------------------------------
/kaggler/online_model/ftrl_dropout.pyx:
--------------------------------------------------------------------------------
  1 | from csv import DictReader
  2 | from math import exp, log, sqrt
  3 | 
  4 | import cPickle as pickle
  5 | import gzip
  6 | import random
  7 | 
  8 | 
  9 | class ftrl_proximal(object):
 10 |     ''' Our main algorithm: Follow the regularized leader - proximal
 11 | 
 12 |         In short,
 13 |         this is an adaptive-learning-rate sparse logistic-regression with
 14 |         efficient L1-L2-regularization
 15 | 
 16 |         Reference:
 17 |         http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
 18 |     '''
 19 | 
 20 |     def __init__(self, alpha, beta, L1, L2, D, interaction=False, dropout=1.0):
 21 |         # parameters
 22 |         self.alpha = alpha
 23 |         self.beta = beta
 24 |         self.L1 = L1
 25 |         self.L2 = L2
 26 | 
 27 |         # feature related parameters
 28 |         self.D = D
 29 |         self.interaction = interaction
 30 |         self.dropout = dropout        
 31 | 
 32 |         # model
 33 |         # n: squared sum of past gradients
 34 |         # z: weights
 35 |         # w: lazy weights
 36 |         self.n = [0.] * D
 37 |         self.z = [0.] * D
 38 |         
 39 |         self.w = [0.] * D  # use this for execution speed up
 40 | 
 41 |     def _indices(self, x):
 42 |         ''' A helper generator that yields the indices in x
 43 | 
 44 |             The purpose of this generator is to make the following
 45 |             code a bit cleaner when doing feature interaction.
 46 |         '''
 47 | 
 48 |         for i in x:
 49 |             yield i
 50 | 
 51 |         if self.interaction:
 52 |             L = len(x)
 53 |             for i in xrange(1, L):  # skip bias term, so we start at 1
 54 |                 for j in xrange(i+1, L):
 55 |                     # one-hot encode interactions with hash trick
 56 |                     yield abs(hash(str(x[i]) + '_' + str(x[j]))) % self.D
 57 | 
 58 |     def predict(self, x, dropped = None):
 59 |         ''' Get probability estimation on x
 60 | 
 61 |             INPUT:
 62 |                 x: features
 63 | 
 64 |             OUTPUT:
 65 |                 probability of p(y = 1 | x; w)
 66 |         '''
 67 |         # params
 68 |         dropout = self.dropout
 69 | 
 70 |         # model
 71 |         w = self.w
 72 | 
 73 |         # wTx is the inner product of w and x
 74 |         wTx = 0.
 75 |         for j, i in enumerate(self._indices(x)):
 76 |             
 77 |             if dropped != None and dropped[j]:
 78 |                 continue
 79 |            
 80 |             wTx += w[i]
 81 |         
 82 |         if dropped != None: wTx /= dropout 
 83 | 
 84 |         # bounded sigmoid function, this is the probability estimation
 85 |         return 1. / (1. + exp(-max(min(wTx, 35.), -35.)))
 86 | 
 87 |     def update(self, x, y):
 88 |         ''' Update model using x, p, y
 89 | 
 90 |             INPUT:
 91 |                 x: feature, a list of indices
 92 |                 p: click probability prediction of our model
 93 |                 y: answer
 94 | 
 95 |             MODIFIES:
 96 |                 self.n: increase by squared gradient
 97 |                 self.z: weights
 98 |         '''
 99 | 
100 |         # parameters
101 |         alpha = self.alpha
102 |         beta = self.beta
103 |         L1 = self.L1
104 |         L2 = self.L2
105 | 
106 |         # model
107 |         n = self.n
108 |         z = self.z
109 |         w = self.w  # no need to change this, it won't gain anything
110 |         dropout = self.dropout
111 | 
112 |         ind = [ i for i in self._indices(x)]
113 |         
114 |         if dropout == 1:
115 |             dropped = None
116 |         else:
117 |             dropped = [random.random() > dropout for i in xrange(0,len(ind))]
118 |         
119 |         p = self.predict(x, dropped)
120 | 
121 |         # gradient under logloss
122 |         g = p - y
123 | 
124 |         # update z and n
125 |         for j, i in enumerate(ind):
126 | 
127 |             # implement dropout as overfitting prevention
128 |             if dropped != None and dropped[j]: continue
129 | 
130 |             sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
131 |             z[i] += g - sigma * w[i]
132 |             n[i] += g * g
133 |             
134 |             sign = -1. if z[i] < 0 else 1.  # get sign of z[i]
135 | 
136 |             # build w on the fly using z and n, hence the name - lazy weights -
137 |             if sign * z[i] <= L1:
138 |                 # w[i] vanishes due to L1 regularization
139 |                 w[i] = 0.
140 |             else:
141 |                 # apply prediction time L1, L2 regularization to z and get w
142 |                 w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
143 | 
144 |     def read_csv(self, f_train):
145 |         ''' GENERATOR: Apply hash-trick to the original csv row
146 |                        and for simplicity, we one-hot-encode everything
147 | 
148 |             INPUT:
149 |                 path: path to training or testing file
150 | 
151 |             YIELDS:
152 |                 ID: id of the instance, mainly useless
153 |                 x: a list of hashed and one-hot-encoded 'indices'
154 |                    we only need the index since all values are either 0 or 1
155 |                 y: y = 1 if we have a click, else we have y = 0
156 |         '''
157 |         for t, row in enumerate(DictReader(f_train)):
158 |             # process id
159 |             ID = row['id']
160 |             del row['id']
161 | 
162 |             # process clicks
163 |             y = 0.
164 |             if 'click' in row:
165 |                 if row['click'] == '1':
166 |                     y = 1.
167 |                 del row['click']
168 |      
169 |             # turn hour really into hour, it was originally YYMMDDHH
170 | 
171 |             date = row['hour'][0:6]
172 |             row['hour'] = row['hour'][6:]
173 |             
174 |             #       stderr.write("_%s_" % date)
175 |             
176 |             # extract date
177 |             row['wd'] = str(int(date) % 7)
178 |             row['wd_hour'] = "%s_%s" % (row['wd'], row['hour'])            
179 | 
180 |             # build x
181 |             x = [0]  # 0 is the index of the bias term
182 |             for key in row:
183 |                 value = row[key]
184 | 
185 |                 # one-hot encode everything with hash trick
186 |                 index = abs(hash(key + '_' + value)) % self.D
187 |                 x.append(index)
188 | 
189 |             yield t, ID, x, y
190 | 
191 |     def write_model(self, model, model_save, args):
192 |        with gzip.open(model_save, "wb") as model_file:
193 |            pickle.dump((args, model), model_file)
194 | 
195 |     def load_model(self, model_save):
196 |         with gzip.open(model_save, "rb") as model_file:
197 |             (p, model) = pickle.load(model_file)
198 |         
199 |         return model
200 | 


--------------------------------------------------------------------------------
/kaggler/online_model/ftrl_fm.pyx:
--------------------------------------------------------------------------------
  1 | ''' Based on Tinrtgu's FTRL code: http://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
  2 | '''
  3 | 
  4 | from csv import DictReader
  5 | cimport cython
  6 | from libc.math cimport exp, copysign, log, sqrt
  7 | import numpy as np
  8 | import copy
  9 | cimport numpy as np
 10 | np.import_array()
 11 | from cython.parallel import parallel
 12 | from datetime import datetime
 13 | import random
 14 | 
 15 | cdef class FTRL_FM:
 16 |     cdef double alpha      # learning rate
 17 |     cdef double beta
 18 |     cdef double alpha_fm      # learning rate
 19 |     cdef double beta_fm
 20 |     cdef double L1
 21 |     cdef double L2
 22 |     cdef double L1_fm
 23 |     cdef double L2_fm
 24 |     cdef double L1_fm_tmp
 25 |     cdef double L2_fm_tmp
 26 |     cdef unsigned int fm_dim
 27 |     cdef unsigned int D
 28 |     cdef double fm_initDev
 29 |     cdef double dropoutRate
 30 | 
 31 | 
 32 |     cdef unsigned int epoch
 33 |     # cdef unsigned int n
 34 |     cdef bint interaction
 35 |     cdef double[:] w
 36 |     cdef double[:] n
 37 |     cdef double[:] z
 38 |     cdef dict n_fm
 39 |     cdef dict z_fm
 40 |     cdef dict w_fm
 41 |     def __init__(
 42 |             self, 
 43 |             unsigned int fm_dim=4, 
 44 |             double fm_initDev=0.01, 
 45 |             double L1=0.0, 
 46 |             double L2=0.0, 
 47 |             double L1_fm=0.0, 
 48 |             double L2_fm=0.0, 
 49 |             unsigned int D=2*22, 
 50 |             double alpha=0.005, 
 51 |             double beta=1.0, 
 52 |             double alpha_fm = .1, 
 53 |             double beta_fm = 1.0, 
 54 |             double dropoutRate = 1.0
 55 |             ):
 56 |         ''' initialize the factorization machine.'''
 57 |         
 58 |         self.alpha = alpha              # learning rate parameter alpha
 59 |         self.beta = beta                # learning rate parameter beta
 60 |         self.L1 = L1                    # L1 regularizer for first order terms
 61 |         self.L2 = L2                    # L2 regularizer for first order terms
 62 |         self.alpha_fm = alpha_fm        # learning rate parameter alpha for factorization machine
 63 |         self.beta_fm = beta_fm          # learning rate parameter beta for factorization machine
 64 |         self.L1_fm = L1_fm              # L1 regularizer for factorization machine weights. Only use L1 after one epoch of training, because small initializations are needed for gradient.
 65 |         self.L2_fm = L2_fm              # L2 regularizer for factorization machine weights.
 66 |         self.fm_dim = fm_dim            # dimension of factorization.
 67 |         self.fm_initDev = fm_initDev    # standard deviation for random intitialization of factorization weights.
 68 |         self.dropoutRate = dropoutRate  # dropout rate (which is actually the inclusion rate), i.e. dropoutRate = .8 indicates a probability of .2 of dropping out a feature.
 69 |         
 70 |         self.L1_fm_tmp = L1_fm              # L1 regularizer for factorization machine weights. Only use L1 after one epoch of training, because small initializations are needed for gradient.
 71 |         self.L2_fm_tmp = L2_fm              # L2 regularizer for factorization machine weights.
 72 | 
 73 |         self.D = D
 74 |         
 75 |         # model
 76 |         # n: squared sum of past gradients
 77 |         # z: weights
 78 |         # w: lazy weights
 79 |         
 80 |         # let index 0 be bias term to avoid collisions.
 81 |         self.n = np.zeros(self.D + 1, dtype=np.float64)
 82 |         self.z = np.zeros(self.D + 1, dtype=np.float64)
 83 |         self.w = np.zeros(self.D + 1, dtype=np.float64)
 84 |         
 85 |         self.n_fm = {}
 86 |         self.z_fm = {}
 87 |         self.w_fm = {}
 88 |         
 89 |         
 90 |     def init_fm(self,unsigned int i):
 91 |         ''' initialize the factorization weight vector for variable i.
 92 |         '''
 93 |         cdef unsigned int k
 94 |         if i not in self.n_fm:
 95 |             self.n_fm[i] = np.zeros(self.fm_dim, dtype=np.float64)
 96 |             self.w_fm[i] = np.zeros(self.fm_dim, dtype=np.float64)
 97 |             self.z_fm[i] = np.zeros(self.fm_dim, dtype=np.float64)
 98 |             
 99 |             for k in range(self.fm_dim): 
100 |                 self.z_fm[i][k] = random.gauss(0., self.fm_initDev)
101 |     
102 |     def predict_raw(self, list x):
103 |         ''' predict_one the raw score prior to logit transformation.
104 |         '''
105 |         alpha = self.alpha
106 |         beta = self.beta
107 |         L1 = self.L1
108 |         L2 = self.L2
109 |         alpha_fm = self.alpha_fm
110 |         beta_fm = self.beta_fm
111 |         L1_fm = self.L1_fm
112 |         L2_fm = self.L2_fm
113 |         
114 |         # first order weights model
115 |         n = self.n
116 |         z = self.z
117 |         w = self.w
118 |         
119 |         # FM interaction model
120 |         n_fm = self.n_fm
121 |         z_fm = self.z_fm
122 |         w_fm = self.w_fm
123 |         
124 |         cdef double raw_y = 0.
125 |         cdef unsigned int i
126 |         cdef double sign
127 |         cdef unsigned int len_x
128 |         cdef unsigned int k
129 | 
130 |         # calculate the bias contribution
131 |         for i in [0]:
132 |             # no regularization for bias
133 |             self.w[i] = (- self.z[i]) / ((self.beta + sqrt(self.n[i])) / self.alpha)
134 |             
135 |             raw_y += self.w[i]
136 |         
137 |         # calculate the first order contribution.
138 |         for i in x:
139 |             sign = -1. if self.z[i] < 0. else 1. # get sign of z[i]
140 |             
141 |             if sign * self.z[i] <= self.L1:
142 |                 self.w[i] = 0.
143 |             else:
144 |                 self.w[i] = (sign * self.L1 - self.z[i]) / ((self.beta + sqrt(n[i])) / self.alpha + self.L2)
145 |             
146 |             raw_y += self.w[i]
147 |         
148 | 
149 |         len_x = len(x)
150 |         # calculate factorization machine contribution.
151 |         for i in x:
152 |             self.init_fm(i)
153 |             for k in range(self.fm_dim):
154 |                 sign = -1. if self.z_fm[i][k] < 0. else 1.   # get the sign of z_fm[i][k]
155 |                 
156 |                 if sign * self.z_fm[i][k] <= self.L1_fm:
157 |                     self.w_fm[i][k] = 0.
158 |                 else:
159 |                     self.w_fm[i][k] = (sign * self.L1_fm - self.z_fm[i][k]) / ((self.beta_fm + sqrt(self.n_fm[i][k])) / self.alpha_fm + self.L2_fm)
160 |         
161 |         for i in range(len_x):
162 |             for j in range(i + 1, len_x):
163 |                 for k in range(self.fm_dim):
164 |                     raw_y += w_fm[x[i]][k] * w_fm[x[j]][k]
165 |         
166 |         return raw_y
167 |     
168 |     def predict_one(self, list x):
169 |         ''' predict_one the logit
170 |         '''
171 |         return 1. / (1. + exp(- max(min(self.predict_raw(x), 35.), -35.)))
172 |     
173 |     def dropout(self, list x):
174 |         ''' dropout variables in list x
175 |         '''
176 |         cdef unsigned int i
177 |         cdef double var
178 |         for i, var in enumerate(x):
179 |             if random.random() > self.dropoutRate:
180 |                 del x[i]
181 |     
182 |     def dropoutThenPredict(self, list x):
183 |         ''' first dropout some variables and then predict_one the logit using the dropped out data.
184 |         '''
185 |         self.dropout(x)
186 |         return self.predict_one(x)
187 |     
188 |     def predictWithDroppedOutModel(self, list x):
189 |         ''' predict_one using all data, using a model trained with dropout.
190 |         '''
191 |         return 1. / (1. + exp(- max(min(self.predict_raw(x) * self.dropoutRate, 35.), -35.)))
192 |     
193 |     def update(self, list x, double p, double y):
194 |         ''' Update the parameters using FTRL (Follow the Regularized Leader)
195 |         '''
196 |         # alpha = self.alpha
197 |         # alpha_fm = self.alpha_fm
198 |         
199 |         # # model
200 |         # n = self.n
201 |         # z = self.z
202 |         # w = self.w
203 |         
204 |         # # FM model
205 |         # n_fm = self.n_fm
206 |         # z_fm = self.z_fm
207 |         # w_fm = self.w_fm
208 |         
209 |         cdef double g
210 |         # cost gradient with respect to raw prediction.
211 |         g = p - y
212 | 
213 |         cdef int len_x
214 |         cdef int i
215 |         cdef int j
216 |         cdef int k
217 |         cdef double sigma
218 |         cdef dict fm_sum
219 |         # cdef np.ndarray fm_sum
220 | 
221 |         fm_sum = {}      # sums for calculating gradients for FM.
222 |         # fm_sum = np.zeros(len(x + [0]))
223 |         # fm_sum = np.expand_dims(fm_sum,1)
224 |         len_x = len(x)
225 |         # with nogil, parallel():
226 |         for i in x + [0]:
227 |             # update the first order weights.
228 |             sigma = (sqrt(self.n[i] + g * g) - sqrt(self.n[i])) / self.alpha
229 |             self.z[i] += g - sigma * self.w[i]
230 |             self.n[i] += g * g
231 |             
232 |             # initialize the sum of the FM interaction weights.
233 |             fm_sum[i] = np.zeros(self.fm_dim)
234 |         
235 |         # sum the gradients for FM interaction weights.
236 |         for i in range(len_x):
237 |             for j in range(len_x):
238 |                 if i != j:
239 |                     for k in range(self.fm_dim):
240 |                         fm_sum[x[i]][k] += self.w_fm[x[j]][k]
241 |         
242 |         for i in x:
243 |             for k in range(self.fm_dim):
244 |                 g_fm = g * fm_sum[i][k]
245 |                 sigma = (sqrt(self.n_fm[i][k] + g_fm * g_fm) - sqrt(self.n_fm[i][k])) / self.alpha_fm
246 |                 self.z_fm[i][k] += g_fm - sigma * self.w_fm[i][k]
247 |                 self.n_fm[i][k] += g_fm * g_fm
248 |     
249 |     def write_w(self, filePath):
250 |         ''' write out the first order weights w to a file.
251 |         '''
252 |         with open(filePath, "w") as f_out:
253 |             for i, w in enumerate(self.w):
254 |                 f_out.write("%i,%f\n" % (i, w))
255 |     
256 |     def write_w_fm(self, filePath):
257 |         ''' write out the factorization machine weights to a file.
258 |         '''
259 |         with open(filePath, "w") as f_out:
260 |             for k, w_fm in self.w_fm.iteritems():
261 |                 f_out.write("%i,%s\n" % (k, ",".join([str(w) for w in w_fm])))
262 | 
263 | 
264 |     def predict(self,testingFile,hashSalt='salt'):
265 |         start = datetime.now()
266 |         # initialize a FM learner
267 |         learner = self
268 |         cdef int e
269 |         cdef double cvLoss = 0.
270 |         cdef double cvCount = 0.
271 |         cdef double progressiveLoss = 0.
272 |         cdef double progressiveCount = 0.
273 |         cdef list x
274 |         cdef double y
275 |         cdef unsigned int t
276 |         cdef double p
277 |         cdef double loss
278 |         cdef list y_preds = []
279 |         for t, ID, x, y in data(testingFile, self.D, hashSalt,loop=False):
280 |             p = learner.predict_one(x)
281 |             y_preds.append(p)
282 |         return y_preds
283 | 
284 | 
285 |     def evaluate(self,validationFile,eval_metric,hashSalt='salt'):
286 |         start = datetime.now()
287 |         # initialize a FM learner
288 |         learner = self
289 |         cdef int e
290 |         cdef double cvLoss = 0.
291 |         cdef double cvCount = 0.
292 |         cdef double progressiveLoss = 0.
293 |         cdef double progressiveCount = 0.
294 |         cdef list x
295 |         cdef double y
296 |         cdef unsigned int t
297 |         cdef double p
298 |         cdef double loss
299 |         cdef list y_preds = []
300 |         cdef list y_test = []
301 |         for t, ID, x, y in data(validationFile, self.D, hashSalt,loop=False):
302 |             p = learner.predict_one(x)
303 |             y_preds.append(p)
304 |             y_test.append(y)
305 |         score = eval_metric(y_preds,y_preds)
306 |         return score
307 | 
308 |     def fit(self,trainingFile,hashSalt='salt',n_epochs=5,reportFrequency=10000,validationFile=None,eval_metric=None):
309 |         start = datetime.now()
310 |         # initialize a FM learner
311 |         learner = self
312 |         cdef int e
313 |         cdef double cvLoss = 0.
314 |         cdef double cvCount = 0.
315 |         cdef double progressiveLoss = 0.
316 |         cdef double progressiveCount = 0.
317 |         cdef list x
318 |         cdef double y
319 |         cdef unsigned int t
320 |         cdef double p
321 |         cdef double loss
322 |         print("Start Training:")
323 |         for e in range(n_epochs):
324 |             
325 |             # if it is the first epoch, then don't use L1_fm or L2_fm
326 |             if e == 0:
327 |                 learner.L1_fm = 0.
328 |                 learner.L2_fm = 0.
329 |             else:
330 |                 learner.L1_fm = learner.L1_fm_tmp
331 |                 learner.L2_fm = learner.L1_fm_tmp
332 |             
333 | 
334 |             for t, ID, x, y in data(trainingFile, self.D, hashSalt,loop=True):
335 |                 p = learner.predict_one(x)
336 |                 loss = logLoss(p, y)
337 |                 learner.update(x, p, y)
338 |                 progressiveLoss += loss
339 |                 progressiveCount += 1.
340 |                 if t % reportFrequency == 0:                
341 |                     print("Epoch %d\tcount: %d\tProgressive Loss: %f" % (e, t, progressiveLoss / progressiveCount))
342 |                     if validationFile!=None and eval_metric!=None:
343 |                         eval_score = self.evaluate(validationFile,eval_metric)
344 |                         print("Epoch %d\tcount: %d\tEvaludation score: %f" % (e, t, eval_score))
345 | 
346 |             print("Epoch %d finished.\tvalidation loss: %f\telapsed time: %s" % (e, cvLoss / cvCount, str(datetime.now() - start)))
347 |             if validationFile!=None and eval_metric!=None:
348 |                 eval_score = self.evaluate(validationFile,eval_metric)
349 |                 print("Epoch %d\finished: %d\tEvaludation score: %f" % (e, t, eval_score))
350 | 
351 | 
352 | def logLoss(double p, double y):
353 |     ''' 
354 |     calculate the log loss cost
355 |     p: prediction [0, 1]
356 |     y: actual value {0, 1}
357 |     '''
358 |     p = max(min(p, 1. - 1e-15), 1e-15)
359 |     return - log(p) if y == 1. else -log(1. - p)
360 | 
361 | def data(filePath, hashSize, hashSalt,loop=False):
362 |     ''' generator for data using hash trick
363 |     
364 |     INPUT:
365 |         filePath
366 |         hashSize
367 |         hashSalt: String with which to salt the hash function
368 |     '''
369 |     cdef unsigned int t
370 |     cdef double y
371 |     cdef list x
372 |     cdef str value
373 |     cdef unsigned int index
374 |     cdef dict row
375 |     import os
376 |     if not loop:
377 |         for t, row in enumerate(DictReader(filePath)):
378 |             ID = row['activity_id']
379 |             del row['activity_id']
380 |             
381 |             del row['outcome_isnull']
382 | 
383 |             y = 0.
384 |             if 'outcome' in row:
385 |                 if row['outcome'] == '1':
386 |                     y = 1.
387 |                 del row['outcome']
388 |             
389 |             # date = int(row['hour'][4:6])
390 |             
391 |             # row['hour'] = row['hour'][6:]
392 |             
393 |             x = []
394 |             
395 |             for key in row:
396 |                 value = row[key]
397 |                 
398 |                 index = abs(hash(hashSalt + key + '_' + value)) % hashSize + 1      # 1 is added to hash index because I want 0 to indicate the bias term.
399 |                 x.append(index)
400 |             
401 |             yield t, ID, x, y
402 |     else:
403 |         while True:
404 |             for t, row in enumerate(DictReader(filePath)):
405 |                 ID = row['activity_id']
406 |                 del row['activity_id']
407 |                 
408 |                 del row['outcome_isnull']
409 | 
410 |                 y = 0.
411 |                 if 'outcome' in row:
412 |                     if row['outcome'] == '1':
413 |                         y = 1.
414 |                     del row['outcome']
415 |                 
416 |                 # date = int(row['hour'][4:6])
417 |                 
418 |                 # row['hour'] = row['hour'][6:]
419 |                 
420 |                 x = []
421 |                 
422 |                 for key in row:
423 |                     value = row[key]
424 |                     
425 |                     index = abs(hash(hashSalt + key + '_' + value)) % hashSize + 1      # 1 is added to hash index because I want 0 to indicate the bias term.
426 |                     x.append(index)
427 |                 
428 |                 yield t, ID, x, y
429 | 


--------------------------------------------------------------------------------
/kaggler/online_model/nn.pyx:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False
  2 | # cython: wraparound=False
  3 | # cython: cdivision=True
  4 | from __future__ import division
  5 | import numpy as np
  6 | 
  7 | cimport cython
  8 | from libc.math cimport sqrt, abs
  9 | from ..util cimport sigm
 10 | cimport numpy as np
 11 | 
 12 | 
 13 | np.import_array()
 14 | 
 15 | 
 16 | cdef class NN:
 17 |     """Neural Network with a single ReLU hidden layer online learner.
 18 | 
 19 |     Attributes:
 20 |         n (int): number of input units
 21 |         epoch (int): number of epochs
 22 |         h (int): number of hidden units
 23 |         a (double): initial learning rate
 24 |         l2 (double): L2 regularization parameter
 25 |         w0 (array of double): weights between the input and hidden layers
 26 |         w1 (array of double): weights between the hidden and output layers
 27 |         z (array of double): hidden units
 28 |         c (double): counter
 29 |         c1 (array of double): counters for hidden units
 30 |     """
 31 | 
 32 |     cdef unsigned int epoch # number of epochs
 33 |     cdef unsigned int n     # number of input units
 34 |     cdef unsigned int h     # number of hidden units
 35 |     cdef double a           # learning rate
 36 |     cdef double l2          # L2 regularization parameter
 37 |     cdef double[:] w0       # weights between the input and hidden layers
 38 |     cdef double[:] w1       # weights between the hidden and output layers
 39 |     cdef double[:] z        # hidden units
 40 |     cdef double c           # counter
 41 |     cdef double[:] c0       # counters for input units
 42 |     cdef double[:] c1       # counters for hidden units
 43 | 
 44 |     def __init__(self,
 45 |                  unsigned int n,
 46 |                  unsigned int epoch=10,
 47 |                  unsigned int h=10,
 48 |                  double a=0.01,
 49 |                  double l2=0.,
 50 |                  unsigned int seed=0):
 51 |         """Initialize the NN class object.
 52 | 
 53 |         Args:
 54 |             n (int): number of input units
 55 |             epoch (int): number of epochs
 56 |             h (int): number of the hidden units
 57 |             a (double): initial learning rate
 58 |             l2 (double): L2 regularization parameter
 59 |             seed (unsigned int): random seed
 60 |         """
 61 | 
 62 |         cdef int i
 63 | 
 64 |         rng = np.random.RandomState(seed)
 65 | 
 66 |         self.epoch = epoch
 67 |         self.n = n
 68 |         self.h = h
 69 | 
 70 |         self.a = a
 71 |         self.l2 = l2
 72 | 
 73 |         self.w1 = (rng.rand(self.h + 1) - .5) * 1e-6
 74 |         self.w0 = (rng.rand((self.n + 1) * self.h) - .5) * 1e-6
 75 | 
 76 |         # hidden units in the hidden layer
 77 |         self.z = np.zeros((self.h,), dtype=np.float64)
 78 | 
 79 |         # counters for biases and inputs
 80 |         self.c = 0.
 81 |         self.c1 = np.zeros((self.h,), dtype=np.float64)
 82 |         self.c0 = np.zeros((self.n,), dtype=np.float64)
 83 | 
 84 |     def __repr__(self):
 85 |         return ('NN(n={}, epoch={}, h={}, a={}, l2={})').format(
 86 |             self.n, self.epoch, self.h, self.a, self.l2
 87 |         )
 88 | 
 89 |     def read_sparse(self, path):
 90 |         """Read a libsvm format sparse file line by line.
 91 | 
 92 |         Args:
 93 |             path (str): a file path to the libsvm format sparse file
 94 | 
 95 |         Yields:
 96 |             idx (list of int): a list of index of non-zero features
 97 |             val (list of double): a list of values of non-zero features
 98 |             y (int): target value
 99 |         """
100 |         for line in open(path):
101 |             xs = line.rstrip().split(' ')
102 | 
103 |             y = int(xs[0])
104 |             idx = []
105 |             val = []
106 |             for item in xs[1:]:
107 |                 i, v = item.split(':')
108 |                 idx.append(int(i) % self.n)
109 |                 val.append(float(v))
110 | 
111 |             yield zip(idx, val), y
112 | 
113 |     def fit(self, X, y):
114 |         """Update the model with a sparse input feature matrix and its targets.
115 | 
116 |         Args:
117 |             X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features
118 |             y (numpy.array): targets
119 | 
120 |         Returns:
121 |             updated model weights and counts
122 |         """
123 |         for epoch in range(self.epoch):
124 |             for row in range(X.shape[0]):
125 |                 x = zip(X[row].indices, X[row].data)
126 |                 self.update_one(x, self.predict_one(x) - y[row])
127 | 
128 |     def predict(self, X):
129 |         """Predict for a sparse matrix X.
130 | 
131 |         Args:
132 |             X (scipy.sparse.csr_matrix): a sparse matrix for input features
133 | 
134 |         Returns:
135 |             p (numpy.array): predictions for input features
136 |         """
137 | 
138 |         p = np.zeros((X.shape[0], ), dtype=np.float64)
139 |         for row in range(X.shape[0]):
140 |             p[row] = self.predict_one(zip(X[row].indices, X[row].data))
141 | 
142 |         return p
143 | 
144 |     def predict_one(self, list x):
145 |         """Predict for features.
146 | 
147 |         Args:
148 |             x (list of tuple): a list of (index, value) of non-zero features
149 | 
150 |         Returns:
151 |             p (double): a prediction for input features
152 |         """
153 |         cdef double p
154 |         cdef int j
155 |         cdef int i
156 |         cdef double v
157 | 
158 |         # starting with the bias in the hidden layer
159 |         p = self.w1[self.h]
160 | 
161 |         # calculating and adding values of hidden units
162 |         for j in range(self.h):
163 |             # starting with the bias in the input layer
164 |             self.z[j] = self.w0[self.n * self.h + j]
165 | 
166 |             # calculating and adding values of input units
167 |             for i, v in x:
168 |                 self.z[j] += self.w0[i * self.h + j] * v
169 | 
170 |             # apply the ReLU activation function to the hidden unit
171 |             self.z[j] = self.z[j] if self.z[j] > 0. else 0.
172 | 
173 |             p += self.w1[j] * self.z[j]
174 | 
175 |         # apply the sigmoid activation function to the output unit
176 |         return sigm(p)
177 | 
178 |     def update_one(self, list x, double e):
179 |         """Update the model with one observation.
180 | 
181 |         Args:
182 |             x (list of tuple): a list of (index, value) of non-zero features
183 |             e (double): error between the prediction of the model and target
184 | 
185 |         Returns:
186 |             updated model weights and counts
187 |         """
188 |         cdef int j
189 |         cdef int i
190 |         cdef double dl_dy
191 |         cdef double dl_dz
192 |         cdef double dl_dw1
193 |         cdef double dl_dw0
194 |         cdef double v
195 | 
196 |         dl_dy = e      # dl/dy * (initial learning rate)
197 | 
198 |         # starting with the bias in the hidden layer
199 |         self.w1[self.h] -= (dl_dy + self.l2 * self.w1[self.h]) * self.a / (sqrt(self.c) + 1)
200 |         for j in range(self.h):
201 |             # update weights related to non-zero hidden units
202 |             if self.z[j] == 0.:
203 |                 continue
204 | 
205 |             # update weights between the hidden units and output
206 |             # dl/dw1 = dl/dy * dy/dw1 = dl/dy * z
207 |             dl_dw1 = dl_dy * self.z[j]
208 |             self.w1[j] -= (dl_dw1 + self.l2 * self.w1[j]) * self.a / (sqrt(self.c1[j]) + 1)
209 | 
210 |             # starting with the bias in the input layer
211 |             # dl/dz = dl/dy * dy/dz = dl/dy * w1
212 |             dl_dz = dl_dy * self.w1[j]
213 |             self.w0[self.n * self.h + j] -= (dl_dz +
214 |                                              self.l2 * self.w0[self.n * self.h + j]) * self.a / (sqrt(self.c1[j]) + 1)
215 |             # update weights related to non-zero input units
216 |             for i, v in x:
217 |                 # update weights between the hidden unit j and input i
218 |                 # dl/dw0 = dl/dz * dz/dw0 = dl/dz * v
219 |                 dl_dw0 = dl_dz * v
220 |                 self.w0[i * self.h + j] -= (dl_dw0 +
221 |                                             self.l2 * self.w0[i * self.h + j]) * self.a / (sqrt(self.c0[i]) + 1)
222 | 
223 |                 # update counter for the input i
224 |                 self.c0[i] += dl_dw0 * dl_dw0
225 | 
226 |             # update counter for the hidden unit j
227 |             self.c1[j] += dl_dw1 * dl_dw1
228 | 
229 |         # update overall counter
230 |         self.c += dl_dy * dl_dy
231 | 


--------------------------------------------------------------------------------
/kaggler/online_model/nn_h2.pyx:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False
  2 | # cython: wraparound=False
  3 | # cython: cdivision=True
  4 | from __future__ import division
  5 | import numpy as np
  6 | 
  7 | cimport cython
  8 | from libc.math cimport sqrt, abs
  9 | from ..util cimport sigm
 10 | cimport numpy as np
 11 | 
 12 | 
 13 | np.import_array()
 14 | 
 15 | 
 16 | cdef class NN_H2:
 17 |     """Neural Network with 2 ReLU hidden layers online learner.
 18 | 
 19 |     Attributes:
 20 |         n (int): number of input units
 21 |         epoch (int): number of epochs
 22 |         h1 (int): number of the 1st level hidden units
 23 |         h2 (int): number of the 2nd level hidden units
 24 |         a (double): initial learning rate
 25 |         l2 (double): L2 regularization parameter
 26 |         w0 (array of double): weights between the input and 1st hidden layers
 27 |         w1 (array of double): weights between the 1st and 2nd hidden layers
 28 |         w2 (array of double): weights between the 2nd hidden and output layers
 29 |         z1 (array of double): 1st level hidden units
 30 |         z2 (array of double): 2nd level hidden units
 31 |         c (double): counter
 32 |         c1 (array of double): counters for 1st level hidden units
 33 |         c2 (array of double): counters for 2nd level hidden units
 34 |     """
 35 | 
 36 |     cdef unsigned int n     # number of input units
 37 |     cdef unsigned int h1    # number of the 1st level hidden units
 38 |     cdef unsigned int h2    # number of the 2nd level hidden units
 39 |     cdef double a           # learning rate
 40 |     cdef double l2          # L2 regularization parameter
 41 |     cdef double[:] w0       # weights between the input and 1st hidden layers
 42 |     cdef double[:] w1       # weights between the 1st and 2nd hidden layers
 43 |     cdef double[:] w2       # weights between the 2nd hidden and output layers
 44 |     cdef double[:] z1       # 1st level hidden units
 45 |     cdef double[:] z2       # 2nd level hidden units
 46 |     cdef double c           # counter
 47 |     cdef double[:] c0       # counters for input units
 48 |     cdef double[:] c1       # counters for 1st level hidden units
 49 |     cdef double[:] c2       # counters for 2nd level hidden units
 50 | 
 51 |     def __init__(self,
 52 |                  unsigned int n,
 53 |                  unsigned int epoch=10,
 54 |                  unsigned int h1=128,
 55 |                  unsigned int h2=256,
 56 |                  double a=0.01,
 57 |                  double l2=0.,
 58 |                  unsigned int seed=0):
 59 |         """Initialize the NN class object.
 60 | 
 61 |         Args:
 62 |             n (int): number of input units
 63 |             epoch (int): number of epochs
 64 |             h1 (int): number of the 1st level hidden units
 65 |             h2 (int): number of the 2nd level hidden units
 66 |             a (double): initial learning rate
 67 |             l2 (double): L2 regularization parameter
 68 |             seed (unsigned int): random seed
 69 |         """
 70 | 
 71 |         cdef int i
 72 | 
 73 |         rng = np.random.RandomState(seed)
 74 | 
 75 |         self.n = n
 76 |         self.epoch = epoch
 77 |         self.h1 = h1
 78 |         self.h2 = h2
 79 | 
 80 |         self.a = a
 81 |         self.l2 = l2
 82 | 
 83 |         # weights between the output and 2nd hidden layer
 84 |         self.w2 = (rng.rand(self.h2 + 1) - .5) * 1e-7
 85 | 
 86 |         # weights between the 2nd hidden layer and 1st hidden layer
 87 |         self.w1 = (rng.rand((self.h1 + 1) * self.h2) - .5) * 1e-7
 88 | 
 89 |         # weights between the 1st hidden layer and inputs
 90 |         self.w0 = (rng.rand((self.n + 1) * self.h1) - .5) * 1e-7
 91 | 
 92 |         # hidden units in the 2nd hidden layer
 93 |         self.z2 = np.zeros((self.h2,), dtype=np.float64)
 94 | 
 95 |         # hidden units in the 1st hidden layer
 96 |         self.z1 = np.zeros((self.h1,), dtype=np.float64)
 97 | 
 98 |         # counters for the hidden units and inputs
 99 |         self.c = 0.
100 |         self.c2 = np.zeros((self.h2,), dtype=np.float64)
101 |         self.c1 = np.zeros((self.h1,), dtype=np.float64)
102 |         self.c0 = np.zeros((self.n,), dtype=np.float64)
103 | 
104 |     def __repr__(self):                                                         
105 |         return ('NN_H2(n={}, epoch={}, h1={}, h2={}, a={}, l2={})').format(
106 |             self.n, self.epoch, self.h1, self.h2, self.a, self.l2
107 |         )
108 | 
109 |     def read_sparse(self, path):
110 |         """Read the libsvm format sparse file line by line.
111 | 
112 |         Args:
113 |             path (str): a file path to the libsvm format sparse file
114 | 
115 |         Yields:
116 |             idx (list of int): a list of index of non-zero features
117 |             val (list of double): a list of values of non-zero features
118 |             y (int): target value
119 |         """
120 |         for line in open(path):
121 |             xs = line.rstrip().split(' ')
122 | 
123 |             y = int(xs[0])
124 |             idx = []
125 |             val = []
126 |             for item in xs[1:]:
127 |                 i, v = item.split(':')
128 |                 idx.append(abs(hash(i)) % self.n)
129 |                 val.append(float(v))
130 | 
131 |             yield zip(idx, val), y
132 | 
133 |     def fit(self, X, y):
134 |         """Update the model with a sparse input feature matrix and its targets.
135 | 
136 |         Args:
137 |             X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features
138 |             y (numpy.array): targets
139 | 
140 |         Returns:
141 |             updated model weights and counts
142 |         """
143 |         for epoch in range(self.epoch):
144 |             for row in range(X.shape[0]):
145 |                 x = zip(X[row].indices, X[row].data)
146 |                 self.update_one(x, self.predict_one(x) - y[row])
147 | 
148 |     def predict(self, X):
149 |         """Predict for a sparse matrix X.
150 | 
151 |         Args:
152 |             X (scipy.sparse.csr_matrix): a sparse matrix for input features
153 | 
154 |         Returns:
155 |             p (numpy.array): predictions for input features
156 |         """
157 | 
158 |         p = np.zeros((X.shape[0], ), dtype=np.float64)
159 |         for row in range(X.shape[0]):
160 |             p[row] = self.predict_one(zip(X[row].indices, X[row].data))
161 | 
162 |         return p
163 | 
164 |     def predict_one(self, list x):
165 |         """Predict for features.
166 | 
167 |         Args:
168 |             x (list of tuple): a list of (index, value) of non-zero features
169 | 
170 |         Returns:
171 |             p (double): a prediction for input features
172 |         """
173 |         cdef double p
174 |         cdef int k
175 |         cdef int j
176 |         cdef int i
177 |         cdef double v
178 | 
179 |         # starting from the bias in the 2nd hidden layer
180 |         p = self.w2[self.h2]
181 | 
182 |         # calculating and adding values of 2nd level hidden units
183 |         for k in range(self.h2):
184 |             # staring with the bias in the 1st hidden layer
185 |             self.z2[k] = self.w1[self.h1 * self.h2 + k]
186 | 
187 |             # calculating and adding values of 1st level hidden units
188 |             for j in range(self.h1):
189 |                 # starting with the bias in the input layer
190 |                 self.z1[j] = self.w0[self.n * self.h1 + j]
191 | 
192 |                 # calculating and adding values of input units
193 |                 for i, v in x:
194 |                     self.z1[j] += self.w0[i * self.h1 + j] * v
195 | 
196 |                 # apply the ReLU activation function to the first level hidden unit
197 |                 self.z1[j] = self.z1[j] if self.z1[j] > 0. else 0.
198 | 
199 |                 self.z2[k] += self.w1[j * self.h2 + k] * self.z1[j]
200 | 
201 |             # apply the ReLU activation function to the 2nd level hidden unit
202 |             self.z2[k] = self.z2[k] if self.z2[k] > 0. else 0.
203 | 
204 |             p += self.w2[k] * self.z2[k]
205 | 
206 |         # apply the sigmoid activation function to the output unit
207 |         return sigm(p)
208 | 
209 |     def update_one(self, list x, double e):
210 |         """Update the model.
211 | 
212 |         Args:
213 |             x (list of tuple): a list of (index, value) of non-zero features
214 |             e (double): error between the prediction of the model and target
215 | 
216 |         Returns:
217 |             updated model weights and counts
218 |         """
219 |         cdef int k
220 |         cdef int j
221 |         cdef int i
222 |         cdef double dl_dy
223 |         cdef double dl_dz1
224 |         cdef double dl_dz2
225 |         cdef double dl_dw0
226 |         cdef double dl_dw1
227 |         cdef double dl_dw2
228 |         cdef double v
229 | 
230 |         # XXX: assuming predict() was called right before with the same idx and
231 |         # val inputs.  Otherwise self.z will be incorrect for updates.
232 |         dl_dy = e      # dl/dy * (initial learning rate)
233 | 
234 |         # starting with the bias in the 2nd hidden layer
235 |         self.w2[self.h2] -= (dl_dy + self.l2 * self.w2[self.h2]) * self.a / (sqrt(self.c) + 1)
236 |         for k in range(self.h2):
237 |             # update weights related to non-zero 2nd level hidden units
238 |             if self.z2[k] == 0.:
239 |                 continue
240 | 
241 |             # update weights between the 2nd hidden units and output
242 |             # dl/dw2 = dl/dy * dy/dw2 = dl/dy * z2
243 |             dl_dw2 = dl_dy * self.z2[k]
244 |             self.w2[k] -= (dl_dw2 + self.l2 * self.w2[k]) * self.a / (sqrt(self.c2[k]) + 1)
245 | 
246 |             # starting with the bias in the 1st hidden layer
247 |             # dl/dz2 = dl/dy * dy/dz2 = dl/dy * w2
248 |             dl_dz2 = dl_dy * self.w2[k]
249 |             self.w1[self.h1 * self.h2 + k] -= (dl_dz2 +
250 |                                                self.l2 * self.w1[self.h1 * self.h2 + k]) * self.a / (sqrt(self.c2[k]) + 1)
251 |             for j in range(self.h1):
252 |                 # update weights realted to non-zero hidden units
253 |                 if self.z1[j] == 0.:
254 |                     continue
255 | 
256 |                 # update weights between the hidden units and output
257 |                 # dl/dw1 = dl/dz2 * dz2/dw1 = dl/dz2 * z1
258 |                 dl_dw1 = dl_dz2 * self.z1[j]
259 |                 self.w1[j * self.h2 + k] -= (dl_dw1 + self.l2 * self.w1[j]) * self.a / (sqrt(self.c1[j]) + 1)
260 | 
261 |                 # starting with the bias in the input layer
262 |                 # dl/dz1 = dl/dz2 * dz2/dz1 = dl/dz2 * w1
263 |                 dl_dz1 = dl_dz2 * self.w1[j * self.h2 + k]
264 |                 self.w0[self.n * self.h1 + j] -= (dl_dz1 +
265 |                                                   self.l2 * self.w0[self.n * self.h1 + j]) * self.a / (sqrt(self.c1[j]) + 1)
266 |                 # update weights related to non-zero input units
267 |                 for i, v in x:
268 |                     # update weights between the hidden unit j and input i
269 |                     # dl/dw0 = dl/dz1 * dz/dw0 = dl/dz1 * v
270 |                     dl_dw0 = dl_dz1 * v
271 |                     self.w0[i * self.h1 + j] -= (dl_dw0 +
272 |                                                  self.l2 * self.w0[i * self.h1 + j]) * self.a / (sqrt(self.c0[i]) + 1)
273 | 
274 |                     # update counter for the input i
275 |                     self.c0[i] += dl_dw0 * dl_dw0
276 | 
277 |                 # update counter for the 1st level hidden unit j
278 |                 self.c1[j] += dl_dw1 * dl_dw1
279 | 
280 |             # update counter for the 2nd level hidden unit k
281 |             self.c2[k] += dl_dw2 * dl_dw2
282 | 
283 |         # update overall counter
284 |         self.c += dl_dy * dl_dy
285 | 


--------------------------------------------------------------------------------
/kaggler/online_model/sgd.pyx:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False
  2 | # cython: wraparound=False
  3 | # cython: cdivision=True
  4 | from __future__ import division
  5 | import numpy as np
  6 | 
  7 | cimport cython
  8 | from libc.math cimport sqrt, abs
  9 | from ..util cimport sigm
 10 | cimport numpy as np
 11 | 
 12 | 
 13 | np.import_array()
 14 | 
 15 | 
 16 | cdef class SGD:
 17 |     """Simple online learner using a hasing trick.
 18 | 
 19 |     Attributes:
 20 |         epoch (int): number of epochs
 21 |         n (int): number of features after hashing trick
 22 |         a (double): initial learning rate
 23 |         l1 (double): L1 regularization parameter
 24 |         l2 (double): L2 regularization parameter
 25 |         w (array of double): feature weights
 26 |         c (array of double): counters for weights
 27 |         interaction (boolean): whether to use 2nd order interaction or not
 28 |     """
 29 |     cdef unsigned int epoch
 30 |     cdef unsigned int n
 31 |     cdef double a
 32 |     cdef double l1
 33 |     cdef double l2
 34 |     cdef double[:] w
 35 |     cdef double[:] c
 36 |     cdef bint interaction
 37 | 
 38 |     def __init__(self,
 39 |                  double a=0.01,
 40 |                  double l1=0.0,
 41 |                  double l2=0.0,
 42 |                  unsigned int n=2**20,
 43 |                  unsigned int epoch=10,
 44 |                  bint interaction=True):
 45 |         """Initialize the SGD class object.
 46 | 
 47 |         Args:
 48 |             epoch (int): number of epochs
 49 |             n (int): number of features after hashing trick
 50 |             a (double): initial learning rate
 51 |             l1 (double): L1 regularization parameter
 52 |             l2 (double): L2 regularization parameter
 53 |             w (array of double): feature weights
 54 |             c (array of double): counters for weights
 55 |             interaction (boolean): whether to use 2nd order interaction or not
 56 |         """
 57 | 
 58 |         self.epoch = epoch
 59 |         self.n = n      # # of features
 60 |         self.a = a      # learning rate
 61 |         self.l1 = l1
 62 |         self.l2 = l2
 63 | 
 64 |         # initialize weights and counts
 65 |         self.w = np.zeros((self.n + 1,), dtype=np.float64)
 66 |         self.c = np.zeros((self.n + 1,), dtype=np.float64)
 67 |         self.interaction = interaction
 68 | 
 69 |     def __repr__(self):
 70 |         return ('SGD(a={}, l1={}, l2={}, n={}, epoch={}, interaction={})').format(
 71 |             self.a, self.l1, self.l2, self.n, self.epoch, self.interaction
 72 |         )
 73 | 
 74 |     def _indices(self, list x):
 75 |         cdef unsigned int index
 76 |         cdef int l
 77 |         cdef int i
 78 |         cdef int j
 79 | 
 80 |         yield self.n
 81 | 
 82 |         for index in x:
 83 |             yield abs(hash(index)) % self.n
 84 | 
 85 |         if self.interaction:
 86 |             l = len(x)
 87 |             x = sorted(x)
 88 |             for i in xrange(l):
 89 |                 for j in xrange(i + 1, l):
 90 |                     yield abs(hash('{}_{}'.format(x[i], x[j]))) % self.n
 91 | 
 92 |     def read_sparse(self, path):
 93 |         """Apply hashing trick to the libsvm format sparse file.
 94 | 
 95 |         Args:
 96 |             path (str): a file path to the libsvm format sparse file
 97 | 
 98 |         Yields:
 99 |             x (list of int): a list of index of non-zero features
100 |             y (int): target value
101 |         """
102 |         for line in open(path):
103 |             xs = line.rstrip().split(' ')
104 | 
105 |             y = int(xs[0])
106 |             x = []
107 |             for item in xs[1:]:
108 |                 index, _ = item.split(':')
109 |                 x.append(abs(hash(index)) % self.n)
110 | 
111 |             yield x, y
112 | 
113 |     def fit(self, X, y):
114 |         """Update the model with a sparse input feature matrix and its targets.
115 | 
116 |         Args:
117 |             X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features
118 |             y (numpy.array): targets
119 | 
120 |         Returns:
121 |             updated model weights and counts
122 |         """
123 |         for epoch in range(self.epoch):
124 |             for row in range(X.shape[0]):
125 |                 x = list(X[row].indices)
126 |                 self.update_one(x, self.predict_one(x) - y[row])
127 | 
128 |     def predict(self, X):
129 |         """Predict for a sparse matrix X.
130 | 
131 |         Args:
132 |             X (scipy.sparse.csr_matrix): a sparse matrix for input features
133 | 
134 |         Returns:
135 |             p (numpy.array): predictions for input features
136 |         """
137 |         p = np.zeros((X.shape[0], ), dtype=np.float64)
138 |         for row in range(X.shape[0]):
139 |             p[row] = self.predict_one(list(X[row].indices))
140 | 
141 |         return p
142 | 
143 |     def predict_one(self, list x):
144 |         """Predict for features.
145 | 
146 |         Args:
147 |             x (list of int): a list of index of non-zero features
148 | 
149 |         Returns:
150 |             p (double): a prediction for input features
151 |         """
152 |         cdef int i
153 |         cdef double wTx
154 | 
155 |         wTx = 0.
156 |         for i in self._indices(x):
157 |             wTx += self.w[i]
158 | 
159 |         return sigm(wTx)
160 | 
161 |     def update_one(self, list x, double e):
162 |         """Update the model.
163 | 
164 |         Args:
165 |             x (list of int): a list of index of non-zero features
166 |             e (double): error between the prediction of the model and target
167 | 
168 |         Returns:
169 |             updates model weights and counts
170 |         """
171 |         cdef int i
172 |         cdef double g2
173 | 
174 |         g2 = e * e
175 |         for i in self._indices(x):
176 |             self.w[i] -= (e +
177 |                           (self.l1 if self.w[i] >= 0. else -self.l1) +
178 |                           self.l2 * self.w[i]) * self.a / (sqrt(self.c[i]) + 1)
179 |             self.c[i] += g2
180 | 


--------------------------------------------------------------------------------
/kaggler/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import OneHotEncoder
2 | from .data import LabelEncoder
3 | from .data import Normalizer
4 | 


--------------------------------------------------------------------------------
/kaggler/preprocessing/data.py:
--------------------------------------------------------------------------------
  1 | from scipy import sparse
  2 | from scipy.stats import norm
  3 | from statsmodels.distributions.empirical_distribution import ECDF
  4 | import logging
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | 
  9 | NAN_INT = 7535805
 10 | 
 11 | 
 12 | class Normalizer(object):
 13 |     """Normalizer that transforms numerical columns into normal distribution.
 14 | 
 15 |     Attributes:
 16 |         ecdfs (list of empirical CDF): empirical CDFs for columns
 17 |     """
 18 | 
 19 |     def fit(self, X, y=None):
 20 |         self.ecdfs = [None] * X.shape[1]
 21 | 
 22 |         for col in range(X.shape[1]):
 23 |             self.ecdfs[col] = ECDF(X[:, col])
 24 | 
 25 |     def transform(self, X):
 26 |         """Normalize numerical columns.
 27 |         
 28 |         Args:
 29 |             X (numpy.array) : numerical columns to normalize
 30 | 
 31 |         Returns:
 32 |             X (numpy.array): normalized numerical columns
 33 |         """
 34 | 
 35 |         for col in range(X.shape[1]):
 36 |             X[:, col] = self._transform_col(X[:, col], col)
 37 |             
 38 |         return X
 39 | 
 40 |     def fit_transform(self, X, y=None):
 41 |         """Normalize numerical columns.
 42 |         
 43 |         Args:
 44 |             X (numpy.array) : numerical columns to normalize
 45 | 
 46 |         Returns:
 47 |             X (numpy.array): normalized numerical columns
 48 |         """
 49 | 
 50 |         self.ecdfs = [None] * X.shape[1]
 51 | 
 52 |         for col in range(X.shape[1]):
 53 |             self.ecdfs[col] = ECDF(X[:, col])
 54 |             X[:, col] = self._transform_col(X[:, col], col)
 55 | 
 56 |         return X
 57 | 
 58 |     def _transform_col(self, x, col):
 59 |         """Normalize one numerical column.
 60 |         
 61 |         Args:
 62 |             x (numpy.array): a numerical column to normalize
 63 |             col (int): column index
 64 | 
 65 |         Returns:
 66 |             A normalized feature vector.
 67 |         """
 68 | 
 69 |         return norm.ppf(self.ecdfs[col](x) * .998 + .001)
 70 | 
 71 | 
 72 | class LabelEncoder(object):
 73 |     """Label Encoder that groups infrequent values into one label.
 74 | 
 75 |     Attributes:
 76 |         min_obs (int): minimum number of observation to assign a label.
 77 |         label_encoders (list of dict): label encoders for columns
 78 |         label_maxes (list of int): maximum of labels for columns
 79 |     """
 80 | 
 81 |     def __init__(self, min_obs=10):
 82 |         """Initialize the OneHotEncoder class object.
 83 | 
 84 |         Args:
 85 |             min_obs (int): minimum number of observation to assign a label.
 86 |         """
 87 | 
 88 |         self.min_obs = min_obs
 89 | 
 90 |     def __repr__(self):
 91 |         return ('LabelEncoder(min_obs={})').format(self.min_obs)
 92 | 
 93 |     def _get_label_encoder_and_max(self, x):
 94 |         """Return a mapping from values and its maximum of a column to integer labels.
 95 | 
 96 |         Args:
 97 |             x (numpy.array): a categorical column to encode.
 98 | 
 99 |         Returns:
100 |             label_encoder (dict): mapping from values of features to integers
101 |             max_label (int): maximum label
102 |         """
103 | 
104 |         # NaN cannot be used as a key for dict. So replace it with a random integer.
105 |         x[pd.isnull(x)] = NAN_INT
106 | 
107 |         # count each unique value
108 |         label_count = {}
109 |         for label in x:
110 |             try:
111 |                 label_count[label] += 1
112 |             except KeyError:
113 |                 label_count[label] = 1
114 | 
115 |         # add unique values appearing more than min_obs to the encoder.
116 |         label_encoder = {}
117 |         label_index = 1
118 |         labels_not_encoded = 0
119 |         for label in label_count.keys():
120 |             if label_count[label] >= self.min_obs:
121 |                 label_encoder[label] = label_index
122 |                 label_index += 1
123 |             else:
124 |                 labels_not_encoded += 1
125 | 
126 |         max_label = label_index - 1
127 | 
128 |         # if every label is encoded, then replace the maximum label with 0 so
129 |         # that total number of labels encoded is (# of total labels - 1).
130 |         if labels_not_encoded == 0:
131 |             for label in label_encoder:
132 |                 # find the label with the maximum encoded value
133 |                 if label_encoder[label] == max_label:
134 |                     # set the value of the label to 0 and decrease the maximum
135 |                     # by 1.
136 |                     label_encoder[label] = 0
137 |                     max_label -= 1
138 |                     break
139 | 
140 |         return label_encoder, max_label
141 | 
142 |     def _transform_col(self, x, col):
143 |         """Encode one categorical column into labels.
144 | 
145 |         Args:
146 |             x (numpy.array): a categorical column to encode
147 |             col (int): column index
148 | 
149 |         Returns:
150 |             x (numpy.array): a column with labels.
151 |         """
152 | 
153 |         label_encoder = self.label_encoders[col]
154 | 
155 |         # replace NaNs with the pre-defined random integer
156 |         x[pd.isnull(x)] = NAN_INT
157 | 
158 |         labels = np.zeros((x.shape[0], ))
159 |         for label in label_encoder:
160 |             labels[x == label] = label_encoder[label]
161 | 
162 |         return labels
163 | 
164 |     def fit(self, X, y=None):
165 |         self.label_encoders = [None] * X.shape[1]
166 |         self.label_maxes = [None] * X.shape[1]
167 | 
168 |         for col in range(X.shape[1]):
169 |             self.label_encoders[col], self.label_maxes[col] = \
170 |                 self._get_label_encoder_and_max(X[:, col])
171 | 
172 |         return self
173 | 
174 |     def transform(self, X):
175 |         """Encode categorical columns into sparse matrix with one-hot-encoding.
176 | 
177 |         Args:
178 |             X (numpy.array): categorical columns to encode
179 | 
180 |         Returns:
181 |             X (numpy.array): label encoded columns
182 |         """
183 | 
184 |         for col in range(X.shape[1]):
185 |             X[:, col] = self._transform_col(X[:, col], col)
186 | 
187 |         return X
188 | 
189 |     def fit_transform(self, X, y=None):
190 |         """Encode categorical columns into label encoded columns
191 | 
192 |         Args:
193 |             X (numpy.array): categorical columns to encode
194 | 
195 |         Returns:
196 |             X (numpy.array): label encoded columns
197 |         """
198 | 
199 |         self.label_encoders = [None] * X.shape[1]
200 |         self.label_maxes = [None] * X.shape[1]
201 | 
202 |         for col in range(X.shape[1]):
203 |             self.label_encoders[col], self.label_maxes[col] = \
204 |                 self._get_label_encoder_and_max(X[:, col])
205 | 
206 |             X[:, col] = self._transform_col(X[:, col], col)
207 | 
208 |         return X
209 | 
210 | 
211 | class OneHotEncoder(object):
212 |     """One-Hot-Encoder that groups infrequent values into one dummy variable.
213 | 
214 |     Attributes:
215 |         min_obs (int): minimum number of observation to create a dummy variable
216 |         label_encoders (list of (dict, int)): label encoders and their maximums
217 |                                               for columns
218 |     """
219 | 
220 |     def __init__(self, min_obs=10):
221 |         """Initialize the OneHotEncoder class object.
222 | 
223 |         Args:
224 |             min_obs (int): minimum number of observation to create a dummy variable
225 |             label_encoder (LabelEncoder): LabelEncoder that transofrm
226 |         """
227 | 
228 |         self.min_obs = min_obs
229 |         self.label_encoder = LabelEncoder(min_obs)
230 | 
231 |     def __repr__(self):
232 |         return ('OneHotEncoder(min_obs={})').format(self.min_obs)
233 | 
234 |     def _transform_col(self, x, col):
235 |         """Encode one categorical column into sparse matrix with one-hot-encoding.
236 | 
237 |         Args:
238 |             x (numpy.array): a categorical column to encode
239 |             col (int): column index
240 | 
241 |         Returns:
242 |             X (scipy.sparse.coo_matrix): sparse matrix encoding a categorical
243 |                                          variable into dummy variables
244 |         """
245 | 
246 |         labels = self.label_encoder._transform_col(x, col)
247 |         label_max = self.label_encoder.label_maxes[col]
248 | 
249 |         # build row and column index for non-zero values of a sparse matrix
250 |         index = np.array(range(len(labels)))
251 |         i = index[labels > 0]
252 |         j = labels[labels > 0] - 1  # column index starts from 0
253 | 
254 |         if len(i) > 0:
255 |             return sparse.coo_matrix((np.ones_like(i), (i, j)),
256 |                                      shape=(x.shape[0], label_max))
257 |         else:
258 |             # if there is no non-zero value, return no matrix
259 |             return None
260 | 
261 |     def fit(self, X, y=None):
262 |         self.label_encoder.fit(X)
263 | 
264 |         return self
265 | 
266 |     def transform(self, X):
267 |         """Encode categorical columns into sparse matrix with one-hot-encoding.
268 | 
269 |         Args:
270 |             X (numpy.array): categorical columns to encode
271 | 
272 |         Returns:
273 |             X_new (scipy.sparse.coo_matrix): sparse matrix encoding categorical
274 |                                              variables into dummy variables
275 |         """
276 | 
277 |         for col in range(X.shape[1]):
278 |             X_col = self._transform_col(X[:, col], col)
279 |             if X_col is not None:
280 |                 if col == 0:
281 |                     X_new = X_col
282 |                 else:
283 |                     X_new = sparse.hstack((X_new, X_col))
284 | 
285 |             logging.debug('{} --> {} features'.format(
286 |                 col, self.label_encoder.label_maxes[col])
287 |             )
288 | 
289 |         return X_new
290 | 
291 |     def fit_transform(self, X, y=None):
292 |         """Encode categorical columns into sparse matrix with one-hot-encoding.
293 | 
294 |         Args:
295 |             X (numpy.array): categorical columns to encode
296 | 
297 |         Returns:
298 |             sparse matrix encoding categorical variables into dummy variables
299 |         """
300 | 
301 |         self.label_encoder.fit(X)
302 | 
303 |         return self.transform(X)
304 | 


--------------------------------------------------------------------------------
/kaggler/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qqgeogor/Kaggler/f53ab7f47eec731648fa03064ec3b7fc11f92396/kaggler/test/__init__.py


--------------------------------------------------------------------------------
/kaggler/test/test_sgd.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from kaggler.online_model import SGD
 5 | 
 6 | 
 7 | DUMMY_SPARSE_STR = """0 1:1 3:1 10:1
 8 | 0 3:1 5:1
 9 | 1 4:1 6:1 8:1 10:1"""
10 | 
11 | DUMMY_Y = [0, 0, 1]
12 | DUMMY_LEN_X = [3, 2, 4]
13 | 
14 | class TestSGD(unittest.TestCase):
15 | 
16 |     def setUp(self):
17 |         self.model = SGD(n=2**10, a=0.1, l1=1, l2=1, interaction=True)
18 |         self.sparse_file = '/tmp/dummy.sps'
19 | 
20 |         """Create dummpy sparse files."""
21 |         with open(self.sparse_file, 'w') as f:
22 |             f.write(DUMMY_SPARSE_STR)
23 | 
24 |     def tearDown(self):
25 |         # If a dummy file exists, remove it.
26 |         if os.path.isfile(self.sparse_file):
27 |             os.remove(self.sparse_file)
28 | 
29 |     def test_read_sparse(self):
30 |         len_xs = []
31 |         ys = []
32 |         for x, y in self.model.read_sparse(self.sparse_file):
33 |             # check hash collision for feature index
34 |             self.assertEqual(len(set(x)), len(x))
35 | 
36 |             ys.append(y)
37 |             len_xs.append(len(x))
38 |             
39 |         # check if target values are correct
40 |         self.assertEqual(ys, DUMMY_Y)
41 | 
42 |         # check if the number of feature index are correct
43 |         self.assertEqual(len_xs, DUMMY_LEN_X)
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     unittest.main()
48 | 
49 | 


--------------------------------------------------------------------------------
/kaggler/util.pxd:
--------------------------------------------------------------------------------
1 | cdef inline double fmax(double a, double b): return a if a >= b else b
2 | cdef inline double fmin(double a, double b): return a if a <= b else b
3 | 
4 | cdef double sigm(double x)
5 | 


--------------------------------------------------------------------------------
/kaggler/util.pyx:
--------------------------------------------------------------------------------
 1 | # cython: boundscheck=False
 2 | # cython: wraparound=False
 3 | # cython: cdivision=True
 4 | from __future__ import division
 5 | from scipy import sparse
 6 | 
 7 | import logging
 8 | import numpy as np
 9 | 
10 | cimport cython
11 | from libc.math cimport exp, log
12 | cimport numpy as np
13 | 
14 | 
15 | np.import_array()
16 | 
17 | 
18 | cdef double sigm(double x):
19 |     """Bounded sigmoid function."""
20 |     return 1 / (1 + exp(-fmax(fmin(x, 20.0), -20.0)))
21 | 
22 | 
23 | def get_downsampled_index(n, rate=0.):
24 |     """Return the index that downsamples a vector x by the rate."""
25 | 
26 |     return np.random.choice(range(n), int(n * rate), replace=False)
27 | 
28 | 
29 | def get_downsampled_index0(x, rate=0., threshold=0.):
30 |     """Return the index that downsamples 0s of a vector x by the rate."""
31 | 
32 |     idx1 = np.where(x > threshold)[0]
33 |     idx0 = np.where(x <= threshold)[0]
34 |     idx0_down = np.random.choice(idx0, int(len(idx0) * rate), replace=False)
35 | 
36 |     idx = list(idx0_down) + list(idx1)
37 |     np.random.shuffle(idx)
38 | 
39 |     return idx
40 | 
41 | 
42 | def set_column_width(X, n_col):
43 |     """Set the column width of a matrix X to n_col."""
44 | 
45 |     if X.shape[1] < n_col:
46 |         if sparse.issparse(X):
47 |             X = sparse.hstack((X, np.zeros((X.shape[0], n_col - X.shape[1]))))
48 |             X = X.tocsr()
49 |         else:
50 |             X = np.hstack((X, np.zeros((X.shape[0], n_col - X.shape[1]))))
51 | 
52 |     elif X.shape[1] > n_col:
53 |         if sparse.issparse(X):
54 |             X = X.tocsc()[:, :-(X.shape[1] - n_col)]
55 |             X = X.tocsr()
56 |         else:
57 |             X = X[:, :-(X.shape[1] - n_col)]
58 | 
59 |     return X
60 | 
61 | 
62 | def rank(x):
63 |     """Rank a vector x.  Ties will be averaged."""
64 | 
65 |     unique, idx_inverse = np.unique(x, return_inverse=True)
66 | 
67 |     unique_rank_sum = np.zeros_like(unique)
68 |     unique_rank_count = np.zeros_like(unique)
69 | 
70 |     np.add.at(unique_rank_sum, idx_inverse, x.argsort().argsort())
71 |     np.add.at(unique_rank_count, idx_inverse, 1)
72 | 
73 |     unique_rank_mean = unique_rank_sum.astype(np.float) / unique_rank_count
74 | 
75 |     return unique_rank_mean[idx_inverse]
76 | 
77 | 
78 | def set_min_max(x, lb, ub):
79 |     x[x < lb] = lb
80 |     x[x > ub] = ub
81 | 
82 |     return x
83 | 
84 | 
85 | def point(rank, n_team, n_teammate=1, t=0):
86 |     """Calculate Kaggle points to earn after a competition.
87 | 
88 |     Args:
89 |         rank (int): final ranking in the private leaderboard.
90 |         n_team (int): the number of teams participated in the competition.
91 |         n_teammate (int): the number of team members in my team.
92 |         t (int): the number of days since the competition ends.
93 | 
94 |     Returns:
95 |         returns Kaggle points to earn after a compeittion.
96 |     """
97 |     return (1e5 / np.sqrt(n_teammate) * (rank ** -.75) *
98 |             np.log10(1 + np.log10(n_team)) * np.exp(-t / 500))
99 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension
 2 | from Cython.Distutils import build_ext
 3 | 
 4 | import numpy as np
 5 | 
 6 | try:
 7 |     from pypandoc import convert
 8 |     read_md = lambda f: convert(f, 'rst')
 9 | except ImportError:
10 |     print("warning: pypandoc module not found, could not convert Markdown to RST")
11 |     read_md = lambda f: open(f, 'r').read()
12 | 
13 | setup(
14 |     name='Kaggler',
15 |     version='0.4.1',
16 | 
17 |     author='Jeong-Yoon Lee',
18 |     author_email='jeongyoon.lee1@gmail.com',
19 | 
20 |     packages=['kaggler',
21 |               'kaggler.model',
22 |               'kaggler.metrics',
23 |               'kaggler.online_model',
24 |               'kaggler.preprocessing',
25 |               'kaggler.test'],
26 |     url='https://github.com/jeongyoonlee/Kaggler',
27 |     license='LICENSE.txt',
28 | 
29 |     description='Code for Kaggle Data Science Competitions.',
30 |     long_description=read_md('README.md'),
31 | 
32 |     install_requires=[
33 |         'cython',
34 |         'numpy',
35 |         'scipy >= 0.14.0',
36 |         'scikit-learn >= 0.15.0',
37 |         'statsmodels >= 0.5.0',
38 |     ],
39 | 
40 |     cmdclass={'build_ext': build_ext},
41 |     ext_modules=[Extension('kaggler.online_model.ftrl',
42 |                            ['kaggler/online_model/ftrl.pyx'],
43 |                            libraries=[],
44 |                            include_dirs=[np.get_include(), '.'],
45 |                            extra_compile_args=['-O3']),
46 |                  Extension('kaggler.online_model.sgd',
47 |                            ['kaggler/online_model/sgd.pyx'],
48 |                            libraries=[],
49 |                            include_dirs=[np.get_include(), '.'],
50 |                            extra_compile_args=['-O3']),
51 |                  Extension('kaggler.online_model.fm',
52 |                            ['kaggler/online_model/fm.pyx'],
53 |                            libraries=[],
54 |                            include_dirs=[np.get_include(), '.'],
55 |                            extra_compile_args=['-O3']),
56 |                  Extension('kaggler.online_model.nn',
57 |                            ['kaggler/online_model/nn.pyx'],
58 |                            libraries=[],
59 |                            include_dirs=[np.get_include(), '.'],
60 |                            extra_compile_args=['-O3']),
61 |                  Extension('kaggler.online_model.nn_h2',
62 |                            ['kaggler/online_model/nn_h2.pyx'],
63 |                            libraries=[],
64 |                            include_dirs=[np.get_include(), '.'],
65 |                            extra_compile_args=['-O3']),
66 |                  Extension('kaggler.util',
67 |                            ['kaggler/util.pyx', 'kaggler/util.pxd'],
68 |                            libraries=[],
69 |                            include_dirs=[np.get_include(), '.'],
70 |                            extra_compile_args=['-O3']),
71 |                  Extension('kaggler.online_model.ftrl_fm',
72 |                            ['kaggler/online_model/ftrl_fm.pyx'],
73 |                            libraries=[],
74 |                            include_dirs=[np.get_include(), '.'],
75 |                            extra_compile_args=[
76 |                                     '-O3',
77 |                                     # '-fopenmp',
78 |                                     ],
79 |                            # extra_link_args=['-fopenmp'],
80 |                            ),
81 |                           
82 |                  ],
83 | )
84 | 


--------------------------------------------------------------------------------