├── .gitignore
├── CHANGES.txt
├── LICENSE
├── LICENSE.txt
├── MANIFEST
├── MANIFEST.in
├── README.md
├── demo
├── ftrl_fm_cython.py
└── mf_qe_nn_clf.py
├── doc
├── Makefile
├── conf.py
├── index.rst
├── kaggler.metrics.rst
├── kaggler.model.rst
├── kaggler.online_model.rst
├── kaggler.preprocessing.rst
├── kaggler.rst
├── kaggler.test.rst
└── modules.rst
├── kaggler
├── __init__.py
├── const.py
├── data_io.py
├── metrics
│ ├── __init__.py
│ ├── classification.py
│ └── regression.py
├── model
│ ├── __init__.py
│ └── nn.py
├── online_model
│ ├── DecisionTree
│ │ ├── OnlineClassificationTree.py
│ │ ├── _tree.pyx
│ │ ├── test.py
│ │ └── utils.pyx
│ ├── __init__.py
│ ├── fm.c
│ ├── fm.pyx
│ ├── ftrl.c
│ ├── ftrl.pyx
│ ├── ftrl_dropout.pyx
│ ├── ftrl_fm.c
│ ├── ftrl_fm.pyx
│ ├── nn.c
│ ├── nn.pyx
│ ├── nn_h2.c
│ ├── nn_h2.pyx
│ ├── sgd.c
│ └── sgd.pyx
├── preprocessing
│ ├── __init__.py
│ └── data.py
├── test
│ ├── __init__.py
│ └── test_sgd.py
├── util.c
├── util.pxd
└── util.pyx
├── setup.cfg
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | _build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .cache
41 | nosetests.xml
42 | coverage.xml
43 |
44 | # Translations
45 | *.mo
46 | *.pot
47 |
48 | # Django stuff:
49 | *.log
50 |
51 | # Sphinx documentation
52 | docs/_build/
53 |
54 | # PyBuilder
55 | target/
56 |
--------------------------------------------------------------------------------
/CHANGES.txt:
--------------------------------------------------------------------------------
1 | 0.3.4, 2015-02-11 -- Add README.md to MANIFEST.in
2 | 0.1.1, 2014-09-24 -- Fix wrong dependencies
3 | 0.1.0, 2014-07-22 -- Initial release.
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
1 | # file GENERATED by distutils, do NOT edit
2 | CHANGES.txt
3 | LICENSE.txt
4 | README.txt
5 | setup.py
6 | kaggler/__init__.py
7 | kaggler/const.py
8 | kaggler/logger.py
9 | kaggler/nn_auc.py
10 | kaggler/util.py
11 | kaggler/test/__init__.py
12 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt README.md
2 | recursive-include docs *.txt
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Kaggler
2 | Kaggler is a Python package for Kaggle data science competitions and distributed under the version 3 of the GNU General Public License.
3 |
4 | It provides online learning algorithms for classification - inspired by Kaggle user [tinrtgu's code](http://goo.gl/K8hQBx). It uses the sparse input format that handles large sparse data efficiently. Core code is optimized for speed by using Cython.
5 |
6 | # Algorithms
7 | Currently algorithms available are as follows:
8 |
9 | ## Online learning algorithms
10 | * Stochastic Gradient Descent (SGD)
11 | * Follow-the-Regularized-Leader (FTRL)
12 | * Follow-the-Regularized-Leader with Factorization Machine (FTRL_FM)
13 | * Factorization Machine (FM)
14 | * Neural Networks (NN) - with a single (NN) or two (NN_H2) ReLU hidden layers
15 | * Decision Tree
16 |
17 | ## Batch learning algorithm
18 | * Neural Networks (NN) - with a single hidden layer and L-BFGS optimization
19 |
20 | # Install
21 | ## Using pip
22 | Python package is available at PyPi for pip installation:
23 | ```
24 | sudo pip install -U Kaggler
25 | ```
26 |
27 | ## From source code
28 | If you want to install it from source code:
29 | ```
30 | python setup.py build_ext --inplace
31 | sudo python setup.py install
32 | ```
33 |
34 | # Input Format
35 | libsvm style sparse file format is used.
36 | ```
37 | 1 1:1 4:1 5:0.5
38 | 0 2:1 5:1
39 | ```
40 |
41 | # Example
42 | ```
43 | from kaggler.online_model import SGD, FTRL, FM, NN
44 |
45 | # SGD
46 | clf = SGD(a=.01, # learning rate
47 | l1=1e-6, # L1 regularization parameter
48 | l2=1e-6, # L2 regularization parameter
49 | n=2**20, # number of hashed features
50 | epoch=10, # number of epochs
51 | interaction=True) # use feature interaction or not
52 |
53 | # FTRL
54 | clf = FTRL(a=.1, # alpha in the per-coordinate rate
55 | b=1, # beta in the per-coordinate rate
56 | l1=1., # L1 regularization parameter
57 | l2=1., # L2 regularization parameter
58 | n=2**20, # number of hashed features
59 | epoch=1, # number of epochs
60 | interaction=True) # use feature interaction or not
61 |
62 | # FM
63 | clf = FM(n=1e5, # number of features
64 | epoch=100, # number of epochs
65 | dim=4, # size of factors for interactions
66 | a=.01) # learning rate
67 |
68 | # NN
69 | clf = NN(n=1e5, # number of features
70 | epoch=10, # number of epochs
71 | h=16, # number of hidden units
72 | a=.1, # learning rate
73 | l2=1e-6) # L2 regularization parameter
74 |
75 | # online training and prediction directly with a libsvm file
76 | for x, y in clf.read_sparse('train.sparse'):
77 | p = clf.predict_one(x) # predict for an input
78 | clf.update_one(x, p - y) # update the model with the target using error
79 |
80 | for x, _ in clf.read_sparse('test.sparse'):
81 | p = clf.predict_one(x)
82 |
83 | # online training and prediction with a scipy sparse matrix
84 | from sklearn.datasets import load_svmlight_file
85 |
86 | X, y = load_svmlight_file('train.sparse')
87 |
88 | clf.fit(X, y)
89 | p = clf.predict(X)
90 | ```
91 |
92 | # Package Documentation
93 | Package documentation is available at [here](http://pythonhosted.org//Kaggler).
94 |
--------------------------------------------------------------------------------
/demo/ftrl_fm_cython.py:
--------------------------------------------------------------------------------
1 | # time pypy-2.4 -u runmodel.py | tee output_0.txt
2 | from kaggler.online_model.ftrl_fm import FTRL_FM
3 | import random
4 | from math import log
5 | import numpy as np
6 | from datetime import datetime
7 | import pandas as pd
8 | from sklearn.cross_validation import KFold
9 | from sklearn.metrics import roc_auc_score
10 | #### RANDOM SEED ####
11 | seed = 1024
12 | np.random.seed(seed)
13 | #####################
14 |
15 | ####################
16 | #### PARAMETERS ####
17 | ####################
18 |
19 | reportFrequency = 1000
20 | path = "E:\\Redhat\\"
21 | trainingFile = "E:\\Redhat\\train_le_date.csv"
22 | testingFile = "E:\\Redhat\\test_le_date.csv"
23 | # train = pd.read_csv(trainingFile)
24 | # test = pd.read_csv(testingFile)
25 | # y = train['outcome'].values
26 | # skf = KFold(len(y), n_folds=4, shuffle=False, random_state=seed)
27 | # for ind_tr, ind_te in skf:
28 | # X_train = train.iloc[ind_tr]
29 | # X_test = train.iloc[ind_te]
30 | # break
31 |
32 | # X_train.to_csv(path+'X_train.csv',index=False)
33 | # X_test.to_csv(path+'X_test.csv',index=False)
34 |
35 | fm_dim = 4
36 | fm_initDev = .01
37 |
38 | alpha = 0.1
39 | beta = 1.
40 |
41 | alpha_fm = .01
42 | beta_fm = 1.
43 |
44 | p_D = 22
45 | D = 2 ** p_D
46 |
47 | L1 = 0.1
48 | L2 = 1.0
49 | L1_fm = 0.1
50 | L2_fm = 1.0
51 |
52 | n_epochs = 3
53 |
54 | ####
55 | start = datetime.now()
56 |
57 | # initialize a FM learner
58 | learner = FTRL_FM(fm_dim, fm_initDev, L1, L2, L1_fm, L2_fm, D, alpha, beta, alpha_fm = alpha_fm, beta_fm = beta_fm)
59 |
60 | learner.fit(trainingFile=open(path+'X_train.csv'),n_epochs=5,validationFile=open(path+'X_test.csv'),eval_metric=roc_auc_score,reportFrequency=reportFrequency)
61 |
62 | # save the weights
63 | # w_outfile = path+"param.w.txt"
64 | # w_fm_outfile = path+"param.w_fm.txt"
65 | # learner.write_w(w_outfile)
66 | # learner.write_w_fm(w_fm_outfile)
67 | pd.to_pickle(learner,path+'ftrl_fm.pkl')
68 |
69 |
70 | test = pd.read_csv(path+'test_le_date.csv')
71 | activity_id = test['activity_id']
72 | print('Make submission')
73 | # X_t = [X_t[:,i] for i in range(X_t.shape[1])]
74 | y_preds = learner.predict(testingFile=open(testingFile),n_epochs=5)
75 | submission = pd.DataFrame()
76 | submission['activity_id'] = activity_id
77 | submission['outcome'] = outcome
78 | submission.to_csv('submission_ftrl_fm_%s.csv'%dim,index=False)
79 |
--------------------------------------------------------------------------------
/demo/mf_qe_nn_clf.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from scipy import sparse as ssp
4 | from sklearn.preprocessing import LabelEncoder,LabelBinarizer,MinMaxScaler,OneHotEncoder,StandardScaler,Normalizer
5 | from sklearn.linear_model import LogisticRegression
6 | from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
7 | from sklearn.feature_selection import SelectFromModel
8 | from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
9 | from sklearn.datasets import dump_svmlight_file,load_svmlight_file
10 | from sklearn.svm import LinearSVC
11 | from sklearn.ensemble import RandomForestClassifier
12 | from sklearn.cross_validation import KFold,StratifiedKFold
13 | from sklearn.metrics import roc_auc_score,accuracy_score
14 | from keras.preprocessing import sequence
15 | from keras.callbacks import ModelCheckpoint
16 | from keras import backend as K
17 | from keras.layers import Input, Embedding, LSTM, Dense,Flatten, Dropout, merge,Convolution1D,MaxPooling1D,Lambda
18 | from keras.layers.normalization import BatchNormalization
19 | from keras.optimizers import SGD,Nadam
20 | from keras.layers.advanced_activations import PReLU,LeakyReLU,ELU,SReLU
21 | from keras.models import Model
22 | from keras.utils.visualize_util import plot
23 | import distance
24 | import xgboost as xgb
25 |
26 | seed = 1024
27 | np.random.seed(seed)
28 |
29 | path = "../"
30 |
31 |
32 | def str_jaccard(str1, str2):
33 | res = distance.jaccard(str1, str2)
34 | return res
35 |
36 |
37 | question_numeric = ['char_4_q','char_5_q','char_6_q']
38 |
39 | train = pd.read_csv(path+'invited_info_train.txt',dtype={"expert_id":str,'question_id':str})
40 | expert_id = train['expert_id'].values
41 | expert_id = LabelEncoder().fit_transform(expert_id)
42 |
43 | test = pd.read_csv(path+'validate_nolabel.txt',dtype={"expert_id":str,'question_id':str}).fillna(-1)
44 | test.columns = ['question_id','expert_id','label']
45 | len_train = train.shape[0]
46 |
47 |
48 | train = pd.concat([train,test])
49 |
50 | expert = pd.read_csv(path+'user_info.txt',dtype={"expert_id":str})
51 | question = pd.read_csv(path+'question_info.txt',dtype={"question_id":str}).fillna(-1)
52 | question['char_3_q'] = question['char_3_q'].astype(str)
53 |
54 | expert['char_1'] = expert['char_1'].apply(lambda x: x.replace('/',' '))
55 | expert['char_2'] = expert['char_2'].apply(lambda x: x.replace('/',' '))
56 | expert['char_3'] = expert['char_3'].apply(lambda x: x.replace('/',' '))
57 |
58 | question['char_2_q'] = question['char_2_q'].apply(lambda x: x.replace('/',' '))
59 | question['char_3_q'] = question['char_3_q'].apply(lambda x: x.replace('/',' '))
60 |
61 | count_char_1 = CountVectorizer(ngram_range=(1,3))
62 | tfidf_char_2 = TfidfVectorizer(ngram_range=(1,3))
63 | tfidf_char_3 = TfidfVectorizer(ngram_range=(1,3))
64 |
65 | count_char_1.fit(expert['char_1'].values)
66 | tfidf_char_2.fit(expert['char_2'].values.tolist()+question['char_2_q'].values.tolist())
67 | tfidf_char_3.fit(expert['char_3'].values.tolist()+question['char_3_q'].values.tolist())
68 |
69 | lb_char_1_q = LabelBinarizer(sparse_output=True)
70 | lb_char_1_q.fit(question['char_1_q'].values)
71 |
72 |
73 | train = pd.merge(train,expert,on='expert_id',how='left')#.fillna(' ')
74 | train = pd.merge(train,question,on='question_id',how='left')
75 |
76 |
77 | le = LabelEncoder()
78 | train['question_id'] = le.fit_transform(train['question_id'].values)
79 | train['expert_id'] = le.fit_transform(train['expert_id'].values)
80 |
81 | y = train['label'].values
82 | features = [
83 | 'question_id',
84 | 'expert_id',
85 | ]
86 |
87 | X = train[features].values
88 | # X = OneHotEncoder().fit_transform(X).tocsr()
89 | # X_char_1 = count_char_1.transform(train['char_1'].values)
90 | # X_char_2 = tfidf_char_2.transform(train['char_2'].values)
91 | # X_char_3 = tfidf_char_3.transform(train['char_3'].values)
92 |
93 |
94 | # X_char_1_q = lb_char_1_q.fit_transform(train['char_1_q'].values)
95 | # X_char_2_q = tfidf_char_2.transform(train['char_2_q'].values)
96 | # X_char_3_q = tfidf_char_3.transform(train['char_3_q'].values)
97 |
98 | # stand_char_4_5_6_q = StandardScaler()
99 | # stand_char_4_5_6_q.fit(train[question_numeric].values)
100 | # X_char_4_5_6_q = stand_char_4_5_6_q.transform(train[question_numeric].values)
101 |
102 |
103 | print ('X raw',X.shape)
104 |
105 | # sim_char_2 = []
106 | # for expert_char_2,question_char_2 in zip(X_char_2,X_char_2_q):
107 | # cos_sim_2 = pairwise_distances(expert_char_2, question_char_2, metric='cosine')[0][0]
108 | # sim_char_2.append(cos_sim_2)
109 | # sim_char_2 = np.array(sim_char_2)
110 | # sim_char_2 = np.expand_dims(sim_char_2,1)
111 |
112 | # sim_char_3 = []
113 | # for expert_char_3,question_char_3 in zip(X_char_3,X_char_3_q):
114 | # cos_sim_3 = pairwise_distances(expert_char_3, question_char_3, metric='cosine')[0][0]
115 | # sim_char_3.append(cos_sim_3)
116 | # sim_char_3 = np.array(sim_char_3)
117 | # sim_char_3 = np.expand_dims(sim_char_3,1)
118 |
119 | # X = ssp.hstack([
120 | # X,
121 | # # X_char_1,
122 | # # X_char_2,
123 | # # X_char_3,
124 | # # X_char_1_q,
125 | # # X_char_2_q,
126 | # # X_char_3_q,
127 | # # X_char_4_5_6_q,
128 | # # sim_char_2,
129 | # # sim_char_3,
130 | # ]).tocsr()
131 |
132 | # dump_svmlight_file(X,y,path+'data.svm')
133 |
134 | # data,y_all = load_svmlight_file(path+'data.svm')
135 | y_all = y
136 | data = X
137 | num_q = len(np.unique(data[:,0]))
138 | num_e = len(np.unique(data[:,1]))
139 | del X
140 | del y
141 |
142 | X = data[:len_train]
143 | y = y_all[:len_train]
144 | X_t= data[len_train:]
145 | del data
146 | del y_all
147 |
148 | def make_mf_lr(X ,y, clf, X_test, n_round=3):
149 | n = X.shape[0]
150 | '''
151 | Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor
152 | '''
153 | print clf
154 | mf_tr = np.zeros(X.shape[0])
155 | mf_te = np.zeros(X_test.shape[0])
156 | for i in range(n_round):
157 | skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=42+i*1000)
158 | for ind_tr, ind_te in skf:
159 | X_tr = X[ind_tr]
160 | X_te = X[ind_te]
161 |
162 | # print('X_tr shape',X_tr.shape)
163 | # print('X_te shape',X_te.shape)
164 |
165 | y_tr = y[ind_tr]
166 | y_te = y[ind_te]
167 |
168 | clf.fit(X_tr, y_tr)
169 | mf_tr[ind_te] += clf.predict_proba(X_te)[:,1]
170 | mf_te += clf.predict_proba(X_test)[:,1]*0.5
171 | y_pred = clf.predict_proba(X_te)[:,1]
172 | score = roc_auc_score(y_te, y_pred)
173 | print 'pred[{}] score:{}'.format(i, score)
174 | return (mf_tr / n_round, mf_te / n_round)
175 |
176 |
177 | def make_mf_lsvc(X ,y, clf, X_test, n_round=3):
178 | n = X.shape[0]
179 | '''
180 | Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor
181 | '''
182 | print clf
183 | mf_tr = np.zeros(X.shape[0])
184 | mf_te = np.zeros(X_test.shape[0])
185 | for i in range(n_round):
186 | skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=42+i*1000)
187 | for ind_tr, ind_te in skf:
188 | X_tr = X[ind_tr]
189 | X_te = X[ind_te]
190 |
191 | # print('X_tr shape',X_tr.shape)
192 | # print('X_te shape',X_te.shape)
193 |
194 | y_tr = y[ind_tr]
195 | y_te = y[ind_te]
196 |
197 | clf.fit(X_tr, y_tr)
198 | mf_tr[ind_te] += clf.decision_function(X_te)
199 | mf_te += clf.decision_function(X_test)*0.5
200 | y_pred = clf.decision_function(X_te)
201 | score = roc_auc_score(y_te, y_pred)
202 | print 'pred[{}] score:{}'.format(i, score)
203 | return (mf_tr / n_round, mf_te / n_round)
204 |
205 | def make_mf_nn(X ,y, X_test, n_round=3):
206 | n = X.shape[0]
207 | '''
208 | Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor
209 | '''
210 | from kaggler.online_model.ftrl import FTRL
211 | mf_tr = np.zeros(X.shape[0])
212 | mf_te = np.zeros(X_test.shape[0])
213 | for i in range(n_round):
214 | skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=42+i*1000)
215 | for ind_tr, ind_te in skf:
216 | clf = build_model(X)
217 | X_tr = [X[:,0][ind_tr],X[:,1][ind_tr]]
218 | X_te = [X[:,0][ind_te],X[:,1][ind_te]]
219 |
220 | # print('X_tr shape',X_tr.shape)
221 | # print('X_te shape',X_te.shape)
222 |
223 | y_tr = y[ind_tr]
224 | y_te = y[ind_te]
225 |
226 | clf.fit(X_tr, y_tr,nb_epoch=2,batch_size=128,validation_data=[X_te,y_te])
227 | mf_tr[ind_te] += clf.predict(X_te).ravel()
228 | mf_te += clf.predict([X_test[:,0],X_test[:,1]]).ravel()*0.5
229 | y_pred = clf.predict(X_te).ravel()
230 | score = roc_auc_score(y_te, y_pred)
231 | print 'pred[{}] score:{}'.format(i, score)
232 | return (mf_tr / n_round, mf_te / n_round)
233 |
234 | def build_model(X,dim=128):
235 |
236 | inputs_p = Input(shape=(1,), dtype='int32')
237 |
238 | embed_p = Embedding(
239 | num_q,
240 | dim,
241 | dropout=0.2,
242 | input_length=1
243 | )(inputs_p)
244 |
245 | inputs_d = Input(shape=(1,), dtype='int32')
246 |
247 | embed_d = Embedding(
248 | num_e,
249 | dim,
250 | dropout=0.2,
251 | input_length=1
252 | )(inputs_d)
253 |
254 |
255 | flatten_p= Flatten()(embed_p)
256 |
257 | flatten_d= Flatten()(embed_d)
258 |
259 | flatten = merge([
260 | flatten_p,
261 | flatten_d,
262 | ],mode='concat')
263 |
264 | fc1 = Dense(512)(flatten)
265 | fc1 = SReLU()(fc1)
266 | dp1 = Dropout(0.7)(fc1)
267 |
268 | outputs = Dense(1,activation='sigmoid',name='outputs')(dp1)
269 |
270 | inputs = [
271 | inputs_p,
272 | inputs_d,
273 | ]
274 |
275 |
276 |
277 | model = Model(input=inputs, output=outputs)
278 | nadam = Nadam()
279 | sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
280 | model.compile(
281 | optimizer=nadam,
282 | loss= 'binary_crossentropy'
283 | )
284 |
285 | return model
286 |
287 | mf_nn_clf = make_mf_nn(X ,y, X_t, n_round=10)
288 | pd.to_pickle(mf_nn_clf,path+'mf_nn_clf.pkl')
289 |
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | PAPER =
8 | BUILDDIR = _build
9 |
10 | # User-friendly check for sphinx-build
11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
13 | endif
14 |
15 | # Internal variables.
16 | PAPEROPT_a4 = -D latex_paper_size=a4
17 | PAPEROPT_letter = -D latex_paper_size=letter
18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
19 | # the i18n builder cannot share the environment and doctrees with the others
20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
21 |
22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
23 |
24 | help:
25 | @echo "Please use \`make ' where is one of"
26 | @echo " html to make standalone HTML files"
27 | @echo " dirhtml to make HTML files named index.html in directories"
28 | @echo " singlehtml to make a single large HTML file"
29 | @echo " pickle to make pickle files"
30 | @echo " json to make JSON files"
31 | @echo " htmlhelp to make HTML files and a HTML help project"
32 | @echo " qthelp to make HTML files and a qthelp project"
33 | @echo " devhelp to make HTML files and a Devhelp project"
34 | @echo " epub to make an epub"
35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
36 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
38 | @echo " text to make text files"
39 | @echo " man to make manual pages"
40 | @echo " texinfo to make Texinfo files"
41 | @echo " info to make Texinfo files and run them through makeinfo"
42 | @echo " gettext to make PO message catalogs"
43 | @echo " changes to make an overview of all changed/added/deprecated items"
44 | @echo " xml to make Docutils-native XML files"
45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes"
46 | @echo " linkcheck to check all external links for integrity"
47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
48 |
49 | clean:
50 | rm -rf $(BUILDDIR)/*
51 |
52 | html:
53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
54 | @echo
55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
56 |
57 | dirhtml:
58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
59 | @echo
60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
61 |
62 | singlehtml:
63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
64 | @echo
65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
66 |
67 | pickle:
68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
69 | @echo
70 | @echo "Build finished; now you can process the pickle files."
71 |
72 | json:
73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
74 | @echo
75 | @echo "Build finished; now you can process the JSON files."
76 |
77 | htmlhelp:
78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
79 | @echo
80 | @echo "Build finished; now you can run HTML Help Workshop with the" \
81 | ".hhp project file in $(BUILDDIR)/htmlhelp."
82 |
83 | qthelp:
84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
85 | @echo
86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \
87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Kaggler.qhcp"
89 | @echo "To view the help file:"
90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Kaggler.qhc"
91 |
92 | devhelp:
93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
94 | @echo
95 | @echo "Build finished."
96 | @echo "To view the help file:"
97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/Kaggler"
98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Kaggler"
99 | @echo "# devhelp"
100 |
101 | epub:
102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | @echo
104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 |
106 | latex:
107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | @echo
109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | "(use \`make latexpdf' here to do that automatically)."
112 |
113 | latexpdf:
114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | @echo "Running LaTeX files through pdflatex..."
116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 |
119 | latexpdfja:
120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | @echo "Running LaTeX files through platex and dvipdfmx..."
122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 |
125 | text:
126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | @echo
128 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
129 |
130 | man:
131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | @echo
133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 |
135 | texinfo:
136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | @echo
138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | @echo "Run \`make' in that directory to run these through makeinfo" \
140 | "(use \`make info' here to do that automatically)."
141 |
142 | info:
143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | @echo "Running Texinfo files through makeinfo..."
145 | make -C $(BUILDDIR)/texinfo info
146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 |
148 | gettext:
149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | @echo
151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 |
153 | changes:
154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | @echo
156 | @echo "The overview file is in $(BUILDDIR)/changes."
157 |
158 | linkcheck:
159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | @echo
161 | @echo "Link check complete; look for any errors in the above output " \
162 | "or in $(BUILDDIR)/linkcheck/output.txt."
163 |
164 | doctest:
165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | @echo "Testing of doctests in the sources finished, look at the " \
167 | "results in $(BUILDDIR)/doctest/output.txt."
168 |
169 | xml:
170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | @echo
172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 |
174 | pseudoxml:
175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | @echo
177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 |
--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # Kaggler documentation build configuration file, created by
4 | # sphinx-quickstart on Tue Feb 10 04:55:59 2015.
5 | #
6 | # This file is execfile()d with the current directory set to its
7 | # containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | import sys
16 | import os
17 |
18 | # If extensions (or modules to document with autodoc) are in another directory,
19 | # add these directories to sys.path here. If the directory is relative to the
20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
21 | #sys.path.insert(0, os.path.abspath('.'))
22 | sys.path.insert(0, os.path.abspath("../.."))
23 |
24 | # -- General configuration ------------------------------------------------
25 |
26 | # If your documentation needs a minimal Sphinx version, state it here.
27 | #needs_sphinx = '1.0'
28 |
29 | # Add any Sphinx extension module names here, as strings. They can be
30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
31 | # ones.
32 | extensions = [
33 | 'sphinx.ext.autodoc',
34 | 'sphinxcontrib.napoleon',
35 | 'sphinx.ext.doctest',
36 | 'sphinx.ext.intersphinx',
37 | 'sphinx.ext.todo',
38 | 'sphinx.ext.coverage',
39 | 'sphinx.ext.mathjax',
40 | 'sphinx.ext.viewcode',
41 | ]
42 |
43 | # Add any paths that contain templates here, relative to this directory.
44 | templates_path = ['_templates']
45 |
46 | # The suffix of source filenames.
47 | source_suffix = '.rst'
48 |
49 | # The encoding of source files.
50 | #source_encoding = 'utf-8-sig'
51 |
52 | # The master toctree document.
53 | master_doc = 'index'
54 |
55 | # General information about the project.
56 | project = u'Kaggler'
57 | copyright = u'2015, Jeong-Yoon Lee'
58 |
59 | # The version info for the project you're documenting, acts as replacement for
60 | # |version| and |release|, also used in various other places throughout the
61 | # built documents.
62 | #
63 | # The short X.Y version.
64 | version = '0.4'
65 | # The full version, including alpha/beta/rc tags.
66 | release = '0.4.1'
67 |
68 | # The language for content autogenerated by Sphinx. Refer to documentation
69 | # for a list of supported languages.
70 | #language = None
71 |
72 | # There are two options for replacing |today|: either, you set today to some
73 | # non-false value, then it is used:
74 | #today = ''
75 | # Else, today_fmt is used as the format for a strftime call.
76 | #today_fmt = '%B %d, %Y'
77 |
78 | # List of patterns, relative to source directory, that match files and
79 | # directories to ignore when looking for source files.
80 | exclude_patterns = ['_build']
81 |
82 | # The reST default role (used for this markup: `text`) to use for all
83 | # documents.
84 | #default_role = None
85 |
86 | # If true, '()' will be appended to :func: etc. cross-reference text.
87 | #add_function_parentheses = True
88 |
89 | # If true, the current module name will be prepended to all description
90 | # unit titles (such as .. function::).
91 | #add_module_names = True
92 |
93 | # If true, sectionauthor and moduleauthor directives will be shown in the
94 | # output. They are ignored by default.
95 | #show_authors = False
96 |
97 | # The name of the Pygments (syntax highlighting) style to use.
98 | pygments_style = 'sphinx'
99 |
100 | # A list of ignored prefixes for module index sorting.
101 | #modindex_common_prefix = []
102 |
103 | # If true, keep warnings as "system message" paragraphs in the built documents.
104 | #keep_warnings = False
105 |
106 |
107 | # -- Options for HTML output ----------------------------------------------
108 |
109 | # The theme to use for HTML and HTML Help pages. See the documentation for
110 | # a list of builtin themes.
111 | html_theme = 'default'
112 |
113 | # Theme options are theme-specific and customize the look and feel of a theme
114 | # further. For a list of options available for each theme, see the
115 | # documentation.
116 | #html_theme_options = {}
117 |
118 | # Add any paths that contain custom themes here, relative to this directory.
119 | #html_theme_path = []
120 |
121 | # The name for this set of Sphinx documents. If None, it defaults to
122 | # " v documentation".
123 | #html_title = None
124 |
125 | # A shorter title for the navigation bar. Default is the same as html_title.
126 | #html_short_title = None
127 |
128 | # The name of an image file (relative to this directory) to place at the top
129 | # of the sidebar.
130 | #html_logo = None
131 |
132 | # The name of an image file (within the static path) to use as favicon of the
133 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
134 | # pixels large.
135 | #html_favicon = None
136 |
137 | # Add any paths that contain custom static files (such as style sheets) here,
138 | # relative to this directory. They are copied after the builtin static files,
139 | # so a file named "default.css" will overwrite the builtin "default.css".
140 | html_static_path = ['_static']
141 |
142 | # Add any extra paths that contain custom files (such as robots.txt or
143 | # .htaccess) here, relative to this directory. These files are copied
144 | # directly to the root of the documentation.
145 | #html_extra_path = []
146 |
147 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
148 | # using the given strftime format.
149 | #html_last_updated_fmt = '%b %d, %Y'
150 |
151 | # If true, SmartyPants will be used to convert quotes and dashes to
152 | # typographically correct entities.
153 | #html_use_smartypants = True
154 |
155 | # Custom sidebar templates, maps document names to template names.
156 | #html_sidebars = {}
157 |
158 | # Additional templates that should be rendered to pages, maps page names to
159 | # template names.
160 | #html_additional_pages = {}
161 |
162 | # If false, no module index is generated.
163 | #html_domain_indices = True
164 |
165 | # If false, no index is generated.
166 | #html_use_index = True
167 |
168 | # If true, the index is split into individual pages for each letter.
169 | #html_split_index = False
170 |
171 | # If true, links to the reST sources are added to the pages.
172 | #html_show_sourcelink = True
173 |
174 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
175 | #html_show_sphinx = True
176 |
177 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
178 | #html_show_copyright = True
179 |
180 | # If true, an OpenSearch description file will be output, and all pages will
181 | # contain a tag referring to it. The value of this option must be the
182 | # base URL from which the finished HTML is served.
183 | #html_use_opensearch = ''
184 |
185 | # This is the file name suffix for HTML files (e.g. ".xhtml").
186 | #html_file_suffix = None
187 |
188 | # Output file base name for HTML help builder.
189 | htmlhelp_basename = 'Kagglerdoc'
190 |
191 |
192 | # -- Options for LaTeX output ---------------------------------------------
193 |
194 | latex_elements = {
195 | # The paper size ('letterpaper' or 'a4paper').
196 | #'papersize': 'letterpaper',
197 |
198 | # The font size ('10pt', '11pt' or '12pt').
199 | #'pointsize': '10pt',
200 |
201 | # Additional stuff for the LaTeX preamble.
202 | #'preamble': '',
203 | }
204 |
205 | # Grouping the document tree into LaTeX files. List of tuples
206 | # (source start file, target name, title,
207 | # author, documentclass [howto, manual, or own class]).
208 | latex_documents = [
209 | ('index', 'Kaggler.tex', u'Kaggler Documentation',
210 | u'Jeong-Yoon Lee', 'manual'),
211 | ]
212 |
213 | # The name of an image file (relative to this directory) to place at the top of
214 | # the title page.
215 | #latex_logo = None
216 |
217 | # For "manual" documents, if this is true, then toplevel headings are parts,
218 | # not chapters.
219 | #latex_use_parts = False
220 |
221 | # If true, show page references after internal links.
222 | #latex_show_pagerefs = False
223 |
224 | # If true, show URL addresses after external links.
225 | #latex_show_urls = False
226 |
227 | # Documents to append as an appendix to all manuals.
228 | #latex_appendices = []
229 |
230 | # If false, no module index is generated.
231 | #latex_domain_indices = True
232 |
233 |
234 | # -- Options for manual page output ---------------------------------------
235 |
236 | # One entry per manual page. List of tuples
237 | # (source start file, name, description, authors, manual section).
238 | man_pages = [
239 | ('index', 'kaggler', u'Kaggler Documentation',
240 | [u'Jeong-Yoon Lee'], 1)
241 | ]
242 |
243 | # If true, show URL addresses after external links.
244 | #man_show_urls = False
245 |
246 |
247 | # -- Options for Texinfo output -------------------------------------------
248 |
249 | # Grouping the document tree into Texinfo files. List of tuples
250 | # (source start file, target name, title, author,
251 | # dir menu entry, description, category)
252 | texinfo_documents = [
253 | ('index', 'Kaggler', u'Kaggler Documentation',
254 | u'Jeong-Yoon Lee', 'Kaggler', 'One line description of project.',
255 | 'Miscellaneous'),
256 | ]
257 |
258 | # Documents to append as an appendix to all manuals.
259 | #texinfo_appendices = []
260 |
261 | # If false, no module index is generated.
262 | #texinfo_domain_indices = True
263 |
264 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
265 | #texinfo_show_urls = 'footnote'
266 |
267 | # If true, do not generate a @detailmenu in the "Top" node's menu.
268 | #texinfo_no_detailmenu = False
269 |
270 |
271 | # Example configuration for intersphinx: refer to the Python standard library.
272 | intersphinx_mapping = {'http://docs.python.org/': None}
273 |
274 | # Napoleon settings
275 | napoleon_google_docstring = True
276 | napoleon_numpy_docstring = True
277 | napoleon_include_private_with_doc = False
278 | napoleon_include_special_with_doc = True
279 | napoleon_use_admonition_for_examples = False
280 | napoleon_use_admonition_for_notes = False
281 | napoleon_use_admonition_for_references = False
282 | napoleon_use_ivar = False
283 | napoleon_use_param = True
284 | napoleon_use_rtype = True
285 |
--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
1 | kaggler package
2 | ===============
3 |
4 | Subpackages
5 | -----------
6 |
7 | .. toctree::
8 |
9 | kaggler.metrics
10 | kaggler.online_model
11 | kaggler.preprocessing
12 | kaggler.test
13 |
14 | Submodules
15 | ----------
16 |
17 | kaggler.const module
18 | --------------------
19 |
20 | .. automodule:: kaggler.const
21 | :members:
22 | :undoc-members:
23 | :show-inheritance:
24 |
25 | kaggler.data_io module
26 | -----------------
27 |
28 | .. automodule:: kaggler.data_io
29 | :members:
30 | :undoc-members:
31 | :show-inheritance:
32 |
33 | kaggler.util module
34 | -------------------
35 |
36 | .. automodule:: kaggler.util
37 | :members:
38 | :undoc-members:
39 | :show-inheritance:
40 |
41 |
42 | Module contents
43 | ---------------
44 |
45 | .. automodule:: kaggler
46 | :members:
47 | :undoc-members:
48 | :show-inheritance:
49 |
--------------------------------------------------------------------------------
/doc/kaggler.metrics.rst:
--------------------------------------------------------------------------------
1 | kaggler.metrics package
2 | ============================
3 |
4 | Submodules
5 | ----------
6 |
7 | kaggler.metrics.classification module
8 | ------------------------------
9 |
10 | .. automodule:: kaggler.metrics.classification
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | kaggler.metrics.regression module
16 | ------------------------------
17 |
18 | .. automodule:: kaggler.metrics.regression
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | Module contents
24 | ---------------
25 |
26 | .. automodule:: kaggler.metrics
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
--------------------------------------------------------------------------------
/doc/kaggler.model.rst:
--------------------------------------------------------------------------------
1 | kaggler.model package
2 | =====================
3 |
4 | Submodules
5 | ----------
6 |
7 | kaggler.model.nn module
8 | -----------------------
9 |
10 | .. automodule:: kaggler.model.nn
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 |
16 | Module contents
17 | ---------------
18 |
19 | .. automodule:: kaggler.model
20 | :members:
21 | :undoc-members:
22 | :show-inheritance:
23 |
--------------------------------------------------------------------------------
/doc/kaggler.online_model.rst:
--------------------------------------------------------------------------------
1 | kaggler.online_model package
2 | ============================
3 |
4 | Submodules
5 | ----------
6 |
7 | kaggler.online_model.fm module
8 | ------------------------------
9 |
10 | .. automodule:: kaggler.online_model.fm
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | kaggler.online_model.ftrl module
16 | --------------------------------
17 |
18 | .. automodule:: kaggler.online_model.ftrl
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | kaggler.online_model.ftrl_dropout module
24 | ----------------------------------------
25 |
26 | .. automodule:: kaggler.online_model.ftrl_dropout
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 | kaggler.online_model.nn module
32 | ------------------------------
33 |
34 | .. automodule:: kaggler.online_model.nn
35 | :members:
36 | :undoc-members:
37 | :show-inheritance:
38 |
39 | kaggler.online_model.nn_h2 module
40 | ---------------------------------
41 |
42 | .. automodule:: kaggler.online_model.nn_h2
43 | :members:
44 | :undoc-members:
45 | :show-inheritance:
46 |
47 | kaggler.online_model.sgd module
48 | -------------------------------
49 |
50 | .. automodule:: kaggler.online_model.sgd
51 | :members:
52 | :undoc-members:
53 | :show-inheritance:
54 |
55 |
56 | Module contents
57 | ---------------
58 |
59 | .. automodule:: kaggler.online_model
60 | :members:
61 | :undoc-members:
62 | :show-inheritance:
63 |
--------------------------------------------------------------------------------
/doc/kaggler.preprocessing.rst:
--------------------------------------------------------------------------------
1 | kaggler.preprocessing package
2 | ============================
3 |
4 | Submodules
5 | ----------
6 |
7 | kaggler.preprocessing.data module
8 | ------------------------------
9 |
10 | .. automodule:: kaggler.preprocessing.data
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | Module contents
16 | ---------------
17 |
18 | .. automodule:: kaggler.preprocessing
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
--------------------------------------------------------------------------------
/doc/kaggler.rst:
--------------------------------------------------------------------------------
1 | kaggler package
2 | ===============
3 |
4 | Subpackages
5 | -----------
6 |
7 | .. toctree::
8 |
9 | kaggler.metrics
10 | kaggler.online_model
11 | kaggler.preprocessing
12 | kaggler.test
13 |
14 | Submodules
15 | ----------
16 |
17 | kaggler.const module
18 | --------------------
19 |
20 | .. automodule:: kaggler.const
21 | :members:
22 | :undoc-members:
23 | :show-inheritance:
24 |
25 | kaggler.data_io module
26 | -----------------
27 |
28 | .. automodule:: kaggler.data_io
29 | :members:
30 | :undoc-members:
31 | :show-inheritance:
32 |
33 | kaggler.util module
34 | -------------------
35 |
36 | .. automodule:: kaggler.util
37 | :members:
38 | :undoc-members:
39 | :show-inheritance:
40 |
41 |
42 | Module contents
43 | ---------------
44 |
45 | .. automodule:: kaggler
46 | :members:
47 | :undoc-members:
48 | :show-inheritance:
49 |
--------------------------------------------------------------------------------
/doc/kaggler.test.rst:
--------------------------------------------------------------------------------
1 | kaggler.test package
2 | ====================
3 |
4 | Submodules
5 | ----------
6 |
7 | kaggler.test.test_sgd module
8 | ----------------------------
9 |
10 | .. automodule:: kaggler.test.test_sgd
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 |
16 | Module contents
17 | ---------------
18 |
19 | .. automodule:: kaggler.test
20 | :members:
21 | :undoc-members:
22 | :show-inheritance:
23 |
--------------------------------------------------------------------------------
/doc/modules.rst:
--------------------------------------------------------------------------------
1 | kaggler
2 | =======
3 |
4 | .. toctree::
5 | :maxdepth: 4
6 |
7 | kaggler
8 |
--------------------------------------------------------------------------------
/kaggler/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.4.1'
2 | __all__ = ['const',
3 | 'data_io',
4 | 'metrics',
5 | 'model',
6 | 'online_model',
7 | 'preprocessing',
8 | 'util']
9 |
--------------------------------------------------------------------------------
/kaggler/const.py:
--------------------------------------------------------------------------------
1 | FIXED_SEED = 2015
2 | SEC_PER_MIN = 60
3 |
--------------------------------------------------------------------------------
/kaggler/data_io.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_svmlight_file
2 |
3 | import heapq
4 | import numpy as np
5 |
6 |
7 | def is_number(s):
8 | """Check if a string is a number or not."""
9 |
10 | try:
11 | float(s)
12 | return True
13 | except ValueError:
14 | return False
15 |
16 |
17 | def load_data(path, dense=False):
18 | """Load data from a CSV or libsvm format file.
19 |
20 | Args:
21 | path (str): A path to the CSV or libsvm format file containing data.
22 | dense (boolean): An optional variable indicating if the return matrix
23 | should be dense. By default, it is false.
24 | """
25 |
26 | with open(path, 'r') as f:
27 | line = f.readline().strip()
28 |
29 | if ':' in line:
30 | X, y = load_svmlight_file(path)
31 | X = X.astype(np.float32)
32 | if dense:
33 | X = X.todense()
34 | elif ',' in line:
35 | X = np.loadtxt(path, delimiter=',',
36 | skiprows=0 if is_number(line.split(',')[0]) else 1)
37 | y = X[:, 0]
38 | X = X[:, 1:]
39 | else:
40 | raise NotImplementedError, "Neither CSV nor LibSVM formatted file."
41 |
42 | return X, y
43 |
44 |
45 | def read_sps(path):
46 | for line in open(path):
47 | # parse x
48 | xs = line.rstrip().split(' ')
49 |
50 | yield xs[1:], int(xs[0])
51 |
52 |
53 | def shuf_file(f, shuf_win):
54 | heap = []
55 | for line in f:
56 | key = hash(line)
57 | if len(heap) < shuf_win:
58 | heapq.heappush(heap, (key, line))
59 | else:
60 | _, out = heapq.heappushpop(heap, (key, line))
61 | yield out
62 |
63 | while len(heap) > 0:
64 | _, out = heapq.heappop(heap)
65 | yield out
66 |
--------------------------------------------------------------------------------
/kaggler/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from .classification import auc
2 | from .classification import logloss
3 | from .regression import gini
4 | from .regression import rmse
5 |
--------------------------------------------------------------------------------
/kaggler/metrics/classification.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from sklearn.metrics import roc_auc_score as auc
3 | from sklearn.metrics import log_loss
4 |
5 |
6 | def logloss(y, p):
7 | """Bounded log loss error.
8 |
9 | Args:
10 | y (numpy.array): target
11 | p (numpy.array): prediction
12 |
13 | Returns:
14 | bounded log loss error
15 | """
16 |
17 | p[p < 1e-15] = 1e-15
18 | p[p > 1 - 1e-15] = 1 - 1e-15
19 | return log_loss(y, p)
20 |
--------------------------------------------------------------------------------
/kaggler/metrics/regression.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from sklearn.metrics import mean_squared_error
3 |
4 | import numpy as np
5 |
6 |
7 | def rmse(y, p):
8 | """Root Mean Squared Error (RMSE).
9 |
10 | Args:
11 | y (numpy.array): target
12 | p (numpy.array): prediction
13 |
14 | Returns:
15 | e (numpy.float64): RMSE
16 | """
17 |
18 | # check and get number of samples
19 | assert y.shape == p.shape
20 |
21 | return np.sqrt(mean_squared_error(y, p))
22 |
23 |
24 | def gini(y, p):
25 | """Normalized Gini Coefficient.
26 |
27 | Args:
28 | y (numpy.array): target
29 | p (numpy.array): prediction
30 |
31 | Returns:
32 | e (numpy.float64): normalized Gini coefficient
33 | """
34 |
35 | # check and get number of samples
36 | assert y.shape == p.shape
37 |
38 | n_samples = y.shape[0]
39 |
40 | # sort rows on prediction column
41 | # (from largest to smallest)
42 | arr = np.array([y, p]).transpose()
43 | true_order = arr[arr[:,0].argsort()][::-1,0]
44 | pred_order = arr[arr[:,1].argsort()][::-1,0]
45 |
46 | # get Lorenz curves
47 | l_true = np.cumsum(true_order) / np.sum(true_order)
48 | l_pred = np.cumsum(pred_order) / np.sum(pred_order)
49 | l_ones = np.linspace(1/n_samples, 1, n_samples)
50 |
51 | # get Gini coefficients (area between curves)
52 | g_true = np.sum(l_ones - l_true)
53 | g_pred = np.sum(l_ones - l_pred)
54 |
55 | # normalize to true Gini coefficient
56 | return g_pred / g_true
57 |
--------------------------------------------------------------------------------
/kaggler/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .nn import NN
2 |
--------------------------------------------------------------------------------
/kaggler/model/nn.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from scipy import sparse
3 | from scipy.optimize import minimize
4 | from sklearn.metrics import roc_auc_score
5 |
6 | import logging
7 | import numpy as np
8 | import time
9 |
10 | from ..const import SEC_PER_MIN
11 |
12 |
13 | class NN(object):
14 | """Implement a neural network with a single h layer."""
15 |
16 | def __init__(self, n=5, h=10, b=100000, l1=.0, l2=.0, random_state=None):
17 | """Initialize the NN class object.
18 |
19 | Args:
20 | h (int): number of h nodes
21 | b (int): number of input examples to be processed together to find
22 | the second order gradient for back-propagation
23 | n (int): number of epoches
24 | l1 (float): regularization parameter for weights between the input
25 | and hidden layers
26 | l2 (float): regularization parameter for weights between the hidden
27 | and output layers.
28 | """
29 |
30 | np.random.seed(random_state)
31 | self.h = h
32 | self.b = b
33 | self.n = n
34 | self.l1 = l1
35 | self.l2 = l2
36 | self.n_opt = 0
37 |
38 | def fit(self, X, y, X_val=None, y_val=None):
39 | """Train a network with the quasi-Newton method.
40 |
41 | Args:
42 | X (np.array of float): feature matrix for training
43 | y (np.array of float): target values for training
44 | X_val (np.array of float): feature matrix for validation
45 | y_val (np.array of float): target values for validation
46 | """
47 | y = y.reshape((len(y), 1))
48 |
49 | if sparse.issparse(X):
50 | X = X.tocsr()
51 |
52 | if X_val is not None:
53 | n_val = len(y_val)
54 | y_val = y_val.reshape((n_val, 1))
55 |
56 | # Set initial weights randomly.
57 | self.i = X.shape[1]
58 | self.l1 = self.l1 / self.i
59 | self.w = (np.random.rand((self.i + 2) * self.h + 1) - .5) * 1e-6
60 | self.w_opt = self.w
61 | self.n_opt = 0
62 |
63 | logging.info('training ...')
64 | n_obs = X.shape[0]
65 | batch = self.b
66 | n_epoch = self.n
67 | idx = range(n_obs)
68 | self.auc_opt = .5
69 |
70 | start = time.time()
71 | print('\tEPOCH TRAIN VALID BEST TIME (m)')
72 | print('\t--------------------------------------------')
73 |
74 | # Before training
75 | p = self.predict_raw(X)
76 | auc = roc_auc_score(y, p)
77 | auc_val = auc
78 | if X_val is not None:
79 | p_val = self.predict_raw(X_val)
80 | auc_val = roc_auc_score(y_val, p_val)
81 |
82 | print('\t{:3d}: {:.6f} {:.6f} {:.6f} {:.2f}'.format(
83 | 0, auc, auc_val, self.auc_opt,
84 | (time.time() - start) / SEC_PER_MIN))
85 |
86 | # Use 'while' instead of 'for' to increase n_epoch if the validation
87 | # error keeps improving at the end of n_epoch
88 | epoch = 1
89 | while epoch <= n_epoch:
90 | # Shuffle inputs every epoch - it helps avoiding the local optimum
91 | # when batch < n_obs.
92 | np.random.shuffle(idx)
93 |
94 | # Find the optimal weights for batch input examples.
95 | # If batch == 1, it's the stochastic optimization, which is slow
96 | # but uses minimal memory. If batch == n_obs, it's the batch
97 | # optimization, which is fast but uses maximum memory.
98 | # Otherwise, it's the mini-batch optimization, which balances the
99 | # speed and space trade-offs.
100 | for i in range(int(n_obs / batch) + 1):
101 | if (i + 1) * batch > n_obs:
102 | sub_idx = idx[batch * i:n_obs]
103 | else:
104 | sub_idx = idx[batch * i:batch * (i + 1)]
105 |
106 | x = X[sub_idx]
107 | neg_idx = [n_idx for n_idx, n_y in enumerate(y[sub_idx]) if n_y == 0.]
108 | pos_idx = [p_idx for p_idx, p_y in enumerate(y[sub_idx]) if p_y == 1.]
109 | x0 = x[neg_idx]
110 | x1 = x[pos_idx]
111 | # Update weights to minimize the cost function using the
112 | # quasi-Newton method (L-BFGS-B), where:
113 | # func -- cost function
114 | # jac -- jacobian (derivative of the cost function)
115 | # maxiter -- number of iterations for L-BFGS-B
116 | ret = minimize(self.func,
117 | self.w,
118 | args=(x0, x1),
119 | method='L-BFGS-B',
120 | jac=self.fprime,
121 | options={'maxiter': 5})
122 | self.w = ret.x
123 |
124 | p = self.predict_raw(X)
125 | auc = roc_auc_score(y, p)
126 | auc_val = auc
127 |
128 | if X_val is not None:
129 | p_val = self.predict_raw(X_val)
130 | auc_val = roc_auc_score(y_val, p_val)
131 |
132 | if auc_val > self.auc_opt:
133 | self.auc_opt = auc_val
134 | self.w_opt = self.w
135 | self.n_opt = epoch
136 |
137 | # If validation auc is still improving after n_epoch,
138 | # try 10 more epochs
139 | if epoch == n_epoch:
140 | n_epoch += 5
141 |
142 | print('\t{:3d}: {:.6f} {:.6f} {:.6f} {:.2f}'.format(
143 | epoch, auc, auc_val, self.auc_opt,
144 | (time.time() - start) / SEC_PER_MIN))
145 |
146 | epoch += 1
147 |
148 | if X_val is not None:
149 | print('Optimal epoch is {0} ({1:.6f})'.format(self.n_opt,
150 | self.auc_opt))
151 | self.w = self.w_opt
152 |
153 | logging.info('done training')
154 |
155 | def predict(self, X):
156 | """Predict targets for a feature matrix.
157 |
158 | Args:
159 | X (np.array of float): feature matrix for prediction
160 |
161 | Returns:
162 |
163 | """
164 | logging.info('predicting ...')
165 | ps = self.predict_raw(X)
166 |
167 | return sigm(ps[:, 0])
168 |
169 | def predict_raw(self, X):
170 | """Predict targets for a feature matrix.
171 |
172 | Args:
173 | X (np.array of float): feature matrix for prediction
174 | """
175 | # b -- bias for the input and h layers
176 | b = np.ones((X.shape[0], 1))
177 | w2 = self.w[-(self.h + 1):].reshape(self.h + 1, 1)
178 | w1 = self.w[:-(self.h + 1)].reshape(self.i + 1, self.h)
179 |
180 | # Make X to have the same number of columns as self.i.
181 | # Because of the sparse matrix representation, X for prediction can
182 | # have a different number of columns.
183 | if X.shape[1] > self.i:
184 | # If X has more columns, cut extra columns.
185 | X = X[:, :self.i]
186 | elif X.shape[1] < self.i:
187 | # If X has less columns, cut the rows of the weight matrix between
188 | # the input and h layers instead of X itself because the SciPy
189 | # sparse matrix does not support .set_shape() yet.
190 | idx = range(X.shape[1])
191 | idx.append(self.i) # Include the last row for the bias
192 | w1 = w1[idx, :]
193 |
194 | if sparse.issparse(X):
195 | return np.hstack((sigm(sparse.hstack((X, b)).dot(w1)), b)).dot(w2)
196 | else:
197 | return np.hstack((sigm(np.hstack((X, b)).dot(w1)), b)).dot(w2)
198 |
199 | def func(self, w, *args):
200 | """Return the costs of the neural network for predictions.
201 |
202 | Args:
203 | w (array of float): weight vectors such that:
204 | w[:-h1] -- weights between the input and h layers
205 | w[-h1:] -- weights between the h and output layers
206 | args: features (args[0]) and target (args[1])
207 |
208 | Returns:
209 | combined cost of RMSE, L1, and L2 regularization
210 | """
211 | x0 = args[0]
212 | x1 = args[1]
213 |
214 | n0 = x0.shape[0]
215 | n1 = x1.shape[0]
216 |
217 | # n -- number of pairs to evaluate
218 | n = max(n0, n1) * 10
219 | idx0 = np.random.choice(range(n0), size=n)
220 | idx1 = np.random.choice(range(n1), size=n)
221 |
222 | # b -- bias for the input and h layers
223 | b0 = np.ones((n0, 1))
224 | b1 = np.ones((n1, 1))
225 | i1 = self.i + 1
226 | h = self.h
227 | h1 = h + 1
228 |
229 | # Predict for features -- cannot use predict_raw() because here
230 | # different weights can be used.
231 | if sparse.issparse(x0):
232 | p0 = np.hstack((sigm(sparse.hstack((x0, b0)).dot(w[:-h1].reshape(
233 | i1, h))), b0)).dot(w[-h1:].reshape(h1, 1))
234 | p1 = np.hstack((sigm(sparse.hstack((x1, b1)).dot(w[:-h1].reshape(
235 | i1, h))), b1)).dot(w[-h1:].reshape(h1, 1))
236 | else:
237 | p0 = np.hstack((sigm(np.hstack((x0, b0)).dot(w[:-h1].reshape(
238 | i1, h))), b0)).dot(w[-h1:].reshape(h1, 1))
239 | p1 = np.hstack((sigm(np.hstack((x1, b1)).dot(w[:-h1].reshape(
240 | i1, h))), b1)).dot(w[-h1:].reshape(h1, 1))
241 |
242 | p0 = p0[idx0]
243 | p1 = p1[idx1]
244 |
245 | # Return the cost that consists of the sum of squared error +
246 | # L2-regularization for weights between the input and h layers +
247 | # L2-regularization for weights between the h and output layers.
248 | #return .5 * (sum((1 - sigm(p1 - p0)) ** 2) + self.l1 * sum(w[:-h1] ** 2) +
249 | return .5 * (sum((1 - p1 + p0) ** 2) / n +
250 | self.l1 * sum(w[:-h1] ** 2) / (i1 * h) +
251 | self.l2 * sum(w[-h1:] ** 2) / h1)
252 |
253 | def fprime(self, w, *args):
254 | """Return the derivatives of the cost function for predictions.
255 |
256 | Args:
257 | w (array of float): weight vectors such that:
258 | w[:-h1] -- weights between the input and h layers
259 | w[-h1:] -- weights between the h and output layers
260 | args: features (args[0]) and target (args[1])
261 |
262 | Returns:
263 | gradients of the cost function for predictions
264 | """
265 |
266 | x0 = args[0]
267 | x1 = args[1]
268 |
269 | n0 = x0.shape[0]
270 | n1 = x1.shape[0]
271 |
272 | # n -- number of pairs to evaluate
273 | n = max(n0, n1) * 10
274 | idx0 = np.random.choice(range(n0), size=n)
275 | idx1 = np.random.choice(range(n1), size=n)
276 |
277 | # b -- bias for the input and h layers
278 | b = np.ones((n, 1))
279 | i1 = self.i + 1
280 | h = self.h
281 | h1 = h + 1
282 |
283 | w2 = w[-h1:].reshape(h1, 1)
284 | w1 = w[:-h1].reshape(i1, h)
285 |
286 | if sparse.issparse(x0):
287 | x0 = x0.tocsr()[idx0]
288 | x1 = x1.tocsr()[idx1]
289 | xb0 = sparse.hstack((x0, b))
290 | xb1 = sparse.hstack((x1, b))
291 | else:
292 | x0 = x0[idx0]
293 | x1 = x1[idx1]
294 | xb0 = np.hstack((x0, b))
295 | xb1 = np.hstack((x1, b))
296 |
297 | z0 = np.hstack((sigm(xb0.dot(w1)), b))
298 | z1 = np.hstack((sigm(xb1.dot(w1)), b))
299 | y0 = z0.dot(w2)
300 | y1 = z1.dot(w2)
301 |
302 | #e = 1 - sigm(y1 - y0)
303 | #dy = e * dsigm(y1 - y0)
304 | e = 1 - (y1 - y0)
305 | dy = e / n
306 |
307 | # Calculate the derivative of the cost function w.r.t. F and w2 where:
308 | # F -- weights between the input and h layers
309 | # w2 -- weights between the h and output layers
310 | dw1 = -(xb1.T.dot(dy.dot(w2[:-1].reshape(1, h)) * dsigm(xb1.dot(w1))) -
311 | xb0.T.dot(dy.dot(w2[:-1].reshape(1, h)) * dsigm(xb0.dot(w1)))
312 | ).reshape(i1 * h) + self.l1 * w[:-h1] / (i1 * h)
313 | dw2 = -(z1 - z0).T.dot(dy).reshape(h1) + self.l2 * w[-h1:] / h1
314 |
315 | return np.append(dw1, dw2)
316 |
317 |
318 | def sigm(x):
319 | """Return the value of the sigmoid function at x.
320 |
321 | Args:
322 | x (np.array of float or float)
323 |
324 | Returns:
325 | value(s) of the sigmoid function for x.
326 | """
327 |
328 | # Avoid numerical overflow by capping the input to the exponential
329 | # function - doesn't affect the return value.
330 | return 1 / (1 + np.exp(-np.maximum(x, -20)))
331 |
332 |
333 | def dsigm(x):
334 | """Return the value of derivative of sigmoid function w.r.t. x.
335 | Args:
336 | x (np.array of float or float)
337 |
338 | Returns:
339 | derivative(s) of the sigmoid function w.r.t. x.
340 | """
341 |
342 | return sigm(x) * (1 - sigm(x))
343 |
--------------------------------------------------------------------------------
/kaggler/online_model/DecisionTree/OnlineClassificationTree.py:
--------------------------------------------------------------------------------
1 | from _tree import Tree
2 | from OnlineDecisionTree import *
3 | from utils import *
4 | import numpy as np
5 | import pandas as pd
6 |
7 | class ClassificationTree(Tree):
8 |
9 | def __init__(
10 | self,
11 | number_of_features,
12 | number_of_functions=10,
13 | min_sample_split=200,
14 | predict_initialize={
15 | 'count_dict': {},
16 | }
17 | ):
18 | # Constant values
19 | self.number_of_features = number_of_features
20 | self.number_of_functions = number_of_functions
21 | self.min_sample_split = min_sample_split
22 | self.predict_initialize = predict_initialize
23 | self.max_sample = 1000
24 | # Dynamic values
25 | self.left = None
26 | self.right = None
27 | self.randomly_selected_features = []
28 | self._randomly_select()
29 | self.criterion = None
30 |
31 |
32 | def _calculate_split_score(self, split):
33 | """
34 | calculate the score of the split:
35 | score = current_error - after_split_error
36 | """
37 | left_error = gini(split['left'])
38 | right_error = gini(split['right'])
39 | error = gini(self.Y)
40 | # if the split is any good, the score should be greater than 0
41 | total = float(len(self.Y))
42 | score = error - 1 / total * (len(split['left']) * left_error\
43 | + len(split['right']) * right_error)
44 | return score
45 |
46 | def _apply_best_split(self):
47 | best_split, best_split_score = self._find_best_split()
48 | if best_split_score > 0:
49 | self.criterion = lambda x : x[best_split['feature']] \
50 | > best_split['value']
51 | # create the left child
52 | self.left = ClassificationTree(
53 | number_of_features=self.number_of_features,
54 | number_of_functions=self.number_of_functions,
55 | min_sample_split=self.min_sample_split,
56 | predict_initialize={
57 | 'count_dict': count_dict(best_split['left']),
58 | }
59 | )
60 | # create the right child
61 | self.right = ClassificationTree(
62 | number_of_features=self.number_of_features,
63 | number_of_functions=self.number_of_functions,
64 | min_sample_split=self.min_sample_split,
65 | predict_initialize={
66 | 'count_dict': count_dict(best_split['right']),
67 | }
68 | )
69 | # Collect garbage
70 | self.samples = {}
71 | self.Y = []
72 |
73 |
74 | def predict(self, x):
75 | """
76 | Make prediction recursively. Use both the samples inside the current
77 | node and the statistics inherited from parent.
78 | """
79 | if self._is_leaf():
80 | d1 = self.predict_initialize['count_dict']
81 | d2 = count_dict(self.Y)
82 | for key, value in d1.iteritems():
83 | if key in d2:
84 | d2[key] += value
85 | else:
86 | d2[key] = value
87 | return argmax(d2)
88 | else:
89 | if self.criterion(x):
90 | return self.right.predict(x)
91 | else:
92 | return self.left.predict(x)
93 |
--------------------------------------------------------------------------------
/kaggler/online_model/DecisionTree/_tree.pyx:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 | cimport numpy as np
4 |
5 | from utils import *
6 |
7 | ctypedef np.int_t DTYPE_t
8 |
9 | cdef class Tree:
10 |
11 | def __cinit__(
12 | self,
13 | int number_of_features,
14 | int number_of_functions=10,
15 | int min_sample_split=20,
16 | dict predict_initialize={
17 | 'mean':2.0,
18 | 'variance':1.0,
19 | 'num_samples':0
20 | }
21 | ):
22 | # Constant values
23 | self.number_of_features = number_of_features
24 | self.number_of_functions = number_of_functions
25 | self.min_sample_split = min_sample_split
26 | self.predict_initialize = predict_initialize
27 | self.max_sample = 100
28 | # Dynamic values
29 | self.left = None
30 | self.right = None
31 | self.randomly_selected_features = []
32 | self._randomly_select()
33 | self.criterion = None
34 |
35 |
36 | def _randomly_select(self):
37 | # Check the number of randomly selected features
38 | if self.number_of_features < self.number_of_functions:
39 | raise Exception("The feature number is more than maximum")
40 |
41 | # Randomly select features into a set, and then transform to a list
42 | self.randomly_selected_features=set([])
43 | while len(self.randomly_selected_features) < self.number_of_functions:
44 | self.randomly_selected_features.add(\
45 | random.randint(0, self.number_of_features-1))
46 | self.randomly_selected_features = list(self.randomly_selected_features)
47 |
48 | # Initialize the samples belong to the node
49 | self.samples = {}
50 | self.Y = []
51 | for feature in self.randomly_selected_features:
52 | self.samples[feature] = []
53 |
54 | def _is_leaf(self):
55 | return self.criterion == None
56 |
57 | cpdef update(self, np.ndarray x, y):
58 | """
59 | Update the model according to a single (x, y) input.
60 |
61 | If the current node is a leaf, then update the samples of the
62 | current node.
63 |
64 | Else update its left or right node recursively according to the
65 | value of x.
66 | When the left and right child are created, they inherit mean and
67 | sample count information from the parent.
68 | """
69 | cdef int N
70 | if self._is_leaf():
71 | N = len(self.Y)
72 | if N <= self.max_sample:
73 | self._update_samples(x, y)
74 | if N == self.min_sample_split or N == 2 * self.min_sample_split:
75 | self._apply_best_split()
76 |
77 | else:
78 | if self.criterion(x):
79 | self.right.update(x, y)
80 | else:
81 | self.left.update(x, y)
82 |
83 | cpdef _update_samples(self, np.ndarray x, DTYPE_t y):
84 | cdef int feature
85 | for feature in self.randomly_selected_features:
86 | self.samples[feature].append((x[feature], y))
87 | self.Y.append(y)
88 |
89 | cpdef tuple _find_best_split(self):
90 | cdef dict best_split = {}
91 | cdef double best_split_score = 0
92 | cdef int feature
93 | cdef double value
94 | cdef DTYPE_t prediction
95 | cdef list sample_feature
96 | cdef list left, right
97 | cdef dict split
98 | cdef double split_score
99 | # Try all the selected features and values combination, find the best
100 | for feature in self.randomly_selected_features:
101 | for (value, prediction) in self.samples[feature]:
102 | sample_feature = self.samples[feature]
103 | left, right = bin_split(sample_feature, value)
104 |
105 | split = {
106 | 'left': left,
107 | 'right': right,
108 | 'value': value,
109 | 'feature': feature,
110 | }
111 |
112 | split_score = self._calculate_split_score(split)
113 | if split_score > best_split_score:
114 | best_split = split
115 | best_split_score = split_score
116 |
117 | return best_split, best_split_score
118 |
--------------------------------------------------------------------------------
/kaggler/online_model/DecisionTree/test.py:
--------------------------------------------------------------------------------
1 | import profile
2 | import numpy as np
3 | import pandas as pd
4 | from sklearn import preprocessing
5 | from OnlineClassificationTree import *
6 |
7 | def test():
8 | filename = "dataset.csv"
9 | df = pd.read_csv(filename, header = 0)
10 | data = df.values
11 | y = data[:, -1]
12 | lbl_enc = preprocessing.LabelEncoder()
13 | y = lbl_enc.fit_transform(y)
14 | data = data[:, 0:-1]
15 | train = data[0:50000]
16 | ytrain = y[0:50000]
17 | test = data[50000:]
18 | ytest = y[50000:]
19 | learner = ClassificationTree(number_of_features=93)
20 |
21 | for t, x in enumerate(train):
22 | learner.update(x, ytrain[t])
23 | if t % 1000 == 0:
24 | print t
25 | correct_num = 0
26 | for t, x in enumerate(test):
27 | y_pred = learner.predict(x)
28 | if y_pred == ytest[t]:
29 | correct_num += 1
30 | if t % 1000 == 0:
31 | print t
32 |
33 | print correct_num
34 |
35 | if __name__ == '__main__':
36 | profile.run("test()")
37 |
--------------------------------------------------------------------------------
/kaggler/online_model/DecisionTree/utils.pyx:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | cimport numpy as np
3 | cimport cython
4 | from libc.math cimport sqrt, abs
5 |
6 | ctypedef np.int_t DTYPE_t
7 |
8 | cpdef DTYPE_t argmax(dict d):
9 | cdef double max_count = 0
10 | cdef double total_count = 0
11 | cdef double value
12 | cdef DTYPE_t key
13 | cdef DTYPE_t max_class = 0
14 | for key, value in d.iteritems():
15 | total_count += value
16 | if value > max_count:
17 | max_count = value
18 | max_class = key
19 | return max_class
20 |
21 |
22 | def predict_max(list a):
23 | return argmax(count_dict(a))
24 |
25 | cpdef dict count_dict(list a):
26 | cdef DTYPE_t x
27 | cdef dict d = {}
28 | for x in a:
29 | d.setdefault(x, 0)
30 | d[x] += 1
31 | return d
32 |
33 | cpdef double mean_squared_error(list x):
34 | cdef np.ndarray xnp
35 | xnp = np.array(x)
36 | xnp = xnp - xnp.mean()
37 | return sqrt((xnp * xnp.T).mean())
38 |
39 | cpdef double mean_absolute_error(list x):
40 | cdef np.ndarray xnp
41 | xnp = np.array(x)
42 | xnp = xnp - xnp.mean()
43 | return abs(xnp).mean()
44 |
45 | cpdef double gini(list x):
46 | cdef dict d = {}
47 | cdef double total
48 | cdef list to_square
49 | cdef np.ndarray to_square2
50 | cdef DTYPE_t y
51 | for y in x:
52 | d.setdefault(y, 0)
53 | d[y] += 1
54 | total = len(x)
55 | to_square = []
56 | cdef double value
57 | cdef DTYPE_t key
58 | for key, value in d.iteritems():
59 | to_square.append(value/total)
60 | to_square2 = np.array(to_square)
61 | return 1 - (to_square2 * to_square2.T).sum()
62 |
63 | cpdef tuple bin_split(list sample_feature, double feature_value):
64 | cdef list left, right
65 | cdef tuple x
66 | left = [x[1] for x in sample_feature if x[0]<=feature_value]
67 | right = [x[1] for x in sample_feature if x[0]>feature_value]
68 | return left, right
69 |
--------------------------------------------------------------------------------
/kaggler/online_model/__init__.py:
--------------------------------------------------------------------------------
1 | from .ftrl import FTRL
2 | from .fm import FM
3 | from .nn import NN
4 | from .nn_h2 import NN_H2
5 | from .sgd import SGD
6 |
--------------------------------------------------------------------------------
/kaggler/online_model/fm.pyx:
--------------------------------------------------------------------------------
1 | # cython: boundscheck=False
2 | # cython: wraparound=False
3 | # cython: cdivision=True
4 | from __future__ import division
5 | import numpy as np
6 |
7 | cimport cython
8 | from libc.math cimport sqrt, abs
9 | from ..util cimport sigm
10 | cimport numpy as np
11 |
12 | from cython.parallel import prange, parallel, threadid
13 |
14 | np.import_array()
15 |
16 |
17 | cdef class FM:
18 | """Factorization Machine online learner.
19 |
20 | Attributes:
21 | n (int): number of input features
22 | epoch (int): number of epochs
23 | k (int): size of factors for interactions
24 | a (double): initial learning rate
25 | w0 (double): weight for bias
26 | c0 (double): counters
27 | w (array of double): feature weights
28 | c (array of double): counters for weights
29 | V (array of double): feature weights for factors
30 | """
31 |
32 | cdef unsigned int epoch
33 | cdef unsigned int n
34 | cdef unsigned int k
35 | cdef double a
36 | cdef double w0
37 | cdef double c0
38 | cdef double[:] w
39 | cdef double[:] c
40 | cdef double[:] V
41 |
42 | def __init__(self,
43 | unsigned int n,
44 | unsigned int epoch=100,
45 | unsigned int dim=4,
46 | double a=0.01,
47 | seed=0):
48 | """Initialize the FM class object.
49 |
50 | Args:
51 | n (int): number of input features
52 | epoch (int): number of epochs
53 | dim (int): size of factors for interactions
54 | a (double): initial learning rate
55 | seed (int): random seed
56 | """
57 | cdef int i
58 |
59 | rng = np.random.RandomState(seed)
60 |
61 | self.n = n # # of features
62 | self.epoch = epoch # # of epochs
63 | self.k = dim # interaction dimension
64 | self.a = a # learning rate
65 |
66 | # initialize weights, factorized interactions, and counts
67 | self.w0 = 0.
68 | self.c0 = 0.
69 | self.w = np.zeros((self.n,), dtype=np.float64)
70 | self.c = np.zeros((self.n,), dtype=np.float64)
71 | self.V = (rng.rand(self.n * self.k) - .5) * 1e-6
72 |
73 | def __repr__(self):
74 | return ('FM(n={}, epoch={}, dim={}, a={})').format(
75 | self.n, self.epoch, self.dim, self.a
76 | )
77 |
78 | def read_sparse(self, path):
79 | """Apply hashing trick to the libsvm format sparse file.
80 |
81 | Args:
82 | path (str): a file path to the libsvm format sparse file
83 |
84 | Yields:
85 | idx (list of int): a list of index of non-zero features
86 | val (list of double): a list of values of non-zero features
87 | y (int): target value
88 | """
89 | for line in open(path):
90 | xs = line.rstrip().split(' ')
91 |
92 | y = int(xs[0])
93 | idx = []
94 | val = []
95 | for item in xs[1:]:
96 | i, v = item.split(':')
97 | idx.append(int(i))
98 | val.append(float(v))
99 |
100 | yield zip(idx, val), y
101 |
102 | def fit(self, X, y):
103 | """Update the model with a sparse input feature matrix and its targets.
104 |
105 | Args:
106 | X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features
107 | y (numpy.array): targets
108 |
109 | Returns:
110 | updated model weights and counts
111 | """
112 | n = X.shape[0]
113 | for epoch in range(self.epoch):
114 | for row in range(n):
115 | x = zip(X[row].indices, X[row].data)
116 | self.update_one(x, self.predict_one(x) - y[row])
117 |
118 | def predict(self, X):
119 | """Predict for a sparse matrix X.
120 |
121 | Args:
122 | X (scipy.sparse.csr_matrix): a sparse matrix for input features
123 |
124 | Returns:
125 | p (numpy.array): predictions for input features
126 | """
127 |
128 | p = np.zeros((X.shape[0], ), dtype=np.float64)
129 | for row in range(X.shape[0]):
130 | p[row] = self.predict_one(zip(X[row].indices, X[row].data))
131 |
132 | return p
133 |
134 | def predict_one(self, list x):
135 | """Predict for features.
136 |
137 | Args:
138 | x (list of tuple): a list of (index, value) of non-zero features
139 |
140 | Returns:
141 | p (double): a prediction for input features
142 | """
143 | cdef int i
144 | cdef int k
145 | cdef double v
146 | cdef double p
147 | cdef double wx
148 | cdef double[:] vx
149 | cdef double[:] v2x2
150 |
151 | wx = 0.
152 | vx = np.zeros((self.k,), dtype=np.float64)
153 | v2x2 = np.zeros((self.k,), dtype=np.float64)
154 | for i, v in x:
155 | wx += self.w[i] * v
156 | for k in range(self.k):
157 | vx[k] += self.V[i * self.k + k] * v
158 | v2x2[k] += (self.V[i * self.k + k] ** 2) * (v ** 2)
159 |
160 | p = self.w0 + wx
161 | for k in range(self.k):
162 | p += .5 * (vx[k] ** 2 - v2x2[k])
163 |
164 | return sigm(p)
165 |
166 | def update_one(self, list x, double e):
167 | """Update the model.
168 |
169 | Args:
170 | idx (list of int): a list of index of non-zero features
171 | val (list of double): a list of values of non-zero features
172 | e (double): error between the prediction of the model and target
173 |
174 | Returns:
175 | updated model weights and counts
176 | """
177 | cdef int i
178 | cdef int k
179 | cdef int f
180 | cdef double v
181 | cdef double g2
182 | cdef double dl_dw
183 | cdef double[:] vx
184 |
185 | # calculate v_f * x in advance
186 | vx = np.zeros((self.k,), dtype=np.float64)
187 | for i, v in x:
188 | for k in range(self.k):
189 | vx[k] += self.V[i * self.k + k] * v
190 |
191 | # update w0, w, V, c0, and c
192 | g2 = e * e
193 |
194 | self.w0 -= self.a / (sqrt(self.c0) + 1) * e
195 | for i, v in x:
196 | dl_dw = self.a / (sqrt(self.c[i]) + 1) * e * v
197 | self.w[i] -= dl_dw
198 | for f in range(self.k):
199 | self.V[i * self.k + f] -= dl_dw * (vx[f] -
200 | self.V[i * self.k + f] * v)
201 |
202 | self.c[i] += g2
203 |
204 | self.c0 += g2
205 |
--------------------------------------------------------------------------------
/kaggler/online_model/ftrl.pyx:
--------------------------------------------------------------------------------
1 | # cython: boundscheck=False
2 | # cython: wraparound=False
3 | # cython: cdivision=True
4 | from __future__ import division
5 | import numpy as np
6 |
7 | cimport cython
8 | from libc.math cimport sqrt, abs
9 | from ..util cimport sigm
10 | cimport numpy as np
11 |
12 |
13 | np.import_array()
14 |
15 |
16 | cdef class FTRL:
17 | """FTRL online learner with the hasing trick using liblinear format data.
18 |
19 | inspired by Kaggle user tinrtgu's code at http://goo.gl/K8hQBx
20 | original FTRL paper is available at http://goo.gl/iqIaH0
21 |
22 | Attributes:
23 | n (int): number of features after hashing trick
24 | epoch (int): number of epochs
25 | a (double): alpha in the per-coordinate rate
26 | b (double): beta in the per-coordinate rate
27 | l1 (double): L1 regularization parameter
28 | l2 (double): L2 regularization parameter
29 | w (array of double): feature weights
30 | c (array of double): counters for weights
31 | z (array of double): lazy weights
32 | interaction (boolean): whether to use 2nd order interaction or not
33 | """
34 |
35 | cdef double a # learning rate
36 | cdef double b
37 | cdef double l1
38 | cdef double l2
39 | cdef unsigned int epoch
40 | cdef unsigned int n
41 | cdef bint interaction
42 | cdef double[:] w
43 | cdef double[:] c
44 | cdef double[:] z
45 |
46 | def __init__(self,
47 | double a=0.01,
48 | double b=1.,
49 | double l1=1.,
50 | double l2=1.,
51 | unsigned int n=2**20,
52 | unsigned int epoch=1,
53 | bint interaction=True):
54 | """Initialize the FTRL class object.
55 |
56 | Args:
57 | a (double): alpha in the per-coordinate rate
58 | b (double): beta in the per-coordinate rate
59 | l1 (double): L1 regularization parameter
60 | l2 (double): L2 regularization parameter
61 | n (int): number of features after hashing trick
62 | epoch (int): number of epochs
63 | interaction (boolean): whether to use 2nd order interaction or not
64 | """
65 |
66 | self.a = a
67 | self.b = b
68 | self.l1 = l1
69 | self.l2 = l2
70 | self.n = n
71 | self.epoch = epoch
72 | self.interaction = interaction
73 |
74 | # initialize weights and counts
75 | self.w = np.zeros((self.n + 1,), dtype=np.float64)
76 | self.c = np.zeros((self.n + 1,), dtype=np.float64)
77 | self.z = np.zeros((self.n + 1,), dtype=np.float64)
78 |
79 | def __repr__(self):
80 | return ('FTRL(a={}, b={}, l1={}, l2={}, n={}, epoch={}, interaction={})').format(
81 | self.a, self.b, self.l1, self.l2, self.n, self.epoch, self.interaction
82 | )
83 |
84 | def _indices(self, list x):
85 | cdef unsigned int index
86 | cdef int l
87 | cdef int i
88 | cdef int j
89 |
90 | # return the index of the bias term
91 | yield self.n
92 |
93 | for index in x:
94 | yield abs(hash(index)) % self.n
95 |
96 | if self.interaction:
97 | l = len(x)
98 | x = sorted(x)
99 | for i in xrange(l):
100 | for j in xrange(i + 1, l):
101 | yield abs(hash('{}_{}'.format(x[i], x[j]))) % self.n
102 |
103 | def read_sparse(self, path):
104 | """Apply hashing trick to the libsvm format sparse file.
105 |
106 | Args:
107 | path (str): a file path to the libsvm format sparse file
108 |
109 | Yields:
110 | x (list of int): a list of index of non-zero features
111 | y (int): target value
112 | """
113 | for line in open(path):
114 | xs = line.rstrip().split(' ')
115 |
116 | y = int(xs[0])
117 | x = []
118 | for item in xs[1:]:
119 | index, _ = item.split(':')
120 | x.append(index)
121 |
122 | yield x, y
123 |
124 | def fit(self, X, y):
125 | """Update the model with a sparse input feature matrix and its targets.
126 |
127 | Args:
128 | X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features
129 | y (numpy.array): targets
130 |
131 | Returns:
132 | updated model weights and counts
133 | """
134 | for epoch in range(self.epoch):
135 | for row in range(X.shape[0]):
136 | x = list(X[row].indices)
137 | self.update_one(x, self.predict_one(x) - y[row])
138 |
139 | def predict(self, X):
140 | """Predict for a sparse matrix X.
141 |
142 | Args:
143 | X (scipy.sparse.csr_matrix): a sparse matrix for input features
144 |
145 | Returns:
146 | p (numpy.array): predictions for input features
147 | """
148 | p = np.zeros((X.shape[0], ), dtype=np.float64)
149 | for row in range(X.shape[0]):
150 | p[row] = self.predict_one(list(X[row].indices))
151 |
152 | return p
153 |
154 | def update_one(self, list x, double e):
155 | """Update the model.
156 |
157 | Args:
158 | x (list of int): a list of index of non-zero features
159 | e (double): error between prediction of the model and target
160 |
161 | Returns:
162 | updates model weights and counts
163 | """
164 | cdef int i
165 | cdef double e2
166 | cdef double s
167 |
168 | e2 = e * e
169 | for i in self._indices(x):
170 | s = (sqrt(self.c[i] + e2) - sqrt(self.c[i])) / self.a
171 | self.w[i] += e - s * self.z[i]
172 | self.c[i] += e2
173 |
174 | def predict_one(self, list x):
175 | """Predict for features.
176 |
177 | Args:
178 | x (list of int): a list of index of non-zero features
179 |
180 | Returns:
181 | p (double): a prediction for input features
182 | """
183 | cdef int i
184 | cdef double sign
185 | cdef double wTx
186 |
187 | wTx = 0.
188 | for i in self._indices(x):
189 | sign = -1. if self.w[i] < 0 else 1.
190 | if sign * self.w[i] <= self.l1:
191 | self.z[i] = 0.
192 | else:
193 | self.z[i] = (sign * self.l1 - self.w[i]) / \
194 | ((self.b + sqrt(self.c[i])) / self.a + self.l2)
195 |
196 | wTx += self.z[i]
197 |
198 | return sigm(wTx)
199 |
--------------------------------------------------------------------------------
/kaggler/online_model/ftrl_dropout.pyx:
--------------------------------------------------------------------------------
1 | from csv import DictReader
2 | from math import exp, log, sqrt
3 |
4 | import cPickle as pickle
5 | import gzip
6 | import random
7 |
8 |
9 | class ftrl_proximal(object):
10 | ''' Our main algorithm: Follow the regularized leader - proximal
11 |
12 | In short,
13 | this is an adaptive-learning-rate sparse logistic-regression with
14 | efficient L1-L2-regularization
15 |
16 | Reference:
17 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
18 | '''
19 |
20 | def __init__(self, alpha, beta, L1, L2, D, interaction=False, dropout=1.0):
21 | # parameters
22 | self.alpha = alpha
23 | self.beta = beta
24 | self.L1 = L1
25 | self.L2 = L2
26 |
27 | # feature related parameters
28 | self.D = D
29 | self.interaction = interaction
30 | self.dropout = dropout
31 |
32 | # model
33 | # n: squared sum of past gradients
34 | # z: weights
35 | # w: lazy weights
36 | self.n = [0.] * D
37 | self.z = [0.] * D
38 |
39 | self.w = [0.] * D # use this for execution speed up
40 |
41 | def _indices(self, x):
42 | ''' A helper generator that yields the indices in x
43 |
44 | The purpose of this generator is to make the following
45 | code a bit cleaner when doing feature interaction.
46 | '''
47 |
48 | for i in x:
49 | yield i
50 |
51 | if self.interaction:
52 | L = len(x)
53 | for i in xrange(1, L): # skip bias term, so we start at 1
54 | for j in xrange(i+1, L):
55 | # one-hot encode interactions with hash trick
56 | yield abs(hash(str(x[i]) + '_' + str(x[j]))) % self.D
57 |
58 | def predict(self, x, dropped = None):
59 | ''' Get probability estimation on x
60 |
61 | INPUT:
62 | x: features
63 |
64 | OUTPUT:
65 | probability of p(y = 1 | x; w)
66 | '''
67 | # params
68 | dropout = self.dropout
69 |
70 | # model
71 | w = self.w
72 |
73 | # wTx is the inner product of w and x
74 | wTx = 0.
75 | for j, i in enumerate(self._indices(x)):
76 |
77 | if dropped != None and dropped[j]:
78 | continue
79 |
80 | wTx += w[i]
81 |
82 | if dropped != None: wTx /= dropout
83 |
84 | # bounded sigmoid function, this is the probability estimation
85 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.)))
86 |
87 | def update(self, x, y):
88 | ''' Update model using x, p, y
89 |
90 | INPUT:
91 | x: feature, a list of indices
92 | p: click probability prediction of our model
93 | y: answer
94 |
95 | MODIFIES:
96 | self.n: increase by squared gradient
97 | self.z: weights
98 | '''
99 |
100 | # parameters
101 | alpha = self.alpha
102 | beta = self.beta
103 | L1 = self.L1
104 | L2 = self.L2
105 |
106 | # model
107 | n = self.n
108 | z = self.z
109 | w = self.w # no need to change this, it won't gain anything
110 | dropout = self.dropout
111 |
112 | ind = [ i for i in self._indices(x)]
113 |
114 | if dropout == 1:
115 | dropped = None
116 | else:
117 | dropped = [random.random() > dropout for i in xrange(0,len(ind))]
118 |
119 | p = self.predict(x, dropped)
120 |
121 | # gradient under logloss
122 | g = p - y
123 |
124 | # update z and n
125 | for j, i in enumerate(ind):
126 |
127 | # implement dropout as overfitting prevention
128 | if dropped != None and dropped[j]: continue
129 |
130 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
131 | z[i] += g - sigma * w[i]
132 | n[i] += g * g
133 |
134 | sign = -1. if z[i] < 0 else 1. # get sign of z[i]
135 |
136 | # build w on the fly using z and n, hence the name - lazy weights -
137 | if sign * z[i] <= L1:
138 | # w[i] vanishes due to L1 regularization
139 | w[i] = 0.
140 | else:
141 | # apply prediction time L1, L2 regularization to z and get w
142 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
143 |
144 | def read_csv(self, f_train):
145 | ''' GENERATOR: Apply hash-trick to the original csv row
146 | and for simplicity, we one-hot-encode everything
147 |
148 | INPUT:
149 | path: path to training or testing file
150 |
151 | YIELDS:
152 | ID: id of the instance, mainly useless
153 | x: a list of hashed and one-hot-encoded 'indices'
154 | we only need the index since all values are either 0 or 1
155 | y: y = 1 if we have a click, else we have y = 0
156 | '''
157 | for t, row in enumerate(DictReader(f_train)):
158 | # process id
159 | ID = row['id']
160 | del row['id']
161 |
162 | # process clicks
163 | y = 0.
164 | if 'click' in row:
165 | if row['click'] == '1':
166 | y = 1.
167 | del row['click']
168 |
169 | # turn hour really into hour, it was originally YYMMDDHH
170 |
171 | date = row['hour'][0:6]
172 | row['hour'] = row['hour'][6:]
173 |
174 | # stderr.write("_%s_" % date)
175 |
176 | # extract date
177 | row['wd'] = str(int(date) % 7)
178 | row['wd_hour'] = "%s_%s" % (row['wd'], row['hour'])
179 |
180 | # build x
181 | x = [0] # 0 is the index of the bias term
182 | for key in row:
183 | value = row[key]
184 |
185 | # one-hot encode everything with hash trick
186 | index = abs(hash(key + '_' + value)) % self.D
187 | x.append(index)
188 |
189 | yield t, ID, x, y
190 |
191 | def write_model(self, model, model_save, args):
192 | with gzip.open(model_save, "wb") as model_file:
193 | pickle.dump((args, model), model_file)
194 |
195 | def load_model(self, model_save):
196 | with gzip.open(model_save, "rb") as model_file:
197 | (p, model) = pickle.load(model_file)
198 |
199 | return model
200 |
--------------------------------------------------------------------------------
/kaggler/online_model/ftrl_fm.pyx:
--------------------------------------------------------------------------------
1 | ''' Based on Tinrtgu's FTRL code: http://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
2 | '''
3 |
4 | from csv import DictReader
5 | cimport cython
6 | from libc.math cimport exp, copysign, log, sqrt
7 | import numpy as np
8 | import copy
9 | cimport numpy as np
10 | np.import_array()
11 | from cython.parallel import parallel
12 | from datetime import datetime
13 | import random
14 |
15 | cdef class FTRL_FM:
16 | cdef double alpha # learning rate
17 | cdef double beta
18 | cdef double alpha_fm # learning rate
19 | cdef double beta_fm
20 | cdef double L1
21 | cdef double L2
22 | cdef double L1_fm
23 | cdef double L2_fm
24 | cdef double L1_fm_tmp
25 | cdef double L2_fm_tmp
26 | cdef unsigned int fm_dim
27 | cdef unsigned int D
28 | cdef double fm_initDev
29 | cdef double dropoutRate
30 |
31 |
32 | cdef unsigned int epoch
33 | # cdef unsigned int n
34 | cdef bint interaction
35 | cdef double[:] w
36 | cdef double[:] n
37 | cdef double[:] z
38 | cdef dict n_fm
39 | cdef dict z_fm
40 | cdef dict w_fm
41 | def __init__(
42 | self,
43 | unsigned int fm_dim=4,
44 | double fm_initDev=0.01,
45 | double L1=0.0,
46 | double L2=0.0,
47 | double L1_fm=0.0,
48 | double L2_fm=0.0,
49 | unsigned int D=2*22,
50 | double alpha=0.005,
51 | double beta=1.0,
52 | double alpha_fm = .1,
53 | double beta_fm = 1.0,
54 | double dropoutRate = 1.0
55 | ):
56 | ''' initialize the factorization machine.'''
57 |
58 | self.alpha = alpha # learning rate parameter alpha
59 | self.beta = beta # learning rate parameter beta
60 | self.L1 = L1 # L1 regularizer for first order terms
61 | self.L2 = L2 # L2 regularizer for first order terms
62 | self.alpha_fm = alpha_fm # learning rate parameter alpha for factorization machine
63 | self.beta_fm = beta_fm # learning rate parameter beta for factorization machine
64 | self.L1_fm = L1_fm # L1 regularizer for factorization machine weights. Only use L1 after one epoch of training, because small initializations are needed for gradient.
65 | self.L2_fm = L2_fm # L2 regularizer for factorization machine weights.
66 | self.fm_dim = fm_dim # dimension of factorization.
67 | self.fm_initDev = fm_initDev # standard deviation for random intitialization of factorization weights.
68 | self.dropoutRate = dropoutRate # dropout rate (which is actually the inclusion rate), i.e. dropoutRate = .8 indicates a probability of .2 of dropping out a feature.
69 |
70 | self.L1_fm_tmp = L1_fm # L1 regularizer for factorization machine weights. Only use L1 after one epoch of training, because small initializations are needed for gradient.
71 | self.L2_fm_tmp = L2_fm # L2 regularizer for factorization machine weights.
72 |
73 | self.D = D
74 |
75 | # model
76 | # n: squared sum of past gradients
77 | # z: weights
78 | # w: lazy weights
79 |
80 | # let index 0 be bias term to avoid collisions.
81 | self.n = np.zeros(self.D + 1, dtype=np.float64)
82 | self.z = np.zeros(self.D + 1, dtype=np.float64)
83 | self.w = np.zeros(self.D + 1, dtype=np.float64)
84 |
85 | self.n_fm = {}
86 | self.z_fm = {}
87 | self.w_fm = {}
88 |
89 |
90 | def init_fm(self,unsigned int i):
91 | ''' initialize the factorization weight vector for variable i.
92 | '''
93 | cdef unsigned int k
94 | if i not in self.n_fm:
95 | self.n_fm[i] = np.zeros(self.fm_dim, dtype=np.float64)
96 | self.w_fm[i] = np.zeros(self.fm_dim, dtype=np.float64)
97 | self.z_fm[i] = np.zeros(self.fm_dim, dtype=np.float64)
98 |
99 | for k in range(self.fm_dim):
100 | self.z_fm[i][k] = random.gauss(0., self.fm_initDev)
101 |
102 | def predict_raw(self, list x):
103 | ''' predict_one the raw score prior to logit transformation.
104 | '''
105 | alpha = self.alpha
106 | beta = self.beta
107 | L1 = self.L1
108 | L2 = self.L2
109 | alpha_fm = self.alpha_fm
110 | beta_fm = self.beta_fm
111 | L1_fm = self.L1_fm
112 | L2_fm = self.L2_fm
113 |
114 | # first order weights model
115 | n = self.n
116 | z = self.z
117 | w = self.w
118 |
119 | # FM interaction model
120 | n_fm = self.n_fm
121 | z_fm = self.z_fm
122 | w_fm = self.w_fm
123 |
124 | cdef double raw_y = 0.
125 | cdef unsigned int i
126 | cdef double sign
127 | cdef unsigned int len_x
128 | cdef unsigned int k
129 |
130 | # calculate the bias contribution
131 | for i in [0]:
132 | # no regularization for bias
133 | self.w[i] = (- self.z[i]) / ((self.beta + sqrt(self.n[i])) / self.alpha)
134 |
135 | raw_y += self.w[i]
136 |
137 | # calculate the first order contribution.
138 | for i in x:
139 | sign = -1. if self.z[i] < 0. else 1. # get sign of z[i]
140 |
141 | if sign * self.z[i] <= self.L1:
142 | self.w[i] = 0.
143 | else:
144 | self.w[i] = (sign * self.L1 - self.z[i]) / ((self.beta + sqrt(n[i])) / self.alpha + self.L2)
145 |
146 | raw_y += self.w[i]
147 |
148 |
149 | len_x = len(x)
150 | # calculate factorization machine contribution.
151 | for i in x:
152 | self.init_fm(i)
153 | for k in range(self.fm_dim):
154 | sign = -1. if self.z_fm[i][k] < 0. else 1. # get the sign of z_fm[i][k]
155 |
156 | if sign * self.z_fm[i][k] <= self.L1_fm:
157 | self.w_fm[i][k] = 0.
158 | else:
159 | self.w_fm[i][k] = (sign * self.L1_fm - self.z_fm[i][k]) / ((self.beta_fm + sqrt(self.n_fm[i][k])) / self.alpha_fm + self.L2_fm)
160 |
161 | for i in range(len_x):
162 | for j in range(i + 1, len_x):
163 | for k in range(self.fm_dim):
164 | raw_y += w_fm[x[i]][k] * w_fm[x[j]][k]
165 |
166 | return raw_y
167 |
168 | def predict_one(self, list x):
169 | ''' predict_one the logit
170 | '''
171 | return 1. / (1. + exp(- max(min(self.predict_raw(x), 35.), -35.)))
172 |
173 | def dropout(self, list x):
174 | ''' dropout variables in list x
175 | '''
176 | cdef unsigned int i
177 | cdef double var
178 | for i, var in enumerate(x):
179 | if random.random() > self.dropoutRate:
180 | del x[i]
181 |
182 | def dropoutThenPredict(self, list x):
183 | ''' first dropout some variables and then predict_one the logit using the dropped out data.
184 | '''
185 | self.dropout(x)
186 | return self.predict_one(x)
187 |
188 | def predictWithDroppedOutModel(self, list x):
189 | ''' predict_one using all data, using a model trained with dropout.
190 | '''
191 | return 1. / (1. + exp(- max(min(self.predict_raw(x) * self.dropoutRate, 35.), -35.)))
192 |
193 | def update(self, list x, double p, double y):
194 | ''' Update the parameters using FTRL (Follow the Regularized Leader)
195 | '''
196 | # alpha = self.alpha
197 | # alpha_fm = self.alpha_fm
198 |
199 | # # model
200 | # n = self.n
201 | # z = self.z
202 | # w = self.w
203 |
204 | # # FM model
205 | # n_fm = self.n_fm
206 | # z_fm = self.z_fm
207 | # w_fm = self.w_fm
208 |
209 | cdef double g
210 | # cost gradient with respect to raw prediction.
211 | g = p - y
212 |
213 | cdef int len_x
214 | cdef int i
215 | cdef int j
216 | cdef int k
217 | cdef double sigma
218 | cdef dict fm_sum
219 | # cdef np.ndarray fm_sum
220 |
221 | fm_sum = {} # sums for calculating gradients for FM.
222 | # fm_sum = np.zeros(len(x + [0]))
223 | # fm_sum = np.expand_dims(fm_sum,1)
224 | len_x = len(x)
225 | # with nogil, parallel():
226 | for i in x + [0]:
227 | # update the first order weights.
228 | sigma = (sqrt(self.n[i] + g * g) - sqrt(self.n[i])) / self.alpha
229 | self.z[i] += g - sigma * self.w[i]
230 | self.n[i] += g * g
231 |
232 | # initialize the sum of the FM interaction weights.
233 | fm_sum[i] = np.zeros(self.fm_dim)
234 |
235 | # sum the gradients for FM interaction weights.
236 | for i in range(len_x):
237 | for j in range(len_x):
238 | if i != j:
239 | for k in range(self.fm_dim):
240 | fm_sum[x[i]][k] += self.w_fm[x[j]][k]
241 |
242 | for i in x:
243 | for k in range(self.fm_dim):
244 | g_fm = g * fm_sum[i][k]
245 | sigma = (sqrt(self.n_fm[i][k] + g_fm * g_fm) - sqrt(self.n_fm[i][k])) / self.alpha_fm
246 | self.z_fm[i][k] += g_fm - sigma * self.w_fm[i][k]
247 | self.n_fm[i][k] += g_fm * g_fm
248 |
249 | def write_w(self, filePath):
250 | ''' write out the first order weights w to a file.
251 | '''
252 | with open(filePath, "w") as f_out:
253 | for i, w in enumerate(self.w):
254 | f_out.write("%i,%f\n" % (i, w))
255 |
256 | def write_w_fm(self, filePath):
257 | ''' write out the factorization machine weights to a file.
258 | '''
259 | with open(filePath, "w") as f_out:
260 | for k, w_fm in self.w_fm.iteritems():
261 | f_out.write("%i,%s\n" % (k, ",".join([str(w) for w in w_fm])))
262 |
263 |
264 | def predict(self,testingFile,hashSalt='salt'):
265 | start = datetime.now()
266 | # initialize a FM learner
267 | learner = self
268 | cdef int e
269 | cdef double cvLoss = 0.
270 | cdef double cvCount = 0.
271 | cdef double progressiveLoss = 0.
272 | cdef double progressiveCount = 0.
273 | cdef list x
274 | cdef double y
275 | cdef unsigned int t
276 | cdef double p
277 | cdef double loss
278 | cdef list y_preds = []
279 | for t, ID, x, y in data(testingFile, self.D, hashSalt,loop=False):
280 | p = learner.predict_one(x)
281 | y_preds.append(p)
282 | return y_preds
283 |
284 |
285 | def evaluate(self,validationFile,eval_metric,hashSalt='salt'):
286 | start = datetime.now()
287 | # initialize a FM learner
288 | learner = self
289 | cdef int e
290 | cdef double cvLoss = 0.
291 | cdef double cvCount = 0.
292 | cdef double progressiveLoss = 0.
293 | cdef double progressiveCount = 0.
294 | cdef list x
295 | cdef double y
296 | cdef unsigned int t
297 | cdef double p
298 | cdef double loss
299 | cdef list y_preds = []
300 | cdef list y_test = []
301 | for t, ID, x, y in data(validationFile, self.D, hashSalt,loop=False):
302 | p = learner.predict_one(x)
303 | y_preds.append(p)
304 | y_test.append(y)
305 | score = eval_metric(y_preds,y_preds)
306 | return score
307 |
308 | def fit(self,trainingFile,hashSalt='salt',n_epochs=5,reportFrequency=10000,validationFile=None,eval_metric=None):
309 | start = datetime.now()
310 | # initialize a FM learner
311 | learner = self
312 | cdef int e
313 | cdef double cvLoss = 0.
314 | cdef double cvCount = 0.
315 | cdef double progressiveLoss = 0.
316 | cdef double progressiveCount = 0.
317 | cdef list x
318 | cdef double y
319 | cdef unsigned int t
320 | cdef double p
321 | cdef double loss
322 | print("Start Training:")
323 | for e in range(n_epochs):
324 |
325 | # if it is the first epoch, then don't use L1_fm or L2_fm
326 | if e == 0:
327 | learner.L1_fm = 0.
328 | learner.L2_fm = 0.
329 | else:
330 | learner.L1_fm = learner.L1_fm_tmp
331 | learner.L2_fm = learner.L1_fm_tmp
332 |
333 |
334 | for t, ID, x, y in data(trainingFile, self.D, hashSalt,loop=True):
335 | p = learner.predict_one(x)
336 | loss = logLoss(p, y)
337 | learner.update(x, p, y)
338 | progressiveLoss += loss
339 | progressiveCount += 1.
340 | if t % reportFrequency == 0:
341 | print("Epoch %d\tcount: %d\tProgressive Loss: %f" % (e, t, progressiveLoss / progressiveCount))
342 | if validationFile!=None and eval_metric!=None:
343 | eval_score = self.evaluate(validationFile,eval_metric)
344 | print("Epoch %d\tcount: %d\tEvaludation score: %f" % (e, t, eval_score))
345 |
346 | print("Epoch %d finished.\tvalidation loss: %f\telapsed time: %s" % (e, cvLoss / cvCount, str(datetime.now() - start)))
347 | if validationFile!=None and eval_metric!=None:
348 | eval_score = self.evaluate(validationFile,eval_metric)
349 | print("Epoch %d\finished: %d\tEvaludation score: %f" % (e, t, eval_score))
350 |
351 |
352 | def logLoss(double p, double y):
353 | '''
354 | calculate the log loss cost
355 | p: prediction [0, 1]
356 | y: actual value {0, 1}
357 | '''
358 | p = max(min(p, 1. - 1e-15), 1e-15)
359 | return - log(p) if y == 1. else -log(1. - p)
360 |
361 | def data(filePath, hashSize, hashSalt,loop=False):
362 | ''' generator for data using hash trick
363 |
364 | INPUT:
365 | filePath
366 | hashSize
367 | hashSalt: String with which to salt the hash function
368 | '''
369 | cdef unsigned int t
370 | cdef double y
371 | cdef list x
372 | cdef str value
373 | cdef unsigned int index
374 | cdef dict row
375 | import os
376 | if not loop:
377 | for t, row in enumerate(DictReader(filePath)):
378 | ID = row['activity_id']
379 | del row['activity_id']
380 |
381 | del row['outcome_isnull']
382 |
383 | y = 0.
384 | if 'outcome' in row:
385 | if row['outcome'] == '1':
386 | y = 1.
387 | del row['outcome']
388 |
389 | # date = int(row['hour'][4:6])
390 |
391 | # row['hour'] = row['hour'][6:]
392 |
393 | x = []
394 |
395 | for key in row:
396 | value = row[key]
397 |
398 | index = abs(hash(hashSalt + key + '_' + value)) % hashSize + 1 # 1 is added to hash index because I want 0 to indicate the bias term.
399 | x.append(index)
400 |
401 | yield t, ID, x, y
402 | else:
403 | while True:
404 | for t, row in enumerate(DictReader(filePath)):
405 | ID = row['activity_id']
406 | del row['activity_id']
407 |
408 | del row['outcome_isnull']
409 |
410 | y = 0.
411 | if 'outcome' in row:
412 | if row['outcome'] == '1':
413 | y = 1.
414 | del row['outcome']
415 |
416 | # date = int(row['hour'][4:6])
417 |
418 | # row['hour'] = row['hour'][6:]
419 |
420 | x = []
421 |
422 | for key in row:
423 | value = row[key]
424 |
425 | index = abs(hash(hashSalt + key + '_' + value)) % hashSize + 1 # 1 is added to hash index because I want 0 to indicate the bias term.
426 | x.append(index)
427 |
428 | yield t, ID, x, y
429 |
--------------------------------------------------------------------------------
/kaggler/online_model/nn.pyx:
--------------------------------------------------------------------------------
1 | # cython: boundscheck=False
2 | # cython: wraparound=False
3 | # cython: cdivision=True
4 | from __future__ import division
5 | import numpy as np
6 |
7 | cimport cython
8 | from libc.math cimport sqrt, abs
9 | from ..util cimport sigm
10 | cimport numpy as np
11 |
12 |
13 | np.import_array()
14 |
15 |
16 | cdef class NN:
17 | """Neural Network with a single ReLU hidden layer online learner.
18 |
19 | Attributes:
20 | n (int): number of input units
21 | epoch (int): number of epochs
22 | h (int): number of hidden units
23 | a (double): initial learning rate
24 | l2 (double): L2 regularization parameter
25 | w0 (array of double): weights between the input and hidden layers
26 | w1 (array of double): weights between the hidden and output layers
27 | z (array of double): hidden units
28 | c (double): counter
29 | c1 (array of double): counters for hidden units
30 | """
31 |
32 | cdef unsigned int epoch # number of epochs
33 | cdef unsigned int n # number of input units
34 | cdef unsigned int h # number of hidden units
35 | cdef double a # learning rate
36 | cdef double l2 # L2 regularization parameter
37 | cdef double[:] w0 # weights between the input and hidden layers
38 | cdef double[:] w1 # weights between the hidden and output layers
39 | cdef double[:] z # hidden units
40 | cdef double c # counter
41 | cdef double[:] c0 # counters for input units
42 | cdef double[:] c1 # counters for hidden units
43 |
44 | def __init__(self,
45 | unsigned int n,
46 | unsigned int epoch=10,
47 | unsigned int h=10,
48 | double a=0.01,
49 | double l2=0.,
50 | unsigned int seed=0):
51 | """Initialize the NN class object.
52 |
53 | Args:
54 | n (int): number of input units
55 | epoch (int): number of epochs
56 | h (int): number of the hidden units
57 | a (double): initial learning rate
58 | l2 (double): L2 regularization parameter
59 | seed (unsigned int): random seed
60 | """
61 |
62 | cdef int i
63 |
64 | rng = np.random.RandomState(seed)
65 |
66 | self.epoch = epoch
67 | self.n = n
68 | self.h = h
69 |
70 | self.a = a
71 | self.l2 = l2
72 |
73 | self.w1 = (rng.rand(self.h + 1) - .5) * 1e-6
74 | self.w0 = (rng.rand((self.n + 1) * self.h) - .5) * 1e-6
75 |
76 | # hidden units in the hidden layer
77 | self.z = np.zeros((self.h,), dtype=np.float64)
78 |
79 | # counters for biases and inputs
80 | self.c = 0.
81 | self.c1 = np.zeros((self.h,), dtype=np.float64)
82 | self.c0 = np.zeros((self.n,), dtype=np.float64)
83 |
84 | def __repr__(self):
85 | return ('NN(n={}, epoch={}, h={}, a={}, l2={})').format(
86 | self.n, self.epoch, self.h, self.a, self.l2
87 | )
88 |
89 | def read_sparse(self, path):
90 | """Read a libsvm format sparse file line by line.
91 |
92 | Args:
93 | path (str): a file path to the libsvm format sparse file
94 |
95 | Yields:
96 | idx (list of int): a list of index of non-zero features
97 | val (list of double): a list of values of non-zero features
98 | y (int): target value
99 | """
100 | for line in open(path):
101 | xs = line.rstrip().split(' ')
102 |
103 | y = int(xs[0])
104 | idx = []
105 | val = []
106 | for item in xs[1:]:
107 | i, v = item.split(':')
108 | idx.append(int(i) % self.n)
109 | val.append(float(v))
110 |
111 | yield zip(idx, val), y
112 |
113 | def fit(self, X, y):
114 | """Update the model with a sparse input feature matrix and its targets.
115 |
116 | Args:
117 | X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features
118 | y (numpy.array): targets
119 |
120 | Returns:
121 | updated model weights and counts
122 | """
123 | for epoch in range(self.epoch):
124 | for row in range(X.shape[0]):
125 | x = zip(X[row].indices, X[row].data)
126 | self.update_one(x, self.predict_one(x) - y[row])
127 |
128 | def predict(self, X):
129 | """Predict for a sparse matrix X.
130 |
131 | Args:
132 | X (scipy.sparse.csr_matrix): a sparse matrix for input features
133 |
134 | Returns:
135 | p (numpy.array): predictions for input features
136 | """
137 |
138 | p = np.zeros((X.shape[0], ), dtype=np.float64)
139 | for row in range(X.shape[0]):
140 | p[row] = self.predict_one(zip(X[row].indices, X[row].data))
141 |
142 | return p
143 |
144 | def predict_one(self, list x):
145 | """Predict for features.
146 |
147 | Args:
148 | x (list of tuple): a list of (index, value) of non-zero features
149 |
150 | Returns:
151 | p (double): a prediction for input features
152 | """
153 | cdef double p
154 | cdef int j
155 | cdef int i
156 | cdef double v
157 |
158 | # starting with the bias in the hidden layer
159 | p = self.w1[self.h]
160 |
161 | # calculating and adding values of hidden units
162 | for j in range(self.h):
163 | # starting with the bias in the input layer
164 | self.z[j] = self.w0[self.n * self.h + j]
165 |
166 | # calculating and adding values of input units
167 | for i, v in x:
168 | self.z[j] += self.w0[i * self.h + j] * v
169 |
170 | # apply the ReLU activation function to the hidden unit
171 | self.z[j] = self.z[j] if self.z[j] > 0. else 0.
172 |
173 | p += self.w1[j] * self.z[j]
174 |
175 | # apply the sigmoid activation function to the output unit
176 | return sigm(p)
177 |
178 | def update_one(self, list x, double e):
179 | """Update the model with one observation.
180 |
181 | Args:
182 | x (list of tuple): a list of (index, value) of non-zero features
183 | e (double): error between the prediction of the model and target
184 |
185 | Returns:
186 | updated model weights and counts
187 | """
188 | cdef int j
189 | cdef int i
190 | cdef double dl_dy
191 | cdef double dl_dz
192 | cdef double dl_dw1
193 | cdef double dl_dw0
194 | cdef double v
195 |
196 | dl_dy = e # dl/dy * (initial learning rate)
197 |
198 | # starting with the bias in the hidden layer
199 | self.w1[self.h] -= (dl_dy + self.l2 * self.w1[self.h]) * self.a / (sqrt(self.c) + 1)
200 | for j in range(self.h):
201 | # update weights related to non-zero hidden units
202 | if self.z[j] == 0.:
203 | continue
204 |
205 | # update weights between the hidden units and output
206 | # dl/dw1 = dl/dy * dy/dw1 = dl/dy * z
207 | dl_dw1 = dl_dy * self.z[j]
208 | self.w1[j] -= (dl_dw1 + self.l2 * self.w1[j]) * self.a / (sqrt(self.c1[j]) + 1)
209 |
210 | # starting with the bias in the input layer
211 | # dl/dz = dl/dy * dy/dz = dl/dy * w1
212 | dl_dz = dl_dy * self.w1[j]
213 | self.w0[self.n * self.h + j] -= (dl_dz +
214 | self.l2 * self.w0[self.n * self.h + j]) * self.a / (sqrt(self.c1[j]) + 1)
215 | # update weights related to non-zero input units
216 | for i, v in x:
217 | # update weights between the hidden unit j and input i
218 | # dl/dw0 = dl/dz * dz/dw0 = dl/dz * v
219 | dl_dw0 = dl_dz * v
220 | self.w0[i * self.h + j] -= (dl_dw0 +
221 | self.l2 * self.w0[i * self.h + j]) * self.a / (sqrt(self.c0[i]) + 1)
222 |
223 | # update counter for the input i
224 | self.c0[i] += dl_dw0 * dl_dw0
225 |
226 | # update counter for the hidden unit j
227 | self.c1[j] += dl_dw1 * dl_dw1
228 |
229 | # update overall counter
230 | self.c += dl_dy * dl_dy
231 |
--------------------------------------------------------------------------------
/kaggler/online_model/nn_h2.pyx:
--------------------------------------------------------------------------------
1 | # cython: boundscheck=False
2 | # cython: wraparound=False
3 | # cython: cdivision=True
4 | from __future__ import division
5 | import numpy as np
6 |
7 | cimport cython
8 | from libc.math cimport sqrt, abs
9 | from ..util cimport sigm
10 | cimport numpy as np
11 |
12 |
13 | np.import_array()
14 |
15 |
16 | cdef class NN_H2:
17 | """Neural Network with 2 ReLU hidden layers online learner.
18 |
19 | Attributes:
20 | n (int): number of input units
21 | epoch (int): number of epochs
22 | h1 (int): number of the 1st level hidden units
23 | h2 (int): number of the 2nd level hidden units
24 | a (double): initial learning rate
25 | l2 (double): L2 regularization parameter
26 | w0 (array of double): weights between the input and 1st hidden layers
27 | w1 (array of double): weights between the 1st and 2nd hidden layers
28 | w2 (array of double): weights between the 2nd hidden and output layers
29 | z1 (array of double): 1st level hidden units
30 | z2 (array of double): 2nd level hidden units
31 | c (double): counter
32 | c1 (array of double): counters for 1st level hidden units
33 | c2 (array of double): counters for 2nd level hidden units
34 | """
35 |
36 | cdef unsigned int n # number of input units
37 | cdef unsigned int h1 # number of the 1st level hidden units
38 | cdef unsigned int h2 # number of the 2nd level hidden units
39 | cdef double a # learning rate
40 | cdef double l2 # L2 regularization parameter
41 | cdef double[:] w0 # weights between the input and 1st hidden layers
42 | cdef double[:] w1 # weights between the 1st and 2nd hidden layers
43 | cdef double[:] w2 # weights between the 2nd hidden and output layers
44 | cdef double[:] z1 # 1st level hidden units
45 | cdef double[:] z2 # 2nd level hidden units
46 | cdef double c # counter
47 | cdef double[:] c0 # counters for input units
48 | cdef double[:] c1 # counters for 1st level hidden units
49 | cdef double[:] c2 # counters for 2nd level hidden units
50 |
51 | def __init__(self,
52 | unsigned int n,
53 | unsigned int epoch=10,
54 | unsigned int h1=128,
55 | unsigned int h2=256,
56 | double a=0.01,
57 | double l2=0.,
58 | unsigned int seed=0):
59 | """Initialize the NN class object.
60 |
61 | Args:
62 | n (int): number of input units
63 | epoch (int): number of epochs
64 | h1 (int): number of the 1st level hidden units
65 | h2 (int): number of the 2nd level hidden units
66 | a (double): initial learning rate
67 | l2 (double): L2 regularization parameter
68 | seed (unsigned int): random seed
69 | """
70 |
71 | cdef int i
72 |
73 | rng = np.random.RandomState(seed)
74 |
75 | self.n = n
76 | self.epoch = epoch
77 | self.h1 = h1
78 | self.h2 = h2
79 |
80 | self.a = a
81 | self.l2 = l2
82 |
83 | # weights between the output and 2nd hidden layer
84 | self.w2 = (rng.rand(self.h2 + 1) - .5) * 1e-7
85 |
86 | # weights between the 2nd hidden layer and 1st hidden layer
87 | self.w1 = (rng.rand((self.h1 + 1) * self.h2) - .5) * 1e-7
88 |
89 | # weights between the 1st hidden layer and inputs
90 | self.w0 = (rng.rand((self.n + 1) * self.h1) - .5) * 1e-7
91 |
92 | # hidden units in the 2nd hidden layer
93 | self.z2 = np.zeros((self.h2,), dtype=np.float64)
94 |
95 | # hidden units in the 1st hidden layer
96 | self.z1 = np.zeros((self.h1,), dtype=np.float64)
97 |
98 | # counters for the hidden units and inputs
99 | self.c = 0.
100 | self.c2 = np.zeros((self.h2,), dtype=np.float64)
101 | self.c1 = np.zeros((self.h1,), dtype=np.float64)
102 | self.c0 = np.zeros((self.n,), dtype=np.float64)
103 |
104 | def __repr__(self):
105 | return ('NN_H2(n={}, epoch={}, h1={}, h2={}, a={}, l2={})').format(
106 | self.n, self.epoch, self.h1, self.h2, self.a, self.l2
107 | )
108 |
109 | def read_sparse(self, path):
110 | """Read the libsvm format sparse file line by line.
111 |
112 | Args:
113 | path (str): a file path to the libsvm format sparse file
114 |
115 | Yields:
116 | idx (list of int): a list of index of non-zero features
117 | val (list of double): a list of values of non-zero features
118 | y (int): target value
119 | """
120 | for line in open(path):
121 | xs = line.rstrip().split(' ')
122 |
123 | y = int(xs[0])
124 | idx = []
125 | val = []
126 | for item in xs[1:]:
127 | i, v = item.split(':')
128 | idx.append(abs(hash(i)) % self.n)
129 | val.append(float(v))
130 |
131 | yield zip(idx, val), y
132 |
133 | def fit(self, X, y):
134 | """Update the model with a sparse input feature matrix and its targets.
135 |
136 | Args:
137 | X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features
138 | y (numpy.array): targets
139 |
140 | Returns:
141 | updated model weights and counts
142 | """
143 | for epoch in range(self.epoch):
144 | for row in range(X.shape[0]):
145 | x = zip(X[row].indices, X[row].data)
146 | self.update_one(x, self.predict_one(x) - y[row])
147 |
148 | def predict(self, X):
149 | """Predict for a sparse matrix X.
150 |
151 | Args:
152 | X (scipy.sparse.csr_matrix): a sparse matrix for input features
153 |
154 | Returns:
155 | p (numpy.array): predictions for input features
156 | """
157 |
158 | p = np.zeros((X.shape[0], ), dtype=np.float64)
159 | for row in range(X.shape[0]):
160 | p[row] = self.predict_one(zip(X[row].indices, X[row].data))
161 |
162 | return p
163 |
164 | def predict_one(self, list x):
165 | """Predict for features.
166 |
167 | Args:
168 | x (list of tuple): a list of (index, value) of non-zero features
169 |
170 | Returns:
171 | p (double): a prediction for input features
172 | """
173 | cdef double p
174 | cdef int k
175 | cdef int j
176 | cdef int i
177 | cdef double v
178 |
179 | # starting from the bias in the 2nd hidden layer
180 | p = self.w2[self.h2]
181 |
182 | # calculating and adding values of 2nd level hidden units
183 | for k in range(self.h2):
184 | # staring with the bias in the 1st hidden layer
185 | self.z2[k] = self.w1[self.h1 * self.h2 + k]
186 |
187 | # calculating and adding values of 1st level hidden units
188 | for j in range(self.h1):
189 | # starting with the bias in the input layer
190 | self.z1[j] = self.w0[self.n * self.h1 + j]
191 |
192 | # calculating and adding values of input units
193 | for i, v in x:
194 | self.z1[j] += self.w0[i * self.h1 + j] * v
195 |
196 | # apply the ReLU activation function to the first level hidden unit
197 | self.z1[j] = self.z1[j] if self.z1[j] > 0. else 0.
198 |
199 | self.z2[k] += self.w1[j * self.h2 + k] * self.z1[j]
200 |
201 | # apply the ReLU activation function to the 2nd level hidden unit
202 | self.z2[k] = self.z2[k] if self.z2[k] > 0. else 0.
203 |
204 | p += self.w2[k] * self.z2[k]
205 |
206 | # apply the sigmoid activation function to the output unit
207 | return sigm(p)
208 |
209 | def update_one(self, list x, double e):
210 | """Update the model.
211 |
212 | Args:
213 | x (list of tuple): a list of (index, value) of non-zero features
214 | e (double): error between the prediction of the model and target
215 |
216 | Returns:
217 | updated model weights and counts
218 | """
219 | cdef int k
220 | cdef int j
221 | cdef int i
222 | cdef double dl_dy
223 | cdef double dl_dz1
224 | cdef double dl_dz2
225 | cdef double dl_dw0
226 | cdef double dl_dw1
227 | cdef double dl_dw2
228 | cdef double v
229 |
230 | # XXX: assuming predict() was called right before with the same idx and
231 | # val inputs. Otherwise self.z will be incorrect for updates.
232 | dl_dy = e # dl/dy * (initial learning rate)
233 |
234 | # starting with the bias in the 2nd hidden layer
235 | self.w2[self.h2] -= (dl_dy + self.l2 * self.w2[self.h2]) * self.a / (sqrt(self.c) + 1)
236 | for k in range(self.h2):
237 | # update weights related to non-zero 2nd level hidden units
238 | if self.z2[k] == 0.:
239 | continue
240 |
241 | # update weights between the 2nd hidden units and output
242 | # dl/dw2 = dl/dy * dy/dw2 = dl/dy * z2
243 | dl_dw2 = dl_dy * self.z2[k]
244 | self.w2[k] -= (dl_dw2 + self.l2 * self.w2[k]) * self.a / (sqrt(self.c2[k]) + 1)
245 |
246 | # starting with the bias in the 1st hidden layer
247 | # dl/dz2 = dl/dy * dy/dz2 = dl/dy * w2
248 | dl_dz2 = dl_dy * self.w2[k]
249 | self.w1[self.h1 * self.h2 + k] -= (dl_dz2 +
250 | self.l2 * self.w1[self.h1 * self.h2 + k]) * self.a / (sqrt(self.c2[k]) + 1)
251 | for j in range(self.h1):
252 | # update weights realted to non-zero hidden units
253 | if self.z1[j] == 0.:
254 | continue
255 |
256 | # update weights between the hidden units and output
257 | # dl/dw1 = dl/dz2 * dz2/dw1 = dl/dz2 * z1
258 | dl_dw1 = dl_dz2 * self.z1[j]
259 | self.w1[j * self.h2 + k] -= (dl_dw1 + self.l2 * self.w1[j]) * self.a / (sqrt(self.c1[j]) + 1)
260 |
261 | # starting with the bias in the input layer
262 | # dl/dz1 = dl/dz2 * dz2/dz1 = dl/dz2 * w1
263 | dl_dz1 = dl_dz2 * self.w1[j * self.h2 + k]
264 | self.w0[self.n * self.h1 + j] -= (dl_dz1 +
265 | self.l2 * self.w0[self.n * self.h1 + j]) * self.a / (sqrt(self.c1[j]) + 1)
266 | # update weights related to non-zero input units
267 | for i, v in x:
268 | # update weights between the hidden unit j and input i
269 | # dl/dw0 = dl/dz1 * dz/dw0 = dl/dz1 * v
270 | dl_dw0 = dl_dz1 * v
271 | self.w0[i * self.h1 + j] -= (dl_dw0 +
272 | self.l2 * self.w0[i * self.h1 + j]) * self.a / (sqrt(self.c0[i]) + 1)
273 |
274 | # update counter for the input i
275 | self.c0[i] += dl_dw0 * dl_dw0
276 |
277 | # update counter for the 1st level hidden unit j
278 | self.c1[j] += dl_dw1 * dl_dw1
279 |
280 | # update counter for the 2nd level hidden unit k
281 | self.c2[k] += dl_dw2 * dl_dw2
282 |
283 | # update overall counter
284 | self.c += dl_dy * dl_dy
285 |
--------------------------------------------------------------------------------
/kaggler/online_model/sgd.pyx:
--------------------------------------------------------------------------------
1 | # cython: boundscheck=False
2 | # cython: wraparound=False
3 | # cython: cdivision=True
4 | from __future__ import division
5 | import numpy as np
6 |
7 | cimport cython
8 | from libc.math cimport sqrt, abs
9 | from ..util cimport sigm
10 | cimport numpy as np
11 |
12 |
13 | np.import_array()
14 |
15 |
16 | cdef class SGD:
17 | """Simple online learner using a hasing trick.
18 |
19 | Attributes:
20 | epoch (int): number of epochs
21 | n (int): number of features after hashing trick
22 | a (double): initial learning rate
23 | l1 (double): L1 regularization parameter
24 | l2 (double): L2 regularization parameter
25 | w (array of double): feature weights
26 | c (array of double): counters for weights
27 | interaction (boolean): whether to use 2nd order interaction or not
28 | """
29 | cdef unsigned int epoch
30 | cdef unsigned int n
31 | cdef double a
32 | cdef double l1
33 | cdef double l2
34 | cdef double[:] w
35 | cdef double[:] c
36 | cdef bint interaction
37 |
38 | def __init__(self,
39 | double a=0.01,
40 | double l1=0.0,
41 | double l2=0.0,
42 | unsigned int n=2**20,
43 | unsigned int epoch=10,
44 | bint interaction=True):
45 | """Initialize the SGD class object.
46 |
47 | Args:
48 | epoch (int): number of epochs
49 | n (int): number of features after hashing trick
50 | a (double): initial learning rate
51 | l1 (double): L1 regularization parameter
52 | l2 (double): L2 regularization parameter
53 | w (array of double): feature weights
54 | c (array of double): counters for weights
55 | interaction (boolean): whether to use 2nd order interaction or not
56 | """
57 |
58 | self.epoch = epoch
59 | self.n = n # # of features
60 | self.a = a # learning rate
61 | self.l1 = l1
62 | self.l2 = l2
63 |
64 | # initialize weights and counts
65 | self.w = np.zeros((self.n + 1,), dtype=np.float64)
66 | self.c = np.zeros((self.n + 1,), dtype=np.float64)
67 | self.interaction = interaction
68 |
69 | def __repr__(self):
70 | return ('SGD(a={}, l1={}, l2={}, n={}, epoch={}, interaction={})').format(
71 | self.a, self.l1, self.l2, self.n, self.epoch, self.interaction
72 | )
73 |
74 | def _indices(self, list x):
75 | cdef unsigned int index
76 | cdef int l
77 | cdef int i
78 | cdef int j
79 |
80 | yield self.n
81 |
82 | for index in x:
83 | yield abs(hash(index)) % self.n
84 |
85 | if self.interaction:
86 | l = len(x)
87 | x = sorted(x)
88 | for i in xrange(l):
89 | for j in xrange(i + 1, l):
90 | yield abs(hash('{}_{}'.format(x[i], x[j]))) % self.n
91 |
92 | def read_sparse(self, path):
93 | """Apply hashing trick to the libsvm format sparse file.
94 |
95 | Args:
96 | path (str): a file path to the libsvm format sparse file
97 |
98 | Yields:
99 | x (list of int): a list of index of non-zero features
100 | y (int): target value
101 | """
102 | for line in open(path):
103 | xs = line.rstrip().split(' ')
104 |
105 | y = int(xs[0])
106 | x = []
107 | for item in xs[1:]:
108 | index, _ = item.split(':')
109 | x.append(abs(hash(index)) % self.n)
110 |
111 | yield x, y
112 |
113 | def fit(self, X, y):
114 | """Update the model with a sparse input feature matrix and its targets.
115 |
116 | Args:
117 | X (scipy.sparse.csr_matrix): a list of (index, value) of non-zero features
118 | y (numpy.array): targets
119 |
120 | Returns:
121 | updated model weights and counts
122 | """
123 | for epoch in range(self.epoch):
124 | for row in range(X.shape[0]):
125 | x = list(X[row].indices)
126 | self.update_one(x, self.predict_one(x) - y[row])
127 |
128 | def predict(self, X):
129 | """Predict for a sparse matrix X.
130 |
131 | Args:
132 | X (scipy.sparse.csr_matrix): a sparse matrix for input features
133 |
134 | Returns:
135 | p (numpy.array): predictions for input features
136 | """
137 | p = np.zeros((X.shape[0], ), dtype=np.float64)
138 | for row in range(X.shape[0]):
139 | p[row] = self.predict_one(list(X[row].indices))
140 |
141 | return p
142 |
143 | def predict_one(self, list x):
144 | """Predict for features.
145 |
146 | Args:
147 | x (list of int): a list of index of non-zero features
148 |
149 | Returns:
150 | p (double): a prediction for input features
151 | """
152 | cdef int i
153 | cdef double wTx
154 |
155 | wTx = 0.
156 | for i in self._indices(x):
157 | wTx += self.w[i]
158 |
159 | return sigm(wTx)
160 |
161 | def update_one(self, list x, double e):
162 | """Update the model.
163 |
164 | Args:
165 | x (list of int): a list of index of non-zero features
166 | e (double): error between the prediction of the model and target
167 |
168 | Returns:
169 | updates model weights and counts
170 | """
171 | cdef int i
172 | cdef double g2
173 |
174 | g2 = e * e
175 | for i in self._indices(x):
176 | self.w[i] -= (e +
177 | (self.l1 if self.w[i] >= 0. else -self.l1) +
178 | self.l2 * self.w[i]) * self.a / (sqrt(self.c[i]) + 1)
179 | self.c[i] += g2
180 |
--------------------------------------------------------------------------------
/kaggler/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import OneHotEncoder
2 | from .data import LabelEncoder
3 | from .data import Normalizer
4 |
--------------------------------------------------------------------------------
/kaggler/preprocessing/data.py:
--------------------------------------------------------------------------------
1 | from scipy import sparse
2 | from scipy.stats import norm
3 | from statsmodels.distributions.empirical_distribution import ECDF
4 | import logging
5 | import numpy as np
6 | import pandas as pd
7 |
8 |
9 | NAN_INT = 7535805
10 |
11 |
12 | class Normalizer(object):
13 | """Normalizer that transforms numerical columns into normal distribution.
14 |
15 | Attributes:
16 | ecdfs (list of empirical CDF): empirical CDFs for columns
17 | """
18 |
19 | def fit(self, X, y=None):
20 | self.ecdfs = [None] * X.shape[1]
21 |
22 | for col in range(X.shape[1]):
23 | self.ecdfs[col] = ECDF(X[:, col])
24 |
25 | def transform(self, X):
26 | """Normalize numerical columns.
27 |
28 | Args:
29 | X (numpy.array) : numerical columns to normalize
30 |
31 | Returns:
32 | X (numpy.array): normalized numerical columns
33 | """
34 |
35 | for col in range(X.shape[1]):
36 | X[:, col] = self._transform_col(X[:, col], col)
37 |
38 | return X
39 |
40 | def fit_transform(self, X, y=None):
41 | """Normalize numerical columns.
42 |
43 | Args:
44 | X (numpy.array) : numerical columns to normalize
45 |
46 | Returns:
47 | X (numpy.array): normalized numerical columns
48 | """
49 |
50 | self.ecdfs = [None] * X.shape[1]
51 |
52 | for col in range(X.shape[1]):
53 | self.ecdfs[col] = ECDF(X[:, col])
54 | X[:, col] = self._transform_col(X[:, col], col)
55 |
56 | return X
57 |
58 | def _transform_col(self, x, col):
59 | """Normalize one numerical column.
60 |
61 | Args:
62 | x (numpy.array): a numerical column to normalize
63 | col (int): column index
64 |
65 | Returns:
66 | A normalized feature vector.
67 | """
68 |
69 | return norm.ppf(self.ecdfs[col](x) * .998 + .001)
70 |
71 |
72 | class LabelEncoder(object):
73 | """Label Encoder that groups infrequent values into one label.
74 |
75 | Attributes:
76 | min_obs (int): minimum number of observation to assign a label.
77 | label_encoders (list of dict): label encoders for columns
78 | label_maxes (list of int): maximum of labels for columns
79 | """
80 |
81 | def __init__(self, min_obs=10):
82 | """Initialize the OneHotEncoder class object.
83 |
84 | Args:
85 | min_obs (int): minimum number of observation to assign a label.
86 | """
87 |
88 | self.min_obs = min_obs
89 |
90 | def __repr__(self):
91 | return ('LabelEncoder(min_obs={})').format(self.min_obs)
92 |
93 | def _get_label_encoder_and_max(self, x):
94 | """Return a mapping from values and its maximum of a column to integer labels.
95 |
96 | Args:
97 | x (numpy.array): a categorical column to encode.
98 |
99 | Returns:
100 | label_encoder (dict): mapping from values of features to integers
101 | max_label (int): maximum label
102 | """
103 |
104 | # NaN cannot be used as a key for dict. So replace it with a random integer.
105 | x[pd.isnull(x)] = NAN_INT
106 |
107 | # count each unique value
108 | label_count = {}
109 | for label in x:
110 | try:
111 | label_count[label] += 1
112 | except KeyError:
113 | label_count[label] = 1
114 |
115 | # add unique values appearing more than min_obs to the encoder.
116 | label_encoder = {}
117 | label_index = 1
118 | labels_not_encoded = 0
119 | for label in label_count.keys():
120 | if label_count[label] >= self.min_obs:
121 | label_encoder[label] = label_index
122 | label_index += 1
123 | else:
124 | labels_not_encoded += 1
125 |
126 | max_label = label_index - 1
127 |
128 | # if every label is encoded, then replace the maximum label with 0 so
129 | # that total number of labels encoded is (# of total labels - 1).
130 | if labels_not_encoded == 0:
131 | for label in label_encoder:
132 | # find the label with the maximum encoded value
133 | if label_encoder[label] == max_label:
134 | # set the value of the label to 0 and decrease the maximum
135 | # by 1.
136 | label_encoder[label] = 0
137 | max_label -= 1
138 | break
139 |
140 | return label_encoder, max_label
141 |
142 | def _transform_col(self, x, col):
143 | """Encode one categorical column into labels.
144 |
145 | Args:
146 | x (numpy.array): a categorical column to encode
147 | col (int): column index
148 |
149 | Returns:
150 | x (numpy.array): a column with labels.
151 | """
152 |
153 | label_encoder = self.label_encoders[col]
154 |
155 | # replace NaNs with the pre-defined random integer
156 | x[pd.isnull(x)] = NAN_INT
157 |
158 | labels = np.zeros((x.shape[0], ))
159 | for label in label_encoder:
160 | labels[x == label] = label_encoder[label]
161 |
162 | return labels
163 |
164 | def fit(self, X, y=None):
165 | self.label_encoders = [None] * X.shape[1]
166 | self.label_maxes = [None] * X.shape[1]
167 |
168 | for col in range(X.shape[1]):
169 | self.label_encoders[col], self.label_maxes[col] = \
170 | self._get_label_encoder_and_max(X[:, col])
171 |
172 | return self
173 |
174 | def transform(self, X):
175 | """Encode categorical columns into sparse matrix with one-hot-encoding.
176 |
177 | Args:
178 | X (numpy.array): categorical columns to encode
179 |
180 | Returns:
181 | X (numpy.array): label encoded columns
182 | """
183 |
184 | for col in range(X.shape[1]):
185 | X[:, col] = self._transform_col(X[:, col], col)
186 |
187 | return X
188 |
189 | def fit_transform(self, X, y=None):
190 | """Encode categorical columns into label encoded columns
191 |
192 | Args:
193 | X (numpy.array): categorical columns to encode
194 |
195 | Returns:
196 | X (numpy.array): label encoded columns
197 | """
198 |
199 | self.label_encoders = [None] * X.shape[1]
200 | self.label_maxes = [None] * X.shape[1]
201 |
202 | for col in range(X.shape[1]):
203 | self.label_encoders[col], self.label_maxes[col] = \
204 | self._get_label_encoder_and_max(X[:, col])
205 |
206 | X[:, col] = self._transform_col(X[:, col], col)
207 |
208 | return X
209 |
210 |
211 | class OneHotEncoder(object):
212 | """One-Hot-Encoder that groups infrequent values into one dummy variable.
213 |
214 | Attributes:
215 | min_obs (int): minimum number of observation to create a dummy variable
216 | label_encoders (list of (dict, int)): label encoders and their maximums
217 | for columns
218 | """
219 |
220 | def __init__(self, min_obs=10):
221 | """Initialize the OneHotEncoder class object.
222 |
223 | Args:
224 | min_obs (int): minimum number of observation to create a dummy variable
225 | label_encoder (LabelEncoder): LabelEncoder that transofrm
226 | """
227 |
228 | self.min_obs = min_obs
229 | self.label_encoder = LabelEncoder(min_obs)
230 |
231 | def __repr__(self):
232 | return ('OneHotEncoder(min_obs={})').format(self.min_obs)
233 |
234 | def _transform_col(self, x, col):
235 | """Encode one categorical column into sparse matrix with one-hot-encoding.
236 |
237 | Args:
238 | x (numpy.array): a categorical column to encode
239 | col (int): column index
240 |
241 | Returns:
242 | X (scipy.sparse.coo_matrix): sparse matrix encoding a categorical
243 | variable into dummy variables
244 | """
245 |
246 | labels = self.label_encoder._transform_col(x, col)
247 | label_max = self.label_encoder.label_maxes[col]
248 |
249 | # build row and column index for non-zero values of a sparse matrix
250 | index = np.array(range(len(labels)))
251 | i = index[labels > 0]
252 | j = labels[labels > 0] - 1 # column index starts from 0
253 |
254 | if len(i) > 0:
255 | return sparse.coo_matrix((np.ones_like(i), (i, j)),
256 | shape=(x.shape[0], label_max))
257 | else:
258 | # if there is no non-zero value, return no matrix
259 | return None
260 |
261 | def fit(self, X, y=None):
262 | self.label_encoder.fit(X)
263 |
264 | return self
265 |
266 | def transform(self, X):
267 | """Encode categorical columns into sparse matrix with one-hot-encoding.
268 |
269 | Args:
270 | X (numpy.array): categorical columns to encode
271 |
272 | Returns:
273 | X_new (scipy.sparse.coo_matrix): sparse matrix encoding categorical
274 | variables into dummy variables
275 | """
276 |
277 | for col in range(X.shape[1]):
278 | X_col = self._transform_col(X[:, col], col)
279 | if X_col is not None:
280 | if col == 0:
281 | X_new = X_col
282 | else:
283 | X_new = sparse.hstack((X_new, X_col))
284 |
285 | logging.debug('{} --> {} features'.format(
286 | col, self.label_encoder.label_maxes[col])
287 | )
288 |
289 | return X_new
290 |
291 | def fit_transform(self, X, y=None):
292 | """Encode categorical columns into sparse matrix with one-hot-encoding.
293 |
294 | Args:
295 | X (numpy.array): categorical columns to encode
296 |
297 | Returns:
298 | sparse matrix encoding categorical variables into dummy variables
299 | """
300 |
301 | self.label_encoder.fit(X)
302 |
303 | return self.transform(X)
304 |
--------------------------------------------------------------------------------
/kaggler/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qqgeogor/Kaggler/f53ab7f47eec731648fa03064ec3b7fc11f92396/kaggler/test/__init__.py
--------------------------------------------------------------------------------
/kaggler/test/test_sgd.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | from kaggler.online_model import SGD
5 |
6 |
7 | DUMMY_SPARSE_STR = """0 1:1 3:1 10:1
8 | 0 3:1 5:1
9 | 1 4:1 6:1 8:1 10:1"""
10 |
11 | DUMMY_Y = [0, 0, 1]
12 | DUMMY_LEN_X = [3, 2, 4]
13 |
14 | class TestSGD(unittest.TestCase):
15 |
16 | def setUp(self):
17 | self.model = SGD(n=2**10, a=0.1, l1=1, l2=1, interaction=True)
18 | self.sparse_file = '/tmp/dummy.sps'
19 |
20 | """Create dummpy sparse files."""
21 | with open(self.sparse_file, 'w') as f:
22 | f.write(DUMMY_SPARSE_STR)
23 |
24 | def tearDown(self):
25 | # If a dummy file exists, remove it.
26 | if os.path.isfile(self.sparse_file):
27 | os.remove(self.sparse_file)
28 |
29 | def test_read_sparse(self):
30 | len_xs = []
31 | ys = []
32 | for x, y in self.model.read_sparse(self.sparse_file):
33 | # check hash collision for feature index
34 | self.assertEqual(len(set(x)), len(x))
35 |
36 | ys.append(y)
37 | len_xs.append(len(x))
38 |
39 | # check if target values are correct
40 | self.assertEqual(ys, DUMMY_Y)
41 |
42 | # check if the number of feature index are correct
43 | self.assertEqual(len_xs, DUMMY_LEN_X)
44 |
45 |
46 | if __name__ == '__main__':
47 | unittest.main()
48 |
49 |
--------------------------------------------------------------------------------
/kaggler/util.pxd:
--------------------------------------------------------------------------------
1 | cdef inline double fmax(double a, double b): return a if a >= b else b
2 | cdef inline double fmin(double a, double b): return a if a <= b else b
3 |
4 | cdef double sigm(double x)
5 |
--------------------------------------------------------------------------------
/kaggler/util.pyx:
--------------------------------------------------------------------------------
1 | # cython: boundscheck=False
2 | # cython: wraparound=False
3 | # cython: cdivision=True
4 | from __future__ import division
5 | from scipy import sparse
6 |
7 | import logging
8 | import numpy as np
9 |
10 | cimport cython
11 | from libc.math cimport exp, log
12 | cimport numpy as np
13 |
14 |
15 | np.import_array()
16 |
17 |
18 | cdef double sigm(double x):
19 | """Bounded sigmoid function."""
20 | return 1 / (1 + exp(-fmax(fmin(x, 20.0), -20.0)))
21 |
22 |
23 | def get_downsampled_index(n, rate=0.):
24 | """Return the index that downsamples a vector x by the rate."""
25 |
26 | return np.random.choice(range(n), int(n * rate), replace=False)
27 |
28 |
29 | def get_downsampled_index0(x, rate=0., threshold=0.):
30 | """Return the index that downsamples 0s of a vector x by the rate."""
31 |
32 | idx1 = np.where(x > threshold)[0]
33 | idx0 = np.where(x <= threshold)[0]
34 | idx0_down = np.random.choice(idx0, int(len(idx0) * rate), replace=False)
35 |
36 | idx = list(idx0_down) + list(idx1)
37 | np.random.shuffle(idx)
38 |
39 | return idx
40 |
41 |
42 | def set_column_width(X, n_col):
43 | """Set the column width of a matrix X to n_col."""
44 |
45 | if X.shape[1] < n_col:
46 | if sparse.issparse(X):
47 | X = sparse.hstack((X, np.zeros((X.shape[0], n_col - X.shape[1]))))
48 | X = X.tocsr()
49 | else:
50 | X = np.hstack((X, np.zeros((X.shape[0], n_col - X.shape[1]))))
51 |
52 | elif X.shape[1] > n_col:
53 | if sparse.issparse(X):
54 | X = X.tocsc()[:, :-(X.shape[1] - n_col)]
55 | X = X.tocsr()
56 | else:
57 | X = X[:, :-(X.shape[1] - n_col)]
58 |
59 | return X
60 |
61 |
62 | def rank(x):
63 | """Rank a vector x. Ties will be averaged."""
64 |
65 | unique, idx_inverse = np.unique(x, return_inverse=True)
66 |
67 | unique_rank_sum = np.zeros_like(unique)
68 | unique_rank_count = np.zeros_like(unique)
69 |
70 | np.add.at(unique_rank_sum, idx_inverse, x.argsort().argsort())
71 | np.add.at(unique_rank_count, idx_inverse, 1)
72 |
73 | unique_rank_mean = unique_rank_sum.astype(np.float) / unique_rank_count
74 |
75 | return unique_rank_mean[idx_inverse]
76 |
77 |
78 | def set_min_max(x, lb, ub):
79 | x[x < lb] = lb
80 | x[x > ub] = ub
81 |
82 | return x
83 |
84 |
85 | def point(rank, n_team, n_teammate=1, t=0):
86 | """Calculate Kaggle points to earn after a competition.
87 |
88 | Args:
89 | rank (int): final ranking in the private leaderboard.
90 | n_team (int): the number of teams participated in the competition.
91 | n_teammate (int): the number of team members in my team.
92 | t (int): the number of days since the competition ends.
93 |
94 | Returns:
95 | returns Kaggle points to earn after a compeittion.
96 | """
97 | return (1e5 / np.sqrt(n_teammate) * (rank ** -.75) *
98 | np.log10(1 + np.log10(n_team)) * np.exp(-t / 500))
99 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, Extension
2 | from Cython.Distutils import build_ext
3 |
4 | import numpy as np
5 |
6 | try:
7 | from pypandoc import convert
8 | read_md = lambda f: convert(f, 'rst')
9 | except ImportError:
10 | print("warning: pypandoc module not found, could not convert Markdown to RST")
11 | read_md = lambda f: open(f, 'r').read()
12 |
13 | setup(
14 | name='Kaggler',
15 | version='0.4.1',
16 |
17 | author='Jeong-Yoon Lee',
18 | author_email='jeongyoon.lee1@gmail.com',
19 |
20 | packages=['kaggler',
21 | 'kaggler.model',
22 | 'kaggler.metrics',
23 | 'kaggler.online_model',
24 | 'kaggler.preprocessing',
25 | 'kaggler.test'],
26 | url='https://github.com/jeongyoonlee/Kaggler',
27 | license='LICENSE.txt',
28 |
29 | description='Code for Kaggle Data Science Competitions.',
30 | long_description=read_md('README.md'),
31 |
32 | install_requires=[
33 | 'cython',
34 | 'numpy',
35 | 'scipy >= 0.14.0',
36 | 'scikit-learn >= 0.15.0',
37 | 'statsmodels >= 0.5.0',
38 | ],
39 |
40 | cmdclass={'build_ext': build_ext},
41 | ext_modules=[Extension('kaggler.online_model.ftrl',
42 | ['kaggler/online_model/ftrl.pyx'],
43 | libraries=[],
44 | include_dirs=[np.get_include(), '.'],
45 | extra_compile_args=['-O3']),
46 | Extension('kaggler.online_model.sgd',
47 | ['kaggler/online_model/sgd.pyx'],
48 | libraries=[],
49 | include_dirs=[np.get_include(), '.'],
50 | extra_compile_args=['-O3']),
51 | Extension('kaggler.online_model.fm',
52 | ['kaggler/online_model/fm.pyx'],
53 | libraries=[],
54 | include_dirs=[np.get_include(), '.'],
55 | extra_compile_args=['-O3']),
56 | Extension('kaggler.online_model.nn',
57 | ['kaggler/online_model/nn.pyx'],
58 | libraries=[],
59 | include_dirs=[np.get_include(), '.'],
60 | extra_compile_args=['-O3']),
61 | Extension('kaggler.online_model.nn_h2',
62 | ['kaggler/online_model/nn_h2.pyx'],
63 | libraries=[],
64 | include_dirs=[np.get_include(), '.'],
65 | extra_compile_args=['-O3']),
66 | Extension('kaggler.util',
67 | ['kaggler/util.pyx', 'kaggler/util.pxd'],
68 | libraries=[],
69 | include_dirs=[np.get_include(), '.'],
70 | extra_compile_args=['-O3']),
71 | Extension('kaggler.online_model.ftrl_fm',
72 | ['kaggler/online_model/ftrl_fm.pyx'],
73 | libraries=[],
74 | include_dirs=[np.get_include(), '.'],
75 | extra_compile_args=[
76 | '-O3',
77 | # '-fopenmp',
78 | ],
79 | # extra_link_args=['-fopenmp'],
80 | ),
81 |
82 | ],
83 | )
84 |
--------------------------------------------------------------------------------