├── .gitignore
├── LICENSE
├── README.md
├── auto_smart
├── LICENSE
├── MANIFEST.in
├── README.md
├── ac.c
├── ac.pyx
├── auto_smart
│ ├── CONSTANT.py
│ ├── PATHS.py
│ ├── __init__.py
│ ├── automl
│ │ ├── __init__.py
│ │ ├── auto_lgb.py
│ │ ├── automl.py
│ │ ├── autosample.py
│ │ └── model_selection.py
│ ├── config.py
│ ├── data_tools.py
│ ├── feat
│ │ ├── __init__.py
│ │ ├── default_feat.py
│ │ ├── default_merge_feat.py
│ │ ├── feat.py
│ │ ├── feat_pipeline.py
│ │ ├── feat_selection.py
│ │ ├── merge_feat.py
│ │ └── merge_feat_pipeline.py
│ ├── feat_context.py
│ ├── feat_engine.py
│ ├── merger.py
│ ├── metadata
│ ├── model.py
│ ├── model_input.py
│ ├── preprocessor
│ │ ├── __init__.py
│ │ └── preprocessor.py
│ ├── table
│ │ ├── __init__.py
│ │ ├── graph.py
│ │ └── table.py
│ └── util.py
└── setup.py
└── demo
├── data
├── test
│ └── main_test.data
└── train
│ ├── info.json
│ ├── main_train.data
│ ├── main_train.solution
│ ├── table_1.data
│ ├── table_2.data
│ └── table_3.data
└── demo.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | #*.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | .DS_Store
107 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
676 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 | [](https://github.com/DeepBlueAI/AutoSmart/blob/master/LICENSE)
3 | # The introduction of AutoSmart
4 | The 1st place solution for KDD Cup 2019 AutoML Track
5 |
6 | # How to install
7 |
8 | Requirements: [Cython with C compiler](https://docs.cython.org/en/latest/src/quickstart/install.html).
9 |
10 | clone or download autosmart package, run
11 |
12 | ```python
13 | python setup.py install
14 | ```
15 |
16 | # How to use
17 | ```python
18 | import auto_smart
19 |
20 | info = auto_smart.read_info("data")
21 | train_data,train_label = auto_smart.read_train("data",info)
22 | test_data = auto_smart.read_test("data",info)
23 | auto_smart.train_and_predict(train_data,train_label,info,test_data)
24 | ```
25 | # Data Sample
26 | ### Data
27 |
28 | This page describes the datasets that our system can deal with.
29 |
30 | #### Components
31 | Each dataset is split into two subsets, namely the training set and the testing set.
32 |
33 | Both sets have:
34 |
35 | - a **main table file** that stores the **main table** (label excluded);
36 | - multiple **related table files** that store the **related tables**;
37 | - an **info dictionary** that contains important information about the dataset, including table relations;
38 | - The training set has an additional **label file** that stores **labels** associated with the **main table**.
39 |
40 | ### Table files
41 |
42 | Each **table file** is a CSV file that stores a table (main or related), with '**\t**' as the delimiter. The first row indicates the names of features, a.k.a 'schema', and the following rows are the records.
43 |
44 | The type of each feature can be found in the info dictionary that will be introduced soon.
45 |
46 | There are 4 types of features, indicated by "cat", "num", "multi-cat", and "time", respectively:
47 |
48 | - **cat**: categorical feature, an integer
49 | - **num**: numerical Feature, a real value.
50 | - **multi-cat**: multi-value categorical Feature: a set of integers, split by the comma. The size of the set is not fixed and can be different for each instance. For example, topics of an article, words in a title, items bought by a user and so on.
51 | - **time**: time feature, an integer that indicates the timestamp.
52 |
53 |
54 | ### Label file
55 | The **label file** is associated only with the main table in the training set. It is a CSV file that contains only one column, with the first row as the header and the remaining indicating labels associated with instances in the main table.
56 |
57 | ### info dictionary
58 | Important information about each dataset is stored in a python dictionary structure named as **info**, which acts as an input of this system. Generally,you need to manually generate the dictionary information info.json file. Here we give details about info.
59 |
60 | 
61 |
62 | Descriptions of the keys in info:
63 |
64 | - **time_budget**: time budget for this dataset (sec).
65 | - **time_col**: the column name of the primary timestamp; Each dataset has one unique time_col; time_col is definitely contained in the main table, but not necessarily in a related table;
66 | - **start_time**: DEPRECATED.
67 | - **tables**: a dictionary that stores information about tables. Each key indicates a table, and its corresponding value is a dictionary that indicates the type of each column in this table. Two kinds of keys are contained in tables:
68 | - **main**: the main table;
69 | - **table_{i}**: the i-th related table.
70 | - There are 4 types of features, indicated by "cat", "num", "multi-cat", and "time", respectively:
71 | - **cat**: categorical feature, an integer
72 | - **num**: numerical Feature, a real value.
73 | - **multi-cat**: multi-value categorical Feature: a set of integers, split by the comma. The size of the set is not fixed and can be different for each instance. For example, topics of an article, words in a title, items bought by a user and so on.
74 | - **time**: time feature, an integer that indicates the timestamp.
75 |
76 | - **relations**: a list that stores table relations in the dataset. Each relation can be represented as an ordered table pair (**table_A**, **table_B**), a key column **key** that appears in both tables and acts as the pivot of table joining, and a relation type **type**. Different relation types will be introduced shortly.
77 |
78 | #### Relations Between Tables
79 | Four table relations are considered in this system:
80 |
81 | - **one-to-one** (1-1): the key columns in both **table_A** and **table_B** have no duplicated values;
82 | - **one-to-many** (1-M): the key column in **table_A** has no duplicated values, but that in **table_B** may have duplicated values;
83 | - **many-to-one** (M-1): the key column in **table_A** may have duplicated values, but that in **table_B** has no duplicated values;
84 | - **many-to-many** (M-M): the key columns in both **table_A** and **table_B** may have duplicated values.
85 |
86 |
87 |
88 |
89 | # Contact Us
90 | DeepBlueAI: 1229991666@qq.com
91 |
--------------------------------------------------------------------------------
/auto_smart/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.pyx
--------------------------------------------------------------------------------
/auto_smart/README.md:
--------------------------------------------------------------------------------
1 | 
2 | [](https://github.com/DeepBlueAI/AutoSmart/blob/master/LICENSE)
3 | # The introduction of AutoSmart
4 | The 1st place solution for KDD Cup 2019 AutoML Track
5 |
6 | # How to use
7 | This is the link to the competition:https://codalab.lri.fr/competitions/559
8 |
9 | # Contact Us
10 | DeepBlueAI: 1229991666@qq.com
11 |
--------------------------------------------------------------------------------
/auto_smart/ac.pyx:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | from cython cimport boundscheck,wraparound
4 |
5 |
6 |
7 |
8 | @boundscheck(False)
9 | @wraparound(False)
10 | def pre_tuple_encode_func(int[:] muldata, int[:] muldatalens,K):
11 | cdef:
12 | int index = 0
13 | int i,j,N = muldatalens.shape[0]
14 | int les
15 | # list tmp = []
16 | int c_K = K
17 | dict map_dict = {}
18 | int ids = 0
19 |
20 |
21 | ans = np.zeros( N ,dtype=np.float )
22 |
23 | for i in range(N):
24 | les = muldatalens[i]
25 | if les == 0:
26 | ans[i] = np.nan
27 | else:
28 | tmp = []
29 | if c_K > les:
30 | for j in range(index,index+les):
31 | tmp.append(muldata[j])
32 | else:
33 | for j in range(index,index+c_K):
34 | tmp.append(muldata[j])
35 |
36 | thash = tuple( tmp )
37 | if thash in map_dict:
38 | ans[i] = map_dict[ thash ]
39 | else:
40 | map_dict[ thash ] = ids
41 | ans[i] = ids
42 | ids += 1
43 |
44 | index += les
45 |
46 | return ans
47 |
48 |
49 | @boundscheck(False)
50 | @wraparound(False)
51 | def post_tuple_encode_func(int[:] muldata, int[:] muldatalens,K):
52 | cdef:
53 | int index = 0
54 | int i,j,N = muldatalens.shape[0]
55 | int les
56 | # list tmp = []
57 | int c_K = K
58 | dict map_dict = {}
59 | int ids = 0
60 |
61 |
62 | ans = np.zeros( N ,dtype=np.float )
63 |
64 | for i in range(N):
65 | les = muldatalens[i]
66 | if les == 0:
67 | ans[i] = np.nan
68 | else:
69 | tmp = []
70 | if c_K > les:
71 | for j in range(index,index+les):
72 | tmp.append(muldata[j])
73 | else:
74 | for j in range(index+les-c_K,index+les):
75 | tmp.append(muldata[j])
76 |
77 | thash = tuple( tmp )
78 | if thash in map_dict:
79 | ans[i] = map_dict[ thash ]
80 | else:
81 | map_dict[ thash ] = ids
82 | ans[i] = ids
83 | ids += 1
84 |
85 | index += les
86 |
87 | return ans
88 |
89 |
90 | @boundscheck(False)
91 | @wraparound(False)
92 | def tuple_encode_func_1(int[:] muldata, int[:] muldatalens):
93 | cdef:
94 | int index = 0
95 | int i,j,N = muldatalens.shape[0]
96 | int les
97 | dict map_dict = {}
98 | int ids = 1
99 |
100 |
101 | ans = np.zeros( N ,dtype=np.float )
102 |
103 | for i in range(N):
104 | les = muldatalens[i]
105 | if les == 0:
106 | ans[i] = np.nan
107 | else:
108 | tmp = []
109 | for j in range(index,index+les):
110 | tmp.append(muldata[j])
111 |
112 | thash = tuple( tmp )
113 | if thash in map_dict:
114 | ans[i] = map_dict[ thash ]
115 | else:
116 | map_dict[ thash ] = ids
117 | ans[i] = ids
118 | ids += 1
119 |
120 | index += les
121 |
122 | return ans
123 |
124 | #@boundscheck(False)
125 | #@wraparound(False)
126 | #def tuple_encode_func_2(vals):
127 | # cdef:
128 | # int idx,N = vals.shape[0]
129 | # dict map_dict = {}
130 | # int ids = 0
131 | #
132 | # ans = np.zeros( N ,dtype=np.float )
133 | #
134 | # for idx in range(N):
135 | # i = vals[idx]
136 | # if type(i)==float or i==():
137 | # ans[idx] = np.nan
138 | # else:
139 | # if i in map_dict:
140 | # ans[idx] = map_dict[ i ]
141 | # else:
142 | # map_dict[ i ] = ids
143 | # ans[idx] = ids
144 | # ids += 1
145 | #
146 | # return ans
147 |
148 |
149 |
150 | @boundscheck(False)
151 | @wraparound(False)
152 | def cat_in_multi( int[:] muldata, int[:] muldatalens, int[:] catdata ):
153 | cdef:
154 | int index = 0
155 | int i,j,N = muldatalens.shape[0]
156 | int les
157 | int cat
158 | int flag
159 | # list ans = []
160 |
161 | ans = np.zeros( N ,dtype=np.int8 )
162 |
163 | for i in range(N):
164 | les = muldatalens[i]
165 | flag = 0
166 | cat = catdata[ i ]
167 | for j in range(index,index+les):
168 | if muldata[j] == cat:
169 | flag = 1
170 | break
171 |
172 | if flag :
173 | ans[i] = 1
174 | else:
175 | ans[i] = 0
176 |
177 | index += les
178 | return ans
179 |
180 | @boundscheck(False)
181 | @wraparound(False)
182 | def cat_rank_multi( int[:] muldata, int[:] muldatalens, int[:] catdata ):
183 | cdef:
184 | int index = 0
185 | int i,j,N = muldatalens.shape[0]
186 | int les
187 | int cat
188 | int flag
189 | # list ans = []
190 |
191 | ans = np.zeros( N ,dtype=np.int16 )
192 |
193 | for i in range(N):
194 | les = muldatalens[i]
195 | flag = 0
196 | cat = catdata[ i ]
197 | for j in range(index,index+les):
198 | if muldata[j] == cat:
199 | flag = j-index+1
200 | break
201 | ans[i] = flag
202 | index += les
203 | return ans
204 |
205 |
206 | @boundscheck(False)
207 | @wraparound(False)
208 | def cat_frank_multi( int[:] muldata, int[:] muldatalens, int[:] catdata ):
209 | cdef:
210 | int index = 0
211 | int i,j,N = muldatalens.shape[0]
212 | int les
213 | int cat
214 | int flag
215 | # list ans = []
216 |
217 | ans = np.zeros( N ,dtype=np.int16 )
218 |
219 | for i in range(N):
220 | les = muldatalens[i]
221 | flag = 0
222 | cat = catdata[ i ]
223 | for j in range(index,index+les):
224 | if muldata[j] == cat:
225 | flag = index+les - j
226 | break
227 | ans[i] = flag
228 | index += les
229 | return ans
230 |
231 |
232 | @boundscheck(False)
233 | @wraparound(False)
234 | def get_need_data( vals ):
235 | cdef:
236 | int idx,N = vals.shape[0]
237 | list datas = []
238 | list datalen = []
239 |
240 | for idx in range(N):
241 | i = vals[idx]
242 | if type(i) == float:
243 | datalen.append( 0 )
244 | else:
245 | datas.extend( i )
246 | datalen.append( len(i) )
247 |
248 | return datas,datalen
249 |
250 |
251 |
252 |
253 | @boundscheck(False)
254 | @wraparound(False)
255 | def mscat_fit(vals ):
256 | cdef:
257 | set ans = set()
258 | int idx,N = vals.shape[0]
259 |
260 | for idx in range(N):
261 | val = vals[idx]
262 | if type(val) == float:
263 | continue
264 | ans.update( val.split(',') )
265 |
266 | return ans
267 |
268 | @boundscheck(False)
269 | @wraparound(False)
270 | def mscat_trans(vals,cats):
271 | cdef:
272 | dict cat2index = {index: i + 1 for i,index in enumerate(cats)}
273 | list ans = []
274 | int idx,N = vals.shape[0]
275 | list tmp = []
276 |
277 |
278 | for idx in range(N):
279 | val = vals[idx]
280 | if type(val) == float:
281 | ans.append( tuple() )
282 | else:
283 | tmp = []
284 | x = val.split(',')
285 | for i in x:
286 | tmp.append( cat2index[i] )
287 |
288 | ans.append( tuple( tmp ) )
289 |
290 | return ans
291 |
292 |
293 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/CONSTANT.py:
--------------------------------------------------------------------------------
1 | NUMERICAL_TYPE = "num"
2 | NUMERICAL_PREFIX = "n_"
3 |
4 | CATEGORY_TYPE = "cat"
5 | CATEGORY_PREFIX = "c_"
6 |
7 | TIME_TYPE = "time"
8 | TIME_PREFIX = "t_"
9 |
10 | MULTI_CAT_TYPE = "multi-cat"
11 | MULTI_CAT_PREFIX = "m_"
12 | MULTI_CAT_DELIMITER = ","
13 |
14 | BINARY_TYPE = "binary"
15 | BINARY_PREFIX = 'b_'
16 |
17 | MAIN_TABLE_NAME = "main"
18 | MAIN_TABLE_TEST_NAME = "main_test"
19 | TABLE_PREFIX = "table_"
20 |
21 | LABEL = "label"
22 |
23 | type2prefix = {
24 | NUMERICAL_TYPE:NUMERICAL_PREFIX,
25 | CATEGORY_TYPE:CATEGORY_PREFIX,
26 | TIME_TYPE:TIME_PREFIX,
27 | MULTI_CAT_TYPE:MULTI_CAT_PREFIX,
28 | BINARY_TYPE: BINARY_PREFIX
29 | }
30 |
31 | THREAD_NUM = 4
32 |
33 | SEED = 2222
34 | JOBS = 7
35 |
36 | CAT_SHIFT = 1
37 |
38 | MAX_SAMPLE_NUM = 1000000
39 |
40 | TIME_MIN_BINS = 1000
41 | SEGMENTS = 100
42 |
43 | LESS_LIMIT = 10
44 | SMOOTH_SHIFT = 100
45 | DEVIATION_SHIFT = 100
46 |
47 | KEYWORDS = ["label",'index']
48 |
49 | SPLIT = -1
50 |
51 | round_opt = False
52 |
53 | SAMPLE_NUM = 210000
54 |
55 | USE_ENSEMBLE = 1
56 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/PATHS.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from datetime import datetime
3 | version = datetime.now().strftime('%Y%m%d%H%M%S')
4 | print('version:{}'.format(version))
5 | feature_importance_path = '../importances/'
6 |
7 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/__init__.py:
--------------------------------------------------------------------------------
1 | name = "example_pkg"
2 | import os
3 | import sys
4 | ABSPATH = os.path.abspath(os.path.realpath(os.path.dirname(__file__)))
5 | sys.path.append(ABSPATH)
6 |
7 | from auto_smart.model import Model
8 | import numpy as np
9 | import pandas as pd
10 | import json
11 | from os.path import join
12 | from datetime import datetime
13 |
14 |
15 | TYPE_MAP = {
16 | 'time': str,
17 | 'cat': str,
18 | 'multi-cat': str,
19 | 'num': np.float64
20 | }
21 |
22 | def read_info(datapath):
23 | with open(join(datapath, 'train', 'info.json'), 'r') as info_fp:
24 | info = json.load(info_fp)
25 | return info
26 |
27 | def read_train(datapath, info):
28 | train_data = {}
29 | for table_name, columns in info['tables'].items():
30 |
31 | table_dtype = {key: TYPE_MAP[val] for key, val in columns.items()}
32 |
33 | if table_name == 'main':
34 | table_path = join(datapath, 'train', 'main_train.data')
35 | else:
36 | table_path = join(datapath, 'train', f'{table_name}.data')
37 |
38 | date_list = [key for key, val in columns.items() if val == 'time']
39 |
40 | train_data[table_name] = pd.read_csv(
41 | table_path, sep='\t', dtype=table_dtype, parse_dates=date_list,
42 | date_parser=lambda millisecs: millisecs if np.isnan(
43 | float(millisecs)) else datetime.fromtimestamp(
44 | float(millisecs)/1000))
45 |
46 | # get train label
47 | train_label = pd.read_csv(
48 | join(datapath, 'train', 'main_train.solution'))['label']
49 | return train_data, train_label
50 |
51 |
52 | def read_test(datapath, info):
53 | # get test data
54 | main_columns = info['tables']['main']
55 | table_dtype = {key: TYPE_MAP[val] for key, val in main_columns.items()}
56 |
57 | table_path = join(datapath, 'test', 'main_test.data')
58 |
59 | date_list = [key for key, val in main_columns.items() if val == 'time']
60 |
61 | test_data = pd.read_csv(
62 | table_path, sep='\t', dtype=table_dtype, parse_dates=date_list,
63 | date_parser=lambda millisecs: millisecs if np.isnan(
64 | float(millisecs)) else datetime.fromtimestamp(
65 | float(millisecs) / 1000))
66 | return test_data
67 |
68 | def train_and_predict(train_data,train_label,info,test_data):
69 | cmodel = Model(info)
70 | cmodel.fit(train_data, train_label)
71 | return cmodel.predict(test_data)
72 |
73 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/automl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepBlueAI/AutoSmart/99a6b44ce5b6520a2463c8a4b3acc675584533be/auto_smart/auto_smart/automl/__init__.py
--------------------------------------------------------------------------------
/auto_smart/auto_smart/automl/auto_lgb.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import lightgbm as lgb
3 | import numpy as np
4 | import CONSTANT
5 | from util import log, timeclass
6 | from .automl import AutoML
7 | import pandas as pd
8 | import gc
9 | from . import autosample
10 | import time
11 | import copy
12 | from sklearn.metrics import roc_auc_score
13 |
14 | class AutoLGB(AutoML):
15 | def __init__(self):
16 | self.params = {
17 | "boosting_type": "gbdt",
18 | "objective": "binary",
19 | "metric": "auc",
20 | "verbosity": 1,
21 | "seed": CONSTANT.SEED,
22 | "num_threads": CONSTANT.THREAD_NUM
23 | }
24 |
25 | self.hyperparams = {
26 | 'num_leaves': 31,
27 | 'max_depth': -1,
28 | 'min_child_samples': 20,
29 | 'max_bin':255,
30 | 'subsample': 0.8,
31 | 'subsample_freq': 1,
32 | 'colsample_bytree': 0.8,
33 | 'min_child_weight': 0.001,
34 | 'subsample_for_bin': 200000,
35 | 'min_split_gain': 0.02,
36 | 'reg_alpha': 0.1,
37 | 'reg_lambda': 0.1,
38 | }
39 |
40 | self.early_stopping_rounds = 50
41 |
42 | @timeclass(cls='AutoLGB')
43 | def predict(self,X):
44 | X = X[self.columns]
45 | X.columns = self.new_feat_name_cols
46 | return self.model.predict(X)
47 |
48 | @timeclass(cls='AutoLGB')
49 | def ensemble_train(self,X,y,categories,config,len_test):
50 | feat_name = list(X.columns)
51 | self.ensemble_models = []
52 | self.ensemble_columns = []
53 | columns = list(X.columns)
54 | log(f'lgb training set shape: {X.shape}')
55 | pos = (y==1).sum()
56 | neg = (y==0).sum()
57 | log(f'pos {pos} neg {neg}')
58 |
59 | self.columns = columns
60 | max_sample_num = len(y)
61 |
62 | feat_name_cols = list(X.columns)
63 | feat_name_maps = { feat_name_cols[i] : str(i) for i in range(len(feat_name_cols)) }
64 | f_feat_name_maps = { str(i) : feat_name_cols[i] for i in range(len(feat_name_cols)) }
65 | new_feat_name_cols = [ feat_name_maps[i] for i in feat_name_cols ]
66 | X.columns = new_feat_name_cols
67 | categories = [ feat_name_maps[i] for i in categories ]
68 | self.f_feat_name_maps = f_feat_name_maps
69 | self.new_feat_name_cols = new_feat_name_cols
70 |
71 | all_columns = list(X.columns)
72 |
73 | start_time = time.time()
74 | i = 0
75 | cur_columns = all_columns
76 | seed = np.random.randint(2019*i,2019*(i+1))
77 | X_train,y_train = autosample.downsampling(X,y,max_sample_num,seed)
78 | X_train = X_train[cur_columns]
79 | gc.collect()
80 |
81 | colset = set(X_train.columns)
82 | cur_categorical = [col for col in categories if col in colset]
83 | pos = (y_train==1).sum()
84 | neg = (y_train==0).sum()
85 |
86 | params = self.params
87 | hyperparams = self.hyperparams
88 | params['seed'] = seed
89 |
90 | X_train = X_train.astype(np.float32)
91 | gc.collect()
92 | y_train = y_train.astype(np.float32)
93 | gc.collect()
94 | X_train = X_train.values
95 | gc.collect()
96 | y_train = y_train.values
97 | gc.collect()
98 |
99 | train_data = lgb.Dataset(X_train, label=y_train,feature_name=feat_name)
100 | del X_train,y_train
101 | gc.collect()
102 |
103 | model = lgb.train({**params, **hyperparams},
104 | train_data,
105 | num_boost_round=self.best_iteration,
106 | feature_name=cur_columns,
107 | categorical_feature=cur_categorical,
108 | learning_rates = self.learning_rates[:self.best_iteration])
109 |
110 | self.ensemble_columns.append(cur_columns)
111 | self.ensemble_models.append(model)
112 | end_time = time.time()
113 |
114 | model_use_time = end_time - start_time
115 | del train_data
116 |
117 | gc.collect()
118 |
119 | start_time = time.time()
120 | temp = X.iloc[:100000]
121 |
122 | temp = temp.astype(np.float32)
123 | gc.collect()
124 | temp = temp.values
125 | gc.collect()
126 |
127 | model.predict(temp)
128 |
129 | end_time = time.time()
130 | model_test_use_time = (end_time-start_time)
131 | model_test_use_time = len_test/temp.shape[0] * model_test_use_time
132 | model_use_time = model_use_time + model_test_use_time
133 | del temp,model
134 |
135 | rest_time = config.budget/10*9-(end_time-config.start_time)
136 | if rest_time <= 0:
137 | rest_model_num = 0
138 | else:
139 | rest_model_num = int(rest_time // model_use_time)
140 |
141 | if rest_model_num >= 50:
142 | rest_model_num = 50
143 |
144 | if rest_model_num >= 1:
145 | rest_model_num -= 1
146 |
147 | if not CONSTANT.USE_ENSEMBLE:
148 | rest_model_num = 0
149 |
150 | for i in range(1,rest_model_num+1):
151 |
152 | seed = np.random.randint(2019*i,2019*(i+1))
153 |
154 | cur_columns = list(pd.Series(all_columns).sample(frac=0.85,replace=False,random_state=seed))
155 |
156 | X_train,y_train = autosample.downsampling(X,y,max_sample_num,seed)
157 | X_train = X_train[cur_columns]
158 | gc.collect()
159 |
160 | colset = set(X_train.columns)
161 | cur_categorical = [col for col in categories if col in colset]
162 |
163 | pos = (y_train==1).sum()
164 | neg = (y_train==0).sum()
165 |
166 | params = self.params
167 | hyperparams = self.hyperparams
168 | params['seed'] = seed
169 |
170 | num_leaves = hyperparams['num_leaves']
171 | num_leaves = num_leaves + np.random.randint(-int(num_leaves/10),int(num_leaves/10)+7)
172 |
173 | lrs = np.array(self.learning_rates)
174 | rands = 1 + 0.2*np.random.rand(len(lrs))
175 | lrs = list(lrs * rands)
176 |
177 | cur_iteration = self.best_iteration
178 | cur_iteration = cur_iteration + np.random.randint(-30,40)
179 | if cur_iteration > len(lrs):
180 | cur_iteration = len(lrs)
181 |
182 | if cur_iteration <= 10:
183 | cur_iteration = self.best_iteration
184 |
185 | cur_hyperparams = copy.deepcopy(hyperparams)
186 | cur_hyperparams['num_leaves'] = num_leaves
187 |
188 | X_train = X_train.astype(np.float32)
189 | gc.collect()
190 | y_train = y_train.astype(np.float32)
191 | gc.collect()
192 | X_train = X_train.values
193 | gc.collect()
194 | y_train = y_train.values
195 | gc.collect()
196 |
197 | train_data = lgb.Dataset(X_train, label=y_train,feature_name=cur_columns)
198 | del X_train,y_train
199 | gc.collect()
200 |
201 | model = lgb.train({**params, **cur_hyperparams},
202 | train_data,
203 | num_boost_round=cur_iteration,
204 | feature_name=cur_columns,
205 | categorical_feature=cur_categorical,
206 | learning_rates = lrs[:cur_iteration])
207 |
208 |
209 | self.ensemble_columns.append(cur_columns)
210 | self.ensemble_models.append(model)
211 |
212 | del train_data
213 | gc.collect()
214 |
215 | X.columns = self.columns
216 |
217 |
218 | @timeclass(cls='AutoLGB')
219 | def ensemble_predict(self,X):
220 | X = X[self.columns]
221 | gc.collect()
222 |
223 | X.columns = self.new_feat_name_cols
224 |
225 | preds = []
226 | for model,cur_cols in zip(self.ensemble_models,self.ensemble_columns):
227 | gc.collect()
228 | tX = X[cur_cols]
229 | gc.collect()
230 | tX = tX.astype(np.float32)
231 | gc.collect()
232 | tX = tX.values
233 | gc.collect()
234 |
235 | preds.append(model.predict( tX ))
236 | gc.collect()
237 |
238 | if len(preds) == 1:
239 | pred = preds[0]
240 |
241 | if len(preds) > 1:
242 | total_model_num = len(preds)
243 |
244 | main_model_weight = 8 / (8 + 2 * (total_model_num-1))
245 | rest_model_weight = 2 / (8 + 2 * (total_model_num-1))
246 | pred = preds[0] * main_model_weight
247 | for i in range(1,total_model_num):
248 | pred = pred + rest_model_weight * preds[i]
249 |
250 | return pred
251 |
252 | @timeclass(cls='AutoLGB')
253 | def ensemble_predict_test(self,X):
254 | X = X[self.columns]
255 | gc.collect()
256 |
257 | X.columns = self.new_feat_name_cols
258 | log(f'ensemble models {len(self.ensemble_models)}')
259 | preds = []
260 | for model,cur_cols in zip(self.ensemble_models,self.ensemble_columns):
261 | gc.collect()
262 | tX = X[cur_cols]
263 | gc.collect()
264 | tX = tX.astype(np.float32)
265 | gc.collect()
266 | tX = tX.values
267 | gc.collect()
268 |
269 | preds.append(model.predict( tX ))
270 | gc.collect()
271 |
272 | if len(preds) == 1:
273 | pred = preds[0]
274 |
275 | if len(preds) > 1:
276 | total_model_num = len(preds)
277 |
278 | main_model_weight = 8 / (8 + 2 * (total_model_num-1))
279 | rest_model_weight = 2 / (8 + 2 * (total_model_num-1))
280 | pred = preds[0] * main_model_weight
281 | for i in range(1,total_model_num):
282 | pred = pred + rest_model_weight * preds[i]
283 |
284 | return pred,preds[0]
285 |
286 | def get_log_lr(self,num_boost_round,max_lr,min_lr):
287 | learning_rates = [max_lr+(min_lr-max_lr)/np.log(num_boost_round)*np.log(i) for i in range(1,num_boost_round+1)]
288 | return learning_rates
289 |
290 | def set_num_leaves(self,X,y):
291 | t = len(y)
292 | t = X.shape[1]*(t/40000)
293 | level = t**0.225 + 1.5
294 | num_leaves = int(2**level) + 10
295 | num_leaves = min(num_leaves, 128)
296 | num_leaves = max(num_leaves, 32)
297 | self.hyperparams['num_leaves'] = num_leaves
298 |
299 | def set_min_child_samples(self, X,y ):
300 | min_child_samples = ( (X.shape[0]/20000)**0.6 ) *15
301 | min_child_samples = int(min_child_samples)
302 | min_child_samples = min(min_child_samples, 150)
303 | min_child_samples = max(min_child_samples, 15)
304 |
305 | self.hyperparams['min_child_samples'] = min_child_samples
306 |
307 | @timeclass(cls='AutoLGB')
308 | def lr_opt(self,train_data,valid_data,categories):
309 | params = self.params
310 | hyperparams = self.hyperparams
311 |
312 | max_lrs = [0.1,0.08,0.05,0.02]
313 | min_lrs = [0.04,0.02,0.01,0.005]
314 |
315 | num_boost_round = self.num_boost_round
316 | max_num_boost_round = min(400,num_boost_round)
317 | best_score = -1
318 | best_loop = -1
319 | lr = None
320 |
321 | scores = []
322 | lrs = []
323 | for max_lr,min_lr in zip(max_lrs,min_lrs):
324 | learning_rates = self.get_log_lr(num_boost_round,max_lr,min_lr)
325 |
326 | model = lgb.train({**params, **hyperparams}, train_data, num_boost_round=max_num_boost_round,\
327 | categorical_feature=categories,learning_rates = learning_rates[:max_num_boost_round]
328 | )
329 | pred = model.predict(valid_data.data)
330 | score = roc_auc_score(valid_data.label,pred)
331 | scores.append(score)
332 | lrs.append(learning_rates)
333 | del model, pred
334 | gc.collect()
335 |
336 | best_loop = np.argmax(scores)
337 | best_score = np.max(scores)
338 | lr = lrs[best_loop]
339 | log(f'scores {scores}')
340 | log(f'loop {best_loop}')
341 | log(f'lr max {lr[0]} min {lr[-1]}')
342 | log(f'lr best score {best_score}')
343 | return lr
344 |
345 | @timeclass(cls='AutoLGB')
346 | def num_leaves_opt(self,train_data,valid_data,categories):
347 | params = self.params
348 | hyperparams = self.hyperparams
349 | num_leaves = [31,63,127,255]
350 |
351 | num_boost_round = 500
352 | best_iteration = -1
353 | i = 0
354 | best_score = -1
355 | best_loop = -1
356 | best_num_leaves = None
357 |
358 | for leaves in num_leaves:
359 | hyperparams['num_leaves'] = leaves
360 | model = lgb.train({**params, **hyperparams}, train_data, num_boost_round=num_boost_round,\
361 | valid_sets=[valid_data], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=100,\
362 | categorical_feature=categories,learning_rates = self.learning_rates
363 | )
364 |
365 | score = model.best_score["valid_0"][params["metric"]]
366 | if score > best_score:
367 | best_num_leaves = leaves
368 | best_iteration = model.best_iteration
369 | best_score = score
370 | best_loop = i
371 |
372 | return best_num_leaves
373 |
374 | @timeclass(cls='AutoLGB')
375 | def subsample_opt(self,num_samples):
376 | samples = num_samples
377 | if samples > 1000000:
378 | samples = 1000000
379 |
380 | if samples<200000:
381 | subsample = 0.95 - samples/1000000
382 | return subsample
383 |
384 | subsample = 0.85-samples/2500000
385 | return subsample
386 |
387 | @timeclass(cls='AutoLGB')
388 | def colsample_bytree_opt(self,num_feature):
389 | if num_feature > 500:
390 | num_feature = 500
391 |
392 | if num_feature > 100:
393 | colsample_bytree = 0.8 - num_feature/2000
394 | else:
395 | colsample_bytree = 0.95 - num_feature/500
396 |
397 | return colsample_bytree
398 |
399 | @timeclass(cls='AutoLGB')
400 | def param_compute(self,X,y,categories,config):
401 | feat_name = list(X.columns)
402 | colsample_bytree = self.colsample_bytree_opt(X.shape[1])
403 | self.hyperparams['colsample_bytree'] = colsample_bytree
404 |
405 | max_sample_num = len(y)
406 | subsample = self.subsample_opt(autosample.downsampling_y(y,max_sample_num).shape[0])
407 | self.hyperparams['subsample'] = subsample
408 |
409 | max_sample_num = min(len(y),50000)
410 | X_sample,y_sample = autosample.downsampling(X,y,max_sample_num)
411 | gc.collect()
412 | params = self.params
413 |
414 | start_time = time.time()
415 | X_sample = X_sample.astype(np.float32)
416 | gc.collect()
417 | y_sample = y_sample.astype(np.float32)
418 | gc.collect()
419 | X_sample = X_sample.values
420 | gc.collect()
421 | y_sample = y_sample.values
422 | gc.collect()
423 | end_time = time.time()
424 | transfer_time = end_time-start_time
425 |
426 | time_number_boost_round1 = 15
427 | start_time = time.time()
428 | train_data = lgb.Dataset(X_sample, label=y_sample,feature_name=feat_name)
429 |
430 | gc.collect()
431 |
432 | lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=time_number_boost_round1,\
433 | categorical_feature=categories,)
434 |
435 | end_time = time.time()
436 |
437 | model_use_time1 = end_time - start_time
438 |
439 | time_number_boost_round2 = time_number_boost_round1*2
440 |
441 | del train_data
442 | gc.collect()
443 |
444 | start_time = time.time()
445 | train_data = lgb.Dataset(X_sample, label=y_sample,feature_name=feat_name)
446 | del X_sample,y_sample
447 | gc.collect()
448 | lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=time_number_boost_round2,\
449 | categorical_feature=categories,)
450 |
451 | del train_data
452 | gc.collect()
453 | end_time = time.time()
454 |
455 | model_use_time2 = end_time - start_time
456 |
457 | boost_time = (model_use_time2 - model_use_time1)
458 | boost_round = time_number_boost_round2 - time_number_boost_round1
459 | preprocess_time = model_use_time1 - boost_time
460 | model_sample_time = 4 * (transfer_time + preprocess_time + (boost_time * (400/boost_round))) + 5
461 |
462 | max_sample_num = len(y)
463 | X,y = autosample.downsampling(X,y,max_sample_num)
464 |
465 | gc.collect()
466 | pos = (y==1).sum()
467 | neg = (y==0).sum()
468 |
469 | gc.collect()
470 | params = self.params
471 |
472 | time_number_boost_round1 = 15
473 |
474 | start_time = time.time()
475 | X = X.astype(np.float32)
476 | gc.collect()
477 | y = y.astype(np.float32)
478 | gc.collect()
479 | X = X.values
480 | gc.collect()
481 | y = y.values
482 | gc.collect()
483 | end_time = time.time()
484 |
485 | transfer_time = end_time-start_time
486 |
487 | start_time = time.time()
488 | train_data = lgb.Dataset(X, label=y,feature_name=feat_name)
489 |
490 | gc.collect()
491 |
492 |
493 | lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=time_number_boost_round1,\
494 | categorical_feature=categories,)
495 |
496 | del train_data
497 | gc.collect()
498 | end_time = time.time()
499 |
500 | model_use_time1 = end_time - start_time
501 |
502 | time_number_boost_round2 = time_number_boost_round1*2
503 |
504 | start_time = time.time()
505 | train_data = lgb.Dataset(X, label=y,feature_name=feat_name)
506 | del X,y
507 | gc.collect()
508 | lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=time_number_boost_round2,\
509 | categorical_feature=categories,)
510 |
511 | del train_data
512 | gc.collect()
513 | end_time = time.time()
514 |
515 | model_use_time2 = end_time - start_time
516 |
517 | boost_time = (model_use_time2 - model_use_time1)
518 | boost_round = time_number_boost_round2 - time_number_boost_round1
519 | preprocess_time = model_use_time1 - boost_time
520 |
521 | rest_time = config.budget/10*9-(end_time-config.start_time)-model_sample_time-10
522 |
523 | self.num_boost_round = 20
524 | for number_boost_round in [700,600,500,400,300,200,100,50]:
525 | real_model_time = (transfer_time + preprocess_time + (boost_time * (number_boost_round/boost_round)))
526 | if real_model_time > rest_time:
527 | continue
528 | else:
529 | self.num_boost_round = number_boost_round
530 | break
531 |
532 | gc.collect()
533 |
534 | @timeclass(cls='AutoLGB')
535 | def param_opt(self,X_train,y_train,X_valid,y_valid,categories):
536 | feat_name = list(X_train.columns)
537 |
538 | pos = (y_train==1).sum()
539 | neg = (y_train==0).sum()
540 | val_pos = (y_valid==1).sum()
541 | val_neg = (y_valid==0).sum()
542 |
543 | max_sample_num = min(len(y_train),50000)
544 | X,y = autosample.downsampling(X_train,y_train,max_sample_num)
545 |
546 | pos = (y==1).sum()
547 | neg = (y==0).sum()
548 |
549 | train_data = lgb.Dataset(X, label=y,feature_name=feat_name)
550 | del X,y
551 | gc.collect()
552 |
553 | valid_data = lgb.Dataset(X_valid, label=y_valid,feature_name=feat_name,free_raw_data=False)
554 | del X_valid,y_valid
555 | gc.collect()
556 |
557 | lr = self.lr_opt(train_data,valid_data,categories)
558 | self.learning_rates = lr
559 |
560 | self.best_iteration = self.num_boost_round
561 |
562 | del train_data
563 | gc.collect()
564 |
565 | num_boost_round = self.num_boost_round
566 | params = self.params
567 | max_sample_num = len(y_train)
568 |
569 | X,y = autosample.downsampling(X_train,y_train,max_sample_num)
570 | del X_train,y_train
571 |
572 | gc.collect()
573 | pos = (y==1).sum()
574 | neg = (y==0).sum()
575 |
576 | X = X.astype(np.float32)
577 | gc.collect()
578 | y = y.astype(np.float32)
579 | gc.collect()
580 | X = X.values
581 | gc.collect()
582 | y = y.values
583 | gc.collect()
584 |
585 | train_data = lgb.Dataset(X, label=y,feature_name=feat_name)
586 |
587 | del X,y
588 | gc.collect()
589 |
590 | model = lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=num_boost_round,\
591 | valid_sets=[valid_data], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=100,\
592 | categorical_feature=categories,learning_rates = self.learning_rates
593 | )
594 | gc.collect()
595 |
596 | best_model = model
597 |
598 | best_score = model.best_score["valid_0"][params["metric"]]
599 |
600 | if model.best_iteration > 50:
601 | self.best_iteration = model.best_iteration
602 | elif model.current_iteration() > 50:
603 | self.best_iteration = model.current_iteration()
604 | else:
605 | self.best_iteration = 50
606 |
607 | return best_model,best_score
608 |
609 | def get_importances(self):
610 | model = self.model
611 | importances = pd.DataFrame({'features':[ self.f_feat_name_maps[i] for i in model.feature_name() ] ,
612 | 'importances':model.feature_importance()})
613 |
614 | importances.sort_values('importances',ascending=False,inplace=True)
615 |
616 | return importances
617 |
618 | @timeclass(cls='AutoLGB')
619 | def ensemble_predict_train(self,X):
620 | X = X[X.columns]
621 | X.columns = self.new_feat_name_cols
622 |
623 | preds = []
624 | for model in self.ensemble_models:
625 | preds.append(model.predict(X))
626 |
627 | pred = np.stack(preds,axis=1).mean(axis=1)
628 | return pred
629 |
630 | def get_ensemble_importances(self):
631 | model = self.ensemble_models[0]
632 | importances = pd.DataFrame({'features':[ self.f_feat_name_maps[i] for i in model.feature_name() ] ,
633 | 'importances':model.feature_importance()})
634 |
635 | importances.sort_values('importances',ascending=False,inplace=True)
636 |
637 | return importances
638 |
639 | @timeclass(cls='AutoLGB')
640 | def param_opt_new(self,X_train,y_train,X_valid,y_valid,categories):
641 | feat_name = list(X_train.columns)
642 |
643 | pos = (y_train==1).sum()
644 | neg = (y_train==0).sum()
645 | val_pos = (y_valid==1).sum()
646 | val_neg = (y_valid==0).sum()
647 | log(f'training set pos {pos} neg {neg}')
648 | log(f'validation set pos {val_pos} neg {val_neg}')
649 |
650 | max_sample_num = min(len(y_train),50000)
651 | X,y = autosample.downsampling(X_train,y_train,max_sample_num)
652 |
653 | pos = (y==1).sum()
654 | neg = (y==0).sum()
655 | log(f'opt downsampling set pos {pos} neg {neg}')
656 |
657 | X = X.astype(np.float32)
658 | gc.collect()
659 | y = y.astype(np.float32)
660 | gc.collect()
661 | X = X.values
662 | gc.collect()
663 | y = y.values
664 | gc.collect()
665 |
666 | train_data = lgb.Dataset(X, label=y,feature_name=feat_name)
667 | del X,y
668 | gc.collect()
669 |
670 | valid_data = lgb.Dataset(X_valid, label=y_valid,feature_name=feat_name,free_raw_data=False)
671 | del X_valid,y_valid
672 | gc.collect()
673 |
674 | lr = self.lr_opt(train_data,valid_data,categories)
675 | del train_data
676 | gc.collect()
677 | self.learning_rates = lr
678 |
679 | self.best_iteration = self.num_boost_round
680 | log(f'pass round opt, use best iteration as {self.best_iteration}')
681 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/automl/automl.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | class AutoML:
4 | def __init__(self):
5 | self.params = {
6 |
7 | }
8 |
9 | def train(self,X,y,categories):
10 | pass
11 |
12 | def predict(self,X):
13 | pass
14 |
15 |
16 | def param_opt(self,X_train,y_train,X_valid,y_valid,categories):
17 | pass
18 |
19 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/automl/autosample.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import pandas as pd
3 |
4 | def get_downsampling_num(npos,nneg,sample_num,unbalanced_ratio,min_neg_pos_ratio=2):
5 |
6 | reverse = False
7 | ntol = npos + nneg
8 | if npos>nneg:
9 | reverse = True
10 | tmp = npos
11 | npos = nneg
12 | nneg = tmp
13 |
14 | max_sample_num = min(npos, nneg)*(unbalanced_ratio+1)
15 | if max_sample_num>sample_num:
16 | max_sample_num = sample_num
17 |
18 | if npos+nneg > max_sample_num:
19 |
20 | if nneg/npos <= min_neg_pos_ratio:
21 | pos_num = npos/ntol * max_sample_num
22 | neg_num = nneg/ntol * max_sample_num
23 |
24 | elif nneg/npos <= unbalanced_ratio:
25 | if npos > max_sample_num/(min_neg_pos_ratio+1):
26 | pos_num = max_sample_num/(min_neg_pos_ratio+1)
27 | neg_num = max_sample_num - pos_num
28 | else:
29 | pos_num = npos
30 | neg_num = max_sample_num - pos_num
31 |
32 | elif nneg/npos > unbalanced_ratio:
33 | if npos > max_sample_num/(unbalanced_ratio+1):
34 | pos_num = max_sample_num/(unbalanced_ratio+1)
35 | neg_num = max_sample_num - pos_num
36 |
37 | else:
38 | pos_num = npos
39 | neg_num = max_sample_num - npos
40 |
41 | else:
42 | neg_num = nneg
43 | pos_num = npos
44 |
45 | if neg_num/pos_num > unbalanced_ratio:
46 | neg_num = pos_num*unbalanced_ratio
47 |
48 | neg_num = int(neg_num)
49 | pos_num = int(pos_num)
50 | if reverse:
51 | return neg_num,pos_num
52 |
53 | return pos_num,neg_num
54 |
55 | def sample(X,frac,seed,y=None):
56 | if frac == 1:
57 | X = X.sample(frac=1,random_state=seed)
58 | elif frac > 1:
59 | mul = int(frac)
60 | frac = frac - int(frac)
61 | X_res = X.sample(frac=frac,random_state=seed)
62 | X = pd.concat([X] * mul + [X_res])
63 | else:
64 | X = X.sample(frac=frac,random_state=seed)
65 |
66 | if y is not None:
67 | y = y.loc[X.index]
68 | return X,y
69 | return X
70 |
71 |
72 | def downsampling_num(y,max_sample_num):
73 | npos = (y==1).sum()
74 | nneg = (y==0).sum()
75 |
76 |
77 | min_num = min(npos,nneg)
78 | min_num = max(min_num,1000)
79 |
80 | if min_num < 8000:
81 | unbalanced_ratio = 10 - (min_num//1000)
82 | else:
83 | unbalanced_ratio = 3
84 |
85 | pos_num,neg_num = get_downsampling_num(npos,nneg,max_sample_num,unbalanced_ratio)
86 | return pos_num,neg_num
87 |
88 |
89 | def class_sample(X,y,pos_num,neg_num,seed=2019):
90 |
91 | npos = float((y == 1).sum())
92 | nneg = len(y) - npos
93 |
94 | pos_frac = pos_num / npos
95 | neg_frac = neg_num / nneg
96 |
97 | X_pos = X[y == 1]
98 | X_pos = sample(X_pos,pos_frac,seed)
99 |
100 | X_neg = X[y == 0]
101 | X_neg = sample(X_neg,neg_frac,seed)
102 |
103 | X = pd.concat([X_pos,X_neg])
104 |
105 | X,y = sample(X,1,seed,y)
106 |
107 | return X,y
108 |
109 | def downsampling(X,y,max_sample_num,seed=2019):
110 | pos_num,neg_num = downsampling_num(y,max_sample_num)
111 | return class_sample(X,y,pos_num,neg_num,seed)
112 |
113 | def class_sample_y(y,pos_num,neg_num,seed=2019):
114 |
115 | npos = float((y == 1).sum())
116 | nneg = len(y) - npos
117 |
118 | pos_frac = pos_num / npos
119 | neg_frac = neg_num / nneg
120 |
121 | y_pos = y[y == 1]
122 | y_pos = sample(y_pos,pos_frac,seed)
123 |
124 | y_neg = y[y == 0]
125 | y_neg = sample(y_neg,neg_frac,seed)
126 |
127 | y = pd.concat([y_pos,y_neg])
128 |
129 | y = sample(y,1,seed)
130 |
131 | return y
132 |
133 | def downsampling_y(y,max_sample_num,seed=2019):
134 | pos_num,neg_num = downsampling_num(y,max_sample_num)
135 | y = class_sample_y(y,pos_num,neg_num,seed)
136 | return y
137 |
138 |
139 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/automl/model_selection.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 |
4 | def time_train_test_split(X,y,test_rate=0.2,shuffle=True,random_state=1):
5 | length = X.shape[0]
6 |
7 |
8 | test_size = int(length * test_rate)
9 | train_size = length - test_size
10 |
11 | X_train = X.iloc[:train_size]
12 | y_train = y.iloc[:train_size]
13 | X_test = X.iloc[train_size:]
14 | y_test = y.iloc[train_size:]
15 |
16 | if shuffle:
17 | np.random.seed(random_state)
18 | idx = np.arange(train_size)
19 | np.random.shuffle(idx)
20 | X_train = X_train.iloc[idx]
21 | y_train = y_train.iloc[idx]
22 |
23 | return X_train,y_train,X_test,y_test
24 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | class Config:
4 | def __init__(self, start_time,budget):
5 | if budget >= 1000:
6 | self.keys_order2_cat_max = 50
7 | self.keys_order2_num_max = 50
8 |
9 | self.keys_order2_cat_maxmin = 10
10 | self.keys_order2_num_maxmin = 10
11 | self.keys_order2_num_std = 5
12 |
13 | self.keys_order2_bin_num_max = 20
14 | self.keys_order2_bin_cat_max = 20
15 |
16 | self.all_order2_cat_max = 7
17 | self.all_order2_num_max = 7
18 |
19 |
20 | self.keys_order3_num_max = 10
21 | self.keys_order3_cat_max = 10
22 |
23 | self.wait_feat_selection_num = 30
24 | self.wait_feat_selection_num_all = 20
25 |
26 | self.start_time = start_time
27 | self.budget = budget
28 | else:
29 | self.keys_order2_cat_max = 40
30 | self.keys_order2_num_max = 40
31 |
32 | self.keys_order2_cat_maxmin = 10
33 | self.keys_order2_num_maxmin = 10
34 | self.keys_order2_num_std = 5
35 |
36 | self.keys_order2_bin_num_max = 10
37 | self.keys_order2_bin_cat_max = 10
38 |
39 | self.all_order2_cat_max = 7
40 | self.all_order2_num_max = 7
41 |
42 | self.keys_order3_num_max = 10
43 | self.keys_order3_cat_max = 10
44 |
45 | self.wait_feat_selection_num = 30
46 | self.wait_feat_selection_num_all = 20
47 |
48 | self.start_time = start_time
49 | self.budget = budget
50 |
51 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/data_tools.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | import pandas as pd
4 |
5 |
6 | def downcast(series,accuracy_loss = True, min_float_type='float16'):
7 | if series.dtype == np.int64:
8 | ii8 = np.iinfo(np.int8)
9 | ii16 = np.iinfo(np.int16)
10 | ii32 = np.iinfo(np.int32)
11 | max_value = series.max()
12 | min_value = series.min()
13 |
14 | if max_value <= ii8.max and min_value >= ii8.min:
15 | return series.astype(np.int8)
16 | elif max_value <= ii16.max and min_value >= ii16.min:
17 | return series.astype(np.int16)
18 | elif max_value <= ii32.max and min_value >= ii32.min:
19 | return series.astype(np.int32)
20 | else:
21 | return series
22 |
23 | elif series.dtype == np.float64:
24 | fi16 = np.finfo(np.float16)
25 | fi32 = np.finfo(np.float32)
26 |
27 | if accuracy_loss:
28 | max_value = series.max()
29 | min_value = series.min()
30 | if np.isnan(max_value):
31 | max_value = 0
32 |
33 | if np.isnan(min_value):
34 | min_value = 0
35 |
36 | if min_float_type=='float16' and max_value <= fi16.max and min_value >= fi16.min:
37 | return series.astype(np.float16)
38 | elif max_value <= fi32.max and min_value >= fi32.min:
39 | return series.astype(np.float32)
40 | else:
41 | return series
42 | else:
43 | tmp = series[~pd.isna(series)]
44 | if(len(tmp)==0):
45 | return series.astype(np.float16)
46 |
47 | if (tmp == tmp.astype(np.float16)).sum() == len(tmp):
48 | return series.astype(np.float16)
49 | elif (tmp == tmp.astype(np.float32)).sum() == len(tmp):
50 | return series.astype(np.float32)
51 |
52 | else:
53 | return series
54 |
55 | else:
56 | return series
57 |
58 | def gen_segs_array(shape0,nseg):
59 | segs = np.zeros(shape0)
60 | block_size = int(shape0/nseg)+1
61 | for i in range(nseg):
62 | segs[i*block_size:(i+1)*block_size] = i
63 | return segs
64 |
65 |
66 | def gen_segs_tuple(shape0,nseg):
67 | segs = []
68 | block_size = int(shape0/nseg)
69 | i = -1
70 | for i in range(nseg-1):
71 | segs.append( (i*block_size,(i+1)*block_size) )
72 | segs.append(((i+1)*block_size,shape0))
73 | return segs
74 |
75 |
76 | def gen_segs_tuple_by_time_nseg(shape0,nseg,time_series):
77 | block_size = None
78 | if time_series is None:
79 | block_size = int(shape0/nseg)+1
80 | else:
81 | max_time = time_series.max().value
82 | min_time = time_series.min().value
83 | block_size = int( (max_time-min_time)/nseg )
84 | return block_size
85 |
86 | def gen_combine_cats(df, cols):
87 |
88 | category = df[cols[0]].astype('float64')
89 | for col in cols[1:]:
90 | mx = df[col].max()
91 | category *= mx
92 | category += df[col]
93 | return category
94 |
95 | def gen_segs_tuple_by_time_size(shape0,block_size,time_series):
96 | segs = []
97 | if time_series is None:
98 | nseg = int(shape0/block_size)
99 | block_size = int( shape0/nseg ) + 1
100 | for i in range(nseg):
101 | segs.append( (i*block_size,(i+1)*block_size) )
102 | else:
103 | max_time = time_series.max().value
104 | min_time = time_series.min().value
105 | nseg = int( (max_time-min_time)/block_size )
106 | if nseg == 0:
107 | nseg = 1
108 | block_size = int( (max_time-min_time)/nseg ) + 1
109 | t = time_series.reset_index(drop=True)
110 | t = t.astype('int64')
111 |
112 |
113 | for i in range(nseg):
114 |
115 | l_time = min_time + i*block_size
116 | r_time = min_time + (i+1)*block_size
117 | if i == nseg-1:
118 | r_time = max_time+1
119 | indexs = t[ (l_time<=t) & (t < r_time) ].index
120 | l_index = indexs[0]
121 | r_index = indexs[-1]+1
122 | segs.append( (l_index,r_index) )
123 |
124 | return segs
125 |
126 |
127 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepBlueAI/AutoSmart/99a6b44ce5b6520a2463c8a4b3acc675584533be/auto_smart/auto_smart/feat/__init__.py
--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/default_merge_feat.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .merge_feat import O2O,M2O,O2M,M2M,TimeM2M,PreO2O,PreM2O,PreO2M,PreM2M,PreTimeM2M
4 | from util import timeclass
5 | import CONSTANT
6 | import pandas as pd
7 | import numpy as np
8 |
9 | from joblib import Parallel, delayed
10 | from feat_context import FeatContext
11 | import util
12 | from data_tools import downcast
13 | import gc
14 | namespace = 'default'
15 |
16 | class M2OJoin(M2O):
17 | def fit(self,U,V):
18 | pass
19 |
20 | @timeclass(cls='M2OJoin')
21 | def transform(self,U,V):
22 | v = V.data
23 | key = self.key
24 | v = v.set_index(key)
25 | new_cols = []
26 | col2type = {}
27 | col2block = {}
28 | for col in v.columns:
29 | feat_type = V.col2type[col]
30 | new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,col,feat_type,V.name)
31 | new_cols.append(new_col)
32 | col2type[new_col] = feat_type
33 |
34 | if col in V.col2block:
35 | block_id = V.col2block[col]
36 | col2block[new_col] = block_id
37 |
38 | v.columns = new_cols
39 | return v,col2type,col2block
40 |
41 | @timeclass(cls='M2OJoin')
42 | def fit_transform(self,U,V):
43 | return self.transform(U,V)
44 |
45 | class M2MKeyCount(M2M):
46 | @timeclass(cls='M2MKeyCount')
47 | def fit(self,U,V):
48 | pass
49 |
50 | @timeclass(cls='M2MKeyCount')
51 | def transform(self,U,V):
52 | v = V.data
53 | key = self.key
54 | col2type = {}
55 | ss = v.groupby(key)[key].count()
56 | ss = downcast(ss)
57 | feat_type = CONSTANT.NUMERICAL_TYPE
58 | new_col = key+'_M2MKeyCount'
59 | new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,new_col,feat_type,V.name)
60 | ss.name = new_col
61 | col2type[new_col] = feat_type
62 | return pd.DataFrame(ss),col2type,{}
63 |
64 | @timeclass(cls='M2MKeyCount')
65 | def fit_transform(self,U,V):
66 | return self.transform(U,V)
67 |
68 | class M2MNumMean(M2M):
69 | @timeclass(cls='M2MNumMean')
70 | def fit(self,U,V):
71 | pass
72 |
73 | @timeclass(cls='M2MNumMean')
74 | def transform(self,U,V):
75 | v = V.data
76 | key = self.key
77 | col2type = {}
78 |
79 | def func(df):
80 | key = df.columns[0]
81 | col = df.columns[1]
82 | df[col] = df[col].astype('float32')
83 |
84 | ss = df.groupby(key)[col].mean()
85 | ss = downcast(ss)
86 | return ss
87 |
88 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(v[[key,col]]) for col in V.num_cols)
89 | if res:
90 | new_cols = []
91 | for col in V.num_cols:
92 | feat_type = CONSTANT.NUMERICAL_TYPE
93 | col = col+'_M2MNumMean'
94 | new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,col,feat_type,V.name)
95 | new_cols.append(new_col)
96 | col2type[new_col] = feat_type
97 |
98 | tmp = pd.concat(res,axis=1)
99 | tmp.columns = new_cols
100 | return tmp,col2type,{}
101 | return pd.DataFrame(),col2type,{}
102 |
103 | @timeclass(cls='M2MNumMean')
104 | def fit_transform(self,U,V):
105 | return self.transform(U,V)
106 |
107 | class TimeM2MnewLastData(M2M):
108 | @timeclass(cls='TimeM2MnewLastData')
109 | def fit(self,U,V):
110 | pass
111 |
112 | @timeclass(cls='TimeM2MnewLastData')
113 | def transform(self,U,V):
114 | key = self.key
115 |
116 | if U.key_time_col != V.key_time_col:
117 | return
118 |
119 | key_time_col = V.key_time_col
120 |
121 | todo_cols = V.multi_cat_cols
122 | if not todo_cols:
123 | return
124 |
125 | v = V.data[[V.key_time_col,key] + todo_cols]
126 | u = U.data[[U.key_time_col,key]]
127 |
128 | u_index = u.index
129 | u.reset_index(drop=True,inplace=True)
130 | col2type = {}
131 | col2block = {}
132 |
133 | u.index = -u.index-1
134 | v_large = pd.concat([v,u])
135 | v_large.sort_values(by=[key,key_time_col],inplace=True)
136 |
137 | symbol = 1
138 | key_diff = v_large[key].diff()
139 | for col in todo_cols:
140 | v_large[col].loc[key_diff!=0].replace(np.nan,symbol)
141 |
142 | new_cols = []
143 | for col in todo_cols:
144 | feat_type = CONSTANT.MULTI_CAT_TYPE
145 | new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,col,feat_type,V.name)
146 | new_cols.append(new_col)
147 | col2type[new_col] = feat_type
148 | if col in V.col2block:
149 | col2block[new_col] = V.col2block[col]
150 |
151 | def func(series):
152 | ss = series.fillna(method='ffill')
153 | ss = ss.replace(symbol,np.nan)
154 | return ss
155 |
156 | res = Parallel(n_jobs=CONSTANT.JOBS, require='sharedmem')(delayed(func)(v_large[col]) for col in todo_cols)
157 | if res:
158 | tmp = pd.concat(res,axis=1)
159 | del res
160 | gc.collect()
161 |
162 | tmp.columns = new_cols
163 | tmp = tmp.loc[tmp.index<0]
164 | tmp.index = -(tmp.index+1)
165 |
166 | tmp.sort_index(inplace=True)
167 | tmp.index = u_index
168 | del u_index
169 | gc.collect()
170 | U.data[new_cols] = tmp
171 | del tmp
172 | gc.collect()
173 | U.update_data(U.data,col2type,None,None,col2block,None)
174 |
175 | @timeclass(cls='TimeM2MnewLastData')
176 | def fit_transform(self,U,V):
177 | self.transform(U,V)
178 |
179 | class M2MDataLast(TimeM2M):
180 | @timeclass(cls='M2MDataLast')
181 | def fit(self,U,V):
182 | pass
183 |
184 | @timeclass(cls='M2MDataLast')
185 | def transform(self,U,V):
186 | data = V.data
187 | key = self.key
188 | col2type = {}
189 | col2block = {}
190 |
191 | col_sets = []
192 | cols = list(data.columns)
193 |
194 | if key in cols:
195 | cols.remove(key)
196 |
197 | del_cols = []
198 | for col in cols:
199 | if col in V.col2type:
200 | if V.col2type[col] == CONSTANT.NUMERICAL_TYPE:
201 | del_cols.append(col)
202 |
203 | for col in del_cols:
204 | if col in cols:
205 | cols.remove(col)
206 |
207 | if len(cols)==0:
208 | return pd.DataFrame(),{},{}
209 | cols_len = 20
210 | cols_num = len(cols)
211 | if cols_num % cols_len == 0:
212 | blocks = int(cols_num / cols_len)
213 | else:
214 | blocks = int(cols_num / cols_len) + 1
215 |
216 | for i in range(blocks):
217 | col_t = []
218 | for j in range(i*cols_len,(i+1)*cols_len):
219 | if j < len(cols):
220 | col_t.append(cols[j])
221 | col_sets.append(col_t)
222 |
223 | feats = []
224 | for col_set in col_sets:
225 |
226 | feats.append( data.groupby( key )[ col_set ].last() )
227 | if feats:
228 | df = pd.concat(feats,axis=1)
229 |
230 | new_cols = []
231 | for col in df.columns:
232 | feat_type = V.col2type[col]
233 | new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,col,feat_type,V.name)
234 | new_cols.append(new_col)
235 | col2type[new_col] = feat_type
236 |
237 |
238 | if col in V.col2block:
239 | block_id = V.col2block[col]
240 | col2block[new_col] = block_id
241 |
242 | df.columns = new_cols
243 | return df,col2type,col2block
244 | else:
245 | return pd.DataFrame(),{},{}
246 |
247 | @timeclass(cls='M2MDataLast')
248 | def fit_transform(self,U,V):
249 | self.fit(U,V)
250 | return self.transform(U,V)
--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/feat.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | class Feat:
4 | def __init__(self,config):
5 | self.config = config
6 |
7 | def fit(self,X,y):
8 | pass
9 |
10 | def transform(self,X):
11 | pass
12 |
13 | def fit_transform(self,X,y):
14 | pass
--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/feat_pipeline.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .default_feat import *
4 | from .feat_selection import LGBFeatureSelection,LGBFeatureSelectionWait,LGBFeatureSelectionLast
5 |
6 | class FeatPipeline:
7 | def __init__(self):
8 | self.order1s = []
9 |
10 | class DefaultFeatPipeline(FeatPipeline):
11 | def __init__(self):
12 | super(DefaultFeatPipeline,self).__init__()
13 | self.main_init()
14 |
15 | def main_init(self):
16 | self.order1s = [
17 | PreMcToNumpy,McCatRank,
18 |
19 | OriginSession,\
20 |
21 | ApartCatRecognize,\
22 |
23 | KeysCountDIY,
24 | UserKeyCntDIY,SessionKeyCntDIY,\
25 |
26 | KeysTimeDiffAndFuture,
27 |
28 | UserSessionNuniqueDIY,\
29 | UserSessionCntDivNuniqueDIY,\
30 | UserKeyNuniqueDIY, SessionKeyNuniqueDIY,\
31 | UserKeyCntDivNuniqueDIY,SessionKeyCntDivNuniqueDIY,\
32 |
33 | KeysCumCntRateAndReverse,
34 |
35 | UserKeyCumCntRateAndReverse,
36 |
37 | KeyTimeDate,
38 | KeyTimeBin,
39 | KeysBinCntDIY,
40 |
41 | CatCountDIY,
42 | LGBFeatureSelection,\
43 | ]
44 |
45 | self.keys_order2s = [
46 | KeysNumMeanOrder2MinusSelfNew,
47 | KeysNumMaxMinOrder2MinusSelfNew,
48 | KeysNumStd,
49 | KeysCatCntOrder2New,
50 |
51 | LGBFeatureSelectionWait,
52 | ]
53 |
54 | self.all_order2s = [
55 | BinsCatCntOrder2DIYNew,
56 | BinsNumMeanOrder2DIYNew,
57 | CatNumMeanOrder2DIYNew,
58 | CatCntOrder2DIYNew,
59 |
60 | LGBFeatureSelectionWait
61 | ]
62 |
63 | self.post_order1s = [
64 | TimeNum,
65 | ]
66 |
67 | self.merge_order1s = [
68 | CatSegCtrOrigin,
69 | CatMeanEncoding,
70 |
71 | LGBFeatureSelectionLast,
72 | ]
73 |
74 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/feat_selection.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from util import timeclass,log
3 | import CONSTANT
4 | from model_input import FeatOutput
5 | from automl import autosample
6 | import gc
7 | import lightgbm as lgb
8 | import pandas as pd
9 | from .feat import Feat
10 | import time
11 | import numpy as np
12 |
13 | def lgb_train(X,y):
14 | num_boost_round = 100
15 | num_leaves = 63
16 |
17 | params = {
18 | 'boosting_type': 'gbdt',
19 | 'objective': 'binary',
20 | 'metric': "None",
21 | 'learning_rate': 0.1,
22 | 'num_leaves': num_leaves,
23 | 'max_depth': -1,
24 | 'min_child_samples': 20,
25 | 'max_bin':255,
26 | 'subsample': 0.9,
27 | 'subsample_freq': 1,
28 | 'colsample_bytree': 1,
29 | 'min_child_weight': 0.001,
30 | 'subsample_for_bin': 200000,
31 | 'min_split_gain': 0.02,
32 | 'reg_alpha': 0.1,
33 | 'reg_lambda': 0.1,
34 | 'seed': CONSTANT.SEED,
35 | 'nthread': CONSTANT.THREAD_NUM,
36 | }
37 |
38 | data = X.data
39 |
40 | y_train = y
41 |
42 | max_sample_num = min(len(y_train),50000)
43 | y_train = autosample.downsampling_y(y_train,max_sample_num)
44 |
45 | X_train = data.loc[y_train.index]
46 |
47 | X.data = X_train
48 | feat_output = FeatOutput()
49 | X_train,y_train,categories = feat_output.fit_transform_output(X,y_train)
50 |
51 | X.data = data
52 | gc.collect()
53 |
54 | feat_name_cols = list(X_train.columns)
55 | feat_name_maps = { feat_name_cols[i] : str(i) for i in range(len(feat_name_cols)) }
56 | f_feat_name_maps = { str(i) : feat_name_cols[i] for i in range(len(feat_name_cols)) }
57 | new_feat_name_cols = [ feat_name_maps[i] for i in feat_name_cols ]
58 | X_train.columns = new_feat_name_cols
59 |
60 | dtrain = lgb.Dataset(X_train,y_train,feature_name=list(X_train.columns))
61 | model = lgb.train(params,dtrain,
62 | num_boost_round=num_boost_round,
63 | categorical_feature=[],
64 | )
65 |
66 | df_imp = pd.DataFrame({'features': [ f_feat_name_maps[i] for i in model.feature_name() ] ,
67 | 'importances':model.feature_importance()})
68 |
69 | df_imp.sort_values('importances',ascending=False,inplace=True)
70 |
71 | return df_imp
72 |
73 | class LGBFeatureSelection(Feat):
74 | @timeclass(cls='LGBFeatureSelection')
75 | def fit(self,X,y):
76 | now = time.time()
77 | log(f'LGBFeatureSelection:{now-self.config.start_time}')
78 |
79 | threshold = 5
80 | df_imp = lgb_train(X,y)
81 | log(f'importances sum {df_imp["importances"].sum()}')
82 | if df_imp["importances"].sum() != 6200:
83 | keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features'])
84 | if len(keep_feats) < 150:
85 | useful_feats = list(df_imp.loc[df_imp['importances'] > 0,'features'])
86 | if len(useful_feats) <= 150:
87 | keep_feats = useful_feats
88 | else:
89 | df_imp_sorted = df_imp.sort_values(by='importances',ascending=False)
90 | keep_feats = list(df_imp_sorted['features'].iloc[:150])
91 | else:
92 | keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features'])
93 |
94 | log(f'keep feats num {len(keep_feats)}')
95 |
96 | keep_cats = []
97 |
98 | keep_cats_set = set()
99 | cat_set = set(X.cat_cols)
100 |
101 | for feat in keep_feats:
102 |
103 | if X.col2type[feat] == CONSTANT.CATEGORY_TYPE:
104 | if feat in cat_set:
105 | if feat not in keep_cats_set:
106 | keep_cats_set.add(feat)
107 | keep_cats.append(feat)
108 |
109 | elif feat in X.col2source_cat:
110 | keep_feat = X.col2source_cat[feat]
111 | if keep_feat in cat_set:
112 | if keep_feat not in keep_cats_set:
113 | keep_cats_set.add(keep_feat)
114 | keep_cats.append(keep_feat)
115 |
116 | drop_feats = list(set(df_imp['features'].tolist()) - set(keep_feats))
117 |
118 | drop_feats = list(set(drop_feats) - keep_cats_set)
119 | self.drop_feats = drop_feats
120 | log(f'total feat num:{df_imp.shape[0]}, drop feat num:{len(self.drop_feats)}')
121 |
122 | keep_nums = []
123 | for feat in keep_feats:
124 | if X.col2type[feat] == CONSTANT.NUMERICAL_TYPE:
125 | keep_nums.append(feat)
126 |
127 | keep_binaries = []
128 | for feat in keep_feats:
129 | if X.col2type[feat] == CONSTANT.BINARY_TYPE:
130 | keep_binaries.append(feat)
131 |
132 | assert(len(set(keep_cats) & set(drop_feats))==0)
133 | assert(len(set(keep_nums) & set(drop_feats))==0)
134 | assert(len(set(keep_binaries) & set(drop_feats))==0)
135 |
136 | X.reset_combine_cols(keep_cats,keep_nums,keep_binaries)
137 |
138 | @timeclass(cls='LGBFeatureSelection')
139 | def transform(self,X):
140 | X.drop_data(self.drop_feats)
141 | return self.drop_feats
142 |
143 | @timeclass(cls='LGBFeatureSelection')
144 | def fit_transform(self,X,y):
145 | self.fit(X,y)
146 | self.transform(X)
147 | return self.drop_feats
148 |
149 | class LGBFeatureSelectionLast(Feat):
150 | @timeclass(cls='LGBFeatureSelectionLast')
151 | def fit(self,X,y):
152 | now = time.time()
153 | log(f'LGBFeatureSelectionLast:{now-self.config.start_time}')
154 |
155 | start_time = time.time()
156 | df_imp = lgb_train(X,y)
157 |
158 | data = X.data
159 | shape = data.shape
160 | y_pos = len(y[y==1])
161 | y_neg = len(y[y==0])
162 | unbalance_ratio = y_pos / y_neg if y_pos > y_neg else y_neg / y_pos
163 | memory_usage = pd.Series(np.zeros(shape[0]),dtype=np.float32).memory_usage() / 1024 / 1024 / 1024
164 | gc.collect()
165 |
166 | if unbalance_ratio >= 7:
167 | memory_constrain = 2
168 | elif unbalance_ratio >= 4:
169 | memory_constrain = 1.8
170 | else:
171 | memory_constrain = 1.6
172 |
173 | col_constrain = int(memory_constrain / memory_usage)
174 |
175 | end_time = time.time()
176 |
177 | use_time = end_time-start_time
178 | user_time_rate = use_time / self.config.budget
179 |
180 | if user_time_rate > 0.1:
181 | threshold = 13
182 | elif user_time_rate > 0.09:
183 | threshold = 12
184 | elif user_time_rate > 0.08:
185 | threshold = 11
186 | elif user_time_rate > 0.07:
187 | threshold = 10
188 | elif user_time_rate > 0.06:
189 | threshold = 9
190 | elif user_time_rate > 0.05:
191 | threshold = 8
192 | elif user_time_rate > 0.04:
193 | threshold = 7
194 | elif user_time_rate > 0.03:
195 | threshold = 6
196 | else:
197 | threshold = 5
198 |
199 | log(f'LGBFeatureSelectionLast threshold {threshold}')
200 |
201 | log(f'importances sum {df_imp["importances"].sum()}')
202 | if df_imp["importances"].sum() != 6200:
203 | keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features'])
204 | if len(keep_feats) < 150:
205 | useful_feats = list(df_imp.loc[df_imp['importances'] > 0,'features'])
206 | if len(useful_feats) <= 150:
207 | keep_feats = useful_feats
208 | else:
209 | df_imp_sorted = df_imp.sort_values(by='importances',ascending=False)
210 | keep_feats = list(df_imp_sorted['features'].iloc[:150])
211 | else:
212 | keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features'])
213 |
214 | keep_cats = []
215 |
216 | keep_cats_set = set()
217 | cat_set = set(X.cat_cols)
218 |
219 | for feat in keep_feats:
220 |
221 | if X.col2type[feat] == CONSTANT.CATEGORY_TYPE:
222 | if feat in cat_set:
223 | if feat not in keep_cats_set:
224 | keep_cats_set.add(feat)
225 | keep_cats.append(feat)
226 |
227 | elif feat in X.col2source_cat:
228 | keep_feat = X.col2source_cat[feat]
229 | if keep_feat in cat_set:
230 | if keep_feat not in keep_cats_set:
231 | keep_cats_set.add(keep_feat)
232 | keep_cats.append(keep_feat)
233 |
234 | drop_feats = list(set(df_imp['features'].tolist()) - set(keep_feats))
235 |
236 | drop_feats = list(set(drop_feats) - keep_cats_set)
237 | self.drop_feats = drop_feats
238 | log(f'total feat num:{df_imp.shape[0]}, drop feat num:{len(self.drop_feats)}')
239 |
240 | keep_nums = []
241 | for feat in keep_feats:
242 | if X.col2type[feat] == CONSTANT.NUMERICAL_TYPE:
243 | keep_nums.append(feat)
244 |
245 | keep_binaries = []
246 | for feat in keep_feats:
247 | if X.col2type[feat] == CONSTANT.BINARY_TYPE:
248 | keep_binaries.append(feat)
249 |
250 | assert(len(set(keep_cats) & set(drop_feats))==0)
251 | assert(len(set(keep_nums) & set(drop_feats))==0)
252 | assert(len(set(keep_binaries) & set(drop_feats))==0)
253 |
254 | X.reset_combine_cols(keep_cats,keep_nums,keep_binaries)
255 |
256 | rest_cols = len(df_imp) - len(self.drop_feats)
257 | if rest_cols > col_constrain:
258 | real_keep_feats = set(df_imp['features'].iloc[:col_constrain].tolist())
259 | real_drop_feats = list(set(df_imp['features'].tolist()) - real_keep_feats)
260 | self.drop_feats = real_drop_feats
261 |
262 | @timeclass(cls='LGBFeatureSelectionLast')
263 | def transform(self,X):
264 | X.drop_data(self.drop_feats)
265 | return self.drop_feats
266 |
267 | @timeclass(cls='LGBFeatureSelectionLast')
268 | def fit_transform(self,X,y):
269 | self.fit(X,y)
270 | self.transform(X)
271 | return self.drop_feats
272 |
273 | class LGBFeatureSelectionWait(Feat):
274 | @timeclass(cls='LGBFeatureSelectionWait')
275 | def fit(self,X,y):
276 | now = time.time()
277 | log(f'LGBFeatureSelection:{now-self.config.start_time}')
278 |
279 | threshold = 5
280 | df_imp = lgb_train(X,y)
281 | drop_feats = set(df_imp.loc[df_imp['importances'] < threshold,'features'])
282 | keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features'])
283 |
284 | df_imp.set_index('features',inplace=True)
285 | for cols in X.wait_selection_cols:
286 | drops = df_imp.loc[cols].sort_values(by='importances',ascending=False).index[self.config.wait_feat_selection_num:]
287 | drops = set(drops)
288 | drop_feats = drop_feats | drops
289 |
290 | keep_cats = []
291 |
292 | keep_cats_set = set()
293 | cat_set = set(X.cat_cols)
294 | for feat in keep_feats:
295 |
296 | if X.col2type[feat] == CONSTANT.CATEGORY_TYPE:
297 | if feat in cat_set:
298 | if feat not in keep_cats_set:
299 | keep_cats_set.add(feat)
300 | keep_cats.append(feat)
301 |
302 | elif feat in X.col2source_cat:
303 | keep_feat = X.col2source_cat[feat]
304 | if keep_feat in cat_set:
305 | if keep_feat not in keep_cats_set:
306 | keep_cats_set.add(keep_feat)
307 | keep_cats.append(keep_feat)
308 |
309 |
310 | drop_feats = drop_feats - keep_cats_set
311 | drop_feats = list(drop_feats)
312 | self.drop_feats = drop_feats
313 | X.empty_wait_selection_cols()
314 | log(f'total feat num:{df_imp.shape[0]}, drop feat num:{len(self.drop_feats)}')
315 |
316 | assert(len(set(keep_cats) & set(drop_feats))==0)
317 |
318 | @timeclass(cls='LGBFeatureSelectionWait')
319 | def transform(self,X):
320 | X.drop_data(self.drop_feats)
321 | return self.drop_feats
322 |
323 | @timeclass(cls='LGBFeatureSelectionWait')
324 | def fit_transform(self,X,y):
325 | self.fit(X,y)
326 | self.transform(X)
327 | return self.drop_feats
328 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/merge_feat.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | class MergeFeat:
4 | def __init__(self,key):
5 | self.key = key
6 |
7 | def fit(self,U,V):
8 | pass
9 |
10 | def transform(self,U,V):
11 | pass
12 |
13 | def fit_transform(self,U,V):
14 | pass
15 |
16 | class PreTimeM2M(MergeFeat):
17 | pass
18 |
19 | class PreO2O(MergeFeat):
20 | pass
21 |
22 | class PreM2O(MergeFeat):
23 | pass
24 |
25 | class PreO2M(MergeFeat):
26 | pass
27 |
28 | class PreM2M(MergeFeat):
29 | pass
30 |
31 | class O2O(MergeFeat):
32 | pass
33 |
34 | class M2O(MergeFeat):
35 | pass
36 |
37 | class O2M(MergeFeat):
38 | pass
39 |
40 |
41 | class M2M(MergeFeat):
42 | pass
43 |
44 | class TimeM2M(MergeFeat):
45 | pass
46 |
47 | class CmjTimeM2M(MergeFeat):
48 | def __init__(self,key,time_key,u_key_time_col):
49 | self.key = key
50 | self.time_key = time_key
51 | self.u_key_time_col = u_key_time_col
52 |
53 | def fit(self,T):
54 | pass
55 |
56 | def transform(self,T):
57 | pass
58 |
59 | def fit_transform(self,T):
60 | pass
--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/merge_feat_pipeline.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from .default_merge_feat import *
3 |
4 | class MergeFeatPipeline:
5 | def __init__(self):
6 | self.preM2Ms = []
7 | self.preO2Ms = []
8 |
9 | self.TimeM2Ms = []
10 | self.newTimeM2Ms = []
11 |
12 | self.O2Ms = []
13 | self.M2Ms = []
14 |
15 | self.preM2Os = []
16 | self.preO2Os = []
17 |
18 | self.O2Os = []
19 | self.M2Os = []
20 |
21 |
22 | class DeafultMergeFeatPipeline(MergeFeatPipeline):
23 | def __init__(self):
24 | super(DeafultMergeFeatPipeline,self).__init__()
25 |
26 | self.main_init()
27 |
28 | def main_init(self):
29 |
30 | self.newTimeM2Ms = [TimeM2MnewLastData]
31 |
32 | self.preM2Ms = []
33 | self.M2Ms = [M2MKeyCount, M2MNumMean,M2MDataLast]
34 |
35 | self.preO2Ms = []
36 | self.O2Ms = [M2MKeyCount, M2MNumMean,M2MDataLast]
37 |
38 | self.preO2Os = []
39 | self.O2Os = [M2OJoin]
40 |
41 | self.preM2Os = []
42 | self.M2Os = [M2OJoin]
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat_context.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import CONSTANT
3 |
4 | class FeatContext:
5 | @staticmethod
6 | def gen_feat_name(namespace,cls_name,feat_name,feat_type):
7 | prefix = CONSTANT.type2prefix[feat_type]
8 |
9 |
10 | return f"{prefix}{cls_name}:{feat_name}:{namespace}"
11 |
12 | @staticmethod
13 | def gen_merge_name(table_name,feat_name,feat_type):
14 | prefix = CONSTANT.type2prefix[feat_type]
15 | return f"{prefix}{table_name}.({feat_name})"
16 |
17 | @staticmethod
18 | def gen_merge_feat_name(namespace,cls_name,feat_name,feat_type,table_name):
19 | feat_name = FeatContext.gen_feat_name(namespace,cls_name,feat_name,feat_type)
20 | return FeatContext.gen_merge_name(table_name,feat_name,feat_type)
21 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat_engine.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from feat.feat_pipeline import FeatPipeline
4 | from util import timeclass
5 |
6 | class FeatEngine:
7 | def __init__(self, feat_pipeline: FeatPipeline, config):
8 | self.feat_pipeline = feat_pipeline
9 | self.config = config
10 |
11 | @timeclass(cls='FeatEngine')
12 | def fit_order1(self,table,y):
13 | self.feats_order1 = []
14 | for feat_cls in self.feat_pipeline.order1s:
15 | feat = feat_cls(self.config)
16 | feat.fit(table,y)
17 | self.feats_order1.append(feat)
18 |
19 | @timeclass(cls='FeatEngine')
20 | def transform_order1(self,table):
21 | for feat in self.feats_order1:
22 | feat.transform(table)
23 |
24 | @timeclass(cls='FeatEngine')
25 | def fit_transform_order1(self,table,y):
26 | self.feats_order1 = []
27 | for feat_cls in self.feat_pipeline.order1s:
28 | feat = feat_cls(self.config)
29 | feat.fit_transform(table,y)
30 | self.feats_order1.append(feat)
31 |
32 |
33 | @timeclass(cls='FeatEngine')
34 | def fit_keys_order2(self,table,y):
35 | self.feats_keys_order2 = []
36 | for feat_cls in self.feat_pipeline.keys_order2s:
37 | feat = feat_cls(self.config)
38 | feat.fit(table,y)
39 | self.feats_keys_order2.append(feat)
40 |
41 | @timeclass(cls='FeatEngine')
42 | def transform_keys_order2(self,table):
43 | for feat in self.feats_keys_order2:
44 | feat.transform(table)
45 |
46 | @timeclass(cls='FeatEngine')
47 | def fit_transform_keys_order2(self,table,y,sample=False,selection=True):
48 | if not self.feat_pipeline.keys_order2s:
49 | return
50 |
51 | if sample:
52 | self.feats_keys_order2 = []
53 | self.keys_order2_new_cols = []
54 | for feat_cls in self.feat_pipeline.keys_order2s[:-1]:
55 | feat = feat_cls(self.config)
56 | new_cols = feat.fit_transform(table,y)
57 | self.feats_keys_order2.append(feat)
58 | self.keys_order2_new_cols.append(set(new_cols))
59 |
60 | feat_cls = self.feat_pipeline.keys_order2s[-1]
61 | feat = feat_cls(self.config)
62 | drop_feats = set(feat.fit_transform(table,y))
63 | self.feats_keys_order2.append(feat)
64 | for i in range(len(self.keys_order2_new_cols)):
65 | self.keys_order2_new_cols[i] = (set(self.keys_order2_new_cols[i]) - drop_feats)
66 |
67 | if not sample:
68 | if selection:
69 | self.feats_keys_order2 = []
70 | for i,feat_cls in enumerate(self.feat_pipeline.keys_order2s):
71 | feat = feat_cls(self.config)
72 | feat.fit_transform(table,y)
73 | self.feats_keys_order2.append(feat)
74 | if not selection:
75 | for i,feat_cls in enumerate(self.feat_pipeline.keys_order2s[:-1]):
76 | feat = feat_cls(self.config)
77 | feat.fit_transform(table,y,self.keys_order2_new_cols[i])
78 | self.feats_keys_order2.append(feat)
79 |
80 | @timeclass(cls='FeatEngine')
81 | def fit_all_order2(self,table,y):
82 | self.feats_all_order2 = []
83 | for feat_cls in self.feat_pipeline.all_order2s:
84 | feat = feat_cls(self.config)
85 | feat.fit(table,y)
86 | self.feats_all_order2.append(feat)
87 |
88 | @timeclass(cls='FeatEngine')
89 | def transform_all_order2(self,table):
90 | for feat in self.feats_all_order2:
91 | feat.transform(table)
92 |
93 | @timeclass(cls='FeatEngine')
94 | def fit_transform_all_order2(self,table,y,sample=False,selection=True):
95 | if not self.feat_pipeline.all_order2s:
96 | return
97 |
98 | if sample:
99 | self.feats_all_order2 = []
100 | self.all_order2_new_cols = []
101 | for feat_cls in self.feat_pipeline.all_order2s[:-1]:
102 | feat = feat_cls(self.config)
103 | new_cols = feat.fit_transform(table,y)
104 | self.feats_all_order2.append(feat)
105 | self.all_order2_new_cols.append(set(new_cols))
106 |
107 | feat_cls = self.feat_pipeline.all_order2s[-1]
108 | feat = feat_cls(self.config)
109 | drop_feats = set(feat.fit_transform(table,y))
110 | self.feats_all_order2.append(feat)
111 | for i in range(len(self.all_order2_new_cols)):
112 | self.all_order2_new_cols[i] = set(self.all_order2_new_cols[i]) - drop_feats
113 |
114 | if not sample:
115 | if selection:
116 | self.feats_all_order2 = []
117 | for i,feat_cls in enumerate(self.feat_pipeline.all_order2s):
118 | feat = feat_cls(self.config)
119 | feat.fit_transform(table,y)
120 | self.feats_all_order2.append(feat)
121 | if not selection:
122 | for i,feat_cls in enumerate(self.feat_pipeline.all_order2s[:-1]):
123 | feat = feat_cls(self.config)
124 | feat.fit_transform(table,y,self.all_order2_new_cols[i])
125 | self.feats_all_order2.append(feat)
126 |
127 | @timeclass(cls='FeatEngine')
128 | def fit_keys_order3(self,table,y):
129 | self.feats_keys_order3 = []
130 | for feat_cls in self.feat_pipeline.keys_order3s:
131 | feat = feat_cls(self.config)
132 | feat.fit(table,y)
133 | self.feats_keys_order3.append(feat)
134 |
135 | @timeclass(cls='FeatEngine')
136 | def transform_keys_order3(self,table):
137 | for feat in self.feats_keys_order3:
138 | feat.transform(table)
139 |
140 | @timeclass(cls='FeatEngine')
141 | def fit_transform_keys_order3(self,table,y):
142 | self.feats_keys_order3 = []
143 | for feat_cls in self.feat_pipeline.keys_order3s:
144 | feat = feat_cls(self.config)
145 | feat.fit_transform(table,y)
146 | self.feats_keys_order3.append(feat)
147 |
148 |
149 | @timeclass(cls='FeatEngine')
150 | def fit_post_order1(self,table,y):
151 | self.feats_post_order1 = []
152 | for feat_cls in self.feat_pipeline.post_order1s:
153 | feat = feat_cls(self.config)
154 | feat.fit(table,y)
155 | self.feats_post_order1.append(feat)
156 |
157 | @timeclass(cls='FeatEngine')
158 | def transform_post_order1(self,table):
159 | for feat in self.feats_post_order1:
160 | feat.transform(table)
161 |
162 | @timeclass(cls='FeatEngine')
163 | def fit_transform_post_order1(self,table,y):
164 | self.feats_post_order1 = []
165 | for feat_cls in self.feat_pipeline.post_order1s:
166 | feat = feat_cls(self.config)
167 | feat.fit_transform(table,y)
168 | self.feats_post_order1.append(feat)
169 |
170 | @timeclass(cls='FeatEngine')
171 | def fit_merge_order1(self,table,y):
172 | self.feats_merge_order1 = []
173 | for feat_cls in self.feat_pipeline.merge_order1s:
174 | feat = feat_cls(self.config)
175 | feat.fit(table,y)
176 | self.feats_merge_order1.append(feat)
177 |
178 | @timeclass(cls='FeatEngine')
179 | def transform_merge_order1(self,table):
180 | for feat in self.feats_merge_order1:
181 | feat.transform(table)
182 |
183 | @timeclass(cls='FeatEngine')
184 | def fit_transform_merge_order1(self,table,y):
185 | self.feats_merge_order1 = []
186 | for feat_cls in self.feat_pipeline.merge_order1s:
187 | feat = feat_cls(self.config)
188 | feat.fit_transform(table,y)
189 | self.feats_merge_order1.append(feat)
--------------------------------------------------------------------------------
/auto_smart/auto_smart/merger.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import pandas as pd
3 |
4 | import CONSTANT
5 | from util import log, timeclass
6 | from feat.merge_feat_pipeline import MergeFeatPipeline
7 | import copy
8 | import gc
9 | from data_tools import downcast
10 |
11 | class Merger:
12 | def __init__(self,merge_feat_pipeline: MergeFeatPipeline):
13 | self.merge_feat_pipeline = merge_feat_pipeline
14 |
15 |
16 |
17 | @timeclass(cls='Merger')
18 | def merge(self,key,u,v,ttype,z2f):
19 | feats = []
20 | col2type = {}
21 | col2groupby = {}
22 | col2block = {}
23 |
24 | if u.key_time_col is not None and v.key_time_col is not None and ttype=='many_to_many':
25 |
26 | if z2f and self.merge_timem2m and (key in u.user_cols):
27 | self.merge_timem2m = False
28 | for merge_feat_cls in self.merge_feat_pipeline.newTimeM2Ms:
29 | merge_feat = merge_feat_cls(key)
30 | merge_feat.fit_transform(u,v)
31 |
32 | for merge_feat_cls in self.merge_feat_pipeline.preM2Ms:
33 | merge_feat = merge_feat_cls(key)
34 | merge_feat.fit_transform(u,v)
35 |
36 | for merge_feat_cls in self.merge_feat_pipeline.M2Ms:
37 | merge_feat = merge_feat_cls(key)
38 | v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v)
39 | feats.append(v_feat)
40 | col2type.update(v_col2type)
41 | col2block.update(v_col2block)
42 |
43 | elif ttype == 'one_to_one':
44 | for merge_feat_cls in self.merge_feat_pipeline.preO2Os:
45 | merge_feat = merge_feat_cls(key)
46 | merge_feat.fit_transform(u,v)
47 |
48 | for merge_feat_cls in self.merge_feat_pipeline.O2Os:
49 | merge_feat = merge_feat_cls(key)
50 | v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v)
51 | feats.append(v_feat)
52 | col2type.update(v_col2type)
53 | col2block.update(v_col2block)
54 |
55 | elif ttype == 'many_to_one':
56 | for merge_feat_cls in self.merge_feat_pipeline.preM2Os:
57 | merge_feat = merge_feat_cls(key)
58 | merge_feat.fit_transform(u,v)
59 |
60 | for merge_feat_cls in self.merge_feat_pipeline.M2Os:
61 | merge_feat = merge_feat_cls(key)
62 | v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v)
63 | feats.append(v_feat)
64 | col2type.update(v_col2type)
65 | col2block.update(v_col2block)
66 |
67 | elif ttype == 'one_to_many':
68 | for merge_feat_cls in self.merge_feat_pipeline.preO2Ms:
69 | merge_feat = merge_feat_cls(key)
70 | merge_feat.fit_transform(u,v)
71 |
72 | for merge_feat_cls in self.merge_feat_pipeline.O2Ms:
73 | merge_feat = merge_feat_cls(key)
74 | v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v)
75 | feats.append(v_feat)
76 | col2type.update(v_col2type)
77 | col2block.update(v_col2block)
78 |
79 | elif ttype == 'many_to_many':
80 | for merge_feat_cls in self.merge_feat_pipeline.preM2Ms:
81 | merge_feat = merge_feat_cls(key)
82 | merge_feat.fit_transform(u,v)
83 |
84 | for merge_feat_cls in self.merge_feat_pipeline.M2Ms:
85 | merge_feat = merge_feat_cls(key)
86 | v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v)
87 | feats.append(v_feat)
88 | col2type.update(v_col2type)
89 | col2block.update(v_col2block)
90 | if feats:
91 | feat = pd.concat(feats,axis=1)
92 | col2groupby = {col:key for col in feat.columns}
93 |
94 | del feats,v
95 | gc.collect()
96 |
97 | data = u.data
98 | index = data.index
99 | data.set_index(key,inplace=True)
100 |
101 | cols = list(feat.columns)
102 | data[cols] = feat
103 | data.reset_index(key,inplace=True)
104 | data[key] = downcast(data[key],accuracy_loss=False)
105 | data.index= index
106 |
107 | u.update_data(data,col2type,col2groupby,None,col2block,None)
108 |
109 | @timeclass(cls='Merger')
110 | def dfs(self,u_name, graph):
111 | depth = graph.depth
112 | name2table = graph.name2table
113 | rel_graph = graph.rel_graph
114 |
115 | u = name2table[u_name]
116 | log(f"enter {u_name}")
117 | for edge in rel_graph[u_name]:
118 | v_name = edge['to']
119 | if depth[v_name]['depth'] <= depth[u_name]['depth']:
120 | continue
121 |
122 | v = self.dfs(v_name, graph)
123 | key = edge['key']
124 | assert len(key) == 1
125 | key = key[0]
126 | type_ = edge['type']
127 |
128 | log(f"join {u_name} <--{type_}--t {v_name}")
129 | self.merge(key,u,v,type_,0)
130 |
131 | log(f"join {u_name} <--{type_}--nt {v_name}")
132 |
133 | del v
134 |
135 | log(f"leave {u_name}")
136 | return u
137 |
138 | @timeclass(cls='Merger')
139 | def merge_to_main_fit_transform(self,graph):
140 | depth = graph.depth
141 | name2table = graph.name2table
142 |
143 | u_name = CONSTANT.MAIN_TABLE_NAME
144 | u = name2table[u_name]
145 | rel_graph = graph.rel_graph
146 |
147 | table2feat = {}
148 | for edge in rel_graph[u_name]:
149 | v_name = edge['to']
150 | if depth[v_name]['depth'] <= depth[u_name]['depth']:
151 | continue
152 |
153 | v = name2table[v_name]
154 | key = edge['key']
155 | assert len(key) == 1
156 | key = key[0]
157 | type_ = edge['type']
158 |
159 | log(f"join {u_name} <--{type_}--t {v_name}")
160 | table2feat[v_name] = self.merge(key,u,v,type_,1)
161 | log(f"join {u_name} <--{type_}--nt {v_name}")
162 |
163 | self.table2feat = table2feat
164 | return u
165 |
166 | @timeclass(cls='Merger')
167 | def merge_table(self,graph):
168 | self.use_all_time_m2m = False
169 | if graph.M2M_relation_cnt < 3:
170 | self.use_all_time_m2m = True
171 |
172 | self.merge_timem2m = True
173 |
174 | graph.build_depth()
175 |
176 | depth = graph.depth
177 | u_name = CONSTANT.MAIN_TABLE_NAME
178 | rel_graph = graph.rel_graph
179 |
180 | for edge in rel_graph[u_name]:
181 | v_name = edge['to']
182 | if depth[v_name]['depth'] <= depth[u_name]['depth']:
183 | continue
184 |
185 | self.dfs(v_name,graph)
186 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/metadata:
--------------------------------------------------------------------------------
1 | description: Provides prediction model to be executed by the ingestion program
--------------------------------------------------------------------------------
/auto_smart/auto_smart/model.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import copy
6 | import CONSTANT
7 | from util import log, timeclass
8 | from table.graph import Graph
9 | from sklearn.metrics import roc_auc_score
10 | from feat.merge_feat_pipeline import DeafultMergeFeatPipeline
11 | from feat.feat_pipeline import DefaultFeatPipeline
12 |
13 | from merger import Merger
14 | from feat_engine import FeatEngine
15 | from model_input import FeatOutput
16 | from automl.model_selection import time_train_test_split
17 | from automl.auto_lgb import AutoLGB
18 | from PATHS import feature_importance_path,version
19 | from datetime import datetime
20 | import gc
21 | from config import Config
22 | import time
23 |
24 | class Model:
25 | auc = []
26 | ensemble_auc = []
27 | ensemble_train_auc = []
28 |
29 | def __init__(self, info):
30 | self.info = copy.deepcopy(info)
31 | self.tables = None
32 |
33 | def shuffle(self,X,y,random_state):
34 | idx = np.arange(len(X))
35 | np.random.shuffle(idx)
36 | X = X.iloc[idx]
37 | y = y.iloc[idx]
38 | return X,y
39 |
40 | def release_tables(self,Xs,graph):
41 |
42 | for name in graph.tables:
43 | del Xs[name]
44 | del graph.name2table[name]
45 |
46 | gc.collect()
47 |
48 | @timeclass(cls='Model')
49 | def my_fit(self, Xs, y,X_test):
50 | np.random.seed(CONSTANT.SEED)
51 |
52 | split = CONSTANT.SPLIT
53 |
54 | self.split = split
55 |
56 | log(f'split {split}')
57 |
58 | if split == -1:
59 | config = Config(time.time(),self.info['time_budget'])
60 |
61 | X_test.index = -X_test.index-1
62 |
63 | main_shape = Xs[CONSTANT.MAIN_TABLE_NAME].shape[0]
64 | main_max_shape = 2888888
65 | main_min_shape = min( main_shape,100000 )
66 |
67 | test_shape = X_test.shape[0]
68 | max_accept_shape = 3999999
69 |
70 | if main_shape + test_shape > max_accept_shape:
71 | sample_main_shape = max_accept_shape - test_shape
72 | if sample_main_shape > main_max_shape:
73 | sample_main_shape = main_max_shape
74 | if sample_main_shape < main_min_shape:
75 | sample_main_shape = main_min_shape
76 | log(f'start sample main table. origin main shape {main_shape} test shape {test_shape} sample rows num {sample_main_shape}')
77 | if 'time_col' in self.info:
78 | key_time_col = self.info['time_col']
79 | if key_time_col in Xs[CONSTANT.MAIN_TABLE_NAME].columns:
80 | Xs[CONSTANT.MAIN_TABLE_NAME].sort_values(by=key_time_col,inplace=True)
81 | Xs[CONSTANT.MAIN_TABLE_NAME] = Xs[CONSTANT.MAIN_TABLE_NAME].iloc[-sample_main_shape:]
82 | gc.collect()
83 |
84 |
85 | Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat([Xs[CONSTANT.MAIN_TABLE_NAME], X_test])
86 |
87 | X_test.drop(X_test.columns,axis=1,inplace=True)
88 | gc.collect()
89 |
90 | graph = Graph(self.info,Xs)
91 | graph.sort_tables()
92 | train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[Xs[CONSTANT.MAIN_TABLE_NAME].index>=0]
93 | y = y.loc[train_index]
94 | test_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[Xs[CONSTANT.MAIN_TABLE_NAME].index<0]
95 |
96 | graph.preprocess_fit_transform()
97 | gc.collect()
98 |
99 | merge_feat_pipeline = DeafultMergeFeatPipeline()
100 | merger = Merger(merge_feat_pipeline)
101 |
102 | merger.merge_table(graph)
103 | main_table = merger.merge_to_main_fit_transform(graph)
104 | self.release_tables(Xs,graph)
105 | del merger
106 | del graph
107 | gc.collect()
108 |
109 | feat_pipeline = DefaultFeatPipeline()
110 | feat_engine = FeatEngine(feat_pipeline,config)
111 | feat_engine.fit_transform_order1(main_table,y)
112 |
113 | sample_for_combine_features = True
114 |
115 | if sample_for_combine_features:
116 | main_data = main_table.data
117 | train_data = main_data.loc[main_data.index>=0]
118 |
119 | del main_data
120 |
121 | sample_num = CONSTANT.SAMPLE_NUM
122 | train_shape = train_data.shape
123 |
124 | if train_shape[0] <= sample_num:
125 | sample_for_combine_features = False
126 | else:
127 | data_tail_new = train_data.iloc[-sample_num:]
128 |
129 | gc.collect()
130 |
131 | y_tail_new = y.loc[data_tail_new.index]
132 |
133 | table_tail_new = copy.deepcopy(main_table)
134 | table_tail_new.data = data_tail_new
135 |
136 | del data_tail_new
137 | gc.collect()
138 |
139 | feat_engine.fit_transform_all_order2(table_tail_new,y_tail_new,sample=True)
140 | feat_engine.fit_transform_keys_order2(table_tail_new,y_tail_new,sample=True)
141 |
142 | del table_tail_new,y_tail_new
143 | gc.collect()
144 |
145 | feat_engine.fit_transform_all_order2(main_table,y,selection=False)
146 | feat_engine.fit_transform_keys_order2(main_table,y,selection=False)
147 |
148 | feat_engine.fit_transform_post_order1(main_table,y)
149 |
150 | if not sample_for_combine_features:
151 | gc.collect()
152 |
153 | feat_engine.fit_transform_all_order2(main_table,y)
154 | feat_engine.fit_transform_keys_order2(main_table,y)
155 |
156 | feat_engine.fit_transform_keys_order3(main_table,y)
157 | feat_engine.fit_transform_post_order1(main_table,y)
158 |
159 |
160 | del feat_engine
161 | gc.collect()
162 |
163 |
164 | X_test = main_table.data.loc[test_index]
165 | main_table.data = main_table.data.loc[train_index]
166 |
167 | gc.collect()
168 |
169 | test_table = copy.deepcopy(main_table)
170 | test_table.data = X_test
171 | self.test_table = test_table
172 | len_test = X_test.shape[0]
173 | gc.collect()
174 |
175 | feat_engine = FeatEngine(feat_pipeline,config)
176 | feat_engine.fit_transform_merge_order1(main_table,y)
177 | self.feat_engine = feat_engine
178 |
179 | feat_output = FeatOutput()
180 | self.feat_output = feat_output
181 | X,y,categories = feat_output.final_fit_transform_output(main_table,y)
182 |
183 | del main_table
184 | gc.collect()
185 |
186 | lgb = AutoLGB()
187 |
188 | lgb.param_compute(X,y,categories,config)
189 | X_train,y_train,X_test,y_test = time_train_test_split(X,y,test_rate=0.2)
190 |
191 | lgb.param_opt_new(X_train,y_train,X_test,y_test,categories)
192 |
193 | gc.collect()
194 |
195 | del X_train,y_train,X_test,y_test
196 |
197 | gc.collect()
198 |
199 | X,y = self.shuffle(X,y,2019)
200 | gc.collect()
201 |
202 | lgb.ensemble_train(X,y,categories,config,len_test)
203 |
204 | gc.collect()
205 |
206 | importances = lgb.get_ensemble_importances()
207 |
208 | self.model = lgb
209 | del X,y
210 |
211 | elif split == -2:
212 |
213 | config = Config(time.time(),self.info['time_budget'])
214 |
215 | Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat([Xs[CONSTANT.MAIN_TABLE_NAME], ])
216 |
217 | gc.collect()
218 |
219 | graph = Graph(self.info,Xs)
220 | graph.sort_tables()
221 | train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[Xs[CONSTANT.MAIN_TABLE_NAME].index>=0]
222 | y = y.loc[train_index]
223 |
224 | graph.preprocess_fit_transform()
225 | gc.collect()
226 |
227 | merge_feat_pipeline = DeafultMergeFeatPipeline()
228 | merger = Merger(merge_feat_pipeline)
229 |
230 | merger.merge_table(graph)
231 | main_table = merger.merge_to_main_fit_transform(graph)
232 | self.release_tables(Xs,graph)
233 | del merger
234 | del graph
235 | gc.collect()
236 |
237 | feat_pipeline = DefaultFeatPipeline()
238 | feat_engine = FeatEngine(feat_pipeline,config)
239 | feat_engine.fit_transform_order1(main_table,y)
240 |
241 | sample_for_combine_features = True
242 |
243 | if sample_for_combine_features:
244 | main_data = main_table.data
245 | train_data = main_data.loc[main_data.index>=0]
246 |
247 | del main_data
248 |
249 | sample_num = CONSTANT.SAMPLE_NUM
250 | train_shape = train_data.shape
251 |
252 | if train_shape[0] <= sample_num:
253 | sample_for_combine_features = False
254 | else:
255 | data_tail_new = train_data.iloc[-sample_num:]
256 |
257 | gc.collect()
258 | log(f'sample data shape {data_tail_new.shape}')
259 |
260 | y_tail_new = y.loc[data_tail_new.index]
261 |
262 | table_tail_new = copy.deepcopy(main_table)
263 | table_tail_new.data = data_tail_new
264 |
265 | del data_tail_new
266 | gc.collect()
267 |
268 | feat_engine.fit_transform_all_order2(table_tail_new,y_tail_new,sample=True)
269 | feat_engine.fit_transform_keys_order2(table_tail_new,y_tail_new,sample=True)
270 |
271 | del table_tail_new,y_tail_new
272 | gc.collect()
273 |
274 | feat_engine.fit_transform_all_order2(main_table,y,selection=False)
275 | feat_engine.fit_transform_keys_order2(main_table,y,selection=False)
276 | feat_engine.fit_transform_post_order1(main_table,y)
277 |
278 | if not sample_for_combine_features:
279 | gc.collect()
280 |
281 | feat_engine.fit_transform_all_order2(main_table,y)
282 | feat_engine.fit_transform_keys_order2(main_table,y)
283 | feat_engine.fit_transform_keys_order3(main_table,y)
284 | feat_engine.fit_transform_post_order1(main_table,y)
285 |
286 | del feat_engine
287 | gc.collect()
288 |
289 | main_table.data = main_table.data.loc[train_index]
290 |
291 | gc.collect()
292 |
293 | def split_table(table,y):
294 | X = table.data
295 | X_train,y_train,X_test,y_test = time_train_test_split(X,y,shuffle=False,test_rate=0.2)
296 | table1 = copy.deepcopy(table)
297 | table1.data = X_train
298 | table2 = copy.deepcopy(table)
299 | table2.data = X_test
300 | return table1,y_train,table2,y_test
301 |
302 | table1,y_train,table2,y_test = split_table(main_table,y)
303 |
304 | feat_engine = FeatEngine(feat_pipeline,config)
305 | feat_engine.fit_transform_merge_order1(table1,y_train)
306 | self.feat_engine = feat_engine
307 |
308 | feat_output = FeatOutput()
309 | self.feat_output = feat_output
310 |
311 | X_train,y_train,categories = feat_output.fit_transform_output(table1,y_train)
312 |
313 | gc.collect()
314 | self.feat_engine.transform_merge_order1(table2)
315 | X_test = self.feat_output.transform_output(table2)
316 |
317 | lgb = AutoLGB()
318 |
319 | lgb.param_compute(X_train,y_train,categories,config)
320 |
321 | lgb.param_opt_new(X_train,y_train,X_test,y_test,categories)
322 |
323 | len_test = X_test.shape[0]
324 |
325 | lgb.ensemble_train(X_train,y_train,categories,config,len_test)
326 | gc.collect()
327 |
328 | pred,pred0 = lgb.ensemble_predict_test(X_test)
329 |
330 | auc = roc_auc_score(y_test,pred0)
331 | print('source AUC:',auc)
332 |
333 | auc = roc_auc_score(y_test,pred)
334 | Model.ensemble_auc.append(auc)
335 | print('ensemble AUC:',auc)
336 |
337 | importances = lgb.get_ensemble_importances()
338 |
339 | self.model = lgb
340 |
341 | del X_train,y_train,X_test,y_test
342 | gc.collect()
343 |
344 | paths = os.path.join(feature_importance_path,version)
345 | if not os.path.exists(paths):
346 | os.makedirs(paths)
347 | importances.to_csv(os.path.join(paths,'{}_importances.csv'.format(datetime.now().strftime('%Y%m%d%H%M%S'))),index=False)
348 |
349 | @timeclass(cls='Model')
350 | def fit(self, Xs, y):
351 | self.Xs = Xs
352 | self.y = y
353 |
354 |
355 | @timeclass(cls='Model')
356 | def predict(self, X_test):
357 |
358 | self.my_fit(self.Xs, self.y, X_test)
359 |
360 | gc.collect()
361 |
362 | if self.split != -2:
363 | main_table = self.test_table
364 | self.feat_engine.transform_merge_order1(main_table)
365 | X = self.feat_output.transform_output(main_table)
366 |
367 | X.index = -(X.index+1)
368 | X.sort_index(inplace=True)
369 |
370 | result = self.model.ensemble_predict(X)
371 | return pd.Series(result)
372 |
373 | else:
374 | return pd.Series()
375 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/model_input.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | from util import log, timeit, timeclass
5 | import numpy as np
6 | import gc
7 | import sys
8 |
9 | class FeatOutput:
10 | @timeclass(cls='FeatOutput')
11 | def transform_output(self,table):
12 | X = table.data
13 |
14 | self.drop_non_numerical_column(table,X)
15 | self.drop_post_drop_column(table,X)
16 |
17 | return X
18 |
19 | @timeclass(cls='FeatOutput')
20 |
21 | def fit_transform_output(self,table,y):
22 | X = table.data.copy()
23 |
24 | self.drop_non_numerical_column(table,X)
25 | self.drop_post_drop_column(table,X)
26 |
27 | categories = self.get_categories(table,X)
28 |
29 | return X,y,categories
30 |
31 | def final_fit_transform_output(self,table,y):
32 | X = table.data
33 |
34 |
35 | self.drop_non_numerical_column(table,X)
36 | self.drop_post_drop_column(table,X)
37 |
38 | categories = self.get_categories(table,X)
39 |
40 | return X,y,categories
41 |
42 | @timeclass(cls='FeatOutput')
43 | def fillna(self,table,X):
44 | for col in table.num_cols:
45 | X[col] = X[col].fillna(X[col].mean())
46 |
47 |
48 | def get_categories(self,table,X):
49 | categories = []
50 | col_set = set(X.columns)
51 | for col in table.cat_cols:
52 | if col in col_set:
53 | if X[col].nunique() <= 15:
54 | categories.append(col)
55 |
56 |
57 | return categories
58 |
59 | @timeclass(cls='FeatOutput')
60 | def drop_non_numerical_column(self,table,X):
61 | if table.key_time_col is not None:
62 |
63 | X.drop(table.key_time_col,axis=1,inplace=True)
64 | gc.collect()
65 |
66 | if len(table.time_cols) != 0:
67 | X.drop(table.time_cols,axis=1,inplace=True)
68 |
69 | if len(table.multi_cat_cols) != 0:
70 | X.drop(table.multi_cat_cols,axis=1,inplace=True)
71 |
72 | @timeclass(cls='FeatOutput')
73 | def drop_post_drop_column(self,table,X):
74 | if len(table.post_drop_set) != 0:
75 | drop_cols = list(table.post_drop_set)
76 | X.drop(drop_cols,axis=1,inplace=True)
77 | log(f'post drop cols:{drop_cols}')
78 |
79 | @timeclass(cls='FeatOutput')
80 | def drop_cat_column(self,table,X):
81 | X.drop(list(set(table.session_cols + table.user_cols + table.key_cols + table.cat_cols)&set(X.columns)),axis=1,inplace=True)
82 |
83 | @timeclass(cls='FeatOutput')
84 | def cat_hash(self,table,X):
85 | for col in table.user_cols + table.key_cols + table.cat_cols:
86 | X[col] = X[col] % 15
87 |
88 | @timeclass(cls='FeatOutput')
89 | def cat_process(self,train_table,test_table):
90 | X = train_table
91 |
92 | train = train_table.data
93 | test = test_table.data
94 | for col in X.user_cols + X.key_cols + X.cat_cols:
95 | inter = set(train[col].unique()) & set(test[col].unique())
96 | train.loc[~(train[col].isin(inter)),col] = np.nan
97 | test.loc[~(test[col].isin(inter)),col] = np.nan
98 |
99 | @timeclass(cls='FeatOutput')
100 | def drop_tail(self,train_table,test_table):
101 | X = train_table
102 |
103 | train = train_table.data
104 | test = test_table.data
105 | for col in X.key_cols + X.cat_cols:
106 | vc = train[col].value_counts()
107 | vc.loc[vc==1] = np.nan
108 | train[col] = train[col].map(vc)
109 | test[col] = test[col].map(vc)
110 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/preprocessor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepBlueAI/AutoSmart/99a6b44ce5b6520a2463c8a4b3acc675584533be/auto_smart/auto_smart/preprocessor/__init__.py
--------------------------------------------------------------------------------
/auto_smart/auto_smart/preprocessor/preprocessor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import pandas as pd
3 | import numpy as np
4 | import ac
5 | import CONSTANT
6 | from data_tools import downcast
7 | from joblib import Parallel, delayed
8 | from util import timeclass
9 | from feat_context import FeatContext
10 |
11 | namespace = 'preprocess'
12 |
13 | class Preprocessor:
14 | def __init__(self):
15 | pass
16 |
17 | def fit(self,ss):
18 | pass
19 |
20 | def transform(self,ss):
21 | pass
22 |
23 | def fit_transform(self,ss):
24 | pass
25 |
26 | class GeneralPreprocessor(Preprocessor):
27 | def __init__(self):
28 | self.K = 5
29 |
30 | @timeclass(cls='GeneralPreprocessor')
31 | def transform(self,X):
32 |
33 | todo_list = X.multi_cat_cols
34 | if todo_list != []:
35 |
36 | col2muldatas = {}
37 | col2muldatalens = {}
38 |
39 | data = X.data[todo_list]
40 | for col in todo_list:
41 | vals = data[col].values
42 | datas,datalen = ac.get_need_data(vals)
43 |
44 | if len(datalen) != data.shape[0]:
45 | raise Exception('An error with data length happens!!')
46 |
47 | col2muldatas[col] = np.array(datas,dtype='int64').astype(np.int32)
48 | col2muldatalens[col] = np.array(datalen,dtype='int32')
49 |
50 | data = X.data[todo_list]
51 | col2type = {}
52 | col2groupby = {}
53 | for col in data.columns:
54 | data[col] = ac.tuple_encode_func_1(col2muldatas[col],col2muldatalens[col])
55 |
56 | new_cols = []
57 | for col in todo_list:
58 | feat_type = CONSTANT.CATEGORY_TYPE
59 | new_col = col+'_MCEncode'
60 | new_col = FeatContext.gen_feat_name(namespace,self.__class__.__name__,new_col,feat_type)
61 | new_cols.append(new_col)
62 | col2type[new_col] = feat_type
63 | col2groupby[new_col] = col
64 |
65 | data.columns = new_cols
66 | df = X.data
67 | for col in data.columns:
68 | df[col] = downcast(data[col],accuracy_loss=False)
69 |
70 | X.update_data(df,col2type,col2groupby)
71 |
72 | df = X.data
73 | index = df.index
74 | col2type = {}
75 | col2groupby = {}
76 | for col in todo_list:
77 | new_col = col+'_MCLenAsCat'
78 | feat_type = CONSTANT.CATEGORY_TYPE
79 | new_col = FeatContext.gen_feat_name(namespace,self.__class__.__name__,new_col,feat_type)
80 | df[new_col] = downcast( pd.Series( col2muldatalens[col],index ),accuracy_loss=False)
81 |
82 | col2type[new_col] = feat_type
83 | col2groupby[new_col] = col
84 |
85 | X.update_data(df,col2type,col2groupby)
86 |
87 | todo_list = X.time_cols
88 |
89 | if todo_list != []:
90 | df = X.data
91 | col2type = {}
92 | for col in X.time_cols:
93 | new_col = col+'_TimeNum'
94 | feat_type = CONSTANT.NUMERICAL_TYPE
95 | new_col = FeatContext.gen_feat_name(namespace,self.__class__.__name__,new_col,feat_type)
96 |
97 | ss = (df[col] - pd.to_datetime('1970-01-01')).dt.total_seconds()
98 | ss[ss<0] = np.nan
99 | min_time = ss.min()
100 | ss = ss-min_time
101 |
102 | df[new_col] = downcast(ss)
103 |
104 | col2type[new_col] = feat_type
105 |
106 | if len(col2type) > 0:
107 | X.update_data(df,col2type,None)
108 |
109 | @timeclass(cls='GeneralPreprocessor')
110 | def fit_transform(self,X):
111 | return self.transform(X)
112 |
113 | class BinaryPreprocessor(Preprocessor):
114 | def __init__(self):
115 | self.col2cats = {}
116 |
117 | @timeclass(cls='BinaryPreprocessor')
118 | def fit(self,X):
119 | def func(ss):
120 | cats = pd.Categorical(ss).categories
121 | return cats
122 |
123 | df = X.data
124 | todo_cols = X.binary_cols
125 |
126 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols)
127 | for col,cats in zip(todo_cols,res):
128 | self.col2cats[col] = cats
129 |
130 | @timeclass(cls='BinaryPreprocessor')
131 | def transform(self,X):
132 |
133 | def func(ss,cats):
134 | codes = pd.Categorical(ss,categories=cats).codes
135 | codes = codes.astype('float16')
136 | codes[codes==-1] = np.nan
137 |
138 | return codes
139 |
140 | df = X.data
141 | todo_cols = X.binary_cols
142 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col],self.col2cats[col]) for col in todo_cols)
143 | for col,codes in zip(todo_cols,res):
144 | df[col] = codes
145 |
146 | @timeclass(cls='BinaryPreprocessor')
147 | def fit_transform(self,X):
148 | self.fit(X)
149 | self.transform(X)
150 |
151 | class MSCatPreprocessor(Preprocessor):
152 | def __init__(self):
153 | self.cats = []
154 |
155 | def fit(self,ss):
156 | vals = ss.values
157 |
158 | ss = pd.Series( list(ac.mscat_fit(vals)) )
159 |
160 | if ss.name is None:
161 | ss.name = 'ss'
162 |
163 | cats = ss.dropna().drop_duplicates().values
164 |
165 | if len(self.cats) == 0:
166 | self.cats = sorted(list(cats))
167 | else:
168 | added_cats = sorted(set(cats) - set(self.cats))
169 | self.cats.extend(added_cats)
170 |
171 | def transform(self,ss,kind):
172 |
173 | if kind == CONSTANT.CATEGORY_TYPE:
174 |
175 | codes = pd.Categorical(ss,categories=self.cats).codes + CONSTANT.CAT_SHIFT
176 | codes = codes.astype('float')
177 | codes[codes==(CONSTANT.CAT_SHIFT-1)] = np.nan
178 |
179 | codes = downcast(codes,accuracy_loss=False)
180 | return codes
181 | else:
182 | codes = pd.Series( ac.mscat_trans(ss.values,self.cats) , index = ss.index )
183 | return codes
184 |
185 | def fit_transform(self,ss):
186 | return self.transform(ss)
187 |
188 | class NumPreprocessor(Preprocessor):
189 | def fit(self,X):
190 | pass
191 |
192 | def transform(self,X):
193 | df = X.data
194 | todo_cols = X.num_cols
195 | for col in todo_cols:
196 | df[col] = downcast(df[col])
197 |
198 | def fit_transform(self,X):
199 | return self.transform(X)
200 |
201 | class UniquePreprocessor(Preprocessor):
202 | @timeclass(cls='UniquePreprocessor')
203 | def fit(self,X):
204 | def func(ss):
205 | length = len(ss.unique())
206 | if length <= 1:
207 | return True
208 | else:
209 | return False
210 |
211 | df = X.data
212 | todo_cols = X.cat_cols + X.multi_cat_cols + X.num_cols + X.time_cols + X.binary_cols
213 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols)
214 |
215 | drop_cols = []
216 | for col,unique in zip(todo_cols,res):
217 | if unique:
218 | drop_cols.append(col)
219 |
220 | self.drop_cols = drop_cols
221 |
222 | @timeclass(cls='UniquePreprocessor')
223 | def transform(self,X):
224 | X.drop_data(self.drop_cols)
225 |
226 | @timeclass(cls='UniquePreprocessor')
227 | def fit_transform(self,X):
228 | self.fit(X)
229 | self.transform(X)
230 |
231 | class AllDiffPreprocessor(Preprocessor):
232 | @timeclass(cls='AllDiffPreprocessor')
233 | def fit(self,X):
234 | def func(ss):
235 | length = len(ss.unique())
236 | if length >= len(ss)-10:
237 | return True
238 | else:
239 | return False
240 |
241 | df = X.data
242 | todo_cols = X.cat_cols
243 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols)
244 |
245 | drop_cols = []
246 | for col,all_diff in zip(todo_cols,res):
247 | if all_diff:
248 | drop_cols.append(col)
249 |
250 | self.drop_cols = drop_cols
251 |
252 | @timeclass(cls='AllDiffPreprocessor')
253 | def transform(self,X):
254 | X.drop_data(self.drop_cols)
255 |
256 | @timeclass(cls='AllDiffPreprocessor')
257 | def fit_transform(self,X):
258 | self.fit(X)
259 | self.transform(X)
260 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/table/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepBlueAI/AutoSmart/99a6b44ce5b6520a2463c8a4b3acc675584533be/auto_smart/auto_smart/table/__init__.py
--------------------------------------------------------------------------------
/auto_smart/auto_smart/table/graph.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .table import Table
4 | from preprocessor.preprocessor import MSCatPreprocessor
5 | import pandas as pd
6 | import CONSTANT
7 | from util import timeclass, log
8 | from collections import defaultdict, deque
9 | import gc
10 | from joblib import Parallel, delayed
11 |
12 | class Graph:
13 | def __init__(self,info,tables):
14 |
15 | self.info = info
16 |
17 | self.table2info = info['tables']
18 | self.relations = info['relations']
19 | self.key_time_col = info['time_col']
20 |
21 | self.M2M_relation_cnt = 0
22 | for relation in info['relations']:
23 | if relation['type'] == "many_to_many":
24 | self.M2M_relation_cnt = self.M2M_relation_cnt + 1
25 |
26 | self.key_col_set = None
27 | self.user_col = None
28 |
29 | self.name2table = {}
30 | self.tables = []
31 |
32 | key_col_set = set()
33 | for relation in info['relations']:
34 | key_col_set.update(relation['key'])
35 | self.key_col_set = key_col_set
36 |
37 | user_col = None
38 | for tname,table in tables.items():
39 | key_cols = []
40 | if tname == CONSTANT.MAIN_TABLE_NAME:
41 | for col in self.table2info[tname]:
42 | if col in self.key_col_set:
43 | key_cols.append(col)
44 |
45 | user_col = self.recognize_user_col(tables[tname],key_cols)
46 |
47 | self.user_col = user_col
48 | del user_col
49 |
50 | main_cat_cols = []
51 | session_col = None
52 | for tname,table in tables.items():
53 | if tname == CONSTANT.MAIN_TABLE_NAME:
54 | for col in self.table2info[tname]:
55 | type_ = self.table2info[tname][col]
56 | if type_ == CONSTANT.CATEGORY_TYPE and col!=self.user_col and col not in key_col_set:
57 | main_cat_cols.append(col)
58 |
59 | session_cols = self.recognize_session_col(tables[tname],main_cat_cols,self.user_col)
60 |
61 |
62 | self.main_session_cols = session_cols
63 | del main_cat_cols
64 | del session_col
65 |
66 | for tname,table in tables.items():
67 | key_cols = []
68 | key_time_col = None
69 | user_cols = []
70 |
71 | for col in self.table2info[tname]:
72 |
73 | if col in self.key_col_set and col != self.user_col:
74 | key_cols.append(col)
75 |
76 | if col == self.user_col:
77 | user_cols.append(col)
78 |
79 | if col == self.key_time_col:
80 | key_time_col = col
81 |
82 | cat_cols = []
83 | for col in self.table2info[tname]:
84 | type_ = self.table2info[tname][col]
85 | if type_ == CONSTANT.CATEGORY_TYPE:
86 | cat_cols.append(col)
87 |
88 | binary_cols = self.recognize_binary_col(tables[tname],cat_cols)
89 | for col in binary_cols:
90 | self.table2info[tname][col] = CONSTANT.BINARY_TYPE
91 |
92 | self.tables.append(tname)
93 | if tname == CONSTANT.MAIN_TABLE_NAME:
94 | self.name2table[tname] = Table(tables[tname],self.table2info[tname],self.main_session_cols,user_cols,key_cols,key_time_col,tname)
95 |
96 | else:
97 | self.name2table[tname] = Table(tables[tname],self.table2info[tname],[],user_cols,key_cols,key_time_col,tname)
98 |
99 | if tname == CONSTANT.MAIN_TABLE_NAME:
100 | self.main_key_cols = key_cols
101 | self.main_key_time_col = key_time_col
102 | self.main_user_col = user_cols
103 | self.main_table_info = self.table2info[tname]
104 |
105 | block2name,name2block = self.init_graph_to_blocks()
106 | self.block2name = block2name
107 | self.name2block = name2block
108 |
109 | for tname in self.name2table:
110 | self.name2table[tname].block2name = block2name
111 | self.name2table[tname].name2block = name2block
112 |
113 | for tname in self.name2table:
114 | col2block = {}
115 | for col in self.name2table[tname].data.columns:
116 | name = tname + ':' + col
117 |
118 | if name in self.name2block:
119 | block_id = self.name2block[name]
120 | col2block[col] = block_id
121 |
122 | self.name2table[tname].col2block = col2block
123 |
124 | for tname in self.name2table:
125 | col2table = {}
126 | for col in self.name2table[tname].data.columns:
127 | col2table[col] = tname
128 |
129 | self.name2table[tname].col2table = col2table
130 |
131 | @timeclass(cls='Graph')
132 | def init_graph_to_blocks(self):
133 | mode = 'all'
134 | if mode == 'all':
135 | t_datas = []
136 | t_names = []
137 |
138 | for t_name in self.name2table:
139 | t_table = self.name2table[t_name]
140 | t_data = t_table.data
141 | t_data_num = t_data.shape[0]
142 | t_limit_num = 100000
143 | if t_limit_num > t_data_num:
144 | t_limit_num = t_data_num
145 | t_sample_frac = t_limit_num / t_data_num
146 | t_data = t_data.sample(frac=t_sample_frac,random_state=CONSTANT.SEED)
147 |
148 | t_datas.append(t_data)
149 | t_names.append(t_name)
150 |
151 | all_cat_cols = []
152 | all_cat2type = {}
153 | for t_data,t_name in zip(t_datas,t_names):
154 |
155 | for col in t_data.columns:
156 | col2type = self.table2info[ t_name ][ col ]
157 | new_col = t_name+':'+col
158 | if col2type == CONSTANT.MULTI_CAT_TYPE or col2type == CONSTANT.CATEGORY_TYPE:
159 | all_cat_cols.append(new_col)
160 | all_cat2type[new_col] = col2type
161 |
162 | mc_graph = {}
163 | all_cat_len = len(all_cat_cols)
164 | for i in range(all_cat_len):
165 | name1 = all_cat_cols[i]
166 | mc_graph[name1] = {}
167 | for j in range(all_cat_len):
168 | name2 = all_cat_cols[j]
169 | mc_graph[name1][name2] = 0
170 |
171 | for t1 in range(len(t_datas)):
172 | t_data_1 = t_datas[t1]
173 | t_name_1 = t_names[t1]
174 | for col1 in t_data_1.columns:
175 | if col1 in self.key_col_set:
176 | name1 = t_name_1+':'+col1
177 |
178 | for t2 in range(len(t_datas)):
179 | t_data_2 = t_datas[t2]
180 | t_name_2 = t_names[t2]
181 | for col2 in t_data_2.columns:
182 | if col2 == col1:
183 | name2 = t_name_2+':'+col2
184 | mc_graph[name1][name2] = 1
185 | mc_graph[name2][name1] = 1
186 |
187 | log('init mcgraph')
188 |
189 | all_cat2set = {}
190 |
191 | for t_data,t_name in zip(t_datas,t_names):
192 | for col in t_data.columns:
193 | new_col = t_name+':'+col
194 | if new_col in all_cat2type:
195 | cur_set = set()
196 | if all_cat2type[new_col] == CONSTANT.MULTI_CAT_TYPE:
197 |
198 | for val in t_data[col]:
199 | if type(val) == float:
200 | continue
201 | cur_set.update(val.split(CONSTANT.MULTI_CAT_DELIMITER))
202 |
203 | elif all_cat2type[new_col] == CONSTANT.CATEGORY_TYPE:
204 | cur_set = set(t_data[col].dropna())
205 |
206 | all_cat2set[new_col] = cur_set
207 |
208 | all_cat_len = len(all_cat_cols)
209 | for i in range(all_cat_len):
210 | for j in range(i+1,all_cat_len):
211 | name1 = all_cat_cols[i]
212 | name2 = all_cat_cols[j]
213 |
214 | len1 = len(all_cat2set[name1])
215 | len2 = len(all_cat2set[name2])
216 |
217 | less_len = min(len1,len2)
218 | if less_len <= 1:
219 | continue
220 |
221 | if mc_graph[name1][name2]==1 or mc_graph[name2][name1] == 1:
222 | continue
223 |
224 | if len(all_cat2set[name1] & all_cat2set[name2])/less_len > 0.1:
225 | mc_graph[name1][name2] = 1
226 | mc_graph[name2][name1] = 1
227 |
228 | block2name = {}
229 |
230 | block_id = 0
231 | vis = {}
232 | nodes = list(mc_graph.keys())
233 | def dfs(now,block_id):
234 | block2name[block_id].append(now)
235 | for nex in nodes:
236 | if mc_graph[now][nex] and ( not (nex in vis) ):
237 | vis[nex] = 1
238 | dfs(nex,block_id)
239 |
240 | for now in nodes:
241 | if now in vis:
242 | continue
243 | vis[now] = 1
244 | block_id += 1
245 | block2name[block_id] = []
246 | dfs(now,block_id)
247 |
248 | name2block = {}
249 |
250 | for block in block2name:
251 | for col in block2name[block]:
252 | name2block[col] = block
253 | log(f'blocks: {block2name}')
254 | return block2name,name2block
255 |
256 | elif mode == 'part':
257 | pass
258 |
259 | @timeclass(cls='Graph')
260 | def sort_tables(self):
261 | for tname in self.name2table:
262 | table = self.name2table[tname]
263 | if table.key_time_col is not None:
264 | table.data.sort_values(by=table.key_time_col,inplace=True)
265 |
266 | @timeclass(cls='Graph')
267 | def sort_main_table(self):
268 | table = self.name2table[CONSTANT.MAIN_TABLE_NAME]
269 | if table.key_time_col is not None:
270 | table.data.sort_values(by=table.key_time_col,inplace=True)
271 |
272 | @timeclass(cls='Graph')
273 | def recognize_session_col(self,data,cat_cols,user_col):
274 | if user_col is None:
275 | return []
276 |
277 | user_nunique = data[user_col].nunique()
278 | session_cols = []
279 |
280 | def func(df,user_nunique):
281 | cat_col = df.columns[0]
282 | user_col = df.columns[1]
283 | cat_nunique = df[cat_col].nunique()
284 |
285 | if (cat_nunique <= user_nunique) or (cat_nunique >= df.shape[0]-10):
286 | return False
287 |
288 | if (df.groupby(cat_col)[user_col].nunique()>1).sum()>10:
289 | return False
290 |
291 | return True
292 |
293 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(data[[col,user_col]],user_nunique) for col in cat_cols)
294 |
295 | for col,is_session in zip(cat_cols,res):
296 | if is_session:
297 | session_cols.append(col)
298 |
299 | return session_cols
300 |
301 | @timeclass(cls='Graph')
302 | def recognize_binary_col(self,data,cat_cols):
303 | def func(ss):
304 | ss = ss.unique()
305 | if len(ss) == 3:
306 | if pd.isna(ss).sum() == 1:
307 | return True
308 | if len(ss) == 2:
309 | return True
310 | return False
311 |
312 | binary_cols = []
313 |
314 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(data[col]) for col in cat_cols)
315 |
316 | for col,is_binary in zip(cat_cols,res):
317 | if is_binary:
318 | binary_cols.append(col)
319 |
320 | return binary_cols
321 |
322 | @timeclass(cls='Graph')
323 | def recognize_user_col(self,data,key_cols):
324 | user_col = None
325 | nunique = -1
326 | for col in key_cols:
327 | nnum = data[col].nunique()
328 | if nnum > nunique:
329 | user_col = col
330 | nunique = nnum
331 | return user_col
332 |
333 | @timeclass(cls='Graph')
334 | def preprocess_fit_transform(self):
335 | log('start mscat')
336 |
337 | mscat_block2preprocessor = {}
338 | for block_id in range(1,len(self.block2name)+1):
339 | mscat_block2preprocessor[block_id] = MSCatPreprocessor()
340 | ss = {}
341 | for block_id in range(1,len(self.block2name)+1):
342 | ss[block_id] = pd.Series()
343 |
344 | t_datas = []
345 | t_names = []
346 | for t_name in self.name2table:
347 | t_table = self.name2table[t_name]
348 | t_data = t_table.data
349 |
350 | t_datas.append(t_data)
351 | t_names.append(t_name)
352 |
353 | for t in range(len(t_datas)):
354 | t_data = t_datas[t]
355 | t_name = t_names[t]
356 | for col in t_data.columns:
357 | coltype = self.table2info[ t_name ][col]
358 | if coltype == CONSTANT.MULTI_CAT_TYPE or coltype == CONSTANT.CATEGORY_TYPE:
359 | name = t_name + ':' + col
360 | if name in self.name2block:
361 | block_id = self.name2block[name]
362 | ss[block_id] = pd.concat([ss[block_id],t_data[col].drop_duplicates()])
363 |
364 | for block_id in range(1,len(self.block2name)+1):
365 | mscat_block2preprocessor[block_id].fit(ss[block_id])
366 |
367 | for tname,table in self.name2table.items():
368 | table.preprocess_fit_transform(mscat_block2preprocessor)
369 |
370 | gc.collect()
371 |
372 | def set_main_table(self,table):
373 | tname = CONSTANT.MAIN_TABLE_NAME
374 | self.name2table[CONSTANT.MAIN_TABLE_NAME] = Table(table,self.main_table_info,self.main_session_cols,self.main_user_col,self.main_key_cols,self.main_key_time_col,tname)
375 | gc.collect()
376 |
377 | @timeclass(cls='Graph')
378 | def bfs(self,root_name, graph, depth):
379 | depth[CONSTANT.MAIN_TABLE_NAME]['depth'] = 0
380 | queue = deque([root_name])
381 | while queue:
382 | u_name = queue.popleft()
383 | for edge in graph[u_name]:
384 | v_name = edge['to']
385 | if 'depth' not in depth[v_name]:
386 | depth[v_name]['depth'] = depth[u_name]['depth'] + 1
387 | queue.append(v_name)
388 |
389 | @timeclass(cls='Graph')
390 | def build_depth(self):
391 | rel_graph = defaultdict(list)
392 | depth = {}
393 |
394 | for tname in self.tables:
395 | depth[tname] = {}
396 |
397 | for rel in self.relations:
398 | ta = rel['table_A']
399 | tb = rel['table_B']
400 | rel_graph[ta].append({
401 | "to": tb,
402 | "key": rel['key'],
403 | "type": rel['type']
404 | })
405 | rel_graph[tb].append({
406 | "to": ta,
407 | "key": rel['key'],
408 | "type": '_'.join(rel['type'].split('_')[::-1])
409 | })
410 | self.bfs(CONSTANT.MAIN_TABLE_NAME, rel_graph, depth)
411 |
412 | self.rel_graph = rel_graph
413 | self.depth = depth
414 |
415 |
416 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/table/table.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from preprocessor.preprocessor import *
3 | import CONSTANT
4 | from util import timeclass,log
5 | import gc
6 |
7 | class Table:
8 | def __init__(self,data,table_info,session_cols,user_cols,key_cols,key_time_col,tname):
9 | self.name = tname
10 |
11 | self.col2type = {}
12 | self.col2groupby = {}
13 | self.col2block = {}
14 | self.col2istime = {}
15 |
16 | self.col2muldatas = {}
17 | self.col2muldatalens = {}
18 |
19 | self.user_cols = user_cols
20 | self.session_cols = []
21 |
22 | self.block2name = {}
23 | self.name2block = {}
24 |
25 | for col in session_cols:
26 | if len(self.user_cols) > 0:
27 | self.session_cols.append(col)
28 | self.col2groupby[col] = self.user_cols[0]
29 |
30 | self.key_time_col = key_time_col
31 | self.key_cols = key_cols
32 |
33 | self.cat_cols = None
34 |
35 | self.binary_cols = None
36 | self.multi_cat_cols = None
37 | self.num_cols = None
38 |
39 | self.time_cols = None
40 |
41 | self.bin_cols = []
42 |
43 | self.update_data(data,table_info,None)
44 |
45 | log(f'session_cols:{self.session_cols}')
46 | log(f'user_cols:{self.user_cols}')
47 | log(f'key_cols:{self.key_cols}')
48 | log(f'cat_cols:{self.cat_cols}')
49 | log(f'binary_cols:{self.binary_cols}')
50 | log(f'multi_cat_cols:{self.multi_cat_cols}')
51 | log(f'key_time_col:{self.key_time_col}')
52 | log(f'time_cols:{self.time_cols}')
53 | log(f'num_cols:{self.num_cols}')
54 |
55 | self.apart_cat_set = set()
56 | self.post_drop_set = set()
57 |
58 | self.col2source_cat = {}
59 |
60 | self.combine_cat_cols = []
61 | self.combine_num_cols = []
62 | self.combine_binary_cols = []
63 | self.wait_selection_cols = []
64 |
65 | def add_session_col(self,col):
66 | self.session_cols.append(col)
67 | self.col2type[col] = CONSTANT.CATEGORY_TYPE
68 | if len(self.user_cols) > 0:
69 | self.col2groupby[col] = self.user_cols[0]
70 |
71 | def get_groupby_cols(self,by,cols):
72 | new_cols = []
73 | bys = set()
74 | bys.add(by)
75 | while by in self.col2groupby:
76 | by = self.col2groupby[by]
77 | bys.add(by)
78 |
79 | for col in cols:
80 | is_skip = False
81 | cur = col
82 | while True:
83 | if cur in bys:
84 | is_skip = True
85 | break
86 |
87 | if cur in self.col2groupby:
88 | cur = self.col2groupby[cur]
89 | else:
90 | break
91 |
92 | if not is_skip:
93 | new_cols.append(col)
94 |
95 | return new_cols
96 |
97 | def get_not_apart_cat_cols(self,cols):
98 | new_cols = []
99 | for col in cols:
100 | if col not in self.apart_cat_set:
101 | new_cols.append(col)
102 | return new_cols
103 |
104 | def drop_data(self,cols):
105 | drop_cols = []
106 | for col in cols:
107 | if col not in self.session_cols\
108 | and col not in self.user_cols\
109 | and col not in self.key_cols\
110 | and col != self.key_time_col:
111 | drop_cols.append(col)
112 | if len(drop_cols)>0:
113 | self.data.drop(drop_cols,axis=1,inplace=True)
114 | self.drop_data_cols(drop_cols)
115 |
116 | def drop_data_cols(self,drop_cols):
117 | for col in drop_cols:
118 | self.col2type.pop(col)
119 | if col in self.col2groupby:
120 | self.col2groupby.pop(col)
121 |
122 | self.type_reset()
123 | self.drop_combine_cols(drop_cols)
124 |
125 | def drop_combine_cols(self,drop_cols):
126 | drop_cols_set = set(drop_cols)
127 |
128 | combine_cat_cols = []
129 | combine_num_cols = []
130 | combine_binary_cols = []
131 |
132 | for col in self.combine_cat_cols:
133 | if col not in drop_cols_set:
134 | combine_cat_cols.append(col)
135 |
136 | for col in self.combine_num_cols:
137 | if col not in drop_cols_set:
138 | combine_num_cols.append(col)
139 |
140 | for col in self.combine_binary_cols:
141 | if col not in drop_cols_set:
142 | combine_binary_cols.append(col)
143 |
144 | self.combine_cat_cols = combine_cat_cols
145 | self.combine_num_cols = combine_num_cols
146 | self.combine_binary_cols = combine_binary_cols
147 |
148 | def add_apart_cat_cols(self,cols):
149 | self.apart_cat_set.update(cols)
150 |
151 | def add_post_drop_cols(self,cols):
152 | self.post_drop_set.update(cols)
153 |
154 | def add_wait_selection_cols(self,cols):
155 | self.wait_selection_cols.append(cols)
156 |
157 | def empty_wait_selection_cols(self):
158 | self.wait_selection_cols = []
159 |
160 | def update_data(self,data,col2type,col2groupby,col2source_cat=None,col2block=None,col2istime=None):
161 |
162 | self.data = data
163 | self.update_col2type(col2type)
164 | if col2groupby is not None:
165 | self.update_col2groupby(col2groupby)
166 |
167 | if col2block is not None:
168 | self.update_col2block(col2block)
169 | if col2istime is not None:
170 | self.update_col2istime(col2istime)
171 |
172 | if col2source_cat is not None:
173 | self.update_col2source_cat(col2source_cat)
174 | gc.collect()
175 |
176 | def update_col2block(self,col2block):
177 | self.col2block.update(col2block)
178 |
179 | def update_col2istime(self,col2istime):
180 | self.col2istime.update(col2istime)
181 |
182 | def update_col2groupby(self,col2groupby):
183 | self.col2groupby.update(col2groupby)
184 |
185 | def update_col2source_cat(self,col2source_cat):
186 | self.col2source_cat.update(col2source_cat)
187 |
188 | def update_col2type(self,col2type):
189 | self.col2type.update(col2type)
190 | self.type_reset()
191 |
192 | def reset_combine_cols(self,combine_cat_cols=None,combine_num_cols=None,combine_binary_cols=None):
193 | self.combine_cat_cols = combine_cat_cols
194 | self.combine_num_cols = combine_num_cols
195 | self.combine_binary_cols = combine_binary_cols
196 |
197 | def type_reset(self):
198 |
199 | cat_cols = []
200 | binary_cols = []
201 | multi_cat_cols = []
202 | num_cols = []
203 | time_cols = []
204 |
205 | for cname,ctype in self.col2type.items():
206 | if (ctype == CONSTANT.CATEGORY_TYPE) \
207 | and (cname not in self.key_cols)\
208 | and (cname not in self.user_cols)\
209 | and (cname not in self.session_cols):
210 | cat_cols.append(cname)
211 | elif ctype == CONSTANT.BINARY_TYPE:
212 | binary_cols.append(cname)
213 | elif ctype == CONSTANT.MULTI_CAT_TYPE:
214 | multi_cat_cols.append(cname)
215 | elif ctype == CONSTANT.NUMERICAL_TYPE:
216 | num_cols.append(cname)
217 | elif ctype == CONSTANT.TIME_TYPE and cname != self.key_time_col:
218 | time_cols.append(cname)
219 |
220 | self.cat_cols = sorted(cat_cols)
221 | self.binary_cols = sorted(binary_cols)
222 | self.num_cols = sorted(num_cols)
223 | self.multi_cat_cols = sorted(multi_cat_cols)
224 | self.time_cols = sorted(time_cols)
225 |
226 | @timeclass(cls='Table')
227 | def preprocess_fit_transform(self,mscat_group2preprocessor):
228 |
229 | for col in (self.cat_cols+self.multi_cat_cols+self.user_cols+self.key_cols+self.session_cols):
230 | name = self.name+':'+col
231 | if name in self.name2block:
232 | block_id = self.name2block[name]
233 | self.data[col] = mscat_group2preprocessor[block_id].transform(self.data[col],self.col2type[col])
234 |
235 | unique_preprocessor = UniquePreprocessor()
236 | unique_preprocessor.fit_transform(self)
237 |
238 | all_diff_preprocessor = AllDiffPreprocessor()
239 | all_diff_preprocessor.fit_transform(self)
240 |
241 | binary_preprocessor = BinaryPreprocessor()
242 | binary_preprocessor.fit_transform(self)
243 |
244 | num_preprocess = NumPreprocessor()
245 | num_preprocess.fit_transform(self)
246 |
247 | general_preprocessor = GeneralPreprocessor()
248 | general_preprocessor.fit_transform(self)
249 |
--------------------------------------------------------------------------------
/auto_smart/auto_smart/util.py:
--------------------------------------------------------------------------------
1 |
2 | import time
3 | from typing import Any
4 |
5 |
6 | import functools
7 | nesting_level = 0
8 | is_start = None
9 |
10 |
11 | class Timer:
12 | def __init__(self):
13 | self.start = time.time()
14 | self.history = [self.start]
15 |
16 | def check(self, info):
17 | current = time.time()
18 | log(f"[{info}] spend {current - self.history[-1]:0.2f} sec")
19 | self.history.append(current)
20 |
21 |
22 | def timeclass(cls):
23 | def timeit(method, start_log=None):
24 | @functools.wraps(method)
25 | def timed(*args, **kw):
26 | global is_start
27 | global nesting_level
28 |
29 | if not is_start:
30 | print()
31 |
32 | is_start = True
33 | log(f"Start [{cls}.{method.__name__}]:" + (start_log if start_log else ""))
34 | log(f'Start time: {time.strftime("%Y-%m-%d %H:%M:%S")}')
35 | nesting_level += 1
36 |
37 | start_time = time.time()
38 | result = method(*args, **kw)
39 | end_time = time.time()
40 |
41 | nesting_level -= 1
42 | log(f"End [{cls}.{method.__name__}]. Time elapsed: {end_time - start_time:0.2f} sec.")
43 | log(f'End time: {time.strftime("%Y-%m-%d %H:%M:%S")}')
44 | is_start = False
45 |
46 | return result
47 |
48 | return timed
49 | return timeit
50 |
51 | def timeit(method, start_log=None):
52 | @functools.wraps(method)
53 | def timed(*args, **kw):
54 | global is_start
55 | global nesting_level
56 |
57 | if not is_start:
58 | print()
59 |
60 | is_start = True
61 | log(f"Start [{method.__name__}]:" + (start_log if start_log else ""))
62 | nesting_level += 1
63 |
64 | start_time = time.time()
65 | result = method(*args, **kw)
66 | end_time = time.time()
67 |
68 | nesting_level -= 1
69 | log(f"End [{method.__name__}]. Time elapsed: {end_time - start_time:0.2f} sec.")
70 | is_start = False
71 |
72 | return result
73 |
74 | return timed
75 |
76 |
77 | def log(entry: Any):
78 | global nesting_level
79 | space = "-" * (4 * nesting_level)
80 | print(f"{space}{entry}")
81 |
82 | def show_dataframe(df):
83 | if len(df) <= 30:
84 | print(f"content=\n"
85 | f"{df}")
86 | else:
87 | print(f"dataframe is too large to show the content, over {len(df)} rows")
88 |
89 | if len(df.dtypes) <= 100:
90 | print(f"types=\n"
91 | f"{df.dtypes}\n")
92 | else:
93 | print(f"dataframe is too wide to show the dtypes, over {len(df.dtypes)} columns")
94 |
95 |
96 |
97 |
98 |
--------------------------------------------------------------------------------
/auto_smart/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | import setuptools
5 | from Cython.Build import cythonize
6 |
7 | setuptools.setup(
8 | name='AutoSmart',
9 | version='0.0.2',
10 | author='DeepBlueAI',
11 | author_email='1229991666@qq.com',
12 | url='https://github.com/DeepBlueAI/AutoSmart',
13 | description=u'The 1st place solution for KDD Cup 2019 AutoML Track',
14 | packages=setuptools.find_packages(),
15 | install_requires=[
16 | "hyperopt",
17 | "lightgbm==2.3.0",
18 | "joblib",
19 | "pandas",
20 | ],
21 | ext_modules = cythonize("ac.pyx"),
22 | classifiers=[
23 | "Programming Language :: Python :: 3",
24 | "License :: OSI Approved :: GNU General Public License (GPL)",
25 | "Operating System :: OS Independent",
26 | ],
27 | )
28 |
--------------------------------------------------------------------------------
/demo/data/train/info.json:
--------------------------------------------------------------------------------
1 | {
2 | "time_budget": 300,
3 | "time_col": "t_01",
4 | "start_time": 1550654179,
5 | "tables": {
6 | "main": {
7 | "t_01": "time",
8 | "c_1": "cat",
9 | "c_2": "cat",
10 | "n_1": "num",
11 | "n_2": "num",
12 | "c_3": "cat",
13 | "c_02": "cat",
14 | "c_01": "cat"
15 | },
16 | "table_1": {
17 | "c_01": "cat",
18 | "c_1": "cat",
19 | "c_2": "cat",
20 | "n_1": "num",
21 | "c_3": "cat",
22 | "c_4": "cat",
23 | "t_1": "time",
24 | "t_2": "time",
25 | "n_2": "num",
26 | "n_3": "num",
27 | "n_4": "num",
28 | "n_5": "num",
29 | "m_1": "multi-cat",
30 | "m_2": "multi-cat",
31 | "m_3": "multi-cat",
32 | "m_4": "multi-cat",
33 | "m_5": "multi-cat",
34 | "m_6": "multi-cat"
35 | },
36 | "table_2": {
37 | "c_02": "cat",
38 | "c_1": "cat",
39 | "c_2": "cat",
40 | "c_3": "cat",
41 | "c_4": "cat",
42 | "t_1": "time"
43 | },
44 | "table_3": {
45 | "n_1": "num",
46 | "c_02": "cat",
47 | "t_01": "time"
48 | }
49 | },
50 | "relations": [
51 | {
52 | "table_A": "main",
53 | "table_B": "table_1",
54 | "key": ["c_01"],
55 | "type": "many_to_one"
56 | },
57 | {
58 | "table_A": "main",
59 | "table_B": "table_2",
60 | "key": ["c_02"],
61 | "type": "many_to_one"
62 | },
63 | {
64 | "table_A": "main",
65 | "table_B": "table_3",
66 | "key": ["c_02"],
67 | "type": "many_to_one"
68 | }
69 | ]
70 | }
71 |
--------------------------------------------------------------------------------
/demo/demo.py:
--------------------------------------------------------------------------------
1 | import auto_smart
2 |
3 | info = auto_smart.read_info("data")
4 | train_data,train_label = auto_smart.read_train("data",info)
5 | test_data = auto_smart.read_test("data",info)
6 |
7 | auto_smart.train_and_predict(train_data,train_label,info,test_data)
8 |
9 |
--------------------------------------------------------------------------------