├── .gitignore
├── LICENSE
├── README.md
├── auto_smart
    ├── LICENSE
    ├── MANIFEST.in
    ├── README.md
    ├── ac.c
    ├── ac.pyx
    ├── auto_smart
    │   ├── CONSTANT.py
    │   ├── PATHS.py
    │   ├── __init__.py
    │   ├── automl
    │   │   ├── __init__.py
    │   │   ├── auto_lgb.py
    │   │   ├── automl.py
    │   │   ├── autosample.py
    │   │   └── model_selection.py
    │   ├── config.py
    │   ├── data_tools.py
    │   ├── feat
    │   │   ├── __init__.py
    │   │   ├── default_feat.py
    │   │   ├── default_merge_feat.py
    │   │   ├── feat.py
    │   │   ├── feat_pipeline.py
    │   │   ├── feat_selection.py
    │   │   ├── merge_feat.py
    │   │   └── merge_feat_pipeline.py
    │   ├── feat_context.py
    │   ├── feat_engine.py
    │   ├── merger.py
    │   ├── metadata
    │   ├── model.py
    │   ├── model_input.py
    │   ├── preprocessor
    │   │   ├── __init__.py
    │   │   └── preprocessor.py
    │   ├── table
    │   │   ├── __init__.py
    │   │   ├── graph.py
    │   │   └── table.py
    │   └── util.py
    └── setup.py
└── demo
    ├── data
        ├── test
        │   └── main_test.data
        └── train
        │   ├── info.json
        │   ├── main_train.data
        │   ├── main_train.solution
        │   ├── table_1.data
        │   ├── table_2.data
        │   └── table_3.data
    └── demo.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | #*.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | .DS_Store
107 |  


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 
676 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Alt text](https://www.deepblueai.com/usr/deepblue/v3/images/logo.png "DeepBlue")  
 2 | [![license](https://img.shields.io/badge/license-GPL%203.0-green.svg)](https://github.com/DeepBlueAI/AutoSmart/blob/master/LICENSE)
 3 | # The introduction of AutoSmart
 4 | The 1st place solution for KDD Cup 2019 AutoML Track
 5 | 
 6 | # How to install
 7 | 
 8 | Requirements: [Cython with C compiler](https://docs.cython.org/en/latest/src/quickstart/install.html).
 9 | 
10 | clone or download autosmart package,  run 
11 | 
12 | ```python
13 | python setup.py install 
14 | ```
15 | 
16 | # How to use
17 | ```python
18 | import auto_smart
19 | 
20 | info = auto_smart.read_info("data")
21 | train_data,train_label = auto_smart.read_train("data",info)
22 | test_data = auto_smart.read_test("data",info)
23 | auto_smart.train_and_predict(train_data,train_label,info,test_data)
24 | ```
25 | # Data Sample
26 | ### Data
27 | 
28 | This page describes the datasets that our system can deal with.
29 | 
30 | #### Components
31 | Each dataset is split into two subsets, namely the training set and the testing set.
32 | 
33 | Both sets have:
34 | 
35 | - a **main table file** that stores the **main table** (label excluded);
36 | - multiple **related table files** that store the **related tables**;
37 | - an **info dictionary** that contains important information about the dataset, including table relations;
38 | - The training set has an additional **label file** that stores **labels** associated with the **main table**.
39 | 
40 | ### Table files
41 | 
42 | Each **table file** is a CSV file that stores a table (main or related), with '**\t**' as the delimiter. The first row indicates the names of features, a.k.a 'schema', and the following rows are the records.
43 | 
44 | The type of each feature can be found in the info dictionary that will be introduced soon. 
45 | 
46 | There are 4 types of features, indicated by "cat", "num", "multi-cat", and "time", respectively:
47 | 
48 | - **cat**: categorical feature, an integer
49 | - **num**: numerical Feature, a real value.
50 | - **multi-cat**: multi-value categorical Feature: a set of integers, split by the comma. The size of the set is not fixed and can be different for each instance. For example, topics of an article, words in a title, items bought by a user and so on.
51 | - **time**: time feature, an integer that indicates the timestamp.
52 | 
53 | 
54 | ### Label file
55 | The **label file** is associated only with the main table in the training set. It is a CSV file that contains only one column, with the first row as the header and the remaining indicating labels associated with instances in the main table.
56 | 
57 | ### info dictionary
58 | Important information about each dataset is stored in a python dictionary structure named as **info**, which acts as an input of this system. Generally,you need to manually generate the dictionary information info.json file. Here we give details about info.
59 | 
60 | ![Alt text](https://i.ibb.co/4dQxCRD/info.png "datainfo")  
61 | 
62 | Descriptions of the keys in info:
63 | 
64 | - **time_budget**: time budget for this dataset (sec). 
65 | - **time_col**: the column name of the primary timestamp; Each dataset has one unique time_col; time_col is definitely contained in the main table, but not necessarily in a related table;
66 | - **start_time**: DEPRECATED.
67 | - **tables**: a dictionary that stores information about tables. Each key indicates a table, and its corresponding value is a dictionary that indicates the type of each column in this table. Two kinds of keys are contained in tables:
68 |     - **main**: the main table;
69 |     - **table_{i}**: the i-th related table.
70 |     - There are 4 types of features, indicated by "cat", "num", "multi-cat", and "time", respectively:
71 |         - **cat**: categorical feature, an integer
72 |         - **num**: numerical Feature, a real value.
73 |         - **multi-cat**: multi-value categorical Feature: a set of integers, split by the comma. The size of the set is not fixed and can be different for each instance. For example, topics of an article, words in a title, items bought by a user and so on.
74 |         - **time**: time feature, an integer that indicates the timestamp.
75 | 
76 | - **relations**: a list that stores table relations in the dataset. Each relation can be represented as an ordered table pair (**table_A**, **table_B**), a key column **key** that appears in both tables and acts as the pivot of table joining, and a relation type **type**. Different relation types will be introduced shortly.
77 | 
78 | #### Relations Between Tables
79 | Four table relations are considered in this system:
80 | 
81 | - **one-to-one** (1-1): the key columns in both **table_A** and **table_B** have no duplicated values;
82 | - **one-to-many** (1-M): the key column in **table_A** has no duplicated values, but that in **table_B** may have duplicated values;
83 | - **many-to-one** (M-1): the key column in **table_A** may have duplicated values, but that in **table_B** has no duplicated values;
84 | - **many-to-many** (M-M): the key columns in both **table_A** and **table_B** may have duplicated values.
85 | 
86 | 
87 | 
88 | 
89 | # Contact Us
90 | DeepBlueAI: 1229991666@qq.com
91 | 


--------------------------------------------------------------------------------
/auto_smart/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.pyx


--------------------------------------------------------------------------------
/auto_smart/README.md:
--------------------------------------------------------------------------------
 1 | ![Alt text](https://www.deepblueai.com/usr/deepblue/v3/images/logo.png "DeepBlue")  
 2 | [![license](https://img.shields.io/badge/license-GPL%203.0-green.svg)](https://github.com/DeepBlueAI/AutoSmart/blob/master/LICENSE)
 3 | # The introduction of AutoSmart
 4 | The 1st place solution for KDD Cup 2019 AutoML Track
 5 | 
 6 | # How to use
 7 | This is the link to the competition：https://codalab.lri.fr/competitions/559
 8 | 
 9 | # Contact Us
10 | DeepBlueAI: 1229991666@qq.com
11 | 


--------------------------------------------------------------------------------
/auto_smart/ac.pyx:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | from cython cimport boundscheck,wraparound
  4 | 
  5 | 
  6 | 
  7 | 
  8 | @boundscheck(False) 
  9 | @wraparound(False)
 10 | def pre_tuple_encode_func(int[:] muldata, int[:] muldatalens,K):
 11 |     cdef:
 12 |         int index = 0
 13 |         int i,j,N = muldatalens.shape[0]
 14 |         int les
 15 | #        list tmp = []
 16 |         int c_K = K
 17 |         dict map_dict = {}
 18 |         int ids = 0
 19 |     
 20 |     
 21 |     ans = np.zeros( N ,dtype=np.float )
 22 |     
 23 |     for i in range(N):
 24 |         les = muldatalens[i]
 25 |         if les == 0:
 26 |             ans[i] = np.nan 
 27 |         else:
 28 |             tmp = []
 29 |             if c_K > les:
 30 |                 for j in range(index,index+les):
 31 |                     tmp.append(muldata[j])
 32 |             else:
 33 |                 for j in range(index,index+c_K):
 34 |                     tmp.append(muldata[j])
 35 |             
 36 |             thash = tuple( tmp )
 37 |             if thash in map_dict:
 38 |                 ans[i] =  map_dict[ thash ] 
 39 |             else:
 40 |                 map_dict[ thash ] = ids
 41 |                 ans[i] = ids
 42 |                 ids += 1
 43 |                 
 44 |         index += les
 45 |     
 46 |     return ans
 47 |         
 48 | 
 49 | @boundscheck(False) 
 50 | @wraparound(False)
 51 | def post_tuple_encode_func(int[:] muldata, int[:] muldatalens,K):
 52 |     cdef:
 53 |         int index = 0
 54 |         int i,j,N = muldatalens.shape[0]
 55 |         int les
 56 | #        list tmp = []
 57 |         int c_K = K
 58 |         dict map_dict = {}
 59 |         int ids = 0
 60 |     
 61 |     
 62 |     ans = np.zeros( N ,dtype=np.float )
 63 |     
 64 |     for i in range(N):
 65 |         les = muldatalens[i]
 66 |         if les == 0:
 67 |             ans[i] = np.nan 
 68 |         else:
 69 |             tmp = []
 70 |             if c_K > les:
 71 |                 for j in range(index,index+les):
 72 |                     tmp.append(muldata[j])
 73 |             else:
 74 |                 for j in range(index+les-c_K,index+les):
 75 |                     tmp.append(muldata[j])
 76 |             
 77 |             thash = tuple( tmp )
 78 |             if thash in map_dict:
 79 |                 ans[i] =  map_dict[ thash ] 
 80 |             else:
 81 |                 map_dict[ thash ] = ids
 82 |                 ans[i] = ids
 83 |                 ids += 1
 84 |                 
 85 |         index += les
 86 |     
 87 |     return ans
 88 | 
 89 | 
 90 | @boundscheck(False) 
 91 | @wraparound(False)
 92 | def tuple_encode_func_1(int[:] muldata, int[:] muldatalens):
 93 |     cdef:
 94 |         int index = 0
 95 |         int i,j,N = muldatalens.shape[0]
 96 |         int les
 97 |         dict map_dict = {}
 98 |         int ids = 1
 99 |     
100 |     
101 |     ans = np.zeros( N ,dtype=np.float )
102 |     
103 |     for i in range(N):
104 |         les = muldatalens[i]
105 |         if les == 0:
106 |             ans[i] = np.nan 
107 |         else:
108 |             tmp = []
109 |             for j in range(index,index+les):
110 |                 tmp.append(muldata[j])
111 |             
112 |             thash = tuple( tmp )
113 |             if thash in map_dict:
114 |                 ans[i] =  map_dict[ thash ] 
115 |             else:
116 |                 map_dict[ thash ] = ids
117 |                 ans[i] = ids
118 |                 ids += 1
119 |                 
120 |         index += les
121 |     
122 |     return ans
123 | 
124 | #@boundscheck(False) 
125 | #@wraparound(False)
126 | #def tuple_encode_func_2(vals):
127 | #    cdef:
128 | #        int idx,N = vals.shape[0]
129 | #        dict map_dict = {}
130 | #        int ids = 0
131 | #    
132 | #    ans = np.zeros( N ,dtype=np.float )
133 | #    
134 | #    for idx in range(N):
135 | #        i = vals[idx]
136 | #        if type(i)==float or i==():
137 | #            ans[idx] = np.nan
138 | #        else:
139 | #            if i in map_dict:
140 | #                ans[idx] = map_dict[ i ]
141 | #            else:
142 | #                map_dict[ i ] = ids
143 | #                ans[idx] = ids
144 | #                ids += 1
145 | #    
146 | #    return ans
147 | 
148 | 
149 |       
150 | @boundscheck(False) 
151 | @wraparound(False)
152 | def cat_in_multi(  int[:] muldata, int[:] muldatalens, int[:] catdata ):
153 |     cdef:
154 |         int index = 0
155 |         int i,j,N = muldatalens.shape[0]
156 |         int les
157 |         int cat
158 |         int flag
159 | #        list ans = []
160 |         
161 |     ans = np.zeros( N ,dtype=np.int8 )
162 |         
163 |     for i in range(N):
164 |         les = muldatalens[i]
165 |         flag = 0
166 |         cat = catdata[ i ]
167 |         for j in range(index,index+les):
168 |             if muldata[j] == cat:
169 |                 flag = 1
170 |                 break
171 |             
172 |         if flag :
173 |             ans[i] = 1
174 |         else:
175 |             ans[i] = 0
176 |             
177 |         index += les
178 |     return ans
179 | 
180 | @boundscheck(False) 
181 | @wraparound(False)
182 | def cat_rank_multi(  int[:] muldata, int[:] muldatalens, int[:] catdata ):
183 |     cdef:
184 |         int index = 0
185 |         int i,j,N = muldatalens.shape[0]
186 |         int les
187 |         int cat
188 |         int flag
189 | #        list ans = []
190 |         
191 |     ans = np.zeros( N ,dtype=np.int16 )
192 |         
193 |     for i in range(N):
194 |         les = muldatalens[i]
195 |         flag = 0
196 |         cat = catdata[ i ]
197 |         for j in range(index,index+les):
198 |             if muldata[j] == cat:
199 |                 flag = j-index+1
200 |                 break
201 |         ans[i] = flag     
202 |         index += les
203 |     return ans
204 | 
205 | 
206 | @boundscheck(False) 
207 | @wraparound(False)
208 | def cat_frank_multi(  int[:] muldata, int[:] muldatalens, int[:] catdata ):
209 |     cdef:
210 |         int index = 0
211 |         int i,j,N = muldatalens.shape[0]
212 |         int les
213 |         int cat
214 |         int flag
215 | #        list ans = []
216 |         
217 |     ans = np.zeros( N ,dtype=np.int16 )
218 |         
219 |     for i in range(N):
220 |         les = muldatalens[i]
221 |         flag = 0
222 |         cat = catdata[ i ]
223 |         for j in range(index,index+les):
224 |             if muldata[j] == cat:
225 |                 flag = index+les - j
226 |                 break
227 |         ans[i] = flag     
228 |         index += les
229 |     return ans
230 | 
231 | 
232 | @boundscheck(False) 
233 | @wraparound(False)
234 | def get_need_data(  vals ):
235 |     cdef:
236 |         int idx,N = vals.shape[0]
237 |         list datas = []
238 |         list datalen = []
239 |         
240 |     for idx in range(N):
241 |         i = vals[idx]
242 |         if type(i) == float:
243 |             datalen.append( 0 )
244 |         else:
245 |             datas.extend( i )
246 |             datalen.append( len(i) )
247 |         
248 |     return datas,datalen
249 | 
250 | 
251 | 
252 | 
253 | @boundscheck(False) 
254 | @wraparound(False)
255 | def mscat_fit(vals ):
256 |     cdef:
257 |         set ans = set()
258 |         int idx,N = vals.shape[0]
259 |         
260 |     for idx in range(N):
261 |         val = vals[idx]
262 |         if type(val) == float:
263 |             continue
264 |         ans.update( val.split(',') )
265 |         
266 |     return ans
267 | 
268 | @boundscheck(False) 
269 | @wraparound(False)
270 | def mscat_trans(vals,cats):
271 |     cdef:
272 |         dict cat2index = {index: i + 1 for i,index in enumerate(cats)}
273 |         list ans = []
274 |         int idx,N = vals.shape[0]
275 |         list tmp = []
276 |     
277 |     
278 |     for idx in range(N):
279 |         val = vals[idx]
280 |         if type(val) == float:
281 |             ans.append( tuple() )
282 |         else:
283 |             tmp = []
284 |             x = val.split(',')
285 |             for i in x:
286 |                 tmp.append( cat2index[i] )
287 |                 
288 |             ans.append( tuple( tmp ) )
289 |          
290 |     return ans
291 | 
292 | 
293 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/CONSTANT.py:
--------------------------------------------------------------------------------
 1 | NUMERICAL_TYPE = "num"
 2 | NUMERICAL_PREFIX = "n_"
 3 | 
 4 | CATEGORY_TYPE = "cat"
 5 | CATEGORY_PREFIX = "c_"
 6 | 
 7 | TIME_TYPE = "time"
 8 | TIME_PREFIX = "t_"
 9 | 
10 | MULTI_CAT_TYPE = "multi-cat"
11 | MULTI_CAT_PREFIX = "m_"
12 | MULTI_CAT_DELIMITER = ","
13 | 
14 | BINARY_TYPE = "binary"
15 | BINARY_PREFIX = 'b_'
16 | 
17 | MAIN_TABLE_NAME = "main"
18 | MAIN_TABLE_TEST_NAME = "main_test"
19 | TABLE_PREFIX = "table_"
20 | 
21 | LABEL = "label"
22 | 
23 | type2prefix = {
24 |     NUMERICAL_TYPE:NUMERICAL_PREFIX,
25 |     CATEGORY_TYPE:CATEGORY_PREFIX,
26 |     TIME_TYPE:TIME_PREFIX,
27 |     MULTI_CAT_TYPE:MULTI_CAT_PREFIX,
28 |     BINARY_TYPE: BINARY_PREFIX
29 | }
30 | 
31 | THREAD_NUM = 4
32 | 
33 | SEED = 2222
34 | JOBS = 7
35 | 
36 | CAT_SHIFT = 1
37 | 
38 | MAX_SAMPLE_NUM = 1000000
39 | 
40 | TIME_MIN_BINS = 1000
41 | SEGMENTS = 100
42 | 
43 | LESS_LIMIT = 10
44 | SMOOTH_SHIFT = 100
45 | DEVIATION_SHIFT = 100
46 | 
47 | KEYWORDS = ["label",'index']
48 | 
49 | SPLIT = -1
50 | 
51 | round_opt = False
52 | 
53 | SAMPLE_NUM = 210000
54 | 
55 | USE_ENSEMBLE = 1
56 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/PATHS.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from datetime import datetime 
3 | version = datetime.now().strftime('%Y%m%d%H%M%S')
4 | print('version:{}'.format(version))
5 | feature_importance_path = '../importances/'
6 | 
7 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/__init__.py:
--------------------------------------------------------------------------------
 1 | name = "example_pkg"
 2 | import os
 3 | import sys
 4 | ABSPATH = os.path.abspath(os.path.realpath(os.path.dirname(__file__)))
 5 | sys.path.append(ABSPATH)
 6 | 
 7 | from auto_smart.model import Model
 8 | import numpy as np
 9 | import pandas as pd
10 | import json
11 | from os.path import join
12 | from datetime import datetime
13 | 
14 | 
15 | TYPE_MAP = {
16 |     'time': str,
17 |     'cat': str,
18 |     'multi-cat': str,
19 |     'num': np.float64
20 | }
21 | 
22 | def read_info(datapath):
23 |     with open(join(datapath, 'train', 'info.json'), 'r') as info_fp:
24 |         info = json.load(info_fp)
25 |     return info
26 | 
27 | def read_train(datapath, info):
28 |     train_data = {}
29 |     for table_name, columns in info['tables'].items():
30 | 
31 |         table_dtype = {key: TYPE_MAP[val] for key, val in columns.items()}
32 | 
33 |         if table_name == 'main':
34 |             table_path = join(datapath, 'train', 'main_train.data')
35 |         else:
36 |             table_path = join(datapath, 'train', f'{table_name}.data')
37 | 
38 |         date_list = [key for key, val in columns.items() if val == 'time']
39 | 
40 |         train_data[table_name] = pd.read_csv(
41 |             table_path, sep='\t', dtype=table_dtype, parse_dates=date_list,
42 |             date_parser=lambda millisecs: millisecs if np.isnan(
43 |                 float(millisecs)) else datetime.fromtimestamp(
44 |                     float(millisecs)/1000))
45 | 
46 |     # get train label
47 |     train_label = pd.read_csv(
48 |         join(datapath, 'train', 'main_train.solution'))['label']
49 |     return train_data, train_label
50 | 
51 | 
52 | def read_test(datapath, info):
53 |     # get test data
54 |     main_columns = info['tables']['main']
55 |     table_dtype = {key: TYPE_MAP[val] for key, val in main_columns.items()}
56 | 
57 |     table_path = join(datapath, 'test', 'main_test.data')
58 | 
59 |     date_list = [key for key, val in main_columns.items() if val == 'time']
60 | 
61 |     test_data = pd.read_csv(
62 |         table_path, sep='\t', dtype=table_dtype, parse_dates=date_list,
63 |         date_parser=lambda millisecs: millisecs if np.isnan(
64 |             float(millisecs)) else datetime.fromtimestamp(
65 |                 float(millisecs) / 1000))
66 |     return test_data
67 | 
68 | def train_and_predict(train_data,train_label,info,test_data):
69 |     cmodel = Model(info)
70 |     cmodel.fit(train_data, train_label)
71 |     return cmodel.predict(test_data)
72 | 
73 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/automl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepBlueAI/AutoSmart/99a6b44ce5b6520a2463c8a4b3acc675584533be/auto_smart/auto_smart/automl/__init__.py


--------------------------------------------------------------------------------
/auto_smart/auto_smart/automl/auto_lgb.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import lightgbm as lgb
  3 | import numpy as np
  4 | import CONSTANT
  5 | from util import log, timeclass
  6 | from .automl import AutoML
  7 | import pandas as pd
  8 | import gc
  9 | from . import autosample
 10 | import time
 11 | import copy
 12 | from sklearn.metrics import roc_auc_score
 13 | 
 14 | class AutoLGB(AutoML):
 15 |     def __init__(self):
 16 |         self.params = {
 17 |             "boosting_type": "gbdt",
 18 |             "objective": "binary",
 19 |             "metric": "auc",
 20 |             "verbosity": 1,
 21 |             "seed": CONSTANT.SEED,
 22 |             "num_threads": CONSTANT.THREAD_NUM
 23 |         }
 24 | 
 25 |         self.hyperparams = {
 26 |             'num_leaves': 31,
 27 |             'max_depth': -1,
 28 |             'min_child_samples': 20,
 29 |             'max_bin':255,
 30 |             'subsample': 0.8,
 31 |             'subsample_freq': 1,
 32 |             'colsample_bytree': 0.8,
 33 |             'min_child_weight': 0.001,
 34 |             'subsample_for_bin': 200000,
 35 |             'min_split_gain': 0.02,
 36 |             'reg_alpha': 0.1,
 37 |             'reg_lambda': 0.1,
 38 |         }
 39 | 
 40 |         self.early_stopping_rounds = 50
 41 | 
 42 |     @timeclass(cls='AutoLGB')
 43 |     def predict(self,X):
 44 |         X = X[self.columns]
 45 |         X.columns = self.new_feat_name_cols
 46 |         return self.model.predict(X)
 47 | 
 48 |     @timeclass(cls='AutoLGB')
 49 |     def ensemble_train(self,X,y,categories,config,len_test):
 50 |         feat_name = list(X.columns)
 51 |         self.ensemble_models = []
 52 |         self.ensemble_columns = []
 53 |         columns = list(X.columns)
 54 |         log(f'lgb training set shape: {X.shape}')
 55 |         pos = (y==1).sum()
 56 |         neg = (y==0).sum()
 57 |         log(f'pos {pos} neg {neg}')
 58 | 
 59 |         self.columns = columns
 60 |         max_sample_num = len(y)
 61 | 
 62 |         feat_name_cols = list(X.columns)
 63 |         feat_name_maps = { feat_name_cols[i] : str(i)  for i in range(len(feat_name_cols)) }
 64 |         f_feat_name_maps = { str(i) : feat_name_cols[i] for i in range(len(feat_name_cols)) }
 65 |         new_feat_name_cols = [ feat_name_maps[i] for i in feat_name_cols ]
 66 |         X.columns = new_feat_name_cols
 67 |         categories = [ feat_name_maps[i] for i in categories ]
 68 |         self.f_feat_name_maps = f_feat_name_maps
 69 |         self.new_feat_name_cols = new_feat_name_cols
 70 |         
 71 |         all_columns = list(X.columns)
 72 |         
 73 |         start_time = time.time()
 74 |         i = 0
 75 |         cur_columns = all_columns
 76 |         seed = np.random.randint(2019*i,2019*(i+1))
 77 |         X_train,y_train = autosample.downsampling(X,y,max_sample_num,seed)
 78 |         X_train = X_train[cur_columns]
 79 |         gc.collect()
 80 |         
 81 |         colset = set(X_train.columns)
 82 |         cur_categorical = [col for col in categories if col in colset]
 83 |         pos = (y_train==1).sum()
 84 |         neg = (y_train==0).sum()
 85 | 
 86 |         params = self.params
 87 |         hyperparams = self.hyperparams
 88 |         params['seed'] = seed
 89 |         
 90 |         X_train = X_train.astype(np.float32)
 91 |         gc.collect()
 92 |         y_train = y_train.astype(np.float32)
 93 |         gc.collect()
 94 |         X_train = X_train.values
 95 |         gc.collect()
 96 |         y_train = y_train.values
 97 |         gc.collect()
 98 |         
 99 |         train_data = lgb.Dataset(X_train, label=y_train,feature_name=feat_name)
100 |         del X_train,y_train
101 |         gc.collect()
102 |         
103 |         model = lgb.train({**params, **hyperparams},
104 |                                 train_data,
105 |                                 num_boost_round=self.best_iteration,
106 |                                 feature_name=cur_columns,
107 |                                 categorical_feature=cur_categorical,
108 |                                 learning_rates = self.learning_rates[:self.best_iteration])
109 | 
110 |         self.ensemble_columns.append(cur_columns)
111 |         self.ensemble_models.append(model)
112 |         end_time = time.time()
113 |         
114 |         model_use_time = end_time - start_time
115 |         del train_data
116 |         
117 |         gc.collect()
118 |         
119 |         start_time = time.time()
120 |         temp = X.iloc[:100000]
121 |         
122 |         temp = temp.astype(np.float32)
123 |         gc.collect()
124 |         temp = temp.values
125 |         gc.collect()
126 |         
127 |         model.predict(temp)
128 |         
129 |         end_time = time.time()
130 |         model_test_use_time = (end_time-start_time)
131 |         model_test_use_time = len_test/temp.shape[0] * model_test_use_time
132 |         model_use_time = model_use_time + model_test_use_time
133 |         del temp,model
134 |         
135 |         rest_time = config.budget/10*9-(end_time-config.start_time)
136 |         if rest_time <= 0:
137 |             rest_model_num = 0
138 |         else:
139 |             rest_model_num = int(rest_time // model_use_time)
140 |         
141 |         if rest_model_num >= 50:
142 |             rest_model_num = 50 
143 |             
144 |         if rest_model_num >= 1:
145 |             rest_model_num -= 1
146 | 
147 |         if not CONSTANT.USE_ENSEMBLE:
148 |             rest_model_num = 0
149 |         
150 |         for i in range(1,rest_model_num+1):
151 | 
152 |             seed = np.random.randint(2019*i,2019*(i+1))
153 |             
154 |             cur_columns = list(pd.Series(all_columns).sample(frac=0.85,replace=False,random_state=seed))
155 | 
156 |             X_train,y_train = autosample.downsampling(X,y,max_sample_num,seed)
157 |             X_train = X_train[cur_columns]
158 |             gc.collect()
159 |             
160 |             colset = set(X_train.columns)
161 |             cur_categorical = [col for col in categories if col in colset]
162 | 
163 |             pos = (y_train==1).sum()
164 |             neg = (y_train==0).sum()
165 | 
166 |             params = self.params
167 |             hyperparams = self.hyperparams
168 |             params['seed'] = seed
169 |             
170 |             num_leaves = hyperparams['num_leaves']
171 |             num_leaves = num_leaves + np.random.randint(-int(num_leaves/10),int(num_leaves/10)+7)
172 |             
173 |             lrs = np.array(self.learning_rates)
174 |             rands = 1 + 0.2*np.random.rand(len(lrs))
175 |             lrs = list(lrs * rands)
176 |             
177 |             cur_iteration = self.best_iteration
178 |             cur_iteration = cur_iteration + np.random.randint(-30,40)
179 |             if cur_iteration > len(lrs):
180 |                 cur_iteration = len(lrs)
181 |             
182 |             if cur_iteration <= 10:
183 |                 cur_iteration = self.best_iteration
184 |             
185 |             cur_hyperparams = copy.deepcopy(hyperparams)
186 |             cur_hyperparams['num_leaves'] = num_leaves
187 |             
188 |             X_train = X_train.astype(np.float32)
189 |             gc.collect()
190 |             y_train = y_train.astype(np.float32)
191 |             gc.collect()
192 |             X_train = X_train.values
193 |             gc.collect()
194 |             y_train = y_train.values
195 |             gc.collect()
196 |             
197 |             train_data = lgb.Dataset(X_train, label=y_train,feature_name=cur_columns)
198 |             del X_train,y_train
199 |             gc.collect()
200 |             
201 |             model = lgb.train({**params, **cur_hyperparams},
202 |                                     train_data,
203 |                                     num_boost_round=cur_iteration,
204 |                                     feature_name=cur_columns,
205 |                                     categorical_feature=cur_categorical,
206 |                                     learning_rates = lrs[:cur_iteration])
207 | 
208 | 
209 |             self.ensemble_columns.append(cur_columns)
210 |             self.ensemble_models.append(model)
211 | 
212 |             del train_data
213 |             gc.collect()
214 | 
215 |         X.columns = self.columns
216 | 
217 | 
218 |     @timeclass(cls='AutoLGB')
219 |     def ensemble_predict(self,X):
220 |         X = X[self.columns]
221 |         gc.collect()
222 |         
223 |         X.columns = self.new_feat_name_cols
224 | 
225 |         preds = []
226 |         for model,cur_cols in zip(self.ensemble_models,self.ensemble_columns):
227 |             gc.collect()
228 |             tX = X[cur_cols]
229 |             gc.collect()
230 |             tX = tX.astype(np.float32)
231 |             gc.collect()
232 |             tX = tX.values
233 |             gc.collect()
234 |             
235 |             preds.append(model.predict( tX ))
236 |             gc.collect()
237 |             
238 |         if len(preds) == 1:
239 |             pred = preds[0]
240 | 
241 |         if len(preds) > 1:
242 |             total_model_num = len(preds)
243 |             
244 |             main_model_weight = 8 / (8 + 2 * (total_model_num-1))
245 |             rest_model_weight = 2 / (8 + 2 * (total_model_num-1))
246 |             pred = preds[0] * main_model_weight
247 |             for i in range(1,total_model_num):
248 |                 pred = pred + rest_model_weight * preds[i]
249 |             
250 |         return pred
251 |     
252 |     @timeclass(cls='AutoLGB')
253 |     def ensemble_predict_test(self,X):
254 |         X = X[self.columns]
255 |         gc.collect()
256 |         
257 |         X.columns = self.new_feat_name_cols
258 |         log(f'ensemble models {len(self.ensemble_models)}')
259 |         preds = []
260 |         for model,cur_cols in zip(self.ensemble_models,self.ensemble_columns):
261 |             gc.collect()
262 |             tX = X[cur_cols]
263 |             gc.collect()
264 |             tX = tX.astype(np.float32)
265 |             gc.collect()
266 |             tX = tX.values
267 |             gc.collect()
268 |             
269 |             preds.append(model.predict( tX ))
270 |             gc.collect()
271 |             
272 |         if len(preds) == 1:
273 |             pred = preds[0]
274 | 
275 |         if len(preds) > 1:
276 |             total_model_num = len(preds)
277 |             
278 |             main_model_weight = 8 / (8 + 2 * (total_model_num-1))
279 |             rest_model_weight = 2 / (8 + 2 * (total_model_num-1))
280 |             pred = preds[0] * main_model_weight
281 |             for i in range(1,total_model_num):
282 |                 pred = pred + rest_model_weight * preds[i]
283 |             
284 |         return pred,preds[0]
285 |     
286 |     def get_log_lr(self,num_boost_round,max_lr,min_lr):
287 |         learning_rates = [max_lr+(min_lr-max_lr)/np.log(num_boost_round)*np.log(i) for i in range(1,num_boost_round+1)]
288 |         return learning_rates
289 | 
290 |     def set_num_leaves(self,X,y):
291 |         t = len(y)
292 |         t = X.shape[1]*(t/40000)
293 |         level = t**0.225 + 1.5
294 |         num_leaves = int(2**level) + 10
295 |         num_leaves = min(num_leaves, 128)
296 |         num_leaves = max(num_leaves, 32)
297 |         self.hyperparams['num_leaves'] = num_leaves
298 | 
299 |     def set_min_child_samples(self, X,y ):
300 |         min_child_samples = ( (X.shape[0]/20000)**0.6 ) *15
301 |         min_child_samples = int(min_child_samples)
302 |         min_child_samples = min(min_child_samples, 150)
303 |         min_child_samples = max(min_child_samples, 15)
304 | 
305 |         self.hyperparams['min_child_samples'] = min_child_samples
306 | 
307 |     @timeclass(cls='AutoLGB')
308 |     def lr_opt(self,train_data,valid_data,categories):
309 |         params = self.params
310 |         hyperparams = self.hyperparams
311 | 
312 |         max_lrs = [0.1,0.08,0.05,0.02]
313 |         min_lrs = [0.04,0.02,0.01,0.005]
314 | 
315 |         num_boost_round = self.num_boost_round
316 |         max_num_boost_round = min(400,num_boost_round)
317 |         best_score = -1
318 |         best_loop = -1
319 |         lr = None
320 | 
321 |         scores = []
322 |         lrs = []
323 |         for max_lr,min_lr in zip(max_lrs,min_lrs):
324 |             learning_rates = self.get_log_lr(num_boost_round,max_lr,min_lr)
325 |             
326 |             model = lgb.train({**params, **hyperparams}, train_data, num_boost_round=max_num_boost_round,\
327 |                                   categorical_feature=categories,learning_rates = learning_rates[:max_num_boost_round]
328 |                                   )
329 |             pred = model.predict(valid_data.data)
330 |             score = roc_auc_score(valid_data.label,pred)
331 |             scores.append(score)
332 |             lrs.append(learning_rates)
333 |             del model, pred
334 |             gc.collect()
335 | 
336 |         best_loop = np.argmax(scores)
337 |         best_score = np.max(scores)
338 |         lr = lrs[best_loop]
339 |         log(f'scores {scores}')
340 |         log(f'loop {best_loop}')
341 |         log(f'lr max {lr[0]} min {lr[-1]}')
342 |         log(f'lr best score {best_score}')
343 |         return lr
344 | 
345 |     @timeclass(cls='AutoLGB')
346 |     def num_leaves_opt(self,train_data,valid_data,categories):
347 |         params = self.params
348 |         hyperparams = self.hyperparams
349 |         num_leaves = [31,63,127,255]
350 | 
351 |         num_boost_round = 500
352 |         best_iteration = -1
353 |         i = 0
354 |         best_score = -1
355 |         best_loop = -1
356 |         best_num_leaves = None
357 | 
358 |         for leaves in num_leaves:
359 |             hyperparams['num_leaves'] = leaves
360 |             model = lgb.train({**params, **hyperparams}, train_data, num_boost_round=num_boost_round,\
361 |                                   valid_sets=[valid_data], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=100,\
362 |                                   categorical_feature=categories,learning_rates = self.learning_rates
363 |                                   )
364 | 
365 |             score = model.best_score["valid_0"][params["metric"]]
366 |             if score > best_score:
367 |                 best_num_leaves = leaves
368 |                 best_iteration = model.best_iteration
369 |                 best_score = score
370 |                 best_loop = i
371 | 
372 |         return best_num_leaves
373 | 
374 |     @timeclass(cls='AutoLGB')
375 |     def subsample_opt(self,num_samples):
376 |         samples = num_samples
377 |         if samples > 1000000:
378 |             samples = 1000000
379 | 
380 |         if samples<200000:
381 |             subsample = 0.95 - samples/1000000
382 |             return subsample
383 | 
384 |         subsample = 0.85-samples/2500000
385 |         return subsample
386 | 
387 |     @timeclass(cls='AutoLGB')
388 |     def colsample_bytree_opt(self,num_feature):
389 |         if num_feature > 500:
390 |             num_feature = 500
391 | 
392 |         if num_feature > 100:
393 |             colsample_bytree = 0.8 - num_feature/2000
394 |         else:
395 |             colsample_bytree = 0.95 - num_feature/500
396 | 
397 |         return colsample_bytree
398 | 
399 |     @timeclass(cls='AutoLGB')
400 |     def param_compute(self,X,y,categories,config):
401 |         feat_name = list(X.columns)
402 |         colsample_bytree = self.colsample_bytree_opt(X.shape[1])
403 |         self.hyperparams['colsample_bytree'] = colsample_bytree
404 |         
405 |         max_sample_num = len(y)
406 |         subsample = self.subsample_opt(autosample.downsampling_y(y,max_sample_num).shape[0])
407 |         self.hyperparams['subsample'] = subsample
408 |         
409 |         max_sample_num = min(len(y),50000)
410 |         X_sample,y_sample = autosample.downsampling(X,y,max_sample_num)
411 |         gc.collect()
412 |         params = self.params
413 |         
414 |         start_time = time.time()
415 |         X_sample = X_sample.astype(np.float32)
416 |         gc.collect()
417 |         y_sample = y_sample.astype(np.float32)
418 |         gc.collect()
419 |         X_sample = X_sample.values
420 |         gc.collect()
421 |         y_sample = y_sample.values
422 |         gc.collect()
423 |         end_time = time.time()
424 |         transfer_time = end_time-start_time
425 |         
426 |         time_number_boost_round1 = 15
427 |         start_time = time.time()
428 |         train_data = lgb.Dataset(X_sample, label=y_sample,feature_name=feat_name)
429 |         
430 |         gc.collect()
431 |         
432 |         lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=time_number_boost_round1,\
433 |                                   categorical_feature=categories,)
434 |         
435 |         end_time = time.time()
436 |         
437 |         model_use_time1 = end_time - start_time
438 |         
439 |         time_number_boost_round2 = time_number_boost_round1*2
440 |         
441 |         del train_data
442 |         gc.collect()
443 |         
444 |         start_time = time.time()
445 |         train_data = lgb.Dataset(X_sample, label=y_sample,feature_name=feat_name)
446 |         del X_sample,y_sample
447 |         gc.collect()
448 |         lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=time_number_boost_round2,\
449 |                                   categorical_feature=categories,)
450 |         
451 |         del train_data
452 |         gc.collect()
453 |         end_time = time.time()
454 |         
455 |         model_use_time2 = end_time - start_time
456 |         
457 |         boost_time = (model_use_time2 - model_use_time1)
458 |         boost_round = time_number_boost_round2 - time_number_boost_round1
459 |         preprocess_time = model_use_time1 - boost_time
460 |         model_sample_time = 4 * (transfer_time + preprocess_time + (boost_time * (400/boost_round))) + 5
461 |         
462 |         max_sample_num = len(y)
463 |         X,y = autosample.downsampling(X,y,max_sample_num)
464 | 
465 |         gc.collect()
466 |         pos = (y==1).sum()
467 |         neg = (y==0).sum()
468 |         
469 |         gc.collect()
470 |         params = self.params
471 |         
472 |         time_number_boost_round1 = 15
473 |         
474 |         start_time = time.time()
475 |         X = X.astype(np.float32)
476 |         gc.collect()
477 |         y = y.astype(np.float32)
478 |         gc.collect()
479 |         X = X.values
480 |         gc.collect()
481 |         y = y.values
482 |         gc.collect()
483 |         end_time = time.time()
484 |         
485 |         transfer_time = end_time-start_time
486 |         
487 |         start_time = time.time()
488 |         train_data = lgb.Dataset(X, label=y,feature_name=feat_name)
489 |         
490 |         gc.collect()
491 |         
492 |         
493 |         lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=time_number_boost_round1,\
494 |                                   categorical_feature=categories,)
495 |         
496 |         del train_data
497 |         gc.collect()
498 |         end_time = time.time()
499 |         
500 |         model_use_time1 = end_time - start_time
501 |         
502 |         time_number_boost_round2 = time_number_boost_round1*2
503 |         
504 |         start_time = time.time()
505 |         train_data = lgb.Dataset(X, label=y,feature_name=feat_name)
506 |         del X,y
507 |         gc.collect()
508 |         lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=time_number_boost_round2,\
509 |                                   categorical_feature=categories,)
510 |         
511 |         del train_data
512 |         gc.collect()
513 |         end_time = time.time()
514 |         
515 |         model_use_time2 = end_time - start_time
516 |         
517 |         boost_time = (model_use_time2 - model_use_time1)
518 |         boost_round = time_number_boost_round2 - time_number_boost_round1
519 |         preprocess_time = model_use_time1 - boost_time
520 |         
521 |         rest_time = config.budget/10*9-(end_time-config.start_time)-model_sample_time-10
522 |         
523 |         self.num_boost_round = 20
524 |         for number_boost_round in [700,600,500,400,300,200,100,50]:
525 |             real_model_time = (transfer_time + preprocess_time + (boost_time * (number_boost_round/boost_round)))
526 |             if real_model_time > rest_time:
527 |                 continue
528 |             else:
529 |                 self.num_boost_round = number_boost_round
530 |                 break
531 |             
532 |         gc.collect()
533 |     
534 |     @timeclass(cls='AutoLGB')
535 |     def param_opt(self,X_train,y_train,X_valid,y_valid,categories):
536 |         feat_name = list(X_train.columns)
537 | 
538 |         pos = (y_train==1).sum()
539 |         neg = (y_train==0).sum()
540 |         val_pos = (y_valid==1).sum()
541 |         val_neg = (y_valid==0).sum()
542 | 
543 |         max_sample_num = min(len(y_train),50000)
544 |         X,y = autosample.downsampling(X_train,y_train,max_sample_num)
545 | 
546 |         pos = (y==1).sum()
547 |         neg = (y==0).sum()
548 | 
549 |         train_data = lgb.Dataset(X, label=y,feature_name=feat_name)
550 |         del X,y
551 |         gc.collect()
552 | 
553 |         valid_data = lgb.Dataset(X_valid, label=y_valid,feature_name=feat_name,free_raw_data=False)
554 |         del X_valid,y_valid
555 |         gc.collect()
556 | 
557 |         lr = self.lr_opt(train_data,valid_data,categories)
558 |         self.learning_rates = lr
559 |         
560 |         self.best_iteration = self.num_boost_round
561 |         
562 |         del train_data
563 |         gc.collect()
564 |          
565 |         num_boost_round = self.num_boost_round
566 |         params = self.params
567 |         max_sample_num = len(y_train)
568 |          
569 |         X,y = autosample.downsampling(X_train,y_train,max_sample_num)
570 |         del X_train,y_train
571 |          
572 |         gc.collect()
573 |         pos = (y==1).sum()
574 |         neg = (y==0).sum()
575 |         
576 |         X = X.astype(np.float32)
577 |         gc.collect()
578 |         y = y.astype(np.float32)
579 |         gc.collect()
580 |         X = X.values
581 |         gc.collect()
582 |         y = y.values
583 |         gc.collect()
584 |         
585 |         train_data = lgb.Dataset(X, label=y,feature_name=feat_name)
586 |         
587 |         del X,y
588 |         gc.collect()
589 |         
590 |         model = lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=num_boost_round,\
591 |                                    valid_sets=[valid_data], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=100,\
592 |                                    categorical_feature=categories,learning_rates = self.learning_rates
593 |                                    )
594 |         gc.collect()
595 |         
596 |         best_model = model
597 |          
598 |         best_score = model.best_score["valid_0"][params["metric"]]
599 |         
600 |         if model.best_iteration > 50:
601 |             self.best_iteration = model.best_iteration
602 |         elif model.current_iteration() > 50:
603 |             self.best_iteration = model.current_iteration()
604 |         else:
605 |             self.best_iteration = 50
606 |         
607 |         return best_model,best_score
608 | 
609 |     def get_importances(self):
610 |         model = self.model
611 |         importances = pd.DataFrame({'features':[ self.f_feat_name_maps[i] for i in model.feature_name() ] ,
612 |                                 'importances':model.feature_importance()})
613 | 
614 |         importances.sort_values('importances',ascending=False,inplace=True)
615 | 
616 |         return importances
617 | 
618 |     @timeclass(cls='AutoLGB')
619 |     def ensemble_predict_train(self,X):
620 |         X = X[X.columns]
621 |         X.columns = self.new_feat_name_cols
622 | 
623 |         preds = []
624 |         for model in self.ensemble_models:
625 |             preds.append(model.predict(X))
626 | 
627 |         pred = np.stack(preds,axis=1).mean(axis=1)
628 |         return pred
629 | 
630 |     def get_ensemble_importances(self):
631 |         model = self.ensemble_models[0]
632 |         importances = pd.DataFrame({'features':[ self.f_feat_name_maps[i] for i in model.feature_name() ] ,
633 |                                 'importances':model.feature_importance()})
634 | 
635 |         importances.sort_values('importances',ascending=False,inplace=True)
636 | 
637 |         return importances
638 |     
639 |     @timeclass(cls='AutoLGB')
640 |     def param_opt_new(self,X_train,y_train,X_valid,y_valid,categories):
641 |         feat_name = list(X_train.columns)
642 | 
643 |         pos = (y_train==1).sum()
644 |         neg = (y_train==0).sum()
645 |         val_pos = (y_valid==1).sum()
646 |         val_neg = (y_valid==0).sum()
647 |         log(f'training set pos {pos} neg {neg}')
648 |         log(f'validation set pos {val_pos} neg {val_neg}')
649 | 
650 |         max_sample_num = min(len(y_train),50000)
651 |         X,y = autosample.downsampling(X_train,y_train,max_sample_num)
652 | 
653 |         pos = (y==1).sum()
654 |         neg = (y==0).sum()
655 |         log(f'opt downsampling set pos {pos} neg {neg}')
656 | 
657 |         X = X.astype(np.float32)
658 |         gc.collect()
659 |         y = y.astype(np.float32)
660 |         gc.collect()
661 |         X = X.values
662 |         gc.collect()
663 |         y = y.values
664 |         gc.collect()
665 |         
666 |         train_data = lgb.Dataset(X, label=y,feature_name=feat_name)
667 |         del X,y
668 |         gc.collect()
669 | 
670 |         valid_data = lgb.Dataset(X_valid, label=y_valid,feature_name=feat_name,free_raw_data=False)
671 |         del X_valid,y_valid
672 |         gc.collect()
673 | 
674 |         lr = self.lr_opt(train_data,valid_data,categories)
675 |         del train_data
676 |         gc.collect()
677 |         self.learning_rates = lr
678 |         
679 |         self.best_iteration = self.num_boost_round
680 |         log(f'pass round opt, use best iteration as {self.best_iteration}')
681 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/automl/automl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | class AutoML:
 4 |     def __init__(self):
 5 |         self.params = {
 6 |             
 7 |         }
 8 |     
 9 |     def train(self,X,y,categories):
10 |         pass
11 |     
12 |     def predict(self,X):
13 |         pass
14 |     
15 |     
16 |     def param_opt(self,X_train,y_train,X_valid,y_valid,categories):
17 |         pass
18 | 
19 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/automl/autosample.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pandas as pd
  3 | 
  4 | def get_downsampling_num(npos,nneg,sample_num,unbalanced_ratio,min_neg_pos_ratio=2):
  5 |   
  6 |     reverse = False
  7 |     ntol = npos + nneg
  8 |     if npos>nneg:
  9 |         reverse = True
 10 |         tmp = npos
 11 |         npos = nneg
 12 |         nneg = tmp
 13 | 
 14 |     max_sample_num = min(npos, nneg)*(unbalanced_ratio+1)
 15 |     if max_sample_num>sample_num:
 16 |         max_sample_num = sample_num
 17 | 
 18 |     if npos+nneg > max_sample_num:
 19 | 
 20 |         if nneg/npos <= min_neg_pos_ratio:
 21 |             pos_num = npos/ntol * max_sample_num
 22 |             neg_num = nneg/ntol * max_sample_num
 23 |             
 24 |         elif nneg/npos <= unbalanced_ratio:
 25 |             if npos > max_sample_num/(min_neg_pos_ratio+1):
 26 |                 pos_num = max_sample_num/(min_neg_pos_ratio+1)
 27 |                 neg_num = max_sample_num - pos_num            
 28 |             else:
 29 |                 pos_num = npos
 30 |                 neg_num = max_sample_num - pos_num
 31 | 
 32 |         elif nneg/npos > unbalanced_ratio:
 33 |             if npos > max_sample_num/(unbalanced_ratio+1):
 34 |                 pos_num = max_sample_num/(unbalanced_ratio+1)
 35 |                 neg_num = max_sample_num - pos_num  
 36 | 
 37 |             else:
 38 |                 pos_num = npos
 39 |                 neg_num = max_sample_num - npos
 40 | 
 41 |     else:
 42 |         neg_num = nneg
 43 |         pos_num = npos
 44 |     
 45 |     if neg_num/pos_num > unbalanced_ratio:
 46 |         neg_num = pos_num*unbalanced_ratio
 47 | 
 48 |     neg_num = int(neg_num)
 49 |     pos_num = int(pos_num)
 50 |     if reverse:
 51 |         return neg_num,pos_num
 52 | 
 53 |     return pos_num,neg_num
 54 | 
 55 | def sample(X,frac,seed,y=None): 
 56 |     if frac == 1:
 57 |         X = X.sample(frac=1,random_state=seed)
 58 |     elif frac > 1:
 59 |         mul = int(frac)
 60 |         frac = frac - int(frac)
 61 |         X_res = X.sample(frac=frac,random_state=seed)
 62 |         X = pd.concat([X] * mul + [X_res])
 63 |     else:
 64 |         X = X.sample(frac=frac,random_state=seed)
 65 |     
 66 |     if y is not None:
 67 |         y = y.loc[X.index]
 68 |         return X,y
 69 |     return X
 70 | 
 71 | 
 72 | def downsampling_num(y,max_sample_num):
 73 |     npos = (y==1).sum()
 74 |     nneg = (y==0).sum()
 75 |     
 76 |     
 77 |     min_num = min(npos,nneg)
 78 |     min_num = max(min_num,1000)
 79 |     
 80 |     if min_num < 8000:
 81 |         unbalanced_ratio = 10 - (min_num//1000) 
 82 |     else:
 83 |         unbalanced_ratio = 3
 84 |         
 85 |     pos_num,neg_num = get_downsampling_num(npos,nneg,max_sample_num,unbalanced_ratio)
 86 |     return pos_num,neg_num
 87 | 
 88 | 
 89 | def class_sample(X,y,pos_num,neg_num,seed=2019):
 90 |     
 91 |     npos = float((y == 1).sum())
 92 |     nneg = len(y) - npos
 93 |     
 94 |     pos_frac = pos_num / npos
 95 |     neg_frac = neg_num / nneg
 96 |     
 97 |     X_pos = X[y == 1]
 98 |     X_pos = sample(X_pos,pos_frac,seed)
 99 |     
100 |     X_neg = X[y == 0]
101 |     X_neg = sample(X_neg,neg_frac,seed)
102 |     
103 |     X = pd.concat([X_pos,X_neg])
104 |     
105 |     X,y = sample(X,1,seed,y)
106 |     
107 |     return X,y
108 | 
109 | def downsampling(X,y,max_sample_num,seed=2019):
110 |     pos_num,neg_num = downsampling_num(y,max_sample_num)
111 |     return class_sample(X,y,pos_num,neg_num,seed)
112 | 
113 | def class_sample_y(y,pos_num,neg_num,seed=2019):
114 |     
115 |     npos = float((y == 1).sum())
116 |     nneg = len(y) - npos
117 |     
118 |     pos_frac = pos_num / npos
119 |     neg_frac = neg_num / nneg
120 |     
121 |     y_pos = y[y == 1]
122 |     y_pos = sample(y_pos,pos_frac,seed)
123 |     
124 |     y_neg = y[y == 0]
125 |     y_neg = sample(y_neg,neg_frac,seed)
126 |     
127 |     y = pd.concat([y_pos,y_neg])
128 |     
129 |     y = sample(y,1,seed)
130 |     
131 |     return y
132 | 
133 | def downsampling_y(y,max_sample_num,seed=2019):
134 |     pos_num,neg_num = downsampling_num(y,max_sample_num)
135 |     y = class_sample_y(y,pos_num,neg_num,seed)
136 |     return y
137 |     
138 | 
139 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/automl/model_selection.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | 
 4 | def time_train_test_split(X,y,test_rate=0.2,shuffle=True,random_state=1):
 5 |     length = X.shape[0]
 6 | 
 7 | 
 8 |     test_size = int(length * test_rate)
 9 |     train_size = length - test_size
10 | 
11 |     X_train = X.iloc[:train_size]
12 |     y_train = y.iloc[:train_size]
13 |     X_test = X.iloc[train_size:]
14 |     y_test = y.iloc[train_size:]
15 | 
16 |     if shuffle:
17 |         np.random.seed(random_state)
18 |         idx = np.arange(train_size)
19 |         np.random.shuffle(idx)
20 |         X_train = X_train.iloc[idx]
21 |         y_train = y_train.iloc[idx]
22 | 
23 |     return X_train,y_train,X_test,y_test
24 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | class Config:
 4 |     def __init__(self, start_time,budget):
 5 |         if budget >= 1000:
 6 |             self.keys_order2_cat_max = 50
 7 |             self.keys_order2_num_max = 50
 8 |             
 9 |             self.keys_order2_cat_maxmin = 10
10 |             self.keys_order2_num_maxmin = 10
11 |             self.keys_order2_num_std = 5
12 |             
13 |             self.keys_order2_bin_num_max = 20
14 |             self.keys_order2_bin_cat_max = 20
15 |             
16 |             self.all_order2_cat_max = 7
17 |             self.all_order2_num_max = 7
18 | 
19 |             
20 |             self.keys_order3_num_max = 10
21 |             self.keys_order3_cat_max = 10
22 |             
23 |             self.wait_feat_selection_num = 30
24 |             self.wait_feat_selection_num_all = 20
25 |             
26 |             self.start_time = start_time
27 |             self.budget = budget
28 |         else:
29 |             self.keys_order2_cat_max = 40
30 |             self.keys_order2_num_max = 40
31 |             
32 |             self.keys_order2_cat_maxmin = 10
33 |             self.keys_order2_num_maxmin = 10
34 |             self.keys_order2_num_std = 5
35 |             
36 |             self.keys_order2_bin_num_max = 10
37 |             self.keys_order2_bin_cat_max = 10
38 |             
39 |             self.all_order2_cat_max = 7
40 |             self.all_order2_num_max = 7
41 | 
42 |             self.keys_order3_num_max = 10
43 |             self.keys_order3_cat_max = 10
44 |             
45 |             self.wait_feat_selection_num = 30
46 |             self.wait_feat_selection_num_all = 20
47 |             
48 |             self.start_time = start_time
49 |             self.budget = budget
50 | 
51 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/data_tools.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import pandas as pd
  4 |   
  5 | 
  6 | def downcast(series,accuracy_loss = True, min_float_type='float16'):
  7 |     if series.dtype == np.int64:
  8 |         ii8 = np.iinfo(np.int8)
  9 |         ii16 = np.iinfo(np.int16)
 10 |         ii32 = np.iinfo(np.int32)
 11 |         max_value = series.max()
 12 |         min_value = series.min()
 13 |         
 14 |         if max_value <= ii8.max and min_value >= ii8.min:
 15 |             return series.astype(np.int8)
 16 |         elif  max_value <= ii16.max and min_value >= ii16.min:
 17 |             return series.astype(np.int16)
 18 |         elif max_value <= ii32.max and min_value >= ii32.min:
 19 |             return series.astype(np.int32)
 20 |         else:
 21 |             return series
 22 |         
 23 |     elif series.dtype == np.float64:
 24 |         fi16 = np.finfo(np.float16)
 25 |         fi32 = np.finfo(np.float32)
 26 |         
 27 |         if accuracy_loss:
 28 |             max_value = series.max()
 29 |             min_value = series.min()
 30 |             if np.isnan(max_value):
 31 |                 max_value = 0
 32 |             
 33 |             if np.isnan(min_value):
 34 |                 min_value = 0
 35 |                 
 36 |             if min_float_type=='float16' and max_value <= fi16.max and min_value >= fi16.min:
 37 |                 return series.astype(np.float16)
 38 |             elif max_value <= fi32.max and min_value >= fi32.min:
 39 |                 return series.astype(np.float32)
 40 |             else:
 41 |                 return series
 42 |         else:
 43 |             tmp = series[~pd.isna(series)]
 44 |             if(len(tmp)==0):
 45 |                 return series.astype(np.float16)
 46 |             
 47 |             if (tmp == tmp.astype(np.float16)).sum() == len(tmp):
 48 |                 return series.astype(np.float16)
 49 |             elif (tmp == tmp.astype(np.float32)).sum() == len(tmp):
 50 |                 return series.astype(np.float32)
 51 |            
 52 |             else:
 53 |                 return series
 54 |             
 55 |     else:
 56 |         return series
 57 |     
 58 | def gen_segs_array(shape0,nseg):
 59 |     segs = np.zeros(shape0)
 60 |     block_size = int(shape0/nseg)+1
 61 |     for i in range(nseg):
 62 |         segs[i*block_size:(i+1)*block_size] = i
 63 |     return segs
 64 | 
 65 | 
 66 | def gen_segs_tuple(shape0,nseg):
 67 |     segs = []
 68 |     block_size = int(shape0/nseg)
 69 |     i = -1
 70 |     for i in range(nseg-1):
 71 |         segs.append( (i*block_size,(i+1)*block_size) )
 72 |     segs.append(((i+1)*block_size,shape0))
 73 |     return segs
 74 |     
 75 | 
 76 | def gen_segs_tuple_by_time_nseg(shape0,nseg,time_series):
 77 |     block_size = None
 78 |     if time_series is None:
 79 |         block_size = int(shape0/nseg)+1
 80 |     else:
 81 |         max_time = time_series.max().value
 82 |         min_time = time_series.min().value
 83 |         block_size = int( (max_time-min_time)/nseg )
 84 |     return block_size
 85 |     
 86 | def gen_combine_cats(df, cols):
 87 | 
 88 |     category = df[cols[0]].astype('float64')
 89 |     for col in cols[1:]:
 90 |         mx = df[col].max()
 91 |         category *= mx
 92 |         category += df[col]
 93 |     return category
 94 | 
 95 | def gen_segs_tuple_by_time_size(shape0,block_size,time_series):
 96 |     segs = []
 97 |     if time_series is None:
 98 |         nseg = int(shape0/block_size)
 99 |         block_size = int( shape0/nseg ) + 1
100 |         for i in range(nseg):
101 |             segs.append( (i*block_size,(i+1)*block_size) )
102 |     else:
103 |         max_time = time_series.max().value
104 |         min_time = time_series.min().value
105 |         nseg = int( (max_time-min_time)/block_size )
106 |         if nseg == 0:
107 |             nseg = 1
108 |         block_size = int( (max_time-min_time)/nseg ) + 1
109 |         t = time_series.reset_index(drop=True)
110 |         t = t.astype('int64')
111 |         
112 |         
113 |         for i in range(nseg):
114 |             
115 |             l_time = min_time + i*block_size
116 |             r_time = min_time + (i+1)*block_size
117 |             if i == nseg-1:
118 |                 r_time = max_time+1
119 |             indexs = t[ (l_time<=t) & (t < r_time) ].index
120 |             l_index = indexs[0]
121 |             r_index = indexs[-1]+1
122 |             segs.append( (l_index,r_index) )
123 |             
124 |     return segs
125 |     
126 | 
127 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepBlueAI/AutoSmart/99a6b44ce5b6520a2463c8a4b3acc675584533be/auto_smart/auto_smart/feat/__init__.py


--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/default_merge_feat.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from .merge_feat import O2O,M2O,O2M,M2M,TimeM2M,PreO2O,PreM2O,PreO2M,PreM2M,PreTimeM2M
  4 | from util import timeclass
  5 | import CONSTANT
  6 | import pandas as pd
  7 | import numpy as np
  8 | 
  9 | from joblib import Parallel, delayed
 10 | from feat_context import FeatContext
 11 | import util
 12 | from data_tools import downcast
 13 | import gc
 14 | namespace = 'default'
 15 | 
 16 | class M2OJoin(M2O):
 17 |     def fit(self,U,V):
 18 |         pass
 19 | 
 20 |     @timeclass(cls='M2OJoin')
 21 |     def transform(self,U,V):
 22 |         v = V.data
 23 |         key = self.key
 24 |         v = v.set_index(key)
 25 |         new_cols = []
 26 |         col2type = {}
 27 |         col2block = {}
 28 |         for col in v.columns:
 29 |             feat_type = V.col2type[col]
 30 |             new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,col,feat_type,V.name)
 31 |             new_cols.append(new_col)
 32 |             col2type[new_col] = feat_type
 33 | 
 34 |             if col in V.col2block:
 35 |                 block_id = V.col2block[col]
 36 |                 col2block[new_col] = block_id
 37 | 
 38 |         v.columns = new_cols
 39 |         return v,col2type,col2block
 40 | 
 41 |     @timeclass(cls='M2OJoin')
 42 |     def fit_transform(self,U,V):
 43 |         return self.transform(U,V)
 44 | 
 45 | class M2MKeyCount(M2M):
 46 |     @timeclass(cls='M2MKeyCount')
 47 |     def fit(self,U,V):
 48 |         pass
 49 | 
 50 |     @timeclass(cls='M2MKeyCount')
 51 |     def transform(self,U,V):
 52 |         v = V.data
 53 |         key = self.key
 54 |         col2type = {}
 55 |         ss = v.groupby(key)[key].count()
 56 |         ss = downcast(ss)
 57 |         feat_type = CONSTANT.NUMERICAL_TYPE
 58 |         new_col = key+'_M2MKeyCount'
 59 |         new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,new_col,feat_type,V.name)
 60 |         ss.name = new_col
 61 |         col2type[new_col] = feat_type
 62 |         return pd.DataFrame(ss),col2type,{}
 63 | 
 64 |     @timeclass(cls='M2MKeyCount')
 65 |     def fit_transform(self,U,V):
 66 |         return self.transform(U,V)
 67 | 
 68 | class M2MNumMean(M2M):
 69 |     @timeclass(cls='M2MNumMean')
 70 |     def fit(self,U,V):
 71 |         pass
 72 | 
 73 |     @timeclass(cls='M2MNumMean')
 74 |     def transform(self,U,V):
 75 |         v = V.data
 76 |         key = self.key
 77 |         col2type = {}
 78 |         
 79 |         def func(df):
 80 |             key = df.columns[0]
 81 |             col = df.columns[1]
 82 |             df[col] = df[col].astype('float32')
 83 |             
 84 |             ss = df.groupby(key)[col].mean()
 85 |             ss = downcast(ss)
 86 |             return ss
 87 |         
 88 |         res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(v[[key,col]]) for col in V.num_cols)
 89 |         if res:
 90 |             new_cols = []
 91 |             for col in V.num_cols:
 92 |                 feat_type = CONSTANT.NUMERICAL_TYPE
 93 |                 col = col+'_M2MNumMean'
 94 |                 new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,col,feat_type,V.name)
 95 |                 new_cols.append(new_col)
 96 |                 col2type[new_col] = feat_type
 97 |             
 98 |             tmp = pd.concat(res,axis=1)
 99 |             tmp.columns = new_cols
100 |             return tmp,col2type,{} 
101 |         return pd.DataFrame(),col2type,{}
102 | 
103 |     @timeclass(cls='M2MNumMean')
104 |     def fit_transform(self,U,V):
105 |         return self.transform(U,V)
106 | 
107 | class TimeM2MnewLastData(M2M):
108 |     @timeclass(cls='TimeM2MnewLastData')
109 |     def fit(self,U,V):
110 |         pass
111 | 
112 |     @timeclass(cls='TimeM2MnewLastData')
113 |     def transform(self,U,V):
114 |         key = self.key
115 |         
116 |         if U.key_time_col != V.key_time_col:
117 |             return 
118 |         
119 |         key_time_col = V.key_time_col
120 |         
121 |         todo_cols = V.multi_cat_cols
122 |         if not todo_cols:
123 |             return 
124 |         
125 |         v = V.data[[V.key_time_col,key] + todo_cols]
126 |         u = U.data[[U.key_time_col,key]]
127 |         
128 |         u_index = u.index
129 |         u.reset_index(drop=True,inplace=True)
130 |         col2type = {}
131 |         col2block = {}
132 |         
133 |         u.index = -u.index-1
134 |         v_large = pd.concat([v,u])
135 |         v_large.sort_values(by=[key,key_time_col],inplace=True)
136 |         
137 |         symbol = 1
138 |         key_diff = v_large[key].diff()
139 |         for col in todo_cols:
140 |             v_large[col].loc[key_diff!=0].replace(np.nan,symbol)
141 |             
142 |         new_cols = []
143 |         for col in todo_cols:
144 |             feat_type = CONSTANT.MULTI_CAT_TYPE
145 |             new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,col,feat_type,V.name)
146 |             new_cols.append(new_col)
147 |             col2type[new_col] = feat_type
148 |             if col in V.col2block:
149 |                 col2block[new_col] = V.col2block[col]
150 |             
151 |         def func(series):
152 |             ss = series.fillna(method='ffill')
153 |             ss = ss.replace(symbol,np.nan)
154 |             return ss
155 |         
156 |         res = Parallel(n_jobs=CONSTANT.JOBS, require='sharedmem')(delayed(func)(v_large[col]) for col in todo_cols)
157 |         if res:
158 |             tmp = pd.concat(res,axis=1)
159 |             del res
160 |             gc.collect()
161 |             
162 |             tmp.columns = new_cols
163 |             tmp = tmp.loc[tmp.index<0]
164 |             tmp.index = -(tmp.index+1)
165 |             
166 |             tmp.sort_index(inplace=True)
167 |             tmp.index = u_index
168 |             del u_index
169 |             gc.collect()
170 |             U.data[new_cols] = tmp
171 |             del tmp
172 |             gc.collect()
173 |             U.update_data(U.data,col2type,None,None,col2block,None)
174 |             
175 |     @timeclass(cls='TimeM2MnewLastData')
176 |     def fit_transform(self,U,V):
177 |         self.transform(U,V)
178 |         
179 | class M2MDataLast(TimeM2M):
180 |     @timeclass(cls='M2MDataLast')
181 |     def fit(self,U,V):
182 |         pass
183 | 
184 |     @timeclass(cls='M2MDataLast')
185 |     def transform(self,U,V):
186 |         data = V.data
187 |         key = self.key
188 |         col2type = {}
189 |         col2block = {}
190 | 
191 |         col_sets = []
192 |         cols = list(data.columns)
193 |         
194 |         if key in cols:
195 |             cols.remove(key)
196 |         
197 |         del_cols = []
198 |         for col in cols:
199 |             if col in V.col2type:
200 |                 if V.col2type[col] == CONSTANT.NUMERICAL_TYPE:
201 |                     del_cols.append(col)
202 |                     
203 |         for col in del_cols:
204 |             if col in cols:
205 |                 cols.remove(col)
206 |         
207 |         if len(cols)==0:
208 |             return pd.DataFrame(),{},{}
209 |         cols_len = 20
210 |         cols_num = len(cols)
211 |         if cols_num % cols_len == 0:
212 |             blocks = int(cols_num / cols_len)
213 |         else:
214 |             blocks = int(cols_num / cols_len) + 1
215 | 
216 |         for i in range(blocks):
217 |             col_t = []
218 |             for j in range(i*cols_len,(i+1)*cols_len):
219 |                 if j < len(cols):
220 |                     col_t.append(cols[j])
221 |             col_sets.append(col_t)
222 | 
223 |         feats = []
224 |         for col_set in col_sets:
225 |             
226 |             feats.append( data.groupby( key )[ col_set ].last() )
227 |         if feats:
228 |             df = pd.concat(feats,axis=1)
229 |             
230 |             new_cols = []
231 |             for col in df.columns:
232 |                 feat_type = V.col2type[col]
233 |                 new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,col,feat_type,V.name)
234 |                 new_cols.append(new_col)
235 |                 col2type[new_col] = feat_type
236 |     
237 |     
238 |                 if col in V.col2block:
239 |                     block_id = V.col2block[col]
240 |                     col2block[new_col] = block_id
241 |     
242 |             df.columns = new_cols
243 |             return df,col2type,col2block
244 |         else:
245 |             return pd.DataFrame(),{},{}
246 | 
247 |     @timeclass(cls='M2MDataLast')
248 |     def fit_transform(self,U,V):
249 |         self.fit(U,V)
250 |         return self.transform(U,V)


--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/feat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | class Feat:
 4 |     def __init__(self,config):
 5 |         self.config = config
 6 |     
 7 |     def fit(self,X,y):
 8 |         pass
 9 | 
10 |     def transform(self,X):
11 |         pass
12 |     
13 |     def fit_transform(self,X,y):
14 |         pass


--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/feat_pipeline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from .default_feat import *
 4 | from .feat_selection import LGBFeatureSelection,LGBFeatureSelectionWait,LGBFeatureSelectionLast
 5 | 
 6 | class FeatPipeline:
 7 |     def __init__(self):
 8 |         self.order1s = []
 9 | 
10 | class DefaultFeatPipeline(FeatPipeline):
11 |     def __init__(self):
12 |         super(DefaultFeatPipeline,self).__init__()
13 |         self.main_init()
14 | 
15 |     def main_init(self):
16 |         self.order1s = [
17 |                 PreMcToNumpy,McCatRank,
18 |                 
19 |                 OriginSession,\
20 | 
21 |                 ApartCatRecognize,\
22 | 
23 |                 KeysCountDIY,
24 |                 UserKeyCntDIY,SessionKeyCntDIY,\
25 |                 
26 |                 KeysTimeDiffAndFuture,
27 |                 
28 |                 UserSessionNuniqueDIY,\
29 |                 UserSessionCntDivNuniqueDIY,\
30 |                 UserKeyNuniqueDIY, SessionKeyNuniqueDIY,\
31 |                 UserKeyCntDivNuniqueDIY,SessionKeyCntDivNuniqueDIY,\
32 | 
33 |                 KeysCumCntRateAndReverse,
34 |                 
35 |                 UserKeyCumCntRateAndReverse,
36 |                 
37 |                 KeyTimeDate,
38 |                 KeyTimeBin,
39 |                 KeysBinCntDIY,
40 |                 
41 |                 CatCountDIY,
42 |                 LGBFeatureSelection,\
43 |                 ]
44 | 
45 |         self.keys_order2s = [
46 |                 KeysNumMeanOrder2MinusSelfNew,
47 |                 KeysNumMaxMinOrder2MinusSelfNew,
48 |                 KeysNumStd,
49 |                 KeysCatCntOrder2New,
50 | 
51 |                 LGBFeatureSelectionWait,
52 |         ]
53 |         
54 |         self.all_order2s = [
55 |                 BinsCatCntOrder2DIYNew,
56 |                 BinsNumMeanOrder2DIYNew,
57 |                 CatNumMeanOrder2DIYNew,
58 |                 CatCntOrder2DIYNew,
59 | 
60 |                 LGBFeatureSelectionWait
61 |         ]
62 | 
63 |         self.post_order1s = [
64 |                 TimeNum,
65 |         ]
66 | 
67 |         self.merge_order1s = [
68 |                 CatSegCtrOrigin,
69 |                 CatMeanEncoding,
70 | 
71 |                 LGBFeatureSelectionLast,
72 |         ]
73 | 
74 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/feat_selection.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from util import timeclass,log
  3 | import CONSTANT
  4 | from model_input import FeatOutput
  5 | from automl import autosample
  6 | import gc
  7 | import lightgbm as lgb
  8 | import pandas as pd
  9 | from .feat import Feat
 10 | import time
 11 | import numpy as np
 12 | 
 13 | def lgb_train(X,y):
 14 |     num_boost_round = 100
 15 |     num_leaves = 63
 16 | 
 17 |     params = {
 18 |         'boosting_type': 'gbdt',
 19 |         'objective': 'binary',
 20 |         'metric': "None",
 21 |         'learning_rate': 0.1,
 22 |         'num_leaves': num_leaves,
 23 |         'max_depth': -1,
 24 |         'min_child_samples': 20,
 25 |         'max_bin':255,
 26 |         'subsample': 0.9,
 27 |         'subsample_freq': 1,
 28 |         'colsample_bytree': 1,
 29 |         'min_child_weight': 0.001,
 30 |         'subsample_for_bin': 200000,
 31 |         'min_split_gain': 0.02,
 32 |         'reg_alpha': 0.1,
 33 |         'reg_lambda': 0.1,
 34 |         'seed': CONSTANT.SEED,
 35 |         'nthread': CONSTANT.THREAD_NUM,
 36 |     }
 37 | 
 38 |     data = X.data
 39 | 
 40 |     y_train = y
 41 | 
 42 |     max_sample_num = min(len(y_train),50000)
 43 |     y_train = autosample.downsampling_y(y_train,max_sample_num)
 44 | 
 45 |     X_train = data.loc[y_train.index]
 46 | 
 47 |     X.data = X_train
 48 |     feat_output = FeatOutput()
 49 |     X_train,y_train,categories = feat_output.fit_transform_output(X,y_train)
 50 | 
 51 |     X.data = data
 52 |     gc.collect()
 53 | 
 54 |     feat_name_cols = list(X_train.columns)
 55 |     feat_name_maps = { feat_name_cols[i] : str(i)  for i in range(len(feat_name_cols)) }
 56 |     f_feat_name_maps = { str(i) : feat_name_cols[i] for i in range(len(feat_name_cols)) }
 57 |     new_feat_name_cols = [ feat_name_maps[i] for i in feat_name_cols ]
 58 |     X_train.columns = new_feat_name_cols
 59 | 
 60 |     dtrain = lgb.Dataset(X_train,y_train,feature_name=list(X_train.columns))
 61 |     model = lgb.train(params,dtrain,
 62 |                          num_boost_round=num_boost_round,
 63 |                          categorical_feature=[],
 64 |                          )
 65 | 
 66 |     df_imp = pd.DataFrame({'features': [ f_feat_name_maps[i] for i in model.feature_name() ] ,
 67 |              'importances':model.feature_importance()})
 68 | 
 69 |     df_imp.sort_values('importances',ascending=False,inplace=True)
 70 | 
 71 |     return df_imp
 72 | 
 73 | class LGBFeatureSelection(Feat):
 74 |     @timeclass(cls='LGBFeatureSelection')
 75 |     def fit(self,X,y):
 76 |         now = time.time()
 77 |         log(f'LGBFeatureSelection:{now-self.config.start_time}')
 78 | 
 79 |         threshold = 5
 80 |         df_imp = lgb_train(X,y)
 81 |         log(f'importances sum {df_imp["importances"].sum()}')
 82 |         if df_imp["importances"].sum() != 6200:
 83 |             keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features'])
 84 |             if len(keep_feats) < 150:
 85 |                 useful_feats = list(df_imp.loc[df_imp['importances'] > 0,'features'])
 86 |                 if len(useful_feats) <= 150:
 87 |                     keep_feats = useful_feats
 88 |                 else:
 89 |                     df_imp_sorted = df_imp.sort_values(by='importances',ascending=False)
 90 |                     keep_feats = list(df_imp_sorted['features'].iloc[:150])
 91 |         else:
 92 |             keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features'])
 93 |         
 94 |         log(f'keep feats num {len(keep_feats)}')
 95 |         
 96 |         keep_cats = []
 97 | 
 98 |         keep_cats_set = set()
 99 |         cat_set = set(X.cat_cols)
100 | 
101 |         for feat in keep_feats:
102 | 
103 |             if X.col2type[feat] == CONSTANT.CATEGORY_TYPE:
104 |                 if feat in cat_set:
105 |                     if feat not in keep_cats_set:
106 |                         keep_cats_set.add(feat)
107 |                         keep_cats.append(feat)
108 | 
109 |             elif feat in X.col2source_cat:
110 |                 keep_feat = X.col2source_cat[feat]
111 |                 if keep_feat in cat_set:
112 |                     if keep_feat not in keep_cats_set:
113 |                         keep_cats_set.add(keep_feat)
114 |                         keep_cats.append(keep_feat)
115 | 
116 |         drop_feats = list(set(df_imp['features'].tolist()) - set(keep_feats))
117 | 
118 |         drop_feats = list(set(drop_feats) - keep_cats_set)
119 |         self.drop_feats = drop_feats
120 |         log(f'total feat num:{df_imp.shape[0]}, drop feat num:{len(self.drop_feats)}')
121 | 
122 |         keep_nums = []
123 |         for feat in keep_feats:
124 |             if X.col2type[feat] ==  CONSTANT.NUMERICAL_TYPE:
125 |                 keep_nums.append(feat)
126 | 
127 |         keep_binaries = []
128 |         for feat in keep_feats:
129 |             if X.col2type[feat] ==  CONSTANT.BINARY_TYPE:
130 |                 keep_binaries.append(feat)
131 | 
132 |         assert(len(set(keep_cats) & set(drop_feats))==0)
133 |         assert(len(set(keep_nums) & set(drop_feats))==0)
134 |         assert(len(set(keep_binaries) & set(drop_feats))==0)
135 | 
136 |         X.reset_combine_cols(keep_cats,keep_nums,keep_binaries)
137 | 
138 |     @timeclass(cls='LGBFeatureSelection')
139 |     def transform(self,X):
140 |         X.drop_data(self.drop_feats)
141 |         return self.drop_feats
142 |     
143 |     @timeclass(cls='LGBFeatureSelection')
144 |     def fit_transform(self,X,y):
145 |         self.fit(X,y)
146 |         self.transform(X)
147 |         return self.drop_feats
148 | 
149 | class LGBFeatureSelectionLast(Feat):
150 |     @timeclass(cls='LGBFeatureSelectionLast')
151 |     def fit(self,X,y):
152 |         now = time.time()
153 |         log(f'LGBFeatureSelectionLast:{now-self.config.start_time}')
154 | 
155 |         start_time = time.time()
156 |         df_imp = lgb_train(X,y)
157 |         
158 |         data = X.data
159 |         shape = data.shape
160 |         y_pos = len(y[y==1])
161 |         y_neg = len(y[y==0])
162 |         unbalance_ratio = y_pos / y_neg if y_pos > y_neg else y_neg / y_pos
163 |         memory_usage = pd.Series(np.zeros(shape[0]),dtype=np.float32).memory_usage() / 1024 / 1024 / 1024
164 |         gc.collect()
165 |         
166 |         if unbalance_ratio >= 7:
167 |             memory_constrain = 2
168 |         elif unbalance_ratio >= 4:
169 |             memory_constrain = 1.8
170 |         else:
171 |             memory_constrain = 1.6
172 |             
173 |         col_constrain =  int(memory_constrain / memory_usage)
174 |         
175 |         end_time = time.time()
176 |         
177 |         use_time = end_time-start_time
178 |         user_time_rate = use_time / self.config.budget
179 |                
180 |         if user_time_rate > 0.1:
181 |             threshold = 13
182 |         elif user_time_rate > 0.09:
183 |             threshold = 12
184 |         elif user_time_rate > 0.08:
185 |             threshold = 11
186 |         elif user_time_rate > 0.07:
187 |             threshold = 10
188 |         elif user_time_rate > 0.06:
189 |             threshold = 9
190 |         elif user_time_rate > 0.05:
191 |             threshold = 8
192 |         elif user_time_rate > 0.04:
193 |             threshold = 7
194 |         elif user_time_rate > 0.03:
195 |             threshold = 6
196 |         else:
197 |             threshold = 5
198 |             
199 |         log(f'LGBFeatureSelectionLast threshold {threshold}')
200 |         
201 |         log(f'importances sum {df_imp["importances"].sum()}')
202 |         if df_imp["importances"].sum() != 6200:
203 |             keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features'])
204 |             if len(keep_feats) < 150:
205 |                 useful_feats = list(df_imp.loc[df_imp['importances'] > 0,'features'])
206 |                 if len(useful_feats) <= 150:
207 |                     keep_feats = useful_feats
208 |                 else:
209 |                     df_imp_sorted = df_imp.sort_values(by='importances',ascending=False)
210 |                     keep_feats = list(df_imp_sorted['features'].iloc[:150])
211 |         else:
212 |             keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features'])
213 |         
214 |         keep_cats = []
215 | 
216 |         keep_cats_set = set()
217 |         cat_set = set(X.cat_cols)
218 | 
219 |         for feat in keep_feats:
220 | 
221 |             if X.col2type[feat] == CONSTANT.CATEGORY_TYPE:
222 |                 if feat in cat_set:
223 |                     if feat not in keep_cats_set:
224 |                         keep_cats_set.add(feat)
225 |                         keep_cats.append(feat)
226 | 
227 |             elif feat in X.col2source_cat:
228 |                 keep_feat = X.col2source_cat[feat]
229 |                 if keep_feat in cat_set:
230 |                     if keep_feat not in keep_cats_set:
231 |                         keep_cats_set.add(keep_feat)
232 |                         keep_cats.append(keep_feat)
233 | 
234 |         drop_feats = list(set(df_imp['features'].tolist()) - set(keep_feats))
235 |         
236 |         drop_feats = list(set(drop_feats) - keep_cats_set)
237 |         self.drop_feats = drop_feats
238 |         log(f'total feat num:{df_imp.shape[0]}, drop feat num:{len(self.drop_feats)}')
239 | 
240 |         keep_nums = []
241 |         for feat in keep_feats:
242 |             if X.col2type[feat] ==  CONSTANT.NUMERICAL_TYPE:
243 |                 keep_nums.append(feat)
244 | 
245 |         keep_binaries = []
246 |         for feat in keep_feats:
247 |             if X.col2type[feat] ==  CONSTANT.BINARY_TYPE:
248 |                 keep_binaries.append(feat)
249 | 
250 |         assert(len(set(keep_cats) & set(drop_feats))==0)
251 |         assert(len(set(keep_nums) & set(drop_feats))==0)
252 |         assert(len(set(keep_binaries) & set(drop_feats))==0)
253 | 
254 |         X.reset_combine_cols(keep_cats,keep_nums,keep_binaries)
255 | 
256 |         rest_cols = len(df_imp) - len(self.drop_feats)
257 |         if rest_cols > col_constrain:
258 |             real_keep_feats = set(df_imp['features'].iloc[:col_constrain].tolist())
259 |             real_drop_feats = list(set(df_imp['features'].tolist()) - real_keep_feats)
260 |             self.drop_feats = real_drop_feats
261 | 
262 |     @timeclass(cls='LGBFeatureSelectionLast')
263 |     def transform(self,X):
264 |         X.drop_data(self.drop_feats)
265 |         return self.drop_feats
266 |     
267 |     @timeclass(cls='LGBFeatureSelectionLast')
268 |     def fit_transform(self,X,y):
269 |         self.fit(X,y)
270 |         self.transform(X)
271 |         return self.drop_feats
272 | 
273 | class LGBFeatureSelectionWait(Feat):
274 |     @timeclass(cls='LGBFeatureSelectionWait')
275 |     def fit(self,X,y):
276 |         now = time.time()
277 |         log(f'LGBFeatureSelection:{now-self.config.start_time}')
278 | 
279 |         threshold = 5
280 |         df_imp = lgb_train(X,y)
281 |         drop_feats = set(df_imp.loc[df_imp['importances'] < threshold,'features'])
282 |         keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features'])
283 | 
284 |         df_imp.set_index('features',inplace=True)
285 |         for cols in X.wait_selection_cols:
286 |             drops = df_imp.loc[cols].sort_values(by='importances',ascending=False).index[self.config.wait_feat_selection_num:]
287 |             drops = set(drops)
288 |             drop_feats = drop_feats | drops
289 | 
290 |         keep_cats = []
291 | 
292 |         keep_cats_set = set()
293 |         cat_set = set(X.cat_cols)
294 |         for feat in keep_feats:
295 | 
296 |             if X.col2type[feat] == CONSTANT.CATEGORY_TYPE:
297 |                 if feat in cat_set:
298 |                     if feat not in keep_cats_set:
299 |                         keep_cats_set.add(feat)
300 |                         keep_cats.append(feat)
301 | 
302 |             elif feat in X.col2source_cat:
303 |                 keep_feat = X.col2source_cat[feat]
304 |                 if keep_feat in cat_set:
305 |                     if keep_feat not in keep_cats_set:
306 |                         keep_cats_set.add(keep_feat)
307 |                         keep_cats.append(keep_feat)
308 | 
309 | 
310 |         drop_feats = drop_feats - keep_cats_set
311 |         drop_feats = list(drop_feats)
312 |         self.drop_feats = drop_feats
313 |         X.empty_wait_selection_cols()
314 |         log(f'total feat num:{df_imp.shape[0]}, drop feat num:{len(self.drop_feats)}')
315 | 
316 |         assert(len(set(keep_cats) & set(drop_feats))==0)
317 | 
318 |     @timeclass(cls='LGBFeatureSelectionWait')
319 |     def transform(self,X):
320 |         X.drop_data(self.drop_feats)
321 |         return self.drop_feats
322 |     
323 |     @timeclass(cls='LGBFeatureSelectionWait')
324 |     def fit_transform(self,X,y):
325 |         self.fit(X,y)
326 |         self.transform(X)
327 |         return self.drop_feats
328 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/merge_feat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | class MergeFeat:
 4 |     def __init__(self,key):
 5 |         self.key = key
 6 |     
 7 |     def fit(self,U,V):
 8 |         pass
 9 | 
10 |     def transform(self,U,V):
11 |         pass
12 |     
13 |     def fit_transform(self,U,V):
14 |         pass
15 |     
16 | class PreTimeM2M(MergeFeat):
17 |     pass
18 | 
19 | class PreO2O(MergeFeat):
20 |     pass
21 | 
22 | class PreM2O(MergeFeat):
23 |     pass
24 | 
25 | class PreO2M(MergeFeat):
26 |     pass
27 | 
28 | class PreM2M(MergeFeat):
29 |     pass
30 | 
31 | class O2O(MergeFeat):
32 |     pass
33 | 
34 | class M2O(MergeFeat):
35 |     pass
36 | 
37 | class O2M(MergeFeat):
38 |     pass
39 | 
40 | 
41 | class M2M(MergeFeat):
42 |     pass
43 | 
44 | class TimeM2M(MergeFeat):
45 |     pass
46 | 
47 | class CmjTimeM2M(MergeFeat):
48 |     def __init__(self,key,time_key,u_key_time_col):
49 |         self.key = key
50 |         self.time_key = time_key
51 |         self.u_key_time_col = u_key_time_col
52 |     
53 |     def fit(self,T):
54 |         pass
55 | 
56 |     def transform(self,T):
57 |         pass
58 |     
59 |     def fit_transform(self,T):
60 |         pass


--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat/merge_feat_pipeline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from .default_merge_feat import *
 3 | 
 4 | class MergeFeatPipeline:
 5 |     def __init__(self):
 6 |         self.preM2Ms = []
 7 |         self.preO2Ms = []
 8 | 
 9 |         self.TimeM2Ms = []
10 |         self.newTimeM2Ms = []
11 | 
12 |         self.O2Ms = []
13 |         self.M2Ms = []
14 | 
15 |         self.preM2Os = []
16 |         self.preO2Os = []
17 | 
18 |         self.O2Os = []
19 |         self.M2Os = []
20 | 
21 | 
22 | class DeafultMergeFeatPipeline(MergeFeatPipeline):
23 |     def __init__(self):
24 |         super(DeafultMergeFeatPipeline,self).__init__()
25 | 
26 |         self.main_init()
27 | 
28 |     def main_init(self):
29 |         
30 |         self.newTimeM2Ms = [TimeM2MnewLastData]
31 |         
32 |         self.preM2Ms = []
33 |         self.M2Ms = [M2MKeyCount, M2MNumMean,M2MDataLast]
34 | 
35 |         self.preO2Ms = []
36 |         self.O2Ms = [M2MKeyCount, M2MNumMean,M2MDataLast]
37 | 
38 |         self.preO2Os = []
39 |         self.O2Os = [M2OJoin]
40 |         
41 |         self.preM2Os = []
42 |         self.M2Os = [M2OJoin]
43 |         
44 |         
45 |         
46 |         
47 |         
48 |         
49 |         
50 |         
51 |         
52 |         
53 |         
54 |         
55 |         
56 |         
57 |         
58 |         
59 |         
60 |         
61 |         
62 |         
63 |         


--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat_context.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import CONSTANT
 3 | 
 4 | class FeatContext:
 5 |     @staticmethod
 6 |     def gen_feat_name(namespace,cls_name,feat_name,feat_type):
 7 |         prefix = CONSTANT.type2prefix[feat_type]
 8 | 
 9 | 
10 |         return f"{prefix}{cls_name}:{feat_name}:{namespace}"
11 | 
12 |     @staticmethod
13 |     def gen_merge_name(table_name,feat_name,feat_type):
14 |         prefix = CONSTANT.type2prefix[feat_type]
15 |         return f"{prefix}{table_name}.({feat_name})"
16 | 
17 |     @staticmethod
18 |     def gen_merge_feat_name(namespace,cls_name,feat_name,feat_type,table_name):
19 |         feat_name = FeatContext.gen_feat_name(namespace,cls_name,feat_name,feat_type)
20 |         return FeatContext.gen_merge_name(table_name,feat_name,feat_type)
21 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/feat_engine.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from feat.feat_pipeline import FeatPipeline
  4 | from util import timeclass
  5 | 
  6 | class FeatEngine:
  7 |     def __init__(self, feat_pipeline: FeatPipeline, config):
  8 |         self.feat_pipeline = feat_pipeline
  9 |         self.config = config
 10 |         
 11 |     @timeclass(cls='FeatEngine')
 12 |     def fit_order1(self,table,y):
 13 |         self.feats_order1 = []
 14 |         for feat_cls in self.feat_pipeline.order1s:
 15 |             feat = feat_cls(self.config)
 16 |             feat.fit(table,y)
 17 |             self.feats_order1.append(feat)
 18 |             
 19 |     @timeclass(cls='FeatEngine')       
 20 |     def transform_order1(self,table):
 21 |         for feat in self.feats_order1:
 22 |             feat.transform(table)
 23 |     
 24 |     @timeclass(cls='FeatEngine')
 25 |     def fit_transform_order1(self,table,y):
 26 |         self.feats_order1 = []
 27 |         for feat_cls in self.feat_pipeline.order1s:
 28 |             feat = feat_cls(self.config)
 29 |             feat.fit_transform(table,y)
 30 |             self.feats_order1.append(feat)
 31 |     
 32 | 
 33 |     @timeclass(cls='FeatEngine')
 34 |     def fit_keys_order2(self,table,y):
 35 |         self.feats_keys_order2 = []
 36 |         for feat_cls in self.feat_pipeline.keys_order2s:
 37 |             feat = feat_cls(self.config)
 38 |             feat.fit(table,y)
 39 |             self.feats_keys_order2.append(feat)
 40 |             
 41 |     @timeclass(cls='FeatEngine')       
 42 |     def transform_keys_order2(self,table):
 43 |         for feat in self.feats_keys_order2:
 44 |             feat.transform(table)
 45 | 
 46 |     @timeclass(cls='FeatEngine')
 47 |     def fit_transform_keys_order2(self,table,y,sample=False,selection=True):
 48 |         if not self.feat_pipeline.keys_order2s:
 49 |             return
 50 |         
 51 |         if sample:
 52 |             self.feats_keys_order2 = []
 53 |             self.keys_order2_new_cols = []
 54 |             for feat_cls in self.feat_pipeline.keys_order2s[:-1]:
 55 |                 feat = feat_cls(self.config)
 56 |                 new_cols = feat.fit_transform(table,y)
 57 |                 self.feats_keys_order2.append(feat)
 58 |                 self.keys_order2_new_cols.append(set(new_cols))
 59 | 
 60 |             feat_cls = self.feat_pipeline.keys_order2s[-1]
 61 |             feat = feat_cls(self.config)
 62 |             drop_feats = set(feat.fit_transform(table,y))
 63 |             self.feats_keys_order2.append(feat)
 64 |             for i in range(len(self.keys_order2_new_cols)):
 65 |                 self.keys_order2_new_cols[i] = (set(self.keys_order2_new_cols[i]) - drop_feats)
 66 | 
 67 |         if not sample:
 68 |             if selection:
 69 |                 self.feats_keys_order2 = []
 70 |                 for i,feat_cls in enumerate(self.feat_pipeline.keys_order2s):
 71 |                     feat = feat_cls(self.config)
 72 |                     feat.fit_transform(table,y)
 73 |                     self.feats_keys_order2.append(feat)
 74 |             if not selection:
 75 |                 for i,feat_cls in enumerate(self.feat_pipeline.keys_order2s[:-1]):
 76 |                     feat = feat_cls(self.config)
 77 |                     feat.fit_transform(table,y,self.keys_order2_new_cols[i])
 78 |                     self.feats_keys_order2.append(feat)
 79 | 
 80 |     @timeclass(cls='FeatEngine')
 81 |     def fit_all_order2(self,table,y):
 82 |         self.feats_all_order2 = []
 83 |         for feat_cls in self.feat_pipeline.all_order2s:
 84 |             feat = feat_cls(self.config)
 85 |             feat.fit(table,y)
 86 |             self.feats_all_order2.append(feat)
 87 |             
 88 |     @timeclass(cls='FeatEngine')       
 89 |     def transform_all_order2(self,table):
 90 |         for feat in self.feats_all_order2:
 91 |             feat.transform(table)
 92 | 
 93 |     @timeclass(cls='FeatEngine')
 94 |     def fit_transform_all_order2(self,table,y,sample=False,selection=True):
 95 |         if not self.feat_pipeline.all_order2s:
 96 |             return
 97 |         
 98 |         if sample:
 99 |             self.feats_all_order2 = []
100 |             self.all_order2_new_cols = []
101 |             for feat_cls in self.feat_pipeline.all_order2s[:-1]:
102 |                 feat = feat_cls(self.config)
103 |                 new_cols = feat.fit_transform(table,y)
104 |                 self.feats_all_order2.append(feat)
105 |                 self.all_order2_new_cols.append(set(new_cols))
106 | 
107 |             feat_cls = self.feat_pipeline.all_order2s[-1]
108 |             feat = feat_cls(self.config)
109 |             drop_feats = set(feat.fit_transform(table,y))
110 |             self.feats_all_order2.append(feat)
111 |             for i in range(len(self.all_order2_new_cols)):
112 |                 self.all_order2_new_cols[i] = set(self.all_order2_new_cols[i]) - drop_feats
113 | 
114 |         if not sample:
115 |             if selection:
116 |                 self.feats_all_order2 = []
117 |                 for i,feat_cls in enumerate(self.feat_pipeline.all_order2s):
118 |                     feat = feat_cls(self.config)
119 |                     feat.fit_transform(table,y)
120 |                     self.feats_all_order2.append(feat)
121 |             if not selection:
122 |                 for i,feat_cls in enumerate(self.feat_pipeline.all_order2s[:-1]):
123 |                     feat = feat_cls(self.config)
124 |                     feat.fit_transform(table,y,self.all_order2_new_cols[i])
125 |                     self.feats_all_order2.append(feat)
126 |     
127 |     @timeclass(cls='FeatEngine')
128 |     def fit_keys_order3(self,table,y):
129 |         self.feats_keys_order3 = []
130 |         for feat_cls in self.feat_pipeline.keys_order3s:
131 |             feat = feat_cls(self.config)
132 |             feat.fit(table,y)
133 |             self.feats_keys_order3.append(feat)
134 |             
135 |     @timeclass(cls='FeatEngine')       
136 |     def transform_keys_order3(self,table):
137 |         for feat in self.feats_keys_order3:
138 |             feat.transform(table)
139 |     
140 |     @timeclass(cls='FeatEngine')
141 |     def fit_transform_keys_order3(self,table,y):
142 |         self.feats_keys_order3 = []
143 |         for feat_cls in self.feat_pipeline.keys_order3s:
144 |             feat = feat_cls(self.config)
145 |             feat.fit_transform(table,y)
146 |             self.feats_keys_order3.append(feat)
147 |     
148 | 
149 |     @timeclass(cls='FeatEngine')
150 |     def fit_post_order1(self,table,y):
151 |         self.feats_post_order1 = []
152 |         for feat_cls in self.feat_pipeline.post_order1s:
153 |             feat = feat_cls(self.config)
154 |             feat.fit(table,y)
155 |             self.feats_post_order1.append(feat)
156 |             
157 |     @timeclass(cls='FeatEngine')       
158 |     def transform_post_order1(self,table):
159 |         for feat in self.feats_post_order1:
160 |             feat.transform(table)
161 |     
162 |     @timeclass(cls='FeatEngine')
163 |     def fit_transform_post_order1(self,table,y):
164 |         self.feats_post_order1 = []
165 |         for feat_cls in self.feat_pipeline.post_order1s:
166 |             feat = feat_cls(self.config)
167 |             feat.fit_transform(table,y)
168 |             self.feats_post_order1.append(feat)
169 |         
170 |     @timeclass(cls='FeatEngine')
171 |     def fit_merge_order1(self,table,y):
172 |         self.feats_merge_order1 = []
173 |         for feat_cls in self.feat_pipeline.merge_order1s:
174 |             feat = feat_cls(self.config)
175 |             feat.fit(table,y)
176 |             self.feats_merge_order1.append(feat)
177 |             
178 |     @timeclass(cls='FeatEngine')       
179 |     def transform_merge_order1(self,table):
180 |         for feat in self.feats_merge_order1:
181 |             feat.transform(table)
182 |     
183 |     @timeclass(cls='FeatEngine')
184 |     def fit_transform_merge_order1(self,table,y):
185 |         self.feats_merge_order1 = []
186 |         for feat_cls in self.feat_pipeline.merge_order1s:
187 |             feat = feat_cls(self.config)
188 |             feat.fit_transform(table,y)
189 |             self.feats_merge_order1.append(feat)


--------------------------------------------------------------------------------
/auto_smart/auto_smart/merger.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pandas as pd
  3 | 
  4 | import CONSTANT
  5 | from util import log, timeclass
  6 | from feat.merge_feat_pipeline import MergeFeatPipeline
  7 | import copy
  8 | import gc
  9 | from data_tools import downcast
 10 | 
 11 | class Merger:
 12 |     def __init__(self,merge_feat_pipeline: MergeFeatPipeline):
 13 |         self.merge_feat_pipeline = merge_feat_pipeline
 14 | 
 15 | 
 16 | 
 17 |     @timeclass(cls='Merger')
 18 |     def merge(self,key,u,v,ttype,z2f):
 19 |         feats = []
 20 |         col2type = {}
 21 |         col2groupby = {}
 22 |         col2block = {}
 23 | 
 24 |         if u.key_time_col is not None and v.key_time_col is not None and ttype=='many_to_many':
 25 | 
 26 |             if z2f and self.merge_timem2m and (key in u.user_cols):
 27 |                 self.merge_timem2m = False
 28 |                 for merge_feat_cls in self.merge_feat_pipeline.newTimeM2Ms:
 29 |                     merge_feat = merge_feat_cls(key)
 30 |                     merge_feat.fit_transform(u,v)            
 31 |             
 32 |             for merge_feat_cls in self.merge_feat_pipeline.preM2Ms:
 33 |                 merge_feat = merge_feat_cls(key)
 34 |                 merge_feat.fit_transform(u,v)
 35 | 
 36 |             for merge_feat_cls in self.merge_feat_pipeline.M2Ms:
 37 |                 merge_feat = merge_feat_cls(key)
 38 |                 v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v)
 39 |                 feats.append(v_feat)
 40 |                 col2type.update(v_col2type)
 41 |                 col2block.update(v_col2block)
 42 | 
 43 |         elif ttype == 'one_to_one':
 44 |             for merge_feat_cls in self.merge_feat_pipeline.preO2Os:
 45 |                 merge_feat = merge_feat_cls(key)
 46 |                 merge_feat.fit_transform(u,v)
 47 | 
 48 |             for merge_feat_cls in self.merge_feat_pipeline.O2Os:
 49 |                 merge_feat = merge_feat_cls(key)
 50 |                 v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v)
 51 |                 feats.append(v_feat)
 52 |                 col2type.update(v_col2type)
 53 |                 col2block.update(v_col2block)
 54 | 
 55 |         elif ttype == 'many_to_one':
 56 |             for merge_feat_cls in self.merge_feat_pipeline.preM2Os:
 57 |                 merge_feat = merge_feat_cls(key)
 58 |                 merge_feat.fit_transform(u,v)
 59 | 
 60 |             for merge_feat_cls in self.merge_feat_pipeline.M2Os:
 61 |                 merge_feat = merge_feat_cls(key)
 62 |                 v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v)
 63 |                 feats.append(v_feat)
 64 |                 col2type.update(v_col2type)
 65 |                 col2block.update(v_col2block)
 66 | 
 67 |         elif ttype == 'one_to_many':
 68 |             for merge_feat_cls in self.merge_feat_pipeline.preO2Ms:
 69 |                 merge_feat = merge_feat_cls(key)
 70 |                 merge_feat.fit_transform(u,v)
 71 | 
 72 |             for merge_feat_cls in self.merge_feat_pipeline.O2Ms:
 73 |                 merge_feat = merge_feat_cls(key)
 74 |                 v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v)
 75 |                 feats.append(v_feat)
 76 |                 col2type.update(v_col2type)
 77 |                 col2block.update(v_col2block)
 78 | 
 79 |         elif ttype == 'many_to_many':
 80 |             for merge_feat_cls in self.merge_feat_pipeline.preM2Ms:
 81 |                 merge_feat = merge_feat_cls(key)
 82 |                 merge_feat.fit_transform(u,v)
 83 | 
 84 |             for merge_feat_cls in self.merge_feat_pipeline.M2Ms:
 85 |                 merge_feat = merge_feat_cls(key)
 86 |                 v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v)
 87 |                 feats.append(v_feat)
 88 |                 col2type.update(v_col2type)
 89 |                 col2block.update(v_col2block)
 90 |         if feats:
 91 |             feat = pd.concat(feats,axis=1)
 92 |             col2groupby = {col:key for col in feat.columns}
 93 | 
 94 |             del feats,v
 95 |             gc.collect()
 96 | 
 97 |             data = u.data
 98 |             index = data.index
 99 |             data.set_index(key,inplace=True)
100 | 
101 |             cols = list(feat.columns)
102 |             data[cols] = feat
103 |             data.reset_index(key,inplace=True)
104 |             data[key] = downcast(data[key],accuracy_loss=False)
105 |             data.index= index
106 | 
107 |             u.update_data(data,col2type,col2groupby,None,col2block,None)
108 | 
109 |     @timeclass(cls='Merger')
110 |     def dfs(self,u_name, graph):
111 |         depth = graph.depth
112 |         name2table = graph.name2table
113 |         rel_graph = graph.rel_graph
114 | 
115 |         u = name2table[u_name]
116 |         log(f"enter {u_name}")
117 |         for edge in rel_graph[u_name]:
118 |             v_name = edge['to']
119 |             if depth[v_name]['depth'] <= depth[u_name]['depth']:
120 |                 continue
121 | 
122 |             v = self.dfs(v_name, graph)
123 |             key = edge['key']
124 |             assert len(key) == 1
125 |             key = key[0]
126 |             type_ = edge['type']
127 | 
128 |             log(f"join {u_name} <--{type_}--t {v_name}")
129 |             self.merge(key,u,v,type_,0)
130 | 
131 |             log(f"join {u_name} <--{type_}--nt {v_name}")
132 | 
133 |             del v
134 | 
135 |         log(f"leave {u_name}")
136 |         return u
137 | 
138 |     @timeclass(cls='Merger')
139 |     def merge_to_main_fit_transform(self,graph):
140 |         depth = graph.depth
141 |         name2table = graph.name2table
142 | 
143 |         u_name = CONSTANT.MAIN_TABLE_NAME
144 |         u = name2table[u_name]
145 |         rel_graph = graph.rel_graph
146 | 
147 |         table2feat = {}
148 |         for edge in rel_graph[u_name]:
149 |             v_name = edge['to']
150 |             if depth[v_name]['depth'] <= depth[u_name]['depth']:
151 |                 continue
152 | 
153 |             v = name2table[v_name]
154 |             key = edge['key']
155 |             assert len(key) == 1
156 |             key = key[0]
157 |             type_ = edge['type']
158 | 
159 |             log(f"join {u_name} <--{type_}--t {v_name}")
160 |             table2feat[v_name] = self.merge(key,u,v,type_,1)
161 |             log(f"join {u_name} <--{type_}--nt {v_name}")
162 | 
163 |         self.table2feat = table2feat
164 |         return u
165 | 
166 |     @timeclass(cls='Merger')
167 |     def merge_table(self,graph):
168 |         self.use_all_time_m2m = False
169 |         if graph.M2M_relation_cnt < 3:
170 |             self.use_all_time_m2m = True
171 |         
172 |         self.merge_timem2m = True
173 |         
174 |         graph.build_depth()
175 | 
176 |         depth = graph.depth
177 |         u_name = CONSTANT.MAIN_TABLE_NAME
178 |         rel_graph = graph.rel_graph
179 | 
180 |         for edge in rel_graph[u_name]:
181 |             v_name = edge['to']
182 |             if depth[v_name]['depth'] <= depth[u_name]['depth']:
183 |                 continue
184 | 
185 |             self.dfs(v_name,graph)
186 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/metadata:
--------------------------------------------------------------------------------
1 | description: Provides prediction model to be executed by the ingestion program


--------------------------------------------------------------------------------
/auto_smart/auto_smart/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import copy
  6 | import CONSTANT
  7 | from util import log, timeclass
  8 | from table.graph import Graph
  9 | from sklearn.metrics import roc_auc_score
 10 | from feat.merge_feat_pipeline import DeafultMergeFeatPipeline
 11 | from feat.feat_pipeline import DefaultFeatPipeline
 12 | 
 13 | from merger import Merger
 14 | from feat_engine import FeatEngine
 15 | from model_input import FeatOutput
 16 | from automl.model_selection import time_train_test_split
 17 | from automl.auto_lgb import AutoLGB
 18 | from PATHS import feature_importance_path,version
 19 | from datetime import datetime
 20 | import gc
 21 | from config import Config
 22 | import time
 23 | 
 24 | class Model:
 25 |     auc = []
 26 |     ensemble_auc = []
 27 |     ensemble_train_auc = []
 28 | 
 29 |     def __init__(self, info):
 30 |         self.info = copy.deepcopy(info)
 31 |         self.tables = None
 32 | 
 33 |     def shuffle(self,X,y,random_state):
 34 |         idx = np.arange(len(X))
 35 |         np.random.shuffle(idx)
 36 |         X = X.iloc[idx]
 37 |         y = y.iloc[idx]
 38 |         return X,y
 39 | 
 40 |     def release_tables(self,Xs,graph):
 41 |         
 42 |         for name in graph.tables:
 43 |             del Xs[name]
 44 |             del graph.name2table[name]
 45 | 
 46 |         gc.collect()
 47 | 
 48 |     @timeclass(cls='Model')
 49 |     def my_fit(self, Xs, y,X_test):
 50 |         np.random.seed(CONSTANT.SEED)
 51 | 
 52 |         split = CONSTANT.SPLIT
 53 | 
 54 |         self.split = split
 55 | 
 56 |         log(f'split {split}')
 57 | 
 58 |         if split == -1:
 59 |             config = Config(time.time(),self.info['time_budget'])
 60 |             
 61 |             X_test.index = -X_test.index-1
 62 | 
 63 |             main_shape = Xs[CONSTANT.MAIN_TABLE_NAME].shape[0]
 64 |             main_max_shape = 2888888
 65 |             main_min_shape = min( main_shape,100000 )
 66 |             
 67 |             test_shape = X_test.shape[0]
 68 |             max_accept_shape = 3999999
 69 |             
 70 |             if main_shape + test_shape > max_accept_shape: 
 71 |                 sample_main_shape = max_accept_shape - test_shape
 72 |                 if sample_main_shape > main_max_shape:
 73 |                     sample_main_shape = main_max_shape
 74 |                 if sample_main_shape < main_min_shape:
 75 |                     sample_main_shape = main_min_shape
 76 |                 log(f'start sample main table. origin main shape {main_shape} test shape {test_shape} sample rows num {sample_main_shape}')
 77 |                 if 'time_col' in self.info:
 78 |                     key_time_col = self.info['time_col']
 79 |                     if key_time_col in Xs[CONSTANT.MAIN_TABLE_NAME].columns:
 80 |                         Xs[CONSTANT.MAIN_TABLE_NAME].sort_values(by=key_time_col,inplace=True)
 81 |                 Xs[CONSTANT.MAIN_TABLE_NAME] = Xs[CONSTANT.MAIN_TABLE_NAME].iloc[-sample_main_shape:]
 82 |                 gc.collect()
 83 | 
 84 | 
 85 |             Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat([Xs[CONSTANT.MAIN_TABLE_NAME], X_test])
 86 | 
 87 |             X_test.drop(X_test.columns,axis=1,inplace=True)
 88 |             gc.collect()
 89 | 
 90 |             graph = Graph(self.info,Xs)
 91 |             graph.sort_tables()
 92 |             train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[Xs[CONSTANT.MAIN_TABLE_NAME].index>=0]
 93 |             y = y.loc[train_index]
 94 |             test_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[Xs[CONSTANT.MAIN_TABLE_NAME].index<0]
 95 | 
 96 |             graph.preprocess_fit_transform()
 97 |             gc.collect()
 98 | 
 99 |             merge_feat_pipeline = DeafultMergeFeatPipeline()
100 |             merger = Merger(merge_feat_pipeline)
101 | 
102 |             merger.merge_table(graph)
103 |             main_table = merger.merge_to_main_fit_transform(graph)
104 |             self.release_tables(Xs,graph)
105 |             del merger
106 |             del graph
107 |             gc.collect()
108 | 
109 |             feat_pipeline = DefaultFeatPipeline()
110 |             feat_engine = FeatEngine(feat_pipeline,config)
111 |             feat_engine.fit_transform_order1(main_table,y)
112 |             
113 |             sample_for_combine_features = True
114 |             
115 |             if sample_for_combine_features:
116 |                 main_data = main_table.data
117 |                 train_data = main_data.loc[main_data.index>=0]
118 | 
119 |                 del main_data
120 | 
121 |                 sample_num = CONSTANT.SAMPLE_NUM
122 |                 train_shape = train_data.shape 
123 |                 
124 |                 if train_shape[0] <= sample_num:
125 |                     sample_for_combine_features = False
126 |                 else:
127 |                     data_tail_new = train_data.iloc[-sample_num:]
128 |                     
129 |                     gc.collect()
130 |                     
131 |                     y_tail_new = y.loc[data_tail_new.index]
132 |                     
133 |                     table_tail_new = copy.deepcopy(main_table)
134 |                     table_tail_new.data = data_tail_new
135 |                     
136 |                     del data_tail_new
137 |                     gc.collect()
138 | 
139 |                     feat_engine.fit_transform_all_order2(table_tail_new,y_tail_new,sample=True)
140 |                     feat_engine.fit_transform_keys_order2(table_tail_new,y_tail_new,sample=True)
141 |                     
142 |                     del table_tail_new,y_tail_new
143 |                     gc.collect()
144 | 
145 |                     feat_engine.fit_transform_all_order2(main_table,y,selection=False)
146 |                     feat_engine.fit_transform_keys_order2(main_table,y,selection=False)
147 | 
148 |                     feat_engine.fit_transform_post_order1(main_table,y)
149 |                     
150 |             if not sample_for_combine_features:
151 |                 gc.collect()
152 | 
153 |                 feat_engine.fit_transform_all_order2(main_table,y)
154 |                 feat_engine.fit_transform_keys_order2(main_table,y)
155 |                 
156 |                 feat_engine.fit_transform_keys_order3(main_table,y)
157 |                 feat_engine.fit_transform_post_order1(main_table,y)
158 | 
159 | 
160 |             del feat_engine
161 |             gc.collect()
162 | 
163 | 
164 |             X_test = main_table.data.loc[test_index]
165 |             main_table.data = main_table.data.loc[train_index]
166 | 
167 |             gc.collect()
168 | 
169 |             test_table = copy.deepcopy(main_table)
170 |             test_table.data = X_test
171 |             self.test_table = test_table
172 |             len_test = X_test.shape[0]
173 |             gc.collect()
174 | 
175 |             feat_engine = FeatEngine(feat_pipeline,config)
176 |             feat_engine.fit_transform_merge_order1(main_table,y)
177 |             self.feat_engine = feat_engine
178 | 
179 |             feat_output = FeatOutput()
180 |             self.feat_output = feat_output
181 |             X,y,categories = feat_output.final_fit_transform_output(main_table,y)
182 | 
183 |             del main_table
184 |             gc.collect()
185 |             
186 |             lgb = AutoLGB()
187 |             
188 |             lgb.param_compute(X,y,categories,config)
189 |             X_train,y_train,X_test,y_test = time_train_test_split(X,y,test_rate=0.2)
190 |             
191 |             lgb.param_opt_new(X_train,y_train,X_test,y_test,categories)
192 |             
193 |             gc.collect()
194 |             
195 |             del X_train,y_train,X_test,y_test
196 | 
197 |             gc.collect()
198 | 
199 |             X,y = self.shuffle(X,y,2019)
200 |             gc.collect()
201 |             
202 |             lgb.ensemble_train(X,y,categories,config,len_test)
203 |                 
204 |             gc.collect()
205 | 
206 |             importances = lgb.get_ensemble_importances()
207 | 
208 |             self.model = lgb
209 |             del X,y
210 |             
211 |         elif split == -2:
212 | 
213 |             config = Config(time.time(),self.info['time_budget'])
214 | 
215 |             Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat([Xs[CONSTANT.MAIN_TABLE_NAME], ])
216 | 
217 |             gc.collect()
218 | 
219 |             graph = Graph(self.info,Xs)
220 |             graph.sort_tables()
221 |             train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[Xs[CONSTANT.MAIN_TABLE_NAME].index>=0]
222 |             y = y.loc[train_index]
223 | 
224 |             graph.preprocess_fit_transform()
225 |             gc.collect()
226 | 
227 |             merge_feat_pipeline = DeafultMergeFeatPipeline()
228 |             merger = Merger(merge_feat_pipeline)
229 | 
230 |             merger.merge_table(graph)
231 |             main_table = merger.merge_to_main_fit_transform(graph)
232 |             self.release_tables(Xs,graph)
233 |             del merger
234 |             del graph
235 |             gc.collect()
236 | 
237 |             feat_pipeline = DefaultFeatPipeline()
238 |             feat_engine = FeatEngine(feat_pipeline,config)
239 |             feat_engine.fit_transform_order1(main_table,y)
240 |             
241 |             sample_for_combine_features = True
242 | 
243 |             if sample_for_combine_features:
244 |                 main_data = main_table.data
245 |                 train_data = main_data.loc[main_data.index>=0]
246 | 
247 |                 del main_data
248 | 
249 |                 sample_num = CONSTANT.SAMPLE_NUM
250 |                 train_shape = train_data.shape 
251 |                 
252 |                 if train_shape[0] <= sample_num:
253 |                     sample_for_combine_features = False
254 |                 else:
255 |                     data_tail_new = train_data.iloc[-sample_num:]
256 | 
257 |                     gc.collect()
258 |                     log(f'sample data shape {data_tail_new.shape}')
259 |                     
260 |                     y_tail_new = y.loc[data_tail_new.index]
261 |                     
262 |                     table_tail_new = copy.deepcopy(main_table)
263 |                     table_tail_new.data = data_tail_new
264 |                     
265 |                     del data_tail_new
266 |                     gc.collect()
267 |                     
268 |                     feat_engine.fit_transform_all_order2(table_tail_new,y_tail_new,sample=True)
269 |                     feat_engine.fit_transform_keys_order2(table_tail_new,y_tail_new,sample=True)
270 |                     
271 |                     del table_tail_new,y_tail_new
272 |                     gc.collect()
273 |     
274 |                     feat_engine.fit_transform_all_order2(main_table,y,selection=False)
275 |                     feat_engine.fit_transform_keys_order2(main_table,y,selection=False)
276 |                     feat_engine.fit_transform_post_order1(main_table,y)
277 |                     
278 |             if not sample_for_combine_features:
279 |                 gc.collect()
280 | 
281 |                 feat_engine.fit_transform_all_order2(main_table,y)
282 |                 feat_engine.fit_transform_keys_order2(main_table,y)
283 |                 feat_engine.fit_transform_keys_order3(main_table,y)
284 |                 feat_engine.fit_transform_post_order1(main_table,y)
285 | 
286 |             del feat_engine
287 |             gc.collect()
288 | 
289 |             main_table.data = main_table.data.loc[train_index]
290 | 
291 |             gc.collect()
292 | 
293 |             def split_table(table,y):
294 |                 X = table.data
295 |                 X_train,y_train,X_test,y_test = time_train_test_split(X,y,shuffle=False,test_rate=0.2)
296 |                 table1 = copy.deepcopy(table)
297 |                 table1.data = X_train
298 |                 table2 = copy.deepcopy(table)
299 |                 table2.data = X_test
300 |                 return table1,y_train,table2,y_test
301 | 
302 |             table1,y_train,table2,y_test = split_table(main_table,y)
303 | 
304 |             feat_engine = FeatEngine(feat_pipeline,config)
305 |             feat_engine.fit_transform_merge_order1(table1,y_train)
306 |             self.feat_engine = feat_engine
307 | 
308 |             feat_output = FeatOutput()
309 |             self.feat_output = feat_output
310 | 
311 |             X_train,y_train,categories = feat_output.fit_transform_output(table1,y_train)
312 | 
313 |             gc.collect()
314 |             self.feat_engine.transform_merge_order1(table2)
315 |             X_test = self.feat_output.transform_output(table2)
316 |             
317 |             lgb = AutoLGB()
318 |             
319 |             lgb.param_compute(X_train,y_train,categories,config)
320 |             
321 |             lgb.param_opt_new(X_train,y_train,X_test,y_test,categories)
322 |                 
323 |             len_test = X_test.shape[0]
324 | 
325 |             lgb.ensemble_train(X_train,y_train,categories,config,len_test)
326 |             gc.collect()
327 | 
328 |             pred,pred0 = lgb.ensemble_predict_test(X_test)
329 | 
330 |             auc = roc_auc_score(y_test,pred0)
331 |             print('source AUC:',auc)
332 |             
333 |             auc = roc_auc_score(y_test,pred)
334 |             Model.ensemble_auc.append(auc)
335 |             print('ensemble AUC:',auc)
336 |             
337 |             importances = lgb.get_ensemble_importances()
338 | 
339 |             self.model = lgb
340 | 
341 |             del X_train,y_train,X_test,y_test
342 |             gc.collect()
343 | 
344 |         paths = os.path.join(feature_importance_path,version)
345 |         if not os.path.exists(paths):
346 |             os.makedirs(paths)
347 |         importances.to_csv(os.path.join(paths,'{}_importances.csv'.format(datetime.now().strftime('%Y%m%d%H%M%S'))),index=False)
348 | 
349 |     @timeclass(cls='Model')
350 |     def fit(self, Xs, y):
351 |         self.Xs = Xs
352 |         self.y = y
353 |         
354 | 
355 |     @timeclass(cls='Model')
356 |     def predict(self, X_test):
357 | 
358 |         self.my_fit(self.Xs, self.y, X_test)
359 |         
360 |         gc.collect()
361 | 
362 |         if self.split != -2:
363 |             main_table = self.test_table
364 |             self.feat_engine.transform_merge_order1(main_table)
365 |             X = self.feat_output.transform_output(main_table)
366 | 
367 |             X.index = -(X.index+1)
368 |             X.sort_index(inplace=True)
369 | 
370 |             result = self.model.ensemble_predict(X)
371 |             return pd.Series(result)
372 |         
373 |         else:
374 |             return pd.Series()
375 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/model_input.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from util import log, timeit, timeclass
  5 | import numpy as np
  6 | import gc
  7 | import sys
  8 | 
  9 | class FeatOutput:
 10 |     @timeclass(cls='FeatOutput')
 11 |     def transform_output(self,table):
 12 |         X = table.data
 13 | 
 14 |         self.drop_non_numerical_column(table,X)
 15 |         self.drop_post_drop_column(table,X)
 16 | 
 17 |         return X
 18 | 
 19 |     @timeclass(cls='FeatOutput')
 20 | 
 21 |     def fit_transform_output(self,table,y):
 22 |         X = table.data.copy()
 23 | 
 24 |         self.drop_non_numerical_column(table,X)
 25 |         self.drop_post_drop_column(table,X)
 26 | 
 27 |         categories =  self.get_categories(table,X)
 28 | 
 29 |         return X,y,categories
 30 | 
 31 |     def final_fit_transform_output(self,table,y):
 32 |         X = table.data
 33 | 
 34 | 
 35 |         self.drop_non_numerical_column(table,X)
 36 |         self.drop_post_drop_column(table,X)
 37 | 
 38 |         categories =  self.get_categories(table,X)
 39 | 
 40 |         return X,y,categories
 41 | 
 42 |     @timeclass(cls='FeatOutput')
 43 |     def fillna(self,table,X):
 44 |         for col in table.num_cols:
 45 |             X[col] = X[col].fillna(X[col].mean())
 46 | 
 47 | 
 48 |     def get_categories(self,table,X):
 49 |         categories = []
 50 |         col_set = set(X.columns)
 51 |         for col in table.cat_cols:
 52 |             if col in col_set:
 53 |                 if X[col].nunique() <= 15:
 54 |                     categories.append(col)
 55 | 
 56 | 
 57 |         return categories
 58 | 
 59 |     @timeclass(cls='FeatOutput')
 60 |     def drop_non_numerical_column(self,table,X):
 61 |         if table.key_time_col is not None:
 62 | 
 63 |             X.drop(table.key_time_col,axis=1,inplace=True)
 64 |             gc.collect()
 65 | 
 66 |         if len(table.time_cols) != 0:
 67 |             X.drop(table.time_cols,axis=1,inplace=True)
 68 | 
 69 |         if len(table.multi_cat_cols) != 0:
 70 |             X.drop(table.multi_cat_cols,axis=1,inplace=True)
 71 | 
 72 |     @timeclass(cls='FeatOutput')
 73 |     def drop_post_drop_column(self,table,X):
 74 |         if len(table.post_drop_set) != 0:
 75 |             drop_cols = list(table.post_drop_set)
 76 |             X.drop(drop_cols,axis=1,inplace=True)
 77 |             log(f'post drop cols:{drop_cols}')
 78 | 
 79 |     @timeclass(cls='FeatOutput')
 80 |     def drop_cat_column(self,table,X):
 81 |         X.drop(list(set(table.session_cols + table.user_cols + table.key_cols + table.cat_cols)&set(X.columns)),axis=1,inplace=True)
 82 | 
 83 |     @timeclass(cls='FeatOutput')
 84 |     def cat_hash(self,table,X):
 85 |         for col in table.user_cols + table.key_cols + table.cat_cols:
 86 |             X[col] = X[col] % 15
 87 | 
 88 |     @timeclass(cls='FeatOutput')
 89 |     def cat_process(self,train_table,test_table):
 90 |         X = train_table
 91 | 
 92 |         train = train_table.data
 93 |         test = test_table.data
 94 |         for col in X.user_cols + X.key_cols + X.cat_cols:
 95 |             inter = set(train[col].unique()) & set(test[col].unique())
 96 |             train.loc[~(train[col].isin(inter)),col] = np.nan
 97 |             test.loc[~(test[col].isin(inter)),col] = np.nan
 98 | 
 99 |     @timeclass(cls='FeatOutput')
100 |     def drop_tail(self,train_table,test_table):
101 |         X = train_table
102 | 
103 |         train = train_table.data
104 |         test = test_table.data
105 |         for col in X.key_cols + X.cat_cols:
106 |             vc = train[col].value_counts()
107 |             vc.loc[vc==1] = np.nan
108 |             train[col] = train[col].map(vc)
109 |             test[col] = test[col].map(vc)
110 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/preprocessor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepBlueAI/AutoSmart/99a6b44ce5b6520a2463c8a4b3acc675584533be/auto_smart/auto_smart/preprocessor/__init__.py


--------------------------------------------------------------------------------
/auto_smart/auto_smart/preprocessor/preprocessor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pandas as pd
  3 | import numpy as np
  4 | import ac
  5 | import CONSTANT
  6 | from data_tools import downcast
  7 | from joblib import Parallel, delayed
  8 | from util import timeclass
  9 | from feat_context import FeatContext
 10 | 
 11 | namespace = 'preprocess'
 12 | 
 13 | class Preprocessor:
 14 |     def __init__(self):
 15 |         pass
 16 |     
 17 |     def fit(self,ss):
 18 |         pass
 19 |     
 20 |     def transform(self,ss):
 21 |         pass
 22 | 
 23 |     def fit_transform(self,ss):
 24 |         pass
 25 | 
 26 | class GeneralPreprocessor(Preprocessor):
 27 |     def __init__(self):
 28 |         self.K = 5
 29 |         
 30 |     @timeclass(cls='GeneralPreprocessor')
 31 |     def transform(self,X):
 32 |     
 33 |         todo_list = X.multi_cat_cols
 34 |         if todo_list  != []:
 35 |             
 36 |             col2muldatas = {}
 37 |             col2muldatalens = {}
 38 |             
 39 |             data = X.data[todo_list]
 40 |             for col in todo_list:
 41 |                 vals = data[col].values
 42 |                 datas,datalen = ac.get_need_data(vals)
 43 |             
 44 |                 if len(datalen) != data.shape[0]:
 45 |                     raise Exception('An error with data length happens!!')
 46 |                     
 47 |                 col2muldatas[col] = np.array(datas,dtype='int64').astype(np.int32)
 48 |                 col2muldatalens[col] = np.array(datalen,dtype='int32')
 49 | 
 50 |             data = X.data[todo_list]
 51 |             col2type = {}
 52 |             col2groupby = {}
 53 |             for col in data.columns:
 54 |                 data[col] = ac.tuple_encode_func_1(col2muldatas[col],col2muldatalens[col])
 55 |             
 56 |             new_cols = []
 57 |             for col in todo_list:
 58 |                 feat_type = CONSTANT.CATEGORY_TYPE
 59 |                 new_col = col+'_MCEncode'
 60 |                 new_col = FeatContext.gen_feat_name(namespace,self.__class__.__name__,new_col,feat_type)
 61 |                 new_cols.append(new_col)
 62 |                 col2type[new_col] = feat_type
 63 |                 col2groupby[new_col] = col
 64 |             
 65 |             data.columns = new_cols
 66 |             df = X.data
 67 |             for col in data.columns:
 68 |                 df[col] = downcast(data[col],accuracy_loss=False)
 69 |             
 70 |             X.update_data(df,col2type,col2groupby)
 71 |             
 72 |             df = X.data
 73 |             index = df.index
 74 |             col2type = {}
 75 |             col2groupby = {}
 76 |             for col in todo_list:
 77 |                 new_col = col+'_MCLenAsCat'
 78 |                 feat_type = CONSTANT.CATEGORY_TYPE
 79 |                 new_col = FeatContext.gen_feat_name(namespace,self.__class__.__name__,new_col,feat_type)
 80 |                 df[new_col] = downcast( pd.Series( col2muldatalens[col],index ),accuracy_loss=False)
 81 | 
 82 |                 col2type[new_col] = feat_type
 83 |                 col2groupby[new_col] = col
 84 |                 
 85 |             X.update_data(df,col2type,col2groupby)
 86 | 
 87 |             todo_list = X.time_cols
 88 |             
 89 |             if todo_list != []:
 90 |                 df = X.data
 91 |                 col2type = {}
 92 |                 for col in X.time_cols:
 93 |                     new_col = col+'_TimeNum'
 94 |                     feat_type = CONSTANT.NUMERICAL_TYPE
 95 |                     new_col = FeatContext.gen_feat_name(namespace,self.__class__.__name__,new_col,feat_type)
 96 |                     
 97 |                     ss = (df[col] - pd.to_datetime('1970-01-01')).dt.total_seconds()
 98 |                     ss[ss<0] = np.nan
 99 |                     min_time = ss.min()
100 |                     ss = ss-min_time
101 |         
102 |                     df[new_col] = downcast(ss)
103 |         
104 |                     col2type[new_col] = feat_type
105 |         
106 |                 if len(col2type) > 0:
107 |                     X.update_data(df,col2type,None)
108 |     
109 |     @timeclass(cls='GeneralPreprocessor')
110 |     def fit_transform(self,X):
111 |         return self.transform(X)
112 | 
113 | class BinaryPreprocessor(Preprocessor):
114 |     def __init__(self):
115 |         self.col2cats = {}
116 |     
117 |     @timeclass(cls='BinaryPreprocessor')
118 |     def fit(self,X):
119 |         def func(ss):
120 |             cats = pd.Categorical(ss).categories 
121 |             return cats
122 |         
123 |         df = X.data
124 |         todo_cols = X.binary_cols
125 |         
126 |         res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols)
127 |         for col,cats in zip(todo_cols,res):
128 |             self.col2cats[col] = cats
129 |         
130 |     @timeclass(cls='BinaryPreprocessor')
131 |     def transform(self,X):
132 |         
133 |         def func(ss,cats):
134 |             codes = pd.Categorical(ss,categories=cats).codes
135 |             codes = codes.astype('float16')
136 |             codes[codes==-1] = np.nan
137 |             
138 |             return codes
139 |         
140 |         df = X.data
141 |         todo_cols = X.binary_cols
142 |         res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col],self.col2cats[col]) for col in todo_cols)
143 |         for col,codes in zip(todo_cols,res):
144 |             df[col] = codes
145 |             
146 |     @timeclass(cls='BinaryPreprocessor') 
147 |     def fit_transform(self,X):
148 |         self.fit(X)
149 |         self.transform(X)
150 | 
151 | class MSCatPreprocessor(Preprocessor):
152 |     def __init__(self):
153 |         self.cats = []
154 |         
155 |     def fit(self,ss):
156 |         vals = ss.values
157 |         
158 |         ss = pd.Series( list(ac.mscat_fit(vals)) )
159 |         
160 |         if ss.name is None:
161 |             ss.name = 'ss'
162 |         
163 |         cats = ss.dropna().drop_duplicates().values
164 | 
165 |         if len(self.cats) == 0:
166 |             self.cats = sorted(list(cats))
167 |         else:
168 |             added_cats = sorted(set(cats) - set(self.cats))
169 |             self.cats.extend(added_cats)
170 | 
171 |     def transform(self,ss,kind):
172 | 
173 |         if kind == CONSTANT.CATEGORY_TYPE:
174 | 
175 |             codes = pd.Categorical(ss,categories=self.cats).codes + CONSTANT.CAT_SHIFT
176 |             codes = codes.astype('float')
177 |             codes[codes==(CONSTANT.CAT_SHIFT-1)] = np.nan
178 | 
179 |             codes = downcast(codes,accuracy_loss=False)
180 |             return codes
181 |         else:
182 |             codes = pd.Series( ac.mscat_trans(ss.values,self.cats) , index = ss.index )
183 |             return codes
184 |             
185 |     def fit_transform(self,ss):
186 |         return self.transform(ss)    
187 |     
188 | class NumPreprocessor(Preprocessor):
189 |     def fit(self,X):
190 |         pass
191 |     
192 |     def transform(self,X):
193 |         df = X.data
194 |         todo_cols = X.num_cols
195 |         for col in todo_cols:
196 |             df[col] = downcast(df[col])
197 |     
198 |     def fit_transform(self,X):
199 |         return self.transform(X)
200 |     
201 | class UniquePreprocessor(Preprocessor):
202 |     @timeclass(cls='UniquePreprocessor')
203 |     def fit(self,X):
204 |         def func(ss):
205 |             length = len(ss.unique())
206 |             if length <= 1:
207 |                 return True
208 |             else:
209 |                 return False
210 |             
211 |         df = X.data
212 |         todo_cols = X.cat_cols + X.multi_cat_cols + X.num_cols + X.time_cols + X.binary_cols
213 |         res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols)
214 |         
215 |         drop_cols = []
216 |         for col,unique in zip(todo_cols,res):
217 |             if unique:
218 |                 drop_cols.append(col)
219 |         
220 |         self.drop_cols = drop_cols
221 | 
222 |     @timeclass(cls='UniquePreprocessor')
223 |     def transform(self,X):
224 |         X.drop_data(self.drop_cols)
225 |     
226 |     @timeclass(cls='UniquePreprocessor')
227 |     def fit_transform(self,X):
228 |         self.fit(X)
229 |         self.transform(X)
230 | 
231 | class AllDiffPreprocessor(Preprocessor):
232 |     @timeclass(cls='AllDiffPreprocessor')
233 |     def fit(self,X):
234 |         def func(ss):
235 |             length = len(ss.unique())
236 |             if length >= len(ss)-10:
237 |                 return True
238 |             else:  
239 |                 return False
240 |         
241 |         df = X.data
242 |         todo_cols = X.cat_cols
243 |         res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols)
244 |         
245 |         drop_cols = []
246 |         for col,all_diff in zip(todo_cols,res):
247 |             if all_diff:
248 |                 drop_cols.append(col)
249 |         
250 |         self.drop_cols = drop_cols
251 |         
252 |     @timeclass(cls='AllDiffPreprocessor')
253 |     def transform(self,X):
254 |         X.drop_data(self.drop_cols)
255 |     
256 |     @timeclass(cls='AllDiffPreprocessor')
257 |     def fit_transform(self,X):
258 |         self.fit(X)
259 |         self.transform(X)
260 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/table/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepBlueAI/AutoSmart/99a6b44ce5b6520a2463c8a4b3acc675584533be/auto_smart/auto_smart/table/__init__.py


--------------------------------------------------------------------------------
/auto_smart/auto_smart/table/graph.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from .table import Table
  4 | from preprocessor.preprocessor import MSCatPreprocessor
  5 | import pandas as pd
  6 | import CONSTANT
  7 | from util import timeclass, log
  8 | from collections import defaultdict, deque
  9 | import gc
 10 | from joblib import Parallel, delayed
 11 | 
 12 | class Graph:
 13 |     def __init__(self,info,tables):
 14 |         
 15 |         self.info = info
 16 |         
 17 |         self.table2info = info['tables']
 18 |         self.relations = info['relations']
 19 |         self.key_time_col = info['time_col']
 20 |         
 21 |         self.M2M_relation_cnt = 0
 22 |         for relation in info['relations']:
 23 |             if relation['type'] == "many_to_many":
 24 |                 self.M2M_relation_cnt = self.M2M_relation_cnt + 1
 25 |         
 26 |         self.key_col_set = None
 27 |         self.user_col = None
 28 |         
 29 |         self.name2table = {}
 30 |         self.tables = []
 31 |         
 32 |         key_col_set = set()
 33 |         for relation in info['relations']:
 34 |             key_col_set.update(relation['key'])
 35 |         self.key_col_set = key_col_set
 36 |         
 37 |         user_col = None
 38 |         for tname,table in tables.items():
 39 |             key_cols = []
 40 |             if tname == CONSTANT.MAIN_TABLE_NAME:
 41 |                 for col in self.table2info[tname]:
 42 |                     if col in self.key_col_set:
 43 |                         key_cols.append(col)
 44 |                 
 45 |                 user_col = self.recognize_user_col(tables[tname],key_cols)
 46 |                 
 47 |         self.user_col = user_col      
 48 |         del user_col
 49 |         
 50 |         main_cat_cols = []
 51 |         session_col = None
 52 |         for tname,table in tables.items():
 53 |             if tname == CONSTANT.MAIN_TABLE_NAME:
 54 |                 for col in self.table2info[tname]:
 55 |                     type_ = self.table2info[tname][col]
 56 |                     if type_ == CONSTANT.CATEGORY_TYPE and col!=self.user_col and col not in key_col_set:
 57 |                         main_cat_cols.append(col)
 58 |                     
 59 |                 session_cols = self.recognize_session_col(tables[tname],main_cat_cols,self.user_col)
 60 |         
 61 |         
 62 |         self.main_session_cols = session_cols
 63 |         del main_cat_cols
 64 |         del session_col
 65 |         
 66 |         for tname,table in tables.items():
 67 |             key_cols = []
 68 |             key_time_col = None
 69 |             user_cols = []
 70 | 
 71 |             for col in self.table2info[tname]:
 72 |                 
 73 |                 if col in self.key_col_set and col != self.user_col:
 74 |                     key_cols.append(col)
 75 |                     
 76 |                 if col == self.user_col:
 77 |                     user_cols.append(col)
 78 |                 
 79 |                 if col == self.key_time_col:
 80 |                     key_time_col = col
 81 |             
 82 |             cat_cols = []
 83 |             for col in self.table2info[tname]:
 84 |                 type_ = self.table2info[tname][col]
 85 |                 if type_ == CONSTANT.CATEGORY_TYPE:
 86 |                     cat_cols.append(col)
 87 |             
 88 |             binary_cols = self.recognize_binary_col(tables[tname],cat_cols)
 89 |             for col in binary_cols:
 90 |                 self.table2info[tname][col] = CONSTANT.BINARY_TYPE
 91 |                 
 92 |             self.tables.append(tname)
 93 |             if tname == CONSTANT.MAIN_TABLE_NAME:
 94 |                 self.name2table[tname] = Table(tables[tname],self.table2info[tname],self.main_session_cols,user_cols,key_cols,key_time_col,tname)
 95 |                 
 96 |             else:
 97 |                 self.name2table[tname] = Table(tables[tname],self.table2info[tname],[],user_cols,key_cols,key_time_col,tname)
 98 |                 
 99 |             if tname == CONSTANT.MAIN_TABLE_NAME:
100 |                 self.main_key_cols = key_cols
101 |                 self.main_key_time_col = key_time_col
102 |                 self.main_user_col = user_cols
103 |                 self.main_table_info = self.table2info[tname]
104 |                 
105 |         block2name,name2block = self.init_graph_to_blocks() 
106 |         self.block2name = block2name
107 |         self.name2block = name2block
108 |         
109 |         for tname in self.name2table:
110 |             self.name2table[tname].block2name = block2name
111 |             self.name2table[tname].name2block = name2block
112 |         
113 |         for tname in self.name2table:
114 |             col2block = {}
115 |             for col in self.name2table[tname].data.columns:
116 |                 name = tname + ':' + col
117 |                 
118 |                 if name in self.name2block:
119 |                     block_id = self.name2block[name]
120 |                     col2block[col] = block_id
121 |             
122 |             self.name2table[tname].col2block = col2block
123 |         
124 |         for tname in self.name2table:
125 |             col2table = {}
126 |             for col in self.name2table[tname].data.columns:
127 |                 col2table[col] = tname
128 |                 
129 |             self.name2table[tname].col2table = col2table
130 |             
131 |     @timeclass(cls='Graph')
132 |     def init_graph_to_blocks(self):
133 |         mode = 'all'
134 |         if mode == 'all':
135 |             t_datas = []
136 |             t_names = []
137 | 
138 |             for t_name in self.name2table:
139 |                 t_table = self.name2table[t_name]
140 |                 t_data = t_table.data
141 |                 t_data_num = t_data.shape[0]
142 |                 t_limit_num = 100000
143 |                 if t_limit_num > t_data_num:
144 |                     t_limit_num = t_data_num
145 |                 t_sample_frac = t_limit_num / t_data_num
146 |                 t_data = t_data.sample(frac=t_sample_frac,random_state=CONSTANT.SEED)
147 |                 
148 |                 t_datas.append(t_data)
149 |                 t_names.append(t_name)
150 |                     
151 |             all_cat_cols = []
152 |             all_cat2type = {}
153 |             for t_data,t_name in zip(t_datas,t_names):
154 |                
155 |                 for col in t_data.columns:
156 |                     col2type = self.table2info[ t_name ][ col ]
157 |                     new_col = t_name+':'+col
158 |                     if col2type == CONSTANT.MULTI_CAT_TYPE or col2type == CONSTANT.CATEGORY_TYPE:
159 |                         all_cat_cols.append(new_col)
160 |                         all_cat2type[new_col] = col2type
161 |             
162 |             mc_graph = {}
163 |             all_cat_len = len(all_cat_cols)
164 |             for i in range(all_cat_len):
165 |                 name1 = all_cat_cols[i]
166 |                 mc_graph[name1] = {}
167 |                 for j in range(all_cat_len):
168 |                     name2 = all_cat_cols[j]
169 |                     mc_graph[name1][name2] = 0
170 |             
171 |             for t1 in range(len(t_datas)):
172 |                 t_data_1 = t_datas[t1]
173 |                 t_name_1 = t_names[t1]
174 |                 for col1 in t_data_1.columns:
175 |                     if col1 in self.key_col_set:
176 |                         name1 = t_name_1+':'+col1
177 |                             
178 |                         for t2 in range(len(t_datas)):
179 |                             t_data_2 = t_datas[t2]
180 |                             t_name_2 = t_names[t2]
181 |                             for col2 in t_data_2.columns:
182 |                                 if col2 == col1: 
183 |                                     name2 = t_name_2+':'+col2
184 |                                     mc_graph[name1][name2] = 1
185 |                                     mc_graph[name2][name1] = 1
186 |             
187 |             log('init mcgraph')      
188 |             
189 |             all_cat2set = {}
190 | 
191 |             for t_data,t_name in zip(t_datas,t_names):
192 |                 for col in t_data.columns:
193 |                     new_col = t_name+':'+col
194 |                     if new_col in all_cat2type:
195 |                         cur_set = set()
196 |                         if all_cat2type[new_col] == CONSTANT.MULTI_CAT_TYPE:
197 |                             
198 |                             for val in t_data[col]:
199 |                                 if type(val) == float:
200 |                                     continue
201 |                                 cur_set.update(val.split(CONSTANT.MULTI_CAT_DELIMITER))
202 |                             
203 |                         elif all_cat2type[new_col] == CONSTANT.CATEGORY_TYPE:
204 |                             cur_set = set(t_data[col].dropna())
205 |                         
206 |                         all_cat2set[new_col] = cur_set
207 |             
208 |             all_cat_len = len(all_cat_cols)
209 |             for i in range(all_cat_len):
210 |                 for j in range(i+1,all_cat_len):
211 |                     name1 = all_cat_cols[i]  
212 |                     name2 = all_cat_cols[j]
213 |                     
214 |                     len1 = len(all_cat2set[name1])
215 |                     len2 = len(all_cat2set[name2])
216 | 
217 |                     less_len = min(len1,len2)
218 |                     if less_len <= 1:
219 |                         continue
220 |                     
221 |                     if mc_graph[name1][name2]==1 or mc_graph[name2][name1] == 1:
222 |                         continue
223 |                     
224 |                     if len(all_cat2set[name1] & all_cat2set[name2])/less_len > 0.1:
225 |                         mc_graph[name1][name2] = 1
226 |                         mc_graph[name2][name1] = 1
227 |             
228 |             block2name = {}
229 | 
230 |             block_id = 0
231 |             vis = {}
232 |             nodes = list(mc_graph.keys())
233 |             def dfs(now,block_id):
234 |                 block2name[block_id].append(now)
235 |                 for nex in nodes:
236 |                     if mc_graph[now][nex] and ( not (nex in vis) ):
237 |                         vis[nex] = 1
238 |                         dfs(nex,block_id)
239 |             
240 |             for now in nodes:
241 |                 if now in vis:
242 |                     continue
243 |                 vis[now] = 1
244 |                 block_id += 1
245 |                 block2name[block_id] = []
246 |                 dfs(now,block_id)
247 |             
248 |             name2block = {}
249 | 
250 |             for block in block2name:
251 |                 for col in block2name[block]:
252 |                     name2block[col] = block
253 |             log(f'blocks: {block2name}')
254 |             return block2name,name2block
255 | 
256 |         elif mode == 'part':
257 |             pass
258 | 
259 |     @timeclass(cls='Graph')
260 |     def sort_tables(self):
261 |         for tname in self.name2table:
262 |             table = self.name2table[tname]
263 |             if table.key_time_col is not None:
264 |                 table.data.sort_values(by=table.key_time_col,inplace=True)
265 |     
266 |     @timeclass(cls='Graph')
267 |     def sort_main_table(self):
268 |         table = self.name2table[CONSTANT.MAIN_TABLE_NAME]
269 |         if table.key_time_col is not None:
270 |             table.data.sort_values(by=table.key_time_col,inplace=True)
271 |     
272 |     @timeclass(cls='Graph')
273 |     def recognize_session_col(self,data,cat_cols,user_col):
274 |         if user_col is None:
275 |             return []
276 |         
277 |         user_nunique = data[user_col].nunique()
278 |         session_cols = []
279 |         
280 |         def func(df,user_nunique):
281 |             cat_col = df.columns[0]
282 |             user_col = df.columns[1]
283 |             cat_nunique = df[cat_col].nunique()
284 |             
285 |             if (cat_nunique <= user_nunique) or (cat_nunique >= df.shape[0]-10):
286 |                 return False
287 |             
288 |             if (df.groupby(cat_col)[user_col].nunique()>1).sum()>10:
289 |                 return False
290 |             
291 |             return True
292 |         
293 |         res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(data[[col,user_col]],user_nunique) for col in cat_cols)
294 |         
295 |         for col,is_session in zip(cat_cols,res):
296 |             if is_session:
297 |                 session_cols.append(col)
298 | 
299 |         return session_cols
300 |     
301 |     @timeclass(cls='Graph')
302 |     def recognize_binary_col(self,data,cat_cols):
303 |         def func(ss):
304 |             ss = ss.unique()
305 |             if len(ss) == 3:
306 |                 if pd.isna(ss).sum() == 1:
307 |                     return True
308 |             if len(ss) == 2:
309 |                 return True
310 |             return False
311 |         
312 |         binary_cols = []
313 |         
314 |         res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(data[col]) for col in cat_cols)
315 |         
316 |         for col,is_binary in zip(cat_cols,res):
317 |             if is_binary:
318 |                 binary_cols.append(col)
319 |         
320 |         return binary_cols
321 |     
322 |     @timeclass(cls='Graph')
323 |     def recognize_user_col(self,data,key_cols):
324 |         user_col = None
325 |         nunique = -1
326 |         for col in key_cols:
327 |             nnum = data[col].nunique()
328 |             if nnum > nunique:
329 |                 user_col = col
330 |                 nunique = nnum
331 |         return user_col
332 |     
333 |     @timeclass(cls='Graph')
334 |     def preprocess_fit_transform(self):
335 |         log('start mscat')
336 | 
337 |         mscat_block2preprocessor = {}
338 |         for block_id in range(1,len(self.block2name)+1):
339 |             mscat_block2preprocessor[block_id] = MSCatPreprocessor()
340 |         ss = {}
341 |         for block_id in range(1,len(self.block2name)+1):
342 |             ss[block_id] = pd.Series()
343 |             
344 |         t_datas = []
345 |         t_names = []
346 |         for t_name in self.name2table:
347 |             t_table = self.name2table[t_name]
348 |             t_data = t_table.data
349 |             
350 |             t_datas.append(t_data)
351 |             t_names.append(t_name)
352 |         
353 |         for t in range(len(t_datas)):
354 |                 t_data = t_datas[t]
355 |                 t_name = t_names[t]
356 |                 for col in t_data.columns:
357 |                     coltype = self.table2info[ t_name ][col] 
358 |                     if coltype == CONSTANT.MULTI_CAT_TYPE or coltype == CONSTANT.CATEGORY_TYPE:
359 |                         name = t_name + ':' + col
360 |                         if name in self.name2block:
361 |                             block_id = self.name2block[name]
362 |                             ss[block_id] = pd.concat([ss[block_id],t_data[col].drop_duplicates()])
363 | 
364 |         for block_id in range(1,len(self.block2name)+1):
365 |             mscat_block2preprocessor[block_id].fit(ss[block_id])
366 |            
367 |         for tname,table in self.name2table.items():
368 |             table.preprocess_fit_transform(mscat_block2preprocessor)
369 |             
370 |         gc.collect()
371 |             
372 |     def set_main_table(self,table):
373 |         tname = CONSTANT.MAIN_TABLE_NAME
374 |         self.name2table[CONSTANT.MAIN_TABLE_NAME] = Table(table,self.main_table_info,self.main_session_cols,self.main_user_col,self.main_key_cols,self.main_key_time_col,tname)
375 |         gc.collect()
376 |         
377 |     @timeclass(cls='Graph')
378 |     def bfs(self,root_name, graph, depth):
379 |         depth[CONSTANT.MAIN_TABLE_NAME]['depth'] = 0
380 |         queue = deque([root_name])
381 |         while queue:
382 |             u_name = queue.popleft()
383 |             for edge in graph[u_name]:
384 |                 v_name = edge['to']
385 |                 if 'depth' not in depth[v_name]:
386 |                     depth[v_name]['depth'] = depth[u_name]['depth'] + 1
387 |                     queue.append(v_name)
388 |     
389 |     @timeclass(cls='Graph')
390 |     def build_depth(self):
391 |         rel_graph = defaultdict(list)
392 |         depth = {}
393 |         
394 |         for tname in self.tables:
395 |             depth[tname] = {}
396 |             
397 |         for rel in self.relations:
398 |             ta = rel['table_A']
399 |             tb = rel['table_B']
400 |             rel_graph[ta].append({
401 |                 "to": tb,
402 |                 "key": rel['key'],
403 |                 "type": rel['type']
404 |             })
405 |             rel_graph[tb].append({
406 |                 "to": ta,
407 |                 "key": rel['key'],
408 |                 "type": '_'.join(rel['type'].split('_')[::-1])
409 |             })
410 |         self.bfs(CONSTANT.MAIN_TABLE_NAME, rel_graph, depth)
411 |         
412 |         self.rel_graph = rel_graph
413 |         self.depth = depth
414 |     
415 |     
416 |     


--------------------------------------------------------------------------------
/auto_smart/auto_smart/table/table.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from preprocessor.preprocessor import *
  3 | import CONSTANT
  4 | from util import timeclass,log
  5 | import gc
  6 | 
  7 | class Table:
  8 |     def __init__(self,data,table_info,session_cols,user_cols,key_cols,key_time_col,tname):
  9 |         self.name = tname
 10 |         
 11 |         self.col2type = {}
 12 |         self.col2groupby = {}
 13 |         self.col2block = {}
 14 |         self.col2istime = {}
 15 | 
 16 |         self.col2muldatas = {}
 17 |         self.col2muldatalens = {}
 18 |         
 19 |         self.user_cols = user_cols
 20 |         self.session_cols = []
 21 |         
 22 |         self.block2name = {}
 23 |         self.name2block = {}
 24 |         
 25 |         for col in session_cols:
 26 |             if len(self.user_cols) > 0:
 27 |                 self.session_cols.append(col)
 28 |                 self.col2groupby[col] = self.user_cols[0]
 29 |         
 30 |         self.key_time_col = key_time_col
 31 |         self.key_cols = key_cols
 32 |         
 33 |         self.cat_cols = None
 34 |         
 35 |         self.binary_cols = None
 36 |         self.multi_cat_cols = None
 37 |         self.num_cols = None
 38 |         
 39 |         self.time_cols = None
 40 |         
 41 |         self.bin_cols = []
 42 |         
 43 |         self.update_data(data,table_info,None)
 44 |         
 45 |         log(f'session_cols:{self.session_cols}')
 46 |         log(f'user_cols:{self.user_cols}')
 47 |         log(f'key_cols:{self.key_cols}')
 48 |         log(f'cat_cols:{self.cat_cols}')
 49 |         log(f'binary_cols:{self.binary_cols}')
 50 |         log(f'multi_cat_cols:{self.multi_cat_cols}')
 51 |         log(f'key_time_col:{self.key_time_col}')
 52 |         log(f'time_cols:{self.time_cols}')
 53 |         log(f'num_cols:{self.num_cols}')
 54 |         
 55 |         self.apart_cat_set = set()
 56 |         self.post_drop_set = set()
 57 |         
 58 |         self.col2source_cat = {}
 59 |         
 60 |         self.combine_cat_cols = []
 61 |         self.combine_num_cols = []
 62 |         self.combine_binary_cols = []
 63 |         self.wait_selection_cols = []
 64 |     
 65 |     def add_session_col(self,col):
 66 |         self.session_cols.append(col)
 67 |         self.col2type[col] = CONSTANT.CATEGORY_TYPE
 68 |         if len(self.user_cols) > 0:
 69 |             self.col2groupby[col] = self.user_cols[0]
 70 |         
 71 |     def get_groupby_cols(self,by,cols):
 72 |         new_cols = []
 73 |         bys = set()
 74 |         bys.add(by)
 75 |         while by in self.col2groupby:
 76 |             by = self.col2groupby[by]
 77 |             bys.add(by)
 78 |         
 79 |         for col in cols:
 80 |             is_skip = False
 81 |             cur = col
 82 |             while True:
 83 |                 if cur in bys:
 84 |                     is_skip = True
 85 |                     break
 86 |                 
 87 |                 if cur in self.col2groupby:
 88 |                     cur = self.col2groupby[cur]
 89 |                 else:
 90 |                     break
 91 |                 
 92 |             if not is_skip:
 93 |                 new_cols.append(col)
 94 |         
 95 |         return new_cols
 96 |     
 97 |     def get_not_apart_cat_cols(self,cols):
 98 |         new_cols = []
 99 |         for col in cols:
100 |             if col not in self.apart_cat_set:
101 |                 new_cols.append(col)
102 |         return new_cols       
103 |     
104 |     def drop_data(self,cols):
105 |         drop_cols = []
106 |         for col in cols:
107 |             if col not in self.session_cols\
108 |             and col not in self.user_cols\
109 |             and col not in self.key_cols\
110 |             and col != self.key_time_col:
111 |                 drop_cols.append(col)
112 |         if len(drop_cols)>0:
113 |             self.data.drop(drop_cols,axis=1,inplace=True)
114 |             self.drop_data_cols(drop_cols)
115 |             
116 |     def drop_data_cols(self,drop_cols):
117 |         for col in drop_cols:
118 |             self.col2type.pop(col)
119 |             if col in self.col2groupby:
120 |                 self.col2groupby.pop(col)
121 |             
122 |         self.type_reset()
123 |         self.drop_combine_cols(drop_cols)
124 | 
125 |     def drop_combine_cols(self,drop_cols):
126 |         drop_cols_set = set(drop_cols)
127 |         
128 |         combine_cat_cols = []
129 |         combine_num_cols = []
130 |         combine_binary_cols = []
131 |         
132 |         for col in self.combine_cat_cols:
133 |             if col not in drop_cols_set:
134 |                 combine_cat_cols.append(col)
135 |         
136 |         for col in self.combine_num_cols:
137 |             if col not in drop_cols_set:
138 |                 combine_num_cols.append(col)
139 |         
140 |         for col in self.combine_binary_cols:
141 |             if col not in drop_cols_set:
142 |                 combine_binary_cols.append(col)
143 |         
144 |         self.combine_cat_cols = combine_cat_cols
145 |         self.combine_num_cols = combine_num_cols
146 |         self.combine_binary_cols = combine_binary_cols
147 |     
148 |     def add_apart_cat_cols(self,cols):
149 |         self.apart_cat_set.update(cols)
150 |     
151 |     def add_post_drop_cols(self,cols):
152 |         self.post_drop_set.update(cols)
153 |     
154 |     def add_wait_selection_cols(self,cols):
155 |         self.wait_selection_cols.append(cols)
156 |     
157 |     def empty_wait_selection_cols(self):
158 |         self.wait_selection_cols = []
159 |     
160 |     def update_data(self,data,col2type,col2groupby,col2source_cat=None,col2block=None,col2istime=None):
161 |         
162 |         self.data = data
163 |         self.update_col2type(col2type)
164 |         if col2groupby is not None:
165 |             self.update_col2groupby(col2groupby)
166 |         
167 |         if col2block is not None:
168 |             self.update_col2block(col2block)
169 |         if col2istime is not None:
170 |             self.update_col2istime(col2istime)
171 |         
172 |         if col2source_cat is not None:
173 |             self.update_col2source_cat(col2source_cat)
174 |         gc.collect()
175 |         
176 |     def update_col2block(self,col2block):
177 |         self.col2block.update(col2block)
178 |     
179 |     def update_col2istime(self,col2istime):
180 |         self.col2istime.update(col2istime)
181 |     
182 |     def update_col2groupby(self,col2groupby):
183 |         self.col2groupby.update(col2groupby)
184 |     
185 |     def update_col2source_cat(self,col2source_cat):
186 |         self.col2source_cat.update(col2source_cat)
187 |     
188 |     def update_col2type(self,col2type):
189 |         self.col2type.update(col2type)
190 |         self.type_reset()
191 |         
192 |     def reset_combine_cols(self,combine_cat_cols=None,combine_num_cols=None,combine_binary_cols=None):
193 |         self.combine_cat_cols = combine_cat_cols
194 |         self.combine_num_cols = combine_num_cols
195 |         self.combine_binary_cols = combine_binary_cols
196 |     
197 |     def type_reset(self):
198 |         
199 |         cat_cols = []
200 |         binary_cols = []
201 |         multi_cat_cols = []
202 |         num_cols = []
203 |         time_cols = []
204 | 
205 |         for cname,ctype in self.col2type.items():
206 |             if (ctype == CONSTANT.CATEGORY_TYPE) \
207 |             and (cname not in self.key_cols)\
208 |             and (cname not in self.user_cols)\
209 |             and (cname not in self.session_cols):
210 |                 cat_cols.append(cname)
211 |             elif ctype == CONSTANT.BINARY_TYPE:
212 |                 binary_cols.append(cname)
213 |             elif ctype == CONSTANT.MULTI_CAT_TYPE:
214 |                 multi_cat_cols.append(cname)
215 |             elif ctype == CONSTANT.NUMERICAL_TYPE:
216 |                 num_cols.append(cname)
217 |             elif ctype == CONSTANT.TIME_TYPE and cname != self.key_time_col:
218 |                 time_cols.append(cname)
219 |         
220 |         self.cat_cols = sorted(cat_cols)
221 |         self.binary_cols = sorted(binary_cols)
222 |         self.num_cols = sorted(num_cols)
223 |         self.multi_cat_cols = sorted(multi_cat_cols)
224 |         self.time_cols = sorted(time_cols)  
225 |     
226 |     @timeclass(cls='Table')
227 |     def preprocess_fit_transform(self,mscat_group2preprocessor):
228 |         
229 |         for col in (self.cat_cols+self.multi_cat_cols+self.user_cols+self.key_cols+self.session_cols):
230 |             name = self.name+':'+col
231 |             if name in self.name2block:
232 |                 block_id = self.name2block[name]
233 |                 self.data[col] = mscat_group2preprocessor[block_id].transform(self.data[col],self.col2type[col])
234 |                 
235 |         unique_preprocessor = UniquePreprocessor()
236 |         unique_preprocessor.fit_transform(self)
237 |         
238 |         all_diff_preprocessor = AllDiffPreprocessor()
239 |         all_diff_preprocessor.fit_transform(self)
240 |         
241 |         binary_preprocessor = BinaryPreprocessor()
242 |         binary_preprocessor.fit_transform(self)
243 |         
244 |         num_preprocess = NumPreprocessor()
245 |         num_preprocess.fit_transform(self)
246 | 
247 |         general_preprocessor = GeneralPreprocessor()
248 |         general_preprocessor.fit_transform(self)
249 | 


--------------------------------------------------------------------------------
/auto_smart/auto_smart/util.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import time
 3 | from typing import Any
 4 | 
 5 | 
 6 | import functools
 7 | nesting_level = 0
 8 | is_start = None
 9 | 
10 | 
11 | class Timer:
12 |     def __init__(self):
13 |         self.start = time.time()
14 |         self.history = [self.start]
15 | 
16 |     def check(self, info):
17 |         current = time.time()
18 |         log(f"[{info}] spend {current - self.history[-1]:0.2f} sec")
19 |         self.history.append(current)
20 | 
21 | 
22 | def timeclass(cls):
23 |     def timeit(method, start_log=None):
24 |         @functools.wraps(method)
25 |         def timed(*args, **kw):
26 |             global is_start
27 |             global nesting_level
28 |     
29 |             if not is_start:
30 |                 print()
31 |     
32 |             is_start = True
33 |             log(f"Start [{cls}.{method.__name__}]:" + (start_log if start_log else ""))
34 |             log(f'Start time: {time.strftime("%Y-%m-%d %H:%M:%S")}')
35 |             nesting_level += 1
36 |     
37 |             start_time = time.time()
38 |             result = method(*args, **kw)
39 |             end_time = time.time()
40 |     
41 |             nesting_level -= 1
42 |             log(f"End   [{cls}.{method.__name__}]. Time elapsed: {end_time - start_time:0.2f} sec.")
43 |             log(f'End time: {time.strftime("%Y-%m-%d %H:%M:%S")}')
44 |             is_start = False
45 |     
46 |             return result
47 |     
48 |         return timed
49 |     return timeit
50 | 
51 | def timeit(method, start_log=None):
52 |     @functools.wraps(method)
53 |     def timed(*args, **kw):
54 |         global is_start
55 |         global nesting_level
56 | 
57 |         if not is_start:
58 |             print()
59 | 
60 |         is_start = True
61 |         log(f"Start [{method.__name__}]:" + (start_log if start_log else ""))
62 |         nesting_level += 1
63 | 
64 |         start_time = time.time()
65 |         result = method(*args, **kw)
66 |         end_time = time.time()
67 | 
68 |         nesting_level -= 1
69 |         log(f"End   [{method.__name__}]. Time elapsed: {end_time - start_time:0.2f} sec.")
70 |         is_start = False
71 | 
72 |         return result
73 | 
74 |     return timed
75 | 
76 | 
77 | def log(entry: Any):
78 |     global nesting_level
79 |     space = "-" * (4 * nesting_level)
80 |     print(f"{space}{entry}")
81 | 
82 | def show_dataframe(df):
83 |     if len(df) <= 30:
84 |         print(f"content=\n"
85 |               f"{df}")
86 |     else:
87 |         print(f"dataframe is too large to show the content, over {len(df)} rows")
88 | 
89 |     if len(df.dtypes) <= 100:
90 |         print(f"types=\n"
91 |               f"{df.dtypes}\n")
92 |     else:
93 |         print(f"dataframe is too wide to show the dtypes, over {len(df.dtypes)} columns")
94 |       
95 | 
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/auto_smart/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import setuptools
 5 | from Cython.Build import cythonize
 6 | 
 7 | setuptools.setup(
 8 |     name='AutoSmart',
 9 |     version='0.0.2',
10 |     author='DeepBlueAI',
11 |     author_email='1229991666@qq.com',
12 |     url='https://github.com/DeepBlueAI/AutoSmart',
13 |     description=u'The 1st place solution for KDD Cup 2019 AutoML Track',
14 |     packages=setuptools.find_packages(),
15 |     install_requires=[
16 |         "hyperopt",
17 |         "lightgbm==2.3.0",
18 |         "joblib",
19 |         "pandas",
20 |         ],
21 |     ext_modules = cythonize("ac.pyx"),
22 |     classifiers=[
23 |         "Programming Language :: Python :: 3",
24 |         "License :: OSI Approved :: GNU General Public License (GPL)",
25 |         "Operating System :: OS Independent",
26 |     ],
27 | )
28 | 


--------------------------------------------------------------------------------
/demo/data/train/info.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "time_budget": 300,
 3 |     "time_col": "t_01",
 4 |     "start_time": 1550654179,
 5 |     "tables": {
 6 |         "main": {
 7 |             "t_01": "time",
 8 |             "c_1": "cat",
 9 |             "c_2": "cat",
10 |             "n_1": "num",
11 |             "n_2": "num",
12 |             "c_3": "cat",
13 |             "c_02": "cat",
14 |             "c_01": "cat"
15 |         },
16 |         "table_1": {
17 |             "c_01": "cat",
18 |             "c_1": "cat",
19 |             "c_2": "cat",
20 |             "n_1": "num",
21 |             "c_3": "cat",
22 |             "c_4": "cat",
23 |             "t_1": "time",
24 |             "t_2": "time",
25 |             "n_2": "num",
26 |             "n_3": "num",
27 |             "n_4": "num",
28 |             "n_5": "num",
29 |             "m_1": "multi-cat",
30 |             "m_2": "multi-cat",
31 |             "m_3": "multi-cat",
32 |             "m_4": "multi-cat",
33 |             "m_5": "multi-cat",
34 |             "m_6": "multi-cat"
35 |         },
36 |         "table_2": {
37 |             "c_02": "cat",
38 |             "c_1": "cat",
39 |             "c_2": "cat",
40 |             "c_3": "cat",
41 |             "c_4": "cat",
42 |             "t_1": "time"
43 |         },
44 |         "table_3": {
45 |             "n_1": "num",
46 |             "c_02": "cat",
47 |             "t_01": "time"
48 |         }
49 |     },
50 |     "relations": [
51 |         {
52 |             "table_A": "main",
53 |             "table_B": "table_1",
54 |             "key": ["c_01"],
55 |             "type": "many_to_one"
56 |         },
57 |         {
58 |             "table_A": "main",
59 |             "table_B": "table_2",
60 |             "key": ["c_02"],
61 |             "type": "many_to_one"
62 |         },
63 |         {
64 |             "table_A": "main",
65 |             "table_B": "table_3",
66 |             "key": ["c_02"],
67 |             "type": "many_to_one"
68 |         }
69 |     ]
70 | }
71 | 


--------------------------------------------------------------------------------
/demo/demo.py:
--------------------------------------------------------------------------------
1 | import auto_smart
2 | 
3 | info = auto_smart.read_info("data")
4 | train_data,train_label = auto_smart.read_train("data",info)
5 | test_data = auto_smart.read_test("data",info)
6 | 
7 | auto_smart.train_and_predict(train_data,train_label,info,test_data)
8 | 
9 | 


--------------------------------------------------------------------------------