├── .gitignore ├── LICENSE ├── README.md ├── auto_smart ├── LICENSE ├── MANIFEST.in ├── README.md ├── ac.c ├── ac.pyx ├── auto_smart │ ├── CONSTANT.py │ ├── PATHS.py │ ├── __init__.py │ ├── automl │ │ ├── __init__.py │ │ ├── auto_lgb.py │ │ ├── automl.py │ │ ├── autosample.py │ │ └── model_selection.py │ ├── config.py │ ├── data_tools.py │ ├── feat │ │ ├── __init__.py │ │ ├── default_feat.py │ │ ├── default_merge_feat.py │ │ ├── feat.py │ │ ├── feat_pipeline.py │ │ ├── feat_selection.py │ │ ├── merge_feat.py │ │ └── merge_feat_pipeline.py │ ├── feat_context.py │ ├── feat_engine.py │ ├── merger.py │ ├── metadata │ ├── model.py │ ├── model_input.py │ ├── preprocessor │ │ ├── __init__.py │ │ └── preprocessor.py │ ├── table │ │ ├── __init__.py │ │ ├── graph.py │ │ └── table.py │ └── util.py └── setup.py └── demo ├── data ├── test │ └── main_test.data └── train │ ├── info.json │ ├── main_train.data │ ├── main_train.solution │ ├── table_1.data │ ├── table_2.data │ └── table_3.data └── demo.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | #*.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .DS_Store 107 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | 676 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Alt text](https://www.deepblueai.com/usr/deepblue/v3/images/logo.png "DeepBlue") 2 | [![license](https://img.shields.io/badge/license-GPL%203.0-green.svg)](https://github.com/DeepBlueAI/AutoSmart/blob/master/LICENSE) 3 | # The introduction of AutoSmart 4 | The 1st place solution for KDD Cup 2019 AutoML Track 5 | 6 | # How to install 7 | 8 | Requirements: [Cython with C compiler](https://docs.cython.org/en/latest/src/quickstart/install.html). 9 | 10 | clone or download autosmart package, run 11 | 12 | ```python 13 | python setup.py install 14 | ``` 15 | 16 | # How to use 17 | ```python 18 | import auto_smart 19 | 20 | info = auto_smart.read_info("data") 21 | train_data,train_label = auto_smart.read_train("data",info) 22 | test_data = auto_smart.read_test("data",info) 23 | auto_smart.train_and_predict(train_data,train_label,info,test_data) 24 | ``` 25 | # Data Sample 26 | ### Data 27 | 28 | This page describes the datasets that our system can deal with. 29 | 30 | #### Components 31 | Each dataset is split into two subsets, namely the training set and the testing set. 32 | 33 | Both sets have: 34 | 35 | - a **main table file** that stores the **main table** (label excluded); 36 | - multiple **related table files** that store the **related tables**; 37 | - an **info dictionary** that contains important information about the dataset, including table relations; 38 | - The training set has an additional **label file** that stores **labels** associated with the **main table**. 39 | 40 | ### Table files 41 | 42 | Each **table file** is a CSV file that stores a table (main or related), with '**\t**' as the delimiter. The first row indicates the names of features, a.k.a 'schema', and the following rows are the records. 43 | 44 | The type of each feature can be found in the info dictionary that will be introduced soon. 45 | 46 | There are 4 types of features, indicated by "cat", "num", "multi-cat", and "time", respectively: 47 | 48 | - **cat**: categorical feature, an integer 49 | - **num**: numerical Feature, a real value. 50 | - **multi-cat**: multi-value categorical Feature: a set of integers, split by the comma. The size of the set is not fixed and can be different for each instance. For example, topics of an article, words in a title, items bought by a user and so on. 51 | - **time**: time feature, an integer that indicates the timestamp. 52 | 53 | 54 | ### Label file 55 | The **label file** is associated only with the main table in the training set. It is a CSV file that contains only one column, with the first row as the header and the remaining indicating labels associated with instances in the main table. 56 | 57 | ### info dictionary 58 | Important information about each dataset is stored in a python dictionary structure named as **info**, which acts as an input of this system. Generally,you need to manually generate the dictionary information info.json file. Here we give details about info. 59 | 60 | ![Alt text](https://i.ibb.co/4dQxCRD/info.png "datainfo") 61 | 62 | Descriptions of the keys in info: 63 | 64 | - **time_budget**: time budget for this dataset (sec). 65 | - **time_col**: the column name of the primary timestamp; Each dataset has one unique time_col; time_col is definitely contained in the main table, but not necessarily in a related table; 66 | - **start_time**: DEPRECATED. 67 | - **tables**: a dictionary that stores information about tables. Each key indicates a table, and its corresponding value is a dictionary that indicates the type of each column in this table. Two kinds of keys are contained in tables: 68 | - **main**: the main table; 69 | - **table_{i}**: the i-th related table. 70 | - There are 4 types of features, indicated by "cat", "num", "multi-cat", and "time", respectively: 71 | - **cat**: categorical feature, an integer 72 | - **num**: numerical Feature, a real value. 73 | - **multi-cat**: multi-value categorical Feature: a set of integers, split by the comma. The size of the set is not fixed and can be different for each instance. For example, topics of an article, words in a title, items bought by a user and so on. 74 | - **time**: time feature, an integer that indicates the timestamp. 75 | 76 | - **relations**: a list that stores table relations in the dataset. Each relation can be represented as an ordered table pair (**table_A**, **table_B**), a key column **key** that appears in both tables and acts as the pivot of table joining, and a relation type **type**. Different relation types will be introduced shortly. 77 | 78 | #### Relations Between Tables 79 | Four table relations are considered in this system: 80 | 81 | - **one-to-one** (1-1): the key columns in both **table_A** and **table_B** have no duplicated values; 82 | - **one-to-many** (1-M): the key column in **table_A** has no duplicated values, but that in **table_B** may have duplicated values; 83 | - **many-to-one** (M-1): the key column in **table_A** may have duplicated values, but that in **table_B** has no duplicated values; 84 | - **many-to-many** (M-M): the key columns in both **table_A** and **table_B** may have duplicated values. 85 | 86 | 87 | 88 | 89 | # Contact Us 90 | DeepBlueAI: 1229991666@qq.com 91 | -------------------------------------------------------------------------------- /auto_smart/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.pyx -------------------------------------------------------------------------------- /auto_smart/README.md: -------------------------------------------------------------------------------- 1 | ![Alt text](https://www.deepblueai.com/usr/deepblue/v3/images/logo.png "DeepBlue") 2 | [![license](https://img.shields.io/badge/license-GPL%203.0-green.svg)](https://github.com/DeepBlueAI/AutoSmart/blob/master/LICENSE) 3 | # The introduction of AutoSmart 4 | The 1st place solution for KDD Cup 2019 AutoML Track 5 | 6 | # How to use 7 | This is the link to the competition:https://codalab.lri.fr/competitions/559 8 | 9 | # Contact Us 10 | DeepBlueAI: 1229991666@qq.com 11 | -------------------------------------------------------------------------------- /auto_smart/ac.pyx: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | from cython cimport boundscheck,wraparound 4 | 5 | 6 | 7 | 8 | @boundscheck(False) 9 | @wraparound(False) 10 | def pre_tuple_encode_func(int[:] muldata, int[:] muldatalens,K): 11 | cdef: 12 | int index = 0 13 | int i,j,N = muldatalens.shape[0] 14 | int les 15 | # list tmp = [] 16 | int c_K = K 17 | dict map_dict = {} 18 | int ids = 0 19 | 20 | 21 | ans = np.zeros( N ,dtype=np.float ) 22 | 23 | for i in range(N): 24 | les = muldatalens[i] 25 | if les == 0: 26 | ans[i] = np.nan 27 | else: 28 | tmp = [] 29 | if c_K > les: 30 | for j in range(index,index+les): 31 | tmp.append(muldata[j]) 32 | else: 33 | for j in range(index,index+c_K): 34 | tmp.append(muldata[j]) 35 | 36 | thash = tuple( tmp ) 37 | if thash in map_dict: 38 | ans[i] = map_dict[ thash ] 39 | else: 40 | map_dict[ thash ] = ids 41 | ans[i] = ids 42 | ids += 1 43 | 44 | index += les 45 | 46 | return ans 47 | 48 | 49 | @boundscheck(False) 50 | @wraparound(False) 51 | def post_tuple_encode_func(int[:] muldata, int[:] muldatalens,K): 52 | cdef: 53 | int index = 0 54 | int i,j,N = muldatalens.shape[0] 55 | int les 56 | # list tmp = [] 57 | int c_K = K 58 | dict map_dict = {} 59 | int ids = 0 60 | 61 | 62 | ans = np.zeros( N ,dtype=np.float ) 63 | 64 | for i in range(N): 65 | les = muldatalens[i] 66 | if les == 0: 67 | ans[i] = np.nan 68 | else: 69 | tmp = [] 70 | if c_K > les: 71 | for j in range(index,index+les): 72 | tmp.append(muldata[j]) 73 | else: 74 | for j in range(index+les-c_K,index+les): 75 | tmp.append(muldata[j]) 76 | 77 | thash = tuple( tmp ) 78 | if thash in map_dict: 79 | ans[i] = map_dict[ thash ] 80 | else: 81 | map_dict[ thash ] = ids 82 | ans[i] = ids 83 | ids += 1 84 | 85 | index += les 86 | 87 | return ans 88 | 89 | 90 | @boundscheck(False) 91 | @wraparound(False) 92 | def tuple_encode_func_1(int[:] muldata, int[:] muldatalens): 93 | cdef: 94 | int index = 0 95 | int i,j,N = muldatalens.shape[0] 96 | int les 97 | dict map_dict = {} 98 | int ids = 1 99 | 100 | 101 | ans = np.zeros( N ,dtype=np.float ) 102 | 103 | for i in range(N): 104 | les = muldatalens[i] 105 | if les == 0: 106 | ans[i] = np.nan 107 | else: 108 | tmp = [] 109 | for j in range(index,index+les): 110 | tmp.append(muldata[j]) 111 | 112 | thash = tuple( tmp ) 113 | if thash in map_dict: 114 | ans[i] = map_dict[ thash ] 115 | else: 116 | map_dict[ thash ] = ids 117 | ans[i] = ids 118 | ids += 1 119 | 120 | index += les 121 | 122 | return ans 123 | 124 | #@boundscheck(False) 125 | #@wraparound(False) 126 | #def tuple_encode_func_2(vals): 127 | # cdef: 128 | # int idx,N = vals.shape[0] 129 | # dict map_dict = {} 130 | # int ids = 0 131 | # 132 | # ans = np.zeros( N ,dtype=np.float ) 133 | # 134 | # for idx in range(N): 135 | # i = vals[idx] 136 | # if type(i)==float or i==(): 137 | # ans[idx] = np.nan 138 | # else: 139 | # if i in map_dict: 140 | # ans[idx] = map_dict[ i ] 141 | # else: 142 | # map_dict[ i ] = ids 143 | # ans[idx] = ids 144 | # ids += 1 145 | # 146 | # return ans 147 | 148 | 149 | 150 | @boundscheck(False) 151 | @wraparound(False) 152 | def cat_in_multi( int[:] muldata, int[:] muldatalens, int[:] catdata ): 153 | cdef: 154 | int index = 0 155 | int i,j,N = muldatalens.shape[0] 156 | int les 157 | int cat 158 | int flag 159 | # list ans = [] 160 | 161 | ans = np.zeros( N ,dtype=np.int8 ) 162 | 163 | for i in range(N): 164 | les = muldatalens[i] 165 | flag = 0 166 | cat = catdata[ i ] 167 | for j in range(index,index+les): 168 | if muldata[j] == cat: 169 | flag = 1 170 | break 171 | 172 | if flag : 173 | ans[i] = 1 174 | else: 175 | ans[i] = 0 176 | 177 | index += les 178 | return ans 179 | 180 | @boundscheck(False) 181 | @wraparound(False) 182 | def cat_rank_multi( int[:] muldata, int[:] muldatalens, int[:] catdata ): 183 | cdef: 184 | int index = 0 185 | int i,j,N = muldatalens.shape[0] 186 | int les 187 | int cat 188 | int flag 189 | # list ans = [] 190 | 191 | ans = np.zeros( N ,dtype=np.int16 ) 192 | 193 | for i in range(N): 194 | les = muldatalens[i] 195 | flag = 0 196 | cat = catdata[ i ] 197 | for j in range(index,index+les): 198 | if muldata[j] == cat: 199 | flag = j-index+1 200 | break 201 | ans[i] = flag 202 | index += les 203 | return ans 204 | 205 | 206 | @boundscheck(False) 207 | @wraparound(False) 208 | def cat_frank_multi( int[:] muldata, int[:] muldatalens, int[:] catdata ): 209 | cdef: 210 | int index = 0 211 | int i,j,N = muldatalens.shape[0] 212 | int les 213 | int cat 214 | int flag 215 | # list ans = [] 216 | 217 | ans = np.zeros( N ,dtype=np.int16 ) 218 | 219 | for i in range(N): 220 | les = muldatalens[i] 221 | flag = 0 222 | cat = catdata[ i ] 223 | for j in range(index,index+les): 224 | if muldata[j] == cat: 225 | flag = index+les - j 226 | break 227 | ans[i] = flag 228 | index += les 229 | return ans 230 | 231 | 232 | @boundscheck(False) 233 | @wraparound(False) 234 | def get_need_data( vals ): 235 | cdef: 236 | int idx,N = vals.shape[0] 237 | list datas = [] 238 | list datalen = [] 239 | 240 | for idx in range(N): 241 | i = vals[idx] 242 | if type(i) == float: 243 | datalen.append( 0 ) 244 | else: 245 | datas.extend( i ) 246 | datalen.append( len(i) ) 247 | 248 | return datas,datalen 249 | 250 | 251 | 252 | 253 | @boundscheck(False) 254 | @wraparound(False) 255 | def mscat_fit(vals ): 256 | cdef: 257 | set ans = set() 258 | int idx,N = vals.shape[0] 259 | 260 | for idx in range(N): 261 | val = vals[idx] 262 | if type(val) == float: 263 | continue 264 | ans.update( val.split(',') ) 265 | 266 | return ans 267 | 268 | @boundscheck(False) 269 | @wraparound(False) 270 | def mscat_trans(vals,cats): 271 | cdef: 272 | dict cat2index = {index: i + 1 for i,index in enumerate(cats)} 273 | list ans = [] 274 | int idx,N = vals.shape[0] 275 | list tmp = [] 276 | 277 | 278 | for idx in range(N): 279 | val = vals[idx] 280 | if type(val) == float: 281 | ans.append( tuple() ) 282 | else: 283 | tmp = [] 284 | x = val.split(',') 285 | for i in x: 286 | tmp.append( cat2index[i] ) 287 | 288 | ans.append( tuple( tmp ) ) 289 | 290 | return ans 291 | 292 | 293 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/CONSTANT.py: -------------------------------------------------------------------------------- 1 | NUMERICAL_TYPE = "num" 2 | NUMERICAL_PREFIX = "n_" 3 | 4 | CATEGORY_TYPE = "cat" 5 | CATEGORY_PREFIX = "c_" 6 | 7 | TIME_TYPE = "time" 8 | TIME_PREFIX = "t_" 9 | 10 | MULTI_CAT_TYPE = "multi-cat" 11 | MULTI_CAT_PREFIX = "m_" 12 | MULTI_CAT_DELIMITER = "," 13 | 14 | BINARY_TYPE = "binary" 15 | BINARY_PREFIX = 'b_' 16 | 17 | MAIN_TABLE_NAME = "main" 18 | MAIN_TABLE_TEST_NAME = "main_test" 19 | TABLE_PREFIX = "table_" 20 | 21 | LABEL = "label" 22 | 23 | type2prefix = { 24 | NUMERICAL_TYPE:NUMERICAL_PREFIX, 25 | CATEGORY_TYPE:CATEGORY_PREFIX, 26 | TIME_TYPE:TIME_PREFIX, 27 | MULTI_CAT_TYPE:MULTI_CAT_PREFIX, 28 | BINARY_TYPE: BINARY_PREFIX 29 | } 30 | 31 | THREAD_NUM = 4 32 | 33 | SEED = 2222 34 | JOBS = 7 35 | 36 | CAT_SHIFT = 1 37 | 38 | MAX_SAMPLE_NUM = 1000000 39 | 40 | TIME_MIN_BINS = 1000 41 | SEGMENTS = 100 42 | 43 | LESS_LIMIT = 10 44 | SMOOTH_SHIFT = 100 45 | DEVIATION_SHIFT = 100 46 | 47 | KEYWORDS = ["label",'index'] 48 | 49 | SPLIT = -1 50 | 51 | round_opt = False 52 | 53 | SAMPLE_NUM = 210000 54 | 55 | USE_ENSEMBLE = 1 56 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/PATHS.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | version = datetime.now().strftime('%Y%m%d%H%M%S') 4 | print('version:{}'.format(version)) 5 | feature_importance_path = '../importances/' 6 | 7 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/__init__.py: -------------------------------------------------------------------------------- 1 | name = "example_pkg" 2 | import os 3 | import sys 4 | ABSPATH = os.path.abspath(os.path.realpath(os.path.dirname(__file__))) 5 | sys.path.append(ABSPATH) 6 | 7 | from auto_smart.model import Model 8 | import numpy as np 9 | import pandas as pd 10 | import json 11 | from os.path import join 12 | from datetime import datetime 13 | 14 | 15 | TYPE_MAP = { 16 | 'time': str, 17 | 'cat': str, 18 | 'multi-cat': str, 19 | 'num': np.float64 20 | } 21 | 22 | def read_info(datapath): 23 | with open(join(datapath, 'train', 'info.json'), 'r') as info_fp: 24 | info = json.load(info_fp) 25 | return info 26 | 27 | def read_train(datapath, info): 28 | train_data = {} 29 | for table_name, columns in info['tables'].items(): 30 | 31 | table_dtype = {key: TYPE_MAP[val] for key, val in columns.items()} 32 | 33 | if table_name == 'main': 34 | table_path = join(datapath, 'train', 'main_train.data') 35 | else: 36 | table_path = join(datapath, 'train', f'{table_name}.data') 37 | 38 | date_list = [key for key, val in columns.items() if val == 'time'] 39 | 40 | train_data[table_name] = pd.read_csv( 41 | table_path, sep='\t', dtype=table_dtype, parse_dates=date_list, 42 | date_parser=lambda millisecs: millisecs if np.isnan( 43 | float(millisecs)) else datetime.fromtimestamp( 44 | float(millisecs)/1000)) 45 | 46 | # get train label 47 | train_label = pd.read_csv( 48 | join(datapath, 'train', 'main_train.solution'))['label'] 49 | return train_data, train_label 50 | 51 | 52 | def read_test(datapath, info): 53 | # get test data 54 | main_columns = info['tables']['main'] 55 | table_dtype = {key: TYPE_MAP[val] for key, val in main_columns.items()} 56 | 57 | table_path = join(datapath, 'test', 'main_test.data') 58 | 59 | date_list = [key for key, val in main_columns.items() if val == 'time'] 60 | 61 | test_data = pd.read_csv( 62 | table_path, sep='\t', dtype=table_dtype, parse_dates=date_list, 63 | date_parser=lambda millisecs: millisecs if np.isnan( 64 | float(millisecs)) else datetime.fromtimestamp( 65 | float(millisecs) / 1000)) 66 | return test_data 67 | 68 | def train_and_predict(train_data,train_label,info,test_data): 69 | cmodel = Model(info) 70 | cmodel.fit(train_data, train_label) 71 | return cmodel.predict(test_data) 72 | 73 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/automl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepBlueAI/AutoSmart/99a6b44ce5b6520a2463c8a4b3acc675584533be/auto_smart/auto_smart/automl/__init__.py -------------------------------------------------------------------------------- /auto_smart/auto_smart/automl/auto_lgb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import lightgbm as lgb 3 | import numpy as np 4 | import CONSTANT 5 | from util import log, timeclass 6 | from .automl import AutoML 7 | import pandas as pd 8 | import gc 9 | from . import autosample 10 | import time 11 | import copy 12 | from sklearn.metrics import roc_auc_score 13 | 14 | class AutoLGB(AutoML): 15 | def __init__(self): 16 | self.params = { 17 | "boosting_type": "gbdt", 18 | "objective": "binary", 19 | "metric": "auc", 20 | "verbosity": 1, 21 | "seed": CONSTANT.SEED, 22 | "num_threads": CONSTANT.THREAD_NUM 23 | } 24 | 25 | self.hyperparams = { 26 | 'num_leaves': 31, 27 | 'max_depth': -1, 28 | 'min_child_samples': 20, 29 | 'max_bin':255, 30 | 'subsample': 0.8, 31 | 'subsample_freq': 1, 32 | 'colsample_bytree': 0.8, 33 | 'min_child_weight': 0.001, 34 | 'subsample_for_bin': 200000, 35 | 'min_split_gain': 0.02, 36 | 'reg_alpha': 0.1, 37 | 'reg_lambda': 0.1, 38 | } 39 | 40 | self.early_stopping_rounds = 50 41 | 42 | @timeclass(cls='AutoLGB') 43 | def predict(self,X): 44 | X = X[self.columns] 45 | X.columns = self.new_feat_name_cols 46 | return self.model.predict(X) 47 | 48 | @timeclass(cls='AutoLGB') 49 | def ensemble_train(self,X,y,categories,config,len_test): 50 | feat_name = list(X.columns) 51 | self.ensemble_models = [] 52 | self.ensemble_columns = [] 53 | columns = list(X.columns) 54 | log(f'lgb training set shape: {X.shape}') 55 | pos = (y==1).sum() 56 | neg = (y==0).sum() 57 | log(f'pos {pos} neg {neg}') 58 | 59 | self.columns = columns 60 | max_sample_num = len(y) 61 | 62 | feat_name_cols = list(X.columns) 63 | feat_name_maps = { feat_name_cols[i] : str(i) for i in range(len(feat_name_cols)) } 64 | f_feat_name_maps = { str(i) : feat_name_cols[i] for i in range(len(feat_name_cols)) } 65 | new_feat_name_cols = [ feat_name_maps[i] for i in feat_name_cols ] 66 | X.columns = new_feat_name_cols 67 | categories = [ feat_name_maps[i] for i in categories ] 68 | self.f_feat_name_maps = f_feat_name_maps 69 | self.new_feat_name_cols = new_feat_name_cols 70 | 71 | all_columns = list(X.columns) 72 | 73 | start_time = time.time() 74 | i = 0 75 | cur_columns = all_columns 76 | seed = np.random.randint(2019*i,2019*(i+1)) 77 | X_train,y_train = autosample.downsampling(X,y,max_sample_num,seed) 78 | X_train = X_train[cur_columns] 79 | gc.collect() 80 | 81 | colset = set(X_train.columns) 82 | cur_categorical = [col for col in categories if col in colset] 83 | pos = (y_train==1).sum() 84 | neg = (y_train==0).sum() 85 | 86 | params = self.params 87 | hyperparams = self.hyperparams 88 | params['seed'] = seed 89 | 90 | X_train = X_train.astype(np.float32) 91 | gc.collect() 92 | y_train = y_train.astype(np.float32) 93 | gc.collect() 94 | X_train = X_train.values 95 | gc.collect() 96 | y_train = y_train.values 97 | gc.collect() 98 | 99 | train_data = lgb.Dataset(X_train, label=y_train,feature_name=feat_name) 100 | del X_train,y_train 101 | gc.collect() 102 | 103 | model = lgb.train({**params, **hyperparams}, 104 | train_data, 105 | num_boost_round=self.best_iteration, 106 | feature_name=cur_columns, 107 | categorical_feature=cur_categorical, 108 | learning_rates = self.learning_rates[:self.best_iteration]) 109 | 110 | self.ensemble_columns.append(cur_columns) 111 | self.ensemble_models.append(model) 112 | end_time = time.time() 113 | 114 | model_use_time = end_time - start_time 115 | del train_data 116 | 117 | gc.collect() 118 | 119 | start_time = time.time() 120 | temp = X.iloc[:100000] 121 | 122 | temp = temp.astype(np.float32) 123 | gc.collect() 124 | temp = temp.values 125 | gc.collect() 126 | 127 | model.predict(temp) 128 | 129 | end_time = time.time() 130 | model_test_use_time = (end_time-start_time) 131 | model_test_use_time = len_test/temp.shape[0] * model_test_use_time 132 | model_use_time = model_use_time + model_test_use_time 133 | del temp,model 134 | 135 | rest_time = config.budget/10*9-(end_time-config.start_time) 136 | if rest_time <= 0: 137 | rest_model_num = 0 138 | else: 139 | rest_model_num = int(rest_time // model_use_time) 140 | 141 | if rest_model_num >= 50: 142 | rest_model_num = 50 143 | 144 | if rest_model_num >= 1: 145 | rest_model_num -= 1 146 | 147 | if not CONSTANT.USE_ENSEMBLE: 148 | rest_model_num = 0 149 | 150 | for i in range(1,rest_model_num+1): 151 | 152 | seed = np.random.randint(2019*i,2019*(i+1)) 153 | 154 | cur_columns = list(pd.Series(all_columns).sample(frac=0.85,replace=False,random_state=seed)) 155 | 156 | X_train,y_train = autosample.downsampling(X,y,max_sample_num,seed) 157 | X_train = X_train[cur_columns] 158 | gc.collect() 159 | 160 | colset = set(X_train.columns) 161 | cur_categorical = [col for col in categories if col in colset] 162 | 163 | pos = (y_train==1).sum() 164 | neg = (y_train==0).sum() 165 | 166 | params = self.params 167 | hyperparams = self.hyperparams 168 | params['seed'] = seed 169 | 170 | num_leaves = hyperparams['num_leaves'] 171 | num_leaves = num_leaves + np.random.randint(-int(num_leaves/10),int(num_leaves/10)+7) 172 | 173 | lrs = np.array(self.learning_rates) 174 | rands = 1 + 0.2*np.random.rand(len(lrs)) 175 | lrs = list(lrs * rands) 176 | 177 | cur_iteration = self.best_iteration 178 | cur_iteration = cur_iteration + np.random.randint(-30,40) 179 | if cur_iteration > len(lrs): 180 | cur_iteration = len(lrs) 181 | 182 | if cur_iteration <= 10: 183 | cur_iteration = self.best_iteration 184 | 185 | cur_hyperparams = copy.deepcopy(hyperparams) 186 | cur_hyperparams['num_leaves'] = num_leaves 187 | 188 | X_train = X_train.astype(np.float32) 189 | gc.collect() 190 | y_train = y_train.astype(np.float32) 191 | gc.collect() 192 | X_train = X_train.values 193 | gc.collect() 194 | y_train = y_train.values 195 | gc.collect() 196 | 197 | train_data = lgb.Dataset(X_train, label=y_train,feature_name=cur_columns) 198 | del X_train,y_train 199 | gc.collect() 200 | 201 | model = lgb.train({**params, **cur_hyperparams}, 202 | train_data, 203 | num_boost_round=cur_iteration, 204 | feature_name=cur_columns, 205 | categorical_feature=cur_categorical, 206 | learning_rates = lrs[:cur_iteration]) 207 | 208 | 209 | self.ensemble_columns.append(cur_columns) 210 | self.ensemble_models.append(model) 211 | 212 | del train_data 213 | gc.collect() 214 | 215 | X.columns = self.columns 216 | 217 | 218 | @timeclass(cls='AutoLGB') 219 | def ensemble_predict(self,X): 220 | X = X[self.columns] 221 | gc.collect() 222 | 223 | X.columns = self.new_feat_name_cols 224 | 225 | preds = [] 226 | for model,cur_cols in zip(self.ensemble_models,self.ensemble_columns): 227 | gc.collect() 228 | tX = X[cur_cols] 229 | gc.collect() 230 | tX = tX.astype(np.float32) 231 | gc.collect() 232 | tX = tX.values 233 | gc.collect() 234 | 235 | preds.append(model.predict( tX )) 236 | gc.collect() 237 | 238 | if len(preds) == 1: 239 | pred = preds[0] 240 | 241 | if len(preds) > 1: 242 | total_model_num = len(preds) 243 | 244 | main_model_weight = 8 / (8 + 2 * (total_model_num-1)) 245 | rest_model_weight = 2 / (8 + 2 * (total_model_num-1)) 246 | pred = preds[0] * main_model_weight 247 | for i in range(1,total_model_num): 248 | pred = pred + rest_model_weight * preds[i] 249 | 250 | return pred 251 | 252 | @timeclass(cls='AutoLGB') 253 | def ensemble_predict_test(self,X): 254 | X = X[self.columns] 255 | gc.collect() 256 | 257 | X.columns = self.new_feat_name_cols 258 | log(f'ensemble models {len(self.ensemble_models)}') 259 | preds = [] 260 | for model,cur_cols in zip(self.ensemble_models,self.ensemble_columns): 261 | gc.collect() 262 | tX = X[cur_cols] 263 | gc.collect() 264 | tX = tX.astype(np.float32) 265 | gc.collect() 266 | tX = tX.values 267 | gc.collect() 268 | 269 | preds.append(model.predict( tX )) 270 | gc.collect() 271 | 272 | if len(preds) == 1: 273 | pred = preds[0] 274 | 275 | if len(preds) > 1: 276 | total_model_num = len(preds) 277 | 278 | main_model_weight = 8 / (8 + 2 * (total_model_num-1)) 279 | rest_model_weight = 2 / (8 + 2 * (total_model_num-1)) 280 | pred = preds[0] * main_model_weight 281 | for i in range(1,total_model_num): 282 | pred = pred + rest_model_weight * preds[i] 283 | 284 | return pred,preds[0] 285 | 286 | def get_log_lr(self,num_boost_round,max_lr,min_lr): 287 | learning_rates = [max_lr+(min_lr-max_lr)/np.log(num_boost_round)*np.log(i) for i in range(1,num_boost_round+1)] 288 | return learning_rates 289 | 290 | def set_num_leaves(self,X,y): 291 | t = len(y) 292 | t = X.shape[1]*(t/40000) 293 | level = t**0.225 + 1.5 294 | num_leaves = int(2**level) + 10 295 | num_leaves = min(num_leaves, 128) 296 | num_leaves = max(num_leaves, 32) 297 | self.hyperparams['num_leaves'] = num_leaves 298 | 299 | def set_min_child_samples(self, X,y ): 300 | min_child_samples = ( (X.shape[0]/20000)**0.6 ) *15 301 | min_child_samples = int(min_child_samples) 302 | min_child_samples = min(min_child_samples, 150) 303 | min_child_samples = max(min_child_samples, 15) 304 | 305 | self.hyperparams['min_child_samples'] = min_child_samples 306 | 307 | @timeclass(cls='AutoLGB') 308 | def lr_opt(self,train_data,valid_data,categories): 309 | params = self.params 310 | hyperparams = self.hyperparams 311 | 312 | max_lrs = [0.1,0.08,0.05,0.02] 313 | min_lrs = [0.04,0.02,0.01,0.005] 314 | 315 | num_boost_round = self.num_boost_round 316 | max_num_boost_round = min(400,num_boost_round) 317 | best_score = -1 318 | best_loop = -1 319 | lr = None 320 | 321 | scores = [] 322 | lrs = [] 323 | for max_lr,min_lr in zip(max_lrs,min_lrs): 324 | learning_rates = self.get_log_lr(num_boost_round,max_lr,min_lr) 325 | 326 | model = lgb.train({**params, **hyperparams}, train_data, num_boost_round=max_num_boost_round,\ 327 | categorical_feature=categories,learning_rates = learning_rates[:max_num_boost_round] 328 | ) 329 | pred = model.predict(valid_data.data) 330 | score = roc_auc_score(valid_data.label,pred) 331 | scores.append(score) 332 | lrs.append(learning_rates) 333 | del model, pred 334 | gc.collect() 335 | 336 | best_loop = np.argmax(scores) 337 | best_score = np.max(scores) 338 | lr = lrs[best_loop] 339 | log(f'scores {scores}') 340 | log(f'loop {best_loop}') 341 | log(f'lr max {lr[0]} min {lr[-1]}') 342 | log(f'lr best score {best_score}') 343 | return lr 344 | 345 | @timeclass(cls='AutoLGB') 346 | def num_leaves_opt(self,train_data,valid_data,categories): 347 | params = self.params 348 | hyperparams = self.hyperparams 349 | num_leaves = [31,63,127,255] 350 | 351 | num_boost_round = 500 352 | best_iteration = -1 353 | i = 0 354 | best_score = -1 355 | best_loop = -1 356 | best_num_leaves = None 357 | 358 | for leaves in num_leaves: 359 | hyperparams['num_leaves'] = leaves 360 | model = lgb.train({**params, **hyperparams}, train_data, num_boost_round=num_boost_round,\ 361 | valid_sets=[valid_data], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=100,\ 362 | categorical_feature=categories,learning_rates = self.learning_rates 363 | ) 364 | 365 | score = model.best_score["valid_0"][params["metric"]] 366 | if score > best_score: 367 | best_num_leaves = leaves 368 | best_iteration = model.best_iteration 369 | best_score = score 370 | best_loop = i 371 | 372 | return best_num_leaves 373 | 374 | @timeclass(cls='AutoLGB') 375 | def subsample_opt(self,num_samples): 376 | samples = num_samples 377 | if samples > 1000000: 378 | samples = 1000000 379 | 380 | if samples<200000: 381 | subsample = 0.95 - samples/1000000 382 | return subsample 383 | 384 | subsample = 0.85-samples/2500000 385 | return subsample 386 | 387 | @timeclass(cls='AutoLGB') 388 | def colsample_bytree_opt(self,num_feature): 389 | if num_feature > 500: 390 | num_feature = 500 391 | 392 | if num_feature > 100: 393 | colsample_bytree = 0.8 - num_feature/2000 394 | else: 395 | colsample_bytree = 0.95 - num_feature/500 396 | 397 | return colsample_bytree 398 | 399 | @timeclass(cls='AutoLGB') 400 | def param_compute(self,X,y,categories,config): 401 | feat_name = list(X.columns) 402 | colsample_bytree = self.colsample_bytree_opt(X.shape[1]) 403 | self.hyperparams['colsample_bytree'] = colsample_bytree 404 | 405 | max_sample_num = len(y) 406 | subsample = self.subsample_opt(autosample.downsampling_y(y,max_sample_num).shape[0]) 407 | self.hyperparams['subsample'] = subsample 408 | 409 | max_sample_num = min(len(y),50000) 410 | X_sample,y_sample = autosample.downsampling(X,y,max_sample_num) 411 | gc.collect() 412 | params = self.params 413 | 414 | start_time = time.time() 415 | X_sample = X_sample.astype(np.float32) 416 | gc.collect() 417 | y_sample = y_sample.astype(np.float32) 418 | gc.collect() 419 | X_sample = X_sample.values 420 | gc.collect() 421 | y_sample = y_sample.values 422 | gc.collect() 423 | end_time = time.time() 424 | transfer_time = end_time-start_time 425 | 426 | time_number_boost_round1 = 15 427 | start_time = time.time() 428 | train_data = lgb.Dataset(X_sample, label=y_sample,feature_name=feat_name) 429 | 430 | gc.collect() 431 | 432 | lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=time_number_boost_round1,\ 433 | categorical_feature=categories,) 434 | 435 | end_time = time.time() 436 | 437 | model_use_time1 = end_time - start_time 438 | 439 | time_number_boost_round2 = time_number_boost_round1*2 440 | 441 | del train_data 442 | gc.collect() 443 | 444 | start_time = time.time() 445 | train_data = lgb.Dataset(X_sample, label=y_sample,feature_name=feat_name) 446 | del X_sample,y_sample 447 | gc.collect() 448 | lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=time_number_boost_round2,\ 449 | categorical_feature=categories,) 450 | 451 | del train_data 452 | gc.collect() 453 | end_time = time.time() 454 | 455 | model_use_time2 = end_time - start_time 456 | 457 | boost_time = (model_use_time2 - model_use_time1) 458 | boost_round = time_number_boost_round2 - time_number_boost_round1 459 | preprocess_time = model_use_time1 - boost_time 460 | model_sample_time = 4 * (transfer_time + preprocess_time + (boost_time * (400/boost_round))) + 5 461 | 462 | max_sample_num = len(y) 463 | X,y = autosample.downsampling(X,y,max_sample_num) 464 | 465 | gc.collect() 466 | pos = (y==1).sum() 467 | neg = (y==0).sum() 468 | 469 | gc.collect() 470 | params = self.params 471 | 472 | time_number_boost_round1 = 15 473 | 474 | start_time = time.time() 475 | X = X.astype(np.float32) 476 | gc.collect() 477 | y = y.astype(np.float32) 478 | gc.collect() 479 | X = X.values 480 | gc.collect() 481 | y = y.values 482 | gc.collect() 483 | end_time = time.time() 484 | 485 | transfer_time = end_time-start_time 486 | 487 | start_time = time.time() 488 | train_data = lgb.Dataset(X, label=y,feature_name=feat_name) 489 | 490 | gc.collect() 491 | 492 | 493 | lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=time_number_boost_round1,\ 494 | categorical_feature=categories,) 495 | 496 | del train_data 497 | gc.collect() 498 | end_time = time.time() 499 | 500 | model_use_time1 = end_time - start_time 501 | 502 | time_number_boost_round2 = time_number_boost_round1*2 503 | 504 | start_time = time.time() 505 | train_data = lgb.Dataset(X, label=y,feature_name=feat_name) 506 | del X,y 507 | gc.collect() 508 | lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=time_number_boost_round2,\ 509 | categorical_feature=categories,) 510 | 511 | del train_data 512 | gc.collect() 513 | end_time = time.time() 514 | 515 | model_use_time2 = end_time - start_time 516 | 517 | boost_time = (model_use_time2 - model_use_time1) 518 | boost_round = time_number_boost_round2 - time_number_boost_round1 519 | preprocess_time = model_use_time1 - boost_time 520 | 521 | rest_time = config.budget/10*9-(end_time-config.start_time)-model_sample_time-10 522 | 523 | self.num_boost_round = 20 524 | for number_boost_round in [700,600,500,400,300,200,100,50]: 525 | real_model_time = (transfer_time + preprocess_time + (boost_time * (number_boost_round/boost_round))) 526 | if real_model_time > rest_time: 527 | continue 528 | else: 529 | self.num_boost_round = number_boost_round 530 | break 531 | 532 | gc.collect() 533 | 534 | @timeclass(cls='AutoLGB') 535 | def param_opt(self,X_train,y_train,X_valid,y_valid,categories): 536 | feat_name = list(X_train.columns) 537 | 538 | pos = (y_train==1).sum() 539 | neg = (y_train==0).sum() 540 | val_pos = (y_valid==1).sum() 541 | val_neg = (y_valid==0).sum() 542 | 543 | max_sample_num = min(len(y_train),50000) 544 | X,y = autosample.downsampling(X_train,y_train,max_sample_num) 545 | 546 | pos = (y==1).sum() 547 | neg = (y==0).sum() 548 | 549 | train_data = lgb.Dataset(X, label=y,feature_name=feat_name) 550 | del X,y 551 | gc.collect() 552 | 553 | valid_data = lgb.Dataset(X_valid, label=y_valid,feature_name=feat_name,free_raw_data=False) 554 | del X_valid,y_valid 555 | gc.collect() 556 | 557 | lr = self.lr_opt(train_data,valid_data,categories) 558 | self.learning_rates = lr 559 | 560 | self.best_iteration = self.num_boost_round 561 | 562 | del train_data 563 | gc.collect() 564 | 565 | num_boost_round = self.num_boost_round 566 | params = self.params 567 | max_sample_num = len(y_train) 568 | 569 | X,y = autosample.downsampling(X_train,y_train,max_sample_num) 570 | del X_train,y_train 571 | 572 | gc.collect() 573 | pos = (y==1).sum() 574 | neg = (y==0).sum() 575 | 576 | X = X.astype(np.float32) 577 | gc.collect() 578 | y = y.astype(np.float32) 579 | gc.collect() 580 | X = X.values 581 | gc.collect() 582 | y = y.values 583 | gc.collect() 584 | 585 | train_data = lgb.Dataset(X, label=y,feature_name=feat_name) 586 | 587 | del X,y 588 | gc.collect() 589 | 590 | model = lgb.train({**params, **self.hyperparams}, train_data, num_boost_round=num_boost_round,\ 591 | valid_sets=[valid_data], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=100,\ 592 | categorical_feature=categories,learning_rates = self.learning_rates 593 | ) 594 | gc.collect() 595 | 596 | best_model = model 597 | 598 | best_score = model.best_score["valid_0"][params["metric"]] 599 | 600 | if model.best_iteration > 50: 601 | self.best_iteration = model.best_iteration 602 | elif model.current_iteration() > 50: 603 | self.best_iteration = model.current_iteration() 604 | else: 605 | self.best_iteration = 50 606 | 607 | return best_model,best_score 608 | 609 | def get_importances(self): 610 | model = self.model 611 | importances = pd.DataFrame({'features':[ self.f_feat_name_maps[i] for i in model.feature_name() ] , 612 | 'importances':model.feature_importance()}) 613 | 614 | importances.sort_values('importances',ascending=False,inplace=True) 615 | 616 | return importances 617 | 618 | @timeclass(cls='AutoLGB') 619 | def ensemble_predict_train(self,X): 620 | X = X[X.columns] 621 | X.columns = self.new_feat_name_cols 622 | 623 | preds = [] 624 | for model in self.ensemble_models: 625 | preds.append(model.predict(X)) 626 | 627 | pred = np.stack(preds,axis=1).mean(axis=1) 628 | return pred 629 | 630 | def get_ensemble_importances(self): 631 | model = self.ensemble_models[0] 632 | importances = pd.DataFrame({'features':[ self.f_feat_name_maps[i] for i in model.feature_name() ] , 633 | 'importances':model.feature_importance()}) 634 | 635 | importances.sort_values('importances',ascending=False,inplace=True) 636 | 637 | return importances 638 | 639 | @timeclass(cls='AutoLGB') 640 | def param_opt_new(self,X_train,y_train,X_valid,y_valid,categories): 641 | feat_name = list(X_train.columns) 642 | 643 | pos = (y_train==1).sum() 644 | neg = (y_train==0).sum() 645 | val_pos = (y_valid==1).sum() 646 | val_neg = (y_valid==0).sum() 647 | log(f'training set pos {pos} neg {neg}') 648 | log(f'validation set pos {val_pos} neg {val_neg}') 649 | 650 | max_sample_num = min(len(y_train),50000) 651 | X,y = autosample.downsampling(X_train,y_train,max_sample_num) 652 | 653 | pos = (y==1).sum() 654 | neg = (y==0).sum() 655 | log(f'opt downsampling set pos {pos} neg {neg}') 656 | 657 | X = X.astype(np.float32) 658 | gc.collect() 659 | y = y.astype(np.float32) 660 | gc.collect() 661 | X = X.values 662 | gc.collect() 663 | y = y.values 664 | gc.collect() 665 | 666 | train_data = lgb.Dataset(X, label=y,feature_name=feat_name) 667 | del X,y 668 | gc.collect() 669 | 670 | valid_data = lgb.Dataset(X_valid, label=y_valid,feature_name=feat_name,free_raw_data=False) 671 | del X_valid,y_valid 672 | gc.collect() 673 | 674 | lr = self.lr_opt(train_data,valid_data,categories) 675 | del train_data 676 | gc.collect() 677 | self.learning_rates = lr 678 | 679 | self.best_iteration = self.num_boost_round 680 | log(f'pass round opt, use best iteration as {self.best_iteration}') 681 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/automl/automl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class AutoML: 4 | def __init__(self): 5 | self.params = { 6 | 7 | } 8 | 9 | def train(self,X,y,categories): 10 | pass 11 | 12 | def predict(self,X): 13 | pass 14 | 15 | 16 | def param_opt(self,X_train,y_train,X_valid,y_valid,categories): 17 | pass 18 | 19 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/automl/autosample.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | 4 | def get_downsampling_num(npos,nneg,sample_num,unbalanced_ratio,min_neg_pos_ratio=2): 5 | 6 | reverse = False 7 | ntol = npos + nneg 8 | if npos>nneg: 9 | reverse = True 10 | tmp = npos 11 | npos = nneg 12 | nneg = tmp 13 | 14 | max_sample_num = min(npos, nneg)*(unbalanced_ratio+1) 15 | if max_sample_num>sample_num: 16 | max_sample_num = sample_num 17 | 18 | if npos+nneg > max_sample_num: 19 | 20 | if nneg/npos <= min_neg_pos_ratio: 21 | pos_num = npos/ntol * max_sample_num 22 | neg_num = nneg/ntol * max_sample_num 23 | 24 | elif nneg/npos <= unbalanced_ratio: 25 | if npos > max_sample_num/(min_neg_pos_ratio+1): 26 | pos_num = max_sample_num/(min_neg_pos_ratio+1) 27 | neg_num = max_sample_num - pos_num 28 | else: 29 | pos_num = npos 30 | neg_num = max_sample_num - pos_num 31 | 32 | elif nneg/npos > unbalanced_ratio: 33 | if npos > max_sample_num/(unbalanced_ratio+1): 34 | pos_num = max_sample_num/(unbalanced_ratio+1) 35 | neg_num = max_sample_num - pos_num 36 | 37 | else: 38 | pos_num = npos 39 | neg_num = max_sample_num - npos 40 | 41 | else: 42 | neg_num = nneg 43 | pos_num = npos 44 | 45 | if neg_num/pos_num > unbalanced_ratio: 46 | neg_num = pos_num*unbalanced_ratio 47 | 48 | neg_num = int(neg_num) 49 | pos_num = int(pos_num) 50 | if reverse: 51 | return neg_num,pos_num 52 | 53 | return pos_num,neg_num 54 | 55 | def sample(X,frac,seed,y=None): 56 | if frac == 1: 57 | X = X.sample(frac=1,random_state=seed) 58 | elif frac > 1: 59 | mul = int(frac) 60 | frac = frac - int(frac) 61 | X_res = X.sample(frac=frac,random_state=seed) 62 | X = pd.concat([X] * mul + [X_res]) 63 | else: 64 | X = X.sample(frac=frac,random_state=seed) 65 | 66 | if y is not None: 67 | y = y.loc[X.index] 68 | return X,y 69 | return X 70 | 71 | 72 | def downsampling_num(y,max_sample_num): 73 | npos = (y==1).sum() 74 | nneg = (y==0).sum() 75 | 76 | 77 | min_num = min(npos,nneg) 78 | min_num = max(min_num,1000) 79 | 80 | if min_num < 8000: 81 | unbalanced_ratio = 10 - (min_num//1000) 82 | else: 83 | unbalanced_ratio = 3 84 | 85 | pos_num,neg_num = get_downsampling_num(npos,nneg,max_sample_num,unbalanced_ratio) 86 | return pos_num,neg_num 87 | 88 | 89 | def class_sample(X,y,pos_num,neg_num,seed=2019): 90 | 91 | npos = float((y == 1).sum()) 92 | nneg = len(y) - npos 93 | 94 | pos_frac = pos_num / npos 95 | neg_frac = neg_num / nneg 96 | 97 | X_pos = X[y == 1] 98 | X_pos = sample(X_pos,pos_frac,seed) 99 | 100 | X_neg = X[y == 0] 101 | X_neg = sample(X_neg,neg_frac,seed) 102 | 103 | X = pd.concat([X_pos,X_neg]) 104 | 105 | X,y = sample(X,1,seed,y) 106 | 107 | return X,y 108 | 109 | def downsampling(X,y,max_sample_num,seed=2019): 110 | pos_num,neg_num = downsampling_num(y,max_sample_num) 111 | return class_sample(X,y,pos_num,neg_num,seed) 112 | 113 | def class_sample_y(y,pos_num,neg_num,seed=2019): 114 | 115 | npos = float((y == 1).sum()) 116 | nneg = len(y) - npos 117 | 118 | pos_frac = pos_num / npos 119 | neg_frac = neg_num / nneg 120 | 121 | y_pos = y[y == 1] 122 | y_pos = sample(y_pos,pos_frac,seed) 123 | 124 | y_neg = y[y == 0] 125 | y_neg = sample(y_neg,neg_frac,seed) 126 | 127 | y = pd.concat([y_pos,y_neg]) 128 | 129 | y = sample(y,1,seed) 130 | 131 | return y 132 | 133 | def downsampling_y(y,max_sample_num,seed=2019): 134 | pos_num,neg_num = downsampling_num(y,max_sample_num) 135 | y = class_sample_y(y,pos_num,neg_num,seed) 136 | return y 137 | 138 | 139 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/automl/model_selection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | def time_train_test_split(X,y,test_rate=0.2,shuffle=True,random_state=1): 5 | length = X.shape[0] 6 | 7 | 8 | test_size = int(length * test_rate) 9 | train_size = length - test_size 10 | 11 | X_train = X.iloc[:train_size] 12 | y_train = y.iloc[:train_size] 13 | X_test = X.iloc[train_size:] 14 | y_test = y.iloc[train_size:] 15 | 16 | if shuffle: 17 | np.random.seed(random_state) 18 | idx = np.arange(train_size) 19 | np.random.shuffle(idx) 20 | X_train = X_train.iloc[idx] 21 | y_train = y_train.iloc[idx] 22 | 23 | return X_train,y_train,X_test,y_test 24 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class Config: 4 | def __init__(self, start_time,budget): 5 | if budget >= 1000: 6 | self.keys_order2_cat_max = 50 7 | self.keys_order2_num_max = 50 8 | 9 | self.keys_order2_cat_maxmin = 10 10 | self.keys_order2_num_maxmin = 10 11 | self.keys_order2_num_std = 5 12 | 13 | self.keys_order2_bin_num_max = 20 14 | self.keys_order2_bin_cat_max = 20 15 | 16 | self.all_order2_cat_max = 7 17 | self.all_order2_num_max = 7 18 | 19 | 20 | self.keys_order3_num_max = 10 21 | self.keys_order3_cat_max = 10 22 | 23 | self.wait_feat_selection_num = 30 24 | self.wait_feat_selection_num_all = 20 25 | 26 | self.start_time = start_time 27 | self.budget = budget 28 | else: 29 | self.keys_order2_cat_max = 40 30 | self.keys_order2_num_max = 40 31 | 32 | self.keys_order2_cat_maxmin = 10 33 | self.keys_order2_num_maxmin = 10 34 | self.keys_order2_num_std = 5 35 | 36 | self.keys_order2_bin_num_max = 10 37 | self.keys_order2_bin_cat_max = 10 38 | 39 | self.all_order2_cat_max = 7 40 | self.all_order2_num_max = 7 41 | 42 | self.keys_order3_num_max = 10 43 | self.keys_order3_cat_max = 10 44 | 45 | self.wait_feat_selection_num = 30 46 | self.wait_feat_selection_num_all = 20 47 | 48 | self.start_time = start_time 49 | self.budget = budget 50 | 51 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/data_tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | def downcast(series,accuracy_loss = True, min_float_type='float16'): 7 | if series.dtype == np.int64: 8 | ii8 = np.iinfo(np.int8) 9 | ii16 = np.iinfo(np.int16) 10 | ii32 = np.iinfo(np.int32) 11 | max_value = series.max() 12 | min_value = series.min() 13 | 14 | if max_value <= ii8.max and min_value >= ii8.min: 15 | return series.astype(np.int8) 16 | elif max_value <= ii16.max and min_value >= ii16.min: 17 | return series.astype(np.int16) 18 | elif max_value <= ii32.max and min_value >= ii32.min: 19 | return series.astype(np.int32) 20 | else: 21 | return series 22 | 23 | elif series.dtype == np.float64: 24 | fi16 = np.finfo(np.float16) 25 | fi32 = np.finfo(np.float32) 26 | 27 | if accuracy_loss: 28 | max_value = series.max() 29 | min_value = series.min() 30 | if np.isnan(max_value): 31 | max_value = 0 32 | 33 | if np.isnan(min_value): 34 | min_value = 0 35 | 36 | if min_float_type=='float16' and max_value <= fi16.max and min_value >= fi16.min: 37 | return series.astype(np.float16) 38 | elif max_value <= fi32.max and min_value >= fi32.min: 39 | return series.astype(np.float32) 40 | else: 41 | return series 42 | else: 43 | tmp = series[~pd.isna(series)] 44 | if(len(tmp)==0): 45 | return series.astype(np.float16) 46 | 47 | if (tmp == tmp.astype(np.float16)).sum() == len(tmp): 48 | return series.astype(np.float16) 49 | elif (tmp == tmp.astype(np.float32)).sum() == len(tmp): 50 | return series.astype(np.float32) 51 | 52 | else: 53 | return series 54 | 55 | else: 56 | return series 57 | 58 | def gen_segs_array(shape0,nseg): 59 | segs = np.zeros(shape0) 60 | block_size = int(shape0/nseg)+1 61 | for i in range(nseg): 62 | segs[i*block_size:(i+1)*block_size] = i 63 | return segs 64 | 65 | 66 | def gen_segs_tuple(shape0,nseg): 67 | segs = [] 68 | block_size = int(shape0/nseg) 69 | i = -1 70 | for i in range(nseg-1): 71 | segs.append( (i*block_size,(i+1)*block_size) ) 72 | segs.append(((i+1)*block_size,shape0)) 73 | return segs 74 | 75 | 76 | def gen_segs_tuple_by_time_nseg(shape0,nseg,time_series): 77 | block_size = None 78 | if time_series is None: 79 | block_size = int(shape0/nseg)+1 80 | else: 81 | max_time = time_series.max().value 82 | min_time = time_series.min().value 83 | block_size = int( (max_time-min_time)/nseg ) 84 | return block_size 85 | 86 | def gen_combine_cats(df, cols): 87 | 88 | category = df[cols[0]].astype('float64') 89 | for col in cols[1:]: 90 | mx = df[col].max() 91 | category *= mx 92 | category += df[col] 93 | return category 94 | 95 | def gen_segs_tuple_by_time_size(shape0,block_size,time_series): 96 | segs = [] 97 | if time_series is None: 98 | nseg = int(shape0/block_size) 99 | block_size = int( shape0/nseg ) + 1 100 | for i in range(nseg): 101 | segs.append( (i*block_size,(i+1)*block_size) ) 102 | else: 103 | max_time = time_series.max().value 104 | min_time = time_series.min().value 105 | nseg = int( (max_time-min_time)/block_size ) 106 | if nseg == 0: 107 | nseg = 1 108 | block_size = int( (max_time-min_time)/nseg ) + 1 109 | t = time_series.reset_index(drop=True) 110 | t = t.astype('int64') 111 | 112 | 113 | for i in range(nseg): 114 | 115 | l_time = min_time + i*block_size 116 | r_time = min_time + (i+1)*block_size 117 | if i == nseg-1: 118 | r_time = max_time+1 119 | indexs = t[ (l_time<=t) & (t < r_time) ].index 120 | l_index = indexs[0] 121 | r_index = indexs[-1]+1 122 | segs.append( (l_index,r_index) ) 123 | 124 | return segs 125 | 126 | 127 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/feat/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepBlueAI/AutoSmart/99a6b44ce5b6520a2463c8a4b3acc675584533be/auto_smart/auto_smart/feat/__init__.py -------------------------------------------------------------------------------- /auto_smart/auto_smart/feat/default_merge_feat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .merge_feat import O2O,M2O,O2M,M2M,TimeM2M,PreO2O,PreM2O,PreO2M,PreM2M,PreTimeM2M 4 | from util import timeclass 5 | import CONSTANT 6 | import pandas as pd 7 | import numpy as np 8 | 9 | from joblib import Parallel, delayed 10 | from feat_context import FeatContext 11 | import util 12 | from data_tools import downcast 13 | import gc 14 | namespace = 'default' 15 | 16 | class M2OJoin(M2O): 17 | def fit(self,U,V): 18 | pass 19 | 20 | @timeclass(cls='M2OJoin') 21 | def transform(self,U,V): 22 | v = V.data 23 | key = self.key 24 | v = v.set_index(key) 25 | new_cols = [] 26 | col2type = {} 27 | col2block = {} 28 | for col in v.columns: 29 | feat_type = V.col2type[col] 30 | new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,col,feat_type,V.name) 31 | new_cols.append(new_col) 32 | col2type[new_col] = feat_type 33 | 34 | if col in V.col2block: 35 | block_id = V.col2block[col] 36 | col2block[new_col] = block_id 37 | 38 | v.columns = new_cols 39 | return v,col2type,col2block 40 | 41 | @timeclass(cls='M2OJoin') 42 | def fit_transform(self,U,V): 43 | return self.transform(U,V) 44 | 45 | class M2MKeyCount(M2M): 46 | @timeclass(cls='M2MKeyCount') 47 | def fit(self,U,V): 48 | pass 49 | 50 | @timeclass(cls='M2MKeyCount') 51 | def transform(self,U,V): 52 | v = V.data 53 | key = self.key 54 | col2type = {} 55 | ss = v.groupby(key)[key].count() 56 | ss = downcast(ss) 57 | feat_type = CONSTANT.NUMERICAL_TYPE 58 | new_col = key+'_M2MKeyCount' 59 | new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,new_col,feat_type,V.name) 60 | ss.name = new_col 61 | col2type[new_col] = feat_type 62 | return pd.DataFrame(ss),col2type,{} 63 | 64 | @timeclass(cls='M2MKeyCount') 65 | def fit_transform(self,U,V): 66 | return self.transform(U,V) 67 | 68 | class M2MNumMean(M2M): 69 | @timeclass(cls='M2MNumMean') 70 | def fit(self,U,V): 71 | pass 72 | 73 | @timeclass(cls='M2MNumMean') 74 | def transform(self,U,V): 75 | v = V.data 76 | key = self.key 77 | col2type = {} 78 | 79 | def func(df): 80 | key = df.columns[0] 81 | col = df.columns[1] 82 | df[col] = df[col].astype('float32') 83 | 84 | ss = df.groupby(key)[col].mean() 85 | ss = downcast(ss) 86 | return ss 87 | 88 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(v[[key,col]]) for col in V.num_cols) 89 | if res: 90 | new_cols = [] 91 | for col in V.num_cols: 92 | feat_type = CONSTANT.NUMERICAL_TYPE 93 | col = col+'_M2MNumMean' 94 | new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,col,feat_type,V.name) 95 | new_cols.append(new_col) 96 | col2type[new_col] = feat_type 97 | 98 | tmp = pd.concat(res,axis=1) 99 | tmp.columns = new_cols 100 | return tmp,col2type,{} 101 | return pd.DataFrame(),col2type,{} 102 | 103 | @timeclass(cls='M2MNumMean') 104 | def fit_transform(self,U,V): 105 | return self.transform(U,V) 106 | 107 | class TimeM2MnewLastData(M2M): 108 | @timeclass(cls='TimeM2MnewLastData') 109 | def fit(self,U,V): 110 | pass 111 | 112 | @timeclass(cls='TimeM2MnewLastData') 113 | def transform(self,U,V): 114 | key = self.key 115 | 116 | if U.key_time_col != V.key_time_col: 117 | return 118 | 119 | key_time_col = V.key_time_col 120 | 121 | todo_cols = V.multi_cat_cols 122 | if not todo_cols: 123 | return 124 | 125 | v = V.data[[V.key_time_col,key] + todo_cols] 126 | u = U.data[[U.key_time_col,key]] 127 | 128 | u_index = u.index 129 | u.reset_index(drop=True,inplace=True) 130 | col2type = {} 131 | col2block = {} 132 | 133 | u.index = -u.index-1 134 | v_large = pd.concat([v,u]) 135 | v_large.sort_values(by=[key,key_time_col],inplace=True) 136 | 137 | symbol = 1 138 | key_diff = v_large[key].diff() 139 | for col in todo_cols: 140 | v_large[col].loc[key_diff!=0].replace(np.nan,symbol) 141 | 142 | new_cols = [] 143 | for col in todo_cols: 144 | feat_type = CONSTANT.MULTI_CAT_TYPE 145 | new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,col,feat_type,V.name) 146 | new_cols.append(new_col) 147 | col2type[new_col] = feat_type 148 | if col in V.col2block: 149 | col2block[new_col] = V.col2block[col] 150 | 151 | def func(series): 152 | ss = series.fillna(method='ffill') 153 | ss = ss.replace(symbol,np.nan) 154 | return ss 155 | 156 | res = Parallel(n_jobs=CONSTANT.JOBS, require='sharedmem')(delayed(func)(v_large[col]) for col in todo_cols) 157 | if res: 158 | tmp = pd.concat(res,axis=1) 159 | del res 160 | gc.collect() 161 | 162 | tmp.columns = new_cols 163 | tmp = tmp.loc[tmp.index<0] 164 | tmp.index = -(tmp.index+1) 165 | 166 | tmp.sort_index(inplace=True) 167 | tmp.index = u_index 168 | del u_index 169 | gc.collect() 170 | U.data[new_cols] = tmp 171 | del tmp 172 | gc.collect() 173 | U.update_data(U.data,col2type,None,None,col2block,None) 174 | 175 | @timeclass(cls='TimeM2MnewLastData') 176 | def fit_transform(self,U,V): 177 | self.transform(U,V) 178 | 179 | class M2MDataLast(TimeM2M): 180 | @timeclass(cls='M2MDataLast') 181 | def fit(self,U,V): 182 | pass 183 | 184 | @timeclass(cls='M2MDataLast') 185 | def transform(self,U,V): 186 | data = V.data 187 | key = self.key 188 | col2type = {} 189 | col2block = {} 190 | 191 | col_sets = [] 192 | cols = list(data.columns) 193 | 194 | if key in cols: 195 | cols.remove(key) 196 | 197 | del_cols = [] 198 | for col in cols: 199 | if col in V.col2type: 200 | if V.col2type[col] == CONSTANT.NUMERICAL_TYPE: 201 | del_cols.append(col) 202 | 203 | for col in del_cols: 204 | if col in cols: 205 | cols.remove(col) 206 | 207 | if len(cols)==0: 208 | return pd.DataFrame(),{},{} 209 | cols_len = 20 210 | cols_num = len(cols) 211 | if cols_num % cols_len == 0: 212 | blocks = int(cols_num / cols_len) 213 | else: 214 | blocks = int(cols_num / cols_len) + 1 215 | 216 | for i in range(blocks): 217 | col_t = [] 218 | for j in range(i*cols_len,(i+1)*cols_len): 219 | if j < len(cols): 220 | col_t.append(cols[j]) 221 | col_sets.append(col_t) 222 | 223 | feats = [] 224 | for col_set in col_sets: 225 | 226 | feats.append( data.groupby( key )[ col_set ].last() ) 227 | if feats: 228 | df = pd.concat(feats,axis=1) 229 | 230 | new_cols = [] 231 | for col in df.columns: 232 | feat_type = V.col2type[col] 233 | new_col = FeatContext.gen_merge_feat_name(namespace,self.__class__.__name__,col,feat_type,V.name) 234 | new_cols.append(new_col) 235 | col2type[new_col] = feat_type 236 | 237 | 238 | if col in V.col2block: 239 | block_id = V.col2block[col] 240 | col2block[new_col] = block_id 241 | 242 | df.columns = new_cols 243 | return df,col2type,col2block 244 | else: 245 | return pd.DataFrame(),{},{} 246 | 247 | @timeclass(cls='M2MDataLast') 248 | def fit_transform(self,U,V): 249 | self.fit(U,V) 250 | return self.transform(U,V) -------------------------------------------------------------------------------- /auto_smart/auto_smart/feat/feat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class Feat: 4 | def __init__(self,config): 5 | self.config = config 6 | 7 | def fit(self,X,y): 8 | pass 9 | 10 | def transform(self,X): 11 | pass 12 | 13 | def fit_transform(self,X,y): 14 | pass -------------------------------------------------------------------------------- /auto_smart/auto_smart/feat/feat_pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .default_feat import * 4 | from .feat_selection import LGBFeatureSelection,LGBFeatureSelectionWait,LGBFeatureSelectionLast 5 | 6 | class FeatPipeline: 7 | def __init__(self): 8 | self.order1s = [] 9 | 10 | class DefaultFeatPipeline(FeatPipeline): 11 | def __init__(self): 12 | super(DefaultFeatPipeline,self).__init__() 13 | self.main_init() 14 | 15 | def main_init(self): 16 | self.order1s = [ 17 | PreMcToNumpy,McCatRank, 18 | 19 | OriginSession,\ 20 | 21 | ApartCatRecognize,\ 22 | 23 | KeysCountDIY, 24 | UserKeyCntDIY,SessionKeyCntDIY,\ 25 | 26 | KeysTimeDiffAndFuture, 27 | 28 | UserSessionNuniqueDIY,\ 29 | UserSessionCntDivNuniqueDIY,\ 30 | UserKeyNuniqueDIY, SessionKeyNuniqueDIY,\ 31 | UserKeyCntDivNuniqueDIY,SessionKeyCntDivNuniqueDIY,\ 32 | 33 | KeysCumCntRateAndReverse, 34 | 35 | UserKeyCumCntRateAndReverse, 36 | 37 | KeyTimeDate, 38 | KeyTimeBin, 39 | KeysBinCntDIY, 40 | 41 | CatCountDIY, 42 | LGBFeatureSelection,\ 43 | ] 44 | 45 | self.keys_order2s = [ 46 | KeysNumMeanOrder2MinusSelfNew, 47 | KeysNumMaxMinOrder2MinusSelfNew, 48 | KeysNumStd, 49 | KeysCatCntOrder2New, 50 | 51 | LGBFeatureSelectionWait, 52 | ] 53 | 54 | self.all_order2s = [ 55 | BinsCatCntOrder2DIYNew, 56 | BinsNumMeanOrder2DIYNew, 57 | CatNumMeanOrder2DIYNew, 58 | CatCntOrder2DIYNew, 59 | 60 | LGBFeatureSelectionWait 61 | ] 62 | 63 | self.post_order1s = [ 64 | TimeNum, 65 | ] 66 | 67 | self.merge_order1s = [ 68 | CatSegCtrOrigin, 69 | CatMeanEncoding, 70 | 71 | LGBFeatureSelectionLast, 72 | ] 73 | 74 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/feat/feat_selection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from util import timeclass,log 3 | import CONSTANT 4 | from model_input import FeatOutput 5 | from automl import autosample 6 | import gc 7 | import lightgbm as lgb 8 | import pandas as pd 9 | from .feat import Feat 10 | import time 11 | import numpy as np 12 | 13 | def lgb_train(X,y): 14 | num_boost_round = 100 15 | num_leaves = 63 16 | 17 | params = { 18 | 'boosting_type': 'gbdt', 19 | 'objective': 'binary', 20 | 'metric': "None", 21 | 'learning_rate': 0.1, 22 | 'num_leaves': num_leaves, 23 | 'max_depth': -1, 24 | 'min_child_samples': 20, 25 | 'max_bin':255, 26 | 'subsample': 0.9, 27 | 'subsample_freq': 1, 28 | 'colsample_bytree': 1, 29 | 'min_child_weight': 0.001, 30 | 'subsample_for_bin': 200000, 31 | 'min_split_gain': 0.02, 32 | 'reg_alpha': 0.1, 33 | 'reg_lambda': 0.1, 34 | 'seed': CONSTANT.SEED, 35 | 'nthread': CONSTANT.THREAD_NUM, 36 | } 37 | 38 | data = X.data 39 | 40 | y_train = y 41 | 42 | max_sample_num = min(len(y_train),50000) 43 | y_train = autosample.downsampling_y(y_train,max_sample_num) 44 | 45 | X_train = data.loc[y_train.index] 46 | 47 | X.data = X_train 48 | feat_output = FeatOutput() 49 | X_train,y_train,categories = feat_output.fit_transform_output(X,y_train) 50 | 51 | X.data = data 52 | gc.collect() 53 | 54 | feat_name_cols = list(X_train.columns) 55 | feat_name_maps = { feat_name_cols[i] : str(i) for i in range(len(feat_name_cols)) } 56 | f_feat_name_maps = { str(i) : feat_name_cols[i] for i in range(len(feat_name_cols)) } 57 | new_feat_name_cols = [ feat_name_maps[i] for i in feat_name_cols ] 58 | X_train.columns = new_feat_name_cols 59 | 60 | dtrain = lgb.Dataset(X_train,y_train,feature_name=list(X_train.columns)) 61 | model = lgb.train(params,dtrain, 62 | num_boost_round=num_boost_round, 63 | categorical_feature=[], 64 | ) 65 | 66 | df_imp = pd.DataFrame({'features': [ f_feat_name_maps[i] for i in model.feature_name() ] , 67 | 'importances':model.feature_importance()}) 68 | 69 | df_imp.sort_values('importances',ascending=False,inplace=True) 70 | 71 | return df_imp 72 | 73 | class LGBFeatureSelection(Feat): 74 | @timeclass(cls='LGBFeatureSelection') 75 | def fit(self,X,y): 76 | now = time.time() 77 | log(f'LGBFeatureSelection:{now-self.config.start_time}') 78 | 79 | threshold = 5 80 | df_imp = lgb_train(X,y) 81 | log(f'importances sum {df_imp["importances"].sum()}') 82 | if df_imp["importances"].sum() != 6200: 83 | keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features']) 84 | if len(keep_feats) < 150: 85 | useful_feats = list(df_imp.loc[df_imp['importances'] > 0,'features']) 86 | if len(useful_feats) <= 150: 87 | keep_feats = useful_feats 88 | else: 89 | df_imp_sorted = df_imp.sort_values(by='importances',ascending=False) 90 | keep_feats = list(df_imp_sorted['features'].iloc[:150]) 91 | else: 92 | keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features']) 93 | 94 | log(f'keep feats num {len(keep_feats)}') 95 | 96 | keep_cats = [] 97 | 98 | keep_cats_set = set() 99 | cat_set = set(X.cat_cols) 100 | 101 | for feat in keep_feats: 102 | 103 | if X.col2type[feat] == CONSTANT.CATEGORY_TYPE: 104 | if feat in cat_set: 105 | if feat not in keep_cats_set: 106 | keep_cats_set.add(feat) 107 | keep_cats.append(feat) 108 | 109 | elif feat in X.col2source_cat: 110 | keep_feat = X.col2source_cat[feat] 111 | if keep_feat in cat_set: 112 | if keep_feat not in keep_cats_set: 113 | keep_cats_set.add(keep_feat) 114 | keep_cats.append(keep_feat) 115 | 116 | drop_feats = list(set(df_imp['features'].tolist()) - set(keep_feats)) 117 | 118 | drop_feats = list(set(drop_feats) - keep_cats_set) 119 | self.drop_feats = drop_feats 120 | log(f'total feat num:{df_imp.shape[0]}, drop feat num:{len(self.drop_feats)}') 121 | 122 | keep_nums = [] 123 | for feat in keep_feats: 124 | if X.col2type[feat] == CONSTANT.NUMERICAL_TYPE: 125 | keep_nums.append(feat) 126 | 127 | keep_binaries = [] 128 | for feat in keep_feats: 129 | if X.col2type[feat] == CONSTANT.BINARY_TYPE: 130 | keep_binaries.append(feat) 131 | 132 | assert(len(set(keep_cats) & set(drop_feats))==0) 133 | assert(len(set(keep_nums) & set(drop_feats))==0) 134 | assert(len(set(keep_binaries) & set(drop_feats))==0) 135 | 136 | X.reset_combine_cols(keep_cats,keep_nums,keep_binaries) 137 | 138 | @timeclass(cls='LGBFeatureSelection') 139 | def transform(self,X): 140 | X.drop_data(self.drop_feats) 141 | return self.drop_feats 142 | 143 | @timeclass(cls='LGBFeatureSelection') 144 | def fit_transform(self,X,y): 145 | self.fit(X,y) 146 | self.transform(X) 147 | return self.drop_feats 148 | 149 | class LGBFeatureSelectionLast(Feat): 150 | @timeclass(cls='LGBFeatureSelectionLast') 151 | def fit(self,X,y): 152 | now = time.time() 153 | log(f'LGBFeatureSelectionLast:{now-self.config.start_time}') 154 | 155 | start_time = time.time() 156 | df_imp = lgb_train(X,y) 157 | 158 | data = X.data 159 | shape = data.shape 160 | y_pos = len(y[y==1]) 161 | y_neg = len(y[y==0]) 162 | unbalance_ratio = y_pos / y_neg if y_pos > y_neg else y_neg / y_pos 163 | memory_usage = pd.Series(np.zeros(shape[0]),dtype=np.float32).memory_usage() / 1024 / 1024 / 1024 164 | gc.collect() 165 | 166 | if unbalance_ratio >= 7: 167 | memory_constrain = 2 168 | elif unbalance_ratio >= 4: 169 | memory_constrain = 1.8 170 | else: 171 | memory_constrain = 1.6 172 | 173 | col_constrain = int(memory_constrain / memory_usage) 174 | 175 | end_time = time.time() 176 | 177 | use_time = end_time-start_time 178 | user_time_rate = use_time / self.config.budget 179 | 180 | if user_time_rate > 0.1: 181 | threshold = 13 182 | elif user_time_rate > 0.09: 183 | threshold = 12 184 | elif user_time_rate > 0.08: 185 | threshold = 11 186 | elif user_time_rate > 0.07: 187 | threshold = 10 188 | elif user_time_rate > 0.06: 189 | threshold = 9 190 | elif user_time_rate > 0.05: 191 | threshold = 8 192 | elif user_time_rate > 0.04: 193 | threshold = 7 194 | elif user_time_rate > 0.03: 195 | threshold = 6 196 | else: 197 | threshold = 5 198 | 199 | log(f'LGBFeatureSelectionLast threshold {threshold}') 200 | 201 | log(f'importances sum {df_imp["importances"].sum()}') 202 | if df_imp["importances"].sum() != 6200: 203 | keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features']) 204 | if len(keep_feats) < 150: 205 | useful_feats = list(df_imp.loc[df_imp['importances'] > 0,'features']) 206 | if len(useful_feats) <= 150: 207 | keep_feats = useful_feats 208 | else: 209 | df_imp_sorted = df_imp.sort_values(by='importances',ascending=False) 210 | keep_feats = list(df_imp_sorted['features'].iloc[:150]) 211 | else: 212 | keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features']) 213 | 214 | keep_cats = [] 215 | 216 | keep_cats_set = set() 217 | cat_set = set(X.cat_cols) 218 | 219 | for feat in keep_feats: 220 | 221 | if X.col2type[feat] == CONSTANT.CATEGORY_TYPE: 222 | if feat in cat_set: 223 | if feat not in keep_cats_set: 224 | keep_cats_set.add(feat) 225 | keep_cats.append(feat) 226 | 227 | elif feat in X.col2source_cat: 228 | keep_feat = X.col2source_cat[feat] 229 | if keep_feat in cat_set: 230 | if keep_feat not in keep_cats_set: 231 | keep_cats_set.add(keep_feat) 232 | keep_cats.append(keep_feat) 233 | 234 | drop_feats = list(set(df_imp['features'].tolist()) - set(keep_feats)) 235 | 236 | drop_feats = list(set(drop_feats) - keep_cats_set) 237 | self.drop_feats = drop_feats 238 | log(f'total feat num:{df_imp.shape[0]}, drop feat num:{len(self.drop_feats)}') 239 | 240 | keep_nums = [] 241 | for feat in keep_feats: 242 | if X.col2type[feat] == CONSTANT.NUMERICAL_TYPE: 243 | keep_nums.append(feat) 244 | 245 | keep_binaries = [] 246 | for feat in keep_feats: 247 | if X.col2type[feat] == CONSTANT.BINARY_TYPE: 248 | keep_binaries.append(feat) 249 | 250 | assert(len(set(keep_cats) & set(drop_feats))==0) 251 | assert(len(set(keep_nums) & set(drop_feats))==0) 252 | assert(len(set(keep_binaries) & set(drop_feats))==0) 253 | 254 | X.reset_combine_cols(keep_cats,keep_nums,keep_binaries) 255 | 256 | rest_cols = len(df_imp) - len(self.drop_feats) 257 | if rest_cols > col_constrain: 258 | real_keep_feats = set(df_imp['features'].iloc[:col_constrain].tolist()) 259 | real_drop_feats = list(set(df_imp['features'].tolist()) - real_keep_feats) 260 | self.drop_feats = real_drop_feats 261 | 262 | @timeclass(cls='LGBFeatureSelectionLast') 263 | def transform(self,X): 264 | X.drop_data(self.drop_feats) 265 | return self.drop_feats 266 | 267 | @timeclass(cls='LGBFeatureSelectionLast') 268 | def fit_transform(self,X,y): 269 | self.fit(X,y) 270 | self.transform(X) 271 | return self.drop_feats 272 | 273 | class LGBFeatureSelectionWait(Feat): 274 | @timeclass(cls='LGBFeatureSelectionWait') 275 | def fit(self,X,y): 276 | now = time.time() 277 | log(f'LGBFeatureSelection:{now-self.config.start_time}') 278 | 279 | threshold = 5 280 | df_imp = lgb_train(X,y) 281 | drop_feats = set(df_imp.loc[df_imp['importances'] < threshold,'features']) 282 | keep_feats = list(df_imp.loc[df_imp['importances'] >= threshold,'features']) 283 | 284 | df_imp.set_index('features',inplace=True) 285 | for cols in X.wait_selection_cols: 286 | drops = df_imp.loc[cols].sort_values(by='importances',ascending=False).index[self.config.wait_feat_selection_num:] 287 | drops = set(drops) 288 | drop_feats = drop_feats | drops 289 | 290 | keep_cats = [] 291 | 292 | keep_cats_set = set() 293 | cat_set = set(X.cat_cols) 294 | for feat in keep_feats: 295 | 296 | if X.col2type[feat] == CONSTANT.CATEGORY_TYPE: 297 | if feat in cat_set: 298 | if feat not in keep_cats_set: 299 | keep_cats_set.add(feat) 300 | keep_cats.append(feat) 301 | 302 | elif feat in X.col2source_cat: 303 | keep_feat = X.col2source_cat[feat] 304 | if keep_feat in cat_set: 305 | if keep_feat not in keep_cats_set: 306 | keep_cats_set.add(keep_feat) 307 | keep_cats.append(keep_feat) 308 | 309 | 310 | drop_feats = drop_feats - keep_cats_set 311 | drop_feats = list(drop_feats) 312 | self.drop_feats = drop_feats 313 | X.empty_wait_selection_cols() 314 | log(f'total feat num:{df_imp.shape[0]}, drop feat num:{len(self.drop_feats)}') 315 | 316 | assert(len(set(keep_cats) & set(drop_feats))==0) 317 | 318 | @timeclass(cls='LGBFeatureSelectionWait') 319 | def transform(self,X): 320 | X.drop_data(self.drop_feats) 321 | return self.drop_feats 322 | 323 | @timeclass(cls='LGBFeatureSelectionWait') 324 | def fit_transform(self,X,y): 325 | self.fit(X,y) 326 | self.transform(X) 327 | return self.drop_feats 328 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/feat/merge_feat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class MergeFeat: 4 | def __init__(self,key): 5 | self.key = key 6 | 7 | def fit(self,U,V): 8 | pass 9 | 10 | def transform(self,U,V): 11 | pass 12 | 13 | def fit_transform(self,U,V): 14 | pass 15 | 16 | class PreTimeM2M(MergeFeat): 17 | pass 18 | 19 | class PreO2O(MergeFeat): 20 | pass 21 | 22 | class PreM2O(MergeFeat): 23 | pass 24 | 25 | class PreO2M(MergeFeat): 26 | pass 27 | 28 | class PreM2M(MergeFeat): 29 | pass 30 | 31 | class O2O(MergeFeat): 32 | pass 33 | 34 | class M2O(MergeFeat): 35 | pass 36 | 37 | class O2M(MergeFeat): 38 | pass 39 | 40 | 41 | class M2M(MergeFeat): 42 | pass 43 | 44 | class TimeM2M(MergeFeat): 45 | pass 46 | 47 | class CmjTimeM2M(MergeFeat): 48 | def __init__(self,key,time_key,u_key_time_col): 49 | self.key = key 50 | self.time_key = time_key 51 | self.u_key_time_col = u_key_time_col 52 | 53 | def fit(self,T): 54 | pass 55 | 56 | def transform(self,T): 57 | pass 58 | 59 | def fit_transform(self,T): 60 | pass -------------------------------------------------------------------------------- /auto_smart/auto_smart/feat/merge_feat_pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .default_merge_feat import * 3 | 4 | class MergeFeatPipeline: 5 | def __init__(self): 6 | self.preM2Ms = [] 7 | self.preO2Ms = [] 8 | 9 | self.TimeM2Ms = [] 10 | self.newTimeM2Ms = [] 11 | 12 | self.O2Ms = [] 13 | self.M2Ms = [] 14 | 15 | self.preM2Os = [] 16 | self.preO2Os = [] 17 | 18 | self.O2Os = [] 19 | self.M2Os = [] 20 | 21 | 22 | class DeafultMergeFeatPipeline(MergeFeatPipeline): 23 | def __init__(self): 24 | super(DeafultMergeFeatPipeline,self).__init__() 25 | 26 | self.main_init() 27 | 28 | def main_init(self): 29 | 30 | self.newTimeM2Ms = [TimeM2MnewLastData] 31 | 32 | self.preM2Ms = [] 33 | self.M2Ms = [M2MKeyCount, M2MNumMean,M2MDataLast] 34 | 35 | self.preO2Ms = [] 36 | self.O2Ms = [M2MKeyCount, M2MNumMean,M2MDataLast] 37 | 38 | self.preO2Os = [] 39 | self.O2Os = [M2OJoin] 40 | 41 | self.preM2Os = [] 42 | self.M2Os = [M2OJoin] 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/feat_context.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import CONSTANT 3 | 4 | class FeatContext: 5 | @staticmethod 6 | def gen_feat_name(namespace,cls_name,feat_name,feat_type): 7 | prefix = CONSTANT.type2prefix[feat_type] 8 | 9 | 10 | return f"{prefix}{cls_name}:{feat_name}:{namespace}" 11 | 12 | @staticmethod 13 | def gen_merge_name(table_name,feat_name,feat_type): 14 | prefix = CONSTANT.type2prefix[feat_type] 15 | return f"{prefix}{table_name}.({feat_name})" 16 | 17 | @staticmethod 18 | def gen_merge_feat_name(namespace,cls_name,feat_name,feat_type,table_name): 19 | feat_name = FeatContext.gen_feat_name(namespace,cls_name,feat_name,feat_type) 20 | return FeatContext.gen_merge_name(table_name,feat_name,feat_type) 21 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/feat_engine.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from feat.feat_pipeline import FeatPipeline 4 | from util import timeclass 5 | 6 | class FeatEngine: 7 | def __init__(self, feat_pipeline: FeatPipeline, config): 8 | self.feat_pipeline = feat_pipeline 9 | self.config = config 10 | 11 | @timeclass(cls='FeatEngine') 12 | def fit_order1(self,table,y): 13 | self.feats_order1 = [] 14 | for feat_cls in self.feat_pipeline.order1s: 15 | feat = feat_cls(self.config) 16 | feat.fit(table,y) 17 | self.feats_order1.append(feat) 18 | 19 | @timeclass(cls='FeatEngine') 20 | def transform_order1(self,table): 21 | for feat in self.feats_order1: 22 | feat.transform(table) 23 | 24 | @timeclass(cls='FeatEngine') 25 | def fit_transform_order1(self,table,y): 26 | self.feats_order1 = [] 27 | for feat_cls in self.feat_pipeline.order1s: 28 | feat = feat_cls(self.config) 29 | feat.fit_transform(table,y) 30 | self.feats_order1.append(feat) 31 | 32 | 33 | @timeclass(cls='FeatEngine') 34 | def fit_keys_order2(self,table,y): 35 | self.feats_keys_order2 = [] 36 | for feat_cls in self.feat_pipeline.keys_order2s: 37 | feat = feat_cls(self.config) 38 | feat.fit(table,y) 39 | self.feats_keys_order2.append(feat) 40 | 41 | @timeclass(cls='FeatEngine') 42 | def transform_keys_order2(self,table): 43 | for feat in self.feats_keys_order2: 44 | feat.transform(table) 45 | 46 | @timeclass(cls='FeatEngine') 47 | def fit_transform_keys_order2(self,table,y,sample=False,selection=True): 48 | if not self.feat_pipeline.keys_order2s: 49 | return 50 | 51 | if sample: 52 | self.feats_keys_order2 = [] 53 | self.keys_order2_new_cols = [] 54 | for feat_cls in self.feat_pipeline.keys_order2s[:-1]: 55 | feat = feat_cls(self.config) 56 | new_cols = feat.fit_transform(table,y) 57 | self.feats_keys_order2.append(feat) 58 | self.keys_order2_new_cols.append(set(new_cols)) 59 | 60 | feat_cls = self.feat_pipeline.keys_order2s[-1] 61 | feat = feat_cls(self.config) 62 | drop_feats = set(feat.fit_transform(table,y)) 63 | self.feats_keys_order2.append(feat) 64 | for i in range(len(self.keys_order2_new_cols)): 65 | self.keys_order2_new_cols[i] = (set(self.keys_order2_new_cols[i]) - drop_feats) 66 | 67 | if not sample: 68 | if selection: 69 | self.feats_keys_order2 = [] 70 | for i,feat_cls in enumerate(self.feat_pipeline.keys_order2s): 71 | feat = feat_cls(self.config) 72 | feat.fit_transform(table,y) 73 | self.feats_keys_order2.append(feat) 74 | if not selection: 75 | for i,feat_cls in enumerate(self.feat_pipeline.keys_order2s[:-1]): 76 | feat = feat_cls(self.config) 77 | feat.fit_transform(table,y,self.keys_order2_new_cols[i]) 78 | self.feats_keys_order2.append(feat) 79 | 80 | @timeclass(cls='FeatEngine') 81 | def fit_all_order2(self,table,y): 82 | self.feats_all_order2 = [] 83 | for feat_cls in self.feat_pipeline.all_order2s: 84 | feat = feat_cls(self.config) 85 | feat.fit(table,y) 86 | self.feats_all_order2.append(feat) 87 | 88 | @timeclass(cls='FeatEngine') 89 | def transform_all_order2(self,table): 90 | for feat in self.feats_all_order2: 91 | feat.transform(table) 92 | 93 | @timeclass(cls='FeatEngine') 94 | def fit_transform_all_order2(self,table,y,sample=False,selection=True): 95 | if not self.feat_pipeline.all_order2s: 96 | return 97 | 98 | if sample: 99 | self.feats_all_order2 = [] 100 | self.all_order2_new_cols = [] 101 | for feat_cls in self.feat_pipeline.all_order2s[:-1]: 102 | feat = feat_cls(self.config) 103 | new_cols = feat.fit_transform(table,y) 104 | self.feats_all_order2.append(feat) 105 | self.all_order2_new_cols.append(set(new_cols)) 106 | 107 | feat_cls = self.feat_pipeline.all_order2s[-1] 108 | feat = feat_cls(self.config) 109 | drop_feats = set(feat.fit_transform(table,y)) 110 | self.feats_all_order2.append(feat) 111 | for i in range(len(self.all_order2_new_cols)): 112 | self.all_order2_new_cols[i] = set(self.all_order2_new_cols[i]) - drop_feats 113 | 114 | if not sample: 115 | if selection: 116 | self.feats_all_order2 = [] 117 | for i,feat_cls in enumerate(self.feat_pipeline.all_order2s): 118 | feat = feat_cls(self.config) 119 | feat.fit_transform(table,y) 120 | self.feats_all_order2.append(feat) 121 | if not selection: 122 | for i,feat_cls in enumerate(self.feat_pipeline.all_order2s[:-1]): 123 | feat = feat_cls(self.config) 124 | feat.fit_transform(table,y,self.all_order2_new_cols[i]) 125 | self.feats_all_order2.append(feat) 126 | 127 | @timeclass(cls='FeatEngine') 128 | def fit_keys_order3(self,table,y): 129 | self.feats_keys_order3 = [] 130 | for feat_cls in self.feat_pipeline.keys_order3s: 131 | feat = feat_cls(self.config) 132 | feat.fit(table,y) 133 | self.feats_keys_order3.append(feat) 134 | 135 | @timeclass(cls='FeatEngine') 136 | def transform_keys_order3(self,table): 137 | for feat in self.feats_keys_order3: 138 | feat.transform(table) 139 | 140 | @timeclass(cls='FeatEngine') 141 | def fit_transform_keys_order3(self,table,y): 142 | self.feats_keys_order3 = [] 143 | for feat_cls in self.feat_pipeline.keys_order3s: 144 | feat = feat_cls(self.config) 145 | feat.fit_transform(table,y) 146 | self.feats_keys_order3.append(feat) 147 | 148 | 149 | @timeclass(cls='FeatEngine') 150 | def fit_post_order1(self,table,y): 151 | self.feats_post_order1 = [] 152 | for feat_cls in self.feat_pipeline.post_order1s: 153 | feat = feat_cls(self.config) 154 | feat.fit(table,y) 155 | self.feats_post_order1.append(feat) 156 | 157 | @timeclass(cls='FeatEngine') 158 | def transform_post_order1(self,table): 159 | for feat in self.feats_post_order1: 160 | feat.transform(table) 161 | 162 | @timeclass(cls='FeatEngine') 163 | def fit_transform_post_order1(self,table,y): 164 | self.feats_post_order1 = [] 165 | for feat_cls in self.feat_pipeline.post_order1s: 166 | feat = feat_cls(self.config) 167 | feat.fit_transform(table,y) 168 | self.feats_post_order1.append(feat) 169 | 170 | @timeclass(cls='FeatEngine') 171 | def fit_merge_order1(self,table,y): 172 | self.feats_merge_order1 = [] 173 | for feat_cls in self.feat_pipeline.merge_order1s: 174 | feat = feat_cls(self.config) 175 | feat.fit(table,y) 176 | self.feats_merge_order1.append(feat) 177 | 178 | @timeclass(cls='FeatEngine') 179 | def transform_merge_order1(self,table): 180 | for feat in self.feats_merge_order1: 181 | feat.transform(table) 182 | 183 | @timeclass(cls='FeatEngine') 184 | def fit_transform_merge_order1(self,table,y): 185 | self.feats_merge_order1 = [] 186 | for feat_cls in self.feat_pipeline.merge_order1s: 187 | feat = feat_cls(self.config) 188 | feat.fit_transform(table,y) 189 | self.feats_merge_order1.append(feat) -------------------------------------------------------------------------------- /auto_smart/auto_smart/merger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | 4 | import CONSTANT 5 | from util import log, timeclass 6 | from feat.merge_feat_pipeline import MergeFeatPipeline 7 | import copy 8 | import gc 9 | from data_tools import downcast 10 | 11 | class Merger: 12 | def __init__(self,merge_feat_pipeline: MergeFeatPipeline): 13 | self.merge_feat_pipeline = merge_feat_pipeline 14 | 15 | 16 | 17 | @timeclass(cls='Merger') 18 | def merge(self,key,u,v,ttype,z2f): 19 | feats = [] 20 | col2type = {} 21 | col2groupby = {} 22 | col2block = {} 23 | 24 | if u.key_time_col is not None and v.key_time_col is not None and ttype=='many_to_many': 25 | 26 | if z2f and self.merge_timem2m and (key in u.user_cols): 27 | self.merge_timem2m = False 28 | for merge_feat_cls in self.merge_feat_pipeline.newTimeM2Ms: 29 | merge_feat = merge_feat_cls(key) 30 | merge_feat.fit_transform(u,v) 31 | 32 | for merge_feat_cls in self.merge_feat_pipeline.preM2Ms: 33 | merge_feat = merge_feat_cls(key) 34 | merge_feat.fit_transform(u,v) 35 | 36 | for merge_feat_cls in self.merge_feat_pipeline.M2Ms: 37 | merge_feat = merge_feat_cls(key) 38 | v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v) 39 | feats.append(v_feat) 40 | col2type.update(v_col2type) 41 | col2block.update(v_col2block) 42 | 43 | elif ttype == 'one_to_one': 44 | for merge_feat_cls in self.merge_feat_pipeline.preO2Os: 45 | merge_feat = merge_feat_cls(key) 46 | merge_feat.fit_transform(u,v) 47 | 48 | for merge_feat_cls in self.merge_feat_pipeline.O2Os: 49 | merge_feat = merge_feat_cls(key) 50 | v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v) 51 | feats.append(v_feat) 52 | col2type.update(v_col2type) 53 | col2block.update(v_col2block) 54 | 55 | elif ttype == 'many_to_one': 56 | for merge_feat_cls in self.merge_feat_pipeline.preM2Os: 57 | merge_feat = merge_feat_cls(key) 58 | merge_feat.fit_transform(u,v) 59 | 60 | for merge_feat_cls in self.merge_feat_pipeline.M2Os: 61 | merge_feat = merge_feat_cls(key) 62 | v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v) 63 | feats.append(v_feat) 64 | col2type.update(v_col2type) 65 | col2block.update(v_col2block) 66 | 67 | elif ttype == 'one_to_many': 68 | for merge_feat_cls in self.merge_feat_pipeline.preO2Ms: 69 | merge_feat = merge_feat_cls(key) 70 | merge_feat.fit_transform(u,v) 71 | 72 | for merge_feat_cls in self.merge_feat_pipeline.O2Ms: 73 | merge_feat = merge_feat_cls(key) 74 | v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v) 75 | feats.append(v_feat) 76 | col2type.update(v_col2type) 77 | col2block.update(v_col2block) 78 | 79 | elif ttype == 'many_to_many': 80 | for merge_feat_cls in self.merge_feat_pipeline.preM2Ms: 81 | merge_feat = merge_feat_cls(key) 82 | merge_feat.fit_transform(u,v) 83 | 84 | for merge_feat_cls in self.merge_feat_pipeline.M2Ms: 85 | merge_feat = merge_feat_cls(key) 86 | v_feat,v_col2type,v_col2block = merge_feat.fit_transform(u,v) 87 | feats.append(v_feat) 88 | col2type.update(v_col2type) 89 | col2block.update(v_col2block) 90 | if feats: 91 | feat = pd.concat(feats,axis=1) 92 | col2groupby = {col:key for col in feat.columns} 93 | 94 | del feats,v 95 | gc.collect() 96 | 97 | data = u.data 98 | index = data.index 99 | data.set_index(key,inplace=True) 100 | 101 | cols = list(feat.columns) 102 | data[cols] = feat 103 | data.reset_index(key,inplace=True) 104 | data[key] = downcast(data[key],accuracy_loss=False) 105 | data.index= index 106 | 107 | u.update_data(data,col2type,col2groupby,None,col2block,None) 108 | 109 | @timeclass(cls='Merger') 110 | def dfs(self,u_name, graph): 111 | depth = graph.depth 112 | name2table = graph.name2table 113 | rel_graph = graph.rel_graph 114 | 115 | u = name2table[u_name] 116 | log(f"enter {u_name}") 117 | for edge in rel_graph[u_name]: 118 | v_name = edge['to'] 119 | if depth[v_name]['depth'] <= depth[u_name]['depth']: 120 | continue 121 | 122 | v = self.dfs(v_name, graph) 123 | key = edge['key'] 124 | assert len(key) == 1 125 | key = key[0] 126 | type_ = edge['type'] 127 | 128 | log(f"join {u_name} <--{type_}--t {v_name}") 129 | self.merge(key,u,v,type_,0) 130 | 131 | log(f"join {u_name} <--{type_}--nt {v_name}") 132 | 133 | del v 134 | 135 | log(f"leave {u_name}") 136 | return u 137 | 138 | @timeclass(cls='Merger') 139 | def merge_to_main_fit_transform(self,graph): 140 | depth = graph.depth 141 | name2table = graph.name2table 142 | 143 | u_name = CONSTANT.MAIN_TABLE_NAME 144 | u = name2table[u_name] 145 | rel_graph = graph.rel_graph 146 | 147 | table2feat = {} 148 | for edge in rel_graph[u_name]: 149 | v_name = edge['to'] 150 | if depth[v_name]['depth'] <= depth[u_name]['depth']: 151 | continue 152 | 153 | v = name2table[v_name] 154 | key = edge['key'] 155 | assert len(key) == 1 156 | key = key[0] 157 | type_ = edge['type'] 158 | 159 | log(f"join {u_name} <--{type_}--t {v_name}") 160 | table2feat[v_name] = self.merge(key,u,v,type_,1) 161 | log(f"join {u_name} <--{type_}--nt {v_name}") 162 | 163 | self.table2feat = table2feat 164 | return u 165 | 166 | @timeclass(cls='Merger') 167 | def merge_table(self,graph): 168 | self.use_all_time_m2m = False 169 | if graph.M2M_relation_cnt < 3: 170 | self.use_all_time_m2m = True 171 | 172 | self.merge_timem2m = True 173 | 174 | graph.build_depth() 175 | 176 | depth = graph.depth 177 | u_name = CONSTANT.MAIN_TABLE_NAME 178 | rel_graph = graph.rel_graph 179 | 180 | for edge in rel_graph[u_name]: 181 | v_name = edge['to'] 182 | if depth[v_name]['depth'] <= depth[u_name]['depth']: 183 | continue 184 | 185 | self.dfs(v_name,graph) 186 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/metadata: -------------------------------------------------------------------------------- 1 | description: Provides prediction model to be executed by the ingestion program -------------------------------------------------------------------------------- /auto_smart/auto_smart/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import copy 6 | import CONSTANT 7 | from util import log, timeclass 8 | from table.graph import Graph 9 | from sklearn.metrics import roc_auc_score 10 | from feat.merge_feat_pipeline import DeafultMergeFeatPipeline 11 | from feat.feat_pipeline import DefaultFeatPipeline 12 | 13 | from merger import Merger 14 | from feat_engine import FeatEngine 15 | from model_input import FeatOutput 16 | from automl.model_selection import time_train_test_split 17 | from automl.auto_lgb import AutoLGB 18 | from PATHS import feature_importance_path,version 19 | from datetime import datetime 20 | import gc 21 | from config import Config 22 | import time 23 | 24 | class Model: 25 | auc = [] 26 | ensemble_auc = [] 27 | ensemble_train_auc = [] 28 | 29 | def __init__(self, info): 30 | self.info = copy.deepcopy(info) 31 | self.tables = None 32 | 33 | def shuffle(self,X,y,random_state): 34 | idx = np.arange(len(X)) 35 | np.random.shuffle(idx) 36 | X = X.iloc[idx] 37 | y = y.iloc[idx] 38 | return X,y 39 | 40 | def release_tables(self,Xs,graph): 41 | 42 | for name in graph.tables: 43 | del Xs[name] 44 | del graph.name2table[name] 45 | 46 | gc.collect() 47 | 48 | @timeclass(cls='Model') 49 | def my_fit(self, Xs, y,X_test): 50 | np.random.seed(CONSTANT.SEED) 51 | 52 | split = CONSTANT.SPLIT 53 | 54 | self.split = split 55 | 56 | log(f'split {split}') 57 | 58 | if split == -1: 59 | config = Config(time.time(),self.info['time_budget']) 60 | 61 | X_test.index = -X_test.index-1 62 | 63 | main_shape = Xs[CONSTANT.MAIN_TABLE_NAME].shape[0] 64 | main_max_shape = 2888888 65 | main_min_shape = min( main_shape,100000 ) 66 | 67 | test_shape = X_test.shape[0] 68 | max_accept_shape = 3999999 69 | 70 | if main_shape + test_shape > max_accept_shape: 71 | sample_main_shape = max_accept_shape - test_shape 72 | if sample_main_shape > main_max_shape: 73 | sample_main_shape = main_max_shape 74 | if sample_main_shape < main_min_shape: 75 | sample_main_shape = main_min_shape 76 | log(f'start sample main table. origin main shape {main_shape} test shape {test_shape} sample rows num {sample_main_shape}') 77 | if 'time_col' in self.info: 78 | key_time_col = self.info['time_col'] 79 | if key_time_col in Xs[CONSTANT.MAIN_TABLE_NAME].columns: 80 | Xs[CONSTANT.MAIN_TABLE_NAME].sort_values(by=key_time_col,inplace=True) 81 | Xs[CONSTANT.MAIN_TABLE_NAME] = Xs[CONSTANT.MAIN_TABLE_NAME].iloc[-sample_main_shape:] 82 | gc.collect() 83 | 84 | 85 | Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat([Xs[CONSTANT.MAIN_TABLE_NAME], X_test]) 86 | 87 | X_test.drop(X_test.columns,axis=1,inplace=True) 88 | gc.collect() 89 | 90 | graph = Graph(self.info,Xs) 91 | graph.sort_tables() 92 | train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[Xs[CONSTANT.MAIN_TABLE_NAME].index>=0] 93 | y = y.loc[train_index] 94 | test_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[Xs[CONSTANT.MAIN_TABLE_NAME].index<0] 95 | 96 | graph.preprocess_fit_transform() 97 | gc.collect() 98 | 99 | merge_feat_pipeline = DeafultMergeFeatPipeline() 100 | merger = Merger(merge_feat_pipeline) 101 | 102 | merger.merge_table(graph) 103 | main_table = merger.merge_to_main_fit_transform(graph) 104 | self.release_tables(Xs,graph) 105 | del merger 106 | del graph 107 | gc.collect() 108 | 109 | feat_pipeline = DefaultFeatPipeline() 110 | feat_engine = FeatEngine(feat_pipeline,config) 111 | feat_engine.fit_transform_order1(main_table,y) 112 | 113 | sample_for_combine_features = True 114 | 115 | if sample_for_combine_features: 116 | main_data = main_table.data 117 | train_data = main_data.loc[main_data.index>=0] 118 | 119 | del main_data 120 | 121 | sample_num = CONSTANT.SAMPLE_NUM 122 | train_shape = train_data.shape 123 | 124 | if train_shape[0] <= sample_num: 125 | sample_for_combine_features = False 126 | else: 127 | data_tail_new = train_data.iloc[-sample_num:] 128 | 129 | gc.collect() 130 | 131 | y_tail_new = y.loc[data_tail_new.index] 132 | 133 | table_tail_new = copy.deepcopy(main_table) 134 | table_tail_new.data = data_tail_new 135 | 136 | del data_tail_new 137 | gc.collect() 138 | 139 | feat_engine.fit_transform_all_order2(table_tail_new,y_tail_new,sample=True) 140 | feat_engine.fit_transform_keys_order2(table_tail_new,y_tail_new,sample=True) 141 | 142 | del table_tail_new,y_tail_new 143 | gc.collect() 144 | 145 | feat_engine.fit_transform_all_order2(main_table,y,selection=False) 146 | feat_engine.fit_transform_keys_order2(main_table,y,selection=False) 147 | 148 | feat_engine.fit_transform_post_order1(main_table,y) 149 | 150 | if not sample_for_combine_features: 151 | gc.collect() 152 | 153 | feat_engine.fit_transform_all_order2(main_table,y) 154 | feat_engine.fit_transform_keys_order2(main_table,y) 155 | 156 | feat_engine.fit_transform_keys_order3(main_table,y) 157 | feat_engine.fit_transform_post_order1(main_table,y) 158 | 159 | 160 | del feat_engine 161 | gc.collect() 162 | 163 | 164 | X_test = main_table.data.loc[test_index] 165 | main_table.data = main_table.data.loc[train_index] 166 | 167 | gc.collect() 168 | 169 | test_table = copy.deepcopy(main_table) 170 | test_table.data = X_test 171 | self.test_table = test_table 172 | len_test = X_test.shape[0] 173 | gc.collect() 174 | 175 | feat_engine = FeatEngine(feat_pipeline,config) 176 | feat_engine.fit_transform_merge_order1(main_table,y) 177 | self.feat_engine = feat_engine 178 | 179 | feat_output = FeatOutput() 180 | self.feat_output = feat_output 181 | X,y,categories = feat_output.final_fit_transform_output(main_table,y) 182 | 183 | del main_table 184 | gc.collect() 185 | 186 | lgb = AutoLGB() 187 | 188 | lgb.param_compute(X,y,categories,config) 189 | X_train,y_train,X_test,y_test = time_train_test_split(X,y,test_rate=0.2) 190 | 191 | lgb.param_opt_new(X_train,y_train,X_test,y_test,categories) 192 | 193 | gc.collect() 194 | 195 | del X_train,y_train,X_test,y_test 196 | 197 | gc.collect() 198 | 199 | X,y = self.shuffle(X,y,2019) 200 | gc.collect() 201 | 202 | lgb.ensemble_train(X,y,categories,config,len_test) 203 | 204 | gc.collect() 205 | 206 | importances = lgb.get_ensemble_importances() 207 | 208 | self.model = lgb 209 | del X,y 210 | 211 | elif split == -2: 212 | 213 | config = Config(time.time(),self.info['time_budget']) 214 | 215 | Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat([Xs[CONSTANT.MAIN_TABLE_NAME], ]) 216 | 217 | gc.collect() 218 | 219 | graph = Graph(self.info,Xs) 220 | graph.sort_tables() 221 | train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[Xs[CONSTANT.MAIN_TABLE_NAME].index>=0] 222 | y = y.loc[train_index] 223 | 224 | graph.preprocess_fit_transform() 225 | gc.collect() 226 | 227 | merge_feat_pipeline = DeafultMergeFeatPipeline() 228 | merger = Merger(merge_feat_pipeline) 229 | 230 | merger.merge_table(graph) 231 | main_table = merger.merge_to_main_fit_transform(graph) 232 | self.release_tables(Xs,graph) 233 | del merger 234 | del graph 235 | gc.collect() 236 | 237 | feat_pipeline = DefaultFeatPipeline() 238 | feat_engine = FeatEngine(feat_pipeline,config) 239 | feat_engine.fit_transform_order1(main_table,y) 240 | 241 | sample_for_combine_features = True 242 | 243 | if sample_for_combine_features: 244 | main_data = main_table.data 245 | train_data = main_data.loc[main_data.index>=0] 246 | 247 | del main_data 248 | 249 | sample_num = CONSTANT.SAMPLE_NUM 250 | train_shape = train_data.shape 251 | 252 | if train_shape[0] <= sample_num: 253 | sample_for_combine_features = False 254 | else: 255 | data_tail_new = train_data.iloc[-sample_num:] 256 | 257 | gc.collect() 258 | log(f'sample data shape {data_tail_new.shape}') 259 | 260 | y_tail_new = y.loc[data_tail_new.index] 261 | 262 | table_tail_new = copy.deepcopy(main_table) 263 | table_tail_new.data = data_tail_new 264 | 265 | del data_tail_new 266 | gc.collect() 267 | 268 | feat_engine.fit_transform_all_order2(table_tail_new,y_tail_new,sample=True) 269 | feat_engine.fit_transform_keys_order2(table_tail_new,y_tail_new,sample=True) 270 | 271 | del table_tail_new,y_tail_new 272 | gc.collect() 273 | 274 | feat_engine.fit_transform_all_order2(main_table,y,selection=False) 275 | feat_engine.fit_transform_keys_order2(main_table,y,selection=False) 276 | feat_engine.fit_transform_post_order1(main_table,y) 277 | 278 | if not sample_for_combine_features: 279 | gc.collect() 280 | 281 | feat_engine.fit_transform_all_order2(main_table,y) 282 | feat_engine.fit_transform_keys_order2(main_table,y) 283 | feat_engine.fit_transform_keys_order3(main_table,y) 284 | feat_engine.fit_transform_post_order1(main_table,y) 285 | 286 | del feat_engine 287 | gc.collect() 288 | 289 | main_table.data = main_table.data.loc[train_index] 290 | 291 | gc.collect() 292 | 293 | def split_table(table,y): 294 | X = table.data 295 | X_train,y_train,X_test,y_test = time_train_test_split(X,y,shuffle=False,test_rate=0.2) 296 | table1 = copy.deepcopy(table) 297 | table1.data = X_train 298 | table2 = copy.deepcopy(table) 299 | table2.data = X_test 300 | return table1,y_train,table2,y_test 301 | 302 | table1,y_train,table2,y_test = split_table(main_table,y) 303 | 304 | feat_engine = FeatEngine(feat_pipeline,config) 305 | feat_engine.fit_transform_merge_order1(table1,y_train) 306 | self.feat_engine = feat_engine 307 | 308 | feat_output = FeatOutput() 309 | self.feat_output = feat_output 310 | 311 | X_train,y_train,categories = feat_output.fit_transform_output(table1,y_train) 312 | 313 | gc.collect() 314 | self.feat_engine.transform_merge_order1(table2) 315 | X_test = self.feat_output.transform_output(table2) 316 | 317 | lgb = AutoLGB() 318 | 319 | lgb.param_compute(X_train,y_train,categories,config) 320 | 321 | lgb.param_opt_new(X_train,y_train,X_test,y_test,categories) 322 | 323 | len_test = X_test.shape[0] 324 | 325 | lgb.ensemble_train(X_train,y_train,categories,config,len_test) 326 | gc.collect() 327 | 328 | pred,pred0 = lgb.ensemble_predict_test(X_test) 329 | 330 | auc = roc_auc_score(y_test,pred0) 331 | print('source AUC:',auc) 332 | 333 | auc = roc_auc_score(y_test,pred) 334 | Model.ensemble_auc.append(auc) 335 | print('ensemble AUC:',auc) 336 | 337 | importances = lgb.get_ensemble_importances() 338 | 339 | self.model = lgb 340 | 341 | del X_train,y_train,X_test,y_test 342 | gc.collect() 343 | 344 | paths = os.path.join(feature_importance_path,version) 345 | if not os.path.exists(paths): 346 | os.makedirs(paths) 347 | importances.to_csv(os.path.join(paths,'{}_importances.csv'.format(datetime.now().strftime('%Y%m%d%H%M%S'))),index=False) 348 | 349 | @timeclass(cls='Model') 350 | def fit(self, Xs, y): 351 | self.Xs = Xs 352 | self.y = y 353 | 354 | 355 | @timeclass(cls='Model') 356 | def predict(self, X_test): 357 | 358 | self.my_fit(self.Xs, self.y, X_test) 359 | 360 | gc.collect() 361 | 362 | if self.split != -2: 363 | main_table = self.test_table 364 | self.feat_engine.transform_merge_order1(main_table) 365 | X = self.feat_output.transform_output(main_table) 366 | 367 | X.index = -(X.index+1) 368 | X.sort_index(inplace=True) 369 | 370 | result = self.model.ensemble_predict(X) 371 | return pd.Series(result) 372 | 373 | else: 374 | return pd.Series() 375 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/model_input.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from util import log, timeit, timeclass 5 | import numpy as np 6 | import gc 7 | import sys 8 | 9 | class FeatOutput: 10 | @timeclass(cls='FeatOutput') 11 | def transform_output(self,table): 12 | X = table.data 13 | 14 | self.drop_non_numerical_column(table,X) 15 | self.drop_post_drop_column(table,X) 16 | 17 | return X 18 | 19 | @timeclass(cls='FeatOutput') 20 | 21 | def fit_transform_output(self,table,y): 22 | X = table.data.copy() 23 | 24 | self.drop_non_numerical_column(table,X) 25 | self.drop_post_drop_column(table,X) 26 | 27 | categories = self.get_categories(table,X) 28 | 29 | return X,y,categories 30 | 31 | def final_fit_transform_output(self,table,y): 32 | X = table.data 33 | 34 | 35 | self.drop_non_numerical_column(table,X) 36 | self.drop_post_drop_column(table,X) 37 | 38 | categories = self.get_categories(table,X) 39 | 40 | return X,y,categories 41 | 42 | @timeclass(cls='FeatOutput') 43 | def fillna(self,table,X): 44 | for col in table.num_cols: 45 | X[col] = X[col].fillna(X[col].mean()) 46 | 47 | 48 | def get_categories(self,table,X): 49 | categories = [] 50 | col_set = set(X.columns) 51 | for col in table.cat_cols: 52 | if col in col_set: 53 | if X[col].nunique() <= 15: 54 | categories.append(col) 55 | 56 | 57 | return categories 58 | 59 | @timeclass(cls='FeatOutput') 60 | def drop_non_numerical_column(self,table,X): 61 | if table.key_time_col is not None: 62 | 63 | X.drop(table.key_time_col,axis=1,inplace=True) 64 | gc.collect() 65 | 66 | if len(table.time_cols) != 0: 67 | X.drop(table.time_cols,axis=1,inplace=True) 68 | 69 | if len(table.multi_cat_cols) != 0: 70 | X.drop(table.multi_cat_cols,axis=1,inplace=True) 71 | 72 | @timeclass(cls='FeatOutput') 73 | def drop_post_drop_column(self,table,X): 74 | if len(table.post_drop_set) != 0: 75 | drop_cols = list(table.post_drop_set) 76 | X.drop(drop_cols,axis=1,inplace=True) 77 | log(f'post drop cols:{drop_cols}') 78 | 79 | @timeclass(cls='FeatOutput') 80 | def drop_cat_column(self,table,X): 81 | X.drop(list(set(table.session_cols + table.user_cols + table.key_cols + table.cat_cols)&set(X.columns)),axis=1,inplace=True) 82 | 83 | @timeclass(cls='FeatOutput') 84 | def cat_hash(self,table,X): 85 | for col in table.user_cols + table.key_cols + table.cat_cols: 86 | X[col] = X[col] % 15 87 | 88 | @timeclass(cls='FeatOutput') 89 | def cat_process(self,train_table,test_table): 90 | X = train_table 91 | 92 | train = train_table.data 93 | test = test_table.data 94 | for col in X.user_cols + X.key_cols + X.cat_cols: 95 | inter = set(train[col].unique()) & set(test[col].unique()) 96 | train.loc[~(train[col].isin(inter)),col] = np.nan 97 | test.loc[~(test[col].isin(inter)),col] = np.nan 98 | 99 | @timeclass(cls='FeatOutput') 100 | def drop_tail(self,train_table,test_table): 101 | X = train_table 102 | 103 | train = train_table.data 104 | test = test_table.data 105 | for col in X.key_cols + X.cat_cols: 106 | vc = train[col].value_counts() 107 | vc.loc[vc==1] = np.nan 108 | train[col] = train[col].map(vc) 109 | test[col] = test[col].map(vc) 110 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/preprocessor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepBlueAI/AutoSmart/99a6b44ce5b6520a2463c8a4b3acc675584533be/auto_smart/auto_smart/preprocessor/__init__.py -------------------------------------------------------------------------------- /auto_smart/auto_smart/preprocessor/preprocessor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | import numpy as np 4 | import ac 5 | import CONSTANT 6 | from data_tools import downcast 7 | from joblib import Parallel, delayed 8 | from util import timeclass 9 | from feat_context import FeatContext 10 | 11 | namespace = 'preprocess' 12 | 13 | class Preprocessor: 14 | def __init__(self): 15 | pass 16 | 17 | def fit(self,ss): 18 | pass 19 | 20 | def transform(self,ss): 21 | pass 22 | 23 | def fit_transform(self,ss): 24 | pass 25 | 26 | class GeneralPreprocessor(Preprocessor): 27 | def __init__(self): 28 | self.K = 5 29 | 30 | @timeclass(cls='GeneralPreprocessor') 31 | def transform(self,X): 32 | 33 | todo_list = X.multi_cat_cols 34 | if todo_list != []: 35 | 36 | col2muldatas = {} 37 | col2muldatalens = {} 38 | 39 | data = X.data[todo_list] 40 | for col in todo_list: 41 | vals = data[col].values 42 | datas,datalen = ac.get_need_data(vals) 43 | 44 | if len(datalen) != data.shape[0]: 45 | raise Exception('An error with data length happens!!') 46 | 47 | col2muldatas[col] = np.array(datas,dtype='int64').astype(np.int32) 48 | col2muldatalens[col] = np.array(datalen,dtype='int32') 49 | 50 | data = X.data[todo_list] 51 | col2type = {} 52 | col2groupby = {} 53 | for col in data.columns: 54 | data[col] = ac.tuple_encode_func_1(col2muldatas[col],col2muldatalens[col]) 55 | 56 | new_cols = [] 57 | for col in todo_list: 58 | feat_type = CONSTANT.CATEGORY_TYPE 59 | new_col = col+'_MCEncode' 60 | new_col = FeatContext.gen_feat_name(namespace,self.__class__.__name__,new_col,feat_type) 61 | new_cols.append(new_col) 62 | col2type[new_col] = feat_type 63 | col2groupby[new_col] = col 64 | 65 | data.columns = new_cols 66 | df = X.data 67 | for col in data.columns: 68 | df[col] = downcast(data[col],accuracy_loss=False) 69 | 70 | X.update_data(df,col2type,col2groupby) 71 | 72 | df = X.data 73 | index = df.index 74 | col2type = {} 75 | col2groupby = {} 76 | for col in todo_list: 77 | new_col = col+'_MCLenAsCat' 78 | feat_type = CONSTANT.CATEGORY_TYPE 79 | new_col = FeatContext.gen_feat_name(namespace,self.__class__.__name__,new_col,feat_type) 80 | df[new_col] = downcast( pd.Series( col2muldatalens[col],index ),accuracy_loss=False) 81 | 82 | col2type[new_col] = feat_type 83 | col2groupby[new_col] = col 84 | 85 | X.update_data(df,col2type,col2groupby) 86 | 87 | todo_list = X.time_cols 88 | 89 | if todo_list != []: 90 | df = X.data 91 | col2type = {} 92 | for col in X.time_cols: 93 | new_col = col+'_TimeNum' 94 | feat_type = CONSTANT.NUMERICAL_TYPE 95 | new_col = FeatContext.gen_feat_name(namespace,self.__class__.__name__,new_col,feat_type) 96 | 97 | ss = (df[col] - pd.to_datetime('1970-01-01')).dt.total_seconds() 98 | ss[ss<0] = np.nan 99 | min_time = ss.min() 100 | ss = ss-min_time 101 | 102 | df[new_col] = downcast(ss) 103 | 104 | col2type[new_col] = feat_type 105 | 106 | if len(col2type) > 0: 107 | X.update_data(df,col2type,None) 108 | 109 | @timeclass(cls='GeneralPreprocessor') 110 | def fit_transform(self,X): 111 | return self.transform(X) 112 | 113 | class BinaryPreprocessor(Preprocessor): 114 | def __init__(self): 115 | self.col2cats = {} 116 | 117 | @timeclass(cls='BinaryPreprocessor') 118 | def fit(self,X): 119 | def func(ss): 120 | cats = pd.Categorical(ss).categories 121 | return cats 122 | 123 | df = X.data 124 | todo_cols = X.binary_cols 125 | 126 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols) 127 | for col,cats in zip(todo_cols,res): 128 | self.col2cats[col] = cats 129 | 130 | @timeclass(cls='BinaryPreprocessor') 131 | def transform(self,X): 132 | 133 | def func(ss,cats): 134 | codes = pd.Categorical(ss,categories=cats).codes 135 | codes = codes.astype('float16') 136 | codes[codes==-1] = np.nan 137 | 138 | return codes 139 | 140 | df = X.data 141 | todo_cols = X.binary_cols 142 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col],self.col2cats[col]) for col in todo_cols) 143 | for col,codes in zip(todo_cols,res): 144 | df[col] = codes 145 | 146 | @timeclass(cls='BinaryPreprocessor') 147 | def fit_transform(self,X): 148 | self.fit(X) 149 | self.transform(X) 150 | 151 | class MSCatPreprocessor(Preprocessor): 152 | def __init__(self): 153 | self.cats = [] 154 | 155 | def fit(self,ss): 156 | vals = ss.values 157 | 158 | ss = pd.Series( list(ac.mscat_fit(vals)) ) 159 | 160 | if ss.name is None: 161 | ss.name = 'ss' 162 | 163 | cats = ss.dropna().drop_duplicates().values 164 | 165 | if len(self.cats) == 0: 166 | self.cats = sorted(list(cats)) 167 | else: 168 | added_cats = sorted(set(cats) - set(self.cats)) 169 | self.cats.extend(added_cats) 170 | 171 | def transform(self,ss,kind): 172 | 173 | if kind == CONSTANT.CATEGORY_TYPE: 174 | 175 | codes = pd.Categorical(ss,categories=self.cats).codes + CONSTANT.CAT_SHIFT 176 | codes = codes.astype('float') 177 | codes[codes==(CONSTANT.CAT_SHIFT-1)] = np.nan 178 | 179 | codes = downcast(codes,accuracy_loss=False) 180 | return codes 181 | else: 182 | codes = pd.Series( ac.mscat_trans(ss.values,self.cats) , index = ss.index ) 183 | return codes 184 | 185 | def fit_transform(self,ss): 186 | return self.transform(ss) 187 | 188 | class NumPreprocessor(Preprocessor): 189 | def fit(self,X): 190 | pass 191 | 192 | def transform(self,X): 193 | df = X.data 194 | todo_cols = X.num_cols 195 | for col in todo_cols: 196 | df[col] = downcast(df[col]) 197 | 198 | def fit_transform(self,X): 199 | return self.transform(X) 200 | 201 | class UniquePreprocessor(Preprocessor): 202 | @timeclass(cls='UniquePreprocessor') 203 | def fit(self,X): 204 | def func(ss): 205 | length = len(ss.unique()) 206 | if length <= 1: 207 | return True 208 | else: 209 | return False 210 | 211 | df = X.data 212 | todo_cols = X.cat_cols + X.multi_cat_cols + X.num_cols + X.time_cols + X.binary_cols 213 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols) 214 | 215 | drop_cols = [] 216 | for col,unique in zip(todo_cols,res): 217 | if unique: 218 | drop_cols.append(col) 219 | 220 | self.drop_cols = drop_cols 221 | 222 | @timeclass(cls='UniquePreprocessor') 223 | def transform(self,X): 224 | X.drop_data(self.drop_cols) 225 | 226 | @timeclass(cls='UniquePreprocessor') 227 | def fit_transform(self,X): 228 | self.fit(X) 229 | self.transform(X) 230 | 231 | class AllDiffPreprocessor(Preprocessor): 232 | @timeclass(cls='AllDiffPreprocessor') 233 | def fit(self,X): 234 | def func(ss): 235 | length = len(ss.unique()) 236 | if length >= len(ss)-10: 237 | return True 238 | else: 239 | return False 240 | 241 | df = X.data 242 | todo_cols = X.cat_cols 243 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols) 244 | 245 | drop_cols = [] 246 | for col,all_diff in zip(todo_cols,res): 247 | if all_diff: 248 | drop_cols.append(col) 249 | 250 | self.drop_cols = drop_cols 251 | 252 | @timeclass(cls='AllDiffPreprocessor') 253 | def transform(self,X): 254 | X.drop_data(self.drop_cols) 255 | 256 | @timeclass(cls='AllDiffPreprocessor') 257 | def fit_transform(self,X): 258 | self.fit(X) 259 | self.transform(X) 260 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/table/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepBlueAI/AutoSmart/99a6b44ce5b6520a2463c8a4b3acc675584533be/auto_smart/auto_smart/table/__init__.py -------------------------------------------------------------------------------- /auto_smart/auto_smart/table/graph.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .table import Table 4 | from preprocessor.preprocessor import MSCatPreprocessor 5 | import pandas as pd 6 | import CONSTANT 7 | from util import timeclass, log 8 | from collections import defaultdict, deque 9 | import gc 10 | from joblib import Parallel, delayed 11 | 12 | class Graph: 13 | def __init__(self,info,tables): 14 | 15 | self.info = info 16 | 17 | self.table2info = info['tables'] 18 | self.relations = info['relations'] 19 | self.key_time_col = info['time_col'] 20 | 21 | self.M2M_relation_cnt = 0 22 | for relation in info['relations']: 23 | if relation['type'] == "many_to_many": 24 | self.M2M_relation_cnt = self.M2M_relation_cnt + 1 25 | 26 | self.key_col_set = None 27 | self.user_col = None 28 | 29 | self.name2table = {} 30 | self.tables = [] 31 | 32 | key_col_set = set() 33 | for relation in info['relations']: 34 | key_col_set.update(relation['key']) 35 | self.key_col_set = key_col_set 36 | 37 | user_col = None 38 | for tname,table in tables.items(): 39 | key_cols = [] 40 | if tname == CONSTANT.MAIN_TABLE_NAME: 41 | for col in self.table2info[tname]: 42 | if col in self.key_col_set: 43 | key_cols.append(col) 44 | 45 | user_col = self.recognize_user_col(tables[tname],key_cols) 46 | 47 | self.user_col = user_col 48 | del user_col 49 | 50 | main_cat_cols = [] 51 | session_col = None 52 | for tname,table in tables.items(): 53 | if tname == CONSTANT.MAIN_TABLE_NAME: 54 | for col in self.table2info[tname]: 55 | type_ = self.table2info[tname][col] 56 | if type_ == CONSTANT.CATEGORY_TYPE and col!=self.user_col and col not in key_col_set: 57 | main_cat_cols.append(col) 58 | 59 | session_cols = self.recognize_session_col(tables[tname],main_cat_cols,self.user_col) 60 | 61 | 62 | self.main_session_cols = session_cols 63 | del main_cat_cols 64 | del session_col 65 | 66 | for tname,table in tables.items(): 67 | key_cols = [] 68 | key_time_col = None 69 | user_cols = [] 70 | 71 | for col in self.table2info[tname]: 72 | 73 | if col in self.key_col_set and col != self.user_col: 74 | key_cols.append(col) 75 | 76 | if col == self.user_col: 77 | user_cols.append(col) 78 | 79 | if col == self.key_time_col: 80 | key_time_col = col 81 | 82 | cat_cols = [] 83 | for col in self.table2info[tname]: 84 | type_ = self.table2info[tname][col] 85 | if type_ == CONSTANT.CATEGORY_TYPE: 86 | cat_cols.append(col) 87 | 88 | binary_cols = self.recognize_binary_col(tables[tname],cat_cols) 89 | for col in binary_cols: 90 | self.table2info[tname][col] = CONSTANT.BINARY_TYPE 91 | 92 | self.tables.append(tname) 93 | if tname == CONSTANT.MAIN_TABLE_NAME: 94 | self.name2table[tname] = Table(tables[tname],self.table2info[tname],self.main_session_cols,user_cols,key_cols,key_time_col,tname) 95 | 96 | else: 97 | self.name2table[tname] = Table(tables[tname],self.table2info[tname],[],user_cols,key_cols,key_time_col,tname) 98 | 99 | if tname == CONSTANT.MAIN_TABLE_NAME: 100 | self.main_key_cols = key_cols 101 | self.main_key_time_col = key_time_col 102 | self.main_user_col = user_cols 103 | self.main_table_info = self.table2info[tname] 104 | 105 | block2name,name2block = self.init_graph_to_blocks() 106 | self.block2name = block2name 107 | self.name2block = name2block 108 | 109 | for tname in self.name2table: 110 | self.name2table[tname].block2name = block2name 111 | self.name2table[tname].name2block = name2block 112 | 113 | for tname in self.name2table: 114 | col2block = {} 115 | for col in self.name2table[tname].data.columns: 116 | name = tname + ':' + col 117 | 118 | if name in self.name2block: 119 | block_id = self.name2block[name] 120 | col2block[col] = block_id 121 | 122 | self.name2table[tname].col2block = col2block 123 | 124 | for tname in self.name2table: 125 | col2table = {} 126 | for col in self.name2table[tname].data.columns: 127 | col2table[col] = tname 128 | 129 | self.name2table[tname].col2table = col2table 130 | 131 | @timeclass(cls='Graph') 132 | def init_graph_to_blocks(self): 133 | mode = 'all' 134 | if mode == 'all': 135 | t_datas = [] 136 | t_names = [] 137 | 138 | for t_name in self.name2table: 139 | t_table = self.name2table[t_name] 140 | t_data = t_table.data 141 | t_data_num = t_data.shape[0] 142 | t_limit_num = 100000 143 | if t_limit_num > t_data_num: 144 | t_limit_num = t_data_num 145 | t_sample_frac = t_limit_num / t_data_num 146 | t_data = t_data.sample(frac=t_sample_frac,random_state=CONSTANT.SEED) 147 | 148 | t_datas.append(t_data) 149 | t_names.append(t_name) 150 | 151 | all_cat_cols = [] 152 | all_cat2type = {} 153 | for t_data,t_name in zip(t_datas,t_names): 154 | 155 | for col in t_data.columns: 156 | col2type = self.table2info[ t_name ][ col ] 157 | new_col = t_name+':'+col 158 | if col2type == CONSTANT.MULTI_CAT_TYPE or col2type == CONSTANT.CATEGORY_TYPE: 159 | all_cat_cols.append(new_col) 160 | all_cat2type[new_col] = col2type 161 | 162 | mc_graph = {} 163 | all_cat_len = len(all_cat_cols) 164 | for i in range(all_cat_len): 165 | name1 = all_cat_cols[i] 166 | mc_graph[name1] = {} 167 | for j in range(all_cat_len): 168 | name2 = all_cat_cols[j] 169 | mc_graph[name1][name2] = 0 170 | 171 | for t1 in range(len(t_datas)): 172 | t_data_1 = t_datas[t1] 173 | t_name_1 = t_names[t1] 174 | for col1 in t_data_1.columns: 175 | if col1 in self.key_col_set: 176 | name1 = t_name_1+':'+col1 177 | 178 | for t2 in range(len(t_datas)): 179 | t_data_2 = t_datas[t2] 180 | t_name_2 = t_names[t2] 181 | for col2 in t_data_2.columns: 182 | if col2 == col1: 183 | name2 = t_name_2+':'+col2 184 | mc_graph[name1][name2] = 1 185 | mc_graph[name2][name1] = 1 186 | 187 | log('init mcgraph') 188 | 189 | all_cat2set = {} 190 | 191 | for t_data,t_name in zip(t_datas,t_names): 192 | for col in t_data.columns: 193 | new_col = t_name+':'+col 194 | if new_col in all_cat2type: 195 | cur_set = set() 196 | if all_cat2type[new_col] == CONSTANT.MULTI_CAT_TYPE: 197 | 198 | for val in t_data[col]: 199 | if type(val) == float: 200 | continue 201 | cur_set.update(val.split(CONSTANT.MULTI_CAT_DELIMITER)) 202 | 203 | elif all_cat2type[new_col] == CONSTANT.CATEGORY_TYPE: 204 | cur_set = set(t_data[col].dropna()) 205 | 206 | all_cat2set[new_col] = cur_set 207 | 208 | all_cat_len = len(all_cat_cols) 209 | for i in range(all_cat_len): 210 | for j in range(i+1,all_cat_len): 211 | name1 = all_cat_cols[i] 212 | name2 = all_cat_cols[j] 213 | 214 | len1 = len(all_cat2set[name1]) 215 | len2 = len(all_cat2set[name2]) 216 | 217 | less_len = min(len1,len2) 218 | if less_len <= 1: 219 | continue 220 | 221 | if mc_graph[name1][name2]==1 or mc_graph[name2][name1] == 1: 222 | continue 223 | 224 | if len(all_cat2set[name1] & all_cat2set[name2])/less_len > 0.1: 225 | mc_graph[name1][name2] = 1 226 | mc_graph[name2][name1] = 1 227 | 228 | block2name = {} 229 | 230 | block_id = 0 231 | vis = {} 232 | nodes = list(mc_graph.keys()) 233 | def dfs(now,block_id): 234 | block2name[block_id].append(now) 235 | for nex in nodes: 236 | if mc_graph[now][nex] and ( not (nex in vis) ): 237 | vis[nex] = 1 238 | dfs(nex,block_id) 239 | 240 | for now in nodes: 241 | if now in vis: 242 | continue 243 | vis[now] = 1 244 | block_id += 1 245 | block2name[block_id] = [] 246 | dfs(now,block_id) 247 | 248 | name2block = {} 249 | 250 | for block in block2name: 251 | for col in block2name[block]: 252 | name2block[col] = block 253 | log(f'blocks: {block2name}') 254 | return block2name,name2block 255 | 256 | elif mode == 'part': 257 | pass 258 | 259 | @timeclass(cls='Graph') 260 | def sort_tables(self): 261 | for tname in self.name2table: 262 | table = self.name2table[tname] 263 | if table.key_time_col is not None: 264 | table.data.sort_values(by=table.key_time_col,inplace=True) 265 | 266 | @timeclass(cls='Graph') 267 | def sort_main_table(self): 268 | table = self.name2table[CONSTANT.MAIN_TABLE_NAME] 269 | if table.key_time_col is not None: 270 | table.data.sort_values(by=table.key_time_col,inplace=True) 271 | 272 | @timeclass(cls='Graph') 273 | def recognize_session_col(self,data,cat_cols,user_col): 274 | if user_col is None: 275 | return [] 276 | 277 | user_nunique = data[user_col].nunique() 278 | session_cols = [] 279 | 280 | def func(df,user_nunique): 281 | cat_col = df.columns[0] 282 | user_col = df.columns[1] 283 | cat_nunique = df[cat_col].nunique() 284 | 285 | if (cat_nunique <= user_nunique) or (cat_nunique >= df.shape[0]-10): 286 | return False 287 | 288 | if (df.groupby(cat_col)[user_col].nunique()>1).sum()>10: 289 | return False 290 | 291 | return True 292 | 293 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(data[[col,user_col]],user_nunique) for col in cat_cols) 294 | 295 | for col,is_session in zip(cat_cols,res): 296 | if is_session: 297 | session_cols.append(col) 298 | 299 | return session_cols 300 | 301 | @timeclass(cls='Graph') 302 | def recognize_binary_col(self,data,cat_cols): 303 | def func(ss): 304 | ss = ss.unique() 305 | if len(ss) == 3: 306 | if pd.isna(ss).sum() == 1: 307 | return True 308 | if len(ss) == 2: 309 | return True 310 | return False 311 | 312 | binary_cols = [] 313 | 314 | res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(data[col]) for col in cat_cols) 315 | 316 | for col,is_binary in zip(cat_cols,res): 317 | if is_binary: 318 | binary_cols.append(col) 319 | 320 | return binary_cols 321 | 322 | @timeclass(cls='Graph') 323 | def recognize_user_col(self,data,key_cols): 324 | user_col = None 325 | nunique = -1 326 | for col in key_cols: 327 | nnum = data[col].nunique() 328 | if nnum > nunique: 329 | user_col = col 330 | nunique = nnum 331 | return user_col 332 | 333 | @timeclass(cls='Graph') 334 | def preprocess_fit_transform(self): 335 | log('start mscat') 336 | 337 | mscat_block2preprocessor = {} 338 | for block_id in range(1,len(self.block2name)+1): 339 | mscat_block2preprocessor[block_id] = MSCatPreprocessor() 340 | ss = {} 341 | for block_id in range(1,len(self.block2name)+1): 342 | ss[block_id] = pd.Series() 343 | 344 | t_datas = [] 345 | t_names = [] 346 | for t_name in self.name2table: 347 | t_table = self.name2table[t_name] 348 | t_data = t_table.data 349 | 350 | t_datas.append(t_data) 351 | t_names.append(t_name) 352 | 353 | for t in range(len(t_datas)): 354 | t_data = t_datas[t] 355 | t_name = t_names[t] 356 | for col in t_data.columns: 357 | coltype = self.table2info[ t_name ][col] 358 | if coltype == CONSTANT.MULTI_CAT_TYPE or coltype == CONSTANT.CATEGORY_TYPE: 359 | name = t_name + ':' + col 360 | if name in self.name2block: 361 | block_id = self.name2block[name] 362 | ss[block_id] = pd.concat([ss[block_id],t_data[col].drop_duplicates()]) 363 | 364 | for block_id in range(1,len(self.block2name)+1): 365 | mscat_block2preprocessor[block_id].fit(ss[block_id]) 366 | 367 | for tname,table in self.name2table.items(): 368 | table.preprocess_fit_transform(mscat_block2preprocessor) 369 | 370 | gc.collect() 371 | 372 | def set_main_table(self,table): 373 | tname = CONSTANT.MAIN_TABLE_NAME 374 | self.name2table[CONSTANT.MAIN_TABLE_NAME] = Table(table,self.main_table_info,self.main_session_cols,self.main_user_col,self.main_key_cols,self.main_key_time_col,tname) 375 | gc.collect() 376 | 377 | @timeclass(cls='Graph') 378 | def bfs(self,root_name, graph, depth): 379 | depth[CONSTANT.MAIN_TABLE_NAME]['depth'] = 0 380 | queue = deque([root_name]) 381 | while queue: 382 | u_name = queue.popleft() 383 | for edge in graph[u_name]: 384 | v_name = edge['to'] 385 | if 'depth' not in depth[v_name]: 386 | depth[v_name]['depth'] = depth[u_name]['depth'] + 1 387 | queue.append(v_name) 388 | 389 | @timeclass(cls='Graph') 390 | def build_depth(self): 391 | rel_graph = defaultdict(list) 392 | depth = {} 393 | 394 | for tname in self.tables: 395 | depth[tname] = {} 396 | 397 | for rel in self.relations: 398 | ta = rel['table_A'] 399 | tb = rel['table_B'] 400 | rel_graph[ta].append({ 401 | "to": tb, 402 | "key": rel['key'], 403 | "type": rel['type'] 404 | }) 405 | rel_graph[tb].append({ 406 | "to": ta, 407 | "key": rel['key'], 408 | "type": '_'.join(rel['type'].split('_')[::-1]) 409 | }) 410 | self.bfs(CONSTANT.MAIN_TABLE_NAME, rel_graph, depth) 411 | 412 | self.rel_graph = rel_graph 413 | self.depth = depth 414 | 415 | 416 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/table/table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from preprocessor.preprocessor import * 3 | import CONSTANT 4 | from util import timeclass,log 5 | import gc 6 | 7 | class Table: 8 | def __init__(self,data,table_info,session_cols,user_cols,key_cols,key_time_col,tname): 9 | self.name = tname 10 | 11 | self.col2type = {} 12 | self.col2groupby = {} 13 | self.col2block = {} 14 | self.col2istime = {} 15 | 16 | self.col2muldatas = {} 17 | self.col2muldatalens = {} 18 | 19 | self.user_cols = user_cols 20 | self.session_cols = [] 21 | 22 | self.block2name = {} 23 | self.name2block = {} 24 | 25 | for col in session_cols: 26 | if len(self.user_cols) > 0: 27 | self.session_cols.append(col) 28 | self.col2groupby[col] = self.user_cols[0] 29 | 30 | self.key_time_col = key_time_col 31 | self.key_cols = key_cols 32 | 33 | self.cat_cols = None 34 | 35 | self.binary_cols = None 36 | self.multi_cat_cols = None 37 | self.num_cols = None 38 | 39 | self.time_cols = None 40 | 41 | self.bin_cols = [] 42 | 43 | self.update_data(data,table_info,None) 44 | 45 | log(f'session_cols:{self.session_cols}') 46 | log(f'user_cols:{self.user_cols}') 47 | log(f'key_cols:{self.key_cols}') 48 | log(f'cat_cols:{self.cat_cols}') 49 | log(f'binary_cols:{self.binary_cols}') 50 | log(f'multi_cat_cols:{self.multi_cat_cols}') 51 | log(f'key_time_col:{self.key_time_col}') 52 | log(f'time_cols:{self.time_cols}') 53 | log(f'num_cols:{self.num_cols}') 54 | 55 | self.apart_cat_set = set() 56 | self.post_drop_set = set() 57 | 58 | self.col2source_cat = {} 59 | 60 | self.combine_cat_cols = [] 61 | self.combine_num_cols = [] 62 | self.combine_binary_cols = [] 63 | self.wait_selection_cols = [] 64 | 65 | def add_session_col(self,col): 66 | self.session_cols.append(col) 67 | self.col2type[col] = CONSTANT.CATEGORY_TYPE 68 | if len(self.user_cols) > 0: 69 | self.col2groupby[col] = self.user_cols[0] 70 | 71 | def get_groupby_cols(self,by,cols): 72 | new_cols = [] 73 | bys = set() 74 | bys.add(by) 75 | while by in self.col2groupby: 76 | by = self.col2groupby[by] 77 | bys.add(by) 78 | 79 | for col in cols: 80 | is_skip = False 81 | cur = col 82 | while True: 83 | if cur in bys: 84 | is_skip = True 85 | break 86 | 87 | if cur in self.col2groupby: 88 | cur = self.col2groupby[cur] 89 | else: 90 | break 91 | 92 | if not is_skip: 93 | new_cols.append(col) 94 | 95 | return new_cols 96 | 97 | def get_not_apart_cat_cols(self,cols): 98 | new_cols = [] 99 | for col in cols: 100 | if col not in self.apart_cat_set: 101 | new_cols.append(col) 102 | return new_cols 103 | 104 | def drop_data(self,cols): 105 | drop_cols = [] 106 | for col in cols: 107 | if col not in self.session_cols\ 108 | and col not in self.user_cols\ 109 | and col not in self.key_cols\ 110 | and col != self.key_time_col: 111 | drop_cols.append(col) 112 | if len(drop_cols)>0: 113 | self.data.drop(drop_cols,axis=1,inplace=True) 114 | self.drop_data_cols(drop_cols) 115 | 116 | def drop_data_cols(self,drop_cols): 117 | for col in drop_cols: 118 | self.col2type.pop(col) 119 | if col in self.col2groupby: 120 | self.col2groupby.pop(col) 121 | 122 | self.type_reset() 123 | self.drop_combine_cols(drop_cols) 124 | 125 | def drop_combine_cols(self,drop_cols): 126 | drop_cols_set = set(drop_cols) 127 | 128 | combine_cat_cols = [] 129 | combine_num_cols = [] 130 | combine_binary_cols = [] 131 | 132 | for col in self.combine_cat_cols: 133 | if col not in drop_cols_set: 134 | combine_cat_cols.append(col) 135 | 136 | for col in self.combine_num_cols: 137 | if col not in drop_cols_set: 138 | combine_num_cols.append(col) 139 | 140 | for col in self.combine_binary_cols: 141 | if col not in drop_cols_set: 142 | combine_binary_cols.append(col) 143 | 144 | self.combine_cat_cols = combine_cat_cols 145 | self.combine_num_cols = combine_num_cols 146 | self.combine_binary_cols = combine_binary_cols 147 | 148 | def add_apart_cat_cols(self,cols): 149 | self.apart_cat_set.update(cols) 150 | 151 | def add_post_drop_cols(self,cols): 152 | self.post_drop_set.update(cols) 153 | 154 | def add_wait_selection_cols(self,cols): 155 | self.wait_selection_cols.append(cols) 156 | 157 | def empty_wait_selection_cols(self): 158 | self.wait_selection_cols = [] 159 | 160 | def update_data(self,data,col2type,col2groupby,col2source_cat=None,col2block=None,col2istime=None): 161 | 162 | self.data = data 163 | self.update_col2type(col2type) 164 | if col2groupby is not None: 165 | self.update_col2groupby(col2groupby) 166 | 167 | if col2block is not None: 168 | self.update_col2block(col2block) 169 | if col2istime is not None: 170 | self.update_col2istime(col2istime) 171 | 172 | if col2source_cat is not None: 173 | self.update_col2source_cat(col2source_cat) 174 | gc.collect() 175 | 176 | def update_col2block(self,col2block): 177 | self.col2block.update(col2block) 178 | 179 | def update_col2istime(self,col2istime): 180 | self.col2istime.update(col2istime) 181 | 182 | def update_col2groupby(self,col2groupby): 183 | self.col2groupby.update(col2groupby) 184 | 185 | def update_col2source_cat(self,col2source_cat): 186 | self.col2source_cat.update(col2source_cat) 187 | 188 | def update_col2type(self,col2type): 189 | self.col2type.update(col2type) 190 | self.type_reset() 191 | 192 | def reset_combine_cols(self,combine_cat_cols=None,combine_num_cols=None,combine_binary_cols=None): 193 | self.combine_cat_cols = combine_cat_cols 194 | self.combine_num_cols = combine_num_cols 195 | self.combine_binary_cols = combine_binary_cols 196 | 197 | def type_reset(self): 198 | 199 | cat_cols = [] 200 | binary_cols = [] 201 | multi_cat_cols = [] 202 | num_cols = [] 203 | time_cols = [] 204 | 205 | for cname,ctype in self.col2type.items(): 206 | if (ctype == CONSTANT.CATEGORY_TYPE) \ 207 | and (cname not in self.key_cols)\ 208 | and (cname not in self.user_cols)\ 209 | and (cname not in self.session_cols): 210 | cat_cols.append(cname) 211 | elif ctype == CONSTANT.BINARY_TYPE: 212 | binary_cols.append(cname) 213 | elif ctype == CONSTANT.MULTI_CAT_TYPE: 214 | multi_cat_cols.append(cname) 215 | elif ctype == CONSTANT.NUMERICAL_TYPE: 216 | num_cols.append(cname) 217 | elif ctype == CONSTANT.TIME_TYPE and cname != self.key_time_col: 218 | time_cols.append(cname) 219 | 220 | self.cat_cols = sorted(cat_cols) 221 | self.binary_cols = sorted(binary_cols) 222 | self.num_cols = sorted(num_cols) 223 | self.multi_cat_cols = sorted(multi_cat_cols) 224 | self.time_cols = sorted(time_cols) 225 | 226 | @timeclass(cls='Table') 227 | def preprocess_fit_transform(self,mscat_group2preprocessor): 228 | 229 | for col in (self.cat_cols+self.multi_cat_cols+self.user_cols+self.key_cols+self.session_cols): 230 | name = self.name+':'+col 231 | if name in self.name2block: 232 | block_id = self.name2block[name] 233 | self.data[col] = mscat_group2preprocessor[block_id].transform(self.data[col],self.col2type[col]) 234 | 235 | unique_preprocessor = UniquePreprocessor() 236 | unique_preprocessor.fit_transform(self) 237 | 238 | all_diff_preprocessor = AllDiffPreprocessor() 239 | all_diff_preprocessor.fit_transform(self) 240 | 241 | binary_preprocessor = BinaryPreprocessor() 242 | binary_preprocessor.fit_transform(self) 243 | 244 | num_preprocess = NumPreprocessor() 245 | num_preprocess.fit_transform(self) 246 | 247 | general_preprocessor = GeneralPreprocessor() 248 | general_preprocessor.fit_transform(self) 249 | -------------------------------------------------------------------------------- /auto_smart/auto_smart/util.py: -------------------------------------------------------------------------------- 1 | 2 | import time 3 | from typing import Any 4 | 5 | 6 | import functools 7 | nesting_level = 0 8 | is_start = None 9 | 10 | 11 | class Timer: 12 | def __init__(self): 13 | self.start = time.time() 14 | self.history = [self.start] 15 | 16 | def check(self, info): 17 | current = time.time() 18 | log(f"[{info}] spend {current - self.history[-1]:0.2f} sec") 19 | self.history.append(current) 20 | 21 | 22 | def timeclass(cls): 23 | def timeit(method, start_log=None): 24 | @functools.wraps(method) 25 | def timed(*args, **kw): 26 | global is_start 27 | global nesting_level 28 | 29 | if not is_start: 30 | print() 31 | 32 | is_start = True 33 | log(f"Start [{cls}.{method.__name__}]:" + (start_log if start_log else "")) 34 | log(f'Start time: {time.strftime("%Y-%m-%d %H:%M:%S")}') 35 | nesting_level += 1 36 | 37 | start_time = time.time() 38 | result = method(*args, **kw) 39 | end_time = time.time() 40 | 41 | nesting_level -= 1 42 | log(f"End [{cls}.{method.__name__}]. Time elapsed: {end_time - start_time:0.2f} sec.") 43 | log(f'End time: {time.strftime("%Y-%m-%d %H:%M:%S")}') 44 | is_start = False 45 | 46 | return result 47 | 48 | return timed 49 | return timeit 50 | 51 | def timeit(method, start_log=None): 52 | @functools.wraps(method) 53 | def timed(*args, **kw): 54 | global is_start 55 | global nesting_level 56 | 57 | if not is_start: 58 | print() 59 | 60 | is_start = True 61 | log(f"Start [{method.__name__}]:" + (start_log if start_log else "")) 62 | nesting_level += 1 63 | 64 | start_time = time.time() 65 | result = method(*args, **kw) 66 | end_time = time.time() 67 | 68 | nesting_level -= 1 69 | log(f"End [{method.__name__}]. Time elapsed: {end_time - start_time:0.2f} sec.") 70 | is_start = False 71 | 72 | return result 73 | 74 | return timed 75 | 76 | 77 | def log(entry: Any): 78 | global nesting_level 79 | space = "-" * (4 * nesting_level) 80 | print(f"{space}{entry}") 81 | 82 | def show_dataframe(df): 83 | if len(df) <= 30: 84 | print(f"content=\n" 85 | f"{df}") 86 | else: 87 | print(f"dataframe is too large to show the content, over {len(df)} rows") 88 | 89 | if len(df.dtypes) <= 100: 90 | print(f"types=\n" 91 | f"{df.dtypes}\n") 92 | else: 93 | print(f"dataframe is too wide to show the dtypes, over {len(df.dtypes)} columns") 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /auto_smart/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import setuptools 5 | from Cython.Build import cythonize 6 | 7 | setuptools.setup( 8 | name='AutoSmart', 9 | version='0.0.2', 10 | author='DeepBlueAI', 11 | author_email='1229991666@qq.com', 12 | url='https://github.com/DeepBlueAI/AutoSmart', 13 | description=u'The 1st place solution for KDD Cup 2019 AutoML Track', 14 | packages=setuptools.find_packages(), 15 | install_requires=[ 16 | "hyperopt", 17 | "lightgbm==2.3.0", 18 | "joblib", 19 | "pandas", 20 | ], 21 | ext_modules = cythonize("ac.pyx"), 22 | classifiers=[ 23 | "Programming Language :: Python :: 3", 24 | "License :: OSI Approved :: GNU General Public License (GPL)", 25 | "Operating System :: OS Independent", 26 | ], 27 | ) 28 | -------------------------------------------------------------------------------- /demo/data/train/info.json: -------------------------------------------------------------------------------- 1 | { 2 | "time_budget": 300, 3 | "time_col": "t_01", 4 | "start_time": 1550654179, 5 | "tables": { 6 | "main": { 7 | "t_01": "time", 8 | "c_1": "cat", 9 | "c_2": "cat", 10 | "n_1": "num", 11 | "n_2": "num", 12 | "c_3": "cat", 13 | "c_02": "cat", 14 | "c_01": "cat" 15 | }, 16 | "table_1": { 17 | "c_01": "cat", 18 | "c_1": "cat", 19 | "c_2": "cat", 20 | "n_1": "num", 21 | "c_3": "cat", 22 | "c_4": "cat", 23 | "t_1": "time", 24 | "t_2": "time", 25 | "n_2": "num", 26 | "n_3": "num", 27 | "n_4": "num", 28 | "n_5": "num", 29 | "m_1": "multi-cat", 30 | "m_2": "multi-cat", 31 | "m_3": "multi-cat", 32 | "m_4": "multi-cat", 33 | "m_5": "multi-cat", 34 | "m_6": "multi-cat" 35 | }, 36 | "table_2": { 37 | "c_02": "cat", 38 | "c_1": "cat", 39 | "c_2": "cat", 40 | "c_3": "cat", 41 | "c_4": "cat", 42 | "t_1": "time" 43 | }, 44 | "table_3": { 45 | "n_1": "num", 46 | "c_02": "cat", 47 | "t_01": "time" 48 | } 49 | }, 50 | "relations": [ 51 | { 52 | "table_A": "main", 53 | "table_B": "table_1", 54 | "key": ["c_01"], 55 | "type": "many_to_one" 56 | }, 57 | { 58 | "table_A": "main", 59 | "table_B": "table_2", 60 | "key": ["c_02"], 61 | "type": "many_to_one" 62 | }, 63 | { 64 | "table_A": "main", 65 | "table_B": "table_3", 66 | "key": ["c_02"], 67 | "type": "many_to_one" 68 | } 69 | ] 70 | } 71 | -------------------------------------------------------------------------------- /demo/demo.py: -------------------------------------------------------------------------------- 1 | import auto_smart 2 | 3 | info = auto_smart.read_info("data") 4 | train_data,train_label = auto_smart.read_train("data",info) 5 | test_data = auto_smart.read_test("data",info) 6 | 7 | auto_smart.train_and_predict(train_data,train_label,info,test_data) 8 | 9 | --------------------------------------------------------------------------------