├── .gitignore
├── LICENSE
├── README.md
├── asserts
    └── WechatIMG22352.png
├── code_submission
    ├── config.py
    ├── configs
    │   ├── a.json
    │   ├── az-cs.json
    │   ├── b.json
    │   ├── c.json
    │   ├── coauthor-cs.json
    │   ├── d.json
    │   ├── default.json
    │   ├── e-d5.json
    │   ├── e.json
    │   └── tmp.json
    ├── meta.encoder
    ├── meta.model
    ├── model.py
    ├── models
    │   ├── __init__.py
    │   ├── emb_gcn.py
    │   ├── focal_loss.py
    │   ├── gat.py
    │   ├── gcn.py
    │   ├── nas_autograph_a.py
    │   ├── nas_autograph_b.py
    │   ├── nas_autograph_c.py
    │   ├── nas_autograph_d.py
    │   ├── nas_autograph_e.py
    │   ├── nas_azcs.py
    │   ├── nas_azpo.py
    │   ├── nas_citeseer.py
    │   ├── nas_coauthorcs.py
    │   ├── nas_coauthorphy.py
    │   ├── nas_cora.py
    │   ├── nas_phy10000.py
    │   ├── nas_pubmed.py
    │   ├── sage.py
    │   └── simple_gcn.py
    ├── preprocessing
    │   ├── __init__.py
    │   ├── feat_engineer.py
    │   ├── graph.py
    │   └── prepredict.py
    └── utils
    │   ├── __init__.py
    │   ├── callbacks.py
    │   ├── data.py
    │   ├── drop_edge.py
    │   ├── ensemble.py
    │   ├── logger.py
    │   ├── timer.py
    │   └── train.py
├── ingestion
    ├── common.py
    ├── dataset.py
    ├── metadata
    └── timing.py
├── meta_run.sh
├── run_local.sh
├── run_local_test.py
└── scoring
    ├── graph-score.py
    ├── metadata
    └── score.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | ingestion/ingestion.py
3 | data/
4 | __pycache__
5 | .DS_Store
6 | *.zip
7 | 
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![license](https://img.shields.io/badge/license-GPL%203.0-green.svg)](https://github.com/Unkrible/AutoGraph2020/blob/master/LICENSE)
 2 | 
 3 | AutoGraph
 4 | ======================================
 5 | 
 6 | 
 7 | 
 8 | ## Contents
 9 | 
10 | - ingestion/: The code and libraries used on Codalab to run your submmission.
11 | 
12 | - scoring/: The code and libraries used on Codalab to score your submmission.
13 | 
14 | - code_submission/: An example of code submission you can use as template.
15 | 
16 | - data/: Some sample data to test your code before you submit it.
17 | - Meta.model: A decision tree for adaptive configuration of hyperparameters![WechatIMG22352](asserts/WechatIMG22352.png)
18 | - [Extra Dataset used in competition](https://github.com/mecthew/Graph-Dataset)
19 | 
20 | 
21 | 
22 | ## Local development and testing
23 | 1. To make your own submission to AutoGraph challenge, you need to modify the
24 | file `model.py` in `code_submission/`, which implements your algorithm.
25 | 2. Test the algorithm on your local computer using Docker,
26 | in the exact same environment as on the CodaLab challenge platform. Advanced
27 | users can also run local test without Docker, if they install all the required
28 | packages.
29 | 3. If you are new to docker, install docker from https://docs.docker.com/get-started/.
30 | Then, at the shell, run:
31 | ```
32 | cd path/to/autograph_starting_kit/
33 | docker run --gpus=0 -it --rm -v "$(pwd):/app/autograph" -w /app/autograph nehzux/kddcup2020:v2
34 | ```
35 | The option `-v "$(pwd):/app/autograph"` mounts current directory
36 | (`autograph_starting_kit/`) as `/app/autograph`. If you want to mount other
37 | directories on your disk, please replace `$(pwd)` by your own directory.
38 | 
39 | The Docker image
40 | ```
41 | nehzux/kddcup2020:v2
42 | ```
43 | 
44 | 4. You will then be able to run the `ingestion program` (to produce predictions)
45 | and the `scoring program` (to evaluate your predictions) on toy sample data.
46 | In the AutoGraph challenge, both two programs will run in parallel to give
47 | feedback. So we provide a Python script to simulate this behavior. To test locally, run:
48 | ```
49 | python run_local_test.py
50 | ```
51 | If the program exits without any errors, you can find the final score from the terminal's stdout of your solution.
52 | Also you can view the score by opening the `scoring_output/scores.txt`.
53 | 
54 | The full usage is
55 | ```
56 | python run_local_test.py --dataset_dir=./data/demo --code_dir=./code_submission
57 | ```
58 | You can change the argument `dataset_dir` to other datasets (e.g. the two
59 | practice datasets we provide). On the other hand, you can also modify the directory containing your other sample code.
60 | 
61 | 5. You can directly use `sh ./meta_run.sh log_folder run_times [dataset1, dataset2, ...]` to run our programs in batch.
62 | 
63 | 
64 | 
65 | ##  Contributor
66 | 
67 | 
68 | 
69 | - Zhuoer Xu, NJU, [xuzhuoer.rex@gmail.com](mailto:xuzhuoer.rex@gmail.com)
70 | 
71 | - Feng Cheng, NJU, [hazzacheng@gmail.com](mailto:hazzacheng@gmail.com)
72 | - Wenjie Wang, NJU, [wjwangpt@gmail.com](mailto:wjwangpt@gmail.com)
73 | - Mengchuan Qiu, NJU, [mecthew.qiu@gmail.com](mailto:mecthew.qiu@gmail.com)
74 | 


--------------------------------------------------------------------------------
/asserts/WechatIMG22352.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unkrible/AutoGraph2020/06fee732e392971cb973a54e19649821ae0f7786/asserts/WechatIMG22352.png


--------------------------------------------------------------------------------
/code_submission/config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | from collections import defaultdict, ChainMap
 4 | from models import MODEL_LIB, MODEL_PARAMETER_LIB
 5 | 
 6 | 
 7 | class ModelList:
 8 |     def __init__(self, names, loop=False):
 9 |         self.names = names
10 |         self.n_models = len(names)
11 |         self.index = 0
12 |         self.model_info = None
13 |         self.loop = loop
14 | 
15 |     def __iter__(self):
16 |         self.index = 0
17 |         return self
18 | 
19 |     def __next__(self):
20 |         if not self.loop and self.index >= self.n_models:
21 |             raise StopIteration
22 |         self.index = self.index % self.n_models
23 |         if self.index == 0 and self.model_info is not None:
24 |             self._update()
25 |         name = self.names[self.index]
26 |         self.index = self.index + 1
27 |         return name, MODEL_LIB[name]
28 | 
29 |     def __len__(self):
30 |         return len(self.names)
31 | 
32 |     def _update(self):
33 |         model_info = [(ele[1], ele[2]) for ele in self.model_info]
34 |         model_metrics = defaultdict(list)
35 |         for metric, name in model_info:
36 |             model_metrics[name].append(metric)
37 |         model_metrics = {name: np.mean(metrics) for name, metrics in model_metrics.items()}
38 |         model_metrics = [(name, model_metrics.get(name, 1.0))for name in self.names]
39 |         model_metrics = sorted(model_metrics, key=lambda x: x[1], reverse=True)
40 |         print("sorted metrics", model_metrics)
41 |         self.names = [ele[0] for ele in model_metrics]
42 | 
43 |     def update(self, model_info):
44 |         self.model_info = model_info
45 | 
46 | 
47 | class Config:
48 |     """
49 |     统一管理全局超参数, 如模型序列, 数据处理方式, batch size等
50 |     """
51 |     def __init__(self, filename="", config=None):
52 |         self.filename = filename
53 |         if config is None:
54 |             self.config = defaultdict(lambda: None)
55 |             with open(filename, 'r') as f:
56 |                 self.config = json.load(f)
57 |         else:
58 |             self.config = config
59 |         self.model_list = None
60 | 
61 |     def __getitem__(self, item):
62 |         return self.config[item]
63 | 
64 |     def __setitem__(self, key, value):
65 |         self.config[key] = value
66 | 
67 |     def __delitem__(self, key):
68 |         del self.config[key]
69 | 
70 |     def __getattr__(self, item):
71 |         return self.config.get(item, None)
72 | 
73 |     def __str__(self):
74 |         return str(self.config)
75 | 
76 |     @property
77 |     def loop(self):
78 |         return self.config.get('loop', False)
79 | 
80 |     @property
81 |     def model_classes(self):
82 |         if self.model_list is None:
83 |             self.model_list = ModelList(self.model, self.loop)
84 |         return self.model_list
85 | 
86 |     def model_config(self, name):
87 |         config = {}
88 |         if "model_config" in self.config:
89 |             if name in self.config["model_config"]:
90 |                 config = self.config["model_config"][name]
91 |             if name in MODEL_PARAMETER_LIB:
92 |                 params = MODEL_PARAMETER_LIB[name]
93 |                 config = ChainMap(config, {"lr": params[0], "dropout": params[1], "weight_decay": params[2], "hidden": params[3]})
94 |         config = ChainMap(config, self.config)
95 |         return Config(config=config)
96 | 


--------------------------------------------------------------------------------
/code_submission/configs/a.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": [
 3 |     "nas_cora",
 4 |     "nas_autograph_a",
 5 |     "nas_autograph_c",
 6 |     "gat",
 7 |     "nas_pubmed",
 8 |     "simple_gcn",
 9 |     "nas_autograph_b",
10 |     "nas_autograph_e"
11 |   ],
12 |   "model_config": {
13 |     "simple_gcn": {
14 |       "num_layers": 3,
15 |       "drop_edge": 0.8
16 |     },
17 |     "gat": {
18 |       "num_layers": 3
19 |     },
20 |     "nas_pubmed": {
21 |       "num_layers": 1
22 |     },
23 |     "nas_autograph_a": {
24 |       "lr": 0.01,
25 |       "dropout": 0.9,
26 |       "weight_decay": 0,
27 |       "hidden": 128
28 |     },
29 |     "nas_autograph_b": {
30 |       "lr": 0.001,
31 |       "dropout": 0.7,
32 |       "weight_decay": 0,
33 |       "hidden": 256
34 |     },
35 |     "nas_autograph_d": {
36 |       "lr": 0.005,
37 |       "dropout": 0.1,
38 |       "weight_decay": 0.001,
39 |       "hidden": 8
40 |     },
41 |     "nas_autograph_e": {
42 |       "lr": 0.005,
43 |       "dropout": 0.7,
44 |       "weight_decay": 0.0001,
45 |       "hidden": 32
46 |     }
47 |   },
48 |   "num_epoch": 500,
49 |   "num_batch": 1,
50 |   "patience": 30,
51 |   "use_valid": true,
52 |   "use_all_data": false,
53 |   "lr": 0.005,
54 |   "drop_edge": 1.0,
55 |   "weight_decay": 5e-4,
56 |   "dropout": 0.5,
57 |   "hidden": 64,
58 |   "use_sampler": false,
59 |   "num_layers": 1,
60 |   "min_epoch": 1,
61 |   "loop": true
62 | }


--------------------------------------------------------------------------------
/code_submission/configs/az-cs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": [
 3 |     "nas_citeseer",
 4 |     "gat",
 5 |     "nas_cora",
 6 |     "nas_pubmed",
 7 |     "nas_autograph_e",
 8 |     "nas_azpo"
 9 |   ],
10 |   "model_config": {
11 |     "nas_cora":{
12 |       "num_layers": 1,
13 |       "patience": 15
14 |     },
15 |     "nas_citeseer": {
16 |       "num_layers": 1,
17 |       "patience": 15
18 |     },
19 |     "nas_pubmed": {
20 |       "num_layers": 1,
21 |       "patience": 15
22 |     }
23 |   },
24 |   "num_epoch": 500,
25 |   "num_batch": 1,
26 |   "patience": 30,
27 |   "use_valid": false,
28 |   "use_all_data": false,
29 |   "lr": 0.005,
30 |   "drop_edge": 1.0,
31 |   "weight_decay": 5e-4,
32 |   "dropout": 0.5,
33 |   "hidden": 64,
34 |   "use_sampler": true,
35 |   "num_layers": 1,
36 |   "min_epoch": 1,
37 |   "loop": true
38 | }


--------------------------------------------------------------------------------
/code_submission/configs/b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": [
 3 |     "nas_citeseer",
 4 |     "nas_autograph_c",
 5 |     "gat",
 6 |     "simple_gcn",
 7 |     "nas_autograph_b",
 8 |     "nas_pubmed",
 9 |     "nas_autograph_e"
10 |   ],
11 |   "model_config": {
12 |     "gat": {
13 |       "num_layers": 3
14 |     },
15 |     "simple_gcn": {
16 |       "drop_edge": 0.8,
17 |       "num_layers": 3
18 |     },
19 |     "nas_citeseer": {
20 |       "num_layers": 1
21 |     },
22 |     "nas_pubmed": {
23 |       "num_layers": 1
24 |     },
25 |     "nas_autograph_a": {
26 |       "lr": 0.01,
27 |       "dropout": 0.9,
28 |       "weight_decay": 0,
29 |       "hidden": 128
30 |     },
31 |     "nas_autograph_b": {
32 |       "lr": 0.001,
33 |       "dropout": 0.7,
34 |       "weight_decay": 0,
35 |       "hidden": 256
36 |     },
37 |     "nas_autograph_d": {
38 |       "lr": 0.005,
39 |       "dropout": 0.1,
40 |       "weight_decay": 0.001,
41 |       "hidden": 8
42 |     },
43 |     "nas_autograph_e": {
44 |       "lr": 0.005,
45 |       "dropout": 0.7,
46 |       "weight_decay": 0.0001,
47 |       "hidden": 32
48 |     }
49 |   },
50 |   "num_epoch": 500,
51 |   "num_batch": 1,
52 |   "patience": 30,
53 |   "use_valid": true,
54 |   "use_all_data": false,
55 |   "lr": 0.005,
56 |   "drop_edge": 1.0,
57 |   "weight_decay": 5e-4,
58 |   "dropout": 0.5,
59 |   "hidden": 64,
60 |   "use_sampler": false,
61 |   "num_layers": 1,
62 |   "min_epoch": 1,
63 |   "loop": true
64 | }


--------------------------------------------------------------------------------
/code_submission/configs/c.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": [
 3 |     "nas_cora",
 4 |     "gat",
 5 |     "nas_pubmed",
 6 |     "gcn",
 7 |     "simple_gcn",
 8 |     "nas_autograph_e"
 9 |   ],
10 |   "model_config": {
11 |     "simple_gcn": {
12 |       "drop_edge": 0.8,
13 |       "num_layers": 3
14 |     },
15 |     "nas_cora":{
16 |       "num_layers": 1
17 |     },
18 |     "nas_pubmed": {
19 |       "num_layers": 1
20 |     },
21 |     "nas_autograph_a": {
22 |       "lr": 0.01,
23 |       "dropout": 0.9,
24 |       "weight_decay": 0,
25 |       "hidden": 128
26 |     },
27 |     "nas_autograph_b": {
28 |       "lr": 0.001,
29 |       "dropout": 0.7,
30 |       "weight_decay": 0,
31 |       "hidden": 256
32 |     },
33 |     "nas_autograph_d": {
34 |       "lr": 0.005,
35 |       "dropout": 0.1,
36 |       "weight_decay": 0.001,
37 |       "hidden": 8
38 |     },
39 |     "nas_autograph_e": {
40 |       "lr": 0.005,
41 |       "dropout": 0.7,
42 |       "weight_decay": 0.0001,
43 |       "hidden": 32
44 |     }
45 |   },
46 |   "num_epoch": 500,
47 |   "num_batch": 1,
48 |   "patience": 20,
49 |   "use_valid": false,
50 |   "use_all_data": false,
51 |   "lr": 0.005,
52 |   "drop_edge": 1.0,
53 |   "weight_decay": 5e-4,
54 |   "dropout": 0.5,
55 |   "hidden": 64,
56 |   "use_sampler": true,
57 |   "num_layers": 1,
58 |   "min_epoch": 1
59 | }


--------------------------------------------------------------------------------
/code_submission/configs/coauthor-cs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": [
 3 |     "nas_coauthorphy",
 4 |     "nas_coauthorcs",
 5 |     "nas_autograph_b",
 6 |     "nas_autograph_e",
 7 |     "nas_phy10000",
 8 |     "nas_autograph_d"
 9 |   ],
10 |   "model_config": {
11 |     "nas_autograph_c": {
12 |       "lr": 0.0005,
13 |       "dropout": 0.8,
14 |       "weight_decay": 1e-05,
15 |       "hidden": 256
16 |     }
17 |   },
18 |   "num_epoch": 500,
19 |   "num_batch": 1,
20 |   "patience": 30,
21 |   "use_valid": true,
22 |   "use_all_data": false,
23 |   "lr": 0.005,
24 |   "drop_edge": 1.0,
25 |   "weight_decay": 0.0005,
26 |   "dropout": 0.5,
27 |   "hidden": 64,
28 |   "num_layers": 2,
29 |   "min_epoch": 1,
30 |   "loop": true
31 | }


--------------------------------------------------------------------------------
/code_submission/configs/d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": [
 3 |     "gat",
 4 |     "nas_autograph_e",
 5 |     "nas_autograph_d",
 6 |     "nas_citeseer",
 7 |     "nas_pubmed",
 8 |     "gcn",
 9 |     "simple_gcn"
10 |   ],
11 |   "model_config": {
12 |     "emb_gcn": {
13 |       "emb_dim": 128,
14 |       "num_layers": 3
15 |     },
16 |     "simple_gcn": {
17 |       "num_layers": 3
18 |     },
19 |     "gcn": {
20 |       "num_layers": 3
21 |     },
22 |     "nas_autograph_a": {
23 |       "lr": 0.01,
24 |       "dropout": 0.9,
25 |       "weight_decay": 0,
26 |       "hidden": 128
27 |     },
28 |     "nas_autograph_b": {
29 |       "lr": 0.001,
30 |       "dropout": 0.7,
31 |       "weight_decay": 0,
32 |       "hidden": 256
33 |     },
34 |     "nas_autograph_d": {
35 |       "lr": 0.005,
36 |       "dropout": 0.1,
37 |       "weight_decay": 0.001,
38 |       "hidden": 8
39 |     },
40 |     "nas_autograph_e": {
41 |       "lr": 0.005,
42 |       "dropout": 0.7,
43 |       "weight_decay": 0.0001,
44 |       "hidden": 32
45 |     }
46 |   },
47 |   "num_epoch": 500,
48 |   "num_batch": 1,
49 |   "patience": 30,
50 |   "use_valid": false,
51 |   "use_all_data": false,
52 |   "lr": 0.005,
53 |   "drop_edge": 1.0,
54 |   "weight_decay": 5e-4,
55 |   "dropout": 0.5,
56 |   "hidden": 64,
57 |   "use_sampler": true,
58 |   "num_layers": 1,
59 |   "min_epoch": 1,
60 |   "loop": true
61 | }


--------------------------------------------------------------------------------
/code_submission/configs/default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": [
 3 |     "nas_autograph_b",
 4 |     "nas_citeseer",
 5 |     "nas_pubmed",
 6 |     "nas_autograph_a"
 7 |   ],
 8 |   "model_config": {
 9 |     "simple_gcn": {
10 |     },
11 |     "nas_citeseer": {
12 |       "num_layers": 1
13 |     },
14 |     "nas_pubmed": {
15 |       "num_layers": 1
16 |     },
17 |     "nas_autograph_a": {
18 |       "lr": 0.01,
19 |       "dropout": 0.9,
20 |       "weight_decay": 0,
21 |       "hidden": 128
22 |     },
23 |     "nas_autograph_b": {
24 |       "lr": 0.001,
25 |       "dropout": 0.7,
26 |       "weight_decay": 0,
27 |       "hidden": 256
28 |     },
29 |     "nas_autograph_c": {
30 |       "lr": 0.0005,
31 |       "dropout": 0.8,
32 |       "weight_decay": 1e-05,
33 |       "hidden": 256
34 |     }
35 |   },
36 |   "num_epoch": 500,
37 |   "num_batch": 1,
38 |   "patience": 30,
39 |   "use_valid": true,
40 |   "use_all_data": false,
41 |   "lr": 0.005,
42 |   "drop_edge": 1.0,
43 |   "weight_decay": 5e-4,
44 |   "dropout": 0.5,
45 |   "hidden": 64,
46 |   "use_sampler": true,
47 |   "num_layers": 2
48 | }


--------------------------------------------------------------------------------
/code_submission/configs/e-d5.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": [
 3 |     "nas_phy10000",
 4 |     "nas_cora",
 5 |     "nas_autograph_e",
 6 |     "nas_citeseer",
 7 |     "nas_phy10000",
 8 |     "gat"
 9 |   ],
10 |  "model_config": {
11 |     "simple_gcn": {
12 |       "drop_edge": 0.8,
13 |       "num_layers": 3
14 |     },
15 |    "nas_cora": {
16 |      "num_layers": 1
17 |    },
18 |    "nas_phy10000": {
19 |      "min_epoch": 100,
20 |      "use_sampler": false
21 |    },
22 |    "nas_pubmed": {
23 |       "num_layers": 1
24 |     },
25 |    "nas_citeseer": {
26 |      "num_layers": 1
27 |    },
28 |     "nas_autograph_a": {
29 |       "lr": 0.01,
30 |       "dropout": 0.9,
31 |       "weight_decay": 0,
32 |       "hidden": 128
33 |     },
34 |     "nas_autograph_b": {
35 |       "lr": 0.001,
36 |       "dropout": 0.7,
37 |       "weight_decay": 0,
38 |       "hidden": 256
39 |     },
40 |     "nas_autograph_d": {
41 |       "lr": 0.005,
42 |       "dropout": 0.1,
43 |       "weight_decay": 0.001,
44 |       "hidden": 8
45 |     },
46 |     "nas_autograph_e": {
47 |       "lr": 0.005,
48 |       "dropout": 0.7,
49 |       "weight_decay": 0.0001,
50 |       "hidden": 32
51 |     }
52 |   },
53 |   "num_epoch": 500,
54 |   "num_batch": 1,
55 |   "patience": 30,
56 |   "use_valid": false,
57 |   "use_all_data": false,
58 |   "lr": 0.005,
59 |   "drop_edge": 1.0,
60 |   "weight_decay": 5e-4,
61 |   "dropout": 0.5,
62 |   "hidden": 64,
63 |   "use_sampler": true,
64 |   "num_layers": 1,
65 |   "min_epoch": 40,
66 |   "loop": true
67 | }


--------------------------------------------------------------------------------
/code_submission/configs/e.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": [
 3 |     "nas_phy10000",
 4 |     "nas_cora",
 5 |     "nas_autograph_e",
 6 |     "nas_citeseer",
 7 |     "nas_phy10000",
 8 |     "nas_autograph_b",
 9 |     "nas_autograph_a",
10 |     "simple_gcn",
11 |     "nas_autograph_c",
12 |     "nas_coauthorcs",
13 |     "nas_coauthorphy"
14 |   ],
15 |  "model_config": {
16 |     "simple_gcn": {
17 |       "drop_edge": 0.8,
18 |       "num_layers": 3
19 |     },
20 |    "nas_cora": {
21 |      "num_layers": 1
22 |    },
23 |    "nas_phy10000": {
24 |      "min_epoch": 100
25 |    },
26 |    "nas_pubmed": {
27 |       "num_layers": 1
28 |     },
29 |    "nas_citeseer": {
30 |      "num_layers": 1
31 |    },
32 |     "nas_autograph_a": {
33 |       "lr": 0.01,
34 |       "dropout": 0.9,
35 |       "weight_decay": 0,
36 |       "hidden": 128
37 |     },
38 |     "nas_autograph_b": {
39 |       "lr": 0.001,
40 |       "dropout": 0.7,
41 |       "weight_decay": 0,
42 |       "hidden": 256
43 |     },
44 |     "nas_autograph_d": {
45 |       "lr": 0.005,
46 |       "dropout": 0.1,
47 |       "weight_decay": 0.001,
48 |       "hidden": 8
49 |     },
50 |     "nas_autograph_e": {
51 |       "lr": 0.005,
52 |       "dropout": 0.7,
53 |       "weight_decay": 0.0001,
54 |       "hidden": 32
55 |     }
56 |   },
57 |   "num_epoch": 500,
58 |   "num_batch": 1,
59 |   "patience": 30,
60 |   "use_valid": true,
61 |   "use_all_data": false,
62 |   "lr": 0.005,
63 |   "drop_edge": 1.0,
64 |   "weight_decay": 5e-4,
65 |   "dropout": 0.5,
66 |   "hidden": 64,
67 |   "use_sampler": true,
68 |   "num_layers": 1,
69 |   "min_epoch": 40,
70 |   "loop": true
71 | }


--------------------------------------------------------------------------------
/code_submission/configs/tmp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": [
 3 |     "nas_azpo"
 4 |   ],
 5 |   "model_config": {
 6 |     "nas_autograph_c": {
 7 |       "lr": 0.0005,
 8 |       "dropout": 0.8,
 9 |       "weight_decay": 1e-05,
10 |       "hidden": 256
11 |     }
12 |   },
13 |   "num_epoch": 500,
14 |   "num_batch": 1,
15 |   "patience": 30,
16 |   "use_valid": true,
17 |   "use_all_data": false,
18 |   "lr": 0.005,
19 |   "drop_edge": 1.0,
20 |   "weight_decay": 5e-4,
21 |   "dropout": 0.5,
22 |   "hidden": 64,
23 |   "num_layers": 2,
24 |   "min_epoch": 1
25 | }


--------------------------------------------------------------------------------
/code_submission/meta.encoder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unkrible/AutoGraph2020/06fee732e392971cb973a54e19649821ae0f7786/code_submission/meta.encoder


--------------------------------------------------------------------------------
/code_submission/meta.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unkrible/AutoGraph2020/06fee732e392971cb973a54e19649821ae0f7786/code_submission/meta.model


--------------------------------------------------------------------------------
/code_submission/model.py:
--------------------------------------------------------------------------------
  1 | """the simple baseline for autograph"""
  2 | import random
  3 | 
  4 | import os
  5 | import joblib
  6 | import numpy as np
  7 | import pandas as pd
  8 | import torch
  9 | import torch.nn.functional as F
 10 | import torch_geometric.utils as gtils
 11 | from collections import defaultdict
 12 | from torch_geometric.data import Data
 13 | from sklearn.model_selection import train_test_split
 14 | from scipy.stats import gmean
 15 | 
 16 | from models import *
 17 | from models import MODEL_PARAMETER_LIB
 18 | from utils import *
 19 | from preprocessing import *
 20 | from config import Config
 21 | from utils.ensemble import get_top_models_by_std, get_top_models_by_r
 22 | from utils.drop_edge import DropEdgeEachStep
 23 | 
 24 | import copy
 25 | import gc
 26 | 
 27 | 
 28 | def fix_seed(seed):
 29 |     random.seed(seed)
 30 |     np.random.seed(seed)
 31 |     torch.manual_seed(seed)
 32 |     torch.cuda.manual_seed_all(seed)
 33 |     torch.backends.cudnn.deterministic = True
 34 | 
 35 | 
 36 | logger = get_logger("INFO", use_error_log=True)
 37 | 
 38 | 
 39 | class Model:
 40 | 
 41 |     def __init__(self):
 42 |         self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 43 |         self.config = None
 44 |         self.metadata = {}
 45 |         self._num_nodes = None
 46 |         self._origin_graph_data_indices = None
 47 |         self._valid_indices = None
 48 |         self._valid_mask = None
 49 |         self._train_indices = None
 50 |         self._train_mask = None
 51 |         self._test_mask = None
 52 |         self._sampler = None
 53 |         self._n_class = None
 54 |         self.y_train = None
 55 |         self.models_topK = defaultdict(list)
 56 |         self.used_model_num = 0
 57 |         self.citation_configs = ['a', 'b', 'demo', 'coauthor-cs', 'coauthor-phy', 'phy10000']
 58 |         self.use_adaptive_topK = True
 59 | 
 60 |     def load_config(self, data, n_class):
 61 |         dir_path = os.path.dirname(__file__)
 62 |         try:
 63 |             tree = joblib.load(f"{dir_path}/meta.model")
 64 |             encoder = joblib.load(f"{dir_path}/meta.encoder")
 65 |             # pd.set_option('display.max_columns', None)
 66 |             meta_info = pd.Series(
 67 |                 extract_graph_feature(data, n_class)
 68 |             )
 69 |             logger.info("meta_info:\n {}".format(meta_info))
 70 |             meta_info = pd.DataFrame([meta_info])
 71 |             self.metadata = meta_info
 72 | 
 73 |             logger.error(f"tree prob:{tree.predict_proba(meta_info)}")
 74 |             if meta_info['n_feature'][0] == 0:
 75 |                 logger.error("n_feature of this set is 0")
 76 |                 config = "e"
 77 |             else:
 78 |                 config = encoder.inverse_transform(tree.predict(meta_info))[0]
 79 |             if config == "e" and meta_info['n_class'].iloc[0] >= 5:
 80 |                 config = "e-d5"
 81 |             logger.error(f"use {config} config by meta learning")
 82 |             self.config = Config(f"{dir_path}/configs/{config}.json")
 83 |             # self.config = Config(f"{dir_path}/configs/tmp.json")
 84 |         except Exception as e:
 85 |             logger.error("Throw error when loading config")
 86 |             logger.error(e)
 87 |             self.config = Config(f"{dir_path}/configs/default.json")
 88 |             # self.config = Config(f"{dir_path}/configs/tmp.json")
 89 | 
 90 |     def train_valid_split(self, total_indices, y, valid_rate=0.2):
 91 |         total_indices = np.asarray(total_indices, dtype=np.int32)
 92 |         total_class_indices = []
 93 |         train_indices, valid_indices = [], []
 94 |         each_class_max_sample_num = 1000
 95 |         for i in range(self._n_class):
 96 |             total_class_indices.append(np.where(y[:] == i)[0])
 97 |             each_class_valid_num = max(1, int(len(total_class_indices[i])*valid_rate))
 98 |             each_class_valid_indices = np.random.choice(total_class_indices[i],
 99 |                                                         each_class_valid_num,
100 |                                                         replace=False).tolist()
101 |             each_class_train_indices = list(set(total_class_indices[i]) - set(each_class_valid_indices))
102 |             if len(each_class_train_indices) == 0:
103 |                 each_class_train_indices = each_class_valid_indices
104 |             train_indices += np.random.permutation(each_class_train_indices)[:each_class_max_sample_num].tolist()
105 |             valid_indices += each_class_valid_indices
106 | 
107 |         train_indices, valid_indices = total_indices[train_indices], total_indices[valid_indices]
108 |         random.shuffle(train_indices)
109 |         random.shuffle(valid_indices)
110 |         return train_indices, valid_indices
111 | 
112 |     def generate_pyg_data(self, data):
113 |         # get x feature table
114 |         x = data['fea_table'].copy()
115 |         df = data['edge_file']
116 |         edges = df[['src_idx', 'dst_idx', 'edge_weight']]
117 | 
118 |         # get indices first
119 |         train_indices = data['train_indices']
120 |         if self.config.use_valid:
121 |             train_indices, valid_indices = train_test_split(train_indices, test_size=0.2, shuffle=False)
122 | 
123 |         try:
124 |             if x.shape[1] == 1:        # 0-dimensional feature
125 |                 x = x.set_index(keys="node_index")
126 |                 x = feat_engineering(
127 |                     x,
128 |                     edges=edges,
129 |                     num_nodes=self.metadata["n_node"].iloc[0]
130 |                 )
131 |             else:
132 |                 x_feat = x.drop('node_index', axis=1).to_numpy()
133 |                 conf_name = self.config.filename.split("/")[-1].split(".")[0]
134 |                 is_only_one_zero = not ((x_feat != 0) & (x_feat != 1)).any()
135 |                 logger.info("use {} config".format(conf_name))
136 |                 logger.info(
137 |                     "feature only contains zero: {}, only one and zero: {}".format((x_feat == 0).all(), is_only_one_zero))
138 | 
139 |                 if conf_name in self.citation_configs:  # Judge whether it is a citation graph
140 |             # if True:
141 |                     if is_only_one_zero:
142 |                         logger.info("Normalize features")
143 |                         normal_feat = feat_row_sum_inv_normalize(x_feat)
144 |                         normal_df = pd.DataFrame(data=normal_feat)
145 |                         normal_df["node_index"] = x["node_index"]
146 |                         x = normal_df
147 | 
148 |                     pre_feat = prepredict(data, train_indices=train_indices, use_valid=self.config.use_valid, use_ohe=False)
149 |                     x = x.set_index(keys="node_index")
150 |                     x_index = x.index.tolist()
151 |                     lpa_preds, lpa_train_acc = lpa_predict(data, n_class=self._n_class, train_indices=train_indices, use_valid=self.config.use_valid)
152 |                     if not np.isnan(lpa_train_acc) and lpa_train_acc > 0.8:
153 |                         logger.info("Use LPA predicts")
154 |                         x = pd.concat([x, pre_feat, lpa_preds], axis=1).values[x_index]
155 |                     else:
156 |                         x = pd.concat([x, pre_feat], axis=1).values[x_index]
157 |                 else:
158 |                     x = x.set_index(keys="node_index")
159 |                     x = feat_engineering(
160 |                         x,
161 |                         edges=edges,
162 |                         num_nodes=self.metadata["n_node"].iloc[0]
163 |                     )
164 |         except Exception as e:
165 |             logger.error(e)
166 |             if x.shape[1] == 0:
167 |                 x = np.zeros((x.shape[0], 64), dtype=np.float)
168 |             else:
169 |                 x = x.to_numpy()
170 | 
171 |         logger.info("x shape: {}".format(x.shape))
172 |         node_index = torch.tensor(data['fea_table']['node_index'].to_numpy(), dtype=torch.long)
173 |         x = torch.tensor(x, dtype=torch.float)
174 | 
175 |         # get edge_index, edge_weight
176 |         edges = edges.to_numpy()
177 |         edge_index = edges[:, :2].astype(np.int)
178 |         # transpose from [edge_num, 2] to [2, edge_num] which is required by PyG
179 |         edge_index = torch.tensor(edge_index, dtype=torch.long).transpose(0, 1)
180 |         edge_weight = edges[:, 2]
181 |         edge_weight = torch.tensor(edge_weight, dtype=torch.float32)
182 | 
183 |         undirected = gtils.is_undirected(edge_index)
184 | 
185 |         edge_index, edge_weight = gtils.sort_edge_index(edge_index, edge_weight)
186 |         logger.info(f"is undirected ? {undirected}")
187 |         logger.info(f"edge index {edge_index.shape}, edge weight {edge_weight.shape}")
188 | 
189 |         # get train/test mask
190 |         num_nodes = x.size(0)
191 |         self._num_nodes = num_nodes
192 |         y = torch.zeros(num_nodes, dtype=torch.long)
193 |         inds = data['train_label'][['node_index']].to_numpy()
194 |         train_y = data['train_label'][['label']].to_numpy()
195 |         self.y_train = train_y
196 |         y[inds] = torch.tensor(train_y, dtype=torch.long)
197 | 
198 |         # train_indices = data['train_indices']
199 |         self._origin_graph_data_indices = copy.deepcopy(data['train_indices'])
200 |         if self.config.use_valid:
201 |             # train_indices, valid_indices = train_test_split(train_indices, test_size=0.2)
202 |             # train_indices, valid_indices = train_test_split(train_indices, test_size=0.2, shuffle=False)
203 |             self.y_train = data['train_label'].set_index('node_index').loc[train_indices][['label']].to_numpy()
204 |         test_indices = data['test_indices']
205 | 
206 |         data = Data(x=x, node_index=node_index, edge_index=edge_index, y=y, edge_weight=edge_weight)
207 | 
208 |         data.num_nodes = num_nodes
209 | 
210 |         train_mask = torch.zeros(num_nodes, dtype=torch.bool)
211 |         train_mask[train_indices] = 1
212 |         data.train_indices = np.asarray(train_indices)
213 |         data.train_mask = train_mask
214 |         self._train_indices = np.asarray(train_indices)
215 |         self._train_mask = train_mask
216 | 
217 |         if self.config.use_valid:
218 |             valid_mask = torch.zeros(num_nodes, dtype=torch.bool)
219 |             valid_mask[valid_indices] = 1
220 |             data.valid_indices = valid_indices
221 |             data.valid_mask = valid_mask
222 |             self._valid_indices = valid_indices
223 |             self._valid_mask = valid_mask
224 | 
225 |         self._test_mask = np.zeros(num_nodes, dtype=np.bool)
226 |         self._test_mask[test_indices] = True
227 |         test_mask = torch.zeros(num_nodes, dtype=torch.bool)
228 |         test_mask[test_indices] = 1
229 |         data.test_mask = test_mask
230 |         data.test_indices = np.asarray(test_indices)
231 | 
232 |         self._sampler = Sampler(data, self.metadata["n_edge"].iloc[0], self.device)
233 | 
234 |         return data
235 | 
236 |     def train(self, sampler, n_class):
237 | 
238 |         try:
239 |             time_budget = get_time_budget().timing(frac=0.95)
240 |             drop_edge_controller = None
241 |             model_time_budget = max(time_budget.remain * 0.6, time_budget.remain / len(self.config.model_classes))
242 |             self.models_topK = defaultdict(list)
243 | 
244 |             for model_name, model_class in self.config.model_classes:
245 |                 time_budget.check()
246 |                 config = self.config.model_config(model_name)
247 |                 logger.info(f"model {model_name} config:\n{config}")
248 |                 self.used_model_num += 1
249 |                 data = sampler.random_edge_sampler(percent=config.drop_edge)
250 | 
251 |                 model = model_class(
252 |                     features_num=data.x.size()[1],
253 |                     num_class=n_class,
254 |                     edge_num=data.edge_index.shape[1],
255 |                     num_layers=config.num_layers,
256 |                     hidden=config.hidden,
257 |                     dropout=config.dropout,
258 |                     drop_edge_controller=drop_edge_controller,
259 |                     num_nodes=self._num_nodes,
260 |                     emb_dim=config.emb_dim
261 |                 )
262 | 
263 |                 model = model.to(self.device)
264 | 
265 |                 optimizer = torch.optim.Adam(model.parameters(), lr=config.lr,
266 |                                              weight_decay=config.weight_decay)
267 | 
268 |                 train_kwargs = {}
269 |                 if config.use_valid:
270 |                     train_kwargs = {
271 |                         "valid_indices": data.valid_indices,
272 |                     }
273 | 
274 |                 if config.use_sampler:
275 |                     dataset = GraphSampleDataset(data, n_class, self.y_train)
276 |                 else:
277 |                     dataset = GraphDataset(data)
278 | 
279 |                 topK_list, valid_acc, except_info = torch_train(
280 |                     data, dataset, model, optimizer, F.nll_loss,
281 |                     epochs=config.num_epoch, batch_size=data.num_nodes // config.num_batch,
282 |                     min_epochs=config.min_epoch,
283 |                     clip_grad=5 if config.use_sampler else 0,
284 |                     patience=config.patience,
285 |                     time_budget=time_budget,
286 |                     all_data=False,
287 |                     use_adaptive_topK=self.use_adaptive_topK,
288 |                     model_topK=self.models_topK[model_name],
289 |                     **train_kwargs
290 |                 )
291 | 
292 |                 if except_info == "time_exceed":
293 |                     print("execute to {}".format(except_info))
294 |                     if -valid_acc < 0.80 and self.used_model_num > 1:
295 |                         del model
296 |                         gc.collect()
297 |                         break
298 | 
299 |                     self.models_topK[model_name] = topK_list
300 |                     del model
301 |                     gc.collect()
302 |                     break
303 | 
304 |                 if except_info == "oom":
305 |                     del model
306 |                     gc.collect()
307 |                     continue
308 | 
309 |                 self.models_topK[model_name] = topK_list
310 | 
311 |                 del model
312 |                 gc.collect()
313 |                 self.config.model_classes.update(self.models_info)
314 |             return self.models_info
315 | 
316 |         except RuntimeError as exception:
317 |             if "out of memory" in str(exception):
318 |                 logger.info("we met cuda out of memory")
319 |                 return self.models_info
320 |             else:
321 |                 raise exception
322 |         except TimeOutError as e:
323 |             print(e)
324 |             return self.models_info
325 | 
326 |     @property
327 |     def models_info(self):
328 |         info = []
329 |         for name in self.models_topK:
330 |             info.extend([(ele['pred'], ele['acc'], name) for ele in self.models_topK[name]])
331 |         return info
332 | 
333 |     def transition_train_valid(self, data, target):
334 |         if target == 'valid':
335 |             data.valid_indices = self._valid_indices
336 |             data.valid_mask = self._valid_mask
337 |             data.train_indices = self._train_indices
338 |             data.train_mask = self._train_mask
339 |         else:
340 |             train_mask = torch.zeros(self._num_nodes, dtype=torch.bool)
341 |             train_indices = self._origin_graph_data_indices
342 |             train_mask[train_indices] = 1
343 |             data.train_indices = np.asarray(train_indices)
344 |             data.train_mask = train_mask
345 |         return data
346 | 
347 |     def fake_pred(self, model, data):
348 |         model.eval()
349 |         data = data.to(self.device)
350 |         with torch.no_grad():
351 |             logits, labels = model(data.test_mask).max(1)
352 |         return logits, labels
353 | 
354 |     def pred(self, model, data):
355 |         model.eval()
356 |         data = data.to(self.device)
357 |         with torch.no_grad():
358 |             logits = model(data)
359 |             _, preds = logits[data.test_mask].max(1)
360 |         return logits, preds
361 | 
362 |     def train_predict(self, data, time_budget, n_class, schema):
363 |         """
364 |         API for ingestion prog to invoke
365 |         Args:
366 |             data: {
367 |                 'fea_table': pd.DataFrame['node_index', 'feat_1', ..., 'feat_n'],
368 |                 'edge_file': pd.DataFrame['src_idx', 'dst_idx', 'edge_weight'],
369 |                 'train_indices': list of the index of train set,
370 |                 'test_indices': list of the index of test set,
371 |                 'train_label': pd.DataFrame['node_index', 'label']
372 |             }
373 |             time_budget: remain time
374 |             n_class: class num
375 |             schema: deprecated
376 | 
377 |         Returns: prediction of nodes in test set
378 | 
379 |         """
380 |         set_time_budget(time_budget)
381 |         self._n_class = n_class
382 |         self.load_config(data, n_class)
383 |         data = self.generate_pyg_data(data)
384 |         # model = self.train(data, n_class)
385 |         models_info = self.train(self._sampler, n_class)
386 |         print("models_info_acc:")
387 |         for i in range(len(models_info)):
388 |             print("acc: {}".format(models_info[i][1]))
389 | 
390 |         # test_logits, test_labels = self.fake_pred(model, data)
391 |         # test_logits = test_logits.cpu().numpy()
392 |         # test_sorted = test_logits.argsort()[::-1]
393 |         # selected = test_sorted[: int(len(test_sorted) * 0.4)]
394 |         # selected_id = data.test_indices[selected]
395 |         # data.y[selected_id] = torch.tensor(test_labels.cpu().numpy().flatten()[selected], dtype=torch.long, device=self.device)
396 |         # data.train_mask[selected_id] = 1
397 |         # model = self.train(data)
398 |         # logits, preds = self.pred(model, data)
399 | 
400 |         timing = get_time_budget().timing(frac=0.97)
401 | 
402 |         ensemble_info = get_top_models_by_r(models_info)
403 | 
404 |         try:
405 |             logger.info("logits_ensemble_len: {}".format(len(ensemble_info)))
406 | 
407 |             logits_ensemble = None
408 |             # logits_list = []
409 |             for pred, weight in ensemble_info:
410 |                 timing.check()
411 |                 # logger.info("model_ensemble_weight: {}".format(weight))
412 |                 logits = pred[self._test_mask, :]
413 |                 # normalize logits
414 |                 logits = logits.T
415 |                 logits = (logits - np.min(logits, axis=0)) / (np.max(logits, axis=0) - np.min(logits, axis=0))
416 |                 logits = logits.T
417 |                 # logits_list.append(logits)
418 |                 if logits_ensemble is None:
419 |                     logits_ensemble = logits * weight
420 |                 else:
421 |                     logits_ensemble += logits * weight
422 |                 timing.check()
423 | 
424 |             # logits_ensemble = np.array(logits_list)
425 |             # logger.info("use gmeans ensemble; logits_ensemble_shape: {}".format(logits_ensemble.shape))
426 |             # logits_ensemble = gmean(logits_ensemble, axis=0)
427 |             preds = np.argmax(logits_ensemble, axis=1)
428 | 
429 |             return preds.flatten()
430 |         except TimeOutError as e:
431 |             print(e)
432 |             return np.argmax(ensemble_info[0][0][self._test_mask, :], axis=1).flatten()
433 |         except Exception as e:
434 |             print(e)
435 |             return np.argmax(np.random.rand(self.metadata['n_test'].iloc[0], self._n_class), axis=1).flatten()
436 | 
437 | 


--------------------------------------------------------------------------------
/code_submission/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from models.nas_azcs import NasAzcs
 2 | from models.nas_azpo import NasAzpo
 3 | from models.nas_coauthorcs import NasCoauthorcs
 4 | from models.nas_coauthorphy import NasCoauthorphy
 5 | from models.nas_phy10000 import NasPhy10000
 6 | 
 7 | __all__ = [
 8 |     "GCN", "SAGE", "GAT", "NasCora", "NasCiteseer", "NasPubmed", "SimpleGCN", "EmbGCN",
 9 |     "NasAutoGraphA", "NasAutoGraphB", "NasAutoGraphD", "NasAutoGraphE",
10 |     "NasCoauthorcs", "NasCoauthorphy", "NasPhy10000", "NasAzpo", "NasAzcs"
11 | ]
12 | 
13 | 
14 | from .gcn import GCN
15 | from .emb_gcn import EmbGCN
16 | from .sage import SAGE
17 | from .gat import GAT
18 | from .nas_cora import NasCora
19 | from .nas_citeseer import NasCiteseer
20 | from .nas_pubmed import NasPubmed
21 | from .simple_gcn import SimpleGCN
22 | from .nas_autograph_a import NasAutoGraphA
23 | from .nas_autograph_b import NasAutoGraphB
24 | from .nas_autograph_c import NasAutoGraphC
25 | from .nas_autograph_d import NasAutoGraphD
26 | from .nas_autograph_e import NasAutoGraphE
27 | 
28 | MODEL_LIB = {
29 |     'gcn': GCN,
30 |     'emb_gcn': EmbGCN,
31 |     'simple_gcn': SimpleGCN,
32 |     'gat': GAT,
33 |     'nas_cora': NasCora,
34 |     'nas_citeseer': NasCiteseer,
35 |     'nas_pubmed': NasPubmed,
36 |     'nas_autograph_a': NasAutoGraphA,
37 |     'nas_autograph_b': NasAutoGraphB,
38 |     'nas_autograph_c': NasAutoGraphC,
39 |     'nas_autograph_d': NasAutoGraphD,
40 |     'nas_autograph_e': NasAutoGraphE,
41 |     'nas_coauthorcs': NasCoauthorcs,
42 |     'nas_coauthorphy': NasCoauthorphy,
43 |     'nas_phy10000': NasPhy10000,
44 |     'nas_azpo': NasAzpo,
45 |     'nas_azcs': NasAzcs
46 | }
47 | 
48 | MODEL_PARAMETER_LIB = {
49 |     'default': [0.005, 0.5, 5e-4, 64],
50 |     # 'nas_cora': [0.01, 0.9, 0.0001, 64],
51 |     # 'nas_citeseer': [0.005, 0.8, 1e-05, 128],
52 |     # 'nas_pubmed': [0.01, 0.4, 5e-05, 64],
53 |     'nas_autograph_a': [0.01, 0.9, 0, 128],
54 |     'nas_autograph_b': [0.001, 0.7, 0, 256],
55 |     'nas_autograph_c': [0.0005, 0.8, 1e-05, 256],
56 |     'nas_autograph_d': [0.005, 0.1, 0.001, 8],
57 |     'nas_autograph_e': [0.005, 0.7, 0.0001, 32],
58 |     'nas_coauthorcs': [0.005, 0.5, 1e-05, 64],
59 |     'nas_coauthorphy': [0.01, 0.4, 5e-05, 128],
60 |     'nas_phy10000': [0.001, 0.5, 0.0001, 128],
61 |     'nas_azpo': [0.0005, 0.5, 0.0005, 32],
62 |     'nas_azcs': [0.0005, 0.5, 1e-05, 512]
63 | }
64 | 


--------------------------------------------------------------------------------
/code_submission/models/emb_gcn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch.nn import Linear, Embedding
 4 | from torch_geometric.nn import GCNConv
 5 | 
 6 | 
 7 | class EmbGCN(torch.nn.Module):
 8 | 
 9 |     def __init__(self, num_layers=2, hidden=32, emb_dim=64, num_class=2, num_nodes=None, **kwargs):
10 |         super(EmbGCN, self).__init__()
11 |         hidden = max(hidden, num_class * 2)
12 |         self.conv1 = GCNConv(emb_dim, hidden)
13 |         self.convs = torch.nn.ModuleList()
14 |         for i in range(num_layers - 1):
15 |             self.convs.append(GCNConv(hidden, hidden))
16 |         self.lin2 = Linear(hidden, num_class)
17 |         self.emb = Embedding(num_nodes, emb_dim)
18 |         self.first_lin = Linear(emb_dim, hidden)
19 | 
20 |     def reset_parameters(self):
21 |         self.first_lin.reset_parameters()
22 |         self.emb.reset_parameters()
23 |         self.conv1.reset_parameters()
24 |         for conv in self.convs:
25 |             conv.reset_parameters()
26 |         self.lin2.reset_parameters()
27 | 
28 |     def forward(self, data):
29 |         x, edge_index, edge_weight, node_index = data.x, data.edge_index, data.edge_weight, data.node_index
30 |         x = self.emb(node_index)
31 |         x1 = F.elu(self.conv1(x, edge_index, edge_weight=edge_weight))
32 |         x = F.elu(self.first_lin(x))
33 |         x = F.dropout(x, p=0.5, training=self.training)
34 |         for conv in self.convs:
35 |             x = F.elu(conv(x, edge_index, edge_weight=edge_weight))
36 |         x = x1 + x
37 |         x = F.dropout(x, p=0.5, training=self.training)
38 |         x = self.lin2(x)
39 |         return F.log_softmax(x, dim=-1)
40 | 
41 |     def __repr__(self):
42 |         return self.__class__.__name__
43 | 


--------------------------------------------------------------------------------
/code_submission/models/focal_loss.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time:    2020/5/13 21:26
 4 | # @Author:  Mecthew
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | from torch.autograd import Variable
10 | 
11 | 
12 | class FocalLoss(nn.Module):
13 |     def __init__(self, class_num, alpha=None, gamma=2, size_average=True):
14 |         super(FocalLoss, self).__init__()
15 |         if alpha is None:
16 |             self.alpha = Variable(torch.ones(class_num, 1), requires_grad=False)
17 |         else:
18 |             if isinstance(alpha, Variable):
19 |                 self.alpha = alpha
20 |             else:
21 |                 self.alpha = Variable(torch.tensor(alpha, dtype=torch.float), requires_grad=False)
22 |         self.gamma = gamma
23 |         self.class_num = class_num
24 |         self.size_average = size_average
25 | 
26 |     def forward(self, inputs, targets):
27 |         N = inputs.size(0)
28 |         C = inputs.size(1)
29 |         P = F.softmax(inputs, dim=1)
30 | 
31 |         class_mask = inputs.data.new(N, C).fill_(0)
32 |         class_mask = Variable(class_mask)
33 |         ids = targets.view(-1, 1)
34 |         class_mask.scatter_(1, ids.data, 1.)
35 | 
36 |         if inputs.is_cuda and not self.alpha.is_cuda:
37 |             self.alpha = self.alpha.cuda()
38 |         alpha = self.alpha[ids.data.view(-1)]
39 |         probs = (P*class_mask).sum(1).view(-1, 1) + 1e-5
40 |         log_p = probs.log()
41 |         batch_loss = -alpha*(torch.pow((1-probs), self.gamma))*log_p
42 | 
43 |         if self.size_average:
44 |             loss = batch_loss.mean()
45 |         else:
46 |             loss = batch_loss.sum()
47 |         return loss
48 | 


--------------------------------------------------------------------------------
/code_submission/models/gat.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch import nn
 4 | from torch_geometric.nn import GATConv
 5 | 
 6 | 
 7 | class GAT(torch.nn.Module):
 8 |     def __init__(self, features_num, num_class, num_layers=3, hidden=32, **kwargs):
 9 |         super(GAT, self).__init__()
10 |         hidden = max(hidden, num_class * 2)
11 |         self.convs = nn.ModuleList()
12 |         for _ in range(num_layers):
13 |             self.convs.append(GATConv(hidden, hidden))
14 |         self.input_lin = nn.Linear(features_num, hidden)
15 |         self.output_lin = nn.Linear(hidden, num_class)
16 | 
17 |     def forward(self, data):
18 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
19 |         x = F.leaky_relu(self.input_lin(x))
20 |         x = F.dropout(x, p=0.5, training=self.training)
21 |         for conv in self.convs:
22 |             x = F.leaky_relu(conv(x, edge_index))
23 |         x = F.dropout(x, p=0.5, training=self.training)
24 |         x = self.output_lin(x)
25 |         return F.log_softmax(x, dim=-1)
26 | 
27 | 


--------------------------------------------------------------------------------
/code_submission/models/gcn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch.nn import Linear
 4 | from torch_geometric.nn import GCNConv
 5 | 
 6 | 
 7 | class GCN(torch.nn.Module):
 8 | 
 9 |     # TODO: 网络太弱
10 |     def __init__(self, num_layers=3, hidden=32, features_num=16, num_class=2, **kwargs):
11 |         super(GCN, self).__init__()
12 |         hidden = max(hidden, num_class * 2)
13 |         self.conv1 = GCNConv(features_num, hidden)
14 |         self.convs = torch.nn.ModuleList()
15 |         for i in range(num_layers - 1):
16 |             self.convs.append(GCNConv(hidden, hidden))
17 |         self.lin2 = Linear(hidden, num_class)
18 |         self.first_lin = Linear(features_num, hidden)
19 | 
20 |     def reset_parameters(self):
21 |         self.first_lin.reset_parameters()
22 |         self.conv1.reset_parameters()
23 |         for conv in self.convs:
24 |             conv.reset_parameters()
25 |         self.lin2.reset_parameters()
26 | 
27 |     def forward(self, data):
28 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
29 |         x1 = F.elu(self.conv1(x, edge_index, edge_weight=edge_weight))
30 |         x = F.elu(self.first_lin(x))
31 |         # TODO: dropout rate
32 |         x = F.dropout(x, p=0.5, training=self.training)
33 |         for conv in self.convs:
34 |             x = F.elu(conv(x, edge_index, edge_weight=edge_weight))
35 |         x = x1 + x
36 |         x = F.dropout(x, p=0.5, training=self.training)
37 |         # x = torch.cat([x1, x], dim=1)
38 |         x = self.lin2(x)
39 |         return F.log_softmax(x, dim=-1)
40 | 
41 |     def __repr__(self):
42 |         return self.__class__.__name__
43 | 


--------------------------------------------------------------------------------
/code_submission/models/nas_autograph_a.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by HazzaCheng on 2020-05-11
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | from torch import nn
 8 | from torch_geometric.nn import GATConv, ARMAConv, SGConv
 9 | 
10 | 
11 | class NasAutoGraphA(nn.Module):
12 |     def __init__(self, features_num, num_class, num_layers=2, dropout=0.5, hidden=64, edge_num=1000, **kwargs):
13 |         super(NasAutoGraphA, self).__init__()
14 |         hidden_dim = max(hidden, num_class * 2)
15 |         his_dim, cur_dim, hidden_dim, output_dim = features_num, features_num, hidden_dim, hidden_dim
16 |         multi_head = edge_num < 1400000
17 |         self.cells = nn.ModuleList()
18 |         for _ in range(num_layers):
19 |             cell = NasAutoGraphACell(his_dim, cur_dim, hidden_dim, output_dim, multi_head)
20 |             self.cells.append(cell)
21 |             his_dim, cur_dim = cur_dim, cell.output_dim
22 |         self.classifier = nn.Linear(cur_dim, num_class)
23 | 
24 |         self.dropout = dropout
25 | 
26 |     def forward(self, data):
27 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
28 |         x = F.dropout(x, p=self.dropout, training=self.training)
29 |         h = x
30 |         for cell in self.cells:
31 |             h, x = cell(h, x, edge_index, edge_weight)
32 |         logits = self.classifier(x)
33 |         return F.log_softmax(logits, dim=-1)
34 | 
35 | 
36 | class NasAutoGraphACell(nn.Module):
37 |     # best structure:{'action': [1, 'gat_2', 1, 'sg', 'relu', 'concat'], 'hyper_param': [0.01, 0.9, 0, 128]}
38 |     def __init__(self, his_dim, cur_dim, hidden_dim, output_dim, multi_head):
39 |         super(NasAutoGraphACell, self).__init__()
40 |         self._cur_dim = cur_dim
41 |         self._hidden_dim = hidden_dim
42 |         self._output_dim = output_dim
43 | 
44 |         self.preprocessor_x = nn.Linear(cur_dim, hidden_dim)
45 |         self.headers = 2 if multi_head else 1
46 |         self.gat2 = GATConv(hidden_dim, output_dim, heads=self.headers)
47 |         self.sg = SGConv(hidden_dim, output_dim)
48 | 
49 |     def forward(self, h, x, edge_index, edge_weight):
50 |         his = x
51 |         x = self.preprocessor_x(x)
52 |         o1 = F.leaky_relu(self.gat2(x, edge_index))
53 |         o2 = F.leaky_relu(self.sg(x, edge_index, edge_weight))
54 |         o3 = F.relu(torch.cat([o1, o2], dim=1))
55 |         return his, o3
56 | 
57 |     @property
58 |     def output_dim(self):
59 |         return self._output_dim * (1 + self.headers)
60 | 
61 | 


--------------------------------------------------------------------------------
/code_submission/models/nas_autograph_b.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by HazzaCheng on 2020-05-11
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | from torch import nn
 8 | from torch_geometric.nn import GATConv, ARMAConv, SAGEConv
 9 | 
10 | 
11 | class NasAutoGraphB(nn.Module):
12 |     def __init__(self, features_num, num_class, num_layers=2, dropout=0.5, hidden=64, edge_num=1000, **kwargs):
13 |         super(NasAutoGraphB, self).__init__()
14 |         print(f"edge num {edge_num}")
15 |         hidden_dim = max(hidden, num_class * 2)
16 |         cur_dim, hidden_dim, output_dim = features_num, hidden_dim, hidden_dim
17 |         multi_head = edge_num < 1400000
18 |         self.cells = nn.ModuleList()
19 |         for _ in range(num_layers):
20 |             cell = NasAutoGraphBCell(cur_dim, hidden_dim, output_dim, multi_head)
21 |             self.cells.append(cell)
22 |             cur_dim = cell.output_dim
23 |         self.classifier = nn.Linear(cur_dim, num_class)
24 | 
25 |         self.dropout = dropout
26 | 
27 |     def forward(self, data):
28 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
29 |         x = F.dropout(x, p=self.dropout, training=self.training)
30 |         for cell in self.cells:
31 |             x = cell(x, edge_index, edge_weight)
32 |         logits = self.classifier(x)
33 |         return F.log_softmax(logits, dim=-1)
34 | 
35 | 
36 | class NasAutoGraphBCell(nn.Module):
37 |     # best structure:{'action': [0, 'arma', 0, 'sage', 'elu', 'add'], 'hyper_param': [0.001, 0.7, 0, 256]}
38 |     def __init__(self, cur_dim, hidden_dim, output_dim, multi_head):
39 |         super(NasAutoGraphBCell, self).__init__()
40 |         self._cur_dim = cur_dim
41 |         self._hidden_dim = hidden_dim
42 |         self._output_dim = output_dim
43 | 
44 |         self.preprocessor = nn.Linear(cur_dim, hidden_dim)
45 |         self.arma = ARMAConv(hidden_dim, output_dim)
46 |         self.sage = SAGEConv(hidden_dim, self._output_dim, bias=True)
47 | 
48 |     def forward(self, x, edge_index, edge_weight):
49 |         h = self.preprocessor(x)
50 |         h1 = F.leaky_relu(self.arma(h, edge_index, edge_weight=edge_weight))
51 |         h2 = F.leaky_relu(self.sage(h, edge_index, edge_weight=edge_weight))
52 |         out = F.elu(torch.cat([h1, h2], dim=1))
53 |         return out
54 | 
55 |     @property
56 |     def output_dim(self):
57 |         return self._output_dim * 2
58 | 


--------------------------------------------------------------------------------
/code_submission/models/nas_autograph_c.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by HazzaCheng on 2020-05-14
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | from torch import nn
 8 | from torch_geometric.nn import GATConv, ARMAConv, SGConv, ChebConv, SAGEConv
 9 | 
10 | 
11 | class NasAutoGraphC(nn.Module):
12 |     def __init__(self, features_num, num_class, num_layers=2, dropout=0.5, hidden=64, edge_num=1000, **kwargs):
13 |         super(NasAutoGraphC, self).__init__()
14 |         hidden_dim = max(hidden, num_class * 2)
15 |         his_dim, cur_dim, hidden_dim, output_dim = features_num, features_num, hidden_dim, hidden_dim
16 |         multi_head = edge_num < 1400000
17 |         self.cells = nn.ModuleList()
18 |         for _ in range(num_layers):
19 |             cell = NasAutoGraphCCell(his_dim, cur_dim, hidden_dim, output_dim, multi_head)
20 |             self.cells.append(cell)
21 |             his_dim, cur_dim = cur_dim, cell.output_dim
22 |         self.classifier = nn.Linear(cur_dim, num_class)
23 | 
24 |         self.dropout = dropout
25 | 
26 |     def forward(self, data):
27 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
28 |         x = F.dropout(x, p=self.dropout, training=self.training)
29 |         h = x
30 |         for cell in self.cells:
31 |             h, x = cell(h, x, edge_index, edge_weight)
32 |         logits = self.classifier(x)
33 |         return F.log_softmax(logits, dim=-1)
34 | 
35 | 
36 | class NasAutoGraphCCell(nn.Module):
37 |     # best structure: {'action': [1, 'sage', 1, 'cheb', 'linear', 'add'], 'hyper_param': [0.0005, 0.8, 1e-05, 256]}
38 |     def __init__(self, his_dim, cur_dim, hidden_dim, output_dim, multi_head):
39 |         super(NasAutoGraphCCell, self).__init__()
40 |         self._cur_dim = cur_dim
41 |         self._hidden_dim = hidden_dim
42 |         self._output_dim = output_dim
43 | 
44 |         self.preprocessor_x = nn.Linear(cur_dim, hidden_dim)
45 |         self.headers = 2 if multi_head else 1
46 |         self.cheb = ChebConv(hidden_dim, output_dim, K=2, bias=True)
47 |         self.sage = SAGEConv(hidden_dim, output_dim)
48 |         self.linear = nn.Linear(self._output_dim, self._output_dim)
49 | 
50 |     def forward(self, h, x, edge_index, edge_weight):
51 |         his = x
52 |         x = self.preprocessor_x(x)
53 |         o1 = F.leaky_relu(self.cheb(x, edge_index, edge_weight))
54 |         o2 = F.leaky_relu(self.sage(x, edge_index, edge_weight))
55 |         o3 = self.linear(torch.add(o1, o2))
56 |         return his, o3
57 | 
58 |     @property
59 |     def output_dim(self):
60 |         return self._output_dim
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/code_submission/models/nas_autograph_d.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by HazzaCheng on 2020-05-13
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | from torch import nn
 8 | from torch_geometric.nn import ARMAConv, SAGEConv, SGConv
 9 | 
10 | 
11 | class NasAutoGraphD(nn.Module):
12 |     def __init__(self, features_num, num_class, num_layers=2, dropout=0.5, hidden=64, edge_num=1000, **kwargs):
13 |         super(NasAutoGraphD, self).__init__()
14 |         hidden_dim = max(hidden, num_class * 2)
15 |         his_dim, cur_dim, hidden_dim, output_dim = features_num, features_num, hidden_dim, hidden_dim
16 |         multi_head = edge_num < 1400000
17 |         self.cells = nn.ModuleList()
18 |         for _ in range(num_layers):
19 |             cell = NasAutoGraphDCell(his_dim, cur_dim, hidden_dim, output_dim, multi_head)
20 |             self.cells.append(cell)
21 |             his_dim, cur_dim = cur_dim, cell.output_dim
22 |         self.classifier = nn.Linear(cur_dim, num_class)
23 | 
24 |         self.dropout = dropout
25 | 
26 |     def forward(self, data):
27 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
28 |         x = F.dropout(x, p=self.dropout, training=self.training)
29 |         h = x
30 |         for cell in self.cells:
31 |             h, x = cell(h, x, edge_index, edge_weight)
32 |         logits = self.classifier(x)
33 |         return F.log_softmax(logits, dim=-1)
34 | 
35 | 
36 | class NasAutoGraphDCell(nn.Module):
37 |     # best structure:{'action': [0, 'sg', 1, 'arma', 'elu', 'concat'], 'hyper_param': [0.005, 0.1, 0.001, 8]}
38 |     def __init__(self, his_dim, cur_dim, hidden_dim, output_dim, multi_head):
39 |         super(NasAutoGraphDCell, self).__init__()
40 |         self._cur_dim = cur_dim
41 |         self._hidden_dim = hidden_dim
42 |         self._output_dim = output_dim
43 | 
44 |         self.preprocessor_h = nn.Linear(his_dim, hidden_dim)
45 |         self.preprocessor_x = nn.Linear(cur_dim, hidden_dim)
46 |         self.sg = SGConv(hidden_dim, output_dim)
47 |         self.arma = ARMAConv(hidden_dim, output_dim)
48 | 
49 |     def forward(self, h, x, edge_index, edge_weight):
50 |         his = x
51 |         x = self.preprocessor_x(x)
52 |         h = self.preprocessor_h(h)
53 |         o1 = F.leaky_relu(self.sg(h, edge_index, edge_weight))
54 |         o2 = F.leaky_relu(self.arma(x, edge_index, edge_weight))
55 |         o3 = F.elu(torch.cat([o1, o2], dim=1))
56 |         return his, o3
57 | 
58 |     @property
59 |     def output_dim(self):
60 |         return self._output_dim * 2


--------------------------------------------------------------------------------
/code_submission/models/nas_autograph_e.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by HazzaCheng on 2020-05-12
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from torch import nn
 7 | from torch_geometric.nn import ARMAConv, SAGEConv
 8 | 
 9 | 
10 | class NasAutoGraphE(nn.Module):
11 |     def __init__(self, features_num, num_class, num_layers=2, dropout=0.5, hidden=64, edge_num=1000, **kwargs):
12 |         super(NasAutoGraphE, self).__init__()
13 |         hidden_dim = max(hidden, num_class * 2)
14 |         his_dim, cur_dim, hidden_dim, output_dim = features_num, features_num, hidden_dim, hidden_dim
15 |         multi_head = edge_num < 1400000
16 |         self.cells = nn.ModuleList()
17 |         for _ in range(num_layers):
18 |             cell = NasAutoGraphECell(his_dim, cur_dim, hidden_dim, output_dim, multi_head)
19 |             self.cells.append(cell)
20 |             his_dim, cur_dim = cur_dim, cell.output_dim
21 |         self.classifier = nn.Linear(cur_dim, num_class)
22 | 
23 |         self.dropout = dropout
24 | 
25 |     def forward(self, data):
26 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
27 |         x = F.dropout(x, p=self.dropout, training=self.training)
28 |         h = x
29 |         for cell in self.cells:
30 |             h, x = cell(h, x, edge_index, edge_weight)
31 |         logits = self.classifier(x)
32 |         return F.log_softmax(logits, dim=-1)
33 | 
34 | 
35 | class NasAutoGraphECell(nn.Module):
36 |     # best structure:{'action': [1, 'arma', 0, 'sage', 'elu', 'add'], 'hyper_param': [0.005, 0.7, 0.0001, 32]}
37 |     def __init__(self, his_dim, cur_dim, hidden_dim, output_dim, multi_head):
38 |         super(NasAutoGraphECell, self).__init__()
39 |         self._cur_dim = cur_dim
40 |         self._hidden_dim = hidden_dim
41 |         self._output_dim = output_dim
42 | 
43 |         self.preprocessor_h = nn.Linear(his_dim, hidden_dim)
44 |         self.preprocessor_x = nn.Linear(cur_dim, hidden_dim)
45 |         self.sage = SAGEConv(hidden_dim, output_dim)
46 |         self.arma = ARMAConv(hidden_dim, output_dim)
47 | 
48 |     def forward(self, h, x, edge_index, edge_weight):
49 |         his = x
50 |         x = self.preprocessor_x(x)
51 |         h = self.preprocessor_h(h)
52 |         o1 = F.leaky_relu(self.sage(h, edge_index, edge_weight))
53 |         o2 = F.leaky_relu(self.arma(x, edge_index, edge_weight))
54 |         o3 = F.elu(torch.add(o1, o2))
55 |         return his, o3
56 | 
57 |     @property
58 |     def output_dim(self):
59 |         return self._output_dim
60 | 


--------------------------------------------------------------------------------
/code_submission/models/nas_azcs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by HazzaCheng on 2020-05-25
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | from torch import nn
 8 | from torch_geometric.nn import GATConv, ARMAConv, SGConv, ChebConv, SAGEConv
 9 | 
10 | 
11 | class NasAzcs(nn.Module):
12 |     def __init__(self, features_num, num_class, num_layers=2, dropout=0.5, hidden=64, edge_num=1000, **kwargs):
13 |         super(NasAzcs, self).__init__()
14 |         hidden_dim = max(hidden, num_class * 2)
15 |         his_dim, cur_dim, hidden_dim, output_dim = features_num, features_num, hidden_dim, hidden_dim
16 |         multi_head = edge_num < 1400000
17 |         self.cells = nn.ModuleList()
18 |         for _ in range(num_layers):
19 |             cell = NasAzcsCell(his_dim, cur_dim, hidden_dim, output_dim, multi_head)
20 |             self.cells.append(cell)
21 |             his_dim, cur_dim = cur_dim, cell.output_dim
22 |         self.classifier = nn.Linear(cur_dim, num_class)
23 | 
24 |         self.dropout = dropout
25 | 
26 |     def forward(self, data):
27 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
28 |         x = F.dropout(x, p=self.dropout, training=self.training)
29 |         h = x
30 |         for cell in self.cells:
31 |             h, x = cell(h, x, edge_index, edge_weight)
32 |         logits = self.classifier(x)
33 |         return F.log_softmax(logits, dim=-1)
34 | 
35 | 
36 | class NasAzcsCell(nn.Module):
37 |     # best structure:{'action': [1, 'sg', 1, 'arma', 'relu', 'add'], 'hyper_param': [0.0005, 0.5, 1e-05, 512]}
38 |     def __init__(self, his_dim, cur_dim, hidden_dim, output_dim, multi_head):
39 |         super(NasAzcsCell, self).__init__()
40 |         self._cur_dim = cur_dim
41 |         self._hidden_dim = hidden_dim
42 |         self._output_dim = output_dim
43 | 
44 |         self.preprocessor_x = nn.Linear(cur_dim, hidden_dim)
45 |         self.headers = 2 if multi_head else 1
46 |         self.sg = SAGEConv(hidden_dim, output_dim)
47 |         self.arma = ARMAConv(hidden_dim, output_dim)
48 | 
49 |     def forward(self, h, x, edge_index, edge_weight):
50 |         his = x
51 |         x = self.preprocessor_x(x)
52 |         o1 = F.leaky_relu(self.sg(x, edge_index, edge_weight))
53 |         o2 = F.leaky_relu(self.arma(x, edge_index, edge_weight))
54 |         o3 = F.relu(torch.add(o1, o2))
55 |         return his, o3
56 | 
57 |     @property
58 |     def output_dim(self):
59 |         return self._output_dim


--------------------------------------------------------------------------------
/code_submission/models/nas_azpo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by HazzaCheng on 2020-05-24
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from torch import nn
 7 | from torch_geometric.nn import ARMAConv, ChebConv
 8 | 
 9 | 
10 | class NasAzpo(nn.Module):
11 |     def __init__(self, features_num, num_class, num_layers=2, dropout=0.5, hidden=64, edge_num=1000, **kwargs):
12 |         super(NasAzpo, self).__init__()
13 |         hidden_dim = max(hidden, num_class * 2)
14 |         his_dim, cur_dim, hidden_dim, output_dim = features_num, features_num, hidden_dim, hidden_dim
15 |         multi_head = edge_num < 1400000
16 |         self.cells = nn.ModuleList()
17 |         for _ in range(num_layers):
18 |             cell = NasAzpoCell(his_dim, cur_dim, hidden_dim, output_dim, multi_head)
19 |             self.cells.append(cell)
20 |             his_dim, cur_dim = cur_dim, cell.output_dim
21 |         self.classifier = nn.Linear(cur_dim, num_class)
22 | 
23 |         self.dropout = dropout
24 | 
25 |     def forward(self, data):
26 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
27 |         x = F.dropout(x, p=self.dropout, training=self.training)
28 |         h = x
29 |         for cell in self.cells:
30 |             h, x = cell(h, x, edge_index, edge_weight)
31 |         logits = self.classifier(x)
32 |         return F.log_softmax(logits, dim=-1)
33 | 
34 | 
35 | class NasAzpoCell(nn.Module):
36 |     # best structure:{'action': [0, 'cheb', 1, 'arma', 'linear', 'add'], 'hyper_param': [0.0005, 0.5, 0.0005, 32]}
37 |     def __init__(self, his_dim, cur_dim, hidden_dim, output_dim, multi_head):
38 |         super(NasAzpoCell, self).__init__()
39 |         self._cur_dim = cur_dim
40 |         self._hidden_dim = hidden_dim
41 |         self._output_dim = output_dim
42 | 
43 |         self.preprocessor_h = nn.Linear(his_dim, hidden_dim)
44 |         self.preprocessor_x = nn.Linear(cur_dim, hidden_dim)
45 |         self.cheb = ChebConv(hidden_dim, output_dim, K=2, bias=True)
46 |         self.arma = ARMAConv(hidden_dim, output_dim)
47 |         self.linear = nn.Linear(output_dim, output_dim)
48 | 
49 |     def forward(self, h, x, edge_index, edge_weight):
50 |         his = x
51 |         x = self.preprocessor_x(x)
52 |         h = self.preprocessor_h(h)
53 |         o1 = F.leaky_relu(self.cheb(h, edge_index, edge_weight))
54 |         o2 = F.leaky_relu(self.arma(x, edge_index, edge_weight))
55 |         o3 = self.linear(torch.add(o1, o2))
56 |         return his, o3
57 | 
58 |     @property
59 |     def output_dim(self):
60 |         return self._output_dim
61 | 


--------------------------------------------------------------------------------
/code_submission/models/nas_citeseer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch import nn
 4 | from torch_geometric.nn import GATConv
 5 | 
 6 | 
 7 | class NasCiteseer(nn.Module):
 8 |     def __init__(self, features_num, num_class, num_layers=2, dropout=0.5, hidden=64, edge_num=1000, **kwargs):
 9 |         super(NasCiteseer, self).__init__()
10 |         print(f"edge num {edge_num}")
11 |         hidden_dim = max(hidden, num_class * 2)
12 |         cur_dim, hidden_dim, output_dim = features_num, hidden_dim, hidden_dim
13 |         multi_head = edge_num < 1400000
14 |         self.cells = nn.ModuleList()
15 |         for _ in range(num_layers):
16 |             cell = NasCiteseerCell(cur_dim, hidden_dim, output_dim, multi_head)
17 |             self.cells.append(cell)
18 |             cur_dim = cell.output_dim
19 |         self.classifier = nn.Linear(cur_dim, num_class)
20 | 
21 |         self.dropout = dropout
22 | 
23 |     def forward(self, data):
24 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
25 |         x = F.dropout(x, p=self.dropout, training=self.training)
26 |         for cell in self.cells:
27 |             x = cell(x, edge_index, edge_weight)
28 |         logits = self.classifier(x)
29 |         return F.log_softmax(logits, dim=-1)
30 | 
31 | 
32 | class NasCiteseerCell(nn.Module):
33 |     def __init__(self, cur_dim, hidden_dim, output_dim, multi_head):
34 |         super(NasCiteseerCell, self).__init__()
35 |         self._cur_dim = cur_dim
36 |         self._hidden_dim = hidden_dim
37 |         self._output_dim = output_dim
38 | 
39 |         self.preprocessor = nn.Linear(cur_dim, hidden_dim)
40 |         self.heads = 6 if multi_head else 1
41 |         self.gat6 = GATConv(hidden_dim, output_dim, heads=self.heads)
42 |         self.linear = nn.Linear(self._output_dim * (self.heads + 1), self._output_dim)
43 | 
44 |     def forward(self, x, edge_index, edge_weight):
45 |         h = self.preprocessor(x)
46 |         h1 = F.leaky_relu(self.gat6(h, edge_index))
47 |         out = self.linear(torch.cat([h, h1], dim=1))
48 |         return out
49 | 
50 |     @property
51 |     def output_dim(self):
52 |         return self._output_dim
53 | 


--------------------------------------------------------------------------------
/code_submission/models/nas_coauthorcs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by HazzaCheng on 2020-05-20
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | from torch import nn
 8 | from torch_geometric.nn import SAGEConv
 9 | 
10 | 
11 | class NasCoauthorcs(nn.Module):
12 |     def __init__(self, features_num, num_class, num_layers=2, dropout=0.5, hidden=64, edge_num=1000, **kwargs):
13 |         super(NasCoauthorcs, self).__init__()
14 |         hidden_dim = max(hidden, num_class * 2)
15 |         his_dim, cur_dim, hidden_dim, output_dim = features_num, features_num, hidden_dim, hidden_dim
16 |         multi_head = edge_num < 1400000
17 |         self.cells = nn.ModuleList()
18 |         for _ in range(num_layers):
19 |             cell = NasCoauthorcsCell(his_dim, cur_dim, hidden_dim, output_dim, multi_head)
20 |             self.cells.append(cell)
21 |             his_dim, cur_dim = cur_dim, cell.output_dim
22 |         self.classifier = nn.Linear(cur_dim, num_class)
23 | 
24 |         self.dropout = dropout
25 | 
26 |     def forward(self, data):
27 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
28 |         x = F.dropout(x, p=self.dropout, training=self.training)
29 |         h = x
30 |         for cell in self.cells:
31 |             h, x = cell(h, x, edge_index, edge_weight)
32 |         logits = self.classifier(x)
33 |         return F.log_softmax(logits, dim=-1)
34 | 
35 | 
36 | class NasCoauthorcsCell(nn.Module):
37 |     # best structure:{'action': [0, 'linear', 1, 'sage', 'tanh', 'concat'], 'hyper_param': [0.005, 0.5, 1e-05, 64]}
38 |     def __init__(self, his_dim, cur_dim, hidden_dim, output_dim, multi_head):
39 |         super(NasCoauthorcsCell, self).__init__()
40 |         self._cur_dim = cur_dim
41 |         self._hidden_dim = hidden_dim
42 |         self._output_dim = output_dim
43 | 
44 |         self.preprocessor_h = nn.Linear(his_dim, hidden_dim)
45 |         self.preprocessor_x = nn.Linear(cur_dim, hidden_dim)
46 |         self.linear = nn.Linear(hidden_dim, output_dim)
47 |         self.sage = SAGEConv(hidden_dim, output_dim)
48 | 
49 |     def forward(self, h, x, edge_index, edge_weight):
50 |         his = x
51 |         x = self.preprocessor_x(x)
52 |         h = self.preprocessor_h(h)
53 |         o1 = F.leaky_relu(self.linear(h))
54 |         o2 = F.leaky_relu(self.sage(x, edge_index, edge_weight))
55 |         o3 = F.tanh(torch.cat([o1, o2], dim=1))
56 |         return his, o3
57 | 
58 |     @property
59 |     def output_dim(self):
60 |         return self._output_dim * 2
61 | 


--------------------------------------------------------------------------------
/code_submission/models/nas_coauthorphy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by HazzaCheng on 2020-05-21
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | from torch import nn
 8 | from torch_geometric.nn import SAGEConv, GCNConv
 9 | 
10 | 
11 | class NasCoauthorphy(nn.Module):
12 |     def __init__(self, features_num, num_class, num_layers=2, dropout=0.5, hidden=64, edge_num=1000, **kwargs):
13 |         super(NasCoauthorphy, self).__init__()
14 |         hidden_dim = max(hidden, num_class * 2)
15 |         his_dim, cur_dim, hidden_dim, output_dim = features_num, features_num, hidden_dim, hidden_dim
16 |         multi_head = edge_num < 1400000
17 |         self.cells = nn.ModuleList()
18 |         for _ in range(num_layers):
19 |             cell = NasCoauthorphyCell(his_dim, cur_dim, hidden_dim, output_dim, multi_head)
20 |             self.cells.append(cell)
21 |             his_dim, cur_dim = cur_dim, cell.output_dim
22 |         self.classifier = nn.Linear(cur_dim, num_class)
23 | 
24 |         self.dropout = dropout
25 | 
26 |     def forward(self, data):
27 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
28 |         x = F.dropout(x, p=self.dropout, training=self.training)
29 |         h = x
30 |         for cell in self.cells:
31 |             h, x = cell(h, x, edge_index, edge_weight)
32 |         logits = self.classifier(x)
33 |         return F.log_softmax(logits, dim=-1)
34 | 
35 | 
36 | class NasCoauthorphyCell(nn.Module):
37 |     # best structure: {'action': [0, 'linear', 1, 'gcn', 'sigmoid', 'concat'], 'hyper_param': [0.01, 0.4, 5e-05, 128]}
38 |     def __init__(self, his_dim, cur_dim, hidden_dim, output_dim, multi_head):
39 |         super(NasCoauthorphyCell, self).__init__()
40 |         self._cur_dim = cur_dim
41 |         self._hidden_dim = hidden_dim
42 |         self._output_dim = output_dim
43 | 
44 |         self.preprocessor_h = nn.Linear(his_dim, hidden_dim)
45 |         self.preprocessor_x = nn.Linear(cur_dim, hidden_dim)
46 |         self.linear = nn.Linear(hidden_dim, output_dim)
47 |         self.gcn = GCNConv(hidden_dim, output_dim)
48 | 
49 |     def forward(self, h, x, edge_index, edge_weight):
50 |         his = x
51 |         x = self.preprocessor_x(x)
52 |         h = self.preprocessor_h(h)
53 |         o1 = F.leaky_relu(self.linear(h))
54 |         o2 = F.leaky_relu(self.gcn(x, edge_index, edge_weight))
55 |         o3 = F.sigmoid(torch.cat([o1, o2], dim=1))
56 |         return his, o3
57 | 
58 |     @property
59 |     def output_dim(self):
60 |         return self._output_dim * 2
61 | 


--------------------------------------------------------------------------------
/code_submission/models/nas_cora.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch import nn
 4 | from torch_geometric.nn import GATConv, GCNConv, ARMAConv
 5 | 
 6 | 
 7 | class NasCora(nn.Module):
 8 |     def __init__(self, features_num, num_class, num_layers=2, dropout=0.5, hidden=64, edge_num=1000, **kwargs):
 9 |         super(NasCora, self).__init__()
10 |         hidden_dim = max(hidden, num_class * 2)
11 |         multi_head = edge_num < 1400000
12 |         cur_dim, hidden_dim, output_dim = features_num, hidden_dim, hidden_dim
13 |         self.cells = nn.ModuleList()
14 |         for _ in range(num_layers):
15 |             cell = NasCoraCell(cur_dim, hidden_dim, output_dim, multi_head)
16 |             self.cells.append(cell)
17 |             cur_dim = cell.output_dim
18 |         self.classifier = nn.Linear(cur_dim, num_class)
19 | 
20 |         self.dropout = dropout
21 | 
22 |     def forward(self, data):
23 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
24 |         x = F.dropout(x, p=self.dropout, training=self.training)
25 |         for cell in self.cells:
26 |             x = cell(x, edge_index, edge_weight)
27 |         logits = self.classifier(x)
28 |         return F.log_softmax(logits, dim=-1)
29 | 
30 | 
31 | class NasCoraCell(nn.Module):
32 |     def __init__(self, cur_dim, hidden_dim, output_dim, multi_head):
33 |         super(NasCoraCell, self).__init__()
34 |         self._cur_dim = cur_dim
35 |         self._hidden_dim = hidden_dim
36 |         self._output_dim = output_dim
37 |         self.headers = 6 if multi_head else 1
38 | 
39 |         self.preprocessor = nn.Linear(cur_dim, hidden_dim)
40 |         self.gat6 = GATConv(hidden_dim, output_dim, heads=self.headers)
41 |         self.gcn0 = GCNConv(hidden_dim, output_dim)
42 |         self.gcn1 = GCNConv(hidden_dim, output_dim)
43 |         self.arma = ARMAConv(output_dim * self.headers, output_dim)
44 |     
45 |     def forward(self, x, edge_index, edge_weight):
46 |         h = self.preprocessor(x)
47 |         h1 = F.leaky_relu(self.gat6(h, edge_index))
48 |         h2 = F.leaky_relu(self.gcn0(h, edge_index, edge_weight=edge_weight))
49 |         h3 = F.leaky_relu(self.gcn1(h, edge_index, edge_weight=edge_weight))
50 |         h4 = F.leaky_relu(self.arma(h1, edge_index, edge_weight))
51 |         out = torch.cat([h1, h2, h3, h4], dim=1)
52 |         return F.tanh(out)
53 | 
54 |     @property
55 |     def output_dim(self):
56 |         return self._output_dim * (self.headers + 3)
57 | 
58 | 


--------------------------------------------------------------------------------
/code_submission/models/nas_phy10000.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by HazzaCheng on 2020-05-22
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | from torch import nn
 8 | from torch_geometric.nn import ARMAConv
 9 | 
10 | 
11 | class NasPhy10000(nn.Module):
12 |     def __init__(self, features_num, num_class, num_layers=2, dropout=0.5, hidden=64, edge_num=1000, **kwargs):
13 |         super(NasPhy10000, self).__init__()
14 |         print(f"edge num {edge_num}")
15 |         hidden_dim = max(hidden, num_class * 2)
16 |         cur_dim, hidden_dim, output_dim = features_num, hidden_dim, hidden_dim
17 |         multi_head = edge_num < 1400000
18 |         self.cells = nn.ModuleList()
19 |         for _ in range(num_layers):
20 |             cell = NasPhy10000Cell(cur_dim, hidden_dim, output_dim, multi_head)
21 |             self.cells.append(cell)
22 |             cur_dim = cell.output_dim
23 |         self.classifier = nn.Linear(cur_dim, num_class)
24 | 
25 |         self.dropout = dropout
26 | 
27 |     def forward(self, data):
28 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
29 |         x = F.dropout(x, p=self.dropout, training=self.training)
30 |         for cell in self.cells:
31 |             x = cell(x, edge_index, edge_weight)
32 |         logits = self.classifier(x)
33 |         return F.log_softmax(logits, dim=-1)
34 | 
35 | 
36 | class NasPhy10000Cell(nn.Module):
37 |     # best structure:{'action': [0, 'linear', 2, 'arma', 'tanh', 'add'], 'hyper_param': [0.001, 0.5, 0.0001, 128]}
38 |     def __init__(self, cur_dim, hidden_dim, output_dim, multi_head):
39 |         super(NasPhy10000Cell, self).__init__()
40 |         self._cur_dim = cur_dim
41 |         self._hidden_dim = hidden_dim
42 |         self._output_dim = output_dim
43 | 
44 |         self.preprocessor = nn.Linear(cur_dim, hidden_dim)
45 |         self.linear = nn.Linear(hidden_dim, output_dim)
46 |         self.arma = ARMAConv(output_dim, output_dim)
47 | 
48 |     def forward(self, x, edge_index, edge_weight):
49 |         h = self.preprocessor(x)
50 |         h1 = F.leaky_relu(self.linear(h))
51 |         h2 = F.leaky_relu(self.arma(h1, edge_index, edge_weight=edge_weight))
52 |         out = F.tanh(torch.add(h1, h2))
53 |         return out
54 | 
55 |     @property
56 |     def output_dim(self):
57 |         return self._output_dim
58 | 


--------------------------------------------------------------------------------
/code_submission/models/nas_pubmed.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch import nn
 4 | from torch_geometric.nn import GATConv, ARMAConv
 5 | 
 6 | 
 7 | class NasPubmed(nn.Module):
 8 |     def __init__(self, features_num, num_class, num_layers=2, dropout=0.5, hidden=64, edge_num=1000, **kwargs):
 9 |         super(NasPubmed, self).__init__()
10 |         hidden_dim = max(hidden, num_class * 2)
11 |         his_dim, cur_dim, hidden_dim, output_dim = features_num, features_num, hidden_dim, hidden_dim
12 |         multi_head = edge_num < 1400000
13 |         self.cells = nn.ModuleList()
14 |         for _ in range(num_layers):
15 |             cell = NasPubmedCell(his_dim, cur_dim, hidden_dim, output_dim, multi_head)
16 |             self.cells.append(cell)
17 |             his_dim, cur_dim = cur_dim, cell.output_dim
18 |         self.classifier = nn.Linear(cur_dim, num_class)
19 | 
20 |         self.dropout = dropout
21 | 
22 |     def forward(self, data):
23 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
24 |         x = F.dropout(x, p=self.dropout, training=self.training)
25 |         h = x
26 |         for cell in self.cells:
27 |             h, x = cell(h, x, edge_index, edge_weight)
28 |         logits = self.classifier(x)
29 |         return F.log_softmax(logits, dim=-1)
30 | 
31 | 
32 | class NasPubmedCell(nn.Module):
33 |     def __init__(self, his_dim, cur_dim, hidden_dim, output_dim, multi_head):
34 |         super(NasPubmedCell, self).__init__()
35 |         self._cur_dim = cur_dim
36 |         self._hidden_dim = hidden_dim
37 |         self._output_dim = output_dim
38 | 
39 |         self.preprocessor_h = nn.Linear(his_dim, hidden_dim)
40 |         self.preprocessor_x = nn.Linear(cur_dim, hidden_dim)
41 |         self.headers = 8 if multi_head else 1
42 |         self.gat8 = GATConv(hidden_dim, output_dim, heads=self.headers)
43 |         self.arma = ARMAConv(hidden_dim, output_dim)
44 | 
45 |     def forward(self, h, x, edge_index, edge_weight):
46 |         his = x
47 |         x = self.preprocessor_x(x)
48 |         h = self.preprocessor_h(h)
49 |         o3 = F.leaky_relu(self.arma(h, edge_index, edge_weight))
50 |         o4 = F.leaky_relu(self.gat8(x, edge_index))
51 |         o5 = F.tanh(torch.cat([o3, o4], dim=1))
52 |         return his, o5
53 | 
54 |     @property
55 |     def output_dim(self):
56 |         return self._output_dim * (1 + self.headers)
57 | 
58 | 


--------------------------------------------------------------------------------
/code_submission/models/sage.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch_geometric.nn import TopKPooling, SAGEConv
 3 | from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | # 这是graph classification级别的网络
 8 | class SAGE(torch.nn.Module):
 9 |     def __init__(self, data, device, embed_dim=128, features_num=16, num_class=2, **kwargs):
10 |         super(SAGE, self).__init__()
11 | 
12 |         self.conv1 = SAGEConv(features_num, 128)
13 |         self.pool1 = TopKPooling(128, ratio=0.8)
14 |         self.conv2 = SAGEConv(128, 128)
15 |         self.pool2 = TopKPooling(128, ratio=0.8)
16 |         self.conv3 = SAGEConv(128, 128)
17 |         self.pool3 = TopKPooling(128, ratio=0.8)
18 |         self.lin1 = torch.nn.Linear(256, 128)
19 |         self.lin2 = torch.nn.Linear(128, 64)
20 |         self.lin3 = torch.nn.Linear(64, num_class)
21 |         self.bn1 = torch.nn.BatchNorm1d(128)
22 |         self.bn2 = torch.nn.BatchNorm1d(64)
23 |         self.act1 = torch.nn.ReLU()
24 |         self.act2 = torch.nn.ReLU()
25 |         self.data = data
26 |         self.item_embedding = data.x
27 |         self.device = device
28 | 
29 |     def forward(self, indices):
30 |         data = self.data
31 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
32 |         print(f"batch {len(x)}")
33 |         batch = torch.tensor([len(x)], dtype=torch.long).to(self.device)
34 | 
35 |         x = F.relu(self.conv1(x, edge_index, edge_weight=edge_weight))
36 | 
37 |         # x, edge_index, _, batch, _ = self.pool1(x, edge_index, None, batch)
38 |         print(f"gmp {gmp(x, batch).shape}, gap {gap(x, batch).shape}")
39 |         x1 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)
40 | 
41 |         x = F.relu(self.conv2(x, edge_index, edge_weight=edge_weight))
42 | 
43 |         # x, edge_index, _, batch, _ = self.pool2(x, edge_index, None, batch)
44 |         x2 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)
45 | 
46 |         x = F.relu(self.conv3(x, edge_index, edge_weight=edge_weight))
47 | 
48 |         # x, edge_index, _, batch, _ = self.pool3(x, edge_index, None, batch)
49 |         x3 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)
50 | 
51 |         x = x1 + x2 + x3
52 |         print(f"x1 {x1.shape}, x2 {x2.shape}, x3 {x3.shape}")
53 | 
54 |         x = self.lin1(x)
55 |         x = self.act1(x)
56 |         x = self.lin2(x)
57 |         x = self.act2(x)
58 |         x = F.dropout(x, p=0.5, training=self.training)
59 | 
60 |         print(indices.shape)
61 |         print(x.shape)
62 |         x = torch.sigmoid(self.lin3(x))[indices, :]
63 |         print(x.shape)
64 | 
65 |         return F.log_softmax(x, dim=-1)
66 | 


--------------------------------------------------------------------------------
/code_submission/models/simple_gcn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time:    2020/5/6 9:39
 4 | # @Author:  Mecthew
 5 | import torch
 6 | import torch.nn.functional as F
 7 | from torch.nn import Linear
 8 | from torch_geometric.nn import SplineConv, GCNConv
 9 | 
10 | 
11 | class SimpleGCN(torch.nn.Module):
12 | 
13 |     # TODO: 网络太弱
14 |     def __init__(
15 |             self, num_layers=3, hidden=32, features_num=16, num_class=2, dropout=0.5,
16 |             drop_edge_controller=None,
17 |             **kwargs
18 |     ):
19 |         super(SimpleGCN, self).__init__()
20 |         hidden = max(hidden, num_class * 2)
21 |         # self.conv1 = SplineConv(features_num, hidden, dim=1, kernel_size=2)
22 |         # self.conv2 = SplineConv(hidden, num_class, dim=1, kernel_size=2)
23 |         self.conv1 = GCNConv(features_num, hidden * 2)
24 |         self.conv2 = GCNConv(hidden * 2, num_class)
25 |         self.dropout = dropout
26 |         self.drop_edge_controller = drop_edge_controller
27 | 
28 |     def forward(self, data):
29 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
30 |         # edge_index, edge_weight = self.drop_edge_controller.drop_edges(edge_index, edge_weight, 0.2)
31 | 
32 |         x = F.dropout(x, p=self.dropout, training=self.training)
33 |         x = F.elu(self.conv1(x, edge_index, edge_weight))
34 |         x = F.dropout(x, p=self.dropout, training=self.training)
35 |         x = self.conv2(x, edge_index, edge_weight)
36 |         return F.log_softmax(x, dim=1)
37 | 
38 |     def __repr__(self):
39 |         return self.__class__.__name__
40 | 


--------------------------------------------------------------------------------
/code_submission/preprocessing/__init__.py:
--------------------------------------------------------------------------------
 1 | __all__ = [
 2 |     "extract_graph_feature", "prepredict", "lpa_predict", "feat_engineering", "is_nonnegative_integer",
 3 |     "feat_row_sum_inv_normalize", "get_node2vec_embedding"
 4 | ]
 5 | 
 6 | from .graph import extract_graph_feature
 7 | from .prepredict import prepredict, lpa_predict, is_nonnegative_integer
 8 | from .feat_engineer import feat_engineering
 9 | from .feat_engineer import feat_row_sum_inv_normalize, get_node2vec_embedding
10 | 


--------------------------------------------------------------------------------
/code_submission/preprocessing/feat_engineer.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import time
 3 | 
 4 | import pandas as pd
 5 | import numpy as np
 6 | from torch_geometric.nn.models import Node2Vec
 7 | import torch
 8 | from torch.optim import Adam
 9 | 
10 | 
11 | def drop_n_unique(x, n=1):
12 |     drop_cols = []
13 |     for col in x:
14 |         if x[col].nunique() == n:
15 |             drop_cols.append(col)
16 |     print(f"Drop {drop_cols} by condition (nunique={n})")
17 |     all_zero = len(drop_cols) == len(x.columns)
18 |     x.drop(columns=drop_cols, inplace=True, axis=1)
19 |     print(f"Remain cols {x.columns}")
20 |     return all_zero
21 | 
22 | 
23 | def count_nonzero(x):
24 |     non_zero = (x != 0).sum(axis=1)
25 |     if non_zero.nunique() != 1:
26 |         non_zero /= non_zero.max()
27 |         x['non_zero'] = non_zero
28 | 
29 | 
30 | def feat_engineering(x, edges=None, num_nodes=None):
31 |     # TODO: out of memory
32 |     all_zero = drop_n_unique(x)
33 |     if all_zero:
34 |         # print(f"Translate all zero to one hot encode")
35 |         # x = pd.get_dummies(x.index)
36 |         # return x.to_numpy()
37 |         print("Use normalized weight as feature")
38 |         edge_weights = np.zeros((num_nodes, num_nodes), dtype=np.float)
39 |         edge_weights[edges['src_idx'], edges['dst_idx']] = edges['edge_weight']
40 |         # for i in range(num_nodes):
41 |         #     max_weight = np.max(edge_weights[:, i])
42 |         #     min_weight = np.min(edge_weights[:, i])
43 |         #     range_weight = max_weight - min_weight
44 |         #     if math.isclose(range_weight, 0, abs_tol=1e-4):
45 |         #         continue
46 |         #     edge_weights[:, i] = (edge_weights[:, i] - min_weight) / range_weight
47 |         return edge_weights
48 |     count_nonzero(x)
49 |     return x.to_numpy()
50 | 
51 | 
52 | def feat_row_sum_inv_normalize(x):
53 |     """
54 |     :param x: np.ndarray, raw features.
55 |     :return:  np.ndarray, normalized features
56 |     """
57 |     x_feat = x.astype(dtype=np.float64)
58 |     inv_x_rowsum = np.power(x_feat.sum(axis=1), -1).flatten()
59 |     inv_x_rowsum[np.isinf(inv_x_rowsum)] = 0.
60 |     x_diag_mat = np.diag(inv_x_rowsum)
61 |     normalized_x = x_diag_mat.dot(x_feat)
62 |     return normalized_x
63 | 
64 | 
65 | def get_node2vec_embedding(data, num_nodes, edge_index, embedding_dim=300):
66 |     """
67 |     :param data: pd.DataFrame.
68 |     :param num_nodes: int, number of nodes.
69 |     :param edge_index: np.ndarray, shape = (2, edge_num)
70 |     :return: np.ndarray, shape = (num_nodes, embedding_dim)
71 |     """
72 |     t1 = time.time()
73 |     edge_index = torch.tensor(edge_index)
74 |     train_indices = data['train_indices']
75 |     test_indices = data['test_indices']
76 |     total_indices = sorted(train_indices + test_indices, reverse=False)
77 |     train_label = data['train_label']['label'].values
78 | 
79 |     node2vec = Node2Vec(num_nodes=num_nodes, embedding_dim=embedding_dim, walk_length=10,
80 |                         context_size=5, walks_per_node=1)
81 |     optimizer = Adam(node2vec.parameters(), lr=1e-1, weight_decay=1e-4)
82 |     for i in range(10):
83 |         optimizer.zero_grad()
84 |         node2vec.forward(subset=torch.tensor(total_indices))
85 |         loss = node2vec.loss(edge_index=edge_index)
86 |         loss.backward()
87 |         optimizer.step()
88 |         print("loss at epoch{}: {}".format(i, loss.item()))
89 | 
90 |     x_feats = node2vec.embedding(torch.tensor(total_indices)).detach().numpy()
91 |     print("Time cost for node2vec {}s".format(time.time() - t1))
92 |     return x_feats
93 | 


--------------------------------------------------------------------------------
/code_submission/preprocessing/graph.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def is_undirected(num_node, edges):
 6 |     src, dist = zip(*edges)
 7 |     a_mat = np.zeros(shape=(num_node, num_node), dtype=np.bool)
 8 |     a_mat[src, dist] = True
 9 |     return (a_mat == a_mat.T).all()
10 | 
11 | 
12 | def extend_directed(edges):
13 |     """
14 |     将无向图转为有向图
15 |     Args:
16 |         edges: pd.DataFrame(columns=['src_idx', 'dst_idx', 'edge_weight'])
17 | 
18 |     Returns:
19 |         undirected_edges
20 |     """
21 |     edges_shadow = edges.copy()
22 |     edges_shadow[['src_idx', 'dst_idx']] = edges_shadow[['dst_idx', 'src_idx']]
23 |     undirected_edges = pd.concat([edges, edges_shadow], axis=0).reset_index(drop=True).drop_duplicates()
24 |     return undirected_edges
25 | 
26 | 
27 | def extract_graph_feature(graph_df, n_class):
28 |     """
29 | 
30 |     Args:
31 |         graph_df: {
32 |                 'fea_table': pd.DataFrame['node_index', 'feat_1', ..., 'feat_n'],
33 |                 'edge_file': pd.DataFrame['src_idx', 'dst_idx', 'edge_weight'],
34 |                 'train_indices': list of the index of train set,
35 |                 'test_indices': list of the index of test set,
36 |                 'train_label': pd.DataFrame['node_index', 'label']
37 |             }
38 |         n_class: num of class
39 | 
40 |     Returns:
41 | 
42 |     """
43 |     fea_table = graph_df['fea_table'].set_index(keys="node_index")
44 |     edges = graph_df['edge_file']
45 |     train_indices = graph_df['train_indices']
46 |     test_indices = graph_df['test_indices']
47 |     train_label = graph_df['train_label']
48 | 
49 |     edge_weight = edges['edge_weight']
50 |     in_degree = edges['dst_idx'].value_counts()
51 |     out_degree = edges['src_idx'].value_counts()
52 |     label_counts = train_label['label'].value_counts()
53 | 
54 |     (n_node, n_feature), n_edge = fea_table.shape, len(edges)
55 |     n_train, n_test = len(train_indices), len(test_indices)
56 |     meaning_weight = not (edge_weight == edge_weight[0]).all()
57 |     max_degree, min_degree, mean_degree = in_degree.max(), in_degree.min(), in_degree.mean()
58 |     max_labels, min_labels = label_counts.max(), label_counts.min()
59 |     label_distribute = label_counts.sort_index(axis=0) / n_train
60 |     print("label_distribute\n{}".format(label_distribute))
61 |     info = {
62 |         "n_node": n_node, "n_feature": n_feature, "n_edge": n_edge,
63 |         "n_class": n_class,
64 |         "n_train": n_train, "n_test": n_test,
65 |         "meaning_weight": meaning_weight,
66 |         "max_degree": max_degree, "min_degree": min_degree, "mean_degree": mean_degree,
67 |         "max_labels": max_labels / n_train, "min_labels": min_labels / n_train,
68 |         # "label_distribute": label_distribute
69 |     }
70 | 
71 |     return info
72 | 


--------------------------------------------------------------------------------
/code_submission/preprocessing/prepredict.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time:    2020/5/14 20:41
  4 | # @Author:  Mecthew
  5 | import time
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import scipy
 10 | from sklearn.svm import LinearSVC
 11 | from sklearn.linear_model import logistic
 12 | from sklearn.calibration import CalibratedClassifierCV
 13 | from sklearn.metrics import accuracy_score
 14 | from sklearn.preprocessing import OneHotEncoder
 15 | import scipy.sparse as sp
 16 | from utils.logger import get_logger
 17 | logger = get_logger("INFO")
 18 | 
 19 | 
 20 | class SVM:
 21 |     def __init__(self, **kwargs):
 22 |         self.name = "SVM"
 23 |         self._model = CalibratedClassifierCV(LinearSVC(C=1.0, max_iter=500, class_weight=None, random_state=666))
 24 | 
 25 |     def fit(self, x_train, y_train):
 26 |         self._model.fit(x_train, y_train)
 27 | 
 28 |     def predict(self, x_test):
 29 |         return self._model.predict_proba(x_test)
 30 | 
 31 | 
 32 | class LR:
 33 |     def __init__(self, **kwargs):
 34 |         self.name = "LR"
 35 |         self._model = logistic.LogisticRegression(C=1.0, solver="liblinear", multi_class="auto",
 36 |                                                   class_weight=None, max_iter=100, random_state=666)
 37 | 
 38 |     def fit(self, x_train, y_train):
 39 |         self._model.fit(x_train, y_train)
 40 | 
 41 |     def predict(self, x_test):
 42 |         return self._model.predict_proba(x_test)
 43 | 
 44 | 
 45 | def prepredict(graph_df, train_indices, use_valid, use_ohe=False):
 46 |     t1 = time.time()
 47 |     fea_table = graph_df['fea_table'].set_index(keys="node_index")
 48 |     train_indices = train_indices
 49 |     if use_valid:
 50 |         valid_indices = list(set(graph_df['train_indices']) - set(train_indices))
 51 |         test_indices = graph_df['test_indices'] + valid_indices
 52 |     else:
 53 |         test_indices = graph_df['test_indices']
 54 |     train_label = graph_df['train_label'].set_index('node_index').loc[train_indices][['label']]
 55 | 
 56 |     x_train, y_train = fea_table.loc[train_indices].to_numpy(), train_label.to_numpy()
 57 |     x_test = fea_table.loc[test_indices].to_numpy()
 58 |     lr = LR()
 59 |     lr.fit(x_train, y_train)
 60 | 
 61 |     if use_ohe:
 62 |         ohe = OneHotEncoder(handle_unknown="ignore").fit(y_train.reshape(-1, 1))
 63 |         x_train_feat, x_test_feat = ohe.transform(np.argmax(lr.predict(x_train), axis=1).reshape(-1, 1)).toarray(), \
 64 |                                     ohe.transform(np.argmax(lr.predict(x_test), axis=1).reshape(-1, 1)).toarray()
 65 |     else:
 66 |         x_train_feat, x_test_feat = lr.predict(x_train), \
 67 |                                     lr.predict(x_test)
 68 |     pre_feat = np.concatenate([x_train_feat, x_test_feat], axis=0)
 69 |     total_indices = np.concatenate([train_indices, test_indices], axis=0)
 70 | 
 71 |     train_predict = np.argmax(x_train_feat, axis=1)
 72 |     train_acc = accuracy_score(y_true=y_train, y_pred=train_predict)
 73 |     t2 = time.time()
 74 |     logger.info("Time cost for training {}: {}s, train acc {}".format(lr.name, t2-t1, train_acc))
 75 | 
 76 |     return pd.DataFrame(data=pre_feat, index=total_indices)
 77 | 
 78 | 
 79 | def lpa_predict(graph_df, n_class, train_indices, use_valid, max_iter=100, tol=1e-3, use_ohe=False):
 80 |     t1 = time.time()
 81 |     train_indices = train_indices
 82 |     if use_valid:
 83 |         valid_indices = list(set(graph_df['train_indices']) - set(train_indices))
 84 |         test_indices = graph_df['test_indices'] + valid_indices
 85 |     else:
 86 |         test_indices = graph_df['test_indices']
 87 |     train_label = graph_df['train_label'].set_index('node_index').loc[train_indices][['label']].to_numpy()
 88 |     print("Train label shape {}".format(train_label.shape))
 89 |     train_label = train_label.reshape(-1)
 90 |     edges = graph_df['edge_file'][['src_idx', 'dst_idx', 'edge_weight']].to_numpy()
 91 |     edge_index = edges[:, :2].astype(np.int).transpose()    # transpose to (2, num_edges)
 92 |     edge_weight = edges[:, 2].astype(np.float)
 93 |     num_nodes = len(train_indices) + len(test_indices)
 94 | 
 95 |     t2 = time.time()
 96 |     total_indices = np.concatenate([train_indices, test_indices], axis=0)
 97 |     adj = sp.coo_matrix((edge_weight, edge_index), shape=(num_nodes, num_nodes)).tocsr()
 98 |     adj = adj[total_indices]       # reorder
 99 |     adj = adj[:, total_indices]
100 | 
101 |     t3 = time.time()
102 |     logger.debug("Time cost for transform adj {}s".format(t3 - t2))
103 |     row_sum = np.array(adj.sum(axis=1), dtype=np.float)
104 |     d_inv = np.power(row_sum, -1).flatten()
105 |     d_inv[np.isinf(d_inv)] = 0.
106 |     normal_adj = sp.diags(d_inv).dot(adj).tocsr().transpose()
107 | 
108 |     Pll = normal_adj[:len(train_indices), :len(train_indices)].copy()
109 |     Plu = normal_adj[:len(train_indices), len(train_indices):].copy()
110 |     Pul = normal_adj[len(train_indices):, :len(train_indices)].copy()
111 |     Puu = normal_adj[len(train_indices):, len(train_indices):].copy()
112 |     label_mat = np.eye(n_class)[train_label]
113 |     label_mat_prob = label_mat.copy()
114 |     print("Pul shape {}, label_mat shape {}".format(Pul.shape, label_mat_prob.shape))
115 | 
116 |     Pul_dot_lable_mat = Pul.dot(label_mat)
117 |     unlabel_mat = np.zeros(shape=(len(test_indices), n_class))
118 |     iter, changed = 0, np.inf
119 |     t4 = time.time()
120 |     logger.debug("Time cost for prepare matrix {}s".format(t4-t3))
121 |     while iter < max_iter and changed > tol:
122 |         if iter % 10 == 0:
123 |             logger.debug("---> Iteration %d/%d, changed: %f" % (iter, max_iter, changed))
124 | 
125 |         iter += 1
126 |         pre_unlabel_mat = unlabel_mat
127 |         unlabel_mat = Puu.dot(unlabel_mat) + Pul_dot_lable_mat
128 |         label_mat_prob = Pll.dot(label_mat_prob) + Plu.dot(pre_unlabel_mat)
129 |         changed = np.abs(pre_unlabel_mat - unlabel_mat).sum()
130 |     logger.debug("Time cost for training lpa {}".format(time.time() - t4))
131 |     # preds = np.argmax(np.array(unlabel_mat), axis=1)
132 |     # unlabel_mat = np.eye(n_class)[preds]
133 |     train_acc = accuracy_score(y_true=train_label, y_pred=np.argmax(label_mat_prob, axis=1))
134 |     logger.info("LPA training acc {}".format(train_acc))
135 |     logger.info("Time cost for LPA {}s".format(time.time() - t1))
136 |     total_indices = np.concatenate([train_indices, test_indices], axis=0)
137 |     if use_ohe:
138 |         ohe = OneHotEncoder(handle_unknown="ignore").fit(train_label.reshape(-1, 1))
139 |         label_mat_ohe = ohe.transform(np.argmax(label_mat_prob, axis=1).reshape(-1, 1)).toarray()
140 |         unlabel_mat_ohe = ohe.transform(np.argmax(unlabel_mat, axis=1).reshape(-1, 1)).toarray()
141 |         lu_mat_ohe = np.concatenate([label_mat_ohe, unlabel_mat_ohe], axis=0)
142 |         return pd.DataFrame(data=lu_mat_ohe, index=total_indices), train_acc
143 |     else:
144 |         unlabel_mat_prob = unlabel_mat
145 |         lu_mat_prob = np.concatenate([label_mat_prob, unlabel_mat_prob], axis=0)
146 |         return pd.DataFrame(data=lu_mat_prob, index=total_indices), train_acc
147 | 
148 | 
149 | def is_nonnegative_integer(x_feats):
150 |     is_nonnegative = (x_feats >= 0).all()
151 |     is_integer = True
152 |     for feat in x_feats:
153 |         feat_int_sum = np.array(feat, dtype=np.int).sum()
154 |         feat_sum = np.array(feat, dtype=np.float).sum()
155 |         is_integer = (feat_int_sum == feat_sum)
156 |         if is_integer is False:
157 |             break
158 |     return is_nonnegative and is_integer
159 | 


--------------------------------------------------------------------------------
/code_submission/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | __all__ = [
 2 |     "get_logger", "torch_train", "GraphDataset", "Sampler", "get_time_budget", "set_time_budget",
 3 |     "GraphSampleDataset", "TimeOutError"
 4 | ]
 5 | 
 6 | from .logger import get_logger
 7 | from .train import torch_train
 8 | from .data import GraphDataset, GraphSampleDataset, Sampler
 9 | from .timer import set_time_budget, get_time_budget, TimeOutError
10 | 


--------------------------------------------------------------------------------
/code_submission/utils/callbacks.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from utils import get_logger
  3 | 
  4 | import copy
  5 | 
  6 | logger = get_logger("INFO")
  7 | 
  8 | BEST_VALID_TOP_NUM = 3
  9 | 
 10 | 
 11 | class Callback:
 12 |     def __init__(self): pass
 13 |     def on_train_begin(self, *args, **kwargs): pass
 14 |     def on_train_end(self, *args, **kwargs): pass
 15 |     def on_epoch_begin(self, *args, **kwargs): pass
 16 |     def on_epoch_end(self, *args, **kwargs): pass
 17 |     def on_batch_begin(self, *args, **kwargs): pass
 18 |     def on_batch_end(self, *args, **kwargs): pass
 19 |     def on_loss_begin(self, *args, **kwargs): pass
 20 |     def on_loss_end(self, *args, **kwargs): pass
 21 |     def on_step_begin(self, *args, **kwargs): pass
 22 |     def on_step_end(self, *args, **kwargs): pass
 23 | 
 24 | 
 25 | class EarlyStopping(Callback):
 26 |     def __init__(self, patience=5, tol=0.001, min_epochs=1, use_adaptive_topK=False):
 27 |         super(EarlyStopping, self).__init__()
 28 |         self.patience = patience
 29 |         self.tol = tol
 30 |         self.best = -0.1
 31 |         # self.best = np.inf
 32 |         self.best_epoch = -1
 33 |         self.wait = 0
 34 |         self.stopped_epoch = -1
 35 |         # self.threshold = threshold
 36 |         self.min_epochs= min_epochs
 37 |         self.topK_list = []
 38 |         self.use_adaptive_topK = use_adaptive_topK
 39 |         self.loopn_best = {
 40 |             "pred": None,
 41 |             "acc": -1.0,
 42 |             "loss": 9999
 43 |         }
 44 | 
 45 |     def on_epoch_end(self, epoch, val_acc, epoch_loss, y_hat):
 46 |         use_adaptive_topK = self.use_adaptive_topK
 47 |         val_loss = min(1.0, val_acc + self.tol)
 48 |         if use_adaptive_topK:
 49 |             is_add = self.add_into_adaptive_topK(epoch, y_hat, val_acc, epoch_loss)
 50 |         else:
 51 |             self.topK_list = self.add_into_topK(self.topK_list, y_hat, val_acc, epoch_loss)
 52 | 
 53 |         if val_acc > self.best and self.best < 0.999:
 54 |             self.best = max(val_loss - self.tol, self.best)
 55 |             self.best_epoch = epoch
 56 |             self.wait = 0
 57 |         else:
 58 |             self.wait += 1
 59 |             if self.wait >= self.patience and epoch > self.min_epochs:
 60 |                 self.stopped_epoch = epoch
 61 |                 logger.warning(
 62 |                     f"Early stopping conditioned on val_acc patience {self.patience} "
 63 |                     f"in epoch {self.stopped_epoch}. "
 64 |                     f"Metric is {val_acc}, best {self.best} in epoch {self.best_epoch}"
 65 |                 )
 66 |                 if use_adaptive_topK:
 67 |                     if is_add is False:
 68 |                         self.add_into_adaptive_topK(epoch, y_hat, val_acc, epoch_loss, early_stop=True)
 69 |                 return True
 70 |         return False
 71 | 
 72 |     def add_into_topK(self, topK_list, y_hat, acc, loss):
 73 |         valid_dict = {
 74 |             "pred": None,
 75 |             "acc": acc,
 76 |             "loss": loss
 77 |         }
 78 | 
 79 |         if len(topK_list) < BEST_VALID_TOP_NUM:
 80 |             valid_dict["pred"] = y_hat
 81 |             topK_list.append(valid_dict)
 82 |             return topK_list
 83 |         if (acc <= topK_list[-1]["acc"]) or ((acc == topK_list[-1]["acc"]) and (loss > topK_list[-1]["loss"])):
 84 |             return topK_list
 85 |         valid_dict["pred"] = y_hat
 86 |         topK_list[-1] = valid_dict
 87 |         topK_list = sorted(topK_list, key=lambda x: (-x["acc"], x["loss"]))
 88 |         return topK_list
 89 | 
 90 |     def add_into_adaptive_topK(self, epoch, y_hat, acc, loss, early_stop=False):
 91 |         if early_stop or (epoch > 0 and (epoch + 1) % 10 == 0):
 92 |             # self.topK_list.append(self.loopn_best)
 93 |             self.topK_list = self.add_into_topK(self.topK_list, self.loopn_best["pred"], self.loopn_best["acc"], self.loopn_best["loss"])
 94 |             self.loopn_best = {
 95 |                 "pred": None,
 96 |                 "acc": -1.0,
 97 |                 "loss": 9999
 98 |             }
 99 |             return True
100 |         else:
101 |             valid_dict = {
102 |                 "pred": None,
103 |                 "acc": acc,
104 |                 "loss": loss
105 |             }
106 |             if (acc > self.loopn_best["acc"]) or ((acc == self.loopn_best["acc"]) and (loss < self.loopn_best["loss"])):
107 |                 valid_dict["pred"] = y_hat
108 |                 self.loopn_best = valid_dict
109 |             return False
110 | 


--------------------------------------------------------------------------------
/code_submission/utils/data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch.utils.data import Dataset
  4 | from torch_geometric.data import Data
  5 | from collections import defaultdict
  6 | 
  7 | from utils import get_logger
  8 | 
  9 | logger = get_logger("DEBUG")
 10 | 
 11 | 
 12 | class GraphDataset(Dataset):
 13 | 
 14 |     def __init__(self, data):
 15 |         super(GraphDataset, self).__init__()
 16 |         self.data = data
 17 |         self.length = data.train_mask.sum()
 18 |         logger.info(f"Graph dataset: length({self.length})")
 19 |         self.indices = (data.train_mask == 1).nonzero()
 20 | 
 21 |     def __getitem__(self, index: int):
 22 |         index = self.indices[index]
 23 |         return index
 24 | 
 25 |     def __len__(self) -> int:
 26 |         return self.length
 27 | 
 28 |     def resample(self):
 29 |         return self
 30 | 
 31 | 
 32 | class GraphSampleDataset(Dataset):
 33 | 
 34 |     def __init__(
 35 |             self, data, n_class, y_train,
 36 |     ):
 37 |         super(GraphSampleDataset, self).__init__()
 38 |         self.data = data
 39 |         self.class_info = self._init_info(n_class, y_train)
 40 |         self.train_indices = (data.train_mask == 1).nonzero()
 41 |         self.indices, self.length = None, None
 42 |         self.n_class = n_class
 43 |         self.min_num, self.max_num, self.sample_nums = self._init_sample_num(
 44 |             len(y_train) / n_class
 45 |         )
 46 | 
 47 |     def _init_sample_num(self, n_mean):
 48 |         num = [self.class_info[i]['num'] for i in range(self.n_class)]
 49 |         n_min = min(num)
 50 |         n_max = max(num)
 51 |         n_max = int(min(n_max * 0.8, n_mean * 1.2))
 52 |         n_min = int(max(n_min * 1.5, n_mean * 0.5))
 53 |         sample_nums = [max(n_min, min(n_max, ele)) for ele in num]
 54 |         for i in range(self.n_class):
 55 |             print(f"Sample {sample_nums[i]} / {num[i]} from class {i}")
 56 |         return n_min, n_max, sample_nums
 57 | 
 58 |     def resample(self):
 59 |         self.indices = self._init_indices(self.n_class)
 60 |         self.length = len(self.indices)
 61 |         return self
 62 | 
 63 |     def _init_indices(self, n_class):
 64 |         all_indices = []
 65 |         for i in range(n_class):
 66 |             num = self.class_info[i]['num']
 67 |             indices = self.class_info[i]['indices']
 68 |             sampled_indices = []
 69 |             sampled_num = self.sample_nums[i]
 70 |             while sampled_num > 0:
 71 |                 cur = min(sampled_num, num)
 72 |                 sampled_indices.append(np.random.permutation(indices)[:cur])
 73 |                 sampled_num -= cur
 74 |             all_indices += sampled_indices
 75 |         return np.concatenate(all_indices)
 76 | 
 77 |     def _init_info(self, n_class, y_train):
 78 |         class_info = defaultdict(dict)
 79 |         for i in range(n_class):
 80 |             indices = np.where(y_train[:] == i)[0]
 81 |             num = len(indices)
 82 |             class_info[i]['num'] = num
 83 |             class_info[i]['indices'] = indices
 84 |         return class_info
 85 | 
 86 |     def __getitem__(self, index: int):
 87 |         index = self.train_indices[self.indices[index]]
 88 |         return index
 89 | 
 90 |     def __len__(self):
 91 |         return self.length
 92 | 
 93 | 
 94 | class Sampler:
 95 |     def __init__(self, data, num_edges, device):
 96 |         self.data = data
 97 |         self.device = device
 98 |         self._origin_num_edges = num_edges
 99 |         self.adj, self.unique_edges, self.num_edges = None, None, None
100 | 
101 |     def _construct_adj(self):
102 |         self.adj, self.unique_edges = Sampler.__construct_adj(self._origin_num_edges, self.data.edge_index)
103 |         self.num_edges = len(self.unique_edges)
104 |         print(f"num edge {self._origin_num_edges}, unique edge {self.num_edges}")
105 | 
106 |     @staticmethod
107 |     def __construct_adj(num_edges, edges_tensor):
108 |         unique = np.zeros(num_edges, dtype=np.bool)
109 |         adj = np.zeros(num_edges, dtype=np.int)
110 |         edges = edges_tensor.cpu().numpy()
111 |         edges_dict = defaultdict(lambda: 0)
112 |         for i in range(num_edges):
113 |             if not (edges[1, i], edges[0, i]) in edges_dict:
114 |                 unique[i] = True
115 |             edges_dict[(edges[0, i], edges[1, i])] = i
116 |         for i in range(num_edges):
117 |             adj[i] = edges_dict[(edges[1, i], edges[0, i])]
118 |         del edges_dict
119 |         return adj, np.argwhere(unique)
120 | 
121 |     def stub_sampler(self):
122 |         return self.data.to(self.device)
123 | 
124 |     def _make_undirected(self, edge_index):
125 |         symmetry = self.adj[edge_index]
126 |         undirected = np.union1d(edge_index, symmetry)
127 |         print(f"Before undirected {len(edge_index)}, after undirected {len(undirected)}")
128 |         return undirected
129 | 
130 |     def random_edge_sampler(self, percent=1.0):
131 |         """
132 |         Randomly drop edge
133 |         Args:
134 |             percent: preserve edges' percent
135 | 
136 |         Returns: data
137 | 
138 |         """
139 | 
140 |         if percent >= 1.0:
141 |             return self.stub_sampler()
142 | 
143 |         if self.adj is None:
144 |             self._construct_adj()
145 | 
146 |         data = self.data
147 |         num_preserved_edges = int(percent * self.num_edges)
148 |         perm = self.unique_edges[np.random.permutation(self.num_edges)[:num_preserved_edges]]
149 |         perm = self._make_undirected(perm)
150 |         random_data = Data(
151 |             x=data.x, y=data.y,
152 |             train_indices=data.train_indices, train_mask=data.train_mask,
153 |             test_indices=data.test_indices, test_mask=data.test_mask,
154 |             edge_index=data.edge_index[:, perm], edge_weight=data.edge_weight[perm]
155 |         )
156 |         if hasattr(data, "valid_indices"):
157 |             random_data.valid_indices = data.valid_indices
158 |             random_data.valid_mask = data.valid_mask
159 |         return random_data.to(self.device)
160 | 


--------------------------------------------------------------------------------
/code_submission/utils/drop_edge.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time:    2020/5/12 23:20
 4 | # @Author:  Mecthew
 5 | import numpy as np
 6 | import torch_geometric.utils as gtils
 7 | 
 8 | 
 9 | class DropEdgeEachStep:
10 |     def __init__(self, adj, unique_edges):
11 |         self.adj = adj
12 |         self.unique_edges = unique_edges
13 |         self.counter = 0
14 | 
15 |     def drop_edges(self, edge_index, edge_weight, drop_rate=0.2):
16 |         num_edges = len(self.unique_edges)
17 |         num_preserved_edges = int(num_edges * (1-drop_rate))
18 |         preserved_edges_idx = self.unique_edges[np.random.permutation(num_edges)[:num_preserved_edges]]
19 |         perm = self._make_undirected(preserved_edges_idx)
20 |         if self.counter == 0:
21 |             print(f"Is undirected after drop edges: {gtils.is_undirected(edge_index[:, perm])}")
22 |         self.counter += 1
23 |         return edge_index[:, perm], edge_weight[perm]
24 | 
25 |     def _make_undirected(self, edge_index):
26 |         symmetry = self.adj[edge_index]
27 |         undirected = np.union1d(edge_index, symmetry)
28 |         return undirected
29 | 


--------------------------------------------------------------------------------
/code_submission/utils/ensemble.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def get_top_models_by_std(models_info, ensemble_std_threshold=1e-2):
 5 |     """
 6 |     select model by std
 7 |     Args:
 8 |         models_info: (model, metric), where smaller metric indicates better model
 9 |         ensemble_std_threshold: std threshold
10 |     Returns:
11 | 
12 |     """
13 |     pred, metrics = zip(*sorted(models_info, key=lambda x: -x[1]))
14 | 
15 |     print("sorted model metrics:")
16 |     top_num = 0
17 |     for i in range(len(metrics)):
18 |         print("metrics: {}".format(metrics[i]))
19 |         std = np.std(metrics[:i])
20 |         top_num = i
21 |         if std > ensemble_std_threshold:
22 |             break
23 |     pred = pred[:top_num]
24 |     metrics = np.asarray(metrics[:top_num])
25 |     metrics = metrics + 15 * (metrics - metrics.mean())
26 |     # metrics[np.where(metrics > 0.01)] = 0.01
27 |     weights = metrics / metrics.sum()
28 |     return list(zip(pred, weights))
29 | 
30 | 
31 | def get_top_models_by_r(models_info, range_threshold=1e-2):
32 |     """
33 |     select model by std
34 |     Args:
35 |         models_info: (model, metric), where smaller metric indicates better model
36 |         range_threshold: range threshold
37 |     Returns:
38 | 
39 |     """
40 |     pred, metrics, model_name = zip(*sorted(models_info, key=lambda x: -x[1]))
41 | 
42 |     print("sorted model metrics:")
43 |     top_num = 0
44 |     for i in range(len(metrics)):
45 |         print("metrics: {}\tmodel_name: {}".format(metrics[i], model_name[i]))
46 |         r = np.abs(metrics[0] - metrics[i])
47 |         top_num = i
48 |         if r > range_threshold:
49 |             break
50 |         if i == len(metrics)-1:
51 |             top_num = i + 1
52 |     pred = pred[:top_num]
53 |     metrics = np.asarray(metrics[:top_num])
54 |     metrics = metrics + 15 * (metrics - metrics.mean())
55 |     weights = metrics / metrics.sum()
56 |     return list(zip(pred, weights))
57 | 


--------------------------------------------------------------------------------
/code_submission/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | 
 5 | def get_logger(verbosity_level, use_error_log=False):
 6 |     logger = logging.getLogger(__file__)
 7 |     logging_level = getattr(logging, verbosity_level)
 8 |     logger.setLevel(logging_level)
 9 |     formatter = logging.Formatter(
10 |         fmt='%(asctime)s %(levelname)s %(filename)s: %(message)s')
11 |     if not logger.handlers:
12 |         stdout_handler = logging.StreamHandler(sys.stdout)
13 |         stdout_handler.setLevel(logging_level)
14 |         stdout_handler.setFormatter(formatter)
15 |         logger.addHandler(stdout_handler)
16 |     if use_error_log:
17 |         stderr_handler = logging.StreamHandler(sys.stderr)
18 |         stderr_handler.setLevel(logging.WARNING)
19 |         stderr_handler.setFormatter(formatter)
20 |         logger.addHandler(stderr_handler)
21 |     logger.propagate = False
22 |     return logger
23 | 


--------------------------------------------------------------------------------
/code_submission/utils/timer.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | # import functools
  3 | # import multiprocessing
  4 | 
  5 | 
  6 | TIME_BUDGET = None
  7 | 
  8 | # multiprocessing.get_context("forkserver")
  9 | # manager = multiprocessing.Manager()
 10 | 
 11 | 
 12 | def set_time_budget(time_budget):
 13 |     global TIME_BUDGET
 14 |     TIME_BUDGET = TimeBudget(time_budget)
 15 | 
 16 | 
 17 | def get_time_budget():
 18 |     global TIME_BUDGET
 19 |     return TIME_BUDGET
 20 | 
 21 | 
 22 | def _wrapped_method(m, return_dict, args, kwargs):
 23 |     res = m(*args, **kwargs)
 24 |     return_dict["res"] = res
 25 |     return return_dict
 26 | 
 27 | 
 28 | # def time_limit(milliseconds=1000):
 29 | #     def wrapper(method):
 30 | #         @functools.wraps(method)
 31 | #         def timed(*args, **kwargs):
 32 | #             global TIME_BUDGET
 33 | #             time_budget = TIME_BUDGET
 34 | #             return_dict = manager.dict()
 35 | #
 36 | #             p = multiprocessing.Process(
 37 | #                 target=_wrapped_method,
 38 | #                 args=(method, return_dict, args, kwargs),
 39 | #             )
 40 | #             p.start()
 41 | #             # use 80% time for fitting
 42 | #             running_time = min(time_budget.remain * 0.8, milliseconds)
 43 | #             p.join(running_time)
 44 | #             if p.is_alive():
 45 | #                 p.kill()
 46 | #                 print(f"Task exceeds the time budget {running_time} and has been cancelled")
 47 | #                 res = None
 48 | #             else:
 49 | #                 res = return_dict.get("res")
 50 | #                 p.terminate()
 51 | #             print(f"After running, there is {time_budget.remain: .4f}s remaining time")
 52 | #             return res
 53 | #         return timed
 54 | #     return wrapper
 55 | 
 56 | 
 57 | class TimeOutError(Exception):
 58 |     pass
 59 | 
 60 | 
 61 | class TimeBudget:
 62 |     def __init__(self, time_budget):
 63 |         self._time_budget = time_budget
 64 |         self._start_time = time.time()
 65 | 
 66 |     def reset(self):
 67 |         self._start_time = time.time()
 68 | 
 69 |     @property
 70 |     def remain(self):
 71 |         escape_time = time.time() - self._start_time
 72 |         return self._time_budget - escape_time
 73 | 
 74 |     @remain.setter
 75 |     def remain(self, value):
 76 |         self._time_budget = value
 77 | 
 78 |     def timing(self, seconds=None, frac=1.0):
 79 |         if seconds is None:
 80 |             seconds = self.remain * frac
 81 |         else:
 82 |             seconds = min(seconds, self.remain * frac)
 83 |         return TimeBudget(seconds)
 84 | 
 85 |     def check(self):
 86 |         if self.remain < 0:
 87 |             raise TimeOutError(f"Time out {self.remain: 0.4f}")
 88 | 
 89 |     def __add__(self, other):
 90 |         # self._time_budget += other
 91 |         return self
 92 | 
 93 |     def __sub__(self, other):
 94 |         # self._time_budget -= other
 95 |         return self
 96 | 
 97 |     def __str__(self):
 98 |         return str(self.remain)
 99 | 
100 |     def __repr__(self):
101 |         return repr(self.remain)
102 | 
103 |     def __format__(self, format_spec):
104 |         return format(self.remain, format_spec)
105 | 


--------------------------------------------------------------------------------
/code_submission/utils/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from tqdm import tqdm
 4 | from torch.utils.data import DataLoader
 5 | from sklearn.metrics import accuracy_score
 6 | 
 7 | from .logger import get_logger
 8 | from .timer import get_time_budget
 9 | from utils.callbacks import EarlyStopping
10 | 
11 | logger = get_logger("DEBUG")
12 | 
13 | 
14 | def get_accuracy(y_hat, indices, data):
15 |     accuracy = accuracy_score(
16 |         data.y[indices].cpu().numpy(),
17 |         y_hat[indices].argmax(axis=1).reshape(-1))
18 |     return accuracy
19 | 
20 | 
21 | def torch_train(
22 |         data, dataset, model, optimizer, loss_func,
23 |         epochs=512, batch_size=32, patience=5,
24 |         clip_grad=0,
25 |         min_epochs=1,
26 |         valid_indices=None, all_data=False,
27 |         use_adaptive_topK=False, model_topK=None,
28 |         time_budget=None
29 | ):
30 |     early_stopping_cb = EarlyStopping(patience=patience, min_epochs=min_epochs, use_adaptive_topK=use_adaptive_topK)
31 |     early_stopping_cb.topK_list = model_topK
32 |     # on epoch begin
33 |     with tqdm(total=epochs) as t:
34 |         try:
35 |             for i in range(epochs):
36 |                 data_loader = DataLoader(dataset.resample(), batch_size=batch_size, shuffle=True)
37 |                 model.train()
38 |                 epoch_loss = 0
39 |                 for indices in data_loader:
40 |                     # on batch begin
41 |                     optimizer.zero_grad()
42 |                     y_hat = model(data)
43 |                     loss = loss_func(y_hat[indices].squeeze(), data.y[indices].squeeze())
44 |                     loss.backward()
45 |                     if clip_grad > 0:
46 |                         for p in model.parameters():
47 |                             nn.utils.clip_grad_norm_(p, clip_grad)
48 |                     optimizer.step()
49 |                     # on batch end
50 |                     epoch_loss += loss.item()
51 | 
52 |                 model.eval()
53 |                 with torch.no_grad():
54 |                     y_hat = model(data)
55 |                     y_hat = y_hat.cpu().numpy()
56 | 
57 |                 # on epoch end
58 |                 if valid_indices is not None:
59 |                     valid_acc = get_accuracy(y_hat, valid_indices, data)
60 |                 else:
61 |                     valid_acc = get_accuracy(y_hat, data.train_indices, data)
62 | 
63 |                 t.set_postfix(
64 |                     Epoch=f"{i: 03,d}",
65 |                     loss=f"{epoch_loss: 0.5f}",
66 |                     acc=f"{valid_acc: 0.5f}",
67 |                     patience=f"{early_stopping_cb.wait: 03,d}/{early_stopping_cb.patience}"
68 |                 )
69 |                 t.update(1)
70 | 
71 |                 if early_stopping_cb.on_epoch_end(i, valid_acc, epoch_loss, y_hat):
72 |                     break
73 |                 try:
74 |                     time_budget.check()
75 |                 except Exception as e:
76 |                     print(e)
77 |                     return early_stopping_cb.topK_list, -early_stopping_cb.best, "time_exceed"
78 |         except RuntimeError as exception:
79 |             if "out of memory" in str(exception):
80 |                 logger.info("we met cuda out of memory")
81 |                 return early_stopping_cb.topK_list, -early_stopping_cb.best, "oom"
82 |             else:
83 |                 raise exception
84 |     return early_stopping_cb.topK_list, -early_stopping_cb.best, None
85 | 


--------------------------------------------------------------------------------
/ingestion/common.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=logging-fstring-interpolation, broad-except
 2 | """common"""
 3 | import logging
 4 | import importlib
 5 | import sys
 6 | 
 7 | 
 8 | class ModelApiError(Exception):
 9 |     """Model api error"""
10 | 
11 | 
12 | def get_logger(verbosity_level, name, use_error_log=False):
13 |     """Set logging format to something like:
14 |         2019-04-25 12:52:51,924 INFO score.py: <message>
15 |     """
16 |     logger = logging.getLogger(name)
17 |     logging_level = getattr(logging, verbosity_level)
18 |     logger.setLevel(logging_level)
19 |     formatter = logging.Formatter(
20 |         fmt='%(asctime)s %(levelname)s %(filename)s: %(message)s')
21 |     stdout_handler = logging.StreamHandler(sys.stdout)
22 |     stdout_handler.setLevel(logging_level)
23 |     stdout_handler.setFormatter(formatter)
24 |     logger.addHandler(stdout_handler)
25 |     if use_error_log:
26 |         stderr_handler = logging.StreamHandler(sys.stderr)
27 |         stderr_handler.setLevel(logging.WARNING)
28 |         stderr_handler.setFormatter(formatter)
29 |         logger.addHandler(stderr_handler)
30 |     logger.propagate = False
31 |     return logger
32 | 
33 | 
34 | VERBOSITY_LEVEL = 'INFO'
35 | LOGGER = get_logger(VERBOSITY_LEVEL, __file__)
36 | METHOD_LIST = ['train_predict']
37 | 
38 | 
39 | def _check_umodel_methed(umodel):
40 |     # Check if the model has methods in METHOD_LIST
41 |     for attr in ['train_predict']:
42 |         if not hasattr(umodel, attr):
43 |             raise ModelApiError(
44 |                 f"Your model object doesn't have the method attr")
45 | 
46 | 
47 | def import_umodel():
48 |     """import user model"""
49 |     model_cls = importlib.import_module('model').Model
50 |     _check_umodel_methed(model_cls)
51 | 
52 |     return model_cls
53 | 
54 | 
55 | def init_usermodel():
56 |     """initialize user model"""
57 |     return import_umodel()()
58 | 


--------------------------------------------------------------------------------
/ingestion/dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 |   AutoWSL datasets.
  3 | """
  4 | import copy
  5 | from os.path import join
  6 | from datetime import datetime
  7 | import numpy as np
  8 | import pandas as pd
  9 | import yaml
 10 | from common import get_logger
 11 | 
 12 | TYPE_MAP = {
 13 |     'cat': str,
 14 |     'multi-cat': str,
 15 |     'str': str,
 16 |     'num': np.float64,
 17 |     'timestamp': 'str'
 18 | }
 19 | 
 20 | VERBOSITY_LEVEL = 'WARNING'
 21 | LOGGER = get_logger(VERBOSITY_LEVEL, __file__)
 22 | TIMESTAMP_TYPE_NAME = 'timestamp'
 23 | TRAIN_FILE = 'train_node_id.txt'
 24 | TRAIN_LABEL = 'train_label.tsv'
 25 | TEST_FILE = 'test_node_id.txt'
 26 | INFO_FILE = 'config.yml'
 27 | FEA_TABLE = 'feature.tsv'
 28 | EDGE_FILE = 'edge.tsv'
 29 | 
 30 | SEP = '\t'
 31 | 
 32 | 
 33 | def _date_parser(millisecs):
 34 |     if np.isnan(float(millisecs)):
 35 |         return millisecs
 36 | 
 37 |     return datetime.fromtimestamp(float(millisecs))
 38 | 
 39 | 
 40 | class Dataset:
 41 |     """"Dataset"""
 42 |     def __init__(self, dataset_dir):
 43 |         """
 44 |             train_dataset, test_dataset: list of strings
 45 |             train_label: np.array
 46 |         """
 47 |         self.dataset_dir_ = dataset_dir
 48 |         self.metadata_ = self._read_metadata(join(dataset_dir, INFO_FILE))
 49 |         self.edge_data = None
 50 |         self.train_indices = None
 51 |         self.train_label = None
 52 |         self.test_indices = None
 53 |         self.fea_table = None
 54 |         self.get_data()
 55 | 
 56 |     def get_data(self):
 57 |         """get all training data"""
 58 |         data = {
 59 |             'fea_table': self.get_fea_table(),
 60 |             'edge_file': self.get_edge(),
 61 |             'train_indices': self.get_train_indices(),
 62 |             'test_indices': self.get_test_indices(),
 63 |             'train_label': self.get_train_label(),
 64 |         }
 65 |         return data
 66 | 
 67 |     def get_fea_table(self):
 68 |         """get train"""
 69 |         if self.fea_table is None:
 70 |             self.fea_table = self._read_dataset(
 71 |                 join(self.dataset_dir_, FEA_TABLE))
 72 |         return self.fea_table
 73 | 
 74 |     def get_edge(self):
 75 |         """get edge file"""
 76 |         dtype = {
 77 |             'src_id': int,
 78 |             'dst_idx': int,
 79 |             'edge_weight': float
 80 |         }
 81 |         if self.edge_data is None:
 82 |             self.edge_data = pd.read_csv(
 83 |                 join(self.dataset_dir_, EDGE_FILE), dtype=dtype, sep=SEP)
 84 |         return self.edge_data
 85 | 
 86 |     def get_train_label(self):
 87 |         """get train label"""
 88 |         dtype = {
 89 |             'node_index': int,
 90 |             'label': int,
 91 |         }
 92 |         if self.train_label is None:
 93 |             self.train_label = pd.read_csv(
 94 |                 join(self.dataset_dir_, TRAIN_LABEL), dtype=dtype, sep=SEP)
 95 | 
 96 |         return self.train_label
 97 | 
 98 |     def get_test_indices(self):
 99 |         """get test index file"""
100 |         if self.test_indices is None:
101 |             with open(join(self.dataset_dir_, TEST_FILE), 'r') as ftmp:
102 |                 self.test_indices = [int(line.strip()) for line in ftmp]
103 | 
104 |         return self.test_indices
105 | 
106 |     def get_train_indices(self):
107 |         """get train index file"""
108 |         if self.train_indices is None:
109 |             with open(join(self.dataset_dir_, TRAIN_FILE), 'r') as ftmp:
110 |                 self.train_indices = [int(line.strip()) for line in ftmp]
111 | 
112 |         return self.train_indices
113 | 
114 |     def get_metadata(self):
115 |         """get metadata"""
116 |         return copy.deepcopy(self.metadata_)
117 | 
118 |     @staticmethod
119 |     def _read_metadata(metadata_path):
120 |         with open(metadata_path, 'r') as ftmp:
121 |             return yaml.safe_load(ftmp)
122 | 
123 |     def _read_dataset(self, dataset_path):
124 |         schema = self.metadata_['schema']
125 |         if isinstance(schema, dict):
126 |             table_dtype = {key: TYPE_MAP[val] for key, val in schema.items()}
127 |             date_list = [key for key, val in schema.items()
128 |                          if val == TIMESTAMP_TYPE_NAME]
129 |             dataset = pd.read_csv(
130 |                 dataset_path, sep=SEP, dtype=table_dtype,
131 |                 parse_dates=date_list, date_parser=_date_parser)
132 |         else:
133 |             dataset = pd.read_csv(dataset_path, sep=SEP)
134 | 
135 |         return dataset
136 | 


--------------------------------------------------------------------------------
/ingestion/metadata:
--------------------------------------------------------------------------------
1 | command: python $ingestion_program/ingestion.py --dataset_dir=$input --output_dir=$predictions --ingestion_program_dir=$ingestion_program --code_dir=$submission_program --score_dir=$output --temp_dir=$tmp
2 | 


--------------------------------------------------------------------------------
/ingestion/timing.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=logging-fstring-interpolation, broad-except
  2 | """common"""
  3 | import signal
  4 | import math
  5 | import time
  6 | from contextlib import contextmanager
  7 | import numpy as np
  8 | import yaml
  9 | from common import get_logger
 10 | 
 11 | VERBOSITY_LEVEL = 'INFO'
 12 | LOGGER = get_logger(VERBOSITY_LEVEL, __file__)
 13 | 
 14 | CUM = 0
 15 | RESET = 1
 16 | MODES = set([CUM, RESET])
 17 | 
 18 | 
 19 | OP_MAP = {
 20 |     'mean': np.mean,
 21 |     'max': np.max,
 22 |     'std': np.std,
 23 |     'sum': sum,
 24 | }
 25 | 
 26 | 
 27 | class TimeoutException(Exception):
 28 |     """timeoutexception"""
 29 | 
 30 | 
 31 | class Timer:
 32 |     """timer"""
 33 |     def __init__(self):
 34 |         self.total = {}
 35 |         self.history = {}
 36 |         self.modes = {}
 37 | 
 38 |     @classmethod
 39 |     def from_file(cls, save_file):
 40 |         """contruct timer from a save file"""
 41 |         timer = Timer()
 42 |         timer.load(save_file)
 43 |         return timer
 44 | 
 45 |     def add_process(self, pname, time_budget, mode=RESET):
 46 |         """set time_budget
 47 |         mode: CUM/RESET
 48 |         """
 49 |         if pname in self.total:
 50 |             raise ValueError(f"Existing process of timer: {pname}")
 51 |         if mode not in MODES:
 52 |             raise ValueError(f"wrong process mode: {mode}")
 53 | 
 54 |         self.total[pname] = time_budget
 55 |         self.history[pname] = []
 56 |         self.modes[pname] = mode
 57 | 
 58 |     @contextmanager
 59 |     def time_limit(self, pname, verbose=True):
 60 |         """limit time"""
 61 |         def signal_handler(signum, frame):
 62 |             raise TimeoutException(f"{pname}: Timed out!")
 63 |         signal.signal(signal.SIGALRM, signal_handler)
 64 |         time_budget = int(math.ceil(self.get_remain(pname)))
 65 |         signal.alarm(time_budget)
 66 |         start_time = time.time()
 67 | 
 68 |         try:
 69 | 
 70 |             if verbose:
 71 |                 LOGGER.info(f'start {pname} with time budget {time_budget}')
 72 |             yield
 73 |         finally:
 74 |             exec_time = time.time() - start_time
 75 |             signal.alarm(0)
 76 |             self.history[pname].append(exec_time)
 77 | 
 78 |         if verbose:
 79 |             LOGGER.info(f'{pname} success, time spent {exec_time} sec')
 80 | 
 81 |         if self.get_remain(pname) <= 0:
 82 |             raise TimeoutException(f"{pname}: Timed out!")
 83 | 
 84 |     def get_remain(self, pname):
 85 |         """get remaining time of process"""
 86 |         if self.modes[pname] == CUM:
 87 |             remain = self.total[pname] - sum(self.history[pname])
 88 |         else:
 89 |             remain = self.total[pname]
 90 | 
 91 |         return remain
 92 | 
 93 |     def get_all_remain(self):
 94 |         """get remaining time of process"""
 95 |         return {key: self.get_remain(key) for key in self.total.keys()}
 96 | 
 97 |     def get_stats(self, pname):
 98 |         """get stats of timing history"""
 99 |         result = {}
100 |         for stat in ['sum', 'mean', 'max', 'std']:
101 |             history = self.history[pname]
102 |             if history:
103 |                 result[stat] = float(OP_MAP[stat](self.history[pname]))
104 |             else:
105 |                 result[stat] = 0
106 |         return result
107 | 
108 |     def get_overall_duration(self):
109 |         """get overall duration"""
110 |         duration = 0
111 |         for _, value in self.history.items():
112 |             duration += sum(value)
113 |         return duration
114 | 
115 |     def get_all_stats(self):
116 |         """get all stats of timing history"""
117 |         stats = {pname: self.get_stats(pname) for pname in self.total.keys()}
118 |         return stats
119 | 
120 |     def save(self, save_file):
121 |         """save timer"""
122 |         save_content = {
123 |             'total': self.total,
124 |             'history': self.history,
125 |             'modes': self.modes
126 |         }
127 |         with open(save_file, 'w') as ftmp:
128 |             yaml.dump(save_content, ftmp)
129 | 
130 |     def load(self, save_file):
131 |         """load timer"""
132 |         with open(save_file, 'r') as ftmp:
133 |             save_content = yaml.safe_load(ftmp)
134 |         self.total = save_content['total']
135 |         self.history = save_content['history']
136 |         self.modes = save_content['modes']
137 | 


--------------------------------------------------------------------------------
/meta_run.sh:
--------------------------------------------------------------------------------
 1 | !/bin/bash
 2 | 
 3 | log_folder="${pwd}/log_output/$1_output"
 4 | 
 5 | if [ ! -x $log_folder ]; then
 6 | 	mkdir $log_folder
 7 | fi
 8 | 
 9 | run_times=$2
10 | IFS=" "
11 | datasets=($@)
12 | unset datasets[0]
13 | unset datasets[1]
14 | 
15 | echo $datasets
16 | 
17 | for (( i=1; i <= run_times; i++))
18 | do
19 |   for dataset in ${datasets[@]}
20 |   do
21 |     if [[ $dataset =~ "/" ]];then   # 针对new-data/co-az类型创建一个目录new-data
22 |       par_dir=(${dataset//// }[0])
23 |       new_dir=${log_folder}"/"$par_dir
24 |       echo "Make dir: $new_dir"
25 |       if [ ! -x "$new_dir" ]; then
26 |         mkdir "$new_dir"
27 |       fi
28 |     fi
29 |     dataset_dir="/home/chengfeng/autograph/public/$dataset"
30 |     cur_time="`date +%Y-%m-%d-%H-%M-%S`"
31 |     log_file="$log_folder/$dataset-$cur_time.log"
32 |     python_command="python run_local_test.py --dataset_dir=$dataset_dir 2>&1"
33 |     log_command="tee -i $log_file"
34 |     echo "Current time: $cur_time"
35 |     echo "Run command: $python_command"
36 |     echo "Log info into file: $log_file"
37 |     eval "$python_command | $log_command"
38 |   done
39 | done
40 | 


--------------------------------------------------------------------------------
/run_local.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | log_folder="./log_output"
 4 | if [ ! -x $log_folder ]; then
 5 | 	mkdir $log_folder
 6 | fi
 7 | 
 8 | for arg in $*
 9 | do
10 |   if [[ $arg =~ "/" ]];then   # 针对new-data/co-az类型创建一个目录new-data
11 |     par_dir=(${arg//// }[0])
12 |     new_dir=${log_folder}"/"$par_dir
13 |     echo "Make dir: $new_dir"
14 |     if [ ! -x "$new_dir" ]; then
15 |       mkdir "$new_dir"
16 |     fi
17 |   fi
18 | 	dataset_dir="../public/$arg"
19 | 	cur_time="`date +%Y-%m-%d-%H-%M-%S`"
20 | 	log_file="$log_folder/$arg-$cur_time.log"
21 | 	python_command="python run_local_test.py --dataset_dir=$dataset_dir 2>&1"
22 | 	log_command="tee -i $log_file"
23 | 	echo "Current time: $cur_time"
24 | 	echo "Run command: $python_command"
25 | 	echo "Log info into file: $log_file"
26 | 	eval "$python_command | $log_command"
27 | done
28 | 


--------------------------------------------------------------------------------
/run_local_test.py:
--------------------------------------------------------------------------------
  1 | """run local test in starting kit"""
  2 | # pylint: disable=logging-fstring-interpolation
  3 | 
  4 | import argparse
  5 | import logging
  6 | import os
  7 | from os.path import join, isdir
  8 | import shutil
  9 | from multiprocessing import Process
 10 | 
 11 | VERBOSITY_LEVEL = 'WARNING'
 12 | 
 13 | logging.basicConfig(
 14 |     level=getattr(logging, VERBOSITY_LEVEL),
 15 |     format='%(asctime)s %(levelname)s %(filename)s: %(message)s',
 16 |     datefmt='%Y-%m-%d %H:%M:%S'
 17 | )
 18 | 
 19 | 
 20 | def _here(*args):
 21 |     here = os.path.dirname(os.path.realpath(__file__))
 22 |     return os.path.join(here, *args)
 23 | 
 24 | 
 25 | def _ingestion_program(starting_kit_dir):
 26 |     return join(starting_kit_dir, 'ingestion', 'ingestion.py')
 27 | 
 28 | 
 29 | def _scoring_program(starting_kit_dir):
 30 |     return join(starting_kit_dir, 'scoring', 'score.py')
 31 | 
 32 | 
 33 | def remove_dir(output_dir):
 34 |     """Remove the directory `output_dir`.
 35 |   This aims to clean existing output of last run of local test.
 36 |   """
 37 |     if isdir(output_dir):
 38 |         logging.info(
 39 |             f"Cleaning existing output directory of last run: {output_dir}")
 40 |         shutil.rmtree(output_dir)
 41 | 
 42 | 
 43 | def _clean(starting_kit_dir):
 44 |     ingestion_output_dir = join(starting_kit_dir, 'sample_result_submission')
 45 |     score_dir = os.path.join(starting_kit_dir, 'scoring_output')
 46 |     remove_dir(ingestion_output_dir)
 47 |     remove_dir(score_dir)
 48 | 
 49 | 
 50 | def run(dataset_dir, code_dir):
 51 |     """run"""
 52 |     # Current directory containing this script
 53 |     starting_kit_dir = _here()
 54 |     path_ingestion = _ingestion_program(starting_kit_dir)
 55 |     path_scoring = _scoring_program(starting_kit_dir)
 56 | 
 57 |     # Run ingestion and scoring at the same time
 58 |     command_ingestion = (
 59 |         'python '
 60 |         #  f'{path_ingestion} --dataset_dir={dataset_dir}/data '
 61 |         f'{path_ingestion} --dataset_dir={dataset_dir}/train.data'
 62 |         f' --code_dir={code_dir}')
 63 | 
 64 |     command_scoring = (
 65 |         #  f'python {path_scoring} --solution_dir={dataset_dir}/solution')
 66 |         f'python {path_scoring} --solution_dir={dataset_dir}')
 67 | 
 68 |     def run_ingestion():
 69 |         os.system(command_ingestion)
 70 | 
 71 |     def run_scoring():
 72 |         os.system(command_scoring)
 73 | 
 74 |     ingestion_process = Process(name='ingestion', target=run_ingestion)
 75 |     scoring_process = Process(name='scoring', target=run_scoring)
 76 |     _clean(starting_kit_dir)
 77 | 
 78 |     ingestion_process.start()
 79 |     scoring_process.start()
 80 | 
 81 | 
 82 | def _parse_args():
 83 |     default_starting_kit_dir = _here()
 84 |     default_dataset_dir = join(default_starting_kit_dir, 'data', 'demo')
 85 |     default_code_dir = join(default_starting_kit_dir, 'code_submission')
 86 | 
 87 |     parser = argparse.ArgumentParser()
 88 |     parser.add_argument('--dataset_dir', type=str,
 89 |                         default=default_dataset_dir,
 90 |                         help="Directory storing the dataset, should contain"
 91 |                              "'data' and 'solution'")
 92 | 
 93 |     parser.add_argument('--code_dir', type=str,
 94 |                         default=default_code_dir,
 95 |                         help="Directory storing the submission code "
 96 |                              "`model.py` and other necessary packages.")
 97 | 
 98 |     args = parser.parse_args()
 99 |     return args
100 | 
101 | 
102 | def main():
103 |     """main entry"""
104 |     args = _parse_args()
105 |     dataset_dir = args.dataset_dir
106 |     code_dir = args.code_dir
107 |     logging.info("#" * 50)
108 |     logging.info("Begin running local test using")
109 |     logging.info(f"code_dir = {code_dir}")
110 |     logging.info(f"dataset_dir = {dataset_dir}")
111 |     logging.info("#" * 50)
112 |     run(dataset_dir, code_dir)
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     main()
117 | 


--------------------------------------------------------------------------------
/scoring/graph-score.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time:    2020/4/30 19:05
 4 | # @Author:  Mecthew
 5 | 
 6 | import os
 7 | import sys
 8 | import re
 9 | from datetime import datetime
10 | import numpy as np
11 | 
12 | 
13 | class ScoreTuple:
14 |     def __init__(self, dataset_name, score_list):
15 |         self.name = dataset_name
16 |         self.dataset_score_list = score_list
17 | 
18 |     def __str__(self):
19 |         return "%-18s\tmean %.6f,  max %.6f,  min %.6f,  std %.6f,  num %d" % (self.name, np.mean(self.dataset_score_list), np.max(self.dataset_score_list),
20 |                                                                                np.min(self.dataset_score_list), np.std(self.dataset_score_list), len(self.dataset_score_list))
21 | 
22 | 
23 | def read_score_of_dir(dir_path):
24 |     score_list = []
25 |     time_cost_patten = re.compile(r'Scoring duration: ([0-9e+-\\.]+) sec.')
26 |     score_patten = re.compile(r'The score of your algorithm on the task is: ([0-9\\.]+).')
27 | 
28 |     for file_name in os.listdir(dir_path):
29 |         if file_name.endswith(".log"):
30 |             file_path = os.path.join(dir_path, file_name)
31 |             score, time_duration = None, None
32 |             for line in open(file_path, 'r', encoding="utf8"):
33 |                 if line.strip().startswith(datetime.now().year.__str__()) and \
34 |                         (score is None or time_duration is None):
35 |                     try:
36 |                         time_duration = time_cost_patten.findall(line.strip())[0]
37 |                     except Exception as e:
38 |                         pass
39 |                     try:
40 |                         score = score_patten.findall(line.strip())[0]
41 |                     except Exception as e:
42 |                         pass
43 |             score_list.append((file_name, time_duration, score))
44 |         elif os.path.isdir(os.path.join(dir_path, file_name)):
45 |             child_dir_path = os.path.join(dir_path, file_name)
46 |             for subfile in os.listdir(child_dir_path):
47 |                 if subfile.endswith(".log"):
48 |                     file_path = os.path.join(child_dir_path, subfile)
49 |                     score, time_duration = None, None
50 |                     for line in open(file_path, 'r', encoding="utf8"):
51 |                         if line.strip().startswith(datetime.now().year.__str__()) and \
52 |                                 (score is None or time_duration is None):
53 |                             try:
54 |                                 time_duration = time_cost_patten.findall(line.strip())[0]
55 |                             except Exception as e:
56 |                                 pass
57 |                             try:
58 |                                 score = score_patten.findall(line.strip())[0]
59 |                             except Exception as e:
60 |                                 pass
61 |                     score_list.append((os.path.join(file_name, subfile), time_duration, score))
62 | 
63 |     return score_list
64 | 
65 | 
66 | def main(argv):
67 |     score_list = read_score_of_dir(argv[1])
68 |     mean_score_list, dataset_score_list = [], []
69 |     prev_dataset = None
70 |     counter = 0
71 |     dataset_name_patten = re.compile(r"(.*)-2020.*")
72 |     for tup in score_list:
73 |         counter += 1
74 |         dataset_name = dataset_name_patten.findall(tup[0])[0]
75 |         if dataset_name in ["coauthor", "az"]:
76 |             dataset_name = "-".join(tup[0].split("-")[:2])
77 |         if prev_dataset is not None and dataset_name != prev_dataset and len(dataset_score_list) > 0:
78 |             mean_score_list.append(ScoreTuple(prev_dataset, dataset_score_list))
79 |             dataset_score_list = []
80 |         prev_dataset = dataset_name
81 |         if tup[-1] is not None:
82 |             dataset_score_list.append(float(tup[-1]))
83 |     if len(dataset_score_list) > 0:
84 |         mean_score_list.append(ScoreTuple(prev_dataset, dataset_score_list))
85 | 
86 |     mean_score_list = sorted(mean_score_list, key=lambda x: len(x.name))
87 |     for tup in mean_score_list:
88 |         print(tup)
89 |         # print("{:<15} {}".format(tup[0], str(tup[1])))
90 | 
91 | if __name__ == '__main__':
92 |     main(sys.argv)


--------------------------------------------------------------------------------
/scoring/metadata:
--------------------------------------------------------------------------------
1 | command: python $program/score.py --solution_dir=$hidden --prediction_dir=$predictions --score_dir=$output
2 | description: Compute scores for the competition
3 | 


--------------------------------------------------------------------------------
/scoring/score.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=logging-fstring-interpolation
  2 | """scoring function for autograph"""
  3 | 
  4 | import argparse
  5 | import datetime
  6 | import os
  7 | from os.path import join
  8 | import logging
  9 | import sys
 10 | import time
 11 | 
 12 | import yaml
 13 | import numpy as np
 14 | import pandas as pd
 15 | from collections import Counter
 16 | from sklearn.metrics import accuracy_score
 17 | 
 18 | from filelock import FileLock
 19 | 
 20 | # Verbosity level of logging.
 21 | # Can be: NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL
 22 | #  VERBOSITY_LEVEL = 'INFO'
 23 | VERBOSITY_LEVEL = 'DEBUG'
 24 | WAIT_TIME = 30
 25 | MAX_TIME_DIFF = datetime.timedelta(seconds=600)
 26 | DEFAULT_SCORE = -1
 27 | SOLUTION_FILE = 'test_label.tsv'
 28 | 
 29 | 
 30 | def get_logger(verbosity_level, use_error_log=False):
 31 |     """Set logging format to something like:
 32 |         2019-04-25 12:52:51,924 INFO score.py: <message>
 33 |     """
 34 |     logger = logging.getLogger(__file__)
 35 |     logging_level = getattr(logging, verbosity_level)
 36 |     logger.setLevel(logging_level)
 37 |     formatter = logging.Formatter(
 38 |         fmt='%(asctime)s %(levelname)s %(filename)s: %(message)s')
 39 |     stdout_handler = logging.StreamHandler(sys.stdout)
 40 |     stdout_handler.setLevel(logging_level)
 41 |     stdout_handler.setFormatter(formatter)
 42 |     logger.addHandler(stdout_handler)
 43 |     if use_error_log:
 44 |         stderr_handler = logging.StreamHandler(sys.stderr)
 45 |         stderr_handler.setLevel(logging.WARNING)
 46 |         stderr_handler.setFormatter(formatter)
 47 |         logger.addHandler(stderr_handler)
 48 |     logger.propagate = False
 49 |     return logger
 50 | 
 51 | 
 52 | LOGGER = get_logger(VERBOSITY_LEVEL)
 53 | 
 54 | 
 55 | def _here(*args):
 56 |     """Helper function for getting the current directory of the script."""
 57 |     here_dir = os.path.dirname(os.path.realpath(__file__))
 58 |     return os.path.abspath(join(here_dir, *args))
 59 | 
 60 | 
 61 | def _get_solution(solution_dir):
 62 |     """Get the solution array from solution directory."""
 63 |     solution_file = join(solution_dir, SOLUTION_FILE)
 64 |     solution = pd.read_csv(solution_file, sep='\t')
 65 |     return solution
 66 | 
 67 | 
 68 | def _get_prediction(prediction_dir):
 69 |     pred_file = join(prediction_dir, 'predictions')
 70 |     return pd.read_csv(pred_file)['label']
 71 | 
 72 | 
 73 | def _get_score(solution_dir, prediction_dir):
 74 |     """get score"""
 75 |     LOGGER.info('===== get solution')
 76 |     solution = _get_solution(solution_dir)['label']
 77 |     LOGGER.info('===== read prediction')
 78 |     prediction = _get_prediction(prediction_dir)
 79 |     if solution.shape != prediction.shape:
 80 |         raise ValueError(f"Bad prediction shape: {prediction.shape}. "
 81 |                          f"Expected shape: {solution.shape}")
 82 | 
 83 |     LOGGER.info('===== calculate score')
 84 |     LOGGER.debug(f'solution shape = {solution.shape}')
 85 |     LOGGER.debug(f'prediction shape = {prediction.shape}')
 86 |     score = accuracy_score(solution, prediction)
 87 | 
 88 |     def get_df(counter, name):
 89 |         counter = {k: v for (k, v) in sorted(counter.items(), key=lambda x: x[0])}
 90 |         keys = counter.keys()
 91 |         values = counter.values()
 92 |         return pd.DataFrame({name: list(values)}, index=keys)
 93 | 
 94 |     labels_count = Counter(solution)
 95 |     length = len(solution)
 96 |     labels = get_df(labels_count, "Label num")
 97 |     labels_ratio = get_df({k: labels_count[k] / length for k in labels_count}, "Label ratio")
 98 |     errors_count = Counter(solution[solution != prediction])
 99 |     errors = get_df(errors_count, "Error")
100 |     errors_ratio = get_df({k: errors_count[k] / labels_count[k] for k in errors_count}, "Error ratio")
101 |     desc = labels.join(labels_ratio).join(errors).join(errors_ratio)
102 |     LOGGER.debug(f"Desc:\n{desc}")
103 | 
104 |     return score
105 | 
106 | 
107 | def _update_score(args, duration):
108 |     score = _get_score(solution_dir=args.solution_dir,
109 |                        prediction_dir=args.prediction_dir)
110 |     # Update learning curve page (detailed_results.html)
111 |     _write_scores_html(args.score_dir)
112 |     # Write score
113 |     LOGGER.info('===== write score')
114 |     write_score(args.score_dir, score, duration)
115 |     LOGGER.info(f"accuracy: {score:.4}")
116 |     return score
117 | 
118 | 
119 | def _init_scores_html(detailed_results_filepath):
120 |     html_head = ('<html><head> <meta http-equiv="refresh" content="5"> '
121 |                  '</head><body><pre>')
122 |     html_end = '</pre></body></html>'
123 |     with open(detailed_results_filepath, 'a') as html_file:
124 |         html_file.write(html_head)
125 |         html_file.write("Starting training process... <br> Please be patient. "
126 |                         "Learning curves will be generated when first "
127 |                         "predictions are made.")
128 |         html_file.write(html_end)
129 | 
130 | 
131 | def _write_scores_html(score_dir, auto_refresh=True, append=False):
132 |     filename = 'detailed_results.html'
133 |     if auto_refresh:
134 |         html_head = ('<html><head> <meta http-equiv="refresh" content="5"> '
135 |                      '</head><body><pre>')
136 |     else:
137 |         html_head = """<html><body><pre>"""
138 |     html_end = '</pre></body></html>'
139 |     if append:
140 |         mode = 'a'
141 |     else:
142 |         mode = 'w'
143 |     filepath = join(score_dir, filename)
144 |     with open(filepath, mode) as html_file:
145 |         html_file.write(html_head)
146 |         html_file.write(html_end)
147 |     LOGGER.debug(f"Wrote learning curve page to {filepath}")
148 | 
149 | 
150 | def write_score(score_dir, score, duration):
151 |     """Write score and duration to score_dir/scores.txt"""
152 |     score_filename = join(score_dir, 'scores.txt')
153 |     with open(score_filename, 'w') as ftmp:
154 |         ftmp.write(f'score: {score}\n')
155 |         ftmp.write(f'Duration: {duration}\n')
156 |     LOGGER.debug(f"Wrote to score_filename={score_filename} with "
157 |                  f"score={score}, duration={duration}")
158 | 
159 | 
160 | class IngestionError(Exception):
161 |     """Ingestion error"""
162 | 
163 | 
164 | class ScoringError(Exception):
165 |     """scoring error"""
166 | 
167 | 
168 | def get_ingestion_info(prediction_dir):
169 |     """get ingestion information"""
170 |     ingestion_info = None
171 |     endfile_path = os.path.join(prediction_dir, 'end.yaml')
172 | 
173 |     if not os.path.isfile(endfile_path):
174 |         raise IngestionError("[-] No end.yaml exist, ingestion failed")
175 | 
176 |     LOGGER.info('===== Detected end.yaml file, get ingestion information')
177 |     with open(endfile_path, 'r') as ftmp:
178 |         ingestion_info = yaml.safe_load(ftmp)
179 | 
180 |     return ingestion_info
181 | 
182 | 
183 | def get_ingestion_pid(prediction_dir):
184 |     """get ingestion pid"""
185 |     # Wait 60 seconds for ingestion to start and write 'start.txt',
186 |     # Otherwise, raise an exception.
187 |     wait_time = 60
188 |     startfile = os.path.join(prediction_dir, 'start.txt')
189 |     lockfile = os.path.join(prediction_dir, 'start.txt.lock')
190 | 
191 |     for i in range(wait_time):
192 |         if os.path.exists(startfile):
193 |             with FileLock(lockfile):
194 |                 with open(startfile, 'r') as ftmp:
195 |                     ingestion_pid = ftmp.read()
196 |                     LOGGER.info(
197 |                         f'Detected the start of ingestion after {i} seconds.')
198 |                     return int(ingestion_pid)
199 |         else:
200 |             time.sleep(1)
201 |     raise IngestionError(f'[-] Failed: scoring didn\'t detected the start of'
202 |                          'ingestion after {wait_time} seconds.')
203 | 
204 | 
205 | def is_process_alive(ingestion_pid):
206 |     """detect ingestion alive"""
207 |     try:
208 |         os.kill(ingestion_pid, 0)
209 |     except OSError:
210 |         return False
211 |     else:
212 |         return True
213 | 
214 | 
215 | def _parse_args():
216 |     # Default I/O directories:
217 |     root_dir = _here(os.pardir)
218 |     default_solution_dir = join(root_dir, "sample_data")
219 |     default_prediction_dir = join(root_dir, "sample_result_submission")
220 |     default_score_dir = join(root_dir, "scoring_output")
221 |     parser = argparse.ArgumentParser()
222 |     parser.add_argument('--solution_dir', type=str,
223 |                         default=default_solution_dir,
224 |                         help=("Directory storing the solution with true "
225 |                               "labels, e.g. adult.solution."))
226 |     parser.add_argument('--prediction_dir', type=str,
227 |                         default=default_prediction_dir,
228 |                         help=("Directory storing the predictions. It should"
229 |                               "contain e.g. [start.txt, adult.predict_0, "
230 |                               "adult.predict_1, ..., end.yaml]."))
231 |     parser.add_argument('--score_dir', type=str,
232 |                         default=default_score_dir,
233 |                         help=("Directory storing the scoring output e.g. "
234 |                               "`scores.txt` and `detailed_results.html`."))
235 |     args = parser.parse_args()
236 |     LOGGER.debug(f"Parsed args are: {args}")
237 |     LOGGER.debug("-" * 50)
238 |     LOGGER.debug(f"Using solution_dir: {args.solution_dir}")
239 |     LOGGER.debug(f"Using prediction_dir: {args.prediction_dir}")
240 |     LOGGER.debug(f"Using score_dir: {args.score_dir}")
241 |     return args
242 | 
243 | 
244 | def _init(args):
245 |     if not os.path.isdir(args.score_dir):
246 |         os.mkdir(args.score_dir)
247 |     detailed_results_filepath = join(
248 |         args.score_dir, 'detailed_results.html')
249 |     # Initialize detailed_results.html
250 |     _init_scores_html(detailed_results_filepath)
251 | 
252 | 
253 | def _finalize(score, scoring_start):
254 |     """finalize the scoring"""
255 |     # Use 'end.yaml' file to detect if ingestion program ends
256 |     duration = time.time() - scoring_start
257 |     LOGGER.info(
258 |         "[+] Successfully finished scoring! "
259 |         f"Scoring duration: {duration:.2} sec. "
260 |         f"The score of your algorithm on the task is: {score:.6}.")
261 | 
262 |     LOGGER.info("[Scoring terminated]")
263 | 
264 | 
265 | def main():
266 |     """main entry"""
267 |     scoring_start = time.time()
268 |     LOGGER.info('===== init scoring program')
269 |     args = _parse_args()
270 |     _init(args)
271 |     score = DEFAULT_SCORE
272 | 
273 |     ingestion_pid = get_ingestion_pid(args.prediction_dir)
274 | 
275 |     LOGGER.info("===== wait for the exit of ingestion.")
276 |     while is_process_alive(ingestion_pid):
277 |         time.sleep(1)
278 | 
279 |     # Compute/write score
280 |     ingestion_info = get_ingestion_info(args.prediction_dir)
281 |     duration = ingestion_info['ingestion_duration']
282 |     score = _update_score(args, duration)
283 | 
284 |     _finalize(score, scoring_start)
285 | 
286 | 
287 | if __name__ == "__main__":
288 |     main()
289 | 


--------------------------------------------------------------------------------