├── .github
    └── workflows
    │   ├── publish_pypi.yml
    │   ├── publish_test_pypi.yml
    │   └── tests.yml
├── .gitignore
├── .readthedocs.yml
├── COPYING
├── README.rst
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── api_reference.rst
    │   ├── conf.py
    │   ├── datasets.rst
    │   ├── development.rst
    │   ├── ensemble_integration.rst
    │   ├── getting_started.rst
    │   ├── index.rst
    │   ├── permutation_interpreter.rst
    │   └── tutorial.ipynb
├── eipy
    ├── __init__.py
    ├── additional_ensembles.py
    ├── datasets.py
    ├── ei.py
    ├── interpretation.py
    ├── metrics.py
    └── utils.py
├── pyproject.toml
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── test_ei.py
    └── test_load_data.py
└── tox.ini


/.github/workflows/publish_pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   publish:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Check out the repository
12 |         uses: actions/checkout@v3
13 |         with:
14 |           fetch-depth: 0
15 |   
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v3
18 |         with:
19 |           python-version: '3.11'
20 |   
21 |       - name: Install poetry
22 |         run: |
23 |           pip install pipx
24 |           pipx install poetry
25 |           pipx inject poetry "poetry-dynamic-versioning[plugin]"
26 |   
27 |       - name: Install dependencies
28 |         run: |
29 |           poetry install
30 | 
31 |       - name: Publish package to PyPI
32 |         run: |
33 |           poetry config pypi-token.pypi ${{ secrets.PYPI_TOKEN }}
34 |           poetry publish --build
35 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_test_pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to Test PyPI
 2 | on:
 3 |   workflow_dispatch:
 4 | 
 5 | jobs:
 6 |   publish:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - name: Check out the repository
10 |         uses: actions/checkout@v3
11 |         with:
12 |           fetch-depth: 0
13 |   
14 |       - name: Set up Python
15 |         uses: actions/setup-python@v3
16 |         with:
17 |           python-version: '3.11'
18 |   
19 |       - name: Install poetry
20 |         run: |
21 |           pip install pipx
22 |           pipx install poetry
23 |           pipx inject poetry "poetry-dynamic-versioning[plugin]"
24 |   
25 |       - name: Install dependencies
26 |         run: |
27 |           poetry install
28 |       
29 |       - name: Publish package to TestPyPI
30 |         run: |
31 |           poetry config repositories.test-pypi https://test.pypi.org/legacy/
32 |           poetry config pypi-token.test-pypi ${{ secrets.TEST_PYPI_TOKEN }}
33 |           poetry publish -r test-pypi --build
34 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     paths-ignore:
 8 |       - '**/README.rst'
 9 |       - 'docs/**'
10 |       - '.github/workflows/**'
11 |       - '**/.readthedocs.yml'
12 |       - '**/COPYING'
13 |       - '**/.gitignore'
14 |   pull_request:
15 |     branches:
16 |       - main
17 |     paths-ignore:
18 |       - '**/README.rst'
19 |       - 'docs/**'
20 |       - '.github/workflows/**'
21 |       - '**/.readthedocs.yml'
22 |       - '**/COPYING'
23 |       - '**/.gitignore'
24 | 
25 | jobs:
26 |   tests:
27 |     runs-on: ${{ matrix.os }}
28 |     strategy:
29 |       matrix:
30 |         os: [ubuntu-latest, windows-latest, macos-latest]
31 |         python-version: ['3.8', '3.9', '3.10', '3.11']
32 | 
33 |     steps:
34 |       - name: Check out the repository
35 |         uses: actions/checkout@v3
36 |       - name: Set up Python ${{ matrix.python-version }}
37 |         uses: actions/setup-python@v3
38 |         with:
39 |           python-version: ${{ matrix.python-version }}
40 | 
41 |       - name: Install poetry
42 |         run: |
43 |           pip install poetry
44 | 
45 |       - name: Install dependencies
46 |         run: |
47 |           poetry install
48 |           pip install tox tox-gh-actions
49 | 
50 |       - name: Test with tox
51 |         run: poetry run tox
52 | 
53 |       - name: Upload coverage reports to Codecov
54 |         uses: codecov/codecov-action@v4.0.1
55 |         with:
56 |           token: ${{ secrets.CODECOV_TOKEN }}
57 |           slug: GauravPandeyLab/eipy
58 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .venv
3 | .tox
4 | docs/build
5 | .coverage
6 | poetry.lock
7 | coverage.xml
8 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file for Sphinx projects
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | # If build fails check https://docs.readthedocs.io/en/latest/build-customization.html#install-dependencies-with-poetry for updates
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the OS, Python version and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.10"
13 |   jobs:
14 |     post_create_environment:
15 |       # Install poetry
16 |       # https://python-poetry.org/docs/#installing-manually
17 |       - pip install poetry
18 |     post_install:
19 |       # Install dependencies with 'docs' dependency group
20 |       # https://python-poetry.org/docs/managing-dependencies/#dependency-groups
21 |       # VIRTUAL_ENV needs to be set manually for now.
22 |       # See https://github.com/readthedocs/readthedocs.org/pull/11152/
23 |       - VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH poetry install --with docs
24 | 
25 | # Build documentation in the "docs/" directory with Sphinx
26 | sphinx:
27 |   configuration: docs/source/conf.py
28 |   # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
29 |   # builder: "dirhtml"
30 |   # Fail on all warnings to avoid broken references
31 |   # fail_on_warning: true
32 | 
33 | # Optionally build your docs in additional formats such as PDF and ePub
34 | # formats:
35 | #    - pdf
36 | #    - epub
37 | 
38 | # Optional but recommended, declare the Python requirements required
39 | # to build your documentation
40 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
41 | # python:
42 | #    install:
43 | #    - requirements: docs/requirements.txt
44 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | |Tests| |Coverage| |ReadTheDocs| |PythonVersion| |PyPI| |Black| |License|
 2 | 
 3 | .. |Tests| image:: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml/badge.svg
 4 |   :target:  https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml
 5 | 
 6 | .. |Coverage| image:: https://codecov.io/gh/GauravPandeyLab/eipy/graph/badge.svg?token=M2AU2XWJB8 
 7 |   :target: https://codecov.io/gh/GauravPandeyLab/eipy
 8 | 
 9 | .. |ReadTheDocs| image:: https://readthedocs.org/projects/eipy/badge/?version=latest
10 |   :target: https://eipy.readthedocs.io/en/latest/
11 | 
12 | .. |PyPI| image:: https://img.shields.io/pypi/v/ensemble-integration
13 |   :target: https://pypi.org/project/ensemble-integration/
14 | 
15 | .. |PythonVersion| image:: https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue
16 | 
17 | .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
18 |   :target: https://github.com/psf/black
19 | 
20 | .. |License| image:: https://img.shields.io/badge/License-GPLv3-blue
21 |   :target: https://github.com/GauravPandeyLab/eipy/blob/main/COPYING
22 | 
23 | 
24 | ``ensemble-integration``: Integrating multi-modal data for predictive modeling
25 | ==============================================================================
26 | 
27 | ``ensemble-integration`` (or ``eipy``) leverages multi-modal data to build classifiers using a late fusion approach. 
28 | In eipy, base predictors are trained on each modality before being ensembled at the late stage. 
29 | 
30 | This implementation of eipy can utilize `sklearn-like <https://scikit-learn.org/>`_ models only, therefore, for unstructured data,
31 | e.g. images, it is recommended to perform feature selection prior to using eipy. We hope to allow for a wider range of base predictors, 
32 | i.e. deep learning methods, in future releases. A key feature of ``eipy`` is its built-in nested cross-validation approach, allowing for a 
33 | fair comparison of a collection of user-defined ensemble methods.
34 | 
35 | Documentation including tutorials are available at `https://eipy.readthedocs.io/en/latest/ <https://eipy.readthedocs.io/en/latest/>`_.
36 | 
37 | Installation
38 | ------------
39 | 
40 | As usual it is recommended to set up a virtual environment prior to installation. 
41 | You can install ensemble-integration with pip:
42 | 
43 | ``pip install ensemble-integration``
44 | 
45 | Citation
46 | --------
47 | 
48 | If you use ``ensemble-integration`` in a scientific publication please cite the following:
49 | 
50 | Jamie J. R. Bennett, Yan Chak Li and Gaurav Pandey. *An Open-Source Python Package for Multi-modal Data Integration using Heterogeneous Ensembles*, https://doi.org/10.48550/arXiv.2401.09582.
51 | 
52 | Yan Chak Li, Linhua Wang, Jeffrey N Law, T M Murali, Gaurav Pandey. *Integrating multimodal data through interpretable heterogeneous ensembles*, Bioinformatics Advances, Volume 2, Issue 1, 2022, vbac065, https://doi.org/10.1093/bioadv/vbac065.
53 | 
54 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/api_reference.rst:
--------------------------------------------------------------------------------
1 | API Reference
2 | =============
3 | 
4 | .. toctree::
5 |     :maxdepth: 2
6 | 
7 |     ensemble_integration
8 |     permutation_interpreter
9 |     datasets


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../eipy'))
16 | 
17 | # -- Project information -----------------------------------------------------
18 | 
19 | project = 'Ensemble Integration'
20 | copyright = '2023, Jamie J. R. Bennett, Yan Chak Li, Aviad Susman, Gaurav Pandey'
21 | author = 'Jamie J. R. Bennett, Yan Chak Li, Aviad Susman, Gaurav Pandey'
22 | 
23 | # -- General configuration ---------------------------------------------------
24 | 
25 | # Add any Sphinx extension module names here, as strings. They can be
26 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
27 | # ones.
28 | extensions = ['sphinx.ext.autodoc', 
29 |               'sphinx.ext.autosummary',
30 |               'sphinx.ext.coverage', 
31 |               'numpydoc',
32 |               'sphinx.ext.napoleon',
33 |               'sphinx_autodoc_typehints',
34 |               'sphinx_rtd_theme',
35 |               'nbsphinx',
36 |               'sphinx_copybutton',
37 | ]
38 | 
39 | # Set order in autodoc
40 | autodoc_member_order = 'bysource'
41 | 
42 | # generate autosummary even if no references
43 | autosummary_generate = True
44 | 
45 | # Whether to create a Sphinx table of contents for the lists of class 
46 | # methods and attributes. If a table of contents is made, Sphinx expects 
47 | # each entry to have a separate page. True by default.
48 | numpydoc_class_members_toctree = False
49 | 
50 | # html sourcecode link
51 | html_show_sourcelink = False
52 |  
53 | # Add any paths that contain templates here, relative to this directory.
54 | templates_path = ['_templates']
55 | 
56 | # List of patterns, relative to source directory, that match files and
57 | # directories to ignore when looking for source files.
58 | # This pattern also affects html_static_path and html_extra_path.
59 | exclude_patterns = []
60 | 
61 | # The name of the Pygments (syntax highlighting) style to use.
62 | pygments_style = "sphinx"
63 | 
64 | # If true, '()' will be appended to :func: etc. cross-reference text.
65 | add_function_parentheses = False
66 | 
67 | # -- Options for HTML output -------------------------------------------------
68 | 
69 | # The theme to use for HTML and HTML Help pages.  See the documentation for
70 | # a list of builtin themes.
71 | 
72 | html_theme = 'sphinx_rtd_theme'
73 | 
74 | # Add any paths that contain custom static files (such as style sheets) here,
75 | # relative to this directory. They are copied after the builtin static files,
76 | # so a file named "default.css" will overwrite the builtin "default.css".
77 | # html_static_path = ['_static']


--------------------------------------------------------------------------------
/docs/source/datasets.rst:
--------------------------------------------------------------------------------
1 | Datasets
2 | --------
3 | 
4 | If using the below datasets in a scientific study, please cite the relevant publication in the doc string.
5 | 
6 | .. autofunction:: eipy.datasets.load_diabetes


--------------------------------------------------------------------------------
/docs/source/development.rst:
--------------------------------------------------------------------------------
 1 | Development
 2 | ===========
 3 | 
 4 | We welcome contributions to the development of ``ensemble-integration``. To contribute follow the below instructions to submit a pull request:
 5 | 
 6 | 1. **Install Python**. First of all make sure you have a supported version of Python on your local machine (see `GitHub <https://github.com/GauravPandeyLab/eipy>`__ for supported versions).
 7 | 2. **Install Poetry**. ``eipy`` uses Poetry to manage dependencies. To install Poetry follow the instructions on their `website <https://python-poetry.org/docs/>`__.
 8 | 3. **Fork the repo**.
 9 | 
10 | .. code-block:: console
11 | 
12 |    git fork https://github.com/GauravPandeyLab/eipy.git
13 | 
14 | 4. **Set up a virtual environment**. Navigate to the ``eipy`` directory and create and activate a virtual environment.
15 | 
16 | .. code-block:: console
17 | 
18 |    python -m venv .venv
19 |    source .venv/bin/activate
20 | 
21 | 5. **Install dependencies**. If editing the documentation make sure to include the ``--with docs`` argument.
22 | 
23 | .. code-block:: console
24 | 
25 |    poetry install --with docs
26 | 
27 | 6. **Make contributions**.
28 | 
29 | 7. **Linting and formating**. We use Flake8 for linting and Black for formatting. For linting type, for example,
30 | 
31 | .. code-block:: console
32 | 
33 |    flake8 eipy/ei.py
34 | 
35 | For formatting type, for example,
36 | 
37 | .. code-block:: console
38 | 
39 |    black eipy/ei.py
40 | 
41 | 8. **Run tests**. All tests can be found in the tests folder and can be run by typing
42 | 
43 | .. code-block:: console
44 | 
45 |    pytest
46 | 
47 | Note that new test file names must have the prefix `test_`.
48 | 
49 | 9. **Submit pull request**. Updates must be made via a pull request. Internal users should note that pushing 
50 | to the main branch has been disabled.
51 | 
52 | 10. **Publishing new versions to PyPI** (internal only). We now use `poetry-dynamic-versioning <https://github.com/mtkennerly/poetry-dynamic-versioning>`__ 
53 | to iterate version numbers in pyproject.toml automatically. You can publish to 
54 | PyPI by creating a new `release <https://github.com/GauravPandeyLab/eipy/releases>`__, 
55 | which will run the "Publish to PyPI" workflow. This workflow determines the PyPI version number from the
56 | GitHub release tag, which you should manually iterate.  
57 | Note: to test things out first, you can try manually running the "Publish to test PyPI" workflow.
58 | 


--------------------------------------------------------------------------------
/docs/source/ensemble_integration.rst:
--------------------------------------------------------------------------------
1 | EnsembleIntegration
2 | -------------------
3 | 
4 | .. autoclass:: eipy.ei.EnsembleIntegration
5 |     :members: fit_base, fit_ensemble, predict, save, load


--------------------------------------------------------------------------------
/docs/source/getting_started.rst:
--------------------------------------------------------------------------------
 1 | Getting started
 2 | ===============
 3 | 
 4 | Ensemble Integration focuses mainly on
 5 | `stacked generalization <https://www.sciencedirect.com/science/article/abs/pii/S0893608005800231>`_,
 6 | as a method for late data fusion, but other ensemble methods including 
 7 | `ensemble selection <https://dl.acm.org/doi/10.1145/1015330.1015432>`_ are available for
 8 | comparison. 
 9 | 
10 | Base predictor training is performed in a nested cross validation set up, to allow for an unbiased comparison
11 | of ensemble methods, allowing the user to select the method with the best performance. 
12 | A final model can then be trained on all available data.
13 | 
14 | Source code
15 | -----------
16 | 
17 | The source code for eipy is available on `GitHub <https://github.com/03bennej/ei-python.git>`_.
18 | 
19 | Installation
20 | ------------
21 | 
22 | As usual it is recommended to set up a virtual environment prior to installation. 
23 | You can install ensemble-integration with pip:
24 | 
25 | .. code-block:: console
26 | 
27 |    pip install ensemble-integration
28 | 
29 | Citation
30 | --------
31 | 
32 | If you use eipy in a scientific publication please cite the following:
33 | 
34 | Jamie J. R. Bennett, Yan Chak Li and Gaurav Pandey. An Open-Source Python Package for Multi-modal Data Integration using Heterogeneous Ensembles, https://doi.org/10.48550/arXiv.2401.09582.
35 | 
36 | Yan Chak Li, Linhua Wang, Jeffrey N Law, T M Murali, Gaurav Pandey. Integrating multimodal data through interpretable heterogeneous ensembles, Bioinformatics Advances, Volume 2, Issue 1, 2022, vbac065, https://doi.org/10.1093/bioadv/vbac065.
37 | 
38 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | ``ensemble-integration``: Integrating multi-modal data for predictive modeling
 2 | ==============================================================================
 3 | 
 4 | ``ensemble-integration`` (or ``eipy``) leverages multi-modal data to build classifiers using a late fusion approach. 
 5 | In eipy, base predictors are trained on each modality before being ensembled at the late stage. 
 6 | 
 7 | This implementation of eipy can utilize `sklearn-like <https://scikit-learn.org/>`_ models only, therefore, for unstructured data,
 8 | e.g. images, it is recommended to perform feature selection prior to using eipy. We hope to allow for a wider range of base predictors, 
 9 | i.e. deep learning methods, in future releases. A key feature of ``eipy`` is its built-in nested cross-validation approach, allowing for a 
10 | fair comparison of a collection of user-defined ensemble methods.
11 | 
12 | For more details see the `original publication <https://doi.org/10.1093/bioadv/vbac065>`_.
13 | 
14 | .. toctree::
15 |    :maxdepth: 2
16 |    :caption: Contents:
17 |    
18 |    getting_started
19 |    tutorial.ipynb
20 |    api_reference
21 |    development
22 | 
23 | .. Indices and tables
24 | .. ==================
25 | 
26 | .. * :ref:`genindex`
27 | .. * :ref:`modindex`
28 | .. * :ref:`search`
29 | 


--------------------------------------------------------------------------------
/docs/source/permutation_interpreter.rst:
--------------------------------------------------------------------------------
1 | PermutationInterpreter
2 | ----------------------
3 | 
4 | .. autoclass:: eipy.interpretation.PermutationInterpreter
5 |     :members: rank_product_score, local_feature_rank, local_model_rank


--------------------------------------------------------------------------------
/docs/source/tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Tutorial"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this tutorial we fairly compare a number of ensemble methods using EI's built in nested cross-validation implementation, and show how predictions can be made with the selected final model. We then show how we can intepret the model by calculating feature rankings.\n",
 15 |     "\n",
 16 |     "### Performance analysis and selection of ensemble methods\n",
 17 |     "\n",
 18 |     "First of all let's import some `sklearn` models, `EnsembleIntegration` and some additional ensemble methods:"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 44,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier\n",
 28 |     "from sklearn.tree import DecisionTreeClassifier\n",
 29 |     "from sklearn.neighbors import KNeighborsClassifier\n",
 30 |     "from sklearn.linear_model import LogisticRegression\n",
 31 |     "from sklearn.naive_bayes import GaussianNB\n",
 32 |     "from sklearn.svm import SVC\n",
 33 |     "from sklearn.neural_network import MLPClassifier\n",
 34 |     "from xgboost import XGBClassifier\n",
 35 |     "import pandas as pd\n",
 36 |     "from eipy.ei import EnsembleIntegration\n",
 37 |     "from eipy.additional_ensembles import MeanAggregation, CES"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "Next load the multi-modal youth diabetes dataset. "
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 45,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "Modalities available are ['Sociodemographic', 'Health status', 'Diet', 'Other lifestyle behaviors']\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "from eipy.datasets import load_diabetes\n",
 62 |     "\n",
 63 |     "data = load_diabetes()\n",
 64 |     "X_train, X_test, y_train, y_test = data['X_train'], data['X_test'], data['y_train'], data['y_test']\n",
 65 |     "print('Modalities available are', list(X_train.keys()))"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "Define metrics of interest. `fmax_score` is a custom metric that outputs both a score and a corresponding threshold."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 46,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "from eipy.metrics import fmax_score\n",
 82 |     "from sklearn.metrics import roc_auc_score, matthews_corrcoef\n",
 83 |     "\n",
 84 |     "metrics = {\n",
 85 |     "            'f_max': fmax_score,\n",
 86 |     "            'auc': roc_auc_score,\n",
 87 |     "            'mcc': matthews_corrcoef\n",
 88 |     "            }"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "Define base predictors:"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 47,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "base_predictors = {\n",
105 |     "                    'ADAB': AdaBoostClassifier(),\n",
106 |     "                    'XGB': XGBClassifier(),\n",
107 |     "                    'DT': DecisionTreeClassifier(),\n",
108 |     "                    'RF': RandomForestClassifier(), \n",
109 |     "                    'GB': GradientBoostingClassifier(),\n",
110 |     "                    'KNN': KNeighborsClassifier(),\n",
111 |     "                    'LR': LogisticRegression(),\n",
112 |     "                    'NB': GaussianNB(),\n",
113 |     "                    'MLP': MLPClassifier(),\n",
114 |     "                    'SVM': SVC(probability=True),\n",
115 |     "}"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "Initialise Ensemble Integration:"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 48,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "EI = EnsembleIntegration(\n",
132 |     "                        base_predictors=base_predictors,\n",
133 |     "                        k_outer=5,\n",
134 |     "                        k_inner=5,\n",
135 |     "                        n_samples=1,\n",
136 |     "                        sampling_strategy=\"undersampling\",\n",
137 |     "                        sampling_aggregation=None,\n",
138 |     "                        n_jobs=-1,\n",
139 |     "                        metrics=metrics,\n",
140 |     "                        random_state=38,\n",
141 |     "                        project_name=\"diabetes\",\n",
142 |     "                        model_building=True,\n",
143 |     "                        )"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "Fit base predictors on each modality. Remember to include the unique modality name."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 49,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "name": "stdout",
160 |      "output_type": "stream",
161 |      "text": [
162 |       "Training base predictors on Sociodemographic...\n",
163 |       "        \n",
164 |       "... for ensemble performance analysis...\n"
165 |      ]
166 |     },
167 |     {
168 |      "name": "stderr",
169 |      "output_type": "stream",
170 |      "text": [
171 |       "Generating ensemble training data: |██████████|100%\n",
172 |       "Generating ensemble test data: |██████████|100%\n"
173 |      ]
174 |     },
175 |     {
176 |      "name": "stdout",
177 |      "output_type": "stream",
178 |      "text": [
179 |       "\n",
180 |       "... for final ensemble...\n"
181 |      ]
182 |     },
183 |     {
184 |      "name": "stderr",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "Generating ensemble training data: |██████████|100%\n",
188 |       "Training final base predictors: |██████████|100%\n"
189 |      ]
190 |     },
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "\n",
196 |       "\n",
197 |       "Training base predictors on Health status...\n",
198 |       "        \n",
199 |       "... for ensemble performance analysis...\n"
200 |      ]
201 |     },
202 |     {
203 |      "name": "stderr",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "Generating ensemble training data: |██████████|100%\n",
207 |       "Generating ensemble test data: |██████████|100%\n"
208 |      ]
209 |     },
210 |     {
211 |      "name": "stdout",
212 |      "output_type": "stream",
213 |      "text": [
214 |       "\n",
215 |       "... for final ensemble...\n"
216 |      ]
217 |     },
218 |     {
219 |      "name": "stderr",
220 |      "output_type": "stream",
221 |      "text": [
222 |       "Generating ensemble training data: |██████████|100%\n",
223 |       "Training final base predictors: |██████████|100%\n"
224 |      ]
225 |     },
226 |     {
227 |      "name": "stdout",
228 |      "output_type": "stream",
229 |      "text": [
230 |       "\n",
231 |       "\n",
232 |       "Training base predictors on Diet...\n",
233 |       "        \n",
234 |       "... for ensemble performance analysis...\n"
235 |      ]
236 |     },
237 |     {
238 |      "name": "stderr",
239 |      "output_type": "stream",
240 |      "text": [
241 |       "Generating ensemble training data: |██████████|100%\n",
242 |       "Generating ensemble test data: |██████████|100%\n"
243 |      ]
244 |     },
245 |     {
246 |      "name": "stdout",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "\n",
250 |       "... for final ensemble...\n"
251 |      ]
252 |     },
253 |     {
254 |      "name": "stderr",
255 |      "output_type": "stream",
256 |      "text": [
257 |       "Generating ensemble training data: |██████████|100%\n",
258 |       "Training final base predictors: |██████████|100%\n"
259 |      ]
260 |     },
261 |     {
262 |      "name": "stdout",
263 |      "output_type": "stream",
264 |      "text": [
265 |       "\n",
266 |       "\n",
267 |       "Training base predictors on Other lifestyle behaviors...\n",
268 |       "        \n",
269 |       "... for ensemble performance analysis...\n"
270 |      ]
271 |     },
272 |     {
273 |      "name": "stderr",
274 |      "output_type": "stream",
275 |      "text": [
276 |       "Generating ensemble training data: |██████████|100%\n",
277 |       "Generating ensemble test data: |██████████|100%\n"
278 |      ]
279 |     },
280 |     {
281 |      "name": "stdout",
282 |      "output_type": "stream",
283 |      "text": [
284 |       "\n",
285 |       "... for final ensemble...\n"
286 |      ]
287 |     },
288 |     {
289 |      "name": "stderr",
290 |      "output_type": "stream",
291 |      "text": [
292 |       "Generating ensemble training data: |██████████|100%\n",
293 |       "Training final base predictors: |██████████|100%\n"
294 |      ]
295 |     },
296 |     {
297 |      "name": "stdout",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "\n",
301 |       "\n"
302 |      ]
303 |     }
304 |    ],
305 |    "source": [
306 |     "for name, modality in X_train.items():\n",
307 |     "    EI.fit_base(modality, y_train, modality_name=name)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {},
313 |    "source": [
314 |     "We can check the cross validated performance of each base predictor on each modality with the `base_summary` dictionary. The metric scores are stored in a dataframe and can be accessed with the `metrics` key. The corresponding threshold values used to threshold the probability vector can be accessed with the `thresholds` key. "
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 50,
320 |    "metadata": {},
321 |    "outputs": [
322 |     {
323 |      "data": {
324 |       "text/html": [
325 |        "<div>\n",
326 |        "<style scoped>\n",
327 |        "    .dataframe tbody tr th:only-of-type {\n",
328 |        "        vertical-align: middle;\n",
329 |        "    }\n",
330 |        "\n",
331 |        "    .dataframe tbody tr th {\n",
332 |        "        vertical-align: top;\n",
333 |        "    }\n",
334 |        "\n",
335 |        "    .dataframe thead tr th {\n",
336 |        "        text-align: left;\n",
337 |        "    }\n",
338 |        "</style>\n",
339 |        "<table border=\"1\" class=\"dataframe\">\n",
340 |        "  <thead>\n",
341 |        "    <tr>\n",
342 |        "      <th>modality</th>\n",
343 |        "      <th colspan=\"10\" halign=\"left\">Diet</th>\n",
344 |        "      <th>...</th>\n",
345 |        "      <th colspan=\"10\" halign=\"left\">Sociodemographic</th>\n",
346 |        "    </tr>\n",
347 |        "    <tr>\n",
348 |        "      <th>base predictor</th>\n",
349 |        "      <th>ADAB</th>\n",
350 |        "      <th>DT</th>\n",
351 |        "      <th>GB</th>\n",
352 |        "      <th>KNN</th>\n",
353 |        "      <th>LR</th>\n",
354 |        "      <th>MLP</th>\n",
355 |        "      <th>NB</th>\n",
356 |        "      <th>RF</th>\n",
357 |        "      <th>SVM</th>\n",
358 |        "      <th>XGB</th>\n",
359 |        "      <th>...</th>\n",
360 |        "      <th>ADAB</th>\n",
361 |        "      <th>DT</th>\n",
362 |        "      <th>GB</th>\n",
363 |        "      <th>KNN</th>\n",
364 |        "      <th>LR</th>\n",
365 |        "      <th>MLP</th>\n",
366 |        "      <th>NB</th>\n",
367 |        "      <th>RF</th>\n",
368 |        "      <th>SVM</th>\n",
369 |        "      <th>XGB</th>\n",
370 |        "    </tr>\n",
371 |        "  </thead>\n",
372 |        "  <tbody>\n",
373 |        "    <tr>\n",
374 |        "      <th>f_max</th>\n",
375 |        "      <td>0.235738</td>\n",
376 |        "      <td>0.222865</td>\n",
377 |        "      <td>0.235282</td>\n",
378 |        "      <td>0.224144</td>\n",
379 |        "      <td>0.251120</td>\n",
380 |        "      <td>0.228173</td>\n",
381 |        "      <td>0.225817</td>\n",
382 |        "      <td>0.234442</td>\n",
383 |        "      <td>0.239102</td>\n",
384 |        "      <td>0.231233</td>\n",
385 |        "      <td>...</td>\n",
386 |        "      <td>0.264833</td>\n",
387 |        "      <td>0.227289</td>\n",
388 |        "      <td>0.271730</td>\n",
389 |        "      <td>0.235114</td>\n",
390 |        "      <td>0.274428</td>\n",
391 |        "      <td>0.248038</td>\n",
392 |        "      <td>0.247952</td>\n",
393 |        "      <td>0.264974</td>\n",
394 |        "      <td>0.271551</td>\n",
395 |        "      <td>0.254824</td>\n",
396 |        "    </tr>\n",
397 |        "    <tr>\n",
398 |        "      <th>auc</th>\n",
399 |        "      <td>0.559842</td>\n",
400 |        "      <td>0.525266</td>\n",
401 |        "      <td>0.564876</td>\n",
402 |        "      <td>0.520080</td>\n",
403 |        "      <td>0.592825</td>\n",
404 |        "      <td>0.537618</td>\n",
405 |        "      <td>0.532803</td>\n",
406 |        "      <td>0.556643</td>\n",
407 |        "      <td>0.568244</td>\n",
408 |        "      <td>0.554326</td>\n",
409 |        "      <td>...</td>\n",
410 |        "      <td>0.626564</td>\n",
411 |        "      <td>0.538970</td>\n",
412 |        "      <td>0.630678</td>\n",
413 |        "      <td>0.562166</td>\n",
414 |        "      <td>0.630872</td>\n",
415 |        "      <td>0.584445</td>\n",
416 |        "      <td>0.587617</td>\n",
417 |        "      <td>0.618888</td>\n",
418 |        "      <td>0.629168</td>\n",
419 |        "      <td>0.594908</td>\n",
420 |        "    </tr>\n",
421 |        "    <tr>\n",
422 |        "      <th>mcc</th>\n",
423 |        "      <td>0.062157</td>\n",
424 |        "      <td>0.033347</td>\n",
425 |        "      <td>0.059942</td>\n",
426 |        "      <td>0.018450</td>\n",
427 |        "      <td>0.094138</td>\n",
428 |        "      <td>0.041208</td>\n",
429 |        "      <td>0.038954</td>\n",
430 |        "      <td>0.054486</td>\n",
431 |        "      <td>0.062126</td>\n",
432 |        "      <td>0.057688</td>\n",
433 |        "      <td>...</td>\n",
434 |        "      <td>0.115635</td>\n",
435 |        "      <td>0.051554</td>\n",
436 |        "      <td>0.125628</td>\n",
437 |        "      <td>0.063908</td>\n",
438 |        "      <td>0.130140</td>\n",
439 |        "      <td>0.090261</td>\n",
440 |        "      <td>0.081534</td>\n",
441 |        "      <td>0.113642</td>\n",
442 |        "      <td>0.128070</td>\n",
443 |        "      <td>0.098644</td>\n",
444 |        "    </tr>\n",
445 |        "  </tbody>\n",
446 |        "</table>\n",
447 |        "<p>3 rows × 40 columns</p>\n",
448 |        "</div>"
449 |       ],
450 |       "text/plain": [
451 |        "modality            Diet                                                    \\\n",
452 |        "base predictor      ADAB        DT        GB       KNN        LR       MLP   \n",
453 |        "f_max           0.235738  0.222865  0.235282  0.224144  0.251120  0.228173   \n",
454 |        "auc             0.559842  0.525266  0.564876  0.520080  0.592825  0.537618   \n",
455 |        "mcc             0.062157  0.033347  0.059942  0.018450  0.094138  0.041208   \n",
456 |        "\n",
457 |        "modality                                                ... Sociodemographic  \\\n",
458 |        "base predictor        NB        RF       SVM       XGB  ...             ADAB   \n",
459 |        "f_max           0.225817  0.234442  0.239102  0.231233  ...         0.264833   \n",
460 |        "auc             0.532803  0.556643  0.568244  0.554326  ...         0.626564   \n",
461 |        "mcc             0.038954  0.054486  0.062126  0.057688  ...         0.115635   \n",
462 |        "\n",
463 |        "modality                                                                    \\\n",
464 |        "base predictor        DT        GB       KNN        LR       MLP        NB   \n",
465 |        "f_max           0.227289  0.271730  0.235114  0.274428  0.248038  0.247952   \n",
466 |        "auc             0.538970  0.630678  0.562166  0.630872  0.584445  0.587617   \n",
467 |        "mcc             0.051554  0.125628  0.063908  0.130140  0.090261  0.081534   \n",
468 |        "\n",
469 |        "modality                                      \n",
470 |        "base predictor        RF       SVM       XGB  \n",
471 |        "f_max           0.264974  0.271551  0.254824  \n",
472 |        "auc             0.618888  0.629168  0.594908  \n",
473 |        "mcc             0.113642  0.128070  0.098644  \n",
474 |        "\n",
475 |        "[3 rows x 40 columns]"
476 |       ]
477 |      },
478 |      "execution_count": 50,
479 |      "metadata": {},
480 |      "output_type": "execute_result"
481 |     }
482 |    ],
483 |    "source": [
484 |     "EI.base_summary['metrics']"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "markdown",
489 |    "metadata": {},
490 |    "source": [
491 |     "Now let's define some ensemble models for stacked generalization. We add an \"S.\" prefix to the keys of stacking algorithms."
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": 51,
497 |    "metadata": {},
498 |    "outputs": [],
499 |    "source": [
500 |     "ensemble_predictors = {     \n",
501 |     "                    'Mean' : MeanAggregation(),\n",
502 |     "                    'CES' : CES(scoring=lambda y_test, y_pred: fmax_score(y_test, y_pred)[0]),\n",
503 |     "                    'S.ADAB': AdaBoostClassifier(),\n",
504 |     "                    'S.XGB': XGBClassifier(),\n",
505 |     "                    'S.DT': DecisionTreeClassifier(),\n",
506 |     "                    \"S.RF\": RandomForestClassifier(), \n",
507 |     "                    'S.GB': GradientBoostingClassifier(),\n",
508 |     "                    'S.KNN': KNeighborsClassifier(),\n",
509 |     "                    'S.LR': LogisticRegression(),\n",
510 |     "                    'S.NB': GaussianNB(),\n",
511 |     "                    'S.MLP': MLPClassifier(),\n",
512 |     "                    'S.SVM': SVC(probability=True),\n",
513 |     "}"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "markdown",
518 |    "metadata": {},
519 |    "source": [
520 |     "Fit ensemble models:"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "code",
525 |    "execution_count": 52,
526 |    "metadata": {},
527 |    "outputs": [
528 |     {
529 |      "name": "stderr",
530 |      "output_type": "stream",
531 |      "text": [
532 |       "Analyzing ensembles: |          |  0%"
533 |      ]
534 |     },
535 |     {
536 |      "name": "stderr",
537 |      "output_type": "stream",
538 |      "text": [
539 |       "Analyzing ensembles: |██████████|100%\n",
540 |       "Training final ensemble models: |██████████|100%\n"
541 |      ]
542 |     },
543 |     {
544 |      "data": {
545 |       "text/plain": [
546 |        "<eipy.ei.EnsembleIntegration at 0x7f05c036b010>"
547 |       ]
548 |      },
549 |      "execution_count": 52,
550 |      "metadata": {},
551 |      "output_type": "execute_result"
552 |     }
553 |    ],
554 |    "source": [
555 |     "EI.fit_ensemble(ensemble_predictors=ensemble_predictors)"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "markdown",
560 |    "metadata": {},
561 |    "source": [
562 |     "Check the ensemble summary with `ensemble_summary`:"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": 53,
568 |    "metadata": {},
569 |    "outputs": [
570 |     {
571 |      "data": {
572 |       "text/html": [
573 |        "<div>\n",
574 |        "<style scoped>\n",
575 |        "    .dataframe tbody tr th:only-of-type {\n",
576 |        "        vertical-align: middle;\n",
577 |        "    }\n",
578 |        "\n",
579 |        "    .dataframe tbody tr th {\n",
580 |        "        vertical-align: top;\n",
581 |        "    }\n",
582 |        "\n",
583 |        "    .dataframe thead th {\n",
584 |        "        text-align: right;\n",
585 |        "    }\n",
586 |        "</style>\n",
587 |        "<table border=\"1\" class=\"dataframe\">\n",
588 |        "  <thead>\n",
589 |        "    <tr style=\"text-align: right;\">\n",
590 |        "      <th></th>\n",
591 |        "      <th>Mean</th>\n",
592 |        "      <th>CES</th>\n",
593 |        "      <th>S.ADAB</th>\n",
594 |        "      <th>S.XGB</th>\n",
595 |        "      <th>S.DT</th>\n",
596 |        "      <th>S.RF</th>\n",
597 |        "      <th>S.GB</th>\n",
598 |        "      <th>S.KNN</th>\n",
599 |        "      <th>S.LR</th>\n",
600 |        "      <th>S.NB</th>\n",
601 |        "      <th>S.MLP</th>\n",
602 |        "      <th>S.SVM</th>\n",
603 |        "    </tr>\n",
604 |        "  </thead>\n",
605 |        "  <tbody>\n",
606 |        "    <tr>\n",
607 |        "      <th>f_max</th>\n",
608 |        "      <td>0.279307</td>\n",
609 |        "      <td>0.297275</td>\n",
610 |        "      <td>0.292202</td>\n",
611 |        "      <td>0.260111</td>\n",
612 |        "      <td>0.222865</td>\n",
613 |        "      <td>0.27744</td>\n",
614 |        "      <td>0.295322</td>\n",
615 |        "      <td>0.228696</td>\n",
616 |        "      <td>0.304192</td>\n",
617 |        "      <td>0.300064</td>\n",
618 |        "      <td>0.278088</td>\n",
619 |        "      <td>0.240511</td>\n",
620 |        "    </tr>\n",
621 |        "    <tr>\n",
622 |        "      <th>auc</th>\n",
623 |        "      <td>0.648915</td>\n",
624 |        "      <td>0.664662</td>\n",
625 |        "      <td>0.652463</td>\n",
626 |        "      <td>0.611726</td>\n",
627 |        "      <td>0.523156</td>\n",
628 |        "      <td>0.64466</td>\n",
629 |        "      <td>0.667541</td>\n",
630 |        "      <td>0.552092</td>\n",
631 |        "      <td>0.678055</td>\n",
632 |        "      <td>0.673285</td>\n",
633 |        "      <td>0.642210</td>\n",
634 |        "      <td>0.573379</td>\n",
635 |        "    </tr>\n",
636 |        "    <tr>\n",
637 |        "      <th>mcc</th>\n",
638 |        "      <td>0.140871</td>\n",
639 |        "      <td>0.154157</td>\n",
640 |        "      <td>0.060963</td>\n",
641 |        "      <td>0.057347</td>\n",
642 |        "      <td>0.046134</td>\n",
643 |        "      <td>0.03141</td>\n",
644 |        "      <td>0.053177</td>\n",
645 |        "      <td>0.050426</td>\n",
646 |        "      <td>0.014030</td>\n",
647 |        "      <td>0.168266</td>\n",
648 |        "      <td>0.056389</td>\n",
649 |        "      <td>0.004238</td>\n",
650 |        "    </tr>\n",
651 |        "  </tbody>\n",
652 |        "</table>\n",
653 |        "</div>"
654 |       ],
655 |       "text/plain": [
656 |        "           Mean       CES    S.ADAB     S.XGB      S.DT     S.RF      S.GB  \\\n",
657 |        "f_max  0.279307  0.297275  0.292202  0.260111  0.222865  0.27744  0.295322   \n",
658 |        "auc    0.648915  0.664662  0.652463  0.611726  0.523156  0.64466  0.667541   \n",
659 |        "mcc    0.140871  0.154157  0.060963  0.057347  0.046134  0.03141  0.053177   \n",
660 |        "\n",
661 |        "          S.KNN      S.LR      S.NB     S.MLP     S.SVM  \n",
662 |        "f_max  0.228696  0.304192  0.300064  0.278088  0.240511  \n",
663 |        "auc    0.552092  0.678055  0.673285  0.642210  0.573379  \n",
664 |        "mcc    0.050426  0.014030  0.168266  0.056389  0.004238  "
665 |       ]
666 |      },
667 |      "execution_count": 53,
668 |      "metadata": {},
669 |      "output_type": "execute_result"
670 |     }
671 |    ],
672 |    "source": [
673 |     "EI.ensemble_summary['metrics']"
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "markdown",
678 |    "metadata": {},
679 |    "source": [
680 |     "The LR stacking algorithm has the best $\\text{F}_\\text{max}$ performance (the preferred metric for imbalanced datasets) so let's select it as our final model.\n",
681 |     "\n",
682 |     "### Predictions on unseen data\n",
683 |     "\n",
684 |     "Since we ran EI with `model_building=True`, we can make predictions. Let's predict the test set and apply the $\\text{F}_\\text{max}$ threshold calculated during training:"
685 |    ]
686 |   },
687 |   {
688 |    "cell_type": "code",
689 |    "execution_count": 54,
690 |    "metadata": {},
691 |    "outputs": [
692 |     {
693 |      "name": "stdout",
694 |      "output_type": "stream",
695 |      "text": [
696 |       "[0. 1. 1. ... 0. 1. 1.]\n"
697 |      ]
698 |     }
699 |    ],
700 |    "source": [
701 |     "y_pred = EI.predict(X_dict=X_test, ensemble_model_key='S.LR')\n",
702 |     "\n",
703 |     "threshold = EI.ensemble_summary['thresholds']['S.LR']['f_max']\n",
704 |     "\n",
705 |     "y_pred[y_pred>=threshold] = 1\n",
706 |     "y_pred[y_pred<threshold] = 0\n",
707 |     "\n",
708 |     "print(y_pred)"
709 |    ]
710 |   },
711 |   {
712 |    "cell_type": "markdown",
713 |    "metadata": {},
714 |    "source": [
715 |     "\n",
716 |     "### Interpreting the final model\n",
717 |     "\n",
718 |     "We now use `PermutationInterpreter` to interpret the final LR stacked generalization model. Let's first import `PermutationInterpreter` and our chosen metric, and initialise the interpreter:"
719 |    ]
720 |   },
721 |   {
722 |    "cell_type": "code",
723 |    "execution_count": 55,
724 |    "metadata": {},
725 |    "outputs": [],
726 |    "source": [
727 |     "from eipy.interpretation import PermutationInterpreter\n",
728 |     "\n",
729 |     "interpreter = PermutationInterpreter(EI=EI,\n",
730 |     "                                     metric=lambda y_test, y_pred: fmax_score(y_test, y_pred)[0],\n",
731 |     "                                     ensemble_predictor_keys=['S.LR'],\n",
732 |     "                                     n_jobs=-1)"
733 |    ]
734 |   },
735 |   {
736 |    "cell_type": "markdown",
737 |    "metadata": {},
738 |    "source": [
739 |     "Calculate feature importance scores:"
740 |    ]
741 |   },
742 |   {
743 |    "cell_type": "code",
744 |    "execution_count": 56,
745 |    "metadata": {},
746 |    "outputs": [
747 |     {
748 |      "name": "stdout",
749 |      "output_type": "stream",
750 |      "text": [
751 |       "Interpreting ensembles...\n",
752 |       "\n"
753 |      ]
754 |     },
755 |     {
756 |      "name": "stderr",
757 |      "output_type": "stream",
758 |      "text": [
759 |       "Calculating local feature ranks: |██████████|100%\n",
760 |       "Calculating local model ranks: |██████████|100%"
761 |      ]
762 |     },
763 |     {
764 |      "name": "stdout",
765 |      "output_type": "stream",
766 |      "text": [
767 |       "Calculating combined rank product score...\n",
768 |       "... complete!\n"
769 |      ]
770 |     },
771 |     {
772 |      "name": "stderr",
773 |      "output_type": "stream",
774 |      "text": [
775 |       "\n"
776 |      ]
777 |     },
778 |     {
779 |      "data": {
780 |       "text/plain": [
781 |        "<eipy.interpretation.PermutationInterpreter at 0x7f06008e1590>"
782 |       ]
783 |      },
784 |      "execution_count": 56,
785 |      "metadata": {},
786 |      "output_type": "execute_result"
787 |     }
788 |    ],
789 |    "source": [
790 |     "interpreter.rank_product_score(X_dict=X_test, y=y_test)"
791 |    ]
792 |   },
793 |   {
794 |    "cell_type": "markdown",
795 |    "metadata": {},
796 |    "source": [
797 |     "We can now inspect the most important features for model prediction:"
798 |    ]
799 |   },
800 |   {
801 |    "cell_type": "code",
802 |    "execution_count": 57,
803 |    "metadata": {},
804 |    "outputs": [
805 |     {
806 |      "data": {
807 |       "text/html": [
808 |        "<div>\n",
809 |        "<style scoped>\n",
810 |        "    .dataframe tbody tr th:only-of-type {\n",
811 |        "        vertical-align: middle;\n",
812 |        "    }\n",
813 |        "\n",
814 |        "    .dataframe tbody tr th {\n",
815 |        "        vertical-align: top;\n",
816 |        "    }\n",
817 |        "\n",
818 |        "    .dataframe thead th {\n",
819 |        "        text-align: right;\n",
820 |        "    }\n",
821 |        "</style>\n",
822 |        "<table border=\"1\" class=\"dataframe\">\n",
823 |        "  <thead>\n",
824 |        "    <tr style=\"text-align: right;\">\n",
825 |        "      <th></th>\n",
826 |        "      <th>modality</th>\n",
827 |        "      <th>feature</th>\n",
828 |        "      <th>RPS</th>\n",
829 |        "      <th>feature rank</th>\n",
830 |        "      <th>ensemble method</th>\n",
831 |        "    </tr>\n",
832 |        "  </thead>\n",
833 |        "  <tbody>\n",
834 |        "    <tr>\n",
835 |        "      <th>38</th>\n",
836 |        "      <td>Health status</td>\n",
837 |        "      <td>BMXHT</td>\n",
838 |        "      <td>0.104007</td>\n",
839 |        "      <td>1.0</td>\n",
840 |        "      <td>S.LR</td>\n",
841 |        "    </tr>\n",
842 |        "    <tr>\n",
843 |        "      <th>21</th>\n",
844 |        "      <td>Sociodemographic</td>\n",
845 |        "      <td>FdStmp</td>\n",
846 |        "      <td>0.126047</td>\n",
847 |        "      <td>2.0</td>\n",
848 |        "      <td>S.LR</td>\n",
849 |        "    </tr>\n",
850 |        "    <tr>\n",
851 |        "      <th>0</th>\n",
852 |        "      <td>Sociodemographic</td>\n",
853 |        "      <td>RIAGENDR</td>\n",
854 |        "      <td>0.126740</td>\n",
855 |        "      <td>3.0</td>\n",
856 |        "      <td>S.LR</td>\n",
857 |        "    </tr>\n",
858 |        "    <tr>\n",
859 |        "      <th>1</th>\n",
860 |        "      <td>Sociodemographic</td>\n",
861 |        "      <td>RIDAGEYR</td>\n",
862 |        "      <td>0.154443</td>\n",
863 |        "      <td>4.0</td>\n",
864 |        "      <td>S.LR</td>\n",
865 |        "    </tr>\n",
866 |        "    <tr>\n",
867 |        "      <th>29</th>\n",
868 |        "      <td>Sociodemographic</td>\n",
869 |        "      <td>OvntPT</td>\n",
870 |        "      <td>0.160676</td>\n",
871 |        "      <td>5.0</td>\n",
872 |        "      <td>S.LR</td>\n",
873 |        "    </tr>\n",
874 |        "    <tr>\n",
875 |        "      <th>...</th>\n",
876 |        "      <td>...</td>\n",
877 |        "      <td>...</td>\n",
878 |        "      <td>...</td>\n",
879 |        "      <td>...</td>\n",
880 |        "      <td>...</td>\n",
881 |        "    </tr>\n",
882 |        "    <tr>\n",
883 |        "      <th>35</th>\n",
884 |        "      <td>Sociodemographic</td>\n",
885 |        "      <td>InSchool</td>\n",
886 |        "      <td>0.380912</td>\n",
887 |        "      <td>104.0</td>\n",
888 |        "      <td>S.LR</td>\n",
889 |        "    </tr>\n",
890 |        "    <tr>\n",
891 |        "      <th>3</th>\n",
892 |        "      <td>Sociodemographic</td>\n",
893 |        "      <td>DMDHRGND</td>\n",
894 |        "      <td>0.383682</td>\n",
895 |        "      <td>105.0</td>\n",
896 |        "      <td>S.LR</td>\n",
897 |        "    </tr>\n",
898 |        "    <tr>\n",
899 |        "      <th>71</th>\n",
900 |        "      <td>Diet</td>\n",
901 |        "      <td>VDrkGr</td>\n",
902 |        "      <td>0.394518</td>\n",
903 |        "      <td>106.0</td>\n",
904 |        "      <td>S.LR</td>\n",
905 |        "    </tr>\n",
906 |        "    <tr>\n",
907 |        "      <th>105</th>\n",
908 |        "      <td>Other lifestyle behaviors</td>\n",
909 |        "      <td>HHSmkNum</td>\n",
910 |        "      <td>0.422812</td>\n",
911 |        "      <td>107.0</td>\n",
912 |        "      <td>S.LR</td>\n",
913 |        "    </tr>\n",
914 |        "    <tr>\n",
915 |        "      <th>88</th>\n",
916 |        "      <td>Diet</td>\n",
917 |        "      <td>Pmps</td>\n",
918 |        "      <td>0.430820</td>\n",
919 |        "      <td>108.0</td>\n",
920 |        "      <td>S.LR</td>\n",
921 |        "    </tr>\n",
922 |        "  </tbody>\n",
923 |        "</table>\n",
924 |        "<p>108 rows × 5 columns</p>\n",
925 |        "</div>"
926 |       ],
927 |       "text/plain": [
928 |        "                      modality   feature       RPS  feature rank  \\\n",
929 |        "38               Health status     BMXHT  0.104007           1.0   \n",
930 |        "21            Sociodemographic    FdStmp  0.126047           2.0   \n",
931 |        "0             Sociodemographic  RIAGENDR  0.126740           3.0   \n",
932 |        "1             Sociodemographic  RIDAGEYR  0.154443           4.0   \n",
933 |        "29            Sociodemographic    OvntPT  0.160676           5.0   \n",
934 |        "..                         ...       ...       ...           ...   \n",
935 |        "35            Sociodemographic  InSchool  0.380912         104.0   \n",
936 |        "3             Sociodemographic  DMDHRGND  0.383682         105.0   \n",
937 |        "71                        Diet    VDrkGr  0.394518         106.0   \n",
938 |        "105  Other lifestyle behaviors  HHSmkNum  0.422812         107.0   \n",
939 |        "88                        Diet      Pmps  0.430820         108.0   \n",
940 |        "\n",
941 |        "    ensemble method  \n",
942 |        "38             S.LR  \n",
943 |        "21             S.LR  \n",
944 |        "0              S.LR  \n",
945 |        "1              S.LR  \n",
946 |        "29             S.LR  \n",
947 |        "..              ...  \n",
948 |        "35             S.LR  \n",
949 |        "3              S.LR  \n",
950 |        "71             S.LR  \n",
951 |        "105            S.LR  \n",
952 |        "88             S.LR  \n",
953 |        "\n",
954 |        "[108 rows x 5 columns]"
955 |       ]
956 |      },
957 |      "execution_count": 57,
958 |      "metadata": {},
959 |      "output_type": "execute_result"
960 |     }
961 |    ],
962 |    "source": [
963 |     "ranking_dataframe = interpreter.ensemble_feature_ranking['S.LR']\n",
964 |     "\n",
965 |     "ranking_dataframe"
966 |    ]
967 |   }
968 |  ],
969 |  "metadata": {
970 |   "kernelspec": {
971 |    "display_name": "tf",
972 |    "language": "python",
973 |    "name": "python3"
974 |   },
975 |   "language_info": {
976 |    "codemirror_mode": {
977 |     "name": "ipython",
978 |     "version": 3
979 |    },
980 |    "file_extension": ".py",
981 |    "mimetype": "text/x-python",
982 |    "name": "python",
983 |    "nbconvert_exporter": "python",
984 |    "pygments_lexer": "ipython3",
985 |    "version": "3.11.5"
986 |   },
987 |   "orig_nbformat": 4,
988 |   "vscode": {
989 |    "interpreter": {
990 |     "hash": "11e74c3c36c376ffcb66f65df8248706fe68363becca747991fd07d52526dccb"
991 |    }
992 |   }
993 |  },
994 |  "nbformat": 4,
995 |  "nbformat_minor": 2
996 | }
997 | 


--------------------------------------------------------------------------------
/eipy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GauravPandeyLab/eipy/4fdaceefec4c3090e17a5be6e8c582d0465cc0a3/eipy/__init__.py


--------------------------------------------------------------------------------
/eipy/additional_ensembles.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | from numpy import (
  4 |     argmax,
  5 |     argmin,
  6 |     sqrt,
  7 | )
  8 | import pandas as pd
  9 | 
 10 | from sklearn.utils.validation import check_is_fitted
 11 | from sklearn.base import BaseEstimator, ClassifierMixin
 12 | from sklearn.utils.multiclass import unique_labels
 13 | 
 14 | 
 15 | class MeanAggregation(BaseEstimator, ClassifierMixin):
 16 |     """
 17 |     Trivially takes the mean of X.
 18 |     """
 19 | 
 20 |     def __init__(self):
 21 |         pass
 22 | 
 23 |     def fit(self, X, y):
 24 |         self.classes_ = unique_labels(y)
 25 | 
 26 |         self.X_ = X
 27 |         self.y_ = y
 28 | 
 29 |         return self
 30 | 
 31 |     def predict_proba(self, X):
 32 |         check_is_fitted(self)
 33 |         predict_positive = X.mean(axis=1)
 34 |         return np.transpose(np.array([1 - predict_positive, predict_positive]))
 35 | 
 36 | 
 37 | class MedianAggregation(BaseEstimator, ClassifierMixin):
 38 |     """
 39 |     Trivially takes the median of X.
 40 |     """
 41 | 
 42 |     def __init__(self):
 43 |         pass
 44 | 
 45 |     def fit(self, X, y):
 46 |         self.classes_ = unique_labels(y)
 47 | 
 48 |         self.X_ = X
 49 |         self.y_ = y
 50 | 
 51 |         return self
 52 | 
 53 |     def predict_proba(self, X):
 54 |         check_is_fitted(self)
 55 |         predict_positive = X.median(axis=1)
 56 |         return np.transpose(np.array([1 - predict_positive, predict_positive]))
 57 | 
 58 | 
 59 | class CES(BaseEstimator, ClassifierMixin):
 60 |     """
 61 |     Caruana et al's Ensemble Selection.
 62 | 
 63 |     Caruana R. et al. (2006) Getting the most out of ensemble selection.
 64 |     In: Sixth International Conference on Data
 65 |     Mining (ICDM'06), 2006 IEEE, Piscataway, NJ, USA, pp. 828-833.
 66 |     """
 67 | 
 68 |     def __init__(
 69 |         self,
 70 |         scoring,
 71 |         max_ensemble_size=50,
 72 |         random_state=0,
 73 |         greater_is_better=True,
 74 |     ):
 75 |         if random_state is not None:
 76 |             random.seed(random_state)
 77 |         self.seed = random_state
 78 |         self.scoring = scoring
 79 |         self.max_ensemble_size = max_ensemble_size
 80 |         self.selected_ensemble = []
 81 |         self.train_performance = []
 82 |         self.greater_is_better = greater_is_better
 83 |         self.argbest = argmax if greater_is_better else argmin
 84 |         self.best = max if greater_is_better else min
 85 |         self.random_state = random_state
 86 | 
 87 |     def fit(self, X, y):
 88 |         # Store the classes seen during fit
 89 |         self.classes_ = unique_labels(y)
 90 | 
 91 |         self.X_ = X
 92 |         self.y_ = y
 93 | 
 94 |         # Return the classifier
 95 | 
 96 |         self.selected_ensemble = []
 97 |         self.train_performance = []
 98 | 
 99 |         self.rng_generator = np.random.default_rng(seed=self.random_state)
100 |         best_classifiers = X.apply(lambda x: self.scoring(y, x)).sort_values(
101 |             ascending=self.greater_is_better
102 |         )
103 | 
104 |         for i in range(min(self.max_ensemble_size, len(best_classifiers))):
105 |             best_candidate = self.select_candidate_enhanced(
106 |                 X, y, best_classifiers, self.selected_ensemble, i
107 |             )
108 |             self.selected_ensemble.append(best_candidate)
109 |             self.train_performance.append(self.get_performance(X, y))
110 | 
111 |         train_performance_df = pd.DataFrame.from_records(self.train_performance)
112 |         best_ensemble_size = self.get_best_performer(train_performance_df)[
113 |             "ensemble_size"
114 |         ].values
115 |         self.best_ensemble = train_performance_df["ensemble"][
116 |             : best_ensemble_size.item(0) + 1
117 |         ]
118 | 
119 |         return self
120 | 
121 |     def predict_proba(self, X):
122 |         check_is_fitted(self)
123 | 
124 |         ces_bp_df = X[self.best_ensemble]
125 |         predict_positive = ces_bp_df.mean(axis=1).values
126 |         return np.transpose(np.array([1 - predict_positive, predict_positive]))
127 | 
128 |     def select_candidate_enhanced(self, X, y, best_classifiers, ensemble, i):
129 |         initial_ensemble_size = 2
130 |         max_candidates = 50
131 |         if len(ensemble) >= initial_ensemble_size:
132 |             candidates = self.rng_generator.choice(
133 |                 best_classifiers.index.values,
134 |                 min(max_candidates, len(best_classifiers)),
135 |                 replace=False,
136 |             )
137 |             candidate_scores = [
138 |                 self.scoring(y, X[ensemble + [candidate]].mean(axis=1))
139 |                 for candidate in candidates
140 |             ]
141 |             best_candidate = candidates[self.argbest(candidate_scores)]
142 |         else:
143 |             best_candidate = best_classifiers.index.values[i]
144 |         return best_candidate
145 | 
146 |     def get_performance(self, X, y):
147 |         predictions = X[self.selected_ensemble].mean(axis=1)
148 |         score = self.scoring(y, predictions)
149 | 
150 |         return {
151 |             "seed": self.seed,
152 |             "score": score,
153 |             "ensemble": self.selected_ensemble[-1],
154 |             "ensemble_size": len(self.selected_ensemble),
155 |         }
156 | 
157 |     def get_best_performer(self, df, one_se=False):
158 |         if not one_se:
159 |             return df[df.score == self.best(df.score)].head(1)
160 |         se = df.score.std() / sqrt(df.shape[0] - 1)
161 |         if self.greater_is_better:
162 |             return df[df.score >= (self.best(df.score) - se)].head(1)
163 |         return df[df.score <= (self.best(df.score) + se)].head(1)
164 | 


--------------------------------------------------------------------------------
/eipy/datasets.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import os
  3 | from os import environ, makedirs
  4 | from os.path import expanduser, join
  5 | import wget
  6 | import zipfile
  7 | 
  8 | 
  9 | def load_diabetes():
 10 |     """
 11 |     Loads a multi-modal youth diabetes dataset.
 12 | 
 13 |     More information about this dataset can be found in the following publication:
 14 | 
 15 |     Catherine McDonough, Yan Chak Li, Nita Vangeepuram, Bian Liu, Gaurav Pandey.
 16 |     Facilitating youth diabetes studies with the most comprehensive epidemiological
 17 |     dataset available through a public web portal. medRxiv 2023.08.02.23293517.
 18 |     https://doi.org/10.1101/2023.08.02.23293517
 19 | 
 20 |     Returns
 21 |     -------
 22 |     data : dict
 23 |         Dictionary with keys 'X_train', 'y_train', 'X_test', 'y_test', 'data_dict'.
 24 | 
 25 |     """
 26 |     zenodo_link = "https://zenodo.org/records/10035422/files/diabetes.zip?download=1"
 27 |     # Get data path
 28 |     data_path = get_data_home()
 29 |     folder_ext = "diabetes"
 30 |     data_ext_path = join(data_path, folder_ext)
 31 |     # check data downloaded before
 32 |     folder_exist = os.path.exists(data_ext_path)
 33 |     zip_exist = os.path.exists(data_ext_path + ".zip")
 34 |     if not folder_exist:
 35 |         if not zip_exist:
 36 |             wget.download(zenodo_link, out=data_path)
 37 |         downloaded_path = data_ext_path + ".zip"
 38 |         with zipfile.ZipFile(downloaded_path, "r") as zip_ref:
 39 |             zip_ref.extractall(data_path)
 40 | 
 41 |     _file_path = data_ext_path
 42 |     modality_keys = [
 43 |         "Sociodemographic",
 44 |         "Health status",
 45 |         "Diet",
 46 |         "Other lifestyle behaviors",
 47 |     ]
 48 |     _train_suffix = "9916"
 49 |     _test_suffix = "1618"
 50 |     X_train = {k: _load_csv(_file_path, k, _train_suffix) for k in modality_keys}
 51 |     X_test = {k: _load_csv(_file_path, k, _test_suffix) for k in modality_keys}
 52 |     y_train = _load_csv(_file_path, "outcomes_label", _train_suffix)
 53 |     y_test = _load_csv(_file_path, "outcomes_label", _test_suffix)
 54 |     dictionary = pd.read_csv(join(_file_path, "data_dictionary.csv"))
 55 | 
 56 |     return {
 57 |         "X_train": X_train,
 58 |         "y_train": y_train,
 59 |         "X_test": X_test,
 60 |         "y_test": y_test,
 61 |         "data_dict": dictionary,
 62 |     }
 63 | 
 64 | 
 65 | def _load_csv(file_path, fn, suffix):
 66 |     return pd.read_csv(join(file_path, f"{fn}_{suffix}.csv"), index_col=0)
 67 | 
 68 | 
 69 | def get_data_home(data_home=None):
 70 |     """Return the path of the eipy data directory.
 71 | 
 72 |     This function is referring from scikit-learn.
 73 | 
 74 |     This folder is used by some large dataset loaders to avoid downloading the
 75 |     data several times.
 76 | 
 77 |     By default the data directory is set to a folder named 'eipy_data' in the
 78 |     user home folder.
 79 | 
 80 |     Alternatively, it can be set by the 'EIPY_DATA' environment
 81 |     variable or programmatically by giving an explicit folder path. The '~'
 82 |     symbol is expanded to the user home folder.
 83 | 
 84 |     If the folder does not already exist, it is automatically created.
 85 | 
 86 |     Parameters
 87 |     ----------
 88 |     data_home : str or path-like, default=None
 89 |         The path to scikit-learn data directory. If `None`, the default path
 90 |         is `~/eipy_data`.
 91 | 
 92 |     Returns
 93 |     -------
 94 |     data_home: str
 95 |         The path to eipy data directory.
 96 |     """
 97 |     if data_home is None:
 98 |         data_home = environ.get("EIPY_DATA", join("~", "eipy_data"))
 99 |     data_home = expanduser(data_home)
100 |     makedirs(data_home, exist_ok=True)
101 |     return data_home
102 | 


--------------------------------------------------------------------------------
/eipy/ei.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Ensemble Integration
  3 | 
  4 | @author: Jamie Bennett, Yan Chak (Richard) Li, Aviad Susman
  5 | """
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | import random
 10 | import dill as pickle
 11 | import copy
 12 | from tqdm import tqdm
 13 | from sklearn.utils._testing import ignore_warnings
 14 | from sklearn.exceptions import ConvergenceWarning
 15 | from sklearn.model_selection import StratifiedKFold
 16 | from sklearn.base import clone
 17 | from joblib import Parallel, delayed
 18 | import warnings
 19 | from eipy.utils import (
 20 |     X_is_dict,
 21 |     X_to_numpy,
 22 |     y_to_numpy,
 23 |     set_predictor_seeds,
 24 |     random_integers,
 25 |     sample,
 26 |     retrieve_X_y,
 27 |     append_modality,
 28 |     safe_predict_proba,
 29 |     dummy_cv,
 30 |     bar_format,
 31 | )
 32 | from eipy.metrics import (
 33 |     base_summary,
 34 |     ensemble_summary,
 35 | )
 36 | 
 37 | warnings.filterwarnings("ignore", category=DeprecationWarning)
 38 | 
 39 | 
 40 | class EnsembleIntegration:
 41 |     """
 42 |     Ensemble Integration.
 43 | 
 44 |     Train and test a variety of ensemble classification algorithms using a nested cross
 45 |     validation approach.
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     base_predictors : dict, default=None
 50 |         Dictionary of (sklearn-like) base predictors. Can also be passed in the
 51 |         fit_base method.
 52 |     ensemble_predictors : dict, default=None
 53 |         Dictionary of (sklearn-like) stacking algorithms. Can also be passed in the
 54 |         fit_ensemble method.
 55 |     k_outer : int, default=5
 56 |         Number of outer folds.
 57 |     k_inner : int, default=5
 58 |         Number of inner folds.
 59 |     n_samples : int, default=1
 60 |         The number of samples to take when balancing classes. Ignored if
 61 |         sampling_strategy is None.
 62 |     sampling_strategy : str, default=None
 63 |         The sampling method for class balancing. Can be set to 'undersampling',
 64 |         'oversampling', 'hybrid'.
 65 |     sampling_aggregation : str, default=None
 66 |         Method for combining multiple samples. Only relevant when n_samples>1. Can be
 67 |         'mean' or None.
 68 |     metrics : dict, default=None
 69 |         A dictionary of metrics for which to evaluate ensembles. If left as default=None,
 70 |         the fmax_score and roc_auc_score are calculated.
 71 |     n_jobs : int, default=1
 72 |         Number of workers for parallelization in joblib.
 73 |     metrics : dict, default=None
 74 |         If None, the maximized F1-score and AUC scores are calculated.
 75 |     random_state : int, default=None
 76 |         Random state for cross-validation and use in some models.
 77 |     parallel_backend : str, default='loky'
 78 |         Backend to use in joblib. See joblib.Parallel() for other options.
 79 |     project_name : str, default='project'
 80 |         Name of project.
 81 |     model_building : bool, default=True
 82 |         Whether or not to train and save final models.
 83 |     verbose : int, default=1
 84 |         Verbosity level. Can be set to 0 or 1.
 85 | 
 86 |     Attributes
 87 |     ----------
 88 |     base_summary : dict
 89 |         Summary of performance scores for each base predictor. Scores can be accessed
 90 |         using the 'metrics' key and corresponding thresholds (if applicable) can be
 91 |         accessed in the 'thresholds' key.
 92 |     ensemble_summary : dict
 93 |         Summary of performance scores for each ensemble method. Scores can be accessed
 94 |         using the 'metrics' key and corresponding thresholds (if applicable) can be
 95 |         accessed in the 'thresholds' key.
 96 |     ensemble_training_data : list of pandas.DataFrame
 97 |         Training data for ensemble methods, for each outer fold.
 98 |         len(ensemble_training_data) = len(k_outer)
 99 |     ensemble_test_data : list of pandas.DataFrame
100 |         Test data for ensemble methods, for each outer fold.
101 |         len(ensemble_test_data) = len(k_outer)
102 |     ensemble_predictions : pandas.DataFrame
103 |         Combined predictions (across all outer folds) made by each ensemble method.
104 |     modality_names : list of str
105 |         List of modalities in the order in which they were passed to EnsembleIntegration.
106 |     n_features_per_modality : list of int
107 |         List of number of features in each modality corresponding to modality_names.
108 |     feature_names : dict
109 |         Feature names for each modality passed to fit_base.
110 |     random_numbers_for_samples : list of int
111 |         Random numbers used to sample each training fold.
112 |     final_models : dict
113 |         Dictionary of the form {"base models": {}, "ensemble models": {}}.
114 |         Populated if model_building=True.
115 |     ensemble_training_data_final: list of pandas.DataFrame
116 |         List containing single dataframe of training data. Final models are
117 |         trained on all available data.
118 |     cv_outer : StratifiedKFold
119 |         StratifiedKFold() cross validator from sklearn.
120 |     cv_inner : StratifiedKFold
121 |         StratifiedKFold() cross validator from sklearn.
122 | 
123 |     """
124 | 
125 |     def __init__(
126 |         self,
127 |         base_predictors=None,
128 |         ensemble_predictors=None,
129 |         k_outer=5,
130 |         k_inner=5,
131 |         n_samples=1,
132 |         sampling_strategy="undersampling",
133 |         sampling_aggregation=None,
134 |         n_jobs=1,
135 |         metrics=None,
136 |         random_state=None,
137 |         parallel_backend="loky",
138 |         project_name="project",
139 |         model_building=True,
140 |         verbose=1,
141 |     ):
142 |         if random_state is not None:
143 |             random.seed(random_state)
144 | 
145 |         self.base_predictors = base_predictors
146 |         self.ensemble_predictors = ensemble_predictors
147 |         self.k_outer = k_outer
148 |         self.k_inner = k_inner
149 |         self.n_samples = n_samples
150 |         self.sampling_strategy = sampling_strategy
151 |         self.sampling_aggregation = sampling_aggregation
152 |         self.n_jobs = n_jobs
153 |         self.metrics = metrics
154 |         self.random_state = random_state
155 |         self.parallel_backend = parallel_backend
156 |         self.project_name = project_name
157 |         self.model_building = model_building
158 |         self.verbose = verbose
159 | 
160 |         self.final_models = {
161 |             "base models": {},
162 |             "ensemble models": {},
163 |         }  # for final model
164 |         self.ensemble_training_data_final = None  # for final model
165 | 
166 |         self.cv_outer = StratifiedKFold(
167 |             n_splits=self.k_outer, shuffle=True, random_state=self.random_state
168 |         )
169 | 
170 |         self.cv_inner = StratifiedKFold(
171 |             n_splits=self.k_inner, shuffle=True, random_state=self.random_state
172 |         )
173 | 
174 |         self.ensemble_training_data = None
175 |         self.ensemble_test_data = None
176 |         self.base_summary = None
177 | 
178 |         self.ensemble_predictions = None
179 |         self.ensemble_summary = None
180 | 
181 |         self.modality_names = []
182 |         self.n_features_per_modality = []
183 | 
184 |         self.random_numbers_for_samples = random_integers(
185 |             n_integers=n_samples, seed=self.random_state
186 |         )
187 |         self.feature_names = {}
188 | 
189 |     def fit_base(self, X, y, base_predictors=None, modality_name=None):
190 |         """
191 |         Train base predictors and generate ensemble train/test data.
192 | 
193 |         Parameters
194 |         ----------
195 |         X : array of shape (n_samples, n_features)
196 |             Training vector, where n_samples is the number of samples and
197 |             n_features is the number of features.
198 |         y : array of shape (n_samples,)
199 |             Target vector relative to X.
200 | 
201 |         Returns
202 |         -------
203 |         self
204 |             Ensemble train/test data and fitted final base predictors.
205 | 
206 |         """
207 | 
208 |         print(
209 |             f"""Training base predictors on {modality_name}...
210 |         \n... for ensemble performance analysis..."""
211 |         )
212 |         #  convert y to a numpy array
213 |         y = y_to_numpy(y)
214 | 
215 |         #  check if base_predictors are passed here
216 |         if base_predictors is not None:
217 |             self.base_predictors = base_predictors  # update base predictors
218 | 
219 |         #  set random_states in base_predictors
220 |         set_predictor_seeds(self.base_predictors, self.random_state)
221 | 
222 |         #  check data format and train accordingly
223 |         if X_is_dict(X):
224 |             for modality_name, modality in X.items():
225 |                 self._fit_base(
226 |                     X=modality,
227 |                     y=y,
228 |                     base_predictors=base_predictors,
229 |                     modality_name=modality_name,
230 |                 )
231 |         else:
232 |             self._fit_base(
233 |                 X=X, y=y, base_predictors=base_predictors, modality_name=modality_name
234 |             )
235 | 
236 |     @ignore_warnings(category=ConvergenceWarning)
237 |     def fit_ensemble(self, ensemble_predictors=None):
238 |         """
239 |         Train ensemble predictors on data generated by fit_base.
240 | 
241 |         Parameters
242 |         ----------
243 |         ensemble_predictors : dict, default=None
244 |             Dictionary of (sklearn-like) stacking algorithms.
245 | 
246 |         Returns
247 |         -------
248 |         self
249 |             Summary of ensemble predictor performance and fitted final ensemble models.
250 |         """
251 | 
252 |         if ensemble_predictors is not None:
253 |             self.ensemble_predictors = ensemble_predictors
254 | 
255 |         set_predictor_seeds(self.ensemble_predictors, self.random_state)
256 | 
257 |         y_test_combined = []
258 | 
259 |         for fold_id in range(self.k_outer):
260 |             _, y_test = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])
261 |             y_test_combined.extend(y_test)
262 | 
263 |         ensemble_predictions = {}
264 | 
265 |         for model_name, model in tqdm(
266 |             self.ensemble_predictors.items(),
267 |             desc="Analyzing ensembles",
268 |             bar_format=bar_format,
269 |         ):
270 |             y_pred_combined = []
271 | 
272 |             for fold_id in range(self.k_outer):
273 |                 X_train, y_train = retrieve_X_y(
274 |                     labelled_data=self.ensemble_training_data[fold_id]
275 |                 )
276 |                 X_test, _ = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])
277 | 
278 |                 if self.sampling_aggregation == "mean":
279 |                     X_train = X_train.T.groupby(level=[0, 1]).mean().T
280 |                     X_test = X_test.T.groupby(level=[0, 1]).mean().T
281 | 
282 |                 model.fit(X_train, y_train)
283 |                 y_pred = safe_predict_proba(model, X_test)
284 |                 y_pred_combined.extend(y_pred)
285 | 
286 |             ensemble_predictions[model_name] = y_pred_combined
287 | 
288 |         ensemble_predictions["labels"] = y_test_combined
289 | 
290 |         self.ensemble_predictions = pd.DataFrame.from_dict(ensemble_predictions)
291 |         self.ensemble_summary = ensemble_summary(
292 |             self.ensemble_predictions, self.metrics
293 |         )
294 | 
295 |         if self.model_building:
296 |             for model_name, model in tqdm(
297 |                 self.ensemble_predictors.items(),
298 |                 desc="Training final ensemble models",
299 |                 bar_format=bar_format,
300 |             ):
301 |                 X_train, y_train = retrieve_X_y(
302 |                     labelled_data=self.ensemble_training_data_final[0]
303 |                 )
304 | 
305 |                 if self.sampling_aggregation == "mean":
306 |                     X_train = X_train.T.groupby(level=[0, 1]).mean().T
307 |                     X_test = X_test.T.groupby(level=[0, 1]).mean().T
308 | 
309 |                 model.fit(X_train, y_train)
310 | 
311 |                 self.final_models["ensemble models"][model_name] = pickle.dumps(model)
312 | 
313 |         return self
314 | 
315 |     def predict(self, X_dict, ensemble_model_key):
316 |         """
317 |         Predict class labels for samples in X
318 | 
319 |         Parameters
320 |         ----------
321 |         X_dict : dict
322 |             Dictionary of X modalities each having n_samples. Keys and n_features
323 |             must match those seen by fit_base.
324 |         ensemble_model_key :
325 |             The key of the ensemble method selected during performance analysis.
326 | 
327 |         Returns
328 |         -------
329 |         y_pred : array of shape (n_samples,)
330 |             Vector containing the class labels for each sample.
331 |         """
332 | 
333 |         ensemble_prediction_data = None
334 | 
335 |         for i in range(len(self.modality_names)):
336 |             modality_name = self.modality_names[i]
337 |             X = X_dict[modality_name]
338 | 
339 |             X, _ = X_to_numpy(X)
340 | 
341 |             base_models = copy.deepcopy(self.final_models["base models"][modality_name])
342 |             self.base_predictors = {}
343 |             for base_model_dict in base_models:
344 |                 if base_model_dict["model name"] not in self.base_predictors.keys():
345 |                     self.base_predictors[base_model_dict["model name"]] = 0
346 | 
347 |                 base_model = pickle.loads(base_model_dict["pickled model"])
348 |                 y_pred = safe_predict_proba(base_model, X)
349 | 
350 |                 base_model_dict["fold id"] = 0
351 |                 base_model_dict["y_pred"] = y_pred
352 | 
353 |             combined_predictions = self._combine_predictions_outer(
354 |                 base_models, modality_name, model_building=True
355 |             )
356 |             ensemble_prediction_data = append_modality(
357 |                 ensemble_prediction_data, combined_predictions, model_building=True
358 |             )
359 |         ensemble_prediction_data = ensemble_prediction_data[0]
360 | 
361 |         if self.sampling_aggregation == "mean":
362 |             ensemble_prediction_data = (
363 |                 ensemble_prediction_data.T.groupby(level=[0, 1]).mean().T
364 |             )
365 | 
366 |         ensemble_model = pickle.loads(
367 |             self.final_models["ensemble models"][ensemble_model_key]
368 |         )
369 | 
370 |         y_pred = safe_predict_proba(ensemble_model, ensemble_prediction_data)
371 |         return y_pred
372 | 
373 |     @ignore_warnings(category=ConvergenceWarning)
374 |     def _fit_base(self, X, y, base_predictors=None, modality_name=None):
375 |         X, feature_names = X_to_numpy(X)
376 | 
377 |         self.modality_names.append(modality_name)
378 |         self.feature_names[modality_name] = feature_names
379 |         self.n_features_per_modality.append(X.shape[1])
380 | 
381 |         ensemble_training_data_modality = self._fit_base_inner(
382 |             X=X,
383 |             y=y,
384 |             cv_outer=self.cv_outer,
385 |             cv_inner=self.cv_inner,
386 |             base_predictors=self.base_predictors,
387 |             modality_name=modality_name,
388 |         )
389 | 
390 |         self.ensemble_training_data = append_modality(
391 |             self.ensemble_training_data, ensemble_training_data_modality
392 |         )
393 | 
394 |         ensemble_test_data_modality = self._fit_base_outer(
395 |             X=X,
396 |             y=y,
397 |             cv_outer=self.cv_outer,
398 |             base_predictors=self.base_predictors,
399 |             modality_name=modality_name,
400 |         )
401 | 
402 |         self.ensemble_test_data = append_modality(
403 |             self.ensemble_test_data, ensemble_test_data_modality
404 |         )  # append data to dataframe
405 | 
406 |         # create a summary of base predictor performance
407 |         self.base_summary = base_summary(self.ensemble_test_data, self.metrics)
408 | 
409 |         if self.model_building:
410 |             self._fit_base_final(X=X, y=y, modality_name=modality_name)
411 | 
412 |         print("\n")
413 | 
414 |         return self
415 | 
416 |     def _fit_base_final(self, X, y, modality_name=None):
417 |         """
418 |         Train a final base predictor model to be used by predict()
419 |         """
420 |         print("\n... for final ensemble...")
421 | 
422 |         ensemble_training_data_modality = self._fit_base_inner(
423 |             X=X,
424 |             y=y,
425 |             cv_inner=self.cv_inner,
426 |             cv_outer=dummy_cv(),  # returns indices of X with an empty set of test indices
427 |             base_predictors=self.base_predictors,
428 |             modality_name=modality_name,
429 |         )
430 | 
431 |         self.ensemble_training_data_final = append_modality(
432 |             self.ensemble_training_data_final, ensemble_training_data_modality
433 |         )
434 | 
435 |         base_model_list_of_dicts = self._fit_base_outer(
436 |             X=X,
437 |             y=y,
438 |             cv_outer=dummy_cv(),  # returns indices of X with an empty set of test indices
439 |             base_predictors=self.base_predictors,
440 |             modality_name=modality_name,
441 |             model_building=self.model_building,
442 |         )
443 | 
444 |         self.final_models["base models"][modality_name] = base_model_list_of_dicts
445 | 
446 |     def _fit_base_inner(
447 |         self, X, y, cv_outer, cv_inner, base_predictors=None, modality_name=None
448 |     ):
449 |         """
450 |         Perform a round of (inner) k-fold cross validation on each outer
451 |         training set. For generating ensemble training data.
452 |         """
453 | 
454 |         if base_predictors is not None:
455 |             self.base_predictors = base_predictors  # update base predictors
456 | 
457 |         # dictionaries for ensemble train/test data for each outer fold
458 |         ensemble_training_data_modality = []
459 | 
460 |         # define joblib Parallel function
461 |         with Parallel(
462 |             n_jobs=self.n_jobs, verbose=0, backend=self.parallel_backend
463 |         ) as parallel:
464 |             for _outer_fold_id, (train_index_outer, _test_index_outer) in enumerate(
465 |                 tqdm(
466 |                     cv_outer.split(X, y),
467 |                     total=cv_outer.n_splits,
468 |                     desc="Generating ensemble training data",
469 |                     bar_format=bar_format,
470 |                 )
471 |             ):
472 |                 X_train_outer = X[train_index_outer]
473 |                 y_train_outer = y[train_index_outer]
474 | 
475 |                 # spawn n_jobs jobs for each sample, inner_fold and model
476 |                 output = parallel(
477 |                     delayed(self._train_predict_single_base_predictor)(
478 |                         X=X_train_outer,
479 |                         y=y_train_outer,
480 |                         model_params=model_params,
481 |                         fold_params=inner_fold_params,
482 |                         sample_state=sample_state,
483 |                     )
484 |                     for model_params in self.base_predictors.items()
485 |                     for inner_fold_params in enumerate(
486 |                         cv_inner.split(X_train_outer, y_train_outer)
487 |                     )
488 |                     for sample_state in enumerate(self.random_numbers_for_samples)
489 |                 )
490 | 
491 |                 combined_predictions = self._combine_predictions_inner(
492 |                     output, modality_name
493 |                 )
494 |                 ensemble_training_data_modality.append(combined_predictions)
495 | 
496 |         return ensemble_training_data_modality
497 | 
498 |     def _fit_base_outer(
499 |         self,
500 |         X,
501 |         y,
502 |         cv_outer,
503 |         base_predictors=None,
504 |         modality_name=None,
505 |         model_building=False,
506 |     ):
507 |         """
508 |         Train each base predictor on each outer training set. For generating ensemble test data.
509 |         """
510 | 
511 |         if model_building:
512 |             progress_string = "Training final base predictors"
513 |         else:
514 |             progress_string = "Generating ensemble test data"
515 | 
516 |         if base_predictors is not None:
517 |             self.base_predictors = base_predictors  # update base predictors
518 | 
519 |         # define joblib Parallel function
520 |         with Parallel(
521 |             n_jobs=self.n_jobs, verbose=0, backend=self.parallel_backend
522 |         ) as parallel:
523 |             # spawn job for each sample, outer_fold and model
524 |             output = parallel(
525 |                 delayed(self._train_predict_single_base_predictor)(
526 |                     X=X,
527 |                     y=y,
528 |                     model_params=model_params,
529 |                     fold_params=outer_fold_params,
530 |                     sample_state=sample_state,
531 |                     model_building=model_building,
532 |                 )
533 |                 for model_params in tqdm(
534 |                     self.base_predictors.items(),
535 |                     desc=progress_string,
536 |                     bar_format=bar_format,
537 |                 )
538 |                 for outer_fold_params in enumerate(cv_outer.split(X, y))
539 |                 for sample_state in enumerate(self.random_numbers_for_samples)
540 |             )
541 | 
542 |         if model_building:
543 |             return output
544 |         else:
545 |             return self._combine_predictions_outer(output, modality_name)
546 | 
547 |     @ignore_warnings(category=ConvergenceWarning)
548 |     def _train_predict_single_base_predictor(
549 |         self, X, y, model_params, fold_params, sample_state, model_building=False
550 |     ):
551 |         """
552 |         Train/test single base predictor, on a given training fold,
553 |         subject to a given sampling strategy.
554 |         """
555 | 
556 |         model_name, model = model_params
557 | 
558 |         model = clone(model)
559 | 
560 |         fold_id, (train_index, test_index) = fold_params
561 |         sample_id, sample_random_state = sample_state
562 | 
563 |         X_train, X_test = X[train_index], X[test_index]
564 |         y_train, y_test = y[train_index], y[test_index]
565 |         X_sample, y_sample = sample(
566 |             X_train,
567 |             y_train,
568 |             strategy=self.sampling_strategy,
569 |             random_state=sample_random_state,
570 |         )
571 | 
572 |         model.fit(X_sample, y_sample)
573 | 
574 |         if model_building:
575 |             results_dict = {
576 |                 "model name": model_name,
577 |                 "sample id": sample_id,
578 |                 "pickled model": pickle.dumps(
579 |                     model
580 |                 ),  # pickle model to reduce memory usage. use pickle.loads() to de-serialize
581 |             }
582 | 
583 |         else:
584 |             y_pred = safe_predict_proba(model, X_test)
585 | 
586 |             results_dict = {
587 |                 "model name": model_name,
588 |                 "sample id": sample_id,
589 |                 "fold id": fold_id,
590 |                 "y_pred": y_pred,
591 |                 "labels": y_test,
592 |             }
593 | 
594 |         return results_dict
595 | 
596 |     def _combine_predictions_inner(self, list_of_dicts, modality):
597 |         """
598 |         Combine the predictions arising from the inner cross validation.
599 |         """
600 | 
601 |         # dictionary to store predictions
602 |         combined_predictions = {}
603 |         # combine fold predictions for each model
604 |         for model_name in self.base_predictors.keys():
605 |             for sample_id in range(self.n_samples):
606 |                 model_predictions = np.concatenate(
607 |                     list(
608 |                         d["y_pred"]
609 |                         for d in list_of_dicts
610 |                         if d["model name"] == model_name and d["sample id"] == sample_id
611 |                     )
612 |                 )
613 |                 combined_predictions[modality, model_name, sample_id] = (
614 |                     model_predictions
615 |                 )
616 |         labels = np.concatenate(
617 |             list(
618 |                 d["labels"]
619 |                 for d in list_of_dicts
620 |                 if d["model name"] == list(self.base_predictors.keys())[0]
621 |                 and d["sample id"] == 0
622 |             )
623 |         )
624 |         combined_predictions = pd.DataFrame(combined_predictions).rename_axis(
625 |             ["modality", "base predictor", "sample"], axis=1
626 |         )
627 |         combined_predictions["labels"] = labels
628 |         return combined_predictions
629 | 
630 |     def _combine_predictions_outer(self, list_of_dicts, modality, model_building=False):
631 |         """
632 |         Combine the predictions arising from the inner cross validation.
633 |         """
634 | 
635 |         if model_building:
636 |             k_outer = 1
637 |         else:
638 |             k_outer = self.k_outer
639 | 
640 |         combined_predictions = []
641 | 
642 |         for fold_id in range(k_outer):
643 |             predictions = {}
644 |             for model_name in self.base_predictors.keys():
645 |                 for sample_id in range(self.n_samples):
646 |                     model_predictions = list(
647 |                         d["y_pred"]
648 |                         for d in list_of_dicts
649 |                         if d["fold id"] == fold_id
650 |                         and d["model name"] == model_name
651 |                         and d["sample id"] == sample_id
652 |                     )
653 |                     predictions[modality, model_name, sample_id] = model_predictions[0]
654 |             predictions = pd.DataFrame(predictions)
655 | 
656 |             if not model_building:
657 |                 labels = [
658 |                     d["labels"]
659 |                     for d in list_of_dicts
660 |                     if d["fold id"] == fold_id
661 |                     and d["model name"] == list(self.base_predictors.keys())[0]
662 |                     and d["sample id"] == 0
663 |                 ]
664 |                 predictions["labels"] = labels[0]
665 | 
666 |             combined_predictions.append(
667 |                 predictions.rename_axis(
668 |                     ["modality", "base predictor", "sample"], axis=1
669 |                 )
670 |             )
671 | 
672 |         return combined_predictions
673 | 
674 |     def save(self, path=None):
675 |         """
676 |         Save to path.
677 | 
678 |         Parameters
679 |         ----------
680 | 
681 |         path : optional, default=None
682 |             Path to save the EnsembleIntegration class object.
683 |         """
684 | 
685 |         if path is None:
686 |             path = f"EI.{self.project_name}"
687 |         with open(path, "wb") as f:
688 |             pickle.dump(self, f)
689 |         print(f"\nSaved to {path}\n")
690 | 
691 |     @classmethod
692 |     def load(cls, path):
693 |         """
694 |         Load from path.
695 | 
696 |         Parameters
697 |         ----------
698 | 
699 |         path : str
700 |             Path to load the EnsembleIntegration class object.
701 |         """
702 |         with open(path, "rb") as f:
703 |             return pickle.load(f)
704 | 


--------------------------------------------------------------------------------
/eipy/interpretation.py:
--------------------------------------------------------------------------------
  1 | from sklearn.inspection import permutation_importance
  2 | from eipy.utils import X_to_numpy, retrieve_X_y, bar_format, y_to_numpy
  3 | import pandas as pd
  4 | from tqdm import tqdm
  5 | import numpy as np
  6 | import copy
  7 | from sklearn.metrics import make_scorer
  8 | import dill as pickle
  9 | from itertools import groupby
 10 | from operator import itemgetter
 11 | from sklearn.ensemble import VotingClassifier
 12 | from sklearn.preprocessing import LabelEncoder
 13 | from eipy.metrics import fmax_score
 14 | 
 15 | import warnings
 16 | 
 17 | 
 18 | class PermutationInterpreter:
 19 |     """
 20 |     Permuation importance based interpreter.
 21 | 
 22 |     This method utilizes sklearn's `permutation_importance
 23 |     <https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html>`_
 24 |     function.
 25 | 
 26 |     EI : EnsembleIntegration class object
 27 |         Fitted EnsembleIntegration model, i.e. with model_building=True.
 28 |     metric : function
 29 |         sklearn-like metric function. If None, the fmax score is used.
 30 |     n_repeats : int, default=10
 31 |         Number of repeats in PermutationImportance.
 32 |     ensemble_predictor_keys: default='all'
 33 |         Ensemble predictor keys used in EnsembleIntegration. If 'all' then all
 34 |         ensemble predictors seen by EI are interpreted. Recommended to pass a
 35 |         subset of ensemble_predctor keys as a list.
 36 |     metric_greater_is_better: default=True
 37 |         Metric greater is better.
 38 | 
 39 |     Attributes
 40 |     ----------
 41 |     ensemble_feature_ranking : pandas.DataFrame
 42 |         Feature rankings for each ensemble method.
 43 |     LFR : pandas.DataFrame
 44 |         Local feature rankings for each base predictor.
 45 |     LMR : pandas.Dataframe
 46 |         self.LMR = None
 47 | 
 48 |     Returns
 49 |     -------
 50 |     self
 51 |         Feature rankings of final ensemble models trained with EnsembleIntegration.
 52 | 
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         EI,
 58 |         metric=None,
 59 |         ensemble_predictor_keys="all",  # can be "all" or a list of keys for ensemble methods
 60 |         n_repeats=10,
 61 |         n_jobs=1,
 62 |         metric_greater_is_better=True,
 63 |     ):
 64 |         self.EI = EI
 65 | 
 66 |         if metric is None:  # use fmax score if metric not specified
 67 |             self.metric = lambda y_test, y_pred: fmax_score(y_test, y_pred)[0]
 68 |         else:
 69 |             self.metric = metric
 70 | 
 71 |         self.n_repeats = n_repeats
 72 |         self.n_jobs = n_jobs
 73 |         self.ensemble_predictor_keys = ensemble_predictor_keys
 74 |         self.metric_greater_is_better = metric_greater_is_better
 75 | 
 76 |         self.LFR = None
 77 |         self.LMR = None
 78 | 
 79 |     def rank_product_score(self, X_dict, y):
 80 |         """
 81 |         Compute feature ranking of ensemble methods using LFR and LMR.
 82 | 
 83 |         Parameters
 84 |         ----------
 85 |         X_dict : dict
 86 |             Dictionary of X modalities. Keys and n_features
 87 |             must match those seen by EnsembleIntegration.fit_base().
 88 |         y : array of shape (n_samples,)
 89 |             Target vector relative to X.
 90 | 
 91 |         Returns
 92 |         -------
 93 |         self
 94 |             Feature ranking of ensemble methods
 95 |         """
 96 | 
 97 |         print("Interpreting ensembles...\n")
 98 | 
 99 |         if self.ensemble_predictor_keys == "all":
100 |             ensemble_predictor_keys = self.EI.ensemble_predictors.keys()
101 |         else:
102 |             ensemble_predictor_keys = self.ensemble_predictor_keys
103 | 
104 |         if self.LFR is None:
105 |             self.local_feature_rank(X_dict, y_to_numpy(y))
106 | 
107 |         if self.LMR is None:
108 |             self.local_model_rank(ensemble_predictor_keys=ensemble_predictor_keys)
109 | 
110 |         print("Calculating combined rank product score...")
111 | 
112 |         feature_ranking_list = {}
113 |         self.merged_lmr_lfr = {}
114 |         for model_name in ensemble_predictor_keys:
115 |             lmr_interest = self.LMR[self.LMR["ensemble_method"] == model_name].copy()
116 |             self.merged_lmr_lfr[model_name] = pd.merge(
117 |                 lmr_interest,
118 |                 self.LFR,
119 |                 how="right",
120 |                 left_on=["base predictor", "modality"],
121 |                 right_on=["base predictor", "modality"],
122 |             )
123 | 
124 |             self.merged_lmr_lfr[model_name]["LMR_LFR_product"] = (
125 |                 self.merged_lmr_lfr[model_name]["LMR"]
126 |                 * self.merged_lmr_lfr[model_name]["LFR"]
127 |             )
128 |             # take mean of LMR*LFR for each feature
129 |             RPS_list = {"modality": [], "feature": [], "RPS": []}
130 | 
131 |             for modal in self.merged_lmr_lfr[model_name]["modality"].unique():
132 |                 merged_lmr_lfr_modal = self.merged_lmr_lfr[model_name].loc[
133 |                     self.merged_lmr_lfr[model_name]["modality"] == modal
134 |                 ]
135 |                 for feat in merged_lmr_lfr_modal["local_feature_id"].unique():
136 |                     RPS_list["modality"].append(modal)
137 |                     RPS_list["feature"].append(feat)
138 |                     RPS_list["RPS"].append(
139 |                         merged_lmr_lfr_modal.loc[
140 |                             merged_lmr_lfr_modal["local_feature_id"] == feat,
141 |                             "LMR_LFR_product",
142 |                         ].mean()
143 |                     )
144 |             RPS_df = pd.DataFrame(RPS_list)
145 |             RPS_df["feature rank"] = RPS_df["RPS"].rank(ascending=True)
146 |             RPS_df["ensemble method"] = model_name
147 |             RPS_df.sort_values(by="feature rank", inplace=True)
148 |             feature_ranking_list[model_name] = RPS_df
149 |         self.ensemble_feature_ranking = feature_ranking_list
150 |         print("... complete!")
151 | 
152 |         return self
153 | 
154 |     def local_feature_rank(self, X_dict, y):
155 |         """
156 |         Local Feature Ranks (LFRs) for each base predictor
157 | 
158 |         Parameters
159 |         ----------
160 |         X_dict : dict
161 |             Dictionary of X modalities. Keys and n_features
162 |             must match those seen by EnsembleIntegration.fit_base().
163 |         y : array of shape (n_samples,)
164 |             Target vector relative to X.
165 | 
166 |         Returns
167 |         -------
168 |         self
169 |             Local feature ranks.
170 |         """
171 | 
172 |         importance_list = []
173 | 
174 |         for modality_name in tqdm(
175 |             self.EI.modality_names,
176 |             desc="Calculating local feature ranks",
177 |             bar_format=bar_format,
178 |         ):
179 |             X = X_dict[modality_name]
180 |             X, feature_names = X_to_numpy(X)
181 | 
182 |             # check feature names were seen during training
183 |             if len(self.EI.feature_names[modality_name]) > 1:
184 |                 # check feature names are the same and warn if not
185 |                 if self.EI.feature_names[modality_name] != feature_names:
186 |                     warnings.warn(
187 |                         "Feature names do not match those seen during training",
188 |                         category=Warning,
189 |                         stacklevel=2,
190 |                     )
191 |             else:
192 |                 # check if features have been passed now
193 |                 if len(feature_names) > 1:
194 |                     warnings.warn(
195 |                         """Feature names have been passed to interpreter but none
196 |                         were seen during training.""",
197 |                         category=Warning,
198 |                         stacklevel=2,
199 |                     )
200 | 
201 |             # if no feature names passed assign an id
202 |             if len(feature_names) != X.shape[1]:
203 |                 feature_names = np.arange(X.shape[1])
204 | 
205 |             base_models = copy.deepcopy(
206 |                 self.EI.final_models["base models"][modality_name]
207 |             )
208 | 
209 |             base_models = sorted(base_models, key=itemgetter("model name"))
210 | 
211 |             for _key, base_models_per_sample in groupby(
212 |                 base_models, key=itemgetter("model name")
213 |             ):
214 |                 list_of_base_models = []
215 | 
216 |                 for base_model_dict in base_models_per_sample:
217 |                     base_model = pickle.loads(base_model_dict["pickled model"])
218 |                     list_of_base_models.append(
219 |                         (
220 |                             str(base_model_dict["sample id"]),
221 |                             base_model,
222 |                         )
223 |                     )  # list of tuples for VotingClassifier
224 | 
225 |                 if (
226 |                     len(list_of_base_models) > 1
227 |                 ):  # take mean of base predictors with different sample ids
228 |                     ###################################################################
229 |                     #  This code is a work around and may be fragile. We use VotingClassifier
230 |                     # to combine models trained on different samples (taking a mean of model
231 |                     # output). The current sklearn implementation of VotingClassifier does not
232 |                     # accept pretrained models, so we set parameters ourselves to allow it. In
233 |                     # the future it may be possible to use VotingClassifier alone without
234 |                     # additional code. An sklearn-like model is needed to be passed to
235 |                     # permutation_importance.
236 | 
237 |                     model = VotingClassifier(
238 |                         estimators=list_of_base_models,
239 |                         voting="soft",
240 |                         weights=np.ones(len(list_of_base_models)),
241 |                     )  # average predictions of models built on different data samples
242 | 
243 |                     model.estimators_ = [j for _, j in list_of_base_models]
244 |                     model.le_ = LabelEncoder().fit(y)
245 |                     model.classes_ = model.le_.classes_
246 | 
247 |                     ##################################################################
248 | 
249 |                 else:
250 |                     model = list_of_base_models[0][1]
251 | 
252 |                 needs_proba = hasattr(base_model, "predict_proba")
253 |                 scorer_ = make_scorer(
254 |                     self.metric,
255 |                     greater_is_better=self.metric_greater_is_better,
256 |                     needs_proba=needs_proba,
257 |                 )
258 | 
259 |                 pi = permutation_importance(
260 |                     estimator=model,
261 |                     X=X,
262 |                     y=y,
263 |                     n_repeats=self.n_repeats,
264 |                     n_jobs=self.n_jobs,
265 |                     random_state=self.EI.random_state,
266 |                     scoring=scorer_,
267 |                 )
268 | 
269 |                 pi_df = pd.DataFrame(
270 |                     {
271 |                         "local_importance_mean": pi.importances_mean,
272 |                         "local_importance_std": pi.importances_std,
273 |                         "local_feature_id": feature_names,
274 |                     }
275 |                 )
276 | 
277 |                 pi_df["base predictor"] = base_model_dict["model name"]
278 |                 pi_df["modality"] = modality_name
279 |                 pi_df["LFR"] = pi_df["local_importance_mean"].rank(
280 |                     pct=True, ascending=False
281 |                 )
282 |                 importance_list.append(pi_df)
283 | 
284 |         self.LFR = pd.concat(importance_list)
285 | 
286 |         return self
287 | 
288 |     def local_model_rank(self, ensemble_predictor_keys):
289 |         """
290 |         Local Model Ranks (LMRs)
291 | 
292 |         Parameters
293 |         ----------
294 |         ensemble_predictor_keys : list of str
295 |             List of ensemble predictor keys that will be used to select
296 |             ensembles classifiers to interpret.
297 | 
298 |         Returns
299 |         -------
300 |         self
301 |             Local model ranks.
302 |         """
303 |         #  load ensemble training data from EI training
304 | 
305 |         ensemble_X_train, ensemble_y_train = retrieve_X_y(
306 |             labelled_data=self.EI.ensemble_training_data_final[0]
307 |         )
308 | 
309 |         if self.EI.sampling_aggregation == "mean":
310 |             ensemble_X_train = ensemble_X_train.T.groupby(level=[0, 1]).mean().T
311 | 
312 |         #  calculate importance for ensemble models of interest
313 | 
314 |         lm_pi_list = []
315 | 
316 |         ensemble_models = copy.deepcopy(self.EI.final_models["ensemble models"])
317 | 
318 |         ensemble_models = [ensemble_models[key] for key in ensemble_predictor_keys]
319 | 
320 |         ensemble_models = dict(zip(ensemble_predictor_keys, ensemble_models))
321 | 
322 |         for model_name, model in tqdm(
323 |             ensemble_models.items(),
324 |             desc="Calculating local model ranks",
325 |             bar_format=bar_format,
326 |         ):
327 |             ensemble_predictor = pickle.loads(model)
328 | 
329 |             if ("Mean" in model_name) or ("Median" in model_name):
330 |                 importances_mean = np.ones(len(ensemble_X_train.columns))
331 |                 importances_std = np.zeros(len(ensemble_X_train.columns))
332 | 
333 |             elif model_name == "CES":
334 |                 model_selected_freq = []
335 |                 for bp in ensemble_X_train.columns:
336 |                     model_selected_freq.append(
337 |                         ensemble_predictor.selected_ensemble.count(bp)
338 |                     )
339 |                 importances_mean = model_selected_freq
340 |                 importances_std = np.ones(len(ensemble_X_train.columns)) * np.nan
341 | 
342 |             else:
343 |                 needs_proba = hasattr(model, "predict_proba")
344 |                 scorer_ = make_scorer(
345 |                     self.metric,
346 |                     greater_is_better=self.metric_greater_is_better,
347 |                     needs_proba=needs_proba,
348 |                 )
349 |                 pi = permutation_importance(
350 |                     estimator=ensemble_predictor,
351 |                     X=ensemble_X_train,
352 |                     y=ensemble_y_train,
353 |                     n_repeats=self.n_repeats,
354 |                     n_jobs=-1,
355 |                     random_state=self.EI.random_state,
356 |                     scoring=scorer_,
357 |                 )
358 | 
359 |                 importances_mean = pi.importances_mean
360 |                 importances_std = pi.importances_std
361 | 
362 |             pi_df = pd.DataFrame(
363 |                 {
364 |                     "local_importance_mean": importances_mean,
365 |                     "local_importance_std": importances_std,
366 |                     "base predictor": [
367 |                         column_name[1] for column_name in ensemble_X_train.columns
368 |                     ],
369 |                     "modality": [
370 |                         column_name[0] for column_name in ensemble_X_train.columns
371 |                     ],
372 |                 }
373 |             )
374 | 
375 |             pi_df["ensemble_method"] = model_name
376 |             pi_df["LMR"] = pi_df["local_importance_mean"].rank(
377 |                 pct=True, ascending=False
378 |             )
379 |             lm_pi_list.append(pi_df)
380 |         self.LMR = pd.concat(lm_pi_list)
381 | 
382 |         return self
383 | 


--------------------------------------------------------------------------------
/eipy/metrics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import inspect
  4 | from eipy.utils import minority_class
  5 | from sklearn.metrics import roc_auc_score, precision_recall_curve
  6 | 
  7 | 
  8 | def fmax_score(y_test, y_score, beta=1.0, pos_label=1):
  9 |     fmax_score, _, _, threshold_fmax = fmax_precision_recall_threshold(
 10 |         y_test, y_score, beta=beta, pos_label=pos_label
 11 |     )
 12 |     return fmax_score, threshold_fmax
 13 | 
 14 | 
 15 | def fmax_precision_recall_threshold(labels, y_score, beta=1.0, pos_label=1):
 16 |     """
 17 |     Radivojac, P. et al. (2013). A Large-Scale Evaluation of Computational Protein
 18 |     Function Prediction. Nature Methods, 10(3), 221-227.
 19 |     Manning, C. D. et al. (2008). Evaluation in Information Retrieval. In
 20 |     Introduction to Information Retrieval. Cambridge University Press.
 21 |     """
 22 |     if pos_label == 0:
 23 |         labels = 1 - np.array(labels)
 24 |         y_score = 1 - np.array(y_score)
 25 | 
 26 |     precision_scores, recall_scores, thresholds = precision_recall_curve(
 27 |         labels, y_score
 28 |     )
 29 | 
 30 |     np.seterr(divide="ignore", invalid="ignore")
 31 |     f_scores = (
 32 |         (1 + beta**2)
 33 |         * (precision_scores * recall_scores)
 34 |         / ((beta**2 * precision_scores) + recall_scores)
 35 |     )
 36 | 
 37 |     arg_fmax = np.nanargmax(f_scores)
 38 | 
 39 |     fmax_score = f_scores[arg_fmax]
 40 |     precision_fmax = precision_scores[arg_fmax]
 41 |     recall_fmax = recall_scores[arg_fmax]
 42 |     threshold_fmax = thresholds[arg_fmax]
 43 | 
 44 |     return fmax_score, precision_fmax, recall_fmax, threshold_fmax
 45 | 
 46 | 
 47 | def try_metric_with_pos_label(y_true, y_pred, metric, pos_label):
 48 |     """
 49 |     Compute score for a given metric.
 50 |     """
 51 |     if "pos_label" in inspect.signature(metric).parameters:
 52 |         score = metric(y_true, y_pred, pos_label=pos_label)
 53 |     else:
 54 |         score = metric(y_true, y_pred)
 55 |     return score
 56 | 
 57 | 
 58 | def scores(y_true, y_pred, metrics):
 59 |     """
 60 |     Compute all metrics for a single set of predictions. Returns a dictionary
 61 |     containing metric keys, each paired to a tuple (score, threshold).
 62 |     """
 63 | 
 64 |     # default metrics to calculate
 65 |     if metrics is None:
 66 |         metrics = {"fmax (minority)": fmax_score, "auc": roc_auc_score}
 67 | 
 68 |     pos_label = minority_class(y_true)  # gives value 1 or 0
 69 | 
 70 |     metric_threshold_dict = {}
 71 | 
 72 |     for metric_key, metric in metrics.items():
 73 |         # if y_pred parameter exists in metric function then y
 74 |         # should be target prediction vector
 75 |         if "y_pred" in inspect.signature(metric).parameters:
 76 |             # calculate metric for target vector with threshold=0.5
 77 |             metric_threshold_dict[metric_key] = (
 78 |                 try_metric_with_pos_label(
 79 |                     y_true, (np.array(y_pred) >= 0.5).astype(int), metric, pos_label
 80 |                 ),
 81 |                 0.5,
 82 |             )
 83 |         # if y_score parameter exists in metric function then y should be probability vector
 84 |         elif "y_score" in inspect.signature(metric).parameters:
 85 |             metric_results = try_metric_with_pos_label(
 86 |                 y_true, y_pred, metric, pos_label
 87 |             )
 88 |             if isinstance(
 89 |                 metric_results, tuple
 90 |             ):  # if metric includes threshold value as tuple
 91 |                 metric_threshold_dict[metric_key] = metric_results
 92 |             else:  # add np.nan threshold if not outputted
 93 |                 metric_threshold_dict[metric_key] = metric_results, np.nan
 94 | 
 95 |     return metric_threshold_dict
 96 | 
 97 | 
 98 | def scores_matrix(X, labels, metrics):
 99 |     """
100 |     Calculate metrics and threshold (if applicable) for each column
101 |     (set of predictions) in matrix X
102 |     """
103 | 
104 |     scores_dict = {}
105 |     for column in X.columns:
106 |         column_temp = X[column]
107 |         metrics_per_column = scores(labels, column_temp, metrics)
108 |         # metric_names = list(metrics.keys())
109 |         for metric_key in metrics_per_column.keys():
110 |             if not (metric_key in scores_dict):
111 |                 scores_dict[metric_key] = [metrics_per_column[metric_key]]
112 |             else:
113 |                 scores_dict[metric_key].append(metrics_per_column[metric_key])
114 | 
115 |     return scores_dict
116 | 
117 | 
118 | def create_metric_threshold_dataframes(X, labels, metrics):
119 |     """
120 |     Create a separate dataframe for metrics and thresholds. thresholds_df contains
121 |     NaN if threshold not applicable.
122 |     """
123 | 
124 |     scores_dict = scores_matrix(X, labels, metrics)
125 | 
126 |     metrics_df = pd.DataFrame(columns=X.columns)
127 |     thresholds_df = pd.DataFrame(columns=X.columns)
128 |     for k, val in scores_dict.items():
129 |         metrics_df.loc[k], thresholds_df.loc[k] = list(zip(*val))
130 |     return metrics_df, thresholds_df
131 | 
132 | 
133 | def create_metric_threshold_dict(X, labels, metrics):
134 |     df_dict = {}
135 |     df_dict["metrics"], df_dict["thresholds"] = create_metric_threshold_dataframes(
136 |         X, labels, metrics
137 |     )
138 |     return df_dict
139 | 
140 | 
141 | def base_summary(ensemble_test_dataframes, metrics):
142 |     """
143 |     Create a base predictor performance summary by concatenating data across test folds
144 |     """
145 |     labels = pd.concat([df["labels"] for df in ensemble_test_dataframes])
146 |     ensemble_test_averaged_samples = pd.concat(
147 |         [
148 |             df.drop(columns=["labels"], level=0).groupby(level=(0, 1), axis=1).mean()
149 |             for df in ensemble_test_dataframes
150 |         ]
151 |     )
152 |     return create_metric_threshold_dict(ensemble_test_averaged_samples, labels, metrics)
153 | 
154 | 
155 | def ensemble_summary(ensemble_predictions, metrics):
156 |     X = ensemble_predictions.drop(["labels"], axis=1)
157 |     labels = ensemble_predictions["labels"]
158 |     return create_metric_threshold_dict(X, labels, metrics)
159 | 
160 | 
161 | # These two functions are an attempt at maximizing/minimizing any metric
162 | # def metric_scaler_function(arg, y_true, y_pred, metric, pos_label, multiplier):
163 | #         threshold = np.sort(np.unique(y_pred))[int(np.round(arg))]
164 | #         y_binary = (y_pred >= threshold).astype(int)
165 | #         return multiplier * try_metric_with_pos_label(y_true, y_binary, metric, pos_label)
166 | 
167 | # def max_min_score(y_true, y_pred, metric, pos_label, max_min):
168 | #     '''
169 | #     Compute maximized/minimized score for a given metric.
170 | #     '''
171 | 
172 | #     if max_min=='max':
173 | #         multiplier = -1
174 | #     elif max_min=='min':
175 | #         multiplier = 1
176 | 
177 | #     optimized_result = minimize_scalar(
178 | #                                         metric_scaler_function,
179 | #                                         args=(y_true, y_pred, metric, pos_label, multiplier),
180 | #                                         bounds=(0, len(np.unique(y_pred))-1),
181 | #                                         method='bounded'
182 | #                                         )
183 | 
184 | #     threshold = np.sort(np.unique(y_pred))[int(np.round(optimized_result.x))]
185 | #     score = multiplier * optimized_result.fun
186 | 
187 | #     return score, threshold
188 | #
189 | 


--------------------------------------------------------------------------------
/eipy/utils.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import random
  4 | 
  5 | from imblearn.under_sampling import RandomUnderSampler
  6 | from imblearn.over_sampling import RandomOverSampler
  7 | 
  8 | # from tensorflow.keras.backend import clear_session
  9 | import warnings
 10 | from sklearn.pipeline import Pipeline
 11 | from sklearn.exceptions import UndefinedMetricWarning
 12 | 
 13 | warnings.filterwarnings(action="ignore", category=UndefinedMetricWarning)
 14 | 
 15 | bar_format = "{desc}: |{bar}|{percentage:3.0f}%"
 16 | 
 17 | 
 18 | def minority_class(y_true):
 19 |     if np.bincount(y_true)[0] < np.bincount(y_true)[1]:
 20 |         minority_class = 0
 21 |     else:
 22 |         minority_class = 1
 23 |     return minority_class
 24 | 
 25 | 
 26 | def set_predictor_seeds(base_predictors, random_state):
 27 |     for _, v in base_predictors.items():
 28 |         if type(v) == Pipeline:
 29 |             est_ = list(v.named_steps)[-1]
 30 |             if hasattr(v[est_], "random_state") and hasattr(v[est_], "set_params"):
 31 |                 v.set_params(**{"{}__random_state".format(est_): random_state})
 32 |         if hasattr(v, "random_state") and hasattr(v, "set_params"):
 33 |             v.set_params(**{"random_state": random_state})
 34 | 
 35 | 
 36 | def X_is_dict(X):
 37 |     if isinstance(X, dict):
 38 |         return True
 39 |     else:
 40 |         return False
 41 | 
 42 | 
 43 | def X_dict_to_numpy(X_dict):
 44 |     """
 45 |     Retrieve feature names and convert arrays to numpy.
 46 |     """
 47 |     X_dict_numpy = {}
 48 |     feature_names = {}
 49 |     for key, X in X_dict.items():
 50 |         X_dict_numpy[key], feature_names[key] = X_to_numpy(X)
 51 |     return X_dict_numpy, feature_names
 52 | 
 53 | 
 54 | def X_to_numpy(X):
 55 |     """
 56 |     Return X as a numpy array, with feature names if applicable.
 57 |     """
 58 |     if isinstance(X, np.ndarray):
 59 |         return X, []
 60 |     elif isinstance(X, pd.DataFrame):
 61 |         return X.to_numpy(), X.columns.to_list()
 62 |     else:
 63 |         raise TypeError(
 64 |             """Object must be a numpy array, a pandas dataframe
 65 |             or a dictionary containing either."""
 66 |         )
 67 | 
 68 | 
 69 | def y_to_numpy(y):
 70 |     """
 71 |     Check y is numpy array and convert if not.
 72 |     """
 73 |     _y = None
 74 |     if isinstance(y, np.ndarray):
 75 |         _y = y
 76 |     elif isinstance(y, list):
 77 |         _y = np.array(y)
 78 |     elif isinstance(y, (pd.Series)):
 79 |         _y = y.to_numpy()
 80 |     elif isinstance(y, (pd.DataFrame)):
 81 |         _y = np.squeeze(y.to_numpy())
 82 |     else:
 83 |         raise TypeError(
 84 |             """Object must be a numpy array, list
 85 |             or pandas Series."""
 86 |         )
 87 | 
 88 |     if not is_binary_array(_y):
 89 |         raise ValueError("y must contain binary values.")
 90 | 
 91 |     return _y
 92 | 
 93 | 
 94 | def is_binary_array(arr):
 95 |     if all(x == 0 or x == 1 or x == 0.0 or x == 1.0 for x in arr):
 96 |         return True
 97 |     else:
 98 |         return False
 99 | 
100 | 
101 | class dummy_cv:
102 |     def __init__(self, n_splits=1):
103 |         self.n_splits = n_splits
104 | 
105 |     def split(self, X, y, groups=None):
106 |         indices = np.arange(0, len(X), 1)
107 |         yield indices, []
108 | 
109 |     def get_n_splits(self, X, y, groups=None):
110 |         return self.n_splits
111 | 
112 | 
113 | def safe_predict_proba(model, X):  # uses predict_proba method where possible
114 |     if hasattr(model, "predict_proba"):
115 |         y_pred = model.predict_proba(X)[:, 1]
116 |     else:
117 |         y_pred = model.predict(X)
118 |     return y_pred
119 | 
120 | 
121 | def random_integers(n_integers=1, seed=42):
122 |     random.seed(seed)
123 |     return random.sample(range(0, 10000), n_integers)
124 | 
125 | 
126 | def sample(X, y, strategy, random_state):
127 |     if strategy is None:
128 |         X_resampled, y_resampled = X, y
129 |     elif strategy == "undersampling":  # define sampler
130 |         sampler = RandomUnderSampler(random_state=random_state)
131 |     elif strategy == "oversampling":
132 |         sampler = RandomOverSampler(random_state=random_state)
133 |     elif strategy == "hybrid":
134 |         y_pos = float(sum(y == 1))
135 |         y_total = y.shape[0]
136 |         if (y_pos / y_total) < 0.5:
137 |             y_min_count = y_pos
138 |             y_maj_count = y_total - y_pos
139 |             maj_class = 0
140 |         else:
141 |             y_maj_count = y_pos
142 |             y_min_count = y_total - y_pos
143 |             maj_class = 1
144 |         rus = RandomUnderSampler(
145 |             random_state=random_state, sampling_strategy=y_min_count / (y_total / 2)
146 |         )
147 |         ros = RandomOverSampler(
148 |             random_state=random_state, sampling_strategy=(y_total / 2) / y_maj_count
149 |         )
150 |         X_maj, y_maj = rus.fit_resample(X=X, y=y)
151 |         X_maj = X_maj[y_maj == maj_class]
152 |         y_maj = y_maj[y_maj == maj_class]
153 |         X_min, y_min = ros.fit_resample(X=X, y=y)
154 |         X_min = X_min[y_min != maj_class]
155 |         y_min = y_min[y_min != maj_class]
156 |         X_resampled = np.concatenate([X_maj, X_min])
157 |         y_resampled = np.concatenate([y_maj, y_min])
158 | 
159 |     if (strategy == "undersampling") or (strategy == "oversampling"):
160 |         X_resampled, y_resampled = sampler.fit_resample(X=X, y=y)
161 |     return X_resampled, y_resampled
162 | 
163 | 
164 | def retrieve_X_y(labelled_data):
165 |     X = labelled_data.drop(columns=["labels"], level=0)
166 |     y = np.ravel(labelled_data["labels"])
167 |     return X, y
168 | 
169 | 
170 | def append_modality(current_data, modality_data, model_building=False):
171 |     if current_data is None:
172 |         combined_dataframe = modality_data
173 |     else:
174 |         combined_dataframe = []
175 |         for fold, dataframe in enumerate(current_data):
176 |             if not model_building:
177 |                 if (
178 |                     dataframe.iloc[:, -1].to_numpy()
179 |                     != modality_data[fold].iloc[:, -1].to_numpy()
180 |                 ).all():
181 |                     print(
182 |                         "Error: something is wrong. Labels do not match across modalities"
183 |                     )
184 |                     break
185 |                 combined_dataframe.append(
186 |                     pd.concat((dataframe.iloc[:, :-1], modality_data[fold]), axis=1)
187 |                 )
188 |             else:
189 |                 combined_dataframe.append(
190 |                     pd.concat((dataframe.iloc[:, :], modality_data[fold]), axis=1)
191 |                 )
192 |     return combined_dataframe
193 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
 3 | build-backend = "poetry_dynamic_versioning.backend"
 4 | 
 5 | [tool.poetry]
 6 | name = "ensemble-integration"
 7 | version = "0.0.0"
 8 | readme = "README.rst"
 9 | description = "Ensemble Integration: a customizable pipeline for generating multi-modal, heterogeneous ensembles"
10 | authors = ["Jamie Bennett", "Yan Chak (Richard) Li", "Aviad Susman", "Gaurav Pandey"]
11 | license = "GNU General Public License version 3"
12 | classifiers = [
13 |     "Programming Language :: Python :: 3",
14 |     "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
15 |     "Operating System :: OS Independent",
16 |     "Operating System :: Microsoft :: Windows",
17 |     "Operating System :: POSIX :: Linux",
18 |     "Operating System :: MacOS",
19 | ]
20 | packages = [
21 |     { include = "eipy"},
22 | ]
23 | 
24 | [tool.poetry.urls]
25 | "Homepage" = "https://github.com/GauravPandeyLab/eipy"
26 | "Documentation" = "https://eipy.readthedocs.io/en/latest/"
27 | 
28 | [tool.poetry.dependencies]
29 | python = ">=3.8"
30 | imbalanced-learn = ">=0.11"
31 | joblib = ">=1.3"
32 | numpy = ">=1.24"
33 | pandas = ">=1.4"
34 | scikit-learn = ">=1.2,<1.3"
35 | scipy = {version = ">=1.0,<1.12", python = ">=3.8,<3.13"}
36 | xgboost = ">=1.7"
37 | pandoc = "^2.3"
38 | dill = "^0.3.7"
39 | wget = "^3.2"
40 | tqdm = "^4.66.2"
41 | 
42 | [tool.poetry.group.dev.dependencies]
43 | pytest = ">=6.0"
44 | flake8 = ">=3.9"
45 | flake8-bugbear = ">=23"
46 | tox = ">=3.9"
47 | pytest-cov = ">=3.0"
48 | black = ">=23.0"
49 | ipykernel = "^6.25.2"
50 | flask = "^3.0.2"
51 | 
52 | [tool.poetry.group.docs]
53 | optional = true
54 | 
55 | [tool.poetry.group.docs.dependencies]
56 | sphinx = "^4.2"
57 | sphinx-autodoc-typehints = "^1.12"
58 | sphinx_rtd_theme = "1.3.0"
59 | nbsphinx = "0.9.3"
60 | sphinx-copybutton = "0.5.2"
61 | numpydoc = "1.5.0"
62 | 
63 | [tool.pytest.ini_options]
64 | addopts = "--cov=eipy --cov-report xml --cov-append"
65 | testpaths = [
66 |     "tests"
67 | ]
68 | 
69 | [tool.poetry-dynamic-versioning]
70 | enable = true
71 | vcs = "git"
72 | style = "semver"
73 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | select = C,E,F,W,B,B950
4 | extend-ignore = E203, E501, W503


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | if __name__ == "__main__":
4 |     setup()


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GauravPandeyLab/eipy/4fdaceefec4c3090e17a5be6e8c582d0465cc0a3/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_ei.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | @pytest.mark.parametrize(
  4 |     "sampling_strategy, dtype",
  5 |     [   
  6 |         (None, "numpy_array"),
  7 |         ("undersampling", "numpy_array"),
  8 |         ("oversampling", "numpy_array"),
  9 |         ("hybrid", "numpy_array"),
 10 |         ("undersampling", "pandas_df")
 11 |     ],
 12 | )
 13 | 
 14 | def test_ensemble_integration(sampling_strategy, dtype):
 15 | 
 16 |     from sklearn.linear_model import LogisticRegression
 17 |     from sklearn.pipeline import Pipeline
 18 |     from sklearn.preprocessing import StandardScaler
 19 |     from xgboost import XGBClassifier
 20 |     from sklearn.datasets import make_classification
 21 |     from eipy.ei import EnsembleIntegration
 22 |     from eipy.additional_ensembles import MeanAggregation, MedianAggregation, CES
 23 |     import pandas as pd
 24 |     from sklearn.metrics import roc_auc_score
 25 |     from eipy.metrics import fmax_score
 26 | 
 27 |     # Generate toy data for testing
 28 |     X, y = make_classification(n_samples=100, n_features=10, n_classes=2, weights=[0.6, 0.4], n_redundant=0)
 29 |     
 30 |     X_1 = X[:, :4]
 31 |     X_2 = X[:, 4:]
 32 | 
 33 |     if dtype=="numpy_array":
 34 |         modalities = {
 35 |                     "modality_1": X_1,
 36 |                     "modality_2": X_2
 37 |                     }
 38 |     elif dtype=="pandas_df":
 39 |         modalities = {
 40 |                     "modality_1": pd.DataFrame(X_1, columns=['a', 'b', 'c', 'd']),
 41 |                     "modality_2": pd.DataFrame(X_2, columns=['e', 'f', 'g', 'h', 'i', 'j']),
 42 |                     }
 43 | 
 44 |     # Create base predictor models
 45 |     base_predictors = {
 46 |         'LR': Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())]),
 47 |         'XGB': XGBClassifier()
 48 |     }
 49 | 
 50 |     metrics = {
 51 |         'f_max': fmax_score,
 52 |         'auc': roc_auc_score
 53 |     }
 54 | 
 55 |     # Initialize EnsembleIntegration
 56 |     EI = EnsembleIntegration(base_predictors=base_predictors,
 57 |                              k_outer=2,
 58 |                              k_inner=2,
 59 |                              n_samples=2,
 60 |                              sampling_strategy=sampling_strategy,
 61 |                              sampling_aggregation="mean",
 62 |                              n_jobs=-1,
 63 |                              metrics=metrics,
 64 |                              random_state=42,
 65 |                              project_name="demo",
 66 |                              model_building=True)
 67 | 
 68 |         # Train base models
 69 |     for name, modality in modalities.items():
 70 |         EI.fit_base(modality, y, base_predictors, modality_name=name)
 71 | 
 72 |     # Train ensemble models
 73 |     ensemble_predictors = {
 74 |         "Mean": MeanAggregation(),
 75 |         "Median": MedianAggregation(),
 76 |         "CES": CES(scoring=lambda y_test, y_pred: fmax_score(y_test, y_pred)[0]),
 77 |         "S.LR": Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())]),
 78 |     }
 79 | 
 80 |     EI.fit_ensemble(ensemble_predictors=ensemble_predictors)
 81 | 
 82 |     # Predict
 83 |     EI.predict(modalities, ensemble_model_key='S.LR')
 84 | 
 85 |     # Assertions
 86 | 
 87 |     # Check if the trained base models and ensemble models are not None
 88 |     assert EI.base_summary is not None
 89 |     assert EI.ensemble_summary is not None
 90 |     assert EI.final_models is not {"base models": {}, "ensemble models": {}}
 91 | 
 92 |     from eipy.interpretation import PermutationInterpreter
 93 | 
 94 |     interpreter = PermutationInterpreter(
 95 |                                         EI=EI,
 96 |                                         metric=lambda y_test, y_pred: fmax_score(y_test, y_pred)[0],
 97 |                                         ensemble_predictor_keys=['S.LR', 'Mean'],
 98 |                                         n_repeats=1,
 99 |                                         n_jobs=1,
100 |                                         metric_greater_is_better=True
101 |                                         )
102 |     
103 |     interpreter.rank_product_score(X_dict=modalities, y=y)
104 | 
105 |     assert interpreter.ensemble_feature_ranking is not None
106 | 
107 |     if dtype=="pandas_df":
108 |         assert list(EI.feature_names.keys()) == ["modality_1", "modality_2"]
109 |         assert EI.feature_names["modality_1"] == ["a", "b", "c", "d"]
110 |         assert EI.feature_names["modality_2"] == ["e", "f", "g", "h", "i", "j"]


--------------------------------------------------------------------------------
/tests/test_load_data.py:
--------------------------------------------------------------------------------
1 | from eipy.datasets import load_diabetes
2 | import pytest
3 | 
4 | def test_load_diabetes():
5 |     data = load_diabetes()


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py38, py39, py310, py311, black, flake8
 3 | isolated_build = true
 4 | 
 5 | [gh-actions]
 6 | python = 
 7 |     3.8: py38
 8 |     3.9: py39
 9 |     3.10: py310
10 |     3.11: py311
11 | 
12 | [testenv]
13 | setenv = 
14 |     PYTHONPATH = {toxinidir}
15 | allowlist_externals = poetry
16 | commands_pre =
17 |     poetry install --no-root --sync
18 | commands =
19 |     poetry run pytest tests/ --import-mode importlib
20 | 
21 | [testenv:black]
22 | basepython = python3.11
23 | commands = black eipy
24 | 
25 | [testenv:flake8]
26 | basepython = python3.11
27 | commands = flake8 eipy


--------------------------------------------------------------------------------