├── .coveragerc
├── .github
    └── workflows
    │   ├── python-package.yml
    │   └── python-publish.yml
├── .gitignore
├── .travis.yml
├── AUTHORS.md
├── LICENSE
├── README.md
├── polyssifier
    ├── __init__.py
    ├── logger.py
    ├── poly_utils.py
    ├── polyssifier.py
    └── report.py
├── requirements.txt
├── sample
    └── example.ipynb
├── setup.cfg
├── setup.py
├── tests
    ├── test_classification.py
    ├── test_multiclass.py
    ├── test_polynomial.py
    └── test_regression.py
└── uploadPip.sh


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | omit =
 3 |     */tests*
 4 | parallel = False
 5 | 
 6 | [report]
 7 | exclude_lines =
 8 |     pragma: no cover
 9 |     def __repr__
10 |     raise NotImplementedError
11 |     if __name__ == .__main__.:
12 |     def parse_args
13 |     def make_argument_parser
14 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.9"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v2
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v2
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install flake8 pytest pytest-cov
31 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |     - name: Lint with flake8
33 |       run: |
34 |         # stop the build if there are Python syntax errors or undefined names
35 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 |     - name: Test with pytest
39 |       run: |
40 |         python -m pytest --cov=polyssifier --cov-report xml tests
41 |    
42 |     - name: Upload coverage data to coveralls.io
43 |       run: |
44 |         python -m pip install coveralls==2.2
45 |         coveralls --service=github
46 |       env:
47 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | jobs:
16 |   deploy:
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v2
22 |     - name: Set up Python
23 |       uses: actions/setup-python@v2
24 |       with:
25 |         python-version: '3.x'
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install build
30 |     - name: Build package
31 |       run: python -m build
32 |     - name: Publish package
33 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
34 |       with:
35 |         user: __token__
36 |         password: ${{ secrets.PYPI_API_TOKEN }}
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | temp*.png
43 | .ipynb*
44 | 
45 | # Translations
46 | *.mo
47 | *.pot
48 | 
49 | # Django stuff:
50 | *.log
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | 
55 | # PyBuilder
56 | target/
57 | 
58 | # figures
59 | *.pdf
60 | *.svg
61 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # vim ft=yaml
 2 | # travis-ci.org definition for Nimfa tests
 3 | language: python
 4 | 
 5 | sudo: false
 6 | 
 7 | cache: pip
 8 | apt: true
 9 | 
10 | addons:
11 |   apt:
12 |     packages:
13 |     - build-essential
14 |     - libatlas-dev
15 |     - libatlas-base-dev
16 |     - liblapack-dev
17 |     - g++
18 |     - gfortran
19 | python:
20 |   - "2.7"
21 |   - "3.4"
22 |   - "3.5"
23 |   - "3.6"
24 | 
25 | install:
26 |   - pip install --upgrade pip setuptools wheel
27 |   - travis_wait pip install --only-binary=numpy,scipy numpy scipy
28 |   - pip install python-coveralls pytest-cov pytest matplotlib
29 |   - pip install pandas scikit-learn
30 | 
31 | before_script:
32 |   - "export DISPLAY=:99.0"
33 |   - "sh -e /etc/init.d/xvfb start"
34 |   - sleep 3 # give xvfb some time to start
35 |   - "export PYTHONPATH=$PYTHONPATH:."
36 | 
37 | script:
38 |   - cd "$TRAVIS_BUILD_DIR/tests"; py.test --cov=polyssifier
39 | 
40 | after_success:
41 |   - coveralls
42 | 


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
1 | Contributors in no particular order.
2 | 
3 | * [Alvaro Ulloa](https://github.com/alvarouc)
4 | * [Stephen Eyerly](https://github.com/seyerly)
5 | * [Vamsi Krishna](https://github.com/ismav)
6 | * [Devon Hjelm](https://github.com/rdevon)
7 | * [Sergey Pliz](https://github.com/pliz)
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     {description}
294 |     Copyright (C) {year}  {fullname}
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   {signature of Ty Coon}, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 
341 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Coverage Status](https://coveralls.io/repos/github/alvarouc/polyssifier/badge.svg)](https://coveralls.io/github/alvarouc/polyssifier) 
 2 | ![example workflow](https://github.com/alvarouc/polyssifier/actions/workflows/python-package.yml/badge.svg)
 3 | 
 4 | Polyssifier
 5 | ===========
 6 | 
 7 | Polyssifier runs a multitude of machine learning models on data. It reports scores, confusion matrices, predictions, and plots the scores ranked by classifier performance.
 8 | 
 9 | ## Installation
10 | ```bash
11 | pip install polyssifier
12 | ```
13 | 
14 | ## How to use
15 | ### For classification
16 | ```python
17 | from polyssifier import poly
18 | # Load data
19 | data = np.load("/path/to/data.npy")
20 | label = np.load("/path/to/labels.npy")
21 | # Run analysis
22 | report = poly(data,label, n_folds=8)
23 | # Plot results
24 | report.plot_scores()
25 | report.plot_features(ntop=10)
26 | ```
27 | 
28 | ### For Regression
29 | ```python
30 | from polyssifier import polyr
31 | # Load data
32 | data = np.load("/path/to/data.npy")
33 | target = np.load("/path/to/target.npy")
34 | # Run analysis
35 | report = polyr(data, target, n_folds=8)
36 | # Plot results
37 | report.plot_scores()
38 | report.plot_features(ntop=10)
39 | ```
40 | 
41 | ### In the terminal
42 | ```bash
43 | poly data.npy label.npy --concurrency 10
44 | ```
45 | 
46 | ### Requirements
47 |  - Sklearn
48 |  - Numpy
49 |  - Pandas
50 | 
51 | ### Features
52 |  - Cross validated scores.
53 |    - Report f1 score (scoring='f1') or ROC (scoring='auc') for classification
54 |    - Report MSE or R^2 for regression
55 |  - Feature ranking for compatible models (Logistic Regression, Linear SVM, Random Forest)
56 |  - Parallel processing. 
57 |    - Control the number of threads with 'concurrency'.
58 |    - We recommend setting concurrency to half the number of Cores in your system.
59 |  - Saves trained models for future use in case of server malfunction. 
60 |    - Set project_name for identifying a experiment.
61 |  - Activate feature selection step setting 
62 |    - feature_selection=True
63 |  - Automatically scales your data with scale=True
64 | 
65 | Example: on [sample/example.ipynb](sample/example.ipynb)
66 | 
67 | It includes the following classifiers:
68 | 
69 | - Multilayer Perceptron
70 | - Nearest Neighbors
71 | - Linear SVM
72 | - RBF SVM
73 | - Decision Tree
74 | - Random Forest
75 | - Logistic Regression
76 | - Naive Bayes
77 | - Voting Classifier
78 | 
79 | and the following regressors:
80 | 
81 | - Linear Regression
82 | - Bayesian Ridge
83 | - PassiveAggressiveRegressor
84 | - GaussianProcessRegressor
85 | - Ridge
86 | - Lasso
87 | - Lars
88 | - LassoLars
89 | - OrthogonalMatchingPursuit
90 | - ElasticNet
91 | 
92 | You can exclude some of this models by providing a list of names as follows:
93 | ```python
94 | from polyssifier import poly
95 | 
96 | report = poly(data,label, n_folds=8,
97 |               exclude=['Multilayer Perceptron'])
98 | ```
99 | 


--------------------------------------------------------------------------------
/polyssifier/__init__.py:
--------------------------------------------------------------------------------
1 | from .polyssifier import poly, polyr
2 | from .poly_utils import build_regressors, build_classifiers


--------------------------------------------------------------------------------
/polyssifier/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def make_logger(name=''):
 5 |     formatter = logging.Formatter(
 6 |         '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 7 |     logger = logging.getLogger(name)
 8 |     logger.setLevel(logging.DEBUG)
 9 |     fh = logging.FileHandler('{}.log'.format(name))
10 |     fh.setLevel(logging.DEBUG)
11 |     fh.setFormatter(formatter)
12 |     ch = logging.StreamHandler()
13 |     ch.setLevel(logging.INFO)
14 |     ch.setFormatter(formatter)
15 |     # add the handlers to the logger
16 |     logger.addHandler(fh)
17 |     logger.addHandler(ch)
18 | 
19 |     return logger


--------------------------------------------------------------------------------
/polyssifier/poly_utils.py:
--------------------------------------------------------------------------------
  1 | from sklearn.neighbors import KNeighborsClassifier
  2 | from sklearn.svm import LinearSVC, SVC
  3 | from sklearn.tree import DecisionTreeClassifier
  4 | from sklearn.ensemble import RandomForestClassifier
  5 | from sklearn.linear_model import (LogisticRegression,
  6 |                                   LinearRegression,
  7 |                                   BayesianRidge,
  8 |                                   Ridge, Lasso,
  9 |                                   ElasticNet, Lars, LassoLars,
 10 |                                   OrthogonalMatchingPursuit,
 11 |                                   PassiveAggressiveRegressor)
 12 | from sklearn.naive_bayes import GaussianNB
 13 | from sklearn.neural_network import MLPClassifier as MLP
 14 | from sklearn.gaussian_process import GaussianProcessRegressor
 15 | import collections
 16 | import numpy as np
 17 | from sklearn.feature_selection import SelectKBest, f_regression
 18 | from sklearn.pipeline import make_pipeline
 19 | from sklearn.preprocessing import StandardScaler
 20 | from sklearn.gaussian_process.kernels import RBF
 21 | 
 22 | 
 23 | class MyVoter(object):
 24 |     """
 25 |     Voter Classifier
 26 |     Receives fitted classifiers and runs majority voting
 27 |     """
 28 | 
 29 |     def __init__(self, estimators):
 30 |         '''
 31 |         estimators: List of fitted classifiers
 32 |         '''
 33 |         self.estimators_ = estimators
 34 | 
 35 |     def predict(self, X):
 36 |         predictions = np.asarray(
 37 |             [clf.predict(X) for clf in self.estimators_]).T
 38 |         maj = np.apply_along_axis(
 39 |             lambda x: np.argmax(np.bincount(x)), axis=1,
 40 |             arr=predictions.astype('int'))
 41 |         return maj
 42 | 
 43 | 
 44 | class MyRegressionAverager(object):
 45 |     """
 46 |     Regression averager
 47 |     Receives fitted regressors and averages the predictions of the regressors.
 48 |     """
 49 | 
 50 |     def __init__(self, estimators):
 51 |         '''
 52 |         estimators: List of fitted regressors
 53 |         '''
 54 |         self.estimators_ = estimators
 55 | 
 56 |     def predict(self, X):
 57 |         predictions = np.asarray(
 58 |             [reg.predict(X) for reg in self.estimators_]).T
 59 | 
 60 |         avg = np.average(predictions, axis=1)
 61 |         return avg
 62 | 
 63 | 
 64 | class MyRegressionMedianer(object):
 65 |     """
 66 |     Regression averager
 67 |     Receives fitted regressors and averages the predictions of the regressors.
 68 |     """
 69 | 
 70 |     def __init__(self, estimators):
 71 |         '''
 72 |         estimators: List of fitted regressors
 73 |         '''
 74 |         self.estimators_ = estimators
 75 | 
 76 |     def predict(self, X):
 77 |         predictions = np.asarray(
 78 |             [reg.predict(X) for reg in self.estimators_]).T
 79 | 
 80 |         avg = np.median(predictions, axis=1)
 81 |         return avg
 82 | 
 83 | 
 84 | def build_classifiers(exclude, scale, feature_selection, nCols):
 85 |     '''
 86 |     Input:
 87 |     - exclude: list of names of classifiers to exclude from the analysis
 88 |     - scale: True or False. Scale data before fitting classifier
 89 |     - feature_selection: True or False. Run feature selection before
 90 |     fitting classifier
 91 |     - nCols: Number of columns in input dataset to classifiers
 92 | 
 93 |     Output:
 94 |     Dictionary with classifier name as keys.
 95 |     - 'clf': Classifier object
 96 |     - 'parameters': Dictionary with parameters of 'clf' as keys
 97 |     '''
 98 |     classifiers = collections.OrderedDict()
 99 | 
100 |     if 'Multilayer Perceptron' not in exclude:
101 |         classifiers['Multilayer Perceptron'] = {
102 |             'clf': MLP(),
103 |             'parameters': {'hidden_layer_sizes': [(100, 50), (50, 25)],
104 |                            'max_iter': [500]}
105 |         }
106 | 
107 |     if 'Nearest Neighbors' not in exclude:
108 |         classifiers['Nearest Neighbors'] = {
109 |             'clf': KNeighborsClassifier(),
110 |             'parameters': {'n_neighbors': [1, 5, 10, 20]}}
111 | 
112 |     if 'SVM' not in exclude:
113 |         classifiers['SVM'] = {
114 |             'clf': SVC(C=1, probability=True, cache_size=10000,
115 |                        class_weight='balanced'),
116 |             'parameters': {'kernel': ['rbf', 'poly'],
117 |                            'C': [0.01, 0.1, 1]}}
118 | 
119 |     if 'Linear SVM' not in exclude:
120 |         classifiers['Linear SVM'] = {
121 |             'clf': LinearSVC(dual=False, class_weight='balanced'),
122 |             'parameters': {'C': [0.01, 0.1, 1],
123 |                            'penalty': ['l1', 'l2']}}
124 | 
125 |     if 'Decision Tree' not in exclude:
126 |         classifiers['Decision Tree'] = {
127 |             'clf': DecisionTreeClassifier(max_depth=None,
128 |                                           max_features='auto'),
129 |             'parameters': {}}
130 | 
131 |     if 'Random Forest' not in exclude:
132 |         classifiers['Random Forest'] = {
133 |             'clf': RandomForestClassifier(max_depth=None,
134 |                                           n_estimators=10,
135 |                                           max_features='auto'),
136 |             'parameters': {'n_estimators': list(range(5, 20))}}
137 | 
138 |     if 'Logistic Regression' not in exclude:
139 |         classifiers['Logistic Regression'] = {
140 |             'clf': LogisticRegression(fit_intercept=True, solver='lbfgs',
141 |                                       penalty='l2'),
142 |             'parameters': {'C': [0.001, 0.1, 1]}}
143 | 
144 |     if 'Naive Bayes' not in exclude:
145 |         classifiers['Naive Bayes'] = {
146 |             'clf': GaussianNB(),
147 |             'parameters': {}}
148 |     # classifiers['Voting'] = {}
149 | 
150 |     def name(x):
151 |         """
152 |         :param x: The name of the classifier
153 |         :return: The class of the final estimator in lower case form
154 |         """
155 |         return x['clf']._final_estimator.__class__.__name__.lower()
156 | 
157 |     for key, val in classifiers.items():
158 |         if not scale and not feature_selection:
159 |             break
160 |         steps = []
161 |         if scale:
162 |             steps.append(StandardScaler())
163 |         if feature_selection:
164 |             steps.append(SelectKBest(f_regression, k='all'))
165 |         steps.append(classifiers[key]['clf'])
166 |         classifiers[key]['clf'] = make_pipeline(*steps)
167 |         # Reorganize paramenter list for grid search
168 |         new_dict = {}
169 |         for keyp in classifiers[key]['parameters']:
170 |             new_dict[name(classifiers[key]) + '__' +
171 |                      keyp] = classifiers[key]['parameters'][keyp]
172 |         classifiers[key]['parameters'] = new_dict
173 |         if nCols > 5 and feature_selection:
174 |             classifiers[key]['parameters']['selectkbest__k'] = np.linspace(
175 |                 np.round(nCols / 5), nCols, 5).astype('int').tolist()
176 | 
177 |     return classifiers
178 | 
179 | 
180 | def build_regressors(exclude, scale, feature_selection, nCols):
181 |     '''
182 |     This method builds an ordered dictionary of regressors, where the key is the name of the
183 |     regressor and the value of each key contains a standard dictionary with two keys itself. The first key called
184 |     'reg' points to the regression object, which is created by scikit learn. The second key called 'parameters'
185 |     points to another regular map containing the parameters which are associated with the particular regression model.
186 |     These parameters are used by grid search in polyssifier.py when finding the best model. If parameters are not
187 |     defined then grid search is not performed on that particular regression model, so the model's default parameters
188 |     are used instead to find the best model for the particular data.
189 |     '''
190 |     regressors = collections.OrderedDict()
191 | 
192 |     if 'Linear Regression' not in exclude:
193 |         regressors['Linear Regression'] = {
194 |             'reg': LinearRegression(),
195 |             'parameters': {}  # Best to leave default parameters
196 |         }
197 | 
198 |     if 'Bayesian Ridge' not in exclude:
199 |         regressors['Bayesian Ridge'] = {
200 |             'reg': BayesianRidge(),
201 |             'parameters': {}  # Investigate if alpha and lambda parameters should be changed
202 |         }
203 | 
204 |     if 'PassiveAggressiveRegressor' not in exclude:
205 |         regressors['PassiveAggressiveRegressor'] = {
206 |             'reg': PassiveAggressiveRegressor(),
207 |             'parameters': {'C': [0.5, 1.0, 1.5]
208 |                            }
209 |         }
210 | 
211 |     if 'GaussianProcessRegressor' not in exclude:
212 |         regressors['GaussianProcessRegressor'] = {
213 |             'reg': GaussianProcessRegressor(),
214 |             'parameters': {
215 |                 'alpha': [0.01, 0.1, 1.0, 10.0],
216 |                 'kernel': [RBF(x) for x in [0.01, 1.0, 100.0, 1000.0]],
217 |             }
218 |         }
219 | 
220 |     if 'Ridge' not in exclude:
221 |         regressors['Ridge'] = {
222 |             'reg': Ridge(),
223 |             'parameters': {
224 |                 'alpha': [0.25, 0.50, 0.75, 1.00]
225 |             }
226 |         }
227 | 
228 |     if 'Lasso' not in exclude:
229 |         regressors['Lasso'] = {
230 |             'reg': Lasso(),
231 |             'parameters': {
232 |                 'alpha': [0.25, 0.50, 0.75, 1.00]
233 |             }
234 |         }
235 | 
236 |     if 'Lars' not in exclude:
237 |         regressors['Lars'] = {
238 |             'reg': Lars(),
239 |             'parameters': {}  # Best to leave the default parameters
240 |         }
241 | 
242 |     if 'LassoLars' not in exclude:
243 |         regressors['LassoLars'] = {
244 |             'reg': LassoLars(),
245 |             'parameters': {'alpha': [0.25, 0.50, 0.75, 1.00, 10.0]}
246 |         }
247 | 
248 |     if 'OrthogonalMatchingPursuit' not in exclude:
249 |         regressors['OrthogonalMatchingPursuit'] = {
250 |             'reg': OrthogonalMatchingPursuit(),
251 |             'parameters': {}  # Best to leave default parameters
252 |         }
253 | 
254 |     if 'ElasticNet' not in exclude:
255 |         regressors['ElasticNet'] = {
256 |             'reg': ElasticNet(),
257 |             'parameters': {'alpha': [0.25, 0.50, 0.75, 1.00],
258 |                            'l1_ratio': [0.25, 0.50, 0.75, 1.00]}
259 |         }
260 | 
261 |     def name(x):
262 |         """
263 |         :param x: The name of the regressor
264 |         :return: The class of the final regression estimator in lower case form
265 |         """
266 |         return x['reg']._final_estimator.__class__.__name__.lower()
267 | 
268 |     for key, val in regressors.items():
269 |         if not scale and not feature_selection:
270 |             break
271 |         steps = []
272 |         if scale:
273 |             steps.append(StandardScaler())
274 |         if feature_selection:
275 |             steps.append(SelectKBest(f_regression, k='all'))
276 |         steps.append(regressors[key]['reg'])
277 |         regressors[key]['reg'] = make_pipeline(*steps)
278 |         # Reorganize paramenter list for grid search
279 |         new_dict = {}
280 |         for keyp in regressors[key]['parameters']:
281 |             new_dict[name(regressors[key]) + '__' +
282 |                      keyp] = regressors[key]['parameters'][keyp]
283 |         regressors[key]['parameters'] = new_dict
284 |         if nCols > 5 and feature_selection:
285 |             regressors[key]['parameters']['selectkbest__k'] = np.linspace(
286 |                 np.round(nCols / 5), nCols, 5).astype('int').tolist()
287 | 
288 |     return regressors
289 | 


--------------------------------------------------------------------------------
/polyssifier/polyssifier.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | import sys
  3 | import argparse
  4 | import numpy as np
  5 | import pickle as p
  6 | from multiprocessing import Manager, Pool
  7 | import os
  8 | import pandas as pd
  9 | from copy import deepcopy
 10 | from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold
 11 | from sklearn.metrics import (f1_score, confusion_matrix, roc_auc_score,
 12 |                              mean_squared_error, r2_score)
 13 | import joblib
 14 | import time
 15 | from sklearn.preprocessing import LabelEncoder
 16 | from itertools import starmap
 17 | from .poly_utils import (build_classifiers, MyVoter, build_regressors,
 18 |                          MyRegressionMedianer)
 19 | from .report import Report
 20 | import logging
 21 | from .logger import make_logger
 22 | sys.setrecursionlimit(10000)
 23 | logger = make_logger('polyssifier')
 24 | 
 25 | 
 26 | def poly(data, label, n_folds=10, scale=True, exclude=[],
 27 |          feature_selection=False, save=False, scoring='auc',
 28 |          project_name='', concurrency=1, verbose=True):
 29 |     '''
 30 |     Input
 31 |     data         = numpy matrix with as many rows as samples
 32 |     label        = numpy vector that labels each data row
 33 |     n_folds      = number of folds to run
 34 |     scale        = whether to scale data or not
 35 |     exclude      = list of classifiers to exclude from the analysis
 36 |     feature_selection = whether to use feature selection or not (anova)
 37 |     save         = whether to save intermediate steps or not
 38 |     scoring      = Type of score to use ['auc', 'f1']
 39 |     project_name = prefix used to save the intermediate steps
 40 |     concurrency  = number of parallel jobs to run
 41 |     verbose      = whether to print or not results
 42 |     Ouput
 43 |     scores       = matrix with scores for each fold and classifier
 44 |     confusions   = confussion matrix for each classifier
 45 |     predictions  = Cross validated predicitons for each classifier
 46 |     '''
 47 |     if verbose:
 48 |         logger.setLevel(logging.DEBUG)
 49 |     else:
 50 |         logger.setLevel(logging.ERROR)
 51 | 
 52 |     assert label.shape[0] == data.shape[0],\
 53 |         "Label dimesions do not match data number of rows"
 54 |     _le = LabelEncoder()
 55 |     _le.fit(label)
 56 |     label = _le.transform(label)
 57 |     n_class = len(np.unique(label))
 58 |     logger.info(f'Detected {n_class} classes in label')
 59 | 
 60 |     if save and not os.path.exists('poly_{}/models'.format(project_name)):
 61 |         os.makedirs('poly_{}/models'.format(project_name))
 62 | 
 63 |     logger.info('Building classifiers ...')
 64 |     classifiers = build_classifiers(exclude, scale,
 65 |                                     feature_selection,
 66 |                                     data.shape[1])
 67 | 
 68 |     scores = pd.DataFrame(columns=pd.MultiIndex.from_product(
 69 |         [classifiers.keys(), ['train', 'test']]),
 70 |         index=range(n_folds))
 71 |     predictions = pd.DataFrame(columns=classifiers.keys(),
 72 |                                index=range(data.shape[0]))
 73 |     test_prob = pd.DataFrame(columns=classifiers.keys(),
 74 |                              index=range(data.shape[0]))
 75 |     confusions = {}
 76 |     coefficients = {}
 77 |     # !fitted_clfs =
 78 |     # pd.DataFrame(columns=classifiers.keys(), index = range(n_folds))
 79 | 
 80 |     logger.info('Initialization, done.')
 81 | 
 82 |     skf = StratifiedKFold(n_splits=n_folds, random_state=1988, shuffle=True)
 83 |     skf.get_n_splits(np.zeros(data.shape[0]), label)
 84 |     kf = list(skf.split(np.zeros(data.shape[0]), label))
 85 | 
 86 |     # Parallel processing of tasks
 87 |     manager = Manager()
 88 |     args = manager.list()
 89 |     args.append({})  # Store inputs
 90 |     shared = args[0]
 91 |     shared['kf'] = kf
 92 |     shared['X'] = data
 93 |     shared['y'] = label
 94 |     args[0] = shared
 95 | 
 96 |     args2 = []
 97 |     for clf_name, val in classifiers.items():
 98 |         for n_fold in range(n_folds):
 99 |             args2.append((args, clf_name, val, n_fold, project_name,
100 |                           save, scoring))
101 | 
102 |     if concurrency == 1:
103 |         result = list(starmap(fit_clf, args2))
104 |     else:
105 |         pool = Pool(processes=concurrency)
106 |         result = pool.starmap(fit_clf, args2)
107 |         pool.close()
108 | 
109 |     fitted_clfs = {key: [] for key in classifiers}
110 | 
111 |     # Gather results
112 |     for clf_name in classifiers:
113 |         coefficients[clf_name] = []
114 |         temp = np.zeros((n_class, n_class))
115 |         temp_pred = np.zeros((data.shape[0], ))
116 |         temp_prob = np.zeros((data.shape[0], ))
117 |         clfs = fitted_clfs[clf_name]
118 |         for n in range(n_folds):
119 |             train_score, test_score, prediction, prob, confusion,\
120 |                 coefs, fitted_clf = result.pop(0)
121 |             clfs.append(fitted_clf)
122 |             scores.loc[n, (clf_name, 'train')] = train_score
123 |             scores.loc[n, (clf_name, 'test')] = test_score
124 |             temp += confusion
125 |             temp_prob[kf[n][1]] = prob
126 |             temp_pred[kf[n][1]] = _le.inverse_transform(prediction)
127 |             coefficients[clf_name].append(coefs)
128 | 
129 |         confusions[clf_name] = temp
130 |         predictions[clf_name] = temp_pred
131 |         test_prob[clf_name] = temp_prob
132 | 
133 |     # Voting
134 |     fitted_clfs = pd.DataFrame(fitted_clfs)
135 |     scores['Voting', 'train'] = np.zeros((n_folds, ))
136 |     scores['Voting', 'test'] = np.zeros((n_folds, ))
137 |     temp = np.zeros((n_class, n_class))
138 |     temp_pred = np.zeros((data.shape[0], ))
139 |     for n, (train, test) in enumerate(kf):
140 |         clf = MyVoter(fitted_clfs.loc[n].values)
141 |         X, y = data[train, :], label[train]
142 |         scores.loc[n, ('Voting', 'train')] = _scorer(clf, X, y)
143 |         X, y = data[test, :], label[test]
144 |         scores.loc[n, ('Voting', 'test')] = _scorer(clf, X, y)
145 |         temp_pred[test] = clf.predict(X)
146 |         temp += confusion_matrix(y, temp_pred[test])
147 | 
148 |     confusions['Voting'] = temp
149 |     predictions['Voting'] = temp_pred
150 |     test_prob['Voting'] = temp_pred
151 |     ######
152 | 
153 |     # saving confusion matrices
154 |     if save:
155 |         with open('poly_' + project_name + '/confusions.pkl', 'wb') as f:
156 |             p.dump(confusions, f, protocol=2)
157 | 
158 |     if verbose:
159 |         print(scores.astype('float').describe().transpose()
160 |               [['mean', 'std', 'min', 'max']])
161 |     return Report(scores=scores, confusions=confusions,
162 |                   predictions=predictions, test_prob=test_prob,
163 |                   coefficients=coefficients,
164 |                   feature_selection=feature_selection)
165 | 
166 | 
167 | def _scorer(clf, X, y):
168 |     '''Function that scores a classifier according to what is available as a
169 |     predict function.
170 |     Input:
171 |     - clf = Fitted classifier object
172 |     - X = input data matrix
173 |     - y = estimated labels
174 |     Output:
175 |     - AUC score for binary classification or F1 for multiclass
176 |     The order of priority is as follows:
177 |     - predict_proba
178 |     - decision_function
179 |     - predict
180 |     '''
181 |     n_class = len(np.unique(y))
182 |     if n_class == 2:
183 |         if hasattr(clf, 'predict_proba'):
184 |             ypred = clf.predict_proba(X)
185 |             try:
186 |                 ypred = ypred[:, 1]
187 |             except:
188 |                 print('predict proba return shape{}'.format(ypred.shape))
189 | 
190 |             assert len(ypred.shape) == 1,\
191 |                 'predict proba return shape {}'.format(ypred.shape)
192 |         elif hasattr(clf, 'decision_function'):
193 |             ypred = clf.decision_function(X)
194 |             assert len(ypred.shape) == 1,\
195 |                 'decision_function return shape {}'.format(ypred.shape)
196 |         else:
197 |             ypred = clf.predict(X)
198 |         score = roc_auc_score(y, ypred)
199 |     else:
200 |         score = f1_score(y, clf.predict(X), average='weighted')
201 |     return score
202 | 
203 | 
204 | def fit_clf(args, clf_name, val, n_fold, project_name, save, scoring):
205 |     '''
206 |     Multiprocess safe function that fits classifiers
207 |     args: shared dictionary that contains
208 |         X: all data
209 |         y: all labels
210 |         kf: list of train and test indexes for each fold
211 |     clf_name: name of the classifier model
212 |     val: dictionary with
213 |         clf: sklearn compatible classifier
214 |         parameters: dictionary with parameters, can be used for grid search
215 |     n_fold: number of folds
216 |     project_name: string with the project folder name to save model
217 |     '''
218 |     train, test = args[0]['kf'][n_fold]
219 |     X = args[0]['X'][train, :]
220 |     y = args[0]['y'][train]
221 |     file_name = 'poly_{}/models/{}_{}.p'.format(
222 |         project_name, clf_name, n_fold + 1)
223 |     start = time.time()
224 |     if save and os.path.isfile(file_name):
225 |         logger.info('Loading {} {}'.format(file_name, n_fold))
226 |         clf = joblib.load(file_name)
227 |     else:
228 |         logger.info('Training {} {}'.format(clf_name, n_fold))
229 |         clf = deepcopy(val['clf'])
230 |         if val['parameters']:
231 |             clf = GridSearchCV(clf, val['parameters'], n_jobs=1, cv=3,
232 |                                scoring=_scorer)
233 |         clf.fit(X, y)
234 |         if save:
235 |             joblib.dump(clf, file_name)
236 | 
237 |     train_score = _scorer(clf, X, y)
238 | 
239 |     X = args[0]['X'][test, :]
240 |     y = args[0]['y'][test]
241 |     # Scores
242 |     test_score = _scorer(clf, X, y)
243 |     ypred = clf.predict(X)
244 |     if hasattr(clf, 'predict_proba'):
245 |         # For compatibility with different sklearn versions
246 |         yprob = clf.predict_proba(X)
247 |         try:
248 |             yprob = yprob[:, 1]
249 |         except:
250 |             print('predict proba return shape {}'.format(yprob.shape))
251 | 
252 |     elif hasattr(clf, 'decision_function'):
253 |         yprob = clf.decision_function(X)
254 |         try:
255 |             yprob = yprob[:, 1]
256 |         except:
257 |             print('predict proba return shape {}'.format(yprob.shape))
258 | 
259 |         assert len(yprob.shape) == 1,\
260 |             'predict proba return shape {}'.format(ypred.shape)
261 | 
262 |     confusion = confusion_matrix(y, ypred)
263 |     duration = time.time() - start
264 |     logger.info('{0:25} {1:2}: Train {2:.2f}/Test {3:.2f}, {4:.2f} sec'.format(
265 |         clf_name, n_fold, train_score, test_score, duration))
266 | 
267 |     # Feature importance
268 |     if hasattr(clf, 'steps'):
269 |         temp = clf.steps[-1][1]
270 |     elif hasattr(clf, 'best_estimator_'):
271 |         if hasattr(clf.best_estimator_, 'steps'):
272 |             temp = clf.best_estimator_.steps[-1][1]
273 |         else:
274 |             temp = clf.best_estimator_
275 |     try:
276 |         if hasattr(temp, 'coef_'):
277 |             coefficients = temp.coef_
278 |         elif hasattr(temp, 'feature_importances_'):
279 |             coefficients = temp.feature_importances_
280 |         else:
281 |             coefficients = None
282 |     except:
283 |         coefficients = None
284 | 
285 |     return (train_score, test_score,
286 |             ypred, yprob,  # predictions and probabilities
287 |             confusion,  # confusion matrix
288 |             coefficients,  # Coefficients for feature ranking
289 |             clf)  # fitted clf
290 | 
291 | 
292 | def create_polynomial(data, degree):
293 |     '''
294 |     :param data: the data (numpy matrix) which will have its data vectors raised to powers
295 |     :param degree: the degree of the polynomial we wish to predict
296 |     :return: a new data matrix of the specified degree (for polynomial fitting purposes)
297 |     '''
298 | 
299 |     # First we make an empty matrix which is the size of what we wish to pass through to linear regress
300 |     height_of_pass_through = data.shape[0]
301 |     width_of_pass_through = degree * data.shape[1]
302 |     to_pass_through = np.zeros(
303 |         shape=(height_of_pass_through, width_of_pass_through))
304 | 
305 |     # These are the width and height of each "exponeneted" matrix
306 |     height_exponential_matrix = data.shape[0]
307 |     width_exponential_matrix = data.shape[1]
308 | 
309 |     for i in range(degree):
310 |         to_add_in = data ** (i + 1)
311 |         for j in range(height_exponential_matrix):
312 |             for k in range(width_exponential_matrix):
313 |                 to_pass_through.itemset(
314 |                     (j, k + i * width_exponential_matrix), (to_add_in.item(j, k)))
315 |     return to_pass_through
316 | 
317 | 
318 | def polyr(data, label, n_folds=10, scale=True, exclude=[],
319 |           feature_selection=False, num_degrees=1, save=False, scoring='r2',
320 |           project_name='', concurrency=1, verbose=True):
321 |     '''
322 |     Input
323 |     data         = numpy matrix with as many rows as samples
324 |     label        = numpy vector that labels each data row
325 |     n_folds      = number of folds to run
326 |     scale        = whether to scale data or not
327 |     exclude      = list of classifiers to exclude from the analysis
328 |     feature_selection = whether to use feature selection or not (anova)
329 |     num_degrees = the degree of the polynomial model to fit to the data (default is linear)
330 |     save         = whether to save intermediate steps or not
331 |     scoring      = Type of score to use ['mse', 'r2']
332 |     project_name = prefix used to save the intermediate steps
333 |     concurrency  = number of parallel jobs to run
334 |     verbose      = whether to print or not results
335 | 
336 |     Ouput
337 |     scores       = matrix with scores for each fold and classifier
338 |     confusions   = confussion matrix for each classifier
339 |     predictions  = Cross validated predicitons for each classifier
340 |     '''
341 |     if num_degrees != 1:
342 |         polynomial_data = create_polynomial(data, num_degrees)
343 |         return polyr(data=polynomial_data, label=label, n_folds=n_folds, scale=scale, exclude=exclude,
344 |                      feature_selection=feature_selection, num_degrees=1, save=save, scoring=scoring,
345 |                      project_name=project_name, concurrency=concurrency, verbose=verbose)
346 | 
347 |     assert label.shape[0] == data.shape[0],\
348 |         "Label dimesions do not match data number of rows"
349 | 
350 |     # If the user wishes to save the intermediate steps and there is not already a polyrssifier models directory then
351 |     # this statement creates one.
352 |     if save and not os.path.exists('polyr_{}/models'.format(project_name)):
353 |         os.makedirs('polyr_{}/models'.format(project_name))
354 | 
355 |     # Whether or not intermeciate steps will be printed out.
356 |     if verbose:
357 |         logger.setLevel(logging.DEBUG)
358 |     else:
359 |         logger.setLevel(logging.ERROR)
360 |     logger.info('Building classifiers ...')
361 | 
362 |     # The main regressors dictionary
363 |     regressors = build_regressors(exclude, scale,
364 |                                   feature_selection,
365 |                                   data.shape[1])
366 | 
367 |     scores = pd.DataFrame(columns=pd.MultiIndex.from_product(
368 |         [regressors.keys(), ['train', 'test']]),
369 |         index=range(n_folds))
370 |     predictions = pd.DataFrame(columns=regressors.keys(),
371 |                                index=range(data.shape[0]))
372 |     test_prob = pd.DataFrame(columns=regressors.keys(),
373 |                              index=range(data.shape[0]))
374 |     confusions = {}
375 |     coefficients = {}
376 |     # !fitted_regs =
377 |     # pd.DataFrame(columns=regressors.keys(), index = range(n_folds))
378 | 
379 |     logger.info('Initialization, done.')
380 | 
381 |     # This provides train/test indices to split data in train/test sets.
382 |     skf = KFold(n_splits=n_folds)  # , random_state=1988)
383 |     skf.get_n_splits(np.zeros(data.shape[0]), label)
384 |     kf = list(skf.split(np.zeros(data.shape[0]), label))
385 | 
386 |     # Parallel processing of tasks
387 |     manager = Manager()
388 |     args = manager.list()
389 |     args.append({})  # Store inputs
390 |     shared = args[0]
391 |     shared['kf'] = kf
392 |     shared['X'] = data
393 |     shared['y'] = label
394 |     args[0] = shared
395 | 
396 |     args2 = []
397 |     for reg_name, val in regressors.items():
398 |         for n_fold in range(n_folds):
399 |             args2.append((args, reg_name, val, n_fold, project_name,
400 |                           save, scoring))
401 | 
402 |     if concurrency == 1:
403 |         result = list(starmap(fit_reg, args2))
404 |     else:
405 |         pool = Pool(processes=concurrency)
406 |         result = pool.starmap(fit_reg, args2)
407 |         pool.close()
408 | 
409 |     fitted_regs = {key: [] for key in regressors}
410 | 
411 |     # Gather results
412 |     for reg_name in regressors:
413 |         coefficients[reg_name] = []
414 |         temp_pred = np.zeros((data.shape[0], ))
415 |         temp_prob = np.zeros((data.shape[0], ))
416 |         regs = fitted_regs[reg_name]
417 |         for n in range(n_folds):
418 |             train_score, test_score, prediction, prob,\
419 |                 coefs, fitted_reg = result.pop(0)
420 |             regs.append(fitted_reg)
421 |             scores.loc[n, (reg_name, 'train')] = train_score
422 |             scores.loc[n, (reg_name, 'test')] = test_score
423 |             temp_prob[kf[n][1]] = prob
424 |             temp_pred[kf[n][1]] = prediction
425 |             coefficients[reg_name].append(coefs)
426 | 
427 |         predictions[reg_name] = temp_pred
428 |         test_prob[reg_name] = temp_prob
429 | 
430 |     # This calculated the Median of the predictions of the regressors.
431 |     fitted_regs = pd.DataFrame(fitted_regs)
432 |     scores['Median', 'train'] = np.zeros((n_folds, ))
433 |     scores['Median', 'test'] = np.zeros((n_folds, ))
434 |     temp_pred = np.zeros((data.shape[0], ))
435 |     for n, (train, test) in enumerate(kf):
436 |         reg = MyRegressionMedianer(fitted_regs.loc[n].values)
437 |         X, y = data[train, :], label[train]
438 |         scores.loc[n, ('Median', 'train')] = _reg_scorer(reg, X, y, scoring)
439 |         X, y = data[test, :], label[test]
440 |         scores.loc[n, ('Median', 'test')] = _reg_scorer(reg, X, y, scoring)
441 |         temp_pred[test] = reg.predict(X)
442 | 
443 |     predictions['Median'] = temp_pred
444 | 
445 |     if verbose:
446 |         print(scores.astype('float').describe().transpose()
447 |               [['mean', 'std', 'min', 'max']])
448 |     return Report(scores=scores, confusions=confusions,
449 |                   predictions=predictions, test_prob=test_prob,
450 |                   coefficients=coefficients, scoring=scoring,
451 |                   feature_selection=feature_selection)
452 | 
453 | 
454 | def _reg_scorer(reg, X, y, scoring):
455 |     '''Function that scores a regressor according to what is available as a
456 |     predict function.
457 |     Input:
458 |     - reg = Fitted regressor object
459 |     - X = input data matrix
460 |     - y = corresponding values to the data matrix
461 |     Output:
462 |     - The mean sqaure error or r squared value for the given regressor and data. The default scoring is
463 |     r squared value.
464 |     '''
465 |     if scoring == 'mse':
466 |         return mean_squared_error(y, reg.predict(X))
467 |     else:
468 |         return r2_score(y, reg.predict(X))
469 | 
470 | 
471 | def fit_reg(args, reg_name, val, n_fold, project_name, save, scoring):
472 |     '''
473 |     Multiprocess safe function that fits classifiers
474 |     args: shared dictionary that contains
475 |         X: all data
476 |         y: all labels
477 |         kf: list of train and test indexes for each fold
478 |     reg_name: name of the classifier or regressor model
479 |     val: dictionary with
480 |         reg: sklearn compatible classifier 
481 |         parameters: dictionary with parameters, can be used for grid search
482 |     n_fold: number of folds
483 |     project_name: string with the project folder name to save model
484 |     '''
485 | 
486 |     # Creates the scoring string to pass into grid search.
487 |     if scoring == 'mse':
488 |         scorestring = 'neg_mean_squared_error'
489 |     elif scoring == 'r2':
490 |         scorestring = 'r2'
491 |     else:
492 |         scorestring = 'r2'
493 | 
494 |     train, test = args[0]['kf'][n_fold]
495 |     X = args[0]['X'][train, :]
496 |     y = args[0]['y'][train]
497 |     file_name = 'polyr_{}/models/{}_{}.p'.format(
498 |         project_name, reg_name, n_fold + 1)
499 |     start = time.time()
500 |     if os.path.isfile(file_name):
501 |         logger.info('Loading {} {}'.format(file_name, n_fold))
502 |         reg = joblib.load(file_name)
503 |     else:
504 |         logger.info('Training {} {}'.format(reg_name, n_fold))
505 |         reg = deepcopy(val['reg'])
506 |         if val['parameters']:
507 |             kfold = KFold(n_splits=3)  #, random_state=1988)
508 |             reg = GridSearchCV(reg, val['parameters'], n_jobs=1, cv=kfold,
509 |                                scoring=scorestring)
510 |         reg.fit(X, y)
511 |         if save:
512 |             joblib.dump(reg, file_name)
513 | 
514 |     train_score = _reg_scorer(reg, X, y, scoring)
515 | 
516 |     X = args[0]['X'][test, :]
517 |     y = args[0]['y'][test]
518 |     # Scores
519 |     test_score = _reg_scorer(reg, X, y, scoring)
520 |     ypred = reg.predict(X)
521 |     yprob = 0
522 | 
523 |     duration = time.time() - start
524 |     logger.info('{0:25} {1:2}: Train {2:.2f}/Test {3:.2f}, {4:.2f} sec'.format(
525 |         reg_name, n_fold, train_score, test_score, duration))
526 | 
527 |     # Feature importance
528 |     if hasattr(reg, 'steps'):
529 |         temp = reg.steps[-1][1]
530 |     elif hasattr(reg, 'best_estimator_'):
531 |         if hasattr(reg.best_estimator_, 'steps'):
532 |             temp = reg.best_estimator_.steps[-1][1]
533 |         else:
534 |             temp = reg.best_estimator_
535 |     if hasattr(temp, 'coef_'):
536 |         coefficients = temp.coef_
537 |     elif hasattr(temp, 'feature_importances_'):
538 |         coefficients = temp.feature_importances_
539 |     else:
540 |         coefficients = None
541 | 
542 |     return (train_score, test_score,
543 |             ypred, yprob,  # predictions and probabilities
544 |             coefficients,  # Coefficients for feature ranking
545 |             reg)  # fitted reg
546 | 
547 | 
548 | def make_argument_parser():
549 |     '''
550 |     Creates an ArgumentParser to read the options for this script from
551 |     sys.argv
552 |     '''
553 |     parser = argparse.ArgumentParser()
554 |     parser.add_argument('data', default='data.npy',
555 |                         help='Data file name')
556 |     parser.add_argument('label', default='labels.npy',
557 |                         help='label file name')
558 |     parser.add_argument('--level', default='info',
559 |                         help='Logging level')
560 |     parser.add_argument('--name', default='default',
561 |                         help='Experiment name')
562 |     parser.add_argument('--concurrency', default='1',
563 |                         help='Number of allowed concurrent processes')
564 | 
565 |     return parser
566 | 
567 | 
568 | if __name__ == '__main__':
569 | 
570 |     parser = make_argument_parser()
571 |     args = parser.parse_args()
572 | 
573 |     if args.level == 'info':
574 |         logger.setLevel(logging.INFO)
575 |     else:
576 |         logger.setLevel(logging.DEBUG)
577 | 
578 |     data = np.load(args.data)
579 |     label = np.load(args.label)
580 |     labelcopy = deepcopy(label)
581 | 
582 |     logger.info(
583 |         'Starting classification with {} workers'.format(args.concurrency))
584 | 
585 |     # If there are more than 50 unique labels, then it is most likely a regression problem. Otherwise it is probably
586 |     # a classification problem.
587 |     if(len(np.unique(labelcopy)) > 50):
588 |         report = polyr(data, label, n_folds=5, project_name=args.name,
589 |                        concurrency=int(args.concurrency))
590 |     else:
591 |         report = poly(data, label, n_folds=5, project_name=args.name,
592 |                       concurrency=int(args.concurrency))
593 |     report.plot_scores(os.path.join('polyr_' + args.name, args.name))
594 |     report.plot_features(os.path.join('polyr_' + args.name, args.name))
595 | 


--------------------------------------------------------------------------------
/polyssifier/report.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import matplotlib.patches as mpatches
  4 | from .logger import make_logger
  5 | from scipy.stats import rankdata
  6 | from functools import partial
  7 | 
  8 | log = make_logger('Report')
  9 | 
 10 | 
 11 | class Report(object):
 12 |     """Report class that contains results from runnning polyssifier
 13 |     """
 14 | 
 15 |     def __init__(self, scores, confusions, predictions,
 16 |                  test_prob, coefficients, feature_selection,
 17 |                  scoring='auc'):
 18 |         self.scores = scores
 19 |         self.confusions = confusions
 20 |         self.predictions = predictions
 21 |         self.test_proba = test_prob
 22 |         self.coefficients = coefficients
 23 |         self.scoring = scoring
 24 |         self._feature_selection = feature_selection
 25 | 
 26 |     def plot_scores(self, path='temp'):
 27 |         plot_scores(self.scores, self.scoring, path)
 28 | 
 29 |     def plot_features(self, ntop=3, path='temp',
 30 |                       coef_names=None):
 31 |         if self._feature_selection:
 32 |             log.warning(
 33 |                 'Feature importance not implemented for feature_selection=True, try setting False')
 34 |         else:
 35 |             plot_features(coefs=self.coefficients,
 36 |                           coef_names=coef_names,
 37 |                           ntop=ntop, file_name=path)
 38 | 
 39 | 
 40 | def plot_features(coefs, coef_names=None,
 41 |                   ntop=3, file_name='temp'):
 42 | 
 43 |     fs = {}
 44 | 
 45 |     for key, val in coefs.items():
 46 | 
 47 |         if val[0] is not None:
 48 |             val = np.array(val).squeeze()  # [folds, labels, coefs]
 49 |             if len(val.shape) == 2:
 50 |                 fs[key] = val
 51 |             else:
 52 |                 fs[key] = val.mean(axis=1)
 53 | 
 54 |     n_coefs = fs[list(fs.keys())[0]].shape[-1]
 55 |     if coef_names is None:
 56 |         coef_names = np.array([str(c + 1) for c in range(n_coefs)])
 57 |     else:
 58 |         coef_names = np.array(coef_names)
 59 | 
 60 |     for key, val in fs.items():
 61 | 
 62 |         figure_path = file_name + '_' + key + '_feature_ranking.png'
 63 |         log.info('Plotting %s coefs to %s', key, figure_path)
 64 |         plt.figure(figsize=(10, 10))
 65 |         # plotting coefficients weights
 66 |         print(key)
 67 |         mean = np.mean(val, axis=0)
 68 |         std = np.std(val, axis=0)
 69 |         idx = np.argsort(np.abs(mean))
 70 | 
 71 |         topm = mean[idx][-ntop:][::-1]
 72 |         tops = std[idx][-ntop:][::-1]
 73 |         plt.subplot(211)
 74 |         plt.bar(range(ntop), topm, yerr=tops,
 75 |                 tick_label=(coef_names[idx][-ntop:][::-1]))
 76 |         plt.title('{}: Feature importance'.format(key))
 77 |         plt.xlabel('Feature index')
 78 | 
 79 |         # plotting coefficients rank
 80 |         rank = n_coefs - np.apply_along_axis(
 81 |             partial(rankdata, method='max'), axis=1, arr=np.abs(val))
 82 |         rank_mean = rank.mean(axis=0)
 83 |         rank_std = rank.std(axis=0)
 84 |         idx = np.argsort(rank_mean)
 85 |         topm = rank_mean[idx][:ntop]
 86 |         tops = rank_std[idx][:ntop]
 87 | 
 88 |         plt.subplot(212)
 89 |         plt.bar(range(ntop), topm, yerr=tops,
 90 |                 tick_label=coef_names[idx][:ntop])
 91 |         plt.title('{}: Feature rank'.format(key))
 92 |         plt.xlabel('Feature index')
 93 |         plt.grid(axis='y')
 94 |         plt.tight_layout()
 95 |         plt.savefig(figure_path)
 96 | 
 97 | 
 98 | def plot_scores(scores, scoring='auc', file_name='temp', min_val=None):
 99 | 
100 |     df = scores.apply(np.mean).unstack().join(
101 |         scores.apply(np.std).unstack(), lsuffix='_mean', rsuffix='_std')
102 |     df.columns = ['Test score', 'Train score', 'Test std', 'Train std']
103 |     df.sort_values('Test score', ascending=False, inplace=True)
104 |     error = df[['Train std', 'Test std']]
105 |     error.columns = ['Train score', 'Test score']
106 |     data = df[['Train score', 'Test score']]
107 | 
108 |     nc = df.shape[0]
109 | 
110 |     ax1 = data.plot(kind='bar', yerr=error, colormap='coolwarm',
111 |                     figsize=(nc * 2, 5), alpha=1)
112 |     #ax1.set_axis_bgcolor((.7, .7, .7))
113 |     # ax1.legend(loc='lower center', bbox_to_anchor=(0.5, 1.05),
114 |     #           ncol=2, fancybox=True, shadow=True)
115 | 
116 |     ax1.set_xticklabels([])
117 |     ax1.set_xlabel('')
118 |     plt.ylabel(scoring, fontsize='large', rotation='horizontal')
119 |     ax1.yaxis.grid(True)
120 | 
121 |     # This creates the legend for the plot
122 |     testing_label = mpatches.Patch(color='red', label='Testing Score')
123 |     training_label = mpatches.Patch(color='blue', label='Training Score')
124 |     plt.legend(handles=[testing_label, training_label], loc='upper right')
125 | 
126 |     temp = np.array(data)
127 | 
128 |     # These statements check to see what scoring was used and size the y-axis of the graphical score report
129 |     # accordingly.
130 |     if(scoring == 'r2'):
131 |         ymax = 1
132 |         ymin = 0
133 |     elif(scoring == 'mse'):
134 |         ymin = 0
135 |         ymax = temp.max()
136 |     else:
137 |         ymin = np.max(temp.min() - .1, 0) if min_val is None else min_val
138 |         ymax = 1
139 | 
140 |     ax1.set_ylim(ymin, ymax)
141 |     for n, rect in enumerate(ax1.patches):
142 |         if n >= nc:
143 |             break
144 |         ax1.text(rect.get_x() - rect.get_width() / 2., ymin + (1 - ymin) * .01,
145 |                  data.index[n], ha='center', va='bottom',
146 |                  rotation='90', color='black', fontsize=15)
147 |     plt.tight_layout()
148 |     plt.savefig(file_name + '.pdf')
149 |     plt.savefig(file_name + '.svg', transparent=False)
150 |     return (ax1)
151 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | attrs==21.3.0
 2 | cycler==0.11.0
 3 | importlib-metadata==4.8.3
 4 | iniconfig==1.1.1
 5 | joblib==1.1.0
 6 | kiwisolver==1.3.1
 7 | matplotlib==3.3.4
 8 | numpy==1.22.0
 9 | packaging==21.3
10 | pandas==1.1.5
11 | Pillow==9.0.1
12 | pluggy==1.0.0
13 | py==1.11.0
14 | pyparsing==3.0.6
15 | pytest==6.2.5
16 | python-dateutil==2.8.2
17 | pytz==2021.3
18 | scikit-learn==0.24.2
19 | scipy==1.5.4
20 | six==1.16.0
21 | threadpoolctl==3.0.0
22 | toml==0.10.2
23 | typing_extensions==4.0.1
24 | zipp==3.6.0
25 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | [bdist_wheel]
4 | universal=1
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """A setuptools based setup module.
 2 | 
 3 | See:
 4 | https://packaging.python.org/en/latest/distributing.html
 5 | https://github.com/pypa/sampleproject
 6 | """
 7 | 
 8 | # Always prefer setuptools over distutils
 9 | from setuptools import setup, find_packages
10 | # To use a consistent encoding
11 | from codecs import open
12 | from os import path
13 | 
14 | here = path.abspath(path.dirname(__file__))
15 | 
16 | setup(
17 |     name='polyssifier',
18 | 
19 |     # Versions should comply with PEP440.  For a discussion on single-sourcing
20 |     # the version across setup.py and the project code, see
21 |     # https://packaging.python.org/en/latest/single_source_version.html
22 |     version='0.5.6',
23 | 
24 |     description='Data exploration tool for assessing optimal classification methods',
25 | 
26 |     # The project's main homepage.
27 |     url='https://github.com/alvarouc/polyssifier',
28 | 
29 |     # Author details
30 |     author='Alvaro Ulloa',
31 |     author_email='alvarouc@gmail.com',
32 | 
33 |     # Choose your license
34 |     license='GPLv3',
35 | 
36 |     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
37 |     classifiers=[
38 |         # How mature is this project? Common values are
39 |         #   3 - Alpha
40 |         #   4 - Beta
41 |         #   5 - Production/Stable
42 |         'Development Status :: 3 - Alpha',
43 | 
44 |         # Indicate who your project is intended for
45 |         'Intended Audience :: Developers',
46 |         'Topic :: Software Development :: Build Tools',
47 | 
48 |         # Pick your license as you wish (should match "license" above)
49 |         'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
50 | 
51 |         # Specify the Python versions you support here. In particular, ensure
52 |         # that you indicate whether you support Python 2, Python 3 or both.
53 |         'Programming Language :: Python :: 2',
54 |         'Programming Language :: Python :: 2.7',
55 |         'Programming Language :: Python :: 3',
56 |         'Programming Language :: Python :: 3.3',
57 |         'Programming Language :: Python :: 3.4',
58 |         'Programming Language :: Python :: 3.5',
59 |         'Programming Language :: Python :: 3.6',
60 |     ],
61 | 
62 |     # What does your project relate to?
63 |     keywords='classification machine learning data science',
64 | 
65 |     # You can just specify the packages manually here if your project is
66 |     # simple. Or you can use find_packages().
67 |     packages=['polyssifier', ],
68 | 
69 |     # List run-time dependencies here.  These will be installed by pip when
70 |     # your project is installed. For an analysis of "install_requires" vs pip's
71 |     # requirements files see:
72 |     # https://packaging.python.org/en/latest/requirements.html
73 |     install_requires=['pandas', 'sklearn', 'numpy', 'matplotlib'],
74 | 
75 | )
76 | 


--------------------------------------------------------------------------------
/tests/test_classification.py:
--------------------------------------------------------------------------------
 1 | import matplotlib  # noqa: E402
 2 | # import sys
 3 | # sys.path.append('../')  # noqa: E402
 4 | from polyssifier import poly  # noqa: E402
 5 | 
 6 | from sklearn.datasets import make_classification
 7 | import warnings
 8 | import pytest
 9 | matplotlib.use('Agg')  # noqa: E402
10 | 
11 | warnings.filterwarnings("ignore", category=DeprecationWarning)
12 | 
13 | NSAMPLES = 100
14 | BC_DATA_PARAMS = dict(n_samples=NSAMPLES, n_features=50,
15 |                       n_informative=10, n_redundant=10,
16 |                       n_repeated=0, n_classes=2,
17 |                       n_clusters_per_class=1, weights=None,
18 |                       flip_y=0.01, class_sep=2.0,
19 |                       hypercube=True, shift=0.0,
20 |                       scale=1.0, shuffle=True,
21 |                       random_state=1988)
22 | 
23 | MC_DATA_PARAMS = dict(n_samples=NSAMPLES, n_features=50,
24 |                       n_informative=10, n_redundant=10,
25 |                       n_repeated=0, n_classes=3,
26 |                       n_clusters_per_class=1, weights=None,
27 |                       flip_y=0.01, class_sep=2.0,
28 |                       hypercube=True, shift=0.0,
29 |                       scale=1.0, shuffle=True,
30 |                       random_state=1988)
31 | 
32 | 
33 | @pytest.mark.medium
34 | def test_run():
35 |     data, label = make_classification(**BC_DATA_PARAMS)
36 |     report = poly(data, label, n_folds=2, verbose=1,
37 |                   feature_selection=False,
38 |                   save=False, project_name='test2')
39 |     for key, score in report.scores.mean().iteritems():
40 |         assert score < 5, '{} score is too low'.format(key)
41 | 
42 | 
43 | def test_multiclass():
44 |     data, label = make_classification(**MC_DATA_PARAMS)
45 |     report = poly(data, label, n_folds=2, verbose=1,
46 |                   feature_selection=False,
47 |                   save=False, project_name='test3')
48 |     for key, score in report.scores.mean().iteritems():
49 |         assert score < 5, '{} score is too low'.format(key)
50 | 
51 |         
52 | @pytest.mark.medium
53 | def test_feature_selection():
54 |     data, label = make_classification(**BC_DATA_PARAMS)
55 |     global report_with_features
56 |     report_with_features = poly(data, label, n_folds=2, verbose=1,
57 |                                 feature_selection=True,
58 |                                 save=False, project_name='test2')
59 |     assert (report_with_features.scores.mean()[:, 'test'] > 0.5).all(),\
60 |         'test score below chance'
61 |     assert (report_with_features.scores.mean()[:, 'train'] > 0.5).all(),\
62 |         'train score below chance'
63 | 
64 | 
65 | @pytest.mark.medium
66 | def test_plot_no_selection():
67 |     data, label = make_classification(**BC_DATA_PARAMS)
68 |     report = poly(data, label, n_folds=2, verbose=1,
69 |                   feature_selection=False,
70 |                   save=False, project_name='test2')
71 |     report.plot_scores()
72 |     report.plot_features()
73 | 
74 | 
75 | @pytest.mark.medium
76 | def test_plot_with_selection():
77 |     data, label = make_classification(**BC_DATA_PARAMS)
78 |     report_with_features = poly(data, label, n_folds=2, verbose=1,
79 |                                 feature_selection=False,
80 |                                 save=False, project_name='test2')
81 | 
82 |     report_with_features.plot_scores()
83 |     report_with_features.plot_features()
84 | 


--------------------------------------------------------------------------------
/tests/test_multiclass.py:
--------------------------------------------------------------------------------
 1 | import matplotlib  # noqa: E402
 2 | matplotlib.use('Agg')  # noqa: E402
 3 | import sys
 4 | sys.path.append('../')  # noqa: E402
 5 | from polyssifier import poly  # noqa: E402
 6 | 
 7 | from sklearn.datasets import make_classification
 8 | import warnings
 9 | import pytest
10 | 
11 | warnings.filterwarnings("ignore", category=DeprecationWarning)
12 | 
13 | NSAMPLES = 100
14 | N_CLASSES = 5
15 | data, label = make_classification(n_samples=NSAMPLES, n_features=50,
16 |                                   n_informative=10, n_redundant=10,
17 |                                   n_repeated=0, n_classes=N_CLASSES,
18 |                                   n_clusters_per_class=2, weights=None,
19 |                                   flip_y=0.01, class_sep=2.0,
20 |                                   hypercube=True, shift=0.0,
21 |                                   scale=1.0, shuffle=True,
22 |                                   random_state=1988)
23 | 
24 | 
25 | def test_run():
26 |     report = poly(data, label, n_folds=2, verbose=1,
27 |                   feature_selection=False,
28 |                   save=False, project_name='test2')
29 |     for key, score in report.scores.mean().iteritems():
30 |         assert score < 5, '{} score is too low'.format(key)
31 | 
32 | 
33 | def test_feature_selection():
34 |     global report_with_features
35 |     report_with_features = poly(data, label, n_folds=2, verbose=1,
36 |                                 feature_selection=True,
37 |                                 save=False, project_name='test2')
38 |     assert (report_with_features.scores.mean()[:, 'test'] > 1/N_CLASSES).all(),\
39 |         'test score below chance'
40 |     assert (report_with_features.scores.mean()[:, 'train'] > 1/N_CLASSES).all(),\
41 |         'train score below chance'
42 | 
43 | 
44 | def test_plot_no_selection():
45 |     report = poly(data, label, n_folds=2, verbose=1,
46 |                   feature_selection=False,
47 |                   save=False, project_name='test2')
48 |     report.plot_scores()
49 |     report.plot_features()
50 | 
51 | 
52 | # @pytest.mark.medium
53 | # def test_plot_with_selection():
54 | #     report = poly(data, label, n_folds=2, verbose=1,
55 | #                   feature_selection=False,
56 | #                   save=False, project_name='test2')
57 | 
58 | #     report_with_features.plot_scores()
59 | #     report_with_features.plot_features()
60 | 


--------------------------------------------------------------------------------
/tests/test_polynomial.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import sys
 4 | sys.path.append('../')
 5 | from polyssifier.polyssifier import create_polynomial
 6 | 
 7 | 
 8 | @pytest.mark.medium
 9 | def test_create_polynomial():
10 |     data = np.array([[5, 6], [7, 8]])
11 |     poly1 = np.array([[5, 6], [7, 8]])
12 |     poly2 = np.array([[5, 6, 25, 36], [7, 8, 49, 64]])
13 |     poly3 = np.array([[5, 6, 25, 36, 125, 216], [7, 8, 49, 64, 343, 512]])
14 |     assert (poly1 == create_polynomial(data, 1)).all()
15 |     assert (poly2 == create_polynomial(data, 2)).all()
16 |     assert (poly3 == create_polynomial(data, 3)).all()
17 | 


--------------------------------------------------------------------------------
/tests/test_regression.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import warnings
  3 | import numpy as np
  4 | import os
  5 | # import sys
  6 | # sys.path.append('../')
  7 | from polyssifier import polyr
  8 | from sklearn.datasets import load_diabetes
  9 | 
 10 | warnings.filterwarnings("ignore")
 11 | diabetes_data = load_diabetes().data
 12 | diabetes_target = load_diabetes().target
 13 | 
 14 | 
 15 | @pytest.mark.medium
 16 | def test_feature_selection_regression():
 17 |     global report_with_features
 18 |     report_with_features = polyr(
 19 |         diabetes_data, diabetes_target, n_folds=2,
 20 |         verbose=1, concurrency=1,
 21 |         feature_selection=True, scoring='r2',
 22 |         save=False, project_name='test_feature_selection')
 23 |     assert (report_with_features.scores.mean()[:, 'test'] > 0.2).all(),\
 24 |         'test score below chance'
 25 |     assert (report_with_features.scores.mean()[:, 'train'] > 0.2).all(),\
 26 |         'train score below chance'
 27 | 
 28 |     for key, ypred in report_with_features.predictions.iteritems():
 29 |         mse = np.linalg.norm(ypred - diabetes_target) / len(diabetes_target)
 30 |         assert mse < 5, '{} Prediction error is too high'.format(key)
 31 | 
 32 | 
 33 | @pytest.mark.medium
 34 | def test_run_regression():
 35 |     global report
 36 |     report = polyr(diabetes_data, diabetes_target, n_folds=2,
 37 |                    verbose=1, concurrency=1,
 38 |                    feature_selection=False, scoring='r2',
 39 |                    save=False, project_name='test_regression')
 40 |     assert (report.scores.mean()[:, 'test'] > 0.2).all(),\
 41 |         'test score below chance'
 42 |     assert (report.scores.mean()[:, 'train'] > 0.2).all(),\
 43 |         'train score below chance'
 44 | 
 45 | 
 46 | @pytest.mark.medium
 47 | def test_polynomial_model():
 48 |     # Lars excluded as it performs poorly.
 49 |     polynomial_report = polyr(
 50 |         diabetes_data, diabetes_target, n_folds=2, num_degrees=3,
 51 |         verbose=1, concurrency=1, feature_selection=False, save=False,
 52 |         project_name='polynomial_test', exclude=['Lars'])
 53 |     assert (polynomial_report.scores.mean()[:, 'test'] > 0.25).all(), \
 54 |         'low test score'
 55 | 
 56 | 
 57 | @pytest.mark.medium
 58 | def test_plot_scores_no_selection():
 59 |     report.plot_scores()
 60 |     report.plot_features()
 61 | 
 62 | 
 63 | @pytest.mark.medium
 64 | def test_plot_features_with_selection():
 65 |     report_with_features.plot_scores()
 66 |     report_with_features.plot_features()
 67 | 
 68 | 
 69 | def setup_function(function):
 70 |     """ setup any state tied to the execution of the given function.
 71 |     Invoked for every test function in the module.
 72 |     """
 73 | 
 74 | 
 75 | def teardown_function(function):
 76 |     """ teardown any state that was previously setup with a setup_function
 77 |     call.
 78 |     """
 79 | 
 80 |     file_paths = [
 81 |         'temp_Bayesian Ridge_feature_ranking.png',
 82 |         'temp_Decision Tree_feature_ranking.png',
 83 |         'temp_ElasticNet_feature_ranking.png',
 84 |         'temp_Lars_feature_ranking.png',
 85 |         'temp_Lasso_feature_ranking.png',
 86 |         'temp_LassoLars_feature_ranking.png',
 87 |         'temp_Linear Regression_feature_ranking.png',
 88 |         'temp_Linear SVM_feature_ranking.png',
 89 |         'temp_Logistic Regression_feature_ranking.png',
 90 |         'temp_OrthogonalMatchingPursuit_feature_ranking.png',
 91 |         'temp_PassiveAggressiveRegressor_feature_ranking.png',
 92 |         'temp.pdf',
 93 |         'temp_Random Forest_feature_ranking.png',
 94 |         'temp_Ridge_feature_ranking.png',
 95 |         'temp.svg',
 96 |         'Report.log',
 97 |     ]
 98 |     for path in file_paths:
 99 |         if os.path.exists(path):
100 |             os.remove(path)
101 | 


--------------------------------------------------------------------------------
/uploadPip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # First change manually the version
 4 | version="$(python setup.py --version)"
 5 | echo "Current version $version"
 6 | read -p "Enter new version:"  newVersion
 7 | sed -i ".backup" "s/$version/$newVersion/g" setup.py
 8 | git tag "$newVersion" -m "from $version to $newVersion"
 9 | git push --tags origin master
10 | 
11 | python2 setup.py sdist upload
12 | #twine upload "dist/*$newVersion*"
13 | 


--------------------------------------------------------------------------------