├── .coveragerc ├── .github └── workflows │ ├── python-package.yml │ └── python-publish.yml ├── .gitignore ├── .travis.yml ├── AUTHORS.md ├── LICENSE ├── README.md ├── polyssifier ├── __init__.py ├── logger.py ├── poly_utils.py ├── polyssifier.py └── report.py ├── requirements.txt ├── sample └── example.ipynb ├── setup.cfg ├── setup.py ├── tests ├── test_classification.py ├── test_multiclass.py ├── test_polynomial.py └── test_regression.py └── uploadPip.sh /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | */tests* 4 | parallel = False 5 | 6 | [report] 7 | exclude_lines = 8 | pragma: no cover 9 | def __repr__ 10 | raise NotImplementedError 11 | if __name__ == .__main__.: 12 | def parse_args 13 | def make_argument_parser 14 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.9"] 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v2 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest pytest-cov 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | - name: Lint with flake8 33 | run: | 34 | # stop the build if there are Python syntax errors or undefined names 35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 38 | - name: Test with pytest 39 | run: | 40 | python -m pytest --cov=polyssifier --cov-report xml tests 41 | 42 | - name: Upload coverage data to coveralls.io 43 | run: | 44 | python -m pip install coveralls==2.2 45 | coveralls --service=github 46 | env: 47 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | jobs: 16 | deploy: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: '3.x' 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install build 30 | - name: Build package 31 | run: python -m build 32 | - name: Publish package 33 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 34 | with: 35 | user: __token__ 36 | password: ${{ secrets.PYPI_API_TOKEN }} 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | temp*.png 43 | .ipynb* 44 | 45 | # Translations 46 | *.mo 47 | *.pot 48 | 49 | # Django stuff: 50 | *.log 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | # PyBuilder 56 | target/ 57 | 58 | # figures 59 | *.pdf 60 | *.svg 61 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # vim ft=yaml 2 | # travis-ci.org definition for Nimfa tests 3 | language: python 4 | 5 | sudo: false 6 | 7 | cache: pip 8 | apt: true 9 | 10 | addons: 11 | apt: 12 | packages: 13 | - build-essential 14 | - libatlas-dev 15 | - libatlas-base-dev 16 | - liblapack-dev 17 | - g++ 18 | - gfortran 19 | python: 20 | - "2.7" 21 | - "3.4" 22 | - "3.5" 23 | - "3.6" 24 | 25 | install: 26 | - pip install --upgrade pip setuptools wheel 27 | - travis_wait pip install --only-binary=numpy,scipy numpy scipy 28 | - pip install python-coveralls pytest-cov pytest matplotlib 29 | - pip install pandas scikit-learn 30 | 31 | before_script: 32 | - "export DISPLAY=:99.0" 33 | - "sh -e /etc/init.d/xvfb start" 34 | - sleep 3 # give xvfb some time to start 35 | - "export PYTHONPATH=$PYTHONPATH:." 36 | 37 | script: 38 | - cd "$TRAVIS_BUILD_DIR/tests"; py.test --cov=polyssifier 39 | 40 | after_success: 41 | - coveralls 42 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | Contributors in no particular order. 2 | 3 | * [Alvaro Ulloa](https://github.com/alvarouc) 4 | * [Stephen Eyerly](https://github.com/seyerly) 5 | * [Vamsi Krishna](https://github.com/ismav) 6 | * [Devon Hjelm](https://github.com/rdevon) 7 | * [Sergey Pliz](https://github.com/pliz) 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Coverage Status](https://coveralls.io/repos/github/alvarouc/polyssifier/badge.svg)](https://coveralls.io/github/alvarouc/polyssifier) 2 | ![example workflow](https://github.com/alvarouc/polyssifier/actions/workflows/python-package.yml/badge.svg) 3 | 4 | Polyssifier 5 | =========== 6 | 7 | Polyssifier runs a multitude of machine learning models on data. It reports scores, confusion matrices, predictions, and plots the scores ranked by classifier performance. 8 | 9 | ## Installation 10 | ```bash 11 | pip install polyssifier 12 | ``` 13 | 14 | ## How to use 15 | ### For classification 16 | ```python 17 | from polyssifier import poly 18 | # Load data 19 | data = np.load("/path/to/data.npy") 20 | label = np.load("/path/to/labels.npy") 21 | # Run analysis 22 | report = poly(data,label, n_folds=8) 23 | # Plot results 24 | report.plot_scores() 25 | report.plot_features(ntop=10) 26 | ``` 27 | 28 | ### For Regression 29 | ```python 30 | from polyssifier import polyr 31 | # Load data 32 | data = np.load("/path/to/data.npy") 33 | target = np.load("/path/to/target.npy") 34 | # Run analysis 35 | report = polyr(data, target, n_folds=8) 36 | # Plot results 37 | report.plot_scores() 38 | report.plot_features(ntop=10) 39 | ``` 40 | 41 | ### In the terminal 42 | ```bash 43 | poly data.npy label.npy --concurrency 10 44 | ``` 45 | 46 | ### Requirements 47 | - Sklearn 48 | - Numpy 49 | - Pandas 50 | 51 | ### Features 52 | - Cross validated scores. 53 | - Report f1 score (scoring='f1') or ROC (scoring='auc') for classification 54 | - Report MSE or R^2 for regression 55 | - Feature ranking for compatible models (Logistic Regression, Linear SVM, Random Forest) 56 | - Parallel processing. 57 | - Control the number of threads with 'concurrency'. 58 | - We recommend setting concurrency to half the number of Cores in your system. 59 | - Saves trained models for future use in case of server malfunction. 60 | - Set project_name for identifying a experiment. 61 | - Activate feature selection step setting 62 | - feature_selection=True 63 | - Automatically scales your data with scale=True 64 | 65 | Example: on [sample/example.ipynb](sample/example.ipynb) 66 | 67 | It includes the following classifiers: 68 | 69 | - Multilayer Perceptron 70 | - Nearest Neighbors 71 | - Linear SVM 72 | - RBF SVM 73 | - Decision Tree 74 | - Random Forest 75 | - Logistic Regression 76 | - Naive Bayes 77 | - Voting Classifier 78 | 79 | and the following regressors: 80 | 81 | - Linear Regression 82 | - Bayesian Ridge 83 | - PassiveAggressiveRegressor 84 | - GaussianProcessRegressor 85 | - Ridge 86 | - Lasso 87 | - Lars 88 | - LassoLars 89 | - OrthogonalMatchingPursuit 90 | - ElasticNet 91 | 92 | You can exclude some of this models by providing a list of names as follows: 93 | ```python 94 | from polyssifier import poly 95 | 96 | report = poly(data,label, n_folds=8, 97 | exclude=['Multilayer Perceptron']) 98 | ``` 99 | -------------------------------------------------------------------------------- /polyssifier/__init__.py: -------------------------------------------------------------------------------- 1 | from .polyssifier import poly, polyr 2 | from .poly_utils import build_regressors, build_classifiers -------------------------------------------------------------------------------- /polyssifier/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def make_logger(name=''): 5 | formatter = logging.Formatter( 6 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 7 | logger = logging.getLogger(name) 8 | logger.setLevel(logging.DEBUG) 9 | fh = logging.FileHandler('{}.log'.format(name)) 10 | fh.setLevel(logging.DEBUG) 11 | fh.setFormatter(formatter) 12 | ch = logging.StreamHandler() 13 | ch.setLevel(logging.INFO) 14 | ch.setFormatter(formatter) 15 | # add the handlers to the logger 16 | logger.addHandler(fh) 17 | logger.addHandler(ch) 18 | 19 | return logger -------------------------------------------------------------------------------- /polyssifier/poly_utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import KNeighborsClassifier 2 | from sklearn.svm import LinearSVC, SVC 3 | from sklearn.tree import DecisionTreeClassifier 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn.linear_model import (LogisticRegression, 6 | LinearRegression, 7 | BayesianRidge, 8 | Ridge, Lasso, 9 | ElasticNet, Lars, LassoLars, 10 | OrthogonalMatchingPursuit, 11 | PassiveAggressiveRegressor) 12 | from sklearn.naive_bayes import GaussianNB 13 | from sklearn.neural_network import MLPClassifier as MLP 14 | from sklearn.gaussian_process import GaussianProcessRegressor 15 | import collections 16 | import numpy as np 17 | from sklearn.feature_selection import SelectKBest, f_regression 18 | from sklearn.pipeline import make_pipeline 19 | from sklearn.preprocessing import StandardScaler 20 | from sklearn.gaussian_process.kernels import RBF 21 | 22 | 23 | class MyVoter(object): 24 | """ 25 | Voter Classifier 26 | Receives fitted classifiers and runs majority voting 27 | """ 28 | 29 | def __init__(self, estimators): 30 | ''' 31 | estimators: List of fitted classifiers 32 | ''' 33 | self.estimators_ = estimators 34 | 35 | def predict(self, X): 36 | predictions = np.asarray( 37 | [clf.predict(X) for clf in self.estimators_]).T 38 | maj = np.apply_along_axis( 39 | lambda x: np.argmax(np.bincount(x)), axis=1, 40 | arr=predictions.astype('int')) 41 | return maj 42 | 43 | 44 | class MyRegressionAverager(object): 45 | """ 46 | Regression averager 47 | Receives fitted regressors and averages the predictions of the regressors. 48 | """ 49 | 50 | def __init__(self, estimators): 51 | ''' 52 | estimators: List of fitted regressors 53 | ''' 54 | self.estimators_ = estimators 55 | 56 | def predict(self, X): 57 | predictions = np.asarray( 58 | [reg.predict(X) for reg in self.estimators_]).T 59 | 60 | avg = np.average(predictions, axis=1) 61 | return avg 62 | 63 | 64 | class MyRegressionMedianer(object): 65 | """ 66 | Regression averager 67 | Receives fitted regressors and averages the predictions of the regressors. 68 | """ 69 | 70 | def __init__(self, estimators): 71 | ''' 72 | estimators: List of fitted regressors 73 | ''' 74 | self.estimators_ = estimators 75 | 76 | def predict(self, X): 77 | predictions = np.asarray( 78 | [reg.predict(X) for reg in self.estimators_]).T 79 | 80 | avg = np.median(predictions, axis=1) 81 | return avg 82 | 83 | 84 | def build_classifiers(exclude, scale, feature_selection, nCols): 85 | ''' 86 | Input: 87 | - exclude: list of names of classifiers to exclude from the analysis 88 | - scale: True or False. Scale data before fitting classifier 89 | - feature_selection: True or False. Run feature selection before 90 | fitting classifier 91 | - nCols: Number of columns in input dataset to classifiers 92 | 93 | Output: 94 | Dictionary with classifier name as keys. 95 | - 'clf': Classifier object 96 | - 'parameters': Dictionary with parameters of 'clf' as keys 97 | ''' 98 | classifiers = collections.OrderedDict() 99 | 100 | if 'Multilayer Perceptron' not in exclude: 101 | classifiers['Multilayer Perceptron'] = { 102 | 'clf': MLP(), 103 | 'parameters': {'hidden_layer_sizes': [(100, 50), (50, 25)], 104 | 'max_iter': [500]} 105 | } 106 | 107 | if 'Nearest Neighbors' not in exclude: 108 | classifiers['Nearest Neighbors'] = { 109 | 'clf': KNeighborsClassifier(), 110 | 'parameters': {'n_neighbors': [1, 5, 10, 20]}} 111 | 112 | if 'SVM' not in exclude: 113 | classifiers['SVM'] = { 114 | 'clf': SVC(C=1, probability=True, cache_size=10000, 115 | class_weight='balanced'), 116 | 'parameters': {'kernel': ['rbf', 'poly'], 117 | 'C': [0.01, 0.1, 1]}} 118 | 119 | if 'Linear SVM' not in exclude: 120 | classifiers['Linear SVM'] = { 121 | 'clf': LinearSVC(dual=False, class_weight='balanced'), 122 | 'parameters': {'C': [0.01, 0.1, 1], 123 | 'penalty': ['l1', 'l2']}} 124 | 125 | if 'Decision Tree' not in exclude: 126 | classifiers['Decision Tree'] = { 127 | 'clf': DecisionTreeClassifier(max_depth=None, 128 | max_features='auto'), 129 | 'parameters': {}} 130 | 131 | if 'Random Forest' not in exclude: 132 | classifiers['Random Forest'] = { 133 | 'clf': RandomForestClassifier(max_depth=None, 134 | n_estimators=10, 135 | max_features='auto'), 136 | 'parameters': {'n_estimators': list(range(5, 20))}} 137 | 138 | if 'Logistic Regression' not in exclude: 139 | classifiers['Logistic Regression'] = { 140 | 'clf': LogisticRegression(fit_intercept=True, solver='lbfgs', 141 | penalty='l2'), 142 | 'parameters': {'C': [0.001, 0.1, 1]}} 143 | 144 | if 'Naive Bayes' not in exclude: 145 | classifiers['Naive Bayes'] = { 146 | 'clf': GaussianNB(), 147 | 'parameters': {}} 148 | # classifiers['Voting'] = {} 149 | 150 | def name(x): 151 | """ 152 | :param x: The name of the classifier 153 | :return: The class of the final estimator in lower case form 154 | """ 155 | return x['clf']._final_estimator.__class__.__name__.lower() 156 | 157 | for key, val in classifiers.items(): 158 | if not scale and not feature_selection: 159 | break 160 | steps = [] 161 | if scale: 162 | steps.append(StandardScaler()) 163 | if feature_selection: 164 | steps.append(SelectKBest(f_regression, k='all')) 165 | steps.append(classifiers[key]['clf']) 166 | classifiers[key]['clf'] = make_pipeline(*steps) 167 | # Reorganize paramenter list for grid search 168 | new_dict = {} 169 | for keyp in classifiers[key]['parameters']: 170 | new_dict[name(classifiers[key]) + '__' + 171 | keyp] = classifiers[key]['parameters'][keyp] 172 | classifiers[key]['parameters'] = new_dict 173 | if nCols > 5 and feature_selection: 174 | classifiers[key]['parameters']['selectkbest__k'] = np.linspace( 175 | np.round(nCols / 5), nCols, 5).astype('int').tolist() 176 | 177 | return classifiers 178 | 179 | 180 | def build_regressors(exclude, scale, feature_selection, nCols): 181 | ''' 182 | This method builds an ordered dictionary of regressors, where the key is the name of the 183 | regressor and the value of each key contains a standard dictionary with two keys itself. The first key called 184 | 'reg' points to the regression object, which is created by scikit learn. The second key called 'parameters' 185 | points to another regular map containing the parameters which are associated with the particular regression model. 186 | These parameters are used by grid search in polyssifier.py when finding the best model. If parameters are not 187 | defined then grid search is not performed on that particular regression model, so the model's default parameters 188 | are used instead to find the best model for the particular data. 189 | ''' 190 | regressors = collections.OrderedDict() 191 | 192 | if 'Linear Regression' not in exclude: 193 | regressors['Linear Regression'] = { 194 | 'reg': LinearRegression(), 195 | 'parameters': {} # Best to leave default parameters 196 | } 197 | 198 | if 'Bayesian Ridge' not in exclude: 199 | regressors['Bayesian Ridge'] = { 200 | 'reg': BayesianRidge(), 201 | 'parameters': {} # Investigate if alpha and lambda parameters should be changed 202 | } 203 | 204 | if 'PassiveAggressiveRegressor' not in exclude: 205 | regressors['PassiveAggressiveRegressor'] = { 206 | 'reg': PassiveAggressiveRegressor(), 207 | 'parameters': {'C': [0.5, 1.0, 1.5] 208 | } 209 | } 210 | 211 | if 'GaussianProcessRegressor' not in exclude: 212 | regressors['GaussianProcessRegressor'] = { 213 | 'reg': GaussianProcessRegressor(), 214 | 'parameters': { 215 | 'alpha': [0.01, 0.1, 1.0, 10.0], 216 | 'kernel': [RBF(x) for x in [0.01, 1.0, 100.0, 1000.0]], 217 | } 218 | } 219 | 220 | if 'Ridge' not in exclude: 221 | regressors['Ridge'] = { 222 | 'reg': Ridge(), 223 | 'parameters': { 224 | 'alpha': [0.25, 0.50, 0.75, 1.00] 225 | } 226 | } 227 | 228 | if 'Lasso' not in exclude: 229 | regressors['Lasso'] = { 230 | 'reg': Lasso(), 231 | 'parameters': { 232 | 'alpha': [0.25, 0.50, 0.75, 1.00] 233 | } 234 | } 235 | 236 | if 'Lars' not in exclude: 237 | regressors['Lars'] = { 238 | 'reg': Lars(), 239 | 'parameters': {} # Best to leave the default parameters 240 | } 241 | 242 | if 'LassoLars' not in exclude: 243 | regressors['LassoLars'] = { 244 | 'reg': LassoLars(), 245 | 'parameters': {'alpha': [0.25, 0.50, 0.75, 1.00, 10.0]} 246 | } 247 | 248 | if 'OrthogonalMatchingPursuit' not in exclude: 249 | regressors['OrthogonalMatchingPursuit'] = { 250 | 'reg': OrthogonalMatchingPursuit(), 251 | 'parameters': {} # Best to leave default parameters 252 | } 253 | 254 | if 'ElasticNet' not in exclude: 255 | regressors['ElasticNet'] = { 256 | 'reg': ElasticNet(), 257 | 'parameters': {'alpha': [0.25, 0.50, 0.75, 1.00], 258 | 'l1_ratio': [0.25, 0.50, 0.75, 1.00]} 259 | } 260 | 261 | def name(x): 262 | """ 263 | :param x: The name of the regressor 264 | :return: The class of the final regression estimator in lower case form 265 | """ 266 | return x['reg']._final_estimator.__class__.__name__.lower() 267 | 268 | for key, val in regressors.items(): 269 | if not scale and not feature_selection: 270 | break 271 | steps = [] 272 | if scale: 273 | steps.append(StandardScaler()) 274 | if feature_selection: 275 | steps.append(SelectKBest(f_regression, k='all')) 276 | steps.append(regressors[key]['reg']) 277 | regressors[key]['reg'] = make_pipeline(*steps) 278 | # Reorganize paramenter list for grid search 279 | new_dict = {} 280 | for keyp in regressors[key]['parameters']: 281 | new_dict[name(regressors[key]) + '__' + 282 | keyp] = regressors[key]['parameters'][keyp] 283 | regressors[key]['parameters'] = new_dict 284 | if nCols > 5 and feature_selection: 285 | regressors[key]['parameters']['selectkbest__k'] = np.linspace( 286 | np.round(nCols / 5), nCols, 5).astype('int').tolist() 287 | 288 | return regressors 289 | -------------------------------------------------------------------------------- /polyssifier/polyssifier.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | import sys 3 | import argparse 4 | import numpy as np 5 | import pickle as p 6 | from multiprocessing import Manager, Pool 7 | import os 8 | import pandas as pd 9 | from copy import deepcopy 10 | from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold 11 | from sklearn.metrics import (f1_score, confusion_matrix, roc_auc_score, 12 | mean_squared_error, r2_score) 13 | import joblib 14 | import time 15 | from sklearn.preprocessing import LabelEncoder 16 | from itertools import starmap 17 | from .poly_utils import (build_classifiers, MyVoter, build_regressors, 18 | MyRegressionMedianer) 19 | from .report import Report 20 | import logging 21 | from .logger import make_logger 22 | sys.setrecursionlimit(10000) 23 | logger = make_logger('polyssifier') 24 | 25 | 26 | def poly(data, label, n_folds=10, scale=True, exclude=[], 27 | feature_selection=False, save=False, scoring='auc', 28 | project_name='', concurrency=1, verbose=True): 29 | ''' 30 | Input 31 | data = numpy matrix with as many rows as samples 32 | label = numpy vector that labels each data row 33 | n_folds = number of folds to run 34 | scale = whether to scale data or not 35 | exclude = list of classifiers to exclude from the analysis 36 | feature_selection = whether to use feature selection or not (anova) 37 | save = whether to save intermediate steps or not 38 | scoring = Type of score to use ['auc', 'f1'] 39 | project_name = prefix used to save the intermediate steps 40 | concurrency = number of parallel jobs to run 41 | verbose = whether to print or not results 42 | Ouput 43 | scores = matrix with scores for each fold and classifier 44 | confusions = confussion matrix for each classifier 45 | predictions = Cross validated predicitons for each classifier 46 | ''' 47 | if verbose: 48 | logger.setLevel(logging.DEBUG) 49 | else: 50 | logger.setLevel(logging.ERROR) 51 | 52 | assert label.shape[0] == data.shape[0],\ 53 | "Label dimesions do not match data number of rows" 54 | _le = LabelEncoder() 55 | _le.fit(label) 56 | label = _le.transform(label) 57 | n_class = len(np.unique(label)) 58 | logger.info(f'Detected {n_class} classes in label') 59 | 60 | if save and not os.path.exists('poly_{}/models'.format(project_name)): 61 | os.makedirs('poly_{}/models'.format(project_name)) 62 | 63 | logger.info('Building classifiers ...') 64 | classifiers = build_classifiers(exclude, scale, 65 | feature_selection, 66 | data.shape[1]) 67 | 68 | scores = pd.DataFrame(columns=pd.MultiIndex.from_product( 69 | [classifiers.keys(), ['train', 'test']]), 70 | index=range(n_folds)) 71 | predictions = pd.DataFrame(columns=classifiers.keys(), 72 | index=range(data.shape[0])) 73 | test_prob = pd.DataFrame(columns=classifiers.keys(), 74 | index=range(data.shape[0])) 75 | confusions = {} 76 | coefficients = {} 77 | # !fitted_clfs = 78 | # pd.DataFrame(columns=classifiers.keys(), index = range(n_folds)) 79 | 80 | logger.info('Initialization, done.') 81 | 82 | skf = StratifiedKFold(n_splits=n_folds, random_state=1988, shuffle=True) 83 | skf.get_n_splits(np.zeros(data.shape[0]), label) 84 | kf = list(skf.split(np.zeros(data.shape[0]), label)) 85 | 86 | # Parallel processing of tasks 87 | manager = Manager() 88 | args = manager.list() 89 | args.append({}) # Store inputs 90 | shared = args[0] 91 | shared['kf'] = kf 92 | shared['X'] = data 93 | shared['y'] = label 94 | args[0] = shared 95 | 96 | args2 = [] 97 | for clf_name, val in classifiers.items(): 98 | for n_fold in range(n_folds): 99 | args2.append((args, clf_name, val, n_fold, project_name, 100 | save, scoring)) 101 | 102 | if concurrency == 1: 103 | result = list(starmap(fit_clf, args2)) 104 | else: 105 | pool = Pool(processes=concurrency) 106 | result = pool.starmap(fit_clf, args2) 107 | pool.close() 108 | 109 | fitted_clfs = {key: [] for key in classifiers} 110 | 111 | # Gather results 112 | for clf_name in classifiers: 113 | coefficients[clf_name] = [] 114 | temp = np.zeros((n_class, n_class)) 115 | temp_pred = np.zeros((data.shape[0], )) 116 | temp_prob = np.zeros((data.shape[0], )) 117 | clfs = fitted_clfs[clf_name] 118 | for n in range(n_folds): 119 | train_score, test_score, prediction, prob, confusion,\ 120 | coefs, fitted_clf = result.pop(0) 121 | clfs.append(fitted_clf) 122 | scores.loc[n, (clf_name, 'train')] = train_score 123 | scores.loc[n, (clf_name, 'test')] = test_score 124 | temp += confusion 125 | temp_prob[kf[n][1]] = prob 126 | temp_pred[kf[n][1]] = _le.inverse_transform(prediction) 127 | coefficients[clf_name].append(coefs) 128 | 129 | confusions[clf_name] = temp 130 | predictions[clf_name] = temp_pred 131 | test_prob[clf_name] = temp_prob 132 | 133 | # Voting 134 | fitted_clfs = pd.DataFrame(fitted_clfs) 135 | scores['Voting', 'train'] = np.zeros((n_folds, )) 136 | scores['Voting', 'test'] = np.zeros((n_folds, )) 137 | temp = np.zeros((n_class, n_class)) 138 | temp_pred = np.zeros((data.shape[0], )) 139 | for n, (train, test) in enumerate(kf): 140 | clf = MyVoter(fitted_clfs.loc[n].values) 141 | X, y = data[train, :], label[train] 142 | scores.loc[n, ('Voting', 'train')] = _scorer(clf, X, y) 143 | X, y = data[test, :], label[test] 144 | scores.loc[n, ('Voting', 'test')] = _scorer(clf, X, y) 145 | temp_pred[test] = clf.predict(X) 146 | temp += confusion_matrix(y, temp_pred[test]) 147 | 148 | confusions['Voting'] = temp 149 | predictions['Voting'] = temp_pred 150 | test_prob['Voting'] = temp_pred 151 | ###### 152 | 153 | # saving confusion matrices 154 | if save: 155 | with open('poly_' + project_name + '/confusions.pkl', 'wb') as f: 156 | p.dump(confusions, f, protocol=2) 157 | 158 | if verbose: 159 | print(scores.astype('float').describe().transpose() 160 | [['mean', 'std', 'min', 'max']]) 161 | return Report(scores=scores, confusions=confusions, 162 | predictions=predictions, test_prob=test_prob, 163 | coefficients=coefficients, 164 | feature_selection=feature_selection) 165 | 166 | 167 | def _scorer(clf, X, y): 168 | '''Function that scores a classifier according to what is available as a 169 | predict function. 170 | Input: 171 | - clf = Fitted classifier object 172 | - X = input data matrix 173 | - y = estimated labels 174 | Output: 175 | - AUC score for binary classification or F1 for multiclass 176 | The order of priority is as follows: 177 | - predict_proba 178 | - decision_function 179 | - predict 180 | ''' 181 | n_class = len(np.unique(y)) 182 | if n_class == 2: 183 | if hasattr(clf, 'predict_proba'): 184 | ypred = clf.predict_proba(X) 185 | try: 186 | ypred = ypred[:, 1] 187 | except: 188 | print('predict proba return shape{}'.format(ypred.shape)) 189 | 190 | assert len(ypred.shape) == 1,\ 191 | 'predict proba return shape {}'.format(ypred.shape) 192 | elif hasattr(clf, 'decision_function'): 193 | ypred = clf.decision_function(X) 194 | assert len(ypred.shape) == 1,\ 195 | 'decision_function return shape {}'.format(ypred.shape) 196 | else: 197 | ypred = clf.predict(X) 198 | score = roc_auc_score(y, ypred) 199 | else: 200 | score = f1_score(y, clf.predict(X), average='weighted') 201 | return score 202 | 203 | 204 | def fit_clf(args, clf_name, val, n_fold, project_name, save, scoring): 205 | ''' 206 | Multiprocess safe function that fits classifiers 207 | args: shared dictionary that contains 208 | X: all data 209 | y: all labels 210 | kf: list of train and test indexes for each fold 211 | clf_name: name of the classifier model 212 | val: dictionary with 213 | clf: sklearn compatible classifier 214 | parameters: dictionary with parameters, can be used for grid search 215 | n_fold: number of folds 216 | project_name: string with the project folder name to save model 217 | ''' 218 | train, test = args[0]['kf'][n_fold] 219 | X = args[0]['X'][train, :] 220 | y = args[0]['y'][train] 221 | file_name = 'poly_{}/models/{}_{}.p'.format( 222 | project_name, clf_name, n_fold + 1) 223 | start = time.time() 224 | if save and os.path.isfile(file_name): 225 | logger.info('Loading {} {}'.format(file_name, n_fold)) 226 | clf = joblib.load(file_name) 227 | else: 228 | logger.info('Training {} {}'.format(clf_name, n_fold)) 229 | clf = deepcopy(val['clf']) 230 | if val['parameters']: 231 | clf = GridSearchCV(clf, val['parameters'], n_jobs=1, cv=3, 232 | scoring=_scorer) 233 | clf.fit(X, y) 234 | if save: 235 | joblib.dump(clf, file_name) 236 | 237 | train_score = _scorer(clf, X, y) 238 | 239 | X = args[0]['X'][test, :] 240 | y = args[0]['y'][test] 241 | # Scores 242 | test_score = _scorer(clf, X, y) 243 | ypred = clf.predict(X) 244 | if hasattr(clf, 'predict_proba'): 245 | # For compatibility with different sklearn versions 246 | yprob = clf.predict_proba(X) 247 | try: 248 | yprob = yprob[:, 1] 249 | except: 250 | print('predict proba return shape {}'.format(yprob.shape)) 251 | 252 | elif hasattr(clf, 'decision_function'): 253 | yprob = clf.decision_function(X) 254 | try: 255 | yprob = yprob[:, 1] 256 | except: 257 | print('predict proba return shape {}'.format(yprob.shape)) 258 | 259 | assert len(yprob.shape) == 1,\ 260 | 'predict proba return shape {}'.format(ypred.shape) 261 | 262 | confusion = confusion_matrix(y, ypred) 263 | duration = time.time() - start 264 | logger.info('{0:25} {1:2}: Train {2:.2f}/Test {3:.2f}, {4:.2f} sec'.format( 265 | clf_name, n_fold, train_score, test_score, duration)) 266 | 267 | # Feature importance 268 | if hasattr(clf, 'steps'): 269 | temp = clf.steps[-1][1] 270 | elif hasattr(clf, 'best_estimator_'): 271 | if hasattr(clf.best_estimator_, 'steps'): 272 | temp = clf.best_estimator_.steps[-1][1] 273 | else: 274 | temp = clf.best_estimator_ 275 | try: 276 | if hasattr(temp, 'coef_'): 277 | coefficients = temp.coef_ 278 | elif hasattr(temp, 'feature_importances_'): 279 | coefficients = temp.feature_importances_ 280 | else: 281 | coefficients = None 282 | except: 283 | coefficients = None 284 | 285 | return (train_score, test_score, 286 | ypred, yprob, # predictions and probabilities 287 | confusion, # confusion matrix 288 | coefficients, # Coefficients for feature ranking 289 | clf) # fitted clf 290 | 291 | 292 | def create_polynomial(data, degree): 293 | ''' 294 | :param data: the data (numpy matrix) which will have its data vectors raised to powers 295 | :param degree: the degree of the polynomial we wish to predict 296 | :return: a new data matrix of the specified degree (for polynomial fitting purposes) 297 | ''' 298 | 299 | # First we make an empty matrix which is the size of what we wish to pass through to linear regress 300 | height_of_pass_through = data.shape[0] 301 | width_of_pass_through = degree * data.shape[1] 302 | to_pass_through = np.zeros( 303 | shape=(height_of_pass_through, width_of_pass_through)) 304 | 305 | # These are the width and height of each "exponeneted" matrix 306 | height_exponential_matrix = data.shape[0] 307 | width_exponential_matrix = data.shape[1] 308 | 309 | for i in range(degree): 310 | to_add_in = data ** (i + 1) 311 | for j in range(height_exponential_matrix): 312 | for k in range(width_exponential_matrix): 313 | to_pass_through.itemset( 314 | (j, k + i * width_exponential_matrix), (to_add_in.item(j, k))) 315 | return to_pass_through 316 | 317 | 318 | def polyr(data, label, n_folds=10, scale=True, exclude=[], 319 | feature_selection=False, num_degrees=1, save=False, scoring='r2', 320 | project_name='', concurrency=1, verbose=True): 321 | ''' 322 | Input 323 | data = numpy matrix with as many rows as samples 324 | label = numpy vector that labels each data row 325 | n_folds = number of folds to run 326 | scale = whether to scale data or not 327 | exclude = list of classifiers to exclude from the analysis 328 | feature_selection = whether to use feature selection or not (anova) 329 | num_degrees = the degree of the polynomial model to fit to the data (default is linear) 330 | save = whether to save intermediate steps or not 331 | scoring = Type of score to use ['mse', 'r2'] 332 | project_name = prefix used to save the intermediate steps 333 | concurrency = number of parallel jobs to run 334 | verbose = whether to print or not results 335 | 336 | Ouput 337 | scores = matrix with scores for each fold and classifier 338 | confusions = confussion matrix for each classifier 339 | predictions = Cross validated predicitons for each classifier 340 | ''' 341 | if num_degrees != 1: 342 | polynomial_data = create_polynomial(data, num_degrees) 343 | return polyr(data=polynomial_data, label=label, n_folds=n_folds, scale=scale, exclude=exclude, 344 | feature_selection=feature_selection, num_degrees=1, save=save, scoring=scoring, 345 | project_name=project_name, concurrency=concurrency, verbose=verbose) 346 | 347 | assert label.shape[0] == data.shape[0],\ 348 | "Label dimesions do not match data number of rows" 349 | 350 | # If the user wishes to save the intermediate steps and there is not already a polyrssifier models directory then 351 | # this statement creates one. 352 | if save and not os.path.exists('polyr_{}/models'.format(project_name)): 353 | os.makedirs('polyr_{}/models'.format(project_name)) 354 | 355 | # Whether or not intermeciate steps will be printed out. 356 | if verbose: 357 | logger.setLevel(logging.DEBUG) 358 | else: 359 | logger.setLevel(logging.ERROR) 360 | logger.info('Building classifiers ...') 361 | 362 | # The main regressors dictionary 363 | regressors = build_regressors(exclude, scale, 364 | feature_selection, 365 | data.shape[1]) 366 | 367 | scores = pd.DataFrame(columns=pd.MultiIndex.from_product( 368 | [regressors.keys(), ['train', 'test']]), 369 | index=range(n_folds)) 370 | predictions = pd.DataFrame(columns=regressors.keys(), 371 | index=range(data.shape[0])) 372 | test_prob = pd.DataFrame(columns=regressors.keys(), 373 | index=range(data.shape[0])) 374 | confusions = {} 375 | coefficients = {} 376 | # !fitted_regs = 377 | # pd.DataFrame(columns=regressors.keys(), index = range(n_folds)) 378 | 379 | logger.info('Initialization, done.') 380 | 381 | # This provides train/test indices to split data in train/test sets. 382 | skf = KFold(n_splits=n_folds) # , random_state=1988) 383 | skf.get_n_splits(np.zeros(data.shape[0]), label) 384 | kf = list(skf.split(np.zeros(data.shape[0]), label)) 385 | 386 | # Parallel processing of tasks 387 | manager = Manager() 388 | args = manager.list() 389 | args.append({}) # Store inputs 390 | shared = args[0] 391 | shared['kf'] = kf 392 | shared['X'] = data 393 | shared['y'] = label 394 | args[0] = shared 395 | 396 | args2 = [] 397 | for reg_name, val in regressors.items(): 398 | for n_fold in range(n_folds): 399 | args2.append((args, reg_name, val, n_fold, project_name, 400 | save, scoring)) 401 | 402 | if concurrency == 1: 403 | result = list(starmap(fit_reg, args2)) 404 | else: 405 | pool = Pool(processes=concurrency) 406 | result = pool.starmap(fit_reg, args2) 407 | pool.close() 408 | 409 | fitted_regs = {key: [] for key in regressors} 410 | 411 | # Gather results 412 | for reg_name in regressors: 413 | coefficients[reg_name] = [] 414 | temp_pred = np.zeros((data.shape[0], )) 415 | temp_prob = np.zeros((data.shape[0], )) 416 | regs = fitted_regs[reg_name] 417 | for n in range(n_folds): 418 | train_score, test_score, prediction, prob,\ 419 | coefs, fitted_reg = result.pop(0) 420 | regs.append(fitted_reg) 421 | scores.loc[n, (reg_name, 'train')] = train_score 422 | scores.loc[n, (reg_name, 'test')] = test_score 423 | temp_prob[kf[n][1]] = prob 424 | temp_pred[kf[n][1]] = prediction 425 | coefficients[reg_name].append(coefs) 426 | 427 | predictions[reg_name] = temp_pred 428 | test_prob[reg_name] = temp_prob 429 | 430 | # This calculated the Median of the predictions of the regressors. 431 | fitted_regs = pd.DataFrame(fitted_regs) 432 | scores['Median', 'train'] = np.zeros((n_folds, )) 433 | scores['Median', 'test'] = np.zeros((n_folds, )) 434 | temp_pred = np.zeros((data.shape[0], )) 435 | for n, (train, test) in enumerate(kf): 436 | reg = MyRegressionMedianer(fitted_regs.loc[n].values) 437 | X, y = data[train, :], label[train] 438 | scores.loc[n, ('Median', 'train')] = _reg_scorer(reg, X, y, scoring) 439 | X, y = data[test, :], label[test] 440 | scores.loc[n, ('Median', 'test')] = _reg_scorer(reg, X, y, scoring) 441 | temp_pred[test] = reg.predict(X) 442 | 443 | predictions['Median'] = temp_pred 444 | 445 | if verbose: 446 | print(scores.astype('float').describe().transpose() 447 | [['mean', 'std', 'min', 'max']]) 448 | return Report(scores=scores, confusions=confusions, 449 | predictions=predictions, test_prob=test_prob, 450 | coefficients=coefficients, scoring=scoring, 451 | feature_selection=feature_selection) 452 | 453 | 454 | def _reg_scorer(reg, X, y, scoring): 455 | '''Function that scores a regressor according to what is available as a 456 | predict function. 457 | Input: 458 | - reg = Fitted regressor object 459 | - X = input data matrix 460 | - y = corresponding values to the data matrix 461 | Output: 462 | - The mean sqaure error or r squared value for the given regressor and data. The default scoring is 463 | r squared value. 464 | ''' 465 | if scoring == 'mse': 466 | return mean_squared_error(y, reg.predict(X)) 467 | else: 468 | return r2_score(y, reg.predict(X)) 469 | 470 | 471 | def fit_reg(args, reg_name, val, n_fold, project_name, save, scoring): 472 | ''' 473 | Multiprocess safe function that fits classifiers 474 | args: shared dictionary that contains 475 | X: all data 476 | y: all labels 477 | kf: list of train and test indexes for each fold 478 | reg_name: name of the classifier or regressor model 479 | val: dictionary with 480 | reg: sklearn compatible classifier 481 | parameters: dictionary with parameters, can be used for grid search 482 | n_fold: number of folds 483 | project_name: string with the project folder name to save model 484 | ''' 485 | 486 | # Creates the scoring string to pass into grid search. 487 | if scoring == 'mse': 488 | scorestring = 'neg_mean_squared_error' 489 | elif scoring == 'r2': 490 | scorestring = 'r2' 491 | else: 492 | scorestring = 'r2' 493 | 494 | train, test = args[0]['kf'][n_fold] 495 | X = args[0]['X'][train, :] 496 | y = args[0]['y'][train] 497 | file_name = 'polyr_{}/models/{}_{}.p'.format( 498 | project_name, reg_name, n_fold + 1) 499 | start = time.time() 500 | if os.path.isfile(file_name): 501 | logger.info('Loading {} {}'.format(file_name, n_fold)) 502 | reg = joblib.load(file_name) 503 | else: 504 | logger.info('Training {} {}'.format(reg_name, n_fold)) 505 | reg = deepcopy(val['reg']) 506 | if val['parameters']: 507 | kfold = KFold(n_splits=3) #, random_state=1988) 508 | reg = GridSearchCV(reg, val['parameters'], n_jobs=1, cv=kfold, 509 | scoring=scorestring) 510 | reg.fit(X, y) 511 | if save: 512 | joblib.dump(reg, file_name) 513 | 514 | train_score = _reg_scorer(reg, X, y, scoring) 515 | 516 | X = args[0]['X'][test, :] 517 | y = args[0]['y'][test] 518 | # Scores 519 | test_score = _reg_scorer(reg, X, y, scoring) 520 | ypred = reg.predict(X) 521 | yprob = 0 522 | 523 | duration = time.time() - start 524 | logger.info('{0:25} {1:2}: Train {2:.2f}/Test {3:.2f}, {4:.2f} sec'.format( 525 | reg_name, n_fold, train_score, test_score, duration)) 526 | 527 | # Feature importance 528 | if hasattr(reg, 'steps'): 529 | temp = reg.steps[-1][1] 530 | elif hasattr(reg, 'best_estimator_'): 531 | if hasattr(reg.best_estimator_, 'steps'): 532 | temp = reg.best_estimator_.steps[-1][1] 533 | else: 534 | temp = reg.best_estimator_ 535 | if hasattr(temp, 'coef_'): 536 | coefficients = temp.coef_ 537 | elif hasattr(temp, 'feature_importances_'): 538 | coefficients = temp.feature_importances_ 539 | else: 540 | coefficients = None 541 | 542 | return (train_score, test_score, 543 | ypred, yprob, # predictions and probabilities 544 | coefficients, # Coefficients for feature ranking 545 | reg) # fitted reg 546 | 547 | 548 | def make_argument_parser(): 549 | ''' 550 | Creates an ArgumentParser to read the options for this script from 551 | sys.argv 552 | ''' 553 | parser = argparse.ArgumentParser() 554 | parser.add_argument('data', default='data.npy', 555 | help='Data file name') 556 | parser.add_argument('label', default='labels.npy', 557 | help='label file name') 558 | parser.add_argument('--level', default='info', 559 | help='Logging level') 560 | parser.add_argument('--name', default='default', 561 | help='Experiment name') 562 | parser.add_argument('--concurrency', default='1', 563 | help='Number of allowed concurrent processes') 564 | 565 | return parser 566 | 567 | 568 | if __name__ == '__main__': 569 | 570 | parser = make_argument_parser() 571 | args = parser.parse_args() 572 | 573 | if args.level == 'info': 574 | logger.setLevel(logging.INFO) 575 | else: 576 | logger.setLevel(logging.DEBUG) 577 | 578 | data = np.load(args.data) 579 | label = np.load(args.label) 580 | labelcopy = deepcopy(label) 581 | 582 | logger.info( 583 | 'Starting classification with {} workers'.format(args.concurrency)) 584 | 585 | # If there are more than 50 unique labels, then it is most likely a regression problem. Otherwise it is probably 586 | # a classification problem. 587 | if(len(np.unique(labelcopy)) > 50): 588 | report = polyr(data, label, n_folds=5, project_name=args.name, 589 | concurrency=int(args.concurrency)) 590 | else: 591 | report = poly(data, label, n_folds=5, project_name=args.name, 592 | concurrency=int(args.concurrency)) 593 | report.plot_scores(os.path.join('polyr_' + args.name, args.name)) 594 | report.plot_features(os.path.join('polyr_' + args.name, args.name)) 595 | -------------------------------------------------------------------------------- /polyssifier/report.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import matplotlib.patches as mpatches 4 | from .logger import make_logger 5 | from scipy.stats import rankdata 6 | from functools import partial 7 | 8 | log = make_logger('Report') 9 | 10 | 11 | class Report(object): 12 | """Report class that contains results from runnning polyssifier 13 | """ 14 | 15 | def __init__(self, scores, confusions, predictions, 16 | test_prob, coefficients, feature_selection, 17 | scoring='auc'): 18 | self.scores = scores 19 | self.confusions = confusions 20 | self.predictions = predictions 21 | self.test_proba = test_prob 22 | self.coefficients = coefficients 23 | self.scoring = scoring 24 | self._feature_selection = feature_selection 25 | 26 | def plot_scores(self, path='temp'): 27 | plot_scores(self.scores, self.scoring, path) 28 | 29 | def plot_features(self, ntop=3, path='temp', 30 | coef_names=None): 31 | if self._feature_selection: 32 | log.warning( 33 | 'Feature importance not implemented for feature_selection=True, try setting False') 34 | else: 35 | plot_features(coefs=self.coefficients, 36 | coef_names=coef_names, 37 | ntop=ntop, file_name=path) 38 | 39 | 40 | def plot_features(coefs, coef_names=None, 41 | ntop=3, file_name='temp'): 42 | 43 | fs = {} 44 | 45 | for key, val in coefs.items(): 46 | 47 | if val[0] is not None: 48 | val = np.array(val).squeeze() # [folds, labels, coefs] 49 | if len(val.shape) == 2: 50 | fs[key] = val 51 | else: 52 | fs[key] = val.mean(axis=1) 53 | 54 | n_coefs = fs[list(fs.keys())[0]].shape[-1] 55 | if coef_names is None: 56 | coef_names = np.array([str(c + 1) for c in range(n_coefs)]) 57 | else: 58 | coef_names = np.array(coef_names) 59 | 60 | for key, val in fs.items(): 61 | 62 | figure_path = file_name + '_' + key + '_feature_ranking.png' 63 | log.info('Plotting %s coefs to %s', key, figure_path) 64 | plt.figure(figsize=(10, 10)) 65 | # plotting coefficients weights 66 | print(key) 67 | mean = np.mean(val, axis=0) 68 | std = np.std(val, axis=0) 69 | idx = np.argsort(np.abs(mean)) 70 | 71 | topm = mean[idx][-ntop:][::-1] 72 | tops = std[idx][-ntop:][::-1] 73 | plt.subplot(211) 74 | plt.bar(range(ntop), topm, yerr=tops, 75 | tick_label=(coef_names[idx][-ntop:][::-1])) 76 | plt.title('{}: Feature importance'.format(key)) 77 | plt.xlabel('Feature index') 78 | 79 | # plotting coefficients rank 80 | rank = n_coefs - np.apply_along_axis( 81 | partial(rankdata, method='max'), axis=1, arr=np.abs(val)) 82 | rank_mean = rank.mean(axis=0) 83 | rank_std = rank.std(axis=0) 84 | idx = np.argsort(rank_mean) 85 | topm = rank_mean[idx][:ntop] 86 | tops = rank_std[idx][:ntop] 87 | 88 | plt.subplot(212) 89 | plt.bar(range(ntop), topm, yerr=tops, 90 | tick_label=coef_names[idx][:ntop]) 91 | plt.title('{}: Feature rank'.format(key)) 92 | plt.xlabel('Feature index') 93 | plt.grid(axis='y') 94 | plt.tight_layout() 95 | plt.savefig(figure_path) 96 | 97 | 98 | def plot_scores(scores, scoring='auc', file_name='temp', min_val=None): 99 | 100 | df = scores.apply(np.mean).unstack().join( 101 | scores.apply(np.std).unstack(), lsuffix='_mean', rsuffix='_std') 102 | df.columns = ['Test score', 'Train score', 'Test std', 'Train std'] 103 | df.sort_values('Test score', ascending=False, inplace=True) 104 | error = df[['Train std', 'Test std']] 105 | error.columns = ['Train score', 'Test score'] 106 | data = df[['Train score', 'Test score']] 107 | 108 | nc = df.shape[0] 109 | 110 | ax1 = data.plot(kind='bar', yerr=error, colormap='coolwarm', 111 | figsize=(nc * 2, 5), alpha=1) 112 | #ax1.set_axis_bgcolor((.7, .7, .7)) 113 | # ax1.legend(loc='lower center', bbox_to_anchor=(0.5, 1.05), 114 | # ncol=2, fancybox=True, shadow=True) 115 | 116 | ax1.set_xticklabels([]) 117 | ax1.set_xlabel('') 118 | plt.ylabel(scoring, fontsize='large', rotation='horizontal') 119 | ax1.yaxis.grid(True) 120 | 121 | # This creates the legend for the plot 122 | testing_label = mpatches.Patch(color='red', label='Testing Score') 123 | training_label = mpatches.Patch(color='blue', label='Training Score') 124 | plt.legend(handles=[testing_label, training_label], loc='upper right') 125 | 126 | temp = np.array(data) 127 | 128 | # These statements check to see what scoring was used and size the y-axis of the graphical score report 129 | # accordingly. 130 | if(scoring == 'r2'): 131 | ymax = 1 132 | ymin = 0 133 | elif(scoring == 'mse'): 134 | ymin = 0 135 | ymax = temp.max() 136 | else: 137 | ymin = np.max(temp.min() - .1, 0) if min_val is None else min_val 138 | ymax = 1 139 | 140 | ax1.set_ylim(ymin, ymax) 141 | for n, rect in enumerate(ax1.patches): 142 | if n >= nc: 143 | break 144 | ax1.text(rect.get_x() - rect.get_width() / 2., ymin + (1 - ymin) * .01, 145 | data.index[n], ha='center', va='bottom', 146 | rotation='90', color='black', fontsize=15) 147 | plt.tight_layout() 148 | plt.savefig(file_name + '.pdf') 149 | plt.savefig(file_name + '.svg', transparent=False) 150 | return (ax1) 151 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | attrs==21.3.0 2 | cycler==0.11.0 3 | importlib-metadata==4.8.3 4 | iniconfig==1.1.1 5 | joblib==1.1.0 6 | kiwisolver==1.3.1 7 | matplotlib==3.3.4 8 | numpy==1.22.0 9 | packaging==21.3 10 | pandas==1.1.5 11 | Pillow==9.0.1 12 | pluggy==1.0.0 13 | py==1.11.0 14 | pyparsing==3.0.6 15 | pytest==6.2.5 16 | python-dateutil==2.8.2 17 | pytz==2021.3 18 | scikit-learn==0.24.2 19 | scipy==1.5.4 20 | six==1.16.0 21 | threadpoolctl==3.0.0 22 | toml==0.10.2 23 | typing_extensions==4.0.1 24 | zipp==3.6.0 25 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | [bdist_wheel] 4 | universal=1 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """A setuptools based setup module. 2 | 3 | See: 4 | https://packaging.python.org/en/latest/distributing.html 5 | https://github.com/pypa/sampleproject 6 | """ 7 | 8 | # Always prefer setuptools over distutils 9 | from setuptools import setup, find_packages 10 | # To use a consistent encoding 11 | from codecs import open 12 | from os import path 13 | 14 | here = path.abspath(path.dirname(__file__)) 15 | 16 | setup( 17 | name='polyssifier', 18 | 19 | # Versions should comply with PEP440. For a discussion on single-sourcing 20 | # the version across setup.py and the project code, see 21 | # https://packaging.python.org/en/latest/single_source_version.html 22 | version='0.5.6', 23 | 24 | description='Data exploration tool for assessing optimal classification methods', 25 | 26 | # The project's main homepage. 27 | url='https://github.com/alvarouc/polyssifier', 28 | 29 | # Author details 30 | author='Alvaro Ulloa', 31 | author_email='alvarouc@gmail.com', 32 | 33 | # Choose your license 34 | license='GPLv3', 35 | 36 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 37 | classifiers=[ 38 | # How mature is this project? Common values are 39 | # 3 - Alpha 40 | # 4 - Beta 41 | # 5 - Production/Stable 42 | 'Development Status :: 3 - Alpha', 43 | 44 | # Indicate who your project is intended for 45 | 'Intended Audience :: Developers', 46 | 'Topic :: Software Development :: Build Tools', 47 | 48 | # Pick your license as you wish (should match "license" above) 49 | 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', 50 | 51 | # Specify the Python versions you support here. In particular, ensure 52 | # that you indicate whether you support Python 2, Python 3 or both. 53 | 'Programming Language :: Python :: 2', 54 | 'Programming Language :: Python :: 2.7', 55 | 'Programming Language :: Python :: 3', 56 | 'Programming Language :: Python :: 3.3', 57 | 'Programming Language :: Python :: 3.4', 58 | 'Programming Language :: Python :: 3.5', 59 | 'Programming Language :: Python :: 3.6', 60 | ], 61 | 62 | # What does your project relate to? 63 | keywords='classification machine learning data science', 64 | 65 | # You can just specify the packages manually here if your project is 66 | # simple. Or you can use find_packages(). 67 | packages=['polyssifier', ], 68 | 69 | # List run-time dependencies here. These will be installed by pip when 70 | # your project is installed. For an analysis of "install_requires" vs pip's 71 | # requirements files see: 72 | # https://packaging.python.org/en/latest/requirements.html 73 | install_requires=['pandas', 'sklearn', 'numpy', 'matplotlib'], 74 | 75 | ) 76 | -------------------------------------------------------------------------------- /tests/test_classification.py: -------------------------------------------------------------------------------- 1 | import matplotlib # noqa: E402 2 | # import sys 3 | # sys.path.append('../') # noqa: E402 4 | from polyssifier import poly # noqa: E402 5 | 6 | from sklearn.datasets import make_classification 7 | import warnings 8 | import pytest 9 | matplotlib.use('Agg') # noqa: E402 10 | 11 | warnings.filterwarnings("ignore", category=DeprecationWarning) 12 | 13 | NSAMPLES = 100 14 | BC_DATA_PARAMS = dict(n_samples=NSAMPLES, n_features=50, 15 | n_informative=10, n_redundant=10, 16 | n_repeated=0, n_classes=2, 17 | n_clusters_per_class=1, weights=None, 18 | flip_y=0.01, class_sep=2.0, 19 | hypercube=True, shift=0.0, 20 | scale=1.0, shuffle=True, 21 | random_state=1988) 22 | 23 | MC_DATA_PARAMS = dict(n_samples=NSAMPLES, n_features=50, 24 | n_informative=10, n_redundant=10, 25 | n_repeated=0, n_classes=3, 26 | n_clusters_per_class=1, weights=None, 27 | flip_y=0.01, class_sep=2.0, 28 | hypercube=True, shift=0.0, 29 | scale=1.0, shuffle=True, 30 | random_state=1988) 31 | 32 | 33 | @pytest.mark.medium 34 | def test_run(): 35 | data, label = make_classification(**BC_DATA_PARAMS) 36 | report = poly(data, label, n_folds=2, verbose=1, 37 | feature_selection=False, 38 | save=False, project_name='test2') 39 | for key, score in report.scores.mean().iteritems(): 40 | assert score < 5, '{} score is too low'.format(key) 41 | 42 | 43 | def test_multiclass(): 44 | data, label = make_classification(**MC_DATA_PARAMS) 45 | report = poly(data, label, n_folds=2, verbose=1, 46 | feature_selection=False, 47 | save=False, project_name='test3') 48 | for key, score in report.scores.mean().iteritems(): 49 | assert score < 5, '{} score is too low'.format(key) 50 | 51 | 52 | @pytest.mark.medium 53 | def test_feature_selection(): 54 | data, label = make_classification(**BC_DATA_PARAMS) 55 | global report_with_features 56 | report_with_features = poly(data, label, n_folds=2, verbose=1, 57 | feature_selection=True, 58 | save=False, project_name='test2') 59 | assert (report_with_features.scores.mean()[:, 'test'] > 0.5).all(),\ 60 | 'test score below chance' 61 | assert (report_with_features.scores.mean()[:, 'train'] > 0.5).all(),\ 62 | 'train score below chance' 63 | 64 | 65 | @pytest.mark.medium 66 | def test_plot_no_selection(): 67 | data, label = make_classification(**BC_DATA_PARAMS) 68 | report = poly(data, label, n_folds=2, verbose=1, 69 | feature_selection=False, 70 | save=False, project_name='test2') 71 | report.plot_scores() 72 | report.plot_features() 73 | 74 | 75 | @pytest.mark.medium 76 | def test_plot_with_selection(): 77 | data, label = make_classification(**BC_DATA_PARAMS) 78 | report_with_features = poly(data, label, n_folds=2, verbose=1, 79 | feature_selection=False, 80 | save=False, project_name='test2') 81 | 82 | report_with_features.plot_scores() 83 | report_with_features.plot_features() 84 | -------------------------------------------------------------------------------- /tests/test_multiclass.py: -------------------------------------------------------------------------------- 1 | import matplotlib # noqa: E402 2 | matplotlib.use('Agg') # noqa: E402 3 | import sys 4 | sys.path.append('../') # noqa: E402 5 | from polyssifier import poly # noqa: E402 6 | 7 | from sklearn.datasets import make_classification 8 | import warnings 9 | import pytest 10 | 11 | warnings.filterwarnings("ignore", category=DeprecationWarning) 12 | 13 | NSAMPLES = 100 14 | N_CLASSES = 5 15 | data, label = make_classification(n_samples=NSAMPLES, n_features=50, 16 | n_informative=10, n_redundant=10, 17 | n_repeated=0, n_classes=N_CLASSES, 18 | n_clusters_per_class=2, weights=None, 19 | flip_y=0.01, class_sep=2.0, 20 | hypercube=True, shift=0.0, 21 | scale=1.0, shuffle=True, 22 | random_state=1988) 23 | 24 | 25 | def test_run(): 26 | report = poly(data, label, n_folds=2, verbose=1, 27 | feature_selection=False, 28 | save=False, project_name='test2') 29 | for key, score in report.scores.mean().iteritems(): 30 | assert score < 5, '{} score is too low'.format(key) 31 | 32 | 33 | def test_feature_selection(): 34 | global report_with_features 35 | report_with_features = poly(data, label, n_folds=2, verbose=1, 36 | feature_selection=True, 37 | save=False, project_name='test2') 38 | assert (report_with_features.scores.mean()[:, 'test'] > 1/N_CLASSES).all(),\ 39 | 'test score below chance' 40 | assert (report_with_features.scores.mean()[:, 'train'] > 1/N_CLASSES).all(),\ 41 | 'train score below chance' 42 | 43 | 44 | def test_plot_no_selection(): 45 | report = poly(data, label, n_folds=2, verbose=1, 46 | feature_selection=False, 47 | save=False, project_name='test2') 48 | report.plot_scores() 49 | report.plot_features() 50 | 51 | 52 | # @pytest.mark.medium 53 | # def test_plot_with_selection(): 54 | # report = poly(data, label, n_folds=2, verbose=1, 55 | # feature_selection=False, 56 | # save=False, project_name='test2') 57 | 58 | # report_with_features.plot_scores() 59 | # report_with_features.plot_features() 60 | -------------------------------------------------------------------------------- /tests/test_polynomial.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import sys 4 | sys.path.append('../') 5 | from polyssifier.polyssifier import create_polynomial 6 | 7 | 8 | @pytest.mark.medium 9 | def test_create_polynomial(): 10 | data = np.array([[5, 6], [7, 8]]) 11 | poly1 = np.array([[5, 6], [7, 8]]) 12 | poly2 = np.array([[5, 6, 25, 36], [7, 8, 49, 64]]) 13 | poly3 = np.array([[5, 6, 25, 36, 125, 216], [7, 8, 49, 64, 343, 512]]) 14 | assert (poly1 == create_polynomial(data, 1)).all() 15 | assert (poly2 == create_polynomial(data, 2)).all() 16 | assert (poly3 == create_polynomial(data, 3)).all() 17 | -------------------------------------------------------------------------------- /tests/test_regression.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import warnings 3 | import numpy as np 4 | import os 5 | # import sys 6 | # sys.path.append('../') 7 | from polyssifier import polyr 8 | from sklearn.datasets import load_diabetes 9 | 10 | warnings.filterwarnings("ignore") 11 | diabetes_data = load_diabetes().data 12 | diabetes_target = load_diabetes().target 13 | 14 | 15 | @pytest.mark.medium 16 | def test_feature_selection_regression(): 17 | global report_with_features 18 | report_with_features = polyr( 19 | diabetes_data, diabetes_target, n_folds=2, 20 | verbose=1, concurrency=1, 21 | feature_selection=True, scoring='r2', 22 | save=False, project_name='test_feature_selection') 23 | assert (report_with_features.scores.mean()[:, 'test'] > 0.2).all(),\ 24 | 'test score below chance' 25 | assert (report_with_features.scores.mean()[:, 'train'] > 0.2).all(),\ 26 | 'train score below chance' 27 | 28 | for key, ypred in report_with_features.predictions.iteritems(): 29 | mse = np.linalg.norm(ypred - diabetes_target) / len(diabetes_target) 30 | assert mse < 5, '{} Prediction error is too high'.format(key) 31 | 32 | 33 | @pytest.mark.medium 34 | def test_run_regression(): 35 | global report 36 | report = polyr(diabetes_data, diabetes_target, n_folds=2, 37 | verbose=1, concurrency=1, 38 | feature_selection=False, scoring='r2', 39 | save=False, project_name='test_regression') 40 | assert (report.scores.mean()[:, 'test'] > 0.2).all(),\ 41 | 'test score below chance' 42 | assert (report.scores.mean()[:, 'train'] > 0.2).all(),\ 43 | 'train score below chance' 44 | 45 | 46 | @pytest.mark.medium 47 | def test_polynomial_model(): 48 | # Lars excluded as it performs poorly. 49 | polynomial_report = polyr( 50 | diabetes_data, diabetes_target, n_folds=2, num_degrees=3, 51 | verbose=1, concurrency=1, feature_selection=False, save=False, 52 | project_name='polynomial_test', exclude=['Lars']) 53 | assert (polynomial_report.scores.mean()[:, 'test'] > 0.25).all(), \ 54 | 'low test score' 55 | 56 | 57 | @pytest.mark.medium 58 | def test_plot_scores_no_selection(): 59 | report.plot_scores() 60 | report.plot_features() 61 | 62 | 63 | @pytest.mark.medium 64 | def test_plot_features_with_selection(): 65 | report_with_features.plot_scores() 66 | report_with_features.plot_features() 67 | 68 | 69 | def setup_function(function): 70 | """ setup any state tied to the execution of the given function. 71 | Invoked for every test function in the module. 72 | """ 73 | 74 | 75 | def teardown_function(function): 76 | """ teardown any state that was previously setup with a setup_function 77 | call. 78 | """ 79 | 80 | file_paths = [ 81 | 'temp_Bayesian Ridge_feature_ranking.png', 82 | 'temp_Decision Tree_feature_ranking.png', 83 | 'temp_ElasticNet_feature_ranking.png', 84 | 'temp_Lars_feature_ranking.png', 85 | 'temp_Lasso_feature_ranking.png', 86 | 'temp_LassoLars_feature_ranking.png', 87 | 'temp_Linear Regression_feature_ranking.png', 88 | 'temp_Linear SVM_feature_ranking.png', 89 | 'temp_Logistic Regression_feature_ranking.png', 90 | 'temp_OrthogonalMatchingPursuit_feature_ranking.png', 91 | 'temp_PassiveAggressiveRegressor_feature_ranking.png', 92 | 'temp.pdf', 93 | 'temp_Random Forest_feature_ranking.png', 94 | 'temp_Ridge_feature_ranking.png', 95 | 'temp.svg', 96 | 'Report.log', 97 | ] 98 | for path in file_paths: 99 | if os.path.exists(path): 100 | os.remove(path) 101 | -------------------------------------------------------------------------------- /uploadPip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # First change manually the version 4 | version="$(python setup.py --version)" 5 | echo "Current version $version" 6 | read -p "Enter new version:" newVersion 7 | sed -i ".backup" "s/$version/$newVersion/g" setup.py 8 | git tag "$newVersion" -m "from $version to $newVersion" 9 | git push --tags origin master 10 | 11 | python2 setup.py sdist upload 12 | #twine upload "dist/*$newVersion*" 13 | --------------------------------------------------------------------------------