├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── counterfactuals ├── LICENSE ├── README.md ├── __init__.py ├── base_proxy.py ├── counterfactual_search.py ├── explainer.py └── tests │ ├── __init__.py │ ├── classifier_test.py │ ├── regression_test.py │ └── test_proxies.py ├── paper └── FSE21-ML-Misprediction-Preprint.pdf └── rule_induction ├── LICENSE ├── README.md ├── __init__.py ├── cfg.py ├── diagnoser.py ├── requirements.txt ├── rules ├── __init__.py ├── cache.py ├── conjuncts.py ├── predicate.py ├── rule.py └── ruleset.py ├── timing.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Mac specific 4 | .DS_Store 5 | 6 | ### Python ### 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | cover/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | .pybuilder/ 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | # For a library or package, you might want to ignore these files since the code is 93 | # intended to run in multiple environments; otherwise, check them in: 94 | # .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 104 | __pypackages__/ 105 | 106 | # Celery stuff 107 | celerybeat-schedule 108 | celerybeat.pid 109 | 110 | # SageMath parsed files 111 | *.sage.py 112 | 113 | # Environments 114 | .env 115 | .venv 116 | env/ 117 | venv/ 118 | ENV/ 119 | env.bak/ 120 | venv.bak/ 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | 140 | # pytype static type analyzer 141 | .pytype/ 142 | 143 | # Cython debug symbols 144 | cython_debug/ 145 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq 81 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to mmd 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `master`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to mmd, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial 4.0 International Public 58 | License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial 4.0 International Public License ("Public 63 | License"). To the extent this Public License may be interpreted as a 64 | contract, You are granted the Licensed Rights in consideration of Your 65 | acceptance of these terms and conditions, and the Licensor grants You 66 | such rights in consideration of benefits the Licensor receives from 67 | making the Licensed Material available under these terms and 68 | conditions. 69 | 70 | Section 1 -- Definitions. 71 | 72 | a. Adapted Material means material subject to Copyright and Similar 73 | Rights that is derived from or based upon the Licensed Material 74 | and in which the Licensed Material is translated, altered, 75 | arranged, transformed, or otherwise modified in a manner requiring 76 | permission under the Copyright and Similar Rights held by the 77 | Licensor. For purposes of this Public License, where the Licensed 78 | Material is a musical work, performance, or sound recording, 79 | Adapted Material is always produced where the Licensed Material is 80 | synched in timed relation with a moving image. 81 | 82 | b. Adapter's License means the license You apply to Your Copyright 83 | and Similar Rights in Your contributions to Adapted Material in 84 | accordance with the terms and conditions of this Public License. 85 | 86 | c. Copyright and Similar Rights means copyright and/or similar rights 87 | closely related to copyright including, without limitation, 88 | performance, broadcast, sound recording, and Sui Generis Database 89 | Rights, without regard to how the rights are labeled or 90 | categorized. For purposes of this Public License, the rights 91 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 92 | Rights. 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. NonCommercial means not primarily intended for or directed towards 116 | commercial advantage or monetary compensation. For purposes of 117 | this Public License, the exchange of the Licensed Material for 118 | other material subject to Copyright and Similar Rights by digital 119 | file-sharing or similar means is NonCommercial provided there is 120 | no payment of monetary compensation in connection with the 121 | exchange. 122 | 123 | j. Share means to provide material to the public by any means or 124 | process that requires permission under the Licensed Rights, such 125 | as reproduction, public display, public performance, distribution, 126 | dissemination, communication, or importation, and to make material 127 | available to the public including in ways that members of the 128 | public may access the material from a place and at a time 129 | individually chosen by them. 130 | 131 | k. Sui Generis Database Rights means rights other than copyright 132 | resulting from Directive 96/9/EC of the European Parliament and of 133 | the Council of 11 March 1996 on the legal protection of databases, 134 | as amended and/or succeeded, as well as other essentially 135 | equivalent rights anywhere in the world. 136 | 137 | l. You means the individual or entity exercising the Licensed Rights 138 | under this Public License. Your has a corresponding meaning. 139 | 140 | Section 2 -- Scope. 141 | 142 | a. License grant. 143 | 144 | 1. Subject to the terms and conditions of this Public License, 145 | the Licensor hereby grants You a worldwide, royalty-free, 146 | non-sublicensable, non-exclusive, irrevocable license to 147 | exercise the Licensed Rights in the Licensed Material to: 148 | 149 | a. reproduce and Share the Licensed Material, in whole or 150 | in part, for NonCommercial purposes only; and 151 | 152 | b. produce, reproduce, and Share Adapted Material for 153 | NonCommercial purposes only. 154 | 155 | 2. Exceptions and Limitations. For the avoidance of doubt, where 156 | Exceptions and Limitations apply to Your use, this Public 157 | License does not apply, and You do not need to comply with 158 | its terms and conditions. 159 | 160 | 3. Term. The term of this Public License is specified in Section 161 | 6(a). 162 | 163 | 4. Media and formats; technical modifications allowed. The 164 | Licensor authorizes You to exercise the Licensed Rights in 165 | all media and formats whether now known or hereafter created, 166 | and to make technical modifications necessary to do so. The 167 | Licensor waives and/or agrees not to assert any right or 168 | authority to forbid You from making technical modifications 169 | necessary to exercise the Licensed Rights, including 170 | technical modifications necessary to circumvent Effective 171 | Technological Measures. For purposes of this Public License, 172 | simply making modifications authorized by this Section 2(a) 173 | (4) never produces Adapted Material. 174 | 175 | 5. Downstream recipients. 176 | 177 | a. Offer from the Licensor -- Licensed Material. Every 178 | recipient of the Licensed Material automatically 179 | receives an offer from the Licensor to exercise the 180 | Licensed Rights under the terms and conditions of this 181 | Public License. 182 | 183 | b. No downstream restrictions. You may not offer or impose 184 | any additional or different terms or conditions on, or 185 | apply any Effective Technological Measures to, the 186 | Licensed Material if doing so restricts exercise of the 187 | Licensed Rights by any recipient of the Licensed 188 | Material. 189 | 190 | 6. No endorsement. Nothing in this Public License constitutes or 191 | may be construed as permission to assert or imply that You 192 | are, or that Your use of the Licensed Material is, connected 193 | with, or sponsored, endorsed, or granted official status by, 194 | the Licensor or others designated to receive attribution as 195 | provided in Section 3(a)(1)(A)(i). 196 | 197 | b. Other rights. 198 | 199 | 1. Moral rights, such as the right of integrity, are not 200 | licensed under this Public License, nor are publicity, 201 | privacy, and/or other similar personality rights; however, to 202 | the extent possible, the Licensor waives and/or agrees not to 203 | assert any such rights held by the Licensor to the limited 204 | extent necessary to allow You to exercise the Licensed 205 | Rights, but not otherwise. 206 | 207 | 2. Patent and trademark rights are not licensed under this 208 | Public License. 209 | 210 | 3. To the extent possible, the Licensor waives any right to 211 | collect royalties from You for the exercise of the Licensed 212 | Rights, whether directly or through a collecting society 213 | under any voluntary or waivable statutory or compulsory 214 | licensing scheme. In all other cases the Licensor expressly 215 | reserves any right to collect such royalties, including when 216 | the Licensed Material is used other than for NonCommercial 217 | purposes. 218 | 219 | Section 3 -- License Conditions. 220 | 221 | Your exercise of the Licensed Rights is expressly made subject to the 222 | following conditions. 223 | 224 | a. Attribution. 225 | 226 | 1. If You Share the Licensed Material (including in modified 227 | form), You must: 228 | 229 | a. retain the following if it is supplied by the Licensor 230 | with the Licensed Material: 231 | 232 | i. identification of the creator(s) of the Licensed 233 | Material and any others designated to receive 234 | attribution, in any reasonable manner requested by 235 | the Licensor (including by pseudonym if 236 | designated); 237 | 238 | ii. a copyright notice; 239 | 240 | iii. a notice that refers to this Public License; 241 | 242 | iv. a notice that refers to the disclaimer of 243 | warranties; 244 | 245 | v. a URI or hyperlink to the Licensed Material to the 246 | extent reasonably practicable; 247 | 248 | b. indicate if You modified the Licensed Material and 249 | retain an indication of any previous modifications; and 250 | 251 | c. indicate the Licensed Material is licensed under this 252 | Public License, and include the text of, or the URI or 253 | hyperlink to, this Public License. 254 | 255 | 2. You may satisfy the conditions in Section 3(a)(1) in any 256 | reasonable manner based on the medium, means, and context in 257 | which You Share the Licensed Material. For example, it may be 258 | reasonable to satisfy the conditions by providing a URI or 259 | hyperlink to a resource that includes the required 260 | information. 261 | 262 | 3. If requested by the Licensor, You must remove any of the 263 | information required by Section 3(a)(1)(A) to the extent 264 | reasonably practicable. 265 | 266 | 4. If You Share Adapted Material You produce, the Adapter's 267 | License You apply must not prevent recipients of the Adapted 268 | Material from complying with this Public License. 269 | 270 | Section 4 -- Sui Generis Database Rights. 271 | 272 | Where the Licensed Rights include Sui Generis Database Rights that 273 | apply to Your use of the Licensed Material: 274 | 275 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 276 | to extract, reuse, reproduce, and Share all or a substantial 277 | portion of the contents of the database for NonCommercial purposes 278 | only; 279 | 280 | b. if You include all or a substantial portion of the database 281 | contents in a database in which You have Sui Generis Database 282 | Rights, then the database in which You have Sui Generis Database 283 | Rights (but not its individual contents) is Adapted Material; and 284 | 285 | c. You must comply with the conditions in Section 3(a) if You Share 286 | all or a substantial portion of the contents of the database. 287 | 288 | For the avoidance of doubt, this Section 4 supplements and does not 289 | replace Your obligations under this Public License where the Licensed 290 | Rights include other Copyright and Similar Rights. 291 | 292 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 293 | 294 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 295 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 296 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 297 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 298 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 299 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 300 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 301 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 302 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 303 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 304 | 305 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 306 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 307 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 308 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 309 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 310 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 311 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 312 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 313 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 314 | 315 | c. The disclaimer of warranties and limitation of liability provided 316 | above shall be interpreted in a manner that, to the extent 317 | possible, most closely approximates an absolute disclaimer and 318 | waiver of all liability. 319 | 320 | Section 6 -- Term and Termination. 321 | 322 | a. This Public License applies for the term of the Copyright and 323 | Similar Rights licensed here. However, if You fail to comply with 324 | this Public License, then Your rights under this Public License 325 | terminate automatically. 326 | 327 | b. Where Your right to use the Licensed Material has terminated under 328 | Section 6(a), it reinstates: 329 | 330 | 1. automatically as of the date the violation is cured, provided 331 | it is cured within 30 days of Your discovery of the 332 | violation; or 333 | 334 | 2. upon express reinstatement by the Licensor. 335 | 336 | For the avoidance of doubt, this Section 6(b) does not affect any 337 | right the Licensor may have to seek remedies for Your violations 338 | of this Public License. 339 | 340 | c. For the avoidance of doubt, the Licensor may also offer the 341 | Licensed Material under separate terms or conditions or stop 342 | distributing the Licensed Material at any time; however, doing so 343 | will not terminate this Public License. 344 | 345 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 346 | License. 347 | 348 | Section 7 -- Other Terms and Conditions. 349 | 350 | a. The Licensor shall not be bound by any additional or different 351 | terms or conditions communicated by You unless expressly agreed. 352 | 353 | b. Any arrangements, understandings, or agreements regarding the 354 | Licensed Material not stated herein are separate from and 355 | independent of the terms and conditions of this Public License. 356 | 357 | Section 8 -- Interpretation. 358 | 359 | a. For the avoidance of doubt, this Public License does not, and 360 | shall not be interpreted to, reduce, limit, restrict, or impose 361 | conditions on any use of the Licensed Material that could lawfully 362 | be made without permission under this Public License. 363 | 364 | b. To the extent possible, if any provision of this Public License is 365 | deemed unenforceable, it shall be automatically reformed to the 366 | minimum extent necessary to make it enforceable. If the provision 367 | cannot be reformed, it shall be severed from this Public License 368 | without affecting the enforceability of the remaining terms and 369 | conditions. 370 | 371 | c. No term or condition of this Public License will be waived and no 372 | failure to comply consented to unless expressly agreed to by the 373 | Licensor. 374 | 375 | d. Nothing in this Public License constitutes or may be interpreted 376 | as a limitation upon, or waiver of, any privileges and immunities 377 | that apply to the Licensor or You, including from the legal 378 | processes of any jurisdiction or authority. 379 | 380 | ======================================================================= 381 | 382 | Creative Commons is not a party to its public 383 | licenses. Notwithstanding, Creative Commons may elect to apply one of 384 | its public licenses to material it publishes and in those instances 385 | will be considered the “Licensor.” The text of the Creative Commons 386 | public licenses is dedicated to the public domain under the CC0 Public 387 | Domain Dedication. Except for the limited purpose of indicating that 388 | material is shared under a Creative Commons public license or as 389 | otherwise permitted by the Creative Commons policies published at 390 | creativecommons.org/policies, Creative Commons does not authorize the 391 | use of the trademark "Creative Commons" or any other trademark or logo 392 | of Creative Commons without its prior written consent including, 393 | without limitation, in connection with any unauthorized modifications 394 | to any of its public licenses or any other arrangements, 395 | understandings, or agreements concerning use of licensed material. For 396 | the avoidance of doubt, this paragraph does not form part of the 397 | public licenses. 398 | 399 | Creative Commons may be contacted at creativecommons.org. 400 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MMD: Machine Learning Model Diagnosis and Counterfactual Explanatins for Models of Code 2 | 3 | Machine learning models often mispredict, and it is hard to tell when and why. 4 | This repository hosts two model diagnosis tools that support understanding the inner working of black-box models. 5 | 6 | ## MMD 7 | 8 | We developed a technique, *MMD* ([rule_induction](/rule_induction)), that systematically discovers rules that characterize a subset of the input space of a machine learning model where the model is more likely to mispredict. 9 | 10 | This work has been published at the International Conference on Foundations in Software Engineering (FSE'21): J. Cito, I. Dillig, S. Kim, V. Murali, S. Chandra, [Explaining Mispredictions of Machine Learning Models using Rule Induction](https://github.com/facebookresearch/mmd/blob/main/paper/FSE21-ML-Misprediction-Preprint.pdf). 11 | 12 | ```bibtex 13 | @inproceedings{explaining_mispredictions:21, 14 | title={Explaining mispredictions of machine learning models using rule induction}, 15 | author={Cito, J{\"u}rgen and Dillig, Isil and Kim, Seohyun and Murali, Vijayaraghavan and Chandra, Satish}, 16 | booktitle={Proceedings of the 29th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering}, 17 | pages={716--727}, 18 | year={2021} 19 | } 20 | ``` 21 | 22 | ## Counterfactual Explanations for Models of Code 23 | 24 | Counterfactual explanations ([counterfactuals](/counterfactuals)) constitute minimal changes to the input space under which the model 25 | “changes its mind". The contrast between original input and perturbed input is considered an explanation. 26 | 27 | This work has been published at the International Conference on Software Engineering (ICSE'22), Software Engineering in Practice: J. Cito, I. Dillig, V. Murali, S. Chandra, [Counterfactual Explanations for Models of Code](https://arxiv.org/pdf/2111.05711.pdf). 28 | 29 | ```bibtex 30 | @inproceedings{code_counterfactuals:22, 31 | title={Counterfactual Explanations for Models of Code}, 32 | author={Cito, J{\"u}rgen and Dillig, Isil and Murali, Vijayaraghavan and Chandra, Satish}, 33 | booktitle = {44th {IEEE/ACM} International Conference on Software Engineering: 34 | Software Engineering in Practice, {ICSE} {(SEIP)} 2022, Madrid, Spain, 35 | May 25-27, 2022}, 36 | year={2022} 37 | } 38 | ``` 39 | 40 | 41 | 42 | ## Requirements 43 | 44 | * Python 3.8 45 | * Pandas 46 | 47 | ## License 48 | 49 | Both projects are CC-BY-NC 4.0 (Attr Non-Commercial Inter.) (e.g., FAIR) licensed, as found in the LICENSE file. 50 | -------------------------------------------------------------------------------- /counterfactuals/LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial 4.0 International Public 58 | License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial 4.0 International Public License ("Public 63 | License"). To the extent this Public License may be interpreted as a 64 | contract, You are granted the Licensed Rights in consideration of Your 65 | acceptance of these terms and conditions, and the Licensor grants You 66 | such rights in consideration of benefits the Licensor receives from 67 | making the Licensed Material available under these terms and 68 | conditions. 69 | 70 | Section 1 -- Definitions. 71 | 72 | a. Adapted Material means material subject to Copyright and Similar 73 | Rights that is derived from or based upon the Licensed Material 74 | and in which the Licensed Material is translated, altered, 75 | arranged, transformed, or otherwise modified in a manner requiring 76 | permission under the Copyright and Similar Rights held by the 77 | Licensor. For purposes of this Public License, where the Licensed 78 | Material is a musical work, performance, or sound recording, 79 | Adapted Material is always produced where the Licensed Material is 80 | synched in timed relation with a moving image. 81 | 82 | b. Adapter's License means the license You apply to Your Copyright 83 | and Similar Rights in Your contributions to Adapted Material in 84 | accordance with the terms and conditions of this Public License. 85 | 86 | c. Copyright and Similar Rights means copyright and/or similar rights 87 | closely related to copyright including, without limitation, 88 | performance, broadcast, sound recording, and Sui Generis Database 89 | Rights, without regard to how the rights are labeled or 90 | categorized. For purposes of this Public License, the rights 91 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 92 | Rights. 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. NonCommercial means not primarily intended for or directed towards 116 | commercial advantage or monetary compensation. For purposes of 117 | this Public License, the exchange of the Licensed Material for 118 | other material subject to Copyright and Similar Rights by digital 119 | file-sharing or similar means is NonCommercial provided there is 120 | no payment of monetary compensation in connection with the 121 | exchange. 122 | 123 | j. Share means to provide material to the public by any means or 124 | process that requires permission under the Licensed Rights, such 125 | as reproduction, public display, public performance, distribution, 126 | dissemination, communication, or importation, and to make material 127 | available to the public including in ways that members of the 128 | public may access the material from a place and at a time 129 | individually chosen by them. 130 | 131 | k. Sui Generis Database Rights means rights other than copyright 132 | resulting from Directive 96/9/EC of the European Parliament and of 133 | the Council of 11 March 1996 on the legal protection of databases, 134 | as amended and/or succeeded, as well as other essentially 135 | equivalent rights anywhere in the world. 136 | 137 | l. You means the individual or entity exercising the Licensed Rights 138 | under this Public License. Your has a corresponding meaning. 139 | 140 | Section 2 -- Scope. 141 | 142 | a. License grant. 143 | 144 | 1. Subject to the terms and conditions of this Public License, 145 | the Licensor hereby grants You a worldwide, royalty-free, 146 | non-sublicensable, non-exclusive, irrevocable license to 147 | exercise the Licensed Rights in the Licensed Material to: 148 | 149 | a. reproduce and Share the Licensed Material, in whole or 150 | in part, for NonCommercial purposes only; and 151 | 152 | b. produce, reproduce, and Share Adapted Material for 153 | NonCommercial purposes only. 154 | 155 | 2. Exceptions and Limitations. For the avoidance of doubt, where 156 | Exceptions and Limitations apply to Your use, this Public 157 | License does not apply, and You do not need to comply with 158 | its terms and conditions. 159 | 160 | 3. Term. The term of this Public License is specified in Section 161 | 6(a). 162 | 163 | 4. Media and formats; technical modifications allowed. The 164 | Licensor authorizes You to exercise the Licensed Rights in 165 | all media and formats whether now known or hereafter created, 166 | and to make technical modifications necessary to do so. The 167 | Licensor waives and/or agrees not to assert any right or 168 | authority to forbid You from making technical modifications 169 | necessary to exercise the Licensed Rights, including 170 | technical modifications necessary to circumvent Effective 171 | Technological Measures. For purposes of this Public License, 172 | simply making modifications authorized by this Section 2(a) 173 | (4) never produces Adapted Material. 174 | 175 | 5. Downstream recipients. 176 | 177 | a. Offer from the Licensor -- Licensed Material. Every 178 | recipient of the Licensed Material automatically 179 | receives an offer from the Licensor to exercise the 180 | Licensed Rights under the terms and conditions of this 181 | Public License. 182 | 183 | b. No downstream restrictions. You may not offer or impose 184 | any additional or different terms or conditions on, or 185 | apply any Effective Technological Measures to, the 186 | Licensed Material if doing so restricts exercise of the 187 | Licensed Rights by any recipient of the Licensed 188 | Material. 189 | 190 | 6. No endorsement. Nothing in this Public License constitutes or 191 | may be construed as permission to assert or imply that You 192 | are, or that Your use of the Licensed Material is, connected 193 | with, or sponsored, endorsed, or granted official status by, 194 | the Licensor or others designated to receive attribution as 195 | provided in Section 3(a)(1)(A)(i). 196 | 197 | b. Other rights. 198 | 199 | 1. Moral rights, such as the right of integrity, are not 200 | licensed under this Public License, nor are publicity, 201 | privacy, and/or other similar personality rights; however, to 202 | the extent possible, the Licensor waives and/or agrees not to 203 | assert any such rights held by the Licensor to the limited 204 | extent necessary to allow You to exercise the Licensed 205 | Rights, but not otherwise. 206 | 207 | 2. Patent and trademark rights are not licensed under this 208 | Public License. 209 | 210 | 3. To the extent possible, the Licensor waives any right to 211 | collect royalties from You for the exercise of the Licensed 212 | Rights, whether directly or through a collecting society 213 | under any voluntary or waivable statutory or compulsory 214 | licensing scheme. In all other cases the Licensor expressly 215 | reserves any right to collect such royalties, including when 216 | the Licensed Material is used other than for NonCommercial 217 | purposes. 218 | 219 | Section 3 -- License Conditions. 220 | 221 | Your exercise of the Licensed Rights is expressly made subject to the 222 | following conditions. 223 | 224 | a. Attribution. 225 | 226 | 1. If You Share the Licensed Material (including in modified 227 | form), You must: 228 | 229 | a. retain the following if it is supplied by the Licensor 230 | with the Licensed Material: 231 | 232 | i. identification of the creator(s) of the Licensed 233 | Material and any others designated to receive 234 | attribution, in any reasonable manner requested by 235 | the Licensor (including by pseudonym if 236 | designated); 237 | 238 | ii. a copyright notice; 239 | 240 | iii. a notice that refers to this Public License; 241 | 242 | iv. a notice that refers to the disclaimer of 243 | warranties; 244 | 245 | v. a URI or hyperlink to the Licensed Material to the 246 | extent reasonably practicable; 247 | 248 | b. indicate if You modified the Licensed Material and 249 | retain an indication of any previous modifications; and 250 | 251 | c. indicate the Licensed Material is licensed under this 252 | Public License, and include the text of, or the URI or 253 | hyperlink to, this Public License. 254 | 255 | 2. You may satisfy the conditions in Section 3(a)(1) in any 256 | reasonable manner based on the medium, means, and context in 257 | which You Share the Licensed Material. For example, it may be 258 | reasonable to satisfy the conditions by providing a URI or 259 | hyperlink to a resource that includes the required 260 | information. 261 | 262 | 3. If requested by the Licensor, You must remove any of the 263 | information required by Section 3(a)(1)(A) to the extent 264 | reasonably practicable. 265 | 266 | 4. If You Share Adapted Material You produce, the Adapter's 267 | License You apply must not prevent recipients of the Adapted 268 | Material from complying with this Public License. 269 | 270 | Section 4 -- Sui Generis Database Rights. 271 | 272 | Where the Licensed Rights include Sui Generis Database Rights that 273 | apply to Your use of the Licensed Material: 274 | 275 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 276 | to extract, reuse, reproduce, and Share all or a substantial 277 | portion of the contents of the database for NonCommercial purposes 278 | only; 279 | 280 | b. if You include all or a substantial portion of the database 281 | contents in a database in which You have Sui Generis Database 282 | Rights, then the database in which You have Sui Generis Database 283 | Rights (but not its individual contents) is Adapted Material; and 284 | 285 | c. You must comply with the conditions in Section 3(a) if You Share 286 | all or a substantial portion of the contents of the database. 287 | 288 | For the avoidance of doubt, this Section 4 supplements and does not 289 | replace Your obligations under this Public License where the Licensed 290 | Rights include other Copyright and Similar Rights. 291 | 292 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 293 | 294 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 295 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 296 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 297 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 298 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 299 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 300 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 301 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 302 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 303 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 304 | 305 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 306 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 307 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 308 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 309 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 310 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 311 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 312 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 313 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 314 | 315 | c. The disclaimer of warranties and limitation of liability provided 316 | above shall be interpreted in a manner that, to the extent 317 | possible, most closely approximates an absolute disclaimer and 318 | waiver of all liability. 319 | 320 | Section 6 -- Term and Termination. 321 | 322 | a. This Public License applies for the term of the Copyright and 323 | Similar Rights licensed here. However, if You fail to comply with 324 | this Public License, then Your rights under this Public License 325 | terminate automatically. 326 | 327 | b. Where Your right to use the Licensed Material has terminated under 328 | Section 6(a), it reinstates: 329 | 330 | 1. automatically as of the date the violation is cured, provided 331 | it is cured within 30 days of Your discovery of the 332 | violation; or 333 | 334 | 2. upon express reinstatement by the Licensor. 335 | 336 | For the avoidance of doubt, this Section 6(b) does not affect any 337 | right the Licensor may have to seek remedies for Your violations 338 | of this Public License. 339 | 340 | c. For the avoidance of doubt, the Licensor may also offer the 341 | Licensed Material under separate terms or conditions or stop 342 | distributing the Licensed Material at any time; however, doing so 343 | will not terminate this Public License. 344 | 345 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 346 | License. 347 | 348 | Section 7 -- Other Terms and Conditions. 349 | 350 | a. The Licensor shall not be bound by any additional or different 351 | terms or conditions communicated by You unless expressly agreed. 352 | 353 | b. Any arrangements, understandings, or agreements regarding the 354 | Licensed Material not stated herein are separate from and 355 | independent of the terms and conditions of this Public License. 356 | 357 | Section 8 -- Interpretation. 358 | 359 | a. For the avoidance of doubt, this Public License does not, and 360 | shall not be interpreted to, reduce, limit, restrict, or impose 361 | conditions on any use of the Licensed Material that could lawfully 362 | be made without permission under this Public License. 363 | 364 | b. To the extent possible, if any provision of this Public License is 365 | deemed unenforceable, it shall be automatically reformed to the 366 | minimum extent necessary to make it enforceable. If the provision 367 | cannot be reformed, it shall be severed from this Public License 368 | without affecting the enforceability of the remaining terms and 369 | conditions. 370 | 371 | c. No term or condition of this Public License will be waived and no 372 | failure to comply consented to unless expressly agreed to by the 373 | Licensor. 374 | 375 | d. Nothing in this Public License constitutes or may be interpreted 376 | as a limitation upon, or waiver of, any privileges and immunities 377 | that apply to the Licensor or You, including from the legal 378 | processes of any jurisdiction or authority. 379 | 380 | ======================================================================= 381 | 382 | Creative Commons is not a party to its public 383 | licenses. Notwithstanding, Creative Commons may elect to apply one of 384 | its public licenses to material it publishes and in those instances 385 | will be considered the “Licensor.” The text of the Creative Commons 386 | public licenses is dedicated to the public domain under the CC0 Public 387 | Domain Dedication. Except for the limited purpose of indicating that 388 | material is shared under a Creative Commons public license or as 389 | otherwise permitted by the Creative Commons policies published at 390 | creativecommons.org/policies, Creative Commons does not authorize the 391 | use of the trademark "Creative Commons" or any other trademark or logo 392 | of Creative Commons without its prior written consent including, 393 | without limitation, in connection with any unauthorized modifications 394 | to any of its public licenses or any other arrangements, 395 | understandings, or agreements concerning use of licensed material. For 396 | the avoidance of doubt, this paragraph does not form part of the 397 | public licenses. 398 | 399 | Creative Commons may be contacted at creativecommons.org. 400 | -------------------------------------------------------------------------------- /counterfactuals/README.md: -------------------------------------------------------------------------------- 1 | # Counterfactual Explanatins for Models of Code 2 | 3 | Counterfactual explanations ([counterfactuals](/counterfactuals)) constitute minimal changes to the input space under which the model 4 | “changes its mind". The contrast between original input and perturbed input is considered an explanation. 5 | 6 | This work has been published at the International Conference on Software Engineering (ICSE'22), Software Engineering in Practice: J. Cito, I. Dillig, V. Murali, S. Chandra, [Counterfactual Explanations for Models of Code](https://arxiv.org/pdf/2111.05711.pdf). 7 | 8 | ```bibtex 9 | @inproceedings{code_counterfactuals:22, 10 | title={Counterfactual Explanations for Models of Code}, 11 | author={Cito, J{\"u}rgen and Dillig, Isil and Murali, Vijayaraghavan and Chandra, Satish}, 12 | booktitle = {44th {IEEE/ACM} International Conference on Software Engineering: 13 | Software Engineering in Practice, {ICSE} {(SEIP)} 2022, Pittsburgh, USA, 14 | May 25-27, 2022}, 15 | year={2022} 16 | } 17 | ``` 18 | 19 | ## Requirements 20 | 21 | * Python 3.8 22 | 23 | ## Tests 24 | 25 | In the `counterfactuals` folder, run `python tests/regression_test.py` and `python tests/classifier_test.py` 26 | 27 | ## License 28 | 29 | CC-BY-NC 4.0 (Attr Non-Commercial Inter.) (e.g., FAIR) licensed, as found in the LICENSE file. 30 | -------------------------------------------------------------------------------- /counterfactuals/__init__.py: -------------------------------------------------------------------------------- 1 | import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__))) 2 | -------------------------------------------------------------------------------- /counterfactuals/base_proxy.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | class BasePerturbationProxy: 4 | initial_document: str 5 | 6 | def classify(self, document) -> Tuple[bool, float]: 7 | return NotImplementedError 8 | 9 | """ 10 | Standard implementation of batch classify assumes 11 | a sequential `classify` function (i.e., batch size 1) 12 | """ 13 | def batch_classify(self, document_list: List) -> List[Tuple[bool, float]]: 14 | if len(document_list) == 0: 15 | return [] 16 | return list(map(self.classify, document_list)) 17 | 18 | 19 | """ 20 | Standard implementation assumes a set of words divided by spaces 21 | The perturbation space consists of all words in the document 22 | """ 23 | def document_to_perturbation_space(self, document: str) -> List[str]: 24 | self.initial_document = document 25 | return document.split(' ') 26 | 27 | """ 28 | Standard implementation simply removes tokens at certain positions 29 | in the perturbation space 30 | """ 31 | def perturb_positions(self, perturbation_space: List, positions: List[int]) -> List: 32 | perturbed_sequence = [] 33 | for i in range(len(perturbation_space)): 34 | if i not in positions: 35 | perturbed_sequence.append(perturbation_space[i]) 36 | return ' '.join(perturbed_sequence) 37 | -------------------------------------------------------------------------------- /counterfactuals/counterfactual_search.py: -------------------------------------------------------------------------------- 1 | import heapq 2 | import logging 3 | from .base_proxy import BasePerturbationProxy 4 | 5 | 6 | class BaseCounterfactualSearch: 7 | def search(self, document, proxy: BasePerturbationProxy): 8 | raise NotImplementedError 9 | 10 | 11 | class GreedySearch(BaseCounterfactualSearch): 12 | 13 | def __init__(self, proxy: BasePerturbationProxy, iterations: int=2): 14 | self.proxy = proxy 15 | self.iterations = iterations 16 | 17 | def search(self, document): 18 | output = self.proxy.classify(document) 19 | # accounting for both sequential and parallel implementation of classify 20 | initial_classification, initial_score = output[0] if isinstance(output, list) else output 21 | sequence = self.proxy.document_to_perturbation_space(document) 22 | exploration_candidates = [] 23 | 24 | """ 25 | In the first iteration, we can expand by every token in the sequence. 26 | As we find more explanations, this list decreases 27 | """ 28 | possible_expansions = set(range(len(sequence))) 29 | 30 | explanations = [] 31 | perturbation_tracking = [] 32 | 33 | for i in range(self.iterations+1): 34 | best_candidate = choose_best_candidate(exploration_candidates) 35 | if not isinstance(best_candidate, list): 36 | best_candidate = [best_candidate] 37 | """ 38 | Expand explanation size by 1 and order by likelihood 39 | of receiving to obtain a class change 40 | """ 41 | candidates = [best_candidate + [expansion_word] for expansion_word in possible_expansions] 42 | counterfactual_documents = list(map( 43 | lambda candidate_positions: self.proxy.perturb_positions(sequence, candidate_positions), 44 | candidates 45 | )) 46 | logging.debug(f"(Candidates in Iteration {i}: {candidates}") 47 | 48 | logging.debug(f"Batching {len(counterfactual_documents)} forward passes") 49 | candidate_classifications = self.proxy.batch_classify(counterfactual_documents) 50 | 51 | 52 | for i in range(len(candidate_classifications)): 53 | classification, score = candidate_classifications[i] 54 | counterfactual_document = counterfactual_documents[i] 55 | candidate_positions = candidates[i] 56 | 57 | logging.debug(f"Document after perturbing positions {candidate_positions}: {counterfactual_document}") 58 | 59 | """ 60 | If the perturbation leads to a classification change 61 | we have found a counterfactual explanation. 62 | 63 | We add it to our list and also remove all positions 64 | from the `possible_expansions` set 65 | """ 66 | if initial_classification != classification: 67 | """ 68 | Add to explanations set: 69 | (1) The positions that were perturbed to achieve the classification change 70 | (2) The new classification 71 | (3) The new score 72 | """ 73 | logging.debug(f"Adding the following position perturbations to explanations in iteration {i}: {candidate_positions}") 74 | explanations.append((candidate_positions, classification, score)) 75 | # add new document to enable perturbation perturbation 76 | perturbation_tracking.append(counterfactual_document) 77 | # remove from possible_expansions 78 | possible_expansions = possible_expansions - set(candidate_positions) 79 | else: 80 | # add this to the exploration candidates 81 | add_exploration_candidate(exploration_candidates, candidate_positions, score) 82 | return sequence, explanations, perturbation_tracking 83 | 84 | 85 | 86 | """ 87 | Search util functions 88 | """ 89 | 90 | 91 | """ 92 | Guides the greedy heuristic by choosing 93 | candidates based on score differential 94 | """ 95 | def choose_best_candidate(exploration_candidates): 96 | if not exploration_candidates: 97 | return [] 98 | score, candidate = heapq.heappop(exploration_candidates) 99 | return candidate 100 | 101 | def add_exploration_candidate(exploration_candidates, candidate, score): 102 | """ 103 | Adjust score to penalize longer explanation candidates 104 | """ 105 | penalty = (len(candidate) - 1) * 0.1 106 | heapq.heappush(exploration_candidates, (score + penalty, candidate)) 107 | -------------------------------------------------------------------------------- /counterfactuals/explainer.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import List 3 | from .counterfactual_search import BaseCounterfactualSearch, GreedySearch 4 | from .base_proxy import BasePerturbationProxy 5 | 6 | class SequenceExplainer: 7 | def __init__(self, counterfactual_search: BaseCounterfactualSearch=None): 8 | self.counterfactual_search = counterfactual_search if counterfactual_search is not None \ 9 | else GreedySearch(BasePerturbationProxy()) 10 | 11 | 12 | def explain(self, document): 13 | start = time.perf_counter() 14 | sequence, full_explanations, perturbation_tracking = self.counterfactual_search.search(document) 15 | end = time.perf_counter() 16 | return SequenceExplanation( 17 | sequence, 18 | full_explanations, 19 | perturbation_tracking, 20 | execution_time=int(end-start), 21 | original_document=document 22 | ) 23 | 24 | 25 | class SequenceExplanation: 26 | def __init__( 27 | self, 28 | document_sequence: List, 29 | explanations: List, 30 | perturbation_tracking: List, 31 | execution_time: int=0, 32 | original_document=None 33 | ): 34 | self.document_sequence = document_sequence 35 | self.explanations = explanations 36 | self.perturbation_tracking = perturbation_tracking 37 | self.execution_time = execution_time 38 | self.original_document = original_document 39 | 40 | def has_explanations(self): 41 | return len(self.explanations) > 0 42 | 43 | # same as 'full' but without the positions 44 | def human_readable(self): 45 | return [ 46 | list(map(lambda pos: self.document_sequence[pos], explanation_list[0])) 47 | for explanation_list in self.explanations 48 | ] 49 | 50 | def set_original_document(self, original_document): 51 | self.original_document = original_document 52 | 53 | def full(self): 54 | return [ 55 | ( 56 | list( 57 | map( 58 | lambda pos: (pos, self.document_sequence[pos]), 59 | explanation_list[0], 60 | ) 61 | ), 62 | explanation_list[1], 63 | ) 64 | for explanation_list in self.explanations 65 | ] 66 | 67 | # Returns a string representation as a list of explanations 68 | # Each explanation item is a tuple of document position and 69 | #item and document item at that position 70 | def __repr__(self): 71 | return str(self.full()) 72 | 73 | def __str__(self): 74 | return str(self.human_readable()) 75 | -------------------------------------------------------------------------------- /counterfactuals/tests/__init__.py: -------------------------------------------------------------------------------- 1 | import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__))) 2 | -------------------------------------------------------------------------------- /counterfactuals/tests/classifier_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from test_proxies import OffensiveSentenceClassifier 3 | 4 | import unittest 5 | 6 | 7 | class ClassiferTest(unittest.TestCase): 8 | """ 9 | Testing (dummy) classifer provided for test 10 | """ 11 | def setUp(self): 12 | self.classifier = OffensiveSentenceClassifier( 13 | # fruits are offensive 14 | offensive_words=['strawberry', 'blueberry', 'apple', 'pear', 'pineapple'], 15 | # vegetables are forbidden 16 | forbidden_words=['celery', 'pepper', 'carrot', 'zucchini', 'eggplant'] 17 | ) 18 | 19 | def test_empty_input(self): 20 | empty_input = '' 21 | expected_classifier_output = (0, 0) 22 | actual_classifier_output = self.classifier.predict(empty_input) 23 | self.assertEqual(actual_classifier_output, expected_classifier_output, 24 | msg='An empty sentence should have a negative label and 0 score') 25 | 26 | 27 | def test_classifier_scores(self): 28 | expected_outputs = { 29 | 'let us try one offensive blueberry' : 30 | 1 * self.classifier.OFFENSIVE_WORD_SCORE_INCREASE, 31 | 32 | 'let us mix offensive apple ad pear with forbidden pepper' : 33 | 2 * self.classifier.OFFENSIVE_WORD_SCORE_INCREASE + 0.5, 34 | # + (1-0)*self.classifier.FORBIDDEN_WORD_SCORE_INCREASE 35 | 36 | 'let us go way over budget with vegetables celery pepper eggplant carrot' : 37 | 0.5+(4-1)*self.classifier.FORBIDDEN_WORD_SCORE_INCREASE 38 | } 39 | for sentence, expected_score in expected_outputs.items(): 40 | expected_label = 1 if expected_score >= 0.5 else 0 41 | expected_classifier_output = (expected_label, min(1.0, expected_score)) 42 | actual_classifier_output = self.classifier.predict(sentence) 43 | self.assertEqual(actual_classifier_output, expected_classifier_output) 44 | 45 | if __name__ == '__main__': 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /counterfactuals/tests/regression_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import unittest 4 | import sys 5 | sys.path.append("..") 6 | 7 | from typing import List, Tuple, Set 8 | 9 | from counterfactuals.explainer import SequenceExplainer 10 | 11 | from tests.test_proxies import RemoveWordsPerturbation 12 | from counterfactuals.counterfactual_search import GreedySearch 13 | 14 | 15 | class RegressionTest(unittest.TestCase): 16 | """ 17 | Testing for regressions on a small (generated) dataset 18 | """ 19 | def setUp(self): 20 | # instantiate mocked model and appropraite proxies 21 | self.proxy = RemoveWordsPerturbation() 22 | self.explainer = SequenceExplainer(GreedySearch(self.proxy)) 23 | 24 | def test_empty_input(self): 25 | empty_input = '' 26 | expected_explanation_count = 0 27 | actual_explanation_count = len(self.explainer.explain(empty_input).explanations) 28 | self.assertEqual(actual_explanation_count, expected_explanation_count, 29 | msg='We should not be able to generate explanations for an empty input') 30 | 31 | def test_forbidden_word_input(self): 32 | forbidden_sentence = 'let us mix offensive apple and pear with forbidden pepper' 33 | 34 | # removing 'pepper' is the only explanation possible 35 | counterfactual_sentence = 'let us mix offensive apple and pear with forbidden' 36 | _, counterfactual_score = self.proxy.classify(counterfactual_sentence) 37 | 38 | # 'pepper' is the perturbed position 9 39 | expected_explanations = [([9], 0, counterfactual_score)] 40 | actual_explanations = self.explainer.explain(forbidden_sentence).explanations 41 | self.assertEqual(actual_explanations, expected_explanations) 42 | 43 | # also asserting that the explanation actually contains 'pepper' 44 | actual_human_readable_explanation = self.explainer.explain(forbidden_sentence).human_readable() 45 | expected_human_readable_explanation = [['pepper']] 46 | self.assertEqual(expected_human_readable_explanation, actual_human_readable_explanation) 47 | 48 | def test_offensive_forbidden_word_input(self): 49 | mix_sentence = 'let us mix offensive apple pear strawberry blueberry with forbidden eggplant' 50 | 51 | """ 52 | Multiple scenarios of explanations are now possible, by removing: 53 | - Eggplant (forbidden word) + at least 1 offensive word 54 | """ 55 | 56 | actual_explanations = list_to_set(self.explainer.explain(mix_sentence).human_readable()) 57 | expected_explanations = {('eggplant', 'blueberry'), ('eggplant', 'strawberry'), ('eggplant', 'pear'), ('eggplant', 'apple')} 58 | self.assertEqual(expected_explanations, actual_explanations) 59 | 60 | 61 | def list_to_set(lists: List) -> Set[Tuple[str]]: 62 | return {tuple(x) for x in lists} 63 | 64 | 65 | if __name__ == '__main__': 66 | unittest.main() 67 | -------------------------------------------------------------------------------- /counterfactuals/tests/test_proxies.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | 4 | from counterfactuals.base_proxy import BasePerturbationProxy 5 | from typing import List, Tuple 6 | 7 | class OffensiveSentenceClassifier: 8 | 9 | OFFENSIVE_WORD_SCORE_INCREASE:float = 0.15 10 | FORBIDDEN_WORD_SCORE_INCREASE:float = 0.2 11 | 12 | def __init__( 13 | self, 14 | offensive_words: List[str], 15 | forbidden_words: List[str] 16 | ): 17 | self.forbidden_words = forbidden_words 18 | self.offensive_words = offensive_words 19 | 20 | def predict(self, sentence: str) -> Tuple[int, float]: 21 | sequence = sentence.split(' ') 22 | forbidden_word_count = 0 23 | offensive_word_count = 0 24 | for token in sequence: 25 | if token in self.forbidden_words: 26 | forbidden_word_count += 1 27 | continue 28 | if token in self.offensive_words: 29 | offensive_word_count += 1 30 | 31 | # if a forbidden word is used, score is at least 0.5 32 | # and goes up in increments from there from there 33 | score = 0.0 34 | if forbidden_word_count > 0: 35 | score += 0.5 + ((forbidden_word_count-1)*self.FORBIDDEN_WORD_SCORE_INCREASE) 36 | 37 | # Add 0.15 score for each offensive word 38 | score += offensive_word_count*self.OFFENSIVE_WORD_SCORE_INCREASE 39 | label = 1 if score >= 0.5 else 0 40 | 41 | return (label, min(score, 1.0)) 42 | 43 | 44 | class RemoveWordsPerturbation(BasePerturbationProxy): 45 | def classify(self, document: str) -> Tuple[int, float]: 46 | sentence_classifier = OffensiveSentenceClassifier( 47 | # fruits are offensive 48 | offensive_words=['strawberry', 'blueberry', 'apple', 'pear', 'pineapple'], 49 | # vegetables are forbidden 50 | forbidden_words=['celery', 'pepper', 'carrot', 'zucchini', 'eggplant'], 51 | ) 52 | return sentence_classifier.predict(document) 53 | -------------------------------------------------------------------------------- /paper/FSE21-ML-Misprediction-Preprint.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/mmd/dd65bafd57aa52e3c46e6167ee1bf5907e8a1c3b/paper/FSE21-ML-Misprediction-Preprint.pdf -------------------------------------------------------------------------------- /rule_induction/LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial 4.0 International Public 58 | License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial 4.0 International Public License ("Public 63 | License"). To the extent this Public License may be interpreted as a 64 | contract, You are granted the Licensed Rights in consideration of Your 65 | acceptance of these terms and conditions, and the Licensor grants You 66 | such rights in consideration of benefits the Licensor receives from 67 | making the Licensed Material available under these terms and 68 | conditions. 69 | 70 | Section 1 -- Definitions. 71 | 72 | a. Adapted Material means material subject to Copyright and Similar 73 | Rights that is derived from or based upon the Licensed Material 74 | and in which the Licensed Material is translated, altered, 75 | arranged, transformed, or otherwise modified in a manner requiring 76 | permission under the Copyright and Similar Rights held by the 77 | Licensor. For purposes of this Public License, where the Licensed 78 | Material is a musical work, performance, or sound recording, 79 | Adapted Material is always produced where the Licensed Material is 80 | synched in timed relation with a moving image. 81 | 82 | b. Adapter's License means the license You apply to Your Copyright 83 | and Similar Rights in Your contributions to Adapted Material in 84 | accordance with the terms and conditions of this Public License. 85 | 86 | c. Copyright and Similar Rights means copyright and/or similar rights 87 | closely related to copyright including, without limitation, 88 | performance, broadcast, sound recording, and Sui Generis Database 89 | Rights, without regard to how the rights are labeled or 90 | categorized. For purposes of this Public License, the rights 91 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 92 | Rights. 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. NonCommercial means not primarily intended for or directed towards 116 | commercial advantage or monetary compensation. For purposes of 117 | this Public License, the exchange of the Licensed Material for 118 | other material subject to Copyright and Similar Rights by digital 119 | file-sharing or similar means is NonCommercial provided there is 120 | no payment of monetary compensation in connection with the 121 | exchange. 122 | 123 | j. Share means to provide material to the public by any means or 124 | process that requires permission under the Licensed Rights, such 125 | as reproduction, public display, public performance, distribution, 126 | dissemination, communication, or importation, and to make material 127 | available to the public including in ways that members of the 128 | public may access the material from a place and at a time 129 | individually chosen by them. 130 | 131 | k. Sui Generis Database Rights means rights other than copyright 132 | resulting from Directive 96/9/EC of the European Parliament and of 133 | the Council of 11 March 1996 on the legal protection of databases, 134 | as amended and/or succeeded, as well as other essentially 135 | equivalent rights anywhere in the world. 136 | 137 | l. You means the individual or entity exercising the Licensed Rights 138 | under this Public License. Your has a corresponding meaning. 139 | 140 | Section 2 -- Scope. 141 | 142 | a. License grant. 143 | 144 | 1. Subject to the terms and conditions of this Public License, 145 | the Licensor hereby grants You a worldwide, royalty-free, 146 | non-sublicensable, non-exclusive, irrevocable license to 147 | exercise the Licensed Rights in the Licensed Material to: 148 | 149 | a. reproduce and Share the Licensed Material, in whole or 150 | in part, for NonCommercial purposes only; and 151 | 152 | b. produce, reproduce, and Share Adapted Material for 153 | NonCommercial purposes only. 154 | 155 | 2. Exceptions and Limitations. For the avoidance of doubt, where 156 | Exceptions and Limitations apply to Your use, this Public 157 | License does not apply, and You do not need to comply with 158 | its terms and conditions. 159 | 160 | 3. Term. The term of this Public License is specified in Section 161 | 6(a). 162 | 163 | 4. Media and formats; technical modifications allowed. The 164 | Licensor authorizes You to exercise the Licensed Rights in 165 | all media and formats whether now known or hereafter created, 166 | and to make technical modifications necessary to do so. The 167 | Licensor waives and/or agrees not to assert any right or 168 | authority to forbid You from making technical modifications 169 | necessary to exercise the Licensed Rights, including 170 | technical modifications necessary to circumvent Effective 171 | Technological Measures. For purposes of this Public License, 172 | simply making modifications authorized by this Section 2(a) 173 | (4) never produces Adapted Material. 174 | 175 | 5. Downstream recipients. 176 | 177 | a. Offer from the Licensor -- Licensed Material. Every 178 | recipient of the Licensed Material automatically 179 | receives an offer from the Licensor to exercise the 180 | Licensed Rights under the terms and conditions of this 181 | Public License. 182 | 183 | b. No downstream restrictions. You may not offer or impose 184 | any additional or different terms or conditions on, or 185 | apply any Effective Technological Measures to, the 186 | Licensed Material if doing so restricts exercise of the 187 | Licensed Rights by any recipient of the Licensed 188 | Material. 189 | 190 | 6. No endorsement. Nothing in this Public License constitutes or 191 | may be construed as permission to assert or imply that You 192 | are, or that Your use of the Licensed Material is, connected 193 | with, or sponsored, endorsed, or granted official status by, 194 | the Licensor or others designated to receive attribution as 195 | provided in Section 3(a)(1)(A)(i). 196 | 197 | b. Other rights. 198 | 199 | 1. Moral rights, such as the right of integrity, are not 200 | licensed under this Public License, nor are publicity, 201 | privacy, and/or other similar personality rights; however, to 202 | the extent possible, the Licensor waives and/or agrees not to 203 | assert any such rights held by the Licensor to the limited 204 | extent necessary to allow You to exercise the Licensed 205 | Rights, but not otherwise. 206 | 207 | 2. Patent and trademark rights are not licensed under this 208 | Public License. 209 | 210 | 3. To the extent possible, the Licensor waives any right to 211 | collect royalties from You for the exercise of the Licensed 212 | Rights, whether directly or through a collecting society 213 | under any voluntary or waivable statutory or compulsory 214 | licensing scheme. In all other cases the Licensor expressly 215 | reserves any right to collect such royalties, including when 216 | the Licensed Material is used other than for NonCommercial 217 | purposes. 218 | 219 | Section 3 -- License Conditions. 220 | 221 | Your exercise of the Licensed Rights is expressly made subject to the 222 | following conditions. 223 | 224 | a. Attribution. 225 | 226 | 1. If You Share the Licensed Material (including in modified 227 | form), You must: 228 | 229 | a. retain the following if it is supplied by the Licensor 230 | with the Licensed Material: 231 | 232 | i. identification of the creator(s) of the Licensed 233 | Material and any others designated to receive 234 | attribution, in any reasonable manner requested by 235 | the Licensor (including by pseudonym if 236 | designated); 237 | 238 | ii. a copyright notice; 239 | 240 | iii. a notice that refers to this Public License; 241 | 242 | iv. a notice that refers to the disclaimer of 243 | warranties; 244 | 245 | v. a URI or hyperlink to the Licensed Material to the 246 | extent reasonably practicable; 247 | 248 | b. indicate if You modified the Licensed Material and 249 | retain an indication of any previous modifications; and 250 | 251 | c. indicate the Licensed Material is licensed under this 252 | Public License, and include the text of, or the URI or 253 | hyperlink to, this Public License. 254 | 255 | 2. You may satisfy the conditions in Section 3(a)(1) in any 256 | reasonable manner based on the medium, means, and context in 257 | which You Share the Licensed Material. For example, it may be 258 | reasonable to satisfy the conditions by providing a URI or 259 | hyperlink to a resource that includes the required 260 | information. 261 | 262 | 3. If requested by the Licensor, You must remove any of the 263 | information required by Section 3(a)(1)(A) to the extent 264 | reasonably practicable. 265 | 266 | 4. If You Share Adapted Material You produce, the Adapter's 267 | License You apply must not prevent recipients of the Adapted 268 | Material from complying with this Public License. 269 | 270 | Section 4 -- Sui Generis Database Rights. 271 | 272 | Where the Licensed Rights include Sui Generis Database Rights that 273 | apply to Your use of the Licensed Material: 274 | 275 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 276 | to extract, reuse, reproduce, and Share all or a substantial 277 | portion of the contents of the database for NonCommercial purposes 278 | only; 279 | 280 | b. if You include all or a substantial portion of the database 281 | contents in a database in which You have Sui Generis Database 282 | Rights, then the database in which You have Sui Generis Database 283 | Rights (but not its individual contents) is Adapted Material; and 284 | 285 | c. You must comply with the conditions in Section 3(a) if You Share 286 | all or a substantial portion of the contents of the database. 287 | 288 | For the avoidance of doubt, this Section 4 supplements and does not 289 | replace Your obligations under this Public License where the Licensed 290 | Rights include other Copyright and Similar Rights. 291 | 292 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 293 | 294 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 295 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 296 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 297 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 298 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 299 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 300 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 301 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 302 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 303 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 304 | 305 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 306 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 307 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 308 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 309 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 310 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 311 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 312 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 313 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 314 | 315 | c. The disclaimer of warranties and limitation of liability provided 316 | above shall be interpreted in a manner that, to the extent 317 | possible, most closely approximates an absolute disclaimer and 318 | waiver of all liability. 319 | 320 | Section 6 -- Term and Termination. 321 | 322 | a. This Public License applies for the term of the Copyright and 323 | Similar Rights licensed here. However, if You fail to comply with 324 | this Public License, then Your rights under this Public License 325 | terminate automatically. 326 | 327 | b. Where Your right to use the Licensed Material has terminated under 328 | Section 6(a), it reinstates: 329 | 330 | 1. automatically as of the date the violation is cured, provided 331 | it is cured within 30 days of Your discovery of the 332 | violation; or 333 | 334 | 2. upon express reinstatement by the Licensor. 335 | 336 | For the avoidance of doubt, this Section 6(b) does not affect any 337 | right the Licensor may have to seek remedies for Your violations 338 | of this Public License. 339 | 340 | c. For the avoidance of doubt, the Licensor may also offer the 341 | Licensed Material under separate terms or conditions or stop 342 | distributing the Licensed Material at any time; however, doing so 343 | will not terminate this Public License. 344 | 345 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 346 | License. 347 | 348 | Section 7 -- Other Terms and Conditions. 349 | 350 | a. The Licensor shall not be bound by any additional or different 351 | terms or conditions communicated by You unless expressly agreed. 352 | 353 | b. Any arrangements, understandings, or agreements regarding the 354 | Licensed Material not stated herein are separate from and 355 | independent of the terms and conditions of this Public License. 356 | 357 | Section 8 -- Interpretation. 358 | 359 | a. For the avoidance of doubt, this Public License does not, and 360 | shall not be interpreted to, reduce, limit, restrict, or impose 361 | conditions on any use of the Licensed Material that could lawfully 362 | be made without permission under this Public License. 363 | 364 | b. To the extent possible, if any provision of this Public License is 365 | deemed unenforceable, it shall be automatically reformed to the 366 | minimum extent necessary to make it enforceable. If the provision 367 | cannot be reformed, it shall be severed from this Public License 368 | without affecting the enforceability of the remaining terms and 369 | conditions. 370 | 371 | c. No term or condition of this Public License will be waived and no 372 | failure to comply consented to unless expressly agreed to by the 373 | Licensor. 374 | 375 | d. Nothing in this Public License constitutes or may be interpreted 376 | as a limitation upon, or waiver of, any privileges and immunities 377 | that apply to the Licensor or You, including from the legal 378 | processes of any jurisdiction or authority. 379 | 380 | ======================================================================= 381 | 382 | Creative Commons is not a party to its public 383 | licenses. Notwithstanding, Creative Commons may elect to apply one of 384 | its public licenses to material it publishes and in those instances 385 | will be considered the “Licensor.” The text of the Creative Commons 386 | public licenses is dedicated to the public domain under the CC0 Public 387 | Domain Dedication. Except for the limited purpose of indicating that 388 | material is shared under a Creative Commons public license or as 389 | otherwise permitted by the Creative Commons policies published at 390 | creativecommons.org/policies, Creative Commons does not authorize the 391 | use of the trademark "Creative Commons" or any other trademark or logo 392 | of Creative Commons without its prior written consent including, 393 | without limitation, in connection with any unauthorized modifications 394 | to any of its public licenses or any other arrangements, 395 | understandings, or agreements concerning use of licensed material. For 396 | the avoidance of doubt, this paragraph does not form part of the 397 | public licenses. 398 | 399 | Creative Commons may be contacted at creativecommons.org. 400 | -------------------------------------------------------------------------------- /rule_induction/README.md: -------------------------------------------------------------------------------- 1 | # MMD: Machine Learning Misprediction Diagnoser 2 | 3 | Machine learning models often mispredict, and it is hard to tell when and why. We developed a technique, *MMD*, that systematically discovers rules that characterize a subset of the input space of a machine learning model where the model is more likely to mispredict. 4 | 5 | Our work has been published at the International Conference on Foundations in Software Engineering (FSE'21): J. Cito, I. Dillig, S. Kim, V. Murali, S. Chandra, [Explaining Mispredictions of Machine Learning Models using Rule Induction](https://github.com/facebookresearch/mmd/blob/main/paper/FSE21-ML-Misprediction-Preprint.pdf). 6 | 7 | ```bibtex 8 | @inproceedings{explaining_mispredictions:21, 9 | title={Explaining mispredictions of machine learning models using rule induction}, 10 | author={Cito, J{\"u}rgen and Dillig, Isil and Kim, Seohyun and Murali, Vijayaraghavan and Chandra, Satish}, 11 | booktitle={Proceedings of the 29th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering}, 12 | pages={716--727}, 13 | year={2021} 14 | } 15 | ``` 16 | 17 | ## Requirements 18 | 19 | * Python 3.8 20 | * Pandas 21 | 22 | ## License 23 | 24 | MMD is CC-BY-NC 4.0 (Attr Non-Commercial Inter.) (e.g., FAIR) licensed, as found in the LICENSE file. 25 | -------------------------------------------------------------------------------- /rule_induction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/mmd/dd65bafd57aa52e3c46e6167ee1bf5907e8a1c3b/rule_induction/__init__.py -------------------------------------------------------------------------------- /rule_induction/cfg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """ 8 | For selecting the seed set of parameters. This is the minimum positive deviation from 9 | the base rate of occurrence of that feature in the population. Only then we consider it interesting. 10 | 11 | In other words: We compute how often a predicate occurs within the target subgroup 12 | and compare it to how often it occurs in the overall population. 13 | Only when it occurs at a rate of [pred_relvance_threshold] more often in the target subgroup, we consider it further 14 | """ 15 | pred_relevance_threshold = 0.01 16 | 17 | # same idea, applied to "true positive ratio" of a rule for it to be interesting 18 | rule_relevance_threshold = 0.05 19 | 20 | # same for "coverage" 21 | rule_coverage_threshold = 0.04 22 | 23 | 24 | # Hyperparameters for rule scores (objective function) 25 | coverage_conjunct = 1 26 | size_conjunct = 5 27 | tpr_conjunct = 7 28 | 29 | 30 | max_size = 5 # used for calculating the size penalty. 31 | MAX_ITERS = 5 32 | min_tpr = 0.2 33 | 34 | verbose = False 35 | -------------------------------------------------------------------------------- /rule_induction/diagnoser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | 8 | from typing import Dict, List 9 | from collections.abc import Callable 10 | from dataclasses import dataclass 11 | import collections 12 | import pandas as pd 13 | 14 | from rules.ruleset import generate_ruleset, RuleSet 15 | from rules.rule import Rule 16 | from rules.conjuncts import generate_conjuncts 17 | from util import sorted_pareto_optimal_rules, partition, ConfusionMatrix, BinningMethod, debug_list_log, bold, percent_format 18 | import rules.predicate as Predicates 19 | import cfg as cfg 20 | import timing as Timing 21 | 22 | """ 23 | Target 'class' 24 | """ 25 | Target = collections.namedtuple('Target', 'attribute value') 26 | 27 | """ 28 | Configuration Settings 29 | """ 30 | @dataclass 31 | class Settings: 32 | """ 33 | beam_width : int 34 | Beam width controls the width of the beam during beam search. 35 | A large beam leads to more precision, but is slower 36 | 37 | num_rules : int 38 | Number of final rules to display 39 | 40 | num_bins : int 41 | The number of bins to use for discretization. 42 | 43 | binning_method : BinningMethod 44 | Binning method, currently only support EQFreq and EQWidth 45 | 46 | minimum_relative_coverage : float 47 | Determines the minimum relative amount of coverage (percentage of rows) each subgroup should have 48 | minimum_relative_coverage=1.5 means that each inferred subgroups has to have cover at least 1.5% of the total rows in the dataset 49 | 50 | disjunctions : bool 51 | Indicates whether the algorithm should also find disjunctions 52 | 53 | target_coverage : float 54 | The percentage of target that we want to cover (only applicable if Settings.disjunctions=True) 55 | """ 56 | beam_width: int = 10 57 | num_rules: int = 3 58 | num_bins: int = 5 59 | binning_method: BinningMethod = BinningMethod.EQFreq 60 | minimum_relative_coverage: int = 0.1 61 | target_coverage: float = 0.6 62 | all_rules: bool = False 63 | disjunctions: bool = False 64 | debug_print_function: Callable = None 65 | 66 | 67 | def discover( 68 | df: pd.DataFrame, 69 | target: Target, 70 | relevant_attributes: Dict[str, str], 71 | config: Settings = None 72 | ): 73 | """ 74 | Disovers rules (conjunction of predicates over attributes) that succinctly 75 | describe a subgroup of a dataset with respect to a target attribute 76 | 77 | Given a tabular dataset, relevant features, and a target attribute, 78 | we compute a set of rules (conjunctions of predicates over feature values) 79 | that summarize how the target attribute can best be explained by the features 80 | (including coverage and precision values) 81 | 82 | Parameters 83 | ---------- 84 | 85 | df : pd.DataFrame 86 | Tabular data as Pandas data frame 87 | 88 | target : NamedTuple[attribute: str, value: bool] 89 | attribute - The feature we care about (e.g., misprediction) 90 | value - Boolean value of target_attrib 91 | (e.g., for True, we want to predict when it's true) 92 | 93 | relevant_attributes : Dict[str, str] 94 | Mapping from relevant attributes to consider to their type (D (discrete), I (Int), C (Continuous)) 95 | 96 | config : Settings 97 | Contains all configurable settings 98 | 99 | sample : pd.DataFrame 100 | A representative sample of the full dataset (optional) 101 | 102 | Returns 103 | ------- 104 | Result containing final rules 105 | List of subgroups (conjunction of rules) describing the phenomenon of choice (target_attribute) 106 | """ 107 | 108 | if config is None: 109 | config = Settings() 110 | 111 | # partition into positive and negative data subsets (wrt. target_attribute) 112 | (positive_df, negative_df) = partition(df, target) 113 | 114 | all_predicates = Predicates.generate(df, relevant_attributes, config.num_bins, config.binning_method) 115 | 116 | # print features before and after filtering 117 | debug_list_log("Predicates before filter: {}".format(len(all_predicates)), all_predicates, config.debug_print_function) 118 | 119 | # filter predicates that do not meet our thresholds 120 | relevant_predicates = Predicates.filter(all_predicates, df, positive_df, cfg.pred_relevance_threshold, config.minimum_relative_coverage) 121 | debug_list_log("Predicates after filter: {}".format(len(relevant_predicates)), relevant_predicates, config.debug_print_function) 122 | 123 | # generate beam_width number of rules 124 | rules = generate_conjuncts(df, target, relevant_predicates, config.beam_width) 125 | 126 | # return all rules if requested in the config 127 | if config.all_rules and not config.disjunctions: 128 | time_log = Timing.get_time_log() 129 | return Result(rules, df, target, time_log) 130 | 131 | # find the best num_rules that are Pareto optimal 132 | pareto_optimal_rules = sorted_pareto_optimal_rules(rules, df, target) 133 | best_rules = pareto_optimal_rules[:config.num_rules] 134 | debug_list_log("Best (pareto-optimal) rules: ", best_rules, config.debug_print_function) 135 | 136 | if config.disjunctions: 137 | final_rule_sets = [] 138 | for r in best_rules: 139 | rs = generate_ruleset(r, df, target, relevant_predicates, 140 | config.target_coverage, config.beam_width) 141 | final_rule_sets.append(rs) 142 | return DisjunctiveResult(final_rule_sets, df, target) 143 | 144 | time_log = Timing.get_time_log() 145 | return Result(best_rules, df, target, time_log) 146 | 147 | 148 | class ConfusionMatrixRuleSet(ConfusionMatrix): 149 | def coverage(self, ruleset, df=None): 150 | if df is None: 151 | df = self.df 152 | 153 | subgroups = [df.query(rule.dataframe_query()) for rule in ruleset.elems] 154 | subgroups_merged = pd.concat(subgroups, join='outer', axis=1).drop_duplicates() 155 | return subgroups_merged 156 | 157 | 158 | class Result: 159 | def __init__(self, rules: List, df: pd.DataFrame, target: Target, time_log=None): 160 | self.rules = rules 161 | self.df = df 162 | self.target = target 163 | self.time_log = time_log 164 | 165 | def __repr__(self): 166 | final = [] 167 | for r in self.rules: 168 | final.append(str(r)) 169 | return "\n".join(final) 170 | 171 | """ Parses rule descriptions and turns them into a Result object 172 | 173 | Parameters 174 | ---------- 175 | 176 | df : pd.DataFrame 177 | Tabular data as Pandas data frame 178 | 179 | rule_str : str 180 | A set of rules separated by newlines (\n) 181 | Each rule (on each line) is a conjunction of predicates (separated by '&') 182 | 183 | target: Target 184 | Target attribute and value 185 | 186 | Returns 187 | ------- 188 | Result containing rules parsed from the string input 189 | 190 | """ 191 | @classmethod 192 | def import_rules(cls, rule_str: str, df: pd.DataFrame, target: Target): 193 | rules = list(map(Rule.from_string, rule_str.split('\n'))) 194 | return cls(rules, df, target) 195 | 196 | def dataframe(self): 197 | result = [] 198 | for rule in self.rules: 199 | result.append(self._rule_stats(rule, self.df, self.target)) 200 | return pd.DataFrame(result).sort_values(by='precision', ascending=False) 201 | 202 | # computes precision, recall, coverage for each rule 203 | def _rule_stats(self, rule: Rule, df: pd.DataFrame, target): 204 | stats = ConfusionMatrix(df, target) 205 | rule_coverage = stats.num_covered(rule) 206 | return { 207 | 'rule' : str(rule), 208 | 'precision' : stats.get_tp_ratio(rule), 209 | 'recall' : stats.get_coverage_ratio(rule), 210 | 'rule_coverage' : rule_coverage/len(df), 211 | } 212 | 213 | def _print_rule(self, rule, df: pd.DataFrame, target): 214 | target_name, target_value = target 215 | rule_stat = self._rule_stats(rule, df, target) 216 | stats = ConfusionMatrix(df, target) 217 | 218 | print(bold("Subgroup: {}".format(str(rule)))) 219 | 220 | overall_coverage = stats.num_covered(rule) 221 | 222 | print("% of subgroup in population (Full Dataset):\t{}% ({} rows)".format( 223 | percent_format(rule_stat['rule_coverage']), 224 | overall_coverage) 225 | ) 226 | 227 | print("Precision: P({}={} | {}) = {}%".format( 228 | target_name, 229 | target_value, 230 | str(rule), 231 | percent_format(rule_stat['precision']) 232 | )) 233 | 234 | print("Recall: P({} | {}={}) = {}%".format( 235 | str(rule), 236 | target_name, 237 | target_value, 238 | percent_format(rule_stat['recall']) 239 | )) 240 | 241 | def print(self, df: pd.DataFrame = None): 242 | if df is None: 243 | df = self.df 244 | 245 | print("Subgroup Discovery Result\n") 246 | print("Found {} subgroups".format(bold(str(len(self.rules))))) 247 | 248 | print(bold("Dataset")) 249 | print("Target: {}={}".format(self.target[0], self.target[1])) 250 | print("# Rows:\t{}".format(df.shape[0])) 251 | print("# Cols:\t{}".format(df.shape[1])) 252 | 253 | pos, _ = partition(df, self.target) 254 | print("% Target in dataset {}%".format(percent_format(len(pos)/len(df)))) 255 | 256 | for rule in self.rules: 257 | print("="*40) 258 | self._print_rule(rule, df, self.target) 259 | 260 | def get_rule_stats(self): 261 | return [ 262 | self._rule_stats(rule, self.df, self.target) 263 | for rule in self.rules 264 | ] 265 | 266 | 267 | class DisjunctiveResult: 268 | def __init__(self, rulesets: List[RuleSet], df: pd.DataFrame, target: Target): 269 | self.rulesets = rulesets 270 | self.df = df 271 | self.target = target 272 | 273 | # computes precision, recall, coverage for each rule 274 | def _rule_stats(self, ruleset: RuleSet, df: pd.DataFrame, target): 275 | stats = ConfusionMatrixRuleSet(df, target) 276 | rule_coverage = stats.num_covered(ruleset) 277 | return { 278 | 'ruleset' : str(ruleset), 279 | 'precision' : stats.get_tp_ratio(ruleset), 280 | 'recall' : stats.get_coverage_ratio(ruleset), 281 | 'rule_coverage' : rule_coverage/len(df) 282 | } 283 | 284 | def print(self): 285 | for ruleset in self.rulesets: 286 | print("#"*40) 287 | print(ruleset) 288 | print("#"*40) 289 | for rule in ruleset.elems: 290 | Result([rule], self.df, self.target).print() 291 | 292 | def dataframe(self): 293 | result = [] 294 | for ruleset in self.rulesets: 295 | result.append(self._rule_stats(ruleset, self.df, self.target)) 296 | return pd.DataFrame(result).sort_values(by='precision', ascending=False) 297 | -------------------------------------------------------------------------------- /rule_induction/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.3.2 2 | -------------------------------------------------------------------------------- /rule_induction/rules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/mmd/dd65bafd57aa52e3c46e6167ee1bf5907e8a1c3b/rule_induction/rules/__init__.py -------------------------------------------------------------------------------- /rule_induction/rules/cache.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from pandas.util import hash_pandas_object 8 | import pandas as pd 9 | 10 | # Mapping from (rule_str, dataframe_hash) pairs to a tuple (q, tps, fps, num_covered) 11 | cache = {} 12 | 13 | # Given a rule and data frame, returns the 14 | # cache key used for memoization 15 | def get_cache_key(rule, df: pd.DataFrame): 16 | h = hash_pandas_object(df) 17 | cache_key = (str(rule) + ":" + str(h)) 18 | return cache_key 19 | -------------------------------------------------------------------------------- /rule_induction/rules/conjuncts.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import pandas as pd 8 | import math 9 | from timing import time_log 10 | 11 | from .rule import Rule 12 | from util import ConfusionMatrix 13 | 14 | 15 | # Find a set of beam_width conjuncts (rules) based on our goodness criteria (using beam search) 16 | # a Rule (aka conjunct) is a set of predicates conjoined by AND 17 | # Note that objective function is implemented in Rule.eval 18 | @time_log 19 | def generate_conjuncts(df: pd.DataFrame, target, predicates, beam_width): 20 | stats = ConfusionMatrix(df, target) 21 | num_pos = stats.pos_df.shape[0] 22 | num_total = df.shape[0] 23 | min_support = math.sqrt(num_pos) / num_total 24 | 25 | #beam and new_beam are mappings from rules to q_values 26 | beam = {} 27 | new_beam = {} 28 | 29 | for pred in predicates: 30 | r = Rule() 31 | r.add_conjunct(pred) 32 | q = r.eval(stats) 33 | beam[r] = q 34 | 35 | if beam_width > len(predicates): 36 | new_beam = beam.copy() 37 | else: 38 | values = sorted(beam.values(), reverse=True) 39 | cutoff = values[beam_width - 1] 40 | new_beam = {key: value for (key, value) in beam.items() if value >= cutoff} 41 | beam = new_beam.copy() 42 | 43 | i = 0 44 | while True: 45 | improved = False 46 | for rule in list(beam): 47 | for pred in predicates: 48 | if rule.check_useless(pred): 49 | continue 50 | 51 | new_rule = Rule(rule) 52 | new_rule.add_conjunct(pred) 53 | 54 | if is_duplicate(new_rule, new_beam): 55 | continue 56 | 57 | q = new_rule.eval(stats) 58 | num_tp = len(stats.true_positive(new_rule)) 59 | exceeds_min_support = num_tp / df.shape[0] >= min_support 60 | if not exceeds_min_support: 61 | continue 62 | 63 | (worst_rule, worst_q) = get_worst_rule(new_beam, beam_width) 64 | 65 | if q <= worst_q: 66 | continue 67 | 68 | # TP(new_rule) < TP(existing) AND 69 | # FP(new_rule) > FP(existing) 70 | if is_irrelevant(new_rule, new_beam, stats): 71 | continue 72 | if worst_q > 0: 73 | new_beam.pop(worst_rule) 74 | new_beam[new_rule] = q 75 | improved = True 76 | 77 | if i == 0: 78 | clean_beam(new_beam) 79 | 80 | if not improved: 81 | break 82 | 83 | beam = new_beam 84 | i += 1 85 | return beam 86 | 87 | 88 | # is rule implied by existing rule in the beam? 89 | def is_redundant(rule, new_beam): 90 | return any(r.implies(rule) for r in new_beam) 91 | 92 | 93 | # is rule a duplicate of another rule in the beam? 94 | def is_duplicate(rule, new_beam): 95 | return any(r == rule for r in new_beam) 96 | 97 | 98 | # find worst rule in beam 99 | def get_worst_rule(beam, beam_width): 100 | if len(beam) < beam_width: 101 | return (None, -1) 102 | worst_q = 2**60 103 | for cur_r in beam: 104 | cur_q = beam[cur_r] 105 | if cur_q < worst_q: 106 | worst_q = cur_q 107 | worst_rule = cur_r 108 | return (worst_rule, worst_q) 109 | 110 | 111 | def clean_beam(beam): 112 | new_beam = {} 113 | for r in beam.keys(): 114 | if len(r.elems) == 0: 115 | continue 116 | new_beam[r] = beam[r] 117 | return new_beam 118 | 119 | 120 | def print_beam(beam, stats: ConfusionMatrix): 121 | sorted_beam = sorted(beam.items(), key=lambda x: x[1]) 122 | for (rule, val) in sorted_beam: 123 | tpr = stats.get_tp_ratio(rule) 124 | coverage = stats.get_coverage_ratio(rule) 125 | print(rule, "q value: ", val, "\n", " True positive ratio ", tpr, " coverage: ", coverage) 126 | 127 | 128 | # A rule R is irrelevant TP(R) < TP(existing) and FP(R) > FP(existing) 129 | def is_irrelevant(rule: Rule, new_beam, stats: ConfusionMatrix): 130 | (new_q, new_tp, new_fp) = rule.get_eval_result(stats) 131 | for r in new_beam: 132 | if len(r.elems) == 0: 133 | continue 134 | (old_q, old_tp, old_fp) = r.get_eval_result(stats) 135 | if new_tp.issubset(old_tp) and old_fp.issubset(new_fp): 136 | return True 137 | return False 138 | -------------------------------------------------------------------------------- /rule_induction/rules/predicate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import re 8 | import ast 9 | import pandas as pd 10 | import math 11 | from typing import Dict, Set, List 12 | 13 | from util import BinningMethod 14 | 15 | 16 | # Predicate of the form attrib OP value 17 | class Predicate: 18 | def __init__(self, attribute_name, operator, value): 19 | self.name = attribute_name 20 | self.op = operator 21 | self.val = value 22 | 23 | def _get_uid(self): 24 | return (self.name, self.op, self.val) 25 | 26 | def __str__(self): 27 | if self.op == '==' or self.op == '!=': 28 | value = '"{}"'.format(self.val) 29 | else: 30 | value = str(self.val) 31 | return self.name + self.op + value 32 | 33 | def dataframe_query(self): 34 | return self.__str__() 35 | 36 | @classmethod 37 | def from_string(cls, predicate_str: str): 38 | attribute_name, operator, value = re.split(r'(==|=|!=|<=|<|>=|>)', predicate_str) 39 | # partial order in predicate indicates numerical type 40 | if operator[0] == '<' or operator[0] == '>': 41 | value = ast.literal_eval(value) 42 | if operator == '=': 43 | operator = '==' 44 | return cls(attribute_name, operator, value) 45 | 46 | def num_positive(self, df: pd.DataFrame): 47 | #return sum(1 for _, row in data.iterrows() if self.eval(row)) 48 | # or just call coverage from util 49 | return len(df.query(self.dataframe_query())) 50 | 51 | 52 | def eval(self, instance): 53 | inst_val = instance[self.name] 54 | if self.op == ">": 55 | return inst_val > self.val 56 | elif self.op == "<=": 57 | return inst_val <= self.val 58 | elif self.op == "==": 59 | return inst_val == self.val 60 | elif self.op == "!=": 61 | return inst_val != self.val 62 | 63 | raise Exception("attribute type not supported" + self.op) 64 | 65 | def __eq__(self, other): 66 | return self.name == other.name and self.op == other.op and self.val == other.val 67 | 68 | def __hash__(self): 69 | # We can use id(self) or proper hashing, this is not the issue I fixed (see other __hash__) 70 | return hash(self._get_uid()) 71 | 72 | def implies(self, other): 73 | if self.name != other.name: 74 | return False 75 | 76 | if self.op == other.op and self.val == other.val: 77 | return True 78 | 79 | if self.op == ">": 80 | if other.op == ">" or other.op == "!=": 81 | return self.val > other.val 82 | else: 83 | return False 84 | 85 | if self.op == "<=": 86 | if other.op == "<=" or other.op == "!=": 87 | return self.val <= other.val 88 | else: 89 | return False 90 | 91 | if self.op == "==": 92 | if other.op == ">": 93 | return self.val > other.val 94 | if other.op == "<=": 95 | return self.val <= other.val 96 | return False 97 | 98 | if self.op == "!=": 99 | return False 100 | 101 | raise Exception("attribute type not supported" + self.op) 102 | 103 | 104 | def filter(predicates: Set[Predicate], df: pd.DataFrame, pos: pd.DataFrame, predicate_relevance_threshold=0.01, minimum_relative_coverage=0): 105 | """ 106 | This is a pre-processing step to identify a subset of interesting predicates 107 | to consider in the seed set. 108 | It considers a predicate to be "interesting" if the data filtered according 109 | to this predicate has a slightly higher occurrence of trait compared 110 | to whole population 111 | 112 | Parameters 113 | ---------- 114 | 115 | predicates : Set[Predicate] 116 | The feature we care about (e.g., misprediction) 117 | 118 | df : pd.DataFrame 119 | Tabular data as Pandas data frame 120 | 121 | pos : pd.DataFrame 122 | Boolean value of target_attrib (e.g., for True, we want to predict when it's true) 123 | 124 | Returns 125 | ------- 126 | filtered: Set[Predicate] 127 | A subset of filtered predicates 128 | """ 129 | filtered = set() 130 | num_pos = pos.shape[0] 131 | num_total = df.shape[0] 132 | ratio_whole = num_pos / num_total 133 | 134 | for predicate in predicates: 135 | num_pass = predicate.num_positive(df) 136 | if num_pass == 0: 137 | continue 138 | # minimum_relative_coverage is given in % 139 | if num_pass / num_total < (minimum_relative_coverage/100): 140 | continue 141 | num_tp = predicate.num_positive(pos) 142 | ratio_current_predicate = num_tp / num_pass 143 | 144 | diff = ratio_current_predicate - ratio_whole 145 | 146 | if diff > predicate_relevance_threshold: 147 | filtered.add(predicate) 148 | return filtered 149 | 150 | 151 | def generate(df: pd.DataFrame, relevant_attributes: Dict[str, str], num_bins: int, binning_method: BinningMethod) -> Set[Predicate]: 152 | """ 153 | Generates a set of predicates based on the dataset and search hyper-parameters 154 | 155 | Parameters 156 | ---------- 157 | 158 | predicates : Set[Predicate] 159 | The feature we care about (e.g., misprediction) 160 | 161 | df : pd.DataFrame 162 | Tabular data as Pandas data frame 163 | 164 | num_bins : int 165 | Number of bins 166 | 167 | binning_method : BinningMethod 168 | Either equal frequency or width binning 169 | 170 | Returns 171 | ------- 172 | filtered: Set[Predicate] 173 | A subset of predicates 174 | """ 175 | features: Set[Predicate] = set() 176 | for (attribute_name, attribute_type) in relevant_attributes.items(): 177 | values = get_values_for_attributes(df, attribute_name) 178 | if attribute_type == 'D': 179 | features |= generate_discrete_predicates(attribute_name, values) 180 | elif attribute_type == 'C' or attribute_type == 'I': 181 | features |= generate_continous_predicates(attribute_name, values, num_bins, binning_method) 182 | else: 183 | raise Exception("attribute type not supported") 184 | return features 185 | 186 | 187 | # Generates predicates for continuous values using discretization 188 | # strategy specified by binning_method 189 | def generate_continous_predicates(attribute_name: str, values: List, num_bins: int, binning_method: str) -> Set[Predicate]: 190 | if binning_method == BinningMethod.EQFreq: 191 | cutoffs = equi_freq(values, num_bins) 192 | elif binning_method == BinningMethod.EQWidth: 193 | cutoffs = equi_width(values, num_bins) 194 | 195 | preds = set() 196 | for val in cutoffs: 197 | pred_geq = Predicate(attribute_name, "<=", val) 198 | preds.add(pred_geq) 199 | pred_lt = Predicate(attribute_name, ">", val) 200 | preds.add(pred_lt) 201 | 202 | return preds 203 | 204 | 205 | def generate_discrete_predicates(attribute_name: str, values: Set) -> Set[Predicate]: 206 | preds = set() 207 | # In a boolean attribute (e.g., True, False), there is no need to generate 4 predicates: 208 | # (attribute_name != values[0]) implies (attribute_name == values[1]) 209 | if len(values) == 2: 210 | values = list(values) 211 | preds.add(Predicate(attribute_name, "==", values[0])) 212 | preds.add(Predicate(attribute_name, "==", values[1])) 213 | return preds 214 | 215 | for v in values: 216 | pred_eq = Predicate(attribute_name, "==", v) 217 | preds.add(pred_eq) 218 | pred_neq = Predicate(attribute_name, "!=", v) 219 | preds.add(pred_neq) 220 | return preds 221 | 222 | 223 | # Returns cut-offs for discretization based on equal frequency binning 224 | def equi_freq(values: Set, num_bins: int): 225 | cutoffs = set() 226 | sorted_values = sorted(values) 227 | 228 | num_values = len(sorted_values) 229 | 230 | # number of bins > number of unique values 231 | if num_bins > num_values: 232 | # Use square-root choice 233 | num_bins = math.ceil(math.sqrt(num_values)) 234 | 235 | values_ratio = int(num_values / num_bins) 236 | 237 | for i in range(0, num_bins): 238 | arr = [] 239 | for j in range(i * values_ratio, (i + 1) * values_ratio): 240 | if j >= num_values: 241 | break 242 | arr = arr + [sorted_values[j]] 243 | 244 | cutoff = arr[len(arr) - 1] 245 | cutoffs.add(cutoff) 246 | 247 | return cutoffs 248 | 249 | 250 | # Returns cut-offs for discretization based on equal frequency binning 251 | def equi_width(values: Set, num_bins: int): 252 | values = set(values) 253 | sorted_set = sorted(values) 254 | min_val = sorted_set[0] 255 | max_val = sorted_set[len(sorted_set) - 1] 256 | val_range = max_val - min_val 257 | 258 | cutoffs = [] 259 | inc = val_range / num_bins 260 | inc = int(inc) + 1 261 | 262 | for i in range(min(num_bins, max_val)): 263 | cutoffs.append(i * inc) 264 | 265 | return cutoffs 266 | 267 | 268 | def get_values_for_attributes(df: pd.DataFrame, attribute_name: str) -> Set: 269 | return set(df[attribute_name].unique()) 270 | -------------------------------------------------------------------------------- /rule_induction/rules/rule.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .cache import cache, get_cache_key 8 | from .predicate import Predicate 9 | from util import ConfusionMatrix 10 | import cfg as cfg 11 | 12 | 13 | class Rule: 14 | # Rule : is a conjunction of predicates. We also call this a 'conjunct' 15 | def __init__(self, orig=None): 16 | self.elems = set(orig.elems) if orig else set() 17 | 18 | def add_conjunct(self, pred): 19 | self.elems.add(pred) 20 | 21 | def check_useless(self, pred): 22 | for p in self.elems: 23 | if pred.name != p.name: 24 | continue 25 | if p.op == ">": 26 | if pred.op == ">" or pred.op == "=" or pred.op == "!=": 27 | return True 28 | if p.op == "<=": 29 | if pred.op == "<=" or pred.op == "=" or pred.op == "!=": 30 | return True 31 | if p.op == "=": 32 | return True 33 | 34 | if p.op == "!=": 35 | return True 36 | 37 | return False 38 | 39 | # does this rule imply another one? 40 | def implies(self, other): 41 | for other_pred in other.elems: 42 | implied = False 43 | for self_pred in self.elems: 44 | if self_pred.implies(other_pred): 45 | implied = True 46 | break 47 | if not implied: 48 | return False 49 | 50 | return True 51 | 52 | def is_subset(self, other): 53 | for p in self.elems: 54 | contains_pred = any(p == q for q in other.elems) 55 | if not contains_pred: 56 | return False 57 | return True 58 | 59 | def __eq__(self, other): 60 | return self.is_subset(other) and other.is_subset(self) 61 | 62 | def __hash__(self): 63 | # Potential Issue: using id(self) is just comparison by pointer address 64 | # This results in failed matches during beam search 65 | # Using "hash(frozenset(self.elems))" below will produce a different (smaller) result 66 | #return id(self) 67 | return hash(frozenset(self.elems)) 68 | 69 | def __str__(self): 70 | return ' & '.join(str(p) for p in self.elems) 71 | 72 | def __repr__(self): 73 | return self.__str__() 74 | 75 | def dataframe_query(self): 76 | return self.__str__() 77 | 78 | @classmethod 79 | def from_string(cls, rule_str : str): 80 | r = Rule() 81 | predicate_str_list = list(map(lambda x: x.strip(), rule_str.split('&'))) 82 | for predicate_str in predicate_str_list: 83 | r.add_conjunct(Predicate.from_string(predicate_str)) 84 | return r 85 | 86 | # Returns a triple (Q_value, TPs, FPs) 87 | def get_eval_result(self, stats): 88 | cache_key = get_cache_key(self, stats.df) 89 | if cache_key in cache.keys(): 90 | (q, tps, fps, nc) = cache[cache_key] 91 | return (q, tps, fps) 92 | 93 | self.eval(stats) 94 | assert cache_key in cache.keys() 95 | (q, tps, fps, nc) = cache[cache_key] 96 | return (q, tps, fps) 97 | 98 | # Does this rule evaluate to true or false for this point? 99 | def eval_point(self, point): 100 | return self.covers(point) 101 | 102 | # Does this rule cover this point (i.e., does it evaluate to true?) 103 | def covers(self, point): 104 | return all(pred.eval(point) for pred in self.elems) 105 | 106 | def get_size(self): 107 | return len(self.elems) 108 | 109 | # Important function for calculating objective (q) value 110 | def eval(self, stats: ConfusionMatrix): 111 | cache_key = get_cache_key(self, stats.df) 112 | if cache_key in cache.keys(): 113 | (q, tps, fps, nc) = cache[cache_key] 114 | return q 115 | 116 | tps = stats.true_positive(self) 117 | fps = stats.false_positive(self) 118 | nc = stats.num_covered(self) 119 | 120 | # Guide rule size with cfg.max_size parameter 121 | size_score = 1 - self.get_size() / cfg.max_size 122 | 123 | q = stats.get_coverage_ratio(self) * cfg.coverage_conjunct \ 124 | + stats.get_tp_ratio(self) * cfg.tpr_conjunct \ 125 | + size_score * cfg.size_conjunct 126 | 127 | cache[cache_key] = (q, tps, fps, nc) 128 | return q 129 | 130 | def is_better(self, other_rule, stats: ConfusionMatrix): 131 | """ 132 | It's better if it's better both in terms of tpr and coverage 133 | This is used for filtering out rules that are not on the Pareto frontier 134 | """ 135 | self_tpr = stats.get_tp_ratio(self) 136 | other_tpr = stats.get_tp_ratio(other_rule) 137 | self_coverage = stats.get_coverage_ratio(self) 138 | other_coverage = stats.get_coverage_ratio(other_rule) 139 | return (self_tpr > other_tpr and self_coverage > other_coverage) 140 | -------------------------------------------------------------------------------- /rule_induction/rules/ruleset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import pandas as pd 8 | from typing import Set 9 | import math 10 | 11 | from .rule import Rule 12 | from .predicate import Predicate 13 | from .conjuncts import generate_conjuncts 14 | from util import sorted_pareto_optimal_rules, ConfusionMatrix 15 | import cfg as cfg 16 | 17 | 18 | def generate_ruleset( 19 | rule: Rule, 20 | df: pd.DataFrame, 21 | target, 22 | predicates: Set[Predicate], 23 | target_coverage: float, 24 | beam_width: int 25 | ): 26 | """ 27 | Discovers a set of rules until target_coverage is achieved or until a certain number of 28 | rules are generated. The idea is to find one rule, then filter out part of the 29 | data set covered by that rule, and then find the next rule etc. until 30 | termination condition holds (e.g., target coverage is achieved) 31 | 32 | rule : Rule 33 | Initial rule seed 34 | 35 | df : pd.DataFrame 36 | Full dataset 37 | 38 | target : Target 39 | Target variable (attribute_name, attribute_value) 40 | 41 | predicates : Set[Predicate] 42 | Relevant predicates 43 | 44 | target_coverage : float 45 | The percentage of target_attrib that we want to cover 46 | 47 | beam_width : int 48 | Beam width controls the width of the beam during beam search. 49 | A large beam leads to more precision, but is slower 50 | 51 | Returns 52 | ------- 53 | rs: RuleSet 54 | Mined ruleset 55 | 56 | """ 57 | rs = RuleSet() 58 | rs.add_rule(rule) # start with the rule that was passed in 59 | 60 | cur_df = df 61 | stats = ConfusionMatrix(df, target) 62 | cur_pos = stats.pos_df 63 | cur_neg = stats.neg_df 64 | 65 | for _ in range(cfg.MAX_ITERS): 66 | # remove the points for which the 'rule' was true 67 | cur_df = filter_by_rule(cur_df, rule) 68 | cur_pos = filter_by_rule(cur_pos, rule) 69 | cur_neg = filter_by_rule(cur_neg, rule) 70 | 71 | rules = generate_conjuncts(cur_df, target, predicates, beam_width) 72 | if len(rules) == 0: 73 | break 74 | 75 | rules = filter_rules(rules, stats) 76 | if len(rules) == 0: 77 | break 78 | best_rules = sorted_pareto_optimal_rules(rules, df, target) 79 | rule = best_rules[0] # pick the very top one 80 | 81 | if len(rule.elems) == 0: 82 | break 83 | 84 | rs.add_rule(rule) 85 | 86 | # Coverage of a disjunctive ruleset is the coverage sum of all conjunctions 87 | cur_coverage = sum(stats.get_coverage_ratio(rule) for rule in rs.elems) 88 | if cur_coverage >= target_coverage: 89 | break 90 | 91 | return rs 92 | 93 | 94 | # Filters out parts of data that are covered by the given rule 95 | def filter_by_rule(df: pd.DataFrame, rule: Rule): 96 | new_data = pd.DataFrame.copy(df) 97 | indices = [i for i, (_, row) in enumerate(df.iterrows()) if rule.eval_point(row)] 98 | new_data.drop(new_data.index[indices], inplace=True) 99 | return new_data 100 | 101 | 102 | # performs statistical significant test to identify interesting rules 103 | def filter_rules(rules, stats): 104 | filtered = {} 105 | for r in rules.keys(): 106 | if pass_threshold(r, stats): 107 | filtered[r] = rules[r] 108 | return filtered 109 | 110 | # checks whether the given rule or rule set 111 | # passes various thresholds in terms 112 | # of true positive ratio and coverage 113 | def pass_threshold(r, stats: ConfusionMatrix): 114 | num_pos = stats.pos_df.shape[0] 115 | num_total = stats.df.shape[0] 116 | ratio_whole = num_pos / num_total 117 | min_support = math.sqrt(num_pos) / num_total 118 | 119 | tps = stats.true_positive(r) 120 | num_tp = len(tps) 121 | exceeds_min_support = num_tp / stats.df.shape[0] >= min_support 122 | if not exceeds_min_support: 123 | return False 124 | cur_tpr = stats.get_tp_ratio(r) 125 | diff = cur_tpr - ratio_whole 126 | if diff < cfg.rule_relevance_threshold: 127 | return False 128 | cur_coverage = stats.get_coverage_ratio(r) 129 | return cur_coverage > cfg.rule_coverage_threshold 130 | 131 | 132 | class RuleSet: 133 | """ 134 | Ruleset : is a set of Rules, implicitly connected by a disjunction. 135 | """ 136 | def __init__(self, orig=None): 137 | self.elems = set(orig.elems) if orig else set() 138 | 139 | def add_rule(self, rule): 140 | self.elems.add(rule) 141 | 142 | def __str__(self): 143 | return ' | '.join(f'({p})' for p in self.elems) 144 | 145 | def __eq__(self, other): 146 | """Overrides the default implementation""" 147 | return self.equals(other) 148 | 149 | def __hash__(self): 150 | return hash(frozenset(self.elems)) 151 | 152 | # Does this ruleSet cover this point (i.e., does it evaluate to true?) 153 | def covers(self, point): 154 | return any(rule.eval_point(point) for rule in self.elems) 155 | 156 | # Does this rule evaluate to true or false for this point? 157 | def eval_point(self, point): 158 | return self.covers(point) 159 | 160 | def is_subset(self, other): 161 | for r in self.elems: 162 | contains_rule = any(r == x for x in other.elems) 163 | if not contains_rule: 164 | return False 165 | return True 166 | 167 | def equals(self, other): 168 | return self.is_subset(other) and other.is_subset(self) 169 | 170 | def impliesRule(self, rule): 171 | return any(r.implies(rule) for r in self.elems) 172 | 173 | def impliesRuleSet(self, other): 174 | return all(self.impliesRule(r) for r in other.elems) 175 | 176 | def check_useless(self, rule): 177 | if self.impliesRule(rule): 178 | return True 179 | rs = RuleSet() 180 | rs.add_rule(rule) 181 | return rs.impliesRuleSet(self) 182 | 183 | def get_size(self): 184 | return sum(r.get_size() for r in self.elems) 185 | -------------------------------------------------------------------------------- /rule_induction/timing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from functools import wraps 8 | import time 9 | import copy 10 | 11 | 12 | class TimeLog: 13 | def __init__(self, time_log, debug=False): 14 | self.time_log = time_log 15 | self.debug = debug 16 | 17 | def add_time_log(self, name, start, end): 18 | time_log_entry = {'name' : name, 'execution_time': end - start } 19 | self.time_log.append(time_log_entry) 20 | if self.debug: 21 | print("{}: {:2.4f}s".format(time_log_entry['name'], time_log_entry['execution_time'])) 22 | 23 | def __str__(self): 24 | return "\n".join(map(lambda time_log_entry: 25 | "{}: {:2.4f}s".format(time_log_entry['name'], time_log_entry['execution_time']), 26 | self.time_log 27 | )) 28 | 29 | _time_log_container = [] 30 | TIMING_DEBUG=False 31 | _time_log = TimeLog(_time_log_container, TIMING_DEBUG) 32 | 33 | def get_time_log(): 34 | global _time_log, _time_log_container 35 | time_log = copy.deepcopy(_time_log) 36 | # Reset time log entries 37 | _time_log_container = [] 38 | _time_log = TimeLog(_time_log_container, TIMING_DEBUG) 39 | return time_log 40 | 41 | 42 | def time_log(f): 43 | @wraps(f) 44 | def wrap(*args, **kw): 45 | start = time.perf_counter() 46 | result = f(*args, **kw) 47 | end = time.perf_counter() 48 | name = f.__module__ + "/" + f.__name__ 49 | _time_log.add_time_log(name, start, end) 50 | return result 51 | return wrap 52 | -------------------------------------------------------------------------------- /rule_induction/util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import pandas as pd 8 | import logging 9 | from enum import Enum 10 | from typing import List 11 | from collections.abc import Callable 12 | 13 | from rules.cache import cache, get_cache_key 14 | from timing import time_log 15 | 16 | 17 | def partition( 18 | df: pd.DataFrame, 19 | target 20 | ): 21 | """ Partitions data into positive and negative subsets 22 | 23 | Parameters 24 | ---------- 25 | 26 | df : pd.DataFrame 27 | Tabular data as Pandas data frame 28 | 29 | target_attribute : str 30 | The feature we care about (e.g., misprediction) 31 | 32 | target_value : bool 33 | Boolean value of target_attrib (e.g., for True, we want to predict when it's true) 34 | 35 | Returns 36 | ------- 37 | (pos_data, neg_data): Tuple[pd.DataFrame, pd.DataFrame] 38 | Tuple of positive and negative subsets of data in df wrt. target attribute and value 39 | 40 | """ 41 | target_name, target_value = target 42 | positive_df = df[df[target_name] == target_value] 43 | negative_df = df[df[target_name] != target_value] 44 | return (positive_df, negative_df) 45 | 46 | 47 | class ConfusionMatrix: 48 | def __init__(self, df: pd.DataFrame, target): 49 | self.df = df 50 | self.pos_df, self.neg_df = partition(df, target) 51 | 52 | def true_positive(self, rule): 53 | cache_key = get_cache_key(rule, self.df) 54 | if cache_key in cache.keys(): 55 | (q, tps, fps, nc) = cache[cache_key] 56 | return tps 57 | tp_coverage = self.coverage(rule, self.pos_df) 58 | return {u_id for u_id, row in tp_coverage.iterrows()} 59 | 60 | def false_positive(self, rule): 61 | cache_key = get_cache_key(rule, self.df) 62 | if cache_key in cache.keys(): 63 | (q, tps, fps, nc) = cache[cache_key] 64 | return fps 65 | fp_coverage = self.coverage(rule, self.neg_df) 66 | return {u_id for u_id, row in fp_coverage.iterrows()} 67 | 68 | # returns the number of data points covered by rule 69 | def num_covered(self, rule): 70 | cache_key = get_cache_key(rule, self.df) 71 | if cache_key in cache.keys(): 72 | (q, tps, fps, nc) = cache[cache_key] 73 | return nc 74 | return len(self.coverage(rule)) 75 | 76 | def coverage(self, rule, df=None): 77 | if df is None: 78 | df = self.df 79 | return df.query(rule.dataframe_query()) 80 | 81 | # P(feature | misprediction) - recall 82 | def get_coverage_ratio(self, rule): 83 | num_tp = len(self.true_positive(rule)) 84 | pos_length = self.pos_df.shape[0] 85 | return num_tp / pos_length if pos_length > 0 else 0 86 | 87 | # P(misprediction | feature) - precision 88 | def get_tp_ratio(self, rule): 89 | num_tp = len(self.true_positive(rule)) 90 | nc = self.num_covered(rule) 91 | if nc == 0: 92 | return 0 93 | return num_tp / nc 94 | 95 | 96 | # Returns the best rule according to q value after 97 | # filtering out non-pareto-optimal rules 98 | @time_log 99 | def sorted_pareto_optimal_rules(rules, df: pd.DataFrame, target): 100 | # filters out rules that are not on the Pareto frontier 101 | def retain_pareto_optimal(rules: List, stats: ConfusionMatrix): 102 | retained = {} 103 | filtered = set() 104 | for r1 in rules.keys(): 105 | keep = True 106 | for r2 in rules.keys(): 107 | if r1 == r2: 108 | continue 109 | if r2 in filtered: 110 | continue 111 | # there is some rule that is better than r1, 112 | # so we don't need to keep r1 113 | if r2.is_better(r1, stats): 114 | keep = False 115 | filtered.add(r1) 116 | break 117 | if keep: 118 | retained[r1] = rules[r1] 119 | return retained 120 | 121 | stats = ConfusionMatrix(df, target) 122 | pareto_optimal = retain_pareto_optimal(rules, stats) 123 | sorted_rules = sorted(pareto_optimal.items(), key=lambda x: x[1], reverse=True) 124 | # leave out score 125 | return [rule for rule, _ in sorted_rules] 126 | 127 | 128 | class BinningMethod(Enum): 129 | EQFreq = 1 130 | EQWidth = 2 131 | 132 | def debug_list_log(msg, list=None, log_method:Callable = None): 133 | if log_method is None: 134 | log_method = logging.debug 135 | 136 | log_method(msg) 137 | if list is not None: 138 | for list_item in list: 139 | log_method(list_item) 140 | 141 | def bold(text: str): 142 | return '\033[1m' + text + '\033[0m' 143 | 144 | def percent_format(percent): 145 | assert(percent >= 0 and percent <= 1) 146 | return round(percent*100, 2) 147 | --------------------------------------------------------------------------------