├── .github └── workflows │ └── pkg-test.yml ├── .gitignore ├── LICENSE ├── README.md ├── labelmodels ├── __init__.py ├── hmm.py ├── label_model.py ├── linked_hmm.py ├── naive_bayes.py └── partial_labels.py ├── setup.py └── test ├── test_hmm.py ├── test_linked_hmm.py ├── test_naive_bayes.py ├── test_partial_labels.py └── util.py /.github/workflows/pkg-test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.8"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | pip install . 33 | - name: Lint with flake8 34 | run: | 35 | # stop the build if there are Python syntax errors or undefined names 36 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 37 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 38 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 39 | - name: Test with pytest 40 | run: | 41 | pytest test/ 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # PyCharm 2 | .idea 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | .idea/ 110 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Label Models 2 | 3 | [![Package Test Status](https://github.com/BatsResearch/labelmodels/actions/workflows/pkg-test.yml/badge.svg)](https://github.com/BatsResearch/labelmodels/actions/workflows/pkg-test.yml) 4 | 5 | Lightweight implementations of generative label models for weakly supervised machine learning 6 | 7 | # Example Usage - Naive Bayes Model 8 | ```python 9 | # Let votes be an m x n matrix where m is the number of data examples, n is the 10 | # number of label sources, and each element is in the set {0, 1, ..., k}, where 11 | # k is the number of classes. If votes_{ij} is 0, it means that label source j 12 | # abstains from voting on example i. 13 | 14 | # As an example, we create a random votes matrix for binary classification with 15 | # 1000 examples and 5 label sources 16 | import numpy as np 17 | votes = np.random.randint(0, 3, size=(1000, 5)) 18 | 19 | # We now can create a Naive Bayes generative model to estimate the accuracies 20 | # of these label sources 21 | from labelmodels import NaiveBayes 22 | 23 | # We initialize the model by specifying that there are 2 classes (binary 24 | # classification) and 5 label sources 25 | model = NaiveBayes(num_classes=2, num_lfs=5) 26 | 27 | # Next, we estimate the model's parameters 28 | model.estimate_label_model(votes) 29 | print(model.get_accuracies()) 30 | 31 | # We can obtain a posterior distribution over the true labels 32 | labels = model.get_label_distribution(votes) 33 | ``` 34 | 35 | 36 | 37 | # Example Usage - Partial Label Model 38 | ```python 39 | # Let votes be an m x n matrix where m is the number of data examples, n is the 40 | # number of label sources, and each element is in the set {0, 1, ..., k_l}, where 41 | # k_l is the number of label partitions for partial labeling functions PLF_{l}. If votes_{ij} is 0, 42 | # it means that partial label source j abstains from voting on example i. 43 | 44 | # As an example, we create a random votes matrix for classification with 45 | # 1000 examples and 3 label sources 46 | import numpy as np 47 | import torch 48 | 49 | # label_partition is a table that specifies 0-indexed PLF's label partition configurations, for this brief example, 50 | # we have 3 PLFs each separating the 3-class label space into two partitions. For 0-th PLF, it partitions the label space 51 | # into \{1\} and \{2,3\}. Notice the class label is 1-indexed. 52 | # The label_partition configures the label partitions mapping in format as {PLF's index: [partition_1, partition_2, ..., partition_{k_l}]} 53 | simple_label_partition = { 54 | 0: [[1], [2, 3]], 55 | 1: [[2], [1, 3]], 56 | 2: [[3], [1, 2]] 57 | } 58 | num_sources = len(simple_label_partition) 59 | num_classes = 3 60 | votes = np.random.randint(0, 1, size=(1000, 3)) 61 | 62 | device = 'cuda:0' if torch.cuda.is_available() else 'cpu' 63 | 64 | # We now can create a Naive Bayes generative model to estimate the accuracies 65 | # of these label sources 66 | from labelmodels import PartialLabelModel 67 | # We initialize the model by specifying that there are 2 classes (binary 68 | # classification) and 5 label sources 69 | model = PartialLabelModel(num_classes=num_classes, 70 | label_partition=simple_label_partition, 71 | preset_classbalance=None, 72 | device=device) 73 | # Next, we estimate the model's parameters 74 | model.estimate_label_model(votes) 75 | print(model.get_accuracies()) 76 | 77 | # We can obtain a posterior distribution over the true labels 78 | labels = model.get_label_distribution(votes) 79 | ``` 80 | 81 | ## Citation 82 | 83 | Please cite the following paper if you are using our tool. Thank you! 84 | 85 | [Esteban Safranchik](https://www.linkedin.com/in/safranchik/), Shiying Luo, [Stephen H. Bach](http://cs.brown.edu/people/sbach/). "Weakly Supervised Sequence Tagging From Noisy Rules". In 34th AAAI Conference on Artificial Intelligence, 2020. 86 | 87 | ``` 88 | @inproceedings{safranchik2020weakly, 89 | title = {Weakly Supervised Sequence Tagging From Noisy Rules}, 90 | author = {Safranchik, Esteban and Luo, Shiying and Bach, Stephen H.}, 91 | booktitle = {AAAI}, 92 | year = 2020, 93 | } 94 | ``` 95 | 96 | [Peilin Yu](https://www.yupeilin.com), [Tiffany Ding](https://tiffanyding.github.io/) 97 | , [Stephen H. Bach](http://cs.brown.edu/people/sbach/). "Learning from Multiple Noisy Partial Labelers". Artificial 98 | Intelligence and Statistics (AISTATS), 2022. 99 | 100 | ``` 101 | @inproceedings{yu2022nplm, 102 | title = {Learning from Multiple Noisy Partial Labelers}, 103 | author = {Yu, Peilin and Ding, Tiffany and Bach, Stephen H.}, 104 | booktitle = {Artificial Intelligence and Statistics (AISTATS)}, 105 | year = 2022, 106 | } 107 | ``` -------------------------------------------------------------------------------- /labelmodels/__init__.py: -------------------------------------------------------------------------------- 1 | from .hmm import HMM 2 | from .label_model import LearningConfig 3 | from .linked_hmm import LinkedHMM 4 | from .naive_bayes import NaiveBayes 5 | from .partial_labels import PartialLabelModel -------------------------------------------------------------------------------- /labelmodels/hmm.py: -------------------------------------------------------------------------------- 1 | from .label_model import ClassConditionalLabelModel, LearningConfig, init_random 2 | import numpy as np 3 | from scipy import sparse 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class HMM(ClassConditionalLabelModel): 9 | """A generative label model that treats a sequence of true class labels as a 10 | Markov chain, as in a hidden Markov model, and treats all labeling functions 11 | as conditionally independent given the corresponding true class label, as 12 | in a Naive Bayes model. 13 | 14 | Proposed for crowdsourced sequence annotations in: A. T. Nguyen, B. C. 15 | Wallace, J. J. Li, A. Nenkova, and M. Lease. Aggregating and Predicting 16 | Sequence Labels from Crowd Annotations. In Annual Meeting of the Association 17 | for Computational Linguistics, 2017. 18 | """ 19 | 20 | def __init__(self, num_classes, num_lfs, init_acc=.9, acc_prior=1, 21 | balance_prior=1): 22 | """Constructor. 23 | 24 | Initializes labeling function accuracies using optional argument and all 25 | other model parameters uniformly. 26 | 27 | :param num_classes: number of target classes, i.e., binary 28 | classification = 2 29 | :param num_lfs: number of labeling functions to model 30 | :param init_acc: initial estimated labeling function accuracy, must 31 | be a float in [0,1] 32 | :param acc_prior: strength of regularization of estimated labeling 33 | function accuracies toward their initial values 34 | """ 35 | super().__init__(num_classes, num_lfs, init_acc, acc_prior) 36 | 37 | self.start_balance = nn.Parameter(torch.zeros([num_classes])) 38 | self.transitions = nn.Parameter(torch.zeros([num_classes, num_classes])) 39 | 40 | self.balance_prior = balance_prior 41 | 42 | def forward(self, votes, seq_starts): 43 | """ 44 | Computes log likelihood of sequence of labeling function outputs for 45 | each (sequence) example in batch. 46 | 47 | For efficiency, this function prefers that votes is an instance of 48 | scipy.sparse.coo_matrix. You can avoid a conversion by passing in votes 49 | with this class. 50 | 51 | :param votes: m x n matrix in {0, ..., k}, where m is the sum of the 52 | lengths of the sequences in the batch, n is the number of 53 | labeling functions and k is the number of classes 54 | :param seq_starts: vector of length l of row indices in votes indicating 55 | the start of each sequence, where l is the number of 56 | sequences in the batch. So, votes[seq_starts[i]] is 57 | the row vector of labeling function outputs for the 58 | first element in the ith sequence 59 | :return: vector of length l, where element is the log-likelihood of the 60 | corresponding sequence of outputs in votes 61 | """ 62 | jll = self._get_labeling_function_likelihoods(votes) 63 | norm_start_balance = self._get_norm_start_balance() 64 | norm_transitions = self._get_norm_transitions() 65 | for i in range(0, votes.shape[0]): 66 | if i in seq_starts: 67 | jll[i] += norm_start_balance 68 | else: 69 | joint_class_pair = jll[i-1, :].clone().unsqueeze(1) 70 | joint_class_pair = joint_class_pair.repeat(1, self.num_classes) 71 | joint_class_pair += norm_transitions 72 | 73 | jll[i] += joint_class_pair.logsumexp(0) 74 | seq_ends = [x - 1 for x in seq_starts] + [votes.shape[0]-1] 75 | seq_ends.remove(-1) 76 | mll = torch.logsumexp(jll[seq_ends], dim=1) 77 | return mll 78 | 79 | def estimate_label_model(self, votes, seq_starts, config=None): 80 | """Estimates the parameters of the label model based on observed 81 | labeling function outputs. 82 | 83 | Note that a minibatch's size refers to the number of sequences in the 84 | minibatch. 85 | 86 | :param votes: m x n matrix in {0, ..., k}, where m is the sum of the 87 | lengths of the sequences in the data, n is the number of 88 | labeling functions and k is the number of classes 89 | :param seq_starts: vector of length l of row indices in votes indicating 90 | the start of each sequence, where l is the number of 91 | sequences in the batch. So, votes[seq_starts[i]] is 92 | the row vector of labeling function outputs for the 93 | first element in the ith sequence 94 | :param config: optional LearningConfig instance. If None, initialized 95 | with default constructor 96 | """ 97 | if config is None: 98 | config = LearningConfig() 99 | 100 | # Initializes random seed 101 | init_random(config.random_seed) 102 | 103 | # Converts to CSR and integers to standardize input 104 | votes = sparse.csr_matrix(votes, dtype=np.int32) 105 | seq_starts = np.array(seq_starts, dtype=np.int32) 106 | 107 | batches = self._create_minibatches( 108 | votes, seq_starts, config.batch_size, shuffle_seqs=True) 109 | 110 | self._do_estimate_label_model(batches, config) 111 | 112 | def get_most_probable_labels(self, votes, seq_starts): 113 | """ 114 | Computes the most probable underlying sequence of labels given function 115 | outputs 116 | 117 | :param votes: m x n matrix in {0, ..., k}, where m is the sum of the 118 | lengths of the sequences in the data, n is the number of 119 | labeling functions and k is the number of classes 120 | :param seq_starts: vector of length l of row indices in votes indicating 121 | the start of each sequence, where l is the number of 122 | sequences in the batch. So, votes[seq_starts[i]] is 123 | the row vector of labeling function outputs for the 124 | first element in the ith sequence 125 | :return: vector of length m, where element is the most likely predicted labels 126 | """ 127 | # Converts to CSR and integers to standardize input 128 | votes = sparse.csr_matrix(votes, dtype=np.int32) 129 | seq_starts = np.array(seq_starts, dtype=np.int32) 130 | 131 | out = np.ndarray((votes.shape[0],), dtype=np.int32) 132 | 133 | offset = 0 134 | for votes, seq_starts in self._create_minibatches(votes, seq_starts, 32): 135 | jll = self._get_labeling_function_likelihoods(votes) 136 | norm_start_balance = self._get_norm_start_balance() 137 | norm_transitions = self._get_norm_transitions() 138 | 139 | T = votes.shape[0] 140 | bt = torch.zeros([T, self.num_classes]) 141 | for i in range(0, T): 142 | if i in seq_starts: 143 | jll[i] += norm_start_balance 144 | else: 145 | p = jll[i-1].clone().unsqueeze(1).repeat( 146 | 1, self.num_classes) + norm_transitions 147 | jll[i] += torch.max(p, dim=0)[0] 148 | bt[i, :] = torch.argmax(p, dim=0) 149 | 150 | seq_ends = [x - 1 for x in seq_starts] + [votes.shape[0] - 1] 151 | res = [] 152 | j = T-1 153 | while j >= 0: 154 | if j in seq_ends: 155 | res.append(torch.argmax(jll[j, :]).item()) 156 | if j in seq_starts: 157 | j -= 1 158 | continue 159 | res.append(int(bt[j, res[-1]].item())) 160 | j -= 1 161 | res = [x + 1 for x in res] 162 | res.reverse() 163 | 164 | for i in range(len(res)): 165 | out[offset + i] = res[i] 166 | offset += len(res) 167 | return out 168 | 169 | def get_label_distribution(self, votes, seq_starts): 170 | """Returns the unary and pairwise marginals over true labels estimated 171 | by the model. 172 | 173 | :param votes: m x n matrix in {0, ..., k}, where m is the sum of the 174 | lengths of the sequences in the data, n is the number of 175 | labeling functions and k is the number of classes 176 | :param seq_starts: vector of length l of row indices in votes indicating 177 | the start of each sequence, where l is the number of 178 | sequences in the batch. So, votes[seq_starts[i]] is 179 | the row vector of labeling function outputs for the 180 | first element in the ith sequence 181 | :return: p_unary, p_pairwise where p_unary is a m x k matrix representing 182 | the marginal distributions over individual labels, and p_pairwise 183 | is a m x k x k tensor representing pairwise marginals over the 184 | ith and (i+1)th labels. For the last element in a sequence, the 185 | k x k matrix will be all zeros. 186 | """ 187 | # Converts to CSR and integers to standardize input 188 | votes = sparse.csr_matrix(votes, dtype=np.int32) 189 | seq_starts = np.array(seq_starts, dtype=np.int32) 190 | 191 | out_unary = np.zeros((votes.shape[0], self.num_classes)) 192 | out_pairwise = np.zeros((votes.shape[0], self.num_classes, self.num_classes)) 193 | 194 | offset = 0 195 | for votes, seq_starts in self._create_minibatches(votes, seq_starts, 32): 196 | # Computes observation likelihoods and initializes alpha and beta messages 197 | cll = self._get_labeling_function_likelihoods(votes) 198 | alpha = torch.zeros(cll.shape) 199 | beta = torch.zeros(cll.shape) 200 | 201 | # Computes alpha 202 | next_seq = 0 203 | for i in range(votes.shape[0]): 204 | if next_seq == len(seq_starts) or i < seq_starts[next_seq]: 205 | # i is not the start of a sequence 206 | temp = alpha[i-1].unsqueeze(1).repeat(1, self.num_classes) 207 | temp = temp + self._get_norm_transitions() 208 | alpha[i] = cll[i] + temp.logsumexp(0) 209 | else: 210 | # i is the start of a sequence 211 | alpha[i] = cll[i] + self._get_norm_start_balance() 212 | next_seq += 1 213 | 214 | # Computes beta 215 | this_seq = seq_starts.shape[0] - 1 216 | beta[-1, :] = 1 217 | for i in range(votes.shape[0] - 2, -1, -1): 218 | if i == seq_starts[this_seq] - 1: 219 | # End of sequence 220 | beta[i, :] = 1 221 | this_seq -= 1 222 | else: 223 | temp = beta[i+1] + cll[i+1] 224 | temp = temp.unsqueeze(1).repeat(1, self.num_classes) 225 | temp = temp + self._get_norm_transitions().transpose(0, 1) 226 | beta[i, :] = temp.logsumexp(0) 227 | 228 | # Computes p_unary 229 | p_unary = alpha + beta 230 | temp = p_unary.logsumexp(1).unsqueeze(1).repeat(1, self.num_classes) 231 | p_unary = p_unary - temp 232 | for i in range(p_unary.shape[0]): 233 | p = torch.exp(p_unary[i, :] - torch.max(p_unary[i, :])) 234 | out_unary[offset + i, :] = (p / p.sum()).detach() 235 | 236 | # Computes p_pairwise 237 | p_pairwise = torch.zeros( 238 | (votes.shape[0], self.num_classes, self.num_classes)) 239 | for i in range(p_pairwise.shape[0] - 1): 240 | p_pairwise[i, :, :] = self._get_norm_transitions() 241 | p_pairwise[i] += alpha[i].unsqueeze(1).repeat(1, self.num_classes) 242 | p_pairwise[i] += cll[i+1].unsqueeze(0).repeat(self.num_classes, 1) 243 | p_pairwise[i] += beta[i+1].unsqueeze(0).repeat(self.num_classes, 1) 244 | 245 | denom = p_pairwise[i].view(-1).logsumexp(0) 246 | denom = denom.unsqueeze(0).unsqueeze(1) 247 | denom = denom.repeat(self.num_classes, self.num_classes) 248 | p_pairwise[i] -= denom 249 | 250 | out_pairwise[offset + i, :, :] = torch.exp(p_pairwise[i]).detach() 251 | 252 | offset += votes.shape[0] 253 | 254 | return out_unary, out_pairwise 255 | 256 | def get_start_balance(self): 257 | """Returns the model's estimated class balance for the start of a 258 | sequence 259 | 260 | :return: a NumPy array with one element in [0,1] for each target class, 261 | representing the estimated prior probability that the first 262 | element in an example sequence has that label 263 | """ 264 | return np.exp(self._get_norm_start_balance().detach().numpy()) 265 | 266 | def get_transition_matrix(self): 267 | """Returns the model's estimated transition distribution from class 268 | label to class label in a sequence. 269 | 270 | :return: a k x k Numpy array, in which each element i, j is the 271 | probability p(c_{t+1} = j + 1 | c_{t} = i + 1) 272 | """ 273 | return np.exp(self._get_norm_transitions().detach().numpy()) 274 | 275 | def _create_minibatches(self, votes, seq_starts, batch_size, shuffle_seqs=False): 276 | # Computes explicit seq ends so that we can shuffle the sequences 277 | seq_ends = np.ndarray((seq_starts.shape[0],), dtype=np.int32) 278 | for i in range(1, seq_starts.shape[0]): 279 | seq_ends[i-1] = seq_starts[i] - 1 280 | seq_ends[-1] = votes.shape[0] - 1 281 | 282 | # Shuffles the sequences by shuffling the start and end index vectors 283 | if shuffle_seqs: 284 | index = np.arange(np.shape(seq_starts)[0]) 285 | np.random.shuffle(index) 286 | seq_starts = seq_starts[index] 287 | seq_ends = seq_ends[index] 288 | 289 | # Splits seq_starts 290 | seq_start_batches = [np.array( 291 | seq_starts[i * batch_size: ((i + 1) * batch_size)], 292 | copy=True) 293 | for i in range(int(np.ceil(len(seq_starts) / batch_size))) 294 | ] 295 | seq_start_batches[-1] = np.concatenate((seq_start_batches[-1], [votes.shape[0]])) 296 | 297 | # Splits seq_ends 298 | seq_end_batches = [ 299 | np.array(seq_ends[i * batch_size: ((i + 1) * batch_size + 1)], copy=True) 300 | for i in range(int(np.ceil(len(seq_ends) / batch_size))) 301 | ] 302 | seq_end_batches[-1] = np.concatenate((seq_end_batches[-1], [votes.shape[0]])) 303 | 304 | # Builds vote_batches and relative seq_start_batches 305 | vote_batches = [] 306 | rel_seq_start_batches = [] 307 | for seq_start_batch, seq_end_batch in zip(seq_start_batches, seq_end_batches): 308 | vote_batch = [] 309 | rel_seq_start_batch = np.zeros((len(seq_start_batch),), dtype=np.int32) 310 | total_len = 0 311 | for i, (start, end) in enumerate(zip(seq_start_batch, seq_end_batch)): 312 | vote_batch.append(votes[start:end+1]) 313 | rel_seq_start_batch[i] = total_len 314 | total_len += end - start + 1 315 | vote_batches.append(sparse.coo_matrix(sparse.vstack(vote_batch), copy=True)) 316 | rel_seq_start_batches.append(rel_seq_start_batch) 317 | 318 | return list(zip(vote_batches, rel_seq_start_batches)) 319 | 320 | def _get_regularization_loss(self): 321 | neg_entropy = 0.0 322 | 323 | # Start balance 324 | norm_start_balance = self._get_norm_start_balance() 325 | exp_class_balance = torch.exp(norm_start_balance) 326 | for k in range(self.num_classes): 327 | neg_entropy += norm_start_balance[k] * exp_class_balance[k] 328 | 329 | # Transitions 330 | norm_transitions = self._get_norm_transitions() 331 | for i in range(self.num_classes): 332 | exp_transitions = torch.exp(norm_transitions[i]) 333 | for k in range(self.num_classes): 334 | neg_entropy += norm_transitions[i, k] * exp_transitions[k] 335 | 336 | entropy_prior = self.balance_prior * neg_entropy 337 | 338 | return super()._get_regularization_loss() + entropy_prior 339 | 340 | def _get_norm_start_balance(self): 341 | return self.start_balance - self.start_balance.logsumexp(0) 342 | 343 | def _get_norm_transitions(self): 344 | denom = self.transitions.logsumexp(1).unsqueeze(1).repeat(1, self.num_classes) 345 | return self.transitions - denom 346 | -------------------------------------------------------------------------------- /labelmodels/label_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | from scipy import sparse 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | class LabelModel(nn.Module): 9 | """Parent class for all generative label models. 10 | 11 | Concrete subclasses should implement at least forward(), 12 | estimate_label_model(), and get_label_distribution(). 13 | """ 14 | def forward(self, *args): 15 | """Computes the marginal log-likelihood of a batch of observed 16 | function outputs provided as input. 17 | 18 | :param args: batch of observed function outputs and related metadata 19 | :return: 1-d tensor of log-likelihoods, one for each input example 20 | """ 21 | raise NotImplementedError 22 | 23 | def estimate_label_model(self, *args, config=None): 24 | """Learns the parameters of the label model from observed 25 | function outputs. 26 | 27 | Subclasses that implement this method should call _do_estimate_label_model() 28 | if possible, to provide consistent behavior. 29 | 30 | :param args: observed function outputs and related metadata 31 | :param config: an instance of LearningConfig. If none, will initialize 32 | with default LearningConfig constructor 33 | """ 34 | raise NotImplementedError 35 | 36 | def get_label_distribution(self, *args): 37 | """Returns the estimated posterior distribution over true labels given 38 | observed function outputs. 39 | 40 | :param args: observed function outputs and related metadata 41 | :return: distribution over true labels. Structure depends on model type 42 | """ 43 | raise NotImplementedError 44 | 45 | def get_most_probable_labels(self, *args): 46 | """Returns the most probable true labels given observed function outputs. 47 | 48 | :param args: observed function outputs and related metadata 49 | :return: 1-d Numpy array of most probable labels 50 | """ 51 | raise NotImplementedError 52 | 53 | def _do_estimate_label_model(self, batches, config): 54 | """Internal method for optimizing model parameters. 55 | 56 | :param batches: sequence of inputs to forward(). The sequence must 57 | contain tuples, even if forward() takes one 58 | argument (besides self) 59 | :param config: an instance of LearningConfig 60 | """ 61 | logging.info(vars(config)) 62 | 63 | # Sets up optimization hyperparameters 64 | optimizer = torch.optim.SGD( 65 | self.parameters(), lr=config.step_size, momentum=config.momentum, 66 | weight_decay=0) 67 | if config.step_schedule is not None and config.step_size_mult is not None: 68 | scheduler = torch.optim.lr_scheduler.MultiStepLR( 69 | optimizer, config.step_schedule, gamma=config.step_size_mult) 70 | else: 71 | scheduler = None 72 | 73 | # Iterates over epochs 74 | for epoch in range(config.epochs): 75 | logging.info('Epoch {}/{}'.format(epoch + 1, config.epochs)) 76 | if scheduler is not None: 77 | scheduler.step() 78 | 79 | # Sets model to training mode 80 | self.train() 81 | running_loss = 0.0 82 | 83 | # Iterates over training data 84 | for i_batch, inputs in enumerate(batches): 85 | optimizer.zero_grad() 86 | log_likelihood = self(*inputs) 87 | loss = -1 * torch.mean(log_likelihood) 88 | loss += self._get_regularization_loss() 89 | loss.backward() 90 | optimizer.step() 91 | running_loss += loss.item() 92 | epoch_loss = running_loss / len(batches) 93 | logging.info('Train Loss: %.6f', epoch_loss) 94 | 95 | def _get_regularization_loss(self): 96 | """Gets the value of the regularization loss for the current values of 97 | the model's parameters 98 | 99 | :return: regularization loss 100 | """ 101 | return 0.0 102 | 103 | 104 | class ClassConditionalLabelModel(LabelModel): 105 | """ 106 | Abstract parent class for generative label models that assume labeling 107 | functions are conditionally independent given the true label, and that each 108 | labeling function is characterized by the following parameters: 109 | * a propensity, which is the probability that it does not abstain 110 | * class-conditional accuracies, each of which is the probability that 111 | the labeling function's output is correct given that the true label 112 | has a certain value. It is assumed that when a labeling function makes 113 | a mistake, the label it outputs is chosen uniformly at random 114 | """ 115 | def __init__(self, num_classes, num_lfs, init_acc, acc_prior): 116 | """Constructor. 117 | 118 | Initializes label source accuracies argument and propensities uniformly. 119 | 120 | :param num_classes: number of target classes, i.e., binary 121 | classification = 2 122 | :param num_lfs: number of labeling functions to model 123 | :param init_acc: initial estimated labeling function accuracy, must 124 | be a float in [0,1] 125 | :param acc_prior: strength of regularization of estimated labeling 126 | function accuracies toward their initial values 127 | """ 128 | super().__init__() 129 | 130 | # Converts init_acc to log scale 131 | init_acc = -1 * np.log(1.0 / init_acc - 1) / 2 132 | 133 | init_param = torch.tensor( 134 | [[init_acc] * num_classes for _ in range(num_lfs)]) 135 | self.accuracy = nn.Parameter(init_param) 136 | self.propensity = nn.Parameter(torch.zeros([num_lfs])) 137 | 138 | # Saves state 139 | self.num_classes = num_classes 140 | self.num_lfs = num_lfs 141 | self.init_acc = init_acc 142 | self.acc_prior = acc_prior 143 | 144 | def get_accuracies(self): 145 | """Returns the model's estimated labeling function accuracies 146 | :return: a NumPy array with one element in [0,1] for each labeling 147 | function, representing the estimated probability that 148 | the corresponding labeling function correctly outputs 149 | the true class label, given that it does not abstain 150 | """ 151 | acc = self.accuracy.detach().numpy() 152 | return np.exp(acc) / (np.exp(acc) + np.exp(-1 * acc)) 153 | 154 | def get_propensities(self): 155 | """Returns the model's estimated labeling function propensities, i.e., 156 | the probability that a labeling function does not abstain 157 | :return: a NumPy array with one element in [0,1] for each labeling 158 | function, representing the estimated probability that 159 | the corresponding labeling function does not abstain 160 | """ 161 | prop = self.propensity.detach().numpy() 162 | return np.exp(prop) / (np.exp(prop) + 1) 163 | 164 | def _get_labeling_function_likelihoods(self, votes): 165 | """ 166 | Computes conditional log-likelihood of labeling function votes given 167 | class as an m x k matrix. 168 | 169 | For efficiency, this function prefers that votes is an instance of 170 | scipy.sparse.coo_matrix. You can avoid a conversion by passing in votes 171 | with this class. 172 | 173 | :param votes: m x n matrix in {0, ..., k}, where m is the sum of the 174 | lengths of the sequences in the batch, n is the number of 175 | labeling functions and k is the number of classes 176 | :return: matrix of dimension m x k, where element is the conditional 177 | log-likelihood of votes given class 178 | """ 179 | if type(votes) != sparse.coo_matrix: 180 | votes = sparse.coo_matrix(votes) 181 | 182 | # Initializes conditional log-likelihood of votes as an m x k matrix 183 | cll = torch.zeros(votes.shape[0], self.num_classes) 184 | 185 | # Initializes normalizing constants 186 | z_prop = self.propensity.unsqueeze(1) 187 | z_prop = torch.cat((z_prop, torch.zeros((self.num_lfs, 1))), dim=1) 188 | z_prop = torch.logsumexp(z_prop, dim=1) 189 | 190 | z_acc = self.accuracy.unsqueeze(2) 191 | z_acc = torch.cat((z_acc, -1 * self.accuracy.unsqueeze(2)), dim=2) 192 | z_acc = torch.logsumexp(z_acc, dim=2) 193 | 194 | # Subtracts normalizing constant for propensities from cll 195 | # (since it applies to all outcomes) 196 | cll -= torch.sum(z_prop) 197 | 198 | # Loops over votes and classes to compute conditional log-likelihood 199 | for i, j, v in zip(votes.row, votes.col, votes.data): 200 | for k in range(self.num_classes): 201 | if v == (k + 1): 202 | logp = self.propensity[j] + self.accuracy[j, k] - z_acc[j, k] 203 | cll[i, k] += logp 204 | elif v != 0: 205 | logp = self.propensity[j] - self.accuracy[j, k] - z_acc[j, k] 206 | logp -= torch.log(torch.tensor(self.num_classes - 1.0)) 207 | cll[i, k] += logp 208 | 209 | return cll 210 | 211 | def _get_regularization_loss(self): 212 | """Computes the regularization loss of the model: 213 | acc_prior * \|accuracy - init_acc\|_2 214 | 215 | :return: value of regularization loss 216 | """ 217 | return self.acc_prior * torch.norm(self.accuracy - self.init_acc) 218 | 219 | 220 | class LearningConfig(object): 221 | """Container for hyperparameters used by label models during learning""" 222 | 223 | def __init__(self): 224 | """Initializes all hyperparameters to default values""" 225 | self.epochs = 10 226 | self.batch_size = 64 227 | self.step_size = 0.01 228 | self.step_schedule = None 229 | self.step_size_mult = None 230 | self.momentum = 0.9 231 | self.random_seed = 0 232 | 233 | 234 | def init_random(seed): 235 | """Initializes PyTorch and NumPy random seeds. 236 | 237 | Also sets the CuDNN back end to deterministic. 238 | 239 | :param seed: integer to use as random seed 240 | """ 241 | torch.backends.cudnn.deterministic = True 242 | 243 | np.random.seed(seed) 244 | torch.manual_seed(seed) 245 | torch.cuda.manual_seed(seed) 246 | logging.info("Random seed: %d", seed) 247 | -------------------------------------------------------------------------------- /labelmodels/linked_hmm.py: -------------------------------------------------------------------------------- 1 | from .label_model import ClassConditionalLabelModel, LearningConfig, init_random 2 | import numpy as np 3 | from scipy import sparse 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class LinkedHMM(ClassConditionalLabelModel): 9 | """A generative label model that treats a sequence of true class labels as a 10 | Markov chain, as in a hidden Markov model, and treats all labeling functions 11 | as conditionally independent given the corresponding true class label, as 12 | in a Naive Bayes model. 13 | 14 | In addition, the linked HMM captures linking functions that are conditioned 15 | on consecutive pairs of hidden states, with distributions that depend on 16 | whether the linked states are the same or different. The outputs of these 17 | functions are represented as a separate m x n link matrix in {-1, 0, 1}, 18 | where m is the sum of the lengths of the sequences in the data and n is the 19 | number of linking functions. An output of 1 at entry i, j indicates that 20 | true labels i-1 and i have the same value, -1 indicates they do not, and 21 | 0 means the linking function abstains. 22 | 23 | The link matrix always has a row of 0 corresponding to the first element in 24 | each sequence in the data, because linking functions operate on pairs of 25 | elements. This does not count as an abstention. 26 | 27 | Proposed for weakly supervised sequence tagging: E. Safranchik, S. Luo, 28 | and S. H. Bach. Weakly Supervised Sequence Tagging from Noisy Rules. 29 | In 34th AAAI Conference On Artificial Intelligence, 2020. 30 | """ 31 | 32 | def __init__(self, num_classes, num_labeling_funcs, num_linking_funcs, 33 | init_acc=.9, acc_prior=1, balance_prior=1): 34 | """Constructor. 35 | 36 | Initializes labeling and linking function accuracies using optional 37 | argument and all other model parameters uniformly. 38 | 39 | :param num_classes: number of target classes, i.e., binary 40 | classification = 2 41 | :param num_labeling_funcs: number of labeling functions to model 42 | :param num_linking_funcs: number of linking functions to model 43 | :param init_acc: initial estimated labeling and linking function 44 | accuracy, must be a float in [0,1] 45 | :param acc_prior: strength of regularization of estimated labeling and 46 | linking function accuracies toward their initial values 47 | """ 48 | super().__init__(num_classes, num_labeling_funcs, init_acc, acc_prior) 49 | 50 | self.link_accuracy = nn.Parameter( 51 | torch.tensor([self.init_acc] * num_linking_funcs)) 52 | self.link_propensity = nn.Parameter(torch.zeros([num_linking_funcs])) 53 | self.start_balance = nn.Parameter(torch.zeros([num_classes])) 54 | self.transitions = nn.Parameter(torch.zeros([num_classes, num_classes])) 55 | 56 | # Saves state 57 | self.num_linking_funcs = num_linking_funcs 58 | self.balance_prior = balance_prior 59 | 60 | def forward(self, label_votes, link_votes, seq_starts): 61 | """ 62 | Computes log likelihood of sequence of labeling and linking function 63 | outputs for each (sequence) example in batch. 64 | 65 | For efficiency, this function prefers that label_votes and link_votes 66 | are instances of scipy.sparse.coo_matrix. You can avoid a conversion by 67 | passing them in as this class. 68 | 69 | :param label_votes: m x n matrix in {0, ..., k}, where m is the sum of 70 | the lengths of the sequences in the batch, n is the 71 | number of labeling functions and k is the number of 72 | classes 73 | :param link_votes: m x n matrix in {-1, 0, 1}, where m is the sum of 74 | the lengths of the sequences in the batch and n is the 75 | number of linking functions 76 | :param seq_starts: vector of length l of row indices in votes indicating 77 | the start of each sequence, where l is the number of 78 | sequences in the batch. So, label_votes[seq_starts[i]] 79 | is the row vector of labeling function outputs for the 80 | first element in the ith sequence 81 | :return: vector of length l, where element is the log-likelihood of the 82 | corresponding sequence of outputs in votes 83 | """ 84 | jll = self._get_labeling_function_likelihoods(label_votes) 85 | link_cll = self._get_linking_function_likelihoods(link_votes) 86 | norm_start_balance = self._get_norm_start_balance() 87 | norm_transitions = self._get_norm_transitions() 88 | for i in range(0, jll.shape[0]): 89 | if i in seq_starts: 90 | jll[i] += norm_start_balance 91 | else: 92 | joint_class_pair = jll[i-1, :].clone().unsqueeze(1) 93 | joint_class_pair = joint_class_pair.repeat(1, self.num_classes) 94 | joint_class_pair += norm_transitions 95 | 96 | # Adds contributions from links 97 | joint_class_pair += link_cll[i] 98 | 99 | # Finishes computing joint log likelihood 100 | jll[i] += joint_class_pair.logsumexp(0) 101 | seq_ends = [x - 1 for x in seq_starts] + [jll.shape[0]-1] 102 | seq_ends.remove(-1) 103 | mll = torch.logsumexp(jll[seq_ends], dim=1) 104 | return mll 105 | 106 | def estimate_label_model(self, label_votes, link_votes, seq_starts, config=None): 107 | """Estimates the parameters of the label model based on observed 108 | labeling and linking function outputs. 109 | 110 | Note that a minibatch's size refers to the number of sequences in the 111 | minibatch. 112 | 113 | :param label_votes: m x n matrix in {0, ..., k}, where m is the sum of 114 | the lengths of the sequences in the batch, n is the 115 | number of labeling functions and k is the number of 116 | classes 117 | :param link_votes: m x n matrix in {-1, 0, 1}, where m is the sum of 118 | the lengths of the sequences in the batch and n is the 119 | number of linking functions 120 | :param seq_starts: vector of length l of row indices in votes indicating 121 | the start of each sequence, where l is the number of 122 | sequences in the batch. So, label_votes[seq_starts[i]] 123 | is the row vector of labeling function outputs for the 124 | first element in the ith sequence 125 | :param config: optional LearningConfig instance. If None, initialized 126 | with default constructor 127 | """ 128 | if config is None: 129 | config = LearningConfig() 130 | 131 | # Initializes random seed 132 | init_random(config.random_seed) 133 | 134 | # Converts to CSR and integers to standardize input 135 | label_votes = sparse.csr_matrix(label_votes, dtype=np.int32) 136 | link_votes = sparse.csr_matrix(link_votes, dtype=np.int32) 137 | seq_starts = np.array(seq_starts, dtype=np.int32) 138 | 139 | batches = self._create_minibatches( 140 | label_votes, link_votes, seq_starts, config.batch_size, shuffle_seqs=True) 141 | 142 | self._do_estimate_label_model(batches, config) 143 | 144 | def get_label_accuracies(self): 145 | """Alias for ClassConditionalModel.get_accuracies() 146 | :return: estimated labeling function accuracies 147 | """ 148 | return self.get_accuracies() 149 | 150 | def get_link_accuracies(self): 151 | """Returns the model's estimated linking function accuracies 152 | :return: a NumPy array with one element in [0,1] for each labeling 153 | function, representing the estimated probability that 154 | the corresponding linking function correctly identifies a pair 155 | of consecutive labels, given that it does not abstain 156 | """ 157 | acc = self.link_accuracy.detach().numpy() 158 | return np.exp(acc) / (np.exp(acc) + np.exp(-1 * acc)) 159 | 160 | def get_label_propensities(self): 161 | """Alias for ClassConditionalModel.get_propensities() 162 | :return: estimated labeling function propensities 163 | """ 164 | return self.get_propensities() 165 | 166 | def get_link_propensities(self): 167 | """Returns the model's estimated linking function propensities, i.e., 168 | the probability that a linking function does not abstain 169 | :return: a NumPy array with one element in [0,1] for each linking 170 | function, representing the estimated probability that 171 | the corresponding linking function does not abstain 172 | """ 173 | prop = self.link_propensity.detach().numpy() 174 | return np.exp(prop) / (np.exp(prop) + 1) 175 | 176 | def get_most_probable_labels(self, label_votes, link_votes, seq_starts): 177 | """ 178 | Computes the most probable underlying sequence nodes given function 179 | outputs 180 | 181 | :param label_votes: m x n matrix in {0, ..., k}, where m is the sum of 182 | the lengths of the sequences in the batch, n is the 183 | number of labeling functions and k is the number of 184 | classes 185 | :param link_votes: m x n matrix in {-1, 0, 1}, where m is the sum of 186 | the lengths of the sequences in the batch and n is the 187 | number of linking functions 188 | :param seq_starts: vector of length l of row indices in votes indicating 189 | the start of each sequence, where l is the number of 190 | sequences in the batch. So, label_votes[seq_starts[i]] 191 | is the row vector of labeling function outputs for the 192 | first element in the ith sequence 193 | :return: vector of length m, where element is the most likely predicted labels 194 | """ 195 | # Converts to CSR and integers to standardize input 196 | label_votes = sparse.csr_matrix(label_votes, dtype=np.int32) 197 | link_votes = sparse.csr_matrix(link_votes, dtype=np.int32) 198 | seq_starts = np.array(seq_starts, dtype=np.int32) 199 | 200 | out = np.ndarray((label_votes.shape[0],), dtype=np.int32) 201 | 202 | offset = 0 203 | for label_votes, link_votes, seq_starts in self._create_minibatches( 204 | label_votes, link_votes, seq_starts, 32): 205 | # Initializes joint log likelihood with labeling function likelihood 206 | jll = self._get_labeling_function_likelihoods(label_votes) 207 | link_cll = self._get_linking_function_likelihoods(link_votes) 208 | norm_start_balance = self._get_norm_start_balance() 209 | norm_transitions = self._get_norm_transitions() 210 | 211 | T = label_votes.shape[0] 212 | bt = torch.zeros([T, self.num_classes]) 213 | for i in range(0, T): 214 | if i in seq_starts: 215 | jll[i] += norm_start_balance 216 | else: 217 | p = jll[i-1].clone().unsqueeze(1).repeat( 218 | 1, self.num_classes) + norm_transitions 219 | p += link_cll[i] 220 | jll[i] += torch.max(p, dim=0)[0] 221 | bt[i, :] = torch.argmax(p, dim=0) 222 | 223 | seq_ends = [x - 1 for x in seq_starts] + [label_votes.shape[0] - 1] 224 | res = [] 225 | j = T-1 226 | while j >= 0: 227 | if j in seq_ends: 228 | res.append(torch.argmax(jll[j, :]).item()) 229 | if j in seq_starts: 230 | j -= 1 231 | continue 232 | res.append(int(bt[j, res[-1]].item())) 233 | j -= 1 234 | res = [x + 1 for x in res] 235 | res.reverse() 236 | 237 | for i in range(len(res)): 238 | out[offset + i] = res[i] 239 | offset += len(res) 240 | return out 241 | 242 | def get_label_distribution(self, label_votes, link_votes, seq_starts): 243 | """Returns the unary and pairwise marginals over true labels estimated 244 | by the model. 245 | 246 | :param label_votes: m x n matrix in {0, ..., k}, where m is the sum of 247 | the lengths of the sequences in the batch, n is the 248 | number of labeling functions and k is the number of 249 | classes 250 | :param link_votes: m x n matrix in {-1, 0, 1}, where m is the sum of 251 | the lengths of the sequences in the batch and n is the 252 | number of linking functions 253 | :param seq_starts: vector of length l of row indices in votes indicating 254 | the start of each sequence, where l is the number of 255 | sequences in the batch. So, label_votes[seq_starts[i]] 256 | is the row vector of labeling function outputs for the 257 | first element in the ith sequence 258 | :return: p_unary, p_pairwise where p_unary is a m x k matrix representing 259 | the marginal distributions over individual labels, and p_pairwise 260 | is a m x k x k tensor representing pairwise marginals over the 261 | ith and (i+1)th labels. For the last element in a sequence, the 262 | k x k matrix will be all zeros. 263 | """ 264 | # Converts to CSR and integers to standardize input 265 | label_votes = sparse.csr_matrix(label_votes, dtype=np.int32) 266 | link_votes = sparse.csr_matrix(link_votes, dtype=np.int32) 267 | seq_starts = np.array(seq_starts, dtype=np.int32) 268 | 269 | out_unary = np.zeros((label_votes.shape[0], self.num_classes)) 270 | out_pairwise = np.zeros((label_votes.shape[0], self.num_classes, self.num_classes)) 271 | 272 | offset = 0 273 | for label_votes, link_votes, seq_starts in self._create_minibatches( 274 | label_votes, link_votes, seq_starts, 32): 275 | # Computes observation likelihoods and initializes alpha and beta messages 276 | label_cll = self._get_labeling_function_likelihoods(label_votes) 277 | link_cll = self._get_linking_function_likelihoods(link_votes) 278 | alpha = torch.zeros(label_cll.shape) 279 | beta = torch.zeros(label_cll.shape) 280 | 281 | # Computes alpha 282 | next_seq = 0 283 | for i in range(label_votes.shape[0]): 284 | if next_seq == len(seq_starts) or i < seq_starts[next_seq]: 285 | # i is not the start of a sequence 286 | temp = alpha[i-1].unsqueeze(1).repeat(1, self.num_classes) 287 | temp = temp + self._get_norm_transitions() 288 | temp += link_cll[i] 289 | alpha[i] = label_cll[i] + temp.logsumexp(0) 290 | else: 291 | # i is the start of a sequence 292 | alpha[i] = label_cll[i] + self._get_norm_start_balance() 293 | next_seq += 1 294 | 295 | # Computes beta 296 | this_seq = seq_starts.shape[0] - 1 297 | beta[-1, :] = 1 298 | for i in range(label_votes.shape[0] - 2, -1, -1): 299 | if i == seq_starts[this_seq] - 1: 300 | # End of sequence 301 | beta[i, :] = 1 302 | this_seq -= 1 303 | else: 304 | temp = beta[i+1] + label_cll[i+1] 305 | temp = temp.unsqueeze(1).repeat(1, self.num_classes) 306 | temp = temp + self._get_norm_transitions().transpose(0, 1) 307 | temp += link_cll[i+1] 308 | beta[i, :] = temp.logsumexp(0) 309 | 310 | # Computes p_unary 311 | p_unary = alpha + beta 312 | temp = p_unary.logsumexp(1).unsqueeze(1).repeat(1, self.num_classes) 313 | p_unary = p_unary - temp 314 | for i in range(p_unary.shape[0]): 315 | p = torch.exp(p_unary[i, :] - torch.max(p_unary[i, :])) 316 | out_unary[offset + i, :] = (p / p.sum()).detach() 317 | 318 | # Computes p_pairwise 319 | p_pairwise = torch.zeros( 320 | (label_votes.shape[0], self.num_classes, self.num_classes)) 321 | for i in range(p_pairwise.shape[0] - 1): 322 | p_pairwise[i, :, :] = self._get_norm_transitions() 323 | p_pairwise[i] += alpha[i].unsqueeze(1).repeat(1, self.num_classes) 324 | p_pairwise[i] += label_cll[i+1].unsqueeze(0).repeat(self.num_classes, 1) 325 | p_pairwise[i] += beta[i+1].unsqueeze(0).repeat(self.num_classes, 1) 326 | p_pairwise[i] += link_cll[i+1] 327 | 328 | denom = p_pairwise[i].view(-1).logsumexp(0) 329 | denom = denom.unsqueeze(0).unsqueeze(1) 330 | denom = denom.repeat(self.num_classes, self.num_classes) 331 | p_pairwise[i] -= denom 332 | 333 | out_pairwise[offset + i, :, :] = torch.exp(p_pairwise[i]).detach() 334 | 335 | offset += label_votes.shape[0] 336 | 337 | return out_unary, out_pairwise 338 | 339 | def get_start_balance(self): 340 | """Returns the model's estimated class balance for the start of a 341 | sequence 342 | 343 | :return: a NumPy array with one element in [0,1] for each target class, 344 | representing the estimated prior probability that the first 345 | element in an example sequence has that label 346 | """ 347 | return np.exp(self._get_norm_start_balance().detach().numpy()) 348 | 349 | def get_transition_matrix(self): 350 | """Returns the model's estimated transition distribution from class 351 | label to class label in a sequence. 352 | 353 | :return: a k x k Numpy array, in which each element i, j is the 354 | probability p(c_{t+1} = j + 1 | c_{t} = i + 1) 355 | """ 356 | return np.exp(self._get_norm_transitions().detach().numpy()) 357 | 358 | def _create_minibatches(self, label_votes, link_votes, seq_starts, 359 | batch_size, shuffle_seqs=False): 360 | if label_votes.shape[0] != link_votes.shape[0]: 361 | raise ValueError("label_votes and link_votes must have same number " 362 | "of rows") 363 | 364 | # Computes explicit seq ends so that we can shuffle the sequences 365 | seq_ends = np.ndarray((seq_starts.shape[0],), dtype=np.int32) 366 | for i in range(1, seq_starts.shape[0]): 367 | seq_ends[i - 1] = seq_starts[i] - 1 368 | seq_ends[-1] = label_votes.shape[0] - 1 369 | 370 | # Shuffles the sequences by shuffling the start and end index vectors 371 | if shuffle_seqs: 372 | index = np.arange(np.shape(seq_starts)[0]) 373 | np.random.shuffle(index) 374 | seq_starts = seq_starts[index] 375 | seq_ends = seq_ends[index] 376 | 377 | # Splits seq_starts 378 | seq_start_batches = [np.array( 379 | seq_starts[i * batch_size: ((i + 1) * batch_size)], 380 | copy=True) 381 | for i in range(int(np.ceil(len(seq_starts) / batch_size))) 382 | ] 383 | seq_start_batches[-1] = np.concatenate((seq_start_batches[-1], 384 | [label_votes.shape[0]])) 385 | 386 | # Splits seq_ends 387 | seq_end_batches = [ 388 | np.array(seq_ends[i * batch_size: ((i + 1) * batch_size + 1)], copy=True) 389 | for i in range(int(np.ceil(len(seq_ends) / batch_size))) 390 | ] 391 | seq_end_batches[-1] = np.concatenate((seq_end_batches[-1], 392 | [label_votes.shape[0]])) 393 | 394 | # Builds label_vote_batches, link_vote_batches and relative seq_start_batches 395 | label_vote_batches = [] 396 | link_vote_batches = [] 397 | rel_seq_start_batches = [] 398 | for seq_start_batch, seq_end_batch in zip(seq_start_batches, seq_end_batches): 399 | label_vote_batch = [] 400 | link_vote_batch = [] 401 | rel_seq_start_batch = np.zeros((len(seq_start_batch),), dtype=np.int32) 402 | total_len = 0 403 | for i, (start, end) in enumerate(zip(seq_start_batch, seq_end_batch)): 404 | label_vote_batch.append(label_votes[start:end + 1]) 405 | link_vote_batch.append(link_votes[start:end + 1]) 406 | rel_seq_start_batch[i] = total_len 407 | total_len += end - start + 1 408 | label_vote_batches.append( 409 | sparse.coo_matrix(sparse.vstack(label_vote_batch), copy=True)) 410 | link_vote_batches.append( 411 | sparse.coo_matrix(sparse.vstack(link_vote_batch), copy=True)) 412 | rel_seq_start_batches.append(rel_seq_start_batch) 413 | 414 | return list(zip(label_vote_batches, link_vote_batches, rel_seq_start_batches)) 415 | 416 | def _get_linking_function_likelihoods(self, votes): 417 | if type(votes) != sparse.coo_matrix: 418 | votes = sparse.coo_matrix(votes) 419 | 420 | cll = torch.zeros((votes.shape[0], self.num_classes, self.num_classes)) 421 | 422 | # Initializes normalizing constants 423 | z_prop = self.link_propensity.unsqueeze(1) 424 | z_prop = torch.cat((z_prop, torch.zeros((self.num_linking_funcs, 1))), dim=1) 425 | z_prop = torch.logsumexp(z_prop, dim=1) 426 | 427 | z_acc = self.link_accuracy.unsqueeze(1) 428 | z_acc = torch.cat((z_acc, -1 * self.link_accuracy.unsqueeze(1)), dim=1) 429 | z_acc = torch.logsumexp(z_acc, dim=1) 430 | 431 | # Subtracts normalizing constant for propensities from cll 432 | # (since it applies to all outcomes) 433 | cll -= torch.sum(z_prop) 434 | 435 | # Loops over votes and classes to compute conditional log-likelihood 436 | for i, j, v in zip(votes.row, votes.col, votes.data): 437 | if v != 1 and v != -1: 438 | continue 439 | 440 | for k1 in range(self.num_classes): 441 | for k2 in range(self.num_classes): 442 | if k1 == k2: 443 | if v == 1: 444 | cll[i, k1, k2] += self.link_propensity[j] 445 | cll[i, k1, k2] += self.link_accuracy[j] 446 | cll[i, k1, k2] -= z_acc[j] 447 | else: 448 | cll[i, k1, k2] += self.link_propensity[j] 449 | cll[i, k1, k2] -= self.link_accuracy[j] 450 | cll[i, k1, k2] -= z_acc[j] 451 | else: 452 | if v == 1: 453 | cll[i, k1, k2] += self.link_propensity[j] 454 | cll[i, k1, k2] -= self.link_accuracy[j] 455 | cll[i, k1, k2] -= z_acc[j] 456 | else: 457 | cll[i, k1, k2] += self.link_propensity[j] 458 | cll[i, k1, k2] += self.link_accuracy[j] 459 | cll[i, k1, k2] -= z_acc[j] 460 | 461 | return cll 462 | 463 | def _get_regularization_loss(self): 464 | neg_entropy = 0.0 465 | 466 | # Start balance 467 | norm_start_balance = self._get_norm_start_balance() 468 | exp_class_balance = torch.exp(norm_start_balance) 469 | for k in range(self.num_classes): 470 | neg_entropy += norm_start_balance[k] * exp_class_balance[k] 471 | 472 | # Transitions 473 | norm_transitions = self._get_norm_transitions() 474 | for i in range(self.num_classes): 475 | exp_transitions = torch.exp(norm_transitions[i]) 476 | for k in range(self.num_classes): 477 | neg_entropy += norm_transitions[i, k] * exp_transitions[k] 478 | 479 | entropy_prior = self.balance_prior * neg_entropy 480 | 481 | # Accuracy prior 482 | acc = torch.cat((self.accuracy.view(-1), self.link_accuracy)) 483 | acc_prior = self.acc_prior * torch.norm(acc - self.init_acc) 484 | 485 | return acc_prior + entropy_prior 486 | 487 | def _get_norm_start_balance(self): 488 | return self.start_balance - self.start_balance.logsumexp(0) 489 | 490 | def _get_norm_transitions(self): 491 | denom = self.transitions.logsumexp(1).unsqueeze(1).repeat(1, self.num_classes) 492 | return self.transitions - denom 493 | -------------------------------------------------------------------------------- /labelmodels/naive_bayes.py: -------------------------------------------------------------------------------- 1 | from .label_model import ClassConditionalLabelModel, LearningConfig, init_random 2 | import numpy as np 3 | from scipy import sparse 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class NaiveBayes(ClassConditionalLabelModel): 9 | """A generative label model that assumes that all labeling functions are 10 | conditionally independent given the true class label, i.e., the naive Bayes 11 | assumption. 12 | 13 | Proposed in: A. P. Dawid and A. M. Skene. Maximum likelihood 14 | estimation of observer error-rates using the EM algorithm. 15 | Journal of the Royal Statistical Society C, 28(1):20–28, 1979. 16 | 17 | Proposed for labeling functions in: A. Ratner, C. De Sa, S. Wu, D. Selsam, 18 | and C. Ré. Data programming: Creating large training sets, quickly. In 19 | Neural Information Processing Systems, 2016. 20 | """ 21 | 22 | def __init__(self, num_classes, num_lfs, init_acc=.9, acc_prior=0.025, 23 | balance_prior=0.025, learn_class_balance=True): 24 | """Constructor. 25 | 26 | Initializes labeling function accuracies using optional argument and all 27 | other model parameters uniformly. 28 | 29 | :param num_classes: number of target classes, i.e., binary 30 | classification = 2 31 | :param num_lfs: number of labeling functions to model 32 | :param init_acc: initial estimated labeling function accuracy, must 33 | be a float in [0,1] 34 | :param acc_prior: strength of regularization of estimated labeling 35 | function accuracies toward their initial values 36 | :param learn_class_balance: whether to estimate the distribution over 37 | target classes (True) or assume to be 38 | uniform (False) 39 | """ 40 | super().__init__(num_classes, num_lfs, init_acc, acc_prior) 41 | self.class_balance = nn.Parameter( 42 | torch.zeros([num_classes]), requires_grad=learn_class_balance) 43 | 44 | self.balance_prior = balance_prior 45 | 46 | def forward(self, votes): 47 | """Computes log likelihood of labeling function outputs for each 48 | example in the batch. 49 | 50 | For efficiency, this function prefers that votes is an instance of 51 | scipy.sparse.coo_matrix. You can avoid a conversion by passing in votes 52 | with this class. 53 | 54 | :param votes: m x n matrix in {0, ..., k}, where m is the batch size, 55 | n is the number of labeling functions and k is the number 56 | of classes 57 | :return: 1-d tensor of length m, where each element is the 58 | log-likelihood of the corresponding row in labels 59 | """ 60 | class_ll = self._get_norm_class_balance() 61 | conditional_ll = self._get_labeling_function_likelihoods(votes) 62 | joint_ll = conditional_ll + class_ll 63 | return torch.logsumexp(joint_ll, dim=1) 64 | 65 | def estimate_label_model(self, votes, config=None): 66 | """Estimates the parameters of the label model based on observed 67 | labeling function outputs. 68 | 69 | :param votes: m x n matrix in {0, ..., k}, where m is the batch size, 70 | n is the number of labeling functions and k is the number 71 | of classes 72 | :param config: optional LearningConfig instance. If None, initialized 73 | with default constructor 74 | """ 75 | if config is None: 76 | config = LearningConfig() 77 | 78 | # Initializes random seed 79 | init_random(config.random_seed) 80 | 81 | # Converts to CSR to standardize input 82 | votes = sparse.csr_matrix(votes, dtype=np.int32) 83 | 84 | batches = self._create_minibatches( 85 | votes, config.batch_size, shuffle_rows=True) 86 | self._do_estimate_label_model(batches, config) 87 | 88 | def get_label_distribution(self, votes): 89 | """Returns the posterior distribution over true labels given labeling 90 | function outputs according to the model 91 | 92 | :param votes: m x n matrix in {0, ..., k}, where m is the batch size, 93 | n is the number of labeling functions and k is the number 94 | of classes 95 | :return: m x k matrix, where each row is the posterior distribution over 96 | the true class label for the corresponding example 97 | """ 98 | # Converts to CSR to standardize input 99 | votes = sparse.csr_matrix(votes, dtype=np.int32) 100 | 101 | labels = np.ndarray((votes.shape[0], self.num_classes)) 102 | batches = self._create_minibatches(votes, 4096, shuffle_rows=False) 103 | 104 | offset = 0 105 | for votes, in batches: 106 | class_balance = self._get_norm_class_balance() 107 | lf_likelihood = self._get_labeling_function_likelihoods(votes) 108 | jll = class_balance + lf_likelihood 109 | for i in range(votes.shape[0]): 110 | p = torch.exp(jll[i, :] - torch.max(jll[i, :])) 111 | p = p / p.sum() 112 | for j in range(self.num_classes): 113 | labels[offset + i, j] = p[j] 114 | offset += votes.shape[0] 115 | 116 | return labels 117 | 118 | def get_most_probable_labels(self, votes): 119 | """Returns the most probable true labels given observed function outputs. 120 | 121 | :param votes: m x n matrix in {0, ..., k}, where m is the batch size, 122 | n is the number of labeling functions and k is the number 123 | of classes 124 | :return: 1-d Numpy array of most probable labels 125 | """ 126 | return np.argmax(self.get_label_distribution(votes), axis=1) + 1 127 | 128 | def get_class_balance(self): 129 | """Returns the model's estimated class balance 130 | 131 | :return: a NumPy array with one element in [0,1] for each target class, 132 | representing the estimated prior probability that an example 133 | has that label 134 | """ 135 | return np.exp(self._get_norm_class_balance().detach().numpy()) 136 | 137 | def _create_minibatches(self, votes, batch_size, shuffle_rows=False): 138 | if shuffle_rows: 139 | index = np.arange(np.shape(votes)[0]) 140 | np.random.shuffle(index) 141 | votes = votes[index, :] 142 | 143 | # Creates minibatches 144 | batches = [(sparse.coo_matrix( 145 | votes[i * batch_size: (i + 1) * batch_size, :], 146 | copy=True),) 147 | for i in range(int(np.ceil(votes.shape[0] / batch_size))) 148 | ] 149 | 150 | return batches 151 | 152 | def _get_regularization_loss(self): 153 | neg_entropy = 0.0 154 | norm_class_balance = self._get_norm_class_balance() 155 | exp_class_balance = torch.exp(norm_class_balance) 156 | for k in range(self.num_classes): 157 | neg_entropy += norm_class_balance[k] * exp_class_balance[k] 158 | entropy_prior = self.balance_prior * neg_entropy 159 | 160 | return super()._get_regularization_loss() + entropy_prior 161 | 162 | def _get_norm_class_balance(self): 163 | return self.class_balance - torch.logsumexp(self.class_balance, dim=0) 164 | -------------------------------------------------------------------------------- /labelmodels/partial_labels.py: -------------------------------------------------------------------------------- 1 | from .label_model import LabelModel, init_random, LearningConfig 2 | import numpy as np 3 | from scipy import sparse 4 | import torch 5 | from torch import nn 6 | from copy import deepcopy as dc 7 | import logging 8 | 9 | 10 | class PartialLabelLearningConfig(LearningConfig): 11 | """Container for hyperparameters used by PartialLabelModel during learning""" 12 | def __init__(self): 13 | """Initializes all hyperparameters to default values""" 14 | super().__init__() 15 | self.epochs = 200 16 | self.batch_size = 8192 17 | self.step_size = 0.1 18 | self.step_schedule = 'p' 19 | self.step_size_mult = 0.1 20 | self.momentum = 0.8 21 | 22 | 23 | class PartialLabelModel(LabelModel): 24 | """A generative label model that assumes that all partial labeling functions are 25 | conditionally independent given the true class label. A naive Bayes distribution 26 | is assumed. 27 | """ 28 | 29 | def __init__(self, num_classes, 30 | label_partition, 31 | init_acc=0.7, preset_classbalance=None, 32 | learn_class_balance=True, 33 | device='cpu'): 34 | """Constructor. 35 | 36 | Initializes labeling function accuracies using optional argument and all 37 | other model parameters uniformly. 38 | 39 | :param num_classes: number of target classes, i.e., binary 40 | classification = 2 41 | :param label_partition: partial labeling functions configurations. The label_partition configures the label 42 | partitions mapping in format as 43 | {PLF's index: [partition_1, partition_2, ..., partition_{k_l}]} 44 | :param preset_classbalance: None if want to learn class balance. Can be preset as fixed class balance 45 | :param init_acc: initial estimated labeling and linking function 46 | accuracy, must be a float in [0,1] 47 | :param device: calculation device 48 | """ 49 | super().__init__() 50 | 51 | self.device = device 52 | if not torch.cuda.is_available(): 53 | self.device = 'cpu' 54 | self.preset_classbalance = preset_classbalance 55 | self.num_classes = num_classes 56 | self.init_acc = -1 * np.log(1.0 / init_acc - 1) / 2 57 | self.label_partition = label_partition 58 | self.num_df = len(label_partition) 59 | 60 | if self.preset_classbalance is not None: 61 | self.class_balance = torch.nn.Parameter( 62 | torch.log(self.preset_classbalance), 63 | requires_grad=False 64 | ) 65 | else: 66 | self.class_balance = torch.nn.Parameter( 67 | torch.zeros([self.num_classes], device=self.device), 68 | requires_grad=learn_class_balance 69 | ) 70 | 71 | self.accuracy = torch.nn.Parameter( 72 | torch.ones([self.num_df, self.num_classes], device=self.device) * self.init_acc, 73 | requires_grad=True 74 | ) 75 | 76 | self.propensity = torch.nn.Parameter( 77 | torch.zeros([self.num_df], device=self.device), 78 | requires_grad=True 79 | ) 80 | 81 | self.ct = torch.zeros([self.num_df, self.num_classes]) 82 | self.poslib = torch.zeros([self.num_df, self.num_classes]) 83 | self.neglib = torch.zeros([self.num_df, self.num_classes]) 84 | 85 | ''' 86 | Set Ops 87 | ''' 88 | def intercect(l1, l2): 89 | return [value for value in l1 if value in l2] 90 | 91 | def union(l1, l2): 92 | return list(set(l1) | set(l2)) 93 | 94 | for fid, clusters in self.label_partition.items(): 95 | crange = clusters[0] 96 | ccover = [] 97 | for cluster_id, cluster in enumerate(clusters): 98 | cluster.sort() 99 | self.label_partition[fid][cluster_id] = cluster 100 | crange = intercect(crange, cluster) 101 | ccover = union(ccover, cluster) 102 | if len(crange) > 0: 103 | raise RuntimeError('Setup Violation: No class can appear in all groups!') 104 | if len(ccover) < self.num_classes: 105 | raise RuntimeError('Setup Violation: Class must appear at least once! Please setup a dummy label group if necessary!') 106 | 107 | for fid, clusters in self.label_partition.items(): 108 | for cluster_id, cluster in enumerate(clusters): 109 | for class_id in cluster: 110 | self.poslib[fid, class_id - 1] += 1 111 | self.ct[fid, class_id - 1] = cluster_id 112 | self.neglib[fid, :] = len(clusters) - self.poslib[fid, :] 113 | self.poslib[self.poslib == 0] = 1 114 | 115 | def forward(self, votes, bid): 116 | """Computes log likelihood of labeling function outputs for each 117 | example in the batch. 118 | """ 119 | class_ll = self._get_norm_class_balance() 120 | conditional_ll = self._cll(votes, bid) 121 | joint_ll = conditional_ll + class_ll 122 | return torch.logsumexp(joint_ll, dim=1) 123 | 124 | def estimate_label_model(self, votes, config=None): 125 | """Estimates the parameters of the label model based on observed 126 | labeling function outputs. 127 | """ 128 | if config is None: 129 | config = PartialLabelLearningConfig() 130 | 131 | # Initializes random seed 132 | init_random(config.random_seed) 133 | 134 | batches = self._setup(votes, config.batch_size, shuffle=True) 135 | 136 | self._do_estimate_label_model(batches, config) 137 | 138 | def get_label_distribution(self, votes, annot_batch_sz=2048): 139 | """Returns the posterior distribution over true labels given labeling 140 | function outputs according to the model 141 | 142 | :param votes: m x n matrix where each element is in the set {0, 0, 1, ..., k_l}, where 143 | k_l is the number of label partitions for partial labeling functions PLF_{l}. 144 | :return: m x k matrix, where each row is the posterior distribution over 145 | the true class label for the corresponding example 146 | """ 147 | self.eval() 148 | batches = self._setup(votes, annot_batch_sz) 149 | 150 | labels = np.ndarray((votes.shape[0], self.num_classes)) 151 | for batch_id, batch_votes in enumerate(batches): 152 | class_balance = self._get_norm_class_balance() 153 | lf_likelihood = self._cll(batch_votes, batch_id) 154 | jll = class_balance + lf_likelihood 155 | P = torch.exp(jll - torch.max(jll, dim=1)[0].unsqueeze(1).repeat(1, self.num_classes)) 156 | P /= torch.sum(P, dim=1).unsqueeze(1).repeat(1, self.num_classes) 157 | labels[batch_id*annot_batch_sz:batch_id*annot_batch_sz+batch_votes.shape[0]] = P.detach().cpu().numpy() 158 | if 'cuda' in self.device: 159 | torch.cuda.empty_cache() 160 | return labels 161 | 162 | def get_most_probable_labels(self, votes): 163 | """Returns the most probable true labels given observed function outputs. 164 | 165 | :param votes: m x n matrix where each element is in the set {0, 0, 1, ..., k_l}, where 166 | k_l is the number of label partitions for partial labeling functions PLF_{l}. 167 | :return: 1-d Numpy array of most probable labels 168 | """ 169 | return np.argmax(self.get_label_distribution(votes), axis=1) + 1 170 | 171 | def get_class_balance(self): 172 | """Returns the model's estimated class balance 173 | 174 | :return: a NumPy array with one element in [0,1] for each target class, 175 | representing the estimated prior probability that an example 176 | has that label 177 | """ 178 | return np.exp(self._get_norm_class_balance().detach().cpu().numpy()) 179 | 180 | def get_accuracies(self): 181 | """Returns the model's estimated labeling function accuracies 182 | :return: a NumPy array with one element in [0,1] for each labeling 183 | function, representing the estimated probability that 184 | the corresponding labeling function correctly outputs 185 | the true class label, given that it does not abstain 186 | """ 187 | acc = self.accuracy.detach().cpu().numpy() 188 | return np.exp(acc) / (np.exp(acc) + np.exp(-1 * acc)) 189 | 190 | def get_propensities(self): 191 | """Returns the model's estimated labeling function propensities, i.e., 192 | the probability that a labeling function does not abstain 193 | :return: a NumPy array with one element in [0,1] for each labeling 194 | function, representing the estimated probability that 195 | the corresponding labeling function does not abstain 196 | """ 197 | prop = self.propensity.detach().cpu().numpy() 198 | return np.exp(prop) / (np.exp(prop) + 1) 199 | 200 | def _do_estimate_label_model(self, batches, config): 201 | """Internal method for optimizing model parameters. 202 | 203 | :param batches: sequence of inputs to forward(). The sequence must 204 | contain tuples, even if forward() takes one 205 | argument (besides self) 206 | :param config: an instance of PartialLabelLearningConfig 207 | """ 208 | 209 | optimizer = torch.optim.Adam( 210 | self.parameters(), lr=config.step_size, 211 | weight_decay=0) 212 | 213 | if config.step_schedule == 'p': 214 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, min_lr=1e-10, factor=config.step_size_mult) 215 | elif config.step_schedule == 'c': 216 | scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=1e-1, max_lr=0.2) 217 | elif config.step_schedule is not None and config.step_size_mult is not None: 218 | scheduler = torch.optim.lr_scheduler.MultiStepLR( 219 | optimizer, config.step_schedule, gamma=config.step_size_mult) 220 | else: 221 | scheduler = None 222 | 223 | self.train() 224 | 225 | for epoch in range(config.epochs): 226 | ga = dc(self.accuracy) 227 | logging.info('Epoch {}/{}'.format(epoch + 1, config.epochs)) 228 | running_loss = 0.0 229 | epoch_loss = [] 230 | for i_batch, inputs in enumerate(batches): 231 | optimizer.zero_grad() 232 | log_likelihood = self(inputs, i_batch) 233 | loss = -1 * torch.mean(log_likelihood) 234 | loss += self._get_regularization_loss() 235 | loss.backward() 236 | optimizer.step() 237 | running_loss += loss 238 | epoch_loss.append(float(loss.item())) 239 | epoch_loss = running_loss / len(batches) 240 | logging.info('Train Loss: %.6f', epoch_loss) 241 | if torch.sum(torch.abs(self.accuracy - ga)) < 1e-7: 242 | logging.info('1e-7 Criterion Reached: Epoch') 243 | break 244 | if scheduler is not None: 245 | if config.step_schedule == 'p': 246 | scheduler.step(epoch_loss) 247 | else: 248 | scheduler.step() 249 | 250 | if 'cuda' in self.device: 251 | torch.cuda.empty_cache() 252 | 253 | def _setup(self, votes, batch_size, shuffle=False): 254 | ''' Setup \& precalculates/populates helper variables 255 | 256 | :param votes: Full PLFs votes input. 257 | :param batch_size: # of instances in one batch 258 | :param shuffle_rows: Decides if rows of given votse need to shuffle 259 | 260 | :return: 3-d Numpy array for batched votes in shape [# batch, # instance, # plfs] 261 | ''' 262 | # Normalizing to 0-indexed LPs. 263 | batches = self._create_minibatches(votes-1, batch_size, shuffle) 264 | cth = self.ct.unsqueeze(0).repeat(batch_size, 1, 1) 265 | self.c = torch.zeros([len(batches), batch_size, self.num_df, self.num_classes]) 266 | self.n = torch.zeros([len(batches), batch_size, self.num_df, self.num_classes]) 267 | self.a = torch.ones([len(batches), batch_size, self.num_df, self.num_classes]) 268 | self.p = torch.ones([len(batches), batch_size, self.num_df]) 269 | for bid in range(len(batches) - 1): 270 | extb = batches[bid].unsqueeze(2).repeat(1, 1, self.num_classes) 271 | self.c[bid] = torch.where(torch.eq(cth, extb), torch.tensor(1.0), torch.tensor(-1.0)) 272 | self.a[bid] = torch.where(extb==-1, torch.tensor(0.0), torch.tensor(1.0)) 273 | marker = torch.where(self.c[bid]==1, torch.tensor(1.0), torch.tensor(0.0)) 274 | self.n[bid] = (1 - marker) * self.neglib + marker * self.poslib 275 | self.p[bid] = torch.where(batches[bid]==-1, torch.tensor(0.0), torch.tensor(1.0)) 276 | 277 | last_bz = len(batches[-1]) 278 | last_extb = batches[-1].unsqueeze(2).repeat(1, 1, self.num_classes) 279 | self.c[-1, :last_bz] = torch.where( 280 | torch.eq(cth[:last_bz, :, :], last_extb), 281 | torch.tensor(1.0), torch.tensor(-1.0)) 282 | marker = torch.where(self.c[-1, :last_bz] == 1, torch.tensor(1.0), torch.tensor(0.0)) 283 | self.a[-1, :last_bz] = torch.where(last_extb==-1, torch.tensor(0.0), torch.tensor(1.0)) 284 | self.n[-1, :last_bz] = (1 - marker) * self.neglib + marker * self.poslib 285 | self.n = -torch.log(self.n) 286 | self.p[-1, :last_bz] = torch.where(batches[-1]==-1, torch.tensor(0.0), torch.tensor(1.0)) 287 | return batches 288 | 289 | def _get_regularization_loss(self): 290 | """Gets the value of the regularization loss for the current values of 291 | the model's parameters 292 | 293 | :return: regularization loss 294 | """ 295 | return 0.0 296 | 297 | def _get_norm_class_balance(self): 298 | return self.class_balance - torch.logsumexp(self.class_balance, dim=0) 299 | 300 | def _cll(self, votes, bid): 301 | '''Calculates class conditioned likelihood for batched votes. 302 | 303 | :param votes: current votes (batch) 304 | :param bid: batch id for current votes 305 | 306 | :return: 2-d torch tensor for class-conditioned likelihood for given votes and batch index. 307 | ''' 308 | num_inst = votes.shape[0] 309 | 310 | za = self.accuracy.unsqueeze(2) 311 | za = torch.cat((za, -1 * za), dim=2) 312 | za = - torch.logsumexp(za, dim=2).unsqueeze(0).repeat(num_inst, 1, 1) 313 | 314 | z_plh = torch.zeros((self.num_df, 1)).to(self.device) 315 | zp = self.propensity.unsqueeze(1) 316 | zp = torch.cat((zp, z_plh), dim=1) 317 | zp = -torch.logsumexp(zp, dim=1).unsqueeze(0).unsqueeze(-1).repeat(num_inst, 1, self.num_classes) 318 | 319 | cp = self.propensity.unsqueeze(0).unsqueeze(-1).repeat(num_inst, 1, self.num_classes) 320 | ca = self.accuracy.unsqueeze(0).repeat(num_inst,1,1) 321 | ab = self.a[bid][:num_inst].to(self.device) 322 | cc = self.c[bid][:num_inst].to(self.device) 323 | cn = self.n[bid][:num_inst].to(self.device) 324 | 325 | cll = torch.sum(((ca*cc+cn+cp+za)*ab)+zp, dim=1) 326 | return cll 327 | 328 | 329 | def _create_minibatches(self, votes, batch_size, shuffle_rows=False): 330 | ''' Create (shuffled) batched votes for parallelized estimation 331 | 332 | :param votes: Full PLFs votes input. 333 | :param batch_size: # of instances in one batch 334 | :param shuffle_rows: Decides if rows of given votse need to shuffle 335 | 336 | :return: 3-d Numpy array for batched votes in shape [# batch, # instance, # plfs] 337 | ''' 338 | if shuffle_rows: 339 | index = np.arange(np.shape(votes)[0]) 340 | np.random.shuffle(index) 341 | votes = votes[index, :] 342 | 343 | batches = [ 344 | torch.LongTensor(votes[i * batch_size: (i + 1) * batch_size, :].astype(np.int32)) 345 | for i in range(int(np.ceil(votes.shape[0] / batch_size))) 346 | ] 347 | 348 | return batches 349 | 350 | 351 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='labelmodels', 5 | version='0.0.1', 6 | url='https://github.com/BatsResearch/labelmodels.git', 7 | author='Shiying Luo, Stephen Bach', 8 | author_email='shiying_luo@brown.edu, sbach@cs.brown.edu', 9 | description='Lightweight implementations of generative label models for ' 10 | 'weakly supervised machine learning', 11 | packages=find_packages(), 12 | install_requires=['numpy >= 1.11', 'scipy >= 1.1', 'torch >= 1.4'], 13 | ) 14 | -------------------------------------------------------------------------------- /test/test_hmm.py: -------------------------------------------------------------------------------- 1 | from labelmodels import HMM 2 | import numpy as np 3 | from scipy import sparse 4 | import torch 5 | import unittest 6 | 7 | 8 | class TestHMM(unittest.TestCase): 9 | 10 | def setUp(self): 11 | np.random.seed(0) 12 | 13 | def tearDown(self): 14 | pass 15 | 16 | def test_estimate_label_model_binary(self): 17 | n = 5 18 | k = 2 19 | 20 | accuracies = np.array([[.9, .8], 21 | [.6, .7], 22 | [.6, .6], 23 | [.7, .6], 24 | [.8, .8]]) 25 | propensities = np.array([.9] * n) 26 | start_balance = np.array([.3, .7]) 27 | transitions = np.array([[.5, .5], [.3, .7]]) 28 | 29 | labels_train, seq_starts_train, gold_train = _generate_data( 30 | 1000, 8, 12, n, accuracies, propensities, start_balance, transitions 31 | ) 32 | 33 | model = HMM(k, n, acc_prior=0.0, balance_prior=0.0) 34 | model.estimate_label_model(labels_train, seq_starts_train) 35 | 36 | for i in range(n): 37 | for j in range(k): 38 | diff = accuracies[i, j] - model.get_accuracies()[i, j] 39 | self.assertAlmostEqual(diff, 0.0, places=1) 40 | for i in range(n): 41 | diff = propensities[i] - model.get_propensities()[i] 42 | self.assertAlmostEqual(diff, 0.0, places=1) 43 | for i in range(k): 44 | diff = start_balance[i] - model.get_start_balance()[i] 45 | self.assertAlmostEqual(diff, 0.0, places=1) 46 | for i in range(k): 47 | for j in range(k): 48 | diff = transitions[i, j] - model.get_transition_matrix()[i, j] 49 | self.assertAlmostEqual(diff, 0.0, places=1) 50 | 51 | def test_estimate_label_model_multiclass(self): 52 | n = 5 53 | k = 3 54 | 55 | accuracies = np.array([[.9, .8, .9], 56 | [.6, .7, .9], 57 | [.6, .6, .9], 58 | [.7, .6, .9], 59 | [.8, .8, .9]]) 60 | propensities = np.array([.9] * n) 61 | start_balance = np.array([.3, .3, .4]) 62 | transitions = np.array([[.5, .3, .2], 63 | [.3, .4, .3], 64 | [.2, .5, .3]]) 65 | 66 | labels_train, seq_starts_train, gold_train = _generate_data( 67 | 1000, 8, 12, n, accuracies, propensities, start_balance, transitions 68 | ) 69 | 70 | model = HMM(k, n, acc_prior=0.0, balance_prior=0.0) 71 | model.estimate_label_model(labels_train, seq_starts_train) 72 | 73 | for i in range(n): 74 | for j in range(k): 75 | diff = accuracies[i, j] - model.get_accuracies()[i, j] 76 | self.assertAlmostEqual(diff, 0.0, places=1) 77 | for i in range(n): 78 | diff = propensities[i] - model.get_propensities()[i] 79 | self.assertAlmostEqual(diff, 0.0, places=1) 80 | for i in range(k): 81 | diff = start_balance[i] - model.get_start_balance()[i] 82 | self.assertAlmostEqual(diff, 0.0, places=1) 83 | for i in range(k): 84 | for j in range(k): 85 | diff = transitions[i, j] - model.get_transition_matrix()[i, j] 86 | self.assertAlmostEqual(diff, 0.0, places=1) 87 | 88 | def test_get_most_probable_labels(self): 89 | m = 500 90 | n = 10 91 | k = 3 92 | 93 | model = HMM(k, n, acc_prior=0.0) 94 | with torch.no_grad(): 95 | model.start_balance[0] = 0 96 | model.start_balance[1] = 0.5 97 | for i in range(n): 98 | model.propensity[i] = 2 99 | for j in range(k): 100 | model.accuracy[i, j] = 2 101 | for i in range(k): 102 | for j in range(k): 103 | model.transitions[i, j] = 1 if i == j else 0 104 | 105 | labels_train, seq_starts_train, gold_train = _generate_data( 106 | m, 8, 12, n, 107 | model.get_accuracies(), 108 | model.get_propensities(), 109 | model.get_start_balance(), 110 | model.get_transition_matrix()) 111 | 112 | predictions = model.get_most_probable_labels(labels_train, seq_starts_train) 113 | correct = 0 114 | for i in range(len(predictions)): 115 | if predictions[i] == gold_train[i]: 116 | correct += 1 117 | accuracy = correct / float(len(predictions)) 118 | self.assertGreaterEqual(accuracy, .95) 119 | 120 | def test_get_label_distribution(self): 121 | m = 500 122 | n = 10 123 | k = 3 124 | 125 | model = HMM(k, n, acc_prior=0.0) 126 | with torch.no_grad(): 127 | model.start_balance[0] = 0 128 | model.start_balance[1] = 0.5 129 | for i in range(n): 130 | model.propensity[i] = 2 131 | for j in range(k): 132 | model.accuracy[i, j] = 2 133 | for i in range(k): 134 | for j in range(k): 135 | model.transitions[i, j] = 1 if i == j else 0 136 | 137 | labels_train, seq_starts_train, gold_train = _generate_data( 138 | m, 8, 12, n, 139 | model.get_accuracies(), 140 | model.get_propensities(), 141 | model.get_start_balance(), 142 | model.get_transition_matrix()) 143 | 144 | p_unary, p_pairwise = model.get_label_distribution( 145 | labels_train, seq_starts_train) 146 | 147 | # Makes predictions using both unary and pairwise marginals 148 | pred_unary = np.argmax(p_unary, axis=1) + 1 149 | pred_pairwise = np.zeros((labels_train.shape[0],), dtype=np.int32) 150 | next_seq = 0 151 | for i in range(labels_train.shape[0] - 1): 152 | if next_seq == len(seq_starts_train) or i < seq_starts_train[next_seq] - 1: 153 | # i is neither the start nor end of a sequence 154 | pred_pairwise[i+1] = np.argmax(p_pairwise[i][pred_pairwise[i]]) 155 | elif i == seq_starts_train[next_seq]: 156 | # i is the start of a sequence 157 | a, b = np.unravel_index(p_pairwise[i].argmax(), (k, k)) 158 | pred_pairwise[i], pred_pairwise[i + 1] = a, b 159 | next_seq += 1 160 | else: 161 | # i is the end of a sequence 162 | pass 163 | pred_pairwise += 1 164 | 165 | # Checks that predictions are accurate 166 | for predictions in (pred_unary, pred_pairwise): 167 | correct = 0 168 | for i in range(len(predictions)): 169 | if predictions[i] == gold_train[i]: 170 | correct += 1 171 | accuracy = correct / float(len(predictions)) 172 | self.assertGreaterEqual(accuracy, .95) 173 | 174 | ##### ensure marginalization of p_pairwise matches p_unary 175 | # create a simple and trained HMM. 176 | n = 5 177 | k = 2 178 | 179 | accuracies = np.array([[.9, .8], 180 | [.6, .7], 181 | [.6, .6], 182 | [.7, .6], 183 | [.8, .8]]) 184 | propensities = np.array([.9] * n) 185 | start_balance = np.array([.3, .7]) 186 | transitions = np.array([[.5, .5], [.3, .7]]) 187 | 188 | labels_train, seq_starts_train, gold_train = _generate_data( 189 | 10, 8, 12, n, accuracies, propensities, start_balance, transitions 190 | ) 191 | 192 | model = HMM(k, n, acc_prior=0.0, balance_prior=0.0) 193 | model.estimate_label_model(labels_train, seq_starts_train) 194 | 195 | # get unary and pairwise marginals 196 | p_unary, p_pairwise = model.get_label_distribution(labels_train, seq_starts_train) 197 | 198 | # marginalize pairwise over t + 1 199 | for un, pa in zip(p_unary, np.sum(p_pairwise, axis= 2)): 200 | if np.sum(pa)==0: # last pairwise_marginal is empty because it does not have t + 1 transition 201 | continue 202 | for i in range(un.shape[0]): 203 | self.assertAlmostEqual(un[i], pa[i], places=3) 204 | 205 | # marginalize pairwise over t 206 | for i, pa in enumerate(np.sum(p_pairwise, axis= 1)): 207 | if i + 1 >= len(p_unary) or i + 1 in seq_starts_train: 208 | # skip if p_unary[i + 1[ goes into new sequence 209 | continue 210 | un = p_unary[i + 1] 211 | for j in range(un.shape[0]): 212 | self.assertAlmostEqual(un[j], pa[j], places=3) 213 | 214 | 215 | def _generate_data(num_seqs, min_seq, max_seq, num_lfs, accuracies, 216 | propensities, start_balance, transitions): 217 | # Generates sequence starts 218 | seq_starts = np.zeros((num_seqs,), dtype=np.int32) 219 | total_len = 0 220 | for i in range(num_seqs): 221 | seq_len = np.random.randint(min_seq, max_seq + 1) 222 | total_len += seq_len 223 | if i + 1 < num_seqs: 224 | seq_starts[i + 1] = total_len 225 | 226 | # Generates sequences of gold labels 227 | gold = np.zeros((total_len,), dtype=np.int32) 228 | next_start = 0 229 | for i in range(total_len): 230 | if next_start < len(seq_starts) and i == seq_starts[next_start]: 231 | balance = start_balance 232 | next_start += 1 233 | else: 234 | balance = np.squeeze(transitions[gold[i - 1] - 1]) 235 | 236 | gold[i] = np.argmax(np.random.multinomial(1, balance)) + 1 237 | 238 | # Generates labeling function outputs conditioned on gold labels 239 | row = [] 240 | col = [] 241 | val = [] 242 | for i in range(total_len): 243 | for j in range(num_lfs): 244 | if np.random.random() < propensities[j]: 245 | row.append(i) 246 | col.append(j) 247 | if np.random.random() < accuracies[j, gold[i] - 1]: 248 | val.append(gold[i]) 249 | else: 250 | p_mistake = 1 / (len(start_balance) - 1) 251 | dist = [p_mistake] * (len(start_balance) + 1) 252 | dist[0] = 0 253 | dist[gold[i]] = 0 254 | val.append(np.argmax(np.random.multinomial(1, dist))) 255 | 256 | labels = sparse.coo_matrix((val, (row, col)), shape=(total_len, num_lfs)) 257 | 258 | return labels, seq_starts, gold 259 | 260 | 261 | if __name__ == '__main__': 262 | unittest.main() 263 | -------------------------------------------------------------------------------- /test/test_linked_hmm.py: -------------------------------------------------------------------------------- 1 | from labelmodels import LinkedHMM, LearningConfig 2 | import numpy as np 3 | from scipy import sparse 4 | import torch 5 | import unittest 6 | 7 | 8 | class TestLinkedHMM(unittest.TestCase): 9 | 10 | def setUp(self): 11 | np.random.seed(0) 12 | 13 | def tearDown(self): 14 | pass 15 | 16 | def test_estimate_label_model_binary(self): 17 | n1 = 5 18 | n2 = 3 19 | k = 2 20 | 21 | label_accuracies = np.array([[.9, .8], 22 | [.6, .7], 23 | [.6, .6], 24 | [.7, .6], 25 | [.8, .8]]) 26 | link_accuracies = np.array([.8, .6, .8]) 27 | label_propensities = np.array([.9] * n1) 28 | link_propensities = np.array([.9] * n1) 29 | start_balance = np.array([.3, .7]) 30 | transitions = np.array([[.5, .5], [.3, .7]]) 31 | 32 | labels, links, seq_starts, gold = _generate_data( 33 | 1000, 8, 12, n1, n2, 34 | label_accuracies, 35 | link_accuracies, 36 | label_propensities, 37 | link_propensities, 38 | start_balance, 39 | transitions 40 | ) 41 | 42 | model = LinkedHMM(k, n1, n2, acc_prior=0.0, balance_prior=0.0) 43 | config = LearningConfig() 44 | config.epochs = 3 45 | model.estimate_label_model(labels, links, seq_starts, config=config) 46 | 47 | for i in range(n1): 48 | for j in range(k): 49 | diff = label_accuracies[i, j] - model.get_accuracies()[i, j] 50 | self.assertAlmostEqual(diff, 0.0, places=1) 51 | for i in range(n2): 52 | for j in range(k): 53 | diff = link_accuracies[i] - model.get_link_accuracies()[i] 54 | self.assertAlmostEqual(diff, 0.0, places=1) 55 | for i in range(n1): 56 | diff = label_propensities[i] - model.get_propensities()[i] 57 | self.assertAlmostEqual(diff, 0.0, places=1) 58 | for i in range(n2): 59 | diff = link_propensities[i] - model.get_link_propensities()[i] 60 | self.assertAlmostEqual(diff, 0.0, places=1) 61 | for i in range(k): 62 | diff = start_balance[i] - model.get_start_balance()[i] 63 | self.assertAlmostEqual(diff, 0.0, places=1) 64 | for i in range(k): 65 | for j in range(k): 66 | diff = transitions[i, j] - model.get_transition_matrix()[i, j] 67 | self.assertAlmostEqual(diff, 0.0, places=1) 68 | 69 | def test_estimate_label_model_multiclass(self): 70 | n1 = 5 71 | n2 = 3 72 | k = 3 73 | 74 | label_accuracies = np.array([[.9, .8, .5], 75 | [.6, .7, .3], 76 | [.6, .6, .8], 77 | [.7, .6, .6], 78 | [.8, .8, .9]]) 79 | link_accuracies = np.array([.8, .6, .8]) 80 | label_propensities = np.array([.9] * n1) 81 | link_propensities = np.array([.9] * n1) 82 | start_balance = np.array([.3, .3, .4]) 83 | transitions = np.array([[.5, .3, .2], 84 | [.4, .3, .3], 85 | [.3, .3, .4]]) 86 | 87 | labels, links, seq_starts, gold = _generate_data( 88 | 1000, 8, 12, n1, n2, 89 | label_accuracies, 90 | link_accuracies, 91 | label_propensities, 92 | link_propensities, 93 | start_balance, 94 | transitions 95 | ) 96 | 97 | model = LinkedHMM(k, n1, n2, acc_prior=0.0, balance_prior=0.0) 98 | config = LearningConfig() 99 | config.epochs = 4 100 | model.estimate_label_model(labels, links, seq_starts, config=config) 101 | 102 | for i in range(n1): 103 | for j in range(k): 104 | diff = label_accuracies[i, j] - model.get_accuracies()[i, j] 105 | self.assertAlmostEqual(diff, 0.0, places=1) 106 | for i in range(n2): 107 | for j in range(k): 108 | diff = link_accuracies[i] - model.get_link_accuracies()[i] 109 | self.assertAlmostEqual(diff, 0.0, places=1) 110 | for i in range(n1): 111 | diff = label_propensities[i] - model.get_propensities()[i] 112 | self.assertAlmostEqual(diff, 0.0, places=1) 113 | for i in range(n2): 114 | diff = link_propensities[i] - model.get_link_propensities()[i] 115 | self.assertAlmostEqual(diff, 0.0, places=1) 116 | for i in range(k): 117 | diff = start_balance[i] - model.get_start_balance()[i] 118 | self.assertAlmostEqual(diff, 0.0, places=1) 119 | for i in range(k): 120 | for j in range(k): 121 | diff = transitions[i, j] - model.get_transition_matrix()[i, j] 122 | self.assertAlmostEqual(diff, 0.0, places=1) 123 | 124 | def test_get_most_probable_labels(self): 125 | m = 500 126 | n1 = 3 127 | n2 = 5 128 | k = 3 129 | 130 | model = LinkedHMM(k, n1, n2) 131 | with torch.no_grad(): 132 | model.start_balance[0] = 0 133 | model.start_balance[1] = 0.5 134 | for i in range(n1): 135 | model.propensity[i] = 0 136 | for j in range(k): 137 | model.accuracy[i, j] = 1 138 | for i in range(n2): 139 | model.link_propensity[i] = 0 140 | model.link_accuracy[i] = 1.5 141 | for i in range(k): 142 | for j in range(k): 143 | model.transitions[i, j] = 1 if i == j else 0 144 | 145 | labels, links, seq_starts, gold = _generate_data( 146 | m, 8, 12, n1, n2, 147 | model.get_label_accuracies(), 148 | model.get_link_accuracies(), 149 | model.get_label_propensities(), 150 | model.get_link_propensities(), 151 | model.get_start_balance(), 152 | model.get_transition_matrix()) 153 | 154 | predictions = model.get_most_probable_labels(labels, links, seq_starts) 155 | correct = 0 156 | for i in range(len(predictions)): 157 | if predictions[i] == gold[i]: 158 | correct += 1 159 | accuracy = correct / float(len(predictions)) 160 | self.assertGreaterEqual(accuracy, .95) 161 | 162 | def test_get_label_distribution(self): 163 | m = 500 164 | n1 = 3 165 | n2 = 5 166 | k = 3 167 | 168 | model = LinkedHMM(k, n1, n2) 169 | with torch.no_grad(): 170 | model.start_balance[0] = 0 171 | model.start_balance[1] = 0.5 172 | for i in range(n1): 173 | model.propensity[i] = 0 174 | for j in range(k): 175 | model.accuracy[i, j] = 1 176 | for i in range(n2): 177 | model.link_propensity[i] = 0 178 | model.link_accuracy[i] = 1.5 179 | for i in range(k): 180 | for j in range(k): 181 | model.transitions[i, j] = 1 if i == j else 0 182 | 183 | labels, links, seq_starts, gold = _generate_data( 184 | m, 8, 12, n1, n2, 185 | model.get_label_accuracies(), 186 | model.get_link_accuracies(), 187 | model.get_label_propensities(), 188 | model.get_link_propensities(), 189 | model.get_start_balance(), 190 | model.get_transition_matrix()) 191 | 192 | p_unary, p_pairwise = model.get_label_distribution( 193 | labels, links, seq_starts) 194 | 195 | # Makes predictions using both unary and pairwise marginals 196 | pred_unary = np.argmax(p_unary, axis=1) + 1 197 | pred_pairwise = np.zeros((labels.shape[0],), dtype=np.int32) 198 | next_seq = 0 199 | for i in range(labels.shape[0] - 1): 200 | if next_seq == len(seq_starts) or i < seq_starts[next_seq] - 1: 201 | # i is neither the start nor end of a sequence 202 | pred_pairwise[i+1] = np.argmax(p_pairwise[i][pred_pairwise[i]]) 203 | elif i == seq_starts[next_seq]: 204 | # i is the start of a sequence 205 | a, b = np.unravel_index(p_pairwise[i].argmax(), (k, k)) 206 | pred_pairwise[i], pred_pairwise[i + 1] = a, b 207 | next_seq += 1 208 | else: 209 | # i is the end of a sequence 210 | pass 211 | pred_pairwise += 1 212 | 213 | # Checks that predictions are accurate 214 | for predictions in (pred_unary, pred_pairwise): 215 | correct = 0 216 | for i in range(len(predictions)): 217 | if predictions[i] == gold[i]: 218 | correct += 1 219 | accuracy = correct / float(len(predictions)) 220 | self.assertGreaterEqual(accuracy, .95) 221 | 222 | ##### ensure marginalization of p_pairwise matches p_unary 223 | # create a simple and trained linkHMM. 224 | n1 = 5 225 | n2 = 3 226 | k = 2 227 | 228 | label_accuracies = np.array([[.9, .8], 229 | [.6, .7], 230 | [.6, .6], 231 | [.7, .6], 232 | [.8, .8]]) 233 | link_accuracies = np.array([.8, .6, .8]) 234 | label_propensities = np.array([.9] * n1) 235 | link_propensities = np.array([.9] * n1) 236 | start_balance = np.array([.3, .7]) 237 | transitions = np.array([[.5, .5], [.3, .7]]) 238 | 239 | labels, links, seq_starts, gold = _generate_data( 240 | 10, 8, 12, n1, n2, 241 | label_accuracies, 242 | link_accuracies, 243 | label_propensities, 244 | link_propensities, 245 | start_balance, 246 | transitions 247 | ) 248 | model = LinkedHMM(k, n1, n2, acc_prior=0.0, balance_prior=0.0) 249 | config = LearningConfig() 250 | config.epochs = 3 251 | model.estimate_label_model(labels, links, seq_starts, config=config) 252 | 253 | # get unary and pairwise marginals 254 | p_unary, p_pairwise = model.get_label_distribution(labels, links, seq_starts) 255 | 256 | # marginalize pairwise over t + 1 257 | for un, pa in zip(p_unary, np.sum(p_pairwise, axis= 2)): 258 | if np.sum(pa)==0: # last pairwise_marginal is empty because it does not have t + 1 transition 259 | continue 260 | for i in range(un.shape[0]): 261 | self.assertAlmostEqual(un[i], pa[i], places=3) 262 | 263 | # marginalize pairwise over t 264 | for i, pa in enumerate(np.sum(p_pairwise, axis= 1)): 265 | if i + 1 >= len(p_unary) or i + 1 in seq_starts: 266 | # skip if p_unary[i + 1[ goes into new sequence 267 | continue 268 | un = p_unary[i + 1] 269 | for j in range(un.shape[0]): 270 | self.assertAlmostEqual(un[j], pa[j], places=3) 271 | 272 | 273 | def _generate_data(num_seqs, min_seq, max_seq, num_label_funcs, num_link_funcs, 274 | label_accs, link_accs, label_propensities, link_propensities, 275 | start_balance, transitions): 276 | # Generates sequence starts 277 | seq_starts = np.zeros((num_seqs,), dtype=np.int32) 278 | total_len = 0 279 | for i in range(num_seqs): 280 | seq_len = np.random.randint(min_seq, max_seq + 1) 281 | total_len += seq_len 282 | if i + 1 < num_seqs: 283 | seq_starts[i + 1] = total_len 284 | 285 | # Generates sequences of gold labels 286 | gold = np.zeros((total_len,), dtype=np.int32) 287 | next_start = 0 288 | for i in range(total_len): 289 | if next_start < len(seq_starts) and i == seq_starts[next_start]: 290 | balance = start_balance 291 | next_start += 1 292 | else: 293 | balance = np.squeeze(transitions[gold[i - 1] - 1]) 294 | 295 | gold[i] = np.argmax(np.random.multinomial(1, balance)) + 1 296 | 297 | # Generates labeling function outputs conditioned on gold labels 298 | row = [] 299 | col = [] 300 | val = [] 301 | for i in range(total_len): 302 | for j in range(num_label_funcs): 303 | if np.random.random() < label_propensities[j]: 304 | row.append(i) 305 | col.append(j) 306 | if np.random.random() < label_accs[j, gold[i] - 1]: 307 | val.append(gold[i]) 308 | else: 309 | p_mistake = 1 / (len(start_balance) - 1) 310 | dist = [p_mistake] * (len(start_balance) + 1) 311 | dist[0] = 0 312 | dist[gold[i]] = 0 313 | val.append(np.argmax(np.random.multinomial(1, dist))) 314 | 315 | labels = sparse.coo_matrix((val, (row, col)), shape=(total_len, num_label_funcs)) 316 | 317 | # Generates linking function outputs conditioned on gold labels 318 | row = [] 319 | col = [] 320 | val = [] 321 | next_seq = 0 322 | for i in range(total_len): 323 | if next_seq < len(seq_starts) and i == seq_starts[next_seq]: 324 | next_seq += 1 325 | else: 326 | for j in range(num_link_funcs): 327 | if np.random.random() < link_propensities[j]: 328 | row.append(i) 329 | col.append(j) 330 | if np.random.random() < link_accs[j]: 331 | val.append(1 if gold[i-1] == gold[i] else -1) 332 | else: 333 | val.append(-1 if gold[i-1] == gold[i] else 1) 334 | 335 | links = sparse.coo_matrix((val, (row, col)), shape=(total_len, num_link_funcs)) 336 | 337 | return labels, links, seq_starts, gold 338 | 339 | 340 | if __name__ == '__main__': 341 | unittest.main() 342 | -------------------------------------------------------------------------------- /test/test_naive_bayes.py: -------------------------------------------------------------------------------- 1 | from labelmodels import NaiveBayes 2 | import numpy as np 3 | from scipy import sparse 4 | import util 5 | import torch 6 | import unittest 7 | 8 | 9 | class TestNaiveBayes(unittest.TestCase): 10 | 11 | def setUp(self): 12 | np.random.seed(0) 13 | 14 | def tearDown(self): 15 | pass 16 | 17 | def test_estimate_label_model_binary(self): 18 | m = 25000 19 | n = 5 20 | accuracies = np.array([[.9, .8], 21 | [.6, .7], 22 | [.5, .5], 23 | [.7, .6], 24 | [.8, .8]]) 25 | propensities = np.array([.3] * n) 26 | class_balance = np.array([.4, .6]) 27 | 28 | labels_train, gold_train = _generate_data( 29 | m, n, accuracies, propensities, class_balance) 30 | 31 | model = NaiveBayes(2, n, acc_prior=0.0, balance_prior=0.0) 32 | model.estimate_label_model(labels_train) 33 | 34 | for j in range(n): 35 | for k in range(2): 36 | diff = accuracies[j, k] - model.get_accuracies()[j, k] 37 | self.assertAlmostEqual(diff, 0.0, places=1) 38 | for j in range(n): 39 | diff = propensities[j] - model.get_propensities()[j] 40 | self.assertAlmostEqual(diff, 0.0, places=1) 41 | for k in range(len(class_balance)): 42 | diff = class_balance[k] - model.get_class_balance()[k] 43 | self.assertAlmostEqual(diff, 0.0, places=1) 44 | 45 | def test_estimate_label_model_multiclass(self): 46 | m = 25000 47 | n = 5 48 | accuracies = np.array([[.9, .8, .5], 49 | [.6, .7, .5], 50 | [.5, .5, .9], 51 | [.7, .6, .7], 52 | [.8, .8, .7]]) 53 | propensities = np.array([.2] * n) 54 | class_balance = np.array([.3, .4, .3]) 55 | 56 | labels_train, gold_train = _generate_data( 57 | m, n, accuracies, propensities, class_balance) 58 | 59 | model = NaiveBayes(3, n, acc_prior=0.0, balance_prior=0.0) 60 | model.estimate_label_model(labels_train) 61 | 62 | for j in range(n): 63 | for k in range(2): 64 | diff = accuracies[j, k] - model.get_accuracies()[j, k] 65 | self.assertAlmostEqual(diff, 0.0, places=1) 66 | for j in range(n): 67 | diff = propensities[j] - model.get_propensities()[j] 68 | self.assertAlmostEqual(diff, 0.0, places=1) 69 | for k in range(len(class_balance)): 70 | diff = class_balance[k] - model.get_class_balance()[k] 71 | self.assertAlmostEqual(diff, 0.0, places=1) 72 | 73 | def test_get_most_probable_labels_binary(self): 74 | m = 10000 75 | n = 5 76 | k = 2 77 | 78 | model = NaiveBayes(k, n) 79 | with torch.no_grad(): 80 | model.class_balance[0] = 0 81 | model.class_balance[1] = 0.5 82 | for i in range(n): 83 | model.propensity[i] = 2 84 | for j in range(k): 85 | model.accuracy[i, j] = 2 86 | 87 | labels_train, gold_train = _generate_data( 88 | m, n, 89 | model.get_accuracies(), 90 | model.get_propensities(), 91 | model.get_class_balance()) 92 | 93 | # Checks label inference 94 | labels = model.get_most_probable_labels(labels_train) 95 | correct = 0 96 | for i in range(m): 97 | if gold_train[i] == labels[i]: 98 | correct += 1 99 | 100 | self.assertGreater(float(correct) / m, .95) 101 | 102 | def test_get_most_probable_labels_multiclass(self): 103 | m = 10000 104 | n = 5 105 | k = 3 106 | 107 | model = NaiveBayes(k, n) 108 | with torch.no_grad(): 109 | model.class_balance[0] = 0 110 | model.class_balance[1] = 0.5 111 | model.class_balance[2] = 0.5 112 | for i in range(n): 113 | model.propensity[i] = 2 114 | for j in range(k): 115 | model.accuracy[i, j] = 2 116 | 117 | labels_train, gold_train = _generate_data( 118 | m, n, 119 | model.get_accuracies(), 120 | model.get_propensities(), 121 | model.get_class_balance()) 122 | 123 | # Checks label inference 124 | labels = model.get_most_probable_labels(labels_train) 125 | correct = 0 126 | for i in range(m): 127 | if gold_train[i] == labels[i]: 128 | correct += 1 129 | 130 | self.assertGreater(float(correct) / m, .95) 131 | 132 | def test_estimate_model_input_formats(self): 133 | m = 1000 134 | n = 3 135 | 136 | accuracies = np.array([[.8, .8, .8], 137 | [.8, .8, .8], 138 | [.8, .8, .8]]) 139 | propensities = np.array([.5] * n) 140 | class_balance = np.array([.1, .1, .8]) 141 | 142 | labels_train, _ = _generate_data( 143 | m, n, accuracies, propensities, class_balance) 144 | 145 | # Trains the model on the generated data 146 | model = NaiveBayes(3, n) 147 | model.estimate_label_model(labels_train) 148 | accuracies = model.get_accuracies() 149 | propensities = model.get_propensities() 150 | class_balance = model.get_class_balance() 151 | 152 | # Checks that other input formats work and do not change the results 153 | for data in util.get_all_formats(labels_train): 154 | model = NaiveBayes(3, n) 155 | model.estimate_label_model(data) 156 | diff = np.sum(np.abs(accuracies - model.get_accuracies())) 157 | self.assertAlmostEqual(float(diff), 0.0) 158 | diff = np.sum(np.abs(propensities - model.get_propensities())) 159 | self.assertAlmostEqual(float(diff), 0.0) 160 | diff = np.sum(np.abs(class_balance - model.get_class_balance())) 161 | self.assertAlmostEqual(float(diff), 0.0) 162 | 163 | def test_get_label_input_formats(self): 164 | m = 1000 165 | n = 3 166 | 167 | accuracies = np.array([[.8, .8, .8], 168 | [.8, .8, .8], 169 | [.8, .8, .8]]) 170 | propensities = np.array([.5] * n) 171 | class_balance = np.array([.1, .1, .8]) 172 | 173 | labels_train, _ = _generate_data( 174 | m, n, accuracies, propensities, class_balance) 175 | 176 | # Gets the label distribution for the generated data 177 | model = NaiveBayes(3, n, init_acc=0.8) 178 | distribution = model.get_label_distribution(labels_train) 179 | 180 | # Checks that other input formats work and do not change the results 181 | for data in util.get_all_formats(labels_train): 182 | model = NaiveBayes(3, n, init_acc=0.8) 183 | new_distribution = model.get_label_distribution(data) 184 | diff = np.sum(np.abs(distribution - new_distribution)) 185 | self.assertAlmostEqual(float(diff), 0.0) 186 | 187 | 188 | def _generate_data(m, n, accuracies, propensities, class_balance): 189 | gold = np.zeros((m,), dtype=np.int16) 190 | row = [] 191 | col = [] 192 | val = [] 193 | 194 | for i in range(m): 195 | k = np.argmax(np.random.multinomial(1, class_balance)) 196 | gold[i] = k + 1 197 | for j in range(n): 198 | if np.random.random() < propensities[j]: 199 | row.append(i) 200 | col.append(j) 201 | if np.random.random() < accuracies[j, k]: 202 | val.append(gold[i]) 203 | else: 204 | p_mistake = 1 / (len(class_balance) - 1) 205 | dist = [p_mistake] * (len(class_balance) + 1) 206 | dist[0] = 0 207 | dist[gold[i]] = 0 208 | val.append(np.argmax(np.random.multinomial(1, dist))) 209 | 210 | labels = sparse.coo_matrix((val, (row, col)), shape=(m, n)) 211 | return labels, gold 212 | 213 | 214 | if __name__ == '__main__': 215 | unittest.main() 216 | -------------------------------------------------------------------------------- /test/test_partial_labels.py: -------------------------------------------------------------------------------- 1 | from labelmodels import PartialLabelModel 2 | import numpy as np 3 | from scipy import sparse 4 | import torch 5 | import unittest 6 | from random import sample 7 | from copy import deepcopy as dc 8 | 9 | 10 | class Experiment: 11 | def __init__(self, name, num_classes, label_partition, lm_annot_votes, lm_train_votes, lm_annot_labels=None): 12 | self.name = name 13 | self.label_partition = label_partition 14 | self.lm_annot_votes = lm_annot_votes 15 | self.lm_train_votes = lm_train_votes 16 | self.lm_annot_labels = lm_annot_labels 17 | self.num_classes = num_classes 18 | self.num_df = len(label_partition) 19 | 20 | def set_soft_labels(self, soft_labels): 21 | pass 22 | 23 | 24 | class LMTask: 25 | def __init__(self, name, num_classes, label_partition, preset_classbalance=None, device='cuda:0'): 26 | self.name = name 27 | 28 | self.labelmodel = PartialLabelModel(num_classes=num_classes, 29 | label_partition=label_partition, 30 | preset_classbalance=preset_classbalance, 31 | device=device) 32 | 33 | def annotate(self, lm_annot_votes, lm_train_votes=None): 34 | if lm_train_votes is not None: 35 | self.labelmodel.estimate_label_model(lm_train_votes) 36 | return self.labelmodel.estimate_label_model(lm_annot_votes) 37 | 38 | def get_accuracy(self): 39 | return self.labelmodel.get_accuracies() 40 | 41 | def get_propensity(self): 42 | return self.labelmodel.get_propensities() 43 | 44 | def get_class_balance(self): 45 | return self.labelmodel.get_class_balance() 46 | 47 | 48 | def Workflow(experimental_data): 49 | lm_task = LMTask('test', num_classes=experimental_data.num_classes, 50 | label_partition=experimental_data.label_partition, 51 | device='cuda:0') 52 | 53 | lm_annot_soft_labels = lm_task.annotate(lm_annot_votes=experimental_data.lm_annot_votes, 54 | lm_train_votes=experimental_data.lm_train_votes) 55 | output = [] 56 | output.append(lm_task.get_accuracy()) 57 | output.append(lm_task.get_class_balance()) 58 | output.append(lm_task.get_propensity()) 59 | return output 60 | 61 | 62 | def setup(): 63 | simple_label_partition = { 64 | 0: [[1], [2, 3]], 65 | 1: [[2], [1, 3]], 66 | 2: [[3], [1, 2]] 67 | } 68 | num_sources = len(simple_label_partition) 69 | num_classes = 4 70 | num_annot_inst = 4096 * 16 71 | 72 | labelmodel_annotation_votes = np.random.randint(2, size=(num_annot_inst, num_sources)) 73 | labelmodel_training_votes = labelmodel_annotation_votes 74 | labelmodel_annotation_labels = np.random.randint(num_classes, size=(num_annot_inst, 1)) + 1 75 | test_data = Experiment('simple-tests', 76 | num_classes=num_classes, 77 | label_partition=simple_label_partition, 78 | lm_annot_votes=labelmodel_annotation_votes, 79 | lm_train_votes=labelmodel_training_votes, 80 | lm_annot_labels=labelmodel_annotation_labels) 81 | 82 | return test_data 83 | 84 | 85 | def setup_test(label_partition, accuracies, class_balance, m=4096*8, abstention=None): 86 | votes, gold = _generate_data(m, label_partition, accuracies, class_balance, abstention=abstention) 87 | 88 | return votes, gold 89 | 90 | 91 | def close_estimation(model_acc, true_acc, thresh=0.05, verbose=True): 92 | # assert model_acc.shape == true_acc.shape 93 | res = torch.allclose(torch.Tensor(model_acc), torch.Tensor(true_acc), atol=thresh) 94 | if verbose: 95 | print(res) 96 | return res 97 | 98 | 99 | def actual_cb(gold): 100 | unique, counts = np.unique(gold, return_counts=True) 101 | return counts / sum(counts) 102 | 103 | 104 | class TestPartialLabelModel(unittest.TestCase): 105 | def test_general_accuracy_recovery_0(self): 106 | print('Testing Accuracy Recovery Rate for PLM - 0') 107 | true_cb_0 = [1 / 3, 1 / 3, 1 / 3] 108 | true_acc_0 = np.array( 109 | [[.8, .7, .6], 110 | [.75, .7, .7], 111 | [.5, .7, .65], 112 | [.8, .8, .75], 113 | [.9, .7, .8]]) 114 | label_partition = { 115 | 0: [[1], [2, 3]], 116 | 1: [[1], [2, 3]], 117 | 2: [[1, 2], [3]], 118 | 3: [[1, 2], [3]], 119 | 4: [[1, 3], [2]] 120 | } 121 | votes, gold = setup_test(label_partition, true_acc_0, true_cb_0) 122 | test_data_0 = Experiment('acc-tests-0', 123 | num_classes=3, 124 | label_partition=label_partition, 125 | lm_annot_votes=votes, 126 | lm_train_votes=votes, 127 | lm_annot_labels=gold) 128 | 129 | acc_0, cb_0, _ = Workflow(experimental_data=test_data_0) 130 | 131 | #print(acc_0 - true_acc_0) 132 | self.assertTrue(close_estimation(acc_0, true_acc_0)) 133 | self.assertTrue(close_estimation(cb_0, true_cb_0)) 134 | 135 | def test_general_accuracy_recovery_1(self): 136 | print('Testing Accuracy Recovery Rate for PLM - 1') 137 | true_cb_1 = [.5, .3, .2] 138 | true_acc_1 = np.array( 139 | [[.8, .7, .6], 140 | [.8, .7, .6], 141 | [.5, .9, .6], 142 | [.8, .7, .6], 143 | [.9, .7, .6]]) 144 | label_partition = { 145 | 0: [[1], [2], [3]], 146 | 1: [[1], [2, 3]], 147 | 2: [[1, 2], [3]], 148 | 3: [[1], [2], [3]], 149 | 4: [[1], [2], [3]] 150 | } 151 | votes, gold = setup_test(label_partition, true_acc_1, true_cb_1) 152 | test_data_1 = Experiment('acc-tests-1', 153 | num_classes=3, 154 | label_partition=label_partition, 155 | lm_annot_votes=votes, 156 | lm_train_votes=votes, 157 | lm_annot_labels=gold) 158 | 159 | acc_1, cb_1, _ = Workflow(experimental_data=test_data_1) 160 | 161 | self.assertTrue(close_estimation(acc_1, true_acc_1)) 162 | self.assertTrue(close_estimation(cb_1, true_cb_1)) 163 | 164 | def test_general_accuracy_recovery_2(self): 165 | print('Testing Accuracy Recovery Rate for PLM - 2') 166 | true_cb_2 = [.4, .2, .4] 167 | true_acc_2 = np.array( 168 | [[.8, .7, .6], 169 | [.8, .8, .7], 170 | [.5, .7, .9], 171 | [.8, .6, .7], 172 | [.9, .7, .6], 173 | [.8, .5, .6]]) 174 | label_partition = { 175 | 0: [[1], [2], [3]], 176 | 1: [[2], [1, 3]], 177 | 2: [[1, 2], [3]], 178 | 3: [[1], [2], [3]], 179 | 4: [[1], [2], [3]], 180 | 5: [[3, 2], [1]] 181 | } 182 | votes, gold = setup_test(label_partition, true_acc_2, true_cb_2) 183 | 184 | test_data_2 = Experiment('acc-tests-2', 185 | num_classes=3, 186 | label_partition=label_partition, 187 | lm_annot_votes=votes, 188 | lm_train_votes=votes, 189 | lm_annot_labels=gold) 190 | 191 | acc_2, cb_2, _ = Workflow(experimental_data=test_data_2) 192 | print(acc_2 - true_acc_2) 193 | print(true_cb_2 - cb_2) 194 | self.assertTrue(close_estimation(acc_2, true_acc_2)) 195 | self.assertTrue(close_estimation(cb_2, true_cb_2)) 196 | 197 | def test_general_accuracy_recovery_3(self): 198 | print('Testing Accuracy Recovery Rate for PLM - 3 with Abstention') 199 | true_cb_3 = [.4, .2, .4] 200 | true_acc_3 = np.array( 201 | [[.8, .7, .6], 202 | [.8, .7, .7], 203 | [.5, .7, .9], 204 | [.8, .6, .7], 205 | [.9, .8, .6], 206 | [.8, .5, .8]]) 207 | label_partition = { 208 | 0: [[1], [2], [3]], 209 | 1: [[2], [1, 3]], 210 | 2: [[1, 2], [3]], 211 | 3: [[1], [2], [3]], 212 | 4: [[1], [2, 3]], 213 | 5: [[3, 2], [1]] 214 | } 215 | 216 | abstention = [0.8, 0.9, 0.8, 0.7, 0.8, 0.9] 217 | votes, gold = setup_test(label_partition, true_acc_3, true_cb_3, 218 | abstention=abstention) 219 | 220 | tv = dc(votes) 221 | tv[tv == -1] = 100 222 | tv[tv < 100] = 0 223 | tv = tv / 100 224 | est_abst = np.mean(tv, axis=0) 225 | # print(est_abst) 226 | 227 | test_data_3 = Experiment('acc-tests-2', 228 | num_classes=3, 229 | label_partition=label_partition, 230 | lm_annot_votes=votes, 231 | lm_train_votes=votes, 232 | lm_annot_labels=gold) 233 | acc_3, cb_3, prp_3 = Workflow(test_data_3) 234 | 235 | self.assertTrue(close_estimation(acc_3, true_acc_3)) 236 | self.assertTrue(close_estimation(cb_3, true_cb_3)) 237 | self.assertTrue(close_estimation(prp_3, abstention)) 238 | 239 | def test_general_accuracy_recovery_4(self): 240 | print('Testing Accuracy Recovery Rate for PLM - 4 with Abstention') 241 | true_cb_3 = [.4, .2, .4] 242 | true_acc_3 = np.array( 243 | [[.8, .7, .6], 244 | [.8, .6, .7], 245 | [.5, .7, .9], 246 | [.8, .6, .7], 247 | [.9, .8, .6], 248 | [.8, .5, .8]]) 249 | label_partition = { 250 | 0: [[1], [2], [3]], 251 | 1: [[2], [1, 3]], 252 | 2: [[1, 2], [3]], 253 | 3: [[1], [2], [3]], 254 | 4: [[1], [2, 3]], 255 | 5: [[3, 2], [1]] 256 | } 257 | 258 | abstention = [0.8, 0.9, 0.8, 0.7, 0.8, 0.9] 259 | votes, gold = setup_test(label_partition, true_acc_3, true_cb_3, 260 | abstention=abstention) 261 | 262 | tv = dc(votes) 263 | tv[tv == -1] = 100 264 | tv[tv < 100] = 0 265 | tv = tv / 100 266 | est_abst = np.mean(tv, axis=0) 267 | print(est_abst) 268 | 269 | test_data_3 = Experiment('acc-tests-4', 270 | num_classes=3, 271 | label_partition=label_partition, 272 | lm_annot_votes=votes, 273 | lm_train_votes=votes, 274 | lm_annot_labels=gold) 275 | acc_3, cb_3, prp_3 = Workflow(test_data_3) 276 | self.assertTrue(close_estimation(acc_3, true_acc_3)) 277 | self.assertTrue(close_estimation(cb_3, true_cb_3)) 278 | self.assertTrue(close_estimation(prp_3, abstention)) 279 | 280 | 281 | def _generate_data(m, label_partition, accuracies, class_balance, abstention=None): 282 | """ 283 | Generate synthetic data 284 | 285 | :param m: number of examples 286 | :param n: number of sources 287 | :param label_partition: feature id clustering 288 | :param accuracies: n x k matrix of accuracies, where k is number of classes 289 | :param class_balance: k-dim vector representing prior over classes 290 | :param abstention: n-dim vector representing prob of not abstention 291 | :return: m x n matrix of features, m-dim vector of gold class labels 292 | """ 293 | n = len(label_partition) 294 | gold = np.zeros((m,), dtype=np.int16) 295 | votes = np.zeros((m, n), dtype=np.int16) 296 | 297 | for i in range(m): 298 | k = np.argmax(np.random.multinomial(1, class_balance)) 299 | gold[i] = k + 1 300 | for j in range(n): 301 | # Collects correct and incorrect clusters 302 | correct = [] 303 | incorrect = [] 304 | for cid, cluster in enumerate(label_partition[j]): 305 | if k + 1 in cluster: 306 | correct.append(cid+1) 307 | else: 308 | incorrect.append(cid+1) 309 | if np.random.random() < accuracies[j, k]: 310 | votes[i, j] = np.random.choice(correct) 311 | else: 312 | votes[i, j] = np.random.choice(incorrect) 313 | 314 | if abstention is not None: 315 | for idx, prob in enumerate(abstention): 316 | votes[np.array(sample(range(m), int((1 - prob) * m))), idx] = 0 317 | 318 | return votes, gold 319 | 320 | 321 | if __name__ == '__main__': 322 | unittest.main() 323 | -------------------------------------------------------------------------------- /test/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | def get_all_formats(matrix): 6 | """Converts a scipy sparse matrix to a list of copies in formats that should 7 | be supported 8 | 9 | :param matrix: labeling function output matrix in a scipy sparse format 10 | :return: list of labeling function output matrices 11 | """ 12 | other_formats = [ 13 | matrix.todense(), 14 | matrix.todense().tolist(), 15 | matrix.todense().astype(np.float64), 16 | matrix.tocoo(), 17 | matrix.tocsc(), 18 | matrix.todia(), 19 | matrix.todok(), 20 | matrix.tolil(), 21 | torch.tensor(matrix.todense()) 22 | ] 23 | return other_formats 24 | --------------------------------------------------------------------------------