├── .github
    └── workflows
    │   └── pkg-test.yml
├── .gitignore
├── LICENSE
├── README.md
├── labelmodels
    ├── __init__.py
    ├── hmm.py
    ├── label_model.py
    ├── linked_hmm.py
    ├── naive_bayes.py
    └── partial_labels.py
├── setup.py
└── test
    ├── test_hmm.py
    ├── test_linked_hmm.py
    ├── test_naive_bayes.py
    ├── test_partial_labels.py
    └── util.py


/.github/workflows/pkg-test.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.8"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v3
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install flake8 pytest
31 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |         pip install .
33 |     - name: Lint with flake8
34 |       run: |
35 |         # stop the build if there are Python syntax errors or undefined names
36 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
39 |     - name: Test with pytest
40 |       run: |
41 |         pytest test/
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # PyCharm
  2 | .idea
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # Environments
 88 | .env
 89 | .venv
 90 | env/
 91 | venv/
 92 | ENV/
 93 | env.bak/
 94 | venv.bak/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 
109 | .idea/
110 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Label Models
  2 | 
  3 | [![Package Test Status](https://github.com/BatsResearch/labelmodels/actions/workflows/pkg-test.yml/badge.svg)](https://github.com/BatsResearch/labelmodels/actions/workflows/pkg-test.yml)
  4 | 
  5 | Lightweight implementations of generative label models for weakly supervised machine learning
  6 | 
  7 | # Example Usage - Naive Bayes Model
  8 | ```python
  9 | # Let votes be an m x n matrix where m is the number of data examples, n is the
 10 | # number of label sources, and each element is in the set {0, 1, ..., k}, where
 11 | # k is the number of classes. If votes_{ij} is 0, it means that label source j
 12 | # abstains from voting on example i.
 13 | 
 14 | # As an example, we create a random votes matrix for binary classification with
 15 | # 1000 examples and 5 label sources
 16 | import numpy as np
 17 | votes = np.random.randint(0, 3, size=(1000, 5))
 18 | 
 19 | # We now can create a Naive Bayes generative model to estimate the accuracies
 20 | # of these label sources
 21 | from labelmodels import NaiveBayes
 22 | 
 23 | # We initialize the model by specifying that there are 2 classes (binary
 24 | # classification) and 5 label sources
 25 | model = NaiveBayes(num_classes=2, num_lfs=5)
 26 | 
 27 | # Next, we estimate the model's parameters
 28 | model.estimate_label_model(votes)
 29 | print(model.get_accuracies())
 30 | 
 31 | # We can obtain a posterior distribution over the true labels
 32 | labels = model.get_label_distribution(votes)
 33 | ```
 34 | 
 35 | 
 36 | 
 37 | # Example Usage - Partial Label Model
 38 | ```python
 39 | # Let votes be an m x n matrix where m is the number of data examples, n is the
 40 | # number of label sources, and each element is in the set {0, 1, ..., k_l}, where
 41 | # k_l is the number of label partitions for partial labeling functions PLF_{l}. If votes_{ij} is 0, 
 42 | # it means that partial label source j abstains from voting on example i.
 43 | 
 44 | # As an example, we create a random votes matrix for classification with
 45 | # 1000 examples and 3 label sources
 46 | import numpy as np
 47 | import torch
 48 | 
 49 | # label_partition is a table that specifies 0-indexed PLF's label partition configurations, for this brief example,
 50 | # we have 3 PLFs each separating the 3-class label space into two partitions. For 0-th PLF, it partitions the label space
 51 | # into \{1\} and \{2,3\}. Notice the class label is 1-indexed.
 52 | # The label_partition configures the label partitions mapping in format as {PLF's index: [partition_1, partition_2, ..., partition_{k_l}]}
 53 | simple_label_partition = {
 54 |         0: [[1], [2, 3]],
 55 |         1: [[2], [1, 3]],
 56 |         2: [[3], [1, 2]]
 57 | }
 58 | num_sources = len(simple_label_partition)
 59 | num_classes = 3
 60 | votes = np.random.randint(0, 1, size=(1000, 3))
 61 | 
 62 | device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
 63 | 
 64 | # We now can create a Naive Bayes generative model to estimate the accuracies
 65 | # of these label sources
 66 | from labelmodels import PartialLabelModel
 67 | # We initialize the model by specifying that there are 2 classes (binary
 68 | # classification) and 5 label sources
 69 | model = PartialLabelModel(num_classes=num_classes,
 70 |                            label_partition=simple_label_partition,
 71 |                            preset_classbalance=None,
 72 |                            device=device)
 73 | # Next, we estimate the model's parameters
 74 | model.estimate_label_model(votes)
 75 | print(model.get_accuracies())
 76 | 
 77 | # We can obtain a posterior distribution over the true labels
 78 | labels = model.get_label_distribution(votes)
 79 | ```
 80 | 
 81 | ## Citation
 82 | 
 83 | Please cite the following paper if you are using our tool. Thank you!
 84 | 
 85 | [Esteban Safranchik](https://www.linkedin.com/in/safranchik/), Shiying Luo, [Stephen H. Bach](http://cs.brown.edu/people/sbach/). "Weakly Supervised Sequence Tagging From Noisy Rules". In 34th AAAI Conference on Artificial Intelligence, 2020.
 86 | 
 87 | ```
 88 | @inproceedings{safranchik2020weakly,
 89 |   title = {Weakly Supervised Sequence Tagging From Noisy Rules}, 
 90 |   author = {Safranchik, Esteban and Luo, Shiying and Bach, Stephen H.}, 
 91 |   booktitle = {AAAI}, 
 92 |   year = 2020, 
 93 | }
 94 | ```
 95 | 
 96 | [Peilin Yu](https://www.yupeilin.com), [Tiffany Ding](https://tiffanyding.github.io/)
 97 | , [Stephen H. Bach](http://cs.brown.edu/people/sbach/). "Learning from Multiple Noisy Partial Labelers". Artificial
 98 | Intelligence and Statistics (AISTATS), 2022.
 99 | 
100 | ```
101 | @inproceedings{yu2022nplm,
102 |   title = {Learning from Multiple Noisy Partial Labelers}, 
103 |   author = {Yu, Peilin and Ding, Tiffany and Bach, Stephen H.}, 
104 |   booktitle = {Artificial Intelligence and Statistics (AISTATS)}, 
105 |   year = 2022, 
106 | }
107 | ```


--------------------------------------------------------------------------------
/labelmodels/__init__.py:
--------------------------------------------------------------------------------
1 | from .hmm import HMM
2 | from .label_model import LearningConfig
3 | from .linked_hmm import LinkedHMM
4 | from .naive_bayes import NaiveBayes
5 | from .partial_labels import PartialLabelModel


--------------------------------------------------------------------------------
/labelmodels/hmm.py:
--------------------------------------------------------------------------------
  1 | from .label_model import ClassConditionalLabelModel, LearningConfig, init_random
  2 | import numpy as np
  3 | from scipy import sparse
  4 | import torch
  5 | from torch import nn
  6 | 
  7 | 
  8 | class HMM(ClassConditionalLabelModel):
  9 |     """A generative label model that treats a sequence of true class labels as a
 10 |     Markov chain, as in a hidden Markov model, and treats all labeling functions
 11 |     as conditionally independent given the corresponding true class label, as
 12 |     in a Naive Bayes model.
 13 | 
 14 |     Proposed for crowdsourced sequence annotations in: A. T. Nguyen, B. C.
 15 |     Wallace, J. J. Li, A. Nenkova, and M. Lease. Aggregating and Predicting
 16 |     Sequence Labels from Crowd Annotations. In Annual Meeting of the Association
 17 |     for Computational Linguistics, 2017.
 18 |     """
 19 | 
 20 |     def __init__(self, num_classes, num_lfs, init_acc=.9, acc_prior=1,
 21 |                  balance_prior=1):
 22 |         """Constructor.
 23 | 
 24 |         Initializes labeling function accuracies using optional argument and all
 25 |         other model parameters uniformly.
 26 | 
 27 |         :param num_classes: number of target classes, i.e., binary
 28 |                             classification = 2
 29 |         :param num_lfs: number of labeling functions to model
 30 |         :param init_acc: initial estimated labeling function accuracy, must
 31 |                             be a float in [0,1]
 32 |         :param acc_prior: strength of regularization of estimated labeling
 33 |                           function accuracies toward their initial values
 34 |         """
 35 |         super().__init__(num_classes, num_lfs, init_acc, acc_prior)
 36 | 
 37 |         self.start_balance = nn.Parameter(torch.zeros([num_classes]))
 38 |         self.transitions = nn.Parameter(torch.zeros([num_classes, num_classes]))
 39 | 
 40 |         self.balance_prior = balance_prior
 41 | 
 42 |     def forward(self, votes, seq_starts):
 43 |         """
 44 |         Computes log likelihood of sequence of labeling function outputs for
 45 |         each (sequence) example in batch.
 46 | 
 47 |         For efficiency, this function prefers that votes is an instance of
 48 |         scipy.sparse.coo_matrix. You can avoid a conversion by passing in votes
 49 |         with this class.
 50 | 
 51 |         :param votes: m x n matrix in {0, ..., k}, where m is the sum of the
 52 |                       lengths of the sequences in the batch, n is the number of
 53 |                       labeling functions and k is the number of classes
 54 |         :param seq_starts: vector of length l of row indices in votes indicating
 55 |                            the start of each sequence, where l is the number of
 56 |                            sequences in the batch. So, votes[seq_starts[i]] is
 57 |                            the row vector of labeling function outputs for the
 58 |                            first element in the ith sequence
 59 |         :return: vector of length l, where element is the log-likelihood of the
 60 |                  corresponding sequence of outputs in votes
 61 |         """
 62 |         jll = self._get_labeling_function_likelihoods(votes)
 63 |         norm_start_balance = self._get_norm_start_balance()
 64 |         norm_transitions = self._get_norm_transitions()
 65 |         for i in range(0, votes.shape[0]):
 66 |             if i in seq_starts:
 67 |                 jll[i] += norm_start_balance
 68 |             else:
 69 |                 joint_class_pair = jll[i-1, :].clone().unsqueeze(1)
 70 |                 joint_class_pair = joint_class_pair.repeat(1, self.num_classes)
 71 |                 joint_class_pair += norm_transitions
 72 | 
 73 |                 jll[i] += joint_class_pair.logsumexp(0)
 74 |         seq_ends = [x - 1 for x in seq_starts] + [votes.shape[0]-1]
 75 |         seq_ends.remove(-1)
 76 |         mll = torch.logsumexp(jll[seq_ends], dim=1)
 77 |         return mll
 78 | 
 79 |     def estimate_label_model(self, votes, seq_starts, config=None):
 80 |         """Estimates the parameters of the label model based on observed
 81 |         labeling function outputs.
 82 | 
 83 |         Note that a minibatch's size refers to the number of sequences in the
 84 |         minibatch.
 85 | 
 86 |         :param votes: m x n matrix in {0, ..., k}, where m is the sum of the
 87 |                       lengths of the sequences in the data, n is the number of
 88 |                       labeling functions and k is the number of classes
 89 |         :param seq_starts: vector of length l of row indices in votes indicating
 90 |                            the start of each sequence, where l is the number of
 91 |                            sequences in the batch. So, votes[seq_starts[i]] is
 92 |                            the row vector of labeling function outputs for the
 93 |                            first element in the ith sequence
 94 |         :param config: optional LearningConfig instance. If None, initialized
 95 |                        with default constructor
 96 |         """
 97 |         if config is None:
 98 |             config = LearningConfig()
 99 | 
100 |         # Initializes random seed
101 |         init_random(config.random_seed)
102 | 
103 |         # Converts to CSR and integers to standardize input
104 |         votes = sparse.csr_matrix(votes, dtype=np.int32)
105 |         seq_starts = np.array(seq_starts, dtype=np.int32)
106 | 
107 |         batches = self._create_minibatches(
108 |             votes, seq_starts, config.batch_size, shuffle_seqs=True)
109 | 
110 |         self._do_estimate_label_model(batches, config)
111 | 
112 |     def get_most_probable_labels(self, votes, seq_starts):
113 |         """
114 |         Computes the most probable underlying sequence of labels given function
115 |         outputs
116 | 
117 |         :param votes: m x n matrix in {0, ..., k}, where m is the sum of the
118 |                       lengths of the sequences in the data, n is the number of
119 |                       labeling functions and k is the number of classes
120 |         :param seq_starts: vector of length l of row indices in votes indicating
121 |                            the start of each sequence, where l is the number of
122 |                            sequences in the batch. So, votes[seq_starts[i]] is
123 |                            the row vector of labeling function outputs for the
124 |                            first element in the ith sequence
125 |         :return: vector of length m, where element is the most likely predicted labels
126 |         """
127 |         # Converts to CSR and integers to standardize input
128 |         votes = sparse.csr_matrix(votes, dtype=np.int32)
129 |         seq_starts = np.array(seq_starts, dtype=np.int32)
130 | 
131 |         out = np.ndarray((votes.shape[0],), dtype=np.int32)
132 | 
133 |         offset = 0
134 |         for votes, seq_starts in self._create_minibatches(votes, seq_starts, 32):
135 |             jll = self._get_labeling_function_likelihoods(votes)
136 |             norm_start_balance = self._get_norm_start_balance()
137 |             norm_transitions = self._get_norm_transitions()
138 | 
139 |             T = votes.shape[0]
140 |             bt = torch.zeros([T, self.num_classes])
141 |             for i in range(0, T):
142 |                 if i in seq_starts:
143 |                     jll[i] += norm_start_balance
144 |                 else:
145 |                     p = jll[i-1].clone().unsqueeze(1).repeat(
146 |                         1, self.num_classes) + norm_transitions
147 |                     jll[i] += torch.max(p, dim=0)[0]
148 |                     bt[i, :] = torch.argmax(p, dim=0)
149 | 
150 |             seq_ends = [x - 1 for x in seq_starts] + [votes.shape[0] - 1]
151 |             res = []
152 |             j = T-1
153 |             while j >= 0:
154 |                 if j in seq_ends:
155 |                     res.append(torch.argmax(jll[j, :]).item())
156 |                 if j in seq_starts:
157 |                     j -= 1
158 |                     continue
159 |                 res.append(int(bt[j, res[-1]].item()))
160 |                 j -= 1
161 |             res = [x + 1 for x in res]
162 |             res.reverse()
163 | 
164 |             for i in range(len(res)):
165 |                 out[offset + i] = res[i]
166 |             offset += len(res)
167 |         return out
168 | 
169 |     def get_label_distribution(self, votes, seq_starts):
170 |         """Returns the unary and pairwise marginals over true labels estimated
171 |         by the model.
172 | 
173 |         :param votes: m x n matrix in {0, ..., k}, where m is the sum of the
174 |                       lengths of the sequences in the data, n is the number of
175 |                       labeling functions and k is the number of classes
176 |         :param seq_starts: vector of length l of row indices in votes indicating
177 |                            the start of each sequence, where l is the number of
178 |                            sequences in the batch. So, votes[seq_starts[i]] is
179 |                            the row vector of labeling function outputs for the
180 |                            first element in the ith sequence
181 |         :return: p_unary, p_pairwise where p_unary is a m x k matrix representing
182 |                  the marginal distributions over individual labels, and p_pairwise
183 |                  is a m x k x k tensor representing pairwise marginals over the
184 |                  ith and (i+1)th labels. For the last element in a sequence, the
185 |                  k x k matrix will be all zeros.
186 |         """
187 |         # Converts to CSR and integers to standardize input
188 |         votes = sparse.csr_matrix(votes, dtype=np.int32)
189 |         seq_starts = np.array(seq_starts, dtype=np.int32)
190 | 
191 |         out_unary = np.zeros((votes.shape[0], self.num_classes))
192 |         out_pairwise = np.zeros((votes.shape[0], self.num_classes, self.num_classes))
193 | 
194 |         offset = 0
195 |         for votes, seq_starts in self._create_minibatches(votes, seq_starts, 32):
196 |             # Computes observation likelihoods and initializes alpha and beta messages
197 |             cll = self._get_labeling_function_likelihoods(votes)
198 |             alpha = torch.zeros(cll.shape)
199 |             beta = torch.zeros(cll.shape)
200 | 
201 |             # Computes alpha
202 |             next_seq = 0
203 |             for i in range(votes.shape[0]):
204 |                 if next_seq == len(seq_starts) or i < seq_starts[next_seq]:
205 |                     # i is not the start of a sequence
206 |                     temp = alpha[i-1].unsqueeze(1).repeat(1, self.num_classes)
207 |                     temp = temp + self._get_norm_transitions()
208 |                     alpha[i] = cll[i] + temp.logsumexp(0)
209 |                 else:
210 |                     # i is the start of a sequence
211 |                     alpha[i] = cll[i] + self._get_norm_start_balance()
212 |                     next_seq += 1
213 | 
214 |             # Computes beta
215 |             this_seq = seq_starts.shape[0] - 1
216 |             beta[-1, :] = 1
217 |             for i in range(votes.shape[0] - 2, -1, -1):
218 |                 if i == seq_starts[this_seq] - 1:
219 |                     # End of sequence
220 |                     beta[i, :] = 1
221 |                     this_seq -= 1
222 |                 else:
223 |                     temp = beta[i+1] + cll[i+1]
224 |                     temp = temp.unsqueeze(1).repeat(1, self.num_classes)
225 |                     temp = temp + self._get_norm_transitions().transpose(0, 1)
226 |                     beta[i, :] = temp.logsumexp(0)
227 | 
228 |             # Computes p_unary
229 |             p_unary = alpha + beta
230 |             temp = p_unary.logsumexp(1).unsqueeze(1).repeat(1, self.num_classes)
231 |             p_unary = p_unary - temp
232 |             for i in range(p_unary.shape[0]):
233 |                 p = torch.exp(p_unary[i, :] - torch.max(p_unary[i, :]))
234 |                 out_unary[offset + i, :] = (p / p.sum()).detach()
235 | 
236 |             # Computes p_pairwise
237 |             p_pairwise = torch.zeros(
238 |                 (votes.shape[0], self.num_classes, self.num_classes))
239 |             for i in range(p_pairwise.shape[0] - 1):
240 |                 p_pairwise[i, :, :] = self._get_norm_transitions()
241 |                 p_pairwise[i] += alpha[i].unsqueeze(1).repeat(1, self.num_classes)
242 |                 p_pairwise[i] += cll[i+1].unsqueeze(0).repeat(self.num_classes, 1)
243 |                 p_pairwise[i] += beta[i+1].unsqueeze(0).repeat(self.num_classes, 1)
244 | 
245 |                 denom = p_pairwise[i].view(-1).logsumexp(0)
246 |                 denom = denom.unsqueeze(0).unsqueeze(1)
247 |                 denom = denom.repeat(self.num_classes, self.num_classes)
248 |                 p_pairwise[i] -= denom
249 | 
250 |                 out_pairwise[offset + i, :, :] = torch.exp(p_pairwise[i]).detach()
251 | 
252 |             offset += votes.shape[0]
253 | 
254 |         return out_unary, out_pairwise
255 | 
256 |     def get_start_balance(self):
257 |         """Returns the model's estimated class balance for the start of a
258 |         sequence
259 | 
260 |         :return: a NumPy array with one element in [0,1] for each target class,
261 |                  representing the estimated prior probability that the first
262 |                  element in an example sequence has that label
263 |         """
264 |         return np.exp(self._get_norm_start_balance().detach().numpy())
265 | 
266 |     def get_transition_matrix(self):
267 |         """Returns the model's estimated transition distribution from class
268 |         label to class label in a sequence.
269 | 
270 |         :return: a k x k Numpy array, in which each element i, j is the
271 |         probability p(c_{t+1} = j + 1 | c_{t} = i + 1)
272 |         """
273 |         return np.exp(self._get_norm_transitions().detach().numpy())
274 | 
275 |     def _create_minibatches(self, votes, seq_starts, batch_size, shuffle_seqs=False):
276 |         # Computes explicit seq ends so that we can shuffle the sequences
277 |         seq_ends = np.ndarray((seq_starts.shape[0],), dtype=np.int32)
278 |         for i in range(1, seq_starts.shape[0]):
279 |             seq_ends[i-1] = seq_starts[i] - 1
280 |         seq_ends[-1] = votes.shape[0] - 1
281 | 
282 |         # Shuffles the sequences by shuffling the start and end index vectors
283 |         if shuffle_seqs:
284 |             index = np.arange(np.shape(seq_starts)[0])
285 |             np.random.shuffle(index)
286 |             seq_starts = seq_starts[index]
287 |             seq_ends = seq_ends[index]
288 | 
289 |         # Splits seq_starts
290 |         seq_start_batches = [np.array(
291 |             seq_starts[i * batch_size: ((i + 1) * batch_size)],
292 |             copy=True)
293 |             for i in range(int(np.ceil(len(seq_starts) / batch_size)))
294 |         ]
295 |         seq_start_batches[-1] = np.concatenate((seq_start_batches[-1], [votes.shape[0]]))
296 | 
297 |         # Splits seq_ends
298 |         seq_end_batches = [
299 |             np.array(seq_ends[i * batch_size: ((i + 1) * batch_size + 1)], copy=True)
300 |             for i in range(int(np.ceil(len(seq_ends) / batch_size)))
301 |         ]
302 |         seq_end_batches[-1] = np.concatenate((seq_end_batches[-1], [votes.shape[0]]))
303 | 
304 |         # Builds vote_batches and relative seq_start_batches
305 |         vote_batches = []
306 |         rel_seq_start_batches = []
307 |         for seq_start_batch, seq_end_batch in zip(seq_start_batches, seq_end_batches):
308 |             vote_batch = []
309 |             rel_seq_start_batch = np.zeros((len(seq_start_batch),), dtype=np.int32)
310 |             total_len = 0
311 |             for i, (start, end) in enumerate(zip(seq_start_batch, seq_end_batch)):
312 |                 vote_batch.append(votes[start:end+1])
313 |                 rel_seq_start_batch[i] = total_len
314 |                 total_len += end - start + 1
315 |             vote_batches.append(sparse.coo_matrix(sparse.vstack(vote_batch), copy=True))
316 |             rel_seq_start_batches.append(rel_seq_start_batch)
317 | 
318 |         return list(zip(vote_batches, rel_seq_start_batches))
319 | 
320 |     def _get_regularization_loss(self):
321 |         neg_entropy = 0.0
322 | 
323 |         # Start balance
324 |         norm_start_balance = self._get_norm_start_balance()
325 |         exp_class_balance = torch.exp(norm_start_balance)
326 |         for k in range(self.num_classes):
327 |             neg_entropy += norm_start_balance[k] * exp_class_balance[k]
328 | 
329 |         # Transitions
330 |         norm_transitions = self._get_norm_transitions()
331 |         for i in range(self.num_classes):
332 |             exp_transitions = torch.exp(norm_transitions[i])
333 |             for k in range(self.num_classes):
334 |                 neg_entropy += norm_transitions[i, k] * exp_transitions[k]
335 | 
336 |         entropy_prior = self.balance_prior * neg_entropy
337 | 
338 |         return super()._get_regularization_loss() + entropy_prior
339 | 
340 |     def _get_norm_start_balance(self):
341 |         return self.start_balance - self.start_balance.logsumexp(0)
342 | 
343 |     def _get_norm_transitions(self):
344 |         denom = self.transitions.logsumexp(1).unsqueeze(1).repeat(1, self.num_classes)
345 |         return self.transitions - denom
346 | 


--------------------------------------------------------------------------------
/labelmodels/label_model.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import numpy as np
  3 | from scipy import sparse
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | 
  8 | class LabelModel(nn.Module):
  9 |     """Parent class for all generative label models.
 10 | 
 11 |     Concrete subclasses should implement at least forward(),
 12 |     estimate_label_model(), and get_label_distribution().
 13 |     """
 14 |     def forward(self, *args):
 15 |         """Computes the marginal log-likelihood of a batch of observed
 16 |         function outputs provided as input.
 17 | 
 18 |         :param args: batch of observed function outputs and related metadata
 19 |         :return: 1-d tensor of log-likelihoods, one for each input example
 20 |         """
 21 |         raise NotImplementedError
 22 | 
 23 |     def estimate_label_model(self, *args, config=None):
 24 |         """Learns the parameters of the label model from observed
 25 |         function outputs.
 26 | 
 27 |         Subclasses that implement this method should call _do_estimate_label_model()
 28 |         if possible, to provide consistent behavior.
 29 | 
 30 |         :param args: observed function outputs and related metadata
 31 |         :param config: an instance of LearningConfig. If none, will initialize
 32 |                        with default LearningConfig constructor
 33 |         """
 34 |         raise NotImplementedError
 35 | 
 36 |     def get_label_distribution(self, *args):
 37 |         """Returns the estimated posterior distribution over true labels given
 38 |         observed function outputs.
 39 | 
 40 |         :param args: observed function outputs and related metadata
 41 |         :return: distribution over true labels. Structure depends on model type
 42 |         """
 43 |         raise NotImplementedError
 44 | 
 45 |     def get_most_probable_labels(self, *args):
 46 |         """Returns the most probable true labels given observed function outputs.
 47 | 
 48 |         :param args: observed function outputs and related metadata
 49 |         :return: 1-d Numpy array of most probable labels
 50 |         """
 51 |         raise NotImplementedError
 52 | 
 53 |     def _do_estimate_label_model(self, batches, config):
 54 |         """Internal method for optimizing model parameters.
 55 | 
 56 |         :param batches: sequence of inputs to forward(). The sequence must
 57 |                         contain tuples, even if forward() takes one
 58 |                         argument (besides self)
 59 |         :param config: an instance of LearningConfig
 60 |         """
 61 |         logging.info(vars(config))
 62 | 
 63 |         # Sets up optimization hyperparameters
 64 |         optimizer = torch.optim.SGD(
 65 |             self.parameters(), lr=config.step_size, momentum=config.momentum,
 66 |             weight_decay=0)
 67 |         if config.step_schedule is not None and config.step_size_mult is not None:
 68 |             scheduler = torch.optim.lr_scheduler.MultiStepLR(
 69 |                 optimizer, config.step_schedule, gamma=config.step_size_mult)
 70 |         else:
 71 |             scheduler = None
 72 | 
 73 |         # Iterates over epochs
 74 |         for epoch in range(config.epochs):
 75 |             logging.info('Epoch {}/{}'.format(epoch + 1, config.epochs))
 76 |             if scheduler is not None:
 77 |                 scheduler.step()
 78 | 
 79 |             # Sets model to training mode
 80 |             self.train()
 81 |             running_loss = 0.0
 82 | 
 83 |             # Iterates over training data
 84 |             for i_batch, inputs in enumerate(batches):
 85 |                 optimizer.zero_grad()
 86 |                 log_likelihood = self(*inputs)
 87 |                 loss = -1 * torch.mean(log_likelihood)
 88 |                 loss += self._get_regularization_loss()
 89 |                 loss.backward()
 90 |                 optimizer.step()
 91 |                 running_loss += loss.item()
 92 |             epoch_loss = running_loss / len(batches)
 93 |             logging.info('Train Loss: %.6f', epoch_loss)
 94 | 
 95 |     def _get_regularization_loss(self):
 96 |         """Gets the value of the regularization loss for the current values of
 97 |         the model's parameters
 98 | 
 99 |         :return: regularization loss
100 |         """
101 |         return 0.0
102 | 
103 | 
104 | class ClassConditionalLabelModel(LabelModel):
105 |     """
106 |     Abstract parent class for generative label models that assume labeling
107 |     functions are conditionally independent given the true label, and that each
108 |     labeling function is characterized by the following parameters:
109 |         * a propensity, which is the probability that it does not abstain
110 |         * class-conditional accuracies, each of which is the probability that
111 |           the labeling function's output is correct given that the true label
112 |           has a certain value. It is assumed that when a labeling function makes
113 |           a mistake, the label it outputs is chosen uniformly at random
114 |     """
115 |     def __init__(self, num_classes, num_lfs, init_acc, acc_prior):
116 |         """Constructor.
117 | 
118 |         Initializes label source accuracies argument and propensities uniformly.
119 | 
120 |         :param num_classes: number of target classes, i.e., binary
121 |                             classification = 2
122 |         :param num_lfs: number of labeling functions to model
123 |         :param init_acc: initial estimated labeling function accuracy, must
124 |                             be a float in [0,1]
125 |         :param acc_prior: strength of regularization of estimated labeling
126 |                           function accuracies toward their initial values
127 |         """
128 |         super().__init__()
129 | 
130 |         # Converts init_acc to log scale
131 |         init_acc = -1 * np.log(1.0 / init_acc - 1) / 2
132 | 
133 |         init_param = torch.tensor(
134 |             [[init_acc] * num_classes for _ in range(num_lfs)])
135 |         self.accuracy = nn.Parameter(init_param)
136 |         self.propensity = nn.Parameter(torch.zeros([num_lfs]))
137 | 
138 |         # Saves state
139 |         self.num_classes = num_classes
140 |         self.num_lfs = num_lfs
141 |         self.init_acc = init_acc
142 |         self.acc_prior = acc_prior
143 | 
144 |     def get_accuracies(self):
145 |         """Returns the model's estimated labeling function accuracies
146 |         :return: a NumPy array with one element in [0,1] for each labeling
147 |                  function, representing the estimated probability that
148 |                  the corresponding labeling function correctly outputs
149 |                  the true class label, given that it does not abstain
150 |         """
151 |         acc = self.accuracy.detach().numpy()
152 |         return np.exp(acc) / (np.exp(acc) + np.exp(-1 * acc))
153 | 
154 |     def get_propensities(self):
155 |         """Returns the model's estimated labeling function propensities, i.e.,
156 |         the probability that a labeling function does not abstain
157 |         :return: a NumPy array with one element in [0,1] for each labeling
158 |                  function, representing the estimated probability that
159 |                  the corresponding labeling function does not abstain
160 |         """
161 |         prop = self.propensity.detach().numpy()
162 |         return np.exp(prop) / (np.exp(prop) + 1)
163 | 
164 |     def _get_labeling_function_likelihoods(self, votes):
165 |         """
166 |         Computes conditional log-likelihood of labeling function votes given
167 |         class as an m x k matrix.
168 | 
169 |         For efficiency, this function prefers that votes is an instance of
170 |         scipy.sparse.coo_matrix. You can avoid a conversion by passing in votes
171 |         with this class.
172 | 
173 |         :param votes: m x n matrix in {0, ..., k}, where m is the sum of the
174 |                       lengths of the sequences in the batch, n is the number of
175 |                       labeling functions and k is the number of classes
176 |         :return: matrix of dimension m x k, where element is the conditional
177 |                  log-likelihood of votes given class
178 |         """
179 |         if type(votes) != sparse.coo_matrix:
180 |             votes = sparse.coo_matrix(votes)
181 | 
182 |         # Initializes conditional log-likelihood of votes as an m x k matrix
183 |         cll = torch.zeros(votes.shape[0], self.num_classes)
184 | 
185 |         # Initializes normalizing constants
186 |         z_prop = self.propensity.unsqueeze(1)
187 |         z_prop = torch.cat((z_prop, torch.zeros((self.num_lfs, 1))), dim=1)
188 |         z_prop = torch.logsumexp(z_prop, dim=1)
189 | 
190 |         z_acc = self.accuracy.unsqueeze(2)
191 |         z_acc = torch.cat((z_acc, -1 * self.accuracy.unsqueeze(2)), dim=2)
192 |         z_acc = torch.logsumexp(z_acc, dim=2)
193 | 
194 |         # Subtracts normalizing constant for propensities from cll
195 |         # (since it applies to all outcomes)
196 |         cll -= torch.sum(z_prop)
197 | 
198 |         # Loops over votes and classes to compute conditional log-likelihood
199 |         for i, j, v in zip(votes.row, votes.col, votes.data):
200 |             for k in range(self.num_classes):
201 |                 if v == (k + 1):
202 |                     logp = self.propensity[j] + self.accuracy[j, k] - z_acc[j, k]
203 |                     cll[i, k] += logp
204 |                 elif v != 0:
205 |                     logp = self.propensity[j] - self.accuracy[j, k] - z_acc[j, k]
206 |                     logp -= torch.log(torch.tensor(self.num_classes - 1.0))
207 |                     cll[i, k] += logp
208 | 
209 |         return cll
210 | 
211 |     def _get_regularization_loss(self):
212 |         """Computes the regularization loss of the model:
213 |         acc_prior * \|accuracy - init_acc\|_2
214 | 
215 |         :return: value of regularization loss
216 |         """
217 |         return self.acc_prior * torch.norm(self.accuracy - self.init_acc)
218 | 
219 | 
220 | class LearningConfig(object):
221 |     """Container for hyperparameters used by label models during learning"""
222 | 
223 |     def __init__(self):
224 |         """Initializes all hyperparameters to default values"""
225 |         self.epochs = 10
226 |         self.batch_size = 64
227 |         self.step_size = 0.01
228 |         self.step_schedule = None
229 |         self.step_size_mult = None
230 |         self.momentum = 0.9
231 |         self.random_seed = 0
232 | 
233 | 
234 | def init_random(seed):
235 |     """Initializes PyTorch and NumPy random seeds.
236 | 
237 |     Also sets the CuDNN back end to deterministic.
238 | 
239 |     :param seed: integer to use as random seed
240 |     """
241 |     torch.backends.cudnn.deterministic = True
242 | 
243 |     np.random.seed(seed)
244 |     torch.manual_seed(seed)
245 |     torch.cuda.manual_seed(seed)
246 |     logging.info("Random seed: %d", seed)
247 | 


--------------------------------------------------------------------------------
/labelmodels/linked_hmm.py:
--------------------------------------------------------------------------------
  1 | from .label_model import ClassConditionalLabelModel, LearningConfig, init_random
  2 | import numpy as np
  3 | from scipy import sparse
  4 | import torch
  5 | from torch import nn
  6 | 
  7 | 
  8 | class LinkedHMM(ClassConditionalLabelModel):
  9 |     """A generative label model that treats a sequence of true class labels as a
 10 |     Markov chain, as in a hidden Markov model, and treats all labeling functions
 11 |     as conditionally independent given the corresponding true class label, as
 12 |     in a Naive Bayes model.
 13 | 
 14 |     In addition, the linked HMM captures linking functions that are conditioned
 15 |     on consecutive pairs of hidden states, with distributions that depend on
 16 |     whether the linked states are the same or different. The outputs of these
 17 |     functions are represented as a separate m x n link matrix in {-1, 0, 1},
 18 |     where m is the sum of the lengths of the sequences in the data and n is the
 19 |     number of linking functions. An output of 1 at entry i, j indicates that
 20 |     true labels i-1 and i have the same value, -1 indicates they do not, and
 21 |     0 means the linking function abstains.
 22 | 
 23 |     The link matrix always has a row of 0 corresponding to the first element in
 24 |     each sequence in the data, because linking functions operate on pairs of
 25 |     elements. This does not count as an abstention.
 26 | 
 27 |     Proposed for weakly supervised sequence tagging: E. Safranchik, S. Luo,
 28 |     and S. H. Bach. Weakly Supervised Sequence Tagging from Noisy Rules.
 29 |     In 34th AAAI Conference On Artificial Intelligence, 2020.
 30 |     """
 31 | 
 32 |     def __init__(self, num_classes, num_labeling_funcs, num_linking_funcs,
 33 |                  init_acc=.9, acc_prior=1, balance_prior=1):
 34 |         """Constructor.
 35 | 
 36 |         Initializes labeling and linking function accuracies using optional
 37 |         argument and all other model parameters uniformly.
 38 | 
 39 |         :param num_classes: number of target classes, i.e., binary
 40 |                             classification = 2
 41 |         :param num_labeling_funcs: number of labeling functions to model
 42 |         :param num_linking_funcs: number of linking functions to model
 43 |         :param init_acc: initial estimated labeling and linking function
 44 |                          accuracy, must be a float in [0,1]
 45 |         :param acc_prior: strength of regularization of estimated labeling and
 46 |                           linking function accuracies toward their initial values
 47 |         """
 48 |         super().__init__(num_classes, num_labeling_funcs, init_acc, acc_prior)
 49 | 
 50 |         self.link_accuracy = nn.Parameter(
 51 |             torch.tensor([self.init_acc] * num_linking_funcs))
 52 |         self.link_propensity = nn.Parameter(torch.zeros([num_linking_funcs]))
 53 |         self.start_balance = nn.Parameter(torch.zeros([num_classes]))
 54 |         self.transitions = nn.Parameter(torch.zeros([num_classes, num_classes]))
 55 | 
 56 |         # Saves state
 57 |         self.num_linking_funcs = num_linking_funcs
 58 |         self.balance_prior = balance_prior
 59 | 
 60 |     def forward(self, label_votes, link_votes, seq_starts):
 61 |         """
 62 |         Computes log likelihood of sequence of labeling and linking function
 63 |         outputs for each (sequence) example in batch.
 64 | 
 65 |         For efficiency, this function prefers that label_votes and link_votes
 66 |         are instances of scipy.sparse.coo_matrix. You can avoid a conversion by
 67 |         passing them in as this class.
 68 | 
 69 |         :param label_votes: m x n matrix in {0, ..., k}, where m is the sum of
 70 |                             the lengths of the sequences in the batch, n is the
 71 |                             number of labeling functions and k is the number of
 72 |                             classes
 73 |         :param link_votes: m x n matrix in {-1, 0, 1}, where m is the sum of
 74 |                            the lengths of the sequences in the batch and n is the
 75 |                            number of linking functions
 76 |         :param seq_starts: vector of length l of row indices in votes indicating
 77 |                            the start of each sequence, where l is the number of
 78 |                            sequences in the batch. So, label_votes[seq_starts[i]]
 79 |                            is the row vector of labeling function outputs for the
 80 |                            first element in the ith sequence
 81 |         :return: vector of length l, where element is the log-likelihood of the
 82 |                  corresponding sequence of outputs in votes
 83 |         """
 84 |         jll = self._get_labeling_function_likelihoods(label_votes)
 85 |         link_cll = self._get_linking_function_likelihoods(link_votes)
 86 |         norm_start_balance = self._get_norm_start_balance()
 87 |         norm_transitions = self._get_norm_transitions()
 88 |         for i in range(0, jll.shape[0]):
 89 |             if i in seq_starts:
 90 |                 jll[i] += norm_start_balance
 91 |             else:
 92 |                 joint_class_pair = jll[i-1, :].clone().unsqueeze(1)
 93 |                 joint_class_pair = joint_class_pair.repeat(1, self.num_classes)
 94 |                 joint_class_pair += norm_transitions
 95 | 
 96 |                 # Adds contributions from links
 97 |                 joint_class_pair += link_cll[i]
 98 | 
 99 |                 # Finishes computing joint log likelihood
100 |                 jll[i] += joint_class_pair.logsumexp(0)
101 |         seq_ends = [x - 1 for x in seq_starts] + [jll.shape[0]-1]
102 |         seq_ends.remove(-1)
103 |         mll = torch.logsumexp(jll[seq_ends], dim=1)
104 |         return mll
105 | 
106 |     def estimate_label_model(self, label_votes, link_votes, seq_starts, config=None):
107 |         """Estimates the parameters of the label model based on observed
108 |         labeling and linking function outputs.
109 | 
110 |         Note that a minibatch's size refers to the number of sequences in the
111 |         minibatch.
112 | 
113 |         :param label_votes: m x n matrix in {0, ..., k}, where m is the sum of
114 |                             the lengths of the sequences in the batch, n is the
115 |                             number of labeling functions and k is the number of
116 |                             classes
117 |         :param link_votes: m x n matrix in {-1, 0, 1}, where m is the sum of
118 |                            the lengths of the sequences in the batch and n is the
119 |                            number of linking functions
120 |         :param seq_starts: vector of length l of row indices in votes indicating
121 |                            the start of each sequence, where l is the number of
122 |                            sequences in the batch. So, label_votes[seq_starts[i]]
123 |                            is the row vector of labeling function outputs for the
124 |                            first element in the ith sequence
125 |         :param config: optional LearningConfig instance. If None, initialized
126 |                        with default constructor
127 |         """
128 |         if config is None:
129 |             config = LearningConfig()
130 | 
131 |         # Initializes random seed
132 |         init_random(config.random_seed)
133 | 
134 |         # Converts to CSR and integers to standardize input
135 |         label_votes = sparse.csr_matrix(label_votes, dtype=np.int32)
136 |         link_votes = sparse.csr_matrix(link_votes, dtype=np.int32)
137 |         seq_starts = np.array(seq_starts, dtype=np.int32)
138 | 
139 |         batches = self._create_minibatches(
140 |             label_votes, link_votes, seq_starts, config.batch_size, shuffle_seqs=True)
141 | 
142 |         self._do_estimate_label_model(batches, config)
143 | 
144 |     def get_label_accuracies(self):
145 |         """Alias for ClassConditionalModel.get_accuracies()
146 |         :return: estimated labeling function accuracies
147 |         """
148 |         return self.get_accuracies()
149 | 
150 |     def get_link_accuracies(self):
151 |         """Returns the model's estimated linking function accuracies
152 |         :return: a NumPy array with one element in [0,1] for each labeling
153 |                  function, representing the estimated probability that
154 |                  the corresponding linking function correctly identifies a pair
155 |                  of consecutive labels, given that it does not abstain
156 |         """
157 |         acc = self.link_accuracy.detach().numpy()
158 |         return np.exp(acc) / (np.exp(acc) + np.exp(-1 * acc))
159 | 
160 |     def get_label_propensities(self):
161 |         """Alias for ClassConditionalModel.get_propensities()
162 |         :return: estimated labeling function propensities
163 |         """
164 |         return self.get_propensities()
165 | 
166 |     def get_link_propensities(self):
167 |         """Returns the model's estimated linking function propensities, i.e.,
168 |         the probability that a linking function does not abstain
169 |         :return: a NumPy array with one element in [0,1] for each linking
170 |                  function, representing the estimated probability that
171 |                  the corresponding linking function does not abstain
172 |         """
173 |         prop = self.link_propensity.detach().numpy()
174 |         return np.exp(prop) / (np.exp(prop) + 1)
175 | 
176 |     def get_most_probable_labels(self, label_votes, link_votes, seq_starts):
177 |         """
178 |         Computes the most probable underlying sequence nodes given function
179 |         outputs
180 | 
181 |         :param label_votes: m x n matrix in {0, ..., k}, where m is the sum of
182 |                             the lengths of the sequences in the batch, n is the
183 |                             number of labeling functions and k is the number of
184 |                             classes
185 |         :param link_votes: m x n matrix in {-1, 0, 1}, where m is the sum of
186 |                            the lengths of the sequences in the batch and n is the
187 |                            number of linking functions
188 |         :param seq_starts: vector of length l of row indices in votes indicating
189 |                            the start of each sequence, where l is the number of
190 |                            sequences in the batch. So, label_votes[seq_starts[i]]
191 |                            is the row vector of labeling function outputs for the
192 |                            first element in the ith sequence
193 |         :return: vector of length m, where element is the most likely predicted labels
194 |         """
195 |         # Converts to CSR and integers to standardize input
196 |         label_votes = sparse.csr_matrix(label_votes, dtype=np.int32)
197 |         link_votes = sparse.csr_matrix(link_votes, dtype=np.int32)
198 |         seq_starts = np.array(seq_starts, dtype=np.int32)
199 | 
200 |         out = np.ndarray((label_votes.shape[0],), dtype=np.int32)
201 | 
202 |         offset = 0
203 |         for label_votes, link_votes, seq_starts in self._create_minibatches(
204 |                 label_votes, link_votes, seq_starts, 32):
205 |             # Initializes joint log likelihood with labeling function likelihood
206 |             jll = self._get_labeling_function_likelihoods(label_votes)
207 |             link_cll = self._get_linking_function_likelihoods(link_votes)
208 |             norm_start_balance = self._get_norm_start_balance()
209 |             norm_transitions = self._get_norm_transitions()
210 | 
211 |             T = label_votes.shape[0]
212 |             bt = torch.zeros([T, self.num_classes])
213 |             for i in range(0, T):
214 |                 if i in seq_starts:
215 |                     jll[i] += norm_start_balance
216 |                 else:
217 |                     p = jll[i-1].clone().unsqueeze(1).repeat(
218 |                         1, self.num_classes) + norm_transitions
219 |                     p += link_cll[i]
220 |                     jll[i] += torch.max(p, dim=0)[0]
221 |                     bt[i, :] = torch.argmax(p, dim=0)
222 | 
223 |             seq_ends = [x - 1 for x in seq_starts] + [label_votes.shape[0] - 1]
224 |             res = []
225 |             j = T-1
226 |             while j >= 0:
227 |                 if j in seq_ends:
228 |                     res.append(torch.argmax(jll[j, :]).item())
229 |                 if j in seq_starts:
230 |                     j -= 1
231 |                     continue
232 |                 res.append(int(bt[j, res[-1]].item()))
233 |                 j -= 1
234 |             res = [x + 1 for x in res]
235 |             res.reverse()
236 | 
237 |             for i in range(len(res)):
238 |                 out[offset + i] = res[i]
239 |             offset += len(res)
240 |         return out
241 | 
242 |     def get_label_distribution(self, label_votes, link_votes, seq_starts):
243 |         """Returns the unary and pairwise marginals over true labels estimated
244 |         by the model.
245 | 
246 |         :param label_votes: m x n matrix in {0, ..., k}, where m is the sum of
247 |                             the lengths of the sequences in the batch, n is the
248 |                             number of labeling functions and k is the number of
249 |                             classes
250 |         :param link_votes: m x n matrix in {-1, 0, 1}, where m is the sum of
251 |                            the lengths of the sequences in the batch and n is the
252 |                            number of linking functions
253 |         :param seq_starts: vector of length l of row indices in votes indicating
254 |                            the start of each sequence, where l is the number of
255 |                            sequences in the batch. So, label_votes[seq_starts[i]]
256 |                            is the row vector of labeling function outputs for the
257 |                            first element in the ith sequence
258 |         :return: p_unary, p_pairwise where p_unary is a m x k matrix representing
259 |                  the marginal distributions over individual labels, and p_pairwise
260 |                  is a m x k x k tensor representing pairwise marginals over the
261 |                  ith and (i+1)th labels. For the last element in a sequence, the
262 |                  k x k matrix will be all zeros.
263 |         """
264 |         # Converts to CSR and integers to standardize input
265 |         label_votes = sparse.csr_matrix(label_votes, dtype=np.int32)
266 |         link_votes = sparse.csr_matrix(link_votes, dtype=np.int32)
267 |         seq_starts = np.array(seq_starts, dtype=np.int32)
268 | 
269 |         out_unary = np.zeros((label_votes.shape[0], self.num_classes))
270 |         out_pairwise = np.zeros((label_votes.shape[0], self.num_classes, self.num_classes))
271 | 
272 |         offset = 0
273 |         for label_votes, link_votes, seq_starts in self._create_minibatches(
274 |                 label_votes, link_votes, seq_starts, 32):
275 |             # Computes observation likelihoods and initializes alpha and beta messages
276 |             label_cll = self._get_labeling_function_likelihoods(label_votes)
277 |             link_cll = self._get_linking_function_likelihoods(link_votes)
278 |             alpha = torch.zeros(label_cll.shape)
279 |             beta = torch.zeros(label_cll.shape)
280 | 
281 |             # Computes alpha
282 |             next_seq = 0
283 |             for i in range(label_votes.shape[0]):
284 |                 if next_seq == len(seq_starts) or i < seq_starts[next_seq]:
285 |                     # i is not the start of a sequence
286 |                     temp = alpha[i-1].unsqueeze(1).repeat(1, self.num_classes)
287 |                     temp = temp + self._get_norm_transitions()
288 |                     temp += link_cll[i]
289 |                     alpha[i] = label_cll[i] + temp.logsumexp(0)
290 |                 else:
291 |                     # i is the start of a sequence
292 |                     alpha[i] = label_cll[i] + self._get_norm_start_balance()
293 |                     next_seq += 1
294 | 
295 |             # Computes beta
296 |             this_seq = seq_starts.shape[0] - 1
297 |             beta[-1, :] = 1
298 |             for i in range(label_votes.shape[0] - 2, -1, -1):
299 |                 if i == seq_starts[this_seq] - 1:
300 |                     # End of sequence
301 |                     beta[i, :] = 1
302 |                     this_seq -= 1
303 |                 else:
304 |                     temp = beta[i+1] + label_cll[i+1]
305 |                     temp = temp.unsqueeze(1).repeat(1, self.num_classes)
306 |                     temp = temp + self._get_norm_transitions().transpose(0, 1)
307 |                     temp += link_cll[i+1]
308 |                     beta[i, :] = temp.logsumexp(0)
309 | 
310 |             # Computes p_unary
311 |             p_unary = alpha + beta
312 |             temp = p_unary.logsumexp(1).unsqueeze(1).repeat(1, self.num_classes)
313 |             p_unary = p_unary - temp
314 |             for i in range(p_unary.shape[0]):
315 |                 p = torch.exp(p_unary[i, :] - torch.max(p_unary[i, :]))
316 |                 out_unary[offset + i, :] = (p / p.sum()).detach()
317 | 
318 |             # Computes p_pairwise
319 |             p_pairwise = torch.zeros(
320 |                 (label_votes.shape[0], self.num_classes, self.num_classes))
321 |             for i in range(p_pairwise.shape[0] - 1):
322 |                 p_pairwise[i, :, :] = self._get_norm_transitions()
323 |                 p_pairwise[i] += alpha[i].unsqueeze(1).repeat(1, self.num_classes)
324 |                 p_pairwise[i] += label_cll[i+1].unsqueeze(0).repeat(self.num_classes, 1)
325 |                 p_pairwise[i] += beta[i+1].unsqueeze(0).repeat(self.num_classes, 1)
326 |                 p_pairwise[i] += link_cll[i+1]
327 | 
328 |                 denom = p_pairwise[i].view(-1).logsumexp(0)
329 |                 denom = denom.unsqueeze(0).unsqueeze(1)
330 |                 denom = denom.repeat(self.num_classes, self.num_classes)
331 |                 p_pairwise[i] -= denom
332 | 
333 |                 out_pairwise[offset + i, :, :] = torch.exp(p_pairwise[i]).detach()
334 | 
335 |             offset += label_votes.shape[0]
336 | 
337 |         return out_unary, out_pairwise
338 | 
339 |     def get_start_balance(self):
340 |         """Returns the model's estimated class balance for the start of a
341 |         sequence
342 | 
343 |         :return: a NumPy array with one element in [0,1] for each target class,
344 |                  representing the estimated prior probability that the first
345 |                  element in an example sequence has that label
346 |         """
347 |         return np.exp(self._get_norm_start_balance().detach().numpy())
348 | 
349 |     def get_transition_matrix(self):
350 |         """Returns the model's estimated transition distribution from class
351 |         label to class label in a sequence.
352 | 
353 |         :return: a k x k Numpy array, in which each element i, j is the
354 |         probability p(c_{t+1} = j + 1 | c_{t} = i + 1)
355 |         """
356 |         return np.exp(self._get_norm_transitions().detach().numpy())
357 | 
358 |     def _create_minibatches(self, label_votes, link_votes, seq_starts,
359 |                             batch_size, shuffle_seqs=False):
360 |         if label_votes.shape[0] != link_votes.shape[0]:
361 |             raise ValueError("label_votes and link_votes must have same number "
362 |                              "of rows")
363 | 
364 |         # Computes explicit seq ends so that we can shuffle the sequences
365 |         seq_ends = np.ndarray((seq_starts.shape[0],), dtype=np.int32)
366 |         for i in range(1, seq_starts.shape[0]):
367 |             seq_ends[i - 1] = seq_starts[i] - 1
368 |         seq_ends[-1] = label_votes.shape[0] - 1
369 | 
370 |         # Shuffles the sequences by shuffling the start and end index vectors
371 |         if shuffle_seqs:
372 |             index = np.arange(np.shape(seq_starts)[0])
373 |             np.random.shuffle(index)
374 |             seq_starts = seq_starts[index]
375 |             seq_ends = seq_ends[index]
376 | 
377 |         # Splits seq_starts
378 |         seq_start_batches = [np.array(
379 |             seq_starts[i * batch_size: ((i + 1) * batch_size)],
380 |             copy=True)
381 |             for i in range(int(np.ceil(len(seq_starts) / batch_size)))
382 |         ]
383 |         seq_start_batches[-1] = np.concatenate((seq_start_batches[-1],
384 |                                                 [label_votes.shape[0]]))
385 | 
386 |         # Splits seq_ends
387 |         seq_end_batches = [
388 |             np.array(seq_ends[i * batch_size: ((i + 1) * batch_size + 1)], copy=True)
389 |             for i in range(int(np.ceil(len(seq_ends) / batch_size)))
390 |         ]
391 |         seq_end_batches[-1] = np.concatenate((seq_end_batches[-1],
392 |                                               [label_votes.shape[0]]))
393 | 
394 |         # Builds label_vote_batches, link_vote_batches and relative seq_start_batches
395 |         label_vote_batches = []
396 |         link_vote_batches = []
397 |         rel_seq_start_batches = []
398 |         for seq_start_batch, seq_end_batch in zip(seq_start_batches, seq_end_batches):
399 |             label_vote_batch = []
400 |             link_vote_batch = []
401 |             rel_seq_start_batch = np.zeros((len(seq_start_batch),), dtype=np.int32)
402 |             total_len = 0
403 |             for i, (start, end) in enumerate(zip(seq_start_batch, seq_end_batch)):
404 |                 label_vote_batch.append(label_votes[start:end + 1])
405 |                 link_vote_batch.append(link_votes[start:end + 1])
406 |                 rel_seq_start_batch[i] = total_len
407 |                 total_len += end - start + 1
408 |             label_vote_batches.append(
409 |                 sparse.coo_matrix(sparse.vstack(label_vote_batch), copy=True))
410 |             link_vote_batches.append(
411 |                 sparse.coo_matrix(sparse.vstack(link_vote_batch), copy=True))
412 |             rel_seq_start_batches.append(rel_seq_start_batch)
413 | 
414 |         return list(zip(label_vote_batches, link_vote_batches, rel_seq_start_batches))
415 | 
416 |     def _get_linking_function_likelihoods(self, votes):
417 |         if type(votes) != sparse.coo_matrix:
418 |             votes = sparse.coo_matrix(votes)
419 | 
420 |         cll = torch.zeros((votes.shape[0], self.num_classes, self.num_classes))
421 | 
422 |         # Initializes normalizing constants
423 |         z_prop = self.link_propensity.unsqueeze(1)
424 |         z_prop = torch.cat((z_prop, torch.zeros((self.num_linking_funcs, 1))), dim=1)
425 |         z_prop = torch.logsumexp(z_prop, dim=1)
426 | 
427 |         z_acc = self.link_accuracy.unsqueeze(1)
428 |         z_acc = torch.cat((z_acc, -1 * self.link_accuracy.unsqueeze(1)), dim=1)
429 |         z_acc = torch.logsumexp(z_acc, dim=1)
430 | 
431 |         # Subtracts normalizing constant for propensities from cll
432 |         # (since it applies to all outcomes)
433 |         cll -= torch.sum(z_prop)
434 | 
435 |         # Loops over votes and classes to compute conditional log-likelihood
436 |         for i, j, v in zip(votes.row, votes.col, votes.data):
437 |             if v != 1 and v != -1:
438 |                 continue
439 | 
440 |             for k1 in range(self.num_classes):
441 |                 for k2 in range(self.num_classes):
442 |                     if k1 == k2:
443 |                         if v == 1:
444 |                             cll[i, k1, k2] += self.link_propensity[j]
445 |                             cll[i, k1, k2] += self.link_accuracy[j]
446 |                             cll[i, k1, k2] -= z_acc[j]
447 |                         else:
448 |                             cll[i, k1, k2] += self.link_propensity[j]
449 |                             cll[i, k1, k2] -= self.link_accuracy[j]
450 |                             cll[i, k1, k2] -= z_acc[j]
451 |                     else:
452 |                         if v == 1:
453 |                             cll[i, k1, k2] += self.link_propensity[j]
454 |                             cll[i, k1, k2] -= self.link_accuracy[j]
455 |                             cll[i, k1, k2] -= z_acc[j]
456 |                         else:
457 |                             cll[i, k1, k2] += self.link_propensity[j]
458 |                             cll[i, k1, k2] += self.link_accuracy[j]
459 |                             cll[i, k1, k2] -= z_acc[j]
460 | 
461 |         return cll
462 | 
463 |     def _get_regularization_loss(self):
464 |         neg_entropy = 0.0
465 | 
466 |         # Start balance
467 |         norm_start_balance = self._get_norm_start_balance()
468 |         exp_class_balance = torch.exp(norm_start_balance)
469 |         for k in range(self.num_classes):
470 |             neg_entropy += norm_start_balance[k] * exp_class_balance[k]
471 | 
472 |         # Transitions
473 |         norm_transitions = self._get_norm_transitions()
474 |         for i in range(self.num_classes):
475 |             exp_transitions = torch.exp(norm_transitions[i])
476 |             for k in range(self.num_classes):
477 |                 neg_entropy += norm_transitions[i, k] * exp_transitions[k]
478 | 
479 |         entropy_prior = self.balance_prior * neg_entropy
480 | 
481 |         # Accuracy prior
482 |         acc = torch.cat((self.accuracy.view(-1), self.link_accuracy))
483 |         acc_prior = self.acc_prior * torch.norm(acc - self.init_acc)
484 | 
485 |         return acc_prior + entropy_prior
486 | 
487 |     def _get_norm_start_balance(self):
488 |         return self.start_balance - self.start_balance.logsumexp(0)
489 | 
490 |     def _get_norm_transitions(self):
491 |         denom = self.transitions.logsumexp(1).unsqueeze(1).repeat(1, self.num_classes)
492 |         return self.transitions - denom
493 | 


--------------------------------------------------------------------------------
/labelmodels/naive_bayes.py:
--------------------------------------------------------------------------------
  1 | from .label_model import ClassConditionalLabelModel, LearningConfig, init_random
  2 | import numpy as np
  3 | from scipy import sparse
  4 | import torch
  5 | from torch import nn
  6 | 
  7 | 
  8 | class NaiveBayes(ClassConditionalLabelModel):
  9 |     """A generative label model that assumes that all labeling functions are
 10 |     conditionally independent given the true class label, i.e., the naive Bayes
 11 |     assumption.
 12 | 
 13 |     Proposed in: A. P. Dawid and A. M. Skene. Maximum likelihood
 14 |     estimation of observer error-rates using the EM algorithm.
 15 |     Journal of the Royal Statistical Society C, 28(1):20–28, 1979.
 16 | 
 17 |     Proposed for labeling functions in: A. Ratner, C. De Sa, S. Wu, D. Selsam,
 18 |     and C. Ré. Data programming: Creating large training sets, quickly. In
 19 |     Neural Information Processing Systems, 2016.
 20 |     """
 21 | 
 22 |     def __init__(self, num_classes, num_lfs, init_acc=.9, acc_prior=0.025,
 23 |                  balance_prior=0.025, learn_class_balance=True):
 24 |         """Constructor.
 25 | 
 26 |         Initializes labeling function accuracies using optional argument and all
 27 |         other model parameters uniformly.
 28 | 
 29 |         :param num_classes: number of target classes, i.e., binary
 30 |                             classification = 2
 31 |         :param num_lfs: number of labeling functions to model
 32 |         :param init_acc: initial estimated labeling function accuracy, must
 33 |                             be a float in [0,1]
 34 |         :param acc_prior: strength of regularization of estimated labeling
 35 |                           function accuracies toward their initial values
 36 |         :param learn_class_balance: whether to estimate the distribution over
 37 |                                     target classes (True) or assume to be
 38 |                                     uniform (False)
 39 |         """
 40 |         super().__init__(num_classes, num_lfs, init_acc, acc_prior)
 41 |         self.class_balance = nn.Parameter(
 42 |             torch.zeros([num_classes]), requires_grad=learn_class_balance)
 43 | 
 44 |         self.balance_prior = balance_prior
 45 | 
 46 |     def forward(self, votes):
 47 |         """Computes log likelihood of labeling function outputs for each
 48 |         example in the batch.
 49 | 
 50 |         For efficiency, this function prefers that votes is an instance of
 51 |         scipy.sparse.coo_matrix. You can avoid a conversion by passing in votes
 52 |         with this class.
 53 | 
 54 |         :param votes: m x n matrix in {0, ..., k}, where m is the batch size,
 55 |                       n is the number of labeling functions and k is the number
 56 |                       of classes
 57 |         :return: 1-d tensor of length m, where each element is the
 58 |                  log-likelihood of the corresponding row in labels
 59 |         """
 60 |         class_ll = self._get_norm_class_balance()
 61 |         conditional_ll = self._get_labeling_function_likelihoods(votes)
 62 |         joint_ll = conditional_ll + class_ll
 63 |         return torch.logsumexp(joint_ll, dim=1)
 64 | 
 65 |     def estimate_label_model(self, votes, config=None):
 66 |         """Estimates the parameters of the label model based on observed
 67 |         labeling function outputs.
 68 | 
 69 |         :param votes: m x n matrix in {0, ..., k}, where m is the batch size,
 70 |                       n is the number of labeling functions and k is the number
 71 |                       of classes
 72 |         :param config: optional LearningConfig instance. If None, initialized
 73 |                        with default constructor
 74 |         """
 75 |         if config is None:
 76 |             config = LearningConfig()
 77 | 
 78 |         # Initializes random seed
 79 |         init_random(config.random_seed)
 80 | 
 81 |         # Converts to CSR to standardize input
 82 |         votes = sparse.csr_matrix(votes, dtype=np.int32)
 83 | 
 84 |         batches = self._create_minibatches(
 85 |             votes, config.batch_size, shuffle_rows=True)
 86 |         self._do_estimate_label_model(batches, config)
 87 | 
 88 |     def get_label_distribution(self, votes):
 89 |         """Returns the posterior distribution over true labels given labeling
 90 |         function outputs according to the model
 91 | 
 92 |         :param votes: m x n matrix in {0, ..., k}, where m is the batch size,
 93 |                       n is the number of labeling functions and k is the number
 94 |                       of classes
 95 |         :return: m x k matrix, where each row is the posterior distribution over
 96 |                  the true class label for the corresponding example
 97 |         """
 98 |         # Converts to CSR to standardize input
 99 |         votes = sparse.csr_matrix(votes, dtype=np.int32)
100 | 
101 |         labels = np.ndarray((votes.shape[0], self.num_classes))
102 |         batches = self._create_minibatches(votes, 4096, shuffle_rows=False)
103 | 
104 |         offset = 0
105 |         for votes, in batches:
106 |             class_balance = self._get_norm_class_balance()
107 |             lf_likelihood = self._get_labeling_function_likelihoods(votes)
108 |             jll = class_balance + lf_likelihood
109 |             for i in range(votes.shape[0]):
110 |                 p = torch.exp(jll[i, :] - torch.max(jll[i, :]))
111 |                 p = p / p.sum()
112 |                 for j in range(self.num_classes):
113 |                     labels[offset + i, j] = p[j]
114 |             offset += votes.shape[0]
115 | 
116 |         return labels
117 | 
118 |     def get_most_probable_labels(self, votes):
119 |         """Returns the most probable true labels given observed function outputs.
120 | 
121 |         :param votes: m x n matrix in {0, ..., k}, where m is the batch size,
122 |                       n is the number of labeling functions and k is the number
123 |                       of classes
124 |         :return: 1-d Numpy array of most probable labels
125 |         """
126 |         return np.argmax(self.get_label_distribution(votes), axis=1) + 1
127 | 
128 |     def get_class_balance(self):
129 |         """Returns the model's estimated class balance
130 | 
131 |         :return: a NumPy array with one element in [0,1] for each target class,
132 |                  representing the estimated prior probability that an example
133 |                  has that label
134 |         """
135 |         return np.exp(self._get_norm_class_balance().detach().numpy())
136 | 
137 |     def _create_minibatches(self, votes, batch_size, shuffle_rows=False):
138 |         if shuffle_rows:
139 |             index = np.arange(np.shape(votes)[0])
140 |             np.random.shuffle(index)
141 |             votes = votes[index, :]
142 | 
143 |         # Creates minibatches
144 |         batches = [(sparse.coo_matrix(
145 |             votes[i * batch_size: (i + 1) * batch_size, :],
146 |             copy=True),)
147 |             for i in range(int(np.ceil(votes.shape[0] / batch_size)))
148 |         ]
149 | 
150 |         return batches
151 | 
152 |     def _get_regularization_loss(self):
153 |         neg_entropy = 0.0
154 |         norm_class_balance = self._get_norm_class_balance()
155 |         exp_class_balance = torch.exp(norm_class_balance)
156 |         for k in range(self.num_classes):
157 |             neg_entropy += norm_class_balance[k] * exp_class_balance[k]
158 |         entropy_prior = self.balance_prior * neg_entropy
159 | 
160 |         return super()._get_regularization_loss() + entropy_prior
161 | 
162 |     def _get_norm_class_balance(self):
163 |         return self.class_balance - torch.logsumexp(self.class_balance, dim=0)
164 | 


--------------------------------------------------------------------------------
/labelmodels/partial_labels.py:
--------------------------------------------------------------------------------
  1 | from .label_model import LabelModel, init_random, LearningConfig
  2 | import numpy as np
  3 | from scipy import sparse
  4 | import torch
  5 | from torch import nn
  6 | from copy import deepcopy as dc
  7 | import logging
  8 | 
  9 | 
 10 | class PartialLabelLearningConfig(LearningConfig):
 11 |     """Container for hyperparameters used by PartialLabelModel during learning"""
 12 |     def __init__(self):
 13 |         """Initializes all hyperparameters to default values"""
 14 |         super().__init__()
 15 |         self.epochs = 200
 16 |         self.batch_size = 8192
 17 |         self.step_size = 0.1
 18 |         self.step_schedule = 'p'
 19 |         self.step_size_mult = 0.1
 20 |         self.momentum = 0.8
 21 | 
 22 | 
 23 | class PartialLabelModel(LabelModel):
 24 |     """A generative label model that assumes that all partial labeling functions are
 25 |     conditionally independent given the true class label. A naive Bayes distribution
 26 |     is assumed.
 27 |     """
 28 | 
 29 |     def __init__(self, num_classes,
 30 |                  label_partition,
 31 |                  init_acc=0.7, preset_classbalance=None,
 32 |                  learn_class_balance=True,
 33 |                  device='cpu'):
 34 |         """Constructor.
 35 | 
 36 |         Initializes labeling function accuracies using optional argument and all
 37 |         other model parameters uniformly.
 38 | 
 39 |         :param num_classes: number of target classes, i.e., binary
 40 |                             classification = 2
 41 |         :param label_partition: partial labeling functions configurations. The label_partition configures the label
 42 |                                 partitions mapping in format as
 43 |                                 {PLF's index: [partition_1, partition_2, ..., partition_{k_l}]}
 44 |         :param preset_classbalance: None if want to learn class balance. Can be preset as fixed class balance
 45 |         :param init_acc: initial estimated labeling and linking function
 46 |                          accuracy, must be a float in [0,1]
 47 |         :param device: calculation device
 48 |         """
 49 |         super().__init__()
 50 | 
 51 |         self.device = device
 52 |         if not torch.cuda.is_available():
 53 |             self.device = 'cpu'
 54 |         self.preset_classbalance = preset_classbalance
 55 |         self.num_classes = num_classes
 56 |         self.init_acc = -1 * np.log(1.0 / init_acc - 1) / 2
 57 |         self.label_partition = label_partition
 58 |         self.num_df = len(label_partition)
 59 | 
 60 |         if self.preset_classbalance is not None:
 61 |             self.class_balance = torch.nn.Parameter(
 62 |                 torch.log(self.preset_classbalance),
 63 |                 requires_grad=False
 64 |             )
 65 |         else:
 66 |             self.class_balance = torch.nn.Parameter(
 67 |                 torch.zeros([self.num_classes], device=self.device),
 68 |                 requires_grad=learn_class_balance
 69 |             )
 70 | 
 71 |         self.accuracy = torch.nn.Parameter(
 72 |             torch.ones([self.num_df, self.num_classes], device=self.device) * self.init_acc,
 73 |             requires_grad=True
 74 |         )
 75 | 
 76 |         self.propensity = torch.nn.Parameter(
 77 |             torch.zeros([self.num_df], device=self.device),
 78 |             requires_grad=True
 79 |         )
 80 | 
 81 |         self.ct = torch.zeros([self.num_df, self.num_classes])
 82 |         self.poslib = torch.zeros([self.num_df, self.num_classes])
 83 |         self.neglib = torch.zeros([self.num_df, self.num_classes])
 84 | 
 85 |         '''
 86 |         Set Ops
 87 |         '''
 88 |         def intercect(l1, l2):
 89 |             return [value for value in l1 if value in l2]
 90 | 
 91 |         def union(l1, l2):
 92 |             return list(set(l1) | set(l2))
 93 | 
 94 |         for fid, clusters in self.label_partition.items():
 95 |             crange = clusters[0]
 96 |             ccover = []
 97 |             for cluster_id, cluster in enumerate(clusters):
 98 |                 cluster.sort()
 99 |                 self.label_partition[fid][cluster_id] = cluster
100 |                 crange = intercect(crange, cluster)
101 |                 ccover = union(ccover, cluster)
102 |             if len(crange) > 0:
103 |                 raise RuntimeError('Setup Violation: No class can appear in all groups!')
104 |             if len(ccover) < self.num_classes:
105 |                 raise RuntimeError('Setup Violation: Class must appear at least once! Please setup a dummy label group if necessary!')
106 | 
107 |         for fid, clusters in self.label_partition.items():
108 |             for cluster_id, cluster in enumerate(clusters):
109 |                 for class_id in cluster:
110 |                     self.poslib[fid, class_id - 1] += 1
111 |                     self.ct[fid, class_id - 1] = cluster_id
112 |             self.neglib[fid, :] = len(clusters) - self.poslib[fid, :]
113 |         self.poslib[self.poslib == 0] = 1
114 | 
115 |     def forward(self, votes, bid):
116 |         """Computes log likelihood of labeling function outputs for each
117 |         example in the batch.
118 |         """
119 |         class_ll = self._get_norm_class_balance()
120 |         conditional_ll = self._cll(votes, bid)
121 |         joint_ll = conditional_ll + class_ll
122 |         return torch.logsumexp(joint_ll, dim=1)
123 | 
124 |     def estimate_label_model(self, votes, config=None):
125 |         """Estimates the parameters of the label model based on observed
126 |         labeling function outputs.
127 |         """
128 |         if config is None:
129 |             config = PartialLabelLearningConfig()
130 | 
131 |         # Initializes random seed
132 |         init_random(config.random_seed)
133 | 
134 |         batches = self._setup(votes, config.batch_size, shuffle=True)
135 | 
136 |         self._do_estimate_label_model(batches, config)
137 | 
138 |     def get_label_distribution(self, votes, annot_batch_sz=2048):
139 |         """Returns the posterior distribution over true labels given labeling
140 |         function outputs according to the model
141 | 
142 |         :param votes: m x n matrix where each element is in the set {0, 0, 1, ..., k_l}, where
143 |                       k_l is the number of label partitions for partial labeling functions PLF_{l}.
144 |         :return: m x k matrix, where each row is the posterior distribution over
145 |                  the true class label for the corresponding example
146 |         """
147 |         self.eval()
148 |         batches = self._setup(votes, annot_batch_sz)
149 | 
150 |         labels = np.ndarray((votes.shape[0], self.num_classes))
151 |         for batch_id, batch_votes in enumerate(batches):
152 |             class_balance = self._get_norm_class_balance()
153 |             lf_likelihood = self._cll(batch_votes, batch_id)
154 |             jll = class_balance + lf_likelihood
155 |             P = torch.exp(jll - torch.max(jll, dim=1)[0].unsqueeze(1).repeat(1, self.num_classes))
156 |             P /= torch.sum(P, dim=1).unsqueeze(1).repeat(1, self.num_classes)
157 |             labels[batch_id*annot_batch_sz:batch_id*annot_batch_sz+batch_votes.shape[0]] = P.detach().cpu().numpy()
158 |         if 'cuda' in self.device:
159 |             torch.cuda.empty_cache()
160 |         return labels
161 | 
162 |     def get_most_probable_labels(self, votes):
163 |         """Returns the most probable true labels given observed function outputs.
164 | 
165 |         :param votes: m x n matrix where each element is in the set {0, 0, 1, ..., k_l}, where
166 |                       k_l is the number of label partitions for partial labeling functions PLF_{l}.
167 |         :return: 1-d Numpy array of most probable labels
168 |         """
169 |         return np.argmax(self.get_label_distribution(votes), axis=1) + 1
170 | 
171 |     def get_class_balance(self):
172 |         """Returns the model's estimated class balance
173 | 
174 |         :return: a NumPy array with one element in [0,1] for each target class,
175 |                  representing the estimated prior probability that an example
176 |                  has that label
177 |         """
178 |         return np.exp(self._get_norm_class_balance().detach().cpu().numpy())
179 | 
180 |     def get_accuracies(self):
181 |         """Returns the model's estimated labeling function accuracies
182 |         :return: a NumPy array with one element in [0,1] for each labeling
183 |                  function, representing the estimated probability that
184 |                  the corresponding labeling function correctly outputs
185 |                  the true class label, given that it does not abstain
186 |         """
187 |         acc = self.accuracy.detach().cpu().numpy()
188 |         return np.exp(acc) / (np.exp(acc) + np.exp(-1 * acc))
189 | 
190 |     def get_propensities(self):
191 |         """Returns the model's estimated labeling function propensities, i.e.,
192 |         the probability that a labeling function does not abstain
193 |         :return: a NumPy array with one element in [0,1] for each labeling
194 |                  function, representing the estimated probability that
195 |                  the corresponding labeling function does not abstain
196 |         """
197 |         prop = self.propensity.detach().cpu().numpy()
198 |         return np.exp(prop) / (np.exp(prop) + 1)
199 | 
200 |     def _do_estimate_label_model(self, batches, config):
201 |         """Internal method for optimizing model parameters.
202 | 
203 |         :param batches: sequence of inputs to forward(). The sequence must
204 |                         contain tuples, even if forward() takes one
205 |                         argument (besides self)
206 |         :param config: an instance of PartialLabelLearningConfig
207 |         """
208 | 
209 |         optimizer = torch.optim.Adam(
210 |             self.parameters(), lr=config.step_size,
211 |             weight_decay=0)
212 | 
213 |         if config.step_schedule == 'p':
214 |             scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, min_lr=1e-10, factor=config.step_size_mult)
215 |         elif config.step_schedule == 'c':
216 |             scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=1e-1, max_lr=0.2)
217 |         elif config.step_schedule is not None and config.step_size_mult is not None:
218 |             scheduler = torch.optim.lr_scheduler.MultiStepLR(
219 |                 optimizer, config.step_schedule, gamma=config.step_size_mult)
220 |         else:
221 |             scheduler = None
222 | 
223 |         self.train()
224 | 
225 |         for epoch in range(config.epochs):
226 |             ga = dc(self.accuracy)
227 |             logging.info('Epoch {}/{}'.format(epoch + 1, config.epochs))
228 |             running_loss = 0.0
229 |             epoch_loss = []
230 |             for i_batch, inputs in enumerate(batches):
231 |                 optimizer.zero_grad()
232 |                 log_likelihood = self(inputs, i_batch)
233 |                 loss = -1 * torch.mean(log_likelihood)
234 |                 loss += self._get_regularization_loss()
235 |                 loss.backward()
236 |                 optimizer.step()
237 |                 running_loss += loss
238 |                 epoch_loss.append(float(loss.item()))
239 |             epoch_loss = running_loss / len(batches)
240 |             logging.info('Train Loss: %.6f', epoch_loss)
241 |             if torch.sum(torch.abs(self.accuracy - ga)) < 1e-7:
242 |                 logging.info('1e-7 Criterion Reached: Epoch')
243 |                 break
244 |             if scheduler is not None:
245 |                 if config.step_schedule == 'p':
246 |                     scheduler.step(epoch_loss)
247 |                 else:
248 |                     scheduler.step()
249 | 
250 |         if 'cuda' in self.device:
251 |             torch.cuda.empty_cache()
252 | 
253 |     def _setup(self, votes, batch_size, shuffle=False):
254 |         ''' Setup \& precalculates/populates helper variables
255 | 
256 |         :param votes: Full PLFs votes input.
257 |         :param batch_size: # of instances in one batch
258 |         :param shuffle_rows: Decides if rows of given votse need to shuffle
259 | 
260 |         :return: 3-d Numpy array for batched votes in shape [# batch, # instance, # plfs]
261 |         '''
262 |         # Normalizing to 0-indexed LPs.
263 |         batches = self._create_minibatches(votes-1, batch_size, shuffle)
264 |         cth = self.ct.unsqueeze(0).repeat(batch_size, 1, 1)
265 |         self.c = torch.zeros([len(batches), batch_size, self.num_df, self.num_classes])
266 |         self.n = torch.zeros([len(batches), batch_size, self.num_df, self.num_classes])
267 |         self.a = torch.ones([len(batches), batch_size, self.num_df, self.num_classes])
268 |         self.p = torch.ones([len(batches), batch_size, self.num_df])
269 |         for bid in range(len(batches) - 1):
270 |             extb = batches[bid].unsqueeze(2).repeat(1, 1, self.num_classes)
271 |             self.c[bid] = torch.where(torch.eq(cth, extb), torch.tensor(1.0), torch.tensor(-1.0))
272 |             self.a[bid] = torch.where(extb==-1, torch.tensor(0.0), torch.tensor(1.0))
273 |             marker = torch.where(self.c[bid]==1, torch.tensor(1.0), torch.tensor(0.0))
274 |             self.n[bid] = (1 - marker) * self.neglib + marker * self.poslib
275 |             self.p[bid] = torch.where(batches[bid]==-1, torch.tensor(0.0), torch.tensor(1.0))
276 | 
277 |         last_bz = len(batches[-1])
278 |         last_extb = batches[-1].unsqueeze(2).repeat(1, 1, self.num_classes)
279 |         self.c[-1, :last_bz] = torch.where(
280 |             torch.eq(cth[:last_bz, :, :], last_extb),
281 |             torch.tensor(1.0), torch.tensor(-1.0))
282 |         marker = torch.where(self.c[-1, :last_bz] == 1, torch.tensor(1.0), torch.tensor(0.0))
283 |         self.a[-1, :last_bz] = torch.where(last_extb==-1, torch.tensor(0.0), torch.tensor(1.0))
284 |         self.n[-1, :last_bz] = (1 - marker) * self.neglib + marker * self.poslib
285 |         self.n = -torch.log(self.n)
286 |         self.p[-1, :last_bz] = torch.where(batches[-1]==-1, torch.tensor(0.0), torch.tensor(1.0))
287 |         return batches
288 | 
289 |     def _get_regularization_loss(self):
290 |         """Gets the value of the regularization loss for the current values of
291 |         the model's parameters
292 | 
293 |         :return: regularization loss
294 |         """
295 |         return 0.0
296 | 
297 |     def _get_norm_class_balance(self):
298 |         return self.class_balance - torch.logsumexp(self.class_balance, dim=0)
299 | 
300 |     def _cll(self, votes, bid):
301 |         '''Calculates class conditioned likelihood for batched votes.
302 | 
303 |         :param votes: current votes (batch)
304 |         :param bid: batch id for current votes
305 | 
306 |         :return: 2-d torch tensor for class-conditioned likelihood for given votes and batch index.
307 |         '''
308 |         num_inst = votes.shape[0]
309 | 
310 |         za = self.accuracy.unsqueeze(2)
311 |         za = torch.cat((za, -1 * za), dim=2)
312 |         za = - torch.logsumexp(za, dim=2).unsqueeze(0).repeat(num_inst, 1, 1)
313 | 
314 |         z_plh = torch.zeros((self.num_df, 1)).to(self.device)
315 |         zp = self.propensity.unsqueeze(1)
316 |         zp = torch.cat((zp, z_plh), dim=1)
317 |         zp = -torch.logsumexp(zp, dim=1).unsqueeze(0).unsqueeze(-1).repeat(num_inst, 1, self.num_classes)
318 | 
319 |         cp = self.propensity.unsqueeze(0).unsqueeze(-1).repeat(num_inst, 1, self.num_classes)
320 |         ca = self.accuracy.unsqueeze(0).repeat(num_inst,1,1)
321 |         ab = self.a[bid][:num_inst].to(self.device)
322 |         cc = self.c[bid][:num_inst].to(self.device)
323 |         cn = self.n[bid][:num_inst].to(self.device)
324 | 
325 |         cll = torch.sum(((ca*cc+cn+cp+za)*ab)+zp, dim=1)
326 |         return cll
327 | 
328 | 
329 |     def _create_minibatches(self, votes, batch_size, shuffle_rows=False):
330 |         ''' Create (shuffled) batched votes for parallelized estimation
331 | 
332 |         :param votes: Full PLFs votes input.
333 |         :param batch_size: # of instances in one batch
334 |         :param shuffle_rows: Decides if rows of given votse need to shuffle
335 | 
336 |         :return:  3-d Numpy array for batched votes in shape [# batch, # instance, # plfs]
337 |         '''
338 |         if shuffle_rows:
339 |             index = np.arange(np.shape(votes)[0])
340 |             np.random.shuffle(index)
341 |             votes = votes[index, :]
342 | 
343 |         batches = [
344 |             torch.LongTensor(votes[i * batch_size: (i + 1) * batch_size, :].astype(np.int32))
345 |             for i in range(int(np.ceil(votes.shape[0] / batch_size)))
346 |         ]
347 | 
348 |         return batches
349 | 
350 | 
351 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='labelmodels',
 5 |     version='0.0.1',
 6 |     url='https://github.com/BatsResearch/labelmodels.git',
 7 |     author='Shiying Luo, Stephen Bach',
 8 |     author_email='shiying_luo@brown.edu, sbach@cs.brown.edu',
 9 |     description='Lightweight implementations of generative label models for '
10 |                 'weakly supervised machine learning',
11 |     packages=find_packages(),
12 |     install_requires=['numpy >= 1.11', 'scipy >= 1.1', 'torch >= 1.4'],
13 | )
14 | 


--------------------------------------------------------------------------------
/test/test_hmm.py:
--------------------------------------------------------------------------------
  1 | from labelmodels import HMM
  2 | import numpy as np
  3 | from scipy import sparse
  4 | import torch
  5 | import unittest
  6 | 
  7 | 
  8 | class TestHMM(unittest.TestCase):
  9 | 
 10 |     def setUp(self):
 11 |         np.random.seed(0)
 12 | 
 13 |     def tearDown(self):
 14 |         pass
 15 | 
 16 |     def test_estimate_label_model_binary(self):
 17 |         n = 5
 18 |         k = 2
 19 | 
 20 |         accuracies = np.array([[.9, .8],
 21 |                                [.6, .7],
 22 |                                [.6, .6],
 23 |                                [.7, .6],
 24 |                                [.8, .8]])
 25 |         propensities = np.array([.9] * n)
 26 |         start_balance = np.array([.3, .7])
 27 |         transitions = np.array([[.5, .5], [.3, .7]])
 28 | 
 29 |         labels_train, seq_starts_train, gold_train = _generate_data(
 30 |             1000, 8, 12, n, accuracies, propensities, start_balance, transitions
 31 |         )
 32 | 
 33 |         model = HMM(k, n, acc_prior=0.0, balance_prior=0.0)
 34 |         model.estimate_label_model(labels_train, seq_starts_train)
 35 | 
 36 |         for i in range(n):
 37 |             for j in range(k):
 38 |                 diff = accuracies[i, j] - model.get_accuracies()[i, j]
 39 |                 self.assertAlmostEqual(diff, 0.0, places=1)
 40 |         for i in range(n):
 41 |             diff = propensities[i] - model.get_propensities()[i]
 42 |             self.assertAlmostEqual(diff, 0.0, places=1)
 43 |         for i in range(k):
 44 |             diff = start_balance[i] - model.get_start_balance()[i]
 45 |             self.assertAlmostEqual(diff, 0.0, places=1)
 46 |         for i in range(k):
 47 |             for j in range(k):
 48 |                 diff = transitions[i, j] - model.get_transition_matrix()[i, j]
 49 |                 self.assertAlmostEqual(diff, 0.0, places=1)
 50 | 
 51 |     def test_estimate_label_model_multiclass(self):
 52 |         n = 5
 53 |         k = 3
 54 | 
 55 |         accuracies = np.array([[.9, .8, .9],
 56 |                                [.6, .7, .9],
 57 |                                [.6, .6, .9],
 58 |                                [.7, .6, .9],
 59 |                                [.8, .8, .9]])
 60 |         propensities = np.array([.9] * n)
 61 |         start_balance = np.array([.3, .3, .4])
 62 |         transitions = np.array([[.5, .3, .2],
 63 |                                 [.3, .4, .3],
 64 |                                 [.2, .5, .3]])
 65 | 
 66 |         labels_train, seq_starts_train, gold_train = _generate_data(
 67 |             1000, 8, 12, n, accuracies, propensities, start_balance, transitions
 68 |         )
 69 | 
 70 |         model = HMM(k, n, acc_prior=0.0, balance_prior=0.0)
 71 |         model.estimate_label_model(labels_train, seq_starts_train)
 72 | 
 73 |         for i in range(n):
 74 |             for j in range(k):
 75 |                 diff = accuracies[i, j] - model.get_accuracies()[i, j]
 76 |                 self.assertAlmostEqual(diff, 0.0, places=1)
 77 |         for i in range(n):
 78 |             diff = propensities[i] - model.get_propensities()[i]
 79 |             self.assertAlmostEqual(diff, 0.0, places=1)
 80 |         for i in range(k):
 81 |             diff = start_balance[i] - model.get_start_balance()[i]
 82 |             self.assertAlmostEqual(diff, 0.0, places=1)
 83 |         for i in range(k):
 84 |             for j in range(k):
 85 |                 diff = transitions[i, j] - model.get_transition_matrix()[i, j]
 86 |                 self.assertAlmostEqual(diff, 0.0, places=1)
 87 | 
 88 |     def test_get_most_probable_labels(self):
 89 |         m = 500
 90 |         n = 10
 91 |         k = 3
 92 | 
 93 |         model = HMM(k, n, acc_prior=0.0)
 94 |         with torch.no_grad():
 95 |             model.start_balance[0] = 0
 96 |             model.start_balance[1] = 0.5
 97 |             for i in range(n):
 98 |                 model.propensity[i] = 2
 99 |                 for j in range(k):
100 |                     model.accuracy[i, j] = 2
101 |             for i in range(k):
102 |                 for j in range(k):
103 |                     model.transitions[i, j] = 1 if i == j else 0
104 | 
105 |         labels_train, seq_starts_train, gold_train = _generate_data(
106 |             m, 8, 12, n,
107 |             model.get_accuracies(),
108 |             model.get_propensities(),
109 |             model.get_start_balance(),
110 |             model.get_transition_matrix())
111 | 
112 |         predictions = model.get_most_probable_labels(labels_train, seq_starts_train)
113 |         correct = 0
114 |         for i in range(len(predictions)):
115 |             if predictions[i] == gold_train[i]:
116 |                 correct += 1
117 |         accuracy = correct / float(len(predictions))
118 |         self.assertGreaterEqual(accuracy, .95)
119 | 
120 |     def test_get_label_distribution(self):
121 |         m = 500
122 |         n = 10
123 |         k = 3
124 | 
125 |         model = HMM(k, n, acc_prior=0.0)
126 |         with torch.no_grad():
127 |             model.start_balance[0] = 0
128 |             model.start_balance[1] = 0.5
129 |             for i in range(n):
130 |                 model.propensity[i] = 2
131 |                 for j in range(k):
132 |                     model.accuracy[i, j] = 2
133 |             for i in range(k):
134 |                 for j in range(k):
135 |                     model.transitions[i, j] = 1 if i == j else 0
136 | 
137 |         labels_train, seq_starts_train, gold_train = _generate_data(
138 |             m, 8, 12, n,
139 |             model.get_accuracies(),
140 |             model.get_propensities(),
141 |             model.get_start_balance(),
142 |             model.get_transition_matrix())
143 | 
144 |         p_unary, p_pairwise = model.get_label_distribution(
145 |             labels_train, seq_starts_train)
146 | 
147 |         # Makes predictions using both unary and pairwise marginals
148 |         pred_unary = np.argmax(p_unary, axis=1) + 1
149 |         pred_pairwise = np.zeros((labels_train.shape[0],), dtype=np.int32)
150 |         next_seq = 0
151 |         for i in range(labels_train.shape[0] - 1):
152 |             if next_seq == len(seq_starts_train) or i < seq_starts_train[next_seq] - 1:
153 |                 # i is neither the start nor end of a sequence
154 |                 pred_pairwise[i+1] = np.argmax(p_pairwise[i][pred_pairwise[i]])
155 |             elif i == seq_starts_train[next_seq]:
156 |                 # i is the start of a sequence
157 |                 a, b = np.unravel_index(p_pairwise[i].argmax(), (k, k))
158 |                 pred_pairwise[i], pred_pairwise[i + 1] = a, b
159 |                 next_seq += 1
160 |             else:
161 |                 # i is the end of a sequence
162 |                 pass
163 |         pred_pairwise += 1
164 | 
165 |         # Checks that predictions are accurate
166 |         for predictions in (pred_unary, pred_pairwise):
167 |             correct = 0
168 |             for i in range(len(predictions)):
169 |                 if predictions[i] == gold_train[i]:
170 |                     correct += 1
171 |             accuracy = correct / float(len(predictions))
172 |             self.assertGreaterEqual(accuracy, .95)
173 | 
174 |         ##### ensure marginalization of p_pairwise matches p_unary
175 |         # create a simple and trained HMM.
176 |         n = 5
177 |         k = 2
178 | 
179 |         accuracies = np.array([[.9, .8],
180 |                                [.6, .7],
181 |                                [.6, .6],
182 |                                [.7, .6],
183 |                                [.8, .8]])
184 |         propensities = np.array([.9] * n)
185 |         start_balance = np.array([.3, .7])
186 |         transitions = np.array([[.5, .5], [.3, .7]])
187 | 
188 |         labels_train, seq_starts_train, gold_train = _generate_data(
189 |             10, 8, 12, n, accuracies, propensities, start_balance, transitions
190 |         )
191 | 
192 |         model = HMM(k, n, acc_prior=0.0, balance_prior=0.0)
193 |         model.estimate_label_model(labels_train, seq_starts_train)
194 | 
195 |         # get unary and pairwise marginals
196 |         p_unary, p_pairwise = model.get_label_distribution(labels_train, seq_starts_train)
197 | 
198 |         # marginalize pairwise over t + 1
199 |         for un, pa in zip(p_unary, np.sum(p_pairwise, axis= 2)):
200 |             if np.sum(pa)==0: # last pairwise_marginal is empty because it does not have t + 1 transition
201 |                 continue
202 |             for i in range(un.shape[0]):
203 |                 self.assertAlmostEqual(un[i], pa[i], places=3)
204 | 
205 |         # marginalize pairwise over t
206 |         for i, pa in enumerate(np.sum(p_pairwise, axis= 1)):
207 |             if i + 1 >= len(p_unary) or i + 1 in seq_starts_train: 
208 |                 # skip if p_unary[i + 1[ goes into new sequence
209 |                 continue
210 |             un = p_unary[i + 1]
211 |             for j in range(un.shape[0]):
212 |                 self.assertAlmostEqual(un[j], pa[j], places=3)
213 | 
214 | 
215 | def _generate_data(num_seqs, min_seq, max_seq, num_lfs, accuracies,
216 |                    propensities, start_balance, transitions):
217 |     # Generates sequence starts
218 |     seq_starts = np.zeros((num_seqs,), dtype=np.int32)
219 |     total_len = 0
220 |     for i in range(num_seqs):
221 |         seq_len = np.random.randint(min_seq, max_seq + 1)
222 |         total_len += seq_len
223 |         if i + 1 < num_seqs:
224 |             seq_starts[i + 1] = total_len
225 | 
226 |     # Generates sequences of gold labels
227 |     gold = np.zeros((total_len,), dtype=np.int32)
228 |     next_start = 0
229 |     for i in range(total_len):
230 |         if next_start < len(seq_starts) and i == seq_starts[next_start]:
231 |             balance = start_balance
232 |             next_start += 1
233 |         else:
234 |             balance = np.squeeze(transitions[gold[i - 1] - 1])
235 | 
236 |         gold[i] = np.argmax(np.random.multinomial(1, balance)) + 1
237 | 
238 |     # Generates labeling function outputs conditioned on gold labels
239 |     row = []
240 |     col = []
241 |     val = []
242 |     for i in range(total_len):
243 |         for j in range(num_lfs):
244 |             if np.random.random() < propensities[j]:
245 |                 row.append(i)
246 |                 col.append(j)
247 |                 if np.random.random() < accuracies[j, gold[i] - 1]:
248 |                     val.append(gold[i])
249 |                 else:
250 |                     p_mistake = 1 / (len(start_balance) - 1)
251 |                     dist = [p_mistake] * (len(start_balance) + 1)
252 |                     dist[0] = 0
253 |                     dist[gold[i]] = 0
254 |                     val.append(np.argmax(np.random.multinomial(1, dist)))
255 | 
256 |     labels = sparse.coo_matrix((val, (row, col)), shape=(total_len, num_lfs))
257 | 
258 |     return labels, seq_starts, gold
259 | 
260 | 
261 | if __name__ == '__main__':
262 |     unittest.main()
263 | 


--------------------------------------------------------------------------------
/test/test_linked_hmm.py:
--------------------------------------------------------------------------------
  1 | from labelmodels import LinkedHMM, LearningConfig
  2 | import numpy as np
  3 | from scipy import sparse
  4 | import torch
  5 | import unittest
  6 | 
  7 | 
  8 | class TestLinkedHMM(unittest.TestCase):
  9 | 
 10 |     def setUp(self):
 11 |         np.random.seed(0)
 12 | 
 13 |     def tearDown(self):
 14 |         pass
 15 | 
 16 |     def test_estimate_label_model_binary(self):
 17 |         n1 = 5
 18 |         n2 = 3
 19 |         k = 2
 20 | 
 21 |         label_accuracies = np.array([[.9, .8],
 22 |                                      [.6, .7],
 23 |                                      [.6, .6],
 24 |                                      [.7, .6],
 25 |                                      [.8, .8]])
 26 |         link_accuracies = np.array([.8, .6, .8])
 27 |         label_propensities = np.array([.9] * n1)
 28 |         link_propensities = np.array([.9] * n1)
 29 |         start_balance = np.array([.3, .7])
 30 |         transitions = np.array([[.5, .5], [.3, .7]])
 31 | 
 32 |         labels, links, seq_starts, gold = _generate_data(
 33 |             1000, 8, 12, n1, n2,
 34 |             label_accuracies,
 35 |             link_accuracies,
 36 |             label_propensities,
 37 |             link_propensities,
 38 |             start_balance,
 39 |             transitions
 40 |         )
 41 | 
 42 |         model = LinkedHMM(k, n1, n2, acc_prior=0.0, balance_prior=0.0)
 43 |         config = LearningConfig()
 44 |         config.epochs = 3
 45 |         model.estimate_label_model(labels, links, seq_starts, config=config)
 46 | 
 47 |         for i in range(n1):
 48 |             for j in range(k):
 49 |                 diff = label_accuracies[i, j] - model.get_accuracies()[i, j]
 50 |                 self.assertAlmostEqual(diff, 0.0, places=1)
 51 |         for i in range(n2):
 52 |             for j in range(k):
 53 |                 diff = link_accuracies[i] - model.get_link_accuracies()[i]
 54 |                 self.assertAlmostEqual(diff, 0.0, places=1)
 55 |         for i in range(n1):
 56 |             diff = label_propensities[i] - model.get_propensities()[i]
 57 |             self.assertAlmostEqual(diff, 0.0, places=1)
 58 |         for i in range(n2):
 59 |             diff = link_propensities[i] - model.get_link_propensities()[i]
 60 |             self.assertAlmostEqual(diff, 0.0, places=1)
 61 |         for i in range(k):
 62 |             diff = start_balance[i] - model.get_start_balance()[i]
 63 |             self.assertAlmostEqual(diff, 0.0, places=1)
 64 |         for i in range(k):
 65 |             for j in range(k):
 66 |                 diff = transitions[i, j] - model.get_transition_matrix()[i, j]
 67 |                 self.assertAlmostEqual(diff, 0.0, places=1)
 68 | 
 69 |     def test_estimate_label_model_multiclass(self):
 70 |         n1 = 5
 71 |         n2 = 3
 72 |         k = 3
 73 | 
 74 |         label_accuracies = np.array([[.9, .8, .5],
 75 |                                      [.6, .7, .3],
 76 |                                      [.6, .6, .8],
 77 |                                      [.7, .6, .6],
 78 |                                      [.8, .8, .9]])
 79 |         link_accuracies = np.array([.8, .6, .8])
 80 |         label_propensities = np.array([.9] * n1)
 81 |         link_propensities = np.array([.9] * n1)
 82 |         start_balance = np.array([.3, .3, .4])
 83 |         transitions = np.array([[.5, .3, .2],
 84 |                                 [.4, .3, .3],
 85 |                                 [.3, .3, .4]])
 86 | 
 87 |         labels, links, seq_starts, gold = _generate_data(
 88 |             1000, 8, 12, n1, n2,
 89 |             label_accuracies,
 90 |             link_accuracies,
 91 |             label_propensities,
 92 |             link_propensities,
 93 |             start_balance,
 94 |             transitions
 95 |         )
 96 | 
 97 |         model = LinkedHMM(k, n1, n2, acc_prior=0.0, balance_prior=0.0)
 98 |         config = LearningConfig()
 99 |         config.epochs = 4
100 |         model.estimate_label_model(labels, links, seq_starts, config=config)
101 | 
102 |         for i in range(n1):
103 |             for j in range(k):
104 |                 diff = label_accuracies[i, j] - model.get_accuracies()[i, j]
105 |                 self.assertAlmostEqual(diff, 0.0, places=1)
106 |         for i in range(n2):
107 |             for j in range(k):
108 |                 diff = link_accuracies[i] - model.get_link_accuracies()[i]
109 |                 self.assertAlmostEqual(diff, 0.0, places=1)
110 |         for i in range(n1):
111 |             diff = label_propensities[i] - model.get_propensities()[i]
112 |             self.assertAlmostEqual(diff, 0.0, places=1)
113 |         for i in range(n2):
114 |             diff = link_propensities[i] - model.get_link_propensities()[i]
115 |             self.assertAlmostEqual(diff, 0.0, places=1)
116 |         for i in range(k):
117 |             diff = start_balance[i] - model.get_start_balance()[i]
118 |             self.assertAlmostEqual(diff, 0.0, places=1)
119 |         for i in range(k):
120 |             for j in range(k):
121 |                 diff = transitions[i, j] - model.get_transition_matrix()[i, j]
122 |                 self.assertAlmostEqual(diff, 0.0, places=1)
123 | 
124 |     def test_get_most_probable_labels(self):
125 |         m = 500
126 |         n1 = 3
127 |         n2 = 5
128 |         k = 3
129 | 
130 |         model = LinkedHMM(k, n1, n2)
131 |         with torch.no_grad():
132 |             model.start_balance[0] = 0
133 |             model.start_balance[1] = 0.5
134 |             for i in range(n1):
135 |                 model.propensity[i] = 0
136 |                 for j in range(k):
137 |                     model.accuracy[i, j] = 1
138 |             for i in range(n2):
139 |                 model.link_propensity[i] = 0
140 |                 model.link_accuracy[i] = 1.5
141 |             for i in range(k):
142 |                 for j in range(k):
143 |                     model.transitions[i, j] = 1 if i == j else 0
144 | 
145 |         labels, links, seq_starts, gold = _generate_data(
146 |             m, 8, 12, n1, n2,
147 |             model.get_label_accuracies(),
148 |             model.get_link_accuracies(),
149 |             model.get_label_propensities(),
150 |             model.get_link_propensities(),
151 |             model.get_start_balance(),
152 |             model.get_transition_matrix())
153 | 
154 |         predictions = model.get_most_probable_labels(labels, links, seq_starts)
155 |         correct = 0
156 |         for i in range(len(predictions)):
157 |             if predictions[i] == gold[i]:
158 |                 correct += 1
159 |         accuracy = correct / float(len(predictions))
160 |         self.assertGreaterEqual(accuracy, .95)
161 | 
162 |     def test_get_label_distribution(self):
163 |         m = 500
164 |         n1 = 3
165 |         n2 = 5
166 |         k = 3
167 | 
168 |         model = LinkedHMM(k, n1, n2)
169 |         with torch.no_grad():
170 |             model.start_balance[0] = 0
171 |             model.start_balance[1] = 0.5
172 |             for i in range(n1):
173 |                 model.propensity[i] = 0
174 |                 for j in range(k):
175 |                     model.accuracy[i, j] = 1
176 |             for i in range(n2):
177 |                 model.link_propensity[i] = 0
178 |                 model.link_accuracy[i] = 1.5
179 |             for i in range(k):
180 |                 for j in range(k):
181 |                     model.transitions[i, j] = 1 if i == j else 0
182 | 
183 |         labels, links, seq_starts, gold = _generate_data(
184 |             m, 8, 12, n1, n2,
185 |             model.get_label_accuracies(),
186 |             model.get_link_accuracies(),
187 |             model.get_label_propensities(),
188 |             model.get_link_propensities(),
189 |             model.get_start_balance(),
190 |             model.get_transition_matrix())
191 | 
192 |         p_unary, p_pairwise = model.get_label_distribution(
193 |             labels, links, seq_starts)
194 | 
195 |         # Makes predictions using both unary and pairwise marginals
196 |         pred_unary = np.argmax(p_unary, axis=1) + 1
197 |         pred_pairwise = np.zeros((labels.shape[0],), dtype=np.int32)
198 |         next_seq = 0
199 |         for i in range(labels.shape[0] - 1):
200 |             if next_seq == len(seq_starts) or i < seq_starts[next_seq] - 1:
201 |                 # i is neither the start nor end of a sequence
202 |                 pred_pairwise[i+1] = np.argmax(p_pairwise[i][pred_pairwise[i]])
203 |             elif i == seq_starts[next_seq]:
204 |                 # i is the start of a sequence
205 |                 a, b = np.unravel_index(p_pairwise[i].argmax(), (k, k))
206 |                 pred_pairwise[i], pred_pairwise[i + 1] = a, b
207 |                 next_seq += 1
208 |             else:
209 |                 # i is the end of a sequence
210 |                 pass
211 |         pred_pairwise += 1
212 | 
213 |         # Checks that predictions are accurate
214 |         for predictions in (pred_unary, pred_pairwise):
215 |             correct = 0
216 |             for i in range(len(predictions)):
217 |                 if predictions[i] == gold[i]:
218 |                     correct += 1
219 |             accuracy = correct / float(len(predictions))
220 |             self.assertGreaterEqual(accuracy, .95)
221 | 
222 |         ##### ensure marginalization of p_pairwise matches p_unary
223 |         # create a simple and trained linkHMM.
224 |         n1 = 5
225 |         n2 = 3
226 |         k = 2
227 | 
228 |         label_accuracies = np.array([[.9, .8],
229 |                                      [.6, .7],
230 |                                      [.6, .6],
231 |                                      [.7, .6],
232 |                                      [.8, .8]])
233 |         link_accuracies = np.array([.8, .6, .8])
234 |         label_propensities = np.array([.9] * n1)
235 |         link_propensities = np.array([.9] * n1)
236 |         start_balance = np.array([.3, .7])
237 |         transitions = np.array([[.5, .5], [.3, .7]])
238 | 
239 |         labels, links, seq_starts, gold = _generate_data(
240 |             10, 8, 12, n1, n2,
241 |             label_accuracies,
242 |             link_accuracies,
243 |             label_propensities,
244 |             link_propensities,
245 |             start_balance,
246 |             transitions
247 |         )
248 |         model = LinkedHMM(k, n1, n2, acc_prior=0.0, balance_prior=0.0)
249 |         config = LearningConfig()
250 |         config.epochs = 3
251 |         model.estimate_label_model(labels, links, seq_starts, config=config)
252 | 
253 |         # get unary and pairwise marginals
254 |         p_unary, p_pairwise = model.get_label_distribution(labels, links, seq_starts)
255 | 
256 |         # marginalize pairwise over t + 1
257 |         for un, pa in zip(p_unary, np.sum(p_pairwise, axis= 2)):
258 |             if np.sum(pa)==0: # last pairwise_marginal is empty because it does not have t + 1 transition
259 |                 continue
260 |             for i in range(un.shape[0]):
261 |                 self.assertAlmostEqual(un[i], pa[i], places=3)
262 | 
263 |         # marginalize pairwise over t
264 |         for i, pa in enumerate(np.sum(p_pairwise, axis= 1)):
265 |             if i + 1 >= len(p_unary) or i + 1 in seq_starts: 
266 |                 # skip if p_unary[i + 1[ goes into new sequence
267 |                 continue
268 |             un = p_unary[i + 1]
269 |             for j in range(un.shape[0]):
270 |                 self.assertAlmostEqual(un[j], pa[j], places=3)
271 | 
272 | 
273 | def _generate_data(num_seqs, min_seq, max_seq, num_label_funcs, num_link_funcs,
274 |                    label_accs, link_accs, label_propensities, link_propensities,
275 |                    start_balance, transitions):
276 |     # Generates sequence starts
277 |     seq_starts = np.zeros((num_seqs,), dtype=np.int32)
278 |     total_len = 0
279 |     for i in range(num_seqs):
280 |         seq_len = np.random.randint(min_seq, max_seq + 1)
281 |         total_len += seq_len
282 |         if i + 1 < num_seqs:
283 |             seq_starts[i + 1] = total_len
284 | 
285 |     # Generates sequences of gold labels
286 |     gold = np.zeros((total_len,), dtype=np.int32)
287 |     next_start = 0
288 |     for i in range(total_len):
289 |         if next_start < len(seq_starts) and i == seq_starts[next_start]:
290 |             balance = start_balance
291 |             next_start += 1
292 |         else:
293 |             balance = np.squeeze(transitions[gold[i - 1] - 1])
294 | 
295 |         gold[i] = np.argmax(np.random.multinomial(1, balance)) + 1
296 | 
297 |     # Generates labeling function outputs conditioned on gold labels
298 |     row = []
299 |     col = []
300 |     val = []
301 |     for i in range(total_len):
302 |         for j in range(num_label_funcs):
303 |             if np.random.random() < label_propensities[j]:
304 |                 row.append(i)
305 |                 col.append(j)
306 |                 if np.random.random() < label_accs[j, gold[i] - 1]:
307 |                     val.append(gold[i])
308 |                 else:
309 |                     p_mistake = 1 / (len(start_balance) - 1)
310 |                     dist = [p_mistake] * (len(start_balance) + 1)
311 |                     dist[0] = 0
312 |                     dist[gold[i]] = 0
313 |                     val.append(np.argmax(np.random.multinomial(1, dist)))
314 | 
315 |     labels = sparse.coo_matrix((val, (row, col)), shape=(total_len, num_label_funcs))
316 | 
317 |     # Generates linking function outputs conditioned on gold labels
318 |     row = []
319 |     col = []
320 |     val = []
321 |     next_seq = 0
322 |     for i in range(total_len):
323 |         if next_seq < len(seq_starts) and i == seq_starts[next_seq]:
324 |             next_seq += 1
325 |         else:
326 |             for j in range(num_link_funcs):
327 |                 if np.random.random() < link_propensities[j]:
328 |                     row.append(i)
329 |                     col.append(j)
330 |                     if np.random.random() < link_accs[j]:
331 |                         val.append(1 if gold[i-1] == gold[i] else -1)
332 |                     else:
333 |                         val.append(-1 if gold[i-1] == gold[i] else 1)
334 | 
335 |     links = sparse.coo_matrix((val, (row, col)), shape=(total_len, num_link_funcs))
336 | 
337 |     return labels, links, seq_starts, gold
338 | 
339 | 
340 | if __name__ == '__main__':
341 |     unittest.main()
342 | 


--------------------------------------------------------------------------------
/test/test_naive_bayes.py:
--------------------------------------------------------------------------------
  1 | from labelmodels import NaiveBayes
  2 | import numpy as np
  3 | from scipy import sparse
  4 | import util
  5 | import torch
  6 | import unittest
  7 | 
  8 | 
  9 | class TestNaiveBayes(unittest.TestCase):
 10 | 
 11 |     def setUp(self):
 12 |         np.random.seed(0)
 13 | 
 14 |     def tearDown(self):
 15 |         pass
 16 | 
 17 |     def test_estimate_label_model_binary(self):
 18 |         m = 25000
 19 |         n = 5
 20 |         accuracies = np.array([[.9, .8],
 21 |                                [.6, .7],
 22 |                                [.5, .5],
 23 |                                [.7, .6],
 24 |                                [.8, .8]])
 25 |         propensities = np.array([.3] * n)
 26 |         class_balance = np.array([.4, .6])
 27 | 
 28 |         labels_train, gold_train = _generate_data(
 29 |             m, n, accuracies, propensities, class_balance)
 30 | 
 31 |         model = NaiveBayes(2, n, acc_prior=0.0, balance_prior=0.0)
 32 |         model.estimate_label_model(labels_train)
 33 | 
 34 |         for j in range(n):
 35 |             for k in range(2):
 36 |                 diff = accuracies[j, k] - model.get_accuracies()[j, k]
 37 |                 self.assertAlmostEqual(diff, 0.0, places=1)
 38 |         for j in range(n):
 39 |             diff = propensities[j] - model.get_propensities()[j]
 40 |             self.assertAlmostEqual(diff, 0.0, places=1)
 41 |         for k in range(len(class_balance)):
 42 |             diff = class_balance[k] - model.get_class_balance()[k]
 43 |             self.assertAlmostEqual(diff, 0.0, places=1)
 44 | 
 45 |     def test_estimate_label_model_multiclass(self):
 46 |         m = 25000
 47 |         n = 5
 48 |         accuracies = np.array([[.9, .8, .5],
 49 |                                [.6, .7, .5],
 50 |                                [.5, .5, .9],
 51 |                                [.7, .6, .7],
 52 |                                [.8, .8, .7]])
 53 |         propensities = np.array([.2] * n)
 54 |         class_balance = np.array([.3, .4, .3])
 55 | 
 56 |         labels_train, gold_train = _generate_data(
 57 |             m, n, accuracies, propensities, class_balance)
 58 | 
 59 |         model = NaiveBayes(3, n, acc_prior=0.0, balance_prior=0.0)
 60 |         model.estimate_label_model(labels_train)
 61 | 
 62 |         for j in range(n):
 63 |             for k in range(2):
 64 |                 diff = accuracies[j, k] - model.get_accuracies()[j, k]
 65 |                 self.assertAlmostEqual(diff, 0.0, places=1)
 66 |         for j in range(n):
 67 |             diff = propensities[j] - model.get_propensities()[j]
 68 |             self.assertAlmostEqual(diff, 0.0, places=1)
 69 |         for k in range(len(class_balance)):
 70 |             diff = class_balance[k] - model.get_class_balance()[k]
 71 |             self.assertAlmostEqual(diff, 0.0, places=1)
 72 | 
 73 |     def test_get_most_probable_labels_binary(self):
 74 |         m = 10000
 75 |         n = 5
 76 |         k = 2
 77 | 
 78 |         model = NaiveBayes(k, n)
 79 |         with torch.no_grad():
 80 |             model.class_balance[0] = 0
 81 |             model.class_balance[1] = 0.5
 82 |             for i in range(n):
 83 |                 model.propensity[i] = 2
 84 |                 for j in range(k):
 85 |                     model.accuracy[i, j] = 2
 86 | 
 87 |         labels_train, gold_train = _generate_data(
 88 |             m, n,
 89 |             model.get_accuracies(),
 90 |             model.get_propensities(),
 91 |             model.get_class_balance())
 92 | 
 93 |         # Checks label inference
 94 |         labels = model.get_most_probable_labels(labels_train)
 95 |         correct = 0
 96 |         for i in range(m):
 97 |             if gold_train[i] == labels[i]:
 98 |                 correct += 1
 99 | 
100 |         self.assertGreater(float(correct) / m, .95)
101 | 
102 |     def test_get_most_probable_labels_multiclass(self):
103 |         m = 10000
104 |         n = 5
105 |         k = 3
106 | 
107 |         model = NaiveBayes(k, n)
108 |         with torch.no_grad():
109 |             model.class_balance[0] = 0
110 |             model.class_balance[1] = 0.5
111 |             model.class_balance[2] = 0.5
112 |             for i in range(n):
113 |                 model.propensity[i] = 2
114 |                 for j in range(k):
115 |                     model.accuracy[i, j] = 2
116 | 
117 |         labels_train, gold_train = _generate_data(
118 |             m, n,
119 |             model.get_accuracies(),
120 |             model.get_propensities(),
121 |             model.get_class_balance())
122 | 
123 |         # Checks label inference
124 |         labels = model.get_most_probable_labels(labels_train)
125 |         correct = 0
126 |         for i in range(m):
127 |             if gold_train[i] == labels[i]:
128 |                 correct += 1
129 | 
130 |         self.assertGreater(float(correct) / m, .95)
131 | 
132 |     def test_estimate_model_input_formats(self):
133 |         m = 1000
134 |         n = 3
135 | 
136 |         accuracies = np.array([[.8, .8, .8],
137 |                                [.8, .8, .8],
138 |                                [.8, .8, .8]])
139 |         propensities = np.array([.5] * n)
140 |         class_balance = np.array([.1, .1, .8])
141 | 
142 |         labels_train, _ = _generate_data(
143 |             m, n, accuracies, propensities, class_balance)
144 | 
145 |         # Trains the model on the generated data
146 |         model = NaiveBayes(3, n)
147 |         model.estimate_label_model(labels_train)
148 |         accuracies = model.get_accuracies()
149 |         propensities = model.get_propensities()
150 |         class_balance = model.get_class_balance()
151 | 
152 |         # Checks that other input formats work and do not change the results
153 |         for data in util.get_all_formats(labels_train):
154 |             model = NaiveBayes(3, n)
155 |             model.estimate_label_model(data)
156 |             diff = np.sum(np.abs(accuracies - model.get_accuracies()))
157 |             self.assertAlmostEqual(float(diff), 0.0)
158 |             diff = np.sum(np.abs(propensities - model.get_propensities()))
159 |             self.assertAlmostEqual(float(diff), 0.0)
160 |             diff = np.sum(np.abs(class_balance - model.get_class_balance()))
161 |             self.assertAlmostEqual(float(diff), 0.0)
162 | 
163 |     def test_get_label_input_formats(self):
164 |         m = 1000
165 |         n = 3
166 | 
167 |         accuracies = np.array([[.8, .8, .8],
168 |                                [.8, .8, .8],
169 |                                [.8, .8, .8]])
170 |         propensities = np.array([.5] * n)
171 |         class_balance = np.array([.1, .1, .8])
172 | 
173 |         labels_train, _ = _generate_data(
174 |             m, n, accuracies, propensities, class_balance)
175 | 
176 |         # Gets the label distribution for the generated data
177 |         model = NaiveBayes(3, n, init_acc=0.8)
178 |         distribution = model.get_label_distribution(labels_train)
179 | 
180 |         # Checks that other input formats work and do not change the results
181 |         for data in util.get_all_formats(labels_train):
182 |             model = NaiveBayes(3, n, init_acc=0.8)
183 |             new_distribution = model.get_label_distribution(data)
184 |             diff = np.sum(np.abs(distribution - new_distribution))
185 |             self.assertAlmostEqual(float(diff), 0.0)
186 | 
187 | 
188 | def _generate_data(m, n, accuracies, propensities, class_balance):
189 |     gold = np.zeros((m,), dtype=np.int16)
190 |     row = []
191 |     col = []
192 |     val = []
193 | 
194 |     for i in range(m):
195 |         k = np.argmax(np.random.multinomial(1, class_balance))
196 |         gold[i] = k + 1
197 |         for j in range(n):
198 |             if np.random.random() < propensities[j]:
199 |                 row.append(i)
200 |                 col.append(j)
201 |                 if np.random.random() < accuracies[j, k]:
202 |                     val.append(gold[i])
203 |                 else:
204 |                     p_mistake = 1 / (len(class_balance) - 1)
205 |                     dist = [p_mistake] * (len(class_balance) + 1)
206 |                     dist[0] = 0
207 |                     dist[gold[i]] = 0
208 |                     val.append(np.argmax(np.random.multinomial(1, dist)))
209 | 
210 |     labels = sparse.coo_matrix((val, (row, col)), shape=(m, n))
211 |     return labels, gold
212 | 
213 | 
214 | if __name__ == '__main__':
215 |     unittest.main()
216 | 


--------------------------------------------------------------------------------
/test/test_partial_labels.py:
--------------------------------------------------------------------------------
  1 | from labelmodels import PartialLabelModel
  2 | import numpy as np
  3 | from scipy import sparse
  4 | import torch
  5 | import unittest
  6 | from random import sample
  7 | from copy import deepcopy as dc
  8 | 
  9 | 
 10 | class Experiment:
 11 |     def __init__(self, name, num_classes, label_partition, lm_annot_votes, lm_train_votes, lm_annot_labels=None):
 12 |         self.name = name
 13 |         self.label_partition = label_partition
 14 |         self.lm_annot_votes = lm_annot_votes
 15 |         self.lm_train_votes = lm_train_votes
 16 |         self.lm_annot_labels = lm_annot_labels
 17 |         self.num_classes = num_classes
 18 |         self.num_df = len(label_partition)
 19 | 
 20 |     def set_soft_labels(self, soft_labels):
 21 |         pass
 22 | 
 23 | 
 24 | class LMTask:
 25 |     def __init__(self, name, num_classes, label_partition, preset_classbalance=None, device='cuda:0'):
 26 |         self.name = name
 27 | 
 28 |         self.labelmodel = PartialLabelModel(num_classes=num_classes,
 29 |                                label_partition=label_partition,
 30 |                                preset_classbalance=preset_classbalance,
 31 |                                device=device)
 32 | 
 33 |     def annotate(self, lm_annot_votes, lm_train_votes=None):
 34 |         if lm_train_votes is not None:
 35 |             self.labelmodel.estimate_label_model(lm_train_votes)
 36 |         return self.labelmodel.estimate_label_model(lm_annot_votes)
 37 | 
 38 |     def get_accuracy(self):
 39 |         return self.labelmodel.get_accuracies()
 40 | 
 41 |     def get_propensity(self):
 42 |         return self.labelmodel.get_propensities()
 43 | 
 44 |     def get_class_balance(self):
 45 |         return self.labelmodel.get_class_balance()
 46 | 
 47 | 
 48 | def Workflow(experimental_data):
 49 |     lm_task = LMTask('test', num_classes=experimental_data.num_classes,
 50 |                      label_partition=experimental_data.label_partition,
 51 |                      device='cuda:0')
 52 | 
 53 |     lm_annot_soft_labels = lm_task.annotate(lm_annot_votes=experimental_data.lm_annot_votes,
 54 |                                             lm_train_votes=experimental_data.lm_train_votes)
 55 |     output = []
 56 |     output.append(lm_task.get_accuracy())
 57 |     output.append(lm_task.get_class_balance())
 58 |     output.append(lm_task.get_propensity())
 59 |     return output
 60 | 
 61 | 
 62 | def setup():
 63 |     simple_label_partition = {
 64 |         0: [[1], [2, 3]],
 65 |         1: [[2], [1, 3]],
 66 |         2: [[3], [1, 2]]
 67 |     }
 68 |     num_sources = len(simple_label_partition)
 69 |     num_classes = 4
 70 |     num_annot_inst = 4096 * 16
 71 | 
 72 |     labelmodel_annotation_votes = np.random.randint(2, size=(num_annot_inst, num_sources))
 73 |     labelmodel_training_votes = labelmodel_annotation_votes
 74 |     labelmodel_annotation_labels = np.random.randint(num_classes, size=(num_annot_inst, 1)) + 1
 75 |     test_data = Experiment('simple-tests',
 76 |                            num_classes=num_classes,
 77 |                            label_partition=simple_label_partition,
 78 |                            lm_annot_votes=labelmodel_annotation_votes,
 79 |                            lm_train_votes=labelmodel_training_votes,
 80 |                            lm_annot_labels=labelmodel_annotation_labels)
 81 | 
 82 |     return test_data
 83 | 
 84 | 
 85 | def setup_test(label_partition, accuracies, class_balance, m=4096*8, abstention=None):
 86 |     votes, gold = _generate_data(m, label_partition, accuracies, class_balance, abstention=abstention)
 87 | 
 88 |     return votes, gold
 89 | 
 90 | 
 91 | def close_estimation(model_acc, true_acc, thresh=0.05, verbose=True):
 92 |     # assert model_acc.shape == true_acc.shape
 93 |     res = torch.allclose(torch.Tensor(model_acc), torch.Tensor(true_acc), atol=thresh)
 94 |     if verbose:
 95 |         print(res)
 96 |     return res
 97 | 
 98 | 
 99 | def actual_cb(gold):
100 |     unique, counts = np.unique(gold, return_counts=True)
101 |     return counts / sum(counts)
102 | 
103 | 
104 | class TestPartialLabelModel(unittest.TestCase):
105 |     def test_general_accuracy_recovery_0(self):
106 |         print('Testing Accuracy Recovery Rate for PLM - 0')
107 |         true_cb_0 = [1 / 3, 1 / 3, 1 / 3]
108 |         true_acc_0 = np.array(
109 |             [[.8, .7, .6],
110 |              [.75, .7, .7],
111 |              [.5, .7, .65],
112 |              [.8, .8, .75],
113 |              [.9, .7, .8]])
114 |         label_partition = {
115 |             0: [[1], [2, 3]],
116 |             1: [[1], [2, 3]],
117 |             2: [[1, 2], [3]],
118 |             3: [[1, 2], [3]],
119 |             4: [[1, 3], [2]]
120 |         }
121 |         votes, gold = setup_test(label_partition, true_acc_0, true_cb_0)
122 |         test_data_0 = Experiment('acc-tests-0',
123 |                                  num_classes=3,
124 |                                  label_partition=label_partition,
125 |                                  lm_annot_votes=votes,
126 |                                  lm_train_votes=votes,
127 |                                  lm_annot_labels=gold)
128 | 
129 |         acc_0, cb_0, _ = Workflow(experimental_data=test_data_0)
130 | 
131 |         #print(acc_0 - true_acc_0)
132 |         self.assertTrue(close_estimation(acc_0, true_acc_0))
133 |         self.assertTrue(close_estimation(cb_0, true_cb_0))
134 | 
135 |     def test_general_accuracy_recovery_1(self):
136 |         print('Testing Accuracy Recovery Rate for PLM - 1')
137 |         true_cb_1 = [.5, .3, .2]
138 |         true_acc_1 = np.array(
139 |             [[.8, .7, .6],
140 |              [.8, .7, .6],
141 |              [.5, .9, .6],
142 |              [.8, .7, .6],
143 |              [.9, .7, .6]])
144 |         label_partition = {
145 |             0: [[1], [2], [3]],
146 |             1: [[1], [2, 3]],
147 |             2: [[1, 2], [3]],
148 |             3: [[1], [2], [3]],
149 |             4: [[1], [2], [3]]
150 |         }
151 |         votes, gold = setup_test(label_partition, true_acc_1, true_cb_1)
152 |         test_data_1 = Experiment('acc-tests-1',
153 |                                  num_classes=3,
154 |                                  label_partition=label_partition,
155 |                                  lm_annot_votes=votes,
156 |                                  lm_train_votes=votes,
157 |                                  lm_annot_labels=gold)
158 | 
159 |         acc_1, cb_1, _ = Workflow(experimental_data=test_data_1)
160 | 
161 |         self.assertTrue(close_estimation(acc_1, true_acc_1))
162 |         self.assertTrue(close_estimation(cb_1, true_cb_1))
163 | 
164 |     def test_general_accuracy_recovery_2(self):
165 |         print('Testing Accuracy Recovery Rate for PLM - 2')
166 |         true_cb_2 = [.4, .2, .4]
167 |         true_acc_2 = np.array(
168 |             [[.8, .7, .6],
169 |              [.8, .8, .7],
170 |              [.5, .7, .9],
171 |              [.8, .6, .7],
172 |              [.9, .7, .6],
173 |              [.8, .5, .6]])
174 |         label_partition = {
175 |             0: [[1], [2], [3]],
176 |             1: [[2], [1, 3]],
177 |             2: [[1, 2], [3]],
178 |             3: [[1], [2], [3]],
179 |             4: [[1], [2], [3]],
180 |             5: [[3, 2], [1]]
181 |         }
182 |         votes, gold = setup_test(label_partition, true_acc_2, true_cb_2)
183 | 
184 |         test_data_2 = Experiment('acc-tests-2',
185 |                                  num_classes=3,
186 |                                  label_partition=label_partition,
187 |                                  lm_annot_votes=votes,
188 |                                  lm_train_votes=votes,
189 |                                  lm_annot_labels=gold)
190 | 
191 |         acc_2, cb_2, _ = Workflow(experimental_data=test_data_2)
192 |         print(acc_2 - true_acc_2)
193 |         print(true_cb_2 - cb_2)
194 |         self.assertTrue(close_estimation(acc_2, true_acc_2))
195 |         self.assertTrue(close_estimation(cb_2, true_cb_2))
196 | 
197 |     def test_general_accuracy_recovery_3(self):
198 |         print('Testing Accuracy Recovery Rate for PLM - 3 with Abstention')
199 |         true_cb_3 = [.4, .2, .4]
200 |         true_acc_3 = np.array(
201 |             [[.8, .7, .6],
202 |              [.8, .7, .7],
203 |              [.5, .7, .9],
204 |              [.8, .6, .7],
205 |              [.9, .8, .6],
206 |              [.8, .5, .8]])
207 |         label_partition = {
208 |             0: [[1], [2], [3]],
209 |             1: [[2], [1, 3]],
210 |             2: [[1, 2], [3]],
211 |             3: [[1], [2], [3]],
212 |             4: [[1], [2, 3]],
213 |             5: [[3, 2], [1]]
214 |         }
215 | 
216 |         abstention = [0.8, 0.9, 0.8, 0.7, 0.8, 0.9]
217 |         votes, gold = setup_test(label_partition, true_acc_3, true_cb_3,
218 |                                  abstention=abstention)
219 | 
220 |         tv = dc(votes)
221 |         tv[tv == -1] = 100
222 |         tv[tv < 100] = 0
223 |         tv = tv / 100
224 |         est_abst = np.mean(tv, axis=0)
225 |         # print(est_abst)
226 | 
227 |         test_data_3 = Experiment('acc-tests-2',
228 |                                  num_classes=3,
229 |                                  label_partition=label_partition,
230 |                                  lm_annot_votes=votes,
231 |                                  lm_train_votes=votes,
232 |                                  lm_annot_labels=gold)
233 |         acc_3, cb_3, prp_3 = Workflow(test_data_3)
234 | 
235 |         self.assertTrue(close_estimation(acc_3, true_acc_3))
236 |         self.assertTrue(close_estimation(cb_3, true_cb_3))
237 |         self.assertTrue(close_estimation(prp_3, abstention))
238 | 
239 |     def test_general_accuracy_recovery_4(self):
240 |         print('Testing Accuracy Recovery Rate for PLM - 4 with Abstention')
241 |         true_cb_3 = [.4, .2, .4]
242 |         true_acc_3 = np.array(
243 |             [[.8, .7, .6],
244 |              [.8, .6, .7],
245 |              [.5, .7, .9],
246 |              [.8, .6, .7],
247 |              [.9, .8, .6],
248 |              [.8, .5, .8]])
249 |         label_partition = {
250 |             0: [[1], [2], [3]],
251 |             1: [[2], [1, 3]],
252 |             2: [[1, 2], [3]],
253 |             3: [[1], [2], [3]],
254 |             4: [[1], [2, 3]],
255 |             5: [[3, 2], [1]]
256 |         }
257 | 
258 |         abstention = [0.8, 0.9, 0.8, 0.7, 0.8, 0.9]
259 |         votes, gold = setup_test(label_partition, true_acc_3, true_cb_3,
260 |                                  abstention=abstention)
261 | 
262 |         tv = dc(votes)
263 |         tv[tv == -1] = 100
264 |         tv[tv < 100] = 0
265 |         tv = tv / 100
266 |         est_abst = np.mean(tv, axis=0)
267 |         print(est_abst)
268 | 
269 |         test_data_3 = Experiment('acc-tests-4',
270 |                                  num_classes=3,
271 |                                  label_partition=label_partition,
272 |                                  lm_annot_votes=votes,
273 |                                  lm_train_votes=votes,
274 |                                  lm_annot_labels=gold)
275 |         acc_3, cb_3, prp_3 = Workflow(test_data_3)
276 |         self.assertTrue(close_estimation(acc_3, true_acc_3))
277 |         self.assertTrue(close_estimation(cb_3, true_cb_3))
278 |         self.assertTrue(close_estimation(prp_3, abstention))
279 | 
280 | 
281 | def _generate_data(m, label_partition, accuracies, class_balance, abstention=None):
282 |     """
283 |     Generate synthetic data
284 | 
285 |     :param m: number of examples
286 |     :param n: number of sources
287 |     :param label_partition: feature id clustering
288 |     :param accuracies: n x k matrix of accuracies, where k is number of classes
289 |     :param class_balance: k-dim vector representing prior over classes
290 |     :param abstention: n-dim vector representing prob of not abstention
291 |     :return: m x n matrix of features, m-dim vector of gold class labels
292 |     """
293 |     n = len(label_partition)
294 |     gold = np.zeros((m,), dtype=np.int16)
295 |     votes = np.zeros((m, n), dtype=np.int16)
296 | 
297 |     for i in range(m):
298 |         k = np.argmax(np.random.multinomial(1, class_balance))
299 |         gold[i] = k + 1
300 |         for j in range(n):
301 |             # Collects correct and incorrect clusters
302 |             correct = []
303 |             incorrect = []
304 |             for cid, cluster in enumerate(label_partition[j]):
305 |                 if k + 1 in cluster:
306 |                     correct.append(cid+1)
307 |                 else:
308 |                     incorrect.append(cid+1)
309 |             if np.random.random() < accuracies[j, k]:
310 |                 votes[i, j] = np.random.choice(correct)
311 |             else:
312 |                 votes[i, j] = np.random.choice(incorrect)
313 | 
314 |     if abstention is not None:
315 |         for idx, prob in enumerate(abstention):
316 |             votes[np.array(sample(range(m), int((1 - prob) * m))), idx] = 0
317 | 
318 |     return votes, gold
319 | 
320 | 
321 | if __name__ == '__main__':
322 |     unittest.main()
323 | 


--------------------------------------------------------------------------------
/test/util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | def get_all_formats(matrix):
 6 |     """Converts a scipy sparse matrix to a list of copies in formats that should
 7 |     be supported
 8 | 
 9 |     :param matrix: labeling function output matrix in a scipy sparse format
10 |     :return: list of labeling function output matrices
11 |     """
12 |     other_formats = [
13 |         matrix.todense(),
14 |         matrix.todense().tolist(),
15 |         matrix.todense().astype(np.float64),
16 |         matrix.tocoo(),
17 |         matrix.tocsc(),
18 |         matrix.todia(),
19 |         matrix.todok(),
20 |         matrix.tolil(),
21 |         torch.tensor(matrix.todense())
22 |     ]
23 |     return other_formats
24 | 


--------------------------------------------------------------------------------