├── .gitignore ├── ALMa ├── __init__.py └── activelearningmanager.py ├── LICENSE ├── README.md ├── examples └── text_classification_with_modAL.py ├── requirements.txt ├── setup.py └── tests └── alma_tests.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | .idea/ 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | -------------------------------------------------------------------------------- /ALMa/__init__.py: -------------------------------------------------------------------------------- 1 | from ALMa.activelearningmanager import ActiveLearningManager 2 | -------------------------------------------------------------------------------- /ALMa/activelearningmanager.py: -------------------------------------------------------------------------------- 1 | import typing 2 | from typing import List, Tuple, Any, Union, Optional, Generic, TypeVar 3 | import numpy as np 4 | 5 | Label = Tuple[int, Union[np.int64,int,float,str]] 6 | LabelList = List[Label] 7 | Sources = List[Any] 8 | 9 | 10 | class ActiveLearningManager: 11 | def __init__( 12 | self, 13 | features: np.ndarray, 14 | labels_dtype: Optional[np.dtype] = None, 15 | sources: Optional[Sources] = None, 16 | ): 17 | """ 18 | 19 | When doing active learning we have our Original Data (OD) Labeled Data [LD] and Unlabeled Data [UD] 20 | where UD and LD are subsets of OD. 21 | The active learner operates on UD and returns indexes relative to it. We want to store those indices with respect 22 | to OD, and sometimes see the subset of labels of LD. (The subset of labels of UD is Null) 23 | 24 | That's a fancy way of saying there is a lot book keeping to be done and this class solves that by doing it for you 25 | 26 | The main idea is that we store a mask (labeeld_mask) of indices that have been labeled and then expose UD , LD 27 | and the labels by using fancy indexing with that mask. The manager exposes a an add_labels method which lets the 28 | user add labels indexed with respect to UD and it will adjust the indices so that they match OD. 29 | 30 | :param features: An array of the features that will be used for AL. 31 | :param labels: Any prexesiting labels. Each label is a tuple(idx,label) 32 | :param source: A list of the original data 33 | """ 34 | self.features = features 35 | 36 | self._labels = np.empty(shape=self.features.shape[0], dtype=labels_dtype) 37 | self.labeled_mask = np.zeros(self.features.shape[0], dtype=bool) 38 | self.sources = np.array(sources if sources else []) 39 | 40 | @property 41 | def labels(self): 42 | """ 43 | 44 | Returns the labels indexed with respect to LD 45 | 46 | """ 47 | return self._labels[self.labeled_mask] 48 | 49 | @property 50 | def unlabeled_mask(self): 51 | """ 52 | 53 | Returns: a mask which is true for all unlabeled points 54 | 55 | """ 56 | return np.logical_not(self.labeled_mask) 57 | 58 | def _update_masks(self, labels: Union[LabelList, Label]): 59 | for label in labels: 60 | self.labeled_mask[label[0]] = True 61 | 62 | def _offset_new_labels(self, labels_for_unlabeled_dataset: LabelList): 63 | """ 64 | This is where the magic happens. 65 | We take self.unlabeled_mask.nonzero()[0] which gives us an array of the indices that appear in the unlabeled 66 | data. So if the original label was at position 0 we look up the "real index" in the unlabeled_indices array to 67 | get it's true index 68 | :param labels_for_unlabeled_dataset A LabelList indexed according to the unlabeled dataset: 69 | :return:labels_for_dataset A LabelList indexed according to the original dataset 70 | """ 71 | if len(self._labels) == 0: 72 | # Nothing to correct in this case 73 | return labels_for_unlabeled_dataset 74 | labels_for_dataset: LabelList = [] 75 | unlabeled_indices_map = self.unlabeled_mask.nonzero()[0] 76 | 77 | for label in labels_for_unlabeled_dataset: 78 | index_in_unlabeled, annotation = label 79 | index_in_dataset = unlabeled_indices_map[index_in_unlabeled] 80 | new_label: Label = (index_in_dataset, annotation) 81 | labels_for_dataset.append(new_label) 82 | return labels_for_dataset 83 | 84 | def add_labels(self, labels: LabelList, offset_to_unlabeled=True): 85 | if isinstance(labels, tuple): # if this is a single example 86 | labels: LabelList = [labels] 87 | elif isinstance(labels, list): 88 | pass 89 | else: 90 | raise Exception( 91 | "Malformed input. Please add either a tuple (ix,label) or a list [(ix,label),..]" 92 | ) 93 | if offset_to_unlabeled: 94 | labels = self._offset_new_labels(labels) 95 | self._update_masks(labels) 96 | for label in labels: 97 | self._labels[label[0]] = label[1] 98 | 99 | @property 100 | def unlabeld(self): 101 | """ 102 | 103 | :return: Returns UD, all of the unlabeled data points 104 | """ 105 | return self.features[self.unlabeled_mask] 106 | 107 | @property 108 | def labeled(self): 109 | """ 110 | :return: Returns LD, all of the labeld data points 111 | """ 112 | return self.features[self.labeled_mask] 113 | 114 | @property 115 | def remaining_sources(self): 116 | """ 117 | 118 | :return: Returns the original data, as opposed to features, with respect to UD 119 | """ 120 | return self.sources[self.unlabeled_mask] 121 | 122 | def get_original_index_from_unlabeled_index(self, ixs: Union[int, List[int]]): 123 | """ 124 | Utility function that takes as input indices from the unlabeled subset and returns the equivalent indices 125 | in the complete array. 126 | Useful for testing purposes, where we have the existing labels and want to take them in the order in which 127 | the active learner specifes. 128 | :param ixs: 129 | :return: 130 | """ 131 | unlabeled_indices = self.unlabeled_mask.nonzero()[0] 132 | if isinstance(ixs, np.int64): 133 | ixs = [ixs] 134 | return list(map(lambda x: unlabeled_indices[x], ixs)) 135 | 136 | 137 | __all__ = [Label, LabelList, ActiveLearningManager] 138 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 LightTag 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ALMa - An Active Learning (data) Manager 2 | 3 | ALMa elimiates the need for bookkeeping when using Active Learning. Read the blog post 4 | on [Active Learning with ALMa](https://www.lighttag.io/blog/active-learning-manager/) 5 | 6 | Made with heart by LightTag - The Text Annotation Tool For Teams. 7 | We use ALMa to facilitate multi annotator active learning. Originally developed as a contribution for [Modal](https://github.com/modAL-python/modAL) 8 | but moved to it's own library 9 | 10 | 11 | ## Install 12 | ``` 13 | pip install ALMa 14 | ``` 15 | 16 | ## Use 17 | Check out the full [example for text classification](examples/text_classification_with_modAL.py) 18 | 19 | ```python 20 | from ALMa import ActiveLearningManager 21 | manager = ActiveLearningManager(my_featurized_data, sources=optional_original_data) 22 | learner = #...some active learning learner 23 | for index in range(N_QUERIES): 24 | index_to_label, query_instance = learner.query(manager.unlabeld) 25 | original_ix = manager.get_original_index_from_unlabeled_index(index_to_label) 26 | y = original_labels_train[original_ix] 27 | label = (index_to_label, y) 28 | manager.add_labels(labels) 29 | learner.teach(X=manager.labeled, y=manager.labels) 30 | 31 | ``` 32 | 33 | -------------------------------------------------------------------------------- /examples/text_classification_with_modAL.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example shows how to use the new data manager class. 3 | For clarity, all the setup has been moved into functions and 4 | the core is in the __main__ section which is commented 5 | 6 | Also look at prepare_manager to see how a DataManager is instantiated 7 | 8 | """ 9 | 10 | from sklearn.datasets import fetch_20newsgroups 11 | from sklearn.ensemble import RandomForestClassifier 12 | import numpy as np 13 | import matplotlib as mpl 14 | import matplotlib.pyplot as plt 15 | from sklearn.feature_extraction.text import TfidfVectorizer 16 | from functools import partial 17 | 18 | 19 | from modAL.models import ActiveLearner 20 | from modAL.batch import uncertainty_batch_sampling 21 | 22 | from ALMa import ActiveLearningManager 23 | 24 | RANDOM_STATE_SEED = 123 25 | np.random.seed(RANDOM_STATE_SEED) 26 | BATCH_SIZE = 5 27 | N_QUERIES = 50 28 | 29 | 30 | def prepare_data(): 31 | SKIP_SIZE = 50 # Skip to make the example go fast. 32 | docs, original_labels = fetch_20newsgroups(return_X_y=True) 33 | docs_train = docs[::SKIP_SIZE] 34 | original_labels_train = original_labels[::SKIP_SIZE] 35 | docs_test = docs[1::SKIP_SIZE] # Offset by one means no overlap 36 | original_labels_test = original_labels[ 37 | 1::SKIP_SIZE 38 | ] # Offset by one means no overlap 39 | return docs_train, original_labels_train, docs_test, original_labels_test 40 | 41 | 42 | def prepare_features(docs_train, docs_test): 43 | vectorizer = TfidfVectorizer( 44 | stop_words="english", ngram_range=(1, 3), max_df=0.9, max_features=5000 45 | ) 46 | 47 | vectors_train = vectorizer.fit_transform(docs_train).toarray() 48 | vectors_test = vectorizer.transform(docs_test).toarray() 49 | return vectors_train, vectors_test 50 | 51 | 52 | def prepare_manager(vectors_train, docs_train): 53 | manager = ActiveLearningManager(vectors_train, sources=docs_train) 54 | return manager 55 | 56 | 57 | def prepare_learner(): 58 | 59 | estimator = RandomForestClassifier() 60 | preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE) 61 | learner = ActiveLearner(estimator=estimator, query_strategy=preset_batch) 62 | return learner 63 | 64 | 65 | def make_pretty_summary_plot(performance_history): 66 | with plt.style.context("seaborn-white"): 67 | fig, ax = plt.subplots(figsize=(8.5, 6), dpi=130) 68 | 69 | ax.plot(performance_history) 70 | ax.scatter(range(len(performance_history)), performance_history, s=13) 71 | 72 | ax.xaxis.set_major_locator( 73 | mpl.ticker.MaxNLocator(nbins=N_QUERIES + 3, integer=True) 74 | ) 75 | ax.xaxis.grid(True) 76 | 77 | ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(nbins=10)) 78 | ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(xmax=1)) 79 | ax.set_ylim(bottom=0, top=1) 80 | ax.yaxis.grid(True, linestyle="--", alpha=1 / 2) 81 | 82 | ax.set_title("Incremental classification accuracy") 83 | ax.set_xlabel("Query iteration") 84 | ax.set_ylabel("Classification Accuracy") 85 | 86 | plt.show() 87 | 88 | 89 | if __name__ == "__main__": 90 | docs_train, original_labels_train, docs_test, original_labels_test = prepare_data() 91 | vectors_train, vectors_test = prepare_features(docs_train, docs_test) 92 | manager = prepare_manager(vectors_train, docs_train) 93 | learner = prepare_learner() 94 | performance_history = [] 95 | # performance_history.append(learner.score(docs_test, original_labels_test)) 96 | 97 | for i in range(N_QUERIES): 98 | # Check if there are more examples that are not labeled. If not, break 99 | if manager.unlabeld.size == 0: 100 | break 101 | 102 | for index in range(1): 103 | # query the learner as usual, in this case we are using a batch learning strategy 104 | # so indices_to_label is an array 105 | indices_to_label, query_instance = learner.query(manager.unlabeld) 106 | labels = [] # Hold a list of the new labels 107 | for ix in indices_to_label: 108 | """ 109 | Here is the tricky part that the manager solves. The indicies are indexed with respect to unlabeled data 110 | but we want to work with them with respect to the original data. The manager makes this almost transparent 111 | """ 112 | # Map the index that is with respect to unlabeled data back to an index with respect to the whole dataset 113 | original_ix = manager.get_original_index_from_unlabeled_index(ix) 114 | # print(manager.sources[original_ix]) #Show the original data so we can decide what to label 115 | # Now we can lookup the label in the original set of labels without any bookkeeping 116 | y = original_labels_train[original_ix] 117 | # We create a Label instance, a tuple of index and label 118 | # The index should be with respect to the unlabeled data, the add_labels function will automatically 119 | # calculate the offsets 120 | label = (ix, y) 121 | # append the labels to a list 122 | labels.append(label) 123 | # Insert them all at once. 124 | manager.add_labels(labels) 125 | # Note that if you need to add labels with indicies that repsect the original dataset you can do 126 | # manager.add_labels(labels,offset_to_unlabeled=False) 127 | # Now teach as usual 128 | learner.teach(manager.labeled, manager.labels) 129 | performance_history.append(learner.score(vectors_test, original_labels_test)) 130 | # Finnaly make a nice plot 131 | make_pretty_summary_plot(performance_history) 132 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="ALMa", # Replace with your own username 8 | version="0.0.3", 9 | author="Tal Perry", 10 | author_email="tal@lighttag.io", 11 | description="Easily track labeled and unlabeled data for active learning", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/lighttag/ALMa", 15 | packages=setuptools.find_packages(), 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ], 21 | python_requires='>=3.6', 22 | ) 23 | -------------------------------------------------------------------------------- /tests/alma_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from ALMa import ActiveLearningManager 4 | 5 | 6 | def first_true(ar: np.ndarray): 7 | return ar.nonzero()[0][0] 8 | 9 | 10 | class TestAddLabels(unittest.TestCase): 11 | def test_test_that_when_the_first_add_is_at_0_it_updates_correctly(self): 12 | features = np.array([[x + y for x in range(10)] for y in range(10)]) 13 | self.assertEqual(features.shape, (10, 10)) 14 | manager = ActiveLearningManager(features=features) 15 | manager.add_labels([(0, 1)]) 16 | self.assertEqual(first_true(manager.labeled_mask), 0) 17 | # the index of the first unlabeled example is one past the first labeled 18 | self.assertEqual(first_true(manager.unlabeled_mask), 1) 19 | 20 | def test_addto_first_continuously(self): 21 | features = np.array([[x + y for x in range(10)] for y in range(10)]) 22 | self.assertEqual(features.shape, (10, 10)) 23 | manager = ActiveLearningManager(features=features) 24 | manager.add_labels([(0, 1)]) 25 | self.assertEqual(first_true(manager.labeled_mask), 0) 26 | self.assertEqual(first_true(manager.unlabeled_mask), 1) 27 | 28 | manager.add_labels([(0, 1)]) 29 | self.assertEqual(first_true(manager.labeled_mask), 0) 30 | self.assertEqual(first_true(manager.unlabeled_mask), 2) 31 | 32 | manager.add_labels([(0, 1)]) 33 | self.assertEqual(first_true(manager.labeled_mask), 0) 34 | self.assertEqual(first_true(manager.unlabeled_mask), 3) 35 | 36 | def test_adding_in_the_middle(self): 37 | features = np.array([[x + y for x in range(10)] for y in range(10)]) 38 | self.assertEqual(features.shape, (10, 10)) 39 | manager = ActiveLearningManager(features=features) 40 | manager.add_labels([(2, 1)]) 41 | self.assertEqual(first_true(manager.labeled_mask), 2) 42 | self.assertEqual(first_true(manager.unlabeled_mask), 0) 43 | 44 | def test_adding_two_in_the_middle(self): 45 | features = np.array([[x + y for x in range(10)] for y in range(10)]) 46 | self.assertEqual(features.shape, (10, 10)) 47 | manager = ActiveLearningManager(features=features) 48 | manager.add_labels([(2, 1)]) 49 | self.assertEqual(first_true(manager.labeled_mask), 2) 50 | self.assertEqual(first_true(manager.unlabeled_mask), 0) 51 | 52 | manager.add_labels([(1, 1)]) 53 | self.assertEqual(first_true(manager.labeled_mask), 1) 54 | # We still didn't label the one at 0 55 | self.assertEqual(first_true(manager.unlabeled_mask), 0) 56 | 57 | def test_adding_two_in_the_middle_and_then_at_0(self): 58 | features = np.array([[x + y for x in range(10)] for y in range(10)]) 59 | self.assertEqual(features.shape, (10, 10)) 60 | manager = ActiveLearningManager(features=features) 61 | manager.add_labels([(2, 1)]) 62 | self.assertEqual(first_true(manager.labeled_mask), 2) 63 | self.assertEqual(first_true(manager.unlabeled_mask), 0) 64 | 65 | manager.add_labels([(1, 1)]) 66 | self.assertEqual(first_true(manager.labeled_mask), 1) 67 | # We still didn't label the one at 0 68 | self.assertEqual(first_true(manager.unlabeled_mask), 0) 69 | 70 | manager.add_labels([(0, 1)]) 71 | self.assertEqual(first_true(manager.labeled_mask), 0) 72 | # we labeled 0,1,2 the next one should be 3 73 | self.assertEqual(first_true(manager.unlabeled_mask), 3) 74 | 75 | 76 | if __name__ == "__main__": 77 | unittest.main() 78 | --------------------------------------------------------------------------------