├── .gitignore
├── ALMa
    ├── __init__.py
    └── activelearningmanager.py
├── LICENSE
├── README.md
├── examples
    └── text_classification_with_modAL.py
├── requirements.txt
├── setup.py
└── tests
    └── alma_tests.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | .idea/
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 


--------------------------------------------------------------------------------
/ALMa/__init__.py:
--------------------------------------------------------------------------------
1 | from ALMa.activelearningmanager import ActiveLearningManager
2 | 


--------------------------------------------------------------------------------
/ALMa/activelearningmanager.py:
--------------------------------------------------------------------------------
  1 | import typing
  2 | from typing import List, Tuple, Any, Union, Optional, Generic, TypeVar
  3 | import numpy as np
  4 | 
  5 | Label = Tuple[int, Union[np.int64,int,float,str]]
  6 | LabelList = List[Label]
  7 | Sources = List[Any]
  8 | 
  9 | 
 10 | class ActiveLearningManager:
 11 |     def __init__(
 12 |         self,
 13 |         features: np.ndarray,
 14 |         labels_dtype: Optional[np.dtype] = None,
 15 |         sources: Optional[Sources] = None,
 16 |     ):
 17 |         """
 18 | 
 19 |         When doing active learning we have our Original Data (OD) Labeled Data [LD] and Unlabeled Data [UD]
 20 |         where UD and LD are subsets of OD.
 21 |         The active learner operates on UD and returns indexes relative to it. We want to store those indices with respect
 22 |         to OD, and sometimes see the subset of labels of LD. (The subset of labels of UD is Null)
 23 | 
 24 |         That's a fancy way of saying there is a lot book keeping to be done and this class solves that by doing it for you
 25 | 
 26 |         The main idea is that we store a mask (labeeld_mask) of indices that have been labeled and then expose UD , LD
 27 |         and the labels by using fancy indexing with that mask. The manager exposes a an add_labels method which lets the
 28 |         user add labels indexed with respect to UD and it will adjust the indices so that they match OD.
 29 | 
 30 |         :param features: An array of the features that will be used for AL.
 31 |         :param labels: Any prexesiting labels. Each label is a tuple(idx,label)
 32 |         :param source: A list of the original data
 33 |         """
 34 |         self.features = features
 35 | 
 36 |         self._labels = np.empty(shape=self.features.shape[0], dtype=labels_dtype)
 37 |         self.labeled_mask = np.zeros(self.features.shape[0], dtype=bool)
 38 |         self.sources = np.array(sources if sources else [])
 39 | 
 40 |     @property
 41 |     def labels(self):
 42 |         """
 43 | 
 44 |         Returns the labels indexed with respect to LD
 45 | 
 46 |         """
 47 |         return self._labels[self.labeled_mask]
 48 | 
 49 |     @property
 50 |     def unlabeled_mask(self):
 51 |         """
 52 | 
 53 |         Returns: a mask which is true for all unlabeled points
 54 | 
 55 |         """
 56 |         return np.logical_not(self.labeled_mask)
 57 | 
 58 |     def _update_masks(self, labels: Union[LabelList, Label]):
 59 |         for label in labels:
 60 |             self.labeled_mask[label[0]] = True
 61 | 
 62 |     def _offset_new_labels(self, labels_for_unlabeled_dataset: LabelList):
 63 |         """
 64 |         This is where the magic happens.
 65 |         We take self.unlabeled_mask.nonzero()[0] which gives us an array of the indices that appear in the unlabeled
 66 |         data. So if the original label was at position 0 we look up the "real index" in the unlabeled_indices array to
 67 |         get it's true index
 68 |         :param labels_for_unlabeled_dataset A LabelList indexed according to the unlabeled dataset:
 69 |         :return:labels_for_dataset A LabelList indexed according to the original dataset
 70 |         """
 71 |         if len(self._labels) == 0:
 72 |             # Nothing to correct in this case
 73 |             return labels_for_unlabeled_dataset
 74 |         labels_for_dataset: LabelList = []
 75 |         unlabeled_indices_map = self.unlabeled_mask.nonzero()[0]
 76 | 
 77 |         for label in labels_for_unlabeled_dataset:
 78 |             index_in_unlabeled, annotation = label
 79 |             index_in_dataset = unlabeled_indices_map[index_in_unlabeled]
 80 |             new_label: Label = (index_in_dataset, annotation)
 81 |             labels_for_dataset.append(new_label)
 82 |         return labels_for_dataset
 83 | 
 84 |     def add_labels(self, labels: LabelList, offset_to_unlabeled=True):
 85 |         if isinstance(labels, tuple):  # if this is a single example
 86 |             labels: LabelList = [labels]
 87 |         elif isinstance(labels, list):
 88 |             pass
 89 |         else:
 90 |             raise Exception(
 91 |                 "Malformed input. Please add either a tuple (ix,label) or a list [(ix,label),..]"
 92 |             )
 93 |         if offset_to_unlabeled:
 94 |             labels = self._offset_new_labels(labels)
 95 |         self._update_masks(labels)
 96 |         for label in labels:
 97 |             self._labels[label[0]] = label[1]
 98 | 
 99 |     @property
100 |     def unlabeld(self):
101 |         """
102 | 
103 |         :return: Returns UD, all of the unlabeled data points
104 |         """
105 |         return self.features[self.unlabeled_mask]
106 | 
107 |     @property
108 |     def labeled(self):
109 |         """
110 |                 :return: Returns LD, all of the labeld data points
111 |         """
112 |         return self.features[self.labeled_mask]
113 | 
114 |     @property
115 |     def remaining_sources(self):
116 |         """
117 | 
118 |         :return: Returns the original data, as opposed to features, with respect to UD
119 |         """
120 |         return self.sources[self.unlabeled_mask]
121 | 
122 |     def get_original_index_from_unlabeled_index(self, ixs: Union[int, List[int]]):
123 |         """
124 |         Utility function that takes as input indices from the unlabeled subset and returns the equivalent indices
125 |         in the complete array.
126 |         Useful for testing purposes, where we have the existing labels and want to take them in the order in which
127 |         the active learner specifes.
128 |         :param ixs:
129 |         :return:
130 |         """
131 |         unlabeled_indices = self.unlabeled_mask.nonzero()[0]
132 |         if isinstance(ixs, np.int64):
133 |             ixs = [ixs]
134 |         return list(map(lambda x: unlabeled_indices[x], ixs))
135 | 
136 | 
137 | __all__ = [Label, LabelList, ActiveLearningManager]
138 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 LightTag
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ALMa - An Active Learning (data) Manager
 2 | 
 3 | ALMa elimiates the need for bookkeeping when using Active Learning. Read the blog post
 4 | on [Active Learning with ALMa](https://www.lighttag.io/blog/active-learning-manager/)
 5 |  
 6 | Made with heart by LightTag - The Text Annotation Tool For Teams. 
 7 | We use ALMa to facilitate multi annotator active learning. Originally developed as a contribution for [Modal](https://github.com/modAL-python/modAL)
 8 |  but moved to it's own library 
 9 | 
10 | 
11 | ## Install
12 | ```
13 | pip install ALMa
14 | ```
15 | 
16 | ## Use
17 | Check out the full [example for text classification](examples/text_classification_with_modAL.py)
18 |  
19 | ```python
20 | from ALMa import ActiveLearningManager
21 | manager = ActiveLearningManager(my_featurized_data, sources=optional_original_data)
22 | learner = #...some active learning learner
23 | for index in range(N_QUERIES):
24 |     index_to_label, query_instance = learner.query(manager.unlabeld)
25 |     original_ix = manager.get_original_index_from_unlabeled_index(index_to_label)
26 |     y = original_labels_train[original_ix]
27 |     label = (index_to_label, y)
28 |     manager.add_labels(labels)
29 |     learner.teach(X=manager.labeled, y=manager.labels)
30 | 
31 | ```
32 | 
33 | 


--------------------------------------------------------------------------------
/examples/text_classification_with_modAL.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This example shows how to use the new data manager class.
  3 | For clarity, all the setup has been moved into functions and
  4 | the core is in the __main__ section which is commented
  5 | 
  6 | Also look at prepare_manager to see how a DataManager is instantiated
  7 | 
  8 | """
  9 | 
 10 | from sklearn.datasets import fetch_20newsgroups
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | import numpy as np
 13 | import matplotlib as mpl
 14 | import matplotlib.pyplot as plt
 15 | from sklearn.feature_extraction.text import TfidfVectorizer
 16 | from functools import partial
 17 | 
 18 | 
 19 | from modAL.models import ActiveLearner
 20 | from modAL.batch import uncertainty_batch_sampling
 21 | 
 22 | from ALMa import ActiveLearningManager
 23 | 
 24 | RANDOM_STATE_SEED = 123
 25 | np.random.seed(RANDOM_STATE_SEED)
 26 | BATCH_SIZE = 5
 27 | N_QUERIES = 50
 28 | 
 29 | 
 30 | def prepare_data():
 31 |     SKIP_SIZE = 50  # Skip to make the example go fast.
 32 |     docs, original_labels = fetch_20newsgroups(return_X_y=True)
 33 |     docs_train = docs[::SKIP_SIZE]
 34 |     original_labels_train = original_labels[::SKIP_SIZE]
 35 |     docs_test = docs[1::SKIP_SIZE]  # Offset by one means no overlap
 36 |     original_labels_test = original_labels[
 37 |         1::SKIP_SIZE
 38 |     ]  # Offset by one means no overlap
 39 |     return docs_train, original_labels_train, docs_test, original_labels_test
 40 | 
 41 | 
 42 | def prepare_features(docs_train, docs_test):
 43 |     vectorizer = TfidfVectorizer(
 44 |         stop_words="english", ngram_range=(1, 3), max_df=0.9, max_features=5000
 45 |     )
 46 | 
 47 |     vectors_train = vectorizer.fit_transform(docs_train).toarray()
 48 |     vectors_test = vectorizer.transform(docs_test).toarray()
 49 |     return vectors_train, vectors_test
 50 | 
 51 | 
 52 | def prepare_manager(vectors_train, docs_train):
 53 |     manager = ActiveLearningManager(vectors_train, sources=docs_train)
 54 |     return manager
 55 | 
 56 | 
 57 | def prepare_learner():
 58 | 
 59 |     estimator = RandomForestClassifier()
 60 |     preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE)
 61 |     learner = ActiveLearner(estimator=estimator, query_strategy=preset_batch)
 62 |     return learner
 63 | 
 64 | 
 65 | def make_pretty_summary_plot(performance_history):
 66 |     with plt.style.context("seaborn-white"):
 67 |         fig, ax = plt.subplots(figsize=(8.5, 6), dpi=130)
 68 | 
 69 |         ax.plot(performance_history)
 70 |         ax.scatter(range(len(performance_history)), performance_history, s=13)
 71 | 
 72 |         ax.xaxis.set_major_locator(
 73 |             mpl.ticker.MaxNLocator(nbins=N_QUERIES + 3, integer=True)
 74 |         )
 75 |         ax.xaxis.grid(True)
 76 | 
 77 |         ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(nbins=10))
 78 |         ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(xmax=1))
 79 |         ax.set_ylim(bottom=0, top=1)
 80 |         ax.yaxis.grid(True, linestyle="--", alpha=1 / 2)
 81 | 
 82 |         ax.set_title("Incremental classification accuracy")
 83 |         ax.set_xlabel("Query iteration")
 84 |         ax.set_ylabel("Classification Accuracy")
 85 | 
 86 |         plt.show()
 87 | 
 88 | 
 89 | if __name__ == "__main__":
 90 |     docs_train, original_labels_train, docs_test, original_labels_test = prepare_data()
 91 |     vectors_train, vectors_test = prepare_features(docs_train, docs_test)
 92 |     manager = prepare_manager(vectors_train, docs_train)
 93 |     learner = prepare_learner()
 94 |     performance_history = []
 95 |     # performance_history.append(learner.score(docs_test, original_labels_test))
 96 | 
 97 |     for i in range(N_QUERIES):
 98 |         # Check if there are more examples that are not labeled. If not, break
 99 |         if manager.unlabeld.size == 0:
100 |             break
101 | 
102 |         for index in range(1):
103 |             # query the learner as usual, in this case we are using a batch learning strategy
104 |             # so indices_to_label is an array
105 |             indices_to_label, query_instance = learner.query(manager.unlabeld)
106 |             labels = []  # Hold a list of the new labels
107 |             for ix in indices_to_label:
108 |                 """
109 |                 Here is the tricky part that the manager solves. The indicies are indexed with respect to unlabeled data
110 |                 but we want to work with them with respect to the original data. The manager makes this almost transparent
111 |                 """
112 |                 # Map the index that is with respect to unlabeled data back to an index with respect to the whole dataset
113 |                 original_ix = manager.get_original_index_from_unlabeled_index(ix)
114 |                 # print(manager.sources[original_ix]) #Show the original data so we can decide what to label
115 |                 # Now we can lookup the label in the original set of labels without any bookkeeping
116 |                 y = original_labels_train[original_ix]
117 |                 # We create a Label instance, a tuple of index and label
118 |                 # The index should be with respect to the unlabeled data, the add_labels function will automatically
119 |                 # calculate the offsets
120 |                 label = (ix, y)
121 |                 # append the labels to a list
122 |                 labels.append(label)
123 |             # Insert them all at once.
124 |             manager.add_labels(labels)
125 |             # Note that if you need to add labels with indicies that repsect the original dataset you can do
126 |             # manager.add_labels(labels,offset_to_unlabeled=False)
127 |         # Now teach as usual
128 |         learner.teach(manager.labeled, manager.labels)
129 |         performance_history.append(learner.score(vectors_test, original_labels_test))
130 |     # Finnaly make a nice plot
131 |     make_pretty_summary_plot(performance_history)
132 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="ALMa", # Replace with your own username
 8 |     version="0.0.3",
 9 |     author="Tal Perry",
10 |     author_email="tal@lighttag.io",
11 |     description="Easily track labeled and unlabeled data for active learning",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/lighttag/ALMa",
15 |     packages=setuptools.find_packages(),
16 |     classifiers=[
17 |         "Programming Language :: Python :: 3",
18 |         "License :: OSI Approved :: MIT License",
19 |         "Operating System :: OS Independent",
20 |     ],
21 |     python_requires='>=3.6',
22 | )
23 | 


--------------------------------------------------------------------------------
/tests/alma_tests.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from ALMa import ActiveLearningManager
 4 | 
 5 | 
 6 | def first_true(ar: np.ndarray):
 7 |     return ar.nonzero()[0][0]
 8 | 
 9 | 
10 | class TestAddLabels(unittest.TestCase):
11 |     def test_test_that_when_the_first_add_is_at_0_it_updates_correctly(self):
12 |         features = np.array([[x + y for x in range(10)] for y in range(10)])
13 |         self.assertEqual(features.shape, (10, 10))
14 |         manager = ActiveLearningManager(features=features)
15 |         manager.add_labels([(0, 1)])
16 |         self.assertEqual(first_true(manager.labeled_mask), 0)
17 |         # the index of the first unlabeled example is one past the first labeled
18 |         self.assertEqual(first_true(manager.unlabeled_mask), 1)
19 | 
20 |     def test_addto_first_continuously(self):
21 |         features = np.array([[x + y for x in range(10)] for y in range(10)])
22 |         self.assertEqual(features.shape, (10, 10))
23 |         manager = ActiveLearningManager(features=features)
24 |         manager.add_labels([(0, 1)])
25 |         self.assertEqual(first_true(manager.labeled_mask), 0)
26 |         self.assertEqual(first_true(manager.unlabeled_mask), 1)
27 | 
28 |         manager.add_labels([(0, 1)])
29 |         self.assertEqual(first_true(manager.labeled_mask), 0)
30 |         self.assertEqual(first_true(manager.unlabeled_mask), 2)
31 | 
32 |         manager.add_labels([(0, 1)])
33 |         self.assertEqual(first_true(manager.labeled_mask), 0)
34 |         self.assertEqual(first_true(manager.unlabeled_mask), 3)
35 | 
36 |     def test_adding_in_the_middle(self):
37 |         features = np.array([[x + y for x in range(10)] for y in range(10)])
38 |         self.assertEqual(features.shape, (10, 10))
39 |         manager = ActiveLearningManager(features=features)
40 |         manager.add_labels([(2, 1)])
41 |         self.assertEqual(first_true(manager.labeled_mask), 2)
42 |         self.assertEqual(first_true(manager.unlabeled_mask), 0)
43 | 
44 |     def test_adding_two_in_the_middle(self):
45 |         features = np.array([[x + y for x in range(10)] for y in range(10)])
46 |         self.assertEqual(features.shape, (10, 10))
47 |         manager = ActiveLearningManager(features=features)
48 |         manager.add_labels([(2, 1)])
49 |         self.assertEqual(first_true(manager.labeled_mask), 2)
50 |         self.assertEqual(first_true(manager.unlabeled_mask), 0)
51 | 
52 |         manager.add_labels([(1, 1)])
53 |         self.assertEqual(first_true(manager.labeled_mask), 1)
54 |         # We still didn't label the one at 0
55 |         self.assertEqual(first_true(manager.unlabeled_mask), 0)
56 | 
57 |     def test_adding_two_in_the_middle_and_then_at_0(self):
58 |         features = np.array([[x + y for x in range(10)] for y in range(10)])
59 |         self.assertEqual(features.shape, (10, 10))
60 |         manager = ActiveLearningManager(features=features)
61 |         manager.add_labels([(2, 1)])
62 |         self.assertEqual(first_true(manager.labeled_mask), 2)
63 |         self.assertEqual(first_true(manager.unlabeled_mask), 0)
64 | 
65 |         manager.add_labels([(1, 1)])
66 |         self.assertEqual(first_true(manager.labeled_mask), 1)
67 |         # We still didn't label the one at 0
68 |         self.assertEqual(first_true(manager.unlabeled_mask), 0)
69 | 
70 |         manager.add_labels([(0, 1)])
71 |         self.assertEqual(first_true(manager.labeled_mask), 0)
72 |         # we labeled 0,1,2 the next one should be 3
73 |         self.assertEqual(first_true(manager.unlabeled_mask), 3)
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     unittest.main()
78 | 


--------------------------------------------------------------------------------