├── .gitignore
├── requirements.txt
├── README.md
└── model.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /*-backend/
2 | __pycache__
3 | .DS_Store


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | label-studio-ml
2 | redis
3 | rq
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # spaCy powered Label Studio ML backend
 2 | 
 3 | spaCy integration for Label Studio. Created as an Open Source alternative to Prodigy.
 4 | 
 5 | Benefits:
 6 | 
 7 | * Speed up annotation of data with integrated predictions
 8 | * Quickly iterate on your spaCy models
 9 | 
10 | ## Demo video
11 | 
12 | [![Demo video](https://img.youtube.com/vi/F19NT-21uT4/0.jpg)](https://youtu.be/F19NT-21uT4)
13 | 
14 | ## Usage
15 | 
16 | 1. Clone this repo
17 | 
18 | 2. Install requirements
19 | 
20 | ```
21 | pip install -r requirements.txt
22 | ```
23 | 
24 | 3. Initialize a new backend
25 | 
26 | ```
27 | label-studio-ml init my_ml_backend
28 | ```
29 | 
30 | 4. In the `my_ml_backend` directory, add your spaCy `config.cfg` file. You can optionally add a `model-best` folder from a pre-trained model, to get started with predictions straight away.
31 | 
32 | 5. Start the backend and add the URL to your Label Studio project settings.
33 | 
34 | ```
35 | label-studio-ml start my_ml_backend
36 | ```
37 | 
38 | 6. As you train new models, they will appear in a `checkpoints` directory. The latest checkpoint will be symlinked to `latest-model`.
39 | 
40 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import random
  4 | from datetime import datetime
  5 | from pathlib import Path
  6 | 
  7 | import spacy
  8 | from label_studio_ml.model import LabelStudioMLBase
  9 | from spacy.cli.train import train
 10 | from spacy.tokens import DocBin, Doc
 11 | 
 12 | # Constants
 13 | 
 14 | # GPU ID's to use. -1 means use the CPU
 15 | TRAIN_GPU_ID = -1
 16 | PREDICTION_GPU_ID = -1
 17 | 
 18 | # Fraction of data to use for evaluation
 19 | EVAL_SPLIT = 0.15
 20 | 
 21 | # Batch size for predictions
 22 | PREDICTION_BATCH_SIZE = 16
 23 | 
 24 | # Score threshold for a category to be accepted
 25 | TEXTCAT_SCORE_THRESHOLD = 0.5
 26 | 
 27 | # Multiple categories per doc?
 28 | TEXTCAT_MULTI = False
 29 | 
 30 | # Assign annotation groups to spacy components
 31 | LABEL_CONFIG = {
 32 |     'ner': [],
 33 |     'spancat': [],
 34 |     'textcat': []
 35 | }
 36 | 
 37 | # SpanGroup key to use for the spancat spans
 38 | SPANCAT_KEY = 'sc'
 39 | 
 40 | # END constants
 41 | 
 42 | logger = logging.getLogger(__name__)
 43 | logger.setLevel(logging.DEBUG)
 44 | 
 45 | 
 46 | class SpacyModel(LabelStudioMLBase):
 47 |     TRAIN_EVENTS = ()
 48 | 
 49 |     def __init__(self, **kwargs):
 50 |         super(SpacyModel, self).__init__(**kwargs)
 51 | 
 52 |         self.model = self.load()
 53 |         self.model_version = self.train_output['checkpoint'] if 'checkpoint' in self.train_output else 'fallback'
 54 | 
 55 |         logger.info("MODEL CHECKPOINT: %s", self.model_version)
 56 | 
 57 |     def misc_labels(self):
 58 |         map = {}
 59 |         from_names = [name for names in LABEL_CONFIG.values()
 60 |                       for name in names]
 61 |         for from_name, schema in self.parsed_label_config.items():
 62 |             if from_name in from_names:
 63 |                 continue
 64 | 
 65 |             for label in schema['labels']:
 66 |                 map[label] = {
 67 |                     'from_name': from_name,
 68 |                     'to_name': schema['to_name'][0],
 69 |                 }
 70 |         return map
 71 | 
 72 |     def ner_labels(self):
 73 |         return self.misc_labels() | label_dict_from_config(
 74 |             self.parsed_label_config, LABEL_CONFIG['ner'])
 75 | 
 76 |     def spancat_labels(self):
 77 |         return self.misc_labels() | label_dict_from_config(
 78 |             self.parsed_label_config, LABEL_CONFIG['spancat'])
 79 | 
 80 |     def textcat_labels(self):
 81 |         return self.misc_labels() | label_dict_from_config(
 82 |             self.parsed_label_config, LABEL_CONFIG['textcat'])
 83 | 
 84 |     def load(self):
 85 |         model_dir = os.path.dirname(os.path.realpath(__file__))
 86 |         fallback_dir = os.path.join(model_dir, "model-best")
 87 | 
 88 |         if PREDICTION_GPU_ID > -1:
 89 |             spacy.prefer_gpu(gpu_id=PREDICTION_GPU_ID)
 90 | 
 91 |         if 'model_path' in self.train_output and os.path.isdir(self.train_output['model_path']):
 92 |             return spacy.load(self.train_output['model_path'])
 93 |         elif os.path.isdir(fallback_dir):
 94 |             return spacy.load(fallback_dir)
 95 | 
 96 |         return None
 97 | 
 98 |     def predict(self, tasks, **kwargs):
 99 |         """ This is where inference happens: model returns 
100 |             the list of predictions based on input list of tasks 
101 |         """
102 |         if not self.model:
103 |             logger.error("model has not been trained yet")
104 |             return {}
105 | 
106 |         ner_labels = self.ner_labels()
107 |         spancat_labels = self.spancat_labels()
108 |         textcat_labels = self.textcat_labels()
109 |         predictions = []
110 | 
111 |         docs = self.model.pipe([t['data']['text']
112 |                                for t in tasks], batch_size=PREDICTION_BATCH_SIZE)
113 |         for doc in docs:
114 |             results = []
115 | 
116 |             for e in doc.ents:
117 |                 config = ner_labels[e.label_]
118 |                 results.append({
119 |                     'from_name': config['from_name'],
120 |                     'to_name': config['to_name'],
121 |                     'type': 'labels',
122 |                     'value': {
123 |                         'start': e.start_char,
124 |                         'end': e.end_char,
125 |                         'text': e.text,
126 |                         'labels': [e.label_]
127 |                     }
128 |                 })
129 | 
130 |             for SPANCAT_KEY in doc.spans:
131 |                 for span in doc.spans[SPANCAT_KEY]:
132 |                     config = spancat_labels[span.label_]
133 |                     results.append({
134 |                         'from_name': config['from_name'],
135 |                         'to_name': config['to_name'],
136 |                         'type': 'labels',
137 |                         'value': {
138 |                             'start': span.start_char,
139 |                             'end': span.end_char,
140 |                             'text': span.text,
141 |                             'labels': [span.label_]
142 |                         }
143 |                     })
144 | 
145 |             choices = [choice for choice, score in doc.cats.items()
146 |                        if score >= TEXTCAT_SCORE_THRESHOLD]
147 |             if len(choices) > 0:
148 |                 config = textcat_labels[choices[0]]
149 |                 results.append({
150 |                     'from_name': config['from_name'],
151 |                     'to_name': config['to_name'],
152 |                     'type': 'choices',
153 |                     'value': {
154 |                         'choices': choices
155 |                     }
156 |                 })
157 | 
158 |             predictions.append({
159 |                 'model_version': self.model_version,
160 |                 'result': results
161 |             })
162 | 
163 |         return predictions
164 | 
165 |     def fit(self, annotations, workdir=None, **kwargs):
166 |         """ This is where training happens: train your model given list of annotations, 
167 |             then returns dict with created links and resources
168 |         """
169 |         model_dir = os.path.dirname(os.path.realpath(__file__))
170 |         checkpoint_name = datetime.now().strftime("%Y%m%d%H%M%S")
171 |         checkpoint_dir = os.path.join(
172 |             model_dir, 'checkpoints', checkpoint_name)
173 |         config_path = os.path.join(model_dir, 'config.cfg')
174 | 
175 |         train_data_path = os.path.join(checkpoint_dir, 'train.spacy')
176 |         dev_data_path = os.path.join(checkpoint_dir, 'dev.spacy')
177 |         model_path = os.path.join(checkpoint_dir, 'model-best')
178 |         latest_path = os.path.join(model_dir, "latest-model")
179 |         latest_path_tmp = os.path.join(model_dir, "latest-model-tmp")
180 | 
181 |         Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)
182 | 
183 |         annotations = list(filter(item_not_cancelled, list(annotations)))
184 | 
185 |         train_data, dev_data = split_annotations(annotations, EVAL_SPLIT)
186 | 
187 |         annotations_to_docbin(
188 |             train_data,
189 |             ner_labels=self.ner_labels(),
190 |             spancat_labels=self.spancat_labels(),
191 |             textcat_labels=self.textcat_labels()
192 |         ).to_disk(train_data_path)
193 | 
194 |         annotations_to_docbin(
195 |             dev_data,
196 |             ner_labels=self.ner_labels(),
197 |             spancat_labels=self.spancat_labels(),
198 |             textcat_labels=self.textcat_labels()
199 |         ).to_disk(dev_data_path)
200 | 
201 |         # try and free GPU memory for training
202 |         if TRAIN_GPU_ID > -1:
203 |             try:
204 |                 import gc
205 |                 import torch
206 |                 self.model = None
207 |                 gc.collect()
208 |                 torch.cuda.empty_cache()
209 |             except:
210 |                 pass
211 | 
212 |         train(config_path, checkpoint_dir, use_gpu=TRAIN_GPU_ID, overrides={
213 |               'paths.train': train_data_path, 'paths.dev': dev_data_path})
214 | 
215 |         os.symlink(model_path, latest_path_tmp)
216 |         os.replace(latest_path_tmp, latest_path)
217 | 
218 |         return {'model_path': model_path, 'checkpoint': checkpoint_name}
219 | 
220 | # Helper functions
221 | 
222 | 
223 | def label_dict_from_config(config, from_names: list[str]):
224 |     map = {}
225 | 
226 |     for from_name in from_names:
227 |         schema = config[from_name]
228 |         to_name = schema['to_name'][0]
229 |         labels = schema['labels']
230 | 
231 |         for label in labels:
232 |             map[label] = {
233 |                 'from_name': from_name,
234 |                 'to_name': to_name
235 |             }
236 | 
237 |     return map
238 | 
239 | 
240 | def item_not_cancelled(item):
241 |     return item['annotations'][0]['was_cancelled'] != True
242 | 
243 | 
244 | def split_annotations(annotations, split):
245 |     random.shuffle(annotations)
246 | 
247 |     dev_len = round(len(annotations) * split)
248 |     train_data = annotations[dev_len:]
249 |     dev_data = annotations[:dev_len]
250 | 
251 |     return train_data, dev_data
252 | 
253 | 
254 | def annotations_to_docbin(annotations, ner_labels, spancat_labels, textcat_labels):
255 |     nlp = spacy.blank("en")
256 |     db = DocBin()
257 |     has_textcat = False
258 | 
259 |     docs = []
260 |     for item in annotations:
261 |         if not item['data']['text']:
262 |             continue
263 | 
264 |         doc = nlp(item['data']['text'])
265 |         annotation = item['annotations'][0]
266 | 
267 |         for a in annotation['result']:
268 |             if a['type'] == 'labels':
269 |                 add_span_to_doc(
270 |                     doc,
271 |                     annotation=a,
272 |                     ner_labels=ner_labels,
273 |                     spancat_labels=spancat_labels
274 |                 )
275 |             elif a['type'] == 'choices':
276 |                 has_textcat = True
277 |                 add_cat_to_doc(doc, a, textcat_labels)
278 | 
279 |         docs.append(doc)
280 | 
281 |     for doc in docs:
282 |         if has_textcat == False or TEXTCAT_MULTI == True or doc_has_one_cat(doc):
283 |             db.add(doc)
284 | 
285 |     return db
286 | 
287 | 
288 | def add_span_to_doc(doc: Doc, annotation, ner_labels, spancat_labels):
289 |     val = annotation['value']
290 |     label = val['labels'][0]
291 | 
292 |     if label not in ner_labels and label not in spancat_labels:
293 |         return
294 | 
295 |     span = doc.char_span(val['start'], val['end'], label=label)
296 | 
297 |     if span and label in ner_labels:
298 |         doc.ents = doc.ents + (span,)
299 | 
300 |     elif span and label in spancat_labels:
301 |         if SPANCAT_KEY in doc.spans:
302 |             doc.spans[SPANCAT_KEY].append(span)
303 |         else:
304 |             doc.spans[SPANCAT_KEY] = [span]
305 | 
306 | 
307 | def add_cat_to_doc(doc: Doc, annotation, label_dict):
308 |     val = annotation['value']
309 |     selected = val['choices']
310 | 
311 |     for choice in label_dict.keys():
312 |         doc.cats[choice] = choice in selected
313 | 
314 | 
315 | def doc_has_one_cat(doc: Doc):
316 |     positive_cats = [cat for cat, val in doc.cats.items() if val == True]
317 |     return len(positive_cats) == 1
318 | 


--------------------------------------------------------------------------------