├── .gitignore ├── requirements.txt ├── README.md └── model.py /.gitignore: -------------------------------------------------------------------------------- 1 | /*-backend/ 2 | __pycache__ 3 | .DS_Store -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | label-studio-ml 2 | redis 3 | rq 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spaCy powered Label Studio ML backend 2 | 3 | spaCy integration for Label Studio. Created as an Open Source alternative to Prodigy. 4 | 5 | Benefits: 6 | 7 | * Speed up annotation of data with integrated predictions 8 | * Quickly iterate on your spaCy models 9 | 10 | ## Demo video 11 | 12 | [![Demo video](https://img.youtube.com/vi/F19NT-21uT4/0.jpg)](https://youtu.be/F19NT-21uT4) 13 | 14 | ## Usage 15 | 16 | 1. Clone this repo 17 | 18 | 2. Install requirements 19 | 20 | ``` 21 | pip install -r requirements.txt 22 | ``` 23 | 24 | 3. Initialize a new backend 25 | 26 | ``` 27 | label-studio-ml init my_ml_backend 28 | ``` 29 | 30 | 4. In the `my_ml_backend` directory, add your spaCy `config.cfg` file. You can optionally add a `model-best` folder from a pre-trained model, to get started with predictions straight away. 31 | 32 | 5. Start the backend and add the URL to your Label Studio project settings. 33 | 34 | ``` 35 | label-studio-ml start my_ml_backend 36 | ``` 37 | 38 | 6. As you train new models, they will appear in a `checkpoints` directory. The latest checkpoint will be symlinked to `latest-model`. 39 | 40 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | from datetime import datetime 5 | from pathlib import Path 6 | 7 | import spacy 8 | from label_studio_ml.model import LabelStudioMLBase 9 | from spacy.cli.train import train 10 | from spacy.tokens import DocBin, Doc 11 | 12 | # Constants 13 | 14 | # GPU ID's to use. -1 means use the CPU 15 | TRAIN_GPU_ID = -1 16 | PREDICTION_GPU_ID = -1 17 | 18 | # Fraction of data to use for evaluation 19 | EVAL_SPLIT = 0.15 20 | 21 | # Batch size for predictions 22 | PREDICTION_BATCH_SIZE = 16 23 | 24 | # Score threshold for a category to be accepted 25 | TEXTCAT_SCORE_THRESHOLD = 0.5 26 | 27 | # Multiple categories per doc? 28 | TEXTCAT_MULTI = False 29 | 30 | # Assign annotation groups to spacy components 31 | LABEL_CONFIG = { 32 | 'ner': [], 33 | 'spancat': [], 34 | 'textcat': [] 35 | } 36 | 37 | # SpanGroup key to use for the spancat spans 38 | SPANCAT_KEY = 'sc' 39 | 40 | # END constants 41 | 42 | logger = logging.getLogger(__name__) 43 | logger.setLevel(logging.DEBUG) 44 | 45 | 46 | class SpacyModel(LabelStudioMLBase): 47 | TRAIN_EVENTS = () 48 | 49 | def __init__(self, **kwargs): 50 | super(SpacyModel, self).__init__(**kwargs) 51 | 52 | self.model = self.load() 53 | self.model_version = self.train_output['checkpoint'] if 'checkpoint' in self.train_output else 'fallback' 54 | 55 | logger.info("MODEL CHECKPOINT: %s", self.model_version) 56 | 57 | def misc_labels(self): 58 | map = {} 59 | from_names = [name for names in LABEL_CONFIG.values() 60 | for name in names] 61 | for from_name, schema in self.parsed_label_config.items(): 62 | if from_name in from_names: 63 | continue 64 | 65 | for label in schema['labels']: 66 | map[label] = { 67 | 'from_name': from_name, 68 | 'to_name': schema['to_name'][0], 69 | } 70 | return map 71 | 72 | def ner_labels(self): 73 | return self.misc_labels() | label_dict_from_config( 74 | self.parsed_label_config, LABEL_CONFIG['ner']) 75 | 76 | def spancat_labels(self): 77 | return self.misc_labels() | label_dict_from_config( 78 | self.parsed_label_config, LABEL_CONFIG['spancat']) 79 | 80 | def textcat_labels(self): 81 | return self.misc_labels() | label_dict_from_config( 82 | self.parsed_label_config, LABEL_CONFIG['textcat']) 83 | 84 | def load(self): 85 | model_dir = os.path.dirname(os.path.realpath(__file__)) 86 | fallback_dir = os.path.join(model_dir, "model-best") 87 | 88 | if PREDICTION_GPU_ID > -1: 89 | spacy.prefer_gpu(gpu_id=PREDICTION_GPU_ID) 90 | 91 | if 'model_path' in self.train_output and os.path.isdir(self.train_output['model_path']): 92 | return spacy.load(self.train_output['model_path']) 93 | elif os.path.isdir(fallback_dir): 94 | return spacy.load(fallback_dir) 95 | 96 | return None 97 | 98 | def predict(self, tasks, **kwargs): 99 | """ This is where inference happens: model returns 100 | the list of predictions based on input list of tasks 101 | """ 102 | if not self.model: 103 | logger.error("model has not been trained yet") 104 | return {} 105 | 106 | ner_labels = self.ner_labels() 107 | spancat_labels = self.spancat_labels() 108 | textcat_labels = self.textcat_labels() 109 | predictions = [] 110 | 111 | docs = self.model.pipe([t['data']['text'] 112 | for t in tasks], batch_size=PREDICTION_BATCH_SIZE) 113 | for doc in docs: 114 | results = [] 115 | 116 | for e in doc.ents: 117 | config = ner_labels[e.label_] 118 | results.append({ 119 | 'from_name': config['from_name'], 120 | 'to_name': config['to_name'], 121 | 'type': 'labels', 122 | 'value': { 123 | 'start': e.start_char, 124 | 'end': e.end_char, 125 | 'text': e.text, 126 | 'labels': [e.label_] 127 | } 128 | }) 129 | 130 | for SPANCAT_KEY in doc.spans: 131 | for span in doc.spans[SPANCAT_KEY]: 132 | config = spancat_labels[span.label_] 133 | results.append({ 134 | 'from_name': config['from_name'], 135 | 'to_name': config['to_name'], 136 | 'type': 'labels', 137 | 'value': { 138 | 'start': span.start_char, 139 | 'end': span.end_char, 140 | 'text': span.text, 141 | 'labels': [span.label_] 142 | } 143 | }) 144 | 145 | choices = [choice for choice, score in doc.cats.items() 146 | if score >= TEXTCAT_SCORE_THRESHOLD] 147 | if len(choices) > 0: 148 | config = textcat_labels[choices[0]] 149 | results.append({ 150 | 'from_name': config['from_name'], 151 | 'to_name': config['to_name'], 152 | 'type': 'choices', 153 | 'value': { 154 | 'choices': choices 155 | } 156 | }) 157 | 158 | predictions.append({ 159 | 'model_version': self.model_version, 160 | 'result': results 161 | }) 162 | 163 | return predictions 164 | 165 | def fit(self, annotations, workdir=None, **kwargs): 166 | """ This is where training happens: train your model given list of annotations, 167 | then returns dict with created links and resources 168 | """ 169 | model_dir = os.path.dirname(os.path.realpath(__file__)) 170 | checkpoint_name = datetime.now().strftime("%Y%m%d%H%M%S") 171 | checkpoint_dir = os.path.join( 172 | model_dir, 'checkpoints', checkpoint_name) 173 | config_path = os.path.join(model_dir, 'config.cfg') 174 | 175 | train_data_path = os.path.join(checkpoint_dir, 'train.spacy') 176 | dev_data_path = os.path.join(checkpoint_dir, 'dev.spacy') 177 | model_path = os.path.join(checkpoint_dir, 'model-best') 178 | latest_path = os.path.join(model_dir, "latest-model") 179 | latest_path_tmp = os.path.join(model_dir, "latest-model-tmp") 180 | 181 | Path(checkpoint_dir).mkdir(parents=True, exist_ok=True) 182 | 183 | annotations = list(filter(item_not_cancelled, list(annotations))) 184 | 185 | train_data, dev_data = split_annotations(annotations, EVAL_SPLIT) 186 | 187 | annotations_to_docbin( 188 | train_data, 189 | ner_labels=self.ner_labels(), 190 | spancat_labels=self.spancat_labels(), 191 | textcat_labels=self.textcat_labels() 192 | ).to_disk(train_data_path) 193 | 194 | annotations_to_docbin( 195 | dev_data, 196 | ner_labels=self.ner_labels(), 197 | spancat_labels=self.spancat_labels(), 198 | textcat_labels=self.textcat_labels() 199 | ).to_disk(dev_data_path) 200 | 201 | # try and free GPU memory for training 202 | if TRAIN_GPU_ID > -1: 203 | try: 204 | import gc 205 | import torch 206 | self.model = None 207 | gc.collect() 208 | torch.cuda.empty_cache() 209 | except: 210 | pass 211 | 212 | train(config_path, checkpoint_dir, use_gpu=TRAIN_GPU_ID, overrides={ 213 | 'paths.train': train_data_path, 'paths.dev': dev_data_path}) 214 | 215 | os.symlink(model_path, latest_path_tmp) 216 | os.replace(latest_path_tmp, latest_path) 217 | 218 | return {'model_path': model_path, 'checkpoint': checkpoint_name} 219 | 220 | # Helper functions 221 | 222 | 223 | def label_dict_from_config(config, from_names: list[str]): 224 | map = {} 225 | 226 | for from_name in from_names: 227 | schema = config[from_name] 228 | to_name = schema['to_name'][0] 229 | labels = schema['labels'] 230 | 231 | for label in labels: 232 | map[label] = { 233 | 'from_name': from_name, 234 | 'to_name': to_name 235 | } 236 | 237 | return map 238 | 239 | 240 | def item_not_cancelled(item): 241 | return item['annotations'][0]['was_cancelled'] != True 242 | 243 | 244 | def split_annotations(annotations, split): 245 | random.shuffle(annotations) 246 | 247 | dev_len = round(len(annotations) * split) 248 | train_data = annotations[dev_len:] 249 | dev_data = annotations[:dev_len] 250 | 251 | return train_data, dev_data 252 | 253 | 254 | def annotations_to_docbin(annotations, ner_labels, spancat_labels, textcat_labels): 255 | nlp = spacy.blank("en") 256 | db = DocBin() 257 | has_textcat = False 258 | 259 | docs = [] 260 | for item in annotations: 261 | if not item['data']['text']: 262 | continue 263 | 264 | doc = nlp(item['data']['text']) 265 | annotation = item['annotations'][0] 266 | 267 | for a in annotation['result']: 268 | if a['type'] == 'labels': 269 | add_span_to_doc( 270 | doc, 271 | annotation=a, 272 | ner_labels=ner_labels, 273 | spancat_labels=spancat_labels 274 | ) 275 | elif a['type'] == 'choices': 276 | has_textcat = True 277 | add_cat_to_doc(doc, a, textcat_labels) 278 | 279 | docs.append(doc) 280 | 281 | for doc in docs: 282 | if has_textcat == False or TEXTCAT_MULTI == True or doc_has_one_cat(doc): 283 | db.add(doc) 284 | 285 | return db 286 | 287 | 288 | def add_span_to_doc(doc: Doc, annotation, ner_labels, spancat_labels): 289 | val = annotation['value'] 290 | label = val['labels'][0] 291 | 292 | if label not in ner_labels and label not in spancat_labels: 293 | return 294 | 295 | span = doc.char_span(val['start'], val['end'], label=label) 296 | 297 | if span and label in ner_labels: 298 | doc.ents = doc.ents + (span,) 299 | 300 | elif span and label in spancat_labels: 301 | if SPANCAT_KEY in doc.spans: 302 | doc.spans[SPANCAT_KEY].append(span) 303 | else: 304 | doc.spans[SPANCAT_KEY] = [span] 305 | 306 | 307 | def add_cat_to_doc(doc: Doc, annotation, label_dict): 308 | val = annotation['value'] 309 | selected = val['choices'] 310 | 311 | for choice in label_dict.keys(): 312 | doc.cats[choice] = choice in selected 313 | 314 | 315 | def doc_has_one_cat(doc: Doc): 316 | positive_cats = [cat for cat, val in doc.cats.items() if val == True] 317 | return len(positive_cats) == 1 318 | --------------------------------------------------------------------------------