├── diaa ├── __init__.py ├── __pycache__ │ ├── maker.cpython-36.pyc │ └── __init__.cpython-36.pyc └── maker.py ├── requirements.txt ├── main.py └── README.md /diaa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bratiaa==0.1.3 2 | nltk==3.4.5 3 | -------------------------------------------------------------------------------- /diaa/__pycache__/maker.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vwoloszyn/diaa/HEAD/diaa/__pycache__/maker.cpython-36.pyc -------------------------------------------------------------------------------- /diaa/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vwoloszyn/diaa/HEAD/diaa/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import argparse 4 | import json 5 | from diaa.maker import * 6 | import operator 7 | import bratiaa as biaa 8 | 9 | if __name__ == "__main__": 10 | pparser = argparse.ArgumentParser() 11 | pparser.add_argument('-i', help='input', required=False ) 12 | pparser.add_argument('-u', help='url', nargs='+', default=[], required=False, ) 13 | args = pparser.parse_args() 14 | 15 | if args.u : 16 | docs = get_docs_from_doccano(args.u) 17 | else: 18 | docs = get_docs_from_json(args.i) 19 | 20 | labels=get_labels(docs) 21 | agg=compute_f1_scores(docs) 22 | biaa.iaa_report(agg) 23 | agreement=calc(labels) 24 | print (agreement) 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Doccano Inter-Annotator Agreement 3 | 4 | In short, it connects automatically to a Doccano server - also accepts json files as input -, to checks Data Quality before training a Machine Learning model. 5 | 6 | ## How to use 7 | 8 | ``` 9 | git clone https://github.com/vwoloszyn/diaa/ 10 | pip install requirements.txt 11 | python main.py -u http://doccano_host [user] [password] [project_number] 12 | 13 | 14 | 15 | * Instance-based F1 agreement 16 | 17 | ## Project Setup 18 | 19 | * 2 annotators: 4, 5 20 | * 24 agreement documents 21 | * 5 labels 22 | 23 | ## Agreement per Document 24 | 25 | | Document | Mean F1 | SD F1 | 26 | |------------|-----------|---------| 27 | | 17.ann | 0.000 | 0.000 | 28 | | 5.ann | 0.000 | 0.000 | 29 | | 0.ann | 0.400 | 0.000 | 30 | | 18.ann | 0.345 | 0.000 | 31 | | 8.ann | 0.400 | 0.000 | 32 | 33 | 34 | ## Agreement per Label 35 | 36 | | Label | Mean F1 | SD F1 | 37 | |---------|-----------|---------| 38 | | 22 | 0.625 | 0.000 | 39 | | 19 | 0.000 | 0.000 | 40 | | 20 | 0.111 | 0.000 | 41 | | 21 | 0.204 | 0.000 | 42 | | 23 | 0.260 | 0.000 | 43 | 44 | ## Overall Agreement 45 | 46 | * Mean F1: 0.233, SD F1: 0.000 47 | 48 | ``` 49 | -------------------------------------------------------------------------------- /diaa/maker.py: -------------------------------------------------------------------------------- 1 | import json 2 | from nltk import agreement 3 | from urllib import request, parse 4 | import json,urllib.request 5 | import re 6 | import os 7 | import tempfile 8 | from bratiaa.agree import F1Agreement, partial, input_generator 9 | from bratiaa.evaluation import exact_match_token_evaluation,exact_match_instance_evaluation 10 | 11 | def get_docs_from_json(fn): 12 | lines=[] 13 | fh = open(fn) 14 | for line in fh: 15 | lines.append(json.loads(line)) 16 | fh.close() 17 | return lines 18 | 19 | 20 | def get_docs_from_doccano(url_): 21 | host=url_[0] 22 | user_=url_[1] 23 | pass_=url_[2] 24 | project_=url_[3] 25 | ##getting token 26 | 27 | auth_url=host+"/v1/auth-token" 28 | data = {'username': user_, 'password': pass_} 29 | req = request.Request(auth_url) 30 | req.add_header('Content-Type', 'application/json; charset=utf-8') 31 | jsondata = json.dumps(data) 32 | jsondataasbytes = jsondata.encode('utf-8') # needs to be bytes 33 | result = request.urlopen(req, jsondataasbytes).read() 34 | token=json.loads(result)["token"] 35 | header = {'Content-Type': 'application/json; charset=utf-8','Authorization': 'Token '+str(token)} 36 | 37 | data_url=host+"/v1/projects/"+str(project_)+"/docs/download?q=json&format=json" 38 | req = request.Request(data_url, headers=header) 39 | webURL = request.urlopen(req) 40 | 41 | data=[] 42 | for line in webURL: 43 | data.append(json.loads(line.decode("utf-8"))) 44 | return data 45 | 46 | 47 | def get_labels(docs): 48 | labels_=[] 49 | spans_={} 50 | users_=[] 51 | for i in range(len(docs)): 52 | doc=docs[i] 53 | if ("annotations" in list(doc.keys())): 54 | text=doc["text"] 55 | labels=doc["annotations"] 56 | for label in labels: 57 | user_=label["user"] 58 | type_=str(label["label"]) 59 | token_=str(i)+"_"+str(label["start_offset"])+"_"+str(label["end_offset"]) 60 | if user_ not in users_: 61 | users_.append(user_) 62 | if token_ not in spans_.keys(): 63 | spans_[token_]={} 64 | spans_[token_][user_]=type_ 65 | else: 66 | print("did not find annotations in record "+str(i)) 67 | 68 | for key, value in spans_.items(): 69 | for u in users_: 70 | if (u in spans_[key].keys() ): 71 | labels_.append((str(u),key, int(spans_[key][u]) )) 72 | else: 73 | labels_.append((str(u),key, 0)) 74 | if len(labels_)==0: 75 | print ("no labels found") 76 | return labels_ 77 | 78 | 79 | def docs_to_ann(docs): 80 | import tempfile 81 | import shutil 82 | temp_dir = tempfile.mkdtemp() 83 | #print(temp_dir) 84 | labels_=[] 85 | annotators_=[] 86 | files_=[] 87 | texts_=[] 88 | for i in range(len(docs)): 89 | doc=docs[i] 90 | #replacing "\n\t" with " " to avoid error "bratsubset.annotation.InvalidIdError: Invalid id:" 91 | #text=doc["text"] 92 | text=re.sub(r"[\t]+", " ", doc["text"]) 93 | texts_.append(text) 94 | if ("annotations" in list(doc.keys())): 95 | labels=doc["annotations"] 96 | users_={} 97 | for label in labels: 98 | user_=label["user"] 99 | type_=str(label["label"]) 100 | labels_.append(type_) 101 | annotators_.append(str(user_)) 102 | start_=label["start_offset"] 103 | end_=label["end_offset"] 104 | sub_text=text[int(start_):][:int(end_-start_)] 105 | if user_ not in users_.keys(): 106 | users_[user_]=[] 107 | nonono = "x" * int(int(end_)-int(start_)) 108 | users_[user_].append(["T"+str(len(users_[user_])+1),type_,start_,end_,sub_text]) 109 | 110 | for u,values in users_.items(): 111 | dir_user=str(temp_dir)+"/"+str(u) 112 | file_dir_user=dir_user+"/"+str(i+1)+".ann" 113 | os.makedirs(dir_user, exist_ok=True) 114 | files_.append(str(i+1)+".ann") 115 | with open(file_dir_user, 'w') as f: 116 | for v in values: 117 | line=str(v[0])+"\t"+str(v[1])+" "+str(v[2])+" "+str(v[3])+"\t"+str(v[4]) 118 | f.write(line+"\n") 119 | ##add txt files 120 | with open(dir_user+"/"+str(i+1)+".txt", 'w') as f: 121 | f.write(text+"\n") 122 | 123 | #making sure that we have same TXT files in each dir 124 | for i in range(len(texts_)): 125 | for u in list(set(annotators_)): 126 | file_=temp_dir+"/"+str(u)+"/"+str(i+1)+".txt" 127 | #annotation 128 | with open(file_, 'w') as f: 129 | f.write(texts_[i]+"\n") 130 | 131 | 132 | #making sure that we have same ANN files in each dir 133 | for f in list(set(files_)): 134 | for a in list(set(annotators_)): 135 | file_dir=temp_dir+"/"+str(a)+"/"+str(f) 136 | #annotation 137 | open(file_dir, 'a+') 138 | #print (temp_dir) 139 | return temp_dir,list(set(labels_)),list(set(annotators_)),list(set(files_)) 140 | 141 | 142 | def compute_f1_scores(docs): 143 | project,labels,annotators,docs = docs_to_ann(docs) 144 | 145 | 146 | def token_func(text): 147 | token = re.compile('\w+|[^\w\s]+') 148 | for match in re.finditer(token, text): 149 | yield match.start(), match.end() 150 | 151 | 152 | agg=F1Agreement(partial(input_generator, project), labels,annotators=annotators, documents=docs, token_func=token_func, 153 | eval_func=exact_match_token_evaluation) 154 | return agg 155 | 156 | 157 | def calc(labels): 158 | metrics={} 159 | ratingtask = agreement.AnnotationTask(data=labels) 160 | metrics["kappa"]=ratingtask.kappa() 161 | metrics["fleiss"]=ratingtask.multi_kappa() 162 | metrics["alpha"]=ratingtask.alpha() 163 | metrics["scotts"]=ratingtask.pi() 164 | return metrics 165 | 166 | --------------------------------------------------------------------------------