├── diaa
    ├── __init__.py
    ├── __pycache__
    │   ├── maker.cpython-36.pyc
    │   └── __init__.cpython-36.pyc
    └── maker.py
├── requirements.txt
├── main.py
└── README.md


/diaa/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | bratiaa==0.1.3
2 | nltk==3.4.5
3 | 


--------------------------------------------------------------------------------
/diaa/__pycache__/maker.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vwoloszyn/diaa/HEAD/diaa/__pycache__/maker.cpython-36.pyc


--------------------------------------------------------------------------------
/diaa/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vwoloszyn/diaa/HEAD/diaa/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import argparse
 4 | import json
 5 | from diaa.maker import *
 6 | import operator
 7 | import bratiaa as biaa
 8 | 
 9 | if __name__ == "__main__":
10 |     pparser = argparse.ArgumentParser()
11 |     pparser.add_argument('-i', help='input', required=False )
12 |     pparser.add_argument('-u', help='url', nargs='+', default=[], required=False, )
13 |     args = pparser.parse_args()
14 | 
15 |     if args.u :
16 |         docs = get_docs_from_doccano(args.u)
17 |     else:
18 |         docs = get_docs_from_json(args.i)
19 | 
20 |     labels=get_labels(docs)
21 |     agg=compute_f1_scores(docs)
22 |     biaa.iaa_report(agg)
23 |     agreement=calc(labels)
24 |     print (agreement)
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Doccano Inter-Annotator Agreement
 3 | 
 4 | In short, it connects automatically to a Doccano server - also accepts json files as input -, to checks Data Quality before training a Machine Learning model.
 5 | 
 6 | ## How to use
 7 | 
 8 | ```
 9 | git clone https://github.com/vwoloszyn/diaa/
10 | pip install requirements.txt
11 | python main.py  -u http://doccano_host [user] [password] [project_number]
12 | 
13 | 
14 | 
15 | * Instance-based F1 agreement
16 | 
17 | ## Project Setup
18 | 
19 | * 2 annotators: 4, 5
20 | * 24 agreement documents
21 | * 5 labels
22 | 
23 | ## Agreement per Document
24 | 
25 | | Document   |   Mean F1 |   SD F1 |
26 | |------------|-----------|---------|
27 | | 17.ann     |     0.000 |   0.000 |
28 | | 5.ann      |     0.000 |   0.000 |
29 | | 0.ann      |     0.400 |   0.000 |
30 | | 18.ann     |     0.345 |   0.000 |
31 | | 8.ann      |     0.400 |   0.000 |
32 | 
33 | 
34 | ## Agreement per Label
35 | 
36 | |   Label |   Mean F1 |   SD F1 |
37 | |---------|-----------|---------|
38 | |      22 |     0.625 |   0.000 |
39 | |      19 |     0.000 |   0.000 |
40 | |      20 |     0.111 |   0.000 |
41 | |      21 |     0.204 |   0.000 |
42 | |      23 |     0.260 |   0.000 |
43 | 
44 | ## Overall Agreement
45 | 
46 | * Mean F1: 0.233, SD F1: 0.000
47 | 
48 | ```
49 | 


--------------------------------------------------------------------------------
/diaa/maker.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from nltk import agreement
  3 | from urllib import request, parse
  4 | import json,urllib.request
  5 | import re
  6 | import os
  7 | import tempfile
  8 | from bratiaa.agree import F1Agreement, partial, input_generator
  9 | from bratiaa.evaluation import exact_match_token_evaluation,exact_match_instance_evaluation
 10 | 
 11 | def get_docs_from_json(fn):
 12 |     lines=[]
 13 |     fh = open(fn)
 14 |     for line in fh:
 15 |         lines.append(json.loads(line))
 16 |     fh.close()
 17 |     return lines
 18 | 
 19 | 
 20 | def get_docs_from_doccano(url_):
 21 |     host=url_[0]
 22 |     user_=url_[1]
 23 |     pass_=url_[2]
 24 |     project_=url_[3]
 25 |     ##getting token
 26 | 
 27 |     auth_url=host+"/v1/auth-token"
 28 |     data = {'username': user_, 'password': pass_}
 29 |     req = request.Request(auth_url)
 30 |     req.add_header('Content-Type', 'application/json; charset=utf-8')
 31 |     jsondata = json.dumps(data)
 32 |     jsondataasbytes = jsondata.encode('utf-8')   # needs to be bytes
 33 |     result = request.urlopen(req, jsondataasbytes).read()
 34 |     token=json.loads(result)["token"]
 35 |     header = {'Content-Type': 'application/json; charset=utf-8','Authorization': 'Token '+str(token)}
 36 | 
 37 |     data_url=host+"/v1/projects/"+str(project_)+"/docs/download?q=json&format=json"
 38 |     req =  request.Request(data_url, headers=header)
 39 |     webURL = request.urlopen(req)
 40 | 
 41 |     data=[]
 42 |     for line in webURL:
 43 |         data.append(json.loads(line.decode("utf-8")))
 44 |     return data
 45 | 
 46 | 
 47 | def get_labels(docs):
 48 |     labels_=[]
 49 |     spans_={}
 50 |     users_=[]
 51 |     for i in range(len(docs)):
 52 |         doc=docs[i]
 53 |         if ("annotations" in list(doc.keys())):
 54 |             text=doc["text"]
 55 |             labels=doc["annotations"]
 56 |             for label in labels:
 57 |                 user_=label["user"]
 58 |                 type_=str(label["label"])
 59 |                 token_=str(i)+"_"+str(label["start_offset"])+"_"+str(label["end_offset"])
 60 |                 if user_ not in users_:
 61 |                     users_.append(user_)
 62 |                 if token_ not in spans_.keys():
 63 |                     spans_[token_]={}
 64 |                 spans_[token_][user_]=type_
 65 |         else:
 66 |             print("did not find annotations in record "+str(i))
 67 | 
 68 |     for key, value in spans_.items():
 69 |         for u in users_:
 70 |             if (u in spans_[key].keys() ):
 71 |                 labels_.append((str(u),key, int(spans_[key][u]) ))
 72 |             else:
 73 |                 labels_.append((str(u),key, 0))
 74 |     if len(labels_)==0:
 75 |         print ("no labels found")
 76 |     return labels_
 77 | 
 78 | 
 79 | def docs_to_ann(docs):
 80 |     import tempfile
 81 |     import shutil
 82 |     temp_dir = tempfile.mkdtemp()
 83 |     #print(temp_dir)
 84 |     labels_=[]
 85 |     annotators_=[]
 86 |     files_=[]
 87 |     texts_=[]
 88 |     for i in range(len(docs)):
 89 |         doc=docs[i]
 90 |         #replacing "\n\t" with " " to avoid error "bratsubset.annotation.InvalidIdError: Invalid id:"
 91 |         #text=doc["text"]
 92 |         text=re.sub(r"[\t]+", " ", doc["text"])
 93 |         texts_.append(text)
 94 |         if ("annotations" in list(doc.keys())):
 95 |             labels=doc["annotations"]
 96 |             users_={}
 97 |             for label in labels:
 98 |                 user_=label["user"]
 99 |                 type_=str(label["label"])
100 |                 labels_.append(type_)
101 |                 annotators_.append(str(user_))
102 |                 start_=label["start_offset"]
103 |                 end_=label["end_offset"]
104 |                 sub_text=text[int(start_):][:int(end_-start_)]
105 |                 if user_ not in users_.keys():
106 |                     users_[user_]=[]
107 |                 nonono = "x" * int(int(end_)-int(start_))
108 |                 users_[user_].append(["T"+str(len(users_[user_])+1),type_,start_,end_,sub_text])
109 | 
110 |         for u,values in users_.items():
111 |             dir_user=str(temp_dir)+"/"+str(u)
112 |             file_dir_user=dir_user+"/"+str(i+1)+".ann"
113 |             os.makedirs(dir_user, exist_ok=True)
114 |             files_.append(str(i+1)+".ann")
115 |             with open(file_dir_user, 'w') as f:
116 |                 for v in values:
117 |                     line=str(v[0])+"\t"+str(v[1])+" "+str(v[2])+" "+str(v[3])+"\t"+str(v[4])
118 |                     f.write(line+"\n")
119 |             ##add txt files
120 |             with open(dir_user+"/"+str(i+1)+".txt", 'w') as f:
121 |                 f.write(text+"\n")
122 | 
123 |     #making sure that we have same TXT files in each dir
124 |     for i in range(len(texts_)):
125 |         for u in list(set(annotators_)):
126 |             file_=temp_dir+"/"+str(u)+"/"+str(i+1)+".txt"
127 |             #annotation
128 |             with open(file_, 'w') as f:
129 |                 f.write(texts_[i]+"\n")
130 | 
131 | 
132 |     #making sure that we have same ANN files in each dir
133 |     for f in list(set(files_)):
134 |         for a in list(set(annotators_)):
135 |             file_dir=temp_dir+"/"+str(a)+"/"+str(f)
136 |             #annotation
137 |             open(file_dir, 'a+')
138 |     #print (temp_dir)
139 |     return temp_dir,list(set(labels_)),list(set(annotators_)),list(set(files_))
140 | 
141 | 
142 | def compute_f1_scores(docs):
143 |     project,labels,annotators,docs = docs_to_ann(docs)
144 |     
145 | 
146 |     def token_func(text):
147 |         token = re.compile('\w+|[^\w\s]+')
148 |         for match in re.finditer(token, text):
149 |             yield match.start(), match.end()
150 | 
151 | 
152 |     agg=F1Agreement(partial(input_generator, project), labels,annotators=annotators, documents=docs, token_func=token_func,
153 |                     eval_func=exact_match_token_evaluation)
154 |     return agg
155 | 
156 | 
157 | def calc(labels):
158 |     metrics={}
159 |     ratingtask = agreement.AnnotationTask(data=labels)
160 |     metrics["kappa"]=ratingtask.kappa()
161 |     metrics["fleiss"]=ratingtask.multi_kappa()
162 |     metrics["alpha"]=ratingtask.alpha()
163 |     metrics["scotts"]=ratingtask.pi()
164 |     return metrics
165 | 
166 | 


--------------------------------------------------------------------------------