├── streamlit_dashboard.png ├── mongo_delete.py ├── mongo_load.py ├── LICENSE ├── report_maker.py ├── annotation_streamlit.py ├── custom_ner_manual.py ├── README.md ├── multiuser_ner.py ├── multiuser_db.py ├── multiuser_mark.py ├── multiuser_db_assault.py ├── multiuser_manual_db.py ├── multiuser_db_blocks.py └── Report.Rmd /streamlit_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/multiuser_prodigy/HEAD/streamlit_dashboard.png -------------------------------------------------------------------------------- /mongo_delete.py: -------------------------------------------------------------------------------- 1 | import jsonlines 2 | from pymongo import MongoClient 3 | import plac 4 | import sys 5 | 6 | @plac.annotations( 7 | coll_name=("Collectionto load into", "option", "c", str), 8 | db_name=("Database to load into", "option", "d", str)) 9 | def delete(coll_name, db_name = "gsr"): 10 | # quick script for updating Mongo tasks 11 | client = MongoClient('mongodb://localhost:27017/') 12 | db = client[db_name] 13 | coll = db[coll_name] 14 | count = coll.count() 15 | conf = input("You're about to delete the collection {0}, which has {1} records. Please type this name to confirm: ".format(coll_name, count)) 16 | if conf != coll_name: 17 | print("Bye!") 18 | sys.exit(0) 19 | if conf == coll_name: 20 | coll.delete_many({}) 21 | print("Deleted all records from ", coll_name) 22 | 23 | if __name__ == "__main__": 24 | plac.call(delete) 25 | -------------------------------------------------------------------------------- /mongo_load.py: -------------------------------------------------------------------------------- 1 | import jsonlines 2 | from pymongo import MongoClient 3 | import plac 4 | 5 | @plac.annotations( 6 | input_file=("JSONL of tasks to load", "option", "i", str), 7 | coll_name=("Collectionto load into", "option", "c", str), 8 | db_name=("Database to load into", "option", "d", str)) 9 | def load(input_file, coll_name, db_name = "gsr"): 10 | # quick script for updating Mongo tasks 11 | with jsonlines.open(input_file, "r") as f: 12 | to_load = list(f.iter()) 13 | for i in to_load: 14 | i['seen'] = 0 15 | i['coders'] = [] 16 | print("Loading into db {0}, collection {1}".format(db_name, coll_name)) 17 | client = MongoClient('mongodb://localhost:27017/') 18 | db = client[db_name] 19 | coll = db[coll_name] 20 | print("Before loading:", coll.count()) 21 | coll.insert_many(to_load) 22 | print("After loading:", coll.count()) 23 | 24 | if __name__ == "__main__": 25 | plac.call(load) 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Andy Halterman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /report_maker.py: -------------------------------------------------------------------------------- 1 | import prodigy 2 | from prodigy.components.db import connect 3 | from pandas import DataFrame 4 | from datetime import datetime, timedelta 5 | from dateutil import parser 6 | import os 7 | import plac 8 | 9 | @plac.annotations( 10 | db_name=("Name of Prodigy database to generate export from", "option", "i", str)) 11 | def main(db_name): 12 | db = connect() 13 | examples = db.get_dataset(db_name) 14 | print("Total examples: ", len(examples)) 15 | 16 | diffs = [] 17 | for ex in examples: 18 | if 'time_returned' in ex.keys() and 'time_loaded' in ex.keys(): 19 | date = parser.parse(ex['time_returned']).strftime("%Y-%m-%d") 20 | diff = parser.parse(ex['time_returned']) - parser.parse(ex['time_loaded']) 21 | diff = diff.total_seconds() 22 | diffs.append({"date" : date, 23 | "coder" : ex['active_coder'], 24 | "diff" : diff, 25 | "id" : ex['id'][-16:], 26 | 'answer': ex['answer']}) 27 | 28 | df = DataFrame(diffs) 29 | df.to_csv("/home/andy/multiuser_prodigy/coding_summary.csv") 30 | os.system("""/usr/bin/Rscript -e 'library(rmarkdown); rmarkdown::render("multiuser_prodigy/Report.Rmd", "html_document")'""") 31 | os.system("""echo pwd""") 32 | 33 | if __name__ == "__main__": 34 | plac.call(main) 35 | -------------------------------------------------------------------------------- /annotation_streamlit.py: -------------------------------------------------------------------------------- 1 | 2 | import streamlit as st 3 | import spacy 4 | import base64 5 | import pandas as pd 6 | import jsonlines 7 | 8 | from pymongo import MongoClient 9 | 10 | 11 | HTML_WRAPPER = """
{}
""" 12 | 13 | @st.cache(allow_output_mutation=True, suppress_st_warning=True) 14 | def setup_mongo(): 15 | client = MongoClient('mongodb://localhost:27017/') 16 | db = client['gsr'] 17 | coll = db['prod_dec_2020_2'] 18 | return coll 19 | 20 | def visualize(coll): 21 | coder = st.selectbox("Select your port/ID number", 22 | [9015, 9016, 9017, 9019, 9020, 9022, 9023, 9025]) 23 | coder = int(coder) 24 | #st.markdown("Total sentences in collection: {}".format(coll.count())) 25 | assigned = coll.count({"assigned_annotators": {"$in" : [coder]}}) 26 | completed = coll.count({"coders": {"$in" : [coder]}}) 27 | st.markdown("Sentences assigned to {}: {}".format(coder, assigned)) 28 | st.markdown("Sentences completed by {}: {}".format(coder, completed)) 29 | st.markdown("Progress:") 30 | try: 31 | prog = completed/assigned 32 | except ZeroDivisionError: 33 | prog = 0 34 | st.progress(prog) 35 | 36 | 37 | st.title('Annotation progress') 38 | st.markdown("Check your annotation progress by selecting your port/ID number") 39 | coll = setup_mongo() 40 | visualize(coll) 41 | -------------------------------------------------------------------------------- /custom_ner_manual.py: -------------------------------------------------------------------------------- 1 | import random 2 | import mmh3 3 | import json 4 | import spacy 5 | import copy 6 | 7 | from .compare import get_questions as get_compare_questions 8 | from ..models.ner import EntityRecognizer, merge_spans 9 | from ..models.matcher import PatternMatcher 10 | from ..components import printers 11 | from ..components.db import connect 12 | from ..components.preprocess import split_sentences, split_spans, split_tokens 13 | from ..components.sorters import prefer_uncertain 14 | from ..components.loaders import get_stream 15 | from ..components.filters import filter_tasks 16 | from ..core import recipe, recipe_args 17 | from ..util import split_evals, get_labels, get_print, combine_models 18 | from ..util import export_model_data, set_hashes, log, prints 19 | from ..util import INPUT_HASH_ATTR, TASK_HASH_ATTR 20 | 21 | 22 | DB = connect() 23 | 24 | @recipe('ner.manual', 25 | dataset=recipe_args['dataset'], 26 | spacy_model=recipe_args['spacy_model'], 27 | source=recipe_args['source'], 28 | api=recipe_args['api'], 29 | loader=recipe_args['loader'], 30 | label=recipe_args['label'], 31 | exclude=recipe_args['exclude']) 32 | def manual(dataset, spacy_model, source=None, api=None, loader=None, 33 | label=None, exclude=None): 34 | """ 35 | Mark spans by token. Requires only a tokenizer and no entity recognizer, 36 | and doesn't do any active learning. 37 | """ 38 | log("RECIPE: Starting recipe ner.manual", locals()) 39 | nlp = spacy.load(spacy_model) 40 | log("RECIPE: Loaded model {}".format(spacy_model)) 41 | labels = get_labels(label, nlp) 42 | log("RECIPE: Annotating with {} labels".format(len(labels)), labels) 43 | stream = get_stream(source, api=api, loader=loader, rehash=True, 44 | dedup=True, input_key='text') 45 | stream = split_tokens(nlp, stream) 46 | 47 | return { 48 | 'view_id': 'ner_manual', 49 | 'dataset': dataset, 50 | 'stream': stream, 51 | 'exclude': exclude, 52 | 'config': {'labels': labels} 53 | } 54 | 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # multiuser_prodigy 2 | 3 | This is a multi-annotator setup for [Prodigy](http://prodi.gy/), 4 | Explosion AI's data annotation tool, that uses a Mongo DB to allocate 5 | annotation tasks to annotators working on different Prodigy instances running 6 | on seperate ports. This use case focuses on collecting gold standard 7 | annotations from a team of annotators using Prodigy, rather than on the active 8 | learning, single-annotator setup that Prodigy is primarily intended for. 9 | 10 | There are a few examples of annotation interfaces in the repo, including code 11 | for annotators working on training an NER model or doing sentence 12 | classification with document context. Each annotator works on the Prodigy/port 13 | assigned to them, and a new `DBStream` class handles pulling the examples from 14 | Prodigy that are assigned to each worker. 15 | 16 | I've used this setup for three major annotation projects now, but you'll need 17 | to modify the code to get it working for your project as well. 18 | 19 | ## Mongo database 20 | 21 | All tasks are stored in a Mongo DB, which allows different logic for how tasks 22 | are assigned to annotators. For instance, examples can go out to annotators 23 | until three annotations are collected, examples could go to two predetermined 24 | annotators from the wider pool, or annotations can be automatically resubmitted 25 | to a third annotator if the first two annotations disagree. 26 | 27 | You can start a Mongo DB in a Docker container: 28 | 29 | ``` 30 | sudo docker run -d -p 127.0.0.1:27017:27017 -v /home/andy/MIT/multiuser_prodigy/db:/data/db mongo 31 | ``` 32 | 33 | To load a list of tasks into the database: 34 | 35 | ``` 36 | python mongo_load.py -i assault_not_assault.jsonl -c "assault_gsr" 37 | ``` 38 | 39 | where `-i` is a JSONL file of tasks and `-c` specifies the collection name to 40 | load them into. 41 | 42 | "seen" : {"$in" : [0,1]}}, 43 | {"coders" 44 | 45 | ## Running 46 | 47 | You'll need to modify the code of `multiuser_db.py` to access the right 48 | collection, set the names/ports of annotators, and the desired interface (NER, 49 | classification, etc). 50 | 51 | Then you should launch the processes either in a `screen` or in the background: 52 | 53 | ``` 54 | python multiuser_db.py 55 | ``` 56 | 57 | ## Analysis 58 | 59 | ![](streamlit_dashboard.png) 60 | 61 | You can use Streamlit to set up a dashboard so annotators can check their 62 | progress. This one pulls results from the Mongo DB, but you could also call the 63 | Prodigy DB and show results from there. 64 | 65 | 66 | A more complicated analysis dashboard setup is in 67 | `Report.Rmd`. This RMarkdown file reads in a CSV of coding information and 68 | generates figures in an HTML page that can be served from the annotation 69 | server. To record information about how long each task takes, add something 70 | like `eg['time_loaded'] = datetime.now().isoformat()` to your stream code and 71 | something like `eg['time_returned'] = datetime.now().isoformat()` to your 72 | update code. `report_maker.py` exports the DB to CSV and knits the RMarkdown on 73 | that CSV. 74 | -------------------------------------------------------------------------------- /multiuser_ner.py: -------------------------------------------------------------------------------- 1 | import prodigy 2 | from multiprocessing import Process 3 | from time import sleep 4 | from prodigy.recipes.ner import batch_train 5 | import atexit 6 | from pathlib import Path 7 | import datetime as dt 8 | 9 | class MultiProdigy: 10 | def __init__(self, tag_list = ["LOC", "GPE", "PERSON", "ORG", "DATE", "NORP"]): 11 | self.tag_list = tag_list 12 | self.processes = [] 13 | 14 | def serve_ner(self, ner_label, port): 15 | print(ner_label) 16 | # We can actually give everyone the same document. That'll simplify the 17 | # directory and the update process, any may help the training process. 18 | #filename = "data/{0}.jsonl".format(ner_label) 19 | filename = "data/aljazeera_1.jsonl" 20 | prodigy.serve('ner.teach', "arabic_ner_db", "model-final", 21 | filename, None, None, ner_label, None, "arabic_ner_db", 22 | port=port) 23 | 24 | def serve_ner_manual(self, ner_label, port): 25 | print(ner_label) 26 | # We can actually give everyone the same document. That'll simplify the 27 | # directory and the update process, any may help the training process. 28 | #filename = "data/{0}.jsonl".format(ner_label) 29 | filename = "data/aljazeera_1.jsonl" 30 | prodigy.serve('ner.manual', "arabic_ner_db", "arabic_model", 31 | filename, None, None, ner_label, "arabic_ner_db", 32 | port=port) 33 | 34 | 35 | def make_prodigies(self): 36 | for n, tag in enumerate(self.tag_list): 37 | thread = Process(target=self.serve_ner_manual, args=(tag, 9010 + n)) 38 | #thread = Process(target=self.serve_ner, args=(tag, 9010 + n)) 39 | self.processes.append(thread) 40 | 41 | def start_prodigies(self): 42 | print("Starting Prodigy processes...") 43 | for p in self.processes: 44 | p.start() 45 | sleep(1) 46 | 47 | def kill_prodigies(self): 48 | print("Killing Prodigy threads") 49 | for i in self.processes: 50 | try: 51 | i.terminate() 52 | except AttributeError: 53 | print("Process {0} doesn't exist?".format(i)) 54 | self.processes = [] 55 | 56 | def train_and_restart(self): 57 | print("Re-training model with new annotations...") 58 | batch_train(dataset="arabic_ner_db", 59 | input_model="model-final", 60 | n_iter = 10, 61 | output_model = Path("arabic_model_updated")) 62 | print("Model training complete. Restarting service with new model...") 63 | self.kill_prodigies() 64 | self.make_prodigies() 65 | self.start_prodigies() 66 | 67 | def make_retrain_time(self): 68 | # make a datetime for tomorrow at 4 am 69 | tomorrow = dt.datetime.today() + dt.timedelta(days=1) 70 | self.retrain_time = dt.datetime.combine(tomorrow, dt.time(4, 0)) 71 | 72 | 73 | if __name__ == "__main__": 74 | mp = MultiProdigy() 75 | mp.make_retrain_time() 76 | atexit.register(mp.kill_prodigies) 77 | mp.make_prodigies() 78 | mp.start_prodigies() 79 | while True: 80 | sleep(5) 81 | if dt.datetime.now() > mp.retrain_time: 82 | print("Retraining model and scheduling next retraining for tomorrow") 83 | mp.make_retrain_time() # bump to tomorrow 84 | mp.train_and_restart() 85 | -------------------------------------------------------------------------------- /multiuser_db.py: -------------------------------------------------------------------------------- 1 | import prodigy 2 | from multiprocessing import Process 3 | from time import sleep 4 | import atexit 5 | from pathlib import Path 6 | 7 | from prodigy.components import printers 8 | from prodigy.components.loaders import get_stream 9 | from prodigy.core import recipe, recipe_args 10 | from prodigy.util import TASK_HASH_ATTR, log 11 | from datetime import datetime 12 | from pymongo import MongoClient 13 | import pymongo 14 | from bson.json_util import dumps 15 | from bson.objectid import ObjectId 16 | from random import shuffle 17 | 18 | from custom_recipes import manual_custom 19 | # Config: 20 | # - add list of coders 21 | # - ?? add port per coder? 22 | # - base file name for files 23 | # - recipe, db, model, output 24 | 25 | class MultiProdigy: 26 | def __init__(self, 27 | coder_list, 28 | db_name, 29 | collection_name, 30 | recipe_name, 31 | view_id, 32 | dataset, 33 | label=None, 34 | model="blank:en"): 35 | self.coder_list = coder_list 36 | self.dataset = dataset, 37 | self.db_name = db_name, 38 | self.collection_name = collection_name 39 | self.processes = [] 40 | self.recipe_name=recipe_name 41 | self.view_id=view_id 42 | self.label=label 43 | self.model=model 44 | self.label=label 45 | 46 | print("Using recipe ", self.recipe_name) 47 | 48 | def serve(self, coder, port): 49 | print(coder) 50 | prodigy.serve(self.recipe_name, 51 | self.dataset, 52 | self.model, 53 | coder, 54 | self.collection_name, 55 | self.db_name, 56 | self.label, 57 | #view_id=self.view_id, 58 | #label=self.label, 59 | #None, # api 60 | #None, # loader 61 | #True, # memorize 62 | #None, # exclude 63 | port=port, 64 | ) # port 65 | 66 | def make_prodigies(self): 67 | for coder_info in enumerate(self.coder_list): 68 | coder_info = coder_info[1] # wut 69 | thread = Process(target=self.serve, kwargs = 70 | {"coder": coder_info['name'], 71 | "port": coder_info['port']}) 72 | self.processes.append(thread) 73 | 74 | def start_prodigies(self): 75 | print("Starting Prodigy processes...") 76 | for p in self.processes: 77 | p.start() 78 | sleep(1) 79 | 80 | def kill_prodigies(self): 81 | print("Killing Prodigy threads") 82 | for i in self.processes: 83 | try: 84 | i.terminate() 85 | except AttributeError: 86 | print("Process {0} doesn't exist?".format(i)) 87 | self.processes = [] 88 | 89 | 90 | if __name__ == "__main__": 91 | mp = MultiProdigy(coder_list = [{"name" : "Andy", "port" : 9010}, 92 | {"name" : "Jill", "port" : 9011}], 93 | db_name = "gsr", 94 | collection_name = "protest_apsa_en_prod", 95 | recipe_name="manual_custom", 96 | view_id="manual_custom", 97 | dataset="tmp_apsa", 98 | label="NOUN,OBJ") 99 | atexit.register(mp.kill_prodigies) 100 | mp.make_prodigies() 101 | mp.start_prodigies() 102 | while True: 103 | sleep(30 * 60) 104 | print("Restarting Prodigy...") 105 | mp.kill_prodigies() 106 | mp.make_prodigies() 107 | mp.start_prodigies() 108 | 109 | # if datetime.datetime.now() > mp.retrain_time: 110 | # print("Retraining model and scheduling next retraining for tomorrow") 111 | # mp.make_retrain_time() # bump to tomorrow 112 | # mp.train_and_restart() 113 | 114 | -------------------------------------------------------------------------------- /multiuser_mark.py: -------------------------------------------------------------------------------- 1 | import prodigy 2 | from multiprocessing import Process 3 | from time import sleep 4 | from prodigy.recipes.ner import batch_train 5 | import atexit 6 | from pathlib import Path 7 | import datetime as dt 8 | 9 | from prodigy.components import printers 10 | from prodigy.components.loaders import get_stream 11 | from prodigy.core import recipe, recipe_args 12 | from prodigy.util import TASK_HASH_ATTR, log 13 | from datetime import datetime 14 | from collections import Counter 15 | 16 | # It's all going to be run by coder name. 17 | 18 | # Config: 19 | # - add list of coders 20 | # - ?? add port per coder? 21 | # - base file name for files 22 | # - recipe, db, model, output 23 | 24 | @prodigy.recipe('mark_custom', 25 | dataset=recipe_args['dataset'], 26 | source=recipe_args['source'], 27 | api=recipe_args['api'], 28 | loader=recipe_args['loader'], 29 | label=recipe_args['label'], 30 | view_id=recipe_args['view'], 31 | memorize=recipe_args['memorize'], 32 | exclude=recipe_args['exclude']) 33 | def mark_custom(dataset, source=None, view_id=None, label='', api=None, 34 | loader=None, memorize=False, exclude=None): 35 | """ 36 | Click through pre-prepared examples, with no model in the loop. 37 | """ 38 | log('RECIPE: Starting recipe mark', locals()) 39 | stream = list(get_stream(source, api, loader)) 40 | 41 | counts = Counter() 42 | memory = {} 43 | 44 | def fill_memory(ctrl): 45 | if memorize: 46 | examples = ctrl.db.get_dataset(dataset) 47 | log("RECIPE: Add {} examples from dataset '{}' to memory" 48 | .format(len(examples), dataset)) 49 | for eg in examples: 50 | memory[eg[TASK_HASH_ATTR]] = eg['answer'] 51 | 52 | def ask_questions(stream): 53 | for eg in stream: 54 | eg['time_loaded'] = datetime.now().isoformat() 55 | if TASK_HASH_ATTR in eg and eg[TASK_HASH_ATTR] in memory: 56 | answer = memory[eg[TASK_HASH_ATTR]] 57 | counts[answer] += 1 58 | else: 59 | if label: 60 | eg['label'] = label 61 | yield eg 62 | 63 | def recv_answers(answers): 64 | for eg in answers: 65 | counts[eg['answer']] += 1 66 | memory[eg[TASK_HASH_ATTR]] = eg['answer'] 67 | eg['time_returned'] = datetime.now().isoformat() 68 | 69 | def print_results(ctrl): 70 | print(printers.answers(counts)) 71 | 72 | def get_progress(session=0, total=0, loss=0): 73 | progress = len(counts) / len(stream) 74 | return progress 75 | 76 | return { 77 | 'view_id': view_id, 78 | 'dataset': dataset, 79 | 'stream': ask_questions(stream), 80 | 'exclude': exclude, 81 | 'update': recv_answers, 82 | 'on_load': fill_memory, 83 | 'on_exit': print_results, 84 | 'config': {'label': label} 85 | } 86 | 87 | class MultiProdigy: 88 | def __init__(self, 89 | coder_list = [{"name" : "Daniel", "port" : 9010}, 90 | {"name" : "Youseff", "port" : 9011}, 91 | {"name" : "Emad", "port" : 9012}, 92 | {"name" : "Rafeef", "port" : 9013}, 93 | {"name" : "Mahmoud", "port" : 9014}, 94 | {"name" : "Zach", "port" : 9015}, 95 | {"name" : "Collin", "port" : 9016}, 96 | ]): 97 | self.coder_list = coder_list 98 | self.processes = [] 99 | 100 | def serve(self, coder, port): 101 | print(coder) 102 | base = "data/protest_for_classification_" 103 | filename = "{0}{1}.jsonl".format(base, coder) 104 | prodigy.serve('mark_custom', # recipe 105 | "gsr_is_protest", # db 106 | filename, # input file 107 | "classification", # view ID 108 | "PROTEST", 109 | None, # api 110 | None, # loader 111 | True, # memorize 112 | "gsr_is_protest", # exclude 113 | port=port) # port 114 | 115 | def make_prodigies(self): 116 | for coder_info in enumerate(self.coder_list): 117 | coder_info = coder_info[1] # wut 118 | thread = Process(target=self.serve, args = (coder_info['name'], coder_info['port'])) 119 | self.processes.append(thread) 120 | 121 | def start_prodigies(self): 122 | print("Starting Prodigy processes...") 123 | for p in self.processes: 124 | p.start() 125 | sleep(1) 126 | 127 | def kill_prodigies(self): 128 | print("Killing Prodigy threads") 129 | for i in self.processes: 130 | try: 131 | i.terminate() 132 | except AttributeError: 133 | print("Process {0} doesn't exist?".format(i)) 134 | self.processes = [] 135 | 136 | 137 | if __name__ == "__main__": 138 | mp = MultiProdigy() 139 | #mp.make_retrain_time() 140 | atexit.register(mp.kill_prodigies) 141 | mp.make_prodigies() 142 | mp.start_prodigies() 143 | while True: 144 | sleep(5) 145 | # if dt.datetime.now() > mp.retrain_time: 146 | # print("Retraining model and scheduling next retraining for tomorrow") 147 | # mp.make_retrain_time() # bump to tomorrow 148 | # mp.train_and_restart() 149 | 150 | -------------------------------------------------------------------------------- /multiuser_db_assault.py: -------------------------------------------------------------------------------- 1 | import prodigy 2 | from multiprocessing import Process 3 | from time import sleep 4 | import atexit 5 | from pathlib import Path 6 | 7 | from prodigy.components import printers 8 | from prodigy.components.loaders import get_stream 9 | from prodigy.core import recipe, recipe_args 10 | from prodigy.util import TASK_HASH_ATTR, log 11 | from datetime import datetime 12 | from pymongo import MongoClient 13 | from bson.json_util import dumps 14 | from bson.objectid import ObjectId 15 | from random import shuffle 16 | 17 | # Config: 18 | # - add list of coders 19 | # - ?? add port per coder? 20 | # - base file name for files 21 | # - recipe, db, model, output 22 | 23 | class DBStream: 24 | def __init__(self, active_coder, collection_name = "silver_assault"): 25 | self.active_coder = active_coder 26 | self.coll = setup_mongo(collection_name) 27 | print("Total tasks in collection: ", self.coll.count()) 28 | 29 | def get_examples(self): 30 | print("get_examples called") 31 | examples = self.coll.find({"$and" : [ 32 | {"seen" : {"$in" : [0,1,2,3]}}, 33 | {"coders" : {"$nin" : [self.active_coder]}}]}).limit(200) 34 | examples = list(examples) 35 | print("inside get_examples, this many examples:", len(examples)) 36 | for i in examples: 37 | i['_id'] = str(i['_id']) 38 | shuffle(examples) 39 | self.examples = iter(examples) 40 | ## !! Need to prioritize examples with 2 or 1 views. 41 | 42 | def setup_mongo(collection_name, db_name = "gsr"): 43 | client = MongoClient('mongodb://localhost:27017/') 44 | db = client[db_name] 45 | coll = db[collection_name] 46 | return coll 47 | 48 | @prodigy.recipe('mark_custom', 49 | dataset=recipe_args['dataset'], 50 | source=recipe_args['source'], 51 | api=recipe_args['api'], 52 | loader=recipe_args['loader'], 53 | label=recipe_args['label'], 54 | view_id=recipe_args['view'], 55 | memorize=recipe_args['memorize'], 56 | exclude=recipe_args['exclude']) 57 | def mark_custom(dataset, source=None, view_id=None, label='', api=None, 58 | loader=None, memorize=False, exclude=None): 59 | """ 60 | Click through pre-prepared examples, with no model in the loop. 61 | """ 62 | 63 | log('RECIPE: Starting recipe mark', locals()) 64 | coder = source # repurposing input slot 65 | stream_empty = iter([]) 66 | stream = DBStream(coder, "silver_assault") 67 | stream.get_examples() 68 | #print("Initial number of examples in queue:", len(list(stream.examples))) 69 | #print("Initial examples in queue:", list(stream.examples)) 70 | 71 | def ask_questions(stream_empty): 72 | #print("Hitting 'ask_question', with ", len(list(stream.examples)), " in the queue") 73 | #print(list(stream.examples)) 74 | #print(stream.reced) 75 | for eg in stream.examples: 76 | #stream.get_examples() 77 | eg['time_loaded'] = datetime.now().isoformat() 78 | # not serializeable 79 | eg['_id'] = str(eg['_id']) 80 | yield eg 81 | 82 | 83 | #### Problem with the post-answer update. 84 | ## Not refreshing 85 | 86 | def recv_answers(answers): 87 | for eg in answers: 88 | print("Answer back: ")#, eg) 89 | # Get the example from the DB again in case it's changed 90 | updated_ex = list(stream.coll.find({'_id': ObjectId(eg['_id'])})) 91 | try: 92 | curr_cod = updated_ex[0]['coders'] 93 | except KeyError: 94 | curr_cod = [] 95 | # add current coder to the list 96 | curr_cod.append(coder) 97 | stream.coll.update_one({"_id": ObjectId(eg['_id'])}, # convert back 98 | {"$set": {"coders": curr_cod, 99 | "seen" : len(curr_cod)}}) 100 | eg['time_returned'] = datetime.now().isoformat() 101 | eg['active_coder'] = coder 102 | eg['coders'] = curr_cod 103 | #print("Refreshing stream...") 104 | #stream.get_examples() 105 | 106 | def print_results(ctrl): 107 | print(printers.answers(counts)) 108 | 109 | def get_progress(session=0, total=0, loss=0): 110 | done = stream.coll.count({"$or" : [ 111 | {"coders" : coder}, 112 | {"seen" : {"$gte": 3}}]}) 113 | total = stream.coll.count() 114 | progress = done / total 115 | return progress 116 | 117 | return { 118 | 'view_id': view_id, 119 | 'dataset': dataset, 120 | 'stream': ask_questions(stream_empty), 121 | 'exclude': exclude, 122 | 'progress' : get_progress, 123 | 'update': recv_answers, 124 | 'on_exit': print_results 125 | #'config': {'label': label} 126 | } 127 | 128 | class MultiProdigy: 129 | def __init__(self, 130 | coder_list = [#{"name" : "Daniel", "port" : 9010}, 131 | #{"name" : "Youseff", "port" : 9011}, 132 | #{"name" : "Emad", "port" : 9012}, 133 | {"name" : "Khaled", "port" : 9013} 134 | #{"name" : "Mahmoud", "port" : 9014}, 135 | #{"name" : "Zach", "port" : 9015}, 136 | #{"name" : "Collin", "port" : 9016}, 137 | ]): 138 | self.coder_list = coder_list 139 | self.processes = [] 140 | 141 | def serve(self, coder, port): 142 | print(coder) 143 | #filename = "{0}{1}.jsonl".format(base, coder) 144 | prodigy.serve('mark_custom', # recipe 145 | "silver_assault", # db 146 | coder, # input file, repurposed for coder 147 | "classification", # view ID 148 | "ASSAULT", 149 | None, # api 150 | None, # loader 151 | True, # memorize 152 | None, # exclude 153 | port=port) # port 154 | 155 | def make_prodigies(self): 156 | for coder_info in enumerate(self.coder_list): 157 | coder_info = coder_info[1] # wut 158 | thread = Process(target=self.serve, args = (coder_info['name'], coder_info['port'])) 159 | self.processes.append(thread) 160 | 161 | def start_prodigies(self): 162 | print("Starting Prodigy processes...") 163 | for p in self.processes: 164 | p.start() 165 | sleep(1) 166 | 167 | def kill_prodigies(self): 168 | print("Killing Prodigy threads") 169 | for i in self.processes: 170 | try: 171 | i.terminate() 172 | except AttributeError: 173 | print("Process {0} doesn't exist?".format(i)) 174 | self.processes = [] 175 | 176 | 177 | if __name__ == "__main__": 178 | mp = MultiProdigy() 179 | atexit.register(mp.kill_prodigies) 180 | mp.make_prodigies() 181 | mp.start_prodigies() 182 | while True: 183 | sleep(60 * 60) 184 | print("Restarting Prodigy...") 185 | mp.kill_prodigies() 186 | mp.make_prodigies() 187 | mp.start_prodigies() 188 | 189 | # if datetime.datetime.now() > mp.retrain_time: 190 | # print("Retraining model and scheduling next retraining for tomorrow") 191 | # mp.make_retrain_time() # bump to tomorrow 192 | # mp.train_and_restart() 193 | 194 | -------------------------------------------------------------------------------- /multiuser_manual_db.py: -------------------------------------------------------------------------------- 1 | import prodigy 2 | from multiprocessing import Process 3 | from time import sleep 4 | import atexit 5 | from pathlib import Path 6 | 7 | from prodigy.components import printers 8 | from prodigy.components.loaders import get_stream 9 | from prodigy.core import recipe, recipe_args 10 | from prodigy.util import TASK_HASH_ATTR, log 11 | from prodigy.components.preprocess import add_tokens 12 | from datetime import datetime 13 | from pymongo import MongoClient 14 | import pymongo 15 | from bson.json_util import dumps 16 | from bson.objectid import ObjectId 17 | from random import shuffle 18 | 19 | # Config: 20 | # - add list of coders 21 | # - ?? add port per coder? 22 | # - base file name for files 23 | # - recipe, db, model, output 24 | 25 | import spacy 26 | nlp = spacy.blank("en") 27 | 28 | def setup_mongo(collection_name, db_name = "gsr"): 29 | client = MongoClient('mongodb://localhost:27017/') 30 | db = client[db_name] 31 | coll = db[collection_name] 32 | return coll 33 | 34 | class DBStream: 35 | """Certain parameters are hard coded, for instance the number of 36 | annotaters to see each example and the Mongo DB name.""" 37 | def __init__(self, active_coder, collection_name): 38 | self.active_coder = active_coder 39 | self.coll = setup_mongo(collection_name) 40 | print("Total tasks in collection: ", self.coll.count()) 41 | 42 | def get_examples(self): 43 | print("get_examples called") 44 | examples = self.coll.find({"$and" : [ 45 | {"seen" : {"$in" : [0,1,2,3]}}, 46 | {"coders" : {"$nin" : [self.active_coder]}}]}).sort("seen", pymongo.DESCENDING).limit(200) 47 | examples = list(examples) 48 | print("inside get_examples, this many examples:", len(examples)) 49 | for i in examples: 50 | if '_id' in i.keys(): 51 | i['_id'] = str(i['_id']) 52 | shuffle(examples) # this, of course, obviates the sorting a few lines above... 53 | self.examples = iter(examples) 54 | 55 | 56 | # This decorator has ideas about what keyword arguments to take in. 57 | # Repurpose some of these to convey other information, which is a bit 58 | # ugly. 59 | @prodigy.recipe('manual_custom', 60 | dataset=recipe_args['dataset'], 61 | source=recipe_args['source'], # use this slot for the coder name 62 | api=recipe_args['api'], # use this one for the collection name 63 | loader=recipe_args['loader'], 64 | label=recipe_args['label'], 65 | view_id=recipe_args['view'], 66 | memorize=recipe_args['memorize'], 67 | exclude=recipe_args['exclude']) 68 | def manual_custom(dataset, source=None, view_id=None, label='', api=None, 69 | loader=None, memorize=False, exclude=None): 70 | """ 71 | Click through pre-prepared examples, with no model in the loop. 72 | """ 73 | 74 | log('RECIPE: Starting recipe mark', locals()) 75 | coder = source # repurposing input slot 76 | stream_empty = iter([]) 77 | stream = DBStream(coder, api) # using the api slot for collection name 78 | stream.get_examples() 79 | 80 | def ask_questions(stream): 81 | for eg in stream.examples: 82 | eg['time_loaded'] = datetime.now().isoformat() 83 | eg['mongo_collection'] = api # record where it came from 84 | # not serializeable 85 | eg['_id'] = str(eg['_id']) 86 | # add tokens. add_tokens expects a list... 87 | ts = add_tokens(nlp, [eg]) 88 | #...and returns a generator 89 | eg = next(ts) 90 | yield eg 91 | 92 | 93 | def recv_answers(answers): 94 | for eg in answers: 95 | # Retrieve the example from the DB again to get most up-to-date 96 | # list of coders 97 | updated_ex = list(stream.coll.find({'_id': ObjectId(eg['_id'])})) 98 | try: 99 | curr_cod = updated_ex[0]['coders'] 100 | except KeyError: 101 | curr_cod = [] 102 | # add current coder to the list 103 | curr_cod.append(coder) 104 | stream.coll.update_one({"_id": ObjectId(eg['_id'])}, # convert back 105 | {"$set": {"coders": curr_cod, 106 | "seen" : len(curr_cod)}}) 107 | eg['time_returned'] = datetime.now().isoformat() # record submission time 108 | eg['active_coder'] = coder 109 | eg['coders'] = curr_cod 110 | 111 | 112 | def get_progress(session=0, total=0, loss=0): 113 | return None 114 | #done = stream.coll.count({"$or" : [ 115 | # {"coders" : coder}, 116 | # {"seen" : {"$gte": 3}}]}) 117 | #total = stream.coll.count() 118 | #progress = done / total 119 | #return progress 120 | 121 | return { 122 | 'view_id': view_id, 123 | 'dataset': dataset, 124 | 'stream': ask_questions(stream), 125 | 'exclude': exclude, 126 | 'progress' : get_progress, 127 | 'update': recv_answers, 128 | } 129 | 130 | 131 | 132 | class MultiProdigy: 133 | """These are functions that remain the same regardless of the view ID.""" 134 | def __init__(self, coder_list, collection, dataset, view_id = None, label = None): 135 | self.coder_list = coder_list 136 | self.collection = collection 137 | self.dataset = dataset 138 | self.processes = [] 139 | self.view_id = view_id 140 | self.label = label 141 | 142 | def start_prodigies(self): 143 | print("Starting Prodigy processes...") 144 | for p in self.processes: 145 | p.start() 146 | sleep(1) 147 | 148 | def kill_prodigies(self): 149 | # Make sure all processes are killed on close 150 | print("Killing Prodigy threads") 151 | for i in self.processes: 152 | try: 153 | i.terminate() 154 | except AttributeError: 155 | print("Process {0} doesn't exist?".format(i)) 156 | self.processes = [] 157 | 158 | 159 | class MultiProdigyManual(MultiProdigy): 160 | # There are two functions 161 | def serve(self, coder, port): 162 | print(coder) 163 | prodigy.serve('manual_custom', # recipe 164 | self.dataset, # dataset to save it in 165 | coder, # input file, repurposed for coder 166 | "ner_manual", # view ID 167 | self.label, 168 | self.collection, # api, repurposed to be collection 169 | None, # loader 170 | True, # memorize 171 | None, # exclude 172 | port=port) # port 173 | 174 | def make_prodigies(self): 175 | for coder_info in enumerate(self.coder_list): 176 | coder_info = coder_info[1] # wut 177 | thread = Process(target=self.serve, args = 178 | (coder_info['name'], 179 | coder_info['port'])) 180 | self.processes.append(thread) 181 | 182 | 183 | 184 | if __name__ == "__main__": 185 | mp = MultiProdigyManual( 186 | dataset = "apsa_tmp", 187 | coder_list = [{"name": "Andy", "port" : "9011"}], 188 | collection = "silver_assault") 189 | atexit.register(mp.kill_prodigies) 190 | mp.make_prodigies() 191 | mp.start_prodigies() 192 | while True: 193 | sleep(60 * 60) 194 | print("Restarting Prodigy...") 195 | mp.kill_prodigies() 196 | mp.make_prodigies() 197 | mp.start_prodigies() 198 | 199 | -------------------------------------------------------------------------------- /multiuser_db_blocks.py: -------------------------------------------------------------------------------- 1 | import prodigy 2 | import spacy 3 | from multiprocessing import Process 4 | from time import sleep 5 | import atexit 6 | from pathlib import Path 7 | 8 | from prodigy.components import printers 9 | from prodigy.components.loaders import get_stream 10 | from prodigy.core import recipe, recipe_args 11 | from prodigy.util import TASK_HASH_ATTR, log 12 | from datetime import datetime 13 | from pymongo import MongoClient 14 | import pymongo 15 | from bson.json_util import dumps 16 | from bson.objectid import ObjectId 17 | from random import shuffle 18 | 19 | 20 | class DBStream: 21 | def __init__(self, active_coder, collection_name): 22 | print("Using collection: {}".format(collection_name)) 23 | self.active_coder = active_coder 24 | self.coll = setup_mongo(collection_name) 25 | print("Total tasks in collection: ", self.coll.count_documents({})) 26 | 27 | def get_examples(self): 28 | print("get_examples called") 29 | examples = self.coll.find({"$and" : [ 30 | {"assigned_annotators" : {"$in" : [self.active_coder]}}, # check if the task is assigned to the current coder... 31 | {"coders" : {"$nin" : [self.active_coder]}}]}).sort("sent_id", pymongo.ASCENDING) # ...but the current coder hasn't seen it yet 32 | examples = list(examples) 33 | print("inside get_examples, this many examples:", len(examples)) 34 | for i in examples: 35 | i['_id'] = str(i['_id']) # this gets created by mongo 36 | i['_task_hash'] = hash(str(i['_id']) + str(self.active_coder)) 37 | i['_input_hash'] = hash(str(i['_id']) + str(self.active_coder)) 38 | self.examples = iter(examples) 39 | ## !! Need to prioritize examples with 2 or 1 views. 40 | 41 | def setup_mongo(collection_name, db_name = "gsr"): 42 | client = MongoClient('mongodb://localhost:27017/') 43 | db = client[db_name] 44 | coll = db[collection_name] 45 | return coll 46 | 47 | @prodigy.recipe('toi_blocks') 48 | def toi_blocks(dataset, source=None, collection="prod_dec_2020_2"): 49 | log('RECIPE: Starting recipe mark', locals()) 50 | coder = source # repurposing input slot 51 | print("Coder from within toi_blocks:", coder) 52 | stream_empty = iter([]) 53 | stream = DBStream(coder, collection) 54 | stream.get_examples() 55 | #print("Initial number of examples in queue:", len(list(stream.examples))) 56 | #print("Initial examples in queue:", list(stream.examples)) 57 | 58 | def ask_questions(stream_empty): 59 | #print("Hitting 'ask_question', with ", len(list(stream.examples)), " in the queue") 60 | #print(list(stream.examples)) 61 | #print(stream.reced) 62 | for eg in stream.examples: 63 | #stream.get_examples() 64 | eg['time_loaded'] = datetime.now().isoformat() 65 | eg['active_coder'] = coder 66 | # not serializeable 67 | eg['_id'] = str(eg['_id']) 68 | yield eg 69 | 70 | 71 | #### Problem with the post-answer update. 72 | ## Not refreshing 73 | 74 | def recv_answers(answers): 75 | for eg in answers: 76 | print("Answer back: ", coder, datetime.now().isoformat())#, eg) 77 | # Get the example from the DB again in case it's changed 78 | updated_ex = list(stream.coll.find({'_id': ObjectId(eg['_id'])})) 79 | try: 80 | curr_cod = updated_ex[0]['coders'] 81 | except KeyError: 82 | curr_cod = [] 83 | # add current coder to the list 84 | curr_cod.append(coder) 85 | stream.coll.update_one({"_id": ObjectId(eg['_id'])}, # convert back 86 | {"$set": {"coders": curr_cod, 87 | "seen" : len(curr_cod), 88 | 'time_returned': datetime.now().isoformat(), 89 | 'time_loaded': eg['time_loaded'], 90 | 'active_coder': coder 91 | }}) 92 | eg['time_returned'] = datetime.now().isoformat() 93 | eg['seen'] = len(curr_cod) 94 | 95 | def print_results(ctrl): 96 | print(printers.answers(counts)) 97 | 98 | def get_progress(*args, **kwargs): 99 | done = stream.coll.count_documents({"coders" : coder}) 100 | total = stream.coll.count_documents({}) 101 | return done / total 102 | 103 | # We can use the blocks to override certain config and content, and set 104 | # "text": None for the choice interface so it doesn't also render the text 105 | blocks = [ 106 | {"view_id": "choice", "text": None}, 107 | {"view_id": "text_input", "field_rows": 3, "field_label": "If you found this example difficult or ambiguous please explain why."} 108 | ] 109 | 110 | return { 111 | "dataset": dataset, # the dataset to save annotations to 112 | "view_id": "blocks", # set the view_id to "blocks" 113 | "stream": ask_questions(stream_empty), # the stream of incoming examples 114 | "config": { 115 | "labels": ["RELEVANT"], # the labels for the manual NER interface 116 | "blocks": blocks, # add the blocks to the config 117 | }, 118 | 'update': recv_answers, 119 | #"custom_theme": {"cardMaxWidth": "90%"}, 120 | } 121 | 122 | # return { 123 | # 'view_id': view_id, 124 | # 'dataset': dataset, 125 | # 'stream': ask_questions(stream_empty), 126 | # 'exclude': exclude, 127 | # "flag": True, 128 | # "custom_theme": {"cardMaxWidth": "90%"}, 129 | # 'progress' : get_progress, 130 | # 'on_exit': print_results 131 | # } 132 | 133 | class MultiProdigy: 134 | def __init__(self, 135 | coder_list = [#{"name" : "Andy", "port" : 9010}, 136 | # {"name" : "Katie", "port" : 9011}, 137 | # {"name" : "Sheikh", "port" : 9012}, 138 | #{"name" : 9014, "port" : 9014}, 139 | {"name" : 9015, "port" : 9015}, 140 | {"name" : 9016, "port" : 9016}, 141 | {"name" : 9017, "port" : 9017}, 142 | #{"name" : 9018, "port" : 9018}, 143 | {"name" : 9019, "port" : 9019}, 144 | {"name" : 9020, "port" : 9020}, 145 | #{"name" : 9021, "port" : 9021}, 146 | {"name" : 9022, "port" : 9022}, 147 | {"name" : 9023, "port" : 9023}, 148 | #{"name" : 9024, "port" : 9024}, 149 | {"name" : 9025, "port" : 9025}, 150 | ]): 151 | self.coder_list = coder_list 152 | self.processes = [] 153 | 154 | def serve(self, coder, port): 155 | print(coder) 156 | #filename = "{0}{1}.jsonl".format(base, coder) 157 | prodigy.serve('toi_blocks', # recipe 158 | "prod_dec_2020_2", # collection 159 | coder, # input file, repurposed for coder 160 | port=port) # port 161 | 162 | def make_prodigies(self): 163 | for coder_info in enumerate(self.coder_list): 164 | coder_info = coder_info[1] # wut 165 | thread = Process(target=self.serve, args = (coder_info['name'], coder_info['port'])) 166 | self.processes.append(thread) 167 | 168 | def start_prodigies(self): 169 | print("Starting Prodigy processes...") 170 | for p in self.processes: 171 | p.start() 172 | sleep(1) 173 | 174 | def kill_prodigies(self): 175 | print("Killing Prodigy threads") 176 | for i in self.processes: 177 | try: 178 | i.terminate() 179 | except AttributeError: 180 | print("Process {0} doesn't exist?".format(i)) 181 | self.processes = [] 182 | 183 | 184 | if __name__ == "__main__": 185 | mp = MultiProdigy() 186 | atexit.register(mp.kill_prodigies) 187 | mp.make_prodigies() 188 | mp.start_prodigies() 189 | while True: 190 | sleep(60 * 60) 191 | print("Restarting Prodigy...") 192 | mp.kill_prodigies() 193 | mp.make_prodigies() 194 | mp.start_prodigies() 195 | 196 | 197 | -------------------------------------------------------------------------------- /Report.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Gold Standard Record Coding Report" 3 | date: "`r paste0(format(Sys.time(), '%d %B %Y, %H:%M:%S', tz='America/New_York',usetz=TRUE), '(East Coast)')`" 4 | output: 5 | html_document: 6 | self_contained: true 7 | --- 8 | 9 | ```{r setup, include=FALSE} 10 | library(knitr) 11 | opts_chunk$set(fig.width = 5) 12 | opts_chunk$set(fig.height= 4) 13 | opts_chunk$set(echo = TRUE) 14 | opts_chunk$set(warning = FALSE) 15 | opts_chunk$set(message = FALSE) 16 | opts_chunk$set(echo = FALSE) 17 | ``` 18 | 19 | ```{r message = FALSE, echo = FALSE, output = FALSE, message = FALSE} 20 | library(ggplot2) 21 | library(RColorBrewer) 22 | library(scales) 23 | library(grid) 24 | library(extrafont) 25 | 26 | theme_mcm <- function() { 27 | # Generate the colors for the chart procedurally with RColorBrewer 28 | palette <- brewer.pal("Greys", n=9) 29 | #color.background = "#fcfaea" 30 | #color.background = "#ffffff" 31 | color.background = "white" #palette[2] 32 | color.grid.major = palette[3] 33 | #color.grid.major = palette[3] 34 | color.axis.text = palette[6] 35 | color.axis.title = palette[7] 36 | color.title = palette[8] 37 | 38 | # Begin construction of chart 39 | theme_bw(base_size=12) + 40 | 41 | # Set the entire chart region to a light gray color 42 | theme(panel.background=element_rect(fill=color.background, color=color.background)) + 43 | theme(plot.background=element_rect(fill=color.background, color=color.background)) + 44 | theme(panel.border=element_rect(color=color.background)) + 45 | 46 | # Format the grid 47 | theme(panel.grid.major=element_line(color=color.grid.major,size=.15)) + 48 | theme(panel.grid.minor=element_line(color=color.grid.major,size=.07)) + 49 | theme(panel.grid.minor.y=element_blank()) + 50 | theme(axis.ticks=element_blank()) + 51 | 52 | # Format the legend 53 | theme(legend.background = element_rect(fill=color.background)) + 54 | theme(legend.key = element_rect(fill=color.background)) + 55 | theme(legend.text = element_text(size=9,color=color.axis.title)) + 56 | theme(legend.title = element_text(color=color.axis.title)) + 57 | 58 | # Set title and axis labels, and format these and tick marks 59 | theme(plot.title=element_text(color=color.title, size=12, vjust=1.25)) + 60 | theme(plot.subtitle=element_text(color="#353535", size=11)) + #, vjust=1.25 61 | theme(plot.caption=element_text(color=color.axis.title)) + 62 | theme(axis.text.x=element_text(size=9,color=color.axis.text)) + 63 | theme(axis.text.y=element_text(size=9,color=color.axis.text)) + 64 | theme(axis.title.x=element_text(size=10,color=color.axis.title, vjust=0)) + 65 | theme(axis.title.y=element_text(size=,color=color.axis.title, vjust=1.25)) + 66 | theme(plot.caption=element_text(size=7,color=palette[4], vjust = 6)) + 67 | 68 | # Facets 69 | theme(strip.background = element_rect(colour = color.background, 70 | fill = palette[3], 71 | size = 0.5)) + 72 | #theme(strip.text.y = element_text(vjust= -1.75)) + 73 | 74 | # Fonts 75 | theme(text=element_text(family="Tw Cen MT", margin=margin(b=15))) + 76 | theme(plot.subtitle=element_text(family="Tw Cen MT")) + 77 | 78 | # scrunch the titles down closer 79 | theme(plot.title = element_text(margin = margin(1.5))) + 80 | theme(plot.subtitle = element_text(margin = margin(1.5))) + 81 | 82 | #...and move the legend top-right 83 | #theme(legend.margin=margin(-20)) + 84 | #theme(legend.justification = "right") + 85 | 86 | # Plot margins 87 | theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm")) 88 | } 89 | 90 | hex <- c("#cfa81f", "#847a8e", "#ad4738", "#7c812d", "#008f7d", 91 | "#6d472f", "#10355e", "#fc9c8e", "#99b8c6", "#ca5f33") 92 | color <- c("yellow", "purple", "red", "green", "teal", 93 | "brown", "blue", "pink", "sky", "orange") 94 | #ggthemr::ggthemr('dust') 95 | #ggthemr::ggthemr('pale') 96 | theme_set(theme_mcm()) 97 | ``` 98 | 99 | ```{r} 100 | library(dplyr) 101 | #library(plotly) 102 | ``` 103 | 104 | 105 | ```{r} 106 | df <- read.csv("coding_summary.csv") 107 | df$date <- as.Date(df$date) 108 | ``` 109 | 110 | ```{r} 111 | gold <- df %>% 112 | group_by(id) %>% 113 | summarize(accept = sum(answer == "accept"), 114 | count = n()) %>% 115 | filter(count >= 2, accept == count) %>% 116 | nrow() 117 | ``` 118 | 119 | ```{r} 120 | gold_reject <- df %>% 121 | group_by(id) %>% 122 | summarize(good = sum(answer == "reject") == 3) %>% 123 | filter(good == TRUE) %>% 124 | nrow() 125 | ``` 126 | 127 | ```{r} 128 | silver <- df %>% 129 | group_by(id) %>% 130 | summarize(good = sum(answer == "accept") >= 2) %>% 131 | filter(good == TRUE) %>% 132 | nrow() 133 | ``` 134 | 135 | 136 | ## Overview 137 | 138 | We've collected `r nrow(df)` total labels. 139 | There are currently `r gold` gold-standard accepts, `r silver` silver-standard (2) accepts, `r gold_reject` with three rejects. 140 | 141 | 142 | ## Statistics 143 | 144 | ```{r} 145 | p <- df %>% 146 | group_by(coder) %>% 147 | summarize(count = n()) %>% 148 | ggplot(., aes(x = coder, y = count)) + 149 | geom_bar(stat = "identity", width = 0.4) + 150 | labs(title = "Sentences coded over all time") 151 | p 152 | ``` 153 | 154 | 155 | ```{r} 156 | library(lubridate) 157 | week <- today() - dweeks(1) 158 | 159 | 160 | p <- df %>% 161 | filter(as.Date(date) >= week) %>% 162 | group_by(coder) %>% 163 | summarize(count = n()) %>% 164 | ggplot(., aes(x = coder, y = count)) + 165 | geom_bar(stat = "identity", width = 0.4) + 166 | labs(title = paste0("Sentences coded for the week of ", today() - dweeks(1), " to ", today())) 167 | p 168 | ``` 169 | 170 | ```{r} 171 | p <- df %>% 172 | group_by(coder, date) %>% 173 | summarize(count = n()) %>% 174 | ggplot(., aes(x = date, y = coder, 175 | alpha = count)) + 176 | geom_tile() + 177 | labs(title = "Daily contributions per coder", 178 | x = "Date") + 179 | scale_x_date(date_minor_breaks = "1 day") 180 | p 181 | ``` 182 | 183 | 184 | ```{r} 185 | library(ggridges) 186 | 187 | coder_means <- df %>% 188 | filter(diff < 300) %>% 189 | group_by(coder) %>% 190 | summarize(avg_time = median(diff)) 191 | 192 | p <- df %>% 193 | filter(diff < 300) %>% 194 | ggplot(., aes(x = diff, y = coder)) + 195 | #geom_density_ridges() + 196 | geom_density_ridges(stat = "binline", bins = 100, scale = 0.95) + 197 | geom_point(data = coder_means, aes(x = avg_time, y = coder)) + 198 | geom_vline(xintercept = 60, linetype = 2) + 199 | labs(title = "Distribution of time per coder", 200 | subtitle = "Dots indicate median time, dashed line is 60 seconds", 201 | x = "Seconds per task", 202 | y = NULL) 203 | p 204 | ``` 205 | 206 | Excluding all tasks that take more than five minutes: 207 | 208 | ```{r results = "asis"} 209 | time_table <- df %>% 210 | filter(as.Date(date) >= week, diff < 300) %>% 211 | group_by(coder) %>% 212 | summarize(total_time = sum(diff) / 60) %>% 213 | arrange(desc(total_time)) 214 | 215 | kable(time_table, caption = "Minutes of coding time in past 7 days", 216 | digits = 1) 217 | ``` 218 | 219 | ```{r results = "asis"} 220 | d <- today() 221 | prev_days <- seq(d-13,d,by='day') 222 | sats <- prev_days[weekdays(prev_days)=='Saturday'] 223 | 224 | time_table <- df %>% 225 | filter(as.Date(date) >= sats[1], as.Date(date) < sats[2], diff < 300) %>% 226 | group_by(coder) %>% 227 | summarize(total_time = sum(diff) / 60) %>% 228 | arrange(desc(total_time)) 229 | 230 | kable(time_table, caption = paste0("Minutes of coding time in previous pay period period (", 231 | paste0(c(sats[1], sats[2]-1), collapse = " to "), 232 | ")"), 233 | digits = 1) 234 | ``` 235 | 236 | ```{r results = "asis"} 237 | time_table <- df %>% 238 | filter(diff < 300) %>% 239 | group_by(coder) %>% 240 | summarize(total_time = sum(diff) / 60) %>% 241 | arrange(desc(total_time)) 242 | 243 | kable(time_table, caption = "Minutes of coding time (all time)", 244 | digits = 1) 245 | ``` 246 | 247 | ```{r} 248 | p <- df %>% 249 | group_by(coder) %>% 250 | summarize(mean_accept = mean(answer == "accept")) %>% 251 | mutate(var = (mean_accept * (1 - mean_accept) / n())) %>% 252 | mutate(upper = mean_accept + 1.96*var, 253 | lower = mean_accept - 1.96*var) %>% 254 | ggplot(., aes(x = coder, y = mean_accept)) + 255 | geom_bar(stat = "identity", width = 0.4) + 256 | geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) + 257 | labs(title = "Acceptance rate per coder", 258 | y = "Acceptance Rate", 259 | x = NULL) 260 | p 261 | ``` 262 | 263 | 264 | ```{r} 265 | p <- df %>% 266 | group_by(id) %>% 267 | summarize(count = n()) %>% 268 | group_by(count) %>% 269 | summarize(examples = n()) %>% 270 | ggplot(., aes(x = as.factor(count), y = examples)) + 271 | geom_bar(stat = "identity", width = 0.4) + 272 | labs(title = "Distribution of completed examples", 273 | subtitle = "Each example is assigned to three coders", 274 | x = "Count") 275 | p 276 | ``` 277 | 278 | ```{r} 279 | p <- df %>% 280 | group_by(id) %>% 281 | mutate(count = n()) %>% 282 | filter(count > 1) %>% 283 | summarize(var = var(answer == "accept")) %>% 284 | ggplot(., aes(x = var)) + 285 | geom_histogram(bins = 50) + 286 | labs(title = "Variance of answers") 287 | p 288 | ``` 289 | 290 | 291 | 292 | 293 | 294 | --------------------------------------------------------------------------------