├── streamlit_dashboard.png
├── mongo_delete.py
├── mongo_load.py
├── LICENSE
├── report_maker.py
├── annotation_streamlit.py
├── custom_ner_manual.py
├── README.md
├── multiuser_ner.py
├── multiuser_db.py
├── multiuser_mark.py
├── multiuser_db_assault.py
├── multiuser_manual_db.py
├── multiuser_db_blocks.py
└── Report.Rmd


/streamlit_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/multiuser_prodigy/HEAD/streamlit_dashboard.png


--------------------------------------------------------------------------------
/mongo_delete.py:
--------------------------------------------------------------------------------
 1 | import jsonlines
 2 | from pymongo import MongoClient
 3 | import plac
 4 | import sys
 5 | 
 6 | @plac.annotations(
 7 |     coll_name=("Collectionto load into", "option", "c", str),
 8 |     db_name=("Database to load into", "option", "d", str))
 9 | def delete(coll_name, db_name = "gsr"):
10 |     # quick script for updating Mongo tasks
11 |     client = MongoClient('mongodb://localhost:27017/')
12 |     db = client[db_name]
13 |     coll = db[coll_name]
14 |     count = coll.count()
15 |     conf = input("You're about to delete the collection {0}, which has {1} records. Please type this name to confirm:  ".format(coll_name, count))
16 |     if conf != coll_name:
17 |         print("Bye!")
18 |         sys.exit(0)
19 |     if conf == coll_name:
20 |         coll.delete_many({})
21 |         print("Deleted all records from ", coll_name)
22 | 
23 | if __name__ == "__main__":
24 |     plac.call(delete)
25 | 


--------------------------------------------------------------------------------
/mongo_load.py:
--------------------------------------------------------------------------------
 1 | import jsonlines
 2 | from pymongo import MongoClient
 3 | import plac
 4 | 
 5 | @plac.annotations(
 6 |     input_file=("JSONL of tasks to load", "option", "i", str),
 7 |     coll_name=("Collectionto load into", "option", "c", str),
 8 |     db_name=("Database to load into", "option", "d", str))
 9 | def load(input_file, coll_name, db_name = "gsr"):
10 |     # quick script for updating Mongo tasks
11 |     with jsonlines.open(input_file, "r") as f:
12 |         to_load = list(f.iter())
13 |     for i in to_load:
14 |         i['seen'] = 0
15 |         i['coders'] = []
16 |     print("Loading into db {0}, collection {1}".format(db_name, coll_name))
17 |     client = MongoClient('mongodb://localhost:27017/')
18 |     db = client[db_name]
19 |     coll = db[coll_name]
20 |     print("Before loading:", coll.count())
21 |     coll.insert_many(to_load)
22 |     print("After loading:", coll.count())
23 | 
24 | if __name__ == "__main__":
25 |     plac.call(load)
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Andy Halterman
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/report_maker.py:
--------------------------------------------------------------------------------
 1 | import prodigy
 2 | from prodigy.components.db import connect
 3 | from pandas import DataFrame
 4 | from datetime import datetime, timedelta
 5 | from dateutil import parser
 6 | import os
 7 | import plac
 8 | 
 9 | @plac.annotations(
10 |     db_name=("Name of Prodigy database to generate export from", "option", "i", str))
11 | def main(db_name):
12 |     db = connect()
13 |     examples = db.get_dataset(db_name)
14 |     print("Total examples: ", len(examples))
15 |     
16 |     diffs = []
17 |     for ex in examples:
18 |         if 'time_returned' in ex.keys() and 'time_loaded' in ex.keys():
19 |             date = parser.parse(ex['time_returned']).strftime("%Y-%m-%d")
20 |             diff = parser.parse(ex['time_returned']) - parser.parse(ex['time_loaded'])
21 |             diff = diff.total_seconds()
22 |             diffs.append({"date" : date,
23 |                         "coder" : ex['active_coder'],
24 |                         "diff" : diff,
25 |                          "id" : ex['id'][-16:],
26 |                          'answer': ex['answer']})
27 |     
28 |     df = DataFrame(diffs)
29 |     df.to_csv("/home/andy/multiuser_prodigy/coding_summary.csv")
30 |     os.system("""/usr/bin/Rscript -e 'library(rmarkdown); rmarkdown::render("multiuser_prodigy/Report.Rmd", "html_document")'""")
31 |     os.system("""echo pwd""")
32 | 
33 | if __name__ == "__main__":
34 |     plac.call(main)
35 | 


--------------------------------------------------------------------------------
/annotation_streamlit.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import streamlit as st
 3 | import spacy
 4 | import base64
 5 | import pandas as pd
 6 | import jsonlines
 7 | 
 8 | from pymongo import MongoClient
 9 | 
10 | 
11 | HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
12 | 
13 | @st.cache(allow_output_mutation=True, suppress_st_warning=True)
14 | def setup_mongo():
15 |     client = MongoClient('mongodb://localhost:27017/')
16 |     db = client['gsr']
17 |     coll = db['prod_dec_2020_2']
18 |     return coll
19 | 
20 | def visualize(coll):
21 |     coder = st.selectbox("Select your port/ID number",
22 |               [9015, 9016, 9017, 9019, 9020, 9022, 9023, 9025])
23 |     coder = int(coder)
24 |     #st.markdown("Total sentences in collection: {}".format(coll.count()))
25 |     assigned = coll.count({"assigned_annotators": {"$in" : [coder]}})
26 |     completed = coll.count({"coders": {"$in" : [coder]}})
27 |     st.markdown("Sentences assigned to {}: {}".format(coder, assigned))
28 |     st.markdown("Sentences completed by {}: {}".format(coder, completed))
29 |     st.markdown("Progress:")
30 |     try:
31 |         prog = completed/assigned
32 |     except ZeroDivisionError:
33 |         prog = 0
34 |     st.progress(prog)
35 | 
36 | 
37 | st.title('Annotation progress')
38 | st.markdown("Check your annotation progress by selecting your port/ID number")
39 | coll = setup_mongo()
40 | visualize(coll)
41 | 


--------------------------------------------------------------------------------
/custom_ner_manual.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import mmh3
 3 | import json
 4 | import spacy
 5 | import copy
 6 | 
 7 | from .compare import get_questions as get_compare_questions
 8 | from ..models.ner import EntityRecognizer, merge_spans
 9 | from ..models.matcher import PatternMatcher
10 | from ..components import printers
11 | from ..components.db import connect
12 | from ..components.preprocess import split_sentences, split_spans, split_tokens
13 | from ..components.sorters import prefer_uncertain
14 | from ..components.loaders import get_stream
15 | from ..components.filters import filter_tasks
16 | from ..core import recipe, recipe_args
17 | from ..util import split_evals, get_labels, get_print, combine_models
18 | from ..util import export_model_data, set_hashes, log, prints
19 | from ..util import INPUT_HASH_ATTR, TASK_HASH_ATTR
20 | 
21 | 
22 | DB = connect()
23 | 
24 | @recipe('ner.manual',
25 |         dataset=recipe_args['dataset'],
26 |         spacy_model=recipe_args['spacy_model'],
27 |         source=recipe_args['source'],
28 |         api=recipe_args['api'],
29 |         loader=recipe_args['loader'],
30 |         label=recipe_args['label'],
31 |         exclude=recipe_args['exclude'])
32 | def manual(dataset, spacy_model, source=None, api=None, loader=None,
33 |            label=None, exclude=None):
34 |     """
35 |     Mark spans by token. Requires only a tokenizer and no entity recognizer,
36 |     and doesn't do any active learning.
37 |     """
38 |     log("RECIPE: Starting recipe ner.manual", locals())
39 |     nlp = spacy.load(spacy_model)
40 |     log("RECIPE: Loaded model {}".format(spacy_model))
41 |     labels = get_labels(label, nlp)
42 |     log("RECIPE: Annotating with {} labels".format(len(labels)), labels)
43 |     stream = get_stream(source, api=api, loader=loader, rehash=True,
44 |                         dedup=True, input_key='text')
45 |     stream = split_tokens(nlp, stream)
46 | 
47 |     return {
48 |         'view_id': 'ner_manual',
49 |         'dataset': dataset,
50 |         'stream': stream,
51 |         'exclude': exclude,
52 |         'config': {'labels': labels}
53 |     }
54 | 
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # multiuser_prodigy
 2 | 
 3 | This is a multi-annotator setup for [Prodigy](http://prodi.gy/),
 4 | Explosion AI's data annotation tool, that uses a Mongo DB to allocate
 5 | annotation tasks to annotators working on different Prodigy instances running
 6 | on seperate ports. This use case focuses on collecting gold standard
 7 | annotations from a team of annotators using Prodigy, rather than on the active
 8 | learning, single-annotator setup that Prodigy is primarily intended for.
 9 | 
10 | There are a few examples of annotation interfaces in the repo, including code
11 | for annotators working on training an NER model or doing sentence
12 | classification with document context.  Each annotator works on the Prodigy/port
13 | assigned to them, and a new `DBStream` class handles pulling the examples from
14 | Prodigy that are assigned to each worker.
15 | 
16 | I've used this setup for three major annotation projects now, but you'll need
17 | to modify the code to get it working for your project as well.
18 | 
19 | ## Mongo database
20 | 
21 | All tasks are stored in a Mongo DB, which allows different logic for how tasks
22 | are assigned to annotators. For instance, examples can go out to annotators
23 | until three annotations are collected, examples could go to two predetermined
24 | annotators from the wider pool, or annotations can be automatically resubmitted
25 | to a third annotator if the first two annotations disagree.
26 | 
27 | You can start a Mongo DB in a Docker container:
28 | 
29 | ```
30 | sudo docker run -d -p 127.0.0.1:27017:27017 -v /home/andy/MIT/multiuser_prodigy/db:/data/db  mongo
31 | ```
32 | 
33 | To load a list of tasks into the database:
34 | 
35 | ```
36 | python mongo_load.py -i assault_not_assault.jsonl -c "assault_gsr"
37 | ```
38 | 
39 | where `-i` is a JSONL file of tasks and `-c` specifies the collection name to
40 | load them into.
41 | 
42 | "seen" : {"$in" : [0,1]}},
43 |             {"coders"
44 | 
45 | ## Running
46 | 
47 | You'll need to modify the code of `multiuser_db.py` to access the right
48 | collection, set the names/ports of annotators, and the desired interface (NER,
49 | classification, etc).
50 | 
51 | Then you should launch the processes either in a `screen` or in the background:
52 | 
53 | ```
54 | python multiuser_db.py
55 | ```
56 | 
57 | ## Analysis
58 | 
59 | ![](streamlit_dashboard.png)
60 | 
61 | You can use Streamlit to set up a dashboard so annotators can check their
62 | progress. This one pulls results from the Mongo DB, but you could also call the
63 | Prodigy DB and show results from there.
64 | 
65 | 
66 | A more complicated analysis dashboard setup is in 
67 | `Report.Rmd`. This RMarkdown file reads in a CSV of coding information and
68 | generates figures in an HTML page that can be served from the annotation
69 | server. To record information about how long each task takes, add something
70 | like `eg['time_loaded'] = datetime.now().isoformat()` to your stream code and
71 | something like `eg['time_returned'] = datetime.now().isoformat()` to your
72 | update code. `report_maker.py` exports the DB to CSV and knits the RMarkdown on
73 | that CSV.
74 | 


--------------------------------------------------------------------------------
/multiuser_ner.py:
--------------------------------------------------------------------------------
 1 | import prodigy
 2 | from multiprocessing import Process
 3 | from time import sleep
 4 | from prodigy.recipes.ner import batch_train
 5 | import atexit
 6 | from pathlib import Path
 7 | import datetime as dt
 8 | 
 9 | class MultiProdigy:
10 |     def __init__(self, tag_list = ["LOC", "GPE", "PERSON", "ORG", "DATE", "NORP"]):
11 |         self.tag_list = tag_list
12 |         self.processes = []
13 | 
14 |     def serve_ner(self, ner_label, port):
15 |         print(ner_label)
16 |         # We can actually give everyone the same document. That'll simplify the
17 |         # directory and the update process, any may help the training process.
18 |         #filename = "data/{0}.jsonl".format(ner_label)
19 |         filename = "data/aljazeera_1.jsonl"
20 |         prodigy.serve('ner.teach', "arabic_ner_db", "model-final",
21 |                       filename,  None, None, ner_label, None, "arabic_ner_db",
22 |                       port=port)
23 | 
24 |     def serve_ner_manual(self, ner_label, port):
25 |         print(ner_label)
26 |         # We can actually give everyone the same document. That'll simplify the
27 |         # directory and the update process, any may help the training process.
28 |         #filename = "data/{0}.jsonl".format(ner_label)
29 |         filename = "data/aljazeera_1.jsonl"
30 |         prodigy.serve('ner.manual', "arabic_ner_db", "arabic_model",
31 |                       filename,  None, None, ner_label, "arabic_ner_db",
32 |                       port=port)
33 | 
34 | 
35 |     def make_prodigies(self):
36 |         for n, tag in enumerate(self.tag_list):
37 |             thread =  Process(target=self.serve_ner_manual, args=(tag, 9010 + n))
38 |             #thread =  Process(target=self.serve_ner, args=(tag, 9010 + n))
39 |             self.processes.append(thread)
40 | 
41 |     def start_prodigies(self):
42 |         print("Starting Prodigy processes...")
43 |         for p in self.processes:
44 |             p.start()
45 |             sleep(1)
46 | 
47 |     def kill_prodigies(self):
48 |         print("Killing Prodigy threads")
49 |         for i in self.processes:
50 |             try:
51 |                 i.terminate()
52 |             except AttributeError:
53 |                 print("Process {0} doesn't exist?".format(i))
54 |         self.processes = []
55 | 
56 |     def train_and_restart(self):
57 |         print("Re-training model with new annotations...")
58 |         batch_train(dataset="arabic_ner_db",
59 |                     input_model="model-final",
60 |                     n_iter = 10,
61 |                     output_model = Path("arabic_model_updated"))
62 |         print("Model training complete. Restarting service with new model...")
63 |         self.kill_prodigies()
64 |         self.make_prodigies()
65 |         self.start_prodigies()
66 | 
67 |     def make_retrain_time(self):
68 |         # make a datetime for tomorrow at 4 am
69 |         tomorrow = dt.datetime.today() + dt.timedelta(days=1)
70 |         self.retrain_time = dt.datetime.combine(tomorrow, dt.time(4, 0))
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     mp = MultiProdigy()
75 |     mp.make_retrain_time()
76 |     atexit.register(mp.kill_prodigies)
77 |     mp.make_prodigies()
78 |     mp.start_prodigies()
79 |     while True:
80 |         sleep(5)
81 |         if dt.datetime.now() > mp.retrain_time:
82 |             print("Retraining model and scheduling next retraining for tomorrow")
83 |             mp.make_retrain_time() # bump to tomorrow
84 |             mp.train_and_restart()
85 | 


--------------------------------------------------------------------------------
/multiuser_db.py:
--------------------------------------------------------------------------------
  1 | import prodigy
  2 | from multiprocessing import Process
  3 | from time import sleep
  4 | import atexit
  5 | from pathlib import Path
  6 | 
  7 | from prodigy.components import printers
  8 | from prodigy.components.loaders import get_stream
  9 | from prodigy.core import recipe, recipe_args
 10 | from prodigy.util import TASK_HASH_ATTR, log
 11 | from datetime import datetime
 12 | from pymongo import MongoClient
 13 | import pymongo
 14 | from bson.json_util import dumps
 15 | from bson.objectid import ObjectId
 16 | from random import shuffle
 17 | 
 18 | from custom_recipes import manual_custom
 19 | # Config:
 20 | # - add list of coders
 21 | # - ?? add port per coder?
 22 | # - base file name for files
 23 | # - recipe, db, model, output
 24 | 
 25 | class MultiProdigy:
 26 |     def __init__(self,
 27 |                  coder_list,
 28 |                  db_name,
 29 |                  collection_name,
 30 |                  recipe_name,
 31 |                  view_id,
 32 |                  dataset,
 33 |                  label=None,
 34 |                  model="blank:en"):
 35 |         self.coder_list = coder_list
 36 |         self.dataset = dataset,
 37 |         self.db_name = db_name,
 38 |         self.collection_name = collection_name
 39 |         self.processes = []
 40 |         self.recipe_name=recipe_name
 41 |         self.view_id=view_id
 42 |         self.label=label
 43 |         self.model=model
 44 |         self.label=label
 45 | 
 46 |         print("Using recipe ", self.recipe_name)
 47 |         
 48 |     def serve(self, coder, port):
 49 |         print(coder)
 50 |         prodigy.serve(self.recipe_name,
 51 |                       self.dataset,
 52 |                       self.model,
 53 |                       coder, 
 54 |                       self.collection_name,
 55 |                       self.db_name,
 56 |                       self.label,
 57 |                       #view_id=self.view_id, 
 58 |                       #label=self.label,
 59 |                       #None, # api
 60 |                       #None, # loader
 61 |                       #True, # memorize
 62 |                       #None, # exclude
 63 |                       port=port,
 64 |                       )  # port
 65 | 
 66 |     def make_prodigies(self):
 67 |         for coder_info in enumerate(self.coder_list):
 68 |             coder_info = coder_info[1] # wut
 69 |             thread = Process(target=self.serve, kwargs =
 70 |                     {"coder": coder_info['name'], 
 71 |                         "port": coder_info['port']})
 72 |             self.processes.append(thread)
 73 | 
 74 |     def start_prodigies(self):
 75 |         print("Starting Prodigy processes...")
 76 |         for p in self.processes:
 77 |             p.start()
 78 |             sleep(1)
 79 | 
 80 |     def kill_prodigies(self):
 81 |         print("Killing Prodigy threads")
 82 |         for i in self.processes:
 83 |             try:
 84 |                 i.terminate()
 85 |             except AttributeError:
 86 |                 print("Process {0} doesn't exist?".format(i))
 87 |         self.processes = []
 88 | 
 89 | 
 90 | if __name__ == "__main__":
 91 |     mp = MultiProdigy(coder_list = [{"name" : "Andy", "port" : 9010},
 92 |                                     {"name" : "Jill", "port" : 9011}],
 93 |                       db_name = "gsr",
 94 |                       collection_name = "protest_apsa_en_prod",
 95 |                       recipe_name="manual_custom",
 96 |                       view_id="manual_custom",
 97 |                       dataset="tmp_apsa",
 98 |                       label="NOUN,OBJ")
 99 |     atexit.register(mp.kill_prodigies)
100 |     mp.make_prodigies()
101 |     mp.start_prodigies()
102 |     while True:
103 |         sleep(30 * 60)
104 |         print("Restarting Prodigy...")
105 |         mp.kill_prodigies()
106 |         mp.make_prodigies()
107 |         mp.start_prodigies()
108 | 
109 |     #    if datetime.datetime.now() > mp.retrain_time:
110 |     #        print("Retraining model and scheduling next retraining for tomorrow")
111 |     #        mp.make_retrain_time() # bump to tomorrow
112 |     #        mp.train_and_restart()
113 | 
114 | 


--------------------------------------------------------------------------------
/multiuser_mark.py:
--------------------------------------------------------------------------------
  1 | import prodigy
  2 | from multiprocessing import Process
  3 | from time import sleep
  4 | from prodigy.recipes.ner import batch_train
  5 | import atexit
  6 | from pathlib import Path
  7 | import datetime as dt
  8 | 
  9 | from prodigy.components import printers
 10 | from prodigy.components.loaders import get_stream
 11 | from prodigy.core import recipe, recipe_args
 12 | from prodigy.util import TASK_HASH_ATTR, log
 13 | from datetime import datetime
 14 | from collections import Counter
 15 | 
 16 | # It's all going to be run by coder name.
 17 | 
 18 | # Config:
 19 | # - add list of coders
 20 | # - ?? add port per coder?
 21 | # - base file name for files
 22 | # - recipe, db, model, output
 23 | 
 24 | @prodigy.recipe('mark_custom',
 25 |         dataset=recipe_args['dataset'],
 26 |         source=recipe_args['source'],
 27 |         api=recipe_args['api'],
 28 |         loader=recipe_args['loader'],
 29 |         label=recipe_args['label'],
 30 |         view_id=recipe_args['view'],
 31 |         memorize=recipe_args['memorize'],
 32 |         exclude=recipe_args['exclude'])
 33 | def mark_custom(dataset, source=None, view_id=None, label='', api=None,
 34 |          loader=None, memorize=False, exclude=None):
 35 |     """
 36 |     Click through pre-prepared examples, with no model in the loop.
 37 |     """
 38 |     log('RECIPE: Starting recipe mark', locals())
 39 |     stream = list(get_stream(source, api, loader))
 40 | 
 41 |     counts = Counter()
 42 |     memory = {}
 43 | 
 44 |     def fill_memory(ctrl):
 45 |         if memorize:
 46 |             examples = ctrl.db.get_dataset(dataset)
 47 |             log("RECIPE: Add {} examples from dataset '{}' to memory"
 48 |                 .format(len(examples), dataset))
 49 |             for eg in examples:
 50 |                 memory[eg[TASK_HASH_ATTR]] = eg['answer']
 51 | 
 52 |     def ask_questions(stream):
 53 |         for eg in stream:
 54 |             eg['time_loaded'] = datetime.now().isoformat()
 55 |             if TASK_HASH_ATTR in eg and eg[TASK_HASH_ATTR] in memory:
 56 |                 answer = memory[eg[TASK_HASH_ATTR]]
 57 |                 counts[answer] += 1
 58 |             else:
 59 |                 if label:
 60 |                     eg['label'] = label
 61 |                 yield eg
 62 | 
 63 |     def recv_answers(answers):
 64 |         for eg in answers:
 65 |             counts[eg['answer']] += 1
 66 |             memory[eg[TASK_HASH_ATTR]] = eg['answer']
 67 |             eg['time_returned'] = datetime.now().isoformat()
 68 | 
 69 |     def print_results(ctrl):
 70 |         print(printers.answers(counts))
 71 | 
 72 |     def get_progress(session=0, total=0, loss=0):
 73 |         progress = len(counts) / len(stream)
 74 |         return progress
 75 | 
 76 |     return {
 77 |         'view_id': view_id,
 78 |         'dataset': dataset,
 79 |         'stream': ask_questions(stream),
 80 |         'exclude': exclude,
 81 |         'update': recv_answers,
 82 |         'on_load': fill_memory,
 83 |         'on_exit': print_results,
 84 |         'config': {'label': label}
 85 |     }
 86 | 
 87 | class MultiProdigy:
 88 |     def __init__(self,
 89 |         coder_list = [{"name" : "Daniel", "port" : 9010},
 90 |                        {"name" : "Youseff", "port" : 9011},
 91 |                        {"name" : "Emad", "port" : 9012},
 92 |                        {"name" : "Rafeef", "port" : 9013},
 93 |                        {"name" : "Mahmoud", "port" : 9014},
 94 |                        {"name" : "Zach", "port" : 9015},
 95 |                        {"name" : "Collin", "port" : 9016},
 96 |                       ]):
 97 |         self.coder_list = coder_list
 98 |         self.processes = []
 99 | 
100 |     def serve(self, coder, port):
101 |         print(coder)
102 |         base = "data/protest_for_classification_"
103 |         filename = "{0}{1}.jsonl".format(base, coder)
104 |         prodigy.serve('mark_custom',       # recipe
105 |                       "gsr_is_protest",  # db
106 |                       filename, # input file
107 |                       "classification", # view ID
108 |                       "PROTEST",
109 |                       None, # api
110 |                       None, # loader
111 |                       True, # memorize
112 |                       "gsr_is_protest", # exclude
113 |                       port=port)  # port
114 | 
115 |     def make_prodigies(self):
116 |         for coder_info in enumerate(self.coder_list):
117 |             coder_info = coder_info[1] # wut
118 |             thread =  Process(target=self.serve, args = (coder_info['name'], coder_info['port']))
119 |             self.processes.append(thread)
120 | 
121 |     def start_prodigies(self):
122 |         print("Starting Prodigy processes...")
123 |         for p in self.processes:
124 |             p.start()
125 |             sleep(1)
126 | 
127 |     def kill_prodigies(self):
128 |         print("Killing Prodigy threads")
129 |         for i in self.processes:
130 |             try:
131 |                 i.terminate()
132 |             except AttributeError:
133 |                 print("Process {0} doesn't exist?".format(i))
134 |         self.processes = []
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     mp = MultiProdigy()
139 |     #mp.make_retrain_time()
140 |     atexit.register(mp.kill_prodigies)
141 |     mp.make_prodigies()
142 |     mp.start_prodigies()
143 |     while True:
144 |         sleep(5)
145 |     #    if dt.datetime.now() > mp.retrain_time:
146 |     #        print("Retraining model and scheduling next retraining for tomorrow")
147 |     #        mp.make_retrain_time() # bump to tomorrow
148 |     #        mp.train_and_restart()
149 | 
150 | 


--------------------------------------------------------------------------------
/multiuser_db_assault.py:
--------------------------------------------------------------------------------
  1 | import prodigy
  2 | from multiprocessing import Process
  3 | from time import sleep
  4 | import atexit
  5 | from pathlib import Path
  6 | 
  7 | from prodigy.components import printers
  8 | from prodigy.components.loaders import get_stream
  9 | from prodigy.core import recipe, recipe_args
 10 | from prodigy.util import TASK_HASH_ATTR, log
 11 | from datetime import datetime
 12 | from pymongo import MongoClient
 13 | from bson.json_util import dumps
 14 | from bson.objectid import ObjectId
 15 | from random import shuffle
 16 | 
 17 | # Config:
 18 | # - add list of coders
 19 | # - ?? add port per coder?
 20 | # - base file name for files
 21 | # - recipe, db, model, output
 22 | 
 23 | class DBStream:
 24 |     def __init__(self, active_coder, collection_name = "silver_assault"):
 25 |         self.active_coder = active_coder
 26 |         self.coll = setup_mongo(collection_name)
 27 |         print("Total tasks in collection: ", self.coll.count())
 28 | 
 29 |     def get_examples(self):
 30 |         print("get_examples called")
 31 |         examples = self.coll.find({"$and" : [
 32 |             {"seen" : {"$in" : [0,1,2,3]}},
 33 |             {"coders" :  {"$nin" : [self.active_coder]}}]}).limit(200)
 34 |         examples = list(examples)
 35 |         print("inside get_examples, this many examples:", len(examples))
 36 |         for i in examples:
 37 |             i['_id'] = str(i['_id'])
 38 |         shuffle(examples)
 39 |         self.examples = iter(examples)
 40 |         ## !! Need to prioritize examples with 2 or 1 views.
 41 | 
 42 | def setup_mongo(collection_name, db_name = "gsr"):
 43 |     client = MongoClient('mongodb://localhost:27017/')
 44 |     db = client[db_name]
 45 |     coll = db[collection_name]
 46 |     return coll
 47 | 
 48 | @prodigy.recipe('mark_custom',
 49 |         dataset=recipe_args['dataset'],
 50 |         source=recipe_args['source'],
 51 |         api=recipe_args['api'],
 52 |         loader=recipe_args['loader'],
 53 |         label=recipe_args['label'],
 54 |         view_id=recipe_args['view'],
 55 |         memorize=recipe_args['memorize'],
 56 |         exclude=recipe_args['exclude'])
 57 | def mark_custom(dataset, source=None, view_id=None, label='', api=None,
 58 |          loader=None, memorize=False, exclude=None):
 59 |     """
 60 |     Click through pre-prepared examples, with no model in the loop.
 61 |     """
 62 | 
 63 |     log('RECIPE: Starting recipe mark', locals())
 64 |     coder = source # repurposing input slot
 65 |     stream_empty = iter([])
 66 |     stream = DBStream(coder, "silver_assault")
 67 |     stream.get_examples()
 68 |     #print("Initial number of examples in queue:", len(list(stream.examples)))
 69 |     #print("Initial examples in queue:", list(stream.examples))
 70 | 
 71 |     def ask_questions(stream_empty):
 72 |         #print("Hitting 'ask_question', with ", len(list(stream.examples)), " in the queue")
 73 |         #print(list(stream.examples))
 74 |         #print(stream.reced)
 75 |         for eg in stream.examples:
 76 |             #stream.get_examples()
 77 |             eg['time_loaded'] = datetime.now().isoformat()
 78 |             # not serializeable
 79 |             eg['_id'] = str(eg['_id'])
 80 |             yield eg
 81 | 
 82 | 
 83 |     #### Problem with the post-answer update.
 84 |     ## Not refreshing
 85 | 
 86 |     def recv_answers(answers):
 87 |         for eg in answers:
 88 |             print("Answer back: ")#, eg)
 89 |             # Get the example from the DB again in case it's changed
 90 |             updated_ex = list(stream.coll.find({'_id': ObjectId(eg['_id'])}))
 91 |             try:
 92 |                 curr_cod = updated_ex[0]['coders']
 93 |             except KeyError:
 94 |                 curr_cod = []
 95 |             # add current coder to the list
 96 |             curr_cod.append(coder)
 97 |             stream.coll.update_one({"_id": ObjectId(eg['_id'])}, # convert back
 98 |                                 {"$set": {"coders": curr_cod,
 99 |                                          "seen" : len(curr_cod)}})
100 |             eg['time_returned'] = datetime.now().isoformat()
101 |             eg['active_coder'] = coder
102 |             eg['coders'] = curr_cod
103 |         #print("Refreshing stream...")
104 |         #stream.get_examples()
105 | 
106 |     def print_results(ctrl):
107 |         print(printers.answers(counts))
108 | 
109 |     def get_progress(session=0, total=0, loss=0):
110 |         done = stream.coll.count({"$or" : [
111 |            {"coders"  : coder},
112 |            {"seen" :  {"$gte": 3}}]})
113 |         total = stream.coll.count()
114 |         progress = done / total
115 |         return progress
116 | 
117 |     return {
118 |         'view_id': view_id,
119 |         'dataset': dataset,
120 |         'stream': ask_questions(stream_empty),
121 |         'exclude': exclude,
122 |         'progress' : get_progress,
123 |         'update': recv_answers,
124 |         'on_exit': print_results
125 |         #'config': {'label': label}
126 |     }
127 | 
128 | class MultiProdigy:
129 |     def __init__(self,
130 |         coder_list = [#{"name" : "Daniel", "port" : 9010},
131 |                        #{"name" : "Youseff", "port" : 9011},
132 |                        #{"name" : "Emad", "port" : 9012},
133 |                        {"name" : "Khaled", "port" : 9013}
134 |                        #{"name" : "Mahmoud", "port" : 9014},
135 |                        #{"name" : "Zach", "port" : 9015},
136 |                        #{"name" : "Collin", "port" : 9016},
137 |                       ]):
138 |         self.coder_list = coder_list
139 |         self.processes = []
140 | 
141 |     def serve(self, coder, port):
142 |         print(coder)
143 |         #filename = "{0}{1}.jsonl".format(base, coder)
144 |         prodigy.serve('mark_custom',       # recipe
145 |                       "silver_assault",  # db
146 |                       coder, # input file, repurposed for coder
147 |                       "classification", # view ID
148 |                       "ASSAULT",
149 |                       None, # api
150 |                       None, # loader
151 |                       True, # memorize
152 |                       None, # exclude
153 |                       port=port)  # port
154 | 
155 |     def make_prodigies(self):
156 |         for coder_info in enumerate(self.coder_list):
157 |             coder_info = coder_info[1] # wut
158 |             thread = Process(target=self.serve, args = (coder_info['name'], coder_info['port']))
159 |             self.processes.append(thread)
160 | 
161 |     def start_prodigies(self):
162 |         print("Starting Prodigy processes...")
163 |         for p in self.processes:
164 |             p.start()
165 |             sleep(1)
166 | 
167 |     def kill_prodigies(self):
168 |         print("Killing Prodigy threads")
169 |         for i in self.processes:
170 |             try:
171 |                 i.terminate()
172 |             except AttributeError:
173 |                 print("Process {0} doesn't exist?".format(i))
174 |         self.processes = []
175 | 
176 | 
177 | if __name__ == "__main__":
178 |     mp = MultiProdigy()
179 |     atexit.register(mp.kill_prodigies)
180 |     mp.make_prodigies()
181 |     mp.start_prodigies()
182 |     while True:
183 |         sleep(60 * 60)
184 |         print("Restarting Prodigy...")
185 |         mp.kill_prodigies()
186 |         mp.make_prodigies()
187 |         mp.start_prodigies()
188 | 
189 |     #    if datetime.datetime.now() > mp.retrain_time:
190 |     #        print("Retraining model and scheduling next retraining for tomorrow")
191 |     #        mp.make_retrain_time() # bump to tomorrow
192 |     #        mp.train_and_restart()
193 | 
194 | 


--------------------------------------------------------------------------------
/multiuser_manual_db.py:
--------------------------------------------------------------------------------
  1 | import prodigy
  2 | from multiprocessing import Process
  3 | from time import sleep
  4 | import atexit
  5 | from pathlib import Path
  6 | 
  7 | from prodigy.components import printers
  8 | from prodigy.components.loaders import get_stream
  9 | from prodigy.core import recipe, recipe_args
 10 | from prodigy.util import TASK_HASH_ATTR, log
 11 | from prodigy.components.preprocess import add_tokens
 12 | from datetime import datetime
 13 | from pymongo import MongoClient
 14 | import pymongo
 15 | from bson.json_util import dumps
 16 | from bson.objectid import ObjectId
 17 | from random import shuffle
 18 | 
 19 | # Config:
 20 | # - add list of coders
 21 | # - ?? add port per coder?
 22 | # - base file name for files
 23 | # - recipe, db, model, output
 24 | 
 25 | import spacy
 26 | nlp = spacy.blank("en")
 27 | 
 28 | def setup_mongo(collection_name, db_name = "gsr"):
 29 |     client = MongoClient('mongodb://localhost:27017/')
 30 |     db = client[db_name]
 31 |     coll = db[collection_name]
 32 |     return coll
 33 | 
 34 | class DBStream:
 35 |     """Certain parameters are hard coded, for instance the number of 
 36 |     annotaters to see each example and the Mongo DB name."""
 37 |     def __init__(self, active_coder, collection_name):
 38 |         self.active_coder = active_coder
 39 |         self.coll = setup_mongo(collection_name)
 40 |         print("Total tasks in collection: ", self.coll.count())
 41 | 
 42 |     def get_examples(self):
 43 |         print("get_examples called")
 44 |         examples = self.coll.find({"$and" : [
 45 |             {"seen" : {"$in" : [0,1,2,3]}},  
 46 |             {"coders" :  {"$nin" : [self.active_coder]}}]}).sort("seen", pymongo.DESCENDING).limit(200)
 47 |         examples = list(examples)
 48 |         print("inside get_examples, this many examples:", len(examples))
 49 |         for i in examples:
 50 |             if '_id' in i.keys():
 51 |                 i['_id'] = str(i['_id'])
 52 |         shuffle(examples)  # this, of course, obviates the sorting a few lines above...
 53 |         self.examples = iter(examples)
 54 | 
 55 | 
 56 | # This decorator has ideas about what keyword arguments to take in.
 57 | # Repurpose some of these to convey other information, which is a bit
 58 | # ugly.
 59 | @prodigy.recipe('manual_custom',
 60 |         dataset=recipe_args['dataset'],
 61 |         source=recipe_args['source'], # use this slot for the coder name
 62 |         api=recipe_args['api'], # use this one for the collection name
 63 |         loader=recipe_args['loader'],
 64 |         label=recipe_args['label'],
 65 |         view_id=recipe_args['view'],
 66 |         memorize=recipe_args['memorize'],
 67 |         exclude=recipe_args['exclude'])
 68 | def manual_custom(dataset, source=None, view_id=None, label='', api=None,
 69 |          loader=None, memorize=False, exclude=None):
 70 |     """
 71 |     Click through pre-prepared examples, with no model in the loop.
 72 |     """
 73 | 
 74 |     log('RECIPE: Starting recipe mark', locals())
 75 |     coder = source # repurposing input slot
 76 |     stream_empty = iter([])
 77 |     stream = DBStream(coder, api) # using the api slot for collection name
 78 |     stream.get_examples()
 79 | 
 80 |     def ask_questions(stream):
 81 |         for eg in stream.examples:
 82 |             eg['time_loaded'] = datetime.now().isoformat()
 83 |             eg['mongo_collection'] = api # record where it came from
 84 |             # not serializeable
 85 |             eg['_id'] = str(eg['_id'])
 86 |             # add tokens. add_tokens expects a list...
 87 |             ts = add_tokens(nlp, [eg])
 88 |             #...and returns a generator
 89 |             eg = next(ts)
 90 |             yield eg
 91 | 
 92 | 
 93 |     def recv_answers(answers):
 94 |         for eg in answers:
 95 |             # Retrieve the example from the DB again to get most up-to-date
 96 |             # list of coders 
 97 |             updated_ex = list(stream.coll.find({'_id': ObjectId(eg['_id'])}))
 98 |             try:
 99 |                 curr_cod = updated_ex[0]['coders']
100 |             except KeyError:
101 |                 curr_cod = []
102 |             # add current coder to the list
103 |             curr_cod.append(coder)
104 |             stream.coll.update_one({"_id": ObjectId(eg['_id'])}, # convert back
105 |                                    {"$set": {"coders": curr_cod,
106 |                                              "seen" : len(curr_cod)}})
107 |             eg['time_returned'] = datetime.now().isoformat() # record submission time
108 |             eg['active_coder'] = coder
109 |             eg['coders'] = curr_cod
110 | 
111 | 
112 |     def get_progress(session=0, total=0, loss=0):
113 |         return None
114 |         #done = stream.coll.count({"$or" : [
115 |         #   {"coders"  : coder},
116 |         #   {"seen" :  {"$gte": 3}}]})
117 |         #total = stream.coll.count()
118 |         #progress = done / total
119 |         #return progress
120 | 
121 |     return {
122 |         'view_id': view_id,
123 |         'dataset': dataset,
124 |         'stream': ask_questions(stream),
125 |         'exclude': exclude,
126 |         'progress' : get_progress,
127 |         'update': recv_answers,
128 |     }
129 | 
130 | 
131 | 
132 | class MultiProdigy:
133 |     """These are functions that remain the same regardless of the view ID."""
134 |     def __init__(self, coder_list, collection, dataset, view_id = None, label = None):
135 |         self.coder_list = coder_list
136 |         self.collection = collection
137 |         self.dataset = dataset
138 |         self.processes = []
139 |         self.view_id = view_id
140 |         self.label = label 
141 | 
142 |     def start_prodigies(self):
143 |         print("Starting Prodigy processes...")
144 |         for p in self.processes:
145 |             p.start()
146 |             sleep(1)
147 | 
148 |     def kill_prodigies(self):
149 |         # Make sure all processes are killed on close
150 |         print("Killing Prodigy threads")
151 |         for i in self.processes:
152 |             try:
153 |                 i.terminate()
154 |             except AttributeError:
155 |                 print("Process {0} doesn't exist?".format(i))
156 |         self.processes = []
157 | 
158 | 
159 | class MultiProdigyManual(MultiProdigy):
160 |    # There are two functions
161 |     def serve(self, coder, port):
162 |         print(coder)
163 |         prodigy.serve('manual_custom',       # recipe
164 |                       self.dataset,  # dataset to save it in
165 |                       coder, # input file, repurposed for coder
166 |                       "ner_manual", # view ID
167 |                       self.label,
168 |                       self.collection, # api, repurposed to be collection
169 |                       None, # loader
170 |                       True, # memorize
171 |                       None, # exclude
172 |                       port=port)  # port
173 | 
174 |     def make_prodigies(self):
175 |         for coder_info in enumerate(self.coder_list):
176 |             coder_info = coder_info[1] # wut
177 |             thread = Process(target=self.serve, args = 
178 |                     (coder_info['name'], 
179 |                      coder_info['port']))
180 |             self.processes.append(thread)
181 | 
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     mp = MultiProdigyManual(
186 |             dataset = "apsa_tmp",
187 |             coder_list = [{"name": "Andy", "port" : "9011"}],
188 |         collection = "silver_assault")
189 |     atexit.register(mp.kill_prodigies)
190 |     mp.make_prodigies()
191 |     mp.start_prodigies()
192 |     while True:
193 |         sleep(60 * 60)
194 |         print("Restarting Prodigy...")
195 |         mp.kill_prodigies()
196 |         mp.make_prodigies()
197 |         mp.start_prodigies()
198 | 
199 | 


--------------------------------------------------------------------------------
/multiuser_db_blocks.py:
--------------------------------------------------------------------------------
  1 | import prodigy
  2 | import spacy
  3 | from multiprocessing import Process
  4 | from time import sleep
  5 | import atexit
  6 | from pathlib import Path
  7 | 
  8 | from prodigy.components import printers
  9 | from prodigy.components.loaders import get_stream
 10 | from prodigy.core import recipe, recipe_args
 11 | from prodigy.util import TASK_HASH_ATTR, log
 12 | from datetime import datetime
 13 | from pymongo import MongoClient
 14 | import pymongo
 15 | from bson.json_util import dumps
 16 | from bson.objectid import ObjectId
 17 | from random import shuffle
 18 | 
 19 | 
 20 | class DBStream:
 21 |     def __init__(self, active_coder, collection_name):
 22 |         print("Using collection: {}".format(collection_name))
 23 |         self.active_coder = active_coder
 24 |         self.coll = setup_mongo(collection_name)
 25 |         print("Total tasks in collection: ", self.coll.count_documents({}))
 26 | 
 27 |     def get_examples(self):
 28 |         print("get_examples called")
 29 |         examples = self.coll.find({"$and" : [
 30 |             {"assigned_annotators" : {"$in" : [self.active_coder]}}, # check if the task is assigned to the current coder...
 31 |             {"coders" :  {"$nin" : [self.active_coder]}}]}).sort("sent_id", pymongo.ASCENDING) # ...but the current coder hasn't seen it yet
 32 |         examples = list(examples)
 33 |         print("inside get_examples, this many examples:", len(examples))
 34 |         for i in examples:
 35 |             i['_id'] = str(i['_id']) # this gets created by mongo 
 36 |             i['_task_hash'] = hash(str(i['_id']) + str(self.active_coder))
 37 |             i['_input_hash'] = hash(str(i['_id']) + str(self.active_coder))
 38 |         self.examples = iter(examples)
 39 |         ## !! Need to prioritize examples with 2 or 1 views.
 40 | 
 41 | def setup_mongo(collection_name, db_name = "gsr"):
 42 |     client = MongoClient('mongodb://localhost:27017/')
 43 |     db = client[db_name]
 44 |     coll = db[collection_name]
 45 |     return coll
 46 | 
 47 | @prodigy.recipe('toi_blocks')
 48 | def toi_blocks(dataset, source=None, collection="prod_dec_2020_2"):
 49 |     log('RECIPE: Starting recipe mark', locals())
 50 |     coder = source # repurposing input slot
 51 |     print("Coder from within toi_blocks:", coder)
 52 |     stream_empty = iter([])
 53 |     stream = DBStream(coder, collection)
 54 |     stream.get_examples()
 55 |     #print("Initial number of examples in queue:", len(list(stream.examples)))
 56 |     #print("Initial examples in queue:", list(stream.examples))
 57 | 
 58 |     def ask_questions(stream_empty):
 59 |         #print("Hitting 'ask_question', with ", len(list(stream.examples)), " in the queue")
 60 |         #print(list(stream.examples))
 61 |         #print(stream.reced)
 62 |         for eg in stream.examples:
 63 |             #stream.get_examples()
 64 |             eg['time_loaded'] = datetime.now().isoformat()
 65 |             eg['active_coder'] = coder
 66 |             # not serializeable
 67 |             eg['_id'] = str(eg['_id'])
 68 |             yield eg
 69 | 
 70 | 
 71 |     #### Problem with the post-answer update.
 72 |     ## Not refreshing
 73 | 
 74 |     def recv_answers(answers):
 75 |         for eg in answers:
 76 |             print("Answer back: ", coder, datetime.now().isoformat())#, eg)
 77 |             # Get the example from the DB again in case it's changed
 78 |             updated_ex = list(stream.coll.find({'_id': ObjectId(eg['_id'])}))
 79 |             try:
 80 |                 curr_cod = updated_ex[0]['coders']
 81 |             except KeyError:
 82 |                 curr_cod = []
 83 |             # add current coder to the list
 84 |             curr_cod.append(coder)
 85 |             stream.coll.update_one({"_id": ObjectId(eg['_id'])}, # convert back
 86 |                                 {"$set": {"coders": curr_cod,
 87 |                                          "seen" : len(curr_cod),
 88 |                                          'time_returned': datetime.now().isoformat(),
 89 |                                          'time_loaded': eg['time_loaded'],
 90 |                                          'active_coder': coder
 91 |                                          }})
 92 |             eg['time_returned'] = datetime.now().isoformat()
 93 |             eg['seen'] = len(curr_cod)
 94 | 
 95 |     def print_results(ctrl):
 96 |         print(printers.answers(counts))
 97 | 
 98 |     def get_progress(*args, **kwargs):
 99 |         done = stream.coll.count_documents({"coders"  : coder})
100 |         total = stream.coll.count_documents({})
101 |         return done / total
102 | 
103 |     # We can use the blocks to override certain config and content, and set
104 |     # "text": None for the choice interface so it doesn't also render the text
105 |     blocks = [
106 |         {"view_id": "choice", "text": None},
107 |         {"view_id": "text_input", "field_rows": 3, "field_label": "If you found this example difficult or ambiguous please explain why."}
108 |       ]
109 | 
110 |     return {
111 |         "dataset": dataset,          # the dataset to save annotations to
112 |         "view_id": "blocks",         # set the view_id to "blocks"
113 |         "stream": ask_questions(stream_empty),            # the stream of incoming examples
114 |         "config": {
115 |             "labels": ["RELEVANT"],  # the labels for the manual NER interface
116 |             "blocks": blocks,         # add the blocks to the config
117 |         },
118 |         'update': recv_answers,
119 |         #"custom_theme": {"cardMaxWidth": "90%"},
120 |     }
121 | 
122 | #    return {
123 | #        'view_id': view_id,
124 | #        'dataset': dataset,
125 | #        'stream': ask_questions(stream_empty),
126 | #        'exclude': exclude,
127 | #        "flag": True,
128 | #        "custom_theme": {"cardMaxWidth": "90%"},
129 | #        'progress' : get_progress,
130 | #        'on_exit': print_results
131 | #    }
132 | 
133 | class MultiProdigy:
134 |     def __init__(self,
135 |         coder_list = [#{"name" : "Andy", "port" : 9010},
136 |                       # {"name" : "Katie", "port" : 9011},
137 |                       # {"name" : "Sheikh", "port" : 9012},
138 |                        #{"name" : 9014, "port" : 9014},
139 |                        {"name" : 9015, "port" : 9015},
140 |                        {"name" : 9016, "port" : 9016},
141 |                        {"name" : 9017, "port" : 9017},
142 |                        #{"name" : 9018, "port" : 9018},
143 |                        {"name" : 9019, "port" : 9019},
144 |                        {"name" : 9020, "port" : 9020},
145 |                        #{"name" : 9021, "port" : 9021},
146 |                        {"name" : 9022, "port" : 9022},
147 |                        {"name" : 9023, "port" : 9023},
148 |                        #{"name" : 9024, "port" : 9024},
149 |                        {"name" : 9025, "port" : 9025},
150 |                       ]):
151 |         self.coder_list = coder_list
152 |         self.processes = []
153 | 
154 |     def serve(self, coder, port):
155 |         print(coder)
156 |         #filename = "{0}{1}.jsonl".format(base, coder)
157 |         prodigy.serve('toi_blocks',       # recipe
158 |                       "prod_dec_2020_2",  # collection
159 |                       coder, # input file, repurposed for coder
160 |                       port=port)  # port
161 | 
162 |     def make_prodigies(self):
163 |         for coder_info in enumerate(self.coder_list):
164 |             coder_info = coder_info[1] # wut
165 |             thread = Process(target=self.serve, args = (coder_info['name'], coder_info['port']))
166 |             self.processes.append(thread)
167 | 
168 |     def start_prodigies(self):
169 |         print("Starting Prodigy processes...")
170 |         for p in self.processes:
171 |             p.start()
172 |             sleep(1)
173 | 
174 |     def kill_prodigies(self):
175 |         print("Killing Prodigy threads")
176 |         for i in self.processes:
177 |             try:
178 |                 i.terminate()
179 |             except AttributeError:
180 |                 print("Process {0} doesn't exist?".format(i))
181 |         self.processes = []
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     mp = MultiProdigy()
186 |     atexit.register(mp.kill_prodigies)
187 |     mp.make_prodigies()
188 |     mp.start_prodigies()
189 |     while True:
190 |         sleep(60 * 60)
191 |         print("Restarting Prodigy...")
192 |         mp.kill_prodigies()
193 |         mp.make_prodigies()
194 |         mp.start_prodigies()
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/Report.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Gold Standard Record Coding Report"
  3 | date: "`r paste0(format(Sys.time(), '%d %B %Y, %H:%M:%S', tz='America/New_York',usetz=TRUE), '(East Coast)')`"
  4 | output: 
  5 |   html_document:
  6 |     self_contained: true
  7 | ---
  8 | 
  9 | ```{r setup, include=FALSE}
 10 | library(knitr)
 11 | opts_chunk$set(fig.width = 5)
 12 | opts_chunk$set(fig.height= 4)
 13 | opts_chunk$set(echo = TRUE)
 14 | opts_chunk$set(warning = FALSE)
 15 | opts_chunk$set(message = FALSE)
 16 | opts_chunk$set(echo = FALSE)
 17 | ```
 18 | 
 19 | ```{r message = FALSE, echo = FALSE, output = FALSE, message = FALSE}
 20 | library(ggplot2)
 21 | library(RColorBrewer)
 22 | library(scales)
 23 | library(grid)
 24 | library(extrafont)
 25 | 
 26 | theme_mcm <- function() {
 27 |   # Generate the colors for the chart procedurally with RColorBrewer
 28 |   palette <- brewer.pal("Greys", n=9)
 29 |   #color.background = "#fcfaea"
 30 |   #color.background = "#ffffff"
 31 |   color.background = "white" #palette[2]
 32 |   color.grid.major = palette[3]
 33 |   #color.grid.major = palette[3]
 34 |   color.axis.text = palette[6]
 35 |   color.axis.title = palette[7]
 36 |   color.title = palette[8]
 37 |   
 38 |   # Begin construction of chart
 39 |   theme_bw(base_size=12)  + 
 40 |     
 41 |     # Set the entire chart region to a light gray color
 42 |     theme(panel.background=element_rect(fill=color.background, color=color.background))  + 
 43 |     theme(plot.background=element_rect(fill=color.background, color=color.background))  + 
 44 |     theme(panel.border=element_rect(color=color.background))  + 
 45 |     
 46 |     # Format the grid
 47 |     theme(panel.grid.major=element_line(color=color.grid.major,size=.15))  + 
 48 |     theme(panel.grid.minor=element_line(color=color.grid.major,size=.07))  + 
 49 |     theme(panel.grid.minor.y=element_blank())  + 
 50 |     theme(axis.ticks=element_blank())  + 
 51 |     
 52 |     # Format the legend
 53 |     theme(legend.background = element_rect(fill=color.background))  + 
 54 |     theme(legend.key = element_rect(fill=color.background))  + 
 55 |     theme(legend.text = element_text(size=9,color=color.axis.title))  + 
 56 |     theme(legend.title = element_text(color=color.axis.title))  + 
 57 |     
 58 |     # Set title and axis labels, and format these and tick marks
 59 |     theme(plot.title=element_text(color=color.title, size=12, vjust=1.25))  + 
 60 |     theme(plot.subtitle=element_text(color="#353535", size=11)) + #, vjust=1.25
 61 |     theme(plot.caption=element_text(color=color.axis.title)) +
 62 |     theme(axis.text.x=element_text(size=9,color=color.axis.text))  + 
 63 |     theme(axis.text.y=element_text(size=9,color=color.axis.text))  + 
 64 |     theme(axis.title.x=element_text(size=10,color=color.axis.title, vjust=0))  + 
 65 |     theme(axis.title.y=element_text(size=,color=color.axis.title, vjust=1.25))  + 
 66 |     theme(plot.caption=element_text(size=7,color=palette[4], vjust = 6)) +
 67 |     
 68 |     # Facets
 69 |     theme(strip.background = element_rect(colour = color.background, 
 70 |                                           fill = palette[3],
 71 |                                           size = 0.5)) +
 72 |     #theme(strip.text.y = element_text(vjust= -1.75)) +   
 73 |     
 74 |     # Fonts
 75 |     theme(text=element_text(family="Tw Cen MT", margin=margin(b=15)))  + 
 76 |     theme(plot.subtitle=element_text(family="Tw Cen MT"))  + 
 77 |     
 78 |     # scrunch the titles down closer
 79 |     theme(plot.title = element_text(margin = margin(1.5))) +
 80 |     theme(plot.subtitle = element_text(margin = margin(1.5))) +
 81 |     
 82 |     #...and move the legend top-right
 83 |     #theme(legend.margin=margin(-20)) +
 84 |     #theme(legend.justification = "right") +
 85 |     
 86 |     # Plot margins
 87 |     theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm"))
 88 | }
 89 | 
 90 | hex <- c("#cfa81f", "#847a8e", "#ad4738", "#7c812d", "#008f7d", 
 91 |          "#6d472f", "#10355e", "#fc9c8e", "#99b8c6", "#ca5f33")
 92 | color <- c("yellow", "purple", "red", "green", "teal", 
 93 |            "brown", "blue", "pink", "sky", "orange")
 94 | #ggthemr::ggthemr('dust')
 95 | #ggthemr::ggthemr('pale')
 96 | theme_set(theme_mcm())
 97 | ```
 98 | 
 99 | ```{r}
100 | library(dplyr)
101 | #library(plotly)
102 | ```
103 | 
104 | 
105 | ```{r}
106 | df <- read.csv("coding_summary.csv")
107 | df$date <- as.Date(df$date)
108 | ```
109 | 
110 | ```{r}
111 | gold <- df %>% 
112 |   group_by(id) %>% 
113 |   summarize(accept = sum(answer == "accept"),
114 |             count = n()) %>% 
115 |   filter(count >= 2, accept == count) %>% 
116 |   nrow()
117 | ```
118 | 
119 | ```{r}
120 | gold_reject <- df %>% 
121 |   group_by(id) %>% 
122 |   summarize(good = sum(answer == "reject") == 3) %>% 
123 |   filter(good == TRUE) %>% 
124 |   nrow()
125 | ```
126 | 
127 | ```{r}
128 | silver <- df %>% 
129 |   group_by(id) %>% 
130 |   summarize(good = sum(answer == "accept") >= 2) %>% 
131 |   filter(good == TRUE) %>% 
132 |   nrow()
133 | ```
134 | 
135 | 
136 | ## Overview
137 | 
138 | We've collected `r nrow(df)` total labels. 
139 | There are currently `r gold` gold-standard accepts, `r silver` silver-standard (2) accepts, `r gold_reject` with three rejects.
140 | 
141 | 
142 | ## Statistics
143 | 
144 | ```{r}
145 | p <- df %>% 
146 |   group_by(coder) %>% 
147 |   summarize(count = n()) %>% 
148 |   ggplot(., aes(x = coder, y = count)) +
149 |     geom_bar(stat = "identity", width = 0.4) +
150 |     labs(title = "Sentences coded over all time")
151 | p
152 | ```
153 | 
154 | 
155 | ```{r}
156 | library(lubridate)
157 | week <- today() - dweeks(1)
158 | 
159 |   
160 | p <- df %>% 
161 |   filter(as.Date(date) >= week) %>% 
162 |   group_by(coder) %>% 
163 |   summarize(count = n()) %>% 
164 |   ggplot(., aes(x = coder, y = count)) +
165 |     geom_bar(stat = "identity", width = 0.4) +
166 |     labs(title = paste0("Sentences coded for the week of ", today() - dweeks(1), " to ", today()))
167 | p
168 | ```
169 | 
170 | ```{r}
171 | p <- df %>% 
172 |   group_by(coder, date) %>% 
173 |   summarize(count = n()) %>% 
174 |   ggplot(., aes(x = date, y = coder, 
175 |                 alpha = count)) +
176 |     geom_tile() +
177 |     labs(title = "Daily contributions per coder",
178 |          x = "Date") +
179 |     scale_x_date(date_minor_breaks = "1 day")
180 | p
181 | ```
182 | 
183 | 
184 | ```{r}
185 | library(ggridges)
186 | 
187 | coder_means <- df %>% 
188 |   filter(diff < 300) %>% 
189 |   group_by(coder) %>% 
190 |   summarize(avg_time = median(diff))
191 | 
192 | p <- df %>% 
193 |   filter(diff < 300) %>% 
194 |   ggplot(., aes(x = diff, y = coder)) +
195 |   #geom_density_ridges() +
196 |   geom_density_ridges(stat = "binline", bins = 100, scale = 0.95) +
197 |   geom_point(data = coder_means, aes(x = avg_time, y = coder)) +
198 |   geom_vline(xintercept = 60, linetype = 2) +
199 |   labs(title = "Distribution of time per coder",
200 |        subtitle = "Dots indicate median time, dashed line is 60 seconds",
201 |        x = "Seconds per task",
202 |        y = NULL)
203 | p
204 | ```
205 | 
206 | Excluding all tasks that take more than five minutes:
207 | 
208 | ```{r results = "asis"}
209 | time_table <- df %>% 
210 |   filter(as.Date(date) >= week, diff < 300) %>% 
211 |   group_by(coder) %>% 
212 |   summarize(total_time = sum(diff) / 60) %>% 
213 |   arrange(desc(total_time))
214 | 
215 | kable(time_table, caption = "Minutes of coding time in past 7 days",
216 |       digits = 1)
217 | ```
218 | 
219 | ```{r results = "asis"}
220 | d <- today()
221 | prev_days <- seq(d-13,d,by='day')
222 | sats <- prev_days[weekdays(prev_days)=='Saturday']
223 | 
224 | time_table <- df %>% 
225 |   filter(as.Date(date) >= sats[1], as.Date(date) < sats[2], diff < 300) %>% 
226 |   group_by(coder) %>% 
227 |   summarize(total_time = sum(diff) / 60) %>% 
228 |   arrange(desc(total_time))
229 | 
230 | kable(time_table, caption = paste0("Minutes of coding time in previous pay period period (",
231 |                                    paste0(c(sats[1], sats[2]-1), collapse = " to "),
232 |                                    ")"),
233 |       digits = 1)
234 | ```
235 | 
236 | ```{r results = "asis"}
237 | time_table <- df %>% 
238 |   filter(diff < 300) %>% 
239 |   group_by(coder) %>% 
240 |   summarize(total_time = sum(diff) / 60) %>% 
241 |   arrange(desc(total_time))
242 | 
243 | kable(time_table, caption = "Minutes of coding time (all time)",
244 |       digits = 1)
245 | ```
246 | 
247 | ```{r}
248 | p <- df %>% 
249 |   group_by(coder) %>% 
250 |   summarize(mean_accept = mean(answer == "accept")) %>%
251 |   mutate(var =  (mean_accept  * (1 - mean_accept) / n())) %>% 
252 |   mutate(upper = mean_accept + 1.96*var,
253 |          lower = mean_accept - 1.96*var) %>% 
254 |   ggplot(., aes(x = coder, y = mean_accept)) + 
255 |     geom_bar(stat = "identity", width = 0.4) +
256 |     geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) +
257 |     labs(title = "Acceptance rate per coder",
258 |          y = "Acceptance Rate",
259 |          x = NULL)
260 | p
261 | ```
262 | 
263 | 
264 | ```{r}
265 | p <- df %>% 
266 |   group_by(id) %>% 
267 |   summarize(count = n()) %>% 
268 |   group_by(count) %>% 
269 |   summarize(examples = n()) %>% 
270 |   ggplot(., aes(x = as.factor(count), y = examples)) +
271 |     geom_bar(stat = "identity", width = 0.4) +
272 |     labs(title = "Distribution of completed examples",
273 |          subtitle = "Each example is assigned to three coders",
274 |          x = "Count")
275 | p
276 | ```
277 | 
278 | ```{r}
279 | p <- df %>% 
280 |   group_by(id) %>% 
281 |   mutate(count = n()) %>% 
282 |   filter(count > 1) %>% 
283 |   summarize(var = var(answer == "accept")) %>% 
284 |   ggplot(., aes(x = var)) +
285 |     geom_histogram(bins = 50) +
286 |     labs(title = "Variance of answers")
287 | p
288 | ```
289 | 
290 | 
291 | 
292 | 
293 | 
294 | 


--------------------------------------------------------------------------------