├── streamlit_dashboard.png
├── mongo_delete.py
├── mongo_load.py
├── LICENSE
├── report_maker.py
├── annotation_streamlit.py
├── custom_ner_manual.py
├── README.md
├── multiuser_ner.py
├── multiuser_db.py
├── multiuser_mark.py
├── multiuser_db_assault.py
├── multiuser_manual_db.py
├── multiuser_db_blocks.py
└── Report.Rmd
/streamlit_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/multiuser_prodigy/HEAD/streamlit_dashboard.png
--------------------------------------------------------------------------------
/mongo_delete.py:
--------------------------------------------------------------------------------
1 | import jsonlines
2 | from pymongo import MongoClient
3 | import plac
4 | import sys
5 |
6 | @plac.annotations(
7 | coll_name=("Collectionto load into", "option", "c", str),
8 | db_name=("Database to load into", "option", "d", str))
9 | def delete(coll_name, db_name = "gsr"):
10 | # quick script for updating Mongo tasks
11 | client = MongoClient('mongodb://localhost:27017/')
12 | db = client[db_name]
13 | coll = db[coll_name]
14 | count = coll.count()
15 | conf = input("You're about to delete the collection {0}, which has {1} records. Please type this name to confirm: ".format(coll_name, count))
16 | if conf != coll_name:
17 | print("Bye!")
18 | sys.exit(0)
19 | if conf == coll_name:
20 | coll.delete_many({})
21 | print("Deleted all records from ", coll_name)
22 |
23 | if __name__ == "__main__":
24 | plac.call(delete)
25 |
--------------------------------------------------------------------------------
/mongo_load.py:
--------------------------------------------------------------------------------
1 | import jsonlines
2 | from pymongo import MongoClient
3 | import plac
4 |
5 | @plac.annotations(
6 | input_file=("JSONL of tasks to load", "option", "i", str),
7 | coll_name=("Collectionto load into", "option", "c", str),
8 | db_name=("Database to load into", "option", "d", str))
9 | def load(input_file, coll_name, db_name = "gsr"):
10 | # quick script for updating Mongo tasks
11 | with jsonlines.open(input_file, "r") as f:
12 | to_load = list(f.iter())
13 | for i in to_load:
14 | i['seen'] = 0
15 | i['coders'] = []
16 | print("Loading into db {0}, collection {1}".format(db_name, coll_name))
17 | client = MongoClient('mongodb://localhost:27017/')
18 | db = client[db_name]
19 | coll = db[coll_name]
20 | print("Before loading:", coll.count())
21 | coll.insert_many(to_load)
22 | print("After loading:", coll.count())
23 |
24 | if __name__ == "__main__":
25 | plac.call(load)
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Andy Halterman
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/report_maker.py:
--------------------------------------------------------------------------------
1 | import prodigy
2 | from prodigy.components.db import connect
3 | from pandas import DataFrame
4 | from datetime import datetime, timedelta
5 | from dateutil import parser
6 | import os
7 | import plac
8 |
9 | @plac.annotations(
10 | db_name=("Name of Prodigy database to generate export from", "option", "i", str))
11 | def main(db_name):
12 | db = connect()
13 | examples = db.get_dataset(db_name)
14 | print("Total examples: ", len(examples))
15 |
16 | diffs = []
17 | for ex in examples:
18 | if 'time_returned' in ex.keys() and 'time_loaded' in ex.keys():
19 | date = parser.parse(ex['time_returned']).strftime("%Y-%m-%d")
20 | diff = parser.parse(ex['time_returned']) - parser.parse(ex['time_loaded'])
21 | diff = diff.total_seconds()
22 | diffs.append({"date" : date,
23 | "coder" : ex['active_coder'],
24 | "diff" : diff,
25 | "id" : ex['id'][-16:],
26 | 'answer': ex['answer']})
27 |
28 | df = DataFrame(diffs)
29 | df.to_csv("/home/andy/multiuser_prodigy/coding_summary.csv")
30 | os.system("""/usr/bin/Rscript -e 'library(rmarkdown); rmarkdown::render("multiuser_prodigy/Report.Rmd", "html_document")'""")
31 | os.system("""echo pwd""")
32 |
33 | if __name__ == "__main__":
34 | plac.call(main)
35 |
--------------------------------------------------------------------------------
/annotation_streamlit.py:
--------------------------------------------------------------------------------
1 |
2 | import streamlit as st
3 | import spacy
4 | import base64
5 | import pandas as pd
6 | import jsonlines
7 |
8 | from pymongo import MongoClient
9 |
10 |
11 | HTML_WRAPPER = """
{}
"""
12 |
13 | @st.cache(allow_output_mutation=True, suppress_st_warning=True)
14 | def setup_mongo():
15 | client = MongoClient('mongodb://localhost:27017/')
16 | db = client['gsr']
17 | coll = db['prod_dec_2020_2']
18 | return coll
19 |
20 | def visualize(coll):
21 | coder = st.selectbox("Select your port/ID number",
22 | [9015, 9016, 9017, 9019, 9020, 9022, 9023, 9025])
23 | coder = int(coder)
24 | #st.markdown("Total sentences in collection: {}".format(coll.count()))
25 | assigned = coll.count({"assigned_annotators": {"$in" : [coder]}})
26 | completed = coll.count({"coders": {"$in" : [coder]}})
27 | st.markdown("Sentences assigned to {}: {}".format(coder, assigned))
28 | st.markdown("Sentences completed by {}: {}".format(coder, completed))
29 | st.markdown("Progress:")
30 | try:
31 | prog = completed/assigned
32 | except ZeroDivisionError:
33 | prog = 0
34 | st.progress(prog)
35 |
36 |
37 | st.title('Annotation progress')
38 | st.markdown("Check your annotation progress by selecting your port/ID number")
39 | coll = setup_mongo()
40 | visualize(coll)
41 |
--------------------------------------------------------------------------------
/custom_ner_manual.py:
--------------------------------------------------------------------------------
1 | import random
2 | import mmh3
3 | import json
4 | import spacy
5 | import copy
6 |
7 | from .compare import get_questions as get_compare_questions
8 | from ..models.ner import EntityRecognizer, merge_spans
9 | from ..models.matcher import PatternMatcher
10 | from ..components import printers
11 | from ..components.db import connect
12 | from ..components.preprocess import split_sentences, split_spans, split_tokens
13 | from ..components.sorters import prefer_uncertain
14 | from ..components.loaders import get_stream
15 | from ..components.filters import filter_tasks
16 | from ..core import recipe, recipe_args
17 | from ..util import split_evals, get_labels, get_print, combine_models
18 | from ..util import export_model_data, set_hashes, log, prints
19 | from ..util import INPUT_HASH_ATTR, TASK_HASH_ATTR
20 |
21 |
22 | DB = connect()
23 |
24 | @recipe('ner.manual',
25 | dataset=recipe_args['dataset'],
26 | spacy_model=recipe_args['spacy_model'],
27 | source=recipe_args['source'],
28 | api=recipe_args['api'],
29 | loader=recipe_args['loader'],
30 | label=recipe_args['label'],
31 | exclude=recipe_args['exclude'])
32 | def manual(dataset, spacy_model, source=None, api=None, loader=None,
33 | label=None, exclude=None):
34 | """
35 | Mark spans by token. Requires only a tokenizer and no entity recognizer,
36 | and doesn't do any active learning.
37 | """
38 | log("RECIPE: Starting recipe ner.manual", locals())
39 | nlp = spacy.load(spacy_model)
40 | log("RECIPE: Loaded model {}".format(spacy_model))
41 | labels = get_labels(label, nlp)
42 | log("RECIPE: Annotating with {} labels".format(len(labels)), labels)
43 | stream = get_stream(source, api=api, loader=loader, rehash=True,
44 | dedup=True, input_key='text')
45 | stream = split_tokens(nlp, stream)
46 |
47 | return {
48 | 'view_id': 'ner_manual',
49 | 'dataset': dataset,
50 | 'stream': stream,
51 | 'exclude': exclude,
52 | 'config': {'labels': labels}
53 | }
54 |
55 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # multiuser_prodigy
2 |
3 | This is a multi-annotator setup for [Prodigy](http://prodi.gy/),
4 | Explosion AI's data annotation tool, that uses a Mongo DB to allocate
5 | annotation tasks to annotators working on different Prodigy instances running
6 | on seperate ports. This use case focuses on collecting gold standard
7 | annotations from a team of annotators using Prodigy, rather than on the active
8 | learning, single-annotator setup that Prodigy is primarily intended for.
9 |
10 | There are a few examples of annotation interfaces in the repo, including code
11 | for annotators working on training an NER model or doing sentence
12 | classification with document context. Each annotator works on the Prodigy/port
13 | assigned to them, and a new `DBStream` class handles pulling the examples from
14 | Prodigy that are assigned to each worker.
15 |
16 | I've used this setup for three major annotation projects now, but you'll need
17 | to modify the code to get it working for your project as well.
18 |
19 | ## Mongo database
20 |
21 | All tasks are stored in a Mongo DB, which allows different logic for how tasks
22 | are assigned to annotators. For instance, examples can go out to annotators
23 | until three annotations are collected, examples could go to two predetermined
24 | annotators from the wider pool, or annotations can be automatically resubmitted
25 | to a third annotator if the first two annotations disagree.
26 |
27 | You can start a Mongo DB in a Docker container:
28 |
29 | ```
30 | sudo docker run -d -p 127.0.0.1:27017:27017 -v /home/andy/MIT/multiuser_prodigy/db:/data/db mongo
31 | ```
32 |
33 | To load a list of tasks into the database:
34 |
35 | ```
36 | python mongo_load.py -i assault_not_assault.jsonl -c "assault_gsr"
37 | ```
38 |
39 | where `-i` is a JSONL file of tasks and `-c` specifies the collection name to
40 | load them into.
41 |
42 | "seen" : {"$in" : [0,1]}},
43 | {"coders"
44 |
45 | ## Running
46 |
47 | You'll need to modify the code of `multiuser_db.py` to access the right
48 | collection, set the names/ports of annotators, and the desired interface (NER,
49 | classification, etc).
50 |
51 | Then you should launch the processes either in a `screen` or in the background:
52 |
53 | ```
54 | python multiuser_db.py
55 | ```
56 |
57 | ## Analysis
58 |
59 | 
60 |
61 | You can use Streamlit to set up a dashboard so annotators can check their
62 | progress. This one pulls results from the Mongo DB, but you could also call the
63 | Prodigy DB and show results from there.
64 |
65 |
66 | A more complicated analysis dashboard setup is in
67 | `Report.Rmd`. This RMarkdown file reads in a CSV of coding information and
68 | generates figures in an HTML page that can be served from the annotation
69 | server. To record information about how long each task takes, add something
70 | like `eg['time_loaded'] = datetime.now().isoformat()` to your stream code and
71 | something like `eg['time_returned'] = datetime.now().isoformat()` to your
72 | update code. `report_maker.py` exports the DB to CSV and knits the RMarkdown on
73 | that CSV.
74 |
--------------------------------------------------------------------------------
/multiuser_ner.py:
--------------------------------------------------------------------------------
1 | import prodigy
2 | from multiprocessing import Process
3 | from time import sleep
4 | from prodigy.recipes.ner import batch_train
5 | import atexit
6 | from pathlib import Path
7 | import datetime as dt
8 |
9 | class MultiProdigy:
10 | def __init__(self, tag_list = ["LOC", "GPE", "PERSON", "ORG", "DATE", "NORP"]):
11 | self.tag_list = tag_list
12 | self.processes = []
13 |
14 | def serve_ner(self, ner_label, port):
15 | print(ner_label)
16 | # We can actually give everyone the same document. That'll simplify the
17 | # directory and the update process, any may help the training process.
18 | #filename = "data/{0}.jsonl".format(ner_label)
19 | filename = "data/aljazeera_1.jsonl"
20 | prodigy.serve('ner.teach', "arabic_ner_db", "model-final",
21 | filename, None, None, ner_label, None, "arabic_ner_db",
22 | port=port)
23 |
24 | def serve_ner_manual(self, ner_label, port):
25 | print(ner_label)
26 | # We can actually give everyone the same document. That'll simplify the
27 | # directory and the update process, any may help the training process.
28 | #filename = "data/{0}.jsonl".format(ner_label)
29 | filename = "data/aljazeera_1.jsonl"
30 | prodigy.serve('ner.manual', "arabic_ner_db", "arabic_model",
31 | filename, None, None, ner_label, "arabic_ner_db",
32 | port=port)
33 |
34 |
35 | def make_prodigies(self):
36 | for n, tag in enumerate(self.tag_list):
37 | thread = Process(target=self.serve_ner_manual, args=(tag, 9010 + n))
38 | #thread = Process(target=self.serve_ner, args=(tag, 9010 + n))
39 | self.processes.append(thread)
40 |
41 | def start_prodigies(self):
42 | print("Starting Prodigy processes...")
43 | for p in self.processes:
44 | p.start()
45 | sleep(1)
46 |
47 | def kill_prodigies(self):
48 | print("Killing Prodigy threads")
49 | for i in self.processes:
50 | try:
51 | i.terminate()
52 | except AttributeError:
53 | print("Process {0} doesn't exist?".format(i))
54 | self.processes = []
55 |
56 | def train_and_restart(self):
57 | print("Re-training model with new annotations...")
58 | batch_train(dataset="arabic_ner_db",
59 | input_model="model-final",
60 | n_iter = 10,
61 | output_model = Path("arabic_model_updated"))
62 | print("Model training complete. Restarting service with new model...")
63 | self.kill_prodigies()
64 | self.make_prodigies()
65 | self.start_prodigies()
66 |
67 | def make_retrain_time(self):
68 | # make a datetime for tomorrow at 4 am
69 | tomorrow = dt.datetime.today() + dt.timedelta(days=1)
70 | self.retrain_time = dt.datetime.combine(tomorrow, dt.time(4, 0))
71 |
72 |
73 | if __name__ == "__main__":
74 | mp = MultiProdigy()
75 | mp.make_retrain_time()
76 | atexit.register(mp.kill_prodigies)
77 | mp.make_prodigies()
78 | mp.start_prodigies()
79 | while True:
80 | sleep(5)
81 | if dt.datetime.now() > mp.retrain_time:
82 | print("Retraining model and scheduling next retraining for tomorrow")
83 | mp.make_retrain_time() # bump to tomorrow
84 | mp.train_and_restart()
85 |
--------------------------------------------------------------------------------
/multiuser_db.py:
--------------------------------------------------------------------------------
1 | import prodigy
2 | from multiprocessing import Process
3 | from time import sleep
4 | import atexit
5 | from pathlib import Path
6 |
7 | from prodigy.components import printers
8 | from prodigy.components.loaders import get_stream
9 | from prodigy.core import recipe, recipe_args
10 | from prodigy.util import TASK_HASH_ATTR, log
11 | from datetime import datetime
12 | from pymongo import MongoClient
13 | import pymongo
14 | from bson.json_util import dumps
15 | from bson.objectid import ObjectId
16 | from random import shuffle
17 |
18 | from custom_recipes import manual_custom
19 | # Config:
20 | # - add list of coders
21 | # - ?? add port per coder?
22 | # - base file name for files
23 | # - recipe, db, model, output
24 |
25 | class MultiProdigy:
26 | def __init__(self,
27 | coder_list,
28 | db_name,
29 | collection_name,
30 | recipe_name,
31 | view_id,
32 | dataset,
33 | label=None,
34 | model="blank:en"):
35 | self.coder_list = coder_list
36 | self.dataset = dataset,
37 | self.db_name = db_name,
38 | self.collection_name = collection_name
39 | self.processes = []
40 | self.recipe_name=recipe_name
41 | self.view_id=view_id
42 | self.label=label
43 | self.model=model
44 | self.label=label
45 |
46 | print("Using recipe ", self.recipe_name)
47 |
48 | def serve(self, coder, port):
49 | print(coder)
50 | prodigy.serve(self.recipe_name,
51 | self.dataset,
52 | self.model,
53 | coder,
54 | self.collection_name,
55 | self.db_name,
56 | self.label,
57 | #view_id=self.view_id,
58 | #label=self.label,
59 | #None, # api
60 | #None, # loader
61 | #True, # memorize
62 | #None, # exclude
63 | port=port,
64 | ) # port
65 |
66 | def make_prodigies(self):
67 | for coder_info in enumerate(self.coder_list):
68 | coder_info = coder_info[1] # wut
69 | thread = Process(target=self.serve, kwargs =
70 | {"coder": coder_info['name'],
71 | "port": coder_info['port']})
72 | self.processes.append(thread)
73 |
74 | def start_prodigies(self):
75 | print("Starting Prodigy processes...")
76 | for p in self.processes:
77 | p.start()
78 | sleep(1)
79 |
80 | def kill_prodigies(self):
81 | print("Killing Prodigy threads")
82 | for i in self.processes:
83 | try:
84 | i.terminate()
85 | except AttributeError:
86 | print("Process {0} doesn't exist?".format(i))
87 | self.processes = []
88 |
89 |
90 | if __name__ == "__main__":
91 | mp = MultiProdigy(coder_list = [{"name" : "Andy", "port" : 9010},
92 | {"name" : "Jill", "port" : 9011}],
93 | db_name = "gsr",
94 | collection_name = "protest_apsa_en_prod",
95 | recipe_name="manual_custom",
96 | view_id="manual_custom",
97 | dataset="tmp_apsa",
98 | label="NOUN,OBJ")
99 | atexit.register(mp.kill_prodigies)
100 | mp.make_prodigies()
101 | mp.start_prodigies()
102 | while True:
103 | sleep(30 * 60)
104 | print("Restarting Prodigy...")
105 | mp.kill_prodigies()
106 | mp.make_prodigies()
107 | mp.start_prodigies()
108 |
109 | # if datetime.datetime.now() > mp.retrain_time:
110 | # print("Retraining model and scheduling next retraining for tomorrow")
111 | # mp.make_retrain_time() # bump to tomorrow
112 | # mp.train_and_restart()
113 |
114 |
--------------------------------------------------------------------------------
/multiuser_mark.py:
--------------------------------------------------------------------------------
1 | import prodigy
2 | from multiprocessing import Process
3 | from time import sleep
4 | from prodigy.recipes.ner import batch_train
5 | import atexit
6 | from pathlib import Path
7 | import datetime as dt
8 |
9 | from prodigy.components import printers
10 | from prodigy.components.loaders import get_stream
11 | from prodigy.core import recipe, recipe_args
12 | from prodigy.util import TASK_HASH_ATTR, log
13 | from datetime import datetime
14 | from collections import Counter
15 |
16 | # It's all going to be run by coder name.
17 |
18 | # Config:
19 | # - add list of coders
20 | # - ?? add port per coder?
21 | # - base file name for files
22 | # - recipe, db, model, output
23 |
24 | @prodigy.recipe('mark_custom',
25 | dataset=recipe_args['dataset'],
26 | source=recipe_args['source'],
27 | api=recipe_args['api'],
28 | loader=recipe_args['loader'],
29 | label=recipe_args['label'],
30 | view_id=recipe_args['view'],
31 | memorize=recipe_args['memorize'],
32 | exclude=recipe_args['exclude'])
33 | def mark_custom(dataset, source=None, view_id=None, label='', api=None,
34 | loader=None, memorize=False, exclude=None):
35 | """
36 | Click through pre-prepared examples, with no model in the loop.
37 | """
38 | log('RECIPE: Starting recipe mark', locals())
39 | stream = list(get_stream(source, api, loader))
40 |
41 | counts = Counter()
42 | memory = {}
43 |
44 | def fill_memory(ctrl):
45 | if memorize:
46 | examples = ctrl.db.get_dataset(dataset)
47 | log("RECIPE: Add {} examples from dataset '{}' to memory"
48 | .format(len(examples), dataset))
49 | for eg in examples:
50 | memory[eg[TASK_HASH_ATTR]] = eg['answer']
51 |
52 | def ask_questions(stream):
53 | for eg in stream:
54 | eg['time_loaded'] = datetime.now().isoformat()
55 | if TASK_HASH_ATTR in eg and eg[TASK_HASH_ATTR] in memory:
56 | answer = memory[eg[TASK_HASH_ATTR]]
57 | counts[answer] += 1
58 | else:
59 | if label:
60 | eg['label'] = label
61 | yield eg
62 |
63 | def recv_answers(answers):
64 | for eg in answers:
65 | counts[eg['answer']] += 1
66 | memory[eg[TASK_HASH_ATTR]] = eg['answer']
67 | eg['time_returned'] = datetime.now().isoformat()
68 |
69 | def print_results(ctrl):
70 | print(printers.answers(counts))
71 |
72 | def get_progress(session=0, total=0, loss=0):
73 | progress = len(counts) / len(stream)
74 | return progress
75 |
76 | return {
77 | 'view_id': view_id,
78 | 'dataset': dataset,
79 | 'stream': ask_questions(stream),
80 | 'exclude': exclude,
81 | 'update': recv_answers,
82 | 'on_load': fill_memory,
83 | 'on_exit': print_results,
84 | 'config': {'label': label}
85 | }
86 |
87 | class MultiProdigy:
88 | def __init__(self,
89 | coder_list = [{"name" : "Daniel", "port" : 9010},
90 | {"name" : "Youseff", "port" : 9011},
91 | {"name" : "Emad", "port" : 9012},
92 | {"name" : "Rafeef", "port" : 9013},
93 | {"name" : "Mahmoud", "port" : 9014},
94 | {"name" : "Zach", "port" : 9015},
95 | {"name" : "Collin", "port" : 9016},
96 | ]):
97 | self.coder_list = coder_list
98 | self.processes = []
99 |
100 | def serve(self, coder, port):
101 | print(coder)
102 | base = "data/protest_for_classification_"
103 | filename = "{0}{1}.jsonl".format(base, coder)
104 | prodigy.serve('mark_custom', # recipe
105 | "gsr_is_protest", # db
106 | filename, # input file
107 | "classification", # view ID
108 | "PROTEST",
109 | None, # api
110 | None, # loader
111 | True, # memorize
112 | "gsr_is_protest", # exclude
113 | port=port) # port
114 |
115 | def make_prodigies(self):
116 | for coder_info in enumerate(self.coder_list):
117 | coder_info = coder_info[1] # wut
118 | thread = Process(target=self.serve, args = (coder_info['name'], coder_info['port']))
119 | self.processes.append(thread)
120 |
121 | def start_prodigies(self):
122 | print("Starting Prodigy processes...")
123 | for p in self.processes:
124 | p.start()
125 | sleep(1)
126 |
127 | def kill_prodigies(self):
128 | print("Killing Prodigy threads")
129 | for i in self.processes:
130 | try:
131 | i.terminate()
132 | except AttributeError:
133 | print("Process {0} doesn't exist?".format(i))
134 | self.processes = []
135 |
136 |
137 | if __name__ == "__main__":
138 | mp = MultiProdigy()
139 | #mp.make_retrain_time()
140 | atexit.register(mp.kill_prodigies)
141 | mp.make_prodigies()
142 | mp.start_prodigies()
143 | while True:
144 | sleep(5)
145 | # if dt.datetime.now() > mp.retrain_time:
146 | # print("Retraining model and scheduling next retraining for tomorrow")
147 | # mp.make_retrain_time() # bump to tomorrow
148 | # mp.train_and_restart()
149 |
150 |
--------------------------------------------------------------------------------
/multiuser_db_assault.py:
--------------------------------------------------------------------------------
1 | import prodigy
2 | from multiprocessing import Process
3 | from time import sleep
4 | import atexit
5 | from pathlib import Path
6 |
7 | from prodigy.components import printers
8 | from prodigy.components.loaders import get_stream
9 | from prodigy.core import recipe, recipe_args
10 | from prodigy.util import TASK_HASH_ATTR, log
11 | from datetime import datetime
12 | from pymongo import MongoClient
13 | from bson.json_util import dumps
14 | from bson.objectid import ObjectId
15 | from random import shuffle
16 |
17 | # Config:
18 | # - add list of coders
19 | # - ?? add port per coder?
20 | # - base file name for files
21 | # - recipe, db, model, output
22 |
23 | class DBStream:
24 | def __init__(self, active_coder, collection_name = "silver_assault"):
25 | self.active_coder = active_coder
26 | self.coll = setup_mongo(collection_name)
27 | print("Total tasks in collection: ", self.coll.count())
28 |
29 | def get_examples(self):
30 | print("get_examples called")
31 | examples = self.coll.find({"$and" : [
32 | {"seen" : {"$in" : [0,1,2,3]}},
33 | {"coders" : {"$nin" : [self.active_coder]}}]}).limit(200)
34 | examples = list(examples)
35 | print("inside get_examples, this many examples:", len(examples))
36 | for i in examples:
37 | i['_id'] = str(i['_id'])
38 | shuffle(examples)
39 | self.examples = iter(examples)
40 | ## !! Need to prioritize examples with 2 or 1 views.
41 |
42 | def setup_mongo(collection_name, db_name = "gsr"):
43 | client = MongoClient('mongodb://localhost:27017/')
44 | db = client[db_name]
45 | coll = db[collection_name]
46 | return coll
47 |
48 | @prodigy.recipe('mark_custom',
49 | dataset=recipe_args['dataset'],
50 | source=recipe_args['source'],
51 | api=recipe_args['api'],
52 | loader=recipe_args['loader'],
53 | label=recipe_args['label'],
54 | view_id=recipe_args['view'],
55 | memorize=recipe_args['memorize'],
56 | exclude=recipe_args['exclude'])
57 | def mark_custom(dataset, source=None, view_id=None, label='', api=None,
58 | loader=None, memorize=False, exclude=None):
59 | """
60 | Click through pre-prepared examples, with no model in the loop.
61 | """
62 |
63 | log('RECIPE: Starting recipe mark', locals())
64 | coder = source # repurposing input slot
65 | stream_empty = iter([])
66 | stream = DBStream(coder, "silver_assault")
67 | stream.get_examples()
68 | #print("Initial number of examples in queue:", len(list(stream.examples)))
69 | #print("Initial examples in queue:", list(stream.examples))
70 |
71 | def ask_questions(stream_empty):
72 | #print("Hitting 'ask_question', with ", len(list(stream.examples)), " in the queue")
73 | #print(list(stream.examples))
74 | #print(stream.reced)
75 | for eg in stream.examples:
76 | #stream.get_examples()
77 | eg['time_loaded'] = datetime.now().isoformat()
78 | # not serializeable
79 | eg['_id'] = str(eg['_id'])
80 | yield eg
81 |
82 |
83 | #### Problem with the post-answer update.
84 | ## Not refreshing
85 |
86 | def recv_answers(answers):
87 | for eg in answers:
88 | print("Answer back: ")#, eg)
89 | # Get the example from the DB again in case it's changed
90 | updated_ex = list(stream.coll.find({'_id': ObjectId(eg['_id'])}))
91 | try:
92 | curr_cod = updated_ex[0]['coders']
93 | except KeyError:
94 | curr_cod = []
95 | # add current coder to the list
96 | curr_cod.append(coder)
97 | stream.coll.update_one({"_id": ObjectId(eg['_id'])}, # convert back
98 | {"$set": {"coders": curr_cod,
99 | "seen" : len(curr_cod)}})
100 | eg['time_returned'] = datetime.now().isoformat()
101 | eg['active_coder'] = coder
102 | eg['coders'] = curr_cod
103 | #print("Refreshing stream...")
104 | #stream.get_examples()
105 |
106 | def print_results(ctrl):
107 | print(printers.answers(counts))
108 |
109 | def get_progress(session=0, total=0, loss=0):
110 | done = stream.coll.count({"$or" : [
111 | {"coders" : coder},
112 | {"seen" : {"$gte": 3}}]})
113 | total = stream.coll.count()
114 | progress = done / total
115 | return progress
116 |
117 | return {
118 | 'view_id': view_id,
119 | 'dataset': dataset,
120 | 'stream': ask_questions(stream_empty),
121 | 'exclude': exclude,
122 | 'progress' : get_progress,
123 | 'update': recv_answers,
124 | 'on_exit': print_results
125 | #'config': {'label': label}
126 | }
127 |
128 | class MultiProdigy:
129 | def __init__(self,
130 | coder_list = [#{"name" : "Daniel", "port" : 9010},
131 | #{"name" : "Youseff", "port" : 9011},
132 | #{"name" : "Emad", "port" : 9012},
133 | {"name" : "Khaled", "port" : 9013}
134 | #{"name" : "Mahmoud", "port" : 9014},
135 | #{"name" : "Zach", "port" : 9015},
136 | #{"name" : "Collin", "port" : 9016},
137 | ]):
138 | self.coder_list = coder_list
139 | self.processes = []
140 |
141 | def serve(self, coder, port):
142 | print(coder)
143 | #filename = "{0}{1}.jsonl".format(base, coder)
144 | prodigy.serve('mark_custom', # recipe
145 | "silver_assault", # db
146 | coder, # input file, repurposed for coder
147 | "classification", # view ID
148 | "ASSAULT",
149 | None, # api
150 | None, # loader
151 | True, # memorize
152 | None, # exclude
153 | port=port) # port
154 |
155 | def make_prodigies(self):
156 | for coder_info in enumerate(self.coder_list):
157 | coder_info = coder_info[1] # wut
158 | thread = Process(target=self.serve, args = (coder_info['name'], coder_info['port']))
159 | self.processes.append(thread)
160 |
161 | def start_prodigies(self):
162 | print("Starting Prodigy processes...")
163 | for p in self.processes:
164 | p.start()
165 | sleep(1)
166 |
167 | def kill_prodigies(self):
168 | print("Killing Prodigy threads")
169 | for i in self.processes:
170 | try:
171 | i.terminate()
172 | except AttributeError:
173 | print("Process {0} doesn't exist?".format(i))
174 | self.processes = []
175 |
176 |
177 | if __name__ == "__main__":
178 | mp = MultiProdigy()
179 | atexit.register(mp.kill_prodigies)
180 | mp.make_prodigies()
181 | mp.start_prodigies()
182 | while True:
183 | sleep(60 * 60)
184 | print("Restarting Prodigy...")
185 | mp.kill_prodigies()
186 | mp.make_prodigies()
187 | mp.start_prodigies()
188 |
189 | # if datetime.datetime.now() > mp.retrain_time:
190 | # print("Retraining model and scheduling next retraining for tomorrow")
191 | # mp.make_retrain_time() # bump to tomorrow
192 | # mp.train_and_restart()
193 |
194 |
--------------------------------------------------------------------------------
/multiuser_manual_db.py:
--------------------------------------------------------------------------------
1 | import prodigy
2 | from multiprocessing import Process
3 | from time import sleep
4 | import atexit
5 | from pathlib import Path
6 |
7 | from prodigy.components import printers
8 | from prodigy.components.loaders import get_stream
9 | from prodigy.core import recipe, recipe_args
10 | from prodigy.util import TASK_HASH_ATTR, log
11 | from prodigy.components.preprocess import add_tokens
12 | from datetime import datetime
13 | from pymongo import MongoClient
14 | import pymongo
15 | from bson.json_util import dumps
16 | from bson.objectid import ObjectId
17 | from random import shuffle
18 |
19 | # Config:
20 | # - add list of coders
21 | # - ?? add port per coder?
22 | # - base file name for files
23 | # - recipe, db, model, output
24 |
25 | import spacy
26 | nlp = spacy.blank("en")
27 |
28 | def setup_mongo(collection_name, db_name = "gsr"):
29 | client = MongoClient('mongodb://localhost:27017/')
30 | db = client[db_name]
31 | coll = db[collection_name]
32 | return coll
33 |
34 | class DBStream:
35 | """Certain parameters are hard coded, for instance the number of
36 | annotaters to see each example and the Mongo DB name."""
37 | def __init__(self, active_coder, collection_name):
38 | self.active_coder = active_coder
39 | self.coll = setup_mongo(collection_name)
40 | print("Total tasks in collection: ", self.coll.count())
41 |
42 | def get_examples(self):
43 | print("get_examples called")
44 | examples = self.coll.find({"$and" : [
45 | {"seen" : {"$in" : [0,1,2,3]}},
46 | {"coders" : {"$nin" : [self.active_coder]}}]}).sort("seen", pymongo.DESCENDING).limit(200)
47 | examples = list(examples)
48 | print("inside get_examples, this many examples:", len(examples))
49 | for i in examples:
50 | if '_id' in i.keys():
51 | i['_id'] = str(i['_id'])
52 | shuffle(examples) # this, of course, obviates the sorting a few lines above...
53 | self.examples = iter(examples)
54 |
55 |
56 | # This decorator has ideas about what keyword arguments to take in.
57 | # Repurpose some of these to convey other information, which is a bit
58 | # ugly.
59 | @prodigy.recipe('manual_custom',
60 | dataset=recipe_args['dataset'],
61 | source=recipe_args['source'], # use this slot for the coder name
62 | api=recipe_args['api'], # use this one for the collection name
63 | loader=recipe_args['loader'],
64 | label=recipe_args['label'],
65 | view_id=recipe_args['view'],
66 | memorize=recipe_args['memorize'],
67 | exclude=recipe_args['exclude'])
68 | def manual_custom(dataset, source=None, view_id=None, label='', api=None,
69 | loader=None, memorize=False, exclude=None):
70 | """
71 | Click through pre-prepared examples, with no model in the loop.
72 | """
73 |
74 | log('RECIPE: Starting recipe mark', locals())
75 | coder = source # repurposing input slot
76 | stream_empty = iter([])
77 | stream = DBStream(coder, api) # using the api slot for collection name
78 | stream.get_examples()
79 |
80 | def ask_questions(stream):
81 | for eg in stream.examples:
82 | eg['time_loaded'] = datetime.now().isoformat()
83 | eg['mongo_collection'] = api # record where it came from
84 | # not serializeable
85 | eg['_id'] = str(eg['_id'])
86 | # add tokens. add_tokens expects a list...
87 | ts = add_tokens(nlp, [eg])
88 | #...and returns a generator
89 | eg = next(ts)
90 | yield eg
91 |
92 |
93 | def recv_answers(answers):
94 | for eg in answers:
95 | # Retrieve the example from the DB again to get most up-to-date
96 | # list of coders
97 | updated_ex = list(stream.coll.find({'_id': ObjectId(eg['_id'])}))
98 | try:
99 | curr_cod = updated_ex[0]['coders']
100 | except KeyError:
101 | curr_cod = []
102 | # add current coder to the list
103 | curr_cod.append(coder)
104 | stream.coll.update_one({"_id": ObjectId(eg['_id'])}, # convert back
105 | {"$set": {"coders": curr_cod,
106 | "seen" : len(curr_cod)}})
107 | eg['time_returned'] = datetime.now().isoformat() # record submission time
108 | eg['active_coder'] = coder
109 | eg['coders'] = curr_cod
110 |
111 |
112 | def get_progress(session=0, total=0, loss=0):
113 | return None
114 | #done = stream.coll.count({"$or" : [
115 | # {"coders" : coder},
116 | # {"seen" : {"$gte": 3}}]})
117 | #total = stream.coll.count()
118 | #progress = done / total
119 | #return progress
120 |
121 | return {
122 | 'view_id': view_id,
123 | 'dataset': dataset,
124 | 'stream': ask_questions(stream),
125 | 'exclude': exclude,
126 | 'progress' : get_progress,
127 | 'update': recv_answers,
128 | }
129 |
130 |
131 |
132 | class MultiProdigy:
133 | """These are functions that remain the same regardless of the view ID."""
134 | def __init__(self, coder_list, collection, dataset, view_id = None, label = None):
135 | self.coder_list = coder_list
136 | self.collection = collection
137 | self.dataset = dataset
138 | self.processes = []
139 | self.view_id = view_id
140 | self.label = label
141 |
142 | def start_prodigies(self):
143 | print("Starting Prodigy processes...")
144 | for p in self.processes:
145 | p.start()
146 | sleep(1)
147 |
148 | def kill_prodigies(self):
149 | # Make sure all processes are killed on close
150 | print("Killing Prodigy threads")
151 | for i in self.processes:
152 | try:
153 | i.terminate()
154 | except AttributeError:
155 | print("Process {0} doesn't exist?".format(i))
156 | self.processes = []
157 |
158 |
159 | class MultiProdigyManual(MultiProdigy):
160 | # There are two functions
161 | def serve(self, coder, port):
162 | print(coder)
163 | prodigy.serve('manual_custom', # recipe
164 | self.dataset, # dataset to save it in
165 | coder, # input file, repurposed for coder
166 | "ner_manual", # view ID
167 | self.label,
168 | self.collection, # api, repurposed to be collection
169 | None, # loader
170 | True, # memorize
171 | None, # exclude
172 | port=port) # port
173 |
174 | def make_prodigies(self):
175 | for coder_info in enumerate(self.coder_list):
176 | coder_info = coder_info[1] # wut
177 | thread = Process(target=self.serve, args =
178 | (coder_info['name'],
179 | coder_info['port']))
180 | self.processes.append(thread)
181 |
182 |
183 |
184 | if __name__ == "__main__":
185 | mp = MultiProdigyManual(
186 | dataset = "apsa_tmp",
187 | coder_list = [{"name": "Andy", "port" : "9011"}],
188 | collection = "silver_assault")
189 | atexit.register(mp.kill_prodigies)
190 | mp.make_prodigies()
191 | mp.start_prodigies()
192 | while True:
193 | sleep(60 * 60)
194 | print("Restarting Prodigy...")
195 | mp.kill_prodigies()
196 | mp.make_prodigies()
197 | mp.start_prodigies()
198 |
199 |
--------------------------------------------------------------------------------
/multiuser_db_blocks.py:
--------------------------------------------------------------------------------
1 | import prodigy
2 | import spacy
3 | from multiprocessing import Process
4 | from time import sleep
5 | import atexit
6 | from pathlib import Path
7 |
8 | from prodigy.components import printers
9 | from prodigy.components.loaders import get_stream
10 | from prodigy.core import recipe, recipe_args
11 | from prodigy.util import TASK_HASH_ATTR, log
12 | from datetime import datetime
13 | from pymongo import MongoClient
14 | import pymongo
15 | from bson.json_util import dumps
16 | from bson.objectid import ObjectId
17 | from random import shuffle
18 |
19 |
20 | class DBStream:
21 | def __init__(self, active_coder, collection_name):
22 | print("Using collection: {}".format(collection_name))
23 | self.active_coder = active_coder
24 | self.coll = setup_mongo(collection_name)
25 | print("Total tasks in collection: ", self.coll.count_documents({}))
26 |
27 | def get_examples(self):
28 | print("get_examples called")
29 | examples = self.coll.find({"$and" : [
30 | {"assigned_annotators" : {"$in" : [self.active_coder]}}, # check if the task is assigned to the current coder...
31 | {"coders" : {"$nin" : [self.active_coder]}}]}).sort("sent_id", pymongo.ASCENDING) # ...but the current coder hasn't seen it yet
32 | examples = list(examples)
33 | print("inside get_examples, this many examples:", len(examples))
34 | for i in examples:
35 | i['_id'] = str(i['_id']) # this gets created by mongo
36 | i['_task_hash'] = hash(str(i['_id']) + str(self.active_coder))
37 | i['_input_hash'] = hash(str(i['_id']) + str(self.active_coder))
38 | self.examples = iter(examples)
39 | ## !! Need to prioritize examples with 2 or 1 views.
40 |
41 | def setup_mongo(collection_name, db_name = "gsr"):
42 | client = MongoClient('mongodb://localhost:27017/')
43 | db = client[db_name]
44 | coll = db[collection_name]
45 | return coll
46 |
47 | @prodigy.recipe('toi_blocks')
48 | def toi_blocks(dataset, source=None, collection="prod_dec_2020_2"):
49 | log('RECIPE: Starting recipe mark', locals())
50 | coder = source # repurposing input slot
51 | print("Coder from within toi_blocks:", coder)
52 | stream_empty = iter([])
53 | stream = DBStream(coder, collection)
54 | stream.get_examples()
55 | #print("Initial number of examples in queue:", len(list(stream.examples)))
56 | #print("Initial examples in queue:", list(stream.examples))
57 |
58 | def ask_questions(stream_empty):
59 | #print("Hitting 'ask_question', with ", len(list(stream.examples)), " in the queue")
60 | #print(list(stream.examples))
61 | #print(stream.reced)
62 | for eg in stream.examples:
63 | #stream.get_examples()
64 | eg['time_loaded'] = datetime.now().isoformat()
65 | eg['active_coder'] = coder
66 | # not serializeable
67 | eg['_id'] = str(eg['_id'])
68 | yield eg
69 |
70 |
71 | #### Problem with the post-answer update.
72 | ## Not refreshing
73 |
74 | def recv_answers(answers):
75 | for eg in answers:
76 | print("Answer back: ", coder, datetime.now().isoformat())#, eg)
77 | # Get the example from the DB again in case it's changed
78 | updated_ex = list(stream.coll.find({'_id': ObjectId(eg['_id'])}))
79 | try:
80 | curr_cod = updated_ex[0]['coders']
81 | except KeyError:
82 | curr_cod = []
83 | # add current coder to the list
84 | curr_cod.append(coder)
85 | stream.coll.update_one({"_id": ObjectId(eg['_id'])}, # convert back
86 | {"$set": {"coders": curr_cod,
87 | "seen" : len(curr_cod),
88 | 'time_returned': datetime.now().isoformat(),
89 | 'time_loaded': eg['time_loaded'],
90 | 'active_coder': coder
91 | }})
92 | eg['time_returned'] = datetime.now().isoformat()
93 | eg['seen'] = len(curr_cod)
94 |
95 | def print_results(ctrl):
96 | print(printers.answers(counts))
97 |
98 | def get_progress(*args, **kwargs):
99 | done = stream.coll.count_documents({"coders" : coder})
100 | total = stream.coll.count_documents({})
101 | return done / total
102 |
103 | # We can use the blocks to override certain config and content, and set
104 | # "text": None for the choice interface so it doesn't also render the text
105 | blocks = [
106 | {"view_id": "choice", "text": None},
107 | {"view_id": "text_input", "field_rows": 3, "field_label": "If you found this example difficult or ambiguous please explain why."}
108 | ]
109 |
110 | return {
111 | "dataset": dataset, # the dataset to save annotations to
112 | "view_id": "blocks", # set the view_id to "blocks"
113 | "stream": ask_questions(stream_empty), # the stream of incoming examples
114 | "config": {
115 | "labels": ["RELEVANT"], # the labels for the manual NER interface
116 | "blocks": blocks, # add the blocks to the config
117 | },
118 | 'update': recv_answers,
119 | #"custom_theme": {"cardMaxWidth": "90%"},
120 | }
121 |
122 | # return {
123 | # 'view_id': view_id,
124 | # 'dataset': dataset,
125 | # 'stream': ask_questions(stream_empty),
126 | # 'exclude': exclude,
127 | # "flag": True,
128 | # "custom_theme": {"cardMaxWidth": "90%"},
129 | # 'progress' : get_progress,
130 | # 'on_exit': print_results
131 | # }
132 |
133 | class MultiProdigy:
134 | def __init__(self,
135 | coder_list = [#{"name" : "Andy", "port" : 9010},
136 | # {"name" : "Katie", "port" : 9011},
137 | # {"name" : "Sheikh", "port" : 9012},
138 | #{"name" : 9014, "port" : 9014},
139 | {"name" : 9015, "port" : 9015},
140 | {"name" : 9016, "port" : 9016},
141 | {"name" : 9017, "port" : 9017},
142 | #{"name" : 9018, "port" : 9018},
143 | {"name" : 9019, "port" : 9019},
144 | {"name" : 9020, "port" : 9020},
145 | #{"name" : 9021, "port" : 9021},
146 | {"name" : 9022, "port" : 9022},
147 | {"name" : 9023, "port" : 9023},
148 | #{"name" : 9024, "port" : 9024},
149 | {"name" : 9025, "port" : 9025},
150 | ]):
151 | self.coder_list = coder_list
152 | self.processes = []
153 |
154 | def serve(self, coder, port):
155 | print(coder)
156 | #filename = "{0}{1}.jsonl".format(base, coder)
157 | prodigy.serve('toi_blocks', # recipe
158 | "prod_dec_2020_2", # collection
159 | coder, # input file, repurposed for coder
160 | port=port) # port
161 |
162 | def make_prodigies(self):
163 | for coder_info in enumerate(self.coder_list):
164 | coder_info = coder_info[1] # wut
165 | thread = Process(target=self.serve, args = (coder_info['name'], coder_info['port']))
166 | self.processes.append(thread)
167 |
168 | def start_prodigies(self):
169 | print("Starting Prodigy processes...")
170 | for p in self.processes:
171 | p.start()
172 | sleep(1)
173 |
174 | def kill_prodigies(self):
175 | print("Killing Prodigy threads")
176 | for i in self.processes:
177 | try:
178 | i.terminate()
179 | except AttributeError:
180 | print("Process {0} doesn't exist?".format(i))
181 | self.processes = []
182 |
183 |
184 | if __name__ == "__main__":
185 | mp = MultiProdigy()
186 | atexit.register(mp.kill_prodigies)
187 | mp.make_prodigies()
188 | mp.start_prodigies()
189 | while True:
190 | sleep(60 * 60)
191 | print("Restarting Prodigy...")
192 | mp.kill_prodigies()
193 | mp.make_prodigies()
194 | mp.start_prodigies()
195 |
196 |
197 |
--------------------------------------------------------------------------------
/Report.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Gold Standard Record Coding Report"
3 | date: "`r paste0(format(Sys.time(), '%d %B %Y, %H:%M:%S', tz='America/New_York',usetz=TRUE), '(East Coast)')`"
4 | output:
5 | html_document:
6 | self_contained: true
7 | ---
8 |
9 | ```{r setup, include=FALSE}
10 | library(knitr)
11 | opts_chunk$set(fig.width = 5)
12 | opts_chunk$set(fig.height= 4)
13 | opts_chunk$set(echo = TRUE)
14 | opts_chunk$set(warning = FALSE)
15 | opts_chunk$set(message = FALSE)
16 | opts_chunk$set(echo = FALSE)
17 | ```
18 |
19 | ```{r message = FALSE, echo = FALSE, output = FALSE, message = FALSE}
20 | library(ggplot2)
21 | library(RColorBrewer)
22 | library(scales)
23 | library(grid)
24 | library(extrafont)
25 |
26 | theme_mcm <- function() {
27 | # Generate the colors for the chart procedurally with RColorBrewer
28 | palette <- brewer.pal("Greys", n=9)
29 | #color.background = "#fcfaea"
30 | #color.background = "#ffffff"
31 | color.background = "white" #palette[2]
32 | color.grid.major = palette[3]
33 | #color.grid.major = palette[3]
34 | color.axis.text = palette[6]
35 | color.axis.title = palette[7]
36 | color.title = palette[8]
37 |
38 | # Begin construction of chart
39 | theme_bw(base_size=12) +
40 |
41 | # Set the entire chart region to a light gray color
42 | theme(panel.background=element_rect(fill=color.background, color=color.background)) +
43 | theme(plot.background=element_rect(fill=color.background, color=color.background)) +
44 | theme(panel.border=element_rect(color=color.background)) +
45 |
46 | # Format the grid
47 | theme(panel.grid.major=element_line(color=color.grid.major,size=.15)) +
48 | theme(panel.grid.minor=element_line(color=color.grid.major,size=.07)) +
49 | theme(panel.grid.minor.y=element_blank()) +
50 | theme(axis.ticks=element_blank()) +
51 |
52 | # Format the legend
53 | theme(legend.background = element_rect(fill=color.background)) +
54 | theme(legend.key = element_rect(fill=color.background)) +
55 | theme(legend.text = element_text(size=9,color=color.axis.title)) +
56 | theme(legend.title = element_text(color=color.axis.title)) +
57 |
58 | # Set title and axis labels, and format these and tick marks
59 | theme(plot.title=element_text(color=color.title, size=12, vjust=1.25)) +
60 | theme(plot.subtitle=element_text(color="#353535", size=11)) + #, vjust=1.25
61 | theme(plot.caption=element_text(color=color.axis.title)) +
62 | theme(axis.text.x=element_text(size=9,color=color.axis.text)) +
63 | theme(axis.text.y=element_text(size=9,color=color.axis.text)) +
64 | theme(axis.title.x=element_text(size=10,color=color.axis.title, vjust=0)) +
65 | theme(axis.title.y=element_text(size=,color=color.axis.title, vjust=1.25)) +
66 | theme(plot.caption=element_text(size=7,color=palette[4], vjust = 6)) +
67 |
68 | # Facets
69 | theme(strip.background = element_rect(colour = color.background,
70 | fill = palette[3],
71 | size = 0.5)) +
72 | #theme(strip.text.y = element_text(vjust= -1.75)) +
73 |
74 | # Fonts
75 | theme(text=element_text(family="Tw Cen MT", margin=margin(b=15))) +
76 | theme(plot.subtitle=element_text(family="Tw Cen MT")) +
77 |
78 | # scrunch the titles down closer
79 | theme(plot.title = element_text(margin = margin(1.5))) +
80 | theme(plot.subtitle = element_text(margin = margin(1.5))) +
81 |
82 | #...and move the legend top-right
83 | #theme(legend.margin=margin(-20)) +
84 | #theme(legend.justification = "right") +
85 |
86 | # Plot margins
87 | theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm"))
88 | }
89 |
90 | hex <- c("#cfa81f", "#847a8e", "#ad4738", "#7c812d", "#008f7d",
91 | "#6d472f", "#10355e", "#fc9c8e", "#99b8c6", "#ca5f33")
92 | color <- c("yellow", "purple", "red", "green", "teal",
93 | "brown", "blue", "pink", "sky", "orange")
94 | #ggthemr::ggthemr('dust')
95 | #ggthemr::ggthemr('pale')
96 | theme_set(theme_mcm())
97 | ```
98 |
99 | ```{r}
100 | library(dplyr)
101 | #library(plotly)
102 | ```
103 |
104 |
105 | ```{r}
106 | df <- read.csv("coding_summary.csv")
107 | df$date <- as.Date(df$date)
108 | ```
109 |
110 | ```{r}
111 | gold <- df %>%
112 | group_by(id) %>%
113 | summarize(accept = sum(answer == "accept"),
114 | count = n()) %>%
115 | filter(count >= 2, accept == count) %>%
116 | nrow()
117 | ```
118 |
119 | ```{r}
120 | gold_reject <- df %>%
121 | group_by(id) %>%
122 | summarize(good = sum(answer == "reject") == 3) %>%
123 | filter(good == TRUE) %>%
124 | nrow()
125 | ```
126 |
127 | ```{r}
128 | silver <- df %>%
129 | group_by(id) %>%
130 | summarize(good = sum(answer == "accept") >= 2) %>%
131 | filter(good == TRUE) %>%
132 | nrow()
133 | ```
134 |
135 |
136 | ## Overview
137 |
138 | We've collected `r nrow(df)` total labels.
139 | There are currently `r gold` gold-standard accepts, `r silver` silver-standard (2) accepts, `r gold_reject` with three rejects.
140 |
141 |
142 | ## Statistics
143 |
144 | ```{r}
145 | p <- df %>%
146 | group_by(coder) %>%
147 | summarize(count = n()) %>%
148 | ggplot(., aes(x = coder, y = count)) +
149 | geom_bar(stat = "identity", width = 0.4) +
150 | labs(title = "Sentences coded over all time")
151 | p
152 | ```
153 |
154 |
155 | ```{r}
156 | library(lubridate)
157 | week <- today() - dweeks(1)
158 |
159 |
160 | p <- df %>%
161 | filter(as.Date(date) >= week) %>%
162 | group_by(coder) %>%
163 | summarize(count = n()) %>%
164 | ggplot(., aes(x = coder, y = count)) +
165 | geom_bar(stat = "identity", width = 0.4) +
166 | labs(title = paste0("Sentences coded for the week of ", today() - dweeks(1), " to ", today()))
167 | p
168 | ```
169 |
170 | ```{r}
171 | p <- df %>%
172 | group_by(coder, date) %>%
173 | summarize(count = n()) %>%
174 | ggplot(., aes(x = date, y = coder,
175 | alpha = count)) +
176 | geom_tile() +
177 | labs(title = "Daily contributions per coder",
178 | x = "Date") +
179 | scale_x_date(date_minor_breaks = "1 day")
180 | p
181 | ```
182 |
183 |
184 | ```{r}
185 | library(ggridges)
186 |
187 | coder_means <- df %>%
188 | filter(diff < 300) %>%
189 | group_by(coder) %>%
190 | summarize(avg_time = median(diff))
191 |
192 | p <- df %>%
193 | filter(diff < 300) %>%
194 | ggplot(., aes(x = diff, y = coder)) +
195 | #geom_density_ridges() +
196 | geom_density_ridges(stat = "binline", bins = 100, scale = 0.95) +
197 | geom_point(data = coder_means, aes(x = avg_time, y = coder)) +
198 | geom_vline(xintercept = 60, linetype = 2) +
199 | labs(title = "Distribution of time per coder",
200 | subtitle = "Dots indicate median time, dashed line is 60 seconds",
201 | x = "Seconds per task",
202 | y = NULL)
203 | p
204 | ```
205 |
206 | Excluding all tasks that take more than five minutes:
207 |
208 | ```{r results = "asis"}
209 | time_table <- df %>%
210 | filter(as.Date(date) >= week, diff < 300) %>%
211 | group_by(coder) %>%
212 | summarize(total_time = sum(diff) / 60) %>%
213 | arrange(desc(total_time))
214 |
215 | kable(time_table, caption = "Minutes of coding time in past 7 days",
216 | digits = 1)
217 | ```
218 |
219 | ```{r results = "asis"}
220 | d <- today()
221 | prev_days <- seq(d-13,d,by='day')
222 | sats <- prev_days[weekdays(prev_days)=='Saturday']
223 |
224 | time_table <- df %>%
225 | filter(as.Date(date) >= sats[1], as.Date(date) < sats[2], diff < 300) %>%
226 | group_by(coder) %>%
227 | summarize(total_time = sum(diff) / 60) %>%
228 | arrange(desc(total_time))
229 |
230 | kable(time_table, caption = paste0("Minutes of coding time in previous pay period period (",
231 | paste0(c(sats[1], sats[2]-1), collapse = " to "),
232 | ")"),
233 | digits = 1)
234 | ```
235 |
236 | ```{r results = "asis"}
237 | time_table <- df %>%
238 | filter(diff < 300) %>%
239 | group_by(coder) %>%
240 | summarize(total_time = sum(diff) / 60) %>%
241 | arrange(desc(total_time))
242 |
243 | kable(time_table, caption = "Minutes of coding time (all time)",
244 | digits = 1)
245 | ```
246 |
247 | ```{r}
248 | p <- df %>%
249 | group_by(coder) %>%
250 | summarize(mean_accept = mean(answer == "accept")) %>%
251 | mutate(var = (mean_accept * (1 - mean_accept) / n())) %>%
252 | mutate(upper = mean_accept + 1.96*var,
253 | lower = mean_accept - 1.96*var) %>%
254 | ggplot(., aes(x = coder, y = mean_accept)) +
255 | geom_bar(stat = "identity", width = 0.4) +
256 | geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) +
257 | labs(title = "Acceptance rate per coder",
258 | y = "Acceptance Rate",
259 | x = NULL)
260 | p
261 | ```
262 |
263 |
264 | ```{r}
265 | p <- df %>%
266 | group_by(id) %>%
267 | summarize(count = n()) %>%
268 | group_by(count) %>%
269 | summarize(examples = n()) %>%
270 | ggplot(., aes(x = as.factor(count), y = examples)) +
271 | geom_bar(stat = "identity", width = 0.4) +
272 | labs(title = "Distribution of completed examples",
273 | subtitle = "Each example is assigned to three coders",
274 | x = "Count")
275 | p
276 | ```
277 |
278 | ```{r}
279 | p <- df %>%
280 | group_by(id) %>%
281 | mutate(count = n()) %>%
282 | filter(count > 1) %>%
283 | summarize(var = var(answer == "accept")) %>%
284 | ggplot(., aes(x = var)) +
285 | geom_histogram(bins = 50) +
286 | labs(title = "Variance of answers")
287 | p
288 | ```
289 |
290 |
291 |
292 |
293 |
294 |
--------------------------------------------------------------------------------