├── tools
    ├── __init__.py
    ├── scripts
    │   ├── __init__.py
    │   ├── toolbox.py
    │   ├── align_text.py
    │   ├── rdlextra.py
    │   └── cat_rules.py
    ├── resources
    │   ├── en-ptb_map
    │   └── readme.md
    └── derivative_word.py
├── seq2seq-train.py
├── README.md
├── data
    └── fce
    │   ├── licence.txt
    │   └── readme.txt
├── forecast_token.py
├── bert-main.py
├── candidate_tokens.py
└── json2pair.py


/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/seq2seq-train.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import pickle
 4 | 
 5 | with open("fce-train.p", "rb") as fp:
 6 |     data = pickle.load(fp)
 7 | 
 8 | xx = data[0]
 9 | yy = data[1]
10 | 
11 | debug = 1
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## LM-GEC
 2 | Build a English grammatical error correction system based on language model.
 3 | 
 4 | ## Requirement
 5 | * python 3.4+
 6 | * pytorch 1.2
 7 | * pytorch-transformer
 8 | * Texat-pytorch
 9 | * numpy
10 | 
11 | 
12 | ## TODO
13 | 1. Select token to improve the minimum softmax probability of word from Bert iteratively
14 | 2. Fine-tuning train the GPT-2 in inverse word order
15 | 3. Frist use the Bert model to correct error
16 | 4. Deep bidirectional GPT-2 model to polish the output of Bert
17 | 5. Take the grammatical erroe correction as seq2seq probelm, train seq2seq model to do the GEC
18 | 


--------------------------------------------------------------------------------
/tools/resources/en-ptb_map:
--------------------------------------------------------------------------------
 1 | #	SYM
 2 | $	SYM
 3 | ''	PUNCT
 4 | ,	PUNCT
 5 | -LRB-	PUNCT
 6 | -RRB-	PUNCT
 7 | .	PUNCT
 8 | :	PUNCT
 9 | AFX	ADJ
10 | CC	CONJ
11 | CD	NUM
12 | DT	DET
13 | EX	ADV
14 | FW	X
15 | HYPH	PUNCT
16 | IN	ADP
17 | JJ	ADJ
18 | JJR	ADJ
19 | JJS	ADJ
20 | LS	PUNCT
21 | MD	VERB
22 | NIL	X
23 | NN	NOUN
24 | NNP	PROPN
25 | NNPS	PROPN
26 | NNS	NOUN
27 | PDT	DET
28 | POS	PART
29 | PRP	PRON
30 | PRP$	DET
31 | RB	ADV
32 | RBR	ADV
33 | RBS	ADV
34 | RP	PART
35 | SYM	SYM
36 | TO	PART
37 | UH	INTJ
38 | VB	VERB
39 | VBD	VERB
40 | VBG	VERB
41 | VBN	VERB
42 | VBP	VERB
43 | VBZ	VERB
44 | WDT	DET
45 | WP	PRON
46 | WP$	DET
47 | WRB	ADV
48 | ``	PUNCT


--------------------------------------------------------------------------------
/tools/resources/readme.md:
--------------------------------------------------------------------------------
 1 | # Resources
 2 | 
 3 | ## en-ptb_map
 4 | 
 5 | en-ptb_map is a mapping file that converts spacy Penn Treebank (PTB) style part of speech tags to stanford universal dependency tags.  
 6 | 
 7 | The mapping file was obtained [here](http://universaldependencies.org/tagset-conversion/en-penn-uposf.html).  
 8 | 
 9 | Spacy includes some custom POS tags that were not part of the original PTB tagset. The authors of spacy suggested the following mapping for these tags:
10 | 
11 | | PTB-Style | Universal
12 | |-----------|--------
13 | | ""        | PUNCT
14 | | ADD       | X
15 | | GW        | X
16 | | NFP       | X
17 | | SP        | SPACE
18 | | XX        | X
19 | 
20 | ## en_GB-large.txt 
21 | 
22 | en_GB-large.txt is a list of valid British English words according to the latest Hunspell dictionary.  
23 | 
24 | It was obtained [here](https://sourceforge.net/projects/wordlist/files/speller/2017.08.24/). 
25 | 
26 | The specific file bundled with this release is: wordlist-en_GB-large-2017.08.24.zip.
27 | 
28 | 


--------------------------------------------------------------------------------
/tools/derivative_word.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import gluonnlp
 4 | from nltk.stem import WordNetLemmatizer
 5 | from nltk.stem.porter import *
 6 | from polyglot.text import Text, Word
 7 | import spacy
 8 | import enchant
 9 | import re
10 | import pickle
11 | isEnglish = enchant.Dict("en")
12 | aa = isEnglish.check("hello")
13 | 
14 | lemmatizer = WordNetLemmatizer()
15 | 
16 | nlp = spacy.load('en_core_web_sm')
17 | 
18 | '''
19 | doc = nlp('The only major thing to note is that lemmatize takes a biggest part of speech parameter, "pos." ')
20 | tokens = [token.text for token in doc]
21 | for tk in tokens:
22 |     w1 = lemmatizer.lemmatize(tk, 'v')
23 |     w2 = lemmatizer.lemmatize(tk, pos="a")
24 |     w3 = lemmatizer.lemmatize(tk)
25 |     w = {w1, w2, w3}
26 | '''
27 | 
28 | stemmer = PorterStemmer()
29 | glove_6b50d = gluonnlp.embedding.create('glove', source='glove.6B.50d')
30 | vocab = gluonnlp.Vocab(gluonnlp.data.Counter(glove_6b50d.idx_to_token))
31 | 
32 | pattern = "^[A-Za-z]*[A-Za-z]$"
33 | 
34 | roots_dict = dict()
35 | for ww in vocab.idx_to_token:
36 |     if re.search(pattern, ww) and len(ww)>2:
37 |         root = stemmer.stem(ww)
38 |         if ww != root:
39 |             print(ww)
40 |             #polarity = int(Word(ww, language="en").polarity)
41 |             #ttt = {ww: polarity}
42 |             if root not in roots_dict:
43 |                 roots_dict[root] = []
44 |                 roots_dict[root].append(ww)
45 |             else:
46 |                 roots_dict[root].append(ww)
47 | 
48 |         debug = 1
49 | fp = open("stem-words.p", "wb")
50 | pickle.dump(roots_dict, fp)
51 | fp.close()
52 | 
53 | fp = open("stem-words.p", "rb")
54 | aaaa = pickle.load(fp)
55 | fp.close()
56 | 
57 | debug = 1


--------------------------------------------------------------------------------
/data/fce/licence.txt:
--------------------------------------------------------------------------------
 1 | CLC FCE Dataset Licence Agreement
 2 | 
 3 | 1. By downloading this dataset and licence, this licence agreement is
 4 |    entered into, effective this date, between you, the Licensee, and the
 5 |    University of Cambridge, the Licensor.
 6 | 
 7 | 2. Copyright of the entire licensed dataset is held by the Licensor.
 8 |    No ownership or interest in the dataset is transferred to the
 9 |    Licensee.
10 | 
11 | 3. The Licensor hereby grants the Licensee a non-exclusive
12 |    non-transferable right to use the licensed dataset for
13 |    non-commercial research and educational purposes.
14 | 
15 | 4. Non-commercial purposes exclude without limitation any use of the
16 |    licensed dataset or information derived from the dataset for or as
17 |    part of a product or service which is sold, offered for sale,
18 |    licensed, leased or rented.
19 | 
20 | 5. The Licensee shall acknowledge use of the licensed dataset in all
21 |    publications of research based on it, in whole or in part, through
22 |    citation of the following publication:
23 | 
24 |    Yannakoudakis, Helen and Briscoe, Ted and Medlock, Ben,
25 |    A New Dataset and Method for Automatically Grading ESOL Texts,
26 |    Proceedings of the 49th Annual Meeting of the Association for
27 |    Computational Linguistics: Human Language Technologies.
28 | 
29 | 6. The Licensee may publish excerpts of less than 100 words from the
30 |    licensed dataset pursuant to clause 3.
31 | 
32 | 7. The Licensor grants the Licensee this right to use the licensed dataset
33 |    "as is". Licensor does not make, and expressly disclaims, any express or
34 |    implied warranties, representations or endorsements of any kind
35 |    whatsoever.
36 | 
37 | 8. This Agreement shall be governed by and construed in accordance with
38 |    the laws of England and the English courts shall have exclusive
39 |    jurisdiction.
40 | 
41 | 


--------------------------------------------------------------------------------
/forecast_token.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from candidate_tokens import get_candidate_tokens
 3 | import torch
 4 | import numpy as np
 5 | from polyglot.text import Word
 6 | import spacy
 7 | nlp = spacy.load('en_core_web_sm')
 8 | 
 9 | 
10 | def forecast_token(text, masked_index, tokenizer, model):
11 |     tokenized_text = ['[CLS]']
12 |     doc = nlp(text)
13 |     tokenized_text.extend([token.text for token in doc])
14 |     tokenized_text.append('[SEP]')
15 | 
16 |     synonyms_ = get_candidate_tokens(tokenized_text[masked_index])
17 |     synonyms_ = list(set(synonyms_))
18 | 
19 |     masked_token = tokenized_text[masked_index]
20 |     token_polarity = int(Word(masked_token, language="en").polarity) #######
21 | 
22 |     synonyms = []
23 |     for elem in synonyms_:
24 |         if int(Word(elem, language="en").polarity) == token_polarity:
25 |             synonyms.append(elem)
26 | 
27 |     # Mask a token that we will try to predict back with `BertForMaskedLM`
28 |     tokenized_text[masked_index] = '[MASK]'
29 | 
30 |     # Convert token to vocabulary indices
31 |     indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
32 | 
33 |     # Convert inputs to PyTorch tensors
34 |     tokens_tensor = torch.tensor([indexed_tokens])
35 | 
36 | 
37 |     # Predict all tokens
38 |     with torch.no_grad():
39 |         outputs = model(tokens_tensor)
40 |         predictions = outputs[0]
41 | 
42 |     token_idxs = [tokenizer.convert_tokens_to_ids([word])[0] for word in synonyms]
43 |     preds = np.array([predictions[0, masked_index, idx] for idx in token_idxs])
44 |     sort_top = preds.argsort()
45 |     #predicted_index = token_idxs[sort_top[-1]]
46 |     candiditate_tokens = [synonyms[sort_top[-1]], synonyms[sort_top[-2]]]
47 |     candiditate_tokens = []
48 |     for nn in np.arange(len(preds)):
49 |         if abs(preds[nn]-preds[sort_top[-1]])<0.0001:
50 |             candiditate_tokens.append(synonyms[nn])
51 | 
52 |     if masked_token in candiditate_tokens:  # if the probability of masked token within top two, then think the masked token is correct.
53 |         predicted_token, softmax_prob = masked_token, 100
54 |     else:
55 |         predicted_token, softmax_prob = synonyms[sort_top[-1]], preds[sort_top[-1]]
56 | 
57 |     # don't change the token if the predicted token is same at the original token
58 |     # without consider the upper/lower case
59 |     if masked_token.lower() == predicted_token.lower():
60 |         predicted_token = masked_token
61 |     return predicted_token, softmax_prob
62 | 
63 | 


--------------------------------------------------------------------------------
/bert-main.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # https://www.cl.cam.ac.uk/research/nl/bea2019st/
 3 | # Input	Travel by bus is exspensive , bored and annoying .
 4 | # Output	Travelling by bus is expensive , boring and annoying .
 5 | 
 6 | import numpy as np
 7 | import spacy
 8 | from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM, \
 9 |                                     RobertaTokenizer, RobertaForMaskedLM,  \
10 |                                     XLNetTokenizer, XLNetPreTrainedModel, \
11 |                                     XLNetLMHeadModel, \
12 |                                     XLMPreTrainedModel , XLMModel, XLMWithLMHeadModel
13 | from forecast_token import forecast_token
14 | import re
15 | from googletrans import Translator
16 | translator = Translator()
17 | aa = translator.translate('程开甲，男，汉族，中共党员、九三学社社员，'
18 |                           '1918年8月生，2018年11月去世，江苏吴江人，'
19 |                           '原国防科工委科技委常任委员，中国科学院院士。')
20 | 
21 | nlp = spacy.load('en_core_web_lg')
22 | 
23 | # Load pre-trained model tokenizer (vocabulary)
24 | # bert-large-uncased-whole-word-masking,
25 | tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')
26 | model = BertForMaskedLM.from_pretrained('bert-large-uncased-whole-word-masking')
27 | #tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
28 | #model = RobertaForMaskedLM.from_pretrained('roberta-large')
29 | #tokenizer = XLNetTokenizer.from_pretrained('xlm-mlm-en-2048')
30 | #model = XLMPreTrainedModel.from_pretrained('xlm-mlm-en-2048')
31 | model.eval()
32 | 
33 | # Tokenize input
34 | #text = 'I am writing in order to express my disappointment about your musical show " Over the Rainbow " .'
35 | #text = 'I am writing in order to express my disappointed about your musical show " Over the Rainbow " .'
36 | text = "I saws the show 's advertisement hanging up of a wall in London where I was spending my holiday with some friends . " \
37 |        "I convinced them to go there with me because I had heard good references about your Company and , " \
38 |        "above all , about the main star , Danny Brook ."
39 | doc = nlp(text)
40 | sentences = [sent.text for sent in doc.sents]
41 | for sent in sentences:
42 |     sent_doc = nlp(sent)
43 |     tokens = [token.text for token in sent_doc]
44 |     for masked_index in np.arange(len(sent_doc))+1:
45 |         if masked_index>1 and tokens[masked_index-1].istitle():  # deflautly think the word with first letter is upppercase is
46 |             f_token, softmax_prob = tokens[masked_index-1], 100
47 |         else:
48 |             f_token, softmax_prob = forecast_token(sent, masked_index, tokenizer, model)
49 |         print('Predicted token is:  ', f_token, '       softmax_prob:   ', softmax_prob)
50 | 
51 | debug = 1


--------------------------------------------------------------------------------
/candidate_tokens.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | from pattern.en import lexeme
 5 | from nltk.corpus import wordnet
 6 | from nltk.stem.porter import *
 7 | from nltk.stem import WordNetLemmatizer
 8 | import spacy
 9 | 
10 | from spacy.lang.en.stop_words import STOP_WORDS
11 | 
12 | from mxnet import nd
13 | import gluonnlp
14 | 
15 | import pickle
16 | import enchant
17 | from nltk.corpus import words
18 | #import nltk
19 | #nltk.download()
20 | EnglishDict = enchant.Dict("en_US")
21 | 
22 | stemmer = PorterStemmer()
23 | 
24 | lemmatizer = WordNetLemmatizer()
25 | 
26 | fp = open("./tools/stem-words.p", "rb")
27 | stem2words = pickle.load(fp)
28 | fp.close()
29 | 
30 | 
31 | glove_6b50d = gluonnlp.embedding.create('glove', source='glove.6B.50d')
32 | vocab = gluonnlp.Vocab(gluonnlp.data.Counter(glove_6b50d.idx_to_token))
33 | vocab.set_embedding(glove_6b50d)
34 | 
35 | def norm_vecs_by_row(x):
36 |     return x / nd.sqrt(nd.sum(x * x, axis=1) + 1E-10).reshape((-1,1))
37 | 
38 | def get_knn(word, k=2000):
39 |     word_vec = vocab.embedding[word].reshape((-1, 1))
40 |     vocab_vecs = norm_vecs_by_row(vocab.embedding.idx_to_vec)
41 |     dot_prod = nd.dot(vocab_vecs, word_vec)
42 |     indices = nd.topk(dot_prod.reshape((len(vocab), )), k=k+1, ret_typ='indices')
43 |     indices = [int(i.asscalar()) for i in indices]
44 |     # Remove unknown and input tokens.
45 |     return vocab.to_tokens(indices[1:])
46 | 
47 | def get_synomyms_token(token):
48 |     stem = stemmer.stem(token)
49 |     synonyms_ = [token]
50 |     if stem in stem2words:
51 |         words = stem2words[stem]
52 |         synonyms_.extend(words)
53 | 
54 |     w1 = lemmatizer.lemmatize(token, 'v')
55 |     w2 = lemmatizer.lemmatize(token, pos="a")
56 |     w3 = lemmatizer.lemmatize(token)
57 |     w = {w1, w2, w3}
58 |     synonyms_.extend(list(w))
59 | 
60 |     #synonyms_ = [token]
61 | 
62 |     for syn in wordnet.synsets(token):
63 |         for l in syn.lemmas():
64 |             synonyms_.append(l.name())
65 | 
66 |     synonyms_.extend(lexeme(token))
67 |     synonyms = np.array([elm for elm in set(synonyms_)])
68 | 
69 |     return synonyms
70 | 
71 | def get_candidate_tokens(token):
72 |     #spacy_stopwords = list(spacy.lang.en.stop_words.STOP_WORDS)
73 |     spacy_stopwords = list(STOP_WORDS)
74 |     if token in spacy_stopwords:
75 |         return spacy_stopwords
76 | 
77 |     result_ = get_knn(token, 20)
78 |     result = []
79 |     for ww in result_:
80 |         # check the string from KNN is in English dictionary
81 |         if EnglishDict.check(ww) or ww in words.words():
82 |             result.append(ww)
83 | 
84 |     synomyms = get_synomyms_token(token)
85 |     result.extend(synomyms)
86 |     #result.append('reviewing')
87 | 
88 |     return result
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     aa = get_candidate_tokens('people')
93 |     bb = get_knn('took', 100)
94 |     print(bb)
95 | 
96 | 


--------------------------------------------------------------------------------
/data/fce/readme.txt:
--------------------------------------------------------------------------------
 1 | Release 2.1
 2 | 25th March 2019
 3 | 
 4 | This directory contains the official version of the First Certificate in English (FCE) corpus used in the BEA2019 shared task.
 5 | 
 6 | More details about the FCE corpus can be found in the following paper:
 7 | 
 8 | Helen Yannakoudakis, Ted Briscoe, and Ben Medlock. 2011. A new dataset and method for automatically grading ESOL texts. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pages 180–189.
 9 | 
10 | The original FCE files are available here: https://ilexir.co.uk/datasets/index.html
11 | The raw dataset is not explicitly split into training, development and test sets, and so we recreated this split based on the error detection version of the dataset available at the same link.
12 | 
13 | This version of the public FCE is available in two different formats: JSON and M2.
14 | 
15 | -- JSON --
16 | The JSON format is the raw unprocessed version of the corpus. Each line in a JSON file contains the following fields:
17 |     id       : A unique id for the essay.
18 |     l1       : The first language of the author.
19 | 	age      : The age (or age range) of the author.
20 | 	q        : The question number; each author submitted essay answers to 2 different questions.
21 | 	answer-s : The score awarded to the essay for this particular question.
22 | 	script-s : The overall score awarded to the author for both questions they answered.
23 |     text     : The essay as it was originally written by the author.
24 |     edits    : A list of all the character level edits made to the text by all annotators, of the form:
25 |                [[annotator_id, [[char_start_offset, char_end_offset, correction], ...]], ...].
26 | 
27 | -- M2 --
28 | The M2 format is the processed version of the corpus that we recommend for the BEA2019 shared task.
29 | M2 format has been the standard format for annotated GEC files since the first CoNLL shared task in 2013.
30 | 
31 | Since it is not easy to convert character level edits in unprocessed text into token level edits in sentences (cf. https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-894.pdf), we provide a json_to_m2.py script to convert the raw JSON to M2. This script must be placed inside the main directory of the ERRor ANnotation Toolkit (ERRANT) in order to be used. ERRANT is available here: https://github.com/chrisjbryant/errant
32 | 
33 | Each M2 file was thus generated in Python 3.5 using the following command:
34 | 
35 | python3 errant/json_to_m2.py <wi_json> -out <wi_m2> -gold
36 | 
37 | This used spacy v1.9.0 and the en_core_web_sm-1.2.0 model.
38 | 
39 | Updates
40 | ----------------------
41 | 
42 | -- v2.0 --
43 | 
44 | * Added new JSON files for the FCE if users want the original data in the same format as the W&I+LOCNESS corpus.
45 | 
46 | * All punctuation was normalised in the M2 files. It was otherwise arbitrary whether, for example, different apostrophe styles were corrected or not.
47 | 
48 | * Fixed a bug in the character to token edit conversion script.
49 | 
50 | * Fixed a bug with correction edits nested inside detection edits that led to them being ignored.
51 | 
52 | -- v2.1 --
53 | 
54 | * Updated the json_to_m2.py script to handle multiple annotators.
55 | 


--------------------------------------------------------------------------------
/tools/scripts/toolbox.py:
--------------------------------------------------------------------------------
  1 | from operator import itemgetter
  2 | 
  3 | # Load latest Hunspell dictionaries:
  4 | def loadDictionary(path):
  5 | 	return set(open(path).read().split())
  6 | 
  7 | # Load Stanford Universal Tags map file.
  8 | def loadTagMap(path):
  9 | 	map_dict = {}
 10 | 	open_file = open(path).readlines()
 11 | 	for line in open_file:
 12 | 		line = line.strip().split("\t")
 13 | 		# Change ADP to PREP; makes it clearer
 14 | 		if line[1].strip() == "ADP":
 15 | 			map_dict[line[0]] = "PREP"
 16 | 		# Also change PROPN to NOUN; we don't need a prop noun tag
 17 | 		elif line[1].strip() == "PROPN":
 18 | 			map_dict[line[0]] = "NOUN"
 19 | 		else:
 20 | 			map_dict[line[0]] = line[1].strip()
 21 | 	# Add some spacy PTB tags not in the original mapping.
 22 | 	map_dict['""'] = "PUNCT"
 23 | 	map_dict["SP"] = "SPACE"
 24 | 	map_dict["ADD"] = "X"
 25 | 	map_dict["GW"] = "X"
 26 | 	map_dict["NFP"] = "X"
 27 | 	map_dict["XX"] = "X"
 28 | 	return map_dict
 29 | 
 30 | # Input: A sentence + edit block in an m2 file.
 31 | # Output 1: The original sentence (a list of tokens)
 32 | # Output 2: A dictionary; key is coder id, value is a tuple. 
 33 | # tuple[0] is the corrected sentence (a list of tokens), tuple[1] is the edits.
 34 | # Process M2 to extract sentences and edits.
 35 | def processM2(info):
 36 | 	info = info.split("\n")
 37 | 	orig_sent = info[0][2:].split() # [2:] ignore the leading "S "
 38 | 	all_edits = info[1:]
 39 | 	# Simplify the edits and group by coder id.
 40 | 	edit_dict = processEdits(all_edits)
 41 | 	out_dict = {}
 42 | 	# Loop through each coder and their edits.
 43 | 	for coder, edits in edit_dict.items():
 44 | 		# Copy orig_sent. We will apply the edits to it to make cor_sent
 45 | 		cor_sent = orig_sent[:]
 46 | 		gold_edits = []
 47 | 		offset = 0
 48 | 		# Sort edits by start and end offset only. If they are the same, do not reorder.
 49 | 		edits = sorted(edits, key=itemgetter(0)) # Sort by start offset
 50 | 		edits = sorted(edits, key=itemgetter(1)) # Sort by end offset
 51 | 		for edit in edits:
 52 | 			# Do not apply noop or Um edits, but save them
 53 | 			if edit[2] in {"noop", "Um"}: 
 54 | 				gold_edits.append(edit+[-1,-1])
 55 | 				continue
 56 | 			orig_start = edit[0]
 57 | 			orig_end = edit[1]
 58 | 			cor_toks = edit[3].split()
 59 | 			# Apply the edit.
 60 | 			cor_sent[orig_start+offset:orig_end+offset] = cor_toks
 61 | 			# Get the cor token start and end positions in cor_sent
 62 | 			cor_start = orig_start+offset
 63 | 			cor_end = cor_start+len(cor_toks)
 64 | 			# Keep track of how this affects orig edit offsets.
 65 | 			offset = offset-(orig_end-orig_start)+len(cor_toks)
 66 | 			# Save the edit with cor_start and cor_end
 67 | 			gold_edits.append(edit+[cor_start]+[cor_end])
 68 | 		# Save the cor_sent and gold_edits for each annotator in the out_dict.
 69 | 		out_dict[coder] = (cor_sent, gold_edits)
 70 | 	return orig_sent, out_dict
 71 | 
 72 | # Input: A list of edit lines for a sentence in an m2 file.
 73 | # Output: An edit dictionary; key is coder id, value is a list of edits.
 74 | def processEdits(edits):
 75 | 	edit_dict = {}
 76 | 	for edit in edits:
 77 | 		edit = edit.split("|||")
 78 | 		span = edit[0][2:].split() # [2:] ignore the leading "A "
 79 | 		start = int(span[0])
 80 | 		end = int(span[1])
 81 | 		cat = edit[1]
 82 | 		cor = edit[2]
 83 | 		id = edit[-1]
 84 | 		# Save the useful info as a list
 85 | 		proc_edit = [start, end, cat, cor]
 86 | 		# Save the proc edit inside the edit_dict using coder id.
 87 | 		if id in edit_dict.keys():
 88 | 			edit_dict[id].append(proc_edit)
 89 | 		else:
 90 | 			edit_dict[id] = [proc_edit]
 91 | 	return edit_dict
 92 | 
 93 | # Input 1: A list of token strings in a sentence.
 94 | # Input 2: A preloaded Spacy processing object.
 95 | # Annotate tokens with POS, lemma and parse info.
 96 | def applySpacy(sent, nlp):
 97 | 	# Convert tokens to spacy tokens and POS tag and parse.
 98 | 	sent = nlp.tokenizer.tokens_from_list(sent)
 99 | 	nlp.tagger(sent)
100 | 	nlp.parser(sent)
101 | 	return sent
102 | 
103 | # Input 1: An edit list. [orig_start, orig_end, cat, cor, cor_start, cor_end]
104 | # Input 2: An original SpaCy sentence.
105 | # Input 3: A corrected SpaCy sentence.
106 | # Output: A minimised edit with duplicate words on both sides removed.
107 | # E.g. [was eaten -> has eaten] becomes [was -> has]
108 | def minimiseEdit(edit, orig, cor):
109 | 	# edit = [orig_start, orig_end, cat, cor, cor_start, cor_end]
110 | 	orig_toks = orig[edit[0]:edit[1]]
111 | 	cor_toks = cor[edit[4]:edit[5]]
112 | 	# While the first token is the same string in both (and both are not null)
113 | 	while orig_toks and cor_toks and orig_toks[0].text == cor_toks[0].text:
114 | 		# Remove that token from the span, and adjust the start offset.
115 | 		orig_toks = orig_toks[1:]
116 | 		cor_toks = cor_toks[1:]
117 | 		edit[0] += 1
118 | 		edit[4] += 1
119 | 	# Then do the same from the last token.
120 | 	while orig_toks and cor_toks and orig_toks[-1].text == cor_toks[-1].text:
121 | 		# Remove that token from the span, and adjust the start offset.
122 | 		orig_toks = orig_toks[:-1]
123 | 		cor_toks = cor_toks[:-1]
124 | 		edit[1] -= 1
125 | 		edit[5] -= 1
126 | 	# If both sides are not null, save the new correction string.
127 | 	if orig_toks or cor_toks:
128 | 		edit[3] = " ".join([tok.text for tok in cor_toks])
129 | 		return edit
130 | 	
131 | # Input 1: An edit list = [orig_start, orig_end, cat, cor, cor_start, cor_end]
132 | # Input 2: A coder id for the specific annotator.
133 | # Output: An edit in m2 file format.
134 | def formatEdit(edit, coder_id=0):
135 | 	span = " ".join(["A", str(edit[0]), str(edit[1])])
136 | 	return "|||".join([span, edit[2], edit[3], "REQUIRED", "-NONE-", str(coder_id)])


--------------------------------------------------------------------------------
/tools/scripts/align_text.py:
--------------------------------------------------------------------------------
  1 | from difflib import SequenceMatcher
  2 | from itertools import combinations, groupby
  3 | from string import punctuation
  4 | import re
  5 | import spacy.parts_of_speech as POS
  6 | import tools.scripts.rdlextra as DL
  7 | 
  8 | # Some global variables
  9 | CONTENT_POS = {POS.ADJ, POS.ADV, POS.NOUN, POS.VERB}
 10 | 
 11 | ### FUNCTIONS ###
 12 | 
 13 | def get_opcodes(alignment):
 14 | 	s_start = 0
 15 | 	s_end   = 0
 16 | 	t_start = 0
 17 | 	t_end   = 0
 18 | 	opcodes = []
 19 | 	for op in alignment:
 20 | 		if op[0] == "D": # Deletion
 21 | 			s_end += 1
 22 | 		elif op[0] == "I": # Insertion
 23 | 			t_end += 1
 24 | 		elif op[0].startswith("T"): # Transposition
 25 | 			# Extract number of elements involved (default is 2)
 26 | 			k = int(op[1:] or 2)
 27 | 			s_end += k
 28 | 			t_end += k
 29 | 		else: # Match or substitution
 30 | 			s_end += 1
 31 | 			t_end += 1
 32 | 		# Save
 33 | 		opcodes.append((op, s_start, s_end, t_start, t_end))
 34 | 		# Start from here
 35 | 		s_start = s_end
 36 | 		t_start = t_end
 37 | 	return opcodes
 38 | 
 39 | def merge_edits(edits):
 40 | 	if edits:
 41 | 		return [("X", edits[0][1], edits[-1][2], edits[0][3], edits[-1][4])]
 42 | 	else:
 43 | 		return edits
 44 | 
 45 | # Input 1: Spacy source sentence
 46 | # Input 2: Spacy target sentence
 47 | # Input 3: The alignment between the 2; [e.g. M, M, S ,S M]
 48 | # Output: A list of processed edits that have been merged or split.
 49 | def get_edits(source, target, edits):
 50 | 	out_edits = []
 51 | 	# Start: Split alignment intro groups of M, T and rest. T has a number after it.
 52 | 	for op, group in groupby(edits, lambda x: x[0][0] if x[0][0] in {"M", "T"} else False):
 53 | 		# Convert the generator to a list
 54 | 		group = list(group)
 55 | 		# Ignore M
 56 | 		if op == "M": continue
 57 | 		# Do not merge T
 58 | 		elif op == "T": out_edits.extend(group)
 59 | 		# Further processing required
 60 | 		else: out_edits.extend(process_edits(source, target, group))
 61 | 	return out_edits
 62 | 
 63 | # Input 1: Spacy source sentence
 64 | # Input 2: Spacy target sentence
 65 | # Input 3: A list of non-matching alignments: D, I and/or S
 66 | # Output: A list of processed edits that have been merged or split.
 67 | def process_edits(source, target, edits):
 68 | 	# Return single alignments
 69 | 	if len(edits) <= 1: return edits
 70 | 	# Get the ops for the whole edit sequence
 71 | 	ops = [op[0] for op in edits]
 72 | 	# Merge ops that are all D xor I. (95% of human multi-token edits contain S).
 73 | 	if set(ops) == {"D"} or set(ops) == {"I"}: return merge_edits(edits)
 74 | 	
 75 | 	content = False # True if edit includes a content word
 76 | 	# Get indices of all combinations of start and end ranges in the edits: 012 -> 01, 02, 12
 77 | 	combos = list(combinations(range(0, len(edits)), 2))
 78 | 	# Sort them starting with largest spans first
 79 | 	combos.sort(key = lambda x: x[1]-x[0], reverse=True)
 80 | 	# Loop through combos
 81 | 	for start, end in combos:
 82 | 		# Ignore ranges that do NOT contain a substitution.
 83 | 		if "S" not in ops[start:end+1]: continue
 84 | 		# Get the tokens in orig and cor. They will never be empty due to above rule.
 85 | 		s = source[edits[start][1]:edits[end][2]]
 86 | 		t = target[edits[start][3]:edits[end][4]]
 87 | 		# Possessive suffixes merged with previous token: [friends -> friend 's]
 88 | 		if s[-1].tag_ == "POS" or t[-1].tag_ == "POS":
 89 | 			return process_edits(source, target, edits[:end-1]) + merge_edits(edits[end-1:end+1]) + process_edits(source, target, edits[end+1:])
 90 | 		# Case changes
 91 | 		if s[-1].lower_ == t[-1].lower_:
 92 | 			# Merge first token I or D of arbitrary length: [Cat -> The big cat]
 93 | 			if start == 0 and ((len(s) == 1 and t[0].text[0].isupper()) or (len(t) == 1 and s[0].text[0].isupper())):
 94 | 				return merge_edits(edits[start:end+1]) + process_edits(source, target, edits[end+1:])
 95 | 			# Merge with previous punctuation: [, we -> . We], [we -> . We]
 96 | 			if (len(s) > 1 and is_punct(s[-2])) or (len(t) > 1 and is_punct(t[-2])):
 97 | 				return process_edits(source, target, edits[:end-1]) + merge_edits(edits[end-1:end+1]) + process_edits(source, target, edits[end+1:])
 98 | 		# Whitespace/hyphens: [bestfriend -> best friend], [sub - way -> subway]
 99 | 		s_str = re.sub("['-]", "", "".join([tok.lower_ for tok in s]))
100 | 		t_str = re.sub("['-]", "", "".join([tok.lower_ for tok in t]))
101 | 		if s_str == t_str:
102 | 			return process_edits(source, target, edits[:start]) + merge_edits(edits[start:end+1]) + process_edits(source, target, edits[end+1:])
103 | 		# POS-based merging: Same POS or infinitive/phrasal verbs: [to eat -> eating], [watch -> look at]
104 | 		pos_set = set([tok.pos for tok in s]+[tok.pos for tok in t])
105 | 		if (len(pos_set) == 1 and len(s) != len(t)) or pos_set == {POS.PART, POS.VERB}:
106 | 			return process_edits(source, target, edits[:start]) + merge_edits(edits[start:end+1]) + process_edits(source, target, edits[end+1:])
107 | 		# Split rules take effect when we get to smallest chunks
108 | 		if end-start < 2:
109 | 			# Split adjacent substitutions
110 | 			if len(s) == len(t) == 2:
111 | 				return process_edits(source, target, edits[:start+1]) + process_edits(source, target, edits[start+1:])
112 | 			# Similar substitutions at start or end
113 | 			if (ops[start] == "S" and char_cost(s[0].text, t[0].text) < 0.25) or \
114 | 				(ops[end] == "S" and char_cost(s[-1].text, t[-1].text) < 0.25):
115 | 				return process_edits(source, target, edits[:start+1]) + process_edits(source, target, edits[start+1:])	
116 | 			# Split final determiners
117 | 			if end == len(edits)-1 and ((ops[-1] in {"D", "S"} and s[-1].pos == POS.DET) or \
118 | 				(ops[-1] in {"I", "S"} and t[-1].pos == POS.DET)):
119 | 				return process_edits(source, target, edits[:-1]) + [edits[-1]]
120 | 		# Set content word flag
121 | 		if not pos_set.isdisjoint(CONTENT_POS): content = True
122 | 	# If all else fails, merge edits that contain content words
123 | 	if content: return merge_edits(edits)
124 | 	else: return edits
125 | 
126 | # Is the token a content word?
127 | def is_content(A):
128 | 	return A.pos in CONTENT_POS
129 | 
130 | # Check whether token is punctuation
131 | def is_punct(token):
132 | 	return token.pos == POS.PUNCT or token.text in punctuation
133 | 
134 | # all-split: No edits are ever merged. Everything is 1:1, 1:0 or 0:1 only.
135 | def get_edits_split(edits):
136 | 	new_edits = []
137 | 	for edit in edits:
138 | 		op = edit[0]
139 | 		if op != "M":
140 | 			 new_edits.append(edit)
141 | 	return new_edits
142 | 
143 | # all-merge: Merge all adjacent edits of any operation type, except M.
144 | def get_edits_group_all(edits):
145 | 	new_edits = []
146 | 	for op, group in groupby(edits, lambda x: True if x[0] == "M" else False):
147 | 		if not op:
148 | 			 new_edits.extend(merge_edits(list(group)))
149 | 	return new_edits
150 | 
151 | # all-equal: Merge all edits of the same operation type.
152 | def get_edits_group_type(edits):
153 | 	new_edits = []
154 | 	for op, group in groupby(edits, lambda x: x[0]):
155 | 		if op != "M":
156 | 			 new_edits.extend(merge_edits(list(group)))
157 | 	return new_edits
158 | 
159 | # Cost is 0 if lemmas are the same, otherwise 0.499. Maximum S cost is 1.999.
160 | # This prevents unintuitive transpositions.
161 | def lemma_cost(A, B):
162 | 	if A.lemma == B.lemma:
163 | 		return 0
164 | 	else: 
165 | 		return 0.499
166 | 
167 | # Cost is 0 if POS are the same, else 0.25 if both are content, else 0.5.
168 | # Content words more likely to align to other content words.
169 | def pos_cost(A, B):
170 | 	if A.pos == B.pos:
171 | 		return 0
172 | 	elif is_content(A) and is_content(B):
173 | 		return 0.25
174 | 	else:
175 | 		return 0.5
176 | 
177 | # Calculate the cost of character alignment; i.e. char similarity
178 | def char_cost(A, B):
179 | 	return 1-SequenceMatcher(None, A, B).ratio()
180 | 
181 | # If there is a substitution, calculate the more informative cost.
182 | def token_substitution(A, B, A_extra, B_extra):
183 | 	# If lower case strings are the same, don't bother checking pos etc.
184 | 	# This helps catch case marking substitution errors.
185 | 	if A.lower() == B.lower():
186 | 		return 0
187 | 	cost = lemma_cost(A_extra, B_extra) + pos_cost(A_extra, B_extra) + char_cost(A, B)
188 | 	return cost	
189 | 
190 | # Change cost of Transpositions to be the same as Levenshtein.
191 | def levTransposition(a,b,c,d):
192 | 	return float("inf")
193 | 
194 | # Change cost of Substitution to be the same as Levenshtein.
195 | def levSubstitution(a,b,c,d):
196 | 	return 1
197 | 
198 | # Input 1: A Spacy annotated original sentence.
199 | # Input 2: A Spacy annotated corrected sentence.
200 | # Input 3: Command line args.
201 | # Output: A list of lists. Each sublist is an edit of the form:
202 | # edit = [orig_start, orig_end, cat, cor, cor_start, cor_end]
203 | def getAutoAlignedEdits(orig, cor, args):
204 | 	# Get a list of strings from the spacy objects.
205 | 	orig_toks = [tok.text for tok in orig]
206 | 	cor_toks = [tok.text for tok in cor]
207 | 	# Align using Levenshtein.
208 | 	if args.lev: alignments = DL.WagnerFischer(orig_toks, cor_toks, orig, cor, substitution=levSubstitution, transposition=levTransposition)
209 | 	# Otherwise, use linguistically enhanced Damerau-Levenshtein
210 | 	else: alignments = DL.WagnerFischer(orig_toks, cor_toks, orig, cor, substitution=token_substitution)
211 | 	# Get the alignment with the highest score. There is usually only 1 best in DL due to custom costs.
212 | 	alignment = next(alignments.alignments(True)) # True uses Depth-first search.
213 | 	# Convert the alignment into edits; choose merge strategy
214 | 	if args.merge == "rules": edits = get_edits(orig, cor, get_opcodes(alignment))
215 | 	elif args.merge == "all-split": edits = get_edits_split(get_opcodes(alignment))
216 | 	elif args.merge == "all-merge": edits = get_edits_group_all(get_opcodes(alignment))
217 | 	elif args.merge == "all-equal": edits = get_edits_group_type(get_opcodes(alignment))
218 | 	proc_edits = []
219 | 	for edit in edits:
220 | 		orig_start = edit[1]
221 | 		orig_end = edit[2]
222 | 		cat = "NA" # Auto edits do not have human types.
223 | 		cor_start = edit[3]
224 | 		cor_end = edit[4]
225 | 		cor_str = " ".join(cor_toks[cor_start:cor_end])
226 | 		proc_edits.append([orig_start, orig_end, cat, cor_str, cor_start, cor_end])
227 | 	return proc_edits
228 | 


--------------------------------------------------------------------------------
/tools/scripts/rdlextra.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016 Mariano Felice and Christopher Bryant
  2 | #
  3 | # This file contains an implementation of the Damerau-Levenshtein
  4 | # algorithm (restricted edit distance version) to align two sentences, 
  5 | # as described in the following paper:
  6 | #
  7 | # Mariano Felice, Christopher Bryant and Ted Briscoe. 2016. 
  8 | # Automatic extraction of learner errors in ESL sentences using 
  9 | # linguistically enhanced alignments. In Proceedings of the 26th 
 10 | # International Conference on Computational Linguistics (COLING 2016), 
 11 | # pp. 825-835, Osaka, Japan. Japanese Association for Natural Language 
 12 | # Processing.
 13 | #
 14 | # Please, cite this paper when using this script in your work.
 15 | #
 16 | # This code is based on an original implementation of the Wagner-Fischer
 17 | # algorithm by Kyle Gorman, available at: https://gist.github.com/kylebgorman/8034009
 18 | # The original license and description are included below.
 19 | #
 20 | # This implementation adds support for token transpositions of arbitrary 
 21 | # length, e.g. A B C --> B C A.
 22 | #
 23 | # ORIGINAL LICENSE:
 24 | #
 25 | # Copyright (c) 2013-2016 Kyle Gorman
 26 | #
 27 | # Permission is hereby granted, free of charge, to any person obtaining a
 28 | # copy of this software and associated documentation files (the
 29 | # "Software"), to deal in the Software without restriction, including
 30 | # without limitation the rights to use, copy, modify, merge, publish,
 31 | # distribute, sublicense, and/or sell copies of the Software, and to
 32 | # permit persons to whom the Software is furnished to do so, subject to
 33 | # the following conditions:
 34 | #
 35 | # The above copyright notice and this permission notice shall be included
 36 | # in all copies or substantial portions of the Software.
 37 | #
 38 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 39 | # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 40 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 41 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 42 | # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 43 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 44 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 45 | #
 46 | # wagnerfischer.py: efficient computation of Levenshtein distance and
 47 | # all optimal alignments with arbitrary edit costs. The algorithm for
 48 | # computing the dynamic programming table used has been discovered many
 49 | # times, but is described most clearly in:
 50 | #
 51 | # R.A. Wagner & M.J. Fischer. 1974. The string-to-string correction
 52 | # problem. Journal of the ACM, 21(1): 168-173.
 53 | #
 54 | # Wagner & Fischer also describe an algorithm ("Algorithm Y") to find the
 55 | # alignment path (i.e., list of edit operations involved in the optimal
 56 | # alignment), but it it is specified such that in fact it only generates
 57 | # one such path, whereas many such paths may exist, particularly when
 58 | # multiple edit operations have the same cost. For example, when all edit
 59 | # operations have the same cost, there are two equal-cost alignments of
 60 | # "TGAC" and "GCAC":
 61 | #
 62 | #     TGAC     TGxAC
 63 | #     ss==     d=i==
 64 | #     GCAC     xGCAC
 65 | #
 66 | # However, all such paths can be generated efficiently, as follows. First,
 67 | # the dynamic programming table "cells" are defined as tuples of (partial
 68 | # cost, set of all operations reaching this cell with minimal cost). As a
 69 | # result, the completed table can be thought of as an unweighted, directed
 70 | # graph (or FSA). The bottom right cell (the one containing the Levenshtein
 71 | # distance) is the start state and the origin as end state. The set of arcs
 72 | # are the set of operations in each cell as arcs. (Many of the cells of the
 73 | # table, those which are not visited by any optimal alignment, are under
 74 | # the graph interpretation unconnected vertices, and can be ignored. Every
 75 | # path between the bottom right cell and the origin cell is an optimal
 76 | # alignment. These paths can be efficiently enumerated using breadth-first
 77 | # traversal. The trick here is that elements in deque must not only contain
 78 | # indices but also partial paths. Averaging over all such paths, we can
 79 | # come up with an estimate of the number of insertions, deletions, and
 80 | # substitutions involved as well; in the example above, we say S = 1 and
 81 | # D, I = 0.5.
 82 | #
 83 | # Thanks to Christoph Weidemann (ctw@cogsci.info), who added support for
 84 | # arbitrary cost functions.
 85 | 
 86 | 
 87 | import collections
 88 | import doctest
 89 | import pprint
 90 | 
 91 | 
 92 | # Default cost functions.
 93 | 
 94 | def INSERTION(A, A_extra=None, cost=1):
 95 |   return cost
 96 | 
 97 | def DELETION(A, A_extra=None, cost=1):
 98 |   return cost
 99 | 
100 | def SUBSTITUTION(A, B, A_extra=None, B_extra=None, cost=1):
101 |   return cost
102 | 
103 | def TRANSPOSITION(A, B, A_extra=None, B_extra=None):
104 |   # Change to cost=float('inf') to have standard edit distance by default
105 |   # A and B should be the same length
106 |   cost = len(A) - 1 # or len(B) -1 
107 |   return cost
108 | 
109 | Trace = collections.namedtuple("Trace", ["cost", "ops"])
110 | 
111 | class WagnerFischer(object):
112 | 
113 |     """
114 |     An object representing a (set of) Levenshtein alignments between two
115 |     iterable objects (they need not be strings). The cost of the optimal
116 |     alignment is scored in `self.cost`, and all Levenshtein alignments can
117 |     be generated using self.alignments()`.
118 | 
119 |     Basic tests:
120 | 
121 |     >>> WagnerFischer("god", "gawd").cost
122 |     2
123 |     >>> WagnerFischer("sitting", "kitten").cost
124 |     3
125 |     >>> WagnerFischer("bana", "banananana").cost
126 |     6
127 |     >>> WagnerFischer("bana", "bana").cost
128 |     0
129 |     >>> WagnerFischer("banana", "angioplastical").cost
130 |     11
131 |     >>> WagnerFischer("angioplastical", "banana").cost
132 |     11
133 |     >>> WagnerFischer("Saturday", "Sunday").cost
134 |     3
135 | 
136 |     IDS tests:
137 | 
138 |     >>> WagnerFischer("doytauvab", "doyvautab").IDS() == {"S": 2.0}
139 |     True
140 |     >>> WagnerFischer("kitten", "sitting").IDS() == {"I": 1.0, "S": 2.0}
141 |     True
142 | 
143 |     Detect insertion vs. deletion:
144 | 
145 |     >>> thesmalldog = "the small dog".split()
146 |     >>> thebigdog = "the big dog".split()
147 |     >>> bigdog = "big dog".split()
148 |     >>> sub_inf = lambda A, B: float("inf")
149 | 
150 |     # Deletion.
151 |     >>> wf = WagnerFischer(thebigdog, bigdog, substitution=sub_inf)
152 |     >>> wf.IDS() == {"D": 1.0}
153 |     True
154 | 
155 |     # Insertion.
156 |     >>> wf = WagnerFischer(bigdog, thebigdog, substitution=sub_inf)
157 |     >>> wf.IDS() == {"I": 1.0}
158 |     True
159 | 
160 |     # Neither.
161 |     >>> wf = WagnerFischer(thebigdog, thesmalldog, substitution=sub_inf)
162 |     >>> wf.IDS() == {"I": 1.0, "D": 1.0}
163 |     True
164 |     """
165 | 
166 |     # Initializes pretty printer (shared across all class instances).
167 |     pprinter = pprint.PrettyPrinter(width=75)
168 | 
169 |     def __init__(self, A, B, A_extra=None, B_extra=None, insertion=INSERTION, deletion=DELETION,
170 |                  substitution=SUBSTITUTION, transposition=TRANSPOSITION):
171 |         # Stores cost functions in a dictionary for programmatic access.
172 |         self.costs = {"I": insertion, "D": deletion, "S": substitution, "T":transposition}
173 |         # Keep lowercased versions for transpositions
174 |         Al = [x.lower() for x in A]
175 |         Bl = [x.lower() for x in B]
176 |         # Initializes table.
177 |         self.asz = len(A)
178 |         self.bsz = len(B)
179 |         self._table = [[None for _ in range(self.bsz + 1)] for
180 |                        _ in range(self.asz + 1)]
181 |         # From now on, all indexing done using self.__getitem__.
182 |         ## Fills in edges.
183 |         self[0][0] = Trace(0, {"O"})  # Start cell.
184 |         for i in range(1, self.asz + 1):
185 |             self[i][0] = Trace(self[i - 1][0].cost + self.costs["D"](A[i - 1], A_extra[i - 1] if A_extra else None),
186 |                                {"D"})
187 |         for j in range(1, self.bsz + 1):
188 |             self[0][j] = Trace(self[0][j - 1].cost + self.costs["I"](B[j - 1], B_extra[j - 1] if B_extra else None),
189 |                                {"I"})
190 |         
191 |         ## Fills in rest.
192 |         for i in range(len(A)):
193 |             for j in range(len(B)):                
194 |                 # Cleans it up in case there are more than one check for match
195 |                 # first, as it is always the cheapest option.
196 |                 if A[i] == B[j]:
197 |                     self[i + 1][j + 1] = Trace(self[i][j].cost, {"M"})
198 |                 # Checks for other types.
199 |                 else:
200 |                     costD = self[i][j + 1].cost + self.costs["D"](A[i], A_extra[i] if A_extra else None)
201 |                     costI = self[i + 1][j].cost + self.costs["I"](B[j], B_extra[j] if B_extra else None)
202 |                     costS = self[i][j].cost + self.costs["S"](A[i], B[j], A_extra[i] if A_extra else None, B_extra[j] if B_extra else None)
203 |                     costT = float("inf") # We don't know it yet
204 |                     min_val = min(costI, costD, costS)
205 | 
206 |                     # Multiword transpositions:
207 |                     # Find a sequence of equal elements in different order
208 |                     # We only need to check diagonally because we require the same number of elements
209 |                     k = 1
210 |                     #while i > 0 and j > 0 and (i - k) >= 0 and (j - k) >= 0 and any(x in ["D", "I", "S"] for x in self[i-k+1][j-k+1].ops):
211 |                     while i > 0 and j > 0 and (i - k) >= 0 and (j - k) >= 0 and self[i-k+1][j-k+1].cost - self[i-k][j-k].cost > 0: # An operation that has a cost (i.e. I, D or S > 0)
212 |                         if collections.Counter(Al[i-k:i+1]) == collections.Counter(Bl[j-k:j+1]):
213 |                             costT = self[i-k][j-k].cost + self.costs["T"](A[i-k:i+1], B[j-k:j+1], A_extra[i-k:i+1] if A_extra else None, B_extra[j-k:j+1] if B_extra else None)
214 |                             min_val = min(min_val, costT)
215 |                             break
216 |                         k += 1
217 |                     
218 |                     trace = Trace(min_val, []) # Use a list to preserve the order
219 |                     # Adds _all_ operations matching minimum value.
220 |                     if costD == min_val:
221 |                         trace.ops.append("D")
222 |                     if costI == min_val:
223 |                         trace.ops.append("I")
224 |                     if costS == min_val:
225 |                         trace.ops.append("S")
226 |                     if costT == min_val:
227 |                         trace.ops.append("T" + str(k+1))
228 |                     self[i + 1][j + 1] = trace
229 |                                         
230 |         # Stores optimum cost as a property.
231 |         self.cost = self[-1][-1].cost
232 | 
233 |     def __repr__(self):
234 |         return self.pprinter.pformat(self._table)
235 | 
236 |     def __iter__(self):
237 |         for row in self._table:
238 |             yield row
239 | 
240 |     def __getitem__(self, i):
241 |         """
242 |         Returns the i-th row of the table, which is a list and so
243 |         can be indexed. Therefore, e.g.,  self[2][3] == self._table[2][3]
244 |         """
245 |         return self._table[i]
246 | 
247 |     # Stuff for generating alignments.
248 | 
249 |     def _stepback(self, i, j, trace, path_back):
250 |         """
251 |         Given a cell location (i, j) and a Trace object trace, generate
252 |         all traces they point back to in the table
253 |         """
254 |         for op in trace.ops:
255 |             if op == "M":
256 |                 yield i - 1, j - 1, self[i - 1][j - 1], path_back + ["M"]
257 |             elif op == "I":
258 |                 yield i, j - 1, self[i][j - 1], path_back + ["I"]
259 |             elif op == "D":
260 |                 yield i - 1, j, self[i - 1][j], path_back + ["D"]
261 |             elif op == "S":
262 |                 yield i - 1, j - 1, self[i - 1][j - 1], path_back + ["S"]
263 |             elif op.startswith("T"):
264 |                 # Extract stepback (default is a transposition of 2 elements)
265 |                 k = int(op[1:] or 2)
266 |                 yield i - k, j - k, self[i - k][j - k], path_back + [op]
267 |             elif op == "O":
268 |                 return  # Origin cell, so we're done.
269 |             else:
270 |                 raise ValueError("Unknown op {!r}".format(op))
271 | 
272 |     def alignments(self, dfirst=False):
273 |         """
274 |         Generate all alignments with optimal cost by traversing an
275 |         implicit graph on the dynamic programming table. Use
276 |         breadth-first traversal by default.
277 |         """
278 |         # Each cell of the queue is a tuple of (i, j, trace, path_back)
279 |         # where i, j is the current index, trace is the trace object at
280 |         # this cell
281 |         if dfirst:
282 |             return self._dfirst_alignments()
283 |         else:
284 |             return self._bfirst_alignments()
285 | 
286 |     def _dfirst_alignments(self):
287 |         """
288 |         Generate alignments via depth-first traversal.
289 |         """
290 |         stack = list(self._stepback(self.asz, self.bsz, self[-1][-1], []))
291 |         while stack:
292 |             (i, j, trace, path_back) = stack.pop()
293 |             if trace.ops == {"O"}:
294 |                 yield path_back[::-1]
295 |                 continue
296 |             stack.extend(self._stepback(i, j, trace, path_back))
297 | 
298 |     def _bfirst_alignments(self):
299 |         """
300 |         Generate alignments via breadth-first traversal.
301 |         """
302 |         # Each cell of the queue is a tuple of (i, j, trace, path_back)
303 |         # where i, j is the current index, trace is the trace object at
304 |         # this cell, and path_back is a reversed list of edit operations
305 |         # which is initialized as an empty list.
306 |         queue = collections.deque(self._stepback(self.asz, self.bsz,
307 |                                                  self[-1][-1], []))
308 |         while queue:
309 |             (i, j, trace, path_back) = queue.popleft()
310 |             if trace.ops == {"O"}:
311 |                 # We have reached the origin, the end of a reverse path, so
312 |                 # yield the list of edit operations in reverse.
313 |                 yield path_back[::-1]
314 |                 continue
315 |             queue.extend(self._stepback(i, j, trace, path_back))
316 | 
317 |     def IDS(self):
318 |         """
319 |         Estimates insertions, deletions, and substitution _count_ (not
320 |         costs). Non-integer values arise when there are multiple possible
321 |         alignments with the same cost.
322 |         """
323 |         npaths = 0
324 |         opcounts = collections.Counter()
325 |         for alignment in self.alignments():
326 |             # Counts edit types for this path, ignoring "M" (which is free).
327 |             opcounts += collections.Counter(op for op in alignment if op != "M")
328 |             npaths += 1
329 |         # Averages over all paths.
330 |         return collections.Counter({o: c / npaths for (o, c) in
331 |                                     opcounts.items()})
332 | 
333 | 
334 | if __name__ == "__main__":
335 |     #doctest.testmod()
336 |     a = raw_input("A: ").split()
337 |     b = raw_input("B: ").split()
338 |     al = WagnerFischer(a, b).alignments()   
339 |     for a in al:
340 |         print(a)
341 | 
342 | 


--------------------------------------------------------------------------------
/tools/scripts/cat_rules.py:
--------------------------------------------------------------------------------
  1 | from difflib import SequenceMatcher
  2 | from string import punctuation
  3 | import spacy.parts_of_speech as spos
  4 | 
  5 | # Contractions
  6 | conts = {"'d", "'ll", "'m", "n't", "'re", "'s", "'ve"}
  7 | # Rare POS tags that make uninformative error categories
  8 | rare_tags = {"INTJ", "NUM", "SYM", "X"}
  9 | # Special auxiliaries in contractions.
 10 | special_aux1 = ({"ca", "can"}, {"sha", "shall"}, {"wo", "will"})
 11 | special_aux2 = {"ca", "sha", "wo"}
 12 | # Open class spacy POS tag objects
 13 | open_pos = (spos.ADJ, spos.ADV, spos.NOUN, spos.VERB)
 14 | # Open class POS tags
 15 | open_tags = {"ADJ", "ADV", "NOUN", "VERB"}
 16 | # Some dep labels that map to pos tags. 
 17 | dep_map = { "acomp": "ADJ",
 18 | 			"amod": "ADJ",
 19 | 			"advmod": "ADV", 
 20 | 			"det": "DET", 
 21 | 			"prep": "PREP", 
 22 | 			"prt": "PART",
 23 | 			"punct": "PUNCT" }
 24 | 
 25 | # Input 1: An edit list. [orig_start, orig_end, cat, cor, cor_start, cor_end]
 26 | # Input 2: An original SpaCy sentence.
 27 | # Input 3: A corrected SpaCy sentence.
 28 | # Input 4: A set of valid GB English words.
 29 | # Input 5: A dictionary to map PTB tags to Stanford Universal Dependency tags.
 30 | # Input 6: A preloaded spacy processing object.
 31 | # Input 7: The Lancaster stemmer in NLTK.
 32 | # Output: The input edit with new error tag, in M2 edit format.
 33 | def autoTypeEdit(edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer):
 34 | 	# Get the tokens in the edit.
 35 | 	orig_toks = orig_sent[edit[0]:edit[1]]
 36 | 	cor_toks = cor_sent[edit[4]:edit[5]]
 37 | 	# Nothing to nothing is a detected, but not corrected edit.
 38 | 	if not orig_toks and not cor_toks:
 39 | 		return "UNK"
 40 | 	# Missing
 41 | 	elif not orig_toks and cor_toks:
 42 | 		op = "M:"
 43 | 		cat = getOneSidedType(cor_toks, tag_map)
 44 | 	# Unnecessary
 45 | 	elif orig_toks and not cor_toks:
 46 | 		op = "U:"
 47 | 		cat = getOneSidedType(orig_toks, tag_map)
 48 | 	# Replacement and special cases
 49 | 	else:
 50 | 		# Same to same is a detected, but not corrected edit.
 51 | 		if orig_toks.text == cor_toks.text:
 52 | 			return "UNK"
 53 | 		# Special: Orthographic errors at the end of multi-token edits are ignored.
 54 | 		# E.g. [Doctor -> The doctor], [The doctor -> Dcotor], [, since -> . Since]
 55 | 		# Classify the edit as if the last token weren't there.
 56 | 		elif orig_toks[-1].lower_ == cor_toks[-1].lower_ and \
 57 | 			(len(orig_toks) > 1 or len(cor_toks) > 1):
 58 | 			min_edit = edit[:]
 59 | 			min_edit[1] -= 1
 60 | 			min_edit[5] -= 1
 61 | 			return autoTypeEdit(min_edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer)
 62 | 		# Replacement
 63 | 		else:
 64 | 			op = "R:"
 65 | 			cat = getTwoSidedType(orig_toks, cor_toks, gb_spell, tag_map, nlp, stemmer)
 66 | 	return op+cat
 67 | 
 68 | # Input 1: Spacy tokens
 69 | # Input 2: A map dict from PTB to universal dependency pos tags.
 70 | # Output: A list of token, pos and dep tag strings.
 71 | def getEditInfo(toks, tag_map):
 72 | 	str = []
 73 | 	pos = []
 74 | 	dep = []
 75 | 	for tok in toks:
 76 | 		str.append(tok.text)
 77 | 		pos.append(tag_map[tok.tag_])
 78 | 		dep.append(tok.dep_)
 79 | 	return str, pos, dep
 80 | 
 81 | # Input 1: Spacy tokens.
 82 | # Input 2: A map dict from PTB to universal dependency pos tags.
 83 | # Output: An error type string.
 84 | # When one side of the edit is null, we can only use the other side.
 85 | def getOneSidedType(toks, tag_map):
 86 | 	# Extract strings, pos tags and parse info from the toks.
 87 | 	str_list, pos_list, dep_list = getEditInfo(toks, tag_map)
 88 | 	
 89 | 	# Special cases.
 90 | 	if len(toks) == 1:
 91 | 		# Possessive noun suffixes; e.g. ' -> 's
 92 | 		if toks[0].tag_ == "POS":
 93 | 			return "NOUN:POSS"
 94 | 		# Contraction. Rule must come after possessive.
 95 | 		if toks[0].lower_ in conts:
 96 | 			return "CONTR"			
 97 | 		# Infinitival "to" is treated as part of a verb form.
 98 | 		if toks[0].lower_ == "to" and toks[0].pos_ == "PART" and toks[0].dep_ != "prep":
 99 | 			return "VERB:FORM"
100 | 	# Auxiliary verbs.
101 | 	if set(dep_list).issubset({"aux", "auxpass"}):
102 | 		return "VERB:TENSE"	
103 | 	# POS-based tags. Ignores rare, uninformative categories.
104 | 	if len(set(pos_list)) == 1 and pos_list[0] not in rare_tags:
105 | 		return pos_list[0]
106 | 	# More POS-based tags using special dependency labels.
107 | 	if len(set(dep_list)) == 1 and dep_list[0] in dep_map.keys():
108 | 		return dep_map[dep_list[0]]
109 | 	# To-infinitives and phrasal verbs.
110 | 	if set(pos_list) == {"PART", "VERB"}:
111 | 		return "VERB"
112 | 	# Tricky cases
113 | 	else:
114 | 		return "OTHER"		
115 | 
116 | # Input 1: Original text spacy tokens.
117 | # Input 2: Corrected text spacy tokens.
118 | # Input 3: A set of valid GB English words.
119 | # Input 4: A map from PTB to universal dependency pos tags.
120 | # Input 5: A preloaded spacy processing object.
121 | # Input 6: The Lancaster stemmer in NLTK.
122 | # Output: An error type string.
123 | def getTwoSidedType(orig_toks, cor_toks, gb_spell, tag_map, nlp, stemmer):
124 | 	# Extract strings, pos tags and parse info from the toks.
125 | 	orig_str, orig_pos, orig_dep = getEditInfo(orig_toks, tag_map)
126 | 	cor_str, cor_pos, cor_dep = getEditInfo(cor_toks, tag_map)
127 | 
128 | 	# Orthography; i.e. whitespace and/or case errors.
129 | 	if onlyOrthChange(orig_str, cor_str):
130 | 		return "ORTH"
131 | 	# Word Order; only matches exact reordering.
132 | 	if exactReordering(orig_str, cor_str):
133 | 		return "WO"
134 | 		
135 | 	# 1:1 replacements (very common)
136 | 	if len(orig_str) == len(cor_str) == 1:
137 | 		# 1. SPECIAL CASES
138 | 		# Possessive noun suffixes; e.g. ' -> 's
139 | 		if orig_toks[0].tag_ == "POS" or cor_toks[0].tag_ == "POS":
140 | 			return "NOUN:POSS"
141 | 		# Contraction. Rule must come after possessive.
142 | 		if (orig_str[0].lower() in conts or cor_str[0].lower() in conts) and orig_pos == cor_pos:
143 | 			return "CONTR"
144 | 		# Special auxiliaries in contractions (1); e.g. ca -> can
145 | 		if set(orig_str[0].lower()+cor_str[0].lower()) in special_aux1:
146 | 			return "CONTR"
147 | 		# Special auxiliaries in contractions (2); e.g. ca -> could
148 | 		if orig_str[0].lower() in special_aux2 or cor_str[0].lower() in special_aux2:
149 | 			return "VERB:TENSE"
150 | 		# Special: "was" and "were" are the only past tense SVA.
151 | 		if {orig_str[0].lower(), cor_str[0].lower()} == {"was", "were"}:
152 | 			return "VERB:SVA"
153 | 			
154 | 		# 2. SPELLING AND INFLECTION
155 | 		# Only check alphabetical strings on the original side.
156 | 		# Spelling errors take precendece over POS errors so this rule is ordered.
157 | 		if orig_str[0].isalpha():
158 | 			# Check a GB English dict for both orig and lower case.
159 | 			# "cat" is in the dict, but "Cat" is not.
160 | 			if orig_str[0] not in gb_spell and orig_str[0].lower() not in gb_spell:
161 | 				# Check if both sides have a common lemma
162 | 				if sameLemma(orig_toks[0], cor_toks[0], nlp):
163 | 					# Inflection; Usually count vs mass nouns or e.g. got vs getted
164 | 					if orig_pos == cor_pos and orig_pos[0] in {"NOUN", "VERB"}:
165 | 						return orig_pos[0]+":INFL"
166 | 					# Unknown morphology; i.e. we cannot be more specific.
167 | 					else:
168 | 						return "MORPH"
169 | 				# Use string similarity to detect true spelling errors.
170 | 				else:
171 | 					char_ratio = SequenceMatcher(None, orig_str[0], cor_str[0]).ratio()
172 | 					# Ratio > 0.5 means both side share at least half the same chars.
173 | 					# WARNING: THIS IS AN APPROXIMATION.
174 | 					if char_ratio > 0.5:
175 | 						return "SPELL"
176 | 					# If ratio is <= 0.5, this may be a spelling+other error; e.g. tolk -> say
177 | 					else:
178 | 						# If POS is the same, this takes precedence over spelling.
179 | 						if orig_pos == cor_pos and orig_pos[0] not in rare_tags:
180 | 							return orig_pos[0]
181 | 						# Tricky cases.
182 | 						else:
183 | 							return "OTHER"					
184 | 		
185 | 		# 3. MORPHOLOGY
186 | 		# Only ADJ, ADV, NOUN and VERB with same lemma can have inflectional changes.
187 | 		if sameLemma(orig_toks[0], cor_toks[0], nlp) and \
188 | 			orig_pos[0] in open_tags and cor_pos[0] in open_tags:
189 | 			# Same POS on both sides
190 | 			if orig_pos == cor_pos:
191 | 				# Adjective form; e.g. comparatives
192 | 				if orig_pos[0] == "ADJ":
193 | 					return "ADJ:FORM"
194 | 				# Noun number
195 | 				if orig_pos[0] == "NOUN":
196 | 					return "NOUN:NUM"
197 | 				# Verbs - various types
198 | 				if orig_pos[0] == "VERB":
199 | 					# NOTE: These rules are carefully ordered.
200 | 					# Use the dep parse to find some form errors.
201 | 					# Main verbs preceded by aux cannot be tense or SVA.
202 | 					if precededByAux(orig_toks, cor_toks):
203 | 						return "VERB:FORM"
204 | 					# Use fine PTB tags to find various errors.
205 | 					# FORM errors normally involve VBG or VBN.
206 | 					if orig_toks[0].tag_ in {"VBG", "VBN"} or cor_toks[0].tag_ in {"VBG", "VBN"}:
207 | 						return "VERB:FORM"
208 | 					# Of what's left, TENSE errors normally involved VBD.
209 | 					if orig_toks[0].tag_ == "VBD" or cor_toks[0].tag_ == "VBD":
210 | 						return "VERB:TENSE"
211 | 					# Of what's left, SVA errors normally involve VBZ.
212 | 					if orig_toks[0].tag_ == "VBZ" or cor_toks[0].tag_ == "VBZ":
213 | 						return "VERB:SVA"
214 | 					# Any remaining aux verbs are called TENSE.
215 | 					if orig_dep[0].startswith("aux") and cor_dep[0].startswith("aux"):
216 | 						return "VERB:TENSE"
217 | 			# Use dep labels to find some more ADJ:FORM
218 | 			if set(orig_dep+cor_dep).issubset({"acomp", "amod"}):
219 | 				return "ADJ:FORM"
220 | 			# Adj to plural noun is usually a noun number error; e.g. musical -> musicals.
221 | 			if orig_pos[0] == "ADJ" and cor_toks[0].tag_ == "NNS":
222 | 				return "NOUN:NUM"
223 | 			# For remaining verb errors (rare), rely on cor_pos
224 | 			if cor_toks[0].tag_ in {"VBG", "VBN"}:
225 | 				return "VERB:FORM"
226 | 			# Cor VBD = TENSE
227 | 			if cor_toks[0].tag_ == "VBD":
228 | 				return "VERB:TENSE"
229 | 			# Cor VBZ = SVA
230 | 			if cor_toks[0].tag_ == "VBZ":
231 | 				return "VERB:SVA"
232 | 			# Tricky cases that all have the same lemma.
233 | 			else:
234 | 				return "MORPH"
235 | 		# Derivational morphology.
236 | 		if stemmer.stem(orig_str[0]) == stemmer.stem(cor_str[0]) and \
237 | 			orig_pos[0] in open_tags and cor_pos[0] in open_tags:
238 | 			return "MORPH"
239 | 
240 | 		# 4. GENERAL
241 | 		# Auxiliaries with different lemmas
242 | 		if orig_dep[0].startswith("aux") and cor_dep[0].startswith("aux"):
243 | 			return "VERB:TENSE"
244 | 		# POS-based tags. Some of these are context sensitive mispellings.
245 | 		if orig_pos == cor_pos and orig_pos[0] not in rare_tags:
246 | 			return orig_pos[0]
247 | 		# Some dep labels map to POS-based tags.
248 | 		if orig_dep == cor_dep and orig_dep[0] in dep_map.keys():
249 | 			return dep_map[orig_dep[0]]
250 | 		# Phrasal verb particles.
251 | 		if set(orig_pos+cor_pos) == {"PART", "PREP"} or set(orig_dep+cor_dep) == {"prt", "prep"}:
252 | 			return "PART"
253 | 		# Can use dep labels to resolve DET + PRON combinations.
254 | 		if set(orig_pos+cor_pos) == {"DET", "PRON"}:
255 | 			# DET cannot be a subject or object.
256 | 			if cor_dep[0] in {"nsubj", "nsubjpass", "dobj", "pobj"}:
257 | 				return "PRON"
258 | 			# "poss" indicates possessive determiner
259 | 			if cor_dep[0] == "poss":
260 | 				return "DET"
261 | 		# Tricky cases.
262 | 		else:
263 | 			return "OTHER"
264 | 	
265 | 	# Multi-token replacements (uncommon)
266 | 	# All auxiliaries
267 | 	if set(orig_dep+cor_dep).issubset({"aux", "auxpass"}):
268 | 		return "VERB:TENSE"		
269 | 	# All same POS
270 | 	if len(set(orig_pos+cor_pos)) == 1:
271 | 		# Final verbs with the same lemma are tense; e.g. eat -> has eaten 
272 | 		if orig_pos[0] == "VERB" and sameLemma(orig_toks[-1], cor_toks[-1], nlp):
273 | 			return "VERB:TENSE"
274 | 		# POS-based tags. 
275 | 		elif orig_pos[0] not in rare_tags:
276 | 			return orig_pos[0]
277 | 	# All same special dep labels.
278 | 	if len(set(orig_dep+cor_dep)) == 1 and orig_dep[0] in dep_map.keys():
279 | 		return dep_map[orig_dep[0]]			
280 | 	# Infinitives, gerunds, phrasal verbs.
281 | 	if set(orig_pos+cor_pos) == {"PART", "VERB"}:
282 | 		# Final verbs with the same lemma are form; e.g. to eat -> eating
283 | 		if sameLemma(orig_toks[-1], cor_toks[-1], nlp):
284 | 			return "VERB:FORM"
285 | 		# Remaining edits are often verb; e.g. to eat -> consuming, look at -> see
286 | 		else:
287 | 			return "VERB"
288 | 	# Possessive nouns; e.g. friends -> friend 's
289 | 	if (orig_pos == ["NOUN", "PART"] or cor_pos == ["NOUN", "PART"]) and \
290 | 		sameLemma(orig_toks[0], cor_toks[0], nlp):
291 | 		return "NOUN:POSS"	
292 | 	# Adjective forms with "most" and "more"; e.g. more free -> freer
293 | 	if (orig_str[0].lower() in {"most", "more"} or cor_str[0].lower() in {"most", "more"}) and \
294 | 		sameLemma(orig_toks[-1], cor_toks[-1], nlp) and len(orig_str) <= 2 and len(cor_str) <= 2:
295 | 		return "ADJ:FORM"		
296 | 		
297 | 	# Tricky cases.
298 | 	else:
299 | 		return "OTHER"
300 | 		
301 | # Input 1: A list of original token strings
302 | # Input 2: A list of corrected token strings
303 | # Output: Boolean; the difference between the inputs is only whitespace or case.
304 | def onlyOrthChange(orig_str, cor_str):
305 | 	orig_join = "".join(orig_str).lower()
306 | 	cor_join = "".join(cor_str).lower()
307 | 	if orig_join == cor_join:
308 | 		return True
309 | 	return False
310 | 
311 | # Input 1: A list of original token strings
312 | # Input 2: A list of corrected token strings
313 | # Output: Boolean; the tokens are exactly the same but in a different order.
314 | def exactReordering(orig_str, cor_str):
315 | 	# Sorting lets us keep duplicates.
316 | 	orig_set = sorted([tok.lower() for tok in orig_str])
317 | 	cor_set = sorted([tok.lower() for tok in cor_str])
318 | 	if orig_set == cor_set:
319 | 		return True
320 | 	return False
321 | 
322 | # Input 1: An original text spacy token. 
323 | # Input 2: A corrected text spacy token.
324 | # Input 3: A spaCy processing object.
325 | # Output: Boolean; the tokens have the same lemma.
326 | # Spacy only finds lemma for its predicted POS tag. Sometimes these are wrong,
327 | # so we also consider alternative POS tags to improve chance of a match.
328 | def sameLemma(orig_tok, cor_tok, nlp):
329 | 	orig_lemmas = []
330 | 	cor_lemmas = []
331 | 	for pos in open_pos:
332 | 		# Pass the lower cased form of the word for lemmatization; improves accuracy.
333 | 		orig_lemmas.append(nlp.vocab.morphology.lemmatize(pos, orig_tok.lower, nlp.vocab.morphology.tag_map))
334 | 		cor_lemmas.append(nlp.vocab.morphology.lemmatize(pos, cor_tok.lower, nlp.vocab.morphology.tag_map))
335 | 	if set(orig_lemmas).intersection(set(cor_lemmas)):
336 | 		return True
337 | 	return False
338 | 
339 | # Input 1: An original text spacy token. 
340 | # Input 2: A corrected text spacy token.
341 | # Output: Boolean; both tokens have a dependant auxiliary verb.
342 | def precededByAux(orig_tok, cor_tok):
343 | 	# If the toks are aux, we need to check if they are the first aux.
344 | 	if orig_tok[0].dep_.startswith("aux") and cor_tok[0].dep_.startswith("aux"):
345 | 		# Find the parent verb
346 | 		orig_head = orig_tok[0].head
347 | 		cor_head = cor_tok[0].head
348 | 		# Find the children of the parent
349 | 		orig_children = orig_head.children
350 | 		cor_children = cor_head.children
351 | 		# Check the orig children.
352 | 		for orig_child in orig_children:
353 | 			# Look at the first aux...
354 | 			if orig_child.dep_.startswith("aux"):
355 | 				# Check if the string matches orig_tok
356 | 				if orig_child.text != orig_tok[0].text:
357 | 					# If it doesn't, orig_tok is not the first aux so check the cor children
358 | 					for cor_child in cor_children:
359 | 						# Find the first aux in cor...
360 | 						if cor_child.dep_.startswith("aux"):
361 | 							# If that doesn't match cor_tok, there cor_tok also isnt first aux.
362 | 							if cor_child.text != cor_tok[0].text:
363 | 								# Therefore, both orig and cor are not first aux.
364 | 								return True
365 | 							# Break after the first cor aux
366 | 							break
367 | 				# Break after the first orig aux.
368 | 				break
369 | 	# Otherwise, the toks are main verbs so we need to look for any aux.
370 | 	else:
371 | 		orig_deps = [orig_dep.dep_ for orig_dep in orig_tok[0].children]
372 | 		cor_deps = [cor_dep.dep_ for cor_dep in cor_tok[0].children]
373 | 		if "aux" in orig_deps or "auxpass" in orig_deps:
374 | 			if "aux" in cor_deps or "auxpass" in cor_deps:
375 | 				return True
376 | 	return False


--------------------------------------------------------------------------------
/json2pair.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # This code is hacked from FCE  jason_to_m2.py
  3 | # make text paragraph<--->corrected paragrapg pair for training and testing.
  4 | 
  5 | import json
  6 | import tools.scripts.align_text as align_text
  7 | import tools.scripts.cat_rules as cat_rules
  8 | import tools.scripts.toolbox as toolbox
  9 | import spacy
 10 | from nltk.stem.lancaster import LancasterStemmer
 11 | import re
 12 | from string import punctuation
 13 | from bisect import bisect
 14 | import argparse
 15 | 
 16 | 
 17 | 
 18 | # Punctuation normalisation dictionary
 19 | norm_dict = {"’": "'",
 20 |              "´": "'",
 21 |              "‘": "'",
 22 |              "′": "'",
 23 |              "`": "'",
 24 |              '“': '"',
 25 |              '”': '"',
 26 |              '˝': '"',
 27 |              '¨': '"',
 28 |              '„': '"',
 29 |              '『': '"',
 30 |              '』': '"',
 31 |              '–': '-',
 32 |              '—': '-',
 33 |              '―': '-',
 34 |              '¬': '-',
 35 |              '、': ',',
 36 |              '，': ',',
 37 |              '：': ':',
 38 |              '；': ';',
 39 |              '？': '?',
 40 |              '！': '!',
 41 |              'ِ': ' ',
 42 |              '\u200b': ' '}
 43 | norm_dict = {ord(k): v for k, v in norm_dict.items()}
 44 | 
 45 | # Load Tokenizer and other resources
 46 | nlp = spacy.load("en_core_web_lg")
 47 | # Lancaster Stemmer
 48 | stemmer = LancasterStemmer()
 49 | # GB English word list (inc -ise and -ize)
 50 | gb_spell = toolbox.loadDictionary("tools/resources/en_GB-large.txt")
 51 | # Part of speech map file
 52 | tag_map = toolbox.loadTagMap("tools/resources/en-ptb_map")
 53 | 
 54 | 
 55 | #Input 1: An essay string.
 56 | # Input 2: A list of character edits in the essay
 57 | # Input 3: A string normalisation dictionary for unusual punctuation etc.
 58 | # Output: A list of paragraph strings and their edits [(para, edits), ...]
 59 | def getParas(text, edits, norm_dict):
 60 | 	para_info = []
 61 | 	# Loop through all sequences between newlines
 62 | 	for para in re.finditer("[^\n]+", text):
 63 | 		para_edits = []
 64 | 		# Keep track of correction spans (not detection spans)
 65 | 		cor_spans = []
 66 | 		# Loop through the edits: [start, end, cor, <type>]
 67 | 		for edit in edits:
 68 | 			# Find edits that fall inside this paragraph
 69 | 			if edit[0] >= para.start(0) and edit[1] <= para.end(0):
 70 | 				# Adjust offsets and add C or D type for correction or detection
 71 | 				new_edit = [edit[0]-para.start(0), edit[1]-para.start(0), "C", edit[2]]
 72 | 				if edit[2] == None: new_edit[2] = "D"
 73 | 				# Normalise the string if its a correction edit
 74 | 				if new_edit[2] == "C":
 75 | 					new_edit[3] = edit[2].translate(norm_dict)
 76 | 					# Save the span in cor_spans
 77 | 					cor_spans.append(new_edit[:2])
 78 | 				# Save the edit
 79 | 				para_edits.append(new_edit)
 80 | 			# Activate this switch to see the cross paragraph edits that are ignored, if any.
 81 | #			elif edit[0] >= para.start(0) and edit[0] <= para.end(0) and edit[1] > para.end(0):
 82 | #				print(text)
 83 | #				print(edit)
 84 | 		# Remove overlapping detection edits from the list (for FCE only)
 85 | 		new_para_edits = []
 86 | 		# Loop through the new normalised edits again
 87 | 		for edit in para_edits:
 88 | 			# Find detection edits
 89 | 			if edit[2] == "D":
 90 | 				# Boolean if the edit overlaps with a correction
 91 | 				overlap = False
 92 | 				# Loop through cor_spans
 93 | 				for start, end in cor_spans:
 94 | 					# Check whether there are any correction edits inside this detection edit.
 95 | 					if (start != end and start >= edit[0] and end <= edit[1]) or \
 96 | 					   (start == end and start > edit[0] and end < edit[1]): overlap = True
 97 | 				# If there is an overlap, ignore the detection edit
 98 | 				if overlap: continue
 99 | 			new_para_edits.append(edit)
100 | 		# Save the para and the para edits
101 | 		para_info.append((para.group(0), new_para_edits))
102 | 	return para_info
103 | 
104 | 
105 | 
106 | # Input 1: An untokenized paragraph string.
107 | # Input 2: A list of character edits in the input string.
108 | # Output 1: The same as Input 1, except unnecessary whitespace has been removed.
109 | # Output 2: The same as Input 2, except character edit spans have been updated.
110 | def cleanPara(para, edits):
111 | 	# Replace all types of whitespace with a space
112 | 	para = re.sub("\s", " ", para)
113 | 	# Find any sequence of 2 adjacent whitespace characters
114 | 	# NOTE: Matching only 2 at a time lets us preserve edits between multiple whitespace.
115 | 	match = re.search("  ", para)
116 | 	# While there is a match...
117 | 	while match:
118 | 		# Find the index where the whitespace starts.
119 | 		ws_start = match.start()
120 | 		# Remove 1 of the whitespace chars.
121 | 		para = para[:ws_start] + para[ws_start+1:]
122 | 		# Update affected edits that start after ws_start
123 | 		for edit in edits:
124 | 			# edit = [start, end, ...]
125 | 			if edit[0] > ws_start:
126 | 				edit[0] -= 1
127 | 			if edit[1] > ws_start:
128 | 				edit[1] -= 1
129 | 		# Try matching again
130 | 		match = re.search("  ", para)
131 | 	# Remove leading whitespace, if any.
132 | 	if para.startswith(" "):
133 | 		para = para.lstrip()
134 | 		# Subtract 1 from all edits.
135 | 		for edit in edits:
136 | 			# edit = [start, end, ...]
137 | 			# "max" used to prevent negative index
138 | 			edit[0] = max(edit[0] - 1, 0)
139 | 			edit[1] = max(edit[1] - 1, 0)
140 | 	# Remove whitespace leading/trailing whitespace from character edit spans
141 | 	for edit in edits:
142 | 		# Ignore insertions
143 | 		if edit[0] == edit[1]: continue
144 | 		# Get the orig text
145 | 		orig = para[edit[0]:edit[1]]
146 | 		# Remove leading whitespace and update span
147 | 		if orig.startswith(" "): edit[0] += 1
148 | 		if orig.endswith(" "): edit[1] -= 1
149 | 	# Return para and new edit spans.
150 | 	return para, edits
151 | 
152 | # Input: A spacy paragraph
153 | # Output: A list of character start and end positions for each token in the input.
154 | def getAllTokStartsAndEnds(spacy_doc):
155 | 	tok_starts = []
156 | 	tok_ends = []
157 | 	for tok in spacy_doc:
158 | 		tok_starts.append(tok.idx)
159 | 		tok_ends.append(tok.idx + len(tok.text))
160 | 	return tok_starts, tok_ends
161 | 
162 | # Input 1: A spacy paragraph
163 | # Input 2: A list of character edits in the input string.
164 | # Input 3: A spacy processing object
165 | # Output: A list of token edits that map to exact tokens.
166 | def getTokenEdits(para, edits, nlp):
167 | 	# Get the character start and end offsets of all tokens in the para.
168 | 	tok_starts, tok_ends = getAllTokStartsAndEnds(para)
169 | 	prev_tok_end = 0
170 | 	overlap_edit_ids = []
171 | 	# edit = [start, end, cat, cor]
172 | 	for edit in edits:
173 | 		# Set cor to orig string if this is a detection edit
174 | 		if edit[3] == None: edit[3] = para.text[edit[0]:edit[1]]
175 | 		# Convert the character spans to token spans.
176 | 		span = convertCharToTok(edit[0], edit[1], tok_starts, tok_ends)
177 | 		# If chars do not map cleanly to tokens, extra processing is needed.
178 | 		if len(span) == 4:
179 | 			# Sometimes token expansion results in overlapping edits. Keep track of these.
180 | 			if span[0] < prev_tok_end:
181 | 				overlap_edit_ids.append(edits.index(edit))
182 | 				continue
183 | 			# When span len is 4, span[2] and [3] are the new char spans.
184 | 			# Use these to expand the edit to match token boundaries.
185 | 			left = para.text[span[2]:edit[0]]
186 | 			right = para.text[edit[1]:span[3]]
187 | 			# Add this new info to cor.
188 | 			edit[3] = (left+edit[3]+right).strip()
189 | 		# Keep track of prev_tok_end
190 | 		prev_tok_end = span[1]
191 | 		# Change char span to tok span
192 | 		edit[0] = span[0]
193 | 		edit[1] = span[1]
194 | 		# Tokenise correction edits
195 | 		if edit[2] == "C": edit[3] = " ".join([tok.text for tok in nlp(edit[3].strip())])
196 | 		# Set detection edits equal to the tokenised original
197 | 		elif edit[2] == "D": edit[3] = " ".join([tok.text for tok in para[edit[0]:edit[1]]])
198 | 	# Finally remove any overlap token edits from the edit list (rare)
199 | 	for id in sorted(overlap_edit_ids, reverse=True):
200 | 		del edits[id]
201 | 	return edits
202 | 
203 | 
204 | # Input 1: A SpaCy original paragraph Doc object.
205 | # Input 2: A list of edits in that paragraph.
206 | # Output: A list of dictionaries. Each dict has 3 keys: orig, cor, edits
207 | # Sentences are split according to orig only. Edits map orig to cor.
208 | def getSents(orig, edits):
209 | 	sent_list = []
210 | 	# Make sure spacy sentences end in punctuation where possible.
211 | 	orig_sents = []
212 | 	start = 0
213 | 	for sent in orig.sents:
214 | 		# Only save sentence boundaries that end with punctuation or are paragraph final.
215 | 		if sent[-1].text[-1] in punctuation or sent.end == len(orig):
216 | 			orig_sents.append(orig[start:sent.end])
217 | 			start = sent.end
218 | 	# If orig is 1 sentence, just return.
219 | 	if len(orig_sents) == 1:
220 | 		# Sents are list of tokens. Edits have cor spans added.
221 | 		orig, cor, edits = prepareSentEditsOutput(orig, edits)
222 | 		out_dict = {"orig": orig,
223 | 					"cor": cor,
224 | 					"edits": edits}
225 | 		sent_list.append(out_dict)
226 | 	# Otherwise, we need to split up the paragraph.
227 | 	else:
228 | 		# Keep track of processed edits (assumes ordered edit list)
229 | 		proc = 0
230 | 		# Keep track of diff between orig and cor sent based on applied edits.
231 | 		cor_offset = 0
232 | 		# Loop through the original sentences.
233 | 		for sent_id, orig_sent in enumerate(orig_sents):
234 | 			# Store valid edits here
235 | 			sent_edits = []
236 | 			# Loop through unprocessed edits
237 | 			for edit in edits[proc:]:
238 | 				# edit = [orig_start, orig_end, cat, cor]
239 | 				# If edit starts inside the current sentence but ends outside it...
240 | 				if orig_sent.start <= edit[0] < orig_sent.end and edit[1] > orig_sent.end:
241 | 					# We cannot handle cross orig_sent edits, so just ignore them.
242 | 					# Update cor_offset and proc_cnt
243 | 					cor_offset = cor_offset-(edit[1]-edit[0])+len(edit[3].split())
244 | 					proc += 1
245 | 				# If the edit starts before the last token and ends inside the sentence...
246 | 				elif orig_sent.start <= edit[0] < orig_sent.end and edit[1] <= orig_sent.end:
247 | 					# It definitely belongs to this sentence, so save it.
248 | 					# Update the token spans to reflect the new boundary
249 | 					edit[0] -= orig_sent.start # Orig_start
250 | 					edit[1] -= orig_sent.start # Orig_end
251 | 					# Update cor_offset and proc_cnt
252 | 					cor_offset = cor_offset-(edit[1]-edit[0])+len(edit[3].split())
253 | 					proc += 1
254 | 					# Save the edit
255 | 					sent_edits.append(edit)
256 | 				# If the edit starts and ends after the last token..
257 | 				elif edit[0] == edit[1] == orig_sent.end:
258 | 					# It could ambiguously belong to this, or the next sentence.
259 | 					# If this is the last sentence, the cor is null, or the last char in cor
260 | 					# is punct, then the edit belongs to the current sent.
261 | 					if sent_id == len(orig_sents)-1 or not edit[3] or edit[3][-1] in punctuation:
262 | 						# Update the token spans to reflect the new boundary
263 | 						edit[0] -= orig_sent.start # Orig_start
264 | 						edit[1] -= orig_sent.start # Orig_end
265 | 						# Update cor_offset and proc_cnt
266 | 						cor_offset = cor_offset-(edit[1]-edit[0])+len(edit[3].split())
267 | 						proc += 1
268 | 						# Save the edit
269 | 						sent_edits.append(edit)
270 | 				# In all other cases, edits likely belong to a different sentence.
271 | 			# Sents are list of tokens. Edits have cor spans added.
272 | 			orig_sent, cor_sent, sent_edits = prepareSentEditsOutput(orig_sent, sent_edits)
273 | 			# Save orig sent and edits
274 | 			out_dict = {"orig": orig_sent,
275 | 						"cor": cor_sent,
276 | 						"edits": sent_edits}
277 | 			sent_list.append(out_dict)
278 | 	return sent_list
279 | 
280 | 
281 | # Input 1: A tokenized original sentence.
282 | # Input 2: The edits in that sentence.
283 | # Output 1: The tokenized corrected sentence from these edits.
284 | # Output 2: The edits, now containing the tok span of cor_str in cor_sent.
285 | def prepareSentEditsOutput(orig, edits):
286 | 	orig = [tok.text for tok in orig]
287 | 	cor = orig[:]
288 | 	offset = 0
289 | 	for edit in edits:
290 | 		# edit = [orig_start, orig_end, cat, cor]
291 | 		cor_toks = edit[3].split()
292 | 		cor[edit[0]+offset:edit[1]+offset] = cor_toks
293 | 		cor_start = edit[0]+offset
294 | 		cor_end = cor_start+len(cor_toks)
295 | 		offset = offset-(edit[1]-edit[0])+len(cor_toks)
296 | 		# Save cor offset
297 | 		edit.extend([cor_start, cor_end])
298 | 	return orig, cor, edits
299 | 
300 | 
301 | 
302 | # Input 1: A char start position
303 | # Input 2: A char end position
304 | # Input 3: All the char token start positions in the paragraph
305 | # Input 4: All the char token end positions in the paragraph
306 | # Output: The char start and end position now in terms of tokens.
307 | def convertCharToTok(start, end, all_starts, all_ends):
308 | 	# If the start and end span is the same, the edit is an insertion.
309 | 	if start == end:
310 | 		# Special case: Pre-First token edits.
311 | 		if not start or start <= all_starts[0]:
312 | 			return [0, 0]
313 | 		# Special case: Post-Last token edits.
314 | 		elif start >= all_ends[-1]:
315 | 			return [len(all_starts), len(all_starts)]
316 | 		# General case 1: Edit starts at the beginning of a token.
317 | 		elif start in all_starts:
318 | 			return [all_starts.index(start), all_starts.index(start)]
319 | 		# General case 2: Edit starts at the end of a token.
320 | 		elif start in all_ends:
321 | 			return [all_ends.index(start)+1, all_ends.index(start)+1]
322 | 		# Problem case: Edit starts inside 1 token.
323 | 		else:
324 | 			# Expand character span to nearest token boundary.
325 | 			if start not in all_starts:
326 | 				start = all_starts[bisect(all_starts, start)-1]
327 | 			if end not in all_ends:
328 | 				end = all_ends[bisect(all_ends, end)]
329 | 			# Keep the new character spans as well
330 | 			return [all_starts.index(start), all_ends.index(end)+1, start, end]
331 | 	# Character spans match complete token spans.
332 | 	elif start in all_starts and end in all_ends:
333 | 		return [all_starts.index(start), all_ends.index(end)+1]
334 | 	# Character spans do NOT match complete token spans.
335 | 	else:
336 | 		# Expand character span to nearest token boundary.
337 | 		if start not in all_starts:
338 | 			start = all_starts[bisect(all_starts, start)-1]
339 | 		if end not in all_ends:
340 | 			nearest = bisect(all_ends, end)
341 | 			# Sometimes the end is a char after the last token.
342 | 			# In this case, just use the last tok boundary.
343 | 			if nearest >= len(all_ends):
344 | 				end = all_ends[-1]
345 | 			else:
346 | 				end = all_ends[bisect(all_ends, end)]
347 | 		# Keep the new character spans as well
348 | 		return [all_starts.index(start), all_ends.index(end)+1, start, end]
349 | 
350 | #########################################
351 | parser = argparse.ArgumentParser()
352 | parser.add_argument('--input_json', type=str, default="data/fce/json/fce-dev.json",
353 | 					help="input json file for GEC.")
354 | args = parser.parse_args()
355 | 
356 | 
357 | def main():
358 | 	orig_sents = []
359 | 	correct_sents = []
360 | 	iii = 1
361 | 	with open(args.input_json) as data:
362 | 		for line in data:
363 | 			line = json.loads(line)
364 | 			print('iii: ', iii)
365 | 			iii += 1
366 | 			# Normalise certain punctuation in the text
367 | 			text = line["text"].translate(norm_dict)
368 | 
369 | 			# Store the sentences and edits for all annotators here
370 | 			#coder_dict = {}
371 | 			# Loop through the annotator ids and their edits
372 | 			# Loop through the annotator ids and their edits
373 | 			for coder, edits in line["edits"]:
374 | 				# Add the coder to the coder_dict if needed
375 | 				#if coder not in coder_dict: coder_dict[coder] = []
376 | 				# Split the essay into paragraphs and update and normalise the char edits
377 | 				para_info = getParas(text, edits, norm_dict)
378 | 				# Loop through the paragraphs and edits
379 | 				for orig_para, para_edits in para_info:
380 | 					# Remove unnecessary whitespace from para and update char edits
381 | 					orig_para, para_edits = cleanPara(orig_para, para_edits)
382 | 					if not orig_para: continue  # Ignore empty paras
383 | 					# Annotate orig_para with spacy
384 | 					orig_para = nlp(orig_para)
385 | 					# Convert character edits to token edits
386 | 					para_edits = getTokenEdits(orig_para, para_edits, nlp)
387 | 					# Split the paragraph into sentences and update tok edits
388 | 					sents = getSents(orig_para, para_edits)
389 | 					orig_sents.extend([sent['orig'] for sent in sents])
390 | 					correct_sents.extend([sent['cor'] for sent in sents])
391 | 					# Save the sents in the coder_dict
392 | 					#coder_dict[coder].extend(sents)
393 | 
394 | 	orig_file_name = args.input_json.split('.')[0] + '.orig'
395 | 	cor_file_name =  args.input_json.split('.')[0] + '.corct'
396 | 
397 | 	with open(orig_file_name, "w") as fp:
398 | 		for line in orig_sents:
399 | 			fp.write(' '.join(line))
400 | 			fp.write("\n")
401 | 
402 | 	with open(cor_file_name, "w") as fp:
403 | 		for line in correct_sents:
404 | 			fp.write(' '.join(line))
405 | 			fp.write("\n")
406 | 
407 | #result = [orig_sents, correct_sents]
408 | #with open("fce-train.p", "wb") as fp:
409 | #    pickle.dump(result, fp)
410 | 
411 | if __name__ == '__main__':
412 | 	main()
413 | 
414 | 
415 | 
416 | 
417 | 
418 | 
419 | 
420 | 
421 | 
422 | 


--------------------------------------------------------------------------------