├── tools ├── __init__.py ├── scripts │ ├── __init__.py │ ├── toolbox.py │ ├── align_text.py │ ├── rdlextra.py │ └── cat_rules.py ├── resources │ ├── en-ptb_map │ └── readme.md └── derivative_word.py ├── seq2seq-train.py ├── README.md ├── data └── fce │ ├── licence.txt │ └── readme.txt ├── forecast_token.py ├── bert-main.py ├── candidate_tokens.py └── json2pair.py /tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /seq2seq-train.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import pickle 4 | 5 | with open("fce-train.p", "rb") as fp: 6 | data = pickle.load(fp) 7 | 8 | xx = data[0] 9 | yy = data[1] 10 | 11 | debug = 1 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## LM-GEC 2 | Build a English grammatical error correction system based on language model. 3 | 4 | ## Requirement 5 | * python 3.4+ 6 | * pytorch 1.2 7 | * pytorch-transformer 8 | * Texat-pytorch 9 | * numpy 10 | 11 | 12 | ## TODO 13 | 1. Select token to improve the minimum softmax probability of word from Bert iteratively 14 | 2. Fine-tuning train the GPT-2 in inverse word order 15 | 3. Frist use the Bert model to correct error 16 | 4. Deep bidirectional GPT-2 model to polish the output of Bert 17 | 5. Take the grammatical erroe correction as seq2seq probelm, train seq2seq model to do the GEC 18 | -------------------------------------------------------------------------------- /tools/resources/en-ptb_map: -------------------------------------------------------------------------------- 1 | # SYM 2 | $ SYM 3 | '' PUNCT 4 | , PUNCT 5 | -LRB- PUNCT 6 | -RRB- PUNCT 7 | . PUNCT 8 | : PUNCT 9 | AFX ADJ 10 | CC CONJ 11 | CD NUM 12 | DT DET 13 | EX ADV 14 | FW X 15 | HYPH PUNCT 16 | IN ADP 17 | JJ ADJ 18 | JJR ADJ 19 | JJS ADJ 20 | LS PUNCT 21 | MD VERB 22 | NIL X 23 | NN NOUN 24 | NNP PROPN 25 | NNPS PROPN 26 | NNS NOUN 27 | PDT DET 28 | POS PART 29 | PRP PRON 30 | PRP$ DET 31 | RB ADV 32 | RBR ADV 33 | RBS ADV 34 | RP PART 35 | SYM SYM 36 | TO PART 37 | UH INTJ 38 | VB VERB 39 | VBD VERB 40 | VBG VERB 41 | VBN VERB 42 | VBP VERB 43 | VBZ VERB 44 | WDT DET 45 | WP PRON 46 | WP$ DET 47 | WRB ADV 48 | `` PUNCT -------------------------------------------------------------------------------- /tools/resources/readme.md: -------------------------------------------------------------------------------- 1 | # Resources 2 | 3 | ## en-ptb_map 4 | 5 | en-ptb_map is a mapping file that converts spacy Penn Treebank (PTB) style part of speech tags to stanford universal dependency tags. 6 | 7 | The mapping file was obtained [here](http://universaldependencies.org/tagset-conversion/en-penn-uposf.html). 8 | 9 | Spacy includes some custom POS tags that were not part of the original PTB tagset. The authors of spacy suggested the following mapping for these tags: 10 | 11 | | PTB-Style | Universal 12 | |-----------|-------- 13 | | "" | PUNCT 14 | | ADD | X 15 | | GW | X 16 | | NFP | X 17 | | SP | SPACE 18 | | XX | X 19 | 20 | ## en_GB-large.txt 21 | 22 | en_GB-large.txt is a list of valid British English words according to the latest Hunspell dictionary. 23 | 24 | It was obtained [here](https://sourceforge.net/projects/wordlist/files/speller/2017.08.24/). 25 | 26 | The specific file bundled with this release is: wordlist-en_GB-large-2017.08.24.zip. 27 | 28 | -------------------------------------------------------------------------------- /tools/derivative_word.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import gluonnlp 4 | from nltk.stem import WordNetLemmatizer 5 | from nltk.stem.porter import * 6 | from polyglot.text import Text, Word 7 | import spacy 8 | import enchant 9 | import re 10 | import pickle 11 | isEnglish = enchant.Dict("en") 12 | aa = isEnglish.check("hello") 13 | 14 | lemmatizer = WordNetLemmatizer() 15 | 16 | nlp = spacy.load('en_core_web_sm') 17 | 18 | ''' 19 | doc = nlp('The only major thing to note is that lemmatize takes a biggest part of speech parameter, "pos." ') 20 | tokens = [token.text for token in doc] 21 | for tk in tokens: 22 | w1 = lemmatizer.lemmatize(tk, 'v') 23 | w2 = lemmatizer.lemmatize(tk, pos="a") 24 | w3 = lemmatizer.lemmatize(tk) 25 | w = {w1, w2, w3} 26 | ''' 27 | 28 | stemmer = PorterStemmer() 29 | glove_6b50d = gluonnlp.embedding.create('glove', source='glove.6B.50d') 30 | vocab = gluonnlp.Vocab(gluonnlp.data.Counter(glove_6b50d.idx_to_token)) 31 | 32 | pattern = "^[A-Za-z]*[A-Za-z]$" 33 | 34 | roots_dict = dict() 35 | for ww in vocab.idx_to_token: 36 | if re.search(pattern, ww) and len(ww)>2: 37 | root = stemmer.stem(ww) 38 | if ww != root: 39 | print(ww) 40 | #polarity = int(Word(ww, language="en").polarity) 41 | #ttt = {ww: polarity} 42 | if root not in roots_dict: 43 | roots_dict[root] = [] 44 | roots_dict[root].append(ww) 45 | else: 46 | roots_dict[root].append(ww) 47 | 48 | debug = 1 49 | fp = open("stem-words.p", "wb") 50 | pickle.dump(roots_dict, fp) 51 | fp.close() 52 | 53 | fp = open("stem-words.p", "rb") 54 | aaaa = pickle.load(fp) 55 | fp.close() 56 | 57 | debug = 1 -------------------------------------------------------------------------------- /data/fce/licence.txt: -------------------------------------------------------------------------------- 1 | CLC FCE Dataset Licence Agreement 2 | 3 | 1. By downloading this dataset and licence, this licence agreement is 4 | entered into, effective this date, between you, the Licensee, and the 5 | University of Cambridge, the Licensor. 6 | 7 | 2. Copyright of the entire licensed dataset is held by the Licensor. 8 | No ownership or interest in the dataset is transferred to the 9 | Licensee. 10 | 11 | 3. The Licensor hereby grants the Licensee a non-exclusive 12 | non-transferable right to use the licensed dataset for 13 | non-commercial research and educational purposes. 14 | 15 | 4. Non-commercial purposes exclude without limitation any use of the 16 | licensed dataset or information derived from the dataset for or as 17 | part of a product or service which is sold, offered for sale, 18 | licensed, leased or rented. 19 | 20 | 5. The Licensee shall acknowledge use of the licensed dataset in all 21 | publications of research based on it, in whole or in part, through 22 | citation of the following publication: 23 | 24 | Yannakoudakis, Helen and Briscoe, Ted and Medlock, Ben, 25 | A New Dataset and Method for Automatically Grading ESOL Texts, 26 | Proceedings of the 49th Annual Meeting of the Association for 27 | Computational Linguistics: Human Language Technologies. 28 | 29 | 6. The Licensee may publish excerpts of less than 100 words from the 30 | licensed dataset pursuant to clause 3. 31 | 32 | 7. The Licensor grants the Licensee this right to use the licensed dataset 33 | "as is". Licensor does not make, and expressly disclaims, any express or 34 | implied warranties, representations or endorsements of any kind 35 | whatsoever. 36 | 37 | 8. This Agreement shall be governed by and construed in accordance with 38 | the laws of England and the English courts shall have exclusive 39 | jurisdiction. 40 | 41 | -------------------------------------------------------------------------------- /forecast_token.py: -------------------------------------------------------------------------------- 1 | 2 | from candidate_tokens import get_candidate_tokens 3 | import torch 4 | import numpy as np 5 | from polyglot.text import Word 6 | import spacy 7 | nlp = spacy.load('en_core_web_sm') 8 | 9 | 10 | def forecast_token(text, masked_index, tokenizer, model): 11 | tokenized_text = ['[CLS]'] 12 | doc = nlp(text) 13 | tokenized_text.extend([token.text for token in doc]) 14 | tokenized_text.append('[SEP]') 15 | 16 | synonyms_ = get_candidate_tokens(tokenized_text[masked_index]) 17 | synonyms_ = list(set(synonyms_)) 18 | 19 | masked_token = tokenized_text[masked_index] 20 | token_polarity = int(Word(masked_token, language="en").polarity) ####### 21 | 22 | synonyms = [] 23 | for elem in synonyms_: 24 | if int(Word(elem, language="en").polarity) == token_polarity: 25 | synonyms.append(elem) 26 | 27 | # Mask a token that we will try to predict back with `BertForMaskedLM` 28 | tokenized_text[masked_index] = '[MASK]' 29 | 30 | # Convert token to vocabulary indices 31 | indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) 32 | 33 | # Convert inputs to PyTorch tensors 34 | tokens_tensor = torch.tensor([indexed_tokens]) 35 | 36 | 37 | # Predict all tokens 38 | with torch.no_grad(): 39 | outputs = model(tokens_tensor) 40 | predictions = outputs[0] 41 | 42 | token_idxs = [tokenizer.convert_tokens_to_ids([word])[0] for word in synonyms] 43 | preds = np.array([predictions[0, masked_index, idx] for idx in token_idxs]) 44 | sort_top = preds.argsort() 45 | #predicted_index = token_idxs[sort_top[-1]] 46 | candiditate_tokens = [synonyms[sort_top[-1]], synonyms[sort_top[-2]]] 47 | candiditate_tokens = [] 48 | for nn in np.arange(len(preds)): 49 | if abs(preds[nn]-preds[sort_top[-1]])<0.0001: 50 | candiditate_tokens.append(synonyms[nn]) 51 | 52 | if masked_token in candiditate_tokens: # if the probability of masked token within top two, then think the masked token is correct. 53 | predicted_token, softmax_prob = masked_token, 100 54 | else: 55 | predicted_token, softmax_prob = synonyms[sort_top[-1]], preds[sort_top[-1]] 56 | 57 | # don't change the token if the predicted token is same at the original token 58 | # without consider the upper/lower case 59 | if masked_token.lower() == predicted_token.lower(): 60 | predicted_token = masked_token 61 | return predicted_token, softmax_prob 62 | 63 | -------------------------------------------------------------------------------- /bert-main.py: -------------------------------------------------------------------------------- 1 | 2 | # https://www.cl.cam.ac.uk/research/nl/bea2019st/ 3 | # Input Travel by bus is exspensive , bored and annoying . 4 | # Output Travelling by bus is expensive , boring and annoying . 5 | 6 | import numpy as np 7 | import spacy 8 | from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM, \ 9 | RobertaTokenizer, RobertaForMaskedLM, \ 10 | XLNetTokenizer, XLNetPreTrainedModel, \ 11 | XLNetLMHeadModel, \ 12 | XLMPreTrainedModel , XLMModel, XLMWithLMHeadModel 13 | from forecast_token import forecast_token 14 | import re 15 | from googletrans import Translator 16 | translator = Translator() 17 | aa = translator.translate('程开甲,男,汉族,中共党员、九三学社社员,' 18 | '1918年8月生,2018年11月去世,江苏吴江人,' 19 | '原国防科工委科技委常任委员,中国科学院院士。') 20 | 21 | nlp = spacy.load('en_core_web_lg') 22 | 23 | # Load pre-trained model tokenizer (vocabulary) 24 | # bert-large-uncased-whole-word-masking, 25 | tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking') 26 | model = BertForMaskedLM.from_pretrained('bert-large-uncased-whole-word-masking') 27 | #tokenizer = RobertaTokenizer.from_pretrained('roberta-large') 28 | #model = RobertaForMaskedLM.from_pretrained('roberta-large') 29 | #tokenizer = XLNetTokenizer.from_pretrained('xlm-mlm-en-2048') 30 | #model = XLMPreTrainedModel.from_pretrained('xlm-mlm-en-2048') 31 | model.eval() 32 | 33 | # Tokenize input 34 | #text = 'I am writing in order to express my disappointment about your musical show " Over the Rainbow " .' 35 | #text = 'I am writing in order to express my disappointed about your musical show " Over the Rainbow " .' 36 | text = "I saws the show 's advertisement hanging up of a wall in London where I was spending my holiday with some friends . " \ 37 | "I convinced them to go there with me because I had heard good references about your Company and , " \ 38 | "above all , about the main star , Danny Brook ." 39 | doc = nlp(text) 40 | sentences = [sent.text for sent in doc.sents] 41 | for sent in sentences: 42 | sent_doc = nlp(sent) 43 | tokens = [token.text for token in sent_doc] 44 | for masked_index in np.arange(len(sent_doc))+1: 45 | if masked_index>1 and tokens[masked_index-1].istitle(): # deflautly think the word with first letter is upppercase is 46 | f_token, softmax_prob = tokens[masked_index-1], 100 47 | else: 48 | f_token, softmax_prob = forecast_token(sent, masked_index, tokenizer, model) 49 | print('Predicted token is: ', f_token, ' softmax_prob: ', softmax_prob) 50 | 51 | debug = 1 -------------------------------------------------------------------------------- /candidate_tokens.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | from pattern.en import lexeme 5 | from nltk.corpus import wordnet 6 | from nltk.stem.porter import * 7 | from nltk.stem import WordNetLemmatizer 8 | import spacy 9 | 10 | from spacy.lang.en.stop_words import STOP_WORDS 11 | 12 | from mxnet import nd 13 | import gluonnlp 14 | 15 | import pickle 16 | import enchant 17 | from nltk.corpus import words 18 | #import nltk 19 | #nltk.download() 20 | EnglishDict = enchant.Dict("en_US") 21 | 22 | stemmer = PorterStemmer() 23 | 24 | lemmatizer = WordNetLemmatizer() 25 | 26 | fp = open("./tools/stem-words.p", "rb") 27 | stem2words = pickle.load(fp) 28 | fp.close() 29 | 30 | 31 | glove_6b50d = gluonnlp.embedding.create('glove', source='glove.6B.50d') 32 | vocab = gluonnlp.Vocab(gluonnlp.data.Counter(glove_6b50d.idx_to_token)) 33 | vocab.set_embedding(glove_6b50d) 34 | 35 | def norm_vecs_by_row(x): 36 | return x / nd.sqrt(nd.sum(x * x, axis=1) + 1E-10).reshape((-1,1)) 37 | 38 | def get_knn(word, k=2000): 39 | word_vec = vocab.embedding[word].reshape((-1, 1)) 40 | vocab_vecs = norm_vecs_by_row(vocab.embedding.idx_to_vec) 41 | dot_prod = nd.dot(vocab_vecs, word_vec) 42 | indices = nd.topk(dot_prod.reshape((len(vocab), )), k=k+1, ret_typ='indices') 43 | indices = [int(i.asscalar()) for i in indices] 44 | # Remove unknown and input tokens. 45 | return vocab.to_tokens(indices[1:]) 46 | 47 | def get_synomyms_token(token): 48 | stem = stemmer.stem(token) 49 | synonyms_ = [token] 50 | if stem in stem2words: 51 | words = stem2words[stem] 52 | synonyms_.extend(words) 53 | 54 | w1 = lemmatizer.lemmatize(token, 'v') 55 | w2 = lemmatizer.lemmatize(token, pos="a") 56 | w3 = lemmatizer.lemmatize(token) 57 | w = {w1, w2, w3} 58 | synonyms_.extend(list(w)) 59 | 60 | #synonyms_ = [token] 61 | 62 | for syn in wordnet.synsets(token): 63 | for l in syn.lemmas(): 64 | synonyms_.append(l.name()) 65 | 66 | synonyms_.extend(lexeme(token)) 67 | synonyms = np.array([elm for elm in set(synonyms_)]) 68 | 69 | return synonyms 70 | 71 | def get_candidate_tokens(token): 72 | #spacy_stopwords = list(spacy.lang.en.stop_words.STOP_WORDS) 73 | spacy_stopwords = list(STOP_WORDS) 74 | if token in spacy_stopwords: 75 | return spacy_stopwords 76 | 77 | result_ = get_knn(token, 20) 78 | result = [] 79 | for ww in result_: 80 | # check the string from KNN is in English dictionary 81 | if EnglishDict.check(ww) or ww in words.words(): 82 | result.append(ww) 83 | 84 | synomyms = get_synomyms_token(token) 85 | result.extend(synomyms) 86 | #result.append('reviewing') 87 | 88 | return result 89 | 90 | 91 | if __name__ == '__main__': 92 | aa = get_candidate_tokens('people') 93 | bb = get_knn('took', 100) 94 | print(bb) 95 | 96 | -------------------------------------------------------------------------------- /data/fce/readme.txt: -------------------------------------------------------------------------------- 1 | Release 2.1 2 | 25th March 2019 3 | 4 | This directory contains the official version of the First Certificate in English (FCE) corpus used in the BEA2019 shared task. 5 | 6 | More details about the FCE corpus can be found in the following paper: 7 | 8 | Helen Yannakoudakis, Ted Briscoe, and Ben Medlock. 2011. A new dataset and method for automatically grading ESOL texts. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pages 180–189. 9 | 10 | The original FCE files are available here: https://ilexir.co.uk/datasets/index.html 11 | The raw dataset is not explicitly split into training, development and test sets, and so we recreated this split based on the error detection version of the dataset available at the same link. 12 | 13 | This version of the public FCE is available in two different formats: JSON and M2. 14 | 15 | -- JSON -- 16 | The JSON format is the raw unprocessed version of the corpus. Each line in a JSON file contains the following fields: 17 | id : A unique id for the essay. 18 | l1 : The first language of the author. 19 | age : The age (or age range) of the author. 20 | q : The question number; each author submitted essay answers to 2 different questions. 21 | answer-s : The score awarded to the essay for this particular question. 22 | script-s : The overall score awarded to the author for both questions they answered. 23 | text : The essay as it was originally written by the author. 24 | edits : A list of all the character level edits made to the text by all annotators, of the form: 25 | [[annotator_id, [[char_start_offset, char_end_offset, correction], ...]], ...]. 26 | 27 | -- M2 -- 28 | The M2 format is the processed version of the corpus that we recommend for the BEA2019 shared task. 29 | M2 format has been the standard format for annotated GEC files since the first CoNLL shared task in 2013. 30 | 31 | Since it is not easy to convert character level edits in unprocessed text into token level edits in sentences (cf. https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-894.pdf), we provide a json_to_m2.py script to convert the raw JSON to M2. This script must be placed inside the main directory of the ERRor ANnotation Toolkit (ERRANT) in order to be used. ERRANT is available here: https://github.com/chrisjbryant/errant 32 | 33 | Each M2 file was thus generated in Python 3.5 using the following command: 34 | 35 | python3 errant/json_to_m2.py -out -gold 36 | 37 | This used spacy v1.9.0 and the en_core_web_sm-1.2.0 model. 38 | 39 | Updates 40 | ---------------------- 41 | 42 | -- v2.0 -- 43 | 44 | * Added new JSON files for the FCE if users want the original data in the same format as the W&I+LOCNESS corpus. 45 | 46 | * All punctuation was normalised in the M2 files. It was otherwise arbitrary whether, for example, different apostrophe styles were corrected or not. 47 | 48 | * Fixed a bug in the character to token edit conversion script. 49 | 50 | * Fixed a bug with correction edits nested inside detection edits that led to them being ignored. 51 | 52 | -- v2.1 -- 53 | 54 | * Updated the json_to_m2.py script to handle multiple annotators. 55 | -------------------------------------------------------------------------------- /tools/scripts/toolbox.py: -------------------------------------------------------------------------------- 1 | from operator import itemgetter 2 | 3 | # Load latest Hunspell dictionaries: 4 | def loadDictionary(path): 5 | return set(open(path).read().split()) 6 | 7 | # Load Stanford Universal Tags map file. 8 | def loadTagMap(path): 9 | map_dict = {} 10 | open_file = open(path).readlines() 11 | for line in open_file: 12 | line = line.strip().split("\t") 13 | # Change ADP to PREP; makes it clearer 14 | if line[1].strip() == "ADP": 15 | map_dict[line[0]] = "PREP" 16 | # Also change PROPN to NOUN; we don't need a prop noun tag 17 | elif line[1].strip() == "PROPN": 18 | map_dict[line[0]] = "NOUN" 19 | else: 20 | map_dict[line[0]] = line[1].strip() 21 | # Add some spacy PTB tags not in the original mapping. 22 | map_dict['""'] = "PUNCT" 23 | map_dict["SP"] = "SPACE" 24 | map_dict["ADD"] = "X" 25 | map_dict["GW"] = "X" 26 | map_dict["NFP"] = "X" 27 | map_dict["XX"] = "X" 28 | return map_dict 29 | 30 | # Input: A sentence + edit block in an m2 file. 31 | # Output 1: The original sentence (a list of tokens) 32 | # Output 2: A dictionary; key is coder id, value is a tuple. 33 | # tuple[0] is the corrected sentence (a list of tokens), tuple[1] is the edits. 34 | # Process M2 to extract sentences and edits. 35 | def processM2(info): 36 | info = info.split("\n") 37 | orig_sent = info[0][2:].split() # [2:] ignore the leading "S " 38 | all_edits = info[1:] 39 | # Simplify the edits and group by coder id. 40 | edit_dict = processEdits(all_edits) 41 | out_dict = {} 42 | # Loop through each coder and their edits. 43 | for coder, edits in edit_dict.items(): 44 | # Copy orig_sent. We will apply the edits to it to make cor_sent 45 | cor_sent = orig_sent[:] 46 | gold_edits = [] 47 | offset = 0 48 | # Sort edits by start and end offset only. If they are the same, do not reorder. 49 | edits = sorted(edits, key=itemgetter(0)) # Sort by start offset 50 | edits = sorted(edits, key=itemgetter(1)) # Sort by end offset 51 | for edit in edits: 52 | # Do not apply noop or Um edits, but save them 53 | if edit[2] in {"noop", "Um"}: 54 | gold_edits.append(edit+[-1,-1]) 55 | continue 56 | orig_start = edit[0] 57 | orig_end = edit[1] 58 | cor_toks = edit[3].split() 59 | # Apply the edit. 60 | cor_sent[orig_start+offset:orig_end+offset] = cor_toks 61 | # Get the cor token start and end positions in cor_sent 62 | cor_start = orig_start+offset 63 | cor_end = cor_start+len(cor_toks) 64 | # Keep track of how this affects orig edit offsets. 65 | offset = offset-(orig_end-orig_start)+len(cor_toks) 66 | # Save the edit with cor_start and cor_end 67 | gold_edits.append(edit+[cor_start]+[cor_end]) 68 | # Save the cor_sent and gold_edits for each annotator in the out_dict. 69 | out_dict[coder] = (cor_sent, gold_edits) 70 | return orig_sent, out_dict 71 | 72 | # Input: A list of edit lines for a sentence in an m2 file. 73 | # Output: An edit dictionary; key is coder id, value is a list of edits. 74 | def processEdits(edits): 75 | edit_dict = {} 76 | for edit in edits: 77 | edit = edit.split("|||") 78 | span = edit[0][2:].split() # [2:] ignore the leading "A " 79 | start = int(span[0]) 80 | end = int(span[1]) 81 | cat = edit[1] 82 | cor = edit[2] 83 | id = edit[-1] 84 | # Save the useful info as a list 85 | proc_edit = [start, end, cat, cor] 86 | # Save the proc edit inside the edit_dict using coder id. 87 | if id in edit_dict.keys(): 88 | edit_dict[id].append(proc_edit) 89 | else: 90 | edit_dict[id] = [proc_edit] 91 | return edit_dict 92 | 93 | # Input 1: A list of token strings in a sentence. 94 | # Input 2: A preloaded Spacy processing object. 95 | # Annotate tokens with POS, lemma and parse info. 96 | def applySpacy(sent, nlp): 97 | # Convert tokens to spacy tokens and POS tag and parse. 98 | sent = nlp.tokenizer.tokens_from_list(sent) 99 | nlp.tagger(sent) 100 | nlp.parser(sent) 101 | return sent 102 | 103 | # Input 1: An edit list. [orig_start, orig_end, cat, cor, cor_start, cor_end] 104 | # Input 2: An original SpaCy sentence. 105 | # Input 3: A corrected SpaCy sentence. 106 | # Output: A minimised edit with duplicate words on both sides removed. 107 | # E.g. [was eaten -> has eaten] becomes [was -> has] 108 | def minimiseEdit(edit, orig, cor): 109 | # edit = [orig_start, orig_end, cat, cor, cor_start, cor_end] 110 | orig_toks = orig[edit[0]:edit[1]] 111 | cor_toks = cor[edit[4]:edit[5]] 112 | # While the first token is the same string in both (and both are not null) 113 | while orig_toks and cor_toks and orig_toks[0].text == cor_toks[0].text: 114 | # Remove that token from the span, and adjust the start offset. 115 | orig_toks = orig_toks[1:] 116 | cor_toks = cor_toks[1:] 117 | edit[0] += 1 118 | edit[4] += 1 119 | # Then do the same from the last token. 120 | while orig_toks and cor_toks and orig_toks[-1].text == cor_toks[-1].text: 121 | # Remove that token from the span, and adjust the start offset. 122 | orig_toks = orig_toks[:-1] 123 | cor_toks = cor_toks[:-1] 124 | edit[1] -= 1 125 | edit[5] -= 1 126 | # If both sides are not null, save the new correction string. 127 | if orig_toks or cor_toks: 128 | edit[3] = " ".join([tok.text for tok in cor_toks]) 129 | return edit 130 | 131 | # Input 1: An edit list = [orig_start, orig_end, cat, cor, cor_start, cor_end] 132 | # Input 2: A coder id for the specific annotator. 133 | # Output: An edit in m2 file format. 134 | def formatEdit(edit, coder_id=0): 135 | span = " ".join(["A", str(edit[0]), str(edit[1])]) 136 | return "|||".join([span, edit[2], edit[3], "REQUIRED", "-NONE-", str(coder_id)]) -------------------------------------------------------------------------------- /tools/scripts/align_text.py: -------------------------------------------------------------------------------- 1 | from difflib import SequenceMatcher 2 | from itertools import combinations, groupby 3 | from string import punctuation 4 | import re 5 | import spacy.parts_of_speech as POS 6 | import tools.scripts.rdlextra as DL 7 | 8 | # Some global variables 9 | CONTENT_POS = {POS.ADJ, POS.ADV, POS.NOUN, POS.VERB} 10 | 11 | ### FUNCTIONS ### 12 | 13 | def get_opcodes(alignment): 14 | s_start = 0 15 | s_end = 0 16 | t_start = 0 17 | t_end = 0 18 | opcodes = [] 19 | for op in alignment: 20 | if op[0] == "D": # Deletion 21 | s_end += 1 22 | elif op[0] == "I": # Insertion 23 | t_end += 1 24 | elif op[0].startswith("T"): # Transposition 25 | # Extract number of elements involved (default is 2) 26 | k = int(op[1:] or 2) 27 | s_end += k 28 | t_end += k 29 | else: # Match or substitution 30 | s_end += 1 31 | t_end += 1 32 | # Save 33 | opcodes.append((op, s_start, s_end, t_start, t_end)) 34 | # Start from here 35 | s_start = s_end 36 | t_start = t_end 37 | return opcodes 38 | 39 | def merge_edits(edits): 40 | if edits: 41 | return [("X", edits[0][1], edits[-1][2], edits[0][3], edits[-1][4])] 42 | else: 43 | return edits 44 | 45 | # Input 1: Spacy source sentence 46 | # Input 2: Spacy target sentence 47 | # Input 3: The alignment between the 2; [e.g. M, M, S ,S M] 48 | # Output: A list of processed edits that have been merged or split. 49 | def get_edits(source, target, edits): 50 | out_edits = [] 51 | # Start: Split alignment intro groups of M, T and rest. T has a number after it. 52 | for op, group in groupby(edits, lambda x: x[0][0] if x[0][0] in {"M", "T"} else False): 53 | # Convert the generator to a list 54 | group = list(group) 55 | # Ignore M 56 | if op == "M": continue 57 | # Do not merge T 58 | elif op == "T": out_edits.extend(group) 59 | # Further processing required 60 | else: out_edits.extend(process_edits(source, target, group)) 61 | return out_edits 62 | 63 | # Input 1: Spacy source sentence 64 | # Input 2: Spacy target sentence 65 | # Input 3: A list of non-matching alignments: D, I and/or S 66 | # Output: A list of processed edits that have been merged or split. 67 | def process_edits(source, target, edits): 68 | # Return single alignments 69 | if len(edits) <= 1: return edits 70 | # Get the ops for the whole edit sequence 71 | ops = [op[0] for op in edits] 72 | # Merge ops that are all D xor I. (95% of human multi-token edits contain S). 73 | if set(ops) == {"D"} or set(ops) == {"I"}: return merge_edits(edits) 74 | 75 | content = False # True if edit includes a content word 76 | # Get indices of all combinations of start and end ranges in the edits: 012 -> 01, 02, 12 77 | combos = list(combinations(range(0, len(edits)), 2)) 78 | # Sort them starting with largest spans first 79 | combos.sort(key = lambda x: x[1]-x[0], reverse=True) 80 | # Loop through combos 81 | for start, end in combos: 82 | # Ignore ranges that do NOT contain a substitution. 83 | if "S" not in ops[start:end+1]: continue 84 | # Get the tokens in orig and cor. They will never be empty due to above rule. 85 | s = source[edits[start][1]:edits[end][2]] 86 | t = target[edits[start][3]:edits[end][4]] 87 | # Possessive suffixes merged with previous token: [friends -> friend 's] 88 | if s[-1].tag_ == "POS" or t[-1].tag_ == "POS": 89 | return process_edits(source, target, edits[:end-1]) + merge_edits(edits[end-1:end+1]) + process_edits(source, target, edits[end+1:]) 90 | # Case changes 91 | if s[-1].lower_ == t[-1].lower_: 92 | # Merge first token I or D of arbitrary length: [Cat -> The big cat] 93 | if start == 0 and ((len(s) == 1 and t[0].text[0].isupper()) or (len(t) == 1 and s[0].text[0].isupper())): 94 | return merge_edits(edits[start:end+1]) + process_edits(source, target, edits[end+1:]) 95 | # Merge with previous punctuation: [, we -> . We], [we -> . We] 96 | if (len(s) > 1 and is_punct(s[-2])) or (len(t) > 1 and is_punct(t[-2])): 97 | return process_edits(source, target, edits[:end-1]) + merge_edits(edits[end-1:end+1]) + process_edits(source, target, edits[end+1:]) 98 | # Whitespace/hyphens: [bestfriend -> best friend], [sub - way -> subway] 99 | s_str = re.sub("['-]", "", "".join([tok.lower_ for tok in s])) 100 | t_str = re.sub("['-]", "", "".join([tok.lower_ for tok in t])) 101 | if s_str == t_str: 102 | return process_edits(source, target, edits[:start]) + merge_edits(edits[start:end+1]) + process_edits(source, target, edits[end+1:]) 103 | # POS-based merging: Same POS or infinitive/phrasal verbs: [to eat -> eating], [watch -> look at] 104 | pos_set = set([tok.pos for tok in s]+[tok.pos for tok in t]) 105 | if (len(pos_set) == 1 and len(s) != len(t)) or pos_set == {POS.PART, POS.VERB}: 106 | return process_edits(source, target, edits[:start]) + merge_edits(edits[start:end+1]) + process_edits(source, target, edits[end+1:]) 107 | # Split rules take effect when we get to smallest chunks 108 | if end-start < 2: 109 | # Split adjacent substitutions 110 | if len(s) == len(t) == 2: 111 | return process_edits(source, target, edits[:start+1]) + process_edits(source, target, edits[start+1:]) 112 | # Similar substitutions at start or end 113 | if (ops[start] == "S" and char_cost(s[0].text, t[0].text) < 0.25) or \ 114 | (ops[end] == "S" and char_cost(s[-1].text, t[-1].text) < 0.25): 115 | return process_edits(source, target, edits[:start+1]) + process_edits(source, target, edits[start+1:]) 116 | # Split final determiners 117 | if end == len(edits)-1 and ((ops[-1] in {"D", "S"} and s[-1].pos == POS.DET) or \ 118 | (ops[-1] in {"I", "S"} and t[-1].pos == POS.DET)): 119 | return process_edits(source, target, edits[:-1]) + [edits[-1]] 120 | # Set content word flag 121 | if not pos_set.isdisjoint(CONTENT_POS): content = True 122 | # If all else fails, merge edits that contain content words 123 | if content: return merge_edits(edits) 124 | else: return edits 125 | 126 | # Is the token a content word? 127 | def is_content(A): 128 | return A.pos in CONTENT_POS 129 | 130 | # Check whether token is punctuation 131 | def is_punct(token): 132 | return token.pos == POS.PUNCT or token.text in punctuation 133 | 134 | # all-split: No edits are ever merged. Everything is 1:1, 1:0 or 0:1 only. 135 | def get_edits_split(edits): 136 | new_edits = [] 137 | for edit in edits: 138 | op = edit[0] 139 | if op != "M": 140 | new_edits.append(edit) 141 | return new_edits 142 | 143 | # all-merge: Merge all adjacent edits of any operation type, except M. 144 | def get_edits_group_all(edits): 145 | new_edits = [] 146 | for op, group in groupby(edits, lambda x: True if x[0] == "M" else False): 147 | if not op: 148 | new_edits.extend(merge_edits(list(group))) 149 | return new_edits 150 | 151 | # all-equal: Merge all edits of the same operation type. 152 | def get_edits_group_type(edits): 153 | new_edits = [] 154 | for op, group in groupby(edits, lambda x: x[0]): 155 | if op != "M": 156 | new_edits.extend(merge_edits(list(group))) 157 | return new_edits 158 | 159 | # Cost is 0 if lemmas are the same, otherwise 0.499. Maximum S cost is 1.999. 160 | # This prevents unintuitive transpositions. 161 | def lemma_cost(A, B): 162 | if A.lemma == B.lemma: 163 | return 0 164 | else: 165 | return 0.499 166 | 167 | # Cost is 0 if POS are the same, else 0.25 if both are content, else 0.5. 168 | # Content words more likely to align to other content words. 169 | def pos_cost(A, B): 170 | if A.pos == B.pos: 171 | return 0 172 | elif is_content(A) and is_content(B): 173 | return 0.25 174 | else: 175 | return 0.5 176 | 177 | # Calculate the cost of character alignment; i.e. char similarity 178 | def char_cost(A, B): 179 | return 1-SequenceMatcher(None, A, B).ratio() 180 | 181 | # If there is a substitution, calculate the more informative cost. 182 | def token_substitution(A, B, A_extra, B_extra): 183 | # If lower case strings are the same, don't bother checking pos etc. 184 | # This helps catch case marking substitution errors. 185 | if A.lower() == B.lower(): 186 | return 0 187 | cost = lemma_cost(A_extra, B_extra) + pos_cost(A_extra, B_extra) + char_cost(A, B) 188 | return cost 189 | 190 | # Change cost of Transpositions to be the same as Levenshtein. 191 | def levTransposition(a,b,c,d): 192 | return float("inf") 193 | 194 | # Change cost of Substitution to be the same as Levenshtein. 195 | def levSubstitution(a,b,c,d): 196 | return 1 197 | 198 | # Input 1: A Spacy annotated original sentence. 199 | # Input 2: A Spacy annotated corrected sentence. 200 | # Input 3: Command line args. 201 | # Output: A list of lists. Each sublist is an edit of the form: 202 | # edit = [orig_start, orig_end, cat, cor, cor_start, cor_end] 203 | def getAutoAlignedEdits(orig, cor, args): 204 | # Get a list of strings from the spacy objects. 205 | orig_toks = [tok.text for tok in orig] 206 | cor_toks = [tok.text for tok in cor] 207 | # Align using Levenshtein. 208 | if args.lev: alignments = DL.WagnerFischer(orig_toks, cor_toks, orig, cor, substitution=levSubstitution, transposition=levTransposition) 209 | # Otherwise, use linguistically enhanced Damerau-Levenshtein 210 | else: alignments = DL.WagnerFischer(orig_toks, cor_toks, orig, cor, substitution=token_substitution) 211 | # Get the alignment with the highest score. There is usually only 1 best in DL due to custom costs. 212 | alignment = next(alignments.alignments(True)) # True uses Depth-first search. 213 | # Convert the alignment into edits; choose merge strategy 214 | if args.merge == "rules": edits = get_edits(orig, cor, get_opcodes(alignment)) 215 | elif args.merge == "all-split": edits = get_edits_split(get_opcodes(alignment)) 216 | elif args.merge == "all-merge": edits = get_edits_group_all(get_opcodes(alignment)) 217 | elif args.merge == "all-equal": edits = get_edits_group_type(get_opcodes(alignment)) 218 | proc_edits = [] 219 | for edit in edits: 220 | orig_start = edit[1] 221 | orig_end = edit[2] 222 | cat = "NA" # Auto edits do not have human types. 223 | cor_start = edit[3] 224 | cor_end = edit[4] 225 | cor_str = " ".join(cor_toks[cor_start:cor_end]) 226 | proc_edits.append([orig_start, orig_end, cat, cor_str, cor_start, cor_end]) 227 | return proc_edits 228 | -------------------------------------------------------------------------------- /tools/scripts/rdlextra.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016 Mariano Felice and Christopher Bryant 2 | # 3 | # This file contains an implementation of the Damerau-Levenshtein 4 | # algorithm (restricted edit distance version) to align two sentences, 5 | # as described in the following paper: 6 | # 7 | # Mariano Felice, Christopher Bryant and Ted Briscoe. 2016. 8 | # Automatic extraction of learner errors in ESL sentences using 9 | # linguistically enhanced alignments. In Proceedings of the 26th 10 | # International Conference on Computational Linguistics (COLING 2016), 11 | # pp. 825-835, Osaka, Japan. Japanese Association for Natural Language 12 | # Processing. 13 | # 14 | # Please, cite this paper when using this script in your work. 15 | # 16 | # This code is based on an original implementation of the Wagner-Fischer 17 | # algorithm by Kyle Gorman, available at: https://gist.github.com/kylebgorman/8034009 18 | # The original license and description are included below. 19 | # 20 | # This implementation adds support for token transpositions of arbitrary 21 | # length, e.g. A B C --> B C A. 22 | # 23 | # ORIGINAL LICENSE: 24 | # 25 | # Copyright (c) 2013-2016 Kyle Gorman 26 | # 27 | # Permission is hereby granted, free of charge, to any person obtaining a 28 | # copy of this software and associated documentation files (the 29 | # "Software"), to deal in the Software without restriction, including 30 | # without limitation the rights to use, copy, modify, merge, publish, 31 | # distribute, sublicense, and/or sell copies of the Software, and to 32 | # permit persons to whom the Software is furnished to do so, subject to 33 | # the following conditions: 34 | # 35 | # The above copyright notice and this permission notice shall be included 36 | # in all copies or substantial portions of the Software. 37 | # 38 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 39 | # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 40 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 41 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 42 | # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 43 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 44 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 45 | # 46 | # wagnerfischer.py: efficient computation of Levenshtein distance and 47 | # all optimal alignments with arbitrary edit costs. The algorithm for 48 | # computing the dynamic programming table used has been discovered many 49 | # times, but is described most clearly in: 50 | # 51 | # R.A. Wagner & M.J. Fischer. 1974. The string-to-string correction 52 | # problem. Journal of the ACM, 21(1): 168-173. 53 | # 54 | # Wagner & Fischer also describe an algorithm ("Algorithm Y") to find the 55 | # alignment path (i.e., list of edit operations involved in the optimal 56 | # alignment), but it it is specified such that in fact it only generates 57 | # one such path, whereas many such paths may exist, particularly when 58 | # multiple edit operations have the same cost. For example, when all edit 59 | # operations have the same cost, there are two equal-cost alignments of 60 | # "TGAC" and "GCAC": 61 | # 62 | # TGAC TGxAC 63 | # ss== d=i== 64 | # GCAC xGCAC 65 | # 66 | # However, all such paths can be generated efficiently, as follows. First, 67 | # the dynamic programming table "cells" are defined as tuples of (partial 68 | # cost, set of all operations reaching this cell with minimal cost). As a 69 | # result, the completed table can be thought of as an unweighted, directed 70 | # graph (or FSA). The bottom right cell (the one containing the Levenshtein 71 | # distance) is the start state and the origin as end state. The set of arcs 72 | # are the set of operations in each cell as arcs. (Many of the cells of the 73 | # table, those which are not visited by any optimal alignment, are under 74 | # the graph interpretation unconnected vertices, and can be ignored. Every 75 | # path between the bottom right cell and the origin cell is an optimal 76 | # alignment. These paths can be efficiently enumerated using breadth-first 77 | # traversal. The trick here is that elements in deque must not only contain 78 | # indices but also partial paths. Averaging over all such paths, we can 79 | # come up with an estimate of the number of insertions, deletions, and 80 | # substitutions involved as well; in the example above, we say S = 1 and 81 | # D, I = 0.5. 82 | # 83 | # Thanks to Christoph Weidemann (ctw@cogsci.info), who added support for 84 | # arbitrary cost functions. 85 | 86 | 87 | import collections 88 | import doctest 89 | import pprint 90 | 91 | 92 | # Default cost functions. 93 | 94 | def INSERTION(A, A_extra=None, cost=1): 95 | return cost 96 | 97 | def DELETION(A, A_extra=None, cost=1): 98 | return cost 99 | 100 | def SUBSTITUTION(A, B, A_extra=None, B_extra=None, cost=1): 101 | return cost 102 | 103 | def TRANSPOSITION(A, B, A_extra=None, B_extra=None): 104 | # Change to cost=float('inf') to have standard edit distance by default 105 | # A and B should be the same length 106 | cost = len(A) - 1 # or len(B) -1 107 | return cost 108 | 109 | Trace = collections.namedtuple("Trace", ["cost", "ops"]) 110 | 111 | class WagnerFischer(object): 112 | 113 | """ 114 | An object representing a (set of) Levenshtein alignments between two 115 | iterable objects (they need not be strings). The cost of the optimal 116 | alignment is scored in `self.cost`, and all Levenshtein alignments can 117 | be generated using self.alignments()`. 118 | 119 | Basic tests: 120 | 121 | >>> WagnerFischer("god", "gawd").cost 122 | 2 123 | >>> WagnerFischer("sitting", "kitten").cost 124 | 3 125 | >>> WagnerFischer("bana", "banananana").cost 126 | 6 127 | >>> WagnerFischer("bana", "bana").cost 128 | 0 129 | >>> WagnerFischer("banana", "angioplastical").cost 130 | 11 131 | >>> WagnerFischer("angioplastical", "banana").cost 132 | 11 133 | >>> WagnerFischer("Saturday", "Sunday").cost 134 | 3 135 | 136 | IDS tests: 137 | 138 | >>> WagnerFischer("doytauvab", "doyvautab").IDS() == {"S": 2.0} 139 | True 140 | >>> WagnerFischer("kitten", "sitting").IDS() == {"I": 1.0, "S": 2.0} 141 | True 142 | 143 | Detect insertion vs. deletion: 144 | 145 | >>> thesmalldog = "the small dog".split() 146 | >>> thebigdog = "the big dog".split() 147 | >>> bigdog = "big dog".split() 148 | >>> sub_inf = lambda A, B: float("inf") 149 | 150 | # Deletion. 151 | >>> wf = WagnerFischer(thebigdog, bigdog, substitution=sub_inf) 152 | >>> wf.IDS() == {"D": 1.0} 153 | True 154 | 155 | # Insertion. 156 | >>> wf = WagnerFischer(bigdog, thebigdog, substitution=sub_inf) 157 | >>> wf.IDS() == {"I": 1.0} 158 | True 159 | 160 | # Neither. 161 | >>> wf = WagnerFischer(thebigdog, thesmalldog, substitution=sub_inf) 162 | >>> wf.IDS() == {"I": 1.0, "D": 1.0} 163 | True 164 | """ 165 | 166 | # Initializes pretty printer (shared across all class instances). 167 | pprinter = pprint.PrettyPrinter(width=75) 168 | 169 | def __init__(self, A, B, A_extra=None, B_extra=None, insertion=INSERTION, deletion=DELETION, 170 | substitution=SUBSTITUTION, transposition=TRANSPOSITION): 171 | # Stores cost functions in a dictionary for programmatic access. 172 | self.costs = {"I": insertion, "D": deletion, "S": substitution, "T":transposition} 173 | # Keep lowercased versions for transpositions 174 | Al = [x.lower() for x in A] 175 | Bl = [x.lower() for x in B] 176 | # Initializes table. 177 | self.asz = len(A) 178 | self.bsz = len(B) 179 | self._table = [[None for _ in range(self.bsz + 1)] for 180 | _ in range(self.asz + 1)] 181 | # From now on, all indexing done using self.__getitem__. 182 | ## Fills in edges. 183 | self[0][0] = Trace(0, {"O"}) # Start cell. 184 | for i in range(1, self.asz + 1): 185 | self[i][0] = Trace(self[i - 1][0].cost + self.costs["D"](A[i - 1], A_extra[i - 1] if A_extra else None), 186 | {"D"}) 187 | for j in range(1, self.bsz + 1): 188 | self[0][j] = Trace(self[0][j - 1].cost + self.costs["I"](B[j - 1], B_extra[j - 1] if B_extra else None), 189 | {"I"}) 190 | 191 | ## Fills in rest. 192 | for i in range(len(A)): 193 | for j in range(len(B)): 194 | # Cleans it up in case there are more than one check for match 195 | # first, as it is always the cheapest option. 196 | if A[i] == B[j]: 197 | self[i + 1][j + 1] = Trace(self[i][j].cost, {"M"}) 198 | # Checks for other types. 199 | else: 200 | costD = self[i][j + 1].cost + self.costs["D"](A[i], A_extra[i] if A_extra else None) 201 | costI = self[i + 1][j].cost + self.costs["I"](B[j], B_extra[j] if B_extra else None) 202 | costS = self[i][j].cost + self.costs["S"](A[i], B[j], A_extra[i] if A_extra else None, B_extra[j] if B_extra else None) 203 | costT = float("inf") # We don't know it yet 204 | min_val = min(costI, costD, costS) 205 | 206 | # Multiword transpositions: 207 | # Find a sequence of equal elements in different order 208 | # We only need to check diagonally because we require the same number of elements 209 | k = 1 210 | #while i > 0 and j > 0 and (i - k) >= 0 and (j - k) >= 0 and any(x in ["D", "I", "S"] for x in self[i-k+1][j-k+1].ops): 211 | while i > 0 and j > 0 and (i - k) >= 0 and (j - k) >= 0 and self[i-k+1][j-k+1].cost - self[i-k][j-k].cost > 0: # An operation that has a cost (i.e. I, D or S > 0) 212 | if collections.Counter(Al[i-k:i+1]) == collections.Counter(Bl[j-k:j+1]): 213 | costT = self[i-k][j-k].cost + self.costs["T"](A[i-k:i+1], B[j-k:j+1], A_extra[i-k:i+1] if A_extra else None, B_extra[j-k:j+1] if B_extra else None) 214 | min_val = min(min_val, costT) 215 | break 216 | k += 1 217 | 218 | trace = Trace(min_val, []) # Use a list to preserve the order 219 | # Adds _all_ operations matching minimum value. 220 | if costD == min_val: 221 | trace.ops.append("D") 222 | if costI == min_val: 223 | trace.ops.append("I") 224 | if costS == min_val: 225 | trace.ops.append("S") 226 | if costT == min_val: 227 | trace.ops.append("T" + str(k+1)) 228 | self[i + 1][j + 1] = trace 229 | 230 | # Stores optimum cost as a property. 231 | self.cost = self[-1][-1].cost 232 | 233 | def __repr__(self): 234 | return self.pprinter.pformat(self._table) 235 | 236 | def __iter__(self): 237 | for row in self._table: 238 | yield row 239 | 240 | def __getitem__(self, i): 241 | """ 242 | Returns the i-th row of the table, which is a list and so 243 | can be indexed. Therefore, e.g., self[2][3] == self._table[2][3] 244 | """ 245 | return self._table[i] 246 | 247 | # Stuff for generating alignments. 248 | 249 | def _stepback(self, i, j, trace, path_back): 250 | """ 251 | Given a cell location (i, j) and a Trace object trace, generate 252 | all traces they point back to in the table 253 | """ 254 | for op in trace.ops: 255 | if op == "M": 256 | yield i - 1, j - 1, self[i - 1][j - 1], path_back + ["M"] 257 | elif op == "I": 258 | yield i, j - 1, self[i][j - 1], path_back + ["I"] 259 | elif op == "D": 260 | yield i - 1, j, self[i - 1][j], path_back + ["D"] 261 | elif op == "S": 262 | yield i - 1, j - 1, self[i - 1][j - 1], path_back + ["S"] 263 | elif op.startswith("T"): 264 | # Extract stepback (default is a transposition of 2 elements) 265 | k = int(op[1:] or 2) 266 | yield i - k, j - k, self[i - k][j - k], path_back + [op] 267 | elif op == "O": 268 | return # Origin cell, so we're done. 269 | else: 270 | raise ValueError("Unknown op {!r}".format(op)) 271 | 272 | def alignments(self, dfirst=False): 273 | """ 274 | Generate all alignments with optimal cost by traversing an 275 | implicit graph on the dynamic programming table. Use 276 | breadth-first traversal by default. 277 | """ 278 | # Each cell of the queue is a tuple of (i, j, trace, path_back) 279 | # where i, j is the current index, trace is the trace object at 280 | # this cell 281 | if dfirst: 282 | return self._dfirst_alignments() 283 | else: 284 | return self._bfirst_alignments() 285 | 286 | def _dfirst_alignments(self): 287 | """ 288 | Generate alignments via depth-first traversal. 289 | """ 290 | stack = list(self._stepback(self.asz, self.bsz, self[-1][-1], [])) 291 | while stack: 292 | (i, j, trace, path_back) = stack.pop() 293 | if trace.ops == {"O"}: 294 | yield path_back[::-1] 295 | continue 296 | stack.extend(self._stepback(i, j, trace, path_back)) 297 | 298 | def _bfirst_alignments(self): 299 | """ 300 | Generate alignments via breadth-first traversal. 301 | """ 302 | # Each cell of the queue is a tuple of (i, j, trace, path_back) 303 | # where i, j is the current index, trace is the trace object at 304 | # this cell, and path_back is a reversed list of edit operations 305 | # which is initialized as an empty list. 306 | queue = collections.deque(self._stepback(self.asz, self.bsz, 307 | self[-1][-1], [])) 308 | while queue: 309 | (i, j, trace, path_back) = queue.popleft() 310 | if trace.ops == {"O"}: 311 | # We have reached the origin, the end of a reverse path, so 312 | # yield the list of edit operations in reverse. 313 | yield path_back[::-1] 314 | continue 315 | queue.extend(self._stepback(i, j, trace, path_back)) 316 | 317 | def IDS(self): 318 | """ 319 | Estimates insertions, deletions, and substitution _count_ (not 320 | costs). Non-integer values arise when there are multiple possible 321 | alignments with the same cost. 322 | """ 323 | npaths = 0 324 | opcounts = collections.Counter() 325 | for alignment in self.alignments(): 326 | # Counts edit types for this path, ignoring "M" (which is free). 327 | opcounts += collections.Counter(op for op in alignment if op != "M") 328 | npaths += 1 329 | # Averages over all paths. 330 | return collections.Counter({o: c / npaths for (o, c) in 331 | opcounts.items()}) 332 | 333 | 334 | if __name__ == "__main__": 335 | #doctest.testmod() 336 | a = raw_input("A: ").split() 337 | b = raw_input("B: ").split() 338 | al = WagnerFischer(a, b).alignments() 339 | for a in al: 340 | print(a) 341 | 342 | -------------------------------------------------------------------------------- /tools/scripts/cat_rules.py: -------------------------------------------------------------------------------- 1 | from difflib import SequenceMatcher 2 | from string import punctuation 3 | import spacy.parts_of_speech as spos 4 | 5 | # Contractions 6 | conts = {"'d", "'ll", "'m", "n't", "'re", "'s", "'ve"} 7 | # Rare POS tags that make uninformative error categories 8 | rare_tags = {"INTJ", "NUM", "SYM", "X"} 9 | # Special auxiliaries in contractions. 10 | special_aux1 = ({"ca", "can"}, {"sha", "shall"}, {"wo", "will"}) 11 | special_aux2 = {"ca", "sha", "wo"} 12 | # Open class spacy POS tag objects 13 | open_pos = (spos.ADJ, spos.ADV, spos.NOUN, spos.VERB) 14 | # Open class POS tags 15 | open_tags = {"ADJ", "ADV", "NOUN", "VERB"} 16 | # Some dep labels that map to pos tags. 17 | dep_map = { "acomp": "ADJ", 18 | "amod": "ADJ", 19 | "advmod": "ADV", 20 | "det": "DET", 21 | "prep": "PREP", 22 | "prt": "PART", 23 | "punct": "PUNCT" } 24 | 25 | # Input 1: An edit list. [orig_start, orig_end, cat, cor, cor_start, cor_end] 26 | # Input 2: An original SpaCy sentence. 27 | # Input 3: A corrected SpaCy sentence. 28 | # Input 4: A set of valid GB English words. 29 | # Input 5: A dictionary to map PTB tags to Stanford Universal Dependency tags. 30 | # Input 6: A preloaded spacy processing object. 31 | # Input 7: The Lancaster stemmer in NLTK. 32 | # Output: The input edit with new error tag, in M2 edit format. 33 | def autoTypeEdit(edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer): 34 | # Get the tokens in the edit. 35 | orig_toks = orig_sent[edit[0]:edit[1]] 36 | cor_toks = cor_sent[edit[4]:edit[5]] 37 | # Nothing to nothing is a detected, but not corrected edit. 38 | if not orig_toks and not cor_toks: 39 | return "UNK" 40 | # Missing 41 | elif not orig_toks and cor_toks: 42 | op = "M:" 43 | cat = getOneSidedType(cor_toks, tag_map) 44 | # Unnecessary 45 | elif orig_toks and not cor_toks: 46 | op = "U:" 47 | cat = getOneSidedType(orig_toks, tag_map) 48 | # Replacement and special cases 49 | else: 50 | # Same to same is a detected, but not corrected edit. 51 | if orig_toks.text == cor_toks.text: 52 | return "UNK" 53 | # Special: Orthographic errors at the end of multi-token edits are ignored. 54 | # E.g. [Doctor -> The doctor], [The doctor -> Dcotor], [, since -> . Since] 55 | # Classify the edit as if the last token weren't there. 56 | elif orig_toks[-1].lower_ == cor_toks[-1].lower_ and \ 57 | (len(orig_toks) > 1 or len(cor_toks) > 1): 58 | min_edit = edit[:] 59 | min_edit[1] -= 1 60 | min_edit[5] -= 1 61 | return autoTypeEdit(min_edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer) 62 | # Replacement 63 | else: 64 | op = "R:" 65 | cat = getTwoSidedType(orig_toks, cor_toks, gb_spell, tag_map, nlp, stemmer) 66 | return op+cat 67 | 68 | # Input 1: Spacy tokens 69 | # Input 2: A map dict from PTB to universal dependency pos tags. 70 | # Output: A list of token, pos and dep tag strings. 71 | def getEditInfo(toks, tag_map): 72 | str = [] 73 | pos = [] 74 | dep = [] 75 | for tok in toks: 76 | str.append(tok.text) 77 | pos.append(tag_map[tok.tag_]) 78 | dep.append(tok.dep_) 79 | return str, pos, dep 80 | 81 | # Input 1: Spacy tokens. 82 | # Input 2: A map dict from PTB to universal dependency pos tags. 83 | # Output: An error type string. 84 | # When one side of the edit is null, we can only use the other side. 85 | def getOneSidedType(toks, tag_map): 86 | # Extract strings, pos tags and parse info from the toks. 87 | str_list, pos_list, dep_list = getEditInfo(toks, tag_map) 88 | 89 | # Special cases. 90 | if len(toks) == 1: 91 | # Possessive noun suffixes; e.g. ' -> 's 92 | if toks[0].tag_ == "POS": 93 | return "NOUN:POSS" 94 | # Contraction. Rule must come after possessive. 95 | if toks[0].lower_ in conts: 96 | return "CONTR" 97 | # Infinitival "to" is treated as part of a verb form. 98 | if toks[0].lower_ == "to" and toks[0].pos_ == "PART" and toks[0].dep_ != "prep": 99 | return "VERB:FORM" 100 | # Auxiliary verbs. 101 | if set(dep_list).issubset({"aux", "auxpass"}): 102 | return "VERB:TENSE" 103 | # POS-based tags. Ignores rare, uninformative categories. 104 | if len(set(pos_list)) == 1 and pos_list[0] not in rare_tags: 105 | return pos_list[0] 106 | # More POS-based tags using special dependency labels. 107 | if len(set(dep_list)) == 1 and dep_list[0] in dep_map.keys(): 108 | return dep_map[dep_list[0]] 109 | # To-infinitives and phrasal verbs. 110 | if set(pos_list) == {"PART", "VERB"}: 111 | return "VERB" 112 | # Tricky cases 113 | else: 114 | return "OTHER" 115 | 116 | # Input 1: Original text spacy tokens. 117 | # Input 2: Corrected text spacy tokens. 118 | # Input 3: A set of valid GB English words. 119 | # Input 4: A map from PTB to universal dependency pos tags. 120 | # Input 5: A preloaded spacy processing object. 121 | # Input 6: The Lancaster stemmer in NLTK. 122 | # Output: An error type string. 123 | def getTwoSidedType(orig_toks, cor_toks, gb_spell, tag_map, nlp, stemmer): 124 | # Extract strings, pos tags and parse info from the toks. 125 | orig_str, orig_pos, orig_dep = getEditInfo(orig_toks, tag_map) 126 | cor_str, cor_pos, cor_dep = getEditInfo(cor_toks, tag_map) 127 | 128 | # Orthography; i.e. whitespace and/or case errors. 129 | if onlyOrthChange(orig_str, cor_str): 130 | return "ORTH" 131 | # Word Order; only matches exact reordering. 132 | if exactReordering(orig_str, cor_str): 133 | return "WO" 134 | 135 | # 1:1 replacements (very common) 136 | if len(orig_str) == len(cor_str) == 1: 137 | # 1. SPECIAL CASES 138 | # Possessive noun suffixes; e.g. ' -> 's 139 | if orig_toks[0].tag_ == "POS" or cor_toks[0].tag_ == "POS": 140 | return "NOUN:POSS" 141 | # Contraction. Rule must come after possessive. 142 | if (orig_str[0].lower() in conts or cor_str[0].lower() in conts) and orig_pos == cor_pos: 143 | return "CONTR" 144 | # Special auxiliaries in contractions (1); e.g. ca -> can 145 | if set(orig_str[0].lower()+cor_str[0].lower()) in special_aux1: 146 | return "CONTR" 147 | # Special auxiliaries in contractions (2); e.g. ca -> could 148 | if orig_str[0].lower() in special_aux2 or cor_str[0].lower() in special_aux2: 149 | return "VERB:TENSE" 150 | # Special: "was" and "were" are the only past tense SVA. 151 | if {orig_str[0].lower(), cor_str[0].lower()} == {"was", "were"}: 152 | return "VERB:SVA" 153 | 154 | # 2. SPELLING AND INFLECTION 155 | # Only check alphabetical strings on the original side. 156 | # Spelling errors take precendece over POS errors so this rule is ordered. 157 | if orig_str[0].isalpha(): 158 | # Check a GB English dict for both orig and lower case. 159 | # "cat" is in the dict, but "Cat" is not. 160 | if orig_str[0] not in gb_spell and orig_str[0].lower() not in gb_spell: 161 | # Check if both sides have a common lemma 162 | if sameLemma(orig_toks[0], cor_toks[0], nlp): 163 | # Inflection; Usually count vs mass nouns or e.g. got vs getted 164 | if orig_pos == cor_pos and orig_pos[0] in {"NOUN", "VERB"}: 165 | return orig_pos[0]+":INFL" 166 | # Unknown morphology; i.e. we cannot be more specific. 167 | else: 168 | return "MORPH" 169 | # Use string similarity to detect true spelling errors. 170 | else: 171 | char_ratio = SequenceMatcher(None, orig_str[0], cor_str[0]).ratio() 172 | # Ratio > 0.5 means both side share at least half the same chars. 173 | # WARNING: THIS IS AN APPROXIMATION. 174 | if char_ratio > 0.5: 175 | return "SPELL" 176 | # If ratio is <= 0.5, this may be a spelling+other error; e.g. tolk -> say 177 | else: 178 | # If POS is the same, this takes precedence over spelling. 179 | if orig_pos == cor_pos and orig_pos[0] not in rare_tags: 180 | return orig_pos[0] 181 | # Tricky cases. 182 | else: 183 | return "OTHER" 184 | 185 | # 3. MORPHOLOGY 186 | # Only ADJ, ADV, NOUN and VERB with same lemma can have inflectional changes. 187 | if sameLemma(orig_toks[0], cor_toks[0], nlp) and \ 188 | orig_pos[0] in open_tags and cor_pos[0] in open_tags: 189 | # Same POS on both sides 190 | if orig_pos == cor_pos: 191 | # Adjective form; e.g. comparatives 192 | if orig_pos[0] == "ADJ": 193 | return "ADJ:FORM" 194 | # Noun number 195 | if orig_pos[0] == "NOUN": 196 | return "NOUN:NUM" 197 | # Verbs - various types 198 | if orig_pos[0] == "VERB": 199 | # NOTE: These rules are carefully ordered. 200 | # Use the dep parse to find some form errors. 201 | # Main verbs preceded by aux cannot be tense or SVA. 202 | if precededByAux(orig_toks, cor_toks): 203 | return "VERB:FORM" 204 | # Use fine PTB tags to find various errors. 205 | # FORM errors normally involve VBG or VBN. 206 | if orig_toks[0].tag_ in {"VBG", "VBN"} or cor_toks[0].tag_ in {"VBG", "VBN"}: 207 | return "VERB:FORM" 208 | # Of what's left, TENSE errors normally involved VBD. 209 | if orig_toks[0].tag_ == "VBD" or cor_toks[0].tag_ == "VBD": 210 | return "VERB:TENSE" 211 | # Of what's left, SVA errors normally involve VBZ. 212 | if orig_toks[0].tag_ == "VBZ" or cor_toks[0].tag_ == "VBZ": 213 | return "VERB:SVA" 214 | # Any remaining aux verbs are called TENSE. 215 | if orig_dep[0].startswith("aux") and cor_dep[0].startswith("aux"): 216 | return "VERB:TENSE" 217 | # Use dep labels to find some more ADJ:FORM 218 | if set(orig_dep+cor_dep).issubset({"acomp", "amod"}): 219 | return "ADJ:FORM" 220 | # Adj to plural noun is usually a noun number error; e.g. musical -> musicals. 221 | if orig_pos[0] == "ADJ" and cor_toks[0].tag_ == "NNS": 222 | return "NOUN:NUM" 223 | # For remaining verb errors (rare), rely on cor_pos 224 | if cor_toks[0].tag_ in {"VBG", "VBN"}: 225 | return "VERB:FORM" 226 | # Cor VBD = TENSE 227 | if cor_toks[0].tag_ == "VBD": 228 | return "VERB:TENSE" 229 | # Cor VBZ = SVA 230 | if cor_toks[0].tag_ == "VBZ": 231 | return "VERB:SVA" 232 | # Tricky cases that all have the same lemma. 233 | else: 234 | return "MORPH" 235 | # Derivational morphology. 236 | if stemmer.stem(orig_str[0]) == stemmer.stem(cor_str[0]) and \ 237 | orig_pos[0] in open_tags and cor_pos[0] in open_tags: 238 | return "MORPH" 239 | 240 | # 4. GENERAL 241 | # Auxiliaries with different lemmas 242 | if orig_dep[0].startswith("aux") and cor_dep[0].startswith("aux"): 243 | return "VERB:TENSE" 244 | # POS-based tags. Some of these are context sensitive mispellings. 245 | if orig_pos == cor_pos and orig_pos[0] not in rare_tags: 246 | return orig_pos[0] 247 | # Some dep labels map to POS-based tags. 248 | if orig_dep == cor_dep and orig_dep[0] in dep_map.keys(): 249 | return dep_map[orig_dep[0]] 250 | # Phrasal verb particles. 251 | if set(orig_pos+cor_pos) == {"PART", "PREP"} or set(orig_dep+cor_dep) == {"prt", "prep"}: 252 | return "PART" 253 | # Can use dep labels to resolve DET + PRON combinations. 254 | if set(orig_pos+cor_pos) == {"DET", "PRON"}: 255 | # DET cannot be a subject or object. 256 | if cor_dep[0] in {"nsubj", "nsubjpass", "dobj", "pobj"}: 257 | return "PRON" 258 | # "poss" indicates possessive determiner 259 | if cor_dep[0] == "poss": 260 | return "DET" 261 | # Tricky cases. 262 | else: 263 | return "OTHER" 264 | 265 | # Multi-token replacements (uncommon) 266 | # All auxiliaries 267 | if set(orig_dep+cor_dep).issubset({"aux", "auxpass"}): 268 | return "VERB:TENSE" 269 | # All same POS 270 | if len(set(orig_pos+cor_pos)) == 1: 271 | # Final verbs with the same lemma are tense; e.g. eat -> has eaten 272 | if orig_pos[0] == "VERB" and sameLemma(orig_toks[-1], cor_toks[-1], nlp): 273 | return "VERB:TENSE" 274 | # POS-based tags. 275 | elif orig_pos[0] not in rare_tags: 276 | return orig_pos[0] 277 | # All same special dep labels. 278 | if len(set(orig_dep+cor_dep)) == 1 and orig_dep[0] in dep_map.keys(): 279 | return dep_map[orig_dep[0]] 280 | # Infinitives, gerunds, phrasal verbs. 281 | if set(orig_pos+cor_pos) == {"PART", "VERB"}: 282 | # Final verbs with the same lemma are form; e.g. to eat -> eating 283 | if sameLemma(orig_toks[-1], cor_toks[-1], nlp): 284 | return "VERB:FORM" 285 | # Remaining edits are often verb; e.g. to eat -> consuming, look at -> see 286 | else: 287 | return "VERB" 288 | # Possessive nouns; e.g. friends -> friend 's 289 | if (orig_pos == ["NOUN", "PART"] or cor_pos == ["NOUN", "PART"]) and \ 290 | sameLemma(orig_toks[0], cor_toks[0], nlp): 291 | return "NOUN:POSS" 292 | # Adjective forms with "most" and "more"; e.g. more free -> freer 293 | if (orig_str[0].lower() in {"most", "more"} or cor_str[0].lower() in {"most", "more"}) and \ 294 | sameLemma(orig_toks[-1], cor_toks[-1], nlp) and len(orig_str) <= 2 and len(cor_str) <= 2: 295 | return "ADJ:FORM" 296 | 297 | # Tricky cases. 298 | else: 299 | return "OTHER" 300 | 301 | # Input 1: A list of original token strings 302 | # Input 2: A list of corrected token strings 303 | # Output: Boolean; the difference between the inputs is only whitespace or case. 304 | def onlyOrthChange(orig_str, cor_str): 305 | orig_join = "".join(orig_str).lower() 306 | cor_join = "".join(cor_str).lower() 307 | if orig_join == cor_join: 308 | return True 309 | return False 310 | 311 | # Input 1: A list of original token strings 312 | # Input 2: A list of corrected token strings 313 | # Output: Boolean; the tokens are exactly the same but in a different order. 314 | def exactReordering(orig_str, cor_str): 315 | # Sorting lets us keep duplicates. 316 | orig_set = sorted([tok.lower() for tok in orig_str]) 317 | cor_set = sorted([tok.lower() for tok in cor_str]) 318 | if orig_set == cor_set: 319 | return True 320 | return False 321 | 322 | # Input 1: An original text spacy token. 323 | # Input 2: A corrected text spacy token. 324 | # Input 3: A spaCy processing object. 325 | # Output: Boolean; the tokens have the same lemma. 326 | # Spacy only finds lemma for its predicted POS tag. Sometimes these are wrong, 327 | # so we also consider alternative POS tags to improve chance of a match. 328 | def sameLemma(orig_tok, cor_tok, nlp): 329 | orig_lemmas = [] 330 | cor_lemmas = [] 331 | for pos in open_pos: 332 | # Pass the lower cased form of the word for lemmatization; improves accuracy. 333 | orig_lemmas.append(nlp.vocab.morphology.lemmatize(pos, orig_tok.lower, nlp.vocab.morphology.tag_map)) 334 | cor_lemmas.append(nlp.vocab.morphology.lemmatize(pos, cor_tok.lower, nlp.vocab.morphology.tag_map)) 335 | if set(orig_lemmas).intersection(set(cor_lemmas)): 336 | return True 337 | return False 338 | 339 | # Input 1: An original text spacy token. 340 | # Input 2: A corrected text spacy token. 341 | # Output: Boolean; both tokens have a dependant auxiliary verb. 342 | def precededByAux(orig_tok, cor_tok): 343 | # If the toks are aux, we need to check if they are the first aux. 344 | if orig_tok[0].dep_.startswith("aux") and cor_tok[0].dep_.startswith("aux"): 345 | # Find the parent verb 346 | orig_head = orig_tok[0].head 347 | cor_head = cor_tok[0].head 348 | # Find the children of the parent 349 | orig_children = orig_head.children 350 | cor_children = cor_head.children 351 | # Check the orig children. 352 | for orig_child in orig_children: 353 | # Look at the first aux... 354 | if orig_child.dep_.startswith("aux"): 355 | # Check if the string matches orig_tok 356 | if orig_child.text != orig_tok[0].text: 357 | # If it doesn't, orig_tok is not the first aux so check the cor children 358 | for cor_child in cor_children: 359 | # Find the first aux in cor... 360 | if cor_child.dep_.startswith("aux"): 361 | # If that doesn't match cor_tok, there cor_tok also isnt first aux. 362 | if cor_child.text != cor_tok[0].text: 363 | # Therefore, both orig and cor are not first aux. 364 | return True 365 | # Break after the first cor aux 366 | break 367 | # Break after the first orig aux. 368 | break 369 | # Otherwise, the toks are main verbs so we need to look for any aux. 370 | else: 371 | orig_deps = [orig_dep.dep_ for orig_dep in orig_tok[0].children] 372 | cor_deps = [cor_dep.dep_ for cor_dep in cor_tok[0].children] 373 | if "aux" in orig_deps or "auxpass" in orig_deps: 374 | if "aux" in cor_deps or "auxpass" in cor_deps: 375 | return True 376 | return False -------------------------------------------------------------------------------- /json2pair.py: -------------------------------------------------------------------------------- 1 | 2 | # This code is hacked from FCE jason_to_m2.py 3 | # make text paragraph<--->corrected paragrapg pair for training and testing. 4 | 5 | import json 6 | import tools.scripts.align_text as align_text 7 | import tools.scripts.cat_rules as cat_rules 8 | import tools.scripts.toolbox as toolbox 9 | import spacy 10 | from nltk.stem.lancaster import LancasterStemmer 11 | import re 12 | from string import punctuation 13 | from bisect import bisect 14 | import argparse 15 | 16 | 17 | 18 | # Punctuation normalisation dictionary 19 | norm_dict = {"’": "'", 20 | "´": "'", 21 | "‘": "'", 22 | "′": "'", 23 | "`": "'", 24 | '“': '"', 25 | '”': '"', 26 | '˝': '"', 27 | '¨': '"', 28 | '„': '"', 29 | '『': '"', 30 | '』': '"', 31 | '–': '-', 32 | '—': '-', 33 | '―': '-', 34 | '¬': '-', 35 | '、': ',', 36 | ',': ',', 37 | ':': ':', 38 | ';': ';', 39 | '?': '?', 40 | '!': '!', 41 | 'ِ': ' ', 42 | '\u200b': ' '} 43 | norm_dict = {ord(k): v for k, v in norm_dict.items()} 44 | 45 | # Load Tokenizer and other resources 46 | nlp = spacy.load("en_core_web_lg") 47 | # Lancaster Stemmer 48 | stemmer = LancasterStemmer() 49 | # GB English word list (inc -ise and -ize) 50 | gb_spell = toolbox.loadDictionary("tools/resources/en_GB-large.txt") 51 | # Part of speech map file 52 | tag_map = toolbox.loadTagMap("tools/resources/en-ptb_map") 53 | 54 | 55 | #Input 1: An essay string. 56 | # Input 2: A list of character edits in the essay 57 | # Input 3: A string normalisation dictionary for unusual punctuation etc. 58 | # Output: A list of paragraph strings and their edits [(para, edits), ...] 59 | def getParas(text, edits, norm_dict): 60 | para_info = [] 61 | # Loop through all sequences between newlines 62 | for para in re.finditer("[^\n]+", text): 63 | para_edits = [] 64 | # Keep track of correction spans (not detection spans) 65 | cor_spans = [] 66 | # Loop through the edits: [start, end, cor, ] 67 | for edit in edits: 68 | # Find edits that fall inside this paragraph 69 | if edit[0] >= para.start(0) and edit[1] <= para.end(0): 70 | # Adjust offsets and add C or D type for correction or detection 71 | new_edit = [edit[0]-para.start(0), edit[1]-para.start(0), "C", edit[2]] 72 | if edit[2] == None: new_edit[2] = "D" 73 | # Normalise the string if its a correction edit 74 | if new_edit[2] == "C": 75 | new_edit[3] = edit[2].translate(norm_dict) 76 | # Save the span in cor_spans 77 | cor_spans.append(new_edit[:2]) 78 | # Save the edit 79 | para_edits.append(new_edit) 80 | # Activate this switch to see the cross paragraph edits that are ignored, if any. 81 | # elif edit[0] >= para.start(0) and edit[0] <= para.end(0) and edit[1] > para.end(0): 82 | # print(text) 83 | # print(edit) 84 | # Remove overlapping detection edits from the list (for FCE only) 85 | new_para_edits = [] 86 | # Loop through the new normalised edits again 87 | for edit in para_edits: 88 | # Find detection edits 89 | if edit[2] == "D": 90 | # Boolean if the edit overlaps with a correction 91 | overlap = False 92 | # Loop through cor_spans 93 | for start, end in cor_spans: 94 | # Check whether there are any correction edits inside this detection edit. 95 | if (start != end and start >= edit[0] and end <= edit[1]) or \ 96 | (start == end and start > edit[0] and end < edit[1]): overlap = True 97 | # If there is an overlap, ignore the detection edit 98 | if overlap: continue 99 | new_para_edits.append(edit) 100 | # Save the para and the para edits 101 | para_info.append((para.group(0), new_para_edits)) 102 | return para_info 103 | 104 | 105 | 106 | # Input 1: An untokenized paragraph string. 107 | # Input 2: A list of character edits in the input string. 108 | # Output 1: The same as Input 1, except unnecessary whitespace has been removed. 109 | # Output 2: The same as Input 2, except character edit spans have been updated. 110 | def cleanPara(para, edits): 111 | # Replace all types of whitespace with a space 112 | para = re.sub("\s", " ", para) 113 | # Find any sequence of 2 adjacent whitespace characters 114 | # NOTE: Matching only 2 at a time lets us preserve edits between multiple whitespace. 115 | match = re.search(" ", para) 116 | # While there is a match... 117 | while match: 118 | # Find the index where the whitespace starts. 119 | ws_start = match.start() 120 | # Remove 1 of the whitespace chars. 121 | para = para[:ws_start] + para[ws_start+1:] 122 | # Update affected edits that start after ws_start 123 | for edit in edits: 124 | # edit = [start, end, ...] 125 | if edit[0] > ws_start: 126 | edit[0] -= 1 127 | if edit[1] > ws_start: 128 | edit[1] -= 1 129 | # Try matching again 130 | match = re.search(" ", para) 131 | # Remove leading whitespace, if any. 132 | if para.startswith(" "): 133 | para = para.lstrip() 134 | # Subtract 1 from all edits. 135 | for edit in edits: 136 | # edit = [start, end, ...] 137 | # "max" used to prevent negative index 138 | edit[0] = max(edit[0] - 1, 0) 139 | edit[1] = max(edit[1] - 1, 0) 140 | # Remove whitespace leading/trailing whitespace from character edit spans 141 | for edit in edits: 142 | # Ignore insertions 143 | if edit[0] == edit[1]: continue 144 | # Get the orig text 145 | orig = para[edit[0]:edit[1]] 146 | # Remove leading whitespace and update span 147 | if orig.startswith(" "): edit[0] += 1 148 | if orig.endswith(" "): edit[1] -= 1 149 | # Return para and new edit spans. 150 | return para, edits 151 | 152 | # Input: A spacy paragraph 153 | # Output: A list of character start and end positions for each token in the input. 154 | def getAllTokStartsAndEnds(spacy_doc): 155 | tok_starts = [] 156 | tok_ends = [] 157 | for tok in spacy_doc: 158 | tok_starts.append(tok.idx) 159 | tok_ends.append(tok.idx + len(tok.text)) 160 | return tok_starts, tok_ends 161 | 162 | # Input 1: A spacy paragraph 163 | # Input 2: A list of character edits in the input string. 164 | # Input 3: A spacy processing object 165 | # Output: A list of token edits that map to exact tokens. 166 | def getTokenEdits(para, edits, nlp): 167 | # Get the character start and end offsets of all tokens in the para. 168 | tok_starts, tok_ends = getAllTokStartsAndEnds(para) 169 | prev_tok_end = 0 170 | overlap_edit_ids = [] 171 | # edit = [start, end, cat, cor] 172 | for edit in edits: 173 | # Set cor to orig string if this is a detection edit 174 | if edit[3] == None: edit[3] = para.text[edit[0]:edit[1]] 175 | # Convert the character spans to token spans. 176 | span = convertCharToTok(edit[0], edit[1], tok_starts, tok_ends) 177 | # If chars do not map cleanly to tokens, extra processing is needed. 178 | if len(span) == 4: 179 | # Sometimes token expansion results in overlapping edits. Keep track of these. 180 | if span[0] < prev_tok_end: 181 | overlap_edit_ids.append(edits.index(edit)) 182 | continue 183 | # When span len is 4, span[2] and [3] are the new char spans. 184 | # Use these to expand the edit to match token boundaries. 185 | left = para.text[span[2]:edit[0]] 186 | right = para.text[edit[1]:span[3]] 187 | # Add this new info to cor. 188 | edit[3] = (left+edit[3]+right).strip() 189 | # Keep track of prev_tok_end 190 | prev_tok_end = span[1] 191 | # Change char span to tok span 192 | edit[0] = span[0] 193 | edit[1] = span[1] 194 | # Tokenise correction edits 195 | if edit[2] == "C": edit[3] = " ".join([tok.text for tok in nlp(edit[3].strip())]) 196 | # Set detection edits equal to the tokenised original 197 | elif edit[2] == "D": edit[3] = " ".join([tok.text for tok in para[edit[0]:edit[1]]]) 198 | # Finally remove any overlap token edits from the edit list (rare) 199 | for id in sorted(overlap_edit_ids, reverse=True): 200 | del edits[id] 201 | return edits 202 | 203 | 204 | # Input 1: A SpaCy original paragraph Doc object. 205 | # Input 2: A list of edits in that paragraph. 206 | # Output: A list of dictionaries. Each dict has 3 keys: orig, cor, edits 207 | # Sentences are split according to orig only. Edits map orig to cor. 208 | def getSents(orig, edits): 209 | sent_list = [] 210 | # Make sure spacy sentences end in punctuation where possible. 211 | orig_sents = [] 212 | start = 0 213 | for sent in orig.sents: 214 | # Only save sentence boundaries that end with punctuation or are paragraph final. 215 | if sent[-1].text[-1] in punctuation or sent.end == len(orig): 216 | orig_sents.append(orig[start:sent.end]) 217 | start = sent.end 218 | # If orig is 1 sentence, just return. 219 | if len(orig_sents) == 1: 220 | # Sents are list of tokens. Edits have cor spans added. 221 | orig, cor, edits = prepareSentEditsOutput(orig, edits) 222 | out_dict = {"orig": orig, 223 | "cor": cor, 224 | "edits": edits} 225 | sent_list.append(out_dict) 226 | # Otherwise, we need to split up the paragraph. 227 | else: 228 | # Keep track of processed edits (assumes ordered edit list) 229 | proc = 0 230 | # Keep track of diff between orig and cor sent based on applied edits. 231 | cor_offset = 0 232 | # Loop through the original sentences. 233 | for sent_id, orig_sent in enumerate(orig_sents): 234 | # Store valid edits here 235 | sent_edits = [] 236 | # Loop through unprocessed edits 237 | for edit in edits[proc:]: 238 | # edit = [orig_start, orig_end, cat, cor] 239 | # If edit starts inside the current sentence but ends outside it... 240 | if orig_sent.start <= edit[0] < orig_sent.end and edit[1] > orig_sent.end: 241 | # We cannot handle cross orig_sent edits, so just ignore them. 242 | # Update cor_offset and proc_cnt 243 | cor_offset = cor_offset-(edit[1]-edit[0])+len(edit[3].split()) 244 | proc += 1 245 | # If the edit starts before the last token and ends inside the sentence... 246 | elif orig_sent.start <= edit[0] < orig_sent.end and edit[1] <= orig_sent.end: 247 | # It definitely belongs to this sentence, so save it. 248 | # Update the token spans to reflect the new boundary 249 | edit[0] -= orig_sent.start # Orig_start 250 | edit[1] -= orig_sent.start # Orig_end 251 | # Update cor_offset and proc_cnt 252 | cor_offset = cor_offset-(edit[1]-edit[0])+len(edit[3].split()) 253 | proc += 1 254 | # Save the edit 255 | sent_edits.append(edit) 256 | # If the edit starts and ends after the last token.. 257 | elif edit[0] == edit[1] == orig_sent.end: 258 | # It could ambiguously belong to this, or the next sentence. 259 | # If this is the last sentence, the cor is null, or the last char in cor 260 | # is punct, then the edit belongs to the current sent. 261 | if sent_id == len(orig_sents)-1 or not edit[3] or edit[3][-1] in punctuation: 262 | # Update the token spans to reflect the new boundary 263 | edit[0] -= orig_sent.start # Orig_start 264 | edit[1] -= orig_sent.start # Orig_end 265 | # Update cor_offset and proc_cnt 266 | cor_offset = cor_offset-(edit[1]-edit[0])+len(edit[3].split()) 267 | proc += 1 268 | # Save the edit 269 | sent_edits.append(edit) 270 | # In all other cases, edits likely belong to a different sentence. 271 | # Sents are list of tokens. Edits have cor spans added. 272 | orig_sent, cor_sent, sent_edits = prepareSentEditsOutput(orig_sent, sent_edits) 273 | # Save orig sent and edits 274 | out_dict = {"orig": orig_sent, 275 | "cor": cor_sent, 276 | "edits": sent_edits} 277 | sent_list.append(out_dict) 278 | return sent_list 279 | 280 | 281 | # Input 1: A tokenized original sentence. 282 | # Input 2: The edits in that sentence. 283 | # Output 1: The tokenized corrected sentence from these edits. 284 | # Output 2: The edits, now containing the tok span of cor_str in cor_sent. 285 | def prepareSentEditsOutput(orig, edits): 286 | orig = [tok.text for tok in orig] 287 | cor = orig[:] 288 | offset = 0 289 | for edit in edits: 290 | # edit = [orig_start, orig_end, cat, cor] 291 | cor_toks = edit[3].split() 292 | cor[edit[0]+offset:edit[1]+offset] = cor_toks 293 | cor_start = edit[0]+offset 294 | cor_end = cor_start+len(cor_toks) 295 | offset = offset-(edit[1]-edit[0])+len(cor_toks) 296 | # Save cor offset 297 | edit.extend([cor_start, cor_end]) 298 | return orig, cor, edits 299 | 300 | 301 | 302 | # Input 1: A char start position 303 | # Input 2: A char end position 304 | # Input 3: All the char token start positions in the paragraph 305 | # Input 4: All the char token end positions in the paragraph 306 | # Output: The char start and end position now in terms of tokens. 307 | def convertCharToTok(start, end, all_starts, all_ends): 308 | # If the start and end span is the same, the edit is an insertion. 309 | if start == end: 310 | # Special case: Pre-First token edits. 311 | if not start or start <= all_starts[0]: 312 | return [0, 0] 313 | # Special case: Post-Last token edits. 314 | elif start >= all_ends[-1]: 315 | return [len(all_starts), len(all_starts)] 316 | # General case 1: Edit starts at the beginning of a token. 317 | elif start in all_starts: 318 | return [all_starts.index(start), all_starts.index(start)] 319 | # General case 2: Edit starts at the end of a token. 320 | elif start in all_ends: 321 | return [all_ends.index(start)+1, all_ends.index(start)+1] 322 | # Problem case: Edit starts inside 1 token. 323 | else: 324 | # Expand character span to nearest token boundary. 325 | if start not in all_starts: 326 | start = all_starts[bisect(all_starts, start)-1] 327 | if end not in all_ends: 328 | end = all_ends[bisect(all_ends, end)] 329 | # Keep the new character spans as well 330 | return [all_starts.index(start), all_ends.index(end)+1, start, end] 331 | # Character spans match complete token spans. 332 | elif start in all_starts and end in all_ends: 333 | return [all_starts.index(start), all_ends.index(end)+1] 334 | # Character spans do NOT match complete token spans. 335 | else: 336 | # Expand character span to nearest token boundary. 337 | if start not in all_starts: 338 | start = all_starts[bisect(all_starts, start)-1] 339 | if end not in all_ends: 340 | nearest = bisect(all_ends, end) 341 | # Sometimes the end is a char after the last token. 342 | # In this case, just use the last tok boundary. 343 | if nearest >= len(all_ends): 344 | end = all_ends[-1] 345 | else: 346 | end = all_ends[bisect(all_ends, end)] 347 | # Keep the new character spans as well 348 | return [all_starts.index(start), all_ends.index(end)+1, start, end] 349 | 350 | ######################################### 351 | parser = argparse.ArgumentParser() 352 | parser.add_argument('--input_json', type=str, default="data/fce/json/fce-dev.json", 353 | help="input json file for GEC.") 354 | args = parser.parse_args() 355 | 356 | 357 | def main(): 358 | orig_sents = [] 359 | correct_sents = [] 360 | iii = 1 361 | with open(args.input_json) as data: 362 | for line in data: 363 | line = json.loads(line) 364 | print('iii: ', iii) 365 | iii += 1 366 | # Normalise certain punctuation in the text 367 | text = line["text"].translate(norm_dict) 368 | 369 | # Store the sentences and edits for all annotators here 370 | #coder_dict = {} 371 | # Loop through the annotator ids and their edits 372 | # Loop through the annotator ids and their edits 373 | for coder, edits in line["edits"]: 374 | # Add the coder to the coder_dict if needed 375 | #if coder not in coder_dict: coder_dict[coder] = [] 376 | # Split the essay into paragraphs and update and normalise the char edits 377 | para_info = getParas(text, edits, norm_dict) 378 | # Loop through the paragraphs and edits 379 | for orig_para, para_edits in para_info: 380 | # Remove unnecessary whitespace from para and update char edits 381 | orig_para, para_edits = cleanPara(orig_para, para_edits) 382 | if not orig_para: continue # Ignore empty paras 383 | # Annotate orig_para with spacy 384 | orig_para = nlp(orig_para) 385 | # Convert character edits to token edits 386 | para_edits = getTokenEdits(orig_para, para_edits, nlp) 387 | # Split the paragraph into sentences and update tok edits 388 | sents = getSents(orig_para, para_edits) 389 | orig_sents.extend([sent['orig'] for sent in sents]) 390 | correct_sents.extend([sent['cor'] for sent in sents]) 391 | # Save the sents in the coder_dict 392 | #coder_dict[coder].extend(sents) 393 | 394 | orig_file_name = args.input_json.split('.')[0] + '.orig' 395 | cor_file_name = args.input_json.split('.')[0] + '.corct' 396 | 397 | with open(orig_file_name, "w") as fp: 398 | for line in orig_sents: 399 | fp.write(' '.join(line)) 400 | fp.write("\n") 401 | 402 | with open(cor_file_name, "w") as fp: 403 | for line in correct_sents: 404 | fp.write(' '.join(line)) 405 | fp.write("\n") 406 | 407 | #result = [orig_sents, correct_sents] 408 | #with open("fce-train.p", "wb") as fp: 409 | # pickle.dump(result, fp) 410 | 411 | if __name__ == '__main__': 412 | main() 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | --------------------------------------------------------------------------------