├── .gitignore ├── README.md ├── config.py ├── data ├── __init__.py ├── conll14st-test-data │ ├── README │ ├── alt │ │ ├── alternative-teama.sgml │ │ ├── alternative-teamb.sgml │ │ ├── alternative-teamc.sgml │ │ └── official-2014.combined-withalt.m2 │ ├── noalt │ │ ├── official-2014.0.conll.ann │ │ ├── official-2014.0.m2 │ │ ├── official-2014.0.sgml │ │ ├── official-2014.1.conll.ann │ │ ├── official-2014.1.m2 │ │ ├── official-2014.1.sgml │ │ └── official-2014.combined.m2 │ └── scripts │ │ ├── README │ │ ├── iparser.py │ │ ├── nucle_doc.py │ │ ├── nuclesgmlparser.py │ │ ├── parser_feature.py │ │ ├── preprocess.py │ │ ├── preprocesscombine.py │ │ └── preprocesswithalt.py ├── eval.txt ├── release2.3.1 │ ├── README │ ├── m2scorer │ │ ├── LICENSE │ │ ├── README │ │ ├── example │ │ │ ├── README │ │ │ ├── source_gold │ │ │ └── system │ │ ├── m2scorer │ │ └── scripts │ │ │ ├── Tokenizer.py │ │ │ ├── combiner.py │ │ │ ├── convert_hoo.py │ │ │ ├── convert_nucle.py │ │ │ ├── levenshtein.py │ │ │ ├── m2scorer.py │ │ │ ├── nucle_doc.py │ │ │ ├── nuclesgmlparser.py │ │ │ ├── test.sgml │ │ │ ├── token_offsets.py │ │ │ └── util.py │ ├── original │ │ ├── data │ │ │ ├── official-preprocessed.conll │ │ │ ├── official-preprocessed.conll.ann │ │ │ ├── official-preprocessed.m2 │ │ │ └── official.sgml │ │ └── data_5types │ │ │ ├── official-preprocessed.5types.conll.ann │ │ │ ├── official-preprocessed.5types.m2 │ │ │ └── official.5types.sgml │ ├── revised │ │ ├── data │ │ │ ├── official-preprocessed.conll.ann │ │ │ ├── official-preprocessed.m2 │ │ │ └── official.sgml │ │ └── data_5types │ │ │ ├── alternatives.NTHU.sgml │ │ │ ├── alternatives.STEL.sgml │ │ │ ├── alternatives.TOR.sgml │ │ │ ├── alternatives.UIUC.sgml │ │ │ ├── alternatives.UMC.sgml │ │ │ ├── combined.5types.m2 │ │ │ ├── official-preprocessed.5types.conll.ann │ │ │ ├── official-preprocessed.5types.m2 │ │ │ └── official.5types.sgml │ └── scripts │ │ ├── README │ │ ├── iparser.py │ │ ├── nucle_doc.py │ │ ├── nuclesgmlparser.py │ │ ├── parser_feature.py │ │ ├── preprocess.py │ │ └── preprocesswithalt.py └── train.txt ├── eval.py ├── model.py ├── preprocess.py ├── seq2seq ├── __init__.py ├── seq2seq.py └── seq2seq_test.py ├── tensorboard_logger.py ├── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | checkpoints/ 3 | *.pyc 4 | .DS_Store 5 | graphs/ 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Text Corrector 2 | > Work in Progress 3 | ## Introduction 4 | This project aims to make a text corrector for English learners using deep neural net. This project is implemented in Pytorch 5 | - First trial: seq2seq w/ attention + nucle 2.3 dataset 6 | - Next.. 7 | - more data, data augmentations, ... 8 | - tweeking nets like Bytenet or Transformer 9 | - beam search 10 | 11 | ## Datasets 12 | [CoNLL-2013 Shared Task: Grammatical Error Correction](http://www.comp.nus.edu.sg/~nlp/conll13st.html) 13 | [Overview Paper](http://www.comp.nus.edu.sg/~nlp/conll13st/CoNLLST01.pdf) 14 | [Datasets](http://www.comp.nus.edu.sg/~nlp/conll13st/release2.3.1.tar.gz) 15 | [Participant Papers](http://aclweb.org/anthology/W/W13/#3600) 16 | 17 | [CoNLL-2014 Shared Task: Grammatical Error Correction](http://www.comp.nus.edu.sg/~nlp/conll14st.html) 18 | [Overview Paper](http://www.comp.nus.edu.sg/~nlp/conll14st.html) 19 | [Datasets] Needs a license form. 20 | 21 | ## References 22 | https://github.com/atpaino/deep-text-corrector 23 | 24 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | use_cuda = True 3 | max_seq_length = 100 4 | train_data_path = './data/train.txt' 5 | eval_data_path = './data/eval.txt' -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andabi/deep-text-corrector/69bd711e65cc42364becba5efd99b8d4f8ab0aab/data/__init__.py -------------------------------------------------------------------------------- /data/conll14st-test-data/README: -------------------------------------------------------------------------------- 1 | CoNLL-2014 Official Test Data 2 | Release 3.2 3 | 22 Apr 2014 4 | 5 | This README file describes the test data for the CoNLL-2014 Shared 6 | Task: Grammatical Error Correction. 7 | 8 | The package is distributed freely with the following copyright 9 | 10 | Copyright (C) 2014 Hwee Tou Ng, Siew Mei Wu, Ted Briscoe, 11 | Christian Hadiwinoto, Raymond Hendy Susanto, 12 | Christopher Bryant 13 | 14 | Any questions regarding the test data should be directed to 15 | Hwee Tou Ng at: nght@comp.nus.edu.sg 16 | 17 | 18 | 1. Directory Structure and Contents 19 | =================================== 20 | 21 | The top-level directory has two subdirectories, namely 22 | 23 | - noalt/ : the annotated test data without alternatives contributed 24 | by the participants 25 | - alt/ : the annotated test data with moderated participants' 26 | alternative annotations 27 | - scripts/ : the scripts used to preprocess the test data inside the 28 | two subdirectories 29 | 30 | 31 | 2. Data Format 32 | ============== 33 | 34 | The corpus is distributed in a simple SGML format. All annotations 35 | come in a "stand-off" format. The start position and end position of 36 | an annotation are given by paragraph and character offsets. 37 | Paragraphs are enclosed in

...

tags. Paragraphs and characters 38 | are counted starting from zero. Each annotation includes the following 39 | fields: the error category, the correction, and optionally a 40 | comment. If the correction replaces the original text at the given 41 | location, it should fix the grammatical error. 42 | 43 | Example: 44 | 45 | 46 | 47 |

48 | People with close blood relationship generally ... 49 |

50 |

51 | Focus on the negative side of the annouance ... 52 |

53 | ... 54 |
55 | 56 | 57 | Nn 58 | relationships 59 | 60 | ... 61 | 62 |
63 | 64 | ... 65 | 66 | Below is a complete list of the error categories in the noalt/ and alt/ 67 | subdirectories: 68 | 69 | ERROR TAG ERROR CATEGORY 70 | --------------------------- 71 | Vt Verb tense 72 | Vm Verb modal 73 | V0 Missing verb 74 | Vform Verb form 75 | SVA Subject-verb-agreement 76 | ArtOrDet Article or Determiner 77 | Nn Noun number 78 | Npos Noun possesive 79 | Pform Pronoun form 80 | Pref Pronoun reference 81 | Prep Preposition 82 | Wci Wrong collocation/idiom 83 | Wa Acronyms 84 | Wform Word form 85 | Wtone Tone 86 | Srun Runons, comma splice 87 | Smod Dangling modifier 88 | Spar Parallelism 89 | Sfrag Fragment 90 | Ssub Subordinate clause 91 | WOinc Incorrect sentence form 92 | WOadv Adverb/adjective position 93 | Trans Link word/phrases 94 | Mec Punctuation, capitalization, spelling, typos 95 | Rloc- Local redundancy 96 | Cit Citation 97 | Others Other errors 98 | Um Unclear meaning (cannot be corrected) 99 | 100 | The official annotation file contains all the default annotations to 101 | make the whole text correct. Meanwhile, each of the alternative 102 | annotation files contains only annotations for sentences that can be 103 | corrected in a different way, i.e. sentences that have alternative 104 | annotations. If according to an alternative, a sentence can remain 105 | unchanged, a special tag "noop" is used for that particular sentence. 106 | 107 | 108 | 3. Updates included in version 2.1 109 | ================================== 110 | 111 | The major change made in version 2.1 is to map the past error 112 | categories Wcip and Rloc to Prep, Wci, ArtOrDet, and Rloc-. 113 | 114 | In the original data, there is no explicit preposition error 115 | category. Instead, preposition errors are part of the Wcip (Wrong 116 | collocation/idiom/preposition) and Rloc (local redundancy) error 117 | categories. In addition, redundant article or determiner errors are 118 | part of the Rloc error category. 119 | 120 | 121 | 4. Updates included in version 2.2 122 | ================================== 123 | 124 | - Fixed the bug on expanding an error annotation involving part of a 125 | token to the full token. 126 | 127 | - Other miscellaneous corrections were made. 128 | 129 | 130 | 5. Updates included in version 2.3 131 | ================================== 132 | 133 | - Fixed the bug involving tokenization of punctuation symbols in the 134 | correction string. 135 | 136 | - Fixed the tokenization example in the README file of the M^2 scorer 137 | to reflect the real tokenization to be used and removed irrelevant 138 | codes from the scorer package. 139 | 140 | 141 | 6. Updates included in version 3.0 142 | ================================== 143 | 144 | - Resolved overlapping annotations in the NUCLE corpus to make them 145 | non-overlapping. 146 | 147 | - Corrected some minor mistakes in error annotations. 148 | 149 | 150 | 7. Updates included in version 3.1 151 | ================================== 152 | 153 | - Removed duplicate annotations in the NUCLE corpus with the same span 154 | and correction string but different error type so as to keep only one of 155 | those annotations. This fix only affects 0.1% of all annotations. 156 | 157 | - Fixed end-of-paragraph annotations so that the end offset of such 158 | annotations is the last character position in the paragraph. This fix 159 | only affects 0.7% of all annotations. 160 | 161 | - Corrected some minor mistakes in error annotations. 162 | 163 | - Inclusion of the CoNLL-2013 test data, with all the known problems 164 | described above fixed. Participating teams in the CoNLL-2014 shared 165 | task can make use of the CoNLL-2013 test data in training and 166 | developing their systems if they wish to do so. 167 | 168 | - Fixed a minor bug in the M2 scorer that caused duplicate insertion 169 | edits to receive high scores. 170 | 171 | 172 | 8. Updates included in version 3.2 173 | ================================== 174 | 175 | - Fixed the preprocessing script such that a gold edit that inserts an 176 | empty string is not included in the token-level gold edit and scorer 177 | answer files. 178 | 179 | - Removed one edit that inserted an empty string from the CoNLL-2014 180 | test data. Also removed such instances from the NUCLE training data. 181 | 182 | - Fixed a bug in the M2 scorer arising from scoring against gold edits 183 | from multiple annotators. Specifically, the bug sometimes caused 184 | incorrect scores to be reported when scoring against the gold edits 185 | of subsequent annotators (other than the first annotator). 186 | 187 | - Fixed a bug in the M2 scorer that caused erroneous scores to be 188 | reported when dealing with insertion edits followed by deletion edits 189 | (or vice versa). 190 | -------------------------------------------------------------------------------- /data/conll14st-test-data/scripts/README: -------------------------------------------------------------------------------- 1 | ==================================================== 2 | 3 | CoNLL-2014 Shared Task: Grammatical Error Correction 4 | 5 | Description of Data Preprocessing Scripts 6 | 7 | 22 Apr 2014 Version 3.2 8 | ==================================================== 9 | 10 | 11 | Table of Contents 12 | ================= 13 | 14 | 1. General 15 | 2. Pre-requisites 16 | 3. Usage 17 | 18 | 1. General 19 | ========== 20 | 21 | This README file describes the usage of scripts for preprocessing the NUCLE version 3.2 corpus. 22 | 23 | Quickstart: 24 | 25 | a. Regenerate the preprocessed files with full syntactic information: 26 | % python preprocess.py -o nucle.sgml conllFileName annFileName m2FileName 27 | 28 | b. Get tokenized annotations without syntactic information: 29 | % python preprocess.py -l nucle.sgml conllFileName annFileName m2FileName 30 | 31 | c. Get combined gold-standard answer file (without alternative): 32 | % python preprocesscombine.py gold1.sgml gold2.sgml combinedAnswer 33 | 34 | d. Get combined gold-standard answer file (with alternative answers): 35 | % python preprocesswithalt.py essay.sgml 2 gold1.sgml gold2.sgml alt1.sgml alt2.sgml alt3.sgml combinedAnsWithAlt 36 | 37 | where 38 | nucle.sgml - input SGML file 39 | conllFileName - output file that contains pre-processed sentences in CoNLL format. 40 | annFileName - output file that contains standoff error annotations. 41 | m2FileName - output file that contains error annotations in the M2 scorer format. 42 | gold1.sgml - input SGML file that contains the gold edits of the first annotator. 43 | gold2.sgml - input SGML file that contains the gold edits of the second annotator. 44 | combinedAnswer - output file that contains gold edits in the M2 scorer format 45 | combining the gold edits of the first and second annotator. 46 | alt1.sgml - input SGML file that contains alternative edits by the first team. 47 | alt2.sgml - input SGML file that contains alternative edits by the second team. 48 | alt3.sgml - input SGML file that contains alternative edits by the third team. 49 | combinedAnsWithAlt - output file that contains gold edits in the M2 scorer format 50 | combining the gold edits of the first and second annotator. 51 | 52 | 2. Pre-requisites 53 | ================= 54 | 55 | + Python (2.6.4, other versions >= 2.6.4, < 3.0 might work but are not tested) 56 | + nltk (http://www.nltk.org, version 2.0b7, needed for sentence splitting and word tokenization) 57 | + Stanford parser (version 2.0.1, http://nlp.stanford.edu/software/stanford-parser-2012-03-09.tgz) 58 | 59 | If you only use the scripts to generate error annotations needed by the M2 scorer, Stanford parser is not required. 60 | Otherwise, "stanford-parser-2012-03-09" need to be in the same directory as "scripts". 61 | 62 | 3. Usage 63 | ======== 64 | 65 | Preprocessing the data from single annotation 66 | 67 | Usage: python preprocess.py OPTIONS sgmlFileName conllFileName annotationFileName m2FileName 68 | 69 | Where 70 | sgmlFileName - NUCLE SGML file 71 | conllFileName - output file name for pre-processed sentences in CoNLL format (e.g., conll14st-preprocessed.conll). 72 | annotationFileName - output file name for error annotations (e.g., conll14st-preprocessed.conll.ann). 73 | m2FileName - output file name in the M2 scorer format (e.g., conll14st-preprocessed.conll.m2). 74 | 75 | OPTIONS 76 | -o - output will contain POS tags and parse tree info (i.e., the same as the released preprocessed file, runs slowly). 77 | -l - output will NOT contain POS tags and parse tree info (runs quickly). 78 | 79 | 80 | Combining alternative annotations by multiple annotators 81 | 82 | Usage: python preprocesscombine.py sgmlFileName1 ... sgmlFileNameN m2FileName 83 | 84 | Where 85 | sgmlFileName1 - test data SGML file containing the gold edits of annotator 1 86 | sgmlFileNameN - test data SGML file containing the gold edits of annotator N 87 | m2FileName - output file in the M2 scorer format, containing annotations by N annotators. 88 | 89 | e.g., python preprocesscombine.py official-2014.0.sgml official-2014.1.sgml official-2014.combined.m2 90 | 91 | will generate official-2014.combined.m2 from alternative annotations by 2 annotators. 92 | 93 | 94 | Combining alternative annotations by multiple main annotators with annotations proposed by participants 95 | 96 | Usage: python preprocesswithalt.py essaySgmlFile M annotSgmlFile1 ... annotSgmlFileM alt1SgmlFileName ... altNSgmlFileName combM2FileName 97 | 98 | Where 99 | essaySgmlFile - test data SGML file containing essay body, not necessarily annotations 100 | M - number of main annotations 101 | annotSgmlFile1 - test data SGML file containing the gold edits of main annotator 1 102 | annotSgmlFileM - test data SGML file containing the gold edits of main annotator M 103 | alt1SgmlFileName - the alternative annotation SGML file proposed by team 1 (first), containing only annotations that differ from the main annotation 104 | altNSgmlFileName - the alternative annotation SGML file proposed by team N (last), containing only annotations that differ from the main annotation 105 | combM2FileName - output file name in the M2 scorer format, containing combination of main and alternative annotations 106 | 107 | e.g., python preprocesswithalt official-2014.0.sgml 2 official-2014.0.sgml official-2014.1.sgml alternative-teama.sgml alternative-teamb.sgml alternative-teamc.sgml official-2014.combined-withalt.m2 108 | -------------------------------------------------------------------------------- /data/conll14st-test-data/scripts/iparser.py: -------------------------------------------------------------------------------- 1 | # iparser.py 2 | # 3 | # Author: Yuanbin Wu 4 | # National University of Singapore (NUS) 5 | # Date: 12 Mar 2013 6 | # Version: 1.0 7 | # 8 | # Contact: wuyb@comp.nus.edu.sg 9 | # 10 | # This script is distributed to support the CoNLL-2013 Shared Task. 11 | # It is free for research and educational purposes. 12 | 13 | import os 14 | import sys 15 | 16 | class stanfordparser: 17 | 18 | def __init__(self): 19 | pass 20 | 21 | def parse_batch(self, sentenceDumpedFileName, parsingDumpedFileName): 22 | 23 | if os.path.exists('../stanford-parser-2012-03-09') == False: 24 | print >> sys.stderr, 'can not find Stanford parser directory' 25 | sys.exit(1) 26 | 27 | # tokenized 28 | cmd = r'java -server -mx4096m -cp "../stanford-parser-2012-03-09/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser -retainTMPSubcategories -sentences newline -tokenized -escaper edu.stanford.nlp.process.PTBEscapingProcessor -outputFormat "wordsAndTags, penn, typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ' + sentenceDumpedFileName 29 | 30 | r = os.popen(cmd).read().strip().decode('utf-8') 31 | f = open(parsingDumpedFileName, 'w') 32 | f.write(r.encode('utf-8')) 33 | f.close() 34 | 35 | rlist = r.replace('\n\n\n', '\n\n\n\n').split('\n\n') 36 | return rlist 37 | -------------------------------------------------------------------------------- /data/conll14st-test-data/scripts/nucle_doc.py: -------------------------------------------------------------------------------- 1 | # nucle_doc.py 2 | # 3 | # Author: Yuanbin Wu 4 | # National University of Singapore (NUS) 5 | # Date: 12 Mar 2013 6 | # Version: 1.0 7 | # 8 | # Contact: wuyb@comp.nus.edu.sg 9 | # 10 | # This script is distributed to support the CoNLL-2013 Shared Task. 11 | # It is free for research and educational purposes. 12 | 13 | import os 14 | import sys 15 | from nltk import word_tokenize 16 | 17 | class nucle_doc: 18 | def __init__(self): 19 | self.docattrs = None 20 | 21 | self.matric = '' 22 | self.email = '' 23 | self.nationality = '' 24 | self.firstLanguage = '' 25 | self.schoolLanguage = '' 26 | self.englishTests = '' 27 | 28 | self.paragraphs = [] 29 | self.annotation = [] 30 | self.mistakes = [] 31 | 32 | self.sentences = [] 33 | 34 | def buildSentence(self, sentstr, dpnode, constituentstr, poslist, chunklist): 35 | self.sentences[-1].append(nucle_sent(sentstr, dpnode, constituentstr, poslist, chunklist)) 36 | 37 | def addSentence(self, sent): 38 | self.sentences[-1].append(sent) 39 | 40 | def findMistake(self, par, pos): 41 | for m in self.mistakes: 42 | if par == m['start_par'] and pos >= m['start_off'] and pos < m['end_off']: 43 | return m 44 | return None 45 | 46 | 47 | class nucle_sent: 48 | def __init__(self, sentstr, dpnode, constituentstr, poslist, chunklist): 49 | self.sentstr = sentstr 50 | self.words = word_tokenize(sentstr) 51 | self.dpnodes = dpnode 52 | self.constituentstr = constituentstr 53 | self.constituentlist = [] 54 | self.poslist = poslist 55 | self.chunklist = chunklist 56 | 57 | def buildConstituentList(self): 58 | 59 | s = self.constituentstr.strip().replace('\n', '').replace(' ', '') 60 | r = [] 61 | i = 0 62 | while i < len(s): 63 | j = i 64 | while j < len(s) and s[j] != ')': 65 | j += 1 66 | k = j 67 | while k < len(s) and s[k] == ')': 68 | k += 1 69 | 70 | nodeWholeStr = s[i:k] 71 | lastLRBIndex = nodeWholeStr.rfind('(') 72 | nodeStr = nodeWholeStr[:lastLRBIndex] + '*' + s[j+1:k] 73 | 74 | r.append(nodeStr) 75 | i = k 76 | 77 | if len(r) != len(self.words): 78 | print >> sys.stderr, 'Error in buiding constituent tree bits: different length with words.' 79 | print >> sys.stderr, len(r), len(self.words) 80 | print >> sys.stderr, ' '.join(r).encode('utf-8') 81 | print >> sys.stderr, words 82 | sys.exit(1) 83 | 84 | self.constituentlist = r 85 | 86 | 87 | 88 | def setDpNode(self, dpnode): 89 | self.dpnodes = dpnode 90 | 91 | def setPOSList(self, poslist): 92 | self.poslist = poslist 93 | 94 | def setConstituentStr(self, constituentstr): 95 | self.constituentstr = constituentstr 96 | 97 | def setConstituentList(self, constituentlist): 98 | self.constituentlist = constituentlist 99 | 100 | def setWords(self, words): 101 | self.words = words 102 | 103 | def setChunkList(self, chunklist): 104 | self.chunklist = chunklist 105 | 106 | def getDpNode(self): 107 | return self.dpnodes 108 | 109 | def getPOSList(self): 110 | return self.poslist 111 | 112 | def getConstituentStr(self): 113 | return self.constituentstr 114 | 115 | def getConstituentList(self): 116 | return self.constituentlist 117 | 118 | def getWords(self): 119 | return self.words 120 | 121 | def getChunkList(self): 122 | return self.chunklist 123 | 124 | def getConllFormat(self, doc, paragraphIndex, sentIndex): 125 | 126 | table = [] 127 | 128 | dpnodes = self.getDpNode() 129 | poslist = self.getPOSList() 130 | #chunklist = self.getChunkList() 131 | words = self.getWords() 132 | constituentlist = self.getConstituentList() 133 | 134 | if len(poslist) == 0: 135 | hasParseInfo = 0 136 | else: 137 | hasParseInfo = 1 138 | 139 | if len(words) != len(poslist) and len(poslist) != 0: 140 | print >> sys.stderr, 'Error in buiding Conll Format: different length stanford parser postags and words.' 141 | print >> sys.stderr, 'len words:', len(words), words 142 | print >> sys.stderr, 'len poslist:', len(poslist), poslist 143 | sys.exit(1) 144 | 145 | for wdindex in xrange(len(words)): 146 | 147 | word = words[wdindex] 148 | 149 | row = [] 150 | row.append(doc.docattrs[0][1]) #docinfo 151 | row.append(paragraphIndex) #paragraph index 152 | row.append(sentIndex) #paragraph index 153 | row.append(wdindex) #word index 154 | row.append(word) #word 155 | 156 | #row.append(chunknode.label) #chunk 157 | if hasParseInfo == 1: 158 | 159 | posword = poslist[wdindex] 160 | splitp = posword.rfind('/') 161 | pos = posword[splitp+1 : ].strip() 162 | 163 | #chunknode = chunklist[wdindex] 164 | 165 | constituentnode = constituentlist[wdindex] 166 | 167 | dpnode = None 168 | for d in dpnodes: 169 | if d.index == wdindex: 170 | dpnode = d 171 | break 172 | 173 | row.append(pos) #POS 174 | if dpnode == None: 175 | row.append('-') 176 | row.append('-') 177 | else: 178 | row.append(dpnode.parent_index) #dp parent 179 | row.append(dpnode.grammarrole) #dp label 180 | row.append(constituentnode) #constituent 181 | 182 | table.append(row) 183 | 184 | return table 185 | 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /data/conll14st-test-data/scripts/nuclesgmlparser.py: -------------------------------------------------------------------------------- 1 | # nuclesgmlparser.py 2 | # 3 | # Author: Yuanbin Wu 4 | # National University of Singapore (NUS) 5 | # Date: 12 Mar 2013 6 | # Version: 1.0 7 | # 8 | # Contact: wuyb@comp.nus.edu.sg 9 | # 10 | # This script is distributed to support the CoNLL-2013 Shared Task. 11 | # It is free for research and educational purposes. 12 | 13 | from sgmllib import SGMLParser 14 | from nucle_doc import nucle_doc 15 | 16 | 17 | class nuclesgmlparser(SGMLParser): 18 | def __init__(self): 19 | SGMLParser.__init__(self) 20 | self.docs = [] 21 | 22 | def reset(self): 23 | self.docs = [] 24 | self.data = [] 25 | SGMLParser.reset(self) 26 | 27 | def unknow_starttag(self, tag, attrs): 28 | pass 29 | 30 | def unknow_endtag(self): 31 | pass 32 | 33 | def start_doc(self, attrs): 34 | self.docs.append(nucle_doc()) 35 | self.docs[-1].docattrs = attrs 36 | 37 | def end_doc(self): 38 | pass 39 | 40 | def start_matric(self, attrs): 41 | pass 42 | 43 | def end_matric(self): 44 | self.docs[-1].matric = ''.join(self.data) 45 | self.data = [] 46 | pass 47 | 48 | def start_email(self, attrs): 49 | pass 50 | 51 | def end_email(self): 52 | self.docs[-1].email = ''.join(self.data) 53 | self.data = [] 54 | pass 55 | 56 | def start_nationality(self, attrs): 57 | pass 58 | 59 | def end_nationality(self): 60 | self.docs[-1].nationality = ''.join(self.data) 61 | self.data = [] 62 | pass 63 | 64 | def start_first_language(self, attrs): 65 | pass 66 | 67 | def end_first_language(self): 68 | self.docs[-1].firstLanguage = ''.join(self.data) 69 | self.data = [] 70 | pass 71 | 72 | def start_school_language(self, attrs): 73 | pass 74 | 75 | def end_school_language(self): 76 | self.docs[-1].schoolLanguage = ''.join(self.data) 77 | self.data = [] 78 | pass 79 | 80 | def start_english_tests(self, attrs): 81 | pass 82 | 83 | def end_english_tests(self): 84 | self.docs[-1].englishTests = ''.join(self.data) 85 | self.data = [] 86 | pass 87 | 88 | 89 | def start_text(self, attrs): 90 | pass 91 | 92 | def end_text(self): 93 | pass 94 | 95 | def start_title(self, attrs): 96 | pass 97 | 98 | def end_title(self): 99 | self.docs[-1].paragraphs.append(''.join(self.data)) 100 | self.data = [] 101 | pass 102 | 103 | 104 | def start_p(self, attrs): 105 | pass 106 | 107 | def end_p(self): 108 | self.docs[-1].paragraphs.append(''.join(self.data)) 109 | self.data = [] 110 | pass 111 | 112 | 113 | def start_annotation(self, attrs): 114 | self.docs[-1].annotation.append(attrs) 115 | 116 | def end_annotation(self): 117 | pass 118 | 119 | def start_mistake(self, attrs): 120 | d = {} 121 | for t in attrs: 122 | d[t[0]] = int(t[1]) 123 | self.docs[-1].mistakes.append(d) 124 | pass 125 | 126 | def end_mistake(self): 127 | pass 128 | 129 | def start_type(self, attrs): 130 | pass 131 | 132 | def end_type(self): 133 | self.docs[-1].mistakes[-1]['type'] = ''.join(self.data) 134 | self.data = [] 135 | 136 | def start_correction(self, attrs): 137 | pass 138 | 139 | def end_correction(self): 140 | self.docs[-1].mistakes[-1]['correction'] = ''.join(self.data) 141 | self.data = [] 142 | 143 | def start_comment(self, attrs): 144 | pass 145 | 146 | def end_comment(self): 147 | self.docs[-1].mistakes[-1]['comment'] = ''.join( self.data) 148 | self.data = [] 149 | 150 | 151 | def handle_charref(self, ref): 152 | self.data.append('&' + ref) 153 | 154 | def handle_entityref(self, ref): 155 | self.data.append('&' + ref) 156 | 157 | def handle_data(self, text): 158 | if text.strip() == '': 159 | self.data.append('') 160 | return 161 | else: 162 | if text.startswith('\n'): 163 | text = text[1:] 164 | if text.endswith('\n'): 165 | text = text[:-1] 166 | self.data.append(text) 167 | 168 | 169 | -------------------------------------------------------------------------------- /data/conll14st-test-data/scripts/parser_feature.py: -------------------------------------------------------------------------------- 1 | # parser_feature.py 2 | # 3 | # Author: Yuanbin Wu 4 | # National University of Singapore (NUS) 5 | # Date: 12 Mar 2013 6 | # Version: 1.0 7 | # 8 | # Contact: wuyb@comp.nus.edu.sg 9 | # 10 | # This script is distributed to support the CoNLL-2013 Shared Task. 11 | # It is free for research and educational purposes. 12 | 13 | 14 | 15 | import iparser 16 | 17 | class stanpartreenode: 18 | def __init__(self, strnode): 19 | 20 | if strnode == '': 21 | self.grammarrole = '' 22 | self.parent_index = -1 23 | self.index = -1 24 | self.parent_word = '' 25 | self.word = '' 26 | self.POS = '' 27 | return 28 | 29 | groleend = strnode.find('(') 30 | self.grammarrole = strnode[ : groleend] 31 | content = strnode[groleend + 1: len(strnode)-1] 32 | dadAndme = content.partition(', ') 33 | dad = dadAndme[0] 34 | me = dadAndme[2] 35 | dadsep = dad.rfind('-') 36 | mesep = me.rfind('-') 37 | self.parent_index = int(dad[dadsep + 1 : ]) - 1 38 | self.parent_word = dad[0 : dadsep] 39 | self.index = int(me[mesep + 1 : ]) - 1 40 | self.word = me[0 : mesep] 41 | self.POS = '' 42 | 43 | 44 | def DependTree_Batch(sentenceDumpedFileName, parsingDumpedFileName): 45 | 46 | sparser = iparser.stanfordparser() 47 | results = sparser.parse_batch(sentenceDumpedFileName, parsingDumpedFileName) 48 | nodeslist = [] 49 | 50 | k = 0 51 | while k < len(results): 52 | PoSlist = results[k].split(' ') 53 | constituentstr = results[k+1] 54 | table = results[k+2].split('\n') 55 | nodes = [] 56 | for i in range(0, len(table)): 57 | nodes.append( stanpartreenode(table[i]) ) 58 | nodeslist.append((nodes, constituentstr, PoSlist)) 59 | k += 3 60 | return nodeslist 61 | 62 | def DependTree_Batch_Parsefile(parsingDumpedFileName): 63 | 64 | f = open(parsingDumpedFileName, 'r') 65 | results = f.read().decode('utf-8').replace('\n\n\n', '\n\n\n\n').split('\n\n') 66 | f.close() 67 | nodeslist = [] 68 | 69 | k = 0 70 | while k < len(results): 71 | PoSlist = results[k].split(' ') 72 | constituentstr = results[k+1] 73 | table = results[k+2].split('\n') 74 | 75 | nodes = [] 76 | for i in range(0, len(table)): 77 | nodes.append( stanpartreenode(table[i]) ) 78 | nodeslist.append((nodes, constituentstr, PoSlist)) 79 | k += 3 80 | return nodeslist 81 | -------------------------------------------------------------------------------- /data/conll14st-test-data/scripts/preprocesscombine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # preprocesscombine.py 4 | # 5 | # Author: Christian Hadiwinoto 6 | # National University of Singapore (NUS) 7 | # Date: 22 Apr 2014 8 | # Version: 1.0 9 | # 10 | # Contact: chrhad@comp.nus.edu.sg 11 | # 12 | # This script is distributed to support the CoNLL-2013 Shared Task. 13 | # It is free for research and educational purposes. 14 | # 15 | # Usage: python preprocesswithalt.py essaySgmlFileName mainSgmlFileName alt1SgmlFileName ... altNSgmlFileName m2FileName 16 | # 17 | 18 | 19 | import parser_feature 20 | from nuclesgmlparser import nuclesgmlparser 21 | from nucle_doc import * 22 | import nltk.data 23 | from nltk import word_tokenize 24 | from operator import itemgetter 25 | import cPickle as pickle 26 | import re 27 | import sys 28 | import os 29 | 30 | getEditKey = itemgetter(0, 1, 2, 3, 4) 31 | 32 | sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 33 | sentenceDumpedFile = 'sentence_file' 34 | docsDumpedFileName = 'docs' 35 | parsingDumpedFileName = 'parse_file' 36 | 37 | def readNUCLE(fn): 38 | 39 | f = open(fn, 'r') 40 | parser = nuclesgmlparser() 41 | filestr = f.read() 42 | filestr = filestr.decode('utf-8') 43 | 44 | #Fix Reference tag 45 | p = re.compile(r'(\n

\n.*\n)

') 46 | filestr = p.sub(r'\1

', filestr) 47 | 48 | parser.feed(filestr) 49 | f.close() 50 | parser.close() 51 | 52 | return parser.docs 53 | 54 | def sentenceSplit(docs): 55 | 56 | sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 57 | for doc in docs: 58 | for par in doc.paragraphs: 59 | doc.sentences.append([]) 60 | for s in sentenceTokenizer.tokenize(par): 61 | doc.buildSentence(s, [], '', [], []) 62 | return docs 63 | 64 | def compareTwoEditLists(editList1, editList2): 65 | # must be sorted 66 | if editList1 == [] and editList2 == []: 67 | return True 68 | elif editList1 == [] or editList2 == []: 69 | return False 70 | elif getEditKey(editList1[0]) != getEditKey(editList2[0]): 71 | return False 72 | else: 73 | return compareTwoEditLists(editList1[1:], editList2[1:]) 74 | 75 | def moderateAnnotations(contestDocs, annotBoard, origDocSet): 76 | # moderate annotation in "contesting" docs with already stated mistakes 77 | #mistakeStrSet = {} 78 | for doc in contestDocs: 79 | #mistakeStr = '' 80 | nid = int(doc.docattrs[0][1]) # nid of current document 81 | tid = doc.annotation[0][0][1] # teacher id 82 | 83 | if not annotBoard.has_key(nid): # create placeholder 84 | annotBoard[nid] = {} 85 | 86 | origDoc = origDocSet[nid] 87 | for pid in xrange(len(origDoc.sentences)): 88 | slist = origDoc.sentences[pid] 89 | if not annotBoard[nid].has_key(pid): 90 | annotBoard[nid][pid] = {} 91 | for sentid in xrange(len(slist)): 92 | sent = slist[sentid] 93 | if not annotBoard[nid][pid].has_key(sentid): 94 | annotBoard[nid][pid][sentid] = [] 95 | editSet = [] 96 | 97 | # enumerate mistakes 98 | sentoffset = origDoc.paragraphs[pid].index(sent.sentstr) 99 | editNum = 0 100 | for m in doc.mistakes: 101 | if m['start_par'] != pid or \ 102 | m['start_par'] != m['end_par'] or \ 103 | m['start_off'] < sentoffset or \ 104 | m['start_off'] >= sentoffset + len(sent.sentstr) or \ 105 | m['end_off'] sentoffset + len(sent.sentstr): 107 | continue 108 | 109 | #if m['type'] != 'noop': 110 | editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], m['correction'], m['type'])) 111 | #editNum += 1 112 | #else: 113 | #editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], sent.sentstr, m['type'])) 114 | 115 | editSet = sorted(editSet, key=itemgetter(0, 1, 2, 3)) 116 | 117 | # find the same annotation 118 | foundMatch = False 119 | i = 0 120 | boardEdits = annotBoard[nid][pid][sentid] 121 | while i < len(boardEdits) and not foundMatch: 122 | if compareTwoEditLists(editSet, boardEdits[i]): 123 | foundMatch = True 124 | else: 125 | i+=1 126 | 127 | if not foundMatch: 128 | annotBoard[nid][pid][sentid].append(editSet) 129 | 130 | return annotBoard 131 | 132 | def createM2File(origDocs, mistakesBoard, m2FileName): 133 | 134 | fm2 = open(m2FileName, 'w') 135 | 136 | for doc in origDocs: 137 | nid = int(doc.docattrs[0][1]) # nid of current document 138 | for slistIndex in xrange(len(doc.sentences)): 139 | slist = doc.sentences[slistIndex] 140 | for sentid in xrange(len(slist)): 141 | 142 | sent = slist[sentid] 143 | 144 | # m2 format annotation string list 145 | m2AnnotationList = [] 146 | 147 | # build colums 148 | table = sent.getConllFormat(doc, slistIndex, sentid) 149 | tokenizedSentStr = ' '.join(sent.getWords()) 150 | 151 | #Add annotation info 152 | sentoffset = doc.paragraphs[slistIndex].index(sent.sentstr) 153 | 154 | i = 0 155 | board = mistakesBoard[nid][slistIndex][sentid] 156 | for mistakesList in board: 157 | if len(mistakesList) == 0 and len(board) > 1: 158 | m2AnnotationList.append('A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||' + str(i) + '\n') 159 | i += 1 160 | 161 | for tuple in mistakesList: 162 | m = {} 163 | m['start_par'] = tuple[0] 164 | m['end_par'] = tuple[1] 165 | m['start_off'] = tuple[2] 166 | m['end_off'] = tuple[3] 167 | m['correction'] = tuple[4] 168 | m['type'] = tuple[5] 169 | 170 | if m['start_par'] != slistIndex or \ 171 | m['start_par'] != m['end_par'] or \ 172 | m['start_off'] < sentoffset or \ 173 | m['start_off'] >= sentoffset + len(sent.sentstr) or \ 174 | m['end_off'] sentoffset + len(sent.sentstr): 176 | continue 177 | 178 | wordsoffset = 0 179 | wdstart = 0 180 | 181 | startInWord = 0 182 | headText = '' 183 | endInWord = 0 184 | tailText = '' 185 | 186 | words = sent.getWords() 187 | while wdstart < len(words): 188 | 189 | word = words[wdstart] 190 | nextstart = sent.sentstr.find(word, wordsoffset) 191 | 192 | if nextstart == -1: 193 | # may not find word, due to relpacement 194 | print >> sys.stderr, "Warning in building conll format: can not find word" 195 | print >> sys.stderr, word.encode('utf-8') 196 | wordsoffset += 1 197 | else: 198 | wordsoffset = nextstart 199 | 200 | if wordsoffset >= m['start_off']-sentoffset: 201 | break 202 | elif wordsoffset + len(word) > m['start_off']-sentoffset: 203 | # annotation starts at the middle of a word 204 | startInWord = 1 205 | headText = sent.sentstr[wordsoffset: m['start_off']-sentoffset] 206 | break 207 | 208 | wordsoffset += len(word) 209 | wdstart += 1 210 | 211 | if wdstart == len(words): 212 | print >> sys.stderr, 'Warning in building conll format: start_off overflow' 213 | print >> sys.stderr, m, sent.sentstr.encode('utf-8') 214 | continue 215 | 216 | 217 | wdend = wdstart 218 | while wdend < len(words): 219 | 220 | word = words[wdend] 221 | 222 | nextstart = sent.sentstr.find(word, wordsoffset) 223 | 224 | if nextstart == -1: 225 | print >> sys.stderr, "Warning in building conll format: can not find word" 226 | print >> sys.stderr, word.encode('utf-8') 227 | wordsoffset += 1 228 | else: 229 | wordsoffset = nextstart 230 | 231 | if wordsoffset >= m['end_off']-sentoffset: 232 | # annotation ends at the middle of a word 233 | if wordsoffset - len(words[wdend-1]) - 1 < m['end_off']-sentoffset: 234 | endInWord = 1 235 | tailText = sent.sentstr[m['end_off']-sentoffset : wordsoffset].strip() 236 | break 237 | 238 | wordsoffset += len(word) 239 | wdend += 1 240 | 241 | 242 | correctionTokenizedStr = tokenizeCorrectionStr(headText + m['correction'] + tailText, wdstart, wdend, words) 243 | correctionTokenizedStr, wdstart, wdend = shrinkCorrectionStr(correctionTokenizedStr, wdstart, wdend, words) 244 | 245 | token_start = wdstart #if m['type'] != 'noop' else -1 246 | token_end = wdend #if m['type'] != 'noop' else -1 247 | correction_final = correctionTokenizedStr.replace('\n', '') #if m['type'] != 'noop' else '-NONE-' 248 | if wdstart == wdend and len(correction_final) == 0: 249 | continue 250 | 251 | # build annotation string for .conll.m2 file 252 | m2AnnotationStr = 'A ' 253 | m2AnnotationStr += str(token_start) + ' ' 254 | m2AnnotationStr += str(token_end) + '|||' 255 | m2AnnotationStr += m['type'] + '|||' 256 | m2AnnotationStr += correction_final + '|||' 257 | m2AnnotationStr += 'REQUIRED|||-NONE-|||' + str(i) + '\n' 258 | 259 | m2AnnotationList.append(m2AnnotationStr) 260 | 261 | if len(mistakesList) > 0: # only if mistakeList contains tuples 262 | i += 1 263 | 264 | # write .conll.m2 file 265 | m2AnnotationSent = 'S ' + tokenizedSentStr + '\n' 266 | m2AnnotationSent += ''.join(m2AnnotationList) + '\n' 267 | fm2.write(m2AnnotationSent.encode('utf-8')) 268 | 269 | fm2.close() 270 | 271 | 272 | def tokenizeCorrectionStr(correctionStr, wdstart, wdend, words): 273 | 274 | correctionTokenizedStr = '' 275 | pseudoSent = correctionStr 276 | 277 | if wdstart != 0: 278 | pseudoSent = words[wdstart-1] + ' ' + pseudoSent 279 | 280 | if wdend < len(words) - 1: 281 | pseudoSent = pseudoSent + ' ' + words[wdend] 282 | elif wdend == len(words) - 1: 283 | pseudoSent = pseudoSent + words[wdend] 284 | 285 | 286 | pseudoWordsList = [] 287 | sentList = sentenceTokenizer.tokenize(pseudoSent) 288 | for sent in sentList: 289 | pseudoWordsList += word_tokenize(sent) 290 | 291 | start = 0 292 | if wdstart != 0: 293 | s = '' 294 | for i in xrange(len(pseudoWordsList)): 295 | s += pseudoWordsList[i] 296 | if s == words[wdstart-1]: 297 | start = i + 1 298 | break 299 | if start == 0: 300 | print >> sys.stderr, 'Can not find words[wdstart-1]' 301 | 302 | else: 303 | start = 0 304 | 305 | end = len(pseudoWordsList) 306 | if wdend != len(words): 307 | 308 | s = '' 309 | for i in xrange(len(pseudoWordsList)): 310 | s = pseudoWordsList[len(pseudoWordsList) - i - 1] + s 311 | if s == words[wdend]: 312 | end = len(pseudoWordsList) - i - 1 313 | break 314 | if end == len(pseudoWordsList): 315 | print >> sys.stderr, 'Can not find words[wdend]' 316 | 317 | else: 318 | end = len(pseudoWordsList) 319 | 320 | correctionTokenizedStr = ' '.join(pseudoWordsList[start:end]) 321 | 322 | return correctionTokenizedStr 323 | 324 | 325 | def shrinkCorrectionStr(correctionTokenizedStr, wdstart, wdend, words): 326 | 327 | correctionWords = correctionTokenizedStr.split(' ') 328 | originalWords = words[wdstart: wdend] 329 | wdstartNew = wdstart 330 | wdendNew = wdend 331 | cstart = 0 332 | cend = len(correctionWords) 333 | 334 | i = 0 335 | while i < len(originalWords) and i < len(correctionWords): 336 | if correctionWords[i] == originalWords[i]: 337 | i += 1 338 | wdstartNew = i + wdstart 339 | cstart = i 340 | else: 341 | break 342 | 343 | i = 1 344 | while i <= len(originalWords) - cstart and i <= len(correctionWords) - cstart: 345 | if correctionWords[len(correctionWords)-i] == originalWords[len(originalWords)-i]: 346 | wdendNew = wdend - i 347 | cend = len(correctionWords) - i 348 | i += 1 349 | else: 350 | break 351 | 352 | return ' '.join(correctionWords[cstart:cend]), wdstartNew, wdendNew 353 | 354 | if __name__ == '__main__': 355 | 356 | ''' usage: 357 | 358 | %python preprocesscombine.py alternativesgmlfile1 ... alternativesgmlfileN combinedm2file 359 | output an m2 file containing a collection of the main annotation and all alternative annotations. 360 | 361 | In most cases completesgmlfile and mainsgmlfile are identical 362 | ''' 363 | 364 | # Load original complete SGML for reference 365 | origDocs = sentenceSplit(readNUCLE(sys.argv[1])) 366 | 367 | origDocSet = {} 368 | for doc in origDocs: 369 | nid = int(doc.docattrs[0][1]) 370 | origDocSet[nid] = doc 371 | 372 | docsList = [] 373 | for i in range(1, len(sys.argv) - 1): 374 | print >> sys.stderr, 'Storing', i 375 | docs = sentenceSplit(readNUCLE(sys.argv[i])) 376 | docsList.append(docs) 377 | 378 | board = {} 379 | for docs in docsList: 380 | board = moderateAnnotations(docs, board, origDocSet) 381 | 382 | createM2File(origDocs, board, sys.argv[len(sys.argv)-1]) 383 | 384 | pass 385 | 386 | -------------------------------------------------------------------------------- /data/conll14st-test-data/scripts/preprocesswithalt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # preprocesswithalt.py 4 | # 5 | # Author: Christian Hadiwinoto 6 | # National University of Singapore (NUS) 7 | # Date: 22 Apr 2014 8 | # Version: 1.0 9 | # 10 | # Contact: chrhad@comp.nus.edu.sg 11 | # 12 | # This script is distributed to support the CoNLL-2013 Shared Task. 13 | # It is free for research and educational purposes. 14 | # 15 | # Usage: python preprocesswithalt.py essaySgmlFileName M mainSgmlFileName alt1SgmlFileName ... altNSgmlFileName m2FileName 16 | # 17 | 18 | 19 | import parser_feature 20 | from nuclesgmlparser import nuclesgmlparser 21 | from nucle_doc import * 22 | import nltk.data 23 | from nltk import word_tokenize 24 | from operator import itemgetter 25 | import cPickle as pickle 26 | import re 27 | import sys 28 | import os 29 | 30 | getEditKey = itemgetter(0, 1, 2, 3, 4) 31 | 32 | sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 33 | sentenceDumpedFile = 'sentence_file' 34 | docsDumpedFileName = 'docs' 35 | parsingDumpedFileName = 'parse_file' 36 | 37 | def readNUCLE(fn): 38 | 39 | f = open(fn, 'r') 40 | parser = nuclesgmlparser() 41 | filestr = f.read() 42 | filestr = filestr.decode('utf-8') 43 | 44 | #Fix Reference tag 45 | p = re.compile(r'(\n

\n.*\n)

') 46 | filestr = p.sub(r'\1

', filestr) 47 | 48 | parser.feed(filestr) 49 | f.close() 50 | parser.close() 51 | 52 | return parser.docs 53 | 54 | def sentenceSplit(docs): 55 | 56 | sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 57 | for doc in docs: 58 | for par in doc.paragraphs: 59 | doc.sentences.append([]) 60 | for s in sentenceTokenizer.tokenize(par): 61 | doc.buildSentence(s, [], '', [], []) 62 | return docs 63 | 64 | def compareTwoEditLists(editList1, editList2): 65 | # must be sorted 66 | if editList1 == [] and editList2 == []: 67 | return True 68 | elif editList1 == [] or editList2 == []: 69 | return False 70 | elif getEditKey(editList1[0]) != getEditKey(editList2[0]): 71 | return False 72 | else: 73 | return compareTwoEditLists(editList1[1:], editList2[1:]) 74 | 75 | def moderateAnnotations(contestDocs, annotBoard, origDocSet): 76 | # moderate annotation in "contesting" docs with already stated mistakes 77 | #mistakeStrSet = {} 78 | for doc in contestDocs: 79 | #mistakeStr = '' 80 | nid = int(doc.docattrs[0][1]) # nid of current document 81 | tid = doc.annotation[0][0][1] # teacher id 82 | 83 | if not annotBoard.has_key(nid): # create placeholder 84 | annotBoard[nid] = {} 85 | 86 | origDoc = origDocSet[nid] 87 | for pid in xrange(len(origDoc.sentences)): 88 | slist = origDoc.sentences[pid] 89 | if not annotBoard[nid].has_key(pid): 90 | annotBoard[nid][pid] = {} 91 | for sentid in xrange(len(slist)): 92 | sent = slist[sentid] 93 | if not annotBoard[nid][pid].has_key(sentid): 94 | annotBoard[nid][pid][sentid] = [] 95 | editSet = [] 96 | 97 | # enumerate mistakes 98 | sentoffset = origDoc.paragraphs[pid].index(sent.sentstr) 99 | editNum = 0 100 | for m in doc.mistakes: 101 | if m['start_par'] != pid or \ 102 | m['start_par'] != m['end_par'] or \ 103 | m['start_off'] < sentoffset or \ 104 | m['start_off'] >= sentoffset + len(sent.sentstr) or \ 105 | m['end_off'] sentoffset + len(sent.sentstr): 107 | continue 108 | 109 | #if m['type'] != 'noop': 110 | editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], m['correction'], m['type'])) 111 | #editNum += 1 112 | #else: 113 | #editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], sent.sentstr, m['type'])) 114 | 115 | editSet = sorted(editSet, key=itemgetter(0, 1, 2, 3)) 116 | 117 | # find the same annotation 118 | foundMatch = False 119 | i = 0 120 | boardEdits = annotBoard[nid][pid][sentid] 121 | while i < len(boardEdits) and not foundMatch: 122 | if compareTwoEditLists(editSet, boardEdits[i]): 123 | foundMatch = True 124 | else: 125 | i+=1 126 | 127 | if not foundMatch: 128 | annotBoard[nid][pid][sentid].append(editSet) 129 | 130 | return annotBoard 131 | 132 | def moderateAnnotationsAlt(contestDocs, annotBoard, origDocSet): 133 | # moderate annotation in "contesting" docs with already stated mistakes 134 | # for alternative answers (with explicit NOOP) 135 | mistakeStrSet = {} 136 | for doc in contestDocs: 137 | mistakeStr = '' 138 | nid = int(doc.docattrs[0][1]) # nid of current document 139 | tid = doc.annotation[0][0][1] # teacher id 140 | 141 | if not annotBoard.has_key(nid): # create placeholder 142 | annotBoard[nid] = {} 143 | 144 | origDoc = origDocSet[nid] 145 | for pid in xrange(len(origDoc.sentences)): 146 | slist = origDoc.sentences[pid] 147 | if not annotBoard[nid].has_key(pid): 148 | annotBoard[nid][pid] = {} 149 | for sentid in xrange(len(slist)): 150 | sent = slist[sentid] 151 | if not annotBoard[nid][pid].has_key(sentid): 152 | annotBoard[nid][pid][sentid] = [] 153 | editSet = [] 154 | 155 | # enumerate mistakes 156 | sentoffset = origDoc.paragraphs[pid].index(sent.sentstr) 157 | editNum = 0 158 | for m in doc.mistakes: 159 | if m['start_par'] != pid or \ 160 | m['start_par'] != m['end_par'] or \ 161 | m['start_off'] < sentoffset or \ 162 | m['start_off'] >= sentoffset + len(sent.sentstr) or \ 163 | m['end_off'] sentoffset + len(sent.sentstr): 165 | continue 166 | 167 | if m['type'] != 'noop': 168 | editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], m['correction'], m['type'])) 169 | editNum += 1 170 | else: 171 | editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], sent.sentstr, m['type'])) 172 | 173 | # as empty alternative edit means agreement to main annotation edit 174 | if len(editSet) == 0: 175 | continue 176 | 177 | editSet = sorted(editSet, key=itemgetter(0, 1, 2, 3)) 178 | 179 | # find the same annotation 180 | foundMatch = False 181 | i = 0 182 | boardEdits = annotBoard[nid][pid][sentid] 183 | while i < len(boardEdits) and not foundMatch: 184 | if compareTwoEditLists(editSet, boardEdits[i]): 185 | foundMatch = True 186 | else: 187 | i+=1 188 | 189 | if not foundMatch: 190 | annotBoard[nid][pid][sentid].append(editSet) 191 | 192 | return annotBoard 193 | 194 | def createM2File(origDocs, mistakesBoard, m2FileName): 195 | 196 | fm2 = open(m2FileName, 'w') 197 | 198 | for doc in origDocs: 199 | nid = int(doc.docattrs[0][1]) # nid of current document 200 | for slistIndex in xrange(len(doc.sentences)): 201 | slist = doc.sentences[slistIndex] 202 | for sentid in xrange(len(slist)): 203 | 204 | sent = slist[sentid] 205 | 206 | # m2 format annotation string list 207 | m2AnnotationList = [] 208 | 209 | # build colums 210 | table = sent.getConllFormat(doc, slistIndex, sentid) 211 | tokenizedSentStr = ' '.join(sent.getWords()) 212 | 213 | #Add annotation info 214 | sentoffset = doc.paragraphs[slistIndex].index(sent.sentstr) 215 | 216 | i = 0 217 | board = mistakesBoard[nid][slistIndex][sentid] 218 | for mistakesList in board: 219 | if len(mistakesList) == 0 and len(board) > 1: 220 | m2AnnotationList.append('A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||' + str(i) + '\n') 221 | i += 1 222 | 223 | for tuple in mistakesList: 224 | m = {} 225 | m['start_par'] = tuple[0] 226 | m['end_par'] = tuple[1] 227 | m['start_off'] = tuple[2] 228 | m['end_off'] = tuple[3] 229 | m['correction'] = tuple[4] 230 | m['type'] = tuple[5] 231 | 232 | if m['start_par'] != slistIndex or \ 233 | m['start_par'] != m['end_par'] or \ 234 | m['start_off'] < sentoffset or \ 235 | m['start_off'] >= sentoffset + len(sent.sentstr) or \ 236 | m['end_off'] sentoffset + len(sent.sentstr): 238 | continue 239 | 240 | wordsoffset = 0 241 | wdstart = 0 242 | 243 | startInWord = 0 244 | headText = '' 245 | endInWord = 0 246 | tailText = '' 247 | 248 | words = sent.getWords() 249 | while wdstart < len(words): 250 | 251 | word = words[wdstart] 252 | nextstart = sent.sentstr.find(word, wordsoffset) 253 | 254 | if nextstart == -1: 255 | # may not find word, due to relpacement 256 | print >> sys.stderr, "Warning in building conll format: can not find word" 257 | print >> sys.stderr, word.encode('utf-8') 258 | wordsoffset += 1 259 | else: 260 | wordsoffset = nextstart 261 | 262 | if wordsoffset >= m['start_off']-sentoffset: 263 | break 264 | elif wordsoffset + len(word) > m['start_off']-sentoffset: 265 | # annotation starts at the middle of a word 266 | startInWord = 1 267 | headText = sent.sentstr[wordsoffset: m['start_off']-sentoffset] 268 | break 269 | 270 | wordsoffset += len(word) 271 | wdstart += 1 272 | 273 | if wdstart == len(words): 274 | print >> sys.stderr, 'Warning in building conll format: start_off overflow' 275 | print >> sys.stderr, m, sent.sentstr.encode('utf-8') 276 | continue 277 | 278 | 279 | wdend = wdstart 280 | while wdend < len(words): 281 | 282 | word = words[wdend] 283 | 284 | nextstart = sent.sentstr.find(word, wordsoffset) 285 | 286 | if nextstart == -1: 287 | print >> sys.stderr, "Warning in building conll format: can not find word" 288 | print >> sys.stderr, word.encode('utf-8') 289 | wordsoffset += 1 290 | else: 291 | wordsoffset = nextstart 292 | 293 | if wordsoffset >= m['end_off']-sentoffset: 294 | # annotation ends at the middle of a word 295 | if wordsoffset - len(words[wdend-1]) - 1 < m['end_off']-sentoffset: 296 | endInWord = 1 297 | tailText = sent.sentstr[m['end_off']-sentoffset : wordsoffset].strip() 298 | break 299 | 300 | wordsoffset += len(word) 301 | wdend += 1 302 | 303 | 304 | correctionTokenizedStr = tokenizeCorrectionStr(headText + m['correction'] + tailText, wdstart, wdend, words) 305 | correctionTokenizedStr, wdstart, wdend = shrinkCorrectionStr(correctionTokenizedStr, wdstart, wdend, words) 306 | 307 | token_start = wdstart if m['type'] != 'noop' else -1 308 | token_end = wdend if m['type'] != 'noop' else -1 309 | correction_final = correctionTokenizedStr.replace('\n', '') if m['type'] != 'noop' else '-NONE-' 310 | 311 | # build annotation string for .conll.m2 file 312 | m2AnnotationStr = 'A ' 313 | m2AnnotationStr += str(token_start) + ' ' 314 | m2AnnotationStr += str(token_end) + '|||' 315 | m2AnnotationStr += m['type'] + '|||' 316 | m2AnnotationStr += correction_final + '|||' 317 | m2AnnotationStr += 'REQUIRED|||-NONE-|||' + str(i) + '\n' 318 | 319 | m2AnnotationList.append(m2AnnotationStr) 320 | 321 | if len(mistakesList) > 0: # only if mistakeList contains tuples 322 | i += 1 323 | 324 | # write .conll.m2 file 325 | m2AnnotationSent = 'S ' + tokenizedSentStr + '\n' 326 | m2AnnotationSent += ''.join(m2AnnotationList) + '\n' 327 | fm2.write(m2AnnotationSent.encode('utf-8')) 328 | 329 | fm2.close() 330 | 331 | 332 | def tokenizeCorrectionStr(correctionStr, wdstart, wdend, words): 333 | 334 | correctionTokenizedStr = '' 335 | pseudoSent = correctionStr 336 | 337 | if wdstart != 0: 338 | pseudoSent = words[wdstart-1] + ' ' + pseudoSent 339 | 340 | if wdend < len(words) - 1: 341 | pseudoSent = pseudoSent + ' ' + words[wdend] 342 | elif wdend == len(words) - 1: 343 | pseudoSent = pseudoSent + words[wdend] 344 | 345 | 346 | pseudoWordsList = [] 347 | sentList = sentenceTokenizer.tokenize(pseudoSent) 348 | for sent in sentList: 349 | pseudoWordsList += word_tokenize(sent) 350 | 351 | start = 0 352 | if wdstart != 0: 353 | s = '' 354 | for i in xrange(len(pseudoWordsList)): 355 | s += pseudoWordsList[i] 356 | if s == words[wdstart-1]: 357 | start = i + 1 358 | break 359 | if start == 0: 360 | print >> sys.stderr, 'Can not find words[wdstart-1]' 361 | 362 | else: 363 | start = 0 364 | 365 | end = len(pseudoWordsList) 366 | if wdend != len(words): 367 | 368 | s = '' 369 | for i in xrange(len(pseudoWordsList)): 370 | s = pseudoWordsList[len(pseudoWordsList) - i - 1] + s 371 | if s == words[wdend]: 372 | end = len(pseudoWordsList) - i - 1 373 | break 374 | if end == len(pseudoWordsList): 375 | print >> sys.stderr, 'Can not find words[wdend]' 376 | 377 | else: 378 | end = len(pseudoWordsList) 379 | 380 | correctionTokenizedStr = ' '.join(pseudoWordsList[start:end]) 381 | 382 | return correctionTokenizedStr 383 | 384 | 385 | def shrinkCorrectionStr(correctionTokenizedStr, wdstart, wdend, words): 386 | 387 | correctionWords = correctionTokenizedStr.split(' ') 388 | originalWords = words[wdstart: wdend] 389 | wdstartNew = wdstart 390 | wdendNew = wdend 391 | cstart = 0 392 | cend = len(correctionWords) 393 | 394 | i = 0 395 | while i < len(originalWords) and i < len(correctionWords): 396 | if correctionWords[i] == originalWords[i]: 397 | i += 1 398 | wdstartNew = i + wdstart 399 | cstart = i 400 | else: 401 | break 402 | 403 | i = 1 404 | while i <= len(originalWords) - cstart and i <= len(correctionWords) - cstart: 405 | if correctionWords[len(correctionWords)-i] == originalWords[len(originalWords)-i]: 406 | wdendNew = wdend - i 407 | cend = len(correctionWords) - i 408 | i += 1 409 | else: 410 | break 411 | 412 | return ' '.join(correctionWords[cstart:cend]), wdstartNew, wdendNew 413 | 414 | if __name__ == '__main__': 415 | 416 | ''' usage: 417 | 418 | %python preprocesswithalt.py essaySgmlfile M mainsgmlfile1 ... mainsgmlfileM alternativesgmlfile1 ... alternativesgmlfileN combinedm2file 419 | output an m2 file containing a collection of M main annotations and N alternative annotations. 420 | 421 | In most cases essaySgmlfile and mainsgmlfile are identical 422 | ''' 423 | 424 | # Load original complete SGML for reference 425 | origDocs = sentenceSplit(readNUCLE(sys.argv[1])) 426 | 427 | origDocSet = {} 428 | for doc in origDocs: 429 | nid = int(doc.docattrs[0][1]) 430 | origDocSet[nid] = doc 431 | 432 | nummain = int(sys.argv[2]) 433 | 434 | # Store main annotations 435 | docsList = [] 436 | altDocsList = [] 437 | for i in range(0, nummain): 438 | print >> sys.stderr, 'Storing main annotation', (i+1) 439 | docs = sentenceSplit(readNUCLE(sys.argv[i+3])) 440 | docsList.append(docs) 441 | 442 | board = {} 443 | for docs in docsList: 444 | board = moderateAnnotations(docs, board, origDocSet) 445 | 446 | # store alternative annotations 447 | for i in range(3 + nummain, len(sys.argv) - 1): 448 | print >> sys.stderr, 'Storing alternative annotation', (i+1) 449 | altdocs = sentenceSplit(readNUCLE(sys.argv[i])) 450 | altDocsList.append(altdocs) 451 | 452 | for altdocs in altDocsList: 453 | board = moderateAnnotationsAlt(altdocs, board, origDocSet) 454 | 455 | createM2File(origDocs, board, sys.argv[len(sys.argv)-1]) 456 | 457 | pass 458 | 459 | -------------------------------------------------------------------------------- /data/release2.3.1/README: -------------------------------------------------------------------------------- 1 | Release 2.3.1 2 | 24 May 2013 3 | 4 | This README file describes the test data and scoring procedure for the 5 | CoNLL-2013 Shared Task: Grammatical Error Correction. 6 | 7 | The package is distributed freely with the following copyright 8 | Copyright (C) 2013 Hwee Tou Ng, Joel Tetreault, Siew Mei Wu, 9 | Yuanbin Wu, Christian Hadiwinoto 10 | 11 | Any questions regarding the test data should be directed to 12 | Hwee Tou Ng at: nght@comp.nus.edu.sg 13 | 14 | 15 | 1. Directory Structure and Contents 16 | =================================== 17 | 18 | The top-level directory has four subdirectories, namely 19 | 20 | - original/ : the test data with the original official annotations 21 | before adding alternatives contributed by the participants 22 | - revised/ : the moderated participants' alternative annotations and 23 | the revised official annotation for the test data 24 | - scripts/ : the scripts used to preprocess the test data inside the 25 | original/ and revised/ subdirectories 26 | - m2scorer/ : the official scoring software for the shared task 27 | 28 | Each of the original/ and revised/ subdirectories contains data/ 29 | subdirectory, which includes annotations for all the error types, and 30 | data_5types/ subdirectory, which includes annotations for only the 5 31 | types concerned in the shared task. 32 | 33 | 34 | 2. Data Format 35 | ============== 36 | 37 | The corpus is distributed in a simple SGML format. All annotations 38 | come in a "stand-off" format. The start position and end position of 39 | an annotation are given by paragraph and character offsets. 40 | Paragraphs are enclosed in

...

tags. Paragraphs and characters 41 | are counted starting from zero. Each annotation includes the following 42 | fields: the error category, the correction, and optionally a 43 | comment. If the correction replaces the original text at the given 44 | location, it should fix the grammatical error. 45 | 46 | Example: 47 | 48 | 49 | 50 |

51 | In modern digital world, ... 52 |

53 |

54 | Surveillance technology such as ... 55 |

56 | ... 57 |
58 | 59 | 60 | ArtOrDet 61 | the modern 62 | 63 | ... 64 | 65 |
66 | 67 | 68 | ... 69 | 70 | Below is a complete list of the error categories in the data/ 71 | subdirectory under the original/ and revised/ subdirectories: 72 | 73 | ERROR TAG ERROR CATEGORY 74 | --------------------------- 75 | Vt Verb tense 76 | Vm Verb modal 77 | V0 Missing verb 78 | Vform Verb form 79 | SVA Subject-verb-agreement 80 | ArtOrDet Article or Determiner 81 | Nn Noun number 82 | Npos Noun possesive 83 | Pform Pronoun form 84 | Pref Pronoun reference 85 | Prep Preposition 86 | Wci Wrong collocation/idiom 87 | Wa Acronyms 88 | Wform Word form 89 | Wtone Tone 90 | Srun Runons, comma splice 91 | Smod Dangling modifier 92 | Spar Parallelism 93 | Sfrag Fragment 94 | Ssub Subordinate clause 95 | WOinc Incorrect sentence form 96 | WOadv Adverb/adjective position 97 | Trans Link word/phrases 98 | Mec Punctuation, capitalization, spelling, typos 99 | Rloc- Local redundancy 100 | Cit Citation 101 | Others Other errors 102 | Um Unclear meaning (cannot be corrected) 103 | 104 | Below is a list of the error categories in the data_5types/ 105 | subdirectory under the original/ and revised/ subdirectories: 106 | 107 | ERROR TAG ERROR CATEGORY 108 | --------------------------- 109 | Vform Verb form 110 | SVA Subject-verb-agreement 111 | ArtOrDet Article or Determiner 112 | Nn Noun number 113 | Prep Preposition 114 | 115 | The official annotation file contains all the default annotations to 116 | make the whole text correct. Meanwhile, each of the alternative 117 | annotation files contains only annotations for sentences that can be 118 | corrected in a different way, i.e. sentences that have alternative 119 | annotations. If according to an alternative, a sentence can remain 120 | unchanged, a special tag "noop" is used for that particular sentence. 121 | 122 | 123 | 3. Updates included in version 2.1 124 | ================================== 125 | 126 | The major change made in version 2.1 is to map the past error 127 | categories Wcip and Rloc to Prep, Wci, ArtOrDet, and Rloc-. 128 | 129 | In the original data, there is no explicit preposition error 130 | category. Instead, preposition errors are part of the Wcip (Wrong 131 | collocation/idiom/preposition) and Rloc (local redundancy) error 132 | categories. In addition, redundant article or determiner errors are 133 | part of the Rloc error category. 134 | 135 | 136 | 4. Updates included in version 2.2 137 | ================================== 138 | 139 | - Fixed the bug on expanding an error annotation involving part of a 140 | token to the full token. 141 | 142 | - Other miscellaneous corrections were made. 143 | 144 | 145 | 5. Updates included in version 2.3 146 | ================================== 147 | 148 | - Fixed the bug involving tokenization of punctuation symbols in the 149 | correction string. 150 | 151 | - Fixed the tokenization example in the README file of the M^2 scorer 152 | to reflect the real tokenization to be used and removed irrelevant 153 | codes from the scorer package. 154 | 155 | 156 | 6. Updates included in version 2.3.1 157 | ==================================== 158 | 159 | - Enhanced the capability of the M^2 scorer to be able to handle 160 | multiple alternative sets of gold edits. 161 | 162 | - Fixed the preprocess.py script to keep the annotation span minimal, 163 | i.e. by excluding the beginning and/or end tokens that co-exist in 164 | the original and correction string. 165 | 166 | 167 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/README: -------------------------------------------------------------------------------- 1 | Release 2.3.1 2 | Revision: 24 May 2013 3 | 4 | This README file describes the NUS MaxMatch (M^2) scorer. 5 | Copyright (C) 2013 Daniel Dahlmeier, Hwee Tou Ng and Christian Hadiwinoto 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or (at 10 | your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, but 13 | WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with this program. If not, see . 19 | 20 | 21 | If you are using the NUS M^2 scorer in your work, please include a 22 | citation of the following paper: 23 | 24 | Daniel Dahlmeier and Hwee Tou Ng. 2012. Better Evaluation for 25 | Grammatical Error Correction. In Proceedings of the 2012 Conference of 26 | the North American Chapter of the Association for Computational 27 | Linguistics: Human Language Technologies 28 | 29 | Any questions regarding the NUS M^2 scorer should be directed to 30 | Daniel Dahlmeier(danielhe@comp.nus.edu.sg). 31 | 32 | 33 | Contents 34 | ======== 35 | 0. Quickstart 36 | 1. Pre-requisites 37 | 2. Using the scorer 38 | 2.1 System output format 39 | 2.2 Scorer's gold standard format 40 | 3. Converting the CoNLL-2013 data format 41 | 4. Revisions 42 | 43 | 44 | 0. Quickstart 45 | ============= 46 | ./m2scorer [-v] SYSTEM SOURCE_GOLD 47 | 48 | SYSTEM = the system output in sentence-per-line plain text. 49 | SOURCE_GOLD = the source sentences with gold edits. 50 | 51 | 52 | 1. Pre-requisites 53 | ================= 54 | The following dependencies have to be installed to use the M^2 scorer. 55 | 56 | + Python (>= 2.6.4, < 3.0, older versions might work but are not tested) 57 | + nltk (http://www.nltk.org, needed for sentence splitting) 58 | 59 | 60 | 61 | 2. Using the scorer 62 | =================== 63 | Usage: m2scorer [OPTIONS] SYSTEM SOURCE_GOLD 64 | where 65 | SYSTEM - system output, one sentence per line 66 | SOURCE_GOLD - source sentences with gold token edits 67 | 68 | OPTIONS 69 | -v --verbose - print verbose output 70 | --very_verbose - print lots of verbose output 71 | --max_unchanged_words N - Maximum unchanged words when extracting edits. Default 2. 72 | --ignore_whitespace_casing - Ignore edits that only affect whitespace and casing. Default no. 73 | 74 | 75 | 2.1 System output format 76 | ======================== 77 | SYSTEM = File that contains the output of the error correction 78 | system. The sentences should be in tokenized plain text, sentence-per-line 79 | format. 80 | 81 | Format: 82 | 83 | 84 | ... 85 | 86 | Examples of tokenization: 87 | ------------------------- 88 | Original : He said, "We shouldn't go to the place. It'll kill one of us." 89 | Tokenized : He said , " We shouldn 't go to the place . It 'll kill one of us . " 90 | 91 | Sample output: 92 | -------------- 93 | ===> system <=== 94 | A cat sat on the mat . 95 | The Dog . 96 | 97 | 98 | 2.2 Scorer's gold standard format 99 | ================================= 100 | SOURCE_GOLD = source sentences (i.e. input to the error correction 101 | system) and the gold annotation in TOKEN offsets (starting from zero). 102 | 103 | Format: 104 | S 105 | A |||||||||||||| 106 | A |||||||||||||| 107 | 108 | S 109 | A |||||||||||||| 110 | 111 | 112 | Notes: 113 | ------ 114 | - Each source sentence should appear on a single line starting with "S " 115 | - Each source sentence is followed by zero or more annotations. 116 | - Each annotation is on a separate line starting with "A ". 117 | - Sentences are separated by one or more empty lines. 118 | - The source sentences need to be tokenized in the same way as the system output. 119 | - Start and end offset for annotations are in token offsets (starting from zero). 120 | - The gold edits can include one or more possible correction strings. Multiple corrections should be separate by '||'. 121 | - The error type, required field and comment are not used for scoring at the moment. You can put dummy values there. 122 | - The annotator ID is used to identify a distinct annotation set by which system edits will be evaluated. 123 | - Each distinct annotation set, identified by an annotator ID, is an alternative 124 | - If one sentence has multiple annotator IDs, score will be computed for each annotator. 125 | - If one of the multiple annotation alternatives is no edit at all, an edit with type 'noop' or with offsets '-1 -1' must be specified. 126 | - The final score for the sentence will use the set of edits by an annotation set maximizing the score. 127 | 128 | 129 | Example: 130 | -------- 131 | ===> source_gold <=== 132 | S The cat sat at mat . 133 | A 3 4|||Prep|||on|||REQUIRED|||-NONE-|||0 134 | A 4 4|||ArtOrDet|||the||a|||REQUIRED|||-NONE-|||0 135 | 136 | S The dog . 137 | A 1 2|||NN|||dogs|||REQUIRED|||-NONE-|||0 138 | A -1 -1|||noop|||-NONE-|||-NONE-|||-NONE-|||1 139 | 140 | S Giant otters is an apex predator . 141 | A 2 3|||SVA|||are|||REQUIRED|||-NONE-|||0 142 | A 3 4|||ArtOrDet|||-NONE-|||REQUIRED|||-NONE-|||0 143 | A 5 6|||NN|||predators|||REQUIRED|||-NONE-|||0 144 | A 1 2|||NN|||otter|||REQUIRED|||-NONE-|||1 145 | 146 | 147 | 148 | ===> system <=== 149 | A cat sat on the mat . 150 | The dog . 151 | Giant otters are apex predator . 152 | 153 | ./m2scorer system source_gold 154 | Precision : 0.8 155 | Recall : 0.8 156 | F1 : 0.8 157 | 158 | For sentence #1, the system makes two valid edits {(at-> on), 159 | (\epsilon -> the)} and one unnecessary edit (The -> A). 160 | 161 | For sentence #2, despite missing one gold edit (dog -> dogs) according 162 | to annotation set 0, the system misses nothing according to set 1. 163 | 164 | For sentence #3, according to annotation set 0, the system makes two 165 | valid edits {(is -> are), (an -> \epsilon)} and misses one edit 166 | (predator -> predators); however according to set 1, the system makes 167 | two unnecessary edits {(is -> are), (an -> \epsilon)}. 168 | 169 | By the case above, there are four valid edits, one unnecessary edit 170 | and one missing edit. Therefore precision is 4/5 = 0.8. Similarly for 171 | recall and F1 measure. 172 | 173 | 174 | 3. Converting the CoNLL-2013 data format 175 | ======================================== 176 | The data format used in the M^2 scorer differs from the format used in 177 | the CoNLL-2013 shared task (http://www.comp.nus.edu.sg/~nlp/conll13st.html) 178 | in two aspects: 179 | - sentence-level edits 180 | - token edit offsets 181 | 182 | To convert source files and gold edits from the CoNLL-2013 format into 183 | the M^2 format, run the preprocessing script bundled with the CoNLL-2013 184 | training data. 185 | 186 | 187 | 4. Revision Note: Alternative Edits 188 | =================================== 189 | 190 | In this release, there is a major modification which enables scoring 191 | with multiple sets of gold edits. On every sentence, the system output 192 | will be scored against every available set of gold edits for the 193 | sentence and the edits set that maximizes the F1 score of the sentence 194 | is chosen. 195 | 196 | This modification was carried out by Christian Hadiwinoto, 2013. 197 | 198 | 199 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/example/README: -------------------------------------------------------------------------------- 1 | (execute these examples from the m2scorer top-level directory) 2 | 3 | 4 | ./m2scorer example/system_output.txt example/source_gold 5 | 6 | 7 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/example/source_gold: -------------------------------------------------------------------------------- 1 | S The cat sat at mat . 2 | A 3 4|||Prep|||on|||REQUIRED|||-NONE-|||0 3 | A 4 4|||ArtOrDet|||the||a|||REQUIRED|||-NONE-|||0 4 | 5 | S The dog . 6 | A 1 2|||NN|||dogs|||REQUIRED|||-NONE-|||0 7 | A -1 -1|||noop|||-NONE-|||-NONE-|||-NONE-|||1 8 | 9 | S Giant otters is an apex predator . 10 | A 2 3|||SVA|||are|||REQUIRED|||-NONE-|||0 11 | A 3 4|||ArtOrDet|||-NONE-|||REQUIRED|||-NONE-|||0 12 | A 5 6|||NN|||predators|||REQUIRED|||-NONE-|||0 13 | A 1 2|||NN|||otter|||REQUIRED|||-NONE-|||1 14 | 15 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/example/system: -------------------------------------------------------------------------------- 1 | A cat sat on the mat . 2 | The dog . 3 | Giant otters are apex predator . 4 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/m2scorer: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This file is part of the NUS M2 scorer. 4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | # The NUS M2 scorer is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | # file: m2scorer.py 18 | # 19 | # score a system's output against a gold reference 20 | # 21 | # Usage: m2scorer.py [OPTIONS] proposed_sentences source_gold 22 | # where 23 | # proposed_sentences - system output, sentence per line 24 | # source_gold - source sentences with gold token edits 25 | # OPTIONS 26 | # -v --verbose - print verbose output 27 | # --very_verbose - print lots of verbose output 28 | # --max_unchanged_words N - Maximum unchanged words when extracting edits. Default 2." 29 | # --ignore_whitespace_casing - Ignore edits that only affect whitespace and caseing. Default no." 30 | # 31 | 32 | import sys 33 | import levenshtein 34 | from getopt import getopt 35 | from util import paragraphs 36 | from util import smart_open 37 | 38 | 39 | 40 | def load_annotation(gold_file): 41 | source_sentences = [] 42 | gold_edits = [] 43 | fgold = smart_open(gold_file, 'r') 44 | puffer = fgold.read() 45 | fgold.close() 46 | puffer = puffer.decode('utf8') 47 | for item in paragraphs(puffer.splitlines(True)): 48 | item = item.splitlines(False) 49 | sentence = [line[2:].strip() for line in item if line.startswith('S ')] 50 | assert sentence != [] 51 | annotations = {} 52 | for line in item[1:]: 53 | if line.startswith('I ') or line.startswith('S '): 54 | continue 55 | assert line.startswith('A ') 56 | line = line[2:] 57 | fields = line.split('|||') 58 | start_offset = int(fields[0].split()[0]) 59 | end_offset = int(fields[0].split()[1]) 60 | etype = fields[1] 61 | if etype == 'noop': 62 | start_offset = -1 63 | end_offset = -1 64 | corrections = [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')] 65 | # NOTE: start and end are *token* offsets 66 | original = ' '.join(' '.join(sentence).split()[start_offset:end_offset]) 67 | annotator = int(fields[5]) 68 | if annotator not in annotations.keys(): 69 | annotations[annotator] = [] 70 | annotations[annotator].append((start_offset, end_offset, original, corrections)) 71 | tok_offset = 0 72 | for this_sentence in sentence: 73 | tok_offset += len(this_sentence.split()) 74 | source_sentences.append(this_sentence) 75 | this_edits = {} 76 | for annotator, annotation in annotations.iteritems(): 77 | this_edits[annotator] = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0] 78 | if len(this_edits) == 0: 79 | this_edits[0] = [] 80 | gold_edits.append(this_edits) 81 | return (source_sentences, gold_edits) 82 | 83 | 84 | def print_usage(): 85 | print >> sys.stderr, "Usage: m2scorer.py [OPTIONS] proposed_sentences gold_source" 86 | print >> sys.stderr, "where" 87 | print >> sys.stderr, " proposed_sentences - system output, sentence per line" 88 | print >> sys.stderr, " source_gold - source sentences with gold token edits" 89 | print >> sys.stderr, "OPTIONS" 90 | print >> sys.stderr, " -v --verbose - print verbose output" 91 | print >> sys.stderr, " --very_verbose - print lots of verbose output" 92 | print >> sys.stderr, " --max_unchanged_words N - Maximum unchanged words when extraction edit. Default 2." 93 | print >> sys.stderr, " --ignore_whitespace_casing - Ignore edits that only affect whitespace and caseing. Default no." 94 | 95 | 96 | 97 | max_unchanged_words=2 98 | ignore_whitespace_casing= False 99 | verbose = False 100 | very_verbose = False 101 | opts, args = getopt(sys.argv[1:], "v", ["max_unchanged_words=", "verbose", "ignore_whitespace_casing", "very_verbose"]) 102 | for o, v in opts: 103 | if o in ('-v', '--verbose'): 104 | verbose = True 105 | elif o == '--very_verbose': 106 | very_verbose = True 107 | elif o == '--max_unchanged_words': 108 | max_unchanged_words = int(v) 109 | elif o == '--ignore_whitespace_casing': 110 | ignore_whitespace_casing = True 111 | else: 112 | print >> sys.stderr, "Unknown option :", o 113 | print_usage() 114 | sys.exit(-1) 115 | 116 | # starting point 117 | if len(args) != 2: 118 | print_usage() 119 | sys.exit(-1) 120 | 121 | system_file = args[0] 122 | gold_file = args[1] 123 | 124 | # load source sentences and gold edits 125 | source_sentences, gold_edits = load_annotation(gold_file) 126 | 127 | # load system hypotheses 128 | fin = smart_open(system_file, 'r') 129 | system_sentences = [line.decode("utf8").strip() for line in fin.readlines()] 130 | fin.close() 131 | 132 | p, r, f1 = levenshtein.batch_multi_pre_rec_f1(system_sentences, source_sentences, gold_edits, max_unchanged_words, ignore_whitespace_casing, verbose, very_verbose) 133 | 134 | print "Precision : %.4f" % p 135 | print "Recall : %.4f" % r 136 | print "F1 : %.4f" % f1 137 | 138 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/scripts/Tokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: iso-8859-15 -*- 3 | 4 | # This file is part of the NUS M2 scorer. 5 | # The NUS M2 scorer is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | 10 | # The NUS M2 scorer is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | 18 | # file: Tokenizer.py 19 | # 20 | # A Penn Treebank tokenizer reimplemented based on the MOSES implementation. 21 | # 22 | # usage : %prog < input > output 23 | 24 | 25 | import re 26 | import sys 27 | 28 | 29 | class DummyTokenizer(object): 30 | 31 | def tokenize(self, text): 32 | return text.split() 33 | 34 | 35 | 36 | class PTBTokenizer(object): 37 | 38 | def __init__(self, language="en"): 39 | self.language = language 40 | self.nonbreaking_prefixes = {} 41 | self.nonbreaking_prefixes_numeric = {} 42 | self.nonbreaking_prefixes["en"] = ''' A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 43 | Adj Adm Adv Asst Bart Bldg Brig Bros Capt Cmdr Col Comdr Con Corp Cpl DR Dr Drs Ens 44 | Gen Gov Hon Hr Hosp Insp Lt MM MR MRS MS Maj Messrs Mlle Mme Mr Mrs Ms Msgr Op Ord 45 | Pfc Ph Prof Pvt Rep Reps Res Rev Rt Sen Sens Sfc Sgt Sr St Supt Surg 46 | v vs i.e rev e.g Nos Nr'''.split() 47 | self.nonbreaking_prefixes_numeric["en"] = '''No Art pp'''.split() 48 | self.special_chars = re.compile(r"([^\w\s\.\'\`\,\-\"\|\/])", flags=re.UNICODE) 49 | 50 | def tokenize(self, text, ptb=False): 51 | text = text.strip() 52 | text = " " + text + " " 53 | 54 | # Separate all "other" punctuation 55 | 56 | text = re.sub(self.special_chars, r' \1 ', text) 57 | text = re.sub(r";", r' ; ', text) 58 | text = re.sub(r":", r' : ', text) 59 | 60 | # replace the pipe character 61 | text = re.sub(r"\|", r' -PIPE- ', text) 62 | 63 | # split internal slash, keep others 64 | text = re.sub(r"(\S)/(\S)", r'\1 / \2', text) 65 | 66 | # PTB tokenization 67 | if ptb: 68 | text = re.sub(r"\(", r' -LRB- ', text) 69 | text = re.sub(r"\)", r' -RRB- ', text) 70 | text = re.sub(r"\[", r' -LSB- ', text) 71 | text = re.sub(r"\]", r' -RSB- ', text) 72 | text = re.sub(r"\{", r' -LCB- ', text) 73 | text = re.sub(r"\}", r' -RCB- ', text) 74 | 75 | text = re.sub(r"\"\s*$", r" '' ", text) 76 | text = re.sub(r"^\s*\"", r' `` ', text) 77 | text = re.sub(r"(\S)\"\s", r"\1 '' ", text) 78 | text = re.sub(r"\s\"(\S)", r" `` \1", text) 79 | text = re.sub(r"(\S)\"", r"\1 '' ", text) 80 | text = re.sub(r"\"(\S)", r" `` \1", text) 81 | text = re.sub(r"'\s*$", r" ' ", text) 82 | text = re.sub(r"^\s*'", r" ` ", text) 83 | text = re.sub(r"(\S)'\s", r"\1 ' ", text) 84 | text = re.sub(r"\s'(\S)", r" ` \1", text) 85 | 86 | text = re.sub(r"'ll", r" -CONTRACT-ll", text) 87 | text = re.sub(r"'re", r" -CONTRACT-re", text) 88 | text = re.sub(r"'ve", r" -CONTRACT-ve", text) 89 | text = re.sub(r"n't", r" n-CONTRACT-t", text) 90 | text = re.sub(r"'LL", r" -CONTRACT-LL", text) 91 | text = re.sub(r"'RE", r" -CONTRACT-RE", text) 92 | text = re.sub(r"'VE", r" -CONTRACT-VE", text) 93 | text = re.sub(r"N'T", r" N-CONTRACT-T", text) 94 | text = re.sub(r"cannot", r"can not", text) 95 | text = re.sub(r"Cannot", r"Can not", text) 96 | 97 | # multidots stay together 98 | text = re.sub(r"\.([\.]+)", r" DOTMULTI\1", text) 99 | while re.search("DOTMULTI\.", text): 100 | text = re.sub(r"DOTMULTI\.([^\.])", r"DOTDOTMULTI \1", text) 101 | text = re.sub(r"DOTMULTI\.", r"DOTDOTMULTI", text) 102 | 103 | # multidashes stay together 104 | text = re.sub(r"\-([\-]+)", r" DASHMULTI\1", text) 105 | while re.search("DASHMULTI\-", text): 106 | text = re.sub(r"DASHMULTI\-([^\-])", r"DASHDASHMULTI \1", text) 107 | text = re.sub(r"DASHMULTI\-", r"DASHDASHMULTI", text) 108 | 109 | # Separate ',' except if within number. 110 | text = re.sub(r"(\D),(\D)", r'\1 , \2', text) 111 | # Separate ',' pre and post number. 112 | text = re.sub(r"(\d),(\D)", r'\1 , \2', text) 113 | text = re.sub(r"(\D),(\d)", r'\1 , \2', text) 114 | 115 | if self.language == "en": 116 | text = re.sub(r"([^a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text) 117 | text = re.sub(r"(\W)'([a-zA-Z])", r"\1 ' \2", text) 118 | text = re.sub(r"([a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text) 119 | text = re.sub(r"([a-zA-Z])'([a-zA-Z])", r"\1 '\2", text) 120 | text = re.sub(r"(\d)'(s)", r"\1 '\2", text) 121 | text = re.sub(r" '\s+s ", r" 's ", text) 122 | text = re.sub(r" '\s+s ", r" 's ", text) 123 | elif self.language == "fr": 124 | text = re.sub(r"([^a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text) 125 | text = re.sub(r"([^a-zA-Z])'([a-zA-Z])", r"\1 ' \2", text) 126 | text = re.sub(r"([a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text) 127 | text = re.sub(r"([a-zA-Z])'([a-zA-Z])", r"\1' \2", text) 128 | else: 129 | text = re.sub(r"'", r" ' ") 130 | 131 | # re-combine single quotes 132 | text = re.sub(r"' '", r"''", text) 133 | 134 | words = text.split() 135 | text = '' 136 | for i, word in enumerate(words): 137 | m = re.match("^(\S+)\.$", word) 138 | if m: 139 | pre = m.group(1) 140 | if ((re.search("\.", pre) and re.search("[a-zA-Z]", pre)) or \ 141 | (pre in self.nonbreaking_prefixes[self.language]) or \ 142 | ((i < len(words)-1) and re.match("^\d+", words[i+1]))): 143 | pass # do nothing 144 | elif ((pre in self.nonbreaking_prefixes_numeric[self.language] ) and \ 145 | (i < len(words)-1) and re.match("\d+", words[i+1])): 146 | pass # do nothing 147 | else: 148 | word = pre + " ." 149 | 150 | text += word + " " 151 | text = re.sub(r"'\s+'", r"''", text) 152 | 153 | # restore multidots 154 | while re.search("DOTDOTMULTI", text): 155 | text = re.sub(r"DOTDOTMULTI", r"DOTMULTI.", text) 156 | text = re.sub(r"DOTMULTI", r".", text) 157 | 158 | # restore multidashes 159 | while re.search("DASHDASHMULTI", text): 160 | text = re.sub(r"DASHDASHMULTI", r"DASHMULTI-", text) 161 | text = re.sub(r"DASHMULTI", r"-", text) 162 | text = re.sub(r"-CONTRACT-", r"'", text) 163 | 164 | return text.split() 165 | 166 | 167 | def tokenize_all(self,sentences, ptb=False): 168 | return [self.tokenize(t, ptb) for t in sentences] 169 | 170 | # starting point 171 | if __name__ == "__main__": 172 | tokenizer = PTBTokenizer() 173 | for line in sys.stdin: 174 | line = line.decode("utf8") 175 | tokens = tokenizer.tokenize(line.strip()) 176 | out = ' '.join(tokens) 177 | print out.encode("utf8") 178 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/scripts/combiner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This file is part of the NUS M2 scorer. 4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | # The NUS M2 scorer is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | # file: m2scorer.py 18 | # 19 | # score a system's output against a gold reference 20 | # 21 | # Usage: m2scorer.py [OPTIONS] proposed_sentences source_gold 22 | # where 23 | # proposed_sentences - system output, sentence per line 24 | # source_gold - source sentences with gold token edits 25 | # OPTIONS 26 | # -v --verbose - print verbose output 27 | # --very_verbose - print lots of verbose output 28 | # --max_unchanged_words N - Maximum unchanged words when extracting edits. Default 2." 29 | # --ignore_whitespace_casing - Ignore edits that only affect whitespace and caseing. Default no." 30 | # 31 | 32 | import sys 33 | import levenshtein 34 | from getopt import getopt 35 | from util import paragraphs 36 | from util import smart_open 37 | 38 | 39 | 40 | def load_annotation(gold_file): 41 | source_sentences = [] 42 | gold_edits = [] 43 | fgold = smart_open(gold_file, 'r') 44 | puffer = fgold.read() 45 | fgold.close() 46 | puffer = puffer.decode('utf8') 47 | for item in paragraphs(puffer.splitlines(True)): 48 | item = item.splitlines(False) 49 | sentence = [line[2:].strip() for line in item if line.startswith('S ')] 50 | assert sentence != [] 51 | annotations = {} 52 | for line in item[1:]: 53 | if line.startswith('I ') or line.startswith('S '): 54 | continue 55 | assert line.startswith('A ') 56 | line = line[2:] 57 | fields = line.split('|||') 58 | start_offset = int(fields[0].split()[0]) 59 | end_offset = int(fields[0].split()[1]) 60 | etype = fields[1] 61 | if etype == 'noop': 62 | start_offset = -1 63 | end_offset = -1 64 | corrections = [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')] 65 | # NOTE: start and end are *token* offsets 66 | original = ' '.join(' '.join(sentence).split()[start_offset:end_offset]) 67 | annotator = int(fields[5]) 68 | if annotator not in annotations.keys(): 69 | annotations[annotator] = [] 70 | annotations[annotator].append((start_offset, end_offset, original, corrections)) 71 | tok_offset = 0 72 | for this_sentence in sentence: 73 | tok_offset += len(this_sentence.split()) 74 | source_sentences.append(this_sentence) 75 | this_edits = {} 76 | for annotator, annotation in annotations.iteritems(): 77 | this_edits[annotator] = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0] 78 | if len(this_edits) == 0: 79 | this_edits[0] = [] 80 | gold_edits.append(this_edits) 81 | return (source_sentences, gold_edits) 82 | 83 | 84 | def print_usage(): 85 | print >> sys.stderr, "Usage: m2scorer.py [OPTIONS] proposed_sentences gold_source" 86 | print >> sys.stderr, "where" 87 | print >> sys.stderr, " proposed_sentences - system output, sentence per line" 88 | print >> sys.stderr, " source_gold - source sentences with gold token edits" 89 | print >> sys.stderr, "OPTIONS" 90 | print >> sys.stderr, " -v --verbose - print verbose output" 91 | print >> sys.stderr, " --very_verbose - print lots of verbose output" 92 | print >> sys.stderr, " --max_unchanged_words N - Maximum unchanged words when extraction edit. Default 2." 93 | print >> sys.stderr, " --ignore_whitespace_casing - Ignore edits that only affect whitespace and caseing. Default no." 94 | 95 | 96 | 97 | max_unchanged_words=2 98 | ignore_whitespace_casing= False 99 | verbose = False 100 | very_verbose = False 101 | opts, args = getopt(sys.argv[1:], "v", ["max_unchanged_words=", "verbose", "ignore_whitespace_casing", "very_verbose"]) 102 | for o, v in opts: 103 | if o in ('-v', '--verbose'): 104 | verbose = True 105 | elif o == '--very_verbose': 106 | very_verbose = True 107 | elif o == '--max_unchanged_words': 108 | max_unchanged_words = int(v) 109 | elif o == '--ignore_whitespace_casing': 110 | ignore_whitespace_casing = True 111 | else: 112 | print >> sys.stderr, "Unknown option :", o 113 | print_usage() 114 | sys.exit(-1) 115 | 116 | 117 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/scripts/convert_hoo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This file is part of the NUS M2 scorer. 4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | # The NUS M2 scorer is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | # file: convert_hoo.py 18 | # 19 | # convert source xml file and gold annotation to 20 | # merged file with sentence-per-line sentences and 21 | # annotation. 22 | # 23 | # usage : %prog [-p] source.xml [gold.xml] > output 24 | 25 | from Tokenizer import PTBTokenizer 26 | import xml.dom.minidom 27 | import sys 28 | import re 29 | import getopt 30 | from util import fix_cp1252codes 31 | 32 | 33 | ## global variables 34 | tokenizer = PTBTokenizer() 35 | 36 | def slice_paragraph(text): 37 | yield (0,len(text),text) 38 | def slice_tokenize(text): 39 | import nltk 40 | sentence_spliter = nltk.data.load('tokenizers/punkt/english.pickle') 41 | last_break = 0 42 | for match in sentence_spliter._lang_vars.period_context_re().finditer(text): 43 | context = match.group() + match.group('after_tok') 44 | if sentence_spliter.text_contains_sentbreak(context): 45 | yield (last_break, match.end(), text[last_break:match.end()]) 46 | if match.group('next_tok'): 47 | # next sentence starts after whitespace 48 | last_break = match.start('next_tok') 49 | else: 50 | # next sentence starts at following punctuation 51 | last_break = match.end() 52 | yield (last_break, len(text), text[last_break:len(text)]) 53 | 54 | def get_text(node): 55 | # get text data from xml tag 56 | buffer = '' 57 | for t in node.childNodes: 58 | if t.nodeType == t.TEXT_NODE: 59 | buffer += t.data 60 | return buffer 61 | 62 | def has_empty(node): 63 | # check if node has tag child 64 | return len(node.getElementsByTagName('empty')) > 0 65 | 66 | def get_textbody(sdom): 67 | parts = [] 68 | for b in sdom.getElementsByTagName('BODY'): 69 | for pa in b.getElementsByTagName('PART'): 70 | part_id = pa.attributes["id"].value 71 | buffer = [] 72 | for p in pa.getElementsByTagName('P'): 73 | buffer.append(get_text(p)) 74 | parts.append((buffer, part_id)) 75 | return parts 76 | 77 | def get_edits(gdom): 78 | edits = [] 79 | for es in gdom.getElementsByTagName('edits'): 80 | for e in es.getElementsByTagName('edit'): 81 | start = int(e.attributes["start"].value) 82 | end = int(e.attributes["end"].value) 83 | part = e.attributes["part"].value 84 | etype = e.attributes["type"].value 85 | o = e.getElementsByTagName('original')[0] 86 | if len(o.getElementsByTagName('empty')) > 0: 87 | original = '' 88 | else: 89 | original = get_text(o).strip() 90 | corrections = [] 91 | optional = False 92 | for cs in e.getElementsByTagName('corrections'): 93 | for c in cs.getElementsByTagName('correction'): 94 | if len(c.getElementsByTagName('empty')) > 0: 95 | corrections.append('') 96 | else: 97 | correction = get_text(c).strip() 98 | if correction == '': 99 | optional = True 100 | else: 101 | corrections.append(correction) 102 | edits.append([start, end, part, etype, original, corrections, optional]) 103 | return edits 104 | 105 | 106 | # starting point 107 | if __name__ == "__main__": 108 | opts, args = getopt.getopt(sys.argv[1:], "p") 109 | paragraph = False 110 | for o,a in opts : 111 | if o == "-p" : 112 | paragraph = True 113 | 114 | if len(args) < 1 or len(args) > 2: 115 | print >> sys.stderr, "usage: %prog [-p] source.xml [gold.xml] > output" 116 | sys.exit(-1) 117 | fsource = args[0] 118 | gold = 0 119 | if len(args) == 2: 120 | fgold = args[1] 121 | gold = 1 122 | 123 | 124 | # parse xml files 125 | source_dom = xml.dom.minidom.parse(fsource) 126 | if gold : 127 | gold_dom = xml.dom.minidom.parse(fgold) 128 | 129 | 130 | # read the xml 131 | parts = get_textbody(source_dom) 132 | if gold : 133 | edits = get_edits(gold_dom) 134 | 135 | # sentence split 136 | slice = slice_tokenize 137 | if paragraph : 138 | slice = slice_paragraph 139 | for part, part_no in parts: 140 | offset = 0 141 | for p in part: 142 | for s_start, s_end, s in slice(p): 143 | if s.strip() == '': 144 | continue 145 | print "S", s.encode('utf8') 146 | if gold : 147 | this_edits = [e for e in edits if e[0] >= offset + s_start 148 | and e[1] < offset + s_end and e[2] == part_no] 149 | for e in this_edits: 150 | start = e[0] - (offset + s_start) 151 | end = e[1] - (offset + s_start) 152 | etype = e[3] 153 | cor = "||".join(e[5]) 154 | req = "REQUIRED" if e[6] == False else "OPTIONAL" 155 | out = "A %d %d|||%s|||%s|||%s|||-NONE-|||0" % (start, end, etype, cor, req) 156 | print out.encode('utf8') 157 | print "" 158 | offset += s_end 159 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/scripts/convert_nucle.py: -------------------------------------------------------------------------------- 1 | # convert_nucle.py 2 | # 3 | # Author: Christian Hadiwinoto 4 | # National University of Singapore (NUS) 5 | # Date: 12 Mar 2013 6 | # Contact: chrhad@comp.nus.edu.sg 7 | # 8 | # Version: 1.0 9 | # 10 | # Original: Yuanbin Wu 11 | # National University of Singapore (NUS) 12 | # Contact: wuyb@comp.nus.edu.sg 13 | # 14 | # This script is distributed to support the CoNLL-2013 Shared Task. 15 | # It is free for research and educational purposes. 16 | # 17 | # Usage: python convert_nucle.py sgmlFile > m2File 18 | 19 | from nuclesgmlparser import nuclesgmlparser 20 | from nucle_doc import * 21 | import nltk.data 22 | import re 23 | import sys 24 | import getopt 25 | 26 | class PreProcessor: 27 | 28 | def __init__(self): 29 | 30 | self.sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 31 | self.sentenceDumpedFile = 'sentence_file' 32 | self.docsDumpedFileName = 'docs' 33 | 34 | def readNUCLE(self, fn): 35 | 36 | f = open(fn, 'r') 37 | parser = nuclesgmlparser() 38 | filestr = f.read() 39 | filestr = filestr.decode('utf-8') 40 | 41 | #Fix Reference tag 42 | p = re.compile(r'(\n

\n.*\n)

') 43 | filestr = p.sub(r'\1

', filestr) 44 | 45 | parser.feed(filestr) 46 | f.close() 47 | parser.close() 48 | 49 | return parser.docs 50 | 51 | 52 | def sentenceSplit(self, docs): 53 | 54 | for doc in docs: 55 | for par in doc.paragraphs: 56 | doc.sentences.append([]) 57 | for s in self.sentenceTokenizer.tokenize(par): 58 | doc.buildSentence(s, [], '', [], []) 59 | return docs 60 | 61 | 62 | def m2FileGeneration(self, docs): 63 | 64 | for doc in docs: 65 | for slistIndex in xrange(len(doc.sentences)): 66 | slist = doc.sentences[slistIndex] 67 | for sentid in xrange(len(slist)): 68 | 69 | sent = slist[sentid] 70 | 71 | # annotation string list 72 | annotationList = [] 73 | 74 | # m2 format annotation string list 75 | m2AnnotationList = [] 76 | 77 | # build colums 78 | table = sent.getConllFormat(doc, slistIndex, sentid) 79 | tokenizedSentStr = ' '.join(sent.getWords()) 80 | 81 | #Add annotation info 82 | sentoffset = doc.paragraphs[slistIndex].index(sent.sentstr) 83 | for m in doc.mistakes: 84 | 85 | if m['start_par'] != slistIndex or \ 86 | m['start_par'] != m['end_par'] or \ 87 | m['start_off'] < sentoffset or \ 88 | m['start_off'] >= sentoffset + len(sent.sentstr) or \ 89 | m['end_off'] sentoffset + len(sent.sentstr): 91 | continue 92 | 93 | wordsoffset = 0 94 | wdstart = 0 95 | 96 | startInWord = 0 97 | headText = '' 98 | endInWord = 0 99 | tailText = '' 100 | 101 | words = sent.getWords() 102 | while wdstart < len(words): 103 | 104 | word = words[wdstart] 105 | nextstart = sent.sentstr.find(word, wordsoffset) 106 | 107 | if nextstart == -1: 108 | # may not find word, due to relpacement 109 | print >> sys.stderr, "Warning: can not find word" 110 | print >> sys.stderr, word.encode('utf-8') 111 | wordsoffset += 1 112 | else: 113 | wordsoffset = nextstart 114 | 115 | if wordsoffset >= m['start_off']-sentoffset: 116 | break 117 | elif wordsoffset + len(word) > m['start_off']-sentoffset: 118 | # annotation starts at the middle of a word 119 | startInWord = 1 120 | headText = sent.sentstr[wordsoffset: m['start_off']-sentoffset] 121 | break 122 | 123 | wordsoffset += len(word) 124 | wdstart += 1 125 | 126 | if wdstart == len(words): 127 | print >> sys.stderr, 'Warning in building conll format: start_off overflow' 128 | print >> sys.stderr, m, sent.sentstr.encode('utf-8') 129 | continue 130 | 131 | wdend = wdstart 132 | while wdend < len(words): 133 | 134 | word = words[wdend] 135 | 136 | nextstart = sent.sentstr.find(word, wordsoffset) 137 | 138 | if nextstart == -1: 139 | print >> sys.stderr, "Warning in building conll format: can not find word" 140 | print >> sys.stderr, word.encode('utf-8') 141 | wordsoffset += 1 142 | else: 143 | wordsoffset = nextstart 144 | 145 | if wordsoffset >= m['end_off']-sentoffset: 146 | # annotation ends at the middle of a word 147 | if wordsoffset - len(word) < m['end_off']-sentoffset: 148 | endInWord = 1 149 | tailText = sent.sentstr[m['end_off']-sentoffset : wordsoffset].strip() 150 | break 151 | 152 | wordsoffset += len(word) 153 | wdend += 1 154 | 155 | # build annotation string for .conll.m2 file 156 | m2AnnotationStr = 'A ' 157 | m2AnnotationStr += str(wdstart) + ' ' 158 | m2AnnotationStr += str(wdend) + '|||' 159 | m2AnnotationStr += m['type'] + '|||' 160 | m2AnnotationStr += m['correction'].replace('\n', '') + '|||' 161 | m2AnnotationStr += 'REQUIRED|||-NONE-|||0\n' 162 | 163 | m2AnnotationList.append(m2AnnotationStr) 164 | 165 | # write .conll.m2 file 166 | if len(m2AnnotationList) != 0: 167 | m2AnnotationSent = 'S ' + tokenizedSentStr + '\n' 168 | m2AnnotationSent += ''.join(m2AnnotationList) + '\n' 169 | sys.stdout.write(m2AnnotationSent.encode('utf-8')) 170 | 171 | 172 | def usage_release(): 173 | print '\nUsage: python preprocess_nmt.py sgmlFile > outputFile \n\n' 174 | 175 | if __name__ == '__main__': 176 | opts, args = getopt.getopt(sys.argv[1:], "") 177 | 178 | if len(args) != 1: 179 | usage_release() 180 | sys.exit(2) 181 | 182 | ppr = PreProcessor() 183 | debug = False 184 | 185 | sgmlFileName = args[0] 186 | 187 | docs = ppr.sentenceSplit(ppr.readNUCLE(sgmlFileName)) 188 | ppr.m2FileGeneration(docs) 189 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/scripts/m2scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This file is part of the NUS M2 scorer. 4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | # The NUS M2 scorer is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | # file: m2scorer.py 18 | # 19 | # score a system's output against a gold reference 20 | # 21 | # Usage: m2scorer.py [OPTIONS] proposed_sentences source_gold 22 | # where 23 | # proposed_sentences - system output, sentence per line 24 | # source_gold - source sentences with gold token edits 25 | # OPTIONS 26 | # -v --verbose - print verbose output 27 | # --very_verbose - print lots of verbose output 28 | # --max_unchanged_words N - Maximum unchanged words when extracting edits. Default 2." 29 | # --ignore_whitespace_casing - Ignore edits that only affect whitespace and caseing. Default no." 30 | # 31 | 32 | import sys 33 | import levenshtein 34 | from getopt import getopt 35 | from util import paragraphs 36 | from util import smart_open 37 | 38 | 39 | 40 | def load_annotation(gold_file): 41 | source_sentences = [] 42 | gold_edits = [] 43 | fgold = smart_open(gold_file, 'r') 44 | puffer = fgold.read() 45 | fgold.close() 46 | puffer = puffer.decode('utf8') 47 | for item in paragraphs(puffer.splitlines(True)): 48 | item = item.splitlines(False) 49 | sentence = [line[2:].strip() for line in item if line.startswith('S ')] 50 | assert sentence != [] 51 | annotations = {} 52 | for line in item[1:]: 53 | if line.startswith('I ') or line.startswith('S '): 54 | continue 55 | assert line.startswith('A ') 56 | line = line[2:] 57 | fields = line.split('|||') 58 | start_offset = int(fields[0].split()[0]) 59 | end_offset = int(fields[0].split()[1]) 60 | etype = fields[1] 61 | if etype == 'noop': 62 | start_offset = -1 63 | end_offset = -1 64 | corrections = [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')] 65 | # NOTE: start and end are *token* offsets 66 | original = ' '.join(' '.join(sentence).split()[start_offset:end_offset]) 67 | annotator = int(fields[5]) 68 | if annotator not in annotations.keys(): 69 | annotations[annotator] = [] 70 | annotations[annotator].append((start_offset, end_offset, original, corrections)) 71 | tok_offset = 0 72 | for this_sentence in sentence: 73 | tok_offset += len(this_sentence.split()) 74 | source_sentences.append(this_sentence) 75 | this_edits = {} 76 | for annotator, annotation in annotations.iteritems(): 77 | this_edits[annotator] = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0] 78 | if len(this_edits) == 0: 79 | this_edits[0] = [] 80 | gold_edits.append(this_edits) 81 | return (source_sentences, gold_edits) 82 | 83 | 84 | def print_usage(): 85 | print >> sys.stderr, "Usage: m2scorer.py [OPTIONS] proposed_sentences gold_source" 86 | print >> sys.stderr, "where" 87 | print >> sys.stderr, " proposed_sentences - system output, sentence per line" 88 | print >> sys.stderr, " source_gold - source sentences with gold token edits" 89 | print >> sys.stderr, "OPTIONS" 90 | print >> sys.stderr, " -v --verbose - print verbose output" 91 | print >> sys.stderr, " --very_verbose - print lots of verbose output" 92 | print >> sys.stderr, " --max_unchanged_words N - Maximum unchanged words when extraction edit. Default 2." 93 | print >> sys.stderr, " --ignore_whitespace_casing - Ignore edits that only affect whitespace and caseing. Default no." 94 | 95 | 96 | 97 | max_unchanged_words=2 98 | ignore_whitespace_casing= False 99 | verbose = False 100 | very_verbose = False 101 | opts, args = getopt(sys.argv[1:], "v", ["max_unchanged_words=", "verbose", "ignore_whitespace_casing", "very_verbose"]) 102 | for o, v in opts: 103 | if o in ('-v', '--verbose'): 104 | verbose = True 105 | elif o == '--very_verbose': 106 | very_verbose = True 107 | elif o == '--max_unchanged_words': 108 | max_unchanged_words = int(v) 109 | elif o == '--ignore_whitespace_casing': 110 | ignore_whitespace_casing = True 111 | else: 112 | print >> sys.stderr, "Unknown option :", o 113 | print_usage() 114 | sys.exit(-1) 115 | 116 | # starting point 117 | if len(args) != 2: 118 | print_usage() 119 | sys.exit(-1) 120 | 121 | system_file = args[0] 122 | gold_file = args[1] 123 | 124 | # load source sentences and gold edits 125 | source_sentences, gold_edits = load_annotation(gold_file) 126 | 127 | # load system hypotheses 128 | fin = smart_open(system_file, 'r') 129 | system_sentences = [line.decode("utf8").strip() for line in fin.readlines()] 130 | fin.close() 131 | 132 | p, r, f1 = levenshtein.batch_multi_pre_rec_f1(system_sentences, source_sentences, gold_edits, max_unchanged_words, ignore_whitespace_casing, verbose, very_verbose) 133 | 134 | print "Precision : %.4f" % p 135 | print "Recall : %.4f" % r 136 | print "F1 : %.4f" % f1 137 | 138 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/scripts/nucle_doc.py: -------------------------------------------------------------------------------- 1 | # nucle_doc.py 2 | # 3 | # Author: Yuanbin Wu 4 | # National University of Singapore (NUS) 5 | # Date: 12 Mar 2013 6 | # Version: 1.0 7 | # 8 | # Contact: wuyb@comp.nus.edu.sg 9 | # 10 | # This script is distributed to support the CoNLL-2013 Shared Task. 11 | # It is free for research and educational purposes. 12 | 13 | import os 14 | import sys 15 | from nltk import word_tokenize 16 | 17 | class nucle_doc: 18 | def __init__(self): 19 | self.docattrs = None 20 | 21 | self.matric = '' 22 | self.email = '' 23 | self.nationality = '' 24 | self.firstLanguage = '' 25 | self.schoolLanguage = '' 26 | self.englishTests = '' 27 | 28 | self.paragraphs = [] 29 | self.annotation = [] 30 | self.mistakes = [] 31 | 32 | self.sentences = [] 33 | 34 | def buildSentence(self, sentstr, dpnode, constituentstr, poslist, chunklist): 35 | self.sentences[-1].append(nucle_sent(sentstr, dpnode, constituentstr, poslist, chunklist)) 36 | 37 | def addSentence(self, sent): 38 | self.sentences[-1].append(sent) 39 | 40 | def findMistake(self, par, pos): 41 | for m in self.mistakes: 42 | if par == m['start_par'] and pos >= m['start_off'] and pos < m['end_off']: 43 | return m 44 | return None 45 | 46 | 47 | class nucle_sent: 48 | def __init__(self, sentstr, dpnode, constituentstr, poslist, chunklist): 49 | self.sentstr = sentstr 50 | self.words = word_tokenize(sentstr) 51 | self.dpnodes = dpnode 52 | self.constituentstr = constituentstr 53 | self.constituentlist = [] 54 | self.poslist = poslist 55 | self.chunklist = chunklist 56 | 57 | def buildConstituentList(self): 58 | 59 | s = self.constituentstr.strip().replace('\n', '').replace(' ', '') 60 | r = [] 61 | i = 0 62 | while i < len(s): 63 | j = i 64 | while j < len(s) and s[j] != ')': 65 | j += 1 66 | k = j 67 | while k < len(s) and s[k] == ')': 68 | k += 1 69 | 70 | nodeWholeStr = s[i:k] 71 | lastLRBIndex = nodeWholeStr.rfind('(') 72 | nodeStr = nodeWholeStr[:lastLRBIndex] + '*' + s[j+1:k] 73 | 74 | r.append(nodeStr) 75 | i = k 76 | 77 | if len(r) != len(self.words): 78 | print >> sys.stderr, 'Error in buiding constituent tree bits: different length with words.' 79 | print >> sys.stderr, len(r), len(self.words) 80 | print >> sys.stderr, ' '.join(r).encode('utf-8') 81 | print >> sys.stderr, words 82 | sys.exit(1) 83 | 84 | self.constituentlist = r 85 | 86 | 87 | 88 | def setDpNode(self, dpnode): 89 | self.dpnodes = dpnode 90 | 91 | def setPOSList(self, poslist): 92 | self.poslist = poslist 93 | 94 | def setConstituentStr(self, constituentstr): 95 | self.constituentstr = constituentstr 96 | 97 | def setConstituentList(self, constituentlist): 98 | self.constituentlist = constituentlist 99 | 100 | def setWords(self, words): 101 | self.words = words 102 | 103 | def setChunkList(self, chunklist): 104 | self.chunklist = chunklist 105 | 106 | def getDpNode(self): 107 | return self.dpnodes 108 | 109 | def getPOSList(self): 110 | return self.poslist 111 | 112 | def getConstituentStr(self): 113 | return self.constituentstr 114 | 115 | def getConstituentList(self): 116 | return self.constituentlist 117 | 118 | def getWords(self): 119 | return self.words 120 | 121 | def getChunkList(self): 122 | return self.chunklist 123 | 124 | def getConllFormat(self, doc, paragraphIndex, sentIndex): 125 | 126 | table = [] 127 | 128 | dpnodes = self.getDpNode() 129 | poslist = self.getPOSList() 130 | #chunklist = self.getChunkList() 131 | words = self.getWords() 132 | constituentlist = self.getConstituentList() 133 | 134 | if len(poslist) == 0: 135 | hasParseInfo = 0 136 | else: 137 | hasParseInfo = 1 138 | 139 | if len(words) != len(poslist) and len(poslist) != 0: 140 | print >> sys.stderr, 'Error in buiding Conll Format: different length stanford parser postags and words.' 141 | print >> sys.stderr, 'len words:', len(words), words 142 | print >> sys.stderr, 'len poslist:', len(poslist), poslist 143 | sys.exit(1) 144 | 145 | for wdindex in xrange(len(words)): 146 | 147 | word = words[wdindex] 148 | 149 | row = [] 150 | row.append(doc.docattrs[0][1]) #docinfo 151 | row.append(paragraphIndex) #paragraph index 152 | row.append(sentIndex) #paragraph index 153 | row.append(wdindex) #word index 154 | row.append(word) #word 155 | 156 | #row.append(chunknode.label) #chunk 157 | if hasParseInfo == 1: 158 | 159 | posword = poslist[wdindex] 160 | splitp = posword.rfind('/') 161 | pos = posword[splitp+1 : ].strip() 162 | 163 | #chunknode = chunklist[wdindex] 164 | 165 | constituentnode = constituentlist[wdindex] 166 | 167 | dpnode = None 168 | for d in dpnodes: 169 | if d.index == wdindex: 170 | dpnode = d 171 | break 172 | 173 | row.append(pos) #POS 174 | if dpnode == None: 175 | row.append('-') 176 | row.append('-') 177 | else: 178 | row.append(dpnode.parent_index) #dp parent 179 | row.append(dpnode.grammarrole) #dp label 180 | row.append(constituentnode) #constituent 181 | 182 | table.append(row) 183 | 184 | return table 185 | 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/scripts/nuclesgmlparser.py: -------------------------------------------------------------------------------- 1 | # nuclesgmlparser.py 2 | # 3 | # Author: Yuanbin Wu 4 | # National University of Singapore (NUS) 5 | # Date: 12 Mar 2013 6 | # Version: 1.0 7 | # 8 | # Contact: wuyb@comp.nus.edu.sg 9 | # 10 | # This script is distributed to support the CoNLL-2013 Shared Task. 11 | # It is free for research and educational purposes. 12 | 13 | from sgmllib import SGMLParser 14 | from nucle_doc import nucle_doc 15 | 16 | 17 | class nuclesgmlparser(SGMLParser): 18 | def __init__(self): 19 | SGMLParser.__init__(self) 20 | self.docs = [] 21 | 22 | def reset(self): 23 | self.docs = [] 24 | self.data = [] 25 | SGMLParser.reset(self) 26 | 27 | def unknow_starttag(self, tag, attrs): 28 | pass 29 | 30 | def unknow_endtag(self): 31 | pass 32 | 33 | def start_doc(self, attrs): 34 | self.docs.append(nucle_doc()) 35 | self.docs[-1].docattrs = attrs 36 | 37 | def end_doc(self): 38 | pass 39 | 40 | def start_matric(self, attrs): 41 | pass 42 | 43 | def end_matric(self): 44 | self.docs[-1].matric = ''.join(self.data) 45 | self.data = [] 46 | pass 47 | 48 | def start_email(self, attrs): 49 | pass 50 | 51 | def end_email(self): 52 | self.docs[-1].email = ''.join(self.data) 53 | self.data = [] 54 | pass 55 | 56 | def start_nationality(self, attrs): 57 | pass 58 | 59 | def end_nationality(self): 60 | self.docs[-1].nationality = ''.join(self.data) 61 | self.data = [] 62 | pass 63 | 64 | def start_first_language(self, attrs): 65 | pass 66 | 67 | def end_first_language(self): 68 | self.docs[-1].firstLanguage = ''.join(self.data) 69 | self.data = [] 70 | pass 71 | 72 | def start_school_language(self, attrs): 73 | pass 74 | 75 | def end_school_language(self): 76 | self.docs[-1].schoolLanguage = ''.join(self.data) 77 | self.data = [] 78 | pass 79 | 80 | def start_english_tests(self, attrs): 81 | pass 82 | 83 | def end_english_tests(self): 84 | self.docs[-1].englishTests = ''.join(self.data) 85 | self.data = [] 86 | pass 87 | 88 | 89 | def start_text(self, attrs): 90 | pass 91 | 92 | def end_text(self): 93 | pass 94 | 95 | def start_title(self, attrs): 96 | pass 97 | 98 | def end_title(self): 99 | self.docs[-1].paragraphs.append(''.join(self.data)) 100 | self.data = [] 101 | pass 102 | 103 | 104 | def start_p(self, attrs): 105 | pass 106 | 107 | def end_p(self): 108 | self.docs[-1].paragraphs.append(''.join(self.data)) 109 | self.data = [] 110 | pass 111 | 112 | 113 | def start_annotation(self, attrs): 114 | self.docs[-1].annotation.append(attrs) 115 | 116 | def end_annotation(self): 117 | pass 118 | 119 | def start_mistake(self, attrs): 120 | d = {} 121 | for t in attrs: 122 | d[t[0]] = int(t[1]) 123 | self.docs[-1].mistakes.append(d) 124 | pass 125 | 126 | def end_mistake(self): 127 | pass 128 | 129 | def start_type(self, attrs): 130 | pass 131 | 132 | def end_type(self): 133 | self.docs[-1].mistakes[-1]['type'] = ''.join(self.data) 134 | self.data = [] 135 | 136 | def start_correction(self, attrs): 137 | pass 138 | 139 | def end_correction(self): 140 | self.docs[-1].mistakes[-1]['correction'] = ''.join(self.data) 141 | self.data = [] 142 | 143 | def start_comment(self, attrs): 144 | pass 145 | 146 | def end_comment(self): 147 | self.docs[-1].mistakes[-1]['comment'] = ''.join( self.data) 148 | self.data = [] 149 | 150 | 151 | def handle_charref(self, ref): 152 | self.data.append('&' + ref) 153 | 154 | def handle_entityref(self, ref): 155 | self.data.append('&' + ref) 156 | 157 | def handle_data(self, text): 158 | if text.strip() == '': 159 | self.data.append('') 160 | return 161 | else: 162 | if text.startswith('\n'): 163 | text = text[1:] 164 | if text.endswith('\n'): 165 | text = text[:-1] 166 | self.data.append(text) 167 | 168 | 169 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/scripts/test.sgml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | CREATING A HABITABLE ENVIRONMENT 5 | 6 |

7 | Humans have many basic needs and one of them is to have an environment that can sustain their lives. Our current population is 6 billion people and it is still growing exponentially. This will, if not already, caused problems as there are very limited spaces for us. The solution can be obtain by using technology to achieve a better usage of space that we have and resolve the problems in lands that inhospitable such as desserts and swamps. 8 |

9 |

10 | Some countries are having difficulties in managing a place to live for their citizen as they tend to get overpopulated. This caused problem like the appearance of slums which most of the time is not safe due to the unhealthy environment. The only way to satisfy the increasing demands of space is by achieving a better usage of the land like designing taller building so it can accommodate more number of people with the same spaces. It is also important to create a better material that can support the buildings despite any natural disaster like earthquakes. A good example is Japan where there are a lot of tall condominiums despite the large number of earthquakes happening in there. Besides a better usage of lands, a better sanitation is also needed because a huge number of people need a clean environment to maintain their heath. For example, countries in Africa can accommodate more people if they can manage to design a better sanitation system. 11 |

12 |

13 | Countries with a lot of inhospitable space need not only to achieve a better space usage, but also to reforms the land to make it livable and technology can help it in a number of ways depending on the trouble the lands have. For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert. Dubai will be a good example for this as previously the country got almost no natural water and they use irrigation to bring natural water to the country. Another example is Netherlands, whose most of his lands is a swamp under sea level, have used a good irrigation system to counter their problem and to make their land habitable. 14 |

15 |

16 | As the number of people grows, the need of habitable environment is unquestionably essential. In this era, Engineering designs can help to provide more habitable accommodation by designing a stronger material so it's possible to create a taller and safer building, a better and efficient sanitation system to prevent disease, and also by designing a way to change the condition of the inhabitable environment. 17 |

18 |
19 | 20 | 21 | Vform 22 | cause 23 | 24 | 25 | Nn 26 | space 27 | 28 | 29 | SVA 30 | is 31 | 32 | 33 | Vform 34 | obtained 35 | 36 | 37 | Prep 38 | of 39 | 40 | 41 | V0 42 | that are inhospitable 43 | 44 | 45 | Nn 46 | citizens 47 | 48 | 49 | Others 50 | managing to get 51 | missing words 52 | 53 | 54 | Vt 55 | has caused 56 | 57 | 58 | Nn 59 | problems 60 | 61 | 62 | Wci 63 | formation and growth 64 | 65 | 66 | Vform 67 | are 68 | 69 | 70 | Others 71 | safe to live in 72 | missing words 73 | 74 | 75 | ArtOrDet 76 | their 77 | 78 | 79 | Prep 80 | for 81 | 82 | 83 | ArtOrDet 84 | 85 | 86 | 87 | Wci 88 | a greater 89 | 90 | 91 | Pref 92 | the same area of land 93 | 94 | 95 | Rloc- 96 | 97 | 98 | 99 | Wci 100 | use 101 | 102 | 103 | ArtOrDet 104 | 105 | 106 | 107 | ArtOrDet 108 | 109 | 110 | 111 | Wci 112 | during 113 | 114 | 115 | Nn 116 | disasters 117 | 118 | 119 | Mec 120 | Japan, 121 | 122 | 123 | Prep 124 | 125 | 126 | 127 | Nn 128 | land 129 | 130 | 131 | ArtOrDet 132 | 133 | 134 | 135 | Vt 136 | will need 137 | 138 | 139 | Mec 140 | health 141 | 142 | 143 | Vform 144 | reform 145 | 146 | 147 | Wci 148 | restore the land to a livable state 149 | 150 | 151 | Pref 152 | 153 | 154 | 155 | Wci 156 | quality 157 | 158 | 159 | Wci 160 | quality of the land 161 | 162 | 163 | Nn 164 | desert 165 | 166 | 167 | Wci 168 | transform 169 | 170 | 171 | Vform 172 | use 173 | 174 | 175 | Trans 176 | But 177 | 178 | 179 | Mec 180 | water. 181 | 182 | 183 | Pref 184 | it 185 | 186 | 187 | SVA 188 | uses 189 | 190 | 191 | Pref 192 | where 193 | 194 | 195 | Pref 196 | the 197 | 198 | 199 | Nn 200 | land 201 | 202 | 203 | ArtOrDet 204 | 205 | 206 | 207 | Sfrag 208 | level. It 209 | 210 | 211 | SVA 212 | has 213 | 214 | 215 | Prep 216 | for 217 | 218 | 219 | Mec 220 | engineering 221 | 222 | 223 | Wci 224 | increasing 225 | 226 | 227 | Wci 228 | environment 229 | 230 | 231 | ArtOrDet 232 | 233 | 234 | 235 | Wci 236 | build 237 | 238 | 239 | ArtOrDet 240 | taller and safer buildings 241 | 242 | 243 | Rloc- 244 | 245 | 246 | 247 | Mec 248 | and 249 | 250 | 251 | Rloc- 252 | changing 253 | 254 | 255 | Wci 256 | otherwise uninhabitable 257 | 258 | 259 |
260 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/scripts/token_offsets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This file is part of the NUS M2 scorer. 4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | # The NUS M2 scorer is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | # file: token_offsets.py 18 | # convert character to token offsets, tokenize sentence 19 | # 20 | # usage: %prog < input > output 21 | # 22 | 23 | 24 | import sys 25 | import re 26 | import os 27 | from util import * 28 | from Tokenizer import PTBTokenizer 29 | 30 | 31 | assert len(sys.argv) == 1 32 | 33 | 34 | # main 35 | # loop over sentences cum annotation 36 | tokenizer = PTBTokenizer() 37 | sentence = '' 38 | for line in sys.stdin: 39 | line = line.decode("utf8").strip() 40 | if line.startswith("S "): 41 | sentence = line[2:] 42 | sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence)) 43 | print sentence_tok.encode("utf8") 44 | elif line.startswith("A "): 45 | fields = line[2:].split('|||') 46 | start_end = fields[0] 47 | char_start, char_end = [int(a) for a in start_end.split()] 48 | # calculate token offsets 49 | prefix = sentence[:char_start] 50 | tok_start = len(tokenizer.tokenize(prefix)) 51 | postfix = sentence[:char_end] 52 | tok_end = len(tokenizer.tokenize(postfix)) 53 | start_end = str(tok_start) + " " + str(tok_end) 54 | fields[0] = start_end 55 | # tokenize corrections, remove trailing whitespace 56 | corrections = [(' '.join(tokenizer.tokenize(c))).strip() for c in fields[2].split('||')] 57 | fields[2] = '||'.join(corrections) 58 | annotation = "A " + '|||'.join(fields) 59 | print annotation.encode("utf8") 60 | else: 61 | print line.encode("utf8") 62 | 63 | -------------------------------------------------------------------------------- /data/release2.3.1/m2scorer/scripts/util.py: -------------------------------------------------------------------------------- 1 | # This file is part of the NUS M2 scorer. 2 | # The NUS M2 scorer is free software: you can redistribute it and/or modify 3 | # it under the terms of the GNU General Public License as published by 4 | # the Free Software Foundation, either version 3 of the License, or 5 | # (at your option) any later version. 6 | 7 | # The NUS M2 scorer is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU General Public License for more details. 11 | 12 | # You should have received a copy of the GNU General Public License 13 | # along with this program. If not, see . 14 | 15 | # file: util.py 16 | # 17 | 18 | import operator 19 | import random 20 | import math 21 | import re 22 | 23 | def smart_open(fname, mode = 'r'): 24 | if fname.endswith('.gz'): 25 | import gzip 26 | # Using max compression (9) by default seems to be slow. 27 | # Let's try using the fastest. 28 | return gzip.open(fname, mode, 1) 29 | else: 30 | return open(fname, mode) 31 | 32 | 33 | def randint(b, a=0): 34 | return random.randint(a,b) 35 | 36 | def uniq(seq, idfun=None): 37 | # order preserving 38 | if idfun is None: 39 | def idfun(x): return x 40 | seen = {} 41 | result = [] 42 | for item in seq: 43 | marker = idfun(item) 44 | # in old Python versions: 45 | # if seen.has_key(marker) 46 | # but in new ones: 47 | if marker in seen: continue 48 | seen[marker] = 1 49 | result.append(item) 50 | return result 51 | 52 | 53 | def sort_dict(myDict, byValue=False, reverse=False): 54 | if byValue: 55 | items = myDict.items() 56 | items.sort(key = operator.itemgetter(1), reverse=reverse) 57 | else: 58 | items = sorted(myDict.items()) 59 | return items 60 | 61 | def max_dict(myDict, byValue=False): 62 | if byValue: 63 | skey=lambda x:x[1] 64 | else: 65 | skey=lambda x:x[0] 66 | return max(myDict.items(), key=skey) 67 | 68 | 69 | def min_dict(myDict, byValue=False): 70 | if byValue: 71 | skey=lambda x:x[1] 72 | else: 73 | skey=lambda x:x[0] 74 | return min(myDict.items(), key=skey) 75 | 76 | def paragraphs(lines, is_separator=lambda x : x == '\n', joiner=''.join): 77 | paragraph = [] 78 | for line in lines: 79 | if is_separator(line): 80 | if paragraph: 81 | yield joiner(paragraph) 82 | paragraph = [] 83 | else: 84 | paragraph.append(line) 85 | if paragraph: 86 | yield joiner(paragraph) 87 | 88 | 89 | def isASCII(word): 90 | try: 91 | word = word.decode("ascii") 92 | return True 93 | except UnicodeEncodeError : 94 | return False 95 | except UnicodeDecodeError: 96 | return False 97 | 98 | 99 | def intersect(x, y): 100 | return [z for z in x if z in y] 101 | 102 | 103 | 104 | # Mapping Windows CP1252 Gremlins to Unicode 105 | # from http://effbot.org/zone/unicode-gremlins.htm 106 | cp1252 = { 107 | # from http://www.microsoft.com/typography/unicode/1252.htm 108 | u"\x80": u"\u20AC", # EURO SIGN 109 | u"\x82": u"\u201A", # SINGLE LOW-9 QUOTATION MARK 110 | u"\x83": u"\u0192", # LATIN SMALL LETTER F WITH HOOK 111 | u"\x84": u"\u201E", # DOUBLE LOW-9 QUOTATION MARK 112 | u"\x85": u"\u2026", # HORIZONTAL ELLIPSIS 113 | u"\x86": u"\u2020", # DAGGER 114 | u"\x87": u"\u2021", # DOUBLE DAGGER 115 | u"\x88": u"\u02C6", # MODIFIER LETTER CIRCUMFLEX ACCENT 116 | u"\x89": u"\u2030", # PER MILLE SIGN 117 | u"\x8A": u"\u0160", # LATIN CAPITAL LETTER S WITH CARON 118 | u"\x8B": u"\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK 119 | u"\x8C": u"\u0152", # LATIN CAPITAL LIGATURE OE 120 | u"\x8E": u"\u017D", # LATIN CAPITAL LETTER Z WITH CARON 121 | u"\x91": u"\u2018", # LEFT SINGLE QUOTATION MARK 122 | u"\x92": u"\u2019", # RIGHT SINGLE QUOTATION MARK 123 | u"\x93": u"\u201C", # LEFT DOUBLE QUOTATION MARK 124 | u"\x94": u"\u201D", # RIGHT DOUBLE QUOTATION MARK 125 | u"\x95": u"\u2022", # BULLET 126 | u"\x96": u"\u2013", # EN DASH 127 | u"\x97": u"\u2014", # EM DASH 128 | u"\x98": u"\u02DC", # SMALL TILDE 129 | u"\x99": u"\u2122", # TRADE MARK SIGN 130 | u"\x9A": u"\u0161", # LATIN SMALL LETTER S WITH CARON 131 | u"\x9B": u"\u203A", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 132 | u"\x9C": u"\u0153", # LATIN SMALL LIGATURE OE 133 | u"\x9E": u"\u017E", # LATIN SMALL LETTER Z WITH CARON 134 | u"\x9F": u"\u0178", # LATIN CAPITAL LETTER Y WITH DIAERESIS 135 | } 136 | 137 | def fix_cp1252codes(text): 138 | # map cp1252 gremlins to real unicode characters 139 | if re.search(u"[\x80-\x9f]", text): 140 | def fixup(m): 141 | s = m.group(0) 142 | return cp1252.get(s, s) 143 | if isinstance(text, type("")): 144 | # make sure we have a unicode string 145 | text = unicode(text, "iso-8859-1") 146 | text = re.sub(u"[\x80-\x9f]", fixup, text) 147 | return text 148 | 149 | def clean_utf8(text): 150 | return filter(lambda x : x > '\x1f' and x < '\x7f', text) 151 | 152 | def pairs(iterable, overlapping=False): 153 | iterator = iterable.__iter__() 154 | token = iterator.next() 155 | i = 0 156 | for lookahead in iterator: 157 | if overlapping or i % 2 == 0: 158 | yield (token, lookahead) 159 | token = lookahead 160 | i += 1 161 | if i % 2 == 0: 162 | yield (token, None) 163 | 164 | def frange(start, end=None, inc=None): 165 | "A range function, that does accept float increments..." 166 | 167 | if end == None: 168 | end = start + 0.0 169 | start = 0.0 170 | 171 | if inc == None: 172 | inc = 1.0 173 | 174 | L = [] 175 | while 1: 176 | next = start + len(L) * inc 177 | if inc > 0 and next >= end: 178 | break 179 | elif inc < 0 and next <= end: 180 | break 181 | L.append(next) 182 | 183 | return L 184 | 185 | def softmax(values): 186 | a = max(values) 187 | Z = 0.0 188 | for v in values: 189 | Z += math.exp(v - a) 190 | sm = [math.exp(v-a) / Z for v in values] 191 | return sm 192 | -------------------------------------------------------------------------------- /data/release2.3.1/revised/data_5types/alternatives.NTHU.sgml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Prep 5 | of 6 | 7 | 8 | Nn 9 | disagreement 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | Nn 18 | purposes 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | SVA 27 | are 28 | 29 | 30 | Vform 31 | was 32 | 33 | 34 | Vform 35 | mentioned 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | Nn 44 | trends 45 | 46 | 47 | ArtOrDet 48 | of the 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | noop 57 | However, human beings are not animals or any other products, they have their own thoughts and freedom. 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | Nn 66 | freedoms 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | Vform 75 | get 76 | 77 | 78 | Prep 79 | home 80 | 81 | 82 | Nn 83 | places 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | ArtOrDet 92 | the news 93 | 94 | 95 | Nn 96 | thousands 97 | 98 | 99 | Prep 100 | for 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | Nn 109 | mind 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | Vform 118 | offend 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | Prep 127 | of 128 | 129 | 130 | Nn 131 | telephones 132 | 133 | 134 | Prep 135 | for 136 | 137 | 138 | Nn 139 | lives 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | SVA 148 | invades 149 | 150 | 151 | ArtOrDet 152 | the whole 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | Vform 161 | go 162 | 163 | 164 | ArtOrDet 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | ArtOrDet 174 | 175 | 176 | 177 | Nn 178 | expectancy 179 | 180 | 181 | Nn 182 | teeth 183 | 184 | 185 | Nn 186 | teeth 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | ArtOrDet 195 | through the 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | ArtOrDet 204 | Rising 205 | 206 | 207 | Nn 208 | expetancy 209 | 210 | 211 | SVA 212 | proves 213 | 214 | 215 | SVA 216 | has 217 | 218 | 219 | Prep 220 | of 221 | 222 | 223 | Prep 224 | history of 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | ArtOrDet 233 | An ageing 234 | 235 | 236 | ArtOrDet 237 | a larger 238 | 239 | 240 | ArtOrDet 241 | An ageing 242 | 243 | 244 | SVA 245 | need 246 | 247 | 248 | Nn 249 | equipment 250 | 251 | 252 | Nn 253 | centres 254 | 255 | 256 | SVA 257 | are 258 | 259 | 260 | ArtOrDet 261 | An increasing 262 | 263 | 264 | ArtOrDet 265 | 266 | 267 | 268 | ArtOrDet 269 | the food 270 | 271 | 272 | Nn 273 | expectancy 274 | 275 | 276 | Prep 277 | of 278 | 279 | 280 | ArtOrDet 281 | the governments 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | Vform 290 | Comparing 291 | 292 | 293 | ArtOrDet 294 | 295 | 296 | 297 | SVA 298 | are 299 | 300 | 301 | noop 302 | However, these natural resources are not renewable. 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | ArtOrDet 311 | the 312 | 313 | 314 | Nn 315 | employees 316 | 317 | 318 | SVA 319 | start 320 | 321 | 322 | SVA 323 | rely 324 | 325 | 326 | Nn 327 | supplements 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | Prep 336 | in 337 | 338 | 339 | Prep 340 | by themselves 341 | 342 | 343 | Prep 344 | to a 345 | 346 | 347 | Nn 348 | way 349 | 350 | 351 | Prep 352 | 353 | 354 | 355 | ArtOrDet 356 | 357 | 358 | 359 | Prep 360 | 361 | 362 | 363 | Nn 364 | standards 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | Nn 373 | diseases 374 | 375 | 376 | ArtOrDet 377 | like the 378 | 379 | 380 | Nn 381 | Death 382 | 383 | 384 | Nn 385 | lives 386 | 387 | 388 | Nn 389 | expectancy 390 | 391 | 392 | SVA 393 | become 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | Prep 402 | 403 | 404 | 405 | ArtOrDet 406 | the people 407 | 408 | 409 | Prep 410 | 411 | 412 | 413 | ArtOrDet 414 | 415 | 416 | 417 | Nn 418 | problems 419 | 420 | 421 | ArtOrDet 422 | 423 | 424 | 425 | ArtOrDet 426 | 427 | 428 | 429 | Nn 430 | contributions 431 | 432 | 433 | ArtOrDet 434 | 435 | 436 | 437 | ArtOrDet 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | ArtOrDet 447 | 448 | 449 | 450 | Nn 451 | services 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | Nn 460 | expectancy 461 | 462 | 463 | ArtOrDet 464 | 465 | 466 | 467 | Prep 468 | in 469 | 470 | 471 | Nn 472 | expectancy 473 | 474 | 475 | ArtOrDet 476 | a large 477 | 478 | 479 | Nn 480 | beings 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | ArtOrDet 489 | 490 | 491 | 492 | Nn 493 | technologies 494 | 495 | 496 | SVA 497 | save 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | ArtOrDet 506 | history of 507 | 508 | 509 | ArtOrDet 510 | of the 511 | 512 | 513 | ArtOrDet 514 | society 515 | 516 | 517 | Prep 518 | by 519 | 520 | 521 | SVA 522 | is 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | Nn 531 | our bodies 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | Nn 540 | lose their lives 541 | 542 | 543 | Vform 544 | led 545 | 546 | 547 | 548 | 549 | -------------------------------------------------------------------------------- /data/release2.3.1/scripts/README: -------------------------------------------------------------------------------- 1 | ==================================================== 2 | 3 | CoNLL-2013 Shared Task: Grammatical Error Correction 4 | 5 | Description of Data Preprocessing Scripts 6 | 7 | Created May 23, 2013 Version 2.3.1 8 | ==================================================== 9 | 10 | 11 | Table of Contents 12 | ================= 13 | 14 | 1. General 15 | 2. Pre-requisites 16 | 3. Usage 17 | 18 | 1. General 19 | ========== 20 | 21 | This README file describes the usage of scripts for preprocessing the CoNLL-2013 official test data. 22 | 23 | Quickstart: 24 | 25 | a. Regenerate the preprocessed files with full syntactic information: 26 | % python preprocess.py -o official.sgml conllFileName annFileName m2FileName 27 | 28 | b. Get tokenized annotations without syntactic information: 29 | % python preprocess.py -l official.sgml conllFileName annFileName m2FileName 30 | 31 | Where 32 | conllFileName - output file that contains pre-processed sentences in CoNLL format. 33 | annFileName - output file that contains standoff error annotations. 34 | m2FileName - output file that contains error annotations in the M2 scorer format. 35 | 36 | c. Creating gold-standard answers including the official and alternative annotations: 37 | % python preprocesswithalt.py official.5types.sgml official.5types.sgml alternatives.UIUC.sgml alternatives.UMC.sgml alternatives.NTHU.sgml alternatives.STEL.sgml alternatives.TOR.sgml m2FileName 38 | 39 | Where 40 | m2FileName - output file containing combined official and alternative annotations. 41 | 42 | Note: The repeated official.5types.sgml is deliberate since it is the program requirement. 43 | 44 | 2. Pre-requisites 45 | ================= 46 | 47 | + Python (2.6.4, other versions >= 2.6.4, < 3.0 might work but are not tested) 48 | + nltk (http://www.nltk.org, version 2.0b7, needed for sentence splitting and word tokenization, other versions might work) 49 | + Stanford parser (version 2.0.1, http://nlp.stanford.edu/software/stanford-parser-2012-03-09.tgz) 50 | 51 | Directories: 52 | stanford-parser-2012-03-09/ 53 | scripts/ 54 | 55 | If you only use the scripts to generate error annotations needed by the M2 scorer, Stanford parser is not required. 56 | Otherwise, "stanford-parser-2012-03-09" need to be in the same directory as "scripts". 57 | 58 | 3. Usage 59 | ======== 60 | 61 | Preprocessing the main official test data: 62 | 63 | Usage: python preprocess.py OPTIONS sgmlFileName conllFileName annotationFileName m2FileName 64 | 65 | Where 66 | sgmlFileName - NUCLE SGML file 67 | conllFileName - output file name for pre-processed sentences in CoNLL format (e.g., conll13st-preprocessed.conll). 68 | annotationFileName - output file name for error annotations (e.g., conll13st-preprocessed.conll.ann). 69 | m2FileName - output file name in the M2 scorer format (e.g., conll13st-preprocessed.conll.m2). 70 | 71 | OPTIONS 72 | -o - output will contain POS tags and parse tree info (i.e., the same as the released preprocessed file, runs slowly). 73 | -l - output will NOT contain POS tags and parse tree info (runs quickly). 74 | 75 | Getting the combined M^2 gold-standard answer: 76 | 77 | Usage: python preprocesswithalt.py essaySgmlFileName mainSgmlFileName alt1SgmlFileName ... altNSgmlFileName m2FileName 78 | 79 | Where 80 | essaySgmlFile - official test data SGML file containing essay body, not necessarily annotations 81 | mainAnnotSgmlFile - official test data SGML file containing the main annotations, not necessarily essay body 82 | alt1SgmlFileName - the first alternative annotations SGML file, containing only annotations that differ from the main annotation 83 | altNSgmlFileName - the last alternative annotations SGML file, containing only annotations that differ from the main annotation 84 | combM2FileName - output file name in the M2 scorer format, containing combination of main and alternative annotations 85 | -------------------------------------------------------------------------------- /data/release2.3.1/scripts/iparser.py: -------------------------------------------------------------------------------- 1 | # iparser.py 2 | # 3 | # Author: Yuanbin Wu 4 | # National University of Singapore (NUS) 5 | # Date: 12 Mar 2013 6 | # Version: 1.0 7 | # 8 | # Contact: wuyb@comp.nus.edu.sg 9 | # 10 | # This script is distributed to support the CoNLL-2013 Shared Task. 11 | # It is free for research and educational purposes. 12 | 13 | import os 14 | import sys 15 | 16 | class stanfordparser: 17 | 18 | def __init__(self): 19 | pass 20 | 21 | def parse_batch(self, sentenceDumpedFileName, parsingDumpedFileName): 22 | 23 | if os.path.exists('../stanford-parser-2012-03-09') == False: 24 | print >> sys.stderr, 'can not find Stanford parser directory' 25 | sys.exit(1) 26 | 27 | # tokenized 28 | cmd = r'java -server -mx4096m -cp "../stanford-parser-2012-03-09/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser -retainTMPSubcategories -sentences newline -tokenized -escaper edu.stanford.nlp.process.PTBEscapingProcessor -outputFormat "wordsAndTags, penn, typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ' + sentenceDumpedFileName 29 | 30 | r = os.popen(cmd).read().strip().decode('utf-8') 31 | f = open(parsingDumpedFileName, 'w') 32 | f.write(r.encode('utf-8')) 33 | f.close() 34 | 35 | rlist = r.replace('\n\n\n', '\n\n\n\n').split('\n\n') 36 | return rlist 37 | -------------------------------------------------------------------------------- /data/release2.3.1/scripts/nucle_doc.py: -------------------------------------------------------------------------------- 1 | # nucle_doc.py 2 | # 3 | # Author: Yuanbin Wu 4 | # National University of Singapore (NUS) 5 | # Date: 12 Mar 2013 6 | # Version: 1.0 7 | # 8 | # Contact: wuyb@comp.nus.edu.sg 9 | # 10 | # This script is distributed to support the CoNLL-2013 Shared Task. 11 | # It is free for research and educational purposes. 12 | 13 | import os 14 | import sys 15 | from nltk import word_tokenize 16 | 17 | class nucle_doc: 18 | def __init__(self): 19 | self.docattrs = None 20 | 21 | self.matric = '' 22 | self.email = '' 23 | self.nationality = '' 24 | self.firstLanguage = '' 25 | self.schoolLanguage = '' 26 | self.englishTests = '' 27 | 28 | self.paragraphs = [] 29 | self.annotation = [] 30 | self.mistakes = [] 31 | 32 | self.sentences = [] 33 | 34 | def buildSentence(self, sentstr, dpnode, constituentstr, poslist, chunklist): 35 | self.sentences[-1].append(nucle_sent(sentstr, dpnode, constituentstr, poslist, chunklist)) 36 | 37 | def addSentence(self, sent): 38 | self.sentences[-1].append(sent) 39 | 40 | def findMistake(self, par, pos): 41 | for m in self.mistakes: 42 | if par == m['start_par'] and pos >= m['start_off'] and pos < m['end_off']: 43 | return m 44 | return None 45 | 46 | 47 | class nucle_sent: 48 | def __init__(self, sentstr, dpnode, constituentstr, poslist, chunklist): 49 | self.sentstr = sentstr 50 | self.words = word_tokenize(sentstr) 51 | self.dpnodes = dpnode 52 | self.constituentstr = constituentstr 53 | self.constituentlist = [] 54 | self.poslist = poslist 55 | self.chunklist = chunklist 56 | 57 | def buildConstituentList(self): 58 | 59 | s = self.constituentstr.strip().replace('\n', '').replace(' ', '') 60 | r = [] 61 | i = 0 62 | while i < len(s): 63 | j = i 64 | while j < len(s) and s[j] != ')': 65 | j += 1 66 | k = j 67 | while k < len(s) and s[k] == ')': 68 | k += 1 69 | 70 | nodeWholeStr = s[i:k] 71 | lastLRBIndex = nodeWholeStr.rfind('(') 72 | nodeStr = nodeWholeStr[:lastLRBIndex] + '*' + s[j+1:k] 73 | 74 | r.append(nodeStr) 75 | i = k 76 | 77 | if len(r) != len(self.words): 78 | print >> sys.stderr, 'Error in buiding constituent tree bits: different length with words.' 79 | print >> sys.stderr, len(r), len(self.words) 80 | print >> sys.stderr, ' '.join(r).encode('utf-8') 81 | print >> sys.stderr, words 82 | sys.exit(1) 83 | 84 | self.constituentlist = r 85 | 86 | 87 | 88 | def setDpNode(self, dpnode): 89 | self.dpnodes = dpnode 90 | 91 | def setPOSList(self, poslist): 92 | self.poslist = poslist 93 | 94 | def setConstituentStr(self, constituentstr): 95 | self.constituentstr = constituentstr 96 | 97 | def setConstituentList(self, constituentlist): 98 | self.constituentlist = constituentlist 99 | 100 | def setWords(self, words): 101 | self.words = words 102 | 103 | def setChunkList(self, chunklist): 104 | self.chunklist = chunklist 105 | 106 | def getDpNode(self): 107 | return self.dpnodes 108 | 109 | def getPOSList(self): 110 | return self.poslist 111 | 112 | def getConstituentStr(self): 113 | return self.constituentstr 114 | 115 | def getConstituentList(self): 116 | return self.constituentlist 117 | 118 | def getWords(self): 119 | return self.words 120 | 121 | def getChunkList(self): 122 | return self.chunklist 123 | 124 | def getConllFormat(self, doc, paragraphIndex, sentIndex): 125 | 126 | table = [] 127 | 128 | dpnodes = self.getDpNode() 129 | poslist = self.getPOSList() 130 | #chunklist = self.getChunkList() 131 | words = self.getWords() 132 | constituentlist = self.getConstituentList() 133 | 134 | if len(poslist) == 0: 135 | hasParseInfo = 0 136 | else: 137 | hasParseInfo = 1 138 | 139 | if len(words) != len(poslist) and len(poslist) != 0: 140 | print >> sys.stderr, 'Error in buiding Conll Format: different length stanford parser postags and words.' 141 | print >> sys.stderr, 'len words:', len(words), words 142 | print >> sys.stderr, 'len poslist:', len(poslist), poslist 143 | sys.exit(1) 144 | 145 | for wdindex in xrange(len(words)): 146 | 147 | word = words[wdindex] 148 | 149 | row = [] 150 | row.append(doc.docattrs[0][1]) #docinfo 151 | row.append(paragraphIndex) #paragraph index 152 | row.append(sentIndex) #paragraph index 153 | row.append(wdindex) #word index 154 | row.append(word) #word 155 | 156 | #row.append(chunknode.label) #chunk 157 | if hasParseInfo == 1: 158 | 159 | posword = poslist[wdindex] 160 | splitp = posword.rfind('/') 161 | pos = posword[splitp+1 : ].strip() 162 | 163 | #chunknode = chunklist[wdindex] 164 | 165 | constituentnode = constituentlist[wdindex] 166 | 167 | dpnode = None 168 | for d in dpnodes: 169 | if d.index == wdindex: 170 | dpnode = d 171 | break 172 | 173 | row.append(pos) #POS 174 | if dpnode == None: 175 | row.append('-') 176 | row.append('-') 177 | else: 178 | row.append(dpnode.parent_index) #dp parent 179 | row.append(dpnode.grammarrole) #dp label 180 | row.append(constituentnode) #constituent 181 | 182 | table.append(row) 183 | 184 | return table 185 | 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /data/release2.3.1/scripts/nuclesgmlparser.py: -------------------------------------------------------------------------------- 1 | # nuclesgmlparser.py 2 | # 3 | # Author: Yuanbin Wu 4 | # National University of Singapore (NUS) 5 | # Date: 12 Mar 2013 6 | # Version: 1.0 7 | # 8 | # Contact: wuyb@comp.nus.edu.sg 9 | # 10 | # This script is distributed to support the CoNLL-2013 Shared Task. 11 | # It is free for research and educational purposes. 12 | 13 | from sgmllib import SGMLParser 14 | from nucle_doc import nucle_doc 15 | 16 | 17 | class nuclesgmlparser(SGMLParser): 18 | def __init__(self): 19 | SGMLParser.__init__(self) 20 | self.docs = [] 21 | 22 | def reset(self): 23 | self.docs = [] 24 | self.data = [] 25 | SGMLParser.reset(self) 26 | 27 | def unknow_starttag(self, tag, attrs): 28 | pass 29 | 30 | def unknow_endtag(self): 31 | pass 32 | 33 | def start_doc(self, attrs): 34 | self.docs.append(nucle_doc()) 35 | self.docs[-1].docattrs = attrs 36 | 37 | def end_doc(self): 38 | pass 39 | 40 | def start_matric(self, attrs): 41 | pass 42 | 43 | def end_matric(self): 44 | self.docs[-1].matric = ''.join(self.data) 45 | self.data = [] 46 | pass 47 | 48 | def start_email(self, attrs): 49 | pass 50 | 51 | def end_email(self): 52 | self.docs[-1].email = ''.join(self.data) 53 | self.data = [] 54 | pass 55 | 56 | def start_nationality(self, attrs): 57 | pass 58 | 59 | def end_nationality(self): 60 | self.docs[-1].nationality = ''.join(self.data) 61 | self.data = [] 62 | pass 63 | 64 | def start_first_language(self, attrs): 65 | pass 66 | 67 | def end_first_language(self): 68 | self.docs[-1].firstLanguage = ''.join(self.data) 69 | self.data = [] 70 | pass 71 | 72 | def start_school_language(self, attrs): 73 | pass 74 | 75 | def end_school_language(self): 76 | self.docs[-1].schoolLanguage = ''.join(self.data) 77 | self.data = [] 78 | pass 79 | 80 | def start_english_tests(self, attrs): 81 | pass 82 | 83 | def end_english_tests(self): 84 | self.docs[-1].englishTests = ''.join(self.data) 85 | self.data = [] 86 | pass 87 | 88 | 89 | def start_text(self, attrs): 90 | pass 91 | 92 | def end_text(self): 93 | pass 94 | 95 | def start_title(self, attrs): 96 | pass 97 | 98 | def end_title(self): 99 | self.docs[-1].paragraphs.append(''.join(self.data)) 100 | self.data = [] 101 | pass 102 | 103 | 104 | def start_p(self, attrs): 105 | pass 106 | 107 | def end_p(self): 108 | self.docs[-1].paragraphs.append(''.join(self.data)) 109 | self.data = [] 110 | pass 111 | 112 | 113 | def start_annotation(self, attrs): 114 | self.docs[-1].annotation.append(attrs) 115 | 116 | def end_annotation(self): 117 | pass 118 | 119 | def start_mistake(self, attrs): 120 | d = {} 121 | for t in attrs: 122 | d[t[0]] = int(t[1]) 123 | self.docs[-1].mistakes.append(d) 124 | pass 125 | 126 | def end_mistake(self): 127 | pass 128 | 129 | def start_type(self, attrs): 130 | pass 131 | 132 | def end_type(self): 133 | self.docs[-1].mistakes[-1]['type'] = ''.join(self.data) 134 | self.data = [] 135 | 136 | def start_correction(self, attrs): 137 | pass 138 | 139 | def end_correction(self): 140 | self.docs[-1].mistakes[-1]['correction'] = ''.join(self.data) 141 | self.data = [] 142 | 143 | def start_comment(self, attrs): 144 | pass 145 | 146 | def end_comment(self): 147 | self.docs[-1].mistakes[-1]['comment'] = ''.join( self.data) 148 | self.data = [] 149 | 150 | 151 | def handle_charref(self, ref): 152 | self.data.append('&' + ref) 153 | 154 | def handle_entityref(self, ref): 155 | self.data.append('&' + ref) 156 | 157 | def handle_data(self, text): 158 | if text.strip() == '': 159 | self.data.append('') 160 | return 161 | else: 162 | if text.startswith('\n'): 163 | text = text[1:] 164 | if text.endswith('\n'): 165 | text = text[:-1] 166 | self.data.append(text) 167 | 168 | 169 | -------------------------------------------------------------------------------- /data/release2.3.1/scripts/parser_feature.py: -------------------------------------------------------------------------------- 1 | # parser_feature.py 2 | # 3 | # Author: Yuanbin Wu 4 | # National University of Singapore (NUS) 5 | # Date: 12 Mar 2013 6 | # Version: 1.0 7 | # 8 | # Contact: wuyb@comp.nus.edu.sg 9 | # 10 | # This script is distributed to support the CoNLL-2013 Shared Task. 11 | # It is free for research and educational purposes. 12 | 13 | 14 | 15 | import iparser 16 | 17 | class stanpartreenode: 18 | def __init__(self, strnode): 19 | 20 | if strnode == '': 21 | self.grammarrole = '' 22 | self.parent_index = -1 23 | self.index = -1 24 | self.parent_word = '' 25 | self.word = '' 26 | self.POS = '' 27 | return 28 | 29 | groleend = strnode.find('(') 30 | self.grammarrole = strnode[ : groleend] 31 | content = strnode[groleend + 1: len(strnode)-1] 32 | dadAndme = content.partition(', ') 33 | dad = dadAndme[0] 34 | me = dadAndme[2] 35 | dadsep = dad.rfind('-') 36 | mesep = me.rfind('-') 37 | self.parent_index = int(dad[dadsep + 1 : ]) - 1 38 | self.parent_word = dad[0 : dadsep] 39 | self.index = int(me[mesep + 1 : ]) - 1 40 | self.word = me[0 : mesep] 41 | self.POS = '' 42 | 43 | 44 | def DependTree_Batch(sentenceDumpedFileName, parsingDumpedFileName): 45 | 46 | sparser = iparser.stanfordparser() 47 | results = sparser.parse_batch(sentenceDumpedFileName, parsingDumpedFileName) 48 | nodeslist = [] 49 | 50 | k = 0 51 | while k < len(results): 52 | PoSlist = results[k].split(' ') 53 | constituentstr = results[k+1] 54 | table = results[k+2].split('\n') 55 | nodes = [] 56 | for i in range(0, len(table)): 57 | nodes.append( stanpartreenode(table[i]) ) 58 | nodeslist.append((nodes, constituentstr, PoSlist)) 59 | k += 3 60 | return nodeslist 61 | 62 | def DependTree_Batch_Parsefile(parsingDumpedFileName): 63 | 64 | f = open(parsingDumpedFileName, 'r') 65 | results = f.read().decode('utf-8').replace('\n\n\n', '\n\n\n\n').split('\n\n') 66 | f.close() 67 | nodeslist = [] 68 | 69 | k = 0 70 | while k < len(results): 71 | PoSlist = results[k].split(' ') 72 | constituentstr = results[k+1] 73 | table = results[k+2].split('\n') 74 | 75 | nodes = [] 76 | for i in range(0, len(table)): 77 | nodes.append( stanpartreenode(table[i]) ) 78 | nodeslist.append((nodes, constituentstr, PoSlist)) 79 | k += 3 80 | return nodeslist 81 | -------------------------------------------------------------------------------- /data/release2.3.1/scripts/preprocesswithalt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # preprocessors.py 4 | # 5 | # Author: Christian Hadiwinoto 6 | # National University of Singapore (NUS) 7 | # Date: 24 May 2013 8 | # Version: 1.0 9 | # 10 | # Contact: chrhad@comp.nus.edu.sg 11 | # 12 | # This script is distributed to support the CoNLL-2013 Shared Task. 13 | # It is free for research and educational purposes. 14 | # 15 | # Usage: python preprocesswithalt.py essaySgmlFileName mainSgmlFileName alt1SgmlFileName ... altNSgmlFileName m2FileName 16 | # 17 | 18 | 19 | import parser_feature 20 | from nuclesgmlparser import nuclesgmlparser 21 | from nucle_doc import * 22 | import nltk.data 23 | from nltk import word_tokenize 24 | from operator import itemgetter 25 | import cPickle as pickle 26 | import re 27 | import sys 28 | import os 29 | 30 | getEditKey = itemgetter(0, 1, 2, 3, 4) 31 | 32 | sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 33 | sentenceDumpedFile = 'sentence_file' 34 | docsDumpedFileName = 'docs' 35 | parsingDumpedFileName = 'parse_file' 36 | 37 | def readNUCLE(fn): 38 | 39 | f = open(fn, 'r') 40 | parser = nuclesgmlparser() 41 | filestr = f.read() 42 | filestr = filestr.decode('utf-8') 43 | 44 | #Fix Reference tag 45 | p = re.compile(r'(\n

\n.*\n)

') 46 | filestr = p.sub(r'\1

', filestr) 47 | 48 | parser.feed(filestr) 49 | f.close() 50 | parser.close() 51 | 52 | return parser.docs 53 | 54 | def sentenceSplit(docs): 55 | 56 | sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 57 | for doc in docs: 58 | for par in doc.paragraphs: 59 | doc.sentences.append([]) 60 | for s in sentenceTokenizer.tokenize(par): 61 | doc.buildSentence(s, [], '', [], []) 62 | return docs 63 | 64 | def compareTwoEditLists(editList1, editList2): 65 | # must be sorted 66 | if editList1 == [] and editList2 == []: 67 | return True 68 | elif editList1 == [] or editList2 == []: 69 | return False 70 | elif getEditKey(editList1[0]) != getEditKey(editList2[0]): 71 | return False 72 | else: 73 | return compareTwoEditLists(editList1[1:], editList2[1:]) 74 | 75 | def moderateAnnotations(contestDocs, annotBoard, origDocSet): 76 | # moderate annotation in "contesting" docs with already stated mistakes 77 | mistakeStrSet = {} 78 | for doc in contestDocs: 79 | mistakeStr = '' 80 | nid = int(doc.docattrs[0][1]) # nid of current document 81 | tid = doc.annotation[0][0][1] # teacher id 82 | 83 | if not annotBoard.has_key(nid): # create placeholder 84 | annotBoard[nid] = {} 85 | 86 | origDoc = origDocSet[nid] 87 | for pid in xrange(len(origDoc.sentences)): 88 | slist = origDoc.sentences[pid] 89 | if not annotBoard[nid].has_key(pid): 90 | annotBoard[nid][pid] = {} 91 | for sentid in xrange(len(slist)): 92 | sent = slist[sentid] 93 | if not annotBoard[nid][pid].has_key(sentid): 94 | annotBoard[nid][pid][sentid] = [] 95 | editSet = [] 96 | 97 | # enumerate mistakes 98 | sentoffset = origDoc.paragraphs[pid].index(sent.sentstr) 99 | editNum = 0 100 | for m in doc.mistakes: 101 | if m['start_par'] != pid or \ 102 | m['start_par'] != m['end_par'] or \ 103 | m['start_off'] < sentoffset or \ 104 | m['start_off'] >= sentoffset + len(sent.sentstr) or \ 105 | m['end_off'] sentoffset + len(sent.sentstr): 107 | continue 108 | 109 | if m['type'] != 'noop': 110 | editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], m['correction'], m['type'])) 111 | editNum += 1 112 | else: 113 | editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], sent.sentstr, m['type'])) 114 | 115 | editSet = sorted(editSet, key=itemgetter(0, 1, 2, 3)) 116 | 117 | # find the same annotation 118 | foundMatch = False 119 | i = 0 120 | boardEdits = annotBoard[nid][pid][sentid] 121 | while i < len(boardEdits) and not foundMatch: 122 | if compareTwoEditLists(editSet, boardEdits[i]): 123 | foundMatch = True 124 | else: 125 | i+=1 126 | 127 | if not foundMatch: 128 | annotBoard[nid][pid][sentid].append(editSet) 129 | 130 | return annotBoard 131 | 132 | def createM2File(origDocs, mistakesBoard, m2FileName): 133 | 134 | fm2 = open(m2FileName, 'w') 135 | 136 | for doc in origDocs: 137 | nid = int(doc.docattrs[0][1]) # nid of current document 138 | for slistIndex in xrange(len(doc.sentences)): 139 | slist = doc.sentences[slistIndex] 140 | for sentid in xrange(len(slist)): 141 | 142 | sent = slist[sentid] 143 | 144 | # m2 format annotation string list 145 | m2AnnotationList = [] 146 | 147 | # build colums 148 | table = sent.getConllFormat(doc, slistIndex, sentid) 149 | tokenizedSentStr = ' '.join(sent.getWords()) 150 | 151 | #Add annotation info 152 | sentoffset = doc.paragraphs[slistIndex].index(sent.sentstr) 153 | 154 | i = 0 155 | board = mistakesBoard[nid][slistIndex][sentid] 156 | for mistakesList in board: 157 | if i == 0 and len(mistakesList) == 0 and len(board) > 1: # the 0-th is empty 158 | m2AnnotationList.append('A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n') 159 | i += 1 160 | 161 | for tuple in mistakesList: 162 | m = {} 163 | m['start_par'] = tuple[0] 164 | m['end_par'] = tuple[1] 165 | m['start_off'] = tuple[2] 166 | m['end_off'] = tuple[3] 167 | m['correction'] = tuple[4] 168 | m['type'] = tuple[5] 169 | 170 | if m['start_par'] != slistIndex or \ 171 | m['start_par'] != m['end_par'] or \ 172 | m['start_off'] < sentoffset or \ 173 | m['start_off'] >= sentoffset + len(sent.sentstr) or \ 174 | m['end_off'] sentoffset + len(sent.sentstr): 176 | continue 177 | 178 | wordsoffset = 0 179 | wdstart = 0 180 | 181 | startInWord = 0 182 | headText = '' 183 | endInWord = 0 184 | tailText = '' 185 | 186 | words = sent.getWords() 187 | while wdstart < len(words): 188 | 189 | word = words[wdstart] 190 | nextstart = sent.sentstr.find(word, wordsoffset) 191 | 192 | if nextstart == -1: 193 | # may not find word, due to relpacement 194 | print >> sys.stderr, "Warning in building conll format: can not find word" 195 | print >> sys.stderr, word.encode('utf-8') 196 | wordsoffset += 1 197 | else: 198 | wordsoffset = nextstart 199 | 200 | if wordsoffset >= m['start_off']-sentoffset: 201 | break 202 | elif wordsoffset + len(word) > m['start_off']-sentoffset: 203 | # annotation starts at the middle of a word 204 | startInWord = 1 205 | headText = sent.sentstr[wordsoffset: m['start_off']-sentoffset] 206 | break 207 | 208 | wordsoffset += len(word) 209 | wdstart += 1 210 | 211 | if wdstart == len(words): 212 | print >> sys.stderr, 'Warning in building conll format: start_off overflow' 213 | print >> sys.stderr, m, sent.sentstr.encode('utf-8') 214 | continue 215 | 216 | 217 | wdend = wdstart 218 | while wdend < len(words): 219 | 220 | word = words[wdend] 221 | 222 | nextstart = sent.sentstr.find(word, wordsoffset) 223 | 224 | if nextstart == -1: 225 | print >> sys.stderr, "Warning in building conll format: can not find word" 226 | print >> sys.stderr, word.encode('utf-8') 227 | wordsoffset += 1 228 | else: 229 | wordsoffset = nextstart 230 | 231 | if wordsoffset >= m['end_off']-sentoffset: 232 | # annotation ends at the middle of a word 233 | if wordsoffset - len(words[wdend-1]) - 1 < m['end_off']-sentoffset: 234 | endInWord = 1 235 | tailText = sent.sentstr[m['end_off']-sentoffset : wordsoffset].strip() 236 | break 237 | 238 | wordsoffset += len(word) 239 | wdend += 1 240 | 241 | 242 | correctionTokenizedStr = tokenizeCorrectionStr(headText + m['correction'] + tailText, wdstart, wdend, words) 243 | correctionTokenizedStr, wdstart, wdend = shrinkCorrectionStr(correctionTokenizedStr, wdstart, wdend, words) 244 | 245 | token_start = wdstart if m['type'] != 'noop' else -1 246 | token_end = wdend if m['type'] != 'noop' else -1 247 | correction_final = correctionTokenizedStr.replace('\n', '') if m['type'] != 'noop' else '-NONE-' 248 | 249 | # build annotation string for .conll.m2 file 250 | m2AnnotationStr = 'A ' 251 | m2AnnotationStr += str(token_start) + ' ' 252 | m2AnnotationStr += str(token_end) + '|||' 253 | m2AnnotationStr += m['type'] + '|||' 254 | m2AnnotationStr += correction_final + '|||' 255 | m2AnnotationStr += 'REQUIRED|||-NONE-|||' + str(i) + '\n' 256 | 257 | m2AnnotationList.append(m2AnnotationStr) 258 | 259 | if len(mistakesList) > 0: # only if mistakeList contains tuples 260 | i += 1 261 | 262 | # write .conll.m2 file 263 | m2AnnotationSent = 'S ' + tokenizedSentStr + '\n' 264 | m2AnnotationSent += ''.join(m2AnnotationList) + '\n' 265 | fm2.write(m2AnnotationSent.encode('utf-8')) 266 | 267 | fm2.close() 268 | 269 | 270 | def tokenizeCorrectionStr(correctionStr, wdstart, wdend, words): 271 | 272 | correctionTokenizedStr = '' 273 | pseudoSent = correctionStr 274 | 275 | if wdstart != 0: 276 | pseudoSent = words[wdstart-1] + ' ' + pseudoSent 277 | 278 | if wdend < len(words) - 1: 279 | pseudoSent = pseudoSent + ' ' + words[wdend] 280 | elif wdend == len(words) - 1: 281 | pseudoSent = pseudoSent + words[wdend] 282 | 283 | 284 | pseudoWordsList = [] 285 | sentList = sentenceTokenizer.tokenize(pseudoSent) 286 | for sent in sentList: 287 | pseudoWordsList += word_tokenize(sent) 288 | 289 | start = 0 290 | if wdstart != 0: 291 | s = '' 292 | for i in xrange(len(pseudoWordsList)): 293 | s += pseudoWordsList[i] 294 | if s == words[wdstart-1]: 295 | start = i + 1 296 | break 297 | if start == 0: 298 | print >> sys.stderr, 'Can not find words[wdstart-1]' 299 | 300 | else: 301 | start = 0 302 | 303 | end = len(pseudoWordsList) 304 | if wdend != len(words): 305 | 306 | s = '' 307 | for i in xrange(len(pseudoWordsList)): 308 | s = pseudoWordsList[len(pseudoWordsList) - i - 1] + s 309 | if s == words[wdend]: 310 | end = len(pseudoWordsList) - i - 1 311 | break 312 | if end == len(pseudoWordsList): 313 | print >> sys.stderr, 'Can not find words[wdend]' 314 | 315 | else: 316 | end = len(pseudoWordsList) 317 | 318 | correctionTokenizedStr = ' '.join(pseudoWordsList[start:end]) 319 | 320 | return correctionTokenizedStr 321 | 322 | 323 | def shrinkCorrectionStr(correctionTokenizedStr, wdstart, wdend, words): 324 | 325 | correctionWords = correctionTokenizedStr.split(' ') 326 | originalWords = words[wdstart: wdend] 327 | wdstartNew = wdstart 328 | wdendNew = wdend 329 | cstart = 0 330 | cend = len(correctionWords) 331 | 332 | i = 0 333 | while i < len(originalWords) and i < len(correctionWords): 334 | if correctionWords[i] == originalWords[i]: 335 | i += 1 336 | wdstartNew = i + wdstart 337 | cstart = i 338 | else: 339 | break 340 | 341 | i = 1 342 | while i <= len(originalWords) - cstart and i <= len(correctionWords) - cstart: 343 | if correctionWords[len(correctionWords)-i] == originalWords[len(originalWords)-i]: 344 | wdendNew = wdend - i 345 | cend = len(correctionWords) - i 346 | i += 1 347 | else: 348 | break 349 | 350 | return ' '.join(correctionWords[cstart:cend]), wdstartNew, wdendNew 351 | 352 | if __name__ == '__main__': 353 | 354 | ''' usage: 355 | 356 | %python preprocesswithalt.py completesgmlfile mainsgmlfile alternativesgmlfile1 ... alternativesgmlfileN combinedm2file 357 | output an m2 file containing a collection of the main annotation and all alternative annotations. 358 | 359 | In most cases completesgmlfile and mainsgmlfile are identical 360 | ''' 361 | 362 | # Load original complete SGML for reference 363 | origDocs = sentenceSplit(readNUCLE(sys.argv[1])) 364 | 365 | origDocSet = {} 366 | for doc in origDocs: 367 | nid = int(doc.docattrs[0][1]) 368 | origDocSet[nid] = doc 369 | 370 | docsList = [] 371 | for i in range(2, len(sys.argv) - 1): 372 | docs = sentenceSplit(readNUCLE(sys.argv[i])) 373 | docsList.append(docs) 374 | 375 | board = {} 376 | for docs in docsList: 377 | board = moderateAnnotations(docs, board, origDocSet) 378 | 379 | createM2File(origDocs, board, sys.argv[len(sys.argv)-1]) 380 | 381 | pass 382 | 383 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #!/usr/bin/env python 3 | 4 | from model import * 5 | from preprocess import * 6 | from config import Config 7 | 8 | 9 | def evaluate(input_variable, len_inputs): 10 | batch_size, input_length = input_variable.size() 11 | 12 | # Run through encoder 13 | encoder_hidden = encoder.init_hidden(batch_size) 14 | encoder_outputs, encoder_hidden = encoder(input_variable, len_inputs, encoder_hidden) 15 | 16 | # Create starting vectors for decoder 17 | decoder_input = Variable(torch.LongTensor([[SOS_token] for _ in range(batch_size)])) # SOS 18 | decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size)) 19 | decoder_hidden = encoder_hidden 20 | if Config.use_cuda: 21 | decoder_input = decoder_input.cuda() 22 | decoder_context = decoder_context.cuda() 23 | 24 | decoded_output = torch.zeros(batch_size, Config.max_seq_length, out=torch.LongTensor(batch_size, Config.max_seq_length)) 25 | decoder_attentions = torch.zeros(batch_size, input_length, input_length) 26 | 27 | # Run through decoder 28 | for di in range(Config.max_seq_length): 29 | decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, 30 | decoder_hidden, encoder_outputs) 31 | # decoder_attentions[:, di, :decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data 32 | 33 | # Choose top word from output 34 | _, top_index = decoder_output.data.topk(1) 35 | 36 | decoded_output[:, di] = top_index 37 | 38 | # Next input is chosen word 39 | decoder_input = Variable(top_index) 40 | if Config.use_cuda: decoder_input = decoder_input.cuda() 41 | 42 | return decoded_output #, decoder_attentions[:, di + 1, :len(encoder_outputs)] 43 | 44 | 45 | def corpus_bleu_single_ref(r, h): 46 | from nltk.translate.bleu_score import corpus_bleu 47 | r = np.expand_dims(r, axis=1) 48 | return corpus_bleu(r, h) 49 | 50 | 51 | def corpus_wer(r, h): 52 | from utils import wer 53 | return np.mean(map(lambda (a, b): wer(a, b), zip(r, h))) 54 | 55 | 56 | def eval_examples(sources, preds, targets, num=3): 57 | str = '' 58 | for i in range(num): 59 | source = word_dict.indexes_to_sentence(sources[i]) 60 | pred = word_dict.indexes_to_sentence(preds[i]) 61 | target = word_dict.indexes_to_sentence(targets[i]) 62 | str += '#{}\nSource:\t{}\nPred:\t{}\nTarget:\t{}\n\n'.format(i, source, pred, target) 63 | return str 64 | 65 | _, eval_corpus, word_dict = build_corpus() 66 | encoder, decoder = get_model(word_dict.n_words) 67 | 68 | inputs, targets, len_inputs, _ = eval_corpus.next_batch(100) 69 | input_variable = Variable(torch.LongTensor(inputs), requires_grad=False) 70 | if Config.use_cuda: 71 | input_variable = input_variable.cuda() 72 | 73 | output_tensor = evaluate(input_variable, len_inputs) 74 | preds = output_tensor.cpu().numpy().tolist() 75 | 76 | print('\nWER:{}\nBLEU:{}\n'.format(corpus_wer(targets, inputs), corpus_bleu_single_ref(targets, inputs))) 77 | print('\nWER:{}\nBLEU:{}\n'.format(corpus_wer(targets, preds), corpus_bleu_single_ref(targets, preds))) 78 | print('\n{}'.format(eval_examples(inputs, preds, targets))) -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #!/usr/bin/env python 3 | 4 | import glob 5 | import os 6 | 7 | import torch.optim as optim 8 | 9 | from seq2seq.seq2seq import * 10 | 11 | 12 | def save_state(encoder, decoder, encoder_optim, decoder_optim, step, path='checkpoints/model'): 13 | state = {'step': step, 14 | 'encoder': encoder.state_dict(), 15 | 'decoder': decoder.state_dict(), 16 | 'encoder_optim': encoder_optim.state_dict(), 17 | 'decoder_optim': decoder_optim.state_dict()} 18 | filename = path + '-' + str(step) 19 | torch.save(state, filename) 20 | 21 | 22 | def load_state(step=None, path='checkpoints/model'): 23 | state = {} 24 | file_list = glob.glob(path + '*') 25 | if file_list: 26 | if step: 27 | filename = path + '-' + str(step) 28 | else: 29 | filename = max(file_list, key=os.path.getctime) 30 | 31 | state = torch.load(filename) 32 | return state 33 | 34 | 35 | def get_model(n_classes, state=None, step=None, load=True): 36 | encoder = EncoderRNN(n_classes, hidden_size, n_layers) 37 | decoder = AttnDecoderRNN(attn_model, hidden_size, n_classes, n_layers, dropout_p=dropout_p) 38 | if Config.use_cuda: 39 | encoder.cuda() 40 | decoder.cuda() 41 | 42 | if load: 43 | if not state: 44 | state = load_state(step) 45 | if state: 46 | encoder.load_state_dict(state['encoder']) 47 | decoder.load_state_dict(state['decoder']) 48 | 49 | return encoder, decoder 50 | 51 | 52 | def get_optimizer(encoder, decoder, step=None, state=None, lr=0.0001): 53 | encoder_optimizer = optim.Adam(encoder.parameters(), lr=lr) 54 | decoder_optimizer = optim.Adam(decoder.parameters(), lr=lr) 55 | 56 | if not state: 57 | state = load_state(step) 58 | if state: 59 | encoder_optimizer.load_state_dict(state['encoder_optim']) 60 | decoder_optimizer.load_state_dict(state['decoder_optim']) 61 | 62 | return encoder_optimizer, decoder_optimizer -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #!/usr/bin/env python 3 | 4 | import random 5 | import numpy as np 6 | from config import Config 7 | 8 | PAD_token = 0 9 | SOS_token = 1 10 | EOS_token = 2 11 | 12 | 13 | class WordDict: 14 | def __init__(self): 15 | self.word2index = {} 16 | self.word2count = {} 17 | self.index2word = {PAD_token: "", SOS_token: "", EOS_token: ""} 18 | self.n_words = 3 # Count PAD, SOS and EOS 19 | 20 | def add_indexes(self, sentence): 21 | for word in sentence.split(' '): 22 | self.add_index(word) 23 | 24 | def add_index(self, word): 25 | if word not in self.word2index: 26 | self.word2index[word] = self.n_words 27 | self.word2count[word] = 1 28 | self.index2word[self.n_words] = word 29 | self.n_words += 1 30 | else: 31 | self.word2count[word] += 1 32 | 33 | def sentence_to_indexes(self, sentence, max_length): 34 | indexes = [self.word2index[word] for word in sentence.split(' ')][:max_length - 1] 35 | indexes.append(EOS_token) 36 | n_indexes = len(indexes) 37 | indexes.extend([PAD_token for _ in range(max_length - len(indexes))]) 38 | return indexes, n_indexes 39 | 40 | def indexes_to_sentence(self, indexes): 41 | indexes = filter(lambda i: i != PAD_token, indexes) 42 | indexes = map(lambda i: self.index2word[i], indexes) 43 | return ' '.join(indexes) 44 | 45 | 46 | class Corpus: 47 | def __init__(self, dict, max_length, path): 48 | self.max_length = max_length 49 | self.lines = self.filter_raw_string(open(path).read()).split('\n') 50 | self.pairs = [[s for s in l.split('\t')] for l in self.lines] 51 | self.dict = dict 52 | for pair in self.pairs: 53 | self.dict.add_indexes(pair[0]) 54 | self.dict.add_indexes(pair[1]) 55 | 56 | def filter_raw_string(self, str): 57 | return str.strip().translate(None, '<>') 58 | 59 | def next_batch(self, batch_size=100): 60 | pairs = np.array(random.sample(self.pairs, batch_size)) 61 | input_lens = [self.dict.sentence_to_indexes(s, self.max_length) for s in pairs[:, 0]] 62 | target_lens = [self.dict.sentence_to_indexes(s, self.max_length) for s in pairs[:, 1]] 63 | input_lens, target_lens = zip(*sorted(zip(input_lens, target_lens), key=lambda p: p[0][1], reverse=True)) 64 | inputs = map(lambda i: i[0], input_lens) 65 | len_inputs = map(lambda i: i[1], input_lens) 66 | targets = map(lambda i: i[0], target_lens) 67 | len_targets = map(lambda i: i[1], target_lens) 68 | return inputs, targets, len_inputs, len_targets 69 | 70 | 71 | def build_corpus(): 72 | word_dict = WordDict() 73 | train_corpus = Corpus(word_dict, Config.max_seq_length, Config.train_data_path) 74 | eval_corpus = Corpus(word_dict, Config.max_seq_length, Config.eval_data_path) 75 | return train_corpus, eval_corpus, word_dict -------------------------------------------------------------------------------- /seq2seq/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andabi/deep-text-corrector/69bd711e65cc42364becba5efd99b8d4f8ab0aab/seq2seq/__init__.py -------------------------------------------------------------------------------- /seq2seq/seq2seq.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #!/usr/bin/env python 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | from torch.nn.utils.rnn import pack_padded_sequence as pack, pad_packed_sequence as unpack 9 | from config import Config 10 | 11 | attn_model = 'general' 12 | hidden_size = 500 13 | n_layers = 2 14 | dropout_p = 0.05 15 | 16 | SOS_token = 0 17 | EOS_token = 1 18 | 19 | 20 | class EncoderRNN(nn.Module): 21 | def __init__(self, input_size, hidden_size, n_layers=1): 22 | super(EncoderRNN, self).__init__() 23 | 24 | self.input_size = input_size 25 | self.hidden_size = hidden_size 26 | self.n_layers = n_layers 27 | 28 | self.embedding = nn.Embedding(input_size, hidden_size) 29 | self.gru = nn.GRU(hidden_size, hidden_size, n_layers, batch_first=True) 30 | 31 | def init_hidden(self, batch_size): 32 | hidden = Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)) 33 | if Config.use_cuda: hidden = hidden.cuda() 34 | return hidden 35 | 36 | def forward(self, input_seq, len_inputs, hidden): 37 | # input_seq.size() = (B, S), hidden.size() = (L, B, H), embedded.size() = (B, S, H), output.size() = (B, S, H) 38 | # batch_size, seq_len = input_sequence.size() 39 | embedded = self.embedding(input_seq) 40 | output, hidden = self.gru(pack(embedded, len_inputs, batch_first=True), hidden) 41 | output, _ = unpack(output, batch_first=True) 42 | return output, hidden 43 | 44 | 45 | class AttnDecoderRNN(nn.Module): 46 | def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_p=0.1): 47 | super(AttnDecoderRNN, self).__init__() 48 | 49 | # Keep parameters for reference 50 | self.attn_model = attn_model 51 | self.hidden_size = hidden_size 52 | self.output_size = output_size 53 | self.n_layers = n_layers 54 | self.dropout_p = dropout_p 55 | 56 | # Define layers 57 | self.embedding = nn.Embedding(output_size, hidden_size) 58 | self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p, batch_first=True) 59 | self.out = nn.Linear(hidden_size * 2, output_size) 60 | 61 | # Choose attention model 62 | if attn_model != 'none': 63 | self.attn = Attn(attn_model, hidden_size) 64 | 65 | def forward(self, input, last_context, last_hidden, encoder_outputs): 66 | # input.size() = (B, 1), last_context.size() = (B, H), last_hidden.size() = (L, B, H), encoder_outputs.size() = (B, S, H) 67 | # word_embedded.size() = (B, 1, H) 68 | # print input.size() 69 | word_embedded = self.embedding(input) 70 | 71 | # rnn_input.size() = (B, 1, 2H), rnn_output.size() = (B, 1, H) 72 | # print word_embedded.size(), last_context.unsqueeze(1).size() 73 | rnn_input = torch.cat((word_embedded, last_context.unsqueeze(1)), -1) 74 | rnn_output, hidden = self.gru(rnn_input, last_hidden) 75 | rnn_output = rnn_output.squeeze(1) # B x S=1 x H -> B x H 76 | 77 | # atten_weights.size() = (B, S) 78 | attn_weights = self.attn(rnn_output, encoder_outputs) 79 | context = attn_weights.unsqueeze(1).bmm(encoder_outputs).squeeze(1) # B x H 80 | 81 | # TODO tanh? 82 | # Final output layer (next word prediction) using the RNN hidden state and context vector 83 | output = self.out(torch.cat((rnn_output, context), -1)) # B x V 84 | 85 | # Return final output, hidden state, and attention weights (for visualization) 86 | # output.size() = (B, V) 87 | return output, context, hidden, attn_weights 88 | 89 | 90 | class Attn(nn.Module): 91 | def __init__(self, method, hidden_size): 92 | super(Attn, self).__init__() 93 | 94 | self.method = method 95 | self.hidden_size = hidden_size 96 | 97 | if self.method == 'general': 98 | self.attn = nn.Linear(self.hidden_size, hidden_size) 99 | 100 | # elif self.method == 'concat': 101 | # self.attn = nn.Linear(self.hidden_size * 2, hidden_size) 102 | # self.other = nn.Parameter(torch.FloatTensor(1, hidden_size)) 103 | 104 | def forward(self, hidden, encoder_outputs): 105 | # hidden.size() = (B, H), encoder_outputs.size() = (B, S, H) 106 | batch_size, encoder_outputs_len, _ = encoder_outputs.size() 107 | 108 | # Create variable to store attention energies 109 | # attn_energies.size() = (B, S) 110 | attn_energies = Variable(torch.zeros((batch_size, encoder_outputs_len))) # B x S 111 | if Config.use_cuda: attn_energies = attn_energies.cuda() 112 | 113 | # Calculate energies for each encoder output 114 | # attn_energies.size() = (B, S) 115 | for i in range(encoder_outputs_len): 116 | attn_energies[:, i] = self.score(hidden, encoder_outputs[:, i]) 117 | # print attn_energies[:, i] 118 | 119 | # Normalize energies to weights in range 0 to 1 120 | return F.softmax(attn_energies) 121 | 122 | def score(self, hidden, encoder_output): 123 | 124 | # print hidden.size(), encoder_output.size() 125 | if self.method == 'dot': 126 | energy = hidden.unsqueeze(1).bmm(encoder_output.unsqueeze(2)) # dot product 127 | return energy 128 | 129 | elif self.method == 'general': 130 | energy = self.attn(encoder_output) 131 | energy = hidden.unsqueeze(1).bmm(energy.unsqueeze(2)) 132 | return energy 133 | 134 | # TODO 135 | # elif self.method == 'concat': 136 | # energy = self.attn(torch.cat((hidden, encoder_output), -1)) 137 | # energy = self.other.unsqueeze(1).bmm(energy.unsqueeze(2)) 138 | # return energy 139 | 140 | 141 | # class BahdanauAttnDecoderRNN(nn.Module): 142 | # def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1): 143 | # super(AttnDecoderRNN, self).__init__() 144 | # 145 | # # Define parameters 146 | # self.hidden_size = hidden_size 147 | # self.output_size = output_size 148 | # self.n_layers = n_layers 149 | # self.dropout_p = dropout_p 150 | # self.max_length = max_length 151 | # 152 | # # Define layers 153 | # self.embedding = nn.Embedding(output_size, hidden_size) 154 | # self.dropout = nn.Dropout(dropout_p) 155 | # self.attn = GeneralAttn(hidden_size) 156 | # self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p) 157 | # self.out = nn.Linear(hidden_size, output_size) 158 | # 159 | # def forward(self, word_input, last_hidden, encoder_outputs): 160 | # # Note that we will only be running forward for a single decoder time step, but will use all encoder outputs 161 | # 162 | # # Get the embedding of the current input word (last output word) 163 | # word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N 164 | # word_embedded = self.dropout(word_embedded) 165 | # 166 | # # Calculate attention weights and apply to encoder outputs 167 | # attn_weights = self.attn(last_hidden[-1], encoder_outputs) 168 | # context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N 169 | # 170 | # # Combine embedded input word and attended context, run through RNN 171 | # rnn_input = torch.cat((word_embedded, context), 2) 172 | # output, hidden = self.gru(rnn_input, last_hidden) 173 | # 174 | # # Final output layer 175 | # output = output.squeeze(0) # B x N 176 | # output = F.log_softmax(self.out(torch.cat((output, context), 1))) 177 | # 178 | # # Return final output, hidden state, and attention weights (for visualization) 179 | # return output, hidden, attn_weights -------------------------------------------------------------------------------- /seq2seq/seq2seq_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #!/usr/bin/env python 3 | 4 | from seq2seq import * 5 | 6 | encoder_test = EncoderRNN(10, 10, 2) 7 | decoder_test = AttnDecoderRNN('general', 10, 10, 2) 8 | print(encoder_test) 9 | print(decoder_test) 10 | 11 | encoder_hidden = encoder_test.init_hidden() 12 | word_input = Variable(torch.LongTensor([1, 2, 3])) 13 | if Config.use_cuda: 14 | encoder_test.cuda() 15 | word_input = word_input.cuda() 16 | encoder_outputs, encoder_hidden = encoder_test(word_input, encoder_hidden) 17 | 18 | word_inputs = Variable(torch.LongTensor([1, 2, 3])) 19 | decoder_attns = torch.zeros(1, 3, 3) 20 | decoder_hidden = encoder_hidden 21 | decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)) 22 | 23 | if Config.use_cuda: 24 | decoder_test.cuda() 25 | word_inputs = word_inputs.cuda() 26 | decoder_context = decoder_context.cuda() 27 | 28 | for i in range(3): 29 | decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[i], decoder_context, decoder_hidden, encoder_outputs) 30 | print(decoder_output.size(), decoder_hidden.size(), decoder_attn.size()) 31 | decoder_attns[0, i] = decoder_attn.squeeze(0).cpu().data -------------------------------------------------------------------------------- /tensorboard_logger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #!/usr/bin/env python 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | import scipy.misc 7 | 8 | try: 9 | from StringIO import StringIO # Python 2.7 10 | except ImportError: 11 | from io import BytesIO # Python 3.x 12 | 13 | 14 | class Logger(object): 15 | def __init__(self, log_dir): 16 | """Create a summary writer logging to log_dir.""" 17 | self.writer = tf.summary.FileWriter(log_dir) 18 | 19 | def scalar_summary(self, tag, value, step): 20 | """Log a scalar variable.""" 21 | summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) 22 | self.writer.add_summary(summary, step) 23 | 24 | def image_summary(self, tag, images, step): 25 | """Log a list of images.""" 26 | 27 | img_summaries = [] 28 | for i, img in enumerate(images): 29 | # Write the image to a string 30 | try: 31 | s = StringIO() 32 | except: 33 | s = BytesIO() 34 | scipy.misc.toimage(img).save(s, format="png") 35 | 36 | # Create an Image object 37 | img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(), 38 | height=img.shape[0], 39 | width=img.shape[1]) 40 | # Create a Summary value 41 | img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum)) 42 | 43 | # Create and write Summary 44 | summary = tf.Summary(value=img_summaries) 45 | self.writer.add_summary(summary, step) 46 | 47 | def histo_summary(self, tag, values, step, bins=1000): 48 | """Log a histogram of the tensor of values.""" 49 | 50 | # Create a histogram using numpy 51 | counts, bin_edges = np.histogram(values, bins=bins) 52 | 53 | # Fill the fields of the histogram proto 54 | hist = tf.HistogramProto() 55 | hist.min = float(np.min(values)) 56 | hist.max = float(np.max(values)) 57 | hist.num = int(np.prod(values.shape)) 58 | hist.sum = float(np.sum(values)) 59 | hist.sum_squares = float(np.sum(values ** 2)) 60 | 61 | # Drop the start of the first bin 62 | bin_edges = bin_edges[1:] 63 | 64 | # Add bin edges and counts 65 | for edge in bin_edges: 66 | hist.bucket_limit.append(edge) 67 | for c in counts: 68 | hist.bucket.append(c) 69 | 70 | # Create and write Summary 71 | summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)]) 72 | self.writer.add_summary(summary, step) 73 | self.writer.flush() -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #!/usr/bin/env python 3 | 4 | import torch.optim 5 | from torch.nn.utils.rnn import pad_packed_sequence as unpack 6 | 7 | from model import * 8 | from preprocess import * 9 | from utils import * 10 | from tensorboard_logger import Logger 11 | 12 | final_steps = 50000 13 | print_every = 1 14 | save_every = 500 15 | learning_rate = 0.0001 16 | teacher_forcing_ratio = 0.5 17 | clip = 5.0 18 | 19 | 20 | def sequence_mask(sequence_length, max_len=None): 21 | if max_len is None: 22 | max_len = sequence_length.data.max() 23 | batch_size = sequence_length.size(0) 24 | seq_range = torch.range(0, max_len - 1).long() 25 | seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) 26 | seq_range_expand = Variable(seq_range_expand) 27 | if sequence_length.is_cuda: 28 | seq_range_expand = seq_range_expand.cuda() 29 | seq_length_expand = (sequence_length.unsqueeze(1) 30 | .expand_as(seq_range_expand)) 31 | return seq_range_expand < seq_length_expand 32 | 33 | 34 | # outputs: (B, S, V) 35 | # targets: (B, S, V) 36 | # lengths: (B, 1) 37 | def masked_cross_entropy(logits, targets, lengths): 38 | batch_size, seq_len, n_classes = logits.size() 39 | assert (batch_size, seq_len) == targets.size() 40 | 41 | # mask = Variable(torch.LongTensor([[1 for _ in range(l)] for l in lengths.data])) 42 | # mask = mask.resize_as(targets) 43 | mask = sequence_mask(sequence_length=lengths, max_len=targets.size(1)) 44 | 45 | # logits_flat: (batch * max_len, num_classes) 46 | logits_flat = logits.view(-1, logits.size(-1)) 47 | # log_probs_flat: (batch * max_len, num_classes) 48 | log_probs_flat = F.log_softmax(logits_flat) 49 | # target_flat: (batch * max_len, 1) 50 | target_flat = targets.view(-1, 1) 51 | # losses_flat: (batch * max_len, 1) 52 | losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat) 53 | # losses: (batch, max_len) 54 | losses = losses_flat.view(*targets.size()) * mask.float() 55 | return losses.sum() / lengths.float().sum() 56 | 57 | 58 | def train(input_batch, len_inputs, target_batch, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion): 59 | # Zero gradients of both optimizers 60 | encoder_optimizer.zero_grad() 61 | decoder_optimizer.zero_grad() 62 | 63 | # Get size of input and target sentences 64 | # batch_size, input_length = input_batch.size() 65 | batch_size, target_length = target_batch.size() 66 | 67 | # TODO parameter를 paddingsequence로 받게끔 하고 아래는 삭제 68 | length_targets = Variable(torch.LongTensor(map(lambda s: len(s), target_batch))).cuda() 69 | 70 | # Run words through encoder 71 | encoder_hidden = encoder.init_hidden(batch_size) 72 | encoder_outputs, encoder_hidden = encoder(input_batch, len_inputs, encoder_hidden) 73 | 74 | # Prepare input and output variables 75 | decoder_input = Variable(torch.LongTensor([[SOS_token] for _ in range(batch_size)])) 76 | decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size)) 77 | decoder_hidden = encoder_hidden # Use last hidden state from encoder to start decoder 78 | decoder_outputs = Variable(torch.FloatTensor(batch_size, target_length, decoder.output_size).zero_()) 79 | 80 | if Config.use_cuda: 81 | decoder_input = decoder_input.cuda() 82 | decoder_context = decoder_context.cuda() 83 | decoder_outputs = decoder_outputs.cuda() 84 | 85 | # Choose whether to use teacher forcing 86 | if random.random() < teacher_forcing_ratio: 87 | # Teacher forcing: Use the ground-truth target as the next input 88 | for di in range(target_length): 89 | decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, 90 | decoder_hidden, 91 | encoder_outputs) 92 | decoder_outputs[:, di] = decoder_output 93 | decoder_input = target_batch[:, di].unsqueeze(1) # Next target is next input 94 | else: 95 | for di in range(target_length): 96 | decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, 97 | decoder_hidden, 98 | encoder_outputs) 99 | decoder_outputs[:, di] = decoder_output 100 | # Get most likely word index (highest value) from output 101 | _, top_index = decoder_output.data.topk(1) 102 | decoder_input = Variable(top_index) # Chosen word is next input 103 | if Config.use_cuda: decoder_input = decoder_input.cuda() 104 | 105 | # Stop at end of sentence (not necessary when using known targets) 106 | # TODO 107 | # if ni == EOS_token: break 108 | 109 | loss = masked_cross_entropy(decoder_outputs, target_batch, length_targets) 110 | 111 | # Backpropagation 112 | loss.backward() 113 | torch.nn.utils.clip_grad_norm(encoder.parameters(), clip) 114 | torch.nn.utils.clip_grad_norm(decoder.parameters(), clip) 115 | encoder_optimizer.step() 116 | decoder_optimizer.step() 117 | 118 | return loss.data[0] / target_length 119 | 120 | 121 | # Get train corpus and word_dict 122 | train_corpus, _, word_dict = build_corpus() 123 | 124 | # Build models, optimizers and load states 125 | state = load_state() 126 | step = 1 127 | if state: 128 | step = state['step'] + 1 129 | encoder, decoder = get_model(word_dict.n_words, state=state) 130 | encoder_optimizer, decoder_optimizer = get_optimizer(encoder, decoder, lr=learning_rate, state=state) 131 | 132 | # Define loss function 133 | criterion = nn.NLLLoss() 134 | 135 | # Keep track of time elapsed and running averages 136 | start = time.time() 137 | 138 | # Set configuration for using Tensorboard 139 | logger = Logger('graphs') 140 | 141 | for step in range(step, final_steps + 1): 142 | 143 | # Get training data for this cycle 144 | inputs, targets, len_inputs, len_targets = train_corpus.next_batch() 145 | input_variable = Variable(torch.LongTensor(inputs), requires_grad=False) 146 | target_variable = Variable(torch.LongTensor(targets), requires_grad=False) 147 | 148 | if Config.use_cuda: 149 | input_variable = input_variable.cuda() 150 | target_variable = target_variable.cuda() 151 | 152 | # Run the train function 153 | loss = train(input_variable, len_inputs, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) 154 | 155 | # Keep track of loss 156 | logger.scalar_summary('loss', loss, step) 157 | 158 | if step % print_every == 0: 159 | print('%s: %s (%d %d%%)' % (step, time_since(start, 1. * step / final_steps), step, step / final_steps * 100)) 160 | 161 | if step % save_every == 0: 162 | save_state(encoder, decoder, encoder_optimizer, decoder_optimizer, step) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #!/usr/bin/env python 3 | 4 | import math 5 | import time 6 | import numpy as np 7 | 8 | 9 | def as_minutes(s): 10 | m = math.floor(s / 60) 11 | s -= m * 60 12 | return '%dm %ds' % (m, s) 13 | 14 | 15 | def time_since(since, percent): 16 | s = now() - since 17 | es = s / (percent) 18 | rs = es - s 19 | return '%s (- %s)' % (as_minutes(s), as_minutes(rs)) 20 | 21 | 22 | def now(): 23 | return time.time() 24 | 25 | 26 | # r = reference, h = hypothesis 27 | def wer(r, h): 28 | # initialisation 29 | d = np.zeros((len(r)+1)*(len(h)+1), dtype=np.uint8) 30 | d = d.reshape((len(r)+1, len(h)+1)) 31 | for i in range(len(r)+1): 32 | for j in range(len(h)+1): 33 | if i == 0: 34 | d[0][j] = j 35 | elif j == 0: 36 | d[i][0] = i 37 | 38 | # computation 39 | for i in range(1, len(r)+1): 40 | for j in range(1, len(h)+1): 41 | if r[i-1] == h[j-1]: 42 | d[i][j] = d[i-1][j-1] 43 | else: 44 | substitution = d[i-1][j-1] + 1 45 | insertion = d[i][j-1] + 1 46 | deletion = d[i-1][j] + 1 47 | d[i][j] = min(substitution, insertion, deletion) 48 | 49 | return d[len(r)][len(h)] / float(len(r)) 50 | 51 | 52 | # def show_attention(input_sentence, output_words, attentions): 53 | # # Set up figure with colorbar 54 | # fig = plt.figure() 55 | # ax = fig.add_subplot(111) 56 | # cax = ax.matshow(attentions.numpy(), cmap='bone') 57 | # fig.colorbar(cax) 58 | # 59 | # # Set up axes 60 | # ax.set_xticklabels([''] + input_sentence.split(' ') + [''], rotation=90) 61 | # ax.set_yticklabels([''] + output_words) 62 | # 63 | # # Show label at every tick 64 | # ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) 65 | # ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) 66 | # 67 | # show_plot_visdom() 68 | # plt.show() 69 | # plt.close() 70 | # 71 | # 72 | # def evaluate_and_show_attention(input_sentence, target_sentence=None): 73 | # output_words, attentions = evaluate(input_sentence) 74 | # output_sentence = ' '.join(output_words) 75 | # print('>', input_sentence) 76 | # if target_sentence is not None: 77 | # print('=', target_sentence) 78 | # print('<', output_sentence) 79 | # 80 | # show_attention(input_sentence, output_words, attentions) 81 | # 82 | # # Show input, target, output text in visdom 83 | # win = 'evaluted (%s)' % hostname 84 | # text = '

> %s

= %s

< %s

' % (input_sentence, target_sentence, output_sentence) 85 | # vis.text(text, win=win, opts={'title': win}) 86 | --------------------------------------------------------------------------------