├── .gitignore
├── README.md
├── config.py
├── data
    ├── __init__.py
    ├── conll14st-test-data
    │   ├── README
    │   ├── alt
    │   │   ├── alternative-teama.sgml
    │   │   ├── alternative-teamb.sgml
    │   │   ├── alternative-teamc.sgml
    │   │   └── official-2014.combined-withalt.m2
    │   ├── noalt
    │   │   ├── official-2014.0.conll.ann
    │   │   ├── official-2014.0.m2
    │   │   ├── official-2014.0.sgml
    │   │   ├── official-2014.1.conll.ann
    │   │   ├── official-2014.1.m2
    │   │   ├── official-2014.1.sgml
    │   │   └── official-2014.combined.m2
    │   └── scripts
    │   │   ├── README
    │   │   ├── iparser.py
    │   │   ├── nucle_doc.py
    │   │   ├── nuclesgmlparser.py
    │   │   ├── parser_feature.py
    │   │   ├── preprocess.py
    │   │   ├── preprocesscombine.py
    │   │   └── preprocesswithalt.py
    ├── eval.txt
    ├── release2.3.1
    │   ├── README
    │   ├── m2scorer
    │   │   ├── LICENSE
    │   │   ├── README
    │   │   ├── example
    │   │   │   ├── README
    │   │   │   ├── source_gold
    │   │   │   └── system
    │   │   ├── m2scorer
    │   │   └── scripts
    │   │   │   ├── Tokenizer.py
    │   │   │   ├── combiner.py
    │   │   │   ├── convert_hoo.py
    │   │   │   ├── convert_nucle.py
    │   │   │   ├── levenshtein.py
    │   │   │   ├── m2scorer.py
    │   │   │   ├── nucle_doc.py
    │   │   │   ├── nuclesgmlparser.py
    │   │   │   ├── test.sgml
    │   │   │   ├── token_offsets.py
    │   │   │   └── util.py
    │   ├── original
    │   │   ├── data
    │   │   │   ├── official-preprocessed.conll
    │   │   │   ├── official-preprocessed.conll.ann
    │   │   │   ├── official-preprocessed.m2
    │   │   │   └── official.sgml
    │   │   └── data_5types
    │   │   │   ├── official-preprocessed.5types.conll.ann
    │   │   │   ├── official-preprocessed.5types.m2
    │   │   │   └── official.5types.sgml
    │   ├── revised
    │   │   ├── data
    │   │   │   ├── official-preprocessed.conll.ann
    │   │   │   ├── official-preprocessed.m2
    │   │   │   └── official.sgml
    │   │   └── data_5types
    │   │   │   ├── alternatives.NTHU.sgml
    │   │   │   ├── alternatives.STEL.sgml
    │   │   │   ├── alternatives.TOR.sgml
    │   │   │   ├── alternatives.UIUC.sgml
    │   │   │   ├── alternatives.UMC.sgml
    │   │   │   ├── combined.5types.m2
    │   │   │   ├── official-preprocessed.5types.conll.ann
    │   │   │   ├── official-preprocessed.5types.m2
    │   │   │   └── official.5types.sgml
    │   └── scripts
    │   │   ├── README
    │   │   ├── iparser.py
    │   │   ├── nucle_doc.py
    │   │   ├── nuclesgmlparser.py
    │   │   ├── parser_feature.py
    │   │   ├── preprocess.py
    │   │   └── preprocesswithalt.py
    └── train.txt
├── eval.py
├── model.py
├── preprocess.py
├── seq2seq
    ├── __init__.py
    ├── seq2seq.py
    └── seq2seq_test.py
├── tensorboard_logger.py
├── train.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | checkpoints/
3 | *.pyc
4 | .DS_Store
5 | graphs/
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Text Corrector
 2 | > Work in Progress
 3 | ## Introduction
 4 | This project aims to make a text corrector for English learners using deep neural net. This project is implemented in Pytorch
 5 | - First trial: seq2seq w/ attention + nucle 2.3 dataset
 6 | - Next..
 7 |     - more data, data augmentations, ...
 8 |     - tweeking nets like Bytenet or Transformer
 9 |     - beam search
10 | 
11 | ## Datasets
12 | [CoNLL-2013 Shared Task: Grammatical Error Correction](http://www.comp.nus.edu.sg/~nlp/conll13st.html)
13 | [Overview Paper](http://www.comp.nus.edu.sg/~nlp/conll13st/CoNLLST01.pdf)
14 | [Datasets](http://www.comp.nus.edu.sg/~nlp/conll13st/release2.3.1.tar.gz)
15 | [Participant Papers](http://aclweb.org/anthology/W/W13/#3600)
16 | 
17 | [CoNLL-2014 Shared Task: Grammatical Error Correction](http://www.comp.nus.edu.sg/~nlp/conll14st.html)
18 | [Overview Paper](http://www.comp.nus.edu.sg/~nlp/conll14st.html)
19 | [Datasets] Needs a license form.
20 | 
21 | ## References
22 | https://github.com/atpaino/deep-text-corrector
23 | 
24 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | class Config:
2 |     use_cuda = True
3 |     max_seq_length = 100
4 |     train_data_path = './data/train.txt'
5 |     eval_data_path = './data/eval.txt'


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andabi/deep-text-corrector/69bd711e65cc42364becba5efd99b8d4f8ab0aab/data/__init__.py


--------------------------------------------------------------------------------
/data/conll14st-test-data/README:
--------------------------------------------------------------------------------
  1 | CoNLL-2014 Official Test Data
  2 | Release 3.2
  3 | 22 Apr 2014
  4 | 
  5 | This README file describes the test data for the CoNLL-2014 Shared
  6 | Task: Grammatical Error Correction.
  7 | 
  8 | The package is distributed freely with the following copyright
  9 | 
 10 | Copyright (C) 2014 Hwee Tou Ng, Siew Mei Wu, Ted Briscoe,
 11 |                    Christian Hadiwinoto, Raymond Hendy Susanto,
 12 |                    Christopher Bryant
 13 | 
 14 | Any questions regarding the test data should be directed to
 15 | Hwee Tou Ng at: nght@comp.nus.edu.sg
 16 | 
 17 | 
 18 | 1. Directory Structure and Contents
 19 | ===================================
 20 | 
 21 | The top-level directory has two subdirectories, namely
 22 | 
 23 | - noalt/   : the annotated test data without alternatives contributed
 24 |              by the participants
 25 | - alt/     : the annotated test data with moderated participants'
 26 |              alternative annotations
 27 | - scripts/ : the scripts used to preprocess the test data inside the
 28 |              two subdirectories
 29 | 
 30 | 
 31 | 2. Data Format 
 32 | ==============
 33 | 
 34 | The corpus is distributed in a simple SGML format.  All annotations
 35 | come in a "stand-off" format. The start position and end position of
 36 | an annotation are given by paragraph and character offsets.
 37 | Paragraphs are enclosed in <P>...</P> tags. Paragraphs and characters
 38 | are counted starting from zero. Each annotation includes the following
 39 | fields: the error category, the correction, and optionally a
 40 | comment. If the correction replaces the original text at the given
 41 | location, it should fix the grammatical error.
 42 | 
 43 | Example:
 44 | 
 45 | <DOC nid="3">
 46 | <TEXT>
 47 | <P>
 48 | People with close blood relationship generally ...
 49 | </P>
 50 | <P>
 51 | Focus on the negative side of the annouance ...
 52 | </P>
 53 | ...
 54 | </TEXT>
 55 | <ANNOTATION teacher_id="8">
 56 | <MISTAKE start_par="0" start_off="24" end_par="0" end_off="36">
 57 | <TYPE>Nn</TYPE>
 58 | <CORRECTION>relationships</CORRECTION>
 59 | </MISTAKE>
 60 | ...
 61 | </ANNOTATION>
 62 | </DOC>
 63 | <DOC nid="4">
 64 | ...
 65 | 
 66 | Below is a complete list of the error categories in the noalt/ and alt/
 67 | subdirectories:
 68 | 
 69 | ERROR TAG    ERROR CATEGORY
 70 | ---------------------------
 71 | Vt	     Verb tense
 72 | Vm	     Verb modal
 73 | V0	     Missing verb
 74 | Vform	     Verb form
 75 | SVA	     Subject-verb-agreement
 76 | ArtOrDet     Article or Determiner
 77 | Nn	     Noun number
 78 | Npos	     Noun possesive
 79 | Pform	     Pronoun form
 80 | Pref	     Pronoun reference
 81 | Prep         Preposition
 82 | Wci	     Wrong collocation/idiom
 83 | Wa	     Acronyms
 84 | Wform	     Word form
 85 | Wtone	     Tone
 86 | Srun	     Runons, comma splice
 87 | Smod	     Dangling modifier
 88 | Spar	     Parallelism
 89 | Sfrag	     Fragment
 90 | Ssub	     Subordinate clause
 91 | WOinc	     Incorrect sentence form
 92 | WOadv	     Adverb/adjective position
 93 | Trans	     Link word/phrases
 94 | Mec	     Punctuation, capitalization, spelling, typos
 95 | Rloc-	     Local redundancy
 96 | Cit	     Citation
 97 | Others	     Other errors
 98 | Um	     Unclear meaning (cannot be corrected)
 99 | 
100 | The official annotation file contains all the default annotations to
101 | make the whole text correct. Meanwhile, each of the alternative
102 | annotation files contains only annotations for sentences that can be
103 | corrected in a different way, i.e. sentences that have alternative
104 | annotations. If according to an alternative, a sentence can remain
105 | unchanged, a special tag "noop" is used for that particular sentence.
106 | 
107 | 
108 | 3. Updates included in version 2.1
109 | ==================================
110 | 
111 | The major change made in version 2.1 is to map the past error
112 | categories Wcip and Rloc to Prep, Wci, ArtOrDet, and Rloc-.
113 | 
114 | In the original data, there is no explicit preposition error
115 | category. Instead, preposition errors are part of the Wcip (Wrong
116 | collocation/idiom/preposition) and Rloc (local redundancy) error
117 | categories. In addition, redundant article or determiner errors are
118 | part of the Rloc error category.
119 | 
120 | 
121 | 4. Updates included in version 2.2
122 | ==================================
123 | 
124 | - Fixed the bug on expanding an error annotation involving part of a
125 | token to the full token.
126 | 
127 | - Other miscellaneous corrections were made.
128 | 
129 | 
130 | 5. Updates included in version 2.3
131 | ==================================
132 | 
133 | - Fixed the bug involving tokenization of punctuation symbols in the
134 | correction string.
135 | 
136 | - Fixed the tokenization example in the README file of the M^2 scorer
137 | to reflect the real tokenization to be used and removed irrelevant
138 | codes from the scorer package.
139 | 
140 | 
141 | 6. Updates included in version 3.0
142 | ==================================
143 | 
144 | - Resolved overlapping annotations in the NUCLE corpus to make them
145 |   non-overlapping.
146 | 
147 | - Corrected some minor mistakes in error annotations.
148 | 
149 | 
150 | 7. Updates included in version 3.1
151 | ==================================
152 | 
153 | - Removed duplicate annotations in the NUCLE corpus with the same span
154 |   and correction string but different error type so as to keep only one of
155 |   those annotations. This fix only affects 0.1% of all annotations.
156 | 
157 | - Fixed end-of-paragraph annotations so that the end offset of such
158 |   annotations is the last character position in the paragraph. This fix
159 |   only affects 0.7% of all annotations.
160 | 
161 | - Corrected some minor mistakes in error annotations.
162 | 
163 | - Inclusion of the CoNLL-2013 test data, with all the known problems
164 |   described above fixed. Participating teams in the CoNLL-2014 shared
165 |   task can make use of the CoNLL-2013 test data in training and
166 |   developing their systems if they wish to do so.
167 | 
168 | - Fixed a minor bug in the M2 scorer that caused duplicate insertion
169 |   edits to receive high scores.
170 | 
171 | 
172 | 8. Updates included in version 3.2
173 | ==================================
174 | 
175 | - Fixed the preprocessing script such that a gold edit that inserts an
176 |   empty string is not included in the token-level gold edit and scorer
177 |   answer files.
178 | 
179 | - Removed one edit that inserted an empty string from the CoNLL-2014
180 |   test data. Also removed such instances from the NUCLE training data.
181 | 
182 | - Fixed a bug in the M2 scorer arising from scoring against gold edits
183 |   from multiple annotators. Specifically, the bug sometimes caused
184 |   incorrect scores to be reported when scoring against the gold edits
185 |   of subsequent annotators (other than the first annotator).
186 | 
187 | - Fixed a bug in the M2 scorer that caused erroneous scores to be
188 |   reported when dealing with insertion edits followed by deletion edits
189 |   (or vice versa).
190 | 


--------------------------------------------------------------------------------
/data/conll14st-test-data/scripts/README:
--------------------------------------------------------------------------------
  1 | ====================================================
  2 | 
  3 | CoNLL-2014 Shared Task: Grammatical Error Correction
  4 | 
  5 | Description of Data Preprocessing Scripts
  6 | 
  7 | 22 Apr 2014 Version 3.2
  8 | ====================================================
  9 | 
 10 | 
 11 | Table of Contents
 12 | =================
 13 | 
 14 |   1. General
 15 |   2. Pre-requisites
 16 |   3. Usage
 17 | 
 18 | 1. General
 19 | ==========
 20 | 
 21 | This README file describes the usage of scripts for preprocessing the NUCLE version 3.2 corpus. 
 22 | 
 23 | Quickstart:
 24 | 
 25 |   a. Regenerate the preprocessed files with full syntactic information:
 26 |      % python preprocess.py -o nucle.sgml conllFileName annFileName m2FileName
 27 | 
 28 |   b. Get tokenized annotations without syntactic information:
 29 |      % python preprocess.py -l nucle.sgml conllFileName annFileName m2FileName
 30 | 
 31 |   c. Get combined gold-standard answer file (without alternative):
 32 |      % python preprocesscombine.py gold1.sgml gold2.sgml combinedAnswer
 33 |   
 34 |   d. Get combined gold-standard answer file (with alternative answers):
 35 |      % python preprocesswithalt.py essay.sgml 2 gold1.sgml gold2.sgml alt1.sgml alt2.sgml alt3.sgml combinedAnsWithAlt
 36 | 
 37 | where
 38 |        nucle.sgml  -  input SGML file
 39 |     conllFileName  -  output file that contains pre-processed sentences in CoNLL format.
 40 |       annFileName  -  output file that contains standoff error annotations.
 41 |        m2FileName  -  output file that contains error annotations in the M2 scorer format.
 42 |        gold1.sgml  -  input SGML file that contains the gold edits of the first annotator.
 43 |        gold2.sgml  -  input SGML file that contains the gold edits of the second annotator.
 44 |    combinedAnswer  -  output file that contains gold edits in the M2 scorer format
 45 |                       combining the gold edits of the first and second annotator.
 46 |         alt1.sgml  -  input SGML file that contains alternative edits by the first team.
 47 |         alt2.sgml  -  input SGML file that contains alternative edits by the second team.
 48 |         alt3.sgml  -  input SGML file that contains alternative edits by the third team.
 49 | combinedAnsWithAlt -  output file that contains gold edits in the M2 scorer format
 50 |                       combining the gold edits of the first and second annotator.
 51 | 
 52 | 2. Pre-requisites
 53 | =================
 54 | 
 55 | + Python (2.6.4, other versions >= 2.6.4, < 3.0 might work but are not tested)
 56 | + nltk (http://www.nltk.org, version 2.0b7, needed for sentence splitting and word tokenization) 
 57 | + Stanford parser (version 2.0.1, http://nlp.stanford.edu/software/stanford-parser-2012-03-09.tgz)
 58 | 
 59 | If you only use the scripts to generate error annotations needed by the M2 scorer, Stanford parser is not required.
 60 | Otherwise, "stanford-parser-2012-03-09" need to be in the same directory as "scripts".
 61 | 
 62 | 3. Usage
 63 | ========
 64 | 
 65 | Preprocessing the data from single annotation
 66 | 
 67 | Usage: python preprocess.py OPTIONS sgmlFileName conllFileName annotationFileName m2FileName
 68 | 
 69 | Where
 70 |   sgmlFileName       -     NUCLE SGML file
 71 |   conllFileName      -     output file name for pre-processed sentences in CoNLL format (e.g., conll14st-preprocessed.conll).
 72 |   annotationFileName -     output file name for error annotations (e.g., conll14st-preprocessed.conll.ann).
 73 |   m2FileName         -     output file name in the M2 scorer format (e.g., conll14st-preprocessed.conll.m2).
 74 | 
 75 | OPTIONS
 76 |   -o      -   output will contain POS tags and parse tree info (i.e., the same as the released preprocessed file, runs slowly).
 77 |   -l      -   output will NOT contain POS tags and parse tree info (runs quickly).
 78 | 
 79 | 
 80 | Combining alternative annotations by multiple annotators
 81 | 
 82 | Usage: python preprocesscombine.py sgmlFileName1 ... sgmlFileNameN m2FileName
 83 | 
 84 | Where
 85 |   sgmlFileName1      -     test data SGML file containing the gold edits of annotator 1
 86 |   sgmlFileNameN      -     test data SGML file containing the gold edits of annotator N
 87 |   m2FileName         -     output file in the M2 scorer format, containing annotations by N annotators.
 88 | 
 89 | e.g., python preprocesscombine.py official-2014.0.sgml official-2014.1.sgml official-2014.combined.m2
 90 | 
 91 | will generate official-2014.combined.m2 from alternative annotations by 2 annotators.
 92 | 
 93 | 
 94 | Combining alternative annotations by multiple main annotators with annotations proposed by participants
 95 | 
 96 | Usage: python preprocesswithalt.py essaySgmlFile M annotSgmlFile1 ... annotSgmlFileM alt1SgmlFileName ... altNSgmlFileName combM2FileName
 97 | 
 98 | Where
 99 |   essaySgmlFile      -    test data SGML file containing essay body, not necessarily annotations
100 |   M                  -    number of main annotations
101 |   annotSgmlFile1     -    test data SGML file containing the gold edits of main annotator 1
102 |   annotSgmlFileM     -    test data SGML file containing the gold edits of main annotator M
103 |   alt1SgmlFileName   -    the alternative annotation SGML file proposed by team 1 (first), containing only annotations that differ from the main annotation
104 |   altNSgmlFileName   -    the alternative annotation SGML file proposed by team N (last), containing only annotations that differ from the main annotation
105 |   combM2FileName     -    output file name in the M2 scorer format, containing combination of main and alternative annotations
106 | 
107 | e.g., python preprocesswithalt official-2014.0.sgml 2 official-2014.0.sgml official-2014.1.sgml alternative-teama.sgml alternative-teamb.sgml alternative-teamc.sgml official-2014.combined-withalt.m2
108 | 


--------------------------------------------------------------------------------
/data/conll14st-test-data/scripts/iparser.py:
--------------------------------------------------------------------------------
 1 | # iparser.py
 2 | #
 3 | # Author:	Yuanbin Wu
 4 | #           National University of Singapore (NUS)
 5 | # Date:		12 Mar 2013
 6 | # Version:      1.0
 7 | # 
 8 | # Contact:  wuyb@comp.nus.edu.sg
 9 | #
10 | # This script is distributed to support the CoNLL-2013 Shared Task.
11 | # It is free for research and educational purposes.
12 | 
13 | import os
14 | import sys
15 | 
16 | class stanfordparser:
17 | 
18 |     def __init__(self):
19 |         pass
20 | 		
21 |     def parse_batch(self, sentenceDumpedFileName, parsingDumpedFileName):
22 |         
23 |         if os.path.exists('../stanford-parser-2012-03-09') == False:
24 |             print >> sys.stderr, 'can not find Stanford parser directory'
25 |             sys.exit(1)
26 |         
27 |         # tokenized
28 |         cmd = r'java -server -mx4096m -cp "../stanford-parser-2012-03-09/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser  -retainTMPSubcategories -sentences newline -tokenized -escaper edu.stanford.nlp.process.PTBEscapingProcessor  -outputFormat "wordsAndTags, penn, typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ' + sentenceDumpedFileName
29 | 
30 |         r = os.popen(cmd).read().strip().decode('utf-8')
31 |         f = open(parsingDumpedFileName, 'w')
32 |         f.write(r.encode('utf-8'))
33 |         f.close()
34 | 
35 |         rlist = r.replace('\n\n\n', '\n\n\n\n').split('\n\n')
36 |         return rlist
37 | 


--------------------------------------------------------------------------------
/data/conll14st-test-data/scripts/nucle_doc.py:
--------------------------------------------------------------------------------
  1 | # nucle_doc.py
  2 | #
  3 | # Author:	Yuanbin Wu
  4 | #           National University of Singapore (NUS)
  5 | # Date:		12 Mar 2013
  6 | # Version:      1.0
  7 | # 
  8 | # Contact:  wuyb@comp.nus.edu.sg
  9 | #
 10 | # This script is distributed to support the CoNLL-2013 Shared Task.
 11 | # It is free for research and educational purposes.
 12 | 
 13 | import os
 14 | import sys
 15 | from nltk import word_tokenize
 16 | 
 17 | class nucle_doc:
 18 |     def __init__(self):
 19 |         self.docattrs = None
 20 | 
 21 |         self.matric = ''
 22 |         self.email = ''
 23 |         self.nationality = ''
 24 |         self.firstLanguage = ''
 25 |         self.schoolLanguage = ''
 26 |         self.englishTests = ''
 27 | 
 28 |         self.paragraphs = []
 29 |         self.annotation = []
 30 |         self.mistakes = []
 31 | 
 32 |         self.sentences = []
 33 | 
 34 |     def buildSentence(self, sentstr, dpnode, constituentstr, poslist, chunklist):
 35 |         self.sentences[-1].append(nucle_sent(sentstr, dpnode, constituentstr, poslist, chunklist))
 36 | 
 37 |     def addSentence(self, sent):
 38 |         self.sentences[-1].append(sent)
 39 | 
 40 |     def findMistake(self, par, pos):
 41 |         for m in self.mistakes:
 42 |             if par == m['start_par'] and pos >= m['start_off'] and pos < m['end_off']:
 43 |                 return m
 44 |         return None
 45 | 
 46 | 
 47 | class nucle_sent:
 48 |     def __init__(self, sentstr, dpnode, constituentstr, poslist, chunklist):
 49 |         self.sentstr = sentstr
 50 |         self.words = word_tokenize(sentstr)
 51 |         self.dpnodes = dpnode
 52 |         self.constituentstr = constituentstr
 53 |         self.constituentlist = []
 54 |         self.poslist = poslist
 55 |         self.chunklist = chunklist
 56 | 
 57 |     def buildConstituentList(self):
 58 | 
 59 |         s = self.constituentstr.strip().replace('\n', '').replace(' ', '')
 60 |         r = []
 61 |         i = 0
 62 |         while i < len(s):
 63 |             j = i
 64 |             while j < len(s) and s[j] != ')':
 65 |                 j += 1
 66 |             k = j
 67 |             while k < len(s) and s[k] == ')':
 68 |                 k += 1
 69 |             
 70 |             nodeWholeStr = s[i:k]
 71 |             lastLRBIndex = nodeWholeStr.rfind('(')
 72 |             nodeStr = nodeWholeStr[:lastLRBIndex] + '*' + s[j+1:k]
 73 | 
 74 |             r.append(nodeStr)
 75 |             i = k
 76 | 
 77 |         if len(r) != len(self.words):
 78 |             print >> sys.stderr, 'Error in buiding constituent tree bits: different length with words.'
 79 |             print >> sys.stderr, len(r), len(self.words)
 80 |             print >> sys.stderr, ' '.join(r).encode('utf-8')
 81 |             print >> sys.stderr, words
 82 |             sys.exit(1)
 83 | 
 84 |         self.constituentlist = r
 85 | 
 86 | 
 87 |     
 88 |     def setDpNode(self, dpnode):
 89 |         self.dpnodes = dpnode
 90 | 
 91 |     def setPOSList(self, poslist):
 92 |         self.poslist = poslist
 93 | 
 94 |     def setConstituentStr(self, constituentstr):
 95 |         self.constituentstr = constituentstr
 96 | 
 97 |     def setConstituentList(self, constituentlist):
 98 |         self.constituentlist = constituentlist
 99 | 
100 |     def setWords(self, words):
101 |         self.words = words
102 | 
103 |     def setChunkList(self, chunklist):
104 |         self.chunklist = chunklist
105 | 
106 |     def getDpNode(self):
107 |         return self.dpnodes
108 | 
109 |     def getPOSList(self):
110 |         return self.poslist
111 | 
112 |     def getConstituentStr(self):
113 |         return self.constituentstr
114 | 
115 |     def getConstituentList(self):
116 |         return self.constituentlist 
117 | 
118 |     def getWords(self):
119 |         return self.words
120 |     
121 |     def getChunkList(self):
122 |         return self.chunklist
123 | 
124 |     def getConllFormat(self, doc, paragraphIndex, sentIndex):
125 | 
126 |         table = []
127 | 
128 |         dpnodes = self.getDpNode()
129 |         poslist = self.getPOSList()
130 |         #chunklist = self.getChunkList()
131 |         words = self.getWords()
132 |         constituentlist = self.getConstituentList()
133 | 
134 |         if len(poslist) == 0:
135 |             hasParseInfo = 0
136 |         else:
137 |             hasParseInfo = 1
138 | 
139 |         if len(words) != len(poslist) and len(poslist) != 0:
140 |             print >> sys.stderr, 'Error in buiding Conll Format: different length stanford parser postags and words.'
141 |             print >> sys.stderr, 'len words:', len(words), words
142 |             print >> sys.stderr, 'len poslist:', len(poslist), poslist
143 |             sys.exit(1)
144 | 
145 |         for wdindex in xrange(len(words)):
146 | 
147 |             word = words[wdindex]
148 | 
149 |             row = []
150 |             row.append(doc.docattrs[0][1])        #docinfo
151 |             row.append(paragraphIndex)          #paragraph index
152 |             row.append(sentIndex)           #paragraph index
153 |             row.append(wdindex)             #word index
154 |             row.append(word)                #word
155 | 
156 |             #row.append(chunknode.label)     #chunk
157 |             if hasParseInfo == 1:
158 | 
159 |                 posword = poslist[wdindex]
160 |                 splitp = posword.rfind('/')
161 |                 pos = posword[splitp+1 : ].strip()
162 | 
163 |                 #chunknode = chunklist[wdindex]
164 | 
165 |                 constituentnode = constituentlist[wdindex]
166 |                 
167 |                 dpnode = None
168 |                 for d in dpnodes:
169 |                     if d.index == wdindex:
170 |                         dpnode = d
171 |                         break
172 | 
173 |                 row.append(pos)                 #POS
174 |                 if dpnode == None:
175 |                     row.append('-')
176 |                     row.append('-')
177 |                 else:
178 |                     row.append(dpnode.parent_index) #dp parent
179 |                     row.append(dpnode.grammarrole)  #dp label
180 |                 row.append(constituentnode)         #constituent 
181 | 
182 |             table.append(row)
183 | 
184 |         return table
185 | 
186 | 
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/data/conll14st-test-data/scripts/nuclesgmlparser.py:
--------------------------------------------------------------------------------
  1 | # nuclesgmlparser.py
  2 | #
  3 | # Author:	Yuanbin Wu
  4 | #           National University of Singapore (NUS)
  5 | # Date:		12 Mar 2013
  6 | # Version:      1.0
  7 | # 
  8 | # Contact:  wuyb@comp.nus.edu.sg
  9 | #
 10 | # This script is distributed to support the CoNLL-2013 Shared Task.
 11 | # It is free for research and educational purposes.
 12 | 
 13 | from sgmllib import SGMLParser
 14 | from nucle_doc import nucle_doc
 15 | 
 16 | 
 17 | class nuclesgmlparser(SGMLParser):
 18 |     def __init__(self):
 19 |         SGMLParser.__init__(self)
 20 |         self.docs = []
 21 | 
 22 |     def reset(self):
 23 |         self.docs = []
 24 |         self.data = []
 25 |         SGMLParser.reset(self)
 26 | 
 27 |     def unknow_starttag(self, tag, attrs):
 28 |         pass
 29 | 
 30 |     def unknow_endtag(self):
 31 |         pass
 32 | 
 33 |     def start_doc(self, attrs):
 34 |         self.docs.append(nucle_doc())
 35 |         self.docs[-1].docattrs = attrs
 36 | 
 37 |     def end_doc(self):
 38 |         pass
 39 | 
 40 |     def start_matric(self, attrs):
 41 |         pass
 42 | 
 43 |     def end_matric(self):
 44 |         self.docs[-1].matric = ''.join(self.data)
 45 |         self.data = []
 46 |         pass
 47 | 
 48 |     def start_email(self, attrs):
 49 |         pass
 50 | 
 51 |     def end_email(self):
 52 |         self.docs[-1].email = ''.join(self.data)
 53 |         self.data = []
 54 |         pass
 55 | 
 56 |     def start_nationality(self, attrs):
 57 |         pass
 58 | 
 59 |     def end_nationality(self):
 60 |         self.docs[-1].nationality = ''.join(self.data)
 61 |         self.data = []
 62 |         pass
 63 | 
 64 |     def start_first_language(self, attrs):
 65 |         pass
 66 | 
 67 |     def end_first_language(self):
 68 |         self.docs[-1].firstLanguage = ''.join(self.data)
 69 |         self.data = []
 70 |         pass
 71 | 
 72 |     def start_school_language(self, attrs):
 73 |         pass
 74 | 
 75 |     def end_school_language(self):
 76 |         self.docs[-1].schoolLanguage = ''.join(self.data)
 77 |         self.data = []
 78 |         pass
 79 | 
 80 |     def start_english_tests(self, attrs):
 81 |         pass
 82 | 
 83 |     def end_english_tests(self):
 84 |         self.docs[-1].englishTests = ''.join(self.data)
 85 |         self.data = []
 86 |         pass
 87 | 
 88 | 
 89 |     def start_text(self, attrs):
 90 |         pass
 91 |     
 92 |     def end_text(self):
 93 |         pass
 94 | 
 95 |     def start_title(self, attrs):
 96 |         pass
 97 | 
 98 |     def end_title(self):
 99 |         self.docs[-1].paragraphs.append(''.join(self.data))
100 |         self.data = []
101 |         pass
102 | 
103 | 
104 |     def start_p(self, attrs):
105 |         pass
106 | 
107 |     def end_p(self):
108 |         self.docs[-1].paragraphs.append(''.join(self.data))
109 |         self.data = []
110 |         pass
111 | 
112 | 
113 |     def start_annotation(self, attrs):
114 |         self.docs[-1].annotation.append(attrs)
115 | 
116 |     def end_annotation(self):
117 |         pass
118 | 
119 |     def start_mistake(self, attrs):
120 |         d = {}
121 |         for t in attrs:
122 |             d[t[0]] = int(t[1])
123 |         self.docs[-1].mistakes.append(d)
124 |         pass 
125 | 
126 |     def end_mistake(self):
127 |         pass 
128 | 
129 |     def start_type(self, attrs):
130 |         pass
131 | 
132 |     def end_type(self):
133 |         self.docs[-1].mistakes[-1]['type'] = ''.join(self.data)
134 |         self.data = []
135 | 
136 |     def start_correction(self, attrs):
137 |         pass
138 | 
139 |     def end_correction(self):
140 |         self.docs[-1].mistakes[-1]['correction'] = ''.join(self.data)
141 |         self.data = []
142 | 
143 |     def start_comment(self, attrs):
144 |         pass
145 | 
146 |     def end_comment(self):
147 |         self.docs[-1].mistakes[-1]['comment'] = ''.join( self.data)
148 |         self.data = []
149 | 
150 | 
151 |     def handle_charref(self, ref):
152 |         self.data.append('&' + ref)
153 | 
154 |     def handle_entityref(self, ref):
155 |         self.data.append('&' + ref)
156 | 
157 |     def handle_data(self, text):
158 |         if  text.strip() == '':
159 |             self.data.append('')
160 |             return
161 |         else:
162 |             if text.startswith('\n'):
163 |                 text = text[1:]
164 |             if text.endswith('\n'):
165 |                 text = text[:-1]
166 |             self.data.append(text)
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/data/conll14st-test-data/scripts/parser_feature.py:
--------------------------------------------------------------------------------
 1 | # parser_feature.py
 2 | #
 3 | # Author:	Yuanbin Wu
 4 | #           National University of Singapore (NUS)
 5 | # Date:		12 Mar 2013
 6 | # Version:      1.0
 7 | # 
 8 | # Contact:  wuyb@comp.nus.edu.sg
 9 | #
10 | # This script is distributed to support the CoNLL-2013 Shared Task.
11 | # It is free for research and educational purposes.
12 | 
13 | 
14 | 
15 | import iparser
16 | 
17 | class stanpartreenode:
18 |     def __init__(self, strnode):
19 | 
20 |         if strnode == '':
21 |             self.grammarrole = ''
22 |             self.parent_index = -1
23 |             self.index = -1
24 |             self.parent_word = ''
25 |             self.word = ''
26 |             self.POS = ''
27 |             return
28 | 
29 |         groleend = strnode.find('(')
30 |         self.grammarrole = strnode[ : groleend]
31 |         content = strnode[groleend + 1: len(strnode)-1]
32 |         dadAndme = content.partition(', ')
33 |         dad = dadAndme[0]
34 |         me = dadAndme[2]
35 |         dadsep = dad.rfind('-')
36 |         mesep = me.rfind('-')
37 |         self.parent_index = int(dad[dadsep + 1 : ]) - 1 
38 |         self.parent_word = dad[0 : dadsep]
39 |         self.index = int(me[mesep + 1 : ]) - 1
40 |         self.word = me[0 : mesep]
41 |         self.POS = '' 
42 | 
43 |         
44 | def DependTree_Batch(sentenceDumpedFileName, parsingDumpedFileName):
45 | 
46 |     sparser = iparser.stanfordparser()
47 |     results = sparser.parse_batch(sentenceDumpedFileName, parsingDumpedFileName)
48 |     nodeslist = []
49 | 
50 |     k = 0
51 |     while k < len(results):
52 |         PoSlist = results[k].split(' ')
53 |         constituentstr = results[k+1]
54 |         table = results[k+2].split('\n')
55 |         nodes = []
56 |         for i in range(0, len(table)):
57 |             nodes.append( stanpartreenode(table[i]) )
58 |         nodeslist.append((nodes, constituentstr, PoSlist))
59 |         k += 3
60 |     return nodeslist
61 | 
62 | def DependTree_Batch_Parsefile(parsingDumpedFileName):
63 | 
64 |     f = open(parsingDumpedFileName, 'r')
65 |     results = f.read().decode('utf-8').replace('\n\n\n', '\n\n\n\n').split('\n\n')
66 |     f.close()
67 |     nodeslist = []
68 | 
69 |     k = 0
70 |     while k < len(results):
71 |         PoSlist = results[k].split(' ')
72 |         constituentstr = results[k+1]
73 |         table = results[k+2].split('\n')
74 | 
75 |         nodes = []
76 |         for i in range(0, len(table)):
77 |             nodes.append( stanpartreenode(table[i]) )
78 |         nodeslist.append((nodes, constituentstr, PoSlist))
79 |         k += 3
80 |     return nodeslist
81 | 


--------------------------------------------------------------------------------
/data/conll14st-test-data/scripts/preprocesscombine.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # preprocesscombine.py
  4 | #
  5 | # Author:	Christian Hadiwinoto
  6 | #           National University of Singapore (NUS)
  7 | # Date:		22 Apr 2014
  8 | # Version:      1.0
  9 | # 
 10 | # Contact:  chrhad@comp.nus.edu.sg
 11 | #
 12 | # This script is distributed to support the CoNLL-2013 Shared Task.
 13 | # It is free for research and educational purposes.
 14 | #
 15 | # Usage:   python preprocesswithalt.py essaySgmlFileName mainSgmlFileName alt1SgmlFileName ... altNSgmlFileName m2FileName
 16 | #
 17 | 
 18 | 
 19 | import parser_feature
 20 | from nuclesgmlparser import nuclesgmlparser
 21 | from nucle_doc import *
 22 | import nltk.data
 23 | from nltk import word_tokenize
 24 | from operator import itemgetter
 25 | import cPickle as pickle
 26 | import re
 27 | import sys
 28 | import os
 29 | 
 30 | getEditKey = itemgetter(0, 1, 2, 3, 4)
 31 | 
 32 | sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 33 | sentenceDumpedFile = 'sentence_file'
 34 | docsDumpedFileName = 'docs'
 35 | parsingDumpedFileName = 'parse_file'
 36 | 
 37 | def readNUCLE(fn):
 38 | 
 39 |     f = open(fn, 'r')
 40 |     parser = nuclesgmlparser()
 41 |     filestr = f.read()
 42 |     filestr = filestr.decode('utf-8')
 43 |     
 44 |     #Fix Reference tag
 45 |     p = re.compile(r'(<REFERENCE>\n<P>\n.*\n)<P>')
 46 |     filestr = p.sub(r'\1</P>', filestr)
 47 | 
 48 |     parser.feed(filestr)
 49 |     f.close()
 50 |     parser.close()
 51 | 
 52 |     return parser.docs
 53 | 
 54 | def sentenceSplit(docs):
 55 | 
 56 |     sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 57 |     for doc in docs:
 58 |         for par in doc.paragraphs:
 59 |             doc.sentences.append([])
 60 |             for s in sentenceTokenizer.tokenize(par):
 61 |                 doc.buildSentence(s, [], '', [], [])
 62 |     return docs
 63 |  
 64 | def compareTwoEditLists(editList1, editList2):
 65 |     # must be sorted
 66 |     if editList1 == [] and editList2 == []:
 67 |         return True
 68 |     elif editList1 == [] or editList2 == []:
 69 |         return False
 70 |     elif getEditKey(editList1[0]) != getEditKey(editList2[0]):
 71 |         return False
 72 |     else:
 73 |         return compareTwoEditLists(editList1[1:], editList2[1:])
 74 |         
 75 | def moderateAnnotations(contestDocs, annotBoard, origDocSet):
 76 |     # moderate annotation in "contesting" docs with already stated mistakes
 77 |     #mistakeStrSet = {}
 78 |     for doc in contestDocs:
 79 |         #mistakeStr = ''
 80 |         nid = int(doc.docattrs[0][1]) # nid of current document
 81 |         tid = doc.annotation[0][0][1] # teacher id
 82 | 
 83 |         if not annotBoard.has_key(nid): # create placeholder
 84 |             annotBoard[nid] = {}
 85 | 
 86 |         origDoc = origDocSet[nid]
 87 |         for pid in xrange(len(origDoc.sentences)):
 88 |             slist = origDoc.sentences[pid]
 89 |             if not annotBoard[nid].has_key(pid):
 90 |                 annotBoard[nid][pid] = {}
 91 |             for sentid in xrange(len(slist)):
 92 |                 sent = slist[sentid]
 93 |                 if not annotBoard[nid][pid].has_key(sentid):
 94 |                     annotBoard[nid][pid][sentid] = []
 95 |                 editSet = []
 96 | 
 97 |                 # enumerate mistakes
 98 |                 sentoffset = origDoc.paragraphs[pid].index(sent.sentstr)
 99 |                 editNum = 0
100 |                 for m in doc.mistakes:
101 |                     if m['start_par'] != pid or \
102 |                        m['start_par'] != m['end_par'] or \
103 |                        m['start_off'] < sentoffset or \
104 |                        m['start_off'] >= sentoffset + len(sent.sentstr) or \
105 |                        m['end_off'] <sentoffset or \
106 |                        m['end_off'] > sentoffset + len(sent.sentstr):
107 |                         continue
108 | 
109 |                     #if m['type'] != 'noop':
110 |                     editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], m['correction'], m['type']))
111 |                         #editNum += 1
112 |                     #else:
113 |                         #editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], sent.sentstr, m['type']))
114 | 
115 |                 editSet = sorted(editSet, key=itemgetter(0, 1, 2, 3))
116 |                 
117 |                 # find the same annotation
118 |                 foundMatch = False
119 |                 i = 0
120 |                 boardEdits = annotBoard[nid][pid][sentid]
121 |                 while i < len(boardEdits) and not foundMatch:
122 |                     if compareTwoEditLists(editSet, boardEdits[i]):
123 |                         foundMatch = True
124 |                     else:
125 |                         i+=1
126 | 
127 |                 if not foundMatch:
128 |                     annotBoard[nid][pid][sentid].append(editSet)
129 |         
130 |     return annotBoard
131 |     
132 | def createM2File(origDocs, mistakesBoard, m2FileName):
133 |     
134 |     fm2 = open(m2FileName, 'w')
135 | 
136 |     for doc in origDocs:
137 |         nid = int(doc.docattrs[0][1]) # nid of current document
138 |         for slistIndex in xrange(len(doc.sentences)):
139 |             slist = doc.sentences[slistIndex]
140 |             for sentid in xrange(len(slist)):
141 | 
142 |                 sent = slist[sentid]
143 | 
144 |                 # m2 format annotation string list
145 |                 m2AnnotationList = []
146 | 
147 |                 # build colums
148 |                 table = sent.getConllFormat(doc, slistIndex, sentid)
149 |                 tokenizedSentStr = ' '.join(sent.getWords())
150 | 
151 |                 #Add annotation info
152 |                 sentoffset = doc.paragraphs[slistIndex].index(sent.sentstr)
153 | 
154 |                 i = 0
155 |                 board = mistakesBoard[nid][slistIndex][sentid]
156 |                 for mistakesList in board:
157 |                     if len(mistakesList) == 0 and len(board) > 1:
158 |                         m2AnnotationList.append('A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||' + str(i) + '\n')
159 |                         i += 1
160 |                         
161 |                     for tuple in mistakesList:
162 |                         m = {}
163 |                         m['start_par'] = tuple[0]
164 |                         m['end_par'] = tuple[1]
165 |                         m['start_off'] = tuple[2]
166 |                         m['end_off'] = tuple[3]
167 |                         m['correction'] = tuple[4]
168 |                         m['type'] = tuple[5]
169 | 
170 |                         if m['start_par'] != slistIndex or \
171 |                           m['start_par'] != m['end_par'] or \
172 |                           m['start_off'] < sentoffset or \
173 |                           m['start_off'] >= sentoffset + len(sent.sentstr) or \
174 |                           m['end_off'] <sentoffset or \
175 |                           m['end_off'] > sentoffset + len(sent.sentstr):
176 |                             continue
177 | 
178 |                         wordsoffset = 0
179 |                         wdstart = 0
180 | 
181 |                         startInWord = 0
182 |                         headText = ''
183 |                         endInWord = 0
184 |                         tailText = ''
185 | 
186 |                         words = sent.getWords()
187 |                         while wdstart < len(words):
188 | 
189 |                             word = words[wdstart]
190 |                             nextstart = sent.sentstr.find(word, wordsoffset)
191 | 
192 |                             if nextstart == -1:
193 |                                 # may not find word, due to relpacement
194 |                                 print >> sys.stderr, "Warning in building conll format: can not find word"
195 |                                 print >> sys.stderr, word.encode('utf-8')
196 |                                 wordsoffset += 1
197 |                             else:
198 |                                 wordsoffset = nextstart
199 | 
200 |                             if wordsoffset >= m['start_off']-sentoffset:
201 |                                 break
202 |                             elif wordsoffset + len(word) > m['start_off']-sentoffset:
203 |                                 # annotation starts at the middle of a word
204 |                                 startInWord = 1
205 |                                 headText = sent.sentstr[wordsoffset: m['start_off']-sentoffset]
206 |                                 break
207 | 
208 |                             wordsoffset += len(word) 
209 |                             wdstart += 1
210 | 
211 |                         if wdstart == len(words):
212 |                             print >> sys.stderr, 'Warning in building conll format: start_off overflow'
213 |                             print >> sys.stderr, m, sent.sentstr.encode('utf-8')
214 |                             continue
215 | 
216 | 
217 |                         wdend = wdstart
218 |                         while wdend < len(words):
219 | 
220 |                             word = words[wdend]
221 |                             
222 |                             nextstart = sent.sentstr.find(word, wordsoffset)
223 | 
224 |                             if nextstart == -1:
225 |                                 print >> sys.stderr, "Warning in building conll format: can not find word"
226 |                                 print >> sys.stderr, word.encode('utf-8')
227 |                                 wordsoffset += 1
228 |                             else:
229 |                                 wordsoffset = nextstart
230 | 
231 |                             if wordsoffset >= m['end_off']-sentoffset:
232 |                                 # annotation ends at the middle of a word
233 |                                 if wordsoffset - len(words[wdend-1]) - 1 < m['end_off']-sentoffset: 
234 |                                     endInWord = 1
235 |                                     tailText = sent.sentstr[m['end_off']-sentoffset : wordsoffset].strip()
236 |                                 break
237 | 
238 |                             wordsoffset += len(word) 
239 |                             wdend += 1
240 |                        
241 | 
242 |                         correctionTokenizedStr = tokenizeCorrectionStr(headText + m['correction'] + tailText, wdstart, wdend, words)
243 |                         correctionTokenizedStr, wdstart, wdend = shrinkCorrectionStr(correctionTokenizedStr, wdstart, wdend, words)
244 | 
245 |                         token_start = wdstart #if m['type'] != 'noop' else -1
246 |                         token_end = wdend #if m['type'] != 'noop' else -1
247 |                         correction_final = correctionTokenizedStr.replace('\n', '') #if m['type'] != 'noop' else '-NONE-'
248 |                         if wdstart == wdend and len(correction_final) == 0:
249 |                             continue
250 | 
251 |                         # build annotation string for .conll.m2 file
252 |                         m2AnnotationStr  = 'A '
253 |                         m2AnnotationStr +=  str(token_start) + ' '
254 |                         m2AnnotationStr +=  str(token_end) + '|||'
255 |                         m2AnnotationStr +=  m['type'] + '|||'
256 |                         m2AnnotationStr +=  correction_final + '|||'
257 |                         m2AnnotationStr +=  'REQUIRED|||-NONE-|||' + str(i) + '\n'
258 | 
259 |                         m2AnnotationList.append(m2AnnotationStr)
260 |                     
261 |                     if len(mistakesList) > 0: # only if mistakeList contains tuples
262 |                         i += 1
263 | 
264 |                 # write .conll.m2 file
265 |                 m2AnnotationSent = 'S ' + tokenizedSentStr + '\n'
266 |                 m2AnnotationSent += ''.join(m2AnnotationList) + '\n'
267 |                 fm2.write(m2AnnotationSent.encode('utf-8'))
268 |     
269 |     fm2.close()
270 | 
271 | 
272 | def tokenizeCorrectionStr(correctionStr, wdstart, wdend, words):
273 | 
274 |     correctionTokenizedStr = ''
275 |     pseudoSent = correctionStr
276 | 
277 |     if wdstart != 0:
278 |         pseudoSent = words[wdstart-1] + ' ' + pseudoSent 
279 | 
280 |     if wdend < len(words) - 1:
281 |         pseudoSent = pseudoSent + ' ' + words[wdend]
282 |     elif wdend == len(words) - 1:
283 |         pseudoSent = pseudoSent + words[wdend]
284 | 
285 | 
286 |     pseudoWordsList = []
287 |     sentList = sentenceTokenizer.tokenize(pseudoSent)
288 |     for sent in sentList:
289 |         pseudoWordsList += word_tokenize(sent)
290 | 
291 |     start = 0
292 |     if wdstart != 0:
293 |         s = ''
294 |         for i in xrange(len(pseudoWordsList)):
295 |             s += pseudoWordsList[i]
296 |             if s == words[wdstart-1]:
297 |                 start = i + 1
298 |                 break
299 |         if start == 0:
300 |             print >> sys.stderr, 'Can not find words[wdstart-1]'
301 | 
302 |     else:
303 |         start = 0
304 | 
305 |     end = len(pseudoWordsList)
306 |     if wdend != len(words):
307 | 
308 |         s = ''
309 |         for i in xrange(len(pseudoWordsList)):
310 |             s = pseudoWordsList[len(pseudoWordsList) - i - 1] + s
311 |             if s == words[wdend]:
312 |                 end = len(pseudoWordsList) - i - 1
313 |                 break
314 |         if end == len(pseudoWordsList):
315 |             print >> sys.stderr, 'Can not find words[wdend]'
316 | 
317 |     else:
318 |         end = len(pseudoWordsList)
319 | 
320 |     correctionTokenizedStr = ' '.join(pseudoWordsList[start:end])
321 | 
322 |     return correctionTokenizedStr
323 | 
324 | 
325 | def shrinkCorrectionStr(correctionTokenizedStr, wdstart, wdend, words):
326 | 
327 |     correctionWords = correctionTokenizedStr.split(' ')
328 |     originalWords = words[wdstart: wdend]
329 |     wdstartNew = wdstart
330 |     wdendNew = wdend
331 |     cstart = 0
332 |     cend = len(correctionWords)
333 | 
334 |     i = 0
335 |     while i < len(originalWords) and i < len(correctionWords):
336 |         if correctionWords[i] == originalWords[i]:
337 |             i += 1
338 |             wdstartNew = i + wdstart
339 |             cstart = i
340 |         else:
341 |             break
342 | 
343 |     i = 1
344 |     while i <= len(originalWords) - cstart and i <= len(correctionWords) - cstart:
345 |         if correctionWords[len(correctionWords)-i] == originalWords[len(originalWords)-i]:
346 |             wdendNew = wdend - i
347 |             cend = len(correctionWords) - i
348 |             i += 1
349 |         else:
350 |             break
351 | 
352 |     return ' '.join(correctionWords[cstart:cend]), wdstartNew, wdendNew
353 | 
354 | if __name__ == '__main__':
355 | 
356 |     ''' usage: 
357 | 
358 |         %python preprocesscombine.py alternativesgmlfile1 ... alternativesgmlfileN combinedm2file
359 |           output an m2 file containing a collection of the main annotation and all alternative annotations.
360 | 
361 |         In most cases completesgmlfile and mainsgmlfile are identical
362 |     '''
363 | 
364 |     # Load original complete SGML for reference
365 |     origDocs = sentenceSplit(readNUCLE(sys.argv[1]))
366 |     
367 |     origDocSet = {}
368 |     for doc in origDocs:
369 |         nid = int(doc.docattrs[0][1])
370 |         origDocSet[nid] = doc
371 | 
372 |     docsList = []
373 |     for i in range(1, len(sys.argv) - 1):
374 |         print >> sys.stderr, 'Storing', i
375 |         docs = sentenceSplit(readNUCLE(sys.argv[i]))
376 |         docsList.append(docs)
377 | 
378 |     board = {}
379 |     for docs in docsList:
380 |         board = moderateAnnotations(docs, board, origDocSet)
381 | 
382 |     createM2File(origDocs, board, sys.argv[len(sys.argv)-1])
383 | 
384 |     pass
385 | 
386 | 


--------------------------------------------------------------------------------
/data/conll14st-test-data/scripts/preprocesswithalt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # preprocesswithalt.py
  4 | #
  5 | # Author:	Christian Hadiwinoto
  6 | #           National University of Singapore (NUS)
  7 | # Date:		22 Apr 2014
  8 | # Version:      1.0
  9 | # 
 10 | # Contact:  chrhad@comp.nus.edu.sg
 11 | #
 12 | # This script is distributed to support the CoNLL-2013 Shared Task.
 13 | # It is free for research and educational purposes.
 14 | #
 15 | # Usage:   python preprocesswithalt.py essaySgmlFileName M mainSgmlFileName alt1SgmlFileName ... altNSgmlFileName m2FileName
 16 | #
 17 | 
 18 | 
 19 | import parser_feature
 20 | from nuclesgmlparser import nuclesgmlparser
 21 | from nucle_doc import *
 22 | import nltk.data
 23 | from nltk import word_tokenize
 24 | from operator import itemgetter
 25 | import cPickle as pickle
 26 | import re
 27 | import sys
 28 | import os
 29 | 
 30 | getEditKey = itemgetter(0, 1, 2, 3, 4)
 31 | 
 32 | sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 33 | sentenceDumpedFile = 'sentence_file'
 34 | docsDumpedFileName = 'docs'
 35 | parsingDumpedFileName = 'parse_file'
 36 | 
 37 | def readNUCLE(fn):
 38 | 
 39 |     f = open(fn, 'r')
 40 |     parser = nuclesgmlparser()
 41 |     filestr = f.read()
 42 |     filestr = filestr.decode('utf-8')
 43 |     
 44 |     #Fix Reference tag
 45 |     p = re.compile(r'(<REFERENCE>\n<P>\n.*\n)<P>')
 46 |     filestr = p.sub(r'\1</P>', filestr)
 47 | 
 48 |     parser.feed(filestr)
 49 |     f.close()
 50 |     parser.close()
 51 | 
 52 |     return parser.docs
 53 | 
 54 | def sentenceSplit(docs):
 55 | 
 56 |     sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 57 |     for doc in docs:
 58 |         for par in doc.paragraphs:
 59 |             doc.sentences.append([])
 60 |             for s in sentenceTokenizer.tokenize(par):
 61 |                 doc.buildSentence(s, [], '', [], [])
 62 |     return docs
 63 |  
 64 | def compareTwoEditLists(editList1, editList2):
 65 |     # must be sorted
 66 |     if editList1 == [] and editList2 == []:
 67 |         return True
 68 |     elif editList1 == [] or editList2 == []:
 69 |         return False
 70 |     elif getEditKey(editList1[0]) != getEditKey(editList2[0]):
 71 |         return False
 72 |     else:
 73 |         return compareTwoEditLists(editList1[1:], editList2[1:])
 74 | 
 75 | def moderateAnnotations(contestDocs, annotBoard, origDocSet):
 76 |     # moderate annotation in "contesting" docs with already stated mistakes
 77 |     #mistakeStrSet = {}
 78 |     for doc in contestDocs:
 79 |         #mistakeStr = ''
 80 |         nid = int(doc.docattrs[0][1]) # nid of current document
 81 |         tid = doc.annotation[0][0][1] # teacher id
 82 | 
 83 |         if not annotBoard.has_key(nid): # create placeholder
 84 |             annotBoard[nid] = {}
 85 | 
 86 |         origDoc = origDocSet[nid]
 87 |         for pid in xrange(len(origDoc.sentences)):
 88 |             slist = origDoc.sentences[pid]
 89 |             if not annotBoard[nid].has_key(pid):
 90 |                 annotBoard[nid][pid] = {}
 91 |             for sentid in xrange(len(slist)):
 92 |                 sent = slist[sentid]
 93 |                 if not annotBoard[nid][pid].has_key(sentid):
 94 |                     annotBoard[nid][pid][sentid] = []
 95 |                 editSet = []
 96 | 
 97 |                 # enumerate mistakes
 98 |                 sentoffset = origDoc.paragraphs[pid].index(sent.sentstr)
 99 |                 editNum = 0
100 |                 for m in doc.mistakes:
101 |                     if m['start_par'] != pid or \
102 |                        m['start_par'] != m['end_par'] or \
103 |                        m['start_off'] < sentoffset or \
104 |                        m['start_off'] >= sentoffset + len(sent.sentstr) or \
105 |                        m['end_off'] <sentoffset or \
106 |                        m['end_off'] > sentoffset + len(sent.sentstr):
107 |                         continue
108 | 
109 |                     #if m['type'] != 'noop':
110 |                     editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], m['correction'], m['type']))
111 |                         #editNum += 1
112 |                     #else:
113 |                         #editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], sent.sentstr, m['type']))
114 | 
115 |                 editSet = sorted(editSet, key=itemgetter(0, 1, 2, 3))
116 |                 
117 |                 # find the same annotation
118 |                 foundMatch = False
119 |                 i = 0
120 |                 boardEdits = annotBoard[nid][pid][sentid]
121 |                 while i < len(boardEdits) and not foundMatch:
122 |                     if compareTwoEditLists(editSet, boardEdits[i]):
123 |                         foundMatch = True
124 |                     else:
125 |                         i+=1
126 | 
127 |                 if not foundMatch:
128 |                     annotBoard[nid][pid][sentid].append(editSet)
129 |         
130 |     return annotBoard
131 | 
132 | def moderateAnnotationsAlt(contestDocs, annotBoard, origDocSet):
133 |     # moderate annotation in "contesting" docs with already stated mistakes
134 |     # for alternative answers (with explicit NOOP)
135 |     mistakeStrSet = {}
136 |     for doc in contestDocs:
137 |         mistakeStr = ''
138 |         nid = int(doc.docattrs[0][1]) # nid of current document
139 |         tid = doc.annotation[0][0][1] # teacher id
140 | 
141 |         if not annotBoard.has_key(nid): # create placeholder
142 |             annotBoard[nid] = {}
143 | 
144 |         origDoc = origDocSet[nid]
145 |         for pid in xrange(len(origDoc.sentences)):
146 |             slist = origDoc.sentences[pid]
147 |             if not annotBoard[nid].has_key(pid):
148 |                 annotBoard[nid][pid] = {}
149 |             for sentid in xrange(len(slist)):
150 |                 sent = slist[sentid]
151 |                 if not annotBoard[nid][pid].has_key(sentid):
152 |                     annotBoard[nid][pid][sentid] = []
153 |                 editSet = []
154 | 
155 |                 # enumerate mistakes
156 |                 sentoffset = origDoc.paragraphs[pid].index(sent.sentstr)
157 |                 editNum = 0
158 |                 for m in doc.mistakes:
159 |                     if m['start_par'] != pid or \
160 |                        m['start_par'] != m['end_par'] or \
161 |                        m['start_off'] < sentoffset or \
162 |                        m['start_off'] >= sentoffset + len(sent.sentstr) or \
163 |                        m['end_off'] <sentoffset or \
164 |                        m['end_off'] > sentoffset + len(sent.sentstr):
165 |                         continue
166 | 
167 |                     if m['type'] != 'noop':
168 |                         editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], m['correction'], m['type']))
169 |                         editNum += 1
170 |                     else:
171 |                         editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], sent.sentstr, m['type']))
172 |                 
173 |                 # as empty alternative edit means agreement to main annotation edit
174 |                 if len(editSet) == 0:
175 |                     continue
176 | 
177 |                 editSet = sorted(editSet, key=itemgetter(0, 1, 2, 3))
178 |                 
179 |                 # find the same annotation
180 |                 foundMatch = False
181 |                 i = 0
182 |                 boardEdits = annotBoard[nid][pid][sentid]
183 |                 while i < len(boardEdits) and not foundMatch:
184 |                     if compareTwoEditLists(editSet, boardEdits[i]):
185 |                         foundMatch = True
186 |                     else:
187 |                         i+=1
188 | 
189 |                 if not foundMatch:
190 |                     annotBoard[nid][pid][sentid].append(editSet)
191 |         
192 |     return annotBoard
193 |     
194 | def createM2File(origDocs, mistakesBoard, m2FileName):
195 |     
196 |     fm2 = open(m2FileName, 'w')
197 | 
198 |     for doc in origDocs:
199 |         nid = int(doc.docattrs[0][1]) # nid of current document
200 |         for slistIndex in xrange(len(doc.sentences)):
201 |             slist = doc.sentences[slistIndex]
202 |             for sentid in xrange(len(slist)):
203 | 
204 |                 sent = slist[sentid]
205 | 
206 |                 # m2 format annotation string list
207 |                 m2AnnotationList = []
208 | 
209 |                 # build colums
210 |                 table = sent.getConllFormat(doc, slistIndex, sentid)
211 |                 tokenizedSentStr = ' '.join(sent.getWords())
212 | 
213 |                 #Add annotation info
214 |                 sentoffset = doc.paragraphs[slistIndex].index(sent.sentstr)
215 | 
216 |                 i = 0
217 |                 board = mistakesBoard[nid][slistIndex][sentid]
218 |                 for mistakesList in board:
219 |                     if len(mistakesList) == 0 and len(board) > 1:
220 |                         m2AnnotationList.append('A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||' + str(i) + '\n')
221 |                         i += 1
222 |                         
223 |                     for tuple in mistakesList:
224 |                         m = {}
225 |                         m['start_par'] = tuple[0]
226 |                         m['end_par'] = tuple[1]
227 |                         m['start_off'] = tuple[2]
228 |                         m['end_off'] = tuple[3]
229 |                         m['correction'] = tuple[4]
230 |                         m['type'] = tuple[5]
231 | 
232 |                         if m['start_par'] != slistIndex or \
233 |                           m['start_par'] != m['end_par'] or \
234 |                           m['start_off'] < sentoffset or \
235 |                           m['start_off'] >= sentoffset + len(sent.sentstr) or \
236 |                           m['end_off'] <sentoffset or \
237 |                           m['end_off'] > sentoffset + len(sent.sentstr):
238 |                             continue
239 | 
240 |                         wordsoffset = 0
241 |                         wdstart = 0
242 | 
243 |                         startInWord = 0
244 |                         headText = ''
245 |                         endInWord = 0
246 |                         tailText = ''
247 | 
248 |                         words = sent.getWords()
249 |                         while wdstart < len(words):
250 | 
251 |                             word = words[wdstart]
252 |                             nextstart = sent.sentstr.find(word, wordsoffset)
253 | 
254 |                             if nextstart == -1:
255 |                                 # may not find word, due to relpacement
256 |                                 print >> sys.stderr, "Warning in building conll format: can not find word"
257 |                                 print >> sys.stderr, word.encode('utf-8')
258 |                                 wordsoffset += 1
259 |                             else:
260 |                                 wordsoffset = nextstart
261 | 
262 |                             if wordsoffset >= m['start_off']-sentoffset:
263 |                                 break
264 |                             elif wordsoffset + len(word) > m['start_off']-sentoffset:
265 |                                 # annotation starts at the middle of a word
266 |                                 startInWord = 1
267 |                                 headText = sent.sentstr[wordsoffset: m['start_off']-sentoffset]
268 |                                 break
269 | 
270 |                             wordsoffset += len(word) 
271 |                             wdstart += 1
272 | 
273 |                         if wdstart == len(words):
274 |                             print >> sys.stderr, 'Warning in building conll format: start_off overflow'
275 |                             print >> sys.stderr, m, sent.sentstr.encode('utf-8')
276 |                             continue
277 | 
278 | 
279 |                         wdend = wdstart
280 |                         while wdend < len(words):
281 | 
282 |                             word = words[wdend]
283 |                             
284 |                             nextstart = sent.sentstr.find(word, wordsoffset)
285 | 
286 |                             if nextstart == -1:
287 |                                 print >> sys.stderr, "Warning in building conll format: can not find word"
288 |                                 print >> sys.stderr, word.encode('utf-8')
289 |                                 wordsoffset += 1
290 |                             else:
291 |                                 wordsoffset = nextstart
292 | 
293 |                             if wordsoffset >= m['end_off']-sentoffset:
294 |                                 # annotation ends at the middle of a word
295 |                                 if wordsoffset - len(words[wdend-1]) - 1 < m['end_off']-sentoffset: 
296 |                                     endInWord = 1
297 |                                     tailText = sent.sentstr[m['end_off']-sentoffset : wordsoffset].strip()
298 |                                 break
299 | 
300 |                             wordsoffset += len(word) 
301 |                             wdend += 1
302 |                        
303 | 
304 |                         correctionTokenizedStr = tokenizeCorrectionStr(headText + m['correction'] + tailText, wdstart, wdend, words)
305 |                         correctionTokenizedStr, wdstart, wdend = shrinkCorrectionStr(correctionTokenizedStr, wdstart, wdend, words)
306 | 
307 |                         token_start = wdstart if m['type'] != 'noop' else -1
308 |                         token_end = wdend if m['type'] != 'noop' else -1
309 |                         correction_final = correctionTokenizedStr.replace('\n', '') if m['type'] != 'noop' else '-NONE-'
310 | 
311 |                         # build annotation string for .conll.m2 file
312 |                         m2AnnotationStr  = 'A '
313 |                         m2AnnotationStr +=  str(token_start) + ' '
314 |                         m2AnnotationStr +=  str(token_end) + '|||'
315 |                         m2AnnotationStr +=  m['type'] + '|||'
316 |                         m2AnnotationStr +=  correction_final + '|||'
317 |                         m2AnnotationStr +=  'REQUIRED|||-NONE-|||' + str(i) + '\n'
318 | 
319 |                         m2AnnotationList.append(m2AnnotationStr)
320 |                     
321 |                     if len(mistakesList) > 0: # only if mistakeList contains tuples
322 |                         i += 1
323 | 
324 |                 # write .conll.m2 file
325 |                 m2AnnotationSent = 'S ' + tokenizedSentStr + '\n'
326 |                 m2AnnotationSent += ''.join(m2AnnotationList) + '\n'
327 |                 fm2.write(m2AnnotationSent.encode('utf-8'))
328 |     
329 |     fm2.close()
330 | 
331 | 
332 | def tokenizeCorrectionStr(correctionStr, wdstart, wdend, words):
333 | 
334 |     correctionTokenizedStr = ''
335 |     pseudoSent = correctionStr
336 | 
337 |     if wdstart != 0:
338 |         pseudoSent = words[wdstart-1] + ' ' + pseudoSent 
339 | 
340 |     if wdend < len(words) - 1:
341 |         pseudoSent = pseudoSent + ' ' + words[wdend]
342 |     elif wdend == len(words) - 1:
343 |         pseudoSent = pseudoSent + words[wdend]
344 | 
345 | 
346 |     pseudoWordsList = []
347 |     sentList = sentenceTokenizer.tokenize(pseudoSent)
348 |     for sent in sentList:
349 |         pseudoWordsList += word_tokenize(sent)
350 | 
351 |     start = 0
352 |     if wdstart != 0:
353 |         s = ''
354 |         for i in xrange(len(pseudoWordsList)):
355 |             s += pseudoWordsList[i]
356 |             if s == words[wdstart-1]:
357 |                 start = i + 1
358 |                 break
359 |         if start == 0:
360 |             print >> sys.stderr, 'Can not find words[wdstart-1]'
361 | 
362 |     else:
363 |         start = 0
364 | 
365 |     end = len(pseudoWordsList)
366 |     if wdend != len(words):
367 | 
368 |         s = ''
369 |         for i in xrange(len(pseudoWordsList)):
370 |             s = pseudoWordsList[len(pseudoWordsList) - i - 1] + s
371 |             if s == words[wdend]:
372 |                 end = len(pseudoWordsList) - i - 1
373 |                 break
374 |         if end == len(pseudoWordsList):
375 |             print >> sys.stderr, 'Can not find words[wdend]'
376 | 
377 |     else:
378 |         end = len(pseudoWordsList)
379 | 
380 |     correctionTokenizedStr = ' '.join(pseudoWordsList[start:end])
381 | 
382 |     return correctionTokenizedStr
383 | 
384 | 
385 | def shrinkCorrectionStr(correctionTokenizedStr, wdstart, wdend, words):
386 | 
387 |     correctionWords = correctionTokenizedStr.split(' ')
388 |     originalWords = words[wdstart: wdend]
389 |     wdstartNew = wdstart
390 |     wdendNew = wdend
391 |     cstart = 0
392 |     cend = len(correctionWords)
393 | 
394 |     i = 0
395 |     while i < len(originalWords) and i < len(correctionWords):
396 |         if correctionWords[i] == originalWords[i]:
397 |             i += 1
398 |             wdstartNew = i + wdstart
399 |             cstart = i
400 |         else:
401 |             break
402 | 
403 |     i = 1
404 |     while i <= len(originalWords) - cstart and i <= len(correctionWords) - cstart:
405 |         if correctionWords[len(correctionWords)-i] == originalWords[len(originalWords)-i]:
406 |             wdendNew = wdend - i
407 |             cend = len(correctionWords) - i
408 |             i += 1
409 |         else:
410 |             break
411 | 
412 |     return ' '.join(correctionWords[cstart:cend]), wdstartNew, wdendNew
413 | 
414 | if __name__ == '__main__':
415 | 
416 |     ''' usage: 
417 | 
418 |         %python preprocesswithalt.py essaySgmlfile M mainsgmlfile1 ... mainsgmlfileM alternativesgmlfile1 ... alternativesgmlfileN combinedm2file
419 |           output an m2 file containing a collection of M main annotations and N alternative annotations.
420 | 
421 |         In most cases essaySgmlfile and mainsgmlfile are identical
422 |     '''
423 | 
424 |     # Load original complete SGML for reference
425 |     origDocs = sentenceSplit(readNUCLE(sys.argv[1]))
426 |     
427 |     origDocSet = {}
428 |     for doc in origDocs:
429 |         nid = int(doc.docattrs[0][1])
430 |         origDocSet[nid] = doc
431 | 
432 |     nummain = int(sys.argv[2])
433 |     
434 |     # Store main annotations
435 |     docsList = []
436 |     altDocsList = []
437 |     for i in range(0, nummain):
438 |         print >> sys.stderr, 'Storing main annotation', (i+1)
439 |         docs = sentenceSplit(readNUCLE(sys.argv[i+3]))
440 |         docsList.append(docs)
441 | 
442 |     board = {}
443 |     for docs in docsList:
444 |         board = moderateAnnotations(docs, board, origDocSet)
445 |     
446 |     # store alternative annotations
447 |     for i in range(3 + nummain, len(sys.argv) - 1):
448 |         print >> sys.stderr, 'Storing alternative annotation', (i+1)
449 |         altdocs = sentenceSplit(readNUCLE(sys.argv[i]))
450 |         altDocsList.append(altdocs)
451 | 
452 |     for altdocs in altDocsList:
453 |         board = moderateAnnotationsAlt(altdocs, board, origDocSet)
454 | 
455 |     createM2File(origDocs, board, sys.argv[len(sys.argv)-1])
456 | 
457 |     pass
458 | 
459 | 


--------------------------------------------------------------------------------
/data/release2.3.1/README:
--------------------------------------------------------------------------------
  1 | Release 2.3.1
  2 | 24 May 2013
  3 | 
  4 | This README file describes the test data and scoring procedure for the
  5 | CoNLL-2013 Shared Task: Grammatical Error Correction.
  6 | 
  7 | The package is distributed freely with the following copyright
  8 | Copyright (C) 2013 Hwee Tou Ng, Joel Tetreault, Siew Mei Wu,
  9 |                    Yuanbin Wu, Christian Hadiwinoto
 10 | 
 11 | Any questions regarding the test data should be directed to
 12 | Hwee Tou Ng at: nght@comp.nus.edu.sg
 13 | 
 14 | 
 15 | 1. Directory Structure and Contents
 16 | ===================================
 17 | 
 18 | The top-level directory has four subdirectories, namely
 19 | 
 20 | - original/ : the test data with the original official annotations
 21 |               before adding alternatives contributed by the participants
 22 | - revised/  : the moderated participants' alternative annotations and
 23 |               the revised official annotation for the test data
 24 | - scripts/  : the scripts used to preprocess the test data inside the
 25 |               original/ and revised/ subdirectories
 26 | - m2scorer/ : the official scoring software for the shared task
 27 | 
 28 | Each of the original/ and revised/ subdirectories contains data/
 29 | subdirectory, which includes annotations for all the error types, and
 30 | data_5types/ subdirectory, which includes annotations for only the 5
 31 | types concerned in the shared task.
 32 | 
 33 | 
 34 | 2. Data Format 
 35 | ==============
 36 | 
 37 | The corpus is distributed in a simple SGML format.  All annotations
 38 | come in a "stand-off" format. The start position and end position of
 39 | an annotation are given by paragraph and character offsets.
 40 | Paragraphs are enclosed in <P>...</P> tags. Paragraphs and characters
 41 | are counted starting from zero. Each annotation includes the following
 42 | fields: the error category, the correction, and optionally a
 43 | comment. If the correction replaces the original text at the given
 44 | location, it should fix the grammatical error.
 45 | 
 46 | Example:
 47 | 
 48 | <DOC nid="2">
 49 | <TEXT>
 50 | <P>
 51 | In modern digital world, ...
 52 | </P>
 53 | <P>
 54 | Surveillance technology such as ...
 55 | </P>
 56 | ...
 57 | </TEXT>
 58 | <ANNOTATION teacher_id="6">
 59 | <MISTAKE start_par="0" start_off="3" end_par="0" end_off="9">
 60 | <TYPE>ArtOrDet</TYPE>
 61 | <CORRECTION>the modern</CORRECTION>
 62 | </MISTAKE>
 63 | ...
 64 | </ANNOTATION>
 65 | </DOC>
 66 | 
 67 | <DOC nid="3">
 68 | ...
 69 | 
 70 | Below is a complete list of the error categories in the data/
 71 | subdirectory under the original/ and revised/ subdirectories:
 72 | 
 73 | ERROR TAG    ERROR CATEGORY
 74 | ---------------------------
 75 | Vt	     Verb tense
 76 | Vm	     Verb modal
 77 | V0	     Missing verb
 78 | Vform	     Verb form
 79 | SVA	     Subject-verb-agreement
 80 | ArtOrDet     Article or Determiner
 81 | Nn	     Noun number
 82 | Npos	     Noun possesive
 83 | Pform	     Pronoun form
 84 | Pref	     Pronoun reference
 85 | Prep         Preposition
 86 | Wci	     Wrong collocation/idiom
 87 | Wa	     Acronyms
 88 | Wform	     Word form
 89 | Wtone	     Tone
 90 | Srun	     Runons, comma splice
 91 | Smod	     Dangling modifier
 92 | Spar	     Parallelism
 93 | Sfrag	     Fragment
 94 | Ssub	     Subordinate clause
 95 | WOinc	     Incorrect sentence form
 96 | WOadv	     Adverb/adjective position
 97 | Trans	     Link word/phrases
 98 | Mec	     Punctuation, capitalization, spelling, typos
 99 | Rloc-	     Local redundancy
100 | Cit	     Citation
101 | Others	     Other errors
102 | Um	     Unclear meaning (cannot be corrected)
103 | 
104 | Below is a list of the error categories in the data_5types/
105 | subdirectory under the original/ and revised/ subdirectories:
106 | 
107 | ERROR TAG    ERROR CATEGORY
108 | ---------------------------
109 | Vform	     Verb form
110 | SVA	     Subject-verb-agreement
111 | ArtOrDet     Article or Determiner
112 | Nn	     Noun number
113 | Prep         Preposition
114 | 
115 | The official annotation file contains all the default annotations to
116 | make the whole text correct. Meanwhile, each of the alternative
117 | annotation files contains only annotations for sentences that can be
118 | corrected in a different way, i.e. sentences that have alternative
119 | annotations. If according to an alternative, a sentence can remain
120 | unchanged, a special tag "noop" is used for that particular sentence.
121 | 
122 | 
123 | 3. Updates included in version 2.1
124 | ==================================
125 | 
126 | The major change made in version 2.1 is to map the past error
127 | categories Wcip and Rloc to Prep, Wci, ArtOrDet, and Rloc-.
128 | 
129 | In the original data, there is no explicit preposition error
130 | category. Instead, preposition errors are part of the Wcip (Wrong
131 | collocation/idiom/preposition) and Rloc (local redundancy) error
132 | categories. In addition, redundant article or determiner errors are
133 | part of the Rloc error category.
134 | 
135 | 
136 | 4. Updates included in version 2.2
137 | ==================================
138 | 
139 | - Fixed the bug on expanding an error annotation involving part of a
140 |   token to the full token.
141 | 
142 | - Other miscellaneous corrections were made.
143 | 
144 | 
145 | 5. Updates included in version 2.3
146 | ==================================
147 | 
148 | - Fixed the bug involving tokenization of punctuation symbols in the
149 |   correction string.
150 | 
151 | - Fixed the tokenization example in the README file of the M^2 scorer
152 |   to reflect the real tokenization to be used and removed irrelevant
153 |   codes from the scorer package.
154 | 
155 | 
156 | 6. Updates included in version 2.3.1
157 | ====================================
158 | 
159 | - Enhanced the capability of the M^2 scorer to be able to handle
160 |   multiple alternative sets of gold edits.
161 | 
162 | - Fixed the preprocess.py script to keep the annotation span minimal,
163 |   i.e. by excluding the beginning and/or end tokens that co-exist in
164 |   the original and correction string.
165 | 
166 | 
167 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/README:
--------------------------------------------------------------------------------
  1 | Release 2.3.1
  2 | Revision: 24 May 2013
  3 | 
  4 | This README file describes the NUS MaxMatch (M^2) scorer.
  5 | Copyright (C) 2013 Daniel Dahlmeier, Hwee Tou Ng and Christian Hadiwinoto
  6 | 
  7 | This program is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or (at
 10 | your option) any later version.
 11 | 
 12 | This program is distributed in the hope that it will be useful, but
 13 | WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License
 18 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | If you are using the NUS M^2 scorer in your work, please include a
 22 | citation of the following paper:
 23 | 
 24 | Daniel Dahlmeier and Hwee Tou Ng. 2012. Better Evaluation for
 25 | Grammatical Error Correction. In Proceedings of the 2012 Conference of
 26 | the North American Chapter of the Association for Computational
 27 | Linguistics: Human Language Technologies
 28 | 
 29 | Any questions regarding the NUS M^2 scorer should be directed to
 30 | Daniel Dahlmeier(danielhe@comp.nus.edu.sg).
 31 | 
 32 | 
 33 | Contents
 34 | ========
 35 | 0. Quickstart
 36 | 1. Pre-requisites
 37 | 2. Using the scorer
 38 |   2.1 System output format
 39 |   2.2 Scorer's gold standard format
 40 | 3. Converting the CoNLL-2013 data format
 41 | 4. Revisions
 42 | 
 43 | 
 44 | 0. Quickstart
 45 | =============
 46 |  ./m2scorer [-v] SYSTEM SOURCE_GOLD 
 47 | 
 48 | SYSTEM = the system output in sentence-per-line plain text.
 49 | SOURCE_GOLD = the source sentences with gold edits.
 50 | 
 51 | 
 52 | 1. Pre-requisites
 53 | =================
 54 | The following dependencies have to be installed to use the M^2 scorer.
 55 | 
 56 | + Python (>= 2.6.4, < 3.0, older versions might work but are not tested)
 57 | + nltk (http://www.nltk.org, needed for sentence splitting) 
 58 | 
 59 | 
 60 | 
 61 | 2. Using the scorer
 62 | ===================
 63 | Usage: m2scorer [OPTIONS] SYSTEM SOURCE_GOLD
 64 | where
 65 |  SYSTEM          -   system output, one sentence per line
 66 |  SOURCE_GOLD     -   source sentences with gold token edits
 67 | 
 68 | OPTIONS
 69 |   -v    --verbose             -  print verbose output
 70 |   --very_verbose              -  print lots of verbose output
 71 |   --max_unchanged_words N     -  Maximum unchanged words when extracting edits. Default 2.
 72 |   --ignore_whitespace_casing  -  Ignore edits that only affect whitespace and casing. Default no.
 73 | 
 74 | 
 75 | 2.1 System output format
 76 | ========================
 77 | SYSTEM = File that contains the output of the error correction
 78 | system. The sentences should be in tokenized plain text, sentence-per-line
 79 | format.
 80 | 
 81 | Format:
 82 | <tokenized system output for sentence 1>
 83 | <tokenized system output for sentence 2>
 84 |  ...
 85 | 
 86 | Examples of tokenization:
 87 | -------------------------
 88 |  Original  : He said, "We shouldn't go to the place. It'll kill one of us."
 89 |  Tokenized : He said , " We shouldn 't go to the place . It 'll kill one of us . "
 90 | 
 91 | Sample output:
 92 | --------------
 93 | ===> system <===
 94 | A cat sat on the mat .
 95 | The Dog .
 96 | 
 97 | 
 98 | 2.2 Scorer's gold standard format
 99 | =================================
100 | SOURCE_GOLD = source sentences (i.e. input to the error correction
101 | system) and the gold annotation in TOKEN offsets (starting from zero). 
102 | 
103 | Format:
104 | S <tokenized system output for sentence 1>
105 | A <token start offset> <token end offset>|||<error type>|||<correction1>||<correction2||..||correctionN|||<required>|||<comment>|||<annotator id>
106 | A <token start offset> <token end offset>|||<error type>|||<correction1>||<correction2||..||correctionN|||<required>|||<comment>|||<annotator id>
107 | 
108 | S <tokenized system output for sentence 2>
109 | A <token start offset> <token end offset>|||<error type>|||<correction1>||<correction2||..||correctionN|||<required>|||<comment>|||<annotator id>
110 | 
111 | 
112 | Notes:
113 | ------
114 |  - Each source sentence should appear on a single line starting with "S "
115 |  - Each source sentence is followed by zero or more annotations.
116 |  - Each annotation is on a separate line starting with "A ".
117 |  - Sentences are separated by one or more empty lines.
118 |  - The source sentences need to be tokenized in the same way as the system output.
119 |  - Start and end offset for annotations are in token offsets (starting from zero).
120 |  - The gold edits can include one or more  possible correction strings. Multiple corrections should be separate by '||'.
121 |  - The error type, required field and comment are not used for scoring at the moment. You can put dummy values there.
122 |  - The annotator ID is used to identify a distinct annotation set by which system edits will be evaluated.
123 |    - Each distinct annotation set, identified by an annotator ID, is an alternative
124 |    - If one sentence has multiple annotator IDs, score will be computed for each annotator.
125 |    - If one of the multiple annotation alternatives is no edit at all, an edit with type 'noop' or with offsets '-1 -1' must be specified.
126 |    - The final score for the sentence will use the set of edits by an annotation set maximizing the score.
127 | 
128 | 
129 | Example:
130 | --------
131 | ===> source_gold <===
132 | S The cat sat at mat .
133 | A 3 4|||Prep|||on|||REQUIRED|||-NONE-|||0
134 | A 4 4|||ArtOrDet|||the||a|||REQUIRED|||-NONE-|||0
135 | 
136 | S The dog .
137 | A 1 2|||NN|||dogs|||REQUIRED|||-NONE-|||0
138 | A -1 -1|||noop|||-NONE-|||-NONE-|||-NONE-|||1
139 | 
140 | S Giant otters is an apex predator .
141 | A 2 3|||SVA|||are|||REQUIRED|||-NONE-|||0
142 | A 3 4|||ArtOrDet|||-NONE-|||REQUIRED|||-NONE-|||0
143 | A 5 6|||NN|||predators|||REQUIRED|||-NONE-|||0
144 | A 1 2|||NN|||otter|||REQUIRED|||-NONE-|||1
145 | 
146 | 
147 | 
148 | ===> system <===
149 | A cat sat on the mat .
150 | The dog .
151 | Giant otters are apex predator .
152 | 
153 | ./m2scorer  system source_gold 
154 | Precision   : 0.8
155 | Recall      : 0.8
156 | F1          : 0.8
157 | 
158 | For sentence #1, the system makes two valid edits {(at-> on),
159 | (\epsilon -> the)} and one unnecessary edit (The -> A).
160 | 
161 | For sentence #2, despite missing one gold edit (dog -> dogs) according
162 | to annotation set 0, the system misses nothing according to set 1.
163 | 
164 | For sentence #3, according to annotation set 0, the system makes two
165 | valid edits {(is -> are), (an -> \epsilon)} and misses one edit
166 | (predator -> predators); however according to set 1, the system makes
167 | two unnecessary edits {(is -> are), (an -> \epsilon)}.
168 | 
169 | By the case above, there are four valid edits, one unnecessary edit
170 | and one missing edit. Therefore precision is 4/5 = 0.8. Similarly for
171 | recall and F1 measure.
172 | 
173 | 
174 | 3. Converting the CoNLL-2013 data format
175 | ========================================
176 | The data format used in the M^2 scorer differs from the format used in
177 | the CoNLL-2013 shared task (http://www.comp.nus.edu.sg/~nlp/conll13st.html)
178 | in two aspects:
179 |  - sentence-level edits
180 |  - token edit offsets
181 | 
182 | To convert source files and gold edits from the CoNLL-2013 format into
183 | the M^2 format, run the preprocessing script bundled with the CoNLL-2013
184 | training data.
185 | 
186 | 
187 | 4. Revision Note: Alternative Edits
188 | ===================================
189 | 
190 | In this release, there is a major modification which enables scoring
191 | with multiple sets of gold edits. On every sentence, the system output
192 | will be scored against every available set of gold edits for the
193 | sentence and the edits set that maximizes the F1 score of the sentence
194 | is chosen.
195 | 
196 | This modification was carried out by Christian Hadiwinoto, 2013.
197 | 
198 | 
199 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/example/README:
--------------------------------------------------------------------------------
1 | (execute these examples from the m2scorer top-level directory)
2 | 
3 | 
4 | ./m2scorer example/system_output.txt example/source_gold
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/example/source_gold:
--------------------------------------------------------------------------------
 1 | S The cat sat at mat .
 2 | A 3 4|||Prep|||on|||REQUIRED|||-NONE-|||0
 3 | A 4 4|||ArtOrDet|||the||a|||REQUIRED|||-NONE-|||0
 4 | 
 5 | S The dog .
 6 | A 1 2|||NN|||dogs|||REQUIRED|||-NONE-|||0
 7 | A -1 -1|||noop|||-NONE-|||-NONE-|||-NONE-|||1
 8 | 
 9 | S Giant otters is an apex predator .
10 | A 2 3|||SVA|||are|||REQUIRED|||-NONE-|||0
11 | A 3 4|||ArtOrDet|||-NONE-|||REQUIRED|||-NONE-|||0
12 | A 5 6|||NN|||predators|||REQUIRED|||-NONE-|||0
13 | A 1 2|||NN|||otter|||REQUIRED|||-NONE-|||1
14 | 
15 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/example/system:
--------------------------------------------------------------------------------
1 | A cat sat on the mat .
2 | The dog .
3 | Giant otters are apex predator .
4 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/m2scorer:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # This file is part of the NUS M2 scorer.
  4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify
  5 | # it under the terms of the GNU General Public License as published by
  6 | # the Free Software Foundation, either version 3 of the License, or
  7 | # (at your option) any later version.
  8 | 
  9 | # The NUS M2 scorer is distributed in the hope that it will be useful,
 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | # GNU General Public License for more details.
 13 | 
 14 | # You should have received a copy of the GNU General Public License
 15 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 | # file: m2scorer.py
 18 | # 
 19 | # score a system's output against a gold reference 
 20 | #
 21 | # Usage: m2scorer.py [OPTIONS] proposed_sentences source_gold
 22 | # where
 23 | #  proposed_sentences   -   system output, sentence per line
 24 | #  source_gold          -   source sentences with gold token edits
 25 | # OPTIONS
 26 | #   -v    --verbose             -  print verbose output
 27 | #   --very_verbose              -  print lots of verbose output
 28 | #   --max_unchanged_words N     -  Maximum unchanged words when extracting edits. Default 2."
 29 | #   --ignore_whitespace_casing  -  Ignore edits that only affect whitespace and caseing. Default no."
 30 | #
 31 | 
 32 | import sys
 33 | import levenshtein
 34 | from getopt import getopt
 35 | from util import paragraphs
 36 | from util import smart_open
 37 | 
 38 | 
 39 | 
 40 | def load_annotation(gold_file):
 41 |     source_sentences = []
 42 |     gold_edits = []
 43 |     fgold = smart_open(gold_file, 'r')
 44 |     puffer = fgold.read()
 45 |     fgold.close()
 46 |     puffer = puffer.decode('utf8')
 47 |     for item in paragraphs(puffer.splitlines(True)):
 48 |         item = item.splitlines(False)
 49 |         sentence = [line[2:].strip() for line in item if line.startswith('S ')]
 50 |         assert sentence != []
 51 |         annotations = {}
 52 |         for line in item[1:]:
 53 |             if line.startswith('I ') or line.startswith('S '):
 54 |                 continue
 55 |             assert line.startswith('A ')
 56 |             line = line[2:]
 57 |             fields = line.split('|||')
 58 |             start_offset = int(fields[0].split()[0])
 59 |             end_offset = int(fields[0].split()[1])
 60 |             etype = fields[1]
 61 |             if etype == 'noop':
 62 |                 start_offset = -1
 63 |                 end_offset = -1
 64 |             corrections =  [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')]
 65 |             # NOTE: start and end are *token* offsets
 66 |             original = ' '.join(' '.join(sentence).split()[start_offset:end_offset])
 67 |             annotator = int(fields[5])
 68 |             if annotator not in annotations.keys():
 69 |                 annotations[annotator] = []
 70 |             annotations[annotator].append((start_offset, end_offset, original, corrections))
 71 |         tok_offset = 0
 72 |         for this_sentence in sentence:
 73 |             tok_offset += len(this_sentence.split())
 74 |             source_sentences.append(this_sentence)
 75 |             this_edits = {}
 76 |             for annotator, annotation in annotations.iteritems():
 77 |                 this_edits[annotator] = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0]
 78 |             if len(this_edits) == 0:
 79 |                 this_edits[0] = []
 80 |             gold_edits.append(this_edits)
 81 |     return (source_sentences, gold_edits)
 82 | 
 83 | 
 84 | def print_usage():
 85 |     print >> sys.stderr, "Usage: m2scorer.py [OPTIONS] proposed_sentences gold_source"
 86 |     print >> sys.stderr, "where"
 87 |     print >> sys.stderr, "  proposed_sentences   -   system output, sentence per line"
 88 |     print >> sys.stderr, "  source_gold          -   source sentences with gold token edits"
 89 |     print >> sys.stderr, "OPTIONS"
 90 |     print >> sys.stderr, "  -v    --verbose                   -  print verbose output"
 91 |     print >> sys.stderr, "        --very_verbose              -  print lots of verbose output"
 92 |     print >> sys.stderr, "        --max_unchanged_words N     -  Maximum unchanged words when extraction edit. Default 2."
 93 |     print >> sys.stderr, "        --ignore_whitespace_casing  -  Ignore edits that only affect whitespace and caseing. Default no."
 94 | 
 95 | 
 96 | 
 97 | max_unchanged_words=2
 98 | ignore_whitespace_casing= False
 99 | verbose = False
100 | very_verbose = False
101 | opts, args = getopt(sys.argv[1:], "v", ["max_unchanged_words=", "verbose", "ignore_whitespace_casing", "very_verbose"])
102 | for o, v in opts:
103 |     if o in ('-v', '--verbose'):
104 |         verbose = True
105 |     elif o == '--very_verbose':
106 |         very_verbose = True
107 |     elif o == '--max_unchanged_words':
108 |         max_unchanged_words = int(v)
109 |     elif o == '--ignore_whitespace_casing':
110 |         ignore_whitespace_casing = True
111 |     else:
112 |         print >> sys.stderr, "Unknown option :", o
113 |         print_usage()
114 |         sys.exit(-1)
115 | 
116 | # starting point
117 | if len(args) != 2:
118 |     print_usage()
119 |     sys.exit(-1)
120 | 
121 | system_file = args[0]
122 | gold_file = args[1]
123 | 
124 | # load source sentences and gold edits
125 | source_sentences, gold_edits = load_annotation(gold_file)
126 | 
127 | # load system hypotheses
128 | fin = smart_open(system_file, 'r')
129 | system_sentences = [line.decode("utf8").strip() for line in fin.readlines()]
130 | fin.close()
131 | 
132 | p, r, f1 = levenshtein.batch_multi_pre_rec_f1(system_sentences, source_sentences, gold_edits, max_unchanged_words, ignore_whitespace_casing, verbose, very_verbose)
133 | 
134 | print "Precision   : %.4f" % p
135 | print "Recall      : %.4f" % r
136 | print "F1          : %.4f" % f1
137 | 
138 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/scripts/Tokenizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: iso-8859-15 -*-
  3 | 
  4 | # This file is part of the NUS M2 scorer.
  5 | # The NUS M2 scorer is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | 
 10 | # The NUS M2 scorer is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | 
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | # file: Tokenizer.py
 19 | #
 20 | # A Penn Treebank tokenizer reimplemented based on the MOSES implementation.
 21 | #
 22 | # usage : %prog < input > output
 23 | 
 24 | 
 25 | import re
 26 | import sys
 27 | 
 28 | 
 29 | class DummyTokenizer(object):
 30 | 
 31 |     def tokenize(self, text):
 32 |         return text.split()
 33 | 
 34 | 
 35 | 
 36 | class PTBTokenizer(object):
 37 | 
 38 |     def __init__(self, language="en"):
 39 |         self.language = language
 40 |         self.nonbreaking_prefixes = {}
 41 |         self.nonbreaking_prefixes_numeric = {}
 42 |         self.nonbreaking_prefixes["en"] = ''' A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 
 43 |             Adj Adm Adv Asst Bart Bldg Brig Bros Capt Cmdr Col Comdr Con Corp Cpl DR Dr Drs Ens 
 44 |             Gen Gov Hon Hr Hosp Insp Lt MM MR MRS MS Maj Messrs Mlle Mme Mr Mrs Ms Msgr Op Ord
 45 |             Pfc Ph Prof Pvt Rep Reps Res Rev Rt Sen Sens Sfc Sgt Sr St Supt Surg
 46 |             v vs i.e rev e.g Nos Nr'''.split()        
 47 |         self.nonbreaking_prefixes_numeric["en"] = '''No Art pp'''.split()
 48 |         self.special_chars = re.compile(r"([^\w\s\.\'\`\,\-\"\|\/])", flags=re.UNICODE)
 49 |                 
 50 |     def tokenize(self, text, ptb=False):
 51 |         text = text.strip()
 52 |         text = " " + text + " "
 53 |         
 54 |         # Separate all "other" punctuation 
 55 | 
 56 |         text = re.sub(self.special_chars, r' \1 ', text)    
 57 |         text = re.sub(r";", r' ; ', text)    
 58 |         text = re.sub(r":", r' : ', text)
 59 |                             
 60 |         # replace the pipe character
 61 |         text = re.sub(r"\|", r' -PIPE- ', text)
 62 | 
 63 |         # split internal slash, keep others 
 64 |         text = re.sub(r"(\S)/(\S)", r'\1 / \2', text) 
 65 | 
 66 |         # PTB tokenization
 67 |         if ptb:
 68 |             text = re.sub(r"\(", r' -LRB- ', text)    
 69 |             text = re.sub(r"\)", r' -RRB- ', text)
 70 |             text = re.sub(r"\[", r' -LSB- ', text)    
 71 |             text = re.sub(r"\]", r' -RSB- ', text)
 72 |             text = re.sub(r"\{", r' -LCB- ', text)    
 73 |             text = re.sub(r"\}", r' -RCB- ', text)
 74 |         
 75 |             text = re.sub(r"\"\s*$", r" '' ", text)
 76 |             text = re.sub(r"^\s*\"", r' `` ', text)
 77 |             text = re.sub(r"(\S)\"\s", r"\1 '' ", text)
 78 |             text = re.sub(r"\s\"(\S)", r" `` \1", text)
 79 |             text = re.sub(r"(\S)\"", r"\1 '' ", text)
 80 |             text = re.sub(r"\"(\S)", r" `` \1", text)
 81 |             text = re.sub(r"'\s*$", r" ' ", text)
 82 |             text = re.sub(r"^\s*'", r" ` ", text)
 83 |             text = re.sub(r"(\S)'\s", r"\1 ' ", text)
 84 |             text = re.sub(r"\s'(\S)", r" ` \1", text) 
 85 |         
 86 |             text = re.sub(r"'ll", r" -CONTRACT-ll", text) 
 87 |             text = re.sub(r"'re", r" -CONTRACT-re", text) 
 88 |             text = re.sub(r"'ve", r" -CONTRACT-ve", text)
 89 |             text = re.sub(r"n't", r" n-CONTRACT-t", text)
 90 |             text = re.sub(r"'LL", r" -CONTRACT-LL", text) 
 91 |             text = re.sub(r"'RE", r" -CONTRACT-RE", text) 
 92 |             text = re.sub(r"'VE", r" -CONTRACT-VE", text)
 93 |             text = re.sub(r"N'T", r" N-CONTRACT-T", text)
 94 |             text = re.sub(r"cannot", r"can not", text)
 95 |             text = re.sub(r"Cannot", r"Can not", text)
 96 |         
 97 |         # multidots stay together
 98 |         text = re.sub(r"\.([\.]+)", r" DOTMULTI\1", text)
 99 |         while re.search("DOTMULTI\.", text):
100 |             text = re.sub(r"DOTMULTI\.([^\.])", r"DOTDOTMULTI \1", text)
101 |             text = re.sub(r"DOTMULTI\.", r"DOTDOTMULTI", text)
102 |         
103 |         # multidashes stay together
104 |         text = re.sub(r"\-([\-]+)", r" DASHMULTI\1", text)
105 |         while re.search("DASHMULTI\-", text):
106 |             text = re.sub(r"DASHMULTI\-([^\-])", r"DASHDASHMULTI \1", text)
107 |             text = re.sub(r"DASHMULTI\-", r"DASHDASHMULTI", text)
108 | 
109 |         # Separate ',' except if within number. 
110 |         text = re.sub(r"(\D),(\D)", r'\1 , \2', text) 
111 |         # Separate ',' pre and post number. 
112 |         text = re.sub(r"(\d),(\D)", r'\1 , \2', text) 
113 |         text = re.sub(r"(\D),(\d)", r'\1 , \2', text) 
114 |             
115 |         if self.language == "en":
116 |             text = re.sub(r"([^a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text) 
117 |             text = re.sub(r"(\W)'([a-zA-Z])", r"\1 ' \2", text)
118 |             text = re.sub(r"([a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text)
119 |             text = re.sub(r"([a-zA-Z])'([a-zA-Z])", r"\1 '\2", text)
120 |             text = re.sub(r"(\d)'(s)", r"\1 '\2", text)
121 |             text = re.sub(r" '\s+s ", r" 's ", text)
122 |             text = re.sub(r" '\s+s ", r" 's ", text)
123 |         elif self.language == "fr":
124 |             text = re.sub(r"([^a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text) 
125 |             text = re.sub(r"([^a-zA-Z])'([a-zA-Z])", r"\1 ' \2", text)
126 |             text = re.sub(r"([a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text)
127 |             text = re.sub(r"([a-zA-Z])'([a-zA-Z])", r"\1' \2", text)
128 |         else:
129 |             text = re.sub(r"'", r" ' ")
130 |             
131 |         # re-combine single quotes    
132 |         text = re.sub(r"' '", r"''", text)    
133 | 
134 |         words = text.split()
135 |         text = ''
136 |         for i, word in enumerate(words):
137 |             m = re.match("^(\S+)\.$", word)
138 |             if m:
139 |                 pre = m.group(1) 
140 |                 if ((re.search("\.", pre) and re.search("[a-zA-Z]", pre)) or \
141 |                     (pre in self.nonbreaking_prefixes[self.language]) or \
142 |                     ((i < len(words)-1) and re.match("^\d+", words[i+1]))):
143 |                     pass  # do nothing
144 |                 elif ((pre in self.nonbreaking_prefixes_numeric[self.language] ) and \
145 |                       (i < len(words)-1) and re.match("\d+", words[i+1])):
146 |                     pass  # do nothing
147 |                 else:
148 |                     word = pre + " ."
149 |                     
150 |             text += word + " "
151 |         text = re.sub(r"'\s+'", r"''", text)            
152 |        
153 |         # restore multidots
154 |         while re.search("DOTDOTMULTI", text):
155 |             text = re.sub(r"DOTDOTMULTI", r"DOTMULTI.", text)
156 |         text = re.sub(r"DOTMULTI", r".", text)
157 | 
158 |         # restore multidashes
159 |         while re.search("DASHDASHMULTI", text):
160 |             text = re.sub(r"DASHDASHMULTI", r"DASHMULTI-", text)
161 |         text = re.sub(r"DASHMULTI", r"-", text)    
162 |         text = re.sub(r"-CONTRACT-", r"'", text)
163 |    
164 |         return text.split() 
165 | 
166 |     
167 |     def tokenize_all(self,sentences, ptb=False):
168 |         return [self.tokenize(t, ptb) for t in sentences]
169 |             
170 | # starting point
171 | if __name__ == "__main__":
172 |     tokenizer = PTBTokenizer()
173 |     for line in sys.stdin:
174 |         line = line.decode("utf8")
175 |         tokens = tokenizer.tokenize(line.strip())
176 |         out = ' '.join(tokens)
177 |         print out.encode("utf8")
178 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/scripts/combiner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # This file is part of the NUS M2 scorer.
  4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify
  5 | # it under the terms of the GNU General Public License as published by
  6 | # the Free Software Foundation, either version 3 of the License, or
  7 | # (at your option) any later version.
  8 | 
  9 | # The NUS M2 scorer is distributed in the hope that it will be useful,
 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | # GNU General Public License for more details.
 13 | 
 14 | # You should have received a copy of the GNU General Public License
 15 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 | # file: m2scorer.py
 18 | # 
 19 | # score a system's output against a gold reference 
 20 | #
 21 | # Usage: m2scorer.py [OPTIONS] proposed_sentences source_gold
 22 | # where
 23 | #  proposed_sentences   -   system output, sentence per line
 24 | #  source_gold          -   source sentences with gold token edits
 25 | # OPTIONS
 26 | #   -v    --verbose             -  print verbose output
 27 | #   --very_verbose              -  print lots of verbose output
 28 | #   --max_unchanged_words N     -  Maximum unchanged words when extracting edits. Default 2."
 29 | #   --ignore_whitespace_casing  -  Ignore edits that only affect whitespace and caseing. Default no."
 30 | #
 31 | 
 32 | import sys
 33 | import levenshtein
 34 | from getopt import getopt
 35 | from util import paragraphs
 36 | from util import smart_open
 37 | 
 38 | 
 39 | 
 40 | def load_annotation(gold_file):
 41 |     source_sentences = []
 42 |     gold_edits = []
 43 |     fgold = smart_open(gold_file, 'r')
 44 |     puffer = fgold.read()
 45 |     fgold.close()
 46 |     puffer = puffer.decode('utf8')
 47 |     for item in paragraphs(puffer.splitlines(True)):
 48 |         item = item.splitlines(False)
 49 |         sentence = [line[2:].strip() for line in item if line.startswith('S ')]
 50 |         assert sentence != []
 51 |         annotations = {}
 52 |         for line in item[1:]:
 53 |             if line.startswith('I ') or line.startswith('S '):
 54 |                 continue
 55 |             assert line.startswith('A ')
 56 |             line = line[2:]
 57 |             fields = line.split('|||')
 58 |             start_offset = int(fields[0].split()[0])
 59 |             end_offset = int(fields[0].split()[1])
 60 |             etype = fields[1]
 61 |             if etype == 'noop':
 62 |                 start_offset = -1
 63 |                 end_offset = -1
 64 |             corrections =  [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')]
 65 |             # NOTE: start and end are *token* offsets
 66 |             original = ' '.join(' '.join(sentence).split()[start_offset:end_offset])
 67 |             annotator = int(fields[5])
 68 |             if annotator not in annotations.keys():
 69 |                 annotations[annotator] = []
 70 |             annotations[annotator].append((start_offset, end_offset, original, corrections))
 71 |         tok_offset = 0
 72 |         for this_sentence in sentence:
 73 |             tok_offset += len(this_sentence.split())
 74 |             source_sentences.append(this_sentence)
 75 |             this_edits = {}
 76 |             for annotator, annotation in annotations.iteritems():
 77 |                 this_edits[annotator] = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0]
 78 |             if len(this_edits) == 0:
 79 |                 this_edits[0] = []
 80 |             gold_edits.append(this_edits)
 81 |     return (source_sentences, gold_edits)
 82 | 
 83 | 
 84 | def print_usage():
 85 |     print >> sys.stderr, "Usage: m2scorer.py [OPTIONS] proposed_sentences gold_source"
 86 |     print >> sys.stderr, "where"
 87 |     print >> sys.stderr, "  proposed_sentences   -   system output, sentence per line"
 88 |     print >> sys.stderr, "  source_gold          -   source sentences with gold token edits"
 89 |     print >> sys.stderr, "OPTIONS"
 90 |     print >> sys.stderr, "  -v    --verbose                   -  print verbose output"
 91 |     print >> sys.stderr, "        --very_verbose              -  print lots of verbose output"
 92 |     print >> sys.stderr, "        --max_unchanged_words N     -  Maximum unchanged words when extraction edit. Default 2."
 93 |     print >> sys.stderr, "        --ignore_whitespace_casing  -  Ignore edits that only affect whitespace and caseing. Default no."
 94 | 
 95 | 
 96 | 
 97 | max_unchanged_words=2
 98 | ignore_whitespace_casing= False
 99 | verbose = False
100 | very_verbose = False
101 | opts, args = getopt(sys.argv[1:], "v", ["max_unchanged_words=", "verbose", "ignore_whitespace_casing", "very_verbose"])
102 | for o, v in opts:
103 |     if o in ('-v', '--verbose'):
104 |         verbose = True
105 |     elif o == '--very_verbose':
106 |         very_verbose = True
107 |     elif o == '--max_unchanged_words':
108 |         max_unchanged_words = int(v)
109 |     elif o == '--ignore_whitespace_casing':
110 |         ignore_whitespace_casing = True
111 |     else:
112 |         print >> sys.stderr, "Unknown option :", o
113 |         print_usage()
114 |         sys.exit(-1)
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/scripts/convert_hoo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # This file is part of the NUS M2 scorer.
  4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify
  5 | # it under the terms of the GNU General Public License as published by
  6 | # the Free Software Foundation, either version 3 of the License, or
  7 | # (at your option) any later version.
  8 | 
  9 | # The NUS M2 scorer is distributed in the hope that it will be useful,
 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | # GNU General Public License for more details.
 13 | 
 14 | # You should have received a copy of the GNU General Public License
 15 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 | # file: convert_hoo.py
 18 | #
 19 | # convert source xml file and gold annotation to 
 20 | # merged file with sentence-per-line sentences and 
 21 | # annotation.
 22 | #
 23 | # usage : %prog [-p] source.xml [gold.xml] > output
 24 | 
 25 | from Tokenizer import PTBTokenizer
 26 | import xml.dom.minidom
 27 | import sys
 28 | import re
 29 | import getopt
 30 | from util import fix_cp1252codes
 31 | 
 32 | 
 33 | ## global variables
 34 | tokenizer = PTBTokenizer()
 35 | 
 36 | def slice_paragraph(text):
 37 |     yield (0,len(text),text)
 38 | def slice_tokenize(text):
 39 |     import nltk
 40 |     sentence_spliter = nltk.data.load('tokenizers/punkt/english.pickle')
 41 |     last_break = 0 
 42 |     for match in sentence_spliter._lang_vars.period_context_re().finditer(text): 
 43 |         context = match.group() + match.group('after_tok') 
 44 |         if sentence_spliter.text_contains_sentbreak(context): 
 45 |             yield (last_break, match.end(), text[last_break:match.end()]) 
 46 |             if match.group('next_tok'): 
 47 |                 # next sentence starts after whitespace 
 48 |                 last_break = match.start('next_tok') 
 49 |             else: 
 50 |                 # next sentence starts at following punctuation 
 51 |                 last_break = match.end() 
 52 |     yield (last_break, len(text), text[last_break:len(text)]) 
 53 | 
 54 | def get_text(node):
 55 |     # get text data from xml tag
 56 |     buffer = ''
 57 |     for t in node.childNodes:
 58 |         if t.nodeType == t.TEXT_NODE:
 59 |             buffer += t.data
 60 |     return buffer
 61 | 
 62 | def has_empty(node):
 63 |     # check if node has <empty> tag child
 64 |     return len(node.getElementsByTagName('empty')) > 0
 65 | 
 66 | def get_textbody(sdom):
 67 |     parts = []
 68 |     for b in sdom.getElementsByTagName('BODY'):
 69 |         for pa in b.getElementsByTagName('PART'):
 70 |             part_id = pa.attributes["id"].value
 71 |             buffer = []
 72 |             for p in pa.getElementsByTagName('P'):
 73 |                 buffer.append(get_text(p))
 74 |             parts.append((buffer, part_id))
 75 |     return parts
 76 | 
 77 | def get_edits(gdom):
 78 |     edits = []
 79 |     for es in gdom.getElementsByTagName('edits'):
 80 |         for e in es.getElementsByTagName('edit'):
 81 |             start = int(e.attributes["start"].value)
 82 |             end = int(e.attributes["end"].value)
 83 |             part = e.attributes["part"].value
 84 |             etype = e.attributes["type"].value
 85 |             o = e.getElementsByTagName('original')[0]
 86 |             if len(o.getElementsByTagName('empty')) > 0:
 87 |                 original = ''
 88 |             else:
 89 |                 original = get_text(o).strip()
 90 |             corrections = []
 91 |             optional = False
 92 |             for cs in e.getElementsByTagName('corrections'):
 93 |                 for c in cs.getElementsByTagName('correction'):
 94 |                     if len(c.getElementsByTagName('empty')) > 0:
 95 |                         corrections.append('')                        
 96 |                     else:
 97 |                         correction = get_text(c).strip()
 98 |                         if correction == '':
 99 |                             optional = True
100 |                         else:
101 |                             corrections.append(correction)
102 |             edits.append([start, end, part, etype, original, corrections, optional])
103 |     return edits
104 | 
105 | 
106 | # starting point
107 | if __name__ == "__main__":
108 |     opts, args = getopt.getopt(sys.argv[1:], "p")
109 |     paragraph = False
110 |     for o,a in opts :
111 |     	if o == "-p" :
112 | 	    paragraph = True
113 |         
114 |     if len(args) < 1 or len(args) > 2:
115 |         print >> sys.stderr, "usage: %prog [-p] source.xml [gold.xml] > output"
116 |         sys.exit(-1)    
117 |     fsource = args[0]
118 |     gold = 0
119 |     if len(args) == 2:
120 |         fgold = args[1]
121 |         gold = 1
122 |     
123 |     
124 |     # parse xml files
125 |     source_dom = xml.dom.minidom.parse(fsource)
126 |     if gold :
127 |         gold_dom = xml.dom.minidom.parse(fgold)    
128 |     
129 | 
130 |     # read the xml
131 |     parts = get_textbody(source_dom)
132 |     if gold : 
133 |         edits = get_edits(gold_dom)
134 | 
135 |     # sentence split
136 |     slice = slice_tokenize
137 |     if paragraph : 
138 |         slice = slice_paragraph
139 |     for part, part_no in parts:
140 |         offset = 0
141 |         for p in part:
142 |             for s_start, s_end, s in slice(p):
143 |                 if s.strip() == '':
144 |                     continue
145 |                 print "S", s.encode('utf8')
146 |                 if gold :
147 |                    this_edits = [e for e in edits if e[0] >= offset + s_start
148 |                                 and e[1] < offset + s_end and e[2] == part_no]
149 |                    for e in this_edits:
150 |                       start = e[0] - (offset + s_start)
151 |                       end = e[1] - (offset + s_start)
152 |                       etype = e[3]
153 |                       cor = "||".join(e[5])
154 |                       req = "REQUIRED" if e[6] == False else "OPTIONAL"
155 |                       out =  "A %d %d|||%s|||%s|||%s|||-NONE-|||0" % (start, end, etype, cor, req)
156 |                       print out.encode('utf8')
157 |                 print ""
158 |             offset += s_end 
159 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/scripts/convert_nucle.py:
--------------------------------------------------------------------------------
  1 | # convert_nucle.py
  2 | #
  3 | # Author:   Christian Hadiwinoto
  4 | #           National University of Singapore (NUS)
  5 | # Date:     12 Mar 2013
  6 | # Contact:  chrhad@comp.nus.edu.sg
  7 | #
  8 | # Version:  1.0
  9 | # 
 10 | # Original:	Yuanbin Wu
 11 | #           National University of Singapore (NUS)
 12 | # Contact:  wuyb@comp.nus.edu.sg
 13 | #
 14 | # This script is distributed to support the CoNLL-2013 Shared Task.
 15 | # It is free for research and educational purposes.
 16 | #
 17 | # Usage:   python convert_nucle.py sgmlFile > m2File
 18 | 
 19 | from nuclesgmlparser import nuclesgmlparser
 20 | from nucle_doc import *
 21 | import nltk.data
 22 | import re
 23 | import sys
 24 | import getopt
 25 | 
 26 | class PreProcessor:
 27 | 
 28 |     def __init__(self):
 29 | 
 30 |         self.sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 31 |         self.sentenceDumpedFile = 'sentence_file'
 32 |         self.docsDumpedFileName = 'docs'
 33 | 
 34 |     def readNUCLE(self, fn):
 35 | 
 36 |         f = open(fn, 'r')
 37 |         parser = nuclesgmlparser()
 38 |         filestr = f.read()
 39 |         filestr = filestr.decode('utf-8')
 40 |         
 41 |         #Fix Reference tag
 42 |         p = re.compile(r'(<REFERENCE>\n<P>\n.*\n)<P>')
 43 |         filestr = p.sub(r'\1</P>', filestr)
 44 | 
 45 |         parser.feed(filestr)
 46 |         f.close()
 47 |         parser.close()
 48 | 
 49 |         return parser.docs
 50 | 
 51 | 
 52 |     def sentenceSplit(self, docs):
 53 | 
 54 |         for doc in docs:
 55 |             for par in doc.paragraphs:
 56 |                 doc.sentences.append([])
 57 |                 for s in self.sentenceTokenizer.tokenize(par):
 58 |                     doc.buildSentence(s, [], '', [], [])
 59 |         return docs
 60 | 
 61 | 
 62 |     def m2FileGeneration(self, docs):
 63 |         
 64 |         for doc in docs:
 65 |             for slistIndex in xrange(len(doc.sentences)):
 66 |                 slist = doc.sentences[slistIndex]
 67 |                 for sentid in xrange(len(slist)):
 68 | 
 69 |                     sent = slist[sentid]
 70 | 
 71 |                     # annotation string list
 72 |                     annotationList = []
 73 | 
 74 |                     # m2 format annotation string list
 75 |                     m2AnnotationList = []
 76 | 
 77 |                     # build colums
 78 |                     table = sent.getConllFormat(doc, slistIndex, sentid)
 79 |                     tokenizedSentStr = ' '.join(sent.getWords())
 80 | 
 81 |                     #Add annotation info
 82 |                     sentoffset = doc.paragraphs[slistIndex].index(sent.sentstr)
 83 |                     for m in doc.mistakes:
 84 | 
 85 |                         if m['start_par'] != slistIndex or \
 86 |                            m['start_par'] != m['end_par'] or \
 87 |                            m['start_off'] < sentoffset or \
 88 |                            m['start_off'] >= sentoffset + len(sent.sentstr) or \
 89 |                            m['end_off'] <sentoffset or \
 90 |                            m['end_off'] > sentoffset + len(sent.sentstr):
 91 |                             continue
 92 | 
 93 |                         wordsoffset = 0
 94 |                         wdstart = 0
 95 | 
 96 |                         startInWord = 0
 97 |                         headText = ''
 98 |                         endInWord = 0
 99 |                         tailText = ''
100 | 
101 |                         words = sent.getWords()
102 |                         while wdstart < len(words):
103 | 
104 |                             word = words[wdstart]
105 |                             nextstart = sent.sentstr.find(word, wordsoffset)
106 | 
107 |                             if nextstart == -1:
108 |                                 # may not find word, due to relpacement
109 |                                 print >> sys.stderr, "Warning: can not find word"
110 |                                 print >> sys.stderr, word.encode('utf-8')
111 |                                 wordsoffset += 1
112 |                             else:
113 |                                 wordsoffset = nextstart
114 | 
115 |                             if wordsoffset >= m['start_off']-sentoffset:
116 |                                 break
117 |                             elif wordsoffset + len(word) > m['start_off']-sentoffset:
118 |                                 # annotation starts at the middle of a word
119 |                                 startInWord = 1
120 |                                 headText = sent.sentstr[wordsoffset: m['start_off']-sentoffset]
121 |                                 break
122 | 
123 |                             wordsoffset += len(word) 
124 |                             wdstart += 1
125 | 
126 |                         if wdstart == len(words):
127 |                             print >> sys.stderr, 'Warning in building conll format: start_off overflow'
128 |                             print >> sys.stderr, m, sent.sentstr.encode('utf-8')
129 |                             continue
130 | 
131 |                         wdend = wdstart
132 |                         while wdend < len(words):
133 | 
134 |                             word = words[wdend]
135 |                             
136 |                             nextstart = sent.sentstr.find(word, wordsoffset)
137 | 
138 |                             if nextstart == -1:
139 |                                 print >> sys.stderr, "Warning in building conll format: can not find word"
140 |                                 print >> sys.stderr, word.encode('utf-8')
141 |                                 wordsoffset += 1
142 |                             else:
143 |                                 wordsoffset = nextstart
144 | 
145 |                             if wordsoffset >= m['end_off']-sentoffset:
146 |                                 # annotation ends at the middle of a word
147 |                                 if wordsoffset - len(word) < m['end_off']-sentoffset: 
148 |                                     endInWord = 1
149 |                                     tailText = sent.sentstr[m['end_off']-sentoffset : wordsoffset].strip()
150 |                                 break
151 | 
152 |                             wordsoffset += len(word) 
153 |                             wdend += 1
154 |                         
155 |                         # build annotation string for .conll.m2 file
156 |                         m2AnnotationStr  = 'A '
157 |                         m2AnnotationStr +=  str(wdstart) + ' '
158 |                         m2AnnotationStr +=  str(wdend) + '|||'
159 |                         m2AnnotationStr +=  m['type'] + '|||'
160 |                         m2AnnotationStr +=  m['correction'].replace('\n', '') + '|||'
161 |                         m2AnnotationStr +=  'REQUIRED|||-NONE-|||0\n'
162 | 
163 |                         m2AnnotationList.append(m2AnnotationStr)
164 | 
165 |                     # write .conll.m2 file
166 |                     if len(m2AnnotationList) != 0:
167 |                         m2AnnotationSent = 'S ' + tokenizedSentStr + '\n'
168 |                         m2AnnotationSent += ''.join(m2AnnotationList) + '\n'
169 |                         sys.stdout.write(m2AnnotationSent.encode('utf-8'))
170 |                     
171 | 
172 | def usage_release():
173 |     print '\nUsage: python preprocess_nmt.py sgmlFile > outputFile \n\n'
174 | 
175 | if __name__ == '__main__':
176 |     opts, args = getopt.getopt(sys.argv[1:], "")
177 |     
178 |     if len(args) != 1:
179 |         usage_release()
180 |         sys.exit(2)
181 | 
182 |     ppr = PreProcessor()
183 |     debug = False
184 |    
185 |     sgmlFileName = args[0]
186 | 
187 |     docs = ppr.sentenceSplit(ppr.readNUCLE(sgmlFileName))
188 |     ppr.m2FileGeneration(docs)
189 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/scripts/m2scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # This file is part of the NUS M2 scorer.
  4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify
  5 | # it under the terms of the GNU General Public License as published by
  6 | # the Free Software Foundation, either version 3 of the License, or
  7 | # (at your option) any later version.
  8 | 
  9 | # The NUS M2 scorer is distributed in the hope that it will be useful,
 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | # GNU General Public License for more details.
 13 | 
 14 | # You should have received a copy of the GNU General Public License
 15 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 | # file: m2scorer.py
 18 | # 
 19 | # score a system's output against a gold reference 
 20 | #
 21 | # Usage: m2scorer.py [OPTIONS] proposed_sentences source_gold
 22 | # where
 23 | #  proposed_sentences   -   system output, sentence per line
 24 | #  source_gold          -   source sentences with gold token edits
 25 | # OPTIONS
 26 | #   -v    --verbose             -  print verbose output
 27 | #   --very_verbose              -  print lots of verbose output
 28 | #   --max_unchanged_words N     -  Maximum unchanged words when extracting edits. Default 2."
 29 | #   --ignore_whitespace_casing  -  Ignore edits that only affect whitespace and caseing. Default no."
 30 | #
 31 | 
 32 | import sys
 33 | import levenshtein
 34 | from getopt import getopt
 35 | from util import paragraphs
 36 | from util import smart_open
 37 | 
 38 | 
 39 | 
 40 | def load_annotation(gold_file):
 41 |     source_sentences = []
 42 |     gold_edits = []
 43 |     fgold = smart_open(gold_file, 'r')
 44 |     puffer = fgold.read()
 45 |     fgold.close()
 46 |     puffer = puffer.decode('utf8')
 47 |     for item in paragraphs(puffer.splitlines(True)):
 48 |         item = item.splitlines(False)
 49 |         sentence = [line[2:].strip() for line in item if line.startswith('S ')]
 50 |         assert sentence != []
 51 |         annotations = {}
 52 |         for line in item[1:]:
 53 |             if line.startswith('I ') or line.startswith('S '):
 54 |                 continue
 55 |             assert line.startswith('A ')
 56 |             line = line[2:]
 57 |             fields = line.split('|||')
 58 |             start_offset = int(fields[0].split()[0])
 59 |             end_offset = int(fields[0].split()[1])
 60 |             etype = fields[1]
 61 |             if etype == 'noop':
 62 |                 start_offset = -1
 63 |                 end_offset = -1
 64 |             corrections =  [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')]
 65 |             # NOTE: start and end are *token* offsets
 66 |             original = ' '.join(' '.join(sentence).split()[start_offset:end_offset])
 67 |             annotator = int(fields[5])
 68 |             if annotator not in annotations.keys():
 69 |                 annotations[annotator] = []
 70 |             annotations[annotator].append((start_offset, end_offset, original, corrections))
 71 |         tok_offset = 0
 72 |         for this_sentence in sentence:
 73 |             tok_offset += len(this_sentence.split())
 74 |             source_sentences.append(this_sentence)
 75 |             this_edits = {}
 76 |             for annotator, annotation in annotations.iteritems():
 77 |                 this_edits[annotator] = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0]
 78 |             if len(this_edits) == 0:
 79 |                 this_edits[0] = []
 80 |             gold_edits.append(this_edits)
 81 |     return (source_sentences, gold_edits)
 82 | 
 83 | 
 84 | def print_usage():
 85 |     print >> sys.stderr, "Usage: m2scorer.py [OPTIONS] proposed_sentences gold_source"
 86 |     print >> sys.stderr, "where"
 87 |     print >> sys.stderr, "  proposed_sentences   -   system output, sentence per line"
 88 |     print >> sys.stderr, "  source_gold          -   source sentences with gold token edits"
 89 |     print >> sys.stderr, "OPTIONS"
 90 |     print >> sys.stderr, "  -v    --verbose                   -  print verbose output"
 91 |     print >> sys.stderr, "        --very_verbose              -  print lots of verbose output"
 92 |     print >> sys.stderr, "        --max_unchanged_words N     -  Maximum unchanged words when extraction edit. Default 2."
 93 |     print >> sys.stderr, "        --ignore_whitespace_casing  -  Ignore edits that only affect whitespace and caseing. Default no."
 94 | 
 95 | 
 96 | 
 97 | max_unchanged_words=2
 98 | ignore_whitespace_casing= False
 99 | verbose = False
100 | very_verbose = False
101 | opts, args = getopt(sys.argv[1:], "v", ["max_unchanged_words=", "verbose", "ignore_whitespace_casing", "very_verbose"])
102 | for o, v in opts:
103 |     if o in ('-v', '--verbose'):
104 |         verbose = True
105 |     elif o == '--very_verbose':
106 |         very_verbose = True
107 |     elif o == '--max_unchanged_words':
108 |         max_unchanged_words = int(v)
109 |     elif o == '--ignore_whitespace_casing':
110 |         ignore_whitespace_casing = True
111 |     else:
112 |         print >> sys.stderr, "Unknown option :", o
113 |         print_usage()
114 |         sys.exit(-1)
115 | 
116 | # starting point
117 | if len(args) != 2:
118 |     print_usage()
119 |     sys.exit(-1)
120 | 
121 | system_file = args[0]
122 | gold_file = args[1]
123 | 
124 | # load source sentences and gold edits
125 | source_sentences, gold_edits = load_annotation(gold_file)
126 | 
127 | # load system hypotheses
128 | fin = smart_open(system_file, 'r')
129 | system_sentences = [line.decode("utf8").strip() for line in fin.readlines()]
130 | fin.close()
131 | 
132 | p, r, f1 = levenshtein.batch_multi_pre_rec_f1(system_sentences, source_sentences, gold_edits, max_unchanged_words, ignore_whitespace_casing, verbose, very_verbose)
133 | 
134 | print "Precision   : %.4f" % p
135 | print "Recall      : %.4f" % r
136 | print "F1          : %.4f" % f1
137 | 
138 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/scripts/nucle_doc.py:
--------------------------------------------------------------------------------
  1 | # nucle_doc.py
  2 | #
  3 | # Author:	Yuanbin Wu
  4 | #           National University of Singapore (NUS)
  5 | # Date:		12 Mar 2013
  6 | # Version:      1.0
  7 | # 
  8 | # Contact:  wuyb@comp.nus.edu.sg
  9 | #
 10 | # This script is distributed to support the CoNLL-2013 Shared Task.
 11 | # It is free for research and educational purposes.
 12 | 
 13 | import os
 14 | import sys
 15 | from nltk import word_tokenize
 16 | 
 17 | class nucle_doc:
 18 |     def __init__(self):
 19 |         self.docattrs = None
 20 | 
 21 |         self.matric = ''
 22 |         self.email = ''
 23 |         self.nationality = ''
 24 |         self.firstLanguage = ''
 25 |         self.schoolLanguage = ''
 26 |         self.englishTests = ''
 27 | 
 28 |         self.paragraphs = []
 29 |         self.annotation = []
 30 |         self.mistakes = []
 31 | 
 32 |         self.sentences = []
 33 | 
 34 |     def buildSentence(self, sentstr, dpnode, constituentstr, poslist, chunklist):
 35 |         self.sentences[-1].append(nucle_sent(sentstr, dpnode, constituentstr, poslist, chunklist))
 36 | 
 37 |     def addSentence(self, sent):
 38 |         self.sentences[-1].append(sent)
 39 | 
 40 |     def findMistake(self, par, pos):
 41 |         for m in self.mistakes:
 42 |             if par == m['start_par'] and pos >= m['start_off'] and pos < m['end_off']:
 43 |                 return m
 44 |         return None
 45 | 
 46 | 
 47 | class nucle_sent:
 48 |     def __init__(self, sentstr, dpnode, constituentstr, poslist, chunklist):
 49 |         self.sentstr = sentstr
 50 |         self.words = word_tokenize(sentstr)
 51 |         self.dpnodes = dpnode
 52 |         self.constituentstr = constituentstr
 53 |         self.constituentlist = []
 54 |         self.poslist = poslist
 55 |         self.chunklist = chunklist
 56 | 
 57 |     def buildConstituentList(self):
 58 | 
 59 |         s = self.constituentstr.strip().replace('\n', '').replace(' ', '')
 60 |         r = []
 61 |         i = 0
 62 |         while i < len(s):
 63 |             j = i
 64 |             while j < len(s) and s[j] != ')':
 65 |                 j += 1
 66 |             k = j
 67 |             while k < len(s) and s[k] == ')':
 68 |                 k += 1
 69 |             
 70 |             nodeWholeStr = s[i:k]
 71 |             lastLRBIndex = nodeWholeStr.rfind('(')
 72 |             nodeStr = nodeWholeStr[:lastLRBIndex] + '*' + s[j+1:k]
 73 | 
 74 |             r.append(nodeStr)
 75 |             i = k
 76 | 
 77 |         if len(r) != len(self.words):
 78 |             print >> sys.stderr, 'Error in buiding constituent tree bits: different length with words.'
 79 |             print >> sys.stderr, len(r), len(self.words)
 80 |             print >> sys.stderr, ' '.join(r).encode('utf-8')
 81 |             print >> sys.stderr, words
 82 |             sys.exit(1)
 83 | 
 84 |         self.constituentlist = r
 85 | 
 86 | 
 87 |     
 88 |     def setDpNode(self, dpnode):
 89 |         self.dpnodes = dpnode
 90 | 
 91 |     def setPOSList(self, poslist):
 92 |         self.poslist = poslist
 93 | 
 94 |     def setConstituentStr(self, constituentstr):
 95 |         self.constituentstr = constituentstr
 96 | 
 97 |     def setConstituentList(self, constituentlist):
 98 |         self.constituentlist = constituentlist
 99 | 
100 |     def setWords(self, words):
101 |         self.words = words
102 | 
103 |     def setChunkList(self, chunklist):
104 |         self.chunklist = chunklist
105 | 
106 |     def getDpNode(self):
107 |         return self.dpnodes
108 | 
109 |     def getPOSList(self):
110 |         return self.poslist
111 | 
112 |     def getConstituentStr(self):
113 |         return self.constituentstr
114 | 
115 |     def getConstituentList(self):
116 |         return self.constituentlist 
117 | 
118 |     def getWords(self):
119 |         return self.words
120 |     
121 |     def getChunkList(self):
122 |         return self.chunklist
123 | 
124 |     def getConllFormat(self, doc, paragraphIndex, sentIndex):
125 | 
126 |         table = []
127 | 
128 |         dpnodes = self.getDpNode()
129 |         poslist = self.getPOSList()
130 |         #chunklist = self.getChunkList()
131 |         words = self.getWords()
132 |         constituentlist = self.getConstituentList()
133 | 
134 |         if len(poslist) == 0:
135 |             hasParseInfo = 0
136 |         else:
137 |             hasParseInfo = 1
138 | 
139 |         if len(words) != len(poslist) and len(poslist) != 0:
140 |             print >> sys.stderr, 'Error in buiding Conll Format: different length stanford parser postags and words.'
141 |             print >> sys.stderr, 'len words:', len(words), words
142 |             print >> sys.stderr, 'len poslist:', len(poslist), poslist
143 |             sys.exit(1)
144 | 
145 |         for wdindex in xrange(len(words)):
146 | 
147 |             word = words[wdindex]
148 | 
149 |             row = []
150 |             row.append(doc.docattrs[0][1])        #docinfo
151 |             row.append(paragraphIndex)          #paragraph index
152 |             row.append(sentIndex)           #paragraph index
153 |             row.append(wdindex)             #word index
154 |             row.append(word)                #word
155 | 
156 |             #row.append(chunknode.label)     #chunk
157 |             if hasParseInfo == 1:
158 | 
159 |                 posword = poslist[wdindex]
160 |                 splitp = posword.rfind('/')
161 |                 pos = posword[splitp+1 : ].strip()
162 | 
163 |                 #chunknode = chunklist[wdindex]
164 | 
165 |                 constituentnode = constituentlist[wdindex]
166 |                 
167 |                 dpnode = None
168 |                 for d in dpnodes:
169 |                     if d.index == wdindex:
170 |                         dpnode = d
171 |                         break
172 | 
173 |                 row.append(pos)                 #POS
174 |                 if dpnode == None:
175 |                     row.append('-')
176 |                     row.append('-')
177 |                 else:
178 |                     row.append(dpnode.parent_index) #dp parent
179 |                     row.append(dpnode.grammarrole)  #dp label
180 |                 row.append(constituentnode)         #constituent 
181 | 
182 |             table.append(row)
183 | 
184 |         return table
185 | 
186 | 
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/scripts/nuclesgmlparser.py:
--------------------------------------------------------------------------------
  1 | # nuclesgmlparser.py
  2 | #
  3 | # Author:	Yuanbin Wu
  4 | #           National University of Singapore (NUS)
  5 | # Date:		12 Mar 2013
  6 | # Version:      1.0
  7 | # 
  8 | # Contact:  wuyb@comp.nus.edu.sg
  9 | #
 10 | # This script is distributed to support the CoNLL-2013 Shared Task.
 11 | # It is free for research and educational purposes.
 12 | 
 13 | from sgmllib import SGMLParser
 14 | from nucle_doc import nucle_doc
 15 | 
 16 | 
 17 | class nuclesgmlparser(SGMLParser):
 18 |     def __init__(self):
 19 |         SGMLParser.__init__(self)
 20 |         self.docs = []
 21 | 
 22 |     def reset(self):
 23 |         self.docs = []
 24 |         self.data = []
 25 |         SGMLParser.reset(self)
 26 | 
 27 |     def unknow_starttag(self, tag, attrs):
 28 |         pass
 29 | 
 30 |     def unknow_endtag(self):
 31 |         pass
 32 | 
 33 |     def start_doc(self, attrs):
 34 |         self.docs.append(nucle_doc())
 35 |         self.docs[-1].docattrs = attrs
 36 | 
 37 |     def end_doc(self):
 38 |         pass
 39 | 
 40 |     def start_matric(self, attrs):
 41 |         pass
 42 | 
 43 |     def end_matric(self):
 44 |         self.docs[-1].matric = ''.join(self.data)
 45 |         self.data = []
 46 |         pass
 47 | 
 48 |     def start_email(self, attrs):
 49 |         pass
 50 | 
 51 |     def end_email(self):
 52 |         self.docs[-1].email = ''.join(self.data)
 53 |         self.data = []
 54 |         pass
 55 | 
 56 |     def start_nationality(self, attrs):
 57 |         pass
 58 | 
 59 |     def end_nationality(self):
 60 |         self.docs[-1].nationality = ''.join(self.data)
 61 |         self.data = []
 62 |         pass
 63 | 
 64 |     def start_first_language(self, attrs):
 65 |         pass
 66 | 
 67 |     def end_first_language(self):
 68 |         self.docs[-1].firstLanguage = ''.join(self.data)
 69 |         self.data = []
 70 |         pass
 71 | 
 72 |     def start_school_language(self, attrs):
 73 |         pass
 74 | 
 75 |     def end_school_language(self):
 76 |         self.docs[-1].schoolLanguage = ''.join(self.data)
 77 |         self.data = []
 78 |         pass
 79 | 
 80 |     def start_english_tests(self, attrs):
 81 |         pass
 82 | 
 83 |     def end_english_tests(self):
 84 |         self.docs[-1].englishTests = ''.join(self.data)
 85 |         self.data = []
 86 |         pass
 87 | 
 88 | 
 89 |     def start_text(self, attrs):
 90 |         pass
 91 |     
 92 |     def end_text(self):
 93 |         pass
 94 | 
 95 |     def start_title(self, attrs):
 96 |         pass
 97 | 
 98 |     def end_title(self):
 99 |         self.docs[-1].paragraphs.append(''.join(self.data))
100 |         self.data = []
101 |         pass
102 | 
103 | 
104 |     def start_p(self, attrs):
105 |         pass
106 | 
107 |     def end_p(self):
108 |         self.docs[-1].paragraphs.append(''.join(self.data))
109 |         self.data = []
110 |         pass
111 | 
112 | 
113 |     def start_annotation(self, attrs):
114 |         self.docs[-1].annotation.append(attrs)
115 | 
116 |     def end_annotation(self):
117 |         pass
118 | 
119 |     def start_mistake(self, attrs):
120 |         d = {}
121 |         for t in attrs:
122 |             d[t[0]] = int(t[1])
123 |         self.docs[-1].mistakes.append(d)
124 |         pass 
125 | 
126 |     def end_mistake(self):
127 |         pass 
128 | 
129 |     def start_type(self, attrs):
130 |         pass
131 | 
132 |     def end_type(self):
133 |         self.docs[-1].mistakes[-1]['type'] = ''.join(self.data)
134 |         self.data = []
135 | 
136 |     def start_correction(self, attrs):
137 |         pass
138 | 
139 |     def end_correction(self):
140 |         self.docs[-1].mistakes[-1]['correction'] = ''.join(self.data)
141 |         self.data = []
142 | 
143 |     def start_comment(self, attrs):
144 |         pass
145 | 
146 |     def end_comment(self):
147 |         self.docs[-1].mistakes[-1]['comment'] = ''.join( self.data)
148 |         self.data = []
149 | 
150 | 
151 |     def handle_charref(self, ref):
152 |         self.data.append('&' + ref)
153 | 
154 |     def handle_entityref(self, ref):
155 |         self.data.append('&' + ref)
156 | 
157 |     def handle_data(self, text):
158 |         if  text.strip() == '':
159 |             self.data.append('')
160 |             return
161 |         else:
162 |             if text.startswith('\n'):
163 |                 text = text[1:]
164 |             if text.endswith('\n'):
165 |                 text = text[:-1]
166 |             self.data.append(text)
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/scripts/test.sgml:
--------------------------------------------------------------------------------
  1 | <DOC nid="829">
  2 | <TEXT>
  3 | <TITLE>
  4 | CREATING A HABITABLE ENVIRONMENT
  5 | </TITLE>
  6 | <P>
  7 | Humans have many basic needs and one of them is to have an environment that can sustain their lives. Our current population is 6 billion people and it is still growing exponentially. This will, if not already, caused problems as there are very limited spaces for us. The solution can be obtain by using technology to achieve a better usage of space that we have and resolve the problems in lands that inhospitable such as desserts and swamps.
  8 | </P>
  9 | <P>
 10 | Some countries are having difficulties in managing a place to live for their citizen as they tend to get overpopulated. This caused problem like the appearance of slums which most of the time is not safe due to the unhealthy environment. The only way to satisfy the increasing demands of space is by achieving a better usage of the land like designing taller building so it can accommodate more number of people with the same spaces. It is also important to create a better material that can support the buildings despite any natural disaster like earthquakes. A good example is Japan where there are a lot of tall condominiums despite the large number of earthquakes happening in there. Besides a better usage of lands, a better sanitation is also needed because a huge number of people need a clean environment to maintain their heath. For example, countries in Africa can accommodate more people if they can manage to design a better sanitation system.
 11 | </P>
 12 | <P>
 13 | Countries with a lot of inhospitable space need not only to achieve a better space usage, but also to reforms the land to make it livable and technology can help it in a number of ways depending on the trouble the lands have. For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert. Dubai will be a good example for this as previously the country got almost no natural water and they use irrigation to bring natural water to the country. Another example is Netherlands, whose most of his lands is a swamp under sea level, have used a good irrigation system to counter their problem and to make their land habitable.
 14 | </P>
 15 | <P>
 16 | As the number of people grows, the need of habitable environment is unquestionably essential. In this era, Engineering designs can help to provide more habitable accommodation by designing a stronger material so it's possible to create a taller and safer building, a better and efficient sanitation system to prevent disease, and also by designing a way to change the condition of the inhabitable environment.
 17 | </P>
 18 | </TEXT>
 19 | <ANNOTATION teacher_id="172">
 20 | <MISTAKE start_par="1" start_off="210" end_par="1" end_off="216">
 21 | <TYPE>Vform</TYPE>
 22 | <CORRECTION>cause</CORRECTION>
 23 | </MISTAKE>
 24 | <MISTAKE start_par="1" start_off="252" end_par="1" end_off="258">
 25 | <TYPE>Nn</TYPE>
 26 | <CORRECTION>space</CORRECTION>
 27 | </MISTAKE>
 28 | <MISTAKE start_par="1" start_off="235" end_par="1" end_off="238">
 29 | <TYPE>SVA</TYPE>
 30 | <CORRECTION>is</CORRECTION>
 31 | </MISTAKE>
 32 | <MISTAKE start_par="1" start_off="287" end_par="1" end_off="293">
 33 | <TYPE>Vform</TYPE>
 34 | <CORRECTION>obtained</CORRECTION>
 35 | </MISTAKE>
 36 | <MISTAKE start_par="1" start_off="387" end_par="1" end_off="389">
 37 | <TYPE>Prep</TYPE>
 38 | <CORRECTION>of</CORRECTION>
 39 | </MISTAKE>
 40 | <MISTAKE start_par="1" start_off="396" end_par="1" end_off="413">
 41 | <TYPE>V0</TYPE>
 42 | <CORRECTION>that are inhospitable</CORRECTION>
 43 | </MISTAKE>
 44 | <MISTAKE start_par="2" start_off="77" end_par="2" end_off="84">
 45 | <TYPE>Nn</TYPE>
 46 | <CORRECTION>citizens</CORRECTION>
 47 | </MISTAKE>
 48 | <MISTAKE start_par="2" start_off="42" end_par="2" end_off="50">
 49 | <TYPE>Others</TYPE>
 50 | <CORRECTION>managing to get</CORRECTION>
 51 | <COMMENT>missing words</COMMENT>
 52 | </MISTAKE>
 53 | <MISTAKE start_par="2" start_off="125" end_par="2" end_off="131">
 54 | <TYPE>Vt</TYPE>
 55 | <CORRECTION>has caused</CORRECTION>
 56 | </MISTAKE>
 57 | <MISTAKE start_par="2" start_off="132" end_par="2" end_off="139">
 58 | <TYPE>Nn</TYPE>
 59 | <CORRECTION>problems</CORRECTION>
 60 | </MISTAKE>
 61 | <MISTAKE start_par="2" start_off="149" end_par="2" end_off="159">
 62 | <TYPE>Wci</TYPE>
 63 | <CORRECTION>formation and growth</CORRECTION>
 64 | </MISTAKE>
 65 | <MISTAKE start_par="2" start_off="192" end_par="2" end_off="194">
 66 | <TYPE>Vform</TYPE>
 67 | <CORRECTION>are</CORRECTION>
 68 | </MISTAKE>
 69 | <MISTAKE start_par="2" start_off="199" end_par="2" end_off="203">
 70 | <TYPE>Others</TYPE>
 71 | <CORRECTION>safe to live in</CORRECTION>
 72 | <COMMENT>missing words</COMMENT>
 73 | </MISTAKE>
 74 | <MISTAKE start_par="2" start_off="211" end_par="2" end_off="214">
 75 | <TYPE>ArtOrDet</TYPE>
 76 | <CORRECTION>their</CORRECTION>
 77 | </MISTAKE>
 78 | <MISTAKE start_par="2" start_off="285" end_par="2" end_off="287">
 79 | <TYPE>Prep</TYPE>
 80 | <CORRECTION>for</CORRECTION>
 81 | </MISTAKE>
 82 | <MISTAKE start_par="2" start_off="328" end_par="2" end_off="331">
 83 | <TYPE>ArtOrDet</TYPE>
 84 | <CORRECTION></CORRECTION>
 85 | </MISTAKE>
 86 | <MISTAKE start_par="2" start_off="390" end_par="2" end_off="394">
 87 | <TYPE>Wci</TYPE>
 88 | <CORRECTION>a greater</CORRECTION>
 89 | </MISTAKE>
 90 | <MISTAKE start_par="2" start_off="371" end_par="2" end_off="373">
 91 | <TYPE>Pref</TYPE>
 92 | <CORRECTION>the same area of land</CORRECTION>
 93 | </MISTAKE>
 94 | <MISTAKE start_par="2" start_off="412" end_par="2" end_off="432">
 95 | <TYPE>Rloc-</TYPE>
 96 | <CORRECTION></CORRECTION>
 97 | </MISTAKE>
 98 | <MISTAKE start_par="2" start_off="458" end_par="2" end_off="464">
 99 | <TYPE>Wci</TYPE>
100 | <CORRECTION>use</CORRECTION>
101 | </MISTAKE>
102 | <MISTAKE start_par="2" start_off="465" end_par="2" end_off="466">
103 | <TYPE>ArtOrDet</TYPE>
104 | <CORRECTION></CORRECTION>
105 | </MISTAKE>
106 | <MISTAKE start_par="2" start_off="500" end_par="2" end_off="503">
107 | <TYPE>ArtOrDet</TYPE>
108 | <CORRECTION></CORRECTION>
109 | </MISTAKE>
110 | <MISTAKE start_par="2" start_off="514" end_par="2" end_off="521">
111 | <TYPE>Wci</TYPE>
112 | <CORRECTION>during</CORRECTION>
113 | </MISTAKE>
114 | <MISTAKE start_par="2" start_off="534" end_par="2" end_off="542">
115 | <TYPE>Nn</TYPE>
116 | <CORRECTION>disasters</CORRECTION>
117 | </MISTAKE>
118 | <MISTAKE start_par="2" start_off="579" end_par="2" end_off="584">
119 | <TYPE>Mec</TYPE>
120 | <CORRECTION>Japan,</CORRECTION>
121 | </MISTAKE>
122 | <MISTAKE start_par="2" start_off="678" end_par="2" end_off="680">
123 | <TYPE>Prep</TYPE>
124 | <CORRECTION></CORRECTION>
125 | </MISTAKE>
126 | <MISTAKE start_par="2" start_off="714" end_par="2" end_off="719">
127 | <TYPE>Nn</TYPE>
128 | <CORRECTION>land</CORRECTION>
129 | </MISTAKE>
130 | <MISTAKE start_par="2" start_off="721" end_par="2" end_off="722">
131 | <TYPE>ArtOrDet</TYPE>
132 | <CORRECTION></CORRECTION>
133 | </MISTAKE>
134 | <MISTAKE start_par="2" start_off="788" end_par="2" end_off="792">
135 | <TYPE>Vt</TYPE>
136 | <CORRECTION>will need</CORRECTION>
137 | </MISTAKE>
138 | <MISTAKE start_par="2" start_off="831" end_par="2" end_off="836">
139 | <TYPE>Mec</TYPE>
140 | <CORRECTION>health</CORRECTION>
141 | </MISTAKE>
142 | <MISTAKE start_par="3" start_off="102" end_par="3" end_off="109">
143 | <TYPE>Vform</TYPE>
144 | <CORRECTION>reform</CORRECTION>
145 | </MISTAKE>
146 | <MISTAKE start_par="3" start_off="102" end_par="3" end_off="137">
147 | <TYPE>Wci</TYPE>
148 | <CORRECTION>restore the land to a livable state</CORRECTION>
149 | </MISTAKE>
150 | <MISTAKE start_par="3" start_off="162" end_par="3" end_off="164">
151 | <TYPE>Pref</TYPE>
152 | <CORRECTION></CORRECTION>
153 | </MISTAKE>
154 | <MISTAKE start_par="3" start_off="202" end_par="3" end_off="209">
155 | <TYPE>Wci</TYPE>
156 | <CORRECTION>quality</CORRECTION>
157 | </MISTAKE>
158 | <MISTAKE start_par="3" start_off="202" end_par="3" end_off="224">
159 | <TYPE>Wci</TYPE>
160 | <CORRECTION>quality of the land</CORRECTION>
161 | </MISTAKE>
162 | <MISTAKE start_par="3" start_off="263" end_par="3" end_off="270">
163 | <TYPE>Nn</TYPE>
164 | <CORRECTION>desert</CORRECTION>
165 | </MISTAKE>
166 | <MISTAKE start_par="3" start_off="275" end_par="3" end_off="284">
167 | <TYPE>Wci</TYPE>
168 | <CORRECTION>transform</CORRECTION>
169 | </MISTAKE>
170 | <MISTAKE start_par="3" start_off="335" end_par="3" end_off="340">
171 | <TYPE>Vform</TYPE>
172 | <CORRECTION>use</CORRECTION>
173 | </MISTAKE>
174 | <MISTAKE start_par="3" start_off="482" end_par="3" end_off="485">
175 | <TYPE>Trans</TYPE>
176 | <CORRECTION>But</CORRECTION>
177 | </MISTAKE>
178 | <MISTAKE start_par="3" start_off="476" end_par="3" end_off="482">
179 | <TYPE>Mec</TYPE>
180 | <CORRECTION>water.</CORRECTION>
181 | </MISTAKE>
182 | <MISTAKE start_par="3" start_off="486" end_par="3" end_off="490">
183 | <TYPE>Pref</TYPE>
184 | <CORRECTION>it</CORRECTION>
185 | </MISTAKE>
186 | <MISTAKE start_par="3" start_off="491" end_par="3" end_off="494">
187 | <TYPE>SVA</TYPE>
188 | <CORRECTION>uses</CORRECTION>
189 | </MISTAKE>
190 | <MISTAKE start_par="3" start_off="577" end_par="3" end_off="582">
191 | <TYPE>Pref</TYPE>
192 | <CORRECTION>where</CORRECTION>
193 | </MISTAKE>
194 | <MISTAKE start_par="3" start_off="591" end_par="3" end_off="594">
195 | <TYPE>Pref</TYPE>
196 | <CORRECTION>the</CORRECTION>
197 | </MISTAKE>
198 | <MISTAKE start_par="3" start_off="595" end_par="3" end_off="600">
199 | <TYPE>Nn</TYPE>
200 | <CORRECTION>land</CORRECTION>
201 | </MISTAKE>
202 | <MISTAKE start_par="3" start_off="604" end_par="3" end_off="605">
203 | <TYPE>ArtOrDet</TYPE>
204 | <CORRECTION></CORRECTION>
205 | </MISTAKE>
206 | <MISTAKE start_par="3" start_off="622" end_par="3" end_off="628">
207 | <TYPE>Sfrag</TYPE>
208 | <CORRECTION>level. It</CORRECTION>
209 | </MISTAKE>
210 | <MISTAKE start_par="3" start_off="629" end_par="3" end_off="633">
211 | <TYPE>SVA</TYPE>
212 | <CORRECTION>has</CORRECTION>
213 | </MISTAKE>
214 | <MISTAKE start_par="4" start_off="40" end_par="4" end_off="42">
215 | <TYPE>Prep</TYPE>
216 | <CORRECTION>for</CORRECTION>
217 | </MISTAKE>
218 | <MISTAKE start_par="4" start_off="107" end_par="4" end_off="118">
219 | <TYPE>Mec</TYPE>
220 | <CORRECTION>engineering</CORRECTION>
221 | </MISTAKE>
222 | <MISTAKE start_par="4" start_off="83" end_par="4" end_off="92">
223 | <TYPE>Wci</TYPE>
224 | <CORRECTION>increasing</CORRECTION>
225 | </MISTAKE>
226 | <MISTAKE start_par="4" start_off="162" end_par="4" end_off="175">
227 | <TYPE>Wci</TYPE>
228 | <CORRECTION>environment</CORRECTION>
229 | </MISTAKE>
230 | <MISTAKE start_par="4" start_off="189" end_par="4" end_off="190">
231 | <TYPE>ArtOrDet</TYPE>
232 | <CORRECTION></CORRECTION>
233 | </MISTAKE>
234 | <MISTAKE start_par="4" start_off="229" end_par="4" end_off="235">
235 | <TYPE>Wci</TYPE>
236 | <CORRECTION>build</CORRECTION>
237 | </MISTAKE>
238 | <MISTAKE start_par="4" start_off="236" end_par="4" end_off="263">
239 | <TYPE>ArtOrDet</TYPE>
240 | <CORRECTION>taller and safer buildings</CORRECTION>
241 | </MISTAKE>
242 | <MISTAKE start_par="4" start_off="209" end_par="4" end_off="225">
243 | <TYPE>Rloc-</TYPE>
244 | <CORRECTION></CORRECTION>
245 | </MISTAKE>
246 | <MISTAKE start_par="4" start_off="263" end_par="4" end_off="264">
247 | <TYPE>Mec</TYPE>
248 | <CORRECTION>and</CORRECTION>
249 | </MISTAKE>
250 | <MISTAKE start_par="4" start_off="338" end_par="4" end_off="363">
251 | <TYPE>Rloc-</TYPE>
252 | <CORRECTION>changing</CORRECTION>
253 | </MISTAKE>
254 | <MISTAKE start_par="4" start_off="385" end_par="4" end_off="396">
255 | <TYPE>Wci</TYPE>
256 | <CORRECTION>otherwise uninhabitable</CORRECTION>
257 | </MISTAKE>
258 | </ANNOTATION>
259 | </DOC>
260 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/scripts/token_offsets.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # This file is part of the NUS M2 scorer.
 4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify
 5 | # it under the terms of the GNU General Public License as published by
 6 | # the Free Software Foundation, either version 3 of the License, or
 7 | # (at your option) any later version.
 8 | 
 9 | # The NUS M2 scorer is distributed in the hope that it will be useful,
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | # GNU General Public License for more details.
13 | 
14 | # You should have received a copy of the GNU General Public License
15 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 | # file: token_offsets.py
18 | # convert character to token offsets, tokenize sentence 
19 | #
20 | # usage: %prog  < input > output
21 | #
22 | 
23 | 
24 | import sys
25 | import re
26 | import os
27 | from util import *
28 | from Tokenizer import PTBTokenizer
29 | 
30 | 
31 | assert len(sys.argv) == 1
32 | 
33 | 
34 | # main
35 | # loop over sentences cum annotation
36 | tokenizer = PTBTokenizer()
37 | sentence = ''
38 | for line in sys.stdin:
39 |     line = line.decode("utf8").strip()
40 |     if line.startswith("S "):
41 |         sentence = line[2:]
42 |         sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence))
43 |         print sentence_tok.encode("utf8")
44 |     elif line.startswith("A "):
45 |         fields = line[2:].split('|||')
46 |         start_end = fields[0]
47 |         char_start, char_end = [int(a) for a in start_end.split()]
48 |         # calculate token offsets
49 |         prefix = sentence[:char_start]
50 |         tok_start = len(tokenizer.tokenize(prefix))
51 |         postfix = sentence[:char_end]
52 |         tok_end = len(tokenizer.tokenize(postfix))
53 |         start_end = str(tok_start) + " " + str(tok_end)
54 |         fields[0] = start_end
55 |         # tokenize corrections, remove trailing whitespace
56 |         corrections = [(' '.join(tokenizer.tokenize(c))).strip() for c in fields[2].split('||')]
57 |         fields[2] = '||'.join(corrections)
58 |         annotation =  "A " + '|||'.join(fields)
59 |         print annotation.encode("utf8")
60 |     else:
61 |         print line.encode("utf8")
62 | 
63 | 


--------------------------------------------------------------------------------
/data/release2.3.1/m2scorer/scripts/util.py:
--------------------------------------------------------------------------------
  1 | # This file is part of the NUS M2 scorer.
  2 | # The NUS M2 scorer is free software: you can redistribute it and/or modify
  3 | # it under the terms of the GNU General Public License as published by
  4 | # the Free Software Foundation, either version 3 of the License, or
  5 | # (at your option) any later version.
  6 | 
  7 | # The NUS M2 scorer is distributed in the hope that it will be useful,
  8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
  9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 10 | # GNU General Public License for more details.
 11 | 
 12 | # You should have received a copy of the GNU General Public License
 13 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 14 | 
 15 | # file: util.py
 16 | #
 17 | 
 18 | import operator
 19 | import random
 20 | import math
 21 | import re
 22 | 
 23 | def smart_open(fname, mode = 'r'):
 24 |     if fname.endswith('.gz'):
 25 |         import gzip
 26 |         # Using max compression (9) by default seems to be slow.                                
 27 |         # Let's try using the fastest.                                                          
 28 |         return gzip.open(fname, mode, 1)
 29 |     else:
 30 |         return open(fname, mode)
 31 | 
 32 | 
 33 | def randint(b, a=0):
 34 |     return random.randint(a,b)
 35 | 
 36 | def uniq(seq, idfun=None):
 37 |     # order preserving                                                                          
 38 |     if idfun is None:
 39 |         def idfun(x): return x
 40 |     seen = {}
 41 |     result = []
 42 |     for item in seq:
 43 |         marker = idfun(item)
 44 |         # in old Python versions:                                                               
 45 |         # if seen.has_key(marker)                                                               
 46 |         # but in new ones:                                                                      
 47 |         if marker in seen: continue
 48 |         seen[marker] = 1
 49 |         result.append(item)
 50 |     return result
 51 | 
 52 | 
 53 | def sort_dict(myDict, byValue=False, reverse=False):
 54 |     if byValue:
 55 |         items = myDict.items()
 56 |         items.sort(key = operator.itemgetter(1), reverse=reverse)
 57 |     else:
 58 |         items = sorted(myDict.items())
 59 |     return items
 60 | 
 61 | def max_dict(myDict, byValue=False):
 62 |     if byValue:
 63 |         skey=lambda x:x[1]
 64 |     else:
 65 |         skey=lambda x:x[0]
 66 |     return max(myDict.items(), key=skey)
 67 | 
 68 | 
 69 | def min_dict(myDict, byValue=False):
 70 |     if byValue:
 71 |         skey=lambda x:x[1]
 72 |     else:
 73 |         skey=lambda x:x[0]
 74 |     return min(myDict.items(), key=skey)
 75 | 
 76 | def paragraphs(lines, is_separator=lambda x : x == '\n', joiner=''.join):
 77 |     paragraph = []
 78 |     for line in lines:
 79 |         if is_separator(line):
 80 |             if paragraph:
 81 |                 yield joiner(paragraph)
 82 |                 paragraph = []
 83 |         else:
 84 |             paragraph.append(line)
 85 |     if paragraph:
 86 |         yield joiner(paragraph)
 87 | 
 88 | 
 89 | def isASCII(word):
 90 |     try:
 91 |         word = word.decode("ascii")
 92 |         return True
 93 |     except UnicodeEncodeError :
 94 |         return False
 95 |     except UnicodeDecodeError:
 96 |         return False
 97 | 
 98 | 
 99 | def intersect(x, y):
100 |     return [z for z in x if z in y]
101 | 
102 | 
103 | 
104 | # Mapping Windows CP1252 Gremlins to Unicode
105 | # from http://effbot.org/zone/unicode-gremlins.htm
106 | cp1252 = {
107 |     # from http://www.microsoft.com/typography/unicode/1252.htm
108 |     u"\x80": u"\u20AC", # EURO SIGN
109 |     u"\x82": u"\u201A", # SINGLE LOW-9 QUOTATION MARK
110 |     u"\x83": u"\u0192", # LATIN SMALL LETTER F WITH HOOK
111 |     u"\x84": u"\u201E", # DOUBLE LOW-9 QUOTATION MARK
112 |     u"\x85": u"\u2026", # HORIZONTAL ELLIPSIS
113 |     u"\x86": u"\u2020", # DAGGER
114 |     u"\x87": u"\u2021", # DOUBLE DAGGER
115 |     u"\x88": u"\u02C6", # MODIFIER LETTER CIRCUMFLEX ACCENT
116 |     u"\x89": u"\u2030", # PER MILLE SIGN
117 |     u"\x8A": u"\u0160", # LATIN CAPITAL LETTER S WITH CARON
118 |     u"\x8B": u"\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
119 |     u"\x8C": u"\u0152", # LATIN CAPITAL LIGATURE OE
120 |     u"\x8E": u"\u017D", # LATIN CAPITAL LETTER Z WITH CARON
121 |     u"\x91": u"\u2018", # LEFT SINGLE QUOTATION MARK
122 |     u"\x92": u"\u2019", # RIGHT SINGLE QUOTATION MARK
123 |     u"\x93": u"\u201C", # LEFT DOUBLE QUOTATION MARK
124 |     u"\x94": u"\u201D", # RIGHT DOUBLE QUOTATION MARK
125 |     u"\x95": u"\u2022", # BULLET
126 |     u"\x96": u"\u2013", # EN DASH
127 |     u"\x97": u"\u2014", # EM DASH
128 |     u"\x98": u"\u02DC", # SMALL TILDE
129 |     u"\x99": u"\u2122", # TRADE MARK SIGN
130 |     u"\x9A": u"\u0161", # LATIN SMALL LETTER S WITH CARON
131 |     u"\x9B": u"\u203A", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
132 |     u"\x9C": u"\u0153", # LATIN SMALL LIGATURE OE
133 |     u"\x9E": u"\u017E", # LATIN SMALL LETTER Z WITH CARON
134 |     u"\x9F": u"\u0178", # LATIN CAPITAL LETTER Y WITH DIAERESIS
135 | }
136 | 
137 | def fix_cp1252codes(text):
138 |     # map cp1252 gremlins to real unicode characters
139 |     if re.search(u"[\x80-\x9f]", text):
140 |         def fixup(m):
141 |             s = m.group(0)
142 |             return cp1252.get(s, s)
143 |         if isinstance(text, type("")):
144 |             # make sure we have a unicode string
145 |             text = unicode(text, "iso-8859-1")
146 |         text = re.sub(u"[\x80-\x9f]", fixup, text)
147 |     return text
148 | 
149 | def clean_utf8(text):
150 |     return filter(lambda x : x > '\x1f' and x < '\x7f', text)
151 | 
152 | def pairs(iterable, overlapping=False):
153 |     iterator = iterable.__iter__()
154 |     token = iterator.next()
155 |     i = 0
156 |     for lookahead in iterator:
157 |         if overlapping or i % 2 == 0: 
158 |             yield (token, lookahead)
159 |         token = lookahead
160 |         i += 1
161 |     if i % 2 == 0:
162 |         yield (token, None)
163 | 
164 | def frange(start, end=None, inc=None):
165 |     "A range function, that does accept float increments..."
166 | 
167 |     if end == None:
168 |         end = start + 0.0
169 |         start = 0.0
170 | 
171 |     if inc == None:
172 |         inc = 1.0
173 | 
174 |     L = []
175 |     while 1:
176 |         next = start + len(L) * inc
177 |         if inc > 0 and next >= end:
178 |             break
179 |         elif inc < 0 and next <= end:
180 |             break
181 |         L.append(next)
182 |         
183 |     return L
184 | 
185 | def softmax(values):
186 |     a = max(values)
187 |     Z = 0.0
188 |     for v in values:
189 |         Z += math.exp(v - a)
190 |     sm = [math.exp(v-a) / Z for v in values]
191 |     return sm
192 | 


--------------------------------------------------------------------------------
/data/release2.3.1/revised/data_5types/alternatives.NTHU.sgml:
--------------------------------------------------------------------------------
  1 | <DOC nid="2">
  2 | <ANNOTATION teacher_id="45">
  3 | <MISTAKE start_par="1" start_off="509" end_par="1" end_off="511">
  4 | <TYPE>Prep</TYPE>
  5 | <CORRECTION>of</CORRECTION>
  6 | </MISTAKE>
  7 | <MISTAKE start_par="1" start_off="571" end_par="1" end_off="585">
  8 | <TYPE>Nn</TYPE>
  9 | <CORRECTION>disagreement</CORRECTION>
 10 | </MISTAKE>
 11 | </ANNOTATION>
 12 | </DOC>
 13 | 
 14 | <DOC nid="4">
 15 | <ANNOTATION teacher_id="45">
 16 | <MISTAKE start_par="0" start_off="344" end_par="0" end_off="351">
 17 | <TYPE>Nn</TYPE>
 18 | <CORRECTION>purposes</CORRECTION>
 19 | </MISTAKE>
 20 | </ANNOTATION>
 21 | </DOC>
 22 | 
 23 | <DOC nid="6">
 24 | <ANNOTATION teacher_id="45">
 25 | <MISTAKE start_par="3" start_off="288" end_par="3" end_off="290">
 26 | <TYPE>SVA</TYPE>
 27 | <CORRECTION>are</CORRECTION>
 28 | </MISTAKE>
 29 | <MISTAKE start_par="4" start_off="414" end_par="4" end_off="418">
 30 | <TYPE>Vform</TYPE>
 31 | <CORRECTION>was</CORRECTION>
 32 | </MISTAKE>
 33 | <MISTAKE start_par="4" start_off="419" end_par="4" end_off="426">
 34 | <TYPE>Vform</TYPE>
 35 | <CORRECTION>mentioned</CORRECTION>
 36 | </MISTAKE>
 37 | </ANNOTATION>
 38 | </DOC>
 39 | 
 40 | <DOC nid="8">
 41 | <ANNOTATION teacher_id="45">
 42 | <MISTAKE start_par="4" start_off="663" end_par="4" end_off="668">
 43 | <TYPE>Nn</TYPE>
 44 | <CORRECTION>trends</CORRECTION>
 45 | </MISTAKE>
 46 | <MISTAKE start_par="4" start_off="669" end_par="4" end_off="671">
 47 | <TYPE>ArtOrDet</TYPE>
 48 | <CORRECTION>of the</CORRECTION>
 49 | </MISTAKE>
 50 | </ANNOTATION>
 51 | </DOC>
 52 | 
 53 | <DOC nid="9">
 54 | <ANNOTATION teacher_id="45">
 55 | <MISTAKE start_par="2" start_off="217" end_par="2" end_off="319">
 56 | <TYPE>noop</TYPE>
 57 | <CORRECTION>However, human beings are not animals or any other products, they have their own thoughts and freedom.</CORRECTION>
 58 | </MISTAKE>
 59 | </ANNOTATION>
 60 | </DOC>
 61 | 
 62 | <DOC nid="10">
 63 | <ANNOTATION teacher_id="45">
 64 | <MISTAKE start_par="1" start_off="310" end_par="1" end_off="317">
 65 | <TYPE>Nn</TYPE>
 66 | <CORRECTION>freedoms</CORRECTION>
 67 | </MISTAKE>
 68 | </ANNOTATION>
 69 | </DOC>
 70 | 
 71 | <DOC nid="11">
 72 | <ANNOTATION teacher_id="45">
 73 | <MISTAKE start_par="2" start_off="531" end_par="2" end_off="538">
 74 | <TYPE>Vform</TYPE>
 75 | <CORRECTION>get</CORRECTION>
 76 | </MISTAKE>
 77 | <MISTAKE start_par="2" start_off="622" end_par="2" end_off="629">
 78 | <TYPE>Prep</TYPE>
 79 | <CORRECTION>home</CORRECTION>
 80 | </MISTAKE>
 81 | <MISTAKE start_par="2" start_off="639" end_par="2" end_off="644">
 82 | <TYPE>Nn</TYPE>
 83 | <CORRECTION>places</CORRECTION>
 84 | </MISTAKE>
 85 | </ANNOTATION>
 86 | </DOC>
 87 | 
 88 | <DOC nid="12">
 89 | <ANNOTATION teacher_id="45">
 90 | <MISTAKE start_par="1" start_off="1099" end_par="1" end_off="1103">
 91 | <TYPE>ArtOrDet</TYPE>
 92 | <CORRECTION>the news</CORRECTION>
 93 | </MISTAKE>
 94 | <MISTAKE start_par="1" start_off="1136" end_par="1" end_off="1144">
 95 | <TYPE>Nn</TYPE>
 96 | <CORRECTION>thousands</CORRECTION>
 97 | </MISTAKE>
 98 | <MISTAKE start_par="1" start_off="1176" end_par="1" end_off="1178">
 99 | <TYPE>Prep</TYPE>
100 | <CORRECTION>for</CORRECTION>
101 | </MISTAKE>
102 | </ANNOTATION>
103 | </DOC>
104 | 
105 | <DOC nid="13">
106 | <ANNOTATION teacher_id="45">
107 | <MISTAKE start_par="0" start_off="542" end_par="0" end_off="547">
108 | <TYPE>Nn</TYPE>
109 | <CORRECTION>mind</CORRECTION>
110 | </MISTAKE>
111 | </ANNOTATION>
112 | </DOC>
113 | 
114 | <DOC nid="14">
115 | <ANNOTATION teacher_id="45">
116 | <MISTAKE start_par="1" start_off="245" end_par="1" end_off="252">
117 | <TYPE>Vform</TYPE>
118 | <CORRECTION>offend</CORRECTION>
119 | </MISTAKE>
120 | </ANNOTATION>
121 | </DOC>
122 | 
123 | <DOC nid="20">
124 | <ANNOTATION teacher_id="45">
125 | <MISTAKE start_par="3" start_off="228" end_par="3" end_off="230">
126 | <TYPE>Prep</TYPE>
127 | <CORRECTION>of</CORRECTION>
128 | </MISTAKE>
129 | <MISTAKE start_par="3" start_off="237" end_par="3" end_off="246">
130 | <TYPE>Nn</TYPE>
131 | <CORRECTION>telephones</CORRECTION>
132 | </MISTAKE>
133 | <MISTAKE start_par="3" start_off="372" end_par="3" end_off="374">
134 | <TYPE>Prep</TYPE>
135 | <CORRECTION>for</CORRECTION>
136 | </MISTAKE>
137 | <MISTAKE start_par="3" start_off="379" end_par="3" end_off="383">
138 | <TYPE>Nn</TYPE>
139 | <CORRECTION>lives</CORRECTION>
140 | </MISTAKE>
141 | </ANNOTATION>
142 | </DOC>
143 | 
144 | <DOC nid="23">
145 | <ANNOTATION teacher_id="45">
146 | <MISTAKE start_par="3" start_off="76" end_par="3" end_off="82">
147 | <TYPE>SVA</TYPE>
148 | <CORRECTION>invades</CORRECTION>
149 | </MISTAKE>
150 | <MISTAKE start_par="3" start_off="122" end_par="3" end_off="127">
151 | <TYPE>ArtOrDet</TYPE>
152 | <CORRECTION>the whole</CORRECTION>
153 | </MISTAKE>
154 | </ANNOTATION>
155 | </DOC>
156 | 
157 | <DOC nid="25">
158 | <ANNOTATION teacher_id="45">
159 | <MISTAKE start_par="2" start_off="186" end_par="2" end_off="191">
160 | <TYPE>Vform</TYPE>
161 | <CORRECTION>go</CORRECTION>
162 | </MISTAKE>
163 | <MISTAKE start_par="2" start_off="340" end_par="2" end_off="343">
164 | <TYPE>ArtOrDet</TYPE>
165 | <CORRECTION></CORRECTION>
166 | </MISTAKE>
167 | </ANNOTATION>
168 | </DOC>
169 | 
170 | <DOC nid="32">
171 | <ANNOTATION teacher_id="45">
172 | <MISTAKE start_par="1" start_off="236" end_par="1" end_off="239">
173 | <TYPE>ArtOrDet</TYPE>
174 | <CORRECTION></CORRECTION>
175 | </MISTAKE>
176 | <MISTAKE start_par="1" start_off="292" end_par="1" end_off="304">
177 | <TYPE>Nn</TYPE>
178 | <CORRECTION>expectancy</CORRECTION>
179 | </MISTAKE>
180 | <MISTAKE start_par="2" start_off="478" end_par="2" end_off="483">
181 | <TYPE>Nn</TYPE>
182 | <CORRECTION>teeth</CORRECTION>
183 | </MISTAKE>
184 | <MISTAKE start_par="2" start_off="506" end_par="2" end_off="511">
185 | <TYPE>Nn</TYPE>
186 | <CORRECTION>teeth</CORRECTION>
187 | </MISTAKE>
188 | </ANNOTATION>
189 | </DOC>
190 | 
191 | <DOC nid="33">
192 | <ANNOTATION teacher_id="45">
193 | <MISTAKE start_par="2" start_off="638" end_par="2" end_off="645">
194 | <TYPE>ArtOrDet</TYPE>
195 | <CORRECTION>through the</CORRECTION>
196 | </MISTAKE>
197 | </ANNOTATION>
198 | </DOC>
199 | 
200 | <DOC nid="34">
201 | <ANNOTATION teacher_id="45">
202 | <MISTAKE start_par="1" start_off="769" end_par="1" end_off="779">
203 | <TYPE>ArtOrDet</TYPE>
204 | <CORRECTION>Rising</CORRECTION>
205 | </MISTAKE>
206 | <MISTAKE start_par="1" start_off="785" end_par="1" end_off="797">
207 | <TYPE>Nn</TYPE>
208 | <CORRECTION>expetancy</CORRECTION>
209 | </MISTAKE>
210 | <MISTAKE start_par="1" start_off="803" end_par="1" end_off="808">
211 | <TYPE>SVA</TYPE>
212 | <CORRECTION>proves</CORRECTION>
213 | </MISTAKE>
214 | <MISTAKE start_par="1" start_off="832" end_par="1" end_off="836">
215 | <TYPE>SVA</TYPE>
216 | <CORRECTION>has</CORRECTION>
217 | </MISTAKE>
218 | <MISTAKE start_par="1" start_off="870" end_par="1" end_off="872">
219 | <TYPE>Prep</TYPE>
220 | <CORRECTION>of</CORRECTION>
221 | </MISTAKE>
222 | <MISTAKE start_par="1" start_off="877" end_par="1" end_off="884">
223 | <TYPE>Prep</TYPE>
224 | <CORRECTION>history of</CORRECTION>
225 | </MISTAKE>
226 | </ANNOTATION>
227 | </DOC>
228 | 
229 | <DOC nid="35">
230 | <ANNOTATION teacher_id="45">
231 | <MISTAKE start_par="1" start_off="512" end_par="1" end_off="518">
232 | <TYPE>ArtOrDet</TYPE>
233 | <CORRECTION>An ageing</CORRECTION>
234 | </MISTAKE>
235 | <MISTAKE start_par="1" start_off="567" end_par="1" end_off="573">
236 | <TYPE>ArtOrDet</TYPE>
237 | <CORRECTION>a larger</CORRECTION>
238 | </MISTAKE>
239 | <MISTAKE start_par="2" start_off="72" end_par="2" end_off="78">
240 | <TYPE>ArtOrDet</TYPE>
241 | <CORRECTION>An ageing</CORRECTION>
242 | </MISTAKE>
243 | <MISTAKE start_par="2" start_off="133" end_par="2" end_off="138">
244 | <TYPE>SVA</TYPE>
245 | <CORRECTION>need</CORRECTION>
246 | </MISTAKE>
247 | <MISTAKE start_par="2" start_off="389" end_par="2" end_off="399">
248 | <TYPE>Nn</TYPE>
249 | <CORRECTION>equipment</CORRECTION>
250 | </MISTAKE>
251 | <MISTAKE start_par="2" start_off="424" end_par="2" end_off="430">
252 | <TYPE>Nn</TYPE>
253 | <CORRECTION>centres</CORRECTION>
254 | </MISTAKE>
255 | <MISTAKE start_par="2" start_off="431" end_par="2" end_off="433">
256 | <TYPE>SVA</TYPE>
257 | <CORRECTION>are</CORRECTION>
258 | </MISTAKE>
259 | <MISTAKE start_par="3" start_off="567" end_par="3" end_off="577">
260 | <TYPE>ArtOrDet</TYPE>
261 | <CORRECTION>An increasing</CORRECTION>
262 | </MISTAKE>
263 | <MISTAKE start_par="3" start_off="611" end_par="3" end_off="614">
264 | <TYPE>ArtOrDet</TYPE>
265 | <CORRECTION></CORRECTION>
266 | </MISTAKE>
267 | <MISTAKE start_par="3" start_off="626" end_par="3" end_off="630">
268 | <TYPE>ArtOrDet</TYPE>
269 | <CORRECTION>the food</CORRECTION>
270 | </MISTAKE>
271 | <MISTAKE start_par="4" start_off="12" end_par="4" end_off="24">
272 | <TYPE>Nn</TYPE>
273 | <CORRECTION>expectancy</CORRECTION>
274 | </MISTAKE>
275 | <MISTAKE start_par="4" start_off="48" end_par="4" end_off="50">
276 | <TYPE>Prep</TYPE>
277 | <CORRECTION>of</CORRECTION>
278 | </MISTAKE>
279 | <MISTAKE start_par="4" start_off="51" end_par="4" end_off="62">
280 | <TYPE>ArtOrDet</TYPE>
281 | <CORRECTION>the governments</CORRECTION>
282 | </MISTAKE>
283 | </ANNOTATION>
284 | </DOC>
285 | 
286 | <DOC nid="36">
287 | <ANNOTATION teacher_id="45">
288 | <MISTAKE start_par="1" start_off="107" end_par="1" end_off="114">
289 | <TYPE>Vform</TYPE>
290 | <CORRECTION>Comparing</CORRECTION>
291 | </MISTAKE>
292 | <MISTAKE start_par="1" start_off="241" end_par="1" end_off="245">
293 | <TYPE>ArtOrDet</TYPE>
294 | <CORRECTION></CORRECTION>
295 | </MISTAKE>
296 | <MISTAKE start_par="1" start_off="259" end_par="1" end_off="261">
297 | <TYPE>SVA</TYPE>
298 | <CORRECTION>are</CORRECTION>
299 | </MISTAKE>
300 | <MISTAKE start_par="3" start_off="872" end_par="3" end_off="923">
301 | <TYPE>noop</TYPE>
302 | <CORRECTION>However, these natural resources are not renewable.</CORRECTION>
303 | </MISTAKE>
304 | </ANNOTATION>
305 | </DOC>
306 | 
307 | <DOC nid="37">
308 | <ANNOTATION teacher_id="45">
309 | <MISTAKE start_par="1" start_off="787" end_par="1" end_off="792">
310 | <TYPE>ArtOrDet</TYPE>
311 | <CORRECTION>the</CORRECTION>
312 | </MISTAKE>
313 | <MISTAKE start_par="1" start_off="793" end_par="1" end_off="801">
314 | <TYPE>Nn</TYPE>
315 | <CORRECTION>employees</CORRECTION>
316 | </MISTAKE>
317 | <MISTAKE start_par="1" start_off="891" end_par="1" end_off="897">
318 | <TYPE>SVA</TYPE>
319 | <CORRECTION>start</CORRECTION>
320 | </MISTAKE>
321 | <MISTAKE start_par="1" start_off="935" end_par="1" end_off="941">
322 | <TYPE>SVA</TYPE>
323 | <CORRECTION>rely</CORRECTION>
324 | </MISTAKE>
325 | <MISTAKE start_par="1" start_off="959" end_par="1" end_off="969">
326 | <TYPE>Nn</TYPE>
327 | <CORRECTION>supplements</CORRECTION>
328 | </MISTAKE>
329 | </ANNOTATION>
330 | </DOC>
331 | 
332 | <DOC nid="38">
333 | <ANNOTATION teacher_id="45">
334 | <MISTAKE start_par="1" start_off="925" end_par="1" end_off="930">
335 | <TYPE>Prep</TYPE>
336 | <CORRECTION>in</CORRECTION>
337 | </MISTAKE>
338 | <MISTAKE start_par="1" start_off="960" end_par="1" end_off="970">
339 | <TYPE>Prep</TYPE>
340 | <CORRECTION>by themselves</CORRECTION>
341 | </MISTAKE>
342 | <MISTAKE start_par="3" start_off="407" end_par="3" end_off="408">
343 | <TYPE>Prep</TYPE>
344 | <CORRECTION>to a</CORRECTION>
345 | </MISTAKE>
346 | <MISTAKE start_par="3" start_off="417" end_par="3" end_off="421">
347 | <TYPE>Nn</TYPE>
348 | <CORRECTION>way</CORRECTION>
349 | </MISTAKE>
350 | <MISTAKE start_par="3" start_off="520" end_par="3" end_off="522">
351 | <TYPE>Prep</TYPE>
352 | <CORRECTION></CORRECTION>
353 | </MISTAKE>
354 | <MISTAKE start_par="3" start_off="544" end_par="3" end_off="547">
355 | <TYPE>ArtOrDet</TYPE>
356 | <CORRECTION></CORRECTION>
357 | </MISTAKE>
358 | <MISTAKE start_par="3" start_off="573" end_par="3" end_off="575">
359 | <TYPE>Prep</TYPE>
360 | <CORRECTION></CORRECTION>
361 | </MISTAKE>
362 | <MISTAKE start_par="3" start_off="587" end_par="3" end_off="595">
363 | <TYPE>Nn</TYPE>
364 | <CORRECTION>standards</CORRECTION>
365 | </MISTAKE>
366 | </ANNOTATION>
367 | </DOC>
368 | 
369 | <DOC nid="39">
370 | <ANNOTATION teacher_id="45">
371 | <MISTAKE start_par="1" start_off="316" end_par="1" end_off="323">
372 | <TYPE>Nn</TYPE>
373 | <CORRECTION>diseases</CORRECTION>
374 | </MISTAKE>
375 | <MISTAKE start_par="1" start_off="324" end_par="1" end_off="328">
376 | <TYPE>ArtOrDet</TYPE>
377 | <CORRECTION>like the</CORRECTION>
378 | </MISTAKE>
379 | <MISTAKE start_par="1" start_off="335" end_par="1" end_off="341">
380 | <TYPE>Nn</TYPE>
381 | <CORRECTION>Death</CORRECTION>
382 | </MISTAKE>
383 | <MISTAKE start_par="1" start_off="482" end_par="1" end_off="486">
384 | <TYPE>Nn</TYPE>
385 | <CORRECTION>lives</CORRECTION>
386 | </MISTAKE>
387 | <MISTAKE start_par="2" start_off="41" end_par="2" end_off="53">
388 | <TYPE>Nn</TYPE>
389 | <CORRECTION>expectancy</CORRECTION>
390 | </MISTAKE>
391 | <MISTAKE start_par="2" start_off="71" end_par="2" end_off="78">
392 | <TYPE>SVA</TYPE>
393 | <CORRECTION>become</CORRECTION>
394 | </MISTAKE>
395 | </ANNOTATION>
396 | </DOC>
397 | 
398 | <DOC nid="41">
399 | <ANNOTATION teacher_id="45">
400 | <MISTAKE start_par="1" start_off="775" end_par="1" end_off="777">
401 | <TYPE>Prep</TYPE>
402 | <CORRECTION></CORRECTION>
403 | </MISTAKE>
404 | <MISTAKE start_par="1" start_off="778" end_par="1" end_off="784">
405 | <TYPE>ArtOrDet</TYPE>
406 | <CORRECTION>the people</CORRECTION>
407 | </MISTAKE>
408 | <MISTAKE start_par="1" start_off="796" end_par="1" end_off="798">
409 | <TYPE>Prep</TYPE>
410 | <CORRECTION></CORRECTION>
411 | </MISTAKE>
412 | <MISTAKE start_par="1" start_off="852" end_par="1" end_off="855">
413 | <TYPE>ArtOrDet</TYPE>
414 | <CORRECTION></CORRECTION>
415 | </MISTAKE>
416 | <MISTAKE start_par="1" start_off="873" end_par="1" end_off="880">
417 | <TYPE>Nn</TYPE>
418 | <CORRECTION>problems</CORRECTION>
419 | </MISTAKE>
420 | <MISTAKE start_par="1" start_off="906" end_par="1" end_off="909">
421 | <TYPE>ArtOrDet</TYPE>
422 | <CORRECTION></CORRECTION>
423 | </MISTAKE>
424 | <MISTAKE start_par="2" start_off="499" end_par="2" end_off="502">
425 | <TYPE>ArtOrDet</TYPE>
426 | <CORRECTION></CORRECTION>
427 | </MISTAKE>
428 | <MISTAKE start_par="2" start_off="532" end_par="2" end_off="544">
429 | <TYPE>Nn</TYPE>
430 | <CORRECTION>contributions</CORRECTION>
431 | </MISTAKE>
432 | <MISTAKE start_par="2" start_off="555" end_par="2" end_off="558">
433 | <TYPE>ArtOrDet</TYPE>
434 | <CORRECTION></CORRECTION>
435 | </MISTAKE>
436 | <MISTAKE start_par="2" start_off="603" end_par="2" end_off="606">
437 | <TYPE>ArtOrDet</TYPE>
438 | <CORRECTION></CORRECTION>
439 | </MISTAKE>
440 | </ANNOTATION>
441 | </DOC>
442 | 
443 | <DOC nid="42">
444 | <ANNOTATION teacher_id="45">
445 | <MISTAKE start_par="5" start_off="14" end_par="5" end_off="17">
446 | <TYPE>ArtOrDet</TYPE>
447 | <CORRECTION></CORRECTION>
448 | </MISTAKE>
449 | <MISTAKE start_par="5" start_off="52" end_par="5" end_off="59">
450 | <TYPE>Nn</TYPE>
451 | <CORRECTION>services</CORRECTION>
452 | </MISTAKE>
453 | </ANNOTATION>
454 | </DOC>
455 | 
456 | <DOC nid="43">
457 | <ANNOTATION teacher_id="45">
458 | <MISTAKE start_par="0" start_off="12" end_par="0" end_off="24">
459 | <TYPE>Nn</TYPE>
460 | <CORRECTION>expectancy</CORRECTION>
461 | </MISTAKE>
462 | <MISTAKE start_par="0" start_off="57" end_par="0" end_off="59">
463 | <TYPE>ArtOrDet</TYPE>
464 | <CORRECTION></CORRECTION>
465 | </MISTAKE>
466 | <MISTAKE start_par="0" start_off="81" end_par="0" end_off="83">
467 | <TYPE>Prep</TYPE>
468 | <CORRECTION>in</CORRECTION>
469 | </MISTAKE>
470 | <MISTAKE start_par="1" start_off="230" end_par="1" end_off="242">
471 | <TYPE>Nn</TYPE>
472 | <CORRECTION>expectancy</CORRECTION>
473 | </MISTAKE>
474 | <MISTAKE start_par="1" start_off="254" end_par="1" end_off="259">
475 | <TYPE>ArtOrDet</TYPE>
476 | <CORRECTION>a large</CORRECTION>
477 | </MISTAKE>
478 | <MISTAKE start_par="1" start_off="298" end_par="1" end_off="303">
479 | <TYPE>Nn</TYPE>
480 | <CORRECTION>beings</CORRECTION>
481 | </MISTAKE>
482 | </ANNOTATION>
483 | </DOC>
484 | 
485 | <DOC nid="45">
486 | <ANNOTATION teacher_id="45">
487 | <MISTAKE start_par="0" start_off="66" end_par="0" end_off="70">
488 | <TYPE>ArtOrDet</TYPE>
489 | <CORRECTION></CORRECTION>
490 | </MISTAKE>
491 | <MISTAKE start_par="0" start_off="78" end_par="0" end_off="88">
492 | <TYPE>Nn</TYPE>
493 | <CORRECTION>technologies</CORRECTION>
494 | </MISTAKE>
495 | <MISTAKE start_par="0" start_off="89" end_par="0" end_off="94">
496 | <TYPE>SVA</TYPE>
497 | <CORRECTION>save</CORRECTION>
498 | </MISTAKE>
499 | </ANNOTATION>
500 | </DOC>
501 | 
502 | <DOC nid="49">
503 | <ANNOTATION teacher_id="45">
504 | <MISTAKE start_par="1" start_off="204" end_par="1" end_off="218">
505 | <TYPE>ArtOrDet</TYPE>
506 | <CORRECTION>history of</CORRECTION>
507 | </MISTAKE>
508 | <MISTAKE start_par="2" start_off="29" end_par="2" end_off="32">
509 | <TYPE>ArtOrDet</TYPE>
510 | <CORRECTION>of the</CORRECTION>
511 | </MISTAKE>
512 | <MISTAKE start_par="2" start_off="70" end_par="2" end_off="81">
513 | <TYPE>ArtOrDet</TYPE>
514 | <CORRECTION>society</CORRECTION>
515 | </MISTAKE>
516 | <MISTAKE start_par="2" start_off="836" end_par="2" end_off="838">
517 | <TYPE>Prep</TYPE>
518 | <CORRECTION>by</CORRECTION>
519 | </MISTAKE>
520 | <MISTAKE start_par="2" start_off="891" end_par="2" end_off="894">
521 | <TYPE>SVA</TYPE>
522 | <CORRECTION>is</CORRECTION>
523 | </MISTAKE>
524 | </ANNOTATION>
525 | </DOC>
526 | 
527 | <DOC nid="51">
528 | <ANNOTATION teacher_id="45">
529 | <MISTAKE start_par="2" start_off="379" end_par="2" end_off="387">
530 | <TYPE>Nn</TYPE>
531 | <CORRECTION>our bodies</CORRECTION>
532 | </MISTAKE>
533 | </ANNOTATION>
534 | </DOC>
535 | 
536 | <DOC nid="52">
537 | <ANNOTATION teacher_id="45">
538 | <MISTAKE start_par="0" start_off="107" end_par="0" end_off="122">
539 | <TYPE>Nn</TYPE>
540 | <CORRECTION>lose their lives</CORRECTION>
541 | </MISTAKE>
542 | <MISTAKE start_par="0" start_off="237" end_par="0" end_off="243">
543 | <TYPE>Vform</TYPE>
544 | <CORRECTION>led</CORRECTION>
545 | </MISTAKE>
546 | </ANNOTATION>
547 | </DOC>
548 | 
549 | 


--------------------------------------------------------------------------------
/data/release2.3.1/scripts/README:
--------------------------------------------------------------------------------
 1 | ====================================================
 2 | 
 3 | CoNLL-2013 Shared Task: Grammatical Error Correction
 4 | 
 5 | Description of Data Preprocessing Scripts
 6 | 
 7 | Created May 23, 2013			Version 2.3.1
 8 | ====================================================
 9 | 
10 | 
11 | Table of Contents
12 | =================
13 | 
14 |   1. General
15 |   2. Pre-requisites
16 |   3. Usage
17 | 
18 | 1. General
19 | ==========
20 | 
21 | This README file describes the usage of scripts for preprocessing the CoNLL-2013 official test data. 
22 | 
23 | Quickstart:
24 | 
25 |   a. Regenerate the preprocessed files with full syntactic information:
26 |      % python preprocess.py -o official.sgml conllFileName annFileName m2FileName
27 | 
28 |   b. Get tokenized annotations without syntactic information:
29 |      % python preprocess.py -l official.sgml conllFileName annFileName m2FileName
30 | 
31 | Where
32 |     conllFileName  -  output file that contains pre-processed sentences in CoNLL format.
33 |       annFileName  -  output file that contains standoff error annotations.
34 |        m2FileName  -  output file that contains error annotations in the M2 scorer format.
35 | 
36 |   c. Creating gold-standard answers including the official and alternative annotations:
37 |      % python preprocesswithalt.py official.5types.sgml official.5types.sgml alternatives.UIUC.sgml alternatives.UMC.sgml alternatives.NTHU.sgml alternatives.STEL.sgml alternatives.TOR.sgml m2FileName
38 | 
39 | Where
40 |        m2FileName  -  output file containing combined official and alternative annotations.
41 | 
42 | Note: The repeated official.5types.sgml is deliberate since it is the program requirement.
43 | 
44 | 2. Pre-requisites
45 | =================
46 | 
47 | + Python (2.6.4, other versions >= 2.6.4, < 3.0 might work but are not tested)
48 | + nltk (http://www.nltk.org, version 2.0b7, needed for sentence splitting and word tokenization, other versions might work) 
49 | + Stanford parser (version 2.0.1, http://nlp.stanford.edu/software/stanford-parser-2012-03-09.tgz)
50 | 
51 | Directories:
52 |   stanford-parser-2012-03-09/
53 |   scripts/
54 |    
55 | If you only use the scripts to generate error annotations needed by the M2 scorer, Stanford parser is not required.
56 | Otherwise, "stanford-parser-2012-03-09" need to be in the same directory as "scripts".
57 | 
58 | 3. Usage
59 | ========
60 | 
61 | Preprocessing the main official test data:
62 | 
63 | Usage: python preprocess.py OPTIONS sgmlFileName conllFileName annotationFileName m2FileName
64 | 
65 | Where
66 |   sgmlFileName       -     NUCLE SGML file
67 |   conllFileName      -     output file name for pre-processed sentences in CoNLL format (e.g., conll13st-preprocessed.conll).
68 |   annotationFileName -     output file name for error annotations (e.g., conll13st-preprocessed.conll.ann).
69 |   m2FileName         -     output file name in the M2 scorer format (e.g., conll13st-preprocessed.conll.m2).
70 | 
71 | OPTIONS
72 |   -o      -   output will contain POS tags and parse tree info (i.e., the same as the released preprocessed file, runs slowly).
73 |   -l      -   output will NOT contain POS tags and parse tree info (runs quickly).
74 | 
75 | Getting the combined M^2 gold-standard answer:
76 | 
77 | Usage: python preprocesswithalt.py essaySgmlFileName mainSgmlFileName alt1SgmlFileName ... altNSgmlFileName m2FileName
78 | 
79 | Where
80 |   essaySgmlFile      -    official test data SGML file containing essay body, not necessarily annotations
81 |   mainAnnotSgmlFile  -    official test data SGML file containing the main annotations, not necessarily essay body
82 |   alt1SgmlFileName   -    the first alternative annotations SGML file, containing only annotations that differ from the main annotation
83 |   altNSgmlFileName   -    the last alternative annotations SGML file, containing only annotations that differ from the main annotation
84 |   combM2FileName     -    output file name in the M2 scorer format, containing combination of main and alternative annotations
85 | 


--------------------------------------------------------------------------------
/data/release2.3.1/scripts/iparser.py:
--------------------------------------------------------------------------------
 1 | # iparser.py
 2 | #
 3 | # Author:	Yuanbin Wu
 4 | #           National University of Singapore (NUS)
 5 | # Date:		12 Mar 2013
 6 | # Version:      1.0
 7 | # 
 8 | # Contact:  wuyb@comp.nus.edu.sg
 9 | #
10 | # This script is distributed to support the CoNLL-2013 Shared Task.
11 | # It is free for research and educational purposes.
12 | 
13 | import os
14 | import sys
15 | 
16 | class stanfordparser:
17 | 
18 |     def __init__(self):
19 |         pass
20 | 		
21 |     def parse_batch(self, sentenceDumpedFileName, parsingDumpedFileName):
22 |         
23 |         if os.path.exists('../stanford-parser-2012-03-09') == False:
24 |             print >> sys.stderr, 'can not find Stanford parser directory'
25 |             sys.exit(1)
26 |         
27 |         # tokenized
28 |         cmd = r'java -server -mx4096m -cp "../stanford-parser-2012-03-09/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser  -retainTMPSubcategories -sentences newline -tokenized -escaper edu.stanford.nlp.process.PTBEscapingProcessor  -outputFormat "wordsAndTags, penn, typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ' + sentenceDumpedFileName
29 | 
30 |         r = os.popen(cmd).read().strip().decode('utf-8')
31 |         f = open(parsingDumpedFileName, 'w')
32 |         f.write(r.encode('utf-8'))
33 |         f.close()
34 | 
35 |         rlist = r.replace('\n\n\n', '\n\n\n\n').split('\n\n')
36 |         return rlist
37 | 


--------------------------------------------------------------------------------
/data/release2.3.1/scripts/nucle_doc.py:
--------------------------------------------------------------------------------
  1 | # nucle_doc.py
  2 | #
  3 | # Author:	Yuanbin Wu
  4 | #           National University of Singapore (NUS)
  5 | # Date:		12 Mar 2013
  6 | # Version:      1.0
  7 | # 
  8 | # Contact:  wuyb@comp.nus.edu.sg
  9 | #
 10 | # This script is distributed to support the CoNLL-2013 Shared Task.
 11 | # It is free for research and educational purposes.
 12 | 
 13 | import os
 14 | import sys
 15 | from nltk import word_tokenize
 16 | 
 17 | class nucle_doc:
 18 |     def __init__(self):
 19 |         self.docattrs = None
 20 | 
 21 |         self.matric = ''
 22 |         self.email = ''
 23 |         self.nationality = ''
 24 |         self.firstLanguage = ''
 25 |         self.schoolLanguage = ''
 26 |         self.englishTests = ''
 27 | 
 28 |         self.paragraphs = []
 29 |         self.annotation = []
 30 |         self.mistakes = []
 31 | 
 32 |         self.sentences = []
 33 | 
 34 |     def buildSentence(self, sentstr, dpnode, constituentstr, poslist, chunklist):
 35 |         self.sentences[-1].append(nucle_sent(sentstr, dpnode, constituentstr, poslist, chunklist))
 36 | 
 37 |     def addSentence(self, sent):
 38 |         self.sentences[-1].append(sent)
 39 | 
 40 |     def findMistake(self, par, pos):
 41 |         for m in self.mistakes:
 42 |             if par == m['start_par'] and pos >= m['start_off'] and pos < m['end_off']:
 43 |                 return m
 44 |         return None
 45 | 
 46 | 
 47 | class nucle_sent:
 48 |     def __init__(self, sentstr, dpnode, constituentstr, poslist, chunklist):
 49 |         self.sentstr = sentstr
 50 |         self.words = word_tokenize(sentstr)
 51 |         self.dpnodes = dpnode
 52 |         self.constituentstr = constituentstr
 53 |         self.constituentlist = []
 54 |         self.poslist = poslist
 55 |         self.chunklist = chunklist
 56 | 
 57 |     def buildConstituentList(self):
 58 | 
 59 |         s = self.constituentstr.strip().replace('\n', '').replace(' ', '')
 60 |         r = []
 61 |         i = 0
 62 |         while i < len(s):
 63 |             j = i
 64 |             while j < len(s) and s[j] != ')':
 65 |                 j += 1
 66 |             k = j
 67 |             while k < len(s) and s[k] == ')':
 68 |                 k += 1
 69 |             
 70 |             nodeWholeStr = s[i:k]
 71 |             lastLRBIndex = nodeWholeStr.rfind('(')
 72 |             nodeStr = nodeWholeStr[:lastLRBIndex] + '*' + s[j+1:k]
 73 | 
 74 |             r.append(nodeStr)
 75 |             i = k
 76 | 
 77 |         if len(r) != len(self.words):
 78 |             print >> sys.stderr, 'Error in buiding constituent tree bits: different length with words.'
 79 |             print >> sys.stderr, len(r), len(self.words)
 80 |             print >> sys.stderr, ' '.join(r).encode('utf-8')
 81 |             print >> sys.stderr, words
 82 |             sys.exit(1)
 83 | 
 84 |         self.constituentlist = r
 85 | 
 86 | 
 87 |     
 88 |     def setDpNode(self, dpnode):
 89 |         self.dpnodes = dpnode
 90 | 
 91 |     def setPOSList(self, poslist):
 92 |         self.poslist = poslist
 93 | 
 94 |     def setConstituentStr(self, constituentstr):
 95 |         self.constituentstr = constituentstr
 96 | 
 97 |     def setConstituentList(self, constituentlist):
 98 |         self.constituentlist = constituentlist
 99 | 
100 |     def setWords(self, words):
101 |         self.words = words
102 | 
103 |     def setChunkList(self, chunklist):
104 |         self.chunklist = chunklist
105 | 
106 |     def getDpNode(self):
107 |         return self.dpnodes
108 | 
109 |     def getPOSList(self):
110 |         return self.poslist
111 | 
112 |     def getConstituentStr(self):
113 |         return self.constituentstr
114 | 
115 |     def getConstituentList(self):
116 |         return self.constituentlist 
117 | 
118 |     def getWords(self):
119 |         return self.words
120 |     
121 |     def getChunkList(self):
122 |         return self.chunklist
123 | 
124 |     def getConllFormat(self, doc, paragraphIndex, sentIndex):
125 | 
126 |         table = []
127 | 
128 |         dpnodes = self.getDpNode()
129 |         poslist = self.getPOSList()
130 |         #chunklist = self.getChunkList()
131 |         words = self.getWords()
132 |         constituentlist = self.getConstituentList()
133 | 
134 |         if len(poslist) == 0:
135 |             hasParseInfo = 0
136 |         else:
137 |             hasParseInfo = 1
138 | 
139 |         if len(words) != len(poslist) and len(poslist) != 0:
140 |             print >> sys.stderr, 'Error in buiding Conll Format: different length stanford parser postags and words.'
141 |             print >> sys.stderr, 'len words:', len(words), words
142 |             print >> sys.stderr, 'len poslist:', len(poslist), poslist
143 |             sys.exit(1)
144 | 
145 |         for wdindex in xrange(len(words)):
146 | 
147 |             word = words[wdindex]
148 | 
149 |             row = []
150 |             row.append(doc.docattrs[0][1])        #docinfo
151 |             row.append(paragraphIndex)          #paragraph index
152 |             row.append(sentIndex)           #paragraph index
153 |             row.append(wdindex)             #word index
154 |             row.append(word)                #word
155 | 
156 |             #row.append(chunknode.label)     #chunk
157 |             if hasParseInfo == 1:
158 | 
159 |                 posword = poslist[wdindex]
160 |                 splitp = posword.rfind('/')
161 |                 pos = posword[splitp+1 : ].strip()
162 | 
163 |                 #chunknode = chunklist[wdindex]
164 | 
165 |                 constituentnode = constituentlist[wdindex]
166 |                 
167 |                 dpnode = None
168 |                 for d in dpnodes:
169 |                     if d.index == wdindex:
170 |                         dpnode = d
171 |                         break
172 | 
173 |                 row.append(pos)                 #POS
174 |                 if dpnode == None:
175 |                     row.append('-')
176 |                     row.append('-')
177 |                 else:
178 |                     row.append(dpnode.parent_index) #dp parent
179 |                     row.append(dpnode.grammarrole)  #dp label
180 |                 row.append(constituentnode)         #constituent 
181 | 
182 |             table.append(row)
183 | 
184 |         return table
185 | 
186 | 
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/data/release2.3.1/scripts/nuclesgmlparser.py:
--------------------------------------------------------------------------------
  1 | # nuclesgmlparser.py
  2 | #
  3 | # Author:	Yuanbin Wu
  4 | #           National University of Singapore (NUS)
  5 | # Date:		12 Mar 2013
  6 | # Version:      1.0
  7 | # 
  8 | # Contact:  wuyb@comp.nus.edu.sg
  9 | #
 10 | # This script is distributed to support the CoNLL-2013 Shared Task.
 11 | # It is free for research and educational purposes.
 12 | 
 13 | from sgmllib import SGMLParser
 14 | from nucle_doc import nucle_doc
 15 | 
 16 | 
 17 | class nuclesgmlparser(SGMLParser):
 18 |     def __init__(self):
 19 |         SGMLParser.__init__(self)
 20 |         self.docs = []
 21 | 
 22 |     def reset(self):
 23 |         self.docs = []
 24 |         self.data = []
 25 |         SGMLParser.reset(self)
 26 | 
 27 |     def unknow_starttag(self, tag, attrs):
 28 |         pass
 29 | 
 30 |     def unknow_endtag(self):
 31 |         pass
 32 | 
 33 |     def start_doc(self, attrs):
 34 |         self.docs.append(nucle_doc())
 35 |         self.docs[-1].docattrs = attrs
 36 | 
 37 |     def end_doc(self):
 38 |         pass
 39 | 
 40 |     def start_matric(self, attrs):
 41 |         pass
 42 | 
 43 |     def end_matric(self):
 44 |         self.docs[-1].matric = ''.join(self.data)
 45 |         self.data = []
 46 |         pass
 47 | 
 48 |     def start_email(self, attrs):
 49 |         pass
 50 | 
 51 |     def end_email(self):
 52 |         self.docs[-1].email = ''.join(self.data)
 53 |         self.data = []
 54 |         pass
 55 | 
 56 |     def start_nationality(self, attrs):
 57 |         pass
 58 | 
 59 |     def end_nationality(self):
 60 |         self.docs[-1].nationality = ''.join(self.data)
 61 |         self.data = []
 62 |         pass
 63 | 
 64 |     def start_first_language(self, attrs):
 65 |         pass
 66 | 
 67 |     def end_first_language(self):
 68 |         self.docs[-1].firstLanguage = ''.join(self.data)
 69 |         self.data = []
 70 |         pass
 71 | 
 72 |     def start_school_language(self, attrs):
 73 |         pass
 74 | 
 75 |     def end_school_language(self):
 76 |         self.docs[-1].schoolLanguage = ''.join(self.data)
 77 |         self.data = []
 78 |         pass
 79 | 
 80 |     def start_english_tests(self, attrs):
 81 |         pass
 82 | 
 83 |     def end_english_tests(self):
 84 |         self.docs[-1].englishTests = ''.join(self.data)
 85 |         self.data = []
 86 |         pass
 87 | 
 88 | 
 89 |     def start_text(self, attrs):
 90 |         pass
 91 |     
 92 |     def end_text(self):
 93 |         pass
 94 | 
 95 |     def start_title(self, attrs):
 96 |         pass
 97 | 
 98 |     def end_title(self):
 99 |         self.docs[-1].paragraphs.append(''.join(self.data))
100 |         self.data = []
101 |         pass
102 | 
103 | 
104 |     def start_p(self, attrs):
105 |         pass
106 | 
107 |     def end_p(self):
108 |         self.docs[-1].paragraphs.append(''.join(self.data))
109 |         self.data = []
110 |         pass
111 | 
112 | 
113 |     def start_annotation(self, attrs):
114 |         self.docs[-1].annotation.append(attrs)
115 | 
116 |     def end_annotation(self):
117 |         pass
118 | 
119 |     def start_mistake(self, attrs):
120 |         d = {}
121 |         for t in attrs:
122 |             d[t[0]] = int(t[1])
123 |         self.docs[-1].mistakes.append(d)
124 |         pass 
125 | 
126 |     def end_mistake(self):
127 |         pass 
128 | 
129 |     def start_type(self, attrs):
130 |         pass
131 | 
132 |     def end_type(self):
133 |         self.docs[-1].mistakes[-1]['type'] = ''.join(self.data)
134 |         self.data = []
135 | 
136 |     def start_correction(self, attrs):
137 |         pass
138 | 
139 |     def end_correction(self):
140 |         self.docs[-1].mistakes[-1]['correction'] = ''.join(self.data)
141 |         self.data = []
142 | 
143 |     def start_comment(self, attrs):
144 |         pass
145 | 
146 |     def end_comment(self):
147 |         self.docs[-1].mistakes[-1]['comment'] = ''.join( self.data)
148 |         self.data = []
149 | 
150 | 
151 |     def handle_charref(self, ref):
152 |         self.data.append('&' + ref)
153 | 
154 |     def handle_entityref(self, ref):
155 |         self.data.append('&' + ref)
156 | 
157 |     def handle_data(self, text):
158 |         if  text.strip() == '':
159 |             self.data.append('')
160 |             return
161 |         else:
162 |             if text.startswith('\n'):
163 |                 text = text[1:]
164 |             if text.endswith('\n'):
165 |                 text = text[:-1]
166 |             self.data.append(text)
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/data/release2.3.1/scripts/parser_feature.py:
--------------------------------------------------------------------------------
 1 | # parser_feature.py
 2 | #
 3 | # Author:	Yuanbin Wu
 4 | #           National University of Singapore (NUS)
 5 | # Date:		12 Mar 2013
 6 | # Version:      1.0
 7 | # 
 8 | # Contact:  wuyb@comp.nus.edu.sg
 9 | #
10 | # This script is distributed to support the CoNLL-2013 Shared Task.
11 | # It is free for research and educational purposes.
12 | 
13 | 
14 | 
15 | import iparser
16 | 
17 | class stanpartreenode:
18 |     def __init__(self, strnode):
19 | 
20 |         if strnode == '':
21 |             self.grammarrole = ''
22 |             self.parent_index = -1
23 |             self.index = -1
24 |             self.parent_word = ''
25 |             self.word = ''
26 |             self.POS = ''
27 |             return
28 | 
29 |         groleend = strnode.find('(')
30 |         self.grammarrole = strnode[ : groleend]
31 |         content = strnode[groleend + 1: len(strnode)-1]
32 |         dadAndme = content.partition(', ')
33 |         dad = dadAndme[0]
34 |         me = dadAndme[2]
35 |         dadsep = dad.rfind('-')
36 |         mesep = me.rfind('-')
37 |         self.parent_index = int(dad[dadsep + 1 : ]) - 1 
38 |         self.parent_word = dad[0 : dadsep]
39 |         self.index = int(me[mesep + 1 : ]) - 1
40 |         self.word = me[0 : mesep]
41 |         self.POS = '' 
42 | 
43 |         
44 | def DependTree_Batch(sentenceDumpedFileName, parsingDumpedFileName):
45 | 
46 |     sparser = iparser.stanfordparser()
47 |     results = sparser.parse_batch(sentenceDumpedFileName, parsingDumpedFileName)
48 |     nodeslist = []
49 | 
50 |     k = 0
51 |     while k < len(results):
52 |         PoSlist = results[k].split(' ')
53 |         constituentstr = results[k+1]
54 |         table = results[k+2].split('\n')
55 |         nodes = []
56 |         for i in range(0, len(table)):
57 |             nodes.append( stanpartreenode(table[i]) )
58 |         nodeslist.append((nodes, constituentstr, PoSlist))
59 |         k += 3
60 |     return nodeslist
61 | 
62 | def DependTree_Batch_Parsefile(parsingDumpedFileName):
63 | 
64 |     f = open(parsingDumpedFileName, 'r')
65 |     results = f.read().decode('utf-8').replace('\n\n\n', '\n\n\n\n').split('\n\n')
66 |     f.close()
67 |     nodeslist = []
68 | 
69 |     k = 0
70 |     while k < len(results):
71 |         PoSlist = results[k].split(' ')
72 |         constituentstr = results[k+1]
73 |         table = results[k+2].split('\n')
74 | 
75 |         nodes = []
76 |         for i in range(0, len(table)):
77 |             nodes.append( stanpartreenode(table[i]) )
78 |         nodeslist.append((nodes, constituentstr, PoSlist))
79 |         k += 3
80 |     return nodeslist
81 | 


--------------------------------------------------------------------------------
/data/release2.3.1/scripts/preprocesswithalt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # preprocessors.py
  4 | #
  5 | # Author:	Christian Hadiwinoto
  6 | #           National University of Singapore (NUS)
  7 | # Date:		24 May 2013
  8 | # Version:      1.0
  9 | # 
 10 | # Contact:  chrhad@comp.nus.edu.sg
 11 | #
 12 | # This script is distributed to support the CoNLL-2013 Shared Task.
 13 | # It is free for research and educational purposes.
 14 | #
 15 | # Usage:   python preprocesswithalt.py essaySgmlFileName mainSgmlFileName alt1SgmlFileName ... altNSgmlFileName m2FileName
 16 | #
 17 | 
 18 | 
 19 | import parser_feature
 20 | from nuclesgmlparser import nuclesgmlparser
 21 | from nucle_doc import *
 22 | import nltk.data
 23 | from nltk import word_tokenize
 24 | from operator import itemgetter
 25 | import cPickle as pickle
 26 | import re
 27 | import sys
 28 | import os
 29 | 
 30 | getEditKey = itemgetter(0, 1, 2, 3, 4)
 31 | 
 32 | sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 33 | sentenceDumpedFile = 'sentence_file'
 34 | docsDumpedFileName = 'docs'
 35 | parsingDumpedFileName = 'parse_file'
 36 | 
 37 | def readNUCLE(fn):
 38 | 
 39 |     f = open(fn, 'r')
 40 |     parser = nuclesgmlparser()
 41 |     filestr = f.read()
 42 |     filestr = filestr.decode('utf-8')
 43 |     
 44 |     #Fix Reference tag
 45 |     p = re.compile(r'(<REFERENCE>\n<P>\n.*\n)<P>')
 46 |     filestr = p.sub(r'\1</P>', filestr)
 47 | 
 48 |     parser.feed(filestr)
 49 |     f.close()
 50 |     parser.close()
 51 | 
 52 |     return parser.docs
 53 | 
 54 | def sentenceSplit(docs):
 55 | 
 56 |     sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 57 |     for doc in docs:
 58 |         for par in doc.paragraphs:
 59 |             doc.sentences.append([])
 60 |             for s in sentenceTokenizer.tokenize(par):
 61 |                 doc.buildSentence(s, [], '', [], [])
 62 |     return docs
 63 |  
 64 | def compareTwoEditLists(editList1, editList2):
 65 |     # must be sorted
 66 |     if editList1 == [] and editList2 == []:
 67 |         return True
 68 |     elif editList1 == [] or editList2 == []:
 69 |         return False
 70 |     elif getEditKey(editList1[0]) != getEditKey(editList2[0]):
 71 |         return False
 72 |     else:
 73 |         return compareTwoEditLists(editList1[1:], editList2[1:])
 74 |         
 75 | def moderateAnnotations(contestDocs, annotBoard, origDocSet):
 76 |     # moderate annotation in "contesting" docs with already stated mistakes
 77 |     mistakeStrSet = {}
 78 |     for doc in contestDocs:
 79 |         mistakeStr = ''
 80 |         nid = int(doc.docattrs[0][1]) # nid of current document
 81 |         tid = doc.annotation[0][0][1] # teacher id
 82 | 
 83 |         if not annotBoard.has_key(nid): # create placeholder
 84 |             annotBoard[nid] = {}
 85 | 
 86 |         origDoc = origDocSet[nid]
 87 |         for pid in xrange(len(origDoc.sentences)):
 88 |             slist = origDoc.sentences[pid]
 89 |             if not annotBoard[nid].has_key(pid):
 90 |                 annotBoard[nid][pid] = {}
 91 |             for sentid in xrange(len(slist)):
 92 |                 sent = slist[sentid]
 93 |                 if not annotBoard[nid][pid].has_key(sentid):
 94 |                     annotBoard[nid][pid][sentid] = []
 95 |                 editSet = []
 96 | 
 97 |                 # enumerate mistakes
 98 |                 sentoffset = origDoc.paragraphs[pid].index(sent.sentstr)
 99 |                 editNum = 0
100 |                 for m in doc.mistakes:
101 |                     if m['start_par'] != pid or \
102 |                        m['start_par'] != m['end_par'] or \
103 |                        m['start_off'] < sentoffset or \
104 |                        m['start_off'] >= sentoffset + len(sent.sentstr) or \
105 |                        m['end_off'] <sentoffset or \
106 |                        m['end_off'] > sentoffset + len(sent.sentstr):
107 |                         continue
108 | 
109 |                     if m['type'] != 'noop':
110 |                         editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], m['correction'], m['type']))
111 |                         editNum += 1
112 |                     else:
113 |                         editSet.append((m['start_par'], m['end_par'], m['start_off'], m['end_off'], sent.sentstr, m['type']))
114 | 
115 |                 editSet = sorted(editSet, key=itemgetter(0, 1, 2, 3))
116 |                 
117 |                 # find the same annotation
118 |                 foundMatch = False
119 |                 i = 0
120 |                 boardEdits = annotBoard[nid][pid][sentid]
121 |                 while i < len(boardEdits) and not foundMatch:
122 |                     if compareTwoEditLists(editSet, boardEdits[i]):
123 |                         foundMatch = True
124 |                     else:
125 |                         i+=1
126 | 
127 |                 if not foundMatch:
128 |                     annotBoard[nid][pid][sentid].append(editSet)
129 |         
130 |     return annotBoard
131 |     
132 | def createM2File(origDocs, mistakesBoard, m2FileName):
133 |     
134 |     fm2 = open(m2FileName, 'w')
135 | 
136 |     for doc in origDocs:
137 |         nid = int(doc.docattrs[0][1]) # nid of current document
138 |         for slistIndex in xrange(len(doc.sentences)):
139 |             slist = doc.sentences[slistIndex]
140 |             for sentid in xrange(len(slist)):
141 | 
142 |                 sent = slist[sentid]
143 | 
144 |                 # m2 format annotation string list
145 |                 m2AnnotationList = []
146 | 
147 |                 # build colums
148 |                 table = sent.getConllFormat(doc, slistIndex, sentid)
149 |                 tokenizedSentStr = ' '.join(sent.getWords())
150 | 
151 |                 #Add annotation info
152 |                 sentoffset = doc.paragraphs[slistIndex].index(sent.sentstr)
153 | 
154 |                 i = 0
155 |                 board = mistakesBoard[nid][slistIndex][sentid]
156 |                 for mistakesList in board:
157 |                     if i == 0 and len(mistakesList) == 0 and len(board) > 1: # the 0-th is empty
158 |                         m2AnnotationList.append('A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n')
159 |                         i += 1
160 |                         
161 |                     for tuple in mistakesList:
162 |                         m = {}
163 |                         m['start_par'] = tuple[0]
164 |                         m['end_par'] = tuple[1]
165 |                         m['start_off'] = tuple[2]
166 |                         m['end_off'] = tuple[3]
167 |                         m['correction'] = tuple[4]
168 |                         m['type'] = tuple[5]
169 | 
170 |                         if m['start_par'] != slistIndex or \
171 |                           m['start_par'] != m['end_par'] or \
172 |                           m['start_off'] < sentoffset or \
173 |                           m['start_off'] >= sentoffset + len(sent.sentstr) or \
174 |                           m['end_off'] <sentoffset or \
175 |                           m['end_off'] > sentoffset + len(sent.sentstr):
176 |                             continue
177 | 
178 |                         wordsoffset = 0
179 |                         wdstart = 0
180 | 
181 |                         startInWord = 0
182 |                         headText = ''
183 |                         endInWord = 0
184 |                         tailText = ''
185 | 
186 |                         words = sent.getWords()
187 |                         while wdstart < len(words):
188 | 
189 |                             word = words[wdstart]
190 |                             nextstart = sent.sentstr.find(word, wordsoffset)
191 | 
192 |                             if nextstart == -1:
193 |                                 # may not find word, due to relpacement
194 |                                 print >> sys.stderr, "Warning in building conll format: can not find word"
195 |                                 print >> sys.stderr, word.encode('utf-8')
196 |                                 wordsoffset += 1
197 |                             else:
198 |                                 wordsoffset = nextstart
199 | 
200 |                             if wordsoffset >= m['start_off']-sentoffset:
201 |                                 break
202 |                             elif wordsoffset + len(word) > m['start_off']-sentoffset:
203 |                                 # annotation starts at the middle of a word
204 |                                 startInWord = 1
205 |                                 headText = sent.sentstr[wordsoffset: m['start_off']-sentoffset]
206 |                                 break
207 | 
208 |                             wordsoffset += len(word) 
209 |                             wdstart += 1
210 | 
211 |                         if wdstart == len(words):
212 |                             print >> sys.stderr, 'Warning in building conll format: start_off overflow'
213 |                             print >> sys.stderr, m, sent.sentstr.encode('utf-8')
214 |                             continue
215 | 
216 | 
217 |                         wdend = wdstart
218 |                         while wdend < len(words):
219 | 
220 |                             word = words[wdend]
221 |                             
222 |                             nextstart = sent.sentstr.find(word, wordsoffset)
223 | 
224 |                             if nextstart == -1:
225 |                                 print >> sys.stderr, "Warning in building conll format: can not find word"
226 |                                 print >> sys.stderr, word.encode('utf-8')
227 |                                 wordsoffset += 1
228 |                             else:
229 |                                 wordsoffset = nextstart
230 | 
231 |                             if wordsoffset >= m['end_off']-sentoffset:
232 |                                 # annotation ends at the middle of a word
233 |                                 if wordsoffset - len(words[wdend-1]) - 1 < m['end_off']-sentoffset: 
234 |                                     endInWord = 1
235 |                                     tailText = sent.sentstr[m['end_off']-sentoffset : wordsoffset].strip()
236 |                                 break
237 | 
238 |                             wordsoffset += len(word) 
239 |                             wdend += 1
240 |                        
241 | 
242 |                         correctionTokenizedStr = tokenizeCorrectionStr(headText + m['correction'] + tailText, wdstart, wdend, words)
243 |                         correctionTokenizedStr, wdstart, wdend = shrinkCorrectionStr(correctionTokenizedStr, wdstart, wdend, words)
244 | 
245 |                         token_start = wdstart if m['type'] != 'noop' else -1
246 |                         token_end = wdend if m['type'] != 'noop' else -1
247 |                         correction_final = correctionTokenizedStr.replace('\n', '') if m['type'] != 'noop' else '-NONE-'
248 | 
249 |                         # build annotation string for .conll.m2 file
250 |                         m2AnnotationStr  = 'A '
251 |                         m2AnnotationStr +=  str(token_start) + ' '
252 |                         m2AnnotationStr +=  str(token_end) + '|||'
253 |                         m2AnnotationStr +=  m['type'] + '|||'
254 |                         m2AnnotationStr +=  correction_final + '|||'
255 |                         m2AnnotationStr +=  'REQUIRED|||-NONE-|||' + str(i) + '\n'
256 | 
257 |                         m2AnnotationList.append(m2AnnotationStr)
258 |                     
259 |                     if len(mistakesList) > 0: # only if mistakeList contains tuples
260 |                         i += 1
261 | 
262 |                 # write .conll.m2 file
263 |                 m2AnnotationSent = 'S ' + tokenizedSentStr + '\n'
264 |                 m2AnnotationSent += ''.join(m2AnnotationList) + '\n'
265 |                 fm2.write(m2AnnotationSent.encode('utf-8'))
266 |     
267 |     fm2.close()
268 | 
269 | 
270 | def tokenizeCorrectionStr(correctionStr, wdstart, wdend, words):
271 | 
272 |     correctionTokenizedStr = ''
273 |     pseudoSent = correctionStr
274 | 
275 |     if wdstart != 0:
276 |         pseudoSent = words[wdstart-1] + ' ' + pseudoSent 
277 | 
278 |     if wdend < len(words) - 1:
279 |         pseudoSent = pseudoSent + ' ' + words[wdend]
280 |     elif wdend == len(words) - 1:
281 |         pseudoSent = pseudoSent + words[wdend]
282 | 
283 | 
284 |     pseudoWordsList = []
285 |     sentList = sentenceTokenizer.tokenize(pseudoSent)
286 |     for sent in sentList:
287 |         pseudoWordsList += word_tokenize(sent)
288 | 
289 |     start = 0
290 |     if wdstart != 0:
291 |         s = ''
292 |         for i in xrange(len(pseudoWordsList)):
293 |             s += pseudoWordsList[i]
294 |             if s == words[wdstart-1]:
295 |                 start = i + 1
296 |                 break
297 |         if start == 0:
298 |             print >> sys.stderr, 'Can not find words[wdstart-1]'
299 | 
300 |     else:
301 |         start = 0
302 | 
303 |     end = len(pseudoWordsList)
304 |     if wdend != len(words):
305 | 
306 |         s = ''
307 |         for i in xrange(len(pseudoWordsList)):
308 |             s = pseudoWordsList[len(pseudoWordsList) - i - 1] + s
309 |             if s == words[wdend]:
310 |                 end = len(pseudoWordsList) - i - 1
311 |                 break
312 |         if end == len(pseudoWordsList):
313 |             print >> sys.stderr, 'Can not find words[wdend]'
314 | 
315 |     else:
316 |         end = len(pseudoWordsList)
317 | 
318 |     correctionTokenizedStr = ' '.join(pseudoWordsList[start:end])
319 | 
320 |     return correctionTokenizedStr
321 | 
322 | 
323 | def shrinkCorrectionStr(correctionTokenizedStr, wdstart, wdend, words):
324 | 
325 |     correctionWords = correctionTokenizedStr.split(' ')
326 |     originalWords = words[wdstart: wdend]
327 |     wdstartNew = wdstart
328 |     wdendNew = wdend
329 |     cstart = 0
330 |     cend = len(correctionWords)
331 | 
332 |     i = 0
333 |     while i < len(originalWords) and i < len(correctionWords):
334 |         if correctionWords[i] == originalWords[i]:
335 |             i += 1
336 |             wdstartNew = i + wdstart
337 |             cstart = i
338 |         else:
339 |             break
340 | 
341 |     i = 1
342 |     while i <= len(originalWords) - cstart and i <= len(correctionWords) - cstart:
343 |         if correctionWords[len(correctionWords)-i] == originalWords[len(originalWords)-i]:
344 |             wdendNew = wdend - i
345 |             cend = len(correctionWords) - i
346 |             i += 1
347 |         else:
348 |             break
349 | 
350 |     return ' '.join(correctionWords[cstart:cend]), wdstartNew, wdendNew
351 | 
352 | if __name__ == '__main__':
353 | 
354 |     ''' usage: 
355 | 
356 |         %python preprocesswithalt.py completesgmlfile mainsgmlfile alternativesgmlfile1 ... alternativesgmlfileN combinedm2file
357 |           output an m2 file containing a collection of the main annotation and all alternative annotations.
358 | 
359 |         In most cases completesgmlfile and mainsgmlfile are identical
360 |     '''
361 | 
362 |     # Load original complete SGML for reference
363 |     origDocs = sentenceSplit(readNUCLE(sys.argv[1]))
364 |     
365 |     origDocSet = {}
366 |     for doc in origDocs:
367 |         nid = int(doc.docattrs[0][1])
368 |         origDocSet[nid] = doc
369 | 
370 |     docsList = []
371 |     for i in range(2, len(sys.argv) - 1):
372 |         docs = sentenceSplit(readNUCLE(sys.argv[i]))
373 |         docsList.append(docs)
374 | 
375 |     board = {}
376 |     for docs in docsList:
377 |         board = moderateAnnotations(docs, board, origDocSet)
378 | 
379 |     createM2File(origDocs, board, sys.argv[len(sys.argv)-1])
380 | 
381 |     pass
382 | 
383 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #!/usr/bin/env python
 3 | 
 4 | from model import *
 5 | from preprocess import *
 6 | from config import Config
 7 | 
 8 | 
 9 | def evaluate(input_variable, len_inputs):
10 |     batch_size, input_length = input_variable.size()
11 | 
12 |     # Run through encoder
13 |     encoder_hidden = encoder.init_hidden(batch_size)
14 |     encoder_outputs, encoder_hidden = encoder(input_variable, len_inputs, encoder_hidden)
15 | 
16 |     # Create starting vectors for decoder
17 |     decoder_input = Variable(torch.LongTensor([[SOS_token] for _ in range(batch_size)]))  # SOS
18 |     decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size))
19 |     decoder_hidden = encoder_hidden
20 |     if Config.use_cuda:
21 |         decoder_input = decoder_input.cuda()
22 |         decoder_context = decoder_context.cuda()
23 | 
24 |     decoded_output = torch.zeros(batch_size, Config.max_seq_length, out=torch.LongTensor(batch_size, Config.max_seq_length))
25 |     decoder_attentions = torch.zeros(batch_size, input_length, input_length)
26 | 
27 |     # Run through decoder
28 |     for di in range(Config.max_seq_length):
29 |         decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context,
30 |                                                                                      decoder_hidden, encoder_outputs)
31 |         # decoder_attentions[:, di, :decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data
32 | 
33 |         # Choose top word from output
34 |         _, top_index = decoder_output.data.topk(1)
35 | 
36 |         decoded_output[:, di] = top_index
37 | 
38 |         # Next input is chosen word
39 |         decoder_input = Variable(top_index)
40 |         if Config.use_cuda: decoder_input = decoder_input.cuda()
41 | 
42 |     return decoded_output  #, decoder_attentions[:, di + 1, :len(encoder_outputs)]
43 | 
44 | 
45 | def corpus_bleu_single_ref(r, h):
46 |     from nltk.translate.bleu_score import corpus_bleu
47 |     r = np.expand_dims(r, axis=1)
48 |     return corpus_bleu(r, h)
49 | 
50 | 
51 | def corpus_wer(r, h):
52 |     from utils import wer
53 |     return np.mean(map(lambda (a, b): wer(a, b), zip(r, h)))
54 | 
55 | 
56 | def eval_examples(sources, preds, targets, num=3):
57 |     str = ''
58 |     for i in range(num):
59 |         source = word_dict.indexes_to_sentence(sources[i])
60 |         pred = word_dict.indexes_to_sentence(preds[i])
61 |         target = word_dict.indexes_to_sentence(targets[i])
62 |         str += '#{}\nSource:\t{}\nPred:\t{}\nTarget:\t{}\n\n'.format(i, source, pred, target)
63 |     return str
64 | 
65 | _, eval_corpus, word_dict = build_corpus()
66 | encoder, decoder = get_model(word_dict.n_words)
67 | 
68 | inputs, targets, len_inputs, _ = eval_corpus.next_batch(100)
69 | input_variable = Variable(torch.LongTensor(inputs), requires_grad=False)
70 | if Config.use_cuda:
71 |     input_variable = input_variable.cuda()
72 | 
73 | output_tensor = evaluate(input_variable, len_inputs)
74 | preds = output_tensor.cpu().numpy().tolist()
75 | 
76 | print('<Baseline>\nWER:{}\nBLEU:{}\n'.format(corpus_wer(targets, inputs), corpus_bleu_single_ref(targets, inputs)))
77 | print('<Prediction>\nWER:{}\nBLEU:{}\n'.format(corpus_wer(targets, preds), corpus_bleu_single_ref(targets, preds)))
78 | print('<Examples>\n{}'.format(eval_examples(inputs, preds, targets)))


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #!/usr/bin/env python
 3 | 
 4 | import glob
 5 | import os
 6 | 
 7 | import torch.optim as optim
 8 | 
 9 | from seq2seq.seq2seq import *
10 | 
11 | 
12 | def save_state(encoder, decoder, encoder_optim, decoder_optim, step, path='checkpoints/model'):
13 |     state = {'step': step,
14 |              'encoder': encoder.state_dict(),
15 |              'decoder': decoder.state_dict(),
16 |              'encoder_optim': encoder_optim.state_dict(),
17 |              'decoder_optim': decoder_optim.state_dict()}
18 |     filename = path + '-' + str(step)
19 |     torch.save(state, filename)
20 | 
21 | 
22 | def load_state(step=None, path='checkpoints/model'):
23 |     state = {}
24 |     file_list = glob.glob(path + '*')
25 |     if file_list:
26 |         if step:
27 |             filename = path + '-' + str(step)
28 |         else:
29 |             filename = max(file_list, key=os.path.getctime)
30 | 
31 |         state = torch.load(filename)
32 |     return state
33 | 
34 | 
35 | def get_model(n_classes, state=None, step=None, load=True):
36 |     encoder = EncoderRNN(n_classes, hidden_size, n_layers)
37 |     decoder = AttnDecoderRNN(attn_model, hidden_size, n_classes, n_layers, dropout_p=dropout_p)
38 |     if Config.use_cuda:
39 |         encoder.cuda()
40 |         decoder.cuda()
41 | 
42 |     if load:
43 |         if not state:
44 |             state = load_state(step)
45 |         if state:
46 |             encoder.load_state_dict(state['encoder'])
47 |             decoder.load_state_dict(state['decoder'])
48 | 
49 |     return encoder, decoder
50 | 
51 | 
52 | def get_optimizer(encoder, decoder, step=None, state=None, lr=0.0001):
53 |     encoder_optimizer = optim.Adam(encoder.parameters(), lr=lr)
54 |     decoder_optimizer = optim.Adam(decoder.parameters(), lr=lr)
55 | 
56 |     if not state:
57 |         state = load_state(step)
58 |     if state:
59 |         encoder_optimizer.load_state_dict(state['encoder_optim'])
60 |         decoder_optimizer.load_state_dict(state['decoder_optim'])
61 | 
62 |     return encoder_optimizer, decoder_optimizer


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #!/usr/bin/env python
 3 | 
 4 | import random
 5 | import numpy as np
 6 | from config import Config
 7 | 
 8 | PAD_token = 0
 9 | SOS_token = 1
10 | EOS_token = 2
11 | 
12 | 
13 | class WordDict:
14 |     def __init__(self):
15 |         self.word2index = {}
16 |         self.word2count = {}
17 |         self.index2word = {PAD_token: "<PAD>", SOS_token: "<SOS>", EOS_token: "<EOS>"}
18 |         self.n_words = 3  # Count PAD, SOS and EOS
19 | 
20 |     def add_indexes(self, sentence):
21 |         for word in sentence.split(' '):
22 |             self.add_index(word)
23 | 
24 |     def add_index(self, word):
25 |         if word not in self.word2index:
26 |             self.word2index[word] = self.n_words
27 |             self.word2count[word] = 1
28 |             self.index2word[self.n_words] = word
29 |             self.n_words += 1
30 |         else:
31 |             self.word2count[word] += 1
32 | 
33 |     def sentence_to_indexes(self, sentence, max_length):
34 |         indexes = [self.word2index[word] for word in sentence.split(' ')][:max_length - 1]
35 |         indexes.append(EOS_token)
36 |         n_indexes = len(indexes)
37 |         indexes.extend([PAD_token for _ in range(max_length - len(indexes))])
38 |         return indexes, n_indexes
39 | 
40 |     def indexes_to_sentence(self, indexes):
41 |         indexes = filter(lambda i: i != PAD_token, indexes)
42 |         indexes = map(lambda i: self.index2word[i], indexes)
43 |         return ' '.join(indexes)
44 | 
45 | 
46 | class Corpus:
47 |     def __init__(self, dict, max_length, path):
48 |         self.max_length = max_length
49 |         self.lines = self.filter_raw_string(open(path).read()).split('\n')
50 |         self.pairs = [[s for s in l.split('\t')] for l in self.lines]
51 |         self.dict = dict
52 |         for pair in self.pairs:
53 |             self.dict.add_indexes(pair[0])
54 |             self.dict.add_indexes(pair[1])
55 | 
56 |     def filter_raw_string(self, str):
57 |         return str.strip().translate(None, '<>')
58 | 
59 |     def next_batch(self, batch_size=100):
60 |         pairs = np.array(random.sample(self.pairs, batch_size))
61 |         input_lens = [self.dict.sentence_to_indexes(s, self.max_length) for s in pairs[:, 0]]
62 |         target_lens = [self.dict.sentence_to_indexes(s, self.max_length) for s in pairs[:, 1]]
63 |         input_lens, target_lens = zip(*sorted(zip(input_lens, target_lens), key=lambda p: p[0][1], reverse=True))
64 |         inputs = map(lambda i: i[0], input_lens)
65 |         len_inputs = map(lambda i: i[1], input_lens)
66 |         targets = map(lambda i: i[0], target_lens)
67 |         len_targets = map(lambda i: i[1], target_lens)
68 |         return inputs, targets, len_inputs, len_targets
69 | 
70 | 
71 | def build_corpus():
72 |     word_dict = WordDict()
73 |     train_corpus = Corpus(word_dict, Config.max_seq_length, Config.train_data_path)
74 |     eval_corpus = Corpus(word_dict, Config.max_seq_length, Config.eval_data_path)
75 |     return train_corpus, eval_corpus, word_dict


--------------------------------------------------------------------------------
/seq2seq/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andabi/deep-text-corrector/69bd711e65cc42364becba5efd99b8d4f8ab0aab/seq2seq/__init__.py


--------------------------------------------------------------------------------
/seq2seq/seq2seq.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #!/usr/bin/env python
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | from torch.nn.utils.rnn import pack_padded_sequence as pack, pad_packed_sequence as unpack
  9 | from config import Config
 10 | 
 11 | attn_model = 'general'
 12 | hidden_size = 500
 13 | n_layers = 2
 14 | dropout_p = 0.05
 15 | 
 16 | SOS_token = 0
 17 | EOS_token = 1
 18 | 
 19 | 
 20 | class EncoderRNN(nn.Module):
 21 |     def __init__(self, input_size, hidden_size, n_layers=1):
 22 |         super(EncoderRNN, self).__init__()
 23 | 
 24 |         self.input_size = input_size
 25 |         self.hidden_size = hidden_size
 26 |         self.n_layers = n_layers
 27 | 
 28 |         self.embedding = nn.Embedding(input_size, hidden_size)
 29 |         self.gru = nn.GRU(hidden_size, hidden_size, n_layers, batch_first=True)
 30 | 
 31 |     def init_hidden(self, batch_size):
 32 |         hidden = Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))
 33 |         if Config.use_cuda: hidden = hidden.cuda()
 34 |         return hidden
 35 | 
 36 |     def forward(self, input_seq, len_inputs, hidden):
 37 |         # input_seq.size() = (B, S), hidden.size() = (L, B, H), embedded.size() = (B, S, H), output.size() = (B, S, H)
 38 |         # batch_size, seq_len = input_sequence.size()
 39 |         embedded = self.embedding(input_seq)
 40 |         output, hidden = self.gru(pack(embedded, len_inputs, batch_first=True), hidden)
 41 |         output, _ = unpack(output, batch_first=True)
 42 |         return output, hidden
 43 | 
 44 | 
 45 | class AttnDecoderRNN(nn.Module):
 46 |     def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_p=0.1):
 47 |         super(AttnDecoderRNN, self).__init__()
 48 | 
 49 |         # Keep parameters for reference
 50 |         self.attn_model = attn_model
 51 |         self.hidden_size = hidden_size
 52 |         self.output_size = output_size
 53 |         self.n_layers = n_layers
 54 |         self.dropout_p = dropout_p
 55 | 
 56 |         # Define layers
 57 |         self.embedding = nn.Embedding(output_size, hidden_size)
 58 |         self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p, batch_first=True)
 59 |         self.out = nn.Linear(hidden_size * 2, output_size)
 60 | 
 61 |         # Choose attention model
 62 |         if attn_model != 'none':
 63 |             self.attn = Attn(attn_model, hidden_size)
 64 | 
 65 |     def forward(self, input, last_context, last_hidden, encoder_outputs):
 66 |         # input.size() = (B, 1), last_context.size() = (B, H), last_hidden.size() = (L, B, H), encoder_outputs.size() = (B, S, H)
 67 |         # word_embedded.size() = (B, 1, H)
 68 |         # print input.size()
 69 |         word_embedded = self.embedding(input)
 70 | 
 71 |         # rnn_input.size() = (B, 1, 2H), rnn_output.size() = (B, 1, H)
 72 |         # print word_embedded.size(), last_context.unsqueeze(1).size()
 73 |         rnn_input = torch.cat((word_embedded, last_context.unsqueeze(1)), -1)
 74 |         rnn_output, hidden = self.gru(rnn_input, last_hidden)
 75 |         rnn_output = rnn_output.squeeze(1)  # B x S=1 x H -> B x H
 76 | 
 77 |         # atten_weights.size() = (B, S)
 78 |         attn_weights = self.attn(rnn_output, encoder_outputs)
 79 |         context = attn_weights.unsqueeze(1).bmm(encoder_outputs).squeeze(1)  # B x H
 80 | 
 81 |         # TODO tanh?
 82 |         # Final output layer (next word prediction) using the RNN hidden state and context vector
 83 |         output = self.out(torch.cat((rnn_output, context), -1))  # B x V
 84 | 
 85 |         # Return final output, hidden state, and attention weights (for visualization)
 86 |         # output.size() = (B, V)
 87 |         return output, context, hidden, attn_weights
 88 | 
 89 | 
 90 | class Attn(nn.Module):
 91 |     def __init__(self, method, hidden_size):
 92 |         super(Attn, self).__init__()
 93 | 
 94 |         self.method = method
 95 |         self.hidden_size = hidden_size
 96 | 
 97 |         if self.method == 'general':
 98 |             self.attn = nn.Linear(self.hidden_size, hidden_size)
 99 | 
100 |         # elif self.method == 'concat':
101 |         #     self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
102 |         #     self.other = nn.Parameter(torch.FloatTensor(1, hidden_size))
103 | 
104 |     def forward(self, hidden, encoder_outputs):
105 |         # hidden.size() = (B, H), encoder_outputs.size() = (B, S, H)
106 |         batch_size, encoder_outputs_len, _ = encoder_outputs.size()
107 | 
108 |         # Create variable to store attention energies
109 |         # attn_energies.size() = (B, S)
110 |         attn_energies = Variable(torch.zeros((batch_size, encoder_outputs_len)))  # B x S
111 |         if Config.use_cuda: attn_energies = attn_energies.cuda()
112 | 
113 |         # Calculate energies for each encoder output
114 |         # attn_energies.size() = (B, S)
115 |         for i in range(encoder_outputs_len):
116 |             attn_energies[:, i] = self.score(hidden, encoder_outputs[:, i])
117 |             # print attn_energies[:, i]
118 | 
119 |         # Normalize energies to weights in range 0 to 1
120 |         return F.softmax(attn_energies)
121 | 
122 |     def score(self, hidden, encoder_output):
123 | 
124 |         # print hidden.size(), encoder_output.size()
125 |         if self.method == 'dot':
126 |             energy = hidden.unsqueeze(1).bmm(encoder_output.unsqueeze(2))  # dot product
127 |             return energy
128 | 
129 |         elif self.method == 'general':
130 |             energy = self.attn(encoder_output)
131 |             energy = hidden.unsqueeze(1).bmm(energy.unsqueeze(2))
132 |             return energy
133 | 
134 |         # TODO
135 |         # elif self.method == 'concat':
136 |         #     energy = self.attn(torch.cat((hidden, encoder_output), -1))
137 |         #     energy = self.other.unsqueeze(1).bmm(energy.unsqueeze(2))
138 |         #     return energy
139 | 
140 | 
141 | # class BahdanauAttnDecoderRNN(nn.Module):
142 | #     def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1):
143 | #         super(AttnDecoderRNN, self).__init__()
144 | #
145 | #         # Define parameters
146 | #         self.hidden_size = hidden_size
147 | #         self.output_size = output_size
148 | #         self.n_layers = n_layers
149 | #         self.dropout_p = dropout_p
150 | #         self.max_length = max_length
151 | #
152 | #         # Define layers
153 | #         self.embedding = nn.Embedding(output_size, hidden_size)
154 | #         self.dropout = nn.Dropout(dropout_p)
155 | #         self.attn = GeneralAttn(hidden_size)
156 | #         self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p)
157 | #         self.out = nn.Linear(hidden_size, output_size)
158 | #
159 | #     def forward(self, word_input, last_hidden, encoder_outputs):
160 | #         # Note that we will only be running forward for a single decoder time step, but will use all encoder outputs
161 | #
162 | #         # Get the embedding of the current input word (last output word)
163 | #         word_embedded = self.embedding(word_input).view(1, 1, -1)  # S=1 x B x N
164 | #         word_embedded = self.dropout(word_embedded)
165 | #
166 | #         # Calculate attention weights and apply to encoder outputs
167 | #         attn_weights = self.attn(last_hidden[-1], encoder_outputs)
168 | #         context = attn_weights.bmm(encoder_outputs.transpose(0, 1))  # B x 1 x N
169 | #
170 | #         # Combine embedded input word and attended context, run through RNN
171 | #         rnn_input = torch.cat((word_embedded, context), 2)
172 | #         output, hidden = self.gru(rnn_input, last_hidden)
173 | #
174 | #         # Final output layer
175 | #         output = output.squeeze(0)  # B x N
176 | #         output = F.log_softmax(self.out(torch.cat((output, context), 1)))
177 | #
178 | #         # Return final output, hidden state, and attention weights (for visualization)
179 | #         return output, hidden, attn_weights


--------------------------------------------------------------------------------
/seq2seq/seq2seq_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #!/usr/bin/env python
 3 | 
 4 | from seq2seq import *
 5 | 
 6 | encoder_test = EncoderRNN(10, 10, 2)
 7 | decoder_test = AttnDecoderRNN('general', 10, 10, 2)
 8 | print(encoder_test)
 9 | print(decoder_test)
10 | 
11 | encoder_hidden = encoder_test.init_hidden()
12 | word_input = Variable(torch.LongTensor([1, 2, 3]))
13 | if Config.use_cuda:
14 |     encoder_test.cuda()
15 |     word_input = word_input.cuda()
16 | encoder_outputs, encoder_hidden = encoder_test(word_input, encoder_hidden)
17 | 
18 | word_inputs = Variable(torch.LongTensor([1, 2, 3]))
19 | decoder_attns = torch.zeros(1, 3, 3)
20 | decoder_hidden = encoder_hidden
21 | decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
22 | 
23 | if Config.use_cuda:
24 |     decoder_test.cuda()
25 |     word_inputs = word_inputs.cuda()
26 |     decoder_context = decoder_context.cuda()
27 | 
28 | for i in range(3):
29 |     decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[i], decoder_context, decoder_hidden, encoder_outputs)
30 |     print(decoder_output.size(), decoder_hidden.size(), decoder_attn.size())
31 |     decoder_attns[0, i] = decoder_attn.squeeze(0).cpu().data


--------------------------------------------------------------------------------
/tensorboard_logger.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #!/usr/bin/env python
 3 | 
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | import scipy.misc
 7 | 
 8 | try:
 9 |     from StringIO import StringIO  # Python 2.7
10 | except ImportError:
11 |     from io import BytesIO  # Python 3.x
12 | 
13 | 
14 | class Logger(object):
15 |     def __init__(self, log_dir):
16 |         """Create a summary writer logging to log_dir."""
17 |         self.writer = tf.summary.FileWriter(log_dir)
18 | 
19 |     def scalar_summary(self, tag, value, step):
20 |         """Log a scalar variable."""
21 |         summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
22 |         self.writer.add_summary(summary, step)
23 | 
24 |     def image_summary(self, tag, images, step):
25 |         """Log a list of images."""
26 | 
27 |         img_summaries = []
28 |         for i, img in enumerate(images):
29 |             # Write the image to a string
30 |             try:
31 |                 s = StringIO()
32 |             except:
33 |                 s = BytesIO()
34 |             scipy.misc.toimage(img).save(s, format="png")
35 | 
36 |             # Create an Image object
37 |             img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(),
38 |                                        height=img.shape[0],
39 |                                        width=img.shape[1])
40 |             # Create a Summary value
41 |             img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum))
42 | 
43 |         # Create and write Summary
44 |         summary = tf.Summary(value=img_summaries)
45 |         self.writer.add_summary(summary, step)
46 | 
47 |     def histo_summary(self, tag, values, step, bins=1000):
48 |         """Log a histogram of the tensor of values."""
49 | 
50 |         # Create a histogram using numpy
51 |         counts, bin_edges = np.histogram(values, bins=bins)
52 | 
53 |         # Fill the fields of the histogram proto
54 |         hist = tf.HistogramProto()
55 |         hist.min = float(np.min(values))
56 |         hist.max = float(np.max(values))
57 |         hist.num = int(np.prod(values.shape))
58 |         hist.sum = float(np.sum(values))
59 |         hist.sum_squares = float(np.sum(values ** 2))
60 | 
61 |         # Drop the start of the first bin
62 |         bin_edges = bin_edges[1:]
63 | 
64 |         # Add bin edges and counts
65 |         for edge in bin_edges:
66 |             hist.bucket_limit.append(edge)
67 |         for c in counts:
68 |             hist.bucket.append(c)
69 | 
70 |         # Create and write Summary
71 |         summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)])
72 |         self.writer.add_summary(summary, step)
73 |         self.writer.flush()


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #!/usr/bin/env python
  3 | 
  4 | import torch.optim
  5 | from torch.nn.utils.rnn import pad_packed_sequence as unpack
  6 | 
  7 | from model import *
  8 | from preprocess import *
  9 | from utils import *
 10 | from tensorboard_logger import Logger
 11 | 
 12 | final_steps = 50000
 13 | print_every = 1
 14 | save_every = 500
 15 | learning_rate = 0.0001
 16 | teacher_forcing_ratio = 0.5
 17 | clip = 5.0
 18 | 
 19 | 
 20 | def sequence_mask(sequence_length, max_len=None):
 21 |     if max_len is None:
 22 |         max_len = sequence_length.data.max()
 23 |     batch_size = sequence_length.size(0)
 24 |     seq_range = torch.range(0, max_len - 1).long()
 25 |     seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
 26 |     seq_range_expand = Variable(seq_range_expand)
 27 |     if sequence_length.is_cuda:
 28 |         seq_range_expand = seq_range_expand.cuda()
 29 |     seq_length_expand = (sequence_length.unsqueeze(1)
 30 |                          .expand_as(seq_range_expand))
 31 |     return seq_range_expand < seq_length_expand
 32 | 
 33 | 
 34 | # outputs: (B, S, V)
 35 | # targets: (B, S, V)
 36 | # lengths: (B, 1)
 37 | def masked_cross_entropy(logits, targets, lengths):
 38 |     batch_size, seq_len, n_classes = logits.size()
 39 |     assert (batch_size, seq_len) == targets.size()
 40 | 
 41 |     # mask = Variable(torch.LongTensor([[1 for _ in range(l)] for l in lengths.data]))
 42 |     # mask = mask.resize_as(targets)
 43 |     mask = sequence_mask(sequence_length=lengths, max_len=targets.size(1))
 44 | 
 45 |     # logits_flat: (batch * max_len, num_classes)
 46 |     logits_flat = logits.view(-1, logits.size(-1))
 47 |     # log_probs_flat: (batch * max_len, num_classes)
 48 |     log_probs_flat = F.log_softmax(logits_flat)
 49 |     # target_flat: (batch * max_len, 1)
 50 |     target_flat = targets.view(-1, 1)
 51 |     # losses_flat: (batch * max_len, 1)
 52 |     losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
 53 |     # losses: (batch, max_len)
 54 |     losses = losses_flat.view(*targets.size()) * mask.float()
 55 |     return losses.sum() / lengths.float().sum()
 56 | 
 57 | 
 58 | def train(input_batch, len_inputs, target_batch, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
 59 |     # Zero gradients of both optimizers
 60 |     encoder_optimizer.zero_grad()
 61 |     decoder_optimizer.zero_grad()
 62 | 
 63 |     # Get size of input and target sentences
 64 |     # batch_size, input_length = input_batch.size()
 65 |     batch_size, target_length = target_batch.size()
 66 | 
 67 |     # TODO parameter를 paddingsequence로 받게끔 하고 아래는 삭제
 68 |     length_targets = Variable(torch.LongTensor(map(lambda s: len(s), target_batch))).cuda()
 69 | 
 70 |     # Run words through encoder
 71 |     encoder_hidden = encoder.init_hidden(batch_size)
 72 |     encoder_outputs, encoder_hidden = encoder(input_batch, len_inputs, encoder_hidden)
 73 | 
 74 |     # Prepare input and output variables
 75 |     decoder_input = Variable(torch.LongTensor([[SOS_token] for _ in range(batch_size)]))
 76 |     decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size))
 77 |     decoder_hidden = encoder_hidden  # Use last hidden state from encoder to start decoder
 78 |     decoder_outputs = Variable(torch.FloatTensor(batch_size, target_length, decoder.output_size).zero_())
 79 | 
 80 |     if Config.use_cuda:
 81 |         decoder_input = decoder_input.cuda()
 82 |         decoder_context = decoder_context.cuda()
 83 |         decoder_outputs = decoder_outputs.cuda()
 84 | 
 85 |     # Choose whether to use teacher forcing
 86 |     if random.random() < teacher_forcing_ratio:
 87 |         # Teacher forcing: Use the ground-truth target as the next input
 88 |         for di in range(target_length):
 89 |             decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context,
 90 |                                                                                          decoder_hidden,
 91 |                                                                                          encoder_outputs)
 92 |             decoder_outputs[:, di] = decoder_output
 93 |             decoder_input = target_batch[:, di].unsqueeze(1)  # Next target is next input
 94 |     else:
 95 |         for di in range(target_length):
 96 |             decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context,
 97 |                                                                                          decoder_hidden,
 98 |                                                                                          encoder_outputs)
 99 |             decoder_outputs[:, di] = decoder_output
100 |             # Get most likely word index (highest value) from output
101 |             _, top_index = decoder_output.data.topk(1)
102 |             decoder_input = Variable(top_index)  # Chosen word is next input
103 |             if Config.use_cuda: decoder_input = decoder_input.cuda()
104 | 
105 |             # Stop at end of sentence (not necessary when using known targets)
106 |             # TODO
107 |             # if ni == EOS_token: break
108 | 
109 |     loss = masked_cross_entropy(decoder_outputs, target_batch, length_targets)
110 | 
111 |     # Backpropagation
112 |     loss.backward()
113 |     torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
114 |     torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
115 |     encoder_optimizer.step()
116 |     decoder_optimizer.step()
117 | 
118 |     return loss.data[0] / target_length
119 | 
120 | 
121 | # Get train corpus and word_dict
122 | train_corpus, _, word_dict = build_corpus()
123 | 
124 | # Build models, optimizers and load states
125 | state = load_state()
126 | step = 1
127 | if state:
128 |     step = state['step'] + 1
129 | encoder, decoder = get_model(word_dict.n_words, state=state)
130 | encoder_optimizer, decoder_optimizer = get_optimizer(encoder, decoder, lr=learning_rate, state=state)
131 | 
132 | # Define loss function
133 | criterion = nn.NLLLoss()
134 | 
135 | # Keep track of time elapsed and running averages
136 | start = time.time()
137 | 
138 | # Set configuration for using Tensorboard
139 | logger = Logger('graphs')
140 | 
141 | for step in range(step, final_steps + 1):
142 | 
143 |     # Get training data for this cycle
144 |     inputs, targets, len_inputs, len_targets = train_corpus.next_batch()
145 |     input_variable = Variable(torch.LongTensor(inputs), requires_grad=False)
146 |     target_variable = Variable(torch.LongTensor(targets), requires_grad=False)
147 | 
148 |     if Config.use_cuda:
149 |         input_variable = input_variable.cuda()
150 |         target_variable = target_variable.cuda()
151 | 
152 |     # Run the train function
153 |     loss = train(input_variable, len_inputs, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
154 | 
155 |     # Keep track of loss
156 |     logger.scalar_summary('loss', loss, step)
157 | 
158 |     if step % print_every == 0:
159 |         print('%s: %s (%d %d%%)' % (step, time_since(start, 1. * step / final_steps), step, step / final_steps * 100))
160 | 
161 |     if step % save_every == 0:
162 |         save_state(encoder, decoder, encoder_optimizer, decoder_optimizer, step)


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #!/usr/bin/env python
 3 | 
 4 | import math
 5 | import time
 6 | import numpy as np
 7 | 
 8 | 
 9 | def as_minutes(s):
10 |     m = math.floor(s / 60)
11 |     s -= m * 60
12 |     return '%dm %ds' % (m, s)
13 | 
14 | 
15 | def time_since(since, percent):
16 |     s = now() - since
17 |     es = s / (percent)
18 |     rs = es - s
19 |     return '%s (- %s)' % (as_minutes(s), as_minutes(rs))
20 | 
21 | 
22 | def now():
23 |     return time.time()
24 | 
25 | 
26 | # r = reference, h = hypothesis
27 | def wer(r, h):
28 |     # initialisation
29 |     d = np.zeros((len(r)+1)*(len(h)+1), dtype=np.uint8)
30 |     d = d.reshape((len(r)+1, len(h)+1))
31 |     for i in range(len(r)+1):
32 |         for j in range(len(h)+1):
33 |             if i == 0:
34 |                 d[0][j] = j
35 |             elif j == 0:
36 |                 d[i][0] = i
37 | 
38 |     # computation
39 |     for i in range(1, len(r)+1):
40 |         for j in range(1, len(h)+1):
41 |             if r[i-1] == h[j-1]:
42 |                 d[i][j] = d[i-1][j-1]
43 |             else:
44 |                 substitution = d[i-1][j-1] + 1
45 |                 insertion    = d[i][j-1] + 1
46 |                 deletion     = d[i-1][j] + 1
47 |                 d[i][j] = min(substitution, insertion, deletion)
48 | 
49 |     return d[len(r)][len(h)] / float(len(r))
50 | 
51 | 
52 | # def show_attention(input_sentence, output_words, attentions):
53 | #     # Set up figure with colorbar
54 | #     fig = plt.figure()
55 | #     ax = fig.add_subplot(111)
56 | #     cax = ax.matshow(attentions.numpy(), cmap='bone')
57 | #     fig.colorbar(cax)
58 | #
59 | #     # Set up axes
60 | #     ax.set_xticklabels([''] + input_sentence.split(' ') + ['<EOS>'], rotation=90)
61 | #     ax.set_yticklabels([''] + output_words)
62 | #
63 | #     # Show label at every tick
64 | #     ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
65 | #     ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
66 | #
67 | #     show_plot_visdom()
68 | #     plt.show()
69 | #     plt.close()
70 | #
71 | #
72 | # def evaluate_and_show_attention(input_sentence, target_sentence=None):
73 | #     output_words, attentions = evaluate(input_sentence)
74 | #     output_sentence = ' '.join(output_words)
75 | #     print('>', input_sentence)
76 | #     if target_sentence is not None:
77 | #         print('=', target_sentence)
78 | #     print('<', output_sentence)
79 | #
80 | #     show_attention(input_sentence, output_words, attentions)
81 | #
82 | #     # Show input, target, output text in visdom
83 | #     win = 'evaluted (%s)' % hostname
84 | #     text = '<p>&gt; %s</p><p>= %s</p><p>&lt; %s</p>' % (input_sentence, target_sentence, output_sentence)
85 | #     vis.text(text, win=win, opts={'title': win})
86 | 


--------------------------------------------------------------------------------