├── .gitignore ├── README.md ├── config.py ├── data ├── __init__.py ├── conll14st-test-data │ ├── README │ ├── alt │ │ ├── alternative-teama.sgml │ │ ├── alternative-teamb.sgml │ │ ├── alternative-teamc.sgml │ │ └── official-2014.combined-withalt.m2 │ ├── noalt │ │ ├── official-2014.0.conll.ann │ │ ├── official-2014.0.m2 │ │ ├── official-2014.0.sgml │ │ ├── official-2014.1.conll.ann │ │ ├── official-2014.1.m2 │ │ ├── official-2014.1.sgml │ │ └── official-2014.combined.m2 │ └── scripts │ │ ├── README │ │ ├── iparser.py │ │ ├── nucle_doc.py │ │ ├── nuclesgmlparser.py │ │ ├── parser_feature.py │ │ ├── preprocess.py │ │ ├── preprocesscombine.py │ │ └── preprocesswithalt.py ├── eval.txt ├── release2.3.1 │ ├── README │ ├── m2scorer │ │ ├── LICENSE │ │ ├── README │ │ ├── example │ │ │ ├── README │ │ │ ├── source_gold │ │ │ └── system │ │ ├── m2scorer │ │ └── scripts │ │ │ ├── Tokenizer.py │ │ │ ├── combiner.py │ │ │ ├── convert_hoo.py │ │ │ ├── convert_nucle.py │ │ │ ├── levenshtein.py │ │ │ ├── m2scorer.py │ │ │ ├── nucle_doc.py │ │ │ ├── nuclesgmlparser.py │ │ │ ├── test.sgml │ │ │ ├── token_offsets.py │ │ │ └── util.py │ ├── original │ │ ├── data │ │ │ ├── official-preprocessed.conll │ │ │ ├── official-preprocessed.conll.ann │ │ │ ├── official-preprocessed.m2 │ │ │ └── official.sgml │ │ └── data_5types │ │ │ ├── official-preprocessed.5types.conll.ann │ │ │ ├── official-preprocessed.5types.m2 │ │ │ └── official.5types.sgml │ ├── revised │ │ ├── data │ │ │ ├── official-preprocessed.conll.ann │ │ │ ├── official-preprocessed.m2 │ │ │ └── official.sgml │ │ └── data_5types │ │ │ ├── alternatives.NTHU.sgml │ │ │ ├── alternatives.STEL.sgml │ │ │ ├── alternatives.TOR.sgml │ │ │ ├── alternatives.UIUC.sgml │ │ │ ├── alternatives.UMC.sgml │ │ │ ├── combined.5types.m2 │ │ │ ├── official-preprocessed.5types.conll.ann │ │ │ ├── official-preprocessed.5types.m2 │ │ │ └── official.5types.sgml │ └── scripts │ │ ├── README │ │ ├── iparser.py │ │ ├── nucle_doc.py │ │ ├── nuclesgmlparser.py │ │ ├── parser_feature.py │ │ ├── preprocess.py │ │ └── preprocesswithalt.py └── train.txt ├── eval.py ├── model.py ├── preprocess.py ├── seq2seq ├── __init__.py ├── seq2seq.py └── seq2seq_test.py ├── tensorboard_logger.py ├── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | checkpoints/ 3 | *.pyc 4 | .DS_Store 5 | graphs/ 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Text Corrector 2 | > Work in Progress 3 | ## Introduction 4 | This project aims to make a text corrector for English learners using deep neural net. This project is implemented in Pytorch 5 | - First trial: seq2seq w/ attention + nucle 2.3 dataset 6 | - Next.. 7 | - more data, data augmentations, ... 8 | - tweeking nets like Bytenet or Transformer 9 | - beam search 10 | 11 | ## Datasets 12 | [CoNLL-2013 Shared Task: Grammatical Error Correction](http://www.comp.nus.edu.sg/~nlp/conll13st.html) 13 | [Overview Paper](http://www.comp.nus.edu.sg/~nlp/conll13st/CoNLLST01.pdf) 14 | [Datasets](http://www.comp.nus.edu.sg/~nlp/conll13st/release2.3.1.tar.gz) 15 | [Participant Papers](http://aclweb.org/anthology/W/W13/#3600) 16 | 17 | [CoNLL-2014 Shared Task: Grammatical Error Correction](http://www.comp.nus.edu.sg/~nlp/conll14st.html) 18 | [Overview Paper](http://www.comp.nus.edu.sg/~nlp/conll14st.html) 19 | [Datasets] Needs a license form. 20 | 21 | ## References 22 | https://github.com/atpaino/deep-text-corrector 23 | 24 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | use_cuda = True 3 | max_seq_length = 100 4 | train_data_path = './data/train.txt' 5 | eval_data_path = './data/eval.txt' -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andabi/deep-text-corrector/69bd711e65cc42364becba5efd99b8d4f8ab0aab/data/__init__.py -------------------------------------------------------------------------------- /data/conll14st-test-data/README: -------------------------------------------------------------------------------- 1 | CoNLL-2014 Official Test Data 2 | Release 3.2 3 | 22 Apr 2014 4 | 5 | This README file describes the test data for the CoNLL-2014 Shared 6 | Task: Grammatical Error Correction. 7 | 8 | The package is distributed freely with the following copyright 9 | 10 | Copyright (C) 2014 Hwee Tou Ng, Siew Mei Wu, Ted Briscoe, 11 | Christian Hadiwinoto, Raymond Hendy Susanto, 12 | Christopher Bryant 13 | 14 | Any questions regarding the test data should be directed to 15 | Hwee Tou Ng at: nght@comp.nus.edu.sg 16 | 17 | 18 | 1. Directory Structure and Contents 19 | =================================== 20 | 21 | The top-level directory has two subdirectories, namely 22 | 23 | - noalt/ : the annotated test data without alternatives contributed 24 | by the participants 25 | - alt/ : the annotated test data with moderated participants' 26 | alternative annotations 27 | - scripts/ : the scripts used to preprocess the test data inside the 28 | two subdirectories 29 | 30 | 31 | 2. Data Format 32 | ============== 33 | 34 | The corpus is distributed in a simple SGML format. All annotations 35 | come in a "stand-off" format. The start position and end position of 36 | an annotation are given by paragraph and character offsets. 37 | Paragraphs are enclosed in
...
tags. Paragraphs and characters 38 | are counted starting from zero. Each annotation includes the following 39 | fields: the error category, the correction, and optionally a 40 | comment. If the correction replaces the original text at the given 41 | location, it should fix the grammatical error. 42 | 43 | Example: 44 | 45 |48 | People with close blood relationship generally ... 49 |
50 |51 | Focus on the negative side of the annouance ... 52 |
53 | ... 54 |\n.*\n)
') 46 | filestr = p.sub(r'\1
', filestr) 47 | 48 | parser.feed(filestr) 49 | f.close() 50 | parser.close() 51 | 52 | return parser.docs 53 | 54 | def sentenceSplit(docs): 55 | 56 | sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 57 | for doc in docs: 58 | for par in doc.paragraphs: 59 | doc.sentences.append([]) 60 | for s in sentenceTokenizer.tokenize(par): 61 | doc.buildSentence(s, [], '', [], []) 62 | return docs 63 | 64 | def compareTwoEditLists(editList1, editList2): 65 | # must be sorted 66 | if editList1 == [] and editList2 == []: 67 | return True 68 | elif editList1 == [] or editList2 == []: 69 | return False 70 | elif getEditKey(editList1[0]) != getEditKey(editList2[0]): 71 | return False 72 | else: 73 | return compareTwoEditLists(editList1[1:], editList2[1:]) 74 | 75 | def moderateAnnotations(contestDocs, annotBoard, origDocSet): 76 | # moderate annotation in "contesting" docs with already stated mistakes 77 | #mistakeStrSet = {} 78 | for doc in contestDocs: 79 | #mistakeStr = '' 80 | nid = int(doc.docattrs[0][1]) # nid of current document 81 | tid = doc.annotation[0][0][1] # teacher id 82 | 83 | if not annotBoard.has_key(nid): # create placeholder 84 | annotBoard[nid] = {} 85 | 86 | origDoc = origDocSet[nid] 87 | for pid in xrange(len(origDoc.sentences)): 88 | slist = origDoc.sentences[pid] 89 | if not annotBoard[nid].has_key(pid): 90 | annotBoard[nid][pid] = {} 91 | for sentid in xrange(len(slist)): 92 | sent = slist[sentid] 93 | if not annotBoard[nid][pid].has_key(sentid): 94 | annotBoard[nid][pid][sentid] = [] 95 | editSet = [] 96 | 97 | # enumerate mistakes 98 | sentoffset = origDoc.paragraphs[pid].index(sent.sentstr) 99 | editNum = 0 100 | for m in doc.mistakes: 101 | if m['start_par'] != pid or \ 102 | m['start_par'] != m['end_par'] or \ 103 | m['start_off'] < sentoffset or \ 104 | m['start_off'] >= sentoffset + len(sent.sentstr) or \ 105 | m['end_off']\n.*\n)
') 46 | filestr = p.sub(r'\1
', filestr) 47 | 48 | parser.feed(filestr) 49 | f.close() 50 | parser.close() 51 | 52 | return parser.docs 53 | 54 | def sentenceSplit(docs): 55 | 56 | sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 57 | for doc in docs: 58 | for par in doc.paragraphs: 59 | doc.sentences.append([]) 60 | for s in sentenceTokenizer.tokenize(par): 61 | doc.buildSentence(s, [], '', [], []) 62 | return docs 63 | 64 | def compareTwoEditLists(editList1, editList2): 65 | # must be sorted 66 | if editList1 == [] and editList2 == []: 67 | return True 68 | elif editList1 == [] or editList2 == []: 69 | return False 70 | elif getEditKey(editList1[0]) != getEditKey(editList2[0]): 71 | return False 72 | else: 73 | return compareTwoEditLists(editList1[1:], editList2[1:]) 74 | 75 | def moderateAnnotations(contestDocs, annotBoard, origDocSet): 76 | # moderate annotation in "contesting" docs with already stated mistakes 77 | #mistakeStrSet = {} 78 | for doc in contestDocs: 79 | #mistakeStr = '' 80 | nid = int(doc.docattrs[0][1]) # nid of current document 81 | tid = doc.annotation[0][0][1] # teacher id 82 | 83 | if not annotBoard.has_key(nid): # create placeholder 84 | annotBoard[nid] = {} 85 | 86 | origDoc = origDocSet[nid] 87 | for pid in xrange(len(origDoc.sentences)): 88 | slist = origDoc.sentences[pid] 89 | if not annotBoard[nid].has_key(pid): 90 | annotBoard[nid][pid] = {} 91 | for sentid in xrange(len(slist)): 92 | sent = slist[sentid] 93 | if not annotBoard[nid][pid].has_key(sentid): 94 | annotBoard[nid][pid][sentid] = [] 95 | editSet = [] 96 | 97 | # enumerate mistakes 98 | sentoffset = origDoc.paragraphs[pid].index(sent.sentstr) 99 | editNum = 0 100 | for m in doc.mistakes: 101 | if m['start_par'] != pid or \ 102 | m['start_par'] != m['end_par'] or \ 103 | m['start_off'] < sentoffset or \ 104 | m['start_off'] >= sentoffset + len(sent.sentstr) or \ 105 | m['end_off']...
tags. Paragraphs and characters 41 | are counted starting from zero. Each annotation includes the following 42 | fields: the error category, the correction, and optionally a 43 | comment. If the correction replaces the original text at the given 44 | location, it should fix the grammatical error. 45 | 46 | Example: 47 | 48 |51 | In modern digital world, ... 52 |
53 |54 | Surveillance technology such as ... 55 |
56 | ... 57 |\n.*\n)
') 43 | filestr = p.sub(r'\1
', filestr) 44 | 45 | parser.feed(filestr) 46 | f.close() 47 | parser.close() 48 | 49 | return parser.docs 50 | 51 | 52 | def sentenceSplit(self, docs): 53 | 54 | for doc in docs: 55 | for par in doc.paragraphs: 56 | doc.sentences.append([]) 57 | for s in self.sentenceTokenizer.tokenize(par): 58 | doc.buildSentence(s, [], '', [], []) 59 | return docs 60 | 61 | 62 | def m2FileGeneration(self, docs): 63 | 64 | for doc in docs: 65 | for slistIndex in xrange(len(doc.sentences)): 66 | slist = doc.sentences[slistIndex] 67 | for sentid in xrange(len(slist)): 68 | 69 | sent = slist[sentid] 70 | 71 | # annotation string list 72 | annotationList = [] 73 | 74 | # m2 format annotation string list 75 | m2AnnotationList = [] 76 | 77 | # build colums 78 | table = sent.getConllFormat(doc, slistIndex, sentid) 79 | tokenizedSentStr = ' '.join(sent.getWords()) 80 | 81 | #Add annotation info 82 | sentoffset = doc.paragraphs[slistIndex].index(sent.sentstr) 83 | for m in doc.mistakes: 84 | 85 | if m['start_par'] != slistIndex or \ 86 | m['start_par'] != m['end_par'] or \ 87 | m['start_off'] < sentoffset or \ 88 | m['start_off'] >= sentoffset + len(sent.sentstr) or \ 89 | m['end_off']7 | Humans have many basic needs and one of them is to have an environment that can sustain their lives. Our current population is 6 billion people and it is still growing exponentially. This will, if not already, caused problems as there are very limited spaces for us. The solution can be obtain by using technology to achieve a better usage of space that we have and resolve the problems in lands that inhospitable such as desserts and swamps. 8 |
9 |10 | Some countries are having difficulties in managing a place to live for their citizen as they tend to get overpopulated. This caused problem like the appearance of slums which most of the time is not safe due to the unhealthy environment. The only way to satisfy the increasing demands of space is by achieving a better usage of the land like designing taller building so it can accommodate more number of people with the same spaces. It is also important to create a better material that can support the buildings despite any natural disaster like earthquakes. A good example is Japan where there are a lot of tall condominiums despite the large number of earthquakes happening in there. Besides a better usage of lands, a better sanitation is also needed because a huge number of people need a clean environment to maintain their heath. For example, countries in Africa can accommodate more people if they can manage to design a better sanitation system. 11 |
12 |13 | Countries with a lot of inhospitable space need not only to achieve a better space usage, but also to reforms the land to make it livable and technology can help it in a number of ways depending on the trouble the lands have. For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert. Dubai will be a good example for this as previously the country got almost no natural water and they use irrigation to bring natural water to the country. Another example is Netherlands, whose most of his lands is a swamp under sea level, have used a good irrigation system to counter their problem and to make their land habitable. 14 |
15 |16 | As the number of people grows, the need of habitable environment is unquestionably essential. In this era, Engineering designs can help to provide more habitable accommodation by designing a stronger material so it's possible to create a taller and safer building, a better and efficient sanitation system to prevent disease, and also by designing a way to change the condition of the inhabitable environment. 17 |
18 |\n.*\n)
') 46 | filestr = p.sub(r'\1
', filestr) 47 | 48 | parser.feed(filestr) 49 | f.close() 50 | parser.close() 51 | 52 | return parser.docs 53 | 54 | def sentenceSplit(docs): 55 | 56 | sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 57 | for doc in docs: 58 | for par in doc.paragraphs: 59 | doc.sentences.append([]) 60 | for s in sentenceTokenizer.tokenize(par): 61 | doc.buildSentence(s, [], '', [], []) 62 | return docs 63 | 64 | def compareTwoEditLists(editList1, editList2): 65 | # must be sorted 66 | if editList1 == [] and editList2 == []: 67 | return True 68 | elif editList1 == [] or editList2 == []: 69 | return False 70 | elif getEditKey(editList1[0]) != getEditKey(editList2[0]): 71 | return False 72 | else: 73 | return compareTwoEditLists(editList1[1:], editList2[1:]) 74 | 75 | def moderateAnnotations(contestDocs, annotBoard, origDocSet): 76 | # moderate annotation in "contesting" docs with already stated mistakes 77 | mistakeStrSet = {} 78 | for doc in contestDocs: 79 | mistakeStr = '' 80 | nid = int(doc.docattrs[0][1]) # nid of current document 81 | tid = doc.annotation[0][0][1] # teacher id 82 | 83 | if not annotBoard.has_key(nid): # create placeholder 84 | annotBoard[nid] = {} 85 | 86 | origDoc = origDocSet[nid] 87 | for pid in xrange(len(origDoc.sentences)): 88 | slist = origDoc.sentences[pid] 89 | if not annotBoard[nid].has_key(pid): 90 | annotBoard[nid][pid] = {} 91 | for sentid in xrange(len(slist)): 92 | sent = slist[sentid] 93 | if not annotBoard[nid][pid].has_key(sentid): 94 | annotBoard[nid][pid][sentid] = [] 95 | editSet = [] 96 | 97 | # enumerate mistakes 98 | sentoffset = origDoc.paragraphs[pid].index(sent.sentstr) 99 | editNum = 0 100 | for m in doc.mistakes: 101 | if m['start_par'] != pid or \ 102 | m['start_par'] != m['end_par'] or \ 103 | m['start_off'] < sentoffset or \ 104 | m['start_off'] >= sentoffset + len(sent.sentstr) or \ 105 | m['end_off']> %s
= %s
< %s
' % (input_sentence, target_sentence, output_sentence) 85 | # vis.text(text, win=win, opts={'title': win}) 86 | --------------------------------------------------------------------------------