├── demo ├── demo.src ├── demo.trg ├── demo.para └── demo.m2 ├── README.md └── conv_m2.py /demo/demo.src: -------------------------------------------------------------------------------- 1 | Because I intend to take examination in Septenber . 2 | 那 时候 人们 没 太 喜欢 生活 这里 。 3 | -------------------------------------------------------------------------------- /demo/demo.trg: -------------------------------------------------------------------------------- 1 | This is because I intend to take an examination in September . 2 | 那 时候 人们 不 太 喜欢 这里 的 生活 。 3 | -------------------------------------------------------------------------------- /demo/demo.para: -------------------------------------------------------------------------------- 1 | S Because I intend to take examination in Septenber . 2 | T0 This is because I intend to take an examination in September . 3 | 4 | S 那 时候 人们 没 太 喜欢 生活 这里 。 5 | T0 那 时候 人们 不 太 喜欢 这里 的 生活 。 6 | T1 那 时候 人们 不 太 喜欢 生活 在 这里 。 7 | 8 | -------------------------------------------------------------------------------- /demo/demo.m2: -------------------------------------------------------------------------------- 1 | S Because I intend to take examination in Septenber . 2 | A 0 1|||M:OTHER|||This is because|||REQUIRED|||-NONE-|||0 3 | A 5 5|||M:DET|||an|||REQUIRED|||-NONE-|||0 4 | A 7 8|||R:SPELL|||September|||REQUIRED|||-NONE-|||0 5 | 6 | S 那 时候 人们 没 太 喜欢 生活 这里 。 7 | A 3 4|||R:NOUN|||不|||REQUIRED|||-NONE-|||0 8 | A 6 7|||U:NOUN||||||REQUIRED|||-NONE-|||0 9 | A 8 8|||M:NOUN|||的 生活|||REQUIRED|||-NONE-|||0 10 | A 3 4|||R:NOUN|||不|||REQUIRED|||-NONE-|||1 11 | A 7 7|||M:NOUN|||在|||REQUIRED|||-NONE-|||1 12 | 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # M2Convertor 2 | Convert Standard M2 format to parallel sentences. 3 | For M2 format and its usage details, please see https://github.com/chrisjbryant/errant for more information. 4 | 5 | ## Prerequisites 6 | Python >= 3.0 7 | 8 | ## Usage 9 | python conv_m2.py -f -p 10 | 11 | ## Outputs 12 | `*.src`: file of source sentences 13 | 14 | `*.trg`: file of target sentences, only editor 0's annotation is contained. 15 | 16 | `*.para`: file which contains parallel sentences. 17 | 18 | > For *.para, lines begining with "S " refers to source sentences, while "T0" refers to target sentences annotated by editor 0, as the number next to "T" indicates the editor number in the input m2 file. 19 | 20 | > You can check the files in demo to understand how it works. 21 | -------------------------------------------------------------------------------- /conv_m2.py: -------------------------------------------------------------------------------- 1 | # -*- coding:UTF-8 -*- 2 | # @Author: Xuezhi Fang 3 | # @Date: 2020-06-19 4 | # @Email: jasonfang3900@gmail.com 5 | 6 | import argparse 7 | import re 8 | 9 | 10 | class M2Processor(): 11 | def __init__(self, src_sent, edit_lines): 12 | self.src_sent = src_sent 13 | self.edit_lines = edit_lines 14 | self.edits = {} 15 | self.trg_sents = [] 16 | 17 | def conv_edit(self, line): 18 | line = line.strip().split("|||") 19 | edit_span = line[0].split(" ") 20 | edit_span = (int(edit_span[0]), int(edit_span[1])) 21 | edit_res = line[2] 22 | editor = line[-1] 23 | if edit_span[0] == -1: 24 | return None 25 | if edit_span[0] == edit_span[1]: 26 | edit_tag = "ADD" 27 | elif edit_res == "-NONE-" or edit_res == "": 28 | edit_tag = "DEL" 29 | else: 30 | edit_tag = "REP" 31 | return editor, edit_tag, edit_span, edit_res 32 | 33 | def get_edits(self): 34 | for line in self.edit_lines: 35 | if line: 36 | edit_item = self.conv_edit(line) 37 | if not edit_item: 38 | continue 39 | editor, edit_tag, edit_span, edit_res = edit_item 40 | if editor not in self.edits: 41 | self.edits[editor] = [] 42 | self.edits[editor].append({"span": edit_span, "op": edit_tag, "res": edit_res}) 43 | 44 | def get_para(self): 45 | self.get_edits() 46 | if self.edits: 47 | for editor in self.edits: 48 | sent = self.src_sent.split(" ") 49 | for edit_item in self.edits[editor]: 50 | edit_span, edit_tag, trg_tokens = edit_item["span"], edit_item["op"], edit_item["res"] 51 | if edit_tag == "DEL": 52 | sent[edit_span[0]:edit_span[1]] = [" " for _ in range(edit_span[1] - edit_span[0])] 53 | else: 54 | if edit_tag == "ADD": 55 | if edit_span[0] != 0: 56 | sent[edit_span[0]-1] += " " + trg_tokens 57 | else: 58 | sent[edit_span[0]] = trg_tokens + " " + sent[edit_span[0]] 59 | elif edit_tag == "REP": 60 | src_tokens_len = len(sent[edit_span[0]:edit_span[1]]) 61 | sent[edit_span[0]:edit_span[1]] = [trg_tokens] + [" " for _ in range(src_tokens_len-1)] 62 | sent = " ".join(sent).strip() 63 | res_sent = re.sub(" +", " ", sent) 64 | self.trg_sents.append(res_sent) 65 | return self.trg_sents 66 | else: 67 | return [self.src_sent] 68 | 69 | 70 | def read_file(): 71 | src_sent = None 72 | edit_lines = [] 73 | with open(args.f, "r", encoding="utf8") as fr: 74 | for line in fr: 75 | if line: 76 | line = line.strip() 77 | if line.startswith("S "): 78 | src_sent = line.replace("S ", "", 1) 79 | elif line.startswith("A "): 80 | edit_lines.append(line.replace("A ", "", 1)) 81 | elif line == "": 82 | yield src_sent, edit_lines 83 | edit_lines.clear() 84 | 85 | 86 | def main(): 87 | counter = 0 88 | fw_src = open(f"{args.p}.src", "w", encoding="utf8") 89 | fw_trg = open(f"{args.p}.trg", "w", encoding="utf8") 90 | fw_para = open(f"{args.p}.para", "w", encoding="utf8") 91 | for src_sent, edit_lines in read_file(): 92 | counter += 1 93 | m2_item = M2Processor(src_sent, edit_lines) 94 | trg_sents = m2_item.get_para() 95 | fw_para.write(f"S {src_sent}\n") 96 | prefix_counter = 0 97 | for sent in trg_sents: 98 | fw_para.write(f"T{prefix_counter} {sent}\n") 99 | prefix_counter += 1 100 | fw_para.write("\n") 101 | fw_src.write(src_sent+"\n") 102 | fw_trg.write(trg_sents[0]+"\n") 103 | fw_src.close() 104 | fw_trg.close() 105 | fw_para.close() 106 | 107 | 108 | if __name__ == "__main__": 109 | parser = argparse.ArgumentParser() 110 | parser.add_argument("-f", help="m2 file") 111 | parser.add_argument("-p", help="output prefix") 112 | args = parser.parse_args() 113 | main() 114 | --------------------------------------------------------------------------------