├── demo
    ├── demo.src
    ├── demo.trg
    ├── demo.para
    └── demo.m2
├── README.md
└── conv_m2.py


/demo/demo.src:
--------------------------------------------------------------------------------
1 | Because I intend to take examination in Septenber .
2 | 那 时候 人们 没 太 喜欢 生活 这里 。
3 | 


--------------------------------------------------------------------------------
/demo/demo.trg:
--------------------------------------------------------------------------------
1 | This is because I intend to take an examination in September .
2 | 那 时候 人们 不 太 喜欢 这里 的 生活 。
3 | 


--------------------------------------------------------------------------------
/demo/demo.para:
--------------------------------------------------------------------------------
1 | S Because I intend to take examination in Septenber .
2 | T0 This is because I intend to take an examination in September .
3 | 
4 | S 那 时候 人们 没 太 喜欢 生活 这里 。
5 | T0 那 时候 人们 不 太 喜欢 这里 的 生活 。
6 | T1 那 时候 人们 不 太 喜欢 生活 在 这里 。
7 | 
8 | 


--------------------------------------------------------------------------------
/demo/demo.m2:
--------------------------------------------------------------------------------
 1 | S Because I intend to take examination in Septenber .
 2 | A 0 1|||M:OTHER|||This is because|||REQUIRED|||-NONE-|||0
 3 | A 5 5|||M:DET|||an|||REQUIRED|||-NONE-|||0
 4 | A 7 8|||R:SPELL|||September|||REQUIRED|||-NONE-|||0
 5 | 
 6 | S 那 时候 人们 没 太 喜欢 生活 这里 。
 7 | A 3 4|||R:NOUN|||不|||REQUIRED|||-NONE-|||0
 8 | A 6 7|||U:NOUN||||||REQUIRED|||-NONE-|||0
 9 | A 8 8|||M:NOUN|||的 生活|||REQUIRED|||-NONE-|||0
10 | A 3 4|||R:NOUN|||不|||REQUIRED|||-NONE-|||1
11 | A 7 7|||M:NOUN|||在|||REQUIRED|||-NONE-|||1
12 | 
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # M2Convertor
 2 | Convert Standard M2 format to parallel sentences.
 3 | For M2 format and its usage details, please see https://github.com/chrisjbryant/errant for more information.
 4 | 
 5 | ## Prerequisites
 6 | Python >= 3.0
 7 | 
 8 | ## Usage
 9 | python conv_m2.py -f <m2_file> -p <output_prefix>
10 | 
11 | ## Outputs
12 | `*.src`: file of source sentences
13 | 
14 | `*.trg`: file of target sentences, only editor 0's annotation is contained.
15 | 
16 | `*.para`: file which contains parallel sentences.
17 | 
18 | > For *.para, lines begining with "S " refers to source sentences, while "T0" refers to target sentences annotated by editor 0, as the number next to "T" indicates the editor number in the input m2 file.
19 | 
20 | > You can check the files in demo to understand how it works.
21 | 


--------------------------------------------------------------------------------
/conv_m2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:UTF-8 -*-
  2 | # @Author: Xuezhi Fang
  3 | # @Date: 2020-06-19
  4 | # @Email: jasonfang3900@gmail.com
  5 | 
  6 | import argparse
  7 | import re
  8 | 
  9 | 
 10 | class M2Processor():
 11 |     def __init__(self, src_sent, edit_lines):
 12 |         self.src_sent = src_sent
 13 |         self.edit_lines = edit_lines
 14 |         self.edits = {}
 15 |         self.trg_sents = []
 16 |         
 17 |     def conv_edit(self, line):
 18 |         line = line.strip().split("|||")
 19 |         edit_span = line[0].split(" ")
 20 |         edit_span = (int(edit_span[0]), int(edit_span[1]))
 21 |         edit_res = line[2]
 22 |         editor = line[-1]
 23 |         if edit_span[0] == -1:
 24 |             return None
 25 |         if edit_span[0] == edit_span[1]:
 26 |             edit_tag = "ADD"
 27 |         elif edit_res == "-NONE-" or edit_res == "":
 28 |             edit_tag = "DEL"
 29 |         else:
 30 |             edit_tag = "REP"
 31 |         return editor, edit_tag, edit_span, edit_res
 32 |     
 33 |     def get_edits(self):
 34 |         for line in self.edit_lines:
 35 |             if line:
 36 |                 edit_item = self.conv_edit(line)
 37 |                 if not edit_item:
 38 |                     continue
 39 |                 editor, edit_tag, edit_span, edit_res = edit_item
 40 |                 if editor not in self.edits:
 41 |                     self.edits[editor] = []
 42 |                 self.edits[editor].append({"span": edit_span, "op": edit_tag, "res": edit_res})
 43 |                 
 44 |     def get_para(self):
 45 |         self.get_edits()
 46 |         if self.edits:
 47 |             for editor in self.edits:
 48 |                 sent = self.src_sent.split(" ")
 49 |                 for edit_item in self.edits[editor]:
 50 |                     edit_span, edit_tag, trg_tokens = edit_item["span"], edit_item["op"], edit_item["res"]
 51 |                     if edit_tag == "DEL":
 52 |                         sent[edit_span[0]:edit_span[1]] = [" " for _ in range(edit_span[1] - edit_span[0])]
 53 |                     else:
 54 |                         if edit_tag == "ADD":
 55 |                             if edit_span[0] != 0:
 56 |                                 sent[edit_span[0]-1] += " " + trg_tokens
 57 |                             else:
 58 |                                 sent[edit_span[0]] = trg_tokens + " " + sent[edit_span[0]]
 59 |                         elif edit_tag == "REP":
 60 |                             src_tokens_len = len(sent[edit_span[0]:edit_span[1]])
 61 |                             sent[edit_span[0]:edit_span[1]] = [trg_tokens] + [" " for _ in range(src_tokens_len-1)]
 62 |                 sent = " ".join(sent).strip()
 63 |                 res_sent = re.sub(" +", " ", sent)
 64 |                 self.trg_sents.append(res_sent)
 65 |             return self.trg_sents
 66 |         else:
 67 |             return [self.src_sent]
 68 | 
 69 |     
 70 | def read_file():
 71 |     src_sent = None
 72 |     edit_lines = []
 73 |     with open(args.f, "r", encoding="utf8") as fr:
 74 |         for line in fr:
 75 |             if line:
 76 |                 line = line.strip()
 77 |                 if line.startswith("S "):
 78 |                     src_sent = line.replace("S ", "", 1)
 79 |                 elif line.startswith("A "):
 80 |                     edit_lines.append(line.replace("A ", "", 1))
 81 |                 elif line == "":
 82 |                     yield src_sent, edit_lines
 83 |                     edit_lines.clear()
 84 | 
 85 | 
 86 | def main():
 87 |     counter = 0
 88 |     fw_src = open(f"{args.p}.src", "w", encoding="utf8")
 89 |     fw_trg = open(f"{args.p}.trg", "w", encoding="utf8")
 90 |     fw_para = open(f"{args.p}.para", "w", encoding="utf8")
 91 |     for src_sent, edit_lines in read_file():
 92 |         counter += 1
 93 |         m2_item = M2Processor(src_sent, edit_lines)
 94 |         trg_sents = m2_item.get_para()
 95 |         fw_para.write(f"S {src_sent}\n")
 96 |         prefix_counter = 0
 97 |         for sent in trg_sents:
 98 |             fw_para.write(f"T{prefix_counter} {sent}\n")
 99 |             prefix_counter += 1
100 |         fw_para.write("\n")
101 |         fw_src.write(src_sent+"\n")
102 |         fw_trg.write(trg_sents[0]+"\n")
103 |     fw_src.close()
104 |     fw_trg.close()
105 |     fw_para.close()
106 |  
107 | 
108 | if __name__ == "__main__":
109 |     parser = argparse.ArgumentParser()
110 |     parser.add_argument("-f", help="m2 file")
111 |     parser.add_argument("-p", help="output prefix")
112 |     args = parser.parse_args()
113 |     main()
114 | 


--------------------------------------------------------------------------------