├── ntumc ├── corpus │ └── __init__.py ├── __init__.py ├── toolkit │ ├── __init__.py │ ├── jpn.py │ ├── ind.py │ ├── preprocess.py │ ├── kor.py │ ├── vie.py │ ├── cmn.py │ ├── nafer.py │ ├── minisegmenter.py │ └── gale_church.py └── external │ └── mini-segmenter │ └── minisegmenter.py └── README.md /ntumc/corpus/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ntumc/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import toolkit 4 | #import corpus -------------------------------------------------------------------------------- /ntumc/toolkit/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from preprocess import chinese, japanese, korean, vietnamese 4 | -------------------------------------------------------------------------------- /ntumc/toolkit/jpn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, io, subprocess 4 | import sys; reload(sys); sys.setdefaultencoding('utf-8') 5 | 6 | from operator import itemgetter 7 | 8 | class Mecab(): 9 | def __init__(self): 10 | pass 11 | def tokenize(self, text): 12 | cmd = unicode("".join(['echo "', text.decode('utf8'), 13 | '" | mecab -O wakati'])) 14 | return os.popen(cmd).read().strip().decode('utf8').split() 15 | 16 | def pos_tag(self, text): 17 | if isinstance(text, list): 18 | text = " ".join(text).decode('utf8') 19 | cmd = unicode("".join(['echo "', text, 20 | '" | mecab -Ochasen'])) 21 | return [itemgetter(0,3)(unicode(i.strip()).split()) 22 | for i in os.popen(cmd).readlines()[:-1]] -------------------------------------------------------------------------------- /ntumc/toolkit/ind.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, io, subprocess 4 | import sys; reload(sys); sys.setdefaultencoding('utf-8') 5 | 6 | from nltk import word_tokenize 7 | 8 | class Indotag(): 9 | def __init__(self): 10 | self.crf_dir = '/home/alvas/git/NTU-MC/ntumc/external/CRF++-0.58/crf_test' 11 | def tokenize(self, text): 12 | return word_tokenize(text) 13 | def pos_tag(self, text): 14 | if isinstance(text, list): 15 | text = " ".join(text).strip() 16 | # Write to text to temp file. 17 | os.popen("".join(['echo "', text, '" > tmp.txt'])) 18 | os.popen("sed '$ !s/$/\\n/;s/ /\\n/g' tmp.txt > tmp.crf.txt") 19 | return [tuple(line.strip().split('\t')) for line in 20 | os.popen(" ".join([self.crf_dir, 21 | '-m model.id tmp.crf.txt'])).readlines()] -------------------------------------------------------------------------------- /ntumc/toolkit/preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, io, subprocess, time 4 | import sys; reload(sys); sys.setdefaultencoding('utf-8') 5 | 6 | from nltk.tokenize import word_tokenize 7 | from nltk import pos_tag as nltk_pos_tag 8 | 9 | import cmn, kor, jpn, vie, ind 10 | 11 | chinese = cmn.StanfordNLP() 12 | korean = kor.Postech() 13 | japanese = jpn.Mecab() 14 | vietnamese = vie.Jvntextpro() 15 | indonesian = ind.Indotag() 16 | 17 | lang2lib = {'jpn':japanese, 'cmn':chinese, 18 | 'vie':vietnamese, 'kor':korean, 19 | 'ind':indonesian} 20 | 21 | def tokenize(text, lang, batch=False): 22 | if lang in ['eng', 'ind']: 23 | return " ".join(word_tokenize(text)) 24 | elif lang in lang2lib: 25 | return lang2lib[lang].tokenize(text, batch=batch) 26 | else: 27 | return text.split() 28 | 29 | def pos_tag(text, lang, batch=False): 30 | if lang == 'eng': 31 | return nltk_pos_tag(word_tokenize(text)) 32 | if lang in lang2lib: 33 | return lang2lib[lang].pos_tag(text, batch=batch) 34 | -------------------------------------------------------------------------------- /ntumc/toolkit/kor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, io, subprocess, time 4 | import sys; reload(sys); sys.setdefaultencoding('utf-8') 5 | 6 | class Postech(): 7 | def __init__(self, sejong_dir= '/home/alvas/git/NTU-MC/ntumc/external/sejong/'): 8 | self.sejong_dir = sejong_dir 9 | 10 | def utf8_to_euck(self, text): 11 | text = unicode(text.decode('utf-8')) 12 | text = text.replace(u'\xa0', u' ') 13 | text = text.replace(u'\xe7', u'c') # ç -> c 14 | text = text.replace(u'\xe9', u'e') # é -> e 15 | text = text.replace(u'\u2013', u'-') # – -> - 16 | text = text.replace(u'\xa9', '(c)') # © -> (c) 17 | return text.encode('euc-kr').strip() 18 | 19 | def sejong(self, text): 20 | text = self.utf8_to_euck(text) 21 | sejong_dir = self.sejong_dir 22 | with io.open(sejong_dir+'input.txt', 'wb') as fout: 23 | fout.write(text) 24 | 25 | cmd = "".join(['wine start /Unix ', sejong_dir,'sjTaggerInteg.exe']) 26 | os.popen(cmd) 27 | time.sleep(2) 28 | 29 | with io.open(sejong_dir+'output.txt', 'r', encoding='euc-kr') as fin: 30 | sejongtext = fin.read().strip().encode('utf8').decode('utf8') 31 | 32 | return sejongtext 33 | 34 | def tokenize(self, text): 35 | sejongtext = self.sejong(text) 36 | text = " ".join([i.split(r'/')[0] for i in sejongtext.split()]) 37 | return text.decode('utf8').split() 38 | 39 | def pos_tag(self, text): 40 | if isinstance(text, list): 41 | text = " ".join(text) 42 | sejongtext = self.sejong(text) 43 | tagged_text = [tuple(i.split(r'/')) for i in sejongtext.split()] 44 | return tagged_text 45 | 46 | def batch_pos_tag(self, text): 47 | pass 48 | -------------------------------------------------------------------------------- /ntumc/toolkit/vie.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, io, subprocess 4 | import sys; reload(sys); sys.setdefaultencoding('utf-8') 5 | 6 | from nltk.tokenize import word_tokenize 7 | 8 | class Jvntextpro(): 9 | def __init__(self, jvn_dir='/home/alvas/JVnTextPro-v.2.0/'): 10 | self.jvn_dir = jvn_dir 11 | 12 | self.segmenter_cmd = "".join(['java -mx512M -cp ', 13 | jvn_dir, '/bin:', 14 | jvn_dir + '/libs/args4j.jar:', 15 | jvn_dir + '/libs/lbfgs.jar ', 16 | 'jvnsegmenter.WordSegmenting ', 17 | '-modeldir ', jvn_dir, 18 | '/models/jvnsegmenter ', 19 | '-inputfile tmp.txt', 20 | ' -outputfile tmp.txt.wseg']) 21 | 22 | self.tagger_cmd = "".join(['java -mx512M -cp ', 23 | jvn_dir, '/bin:', 24 | jvn_dir + '/libs/args4j.jar:', 25 | jvn_dir + '/libs/lbfgs.jar ', 26 | 'jvnpostag.POSTagging ', 27 | '-tagger maxent ', 28 | '-modeldir ', jvn_dir, 29 | '/models/jvnpostag/maxent ', 30 | '-inputfile tmp.txt', 31 | ' -outputfile tmp.txt.pos']) 32 | 33 | def tokenize(self, text): 34 | # Write to text to temp file. 35 | os.popen("".join(['echo "', text, '" > tmp.txt'])) 36 | # Runs segmenter. 37 | os.popen(self.segmenter_cmd) 38 | # Reads from output file. 39 | text = io.open('tmp.txt.wseg', 'r', encoding='utf8').read().strip() 40 | return word_tokenize(text) 41 | 42 | def pos_tag(self, text): 43 | # Tokenize the text. 44 | text = " ".join(text) 45 | # Write to text to temp file. 46 | os.popen("".join(['echo "', text, '" > tmp.txt'])) 47 | # Runs tagger. 48 | os.popen(self.tagger_cmd) 49 | # Reads from output file. 50 | jvntext = io.open('tmp.txt.pos', 'r', encoding='utf8').read().strip() 51 | tagged_text = [tuple(i.split(r'/')) for i in jvntext.split()] 52 | 53 | return tagged_text -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | NTU-MC 2 | ====== 3 | 4 | This is a legacy repository for the STB subcorpora of the Nanyang Technological University - Multilingual Corpus (NTU-MC) project. New editions of NTU-MC are maintained by [NTU Computational Linguistics Lab](http://compling.hss.ntu.edu.sg/ntumc/) 5 | 6 | 7 | Spin-offs 8 | ======== 9 | 10 | * [NTU-MC Toolkit](http://www.aclweb.org/anthology/C/C14/C14-2019.pdf): An annotation toolkit for multilingual text (supports Arabic, Chinese, Japanese, Korean, Indonesian, Vietnamese and English) 11 | * [GaChalign](https://github.com/alvations/gachalign): A python implementation of Gale-Church Sentence-level Aligner with variable parameters 12 | * [Mini-segmenter](https://code.google.com/p/mini-segmenter/): A Dictionary based Chinese segmenter 13 | * [Indotag](): Implementation of Pisceldo et al. (2010) Bahasa Indonesian Part of Speech tagger, using 1M word corpus from the Pan Asia Networking Localization Project. 14 | 15 | 16 | Changelog 17 | ========== 18 | 19 | 20 | * NTU-MC v5.1 (26.08.14): Added NTU-MC Toolkit 21 | * [NTU-MC v5.0](https://drive.google.com/drive/folders/1ResffV1GXLCK6Dc-0ZxFeBRS9CtnMS71?usp=sharing) (29.04.13): Better cleaning with titles 22 | * NTU-MC v4.1 (08.04.13): Scheduled release. 23 | * NTU-MC v4.0 (27.01.13): Re-clean and retagged from scratch. 24 | * NTU-MC v3.0 (01.05.12): Scheduled release for IJALP 25 | * NTU-MC v2.0 (20.08.11): Cleaned and sentence aligned. 26 | * NTU-MC v1.0 (01.05.11): Foundation text. 27 | 28 | 29 | 30 | References 31 | ========== 32 | 33 | Please cite the following when using the data/scripts from the NTU-MC: 34 | 35 | ``` 36 | @inproceedings{ntumc2011, 37 | author = {Liling Tan and 38 | Francis Bond}, 39 | title = {Building and Annotating the Linguistically Diverse NTU-MC 40 | (NTU-Multilingual Corpus)}, 41 | booktitle = {PACLIC}, 42 | year = {2011}, 43 | pages = {362-371}, 44 | ee = {http://www.aclweb.org/anthology/Y11-1038}, 45 | } 46 | ``` 47 | 48 | * Liling Tan. 2011. [Building the foundation text for Nanyang Technological University - Multilingual Corpus (NTU-MC).](http://dr.ntu.edu.sg/bitstream/handle/10220/7790/Liling%20Tan.pdf). Bachelor Final Year Project. Nanyang Technological University: Singapore. 49 | 50 | * Liling Tan and Francis Bond. 2012. [Building and annotating the linguistically diverse NTU-MC (NTU-multilingual corpus)](http://www.colips.org/journal/volume22/22.4.2.NTU-MC%20Tan%20final.pdf). International Journal of Asian Language Processing, 22(4):161–174 51 | 52 | * Liling Tan and Francis Bond. 2014. [NTU-MC Toolkit: Annotating a Linguistically Diverse Corpus](http://www.aclweb.org/anthology/C/C14/C14-2019.pdf). In Proceedings of 25th International Conference on Computational Linguistics (COLING 2014). Dublin, Ireland. 53 | 54 | 55 | **Other References**: 56 | -------------------------------------------------------------------------------- /ntumc/toolkit/cmn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, io, subprocess 4 | import sys; reload(sys); sys.setdefaultencoding('utf-8') 5 | 6 | class StanfordNLP(): 7 | def __init__(self, 8 | stanford_segdir='/home/alvas/stanford-segmenter-2014-06-16', 9 | stanford_posdir='/home/alvas/stanford-postagger-full-2014-06-16'): 10 | self.stanford_segdir = stanford_segdir 11 | self.segmenter_cmd = " ".join(["bash",stanford_segdir+'/segment.sh', 12 | "ctb tmp.txt UTF8 0"]) 13 | 14 | self.stanford_posdir = stanford_posdir 15 | 16 | self.tagger_cmd = " ".join(['java', '-cp', 17 | stanford_posdir+'/stanford-postagger.jar', 18 | 'edu.stanford.nlp.tagger.maxent.MaxentTagger', 19 | '-model', stanford_posdir+'/models/chinese-nodistsim.tagger', 20 | '-textFile tmp.txt']) 21 | 22 | 23 | def tokenize(self, text, batch=False): 24 | # Write to text to temp file. 25 | if batch: 26 | os.popen("".join(['echo -e "', '\n'.join(text), '" > tmp.txt'])) 27 | else: 28 | os.popen("".join(['echo "', text, '" > tmp.txt'])) 29 | # Runs the segmenter. 30 | text, err = subprocess.Popen(self.segmenter_cmd, 31 | shell = True, stdout=subprocess.PIPE, 32 | stderr=subprocess.PIPE).communicate() 33 | # Reads from subprocess output. 34 | text = text.decode('utf8').strip() 35 | if batch: 36 | return [i.strip().split() for i in text.split('\n')] 37 | else: 38 | return text.split() 39 | 40 | 41 | def pos_tag(self, text, batch=False): 42 | if batch: 43 | text = [' '.join(i) if isinstance(i, list) else i for i in text] 44 | os.popen("".join(['echo -e "', '\n'.join(text), '" > tmp.txt'])) 45 | else: 46 | if isinstance(text, list): 47 | text = " ".join(text) 48 | # Write to text to temp file. 49 | os.popen("".join(['echo "', text, '" > tmp.txt'])) 50 | # Runs the tagger. 51 | text, err = subprocess.Popen(self.tagger_cmd, 52 | shell = True, stdout=subprocess.PIPE, 53 | stderr=subprocess.PIPE).communicate() 54 | # Reads from subprocess output. 55 | text = text.decode('utf8').strip() 56 | if batch: 57 | return [[tuple(i.split(r'#')) for i in t.split()] 58 | for t in text.split('\n')] 59 | else: 60 | return [tuple(i.split(r'#')) for i in text.decode('utf8').split()] 61 | 62 | -------------------------------------------------------------------------------- /ntumc/toolkit/nafer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, io, subprocess, time 4 | import sys; reload(sys); sys.setdefaultencoding('utf-8') 5 | 6 | from preprocess import tokenize, pos_tag 7 | 8 | def text2naf(text, sentid, thisparaid, wordid): 9 | """ 10 | John 11 | """ 12 | textnaf = [] 13 | stridx = 0 14 | for wid, word in enumerate(text.split(), start=1): 15 | line = '\t'+word+'''''' 18 | textnaf.append(line) 19 | stridx+=len(word)+1 20 | return "\n".join(textnaf), wordid+wid 21 | 22 | def term2naf(tokens, tags, wordid): 23 | termnaf = [] 24 | wid = 1 25 | for token, tag in zip(tokens, tags): 26 | line = '\t\n' 27 | line+= '\t\t\n' 28 | line+= '\t' + r'<\term>' 29 | termnaf.append(line) 30 | wid+=1 31 | return "\n".join(termnaf) 32 | 33 | indir = '/home/alvas/git/NTU-MC/ntumc-v5/subcorpora/yoursing/cleanest/' 34 | langs = os.walk(indir).next()[1] 35 | 36 | for lang in langs: 37 | if lang == "eng" or lang == "cmn": 38 | continue 39 | langdir = indir+lang+'/' 40 | fout = io.open('ntumc-'+lang+'.naf', 'wb') 41 | for filename in sorted(os.walk(langdir).next()[2]): 42 | if filename.endswith('~'): 43 | continue 44 | webpage = '' 45 | title = '' 46 | wordid = 1 47 | textlayer = [] 48 | textlayer.append('') 49 | textlayer.append('') 50 | 51 | termlayer = [] 52 | termlayer.append('') 53 | termlayer.append('') 54 | 55 | lines = [] 56 | 57 | for line in io.open(langdir+filename, 'r', encoding='utf8'): 58 | line = line.strip() 59 | if line.startswith('#M'): 60 | webpage = line.split('\t')[1].replace('