├── ntumc
├── corpus
│ └── __init__.py
├── __init__.py
├── toolkit
│ ├── __init__.py
│ ├── jpn.py
│ ├── ind.py
│ ├── preprocess.py
│ ├── kor.py
│ ├── vie.py
│ ├── cmn.py
│ ├── nafer.py
│ ├── minisegmenter.py
│ └── gale_church.py
└── external
│ └── mini-segmenter
│ └── minisegmenter.py
└── README.md
/ntumc/corpus/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ntumc/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import toolkit
4 | #import corpus
--------------------------------------------------------------------------------
/ntumc/toolkit/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from preprocess import chinese, japanese, korean, vietnamese
4 |
--------------------------------------------------------------------------------
/ntumc/toolkit/jpn.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os, io, subprocess
4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
5 |
6 | from operator import itemgetter
7 |
8 | class Mecab():
9 | def __init__(self):
10 | pass
11 | def tokenize(self, text):
12 | cmd = unicode("".join(['echo "', text.decode('utf8'),
13 | '" | mecab -O wakati']))
14 | return os.popen(cmd).read().strip().decode('utf8').split()
15 |
16 | def pos_tag(self, text):
17 | if isinstance(text, list):
18 | text = " ".join(text).decode('utf8')
19 | cmd = unicode("".join(['echo "', text,
20 | '" | mecab -Ochasen']))
21 | return [itemgetter(0,3)(unicode(i.strip()).split())
22 | for i in os.popen(cmd).readlines()[:-1]]
--------------------------------------------------------------------------------
/ntumc/toolkit/ind.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os, io, subprocess
4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
5 |
6 | from nltk import word_tokenize
7 |
8 | class Indotag():
9 | def __init__(self):
10 | self.crf_dir = '/home/alvas/git/NTU-MC/ntumc/external/CRF++-0.58/crf_test'
11 | def tokenize(self, text):
12 | return word_tokenize(text)
13 | def pos_tag(self, text):
14 | if isinstance(text, list):
15 | text = " ".join(text).strip()
16 | # Write to text to temp file.
17 | os.popen("".join(['echo "', text, '" > tmp.txt']))
18 | os.popen("sed '$ !s/$/\\n/;s/ /\\n/g' tmp.txt > tmp.crf.txt")
19 | return [tuple(line.strip().split('\t')) for line in
20 | os.popen(" ".join([self.crf_dir,
21 | '-m model.id tmp.crf.txt'])).readlines()]
--------------------------------------------------------------------------------
/ntumc/toolkit/preprocess.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os, io, subprocess, time
4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
5 |
6 | from nltk.tokenize import word_tokenize
7 | from nltk import pos_tag as nltk_pos_tag
8 |
9 | import cmn, kor, jpn, vie, ind
10 |
11 | chinese = cmn.StanfordNLP()
12 | korean = kor.Postech()
13 | japanese = jpn.Mecab()
14 | vietnamese = vie.Jvntextpro()
15 | indonesian = ind.Indotag()
16 |
17 | lang2lib = {'jpn':japanese, 'cmn':chinese,
18 | 'vie':vietnamese, 'kor':korean,
19 | 'ind':indonesian}
20 |
21 | def tokenize(text, lang, batch=False):
22 | if lang in ['eng', 'ind']:
23 | return " ".join(word_tokenize(text))
24 | elif lang in lang2lib:
25 | return lang2lib[lang].tokenize(text, batch=batch)
26 | else:
27 | return text.split()
28 |
29 | def pos_tag(text, lang, batch=False):
30 | if lang == 'eng':
31 | return nltk_pos_tag(word_tokenize(text))
32 | if lang in lang2lib:
33 | return lang2lib[lang].pos_tag(text, batch=batch)
34 |
--------------------------------------------------------------------------------
/ntumc/toolkit/kor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os, io, subprocess, time
4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
5 |
6 | class Postech():
7 | def __init__(self, sejong_dir= '/home/alvas/git/NTU-MC/ntumc/external/sejong/'):
8 | self.sejong_dir = sejong_dir
9 |
10 | def utf8_to_euck(self, text):
11 | text = unicode(text.decode('utf-8'))
12 | text = text.replace(u'\xa0', u' ')
13 | text = text.replace(u'\xe7', u'c') # ç -> c
14 | text = text.replace(u'\xe9', u'e') # é -> e
15 | text = text.replace(u'\u2013', u'-') # – -> -
16 | text = text.replace(u'\xa9', '(c)') # © -> (c)
17 | return text.encode('euc-kr').strip()
18 |
19 | def sejong(self, text):
20 | text = self.utf8_to_euck(text)
21 | sejong_dir = self.sejong_dir
22 | with io.open(sejong_dir+'input.txt', 'wb') as fout:
23 | fout.write(text)
24 |
25 | cmd = "".join(['wine start /Unix ', sejong_dir,'sjTaggerInteg.exe'])
26 | os.popen(cmd)
27 | time.sleep(2)
28 |
29 | with io.open(sejong_dir+'output.txt', 'r', encoding='euc-kr') as fin:
30 | sejongtext = fin.read().strip().encode('utf8').decode('utf8')
31 |
32 | return sejongtext
33 |
34 | def tokenize(self, text):
35 | sejongtext = self.sejong(text)
36 | text = " ".join([i.split(r'/')[0] for i in sejongtext.split()])
37 | return text.decode('utf8').split()
38 |
39 | def pos_tag(self, text):
40 | if isinstance(text, list):
41 | text = " ".join(text)
42 | sejongtext = self.sejong(text)
43 | tagged_text = [tuple(i.split(r'/')) for i in sejongtext.split()]
44 | return tagged_text
45 |
46 | def batch_pos_tag(self, text):
47 | pass
48 |
--------------------------------------------------------------------------------
/ntumc/toolkit/vie.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os, io, subprocess
4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
5 |
6 | from nltk.tokenize import word_tokenize
7 |
8 | class Jvntextpro():
9 | def __init__(self, jvn_dir='/home/alvas/JVnTextPro-v.2.0/'):
10 | self.jvn_dir = jvn_dir
11 |
12 | self.segmenter_cmd = "".join(['java -mx512M -cp ',
13 | jvn_dir, '/bin:',
14 | jvn_dir + '/libs/args4j.jar:',
15 | jvn_dir + '/libs/lbfgs.jar ',
16 | 'jvnsegmenter.WordSegmenting ',
17 | '-modeldir ', jvn_dir,
18 | '/models/jvnsegmenter ',
19 | '-inputfile tmp.txt',
20 | ' -outputfile tmp.txt.wseg'])
21 |
22 | self.tagger_cmd = "".join(['java -mx512M -cp ',
23 | jvn_dir, '/bin:',
24 | jvn_dir + '/libs/args4j.jar:',
25 | jvn_dir + '/libs/lbfgs.jar ',
26 | 'jvnpostag.POSTagging ',
27 | '-tagger maxent ',
28 | '-modeldir ', jvn_dir,
29 | '/models/jvnpostag/maxent ',
30 | '-inputfile tmp.txt',
31 | ' -outputfile tmp.txt.pos'])
32 |
33 | def tokenize(self, text):
34 | # Write to text to temp file.
35 | os.popen("".join(['echo "', text, '" > tmp.txt']))
36 | # Runs segmenter.
37 | os.popen(self.segmenter_cmd)
38 | # Reads from output file.
39 | text = io.open('tmp.txt.wseg', 'r', encoding='utf8').read().strip()
40 | return word_tokenize(text)
41 |
42 | def pos_tag(self, text):
43 | # Tokenize the text.
44 | text = " ".join(text)
45 | # Write to text to temp file.
46 | os.popen("".join(['echo "', text, '" > tmp.txt']))
47 | # Runs tagger.
48 | os.popen(self.tagger_cmd)
49 | # Reads from output file.
50 | jvntext = io.open('tmp.txt.pos', 'r', encoding='utf8').read().strip()
51 | tagged_text = [tuple(i.split(r'/')) for i in jvntext.split()]
52 |
53 | return tagged_text
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | NTU-MC
2 | ======
3 |
4 | This is a legacy repository for the STB subcorpora of the Nanyang Technological University - Multilingual Corpus (NTU-MC) project. New editions of NTU-MC are maintained by [NTU Computational Linguistics Lab](http://compling.hss.ntu.edu.sg/ntumc/)
5 |
6 |
7 | Spin-offs
8 | ========
9 |
10 | * [NTU-MC Toolkit](http://www.aclweb.org/anthology/C/C14/C14-2019.pdf): An annotation toolkit for multilingual text (supports Arabic, Chinese, Japanese, Korean, Indonesian, Vietnamese and English)
11 | * [GaChalign](https://github.com/alvations/gachalign): A python implementation of Gale-Church Sentence-level Aligner with variable parameters
12 | * [Mini-segmenter](https://code.google.com/p/mini-segmenter/): A Dictionary based Chinese segmenter
13 | * [Indotag](): Implementation of Pisceldo et al. (2010) Bahasa Indonesian Part of Speech tagger, using 1M word corpus from the Pan Asia Networking Localization Project.
14 |
15 |
16 | Changelog
17 | ==========
18 |
19 |
20 | * NTU-MC v5.1 (26.08.14): Added NTU-MC Toolkit
21 | * [NTU-MC v5.0](https://drive.google.com/drive/folders/1ResffV1GXLCK6Dc-0ZxFeBRS9CtnMS71?usp=sharing) (29.04.13): Better cleaning with titles
22 | * NTU-MC v4.1 (08.04.13): Scheduled release.
23 | * NTU-MC v4.0 (27.01.13): Re-clean and retagged from scratch.
24 | * NTU-MC v3.0 (01.05.12): Scheduled release for IJALP
25 | * NTU-MC v2.0 (20.08.11): Cleaned and sentence aligned.
26 | * NTU-MC v1.0 (01.05.11): Foundation text.
27 |
28 |
29 |
30 | References
31 | ==========
32 |
33 | Please cite the following when using the data/scripts from the NTU-MC:
34 |
35 | ```
36 | @inproceedings{ntumc2011,
37 | author = {Liling Tan and
38 | Francis Bond},
39 | title = {Building and Annotating the Linguistically Diverse NTU-MC
40 | (NTU-Multilingual Corpus)},
41 | booktitle = {PACLIC},
42 | year = {2011},
43 | pages = {362-371},
44 | ee = {http://www.aclweb.org/anthology/Y11-1038},
45 | }
46 | ```
47 |
48 | * Liling Tan. 2011. [Building the foundation text for Nanyang Technological University - Multilingual Corpus (NTU-MC).](http://dr.ntu.edu.sg/bitstream/handle/10220/7790/Liling%20Tan.pdf). Bachelor Final Year Project. Nanyang Technological University: Singapore.
49 |
50 | * Liling Tan and Francis Bond. 2012. [Building and annotating the linguistically diverse NTU-MC (NTU-multilingual corpus)](http://www.colips.org/journal/volume22/22.4.2.NTU-MC%20Tan%20final.pdf). International Journal of Asian Language Processing, 22(4):161–174
51 |
52 | * Liling Tan and Francis Bond. 2014. [NTU-MC Toolkit: Annotating a Linguistically Diverse Corpus](http://www.aclweb.org/anthology/C/C14/C14-2019.pdf). In Proceedings of 25th International Conference on Computational Linguistics (COLING 2014). Dublin, Ireland.
53 |
54 |
55 | **Other References**:
56 |
--------------------------------------------------------------------------------
/ntumc/toolkit/cmn.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os, io, subprocess
4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
5 |
6 | class StanfordNLP():
7 | def __init__(self,
8 | stanford_segdir='/home/alvas/stanford-segmenter-2014-06-16',
9 | stanford_posdir='/home/alvas/stanford-postagger-full-2014-06-16'):
10 | self.stanford_segdir = stanford_segdir
11 | self.segmenter_cmd = " ".join(["bash",stanford_segdir+'/segment.sh',
12 | "ctb tmp.txt UTF8 0"])
13 |
14 | self.stanford_posdir = stanford_posdir
15 |
16 | self.tagger_cmd = " ".join(['java', '-cp',
17 | stanford_posdir+'/stanford-postagger.jar',
18 | 'edu.stanford.nlp.tagger.maxent.MaxentTagger',
19 | '-model', stanford_posdir+'/models/chinese-nodistsim.tagger',
20 | '-textFile tmp.txt'])
21 |
22 |
23 | def tokenize(self, text, batch=False):
24 | # Write to text to temp file.
25 | if batch:
26 | os.popen("".join(['echo -e "', '\n'.join(text), '" > tmp.txt']))
27 | else:
28 | os.popen("".join(['echo "', text, '" > tmp.txt']))
29 | # Runs the segmenter.
30 | text, err = subprocess.Popen(self.segmenter_cmd,
31 | shell = True, stdout=subprocess.PIPE,
32 | stderr=subprocess.PIPE).communicate()
33 | # Reads from subprocess output.
34 | text = text.decode('utf8').strip()
35 | if batch:
36 | return [i.strip().split() for i in text.split('\n')]
37 | else:
38 | return text.split()
39 |
40 |
41 | def pos_tag(self, text, batch=False):
42 | if batch:
43 | text = [' '.join(i) if isinstance(i, list) else i for i in text]
44 | os.popen("".join(['echo -e "', '\n'.join(text), '" > tmp.txt']))
45 | else:
46 | if isinstance(text, list):
47 | text = " ".join(text)
48 | # Write to text to temp file.
49 | os.popen("".join(['echo "', text, '" > tmp.txt']))
50 | # Runs the tagger.
51 | text, err = subprocess.Popen(self.tagger_cmd,
52 | shell = True, stdout=subprocess.PIPE,
53 | stderr=subprocess.PIPE).communicate()
54 | # Reads from subprocess output.
55 | text = text.decode('utf8').strip()
56 | if batch:
57 | return [[tuple(i.split(r'#')) for i in t.split()]
58 | for t in text.split('\n')]
59 | else:
60 | return [tuple(i.split(r'#')) for i in text.decode('utf8').split()]
61 |
62 |
--------------------------------------------------------------------------------
/ntumc/toolkit/nafer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os, io, subprocess, time
4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
5 |
6 | from preprocess import tokenize, pos_tag
7 |
8 | def text2naf(text, sentid, thisparaid, wordid):
9 | """
10 | John
11 | """
12 | textnaf = []
13 | stridx = 0
14 | for wid, word in enumerate(text.split(), start=1):
15 | line = '\t'+word+''''''
18 | textnaf.append(line)
19 | stridx+=len(word)+1
20 | return "\n".join(textnaf), wordid+wid
21 |
22 | def term2naf(tokens, tags, wordid):
23 | termnaf = []
24 | wid = 1
25 | for token, tag in zip(tokens, tags):
26 | line = '\t\n'
27 | line+= '\t\t\n'
28 | line+= '\t' + r'<\term>'
29 | termnaf.append(line)
30 | wid+=1
31 | return "\n".join(termnaf)
32 |
33 | indir = '/home/alvas/git/NTU-MC/ntumc-v5/subcorpora/yoursing/cleanest/'
34 | langs = os.walk(indir).next()[1]
35 |
36 | for lang in langs:
37 | if lang == "eng" or lang == "cmn":
38 | continue
39 | langdir = indir+lang+'/'
40 | fout = io.open('ntumc-'+lang+'.naf', 'wb')
41 | for filename in sorted(os.walk(langdir).next()[2]):
42 | if filename.endswith('~'):
43 | continue
44 | webpage = ''
45 | title = ''
46 | wordid = 1
47 | textlayer = []
48 | textlayer.append('')
49 | textlayer.append('')
50 |
51 | termlayer = []
52 | termlayer.append('')
53 | termlayer.append('')
54 |
55 | lines = []
56 |
57 | for line in io.open(langdir+filename, 'r', encoding='utf8'):
58 | line = line.strip()
59 | if line.startswith('#M'):
60 | webpage = line.split('\t')[1].replace('