├── ntumc
    ├── corpus
    │   └── __init__.py
    ├── __init__.py
    ├── toolkit
    │   ├── __init__.py
    │   ├── jpn.py
    │   ├── ind.py
    │   ├── preprocess.py
    │   ├── kor.py
    │   ├── vie.py
    │   ├── cmn.py
    │   ├── nafer.py
    │   ├── minisegmenter.py
    │   └── gale_church.py
    └── external
    │   └── mini-segmenter
    │       └── minisegmenter.py
└── README.md


/ntumc/corpus/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ntumc/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | import toolkit
4 | #import corpus


--------------------------------------------------------------------------------
/ntumc/toolkit/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from preprocess import chinese, japanese, korean, vietnamese
4 | 


--------------------------------------------------------------------------------
/ntumc/toolkit/jpn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os, io, subprocess
 4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
 5 | 
 6 | from operator import itemgetter
 7 | 
 8 | class Mecab():
 9 |     def __init__(self):
10 |         pass
11 |     def tokenize(self, text):
12 |         cmd = unicode("".join(['echo "', text.decode('utf8'),
13 |                        '" | mecab -O wakati']))
14 |         return os.popen(cmd).read().strip().decode('utf8').split()
15 |     
16 |     def pos_tag(self, text):
17 |         if isinstance(text, list):
18 |             text = " ".join(text).decode('utf8')
19 |         cmd = unicode("".join(['echo "', text,
20 |                        '" | mecab -Ochasen']))
21 |         return [itemgetter(0,3)(unicode(i.strip()).split()) 
22 |                 for i in os.popen(cmd).readlines()[:-1]]


--------------------------------------------------------------------------------
/ntumc/toolkit/ind.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os, io, subprocess
 4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
 5 | 
 6 | from nltk import word_tokenize
 7 | 
 8 | class Indotag():
 9 |     def __init__(self):
10 |         self.crf_dir = '/home/alvas/git/NTU-MC/ntumc/external/CRF++-0.58/crf_test'
11 |     def tokenize(self, text):
12 |         return word_tokenize(text)
13 |     def pos_tag(self, text):
14 |         if isinstance(text, list):
15 |             text = " ".join(text).strip()
16 |         # Write to text to temp file.
17 |         os.popen("".join(['echo "', text, '" > tmp.txt']))
18 |         os.popen("sed '$ !s/$/\\n/;s/ /\\n/g' tmp.txt > tmp.crf.txt")
19 |         return [tuple(line.strip().split('\t')) for line in 
20 |                 os.popen(" ".join([self.crf_dir, 
21 |                                    '-m model.id tmp.crf.txt'])).readlines()]


--------------------------------------------------------------------------------
/ntumc/toolkit/preprocess.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os, io, subprocess, time
 4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
 5 | 
 6 | from nltk.tokenize import word_tokenize
 7 | from nltk import pos_tag as nltk_pos_tag
 8 | 
 9 | import cmn, kor, jpn, vie, ind
10 | 
11 | chinese = cmn.StanfordNLP()
12 | korean = kor.Postech()
13 | japanese = jpn.Mecab()
14 | vietnamese = vie.Jvntextpro()
15 | indonesian = ind.Indotag()
16 | 
17 | lang2lib = {'jpn':japanese, 'cmn':chinese, 
18 |             'vie':vietnamese, 'kor':korean,
19 |             'ind':indonesian}
20 | 
21 | def tokenize(text, lang, batch=False):
22 |     if lang in ['eng', 'ind']:
23 |         return " ".join(word_tokenize(text))
24 |     elif lang in lang2lib:
25 |         return lang2lib[lang].tokenize(text, batch=batch)
26 |     else:
27 |         return text.split()
28 |         
29 | def pos_tag(text, lang, batch=False):
30 |     if lang == 'eng':
31 |         return nltk_pos_tag(word_tokenize(text))
32 |     if lang in lang2lib:
33 |         return lang2lib[lang].pos_tag(text, batch=batch)
34 | 


--------------------------------------------------------------------------------
/ntumc/toolkit/kor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os, io, subprocess, time
 4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
 5 | 
 6 | class Postech():
 7 |     def __init__(self, sejong_dir= '/home/alvas/git/NTU-MC/ntumc/external/sejong/'):
 8 |         self.sejong_dir = sejong_dir
 9 |     
10 |     def utf8_to_euck(self, text):
11 |         text = unicode(text.decode('utf-8'))
12 |         text = text.replace(u'\xa0', u' ')
13 |         text = text.replace(u'\xe7', u'c') # ç -> c
14 |         text = text.replace(u'\xe9', u'e') # é -> e
15 |         text = text.replace(u'\u2013', u'-') # – -> -
16 |         text = text.replace(u'\xa9', '(c)') # © -> (c)
17 |         return text.encode('euc-kr').strip()
18 |     
19 |     def sejong(self, text):
20 |         text = self.utf8_to_euck(text)
21 |         sejong_dir = self.sejong_dir
22 |         with io.open(sejong_dir+'input.txt', 'wb') as fout:
23 |             fout.write(text)
24 |         
25 |         cmd = "".join(['wine start /Unix ', sejong_dir,'sjTaggerInteg.exe'])
26 |         os.popen(cmd)
27 |         time.sleep(2)
28 |         
29 |         with io.open(sejong_dir+'output.txt', 'r', encoding='euc-kr') as fin:
30 |             sejongtext = fin.read().strip().encode('utf8').decode('utf8')
31 |         
32 |         return sejongtext
33 | 
34 |     def tokenize(self, text):
35 |         sejongtext = self.sejong(text)
36 |         text = " ".join([i.split(r'/')[0] for i in sejongtext.split()])
37 |         return text.decode('utf8').split()
38 |     
39 |     def pos_tag(self, text):
40 |         if isinstance(text, list):
41 |             text = " ".join(text)
42 |         sejongtext = self.sejong(text)
43 |         tagged_text = [tuple(i.split(r'/')) for i in sejongtext.split()]
44 |         return tagged_text
45 |         
46 |     def batch_pos_tag(self, text):
47 |         pass
48 |     


--------------------------------------------------------------------------------
/ntumc/toolkit/vie.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os, io, subprocess
 4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
 5 | 
 6 | from nltk.tokenize import word_tokenize
 7 | 
 8 | class Jvntextpro():
 9 |     def __init__(self, jvn_dir='/home/alvas/JVnTextPro-v.2.0/'):
10 |         self.jvn_dir = jvn_dir
11 |     
12 |         self.segmenter_cmd = "".join(['java -mx512M -cp ',
13 |                                        jvn_dir, '/bin:', 
14 |                                        jvn_dir + '/libs/args4j.jar:',
15 |                                        jvn_dir + '/libs/lbfgs.jar ',
16 |                                        'jvnsegmenter.WordSegmenting ',
17 |                                        '-modeldir ', jvn_dir, 
18 |                                        '/models/jvnsegmenter ',
19 |                                        '-inputfile tmp.txt',
20 |                                        ' -outputfile tmp.txt.wseg'])
21 |         
22 |         self.tagger_cmd = "".join(['java -mx512M -cp ',
23 |                                    jvn_dir, '/bin:', 
24 |                                    jvn_dir + '/libs/args4j.jar:',
25 |                                    jvn_dir + '/libs/lbfgs.jar ',
26 |                                    'jvnpostag.POSTagging ',
27 |                                    '-tagger maxent ',
28 |                                    '-modeldir ', jvn_dir, 
29 |                                    '/models/jvnpostag/maxent ',
30 |                                    '-inputfile tmp.txt',
31 |                                    ' -outputfile tmp.txt.pos'])
32 |         
33 |     def tokenize(self, text):
34 |         # Write to text to temp file.
35 |         os.popen("".join(['echo "', text, '" > tmp.txt']))
36 |         # Runs segmenter.
37 |         os.popen(self.segmenter_cmd)
38 |         # Reads from output file.
39 |         text = io.open('tmp.txt.wseg', 'r', encoding='utf8').read().strip()
40 |         return word_tokenize(text)
41 |     
42 |     def pos_tag(self, text):
43 |         # Tokenize the text.
44 |         text = " ".join(text)
45 |         # Write to text to temp file.
46 |         os.popen("".join(['echo "', text, '" > tmp.txt']))
47 |         # Runs tagger.
48 |         os.popen(self.tagger_cmd)
49 |         # Reads from output file.
50 |         jvntext = io.open('tmp.txt.pos', 'r', encoding='utf8').read().strip()
51 |         tagged_text = [tuple(i.split(r'/')) for i in jvntext.split()]
52 |         
53 |         return tagged_text


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | NTU-MC
 2 | ======
 3 | 
 4 | This is a legacy repository for the STB subcorpora of the Nanyang Technological University - Multilingual Corpus (NTU-MC) project. New editions of NTU-MC are maintained by [NTU Computational Linguistics Lab](http://compling.hss.ntu.edu.sg/ntumc/)
 5 | 
 6 | 
 7 | Spin-offs
 8 | ========
 9 | 
10 | * [NTU-MC Toolkit](http://www.aclweb.org/anthology/C/C14/C14-2019.pdf): An annotation toolkit for multilingual text (supports Arabic, Chinese, Japanese, Korean, Indonesian, Vietnamese and English)
11 | * [GaChalign](https://github.com/alvations/gachalign): A python implementation of Gale-Church Sentence-level Aligner with variable parameters
12 | * [Mini-segmenter](https://code.google.com/p/mini-segmenter/): A Dictionary based Chinese segmenter
13 | * [Indotag](): Implementation of Pisceldo et al. (2010) Bahasa Indonesian Part of Speech tagger, using 1M word corpus from the Pan Asia Networking Localization Project. 
14 | 
15 | 
16 | Changelog
17 | ==========
18 | 
19 | 
20 | * NTU-MC v5.1 (26.08.14): Added NTU-MC Toolkit
21 | * [NTU-MC v5.0](https://drive.google.com/drive/folders/1ResffV1GXLCK6Dc-0ZxFeBRS9CtnMS71?usp=sharing) (29.04.13): Better cleaning with titles 
22 | * NTU-MC v4.1 (08.04.13): Scheduled release.
23 | * NTU-MC v4.0 (27.01.13): Re-clean and retagged from scratch.
24 | * NTU-MC v3.0 (01.05.12): Scheduled release for IJALP
25 | * NTU-MC v2.0 (20.08.11): Cleaned and sentence aligned.
26 | * NTU-MC v1.0 (01.05.11): Foundation text.
27 | 
28 | 
29 | 
30 | References
31 | ==========
32 | 
33 | Please cite the following when using the data/scripts from the NTU-MC:
34 | 
35 | ```
36 | @inproceedings{ntumc2011,
37 |   author    = {Liling Tan and
38 |                Francis Bond},
39 |   title     = {Building and Annotating the Linguistically Diverse NTU-MC
40 |                (NTU-Multilingual Corpus)},
41 |   booktitle = {PACLIC},
42 |   year      = {2011},
43 |   pages     = {362-371},
44 |   ee        = {http://www.aclweb.org/anthology/Y11-1038},
45 | }
46 | ```
47 | 
48 | * Liling Tan. 2011. [Building the foundation text for Nanyang Technological University - Multilingual Corpus (NTU-MC).](http://dr.ntu.edu.sg/bitstream/handle/10220/7790/Liling%20Tan.pdf). Bachelor Final Year Project. Nanyang Technological University: Singapore.
49 | 
50 | * Liling Tan and Francis Bond. 2012. [Building and annotating the linguistically diverse NTU-MC (NTU-multilingual corpus)](http://www.colips.org/journal/volume22/22.4.2.NTU-MC%20Tan%20final.pdf). International Journal of Asian Language Processing, 22(4):161–174
51 | 
52 | * Liling Tan and Francis Bond. 2014. [NTU-MC Toolkit: Annotating a Linguistically Diverse Corpus](http://www.aclweb.org/anthology/C/C14/C14-2019.pdf). In Proceedings of 25th International Conference on Computational Linguistics (COLING 2014). Dublin, Ireland.
53 | 
54 | 
55 | **Other References**:
56 | 


--------------------------------------------------------------------------------
/ntumc/toolkit/cmn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os, io, subprocess
 4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
 5 | 
 6 | class StanfordNLP():
 7 |     def __init__(self, 
 8 |                  stanford_segdir='/home/alvas/stanford-segmenter-2014-06-16',
 9 |                  stanford_posdir='/home/alvas/stanford-postagger-full-2014-06-16'):
10 |         self.stanford_segdir = stanford_segdir
11 |         self.segmenter_cmd = " ".join(["bash",stanford_segdir+'/segment.sh',
12 |                                   "ctb tmp.txt UTF8 0"])
13 |         
14 |         self.stanford_posdir = stanford_posdir
15 |         
16 |         self.tagger_cmd = " ".join(['java', '-cp', 
17 |                                     stanford_posdir+'/stanford-postagger.jar',
18 |                                     'edu.stanford.nlp.tagger.maxent.MaxentTagger',
19 |                                     '-model', stanford_posdir+'/models/chinese-nodistsim.tagger',
20 |                                     '-textFile tmp.txt'])
21 |                 
22 |     
23 |     def tokenize(self, text, batch=False):
24 |         # Write to text to temp file.
25 |         if batch:
26 |             os.popen("".join(['echo -e "', '\n'.join(text), '" > tmp.txt']))
27 |         else:
28 |             os.popen("".join(['echo "', text, '" > tmp.txt']))
29 |         # Runs the segmenter.
30 |         text, err = subprocess.Popen(self.segmenter_cmd,
31 |                         shell = True, stdout=subprocess.PIPE,
32 |                         stderr=subprocess.PIPE).communicate()
33 |         # Reads from subprocess output.
34 |         text = text.decode('utf8').strip()
35 |         if batch:
36 |             return [i.strip().split() for i in text.split('\n')]
37 |         else:
38 |             return text.split()
39 |     
40 |         
41 |     def pos_tag(self, text, batch=False):
42 |         if batch:
43 |             text = [' '.join(i) if isinstance(i, list) else i for i in text]
44 |             os.popen("".join(['echo -e "', '\n'.join(text), '" > tmp.txt']))
45 |         else:
46 |             if isinstance(text, list):
47 |                 text = " ".join(text)
48 |             # Write to text to temp file.
49 |             os.popen("".join(['echo "', text, '" > tmp.txt']))
50 |         # Runs the tagger.
51 |         text, err = subprocess.Popen(self.tagger_cmd,
52 |                         shell = True, stdout=subprocess.PIPE,
53 |                         stderr=subprocess.PIPE).communicate()
54 |         # Reads from subprocess output.
55 |         text = text.decode('utf8').strip()
56 |         if batch:
57 |             return [[tuple(i.split(r'#')) for i in t.split()] 
58 |                     for t in text.split('\n')] 
59 |         else:
60 |             return [tuple(i.split(r'#')) for i in text.decode('utf8').split()]
61 |     
62 |     


--------------------------------------------------------------------------------
/ntumc/toolkit/nafer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os, io, subprocess, time
  4 | import sys; reload(sys); sys.setdefaultencoding('utf-8')
  5 | 
  6 | from preprocess import tokenize, pos_tag
  7 | 
  8 | def text2naf(text, sentid, thisparaid, wordid):
  9 |     """
 10 |     <wf id="w1" offset="0" length="4" sent="1" para="1">John</wf>
 11 |     """
 12 |     textnaf = []
 13 |     stridx = 0
 14 |     for wid, word in enumerate(text.split(), start=1):
 15 |        line = '\t<wf id="w'+str(wid+wordid)+ '" offset="'+str(stridx)
 16 |        line+= '" length="'+str(len(word))+ '" sent="'+str(sentid)
 17 |        line+= '" para="'+str(thisparaid)+'">'+word+'''</wf>'''
 18 |        textnaf.append(line)
 19 |        stridx+=len(word)+1
 20 |     return "\n".join(textnaf), wordid+wid
 21 | 
 22 | def term2naf(tokens, tags, wordid):
 23 |     termnaf = []
 24 |     wid = 1
 25 |     for token, tag in zip(tokens, tags):
 26 |         line = '\t<term id="t'+str(wid+wordid)+'" pos="'+tag+'">\n'
 27 |         line+= '\t\t<span><target id="w'+str(wid+wordid)+'"/></span>\n'
 28 |         line+= '\t' + r'<\term>'
 29 |         termnaf.append(line)
 30 |         wid+=1
 31 |     return "\n".join(termnaf)
 32 |         
 33 | indir = '/home/alvas/git/NTU-MC/ntumc-v5/subcorpora/yoursing/cleanest/'
 34 | langs = os.walk(indir).next()[1]
 35 | 
 36 | for lang in langs:
 37 |     if lang == "eng" or lang == "cmn":
 38 |         continue
 39 |     langdir = indir+lang+'/'
 40 |     fout = io.open('ntumc-'+lang+'.naf', 'wb')
 41 |     for filename in sorted(os.walk(langdir).next()[2]):
 42 |         if filename.endswith('~'):
 43 |             continue        
 44 |         webpage = ''
 45 |         title = ''
 46 |         wordid = 1
 47 |         textlayer = []
 48 |         textlayer.append('<!-- text layer -->')
 49 |         textlayer.append('<text>')
 50 |         
 51 |         termlayer = []
 52 |         termlayer.append('<!-- term layer -->')
 53 |         termlayer.append('<terms>')
 54 |         
 55 |         lines = []
 56 |         
 57 |         for line in io.open(langdir+filename, 'r', encoding='utf8'):
 58 |             line = line.strip()
 59 |             if line.startswith('#M'):
 60 |                 webpage = line.split('\t')[1].replace('<!-- Mirrored from ','')
 61 |                 webpage = webpage.partition(' ')[0]
 62 |             elif line.startswith('#T'):
 63 |                 text = line.split('\t')[1].replace('YourSingapore.com - ','')
 64 |                 ##print text
 65 |                 textxml_header = "".join(['<naf xml:lang="', lang,'"', 
 66 |                                           " doc='"+filename,'"',
 67 |                                           " url='"+webpage,'"',
 68 |                                           " title='"+text,'"',
 69 |                                           ">"])
 70 |                 fout.write(unicode(textxml_header)+"\n")
 71 |             elif line.startswith('#H'):
 72 |                 ##print lang, filename, line
 73 |                 if len(line.split('\t')) < 2:
 74 |                     continue
 75 |                 text = line.split('\t')[1]
 76 |                 header = True
 77 |                 header_index = int(line.split('\t')[0][2:])
 78 |                 ##print header, header_index, text
 79 |             elif line.startswith("#P"):
 80 |                 idx, text = line.split('\t')
 81 |                 thisparaid, sentid = idx.split()
 82 |                 thisparaid = int(thisparaid.split('\t')[0][2:])+1
 83 |                 sentid = int(sentid.split('\t')[0][2:])+1
 84 |                 #text = tokenize(text, lang)
 85 |                 
 86 |                 ##print text
 87 |                 ##print text2naf(text, sentid, thisparaid)
 88 |                 
 89 |                 if lang == 'cmn':
 90 |                     lines.append(text)
 91 |                 else:
 92 |                     tagged_text = pos_tag(tokenize(text,lang), lang)
 93 |                     tokens, tags = zip(*tagged_text)
 94 |                     tl, newwordid = text2naf(" ".join(tokens), sentid, 
 95 |                                              thisparaid,wordid)
 96 |                     textlayer.append(tl)
 97 |                     termlayer.append(term2naf(tokens, tags, wordid))
 98 |                     wordid = newwordid
 99 |         
100 |         if lang == 'cmn':
101 |             tagged_texts = pos_tag(tokenize(lines,lang, batch=True), 
102 |                                    lang, batch=True)
103 |             
104 |             for tagged_text in tagged_texts:
105 |                 tokens, tags = zip(*tagged_text)
106 |                 tl, newwordid = text2naf(" ".join(tokens), sentid, 
107 |                                          thisparaid,wordid)
108 |                 textlayer.append(tl)
109 |                 termlayer.append(term2naf(tokens, tags, wordid))
110 |                 wordid = newwordid
111 |         
112 |         fout.write("\n".join(textlayer) + '\n')
113 |         fout.write("</text>"+'\n')
114 |         fout.write("\n".join(termlayer)+'\n')
115 |         fout.write(r'<\naf>'+'\n')
116 |         
117 | ##print count


--------------------------------------------------------------------------------
/ntumc/external/mini-segmenter/minisegmenter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8 -*-
  2 | import os, codecs, re, math
  3 | from itertools import izip
  4 | 
  5 | # This function reads the CC-CEDICT entries into a list.
  6 | def readCedict(dic, output=None):
  7 |   reader = [x.strip() for x in codecs.open(dic,"r","utf8") if x[0]!="#"]
  8 |   if output!= None:
  9 |     outfile = codecs.open(output,"w","utf8")
 10 |   dic = []  
 11 |   for l in reader:
 12 |     #trad = l.partition(" ")[0]
 13 |     simp = l.split(" ")[1]
 14 |     #pinyin = re.findall(r'\[([^]]*)\]',l)[0]
 15 |     #gloss = l.replace(trad, "").replace(simp,"").replace(pinyin,""). \
 16 |     #          replace("[]","").strip()[1:-1]
 17 |     if output!= None:
 18 |       print>>outfile, trad + "\t" +simp + "\t" + pinyin + "\t" + gloss
 19 |     dic.append(simp)    
 20 |   return dic  
 21 | 
 22 | # This function is used to find the position where the punctuations occurs in 
 23 | # a sentence; these positions are used as the delimiters for chinking.
 24 | def getPunctIndice(text, option=None):
 25 |   punctuations = ["，","、","。","！",'"','(',')','[',']','？','；','-',
 26 |                   '（','）','/','-','：', ' ','《','》','…','•']
 27 |                   #'1.','2.','3.','4.','5.','6.','7.','8.','9.']
 28 |   if option == "super":
 29 |     superspliter = ["在","的"]
 30 |     punctuations = punctuations + superspliter
 31 |   punctIndice = [i for i, x in enumerate(text) if x in punctuations]
 32 |   if punctIndice == []:
 33 |   	punctIndice = [len(text)]
 34 |   return punctIndice
 35 | 
 36 | # By splitting up the sentences into smaller parts (i.e. chinks), it will take
 37 | # the load off the node generation and node parsing processes
 38 | def getChinks(text):
 39 |   text = addSpace(text)
 40 |   chunkpoints = getPunctIndice(text)
 41 |   pointer = 0; chunks = []
 42 |   for i, c in enumerate(chunkpoints):
 43 |     chunks.append(text[pointer:c].strip())
 44 |     chunks.append(text[c:c+1].strip())
 45 |     pointer = c+1
 46 |     if i == len(chunkpoints) -1:
 47 |       chunks.append(text[pointer:len(text)-1].strip())
 48 |   return [i for i in chunks if i !=""]
 49 | 
 50 | # Sometimes Chinese sentences have English words with odd spacing
 51 | # e.g. "顺便采买些喜欢的 CD 和DVD 或 vcd。"
 52 | # This functions will sort out the spaces and output:
 53 | # "顺便采买些喜欢的 CD 和 DVD 或 vcd 。"
 54 | def addSpace(text):
 55 |   currIsAscii = None; prevIsAscii = None; newsentence = ""
 56 |   for i in text:
 57 |     try:
 58 |       i.decode('ascii')
 59 |       currIsAscii = True
 60 |     except:
 61 |       currIsAscii = False
 62 |     if prevIsAscii != currIsAscii:
 63 |       newsentence+=" "
 64 |       newsentence+=i
 65 |     else:
 66 |       newsentence+=i
 67 |     prevIsAscii = currIsAscii
 68 |     while "  " in newsentence:
 69 |       newsentence = newsentence.replace("  ", " ")
 70 |   return newsentence.strip()
 71 | 
 72 | # Using a dictionary, this function will return the dictionary entries that 
 73 | # corresponds to the sentence tokens.
 74 | # NOTE:
 75 | # - This function returns the input chunk as a node if it is
 76 | #   -- an ascii (i.e. an English word) or
 77 | #   -- a punctuation
 78 | def getNodes (chunk,dic):
 79 |   if dic == None:
 80 |     dic = sorted(readCedict('cedict_ts.u8'),key=len)
 81 |     
 82 |   chunklist = []
 83 |   try:
 84 |     chunk.decode('ascii')
 85 |     chunklist.append((0,"0-1",chunk))
 86 |   except:
 87 |     punctuations = ["，","、","。","！",'"','(',')','[',']','？','；','-',
 88 |                   '（','）','/','-','：', ' ','《','》','…','•']
 89 |     numbers = ['０','１','２','３','４','５','６','７','８','９', \
 90 |              '0','1','2','3','4','5','6','7','8','9']
 91 |     if chunk in punctuations:
 92 |       chunklist.append((0,"0-1",chunk))
 93 |     else:
 94 |       marker=0; max_chunk_len=len(chunk)
 95 |       while marker < max_chunk_len+1:
 96 |         index = 0; #longest_chunk = ""; chunk_end = 0
 97 |         while index < marker:
 98 |           token = chunk[index:marker]
 99 |           if token in dic:
100 |             #print len(chunk), str(index)+"-"+str(marker),token
101 |             chunklist.append((int(index),str(index)+"-"+str(marker),token))
102 |           if token in numbers:
103 |             #print len(chunk), str(index)+"-"+str(marker),token
104 |             chunklist.append((int(index),str(index)+"-"+str(marker),token))
105 |           index+=1
106 |         marker+=1
107 |   
108 |   # Adds unseen tokens as individual words into the list of possible nodes.
109 |   for j,i in enumerate(chunk):
110 |     if i not in dic:
111 |       chunklist.append((int(j),str(j)+"-"+str(j+1),i))
112 |   
113 |   chunklist = [(k,i,j) for k,i,j in chunklist if j!=""]
114 |   
115 |   # NOTE: This is for code optimization.
116 |   # If there is an entry in the dictionary that corresponds to a whole chunk,
117 |   # dump the rest of the chunk nodes.
118 |   for i in chunklist:
119 |     if len(i[2]) == len(chunk):
120 |       chunklist = [i]
121 |       break
122 |     
123 |   return chunklist
124 | 
125 | # This is totally a bad habit of mine to work with maps instead of messy tuples.
126 | # Basically this method sorts the nodes (in tuple format) according the nodes'
127 | # starting position. That is accomplished by putting nodes with the same
128 | # starting position as under the same key. 
129 | def chunklist2Map(chunklist):
130 |   chunkmap = {}
131 |   prev_k = -1 ; tempchunks = []
132 |   for k,i,j in sorted(chunklist):
133 |     if k > prev_k and prev_k >-1:
134 |       chunkmap[prev_k] = tempchunks
135 |       tempchunks = []
136 |       tempchunks.append((i.split('-')[1],j))
137 |     else:
138 |       tempchunks.append((i.split('-')[1],j))
139 |     prev_k = k
140 |   chunkmap[prev_k] = tempchunks
141 |   return chunkmap
142 | 
143 | ############################################################################
144 | ## The brand-new recursive parser to get the best possible parse of the nodes.
145 | ##
146 | ## The criteria for this parser is to favor words with longer character.
147 | ##
148 | ## The score for the different parses are calculated based on the summation of
149 | ## the length of each segment squared i.e.:
150 | ##     sum_over_each_segment(len(char_in_words)^2)
151 | ##
152 | ## This function returns the parse with the highest score.
153 | #############################################################################
154 | 
155 | def largestChunksParser(chunk,dic=None):
156 |   if dic == None:
157 |     dic = sorted(readCedict('cedict_ts.u8'),key=len)
158 | 
159 |   chunklist = getNodes(chunk,dic)  
160 |   chunkmap = chunklist2Map(chunklist)
161 |   
162 |   def getNodesAtPos (map, position):
163 |     for i in map:
164 |       if i == position:
165 |         return map[i]
166 |   
167 |   def add2Parses(map, pointer, parses):
168 |     currentNodes = getNodesAtPos(map,pointer)
169 |     # Terminating end node(s).
170 |     if currentNodes == None:
171 |       return parses
172 |     # Initialize start node(s).
173 |     newparses = []
174 |     if pointer == 0:
175 |       for j in currentNodes:
176 |         newparses.append(j)
177 |     # Iterate over parses and extends the parses by selecting the nodes
178 |     # that corresponds to the ending position of the parse.
179 |     for l,k in enumerate(parses):
180 |       #prevNodeEnd = k[0]
181 |       if int(k[0]) == int(pointer):
182 |         for m in currentNodes:
183 |           newparses.append((m[0], k[1] + " " + m[1]))
184 |           #parses[l] = (m[0], k[1] + " " + m[1])    
185 |     newparses+=parses
186 |     return newparses
187 |   
188 |   def calculateParseScore(parses):
189 |     parseWithScore = []
190 |     for i in sorted(parses):
191 |       score = 0
192 |       for j in unicode.split(i," "):
193 |         score+=math.pow(len(j.decode('utf8')),2)
194 |       parseWithScore.append((score, i))
195 |     return parseWithScore
196 | 
197 |   pointer = 0
198 |   possible_parses = []
199 |   # Activating the recursive parser.
200 |   while pointer != len(chunk.decode('utf8')):
201 |     possible_parses = add2Parses(chunkmap, pointer, possible_parses)
202 |     pointer+=1
203 |   
204 |   # A very lazy way to trim the parse forest. What it does is it kills all the  
205 |   # parses that doesn't matches the original input chunk.
206 |   possible_parses = list(set([i[1] for i in possible_parses \
207 |                    if len(i[1].replace(" ","")) == len(chunk.decode('utf8'))
208 |                    and i[1].replace(" ","") == chunk.replace(" ","") ]))
209 |   
210 |   parses_with_scores = calculateParseScore(possible_parses)
211 |   #print chunk
212 |   bestparse = sorted(parses_with_scores)[-1]
213 |   return bestparse[1]
214 |   
215 | # DEPRECATED: because sometimes it generates extra tokens...
216 | # Select the nodes from left to right, choose the token with the longest length
217 | # as the correct node.
218 | def left2rightGreedyParser(chunk,dic):
219 |   chunklist = getNodes(chunk,dic)
220 |   chunklist_biggest = chunklist2Map(chunklist)
221 |   
222 |   chunklist2 = []; index = 0
223 |   longest_len, prevend = 0.0,0.0
224 |   for i in chunklist_biggest:
225 |     #print i,chunklist_biggest[i]
226 |     if int(i) < prevend:
227 |       continue
228 |     else:
229 |       for j,k in chunklist_biggest[i]:
230 |         if math.pow(float(j),2) > longest_len:
231 |           longest_len=math.pow(float(j),2)
232 |           try:
233 |             chunklist2[index] = k
234 |             prevend = int(j)
235 |           except:
236 |             chunklist2.append(k)
237 |     index+=1
238 |   #print " ".join(chunklist2)
239 |   return chunklist2
240 | 
241 | def despace(text):
242 |   while "  " in text:
243 |     text = text.replace("  ", " ")
244 |   return text.strip()
245 | 
246 | def segmenter(sentence,dic=sorted(readCedict('cedict_ts.u8'),key=len)):
247 |   #print dic
248 |   #print sentence
249 |   chunked_sent = []
250 |   for c in getChinks(sentence):
251 |     chunked_sent.append(largestChunksParser(c,dic))
252 |   return despace(" ".join(chunked_sent))
253 | 
254 | ##############################################################################
255 | # A demo for mini-segmenter using sentences from NTU-MC test suite.
256 | def minidemo():
257 | 	textfile = "all.cmn"
258 | 	reader = [i.strip() for i in codecs.open(textfile,'r','utf8') if i[0] !="#"]
259 | 	cmn_dic = sorted(readCedict('cedict_ts.u8'),key=len)
260 | 
261 | 	for line in reader:
262 | 		segmented_sentence = segmenter(line)
263 | 		print line
264 | 		print segmented_sentence 
265 | 		print "#################################################"
266 | 		if despace(line.replace(" ","")) != segmented_sentence.replace(" ",""):
267 | 		  print "ERROR:", line
268 | 		  break
269 | 


--------------------------------------------------------------------------------
/ntumc/toolkit/minisegmenter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8 -*-
  2 | import os, codecs, re, math
  3 | from itertools import izip
  4 | 
  5 | import sys; reload(sys); sys.setdefaultencoding('utf8')
  6 | 
  7 | # This function reads the CC-CEDICT entries into a list.
  8 | def readCedict(dic='cedict_ts.u8', output=None):
  9 |   reader = [x.strip() for x in codecs.open(dic,"r","utf8") if x[0]!="#"]
 10 |   if output!= None:
 11 |     outfile = codecs.open(output,"w","utf8")
 12 |   dic = []  
 13 |   for l in reader:
 14 |     #trad = l.partition(" ")[0]
 15 |     simp = l.split(" ")[1]
 16 |     #pinyin = re.findall(r'\[([^]]*)\]',l)[0]
 17 |     #gloss = l.replace(trad, "").replace(simp,"").replace(pinyin,""). \
 18 |     #          replace("[]","").strip()[1:-1]
 19 |     if output!= None:
 20 |       print>>outfile, trad + "\t" +simp + "\t" + pinyin + "\t" + gloss
 21 |     dic.append(simp)    
 22 |   return dic  
 23 | 
 24 | # This function is used to find the position where the punctuations occurs in 
 25 | # a sentence; these positions are used as the delimiters for chinking.
 26 | def getPunctIndice(text, option=None):
 27 |   punctuations = ["，","、","。","！",'"','(',')','[',']','？','；','-',
 28 |                   '（','）','/','-','：', ' ','《','》','…','•']
 29 |                   #'1.','2.','3.','4.','5.','6.','7.','8.','9.']
 30 |   if option == "super":
 31 |     superspliter = ["在","的"]
 32 |     punctuations = punctuations + superspliter
 33 |   punctIndice = [i for i, x in enumerate(text) if x in punctuations]
 34 |   if punctIndice == []:
 35 |   	punctIndice = [len(text)]
 36 |   return punctIndice
 37 | 
 38 | # By splitting up the sentences into smaller parts (i.e. chinks), it will take
 39 | # the load off the node generation and node parsing processes
 40 | def getChinks(text):
 41 |   text = addSpace(text)
 42 |   chunkpoints = getPunctIndice(text)
 43 |   pointer = 0; chunks = []
 44 |   for i, c in enumerate(chunkpoints):
 45 |     chunks.append(text[pointer:c].strip())
 46 |     chunks.append(text[c:c+1].strip())
 47 |     pointer = c+1
 48 |     if i == len(chunkpoints) -1:
 49 |       chunks.append(text[pointer:len(text)-1].strip())
 50 |   return [i for i in chunks if i !=""]
 51 | 
 52 | # Sometimes Chinese sentences have English words with odd spacing
 53 | # e.g. "顺便采买些喜欢的 CD 和DVD 或 vcd。"
 54 | # This functions will sort out the spaces and output:
 55 | # "顺便采买些喜欢的 CD 和 DVD 或 vcd 。"
 56 | def addSpace(text):
 57 |   currIsAscii = None; prevIsAscii = None; newsentence = ""
 58 |   for i in text:
 59 |     try:
 60 |       i.decode('ascii')
 61 |       currIsAscii = True
 62 |     except:
 63 |       currIsAscii = False
 64 |     if prevIsAscii != currIsAscii:
 65 |       newsentence+=" "
 66 |       newsentence+=i
 67 |     else:
 68 |       newsentence+=i
 69 |     prevIsAscii = currIsAscii
 70 |     while "  " in newsentence:
 71 |       newsentence = newsentence.replace("  ", " ")
 72 |   return newsentence.strip()
 73 | 
 74 | # Using a dictionary, this function will return the dictionary entries that 
 75 | # corresponds to the sentence tokens.
 76 | # NOTE:
 77 | # - This function returns the input chunk as a node if it is
 78 | #   -- an ascii (i.e. an English word) or
 79 | #   -- a punctuation
 80 | def getNodes (chunk,dic):
 81 |   if dic == None:
 82 |     dic = sorted(readCedict(),key=len)
 83 |     
 84 |   chunklist = []
 85 |   try:
 86 |     chunk.decode('ascii')
 87 |     chunklist.append((0,"0-1",chunk))
 88 |   except:
 89 |     punctuations = ["，","、","。","！",'"','(',')','[',']','？','；','-',
 90 |                   '（','）','/','-','：', ' ','《','》','…','•']
 91 |     numbers = ['０','１','２','３','４','５','６','７','８','９', \
 92 |              '0','1','2','3','4','5','6','7','8','9']
 93 |     if chunk in punctuations:
 94 |       chunklist.append((0,"0-1",chunk))
 95 |     else:
 96 |       marker=0; max_chunk_len=len(chunk)
 97 |       while marker < max_chunk_len+1:
 98 |         index = 0; #longest_chunk = ""; chunk_end = 0
 99 |         while index < marker:
100 |           token = chunk[index:marker]
101 |           if token in dic:
102 |             #print len(chunk), str(index)+"-"+str(marker),token
103 |             chunklist.append((int(index),str(index)+"-"+str(marker),token))
104 |           if token in numbers:
105 |             #print len(chunk), str(index)+"-"+str(marker),token
106 |             chunklist.append((int(index),str(index)+"-"+str(marker),token))
107 |           index+=1
108 |         marker+=1
109 |   
110 |   # Adds unseen tokens as individual words into the list of possible nodes.
111 |   for j,i in enumerate(chunk):
112 |     if i not in dic:
113 |       chunklist.append((int(j),str(j)+"-"+str(j+1),i))
114 |   
115 |   chunklist = [(k,i,j) for k,i,j in chunklist if j!=""]
116 |   
117 |   # NOTE: This is for code optimization.
118 |   # If there is an entry in the dictionary that corresponds to a whole chunk,
119 |   # dump the rest of the chunk nodes.
120 |   for i in chunklist:
121 |     if len(i[2]) == len(chunk):
122 |       chunklist = [i]
123 |       break
124 |     
125 |   return chunklist
126 | 
127 | # This is totally a bad habit of mine to work with maps instead of messy tuples.
128 | # Basically this method sorts the nodes (in tuple format) according the nodes'
129 | # starting position. That is accomplished by putting nodes with the same
130 | # starting position as under the same key. 
131 | def chunklist2Map(chunklist):
132 |   chunkmap = {}
133 |   prev_k = -1 ; tempchunks = []
134 |   for k,i,j in sorted(chunklist):
135 |     if k > prev_k and prev_k >-1:
136 |       chunkmap[prev_k] = tempchunks
137 |       tempchunks = []
138 |       tempchunks.append((i.split('-')[1],j))
139 |     else:
140 |       tempchunks.append((i.split('-')[1],j))
141 |     prev_k = k
142 |   chunkmap[prev_k] = tempchunks
143 |   return chunkmap
144 | 
145 | ############################################################################
146 | ## The brand-new recursive parser to get the best possible parse of the nodes.
147 | ##
148 | ## The criteria for this parser is to favor words with longer character.
149 | ##
150 | ## The score for the different parses are calculated based on the summation of
151 | ## the length of each segment squared i.e.:
152 | ##     sum_over_each_segment(len(char_in_words)^2)
153 | ##
154 | ## This function returns the parse with the highest score.
155 | #############################################################################
156 | 
157 | def largestChunksParser(chunk,dic=None):
158 |   if dic == None:
159 |     dic = sorted(readCedict(),key=len)
160 | 
161 |   chunklist = getNodes(chunk,dic)  
162 |   chunkmap = chunklist2Map(chunklist)
163 |   
164 |   def getNodesAtPos (map, position):
165 |     for i in map:
166 |       if i == position:
167 |         return map[i]
168 |   
169 |   def add2Parses(map, pointer, parses):
170 |     currentNodes = getNodesAtPos(map,pointer)
171 |     # Terminating end node(s).
172 |     if currentNodes == None:
173 |       return parses
174 |     # Initialize start node(s).
175 |     newparses = []
176 |     if pointer == 0:
177 |       for j in currentNodes:
178 |         newparses.append(j)
179 |     # Iterate over parses and extends the parses by selecting the nodes
180 |     # that corresponds to the ending position of the parse.
181 |     for l,k in enumerate(parses):
182 |       #prevNodeEnd = k[0]
183 |       if int(k[0]) == int(pointer):
184 |         for m in currentNodes:
185 |           newparses.append((m[0], k[1] + " " + m[1]))
186 |           #parses[l] = (m[0], k[1] + " " + m[1])    
187 |     newparses+=parses
188 |     return newparses
189 |   
190 |   def calculateParseScore(parses):
191 |     parseWithScore = []
192 |     for i in sorted(parses):
193 |       score = 0
194 |       for j in unicode.split(i," "):
195 |         score+=math.pow(len(j.decode('utf8')),2)
196 |       parseWithScore.append((score, i))
197 |     return parseWithScore
198 | 
199 |   pointer = 0
200 |   possible_parses = []
201 |   # Activating the recursive parser.
202 |   while pointer != len(chunk.decode('utf8')):
203 |     possible_parses = add2Parses(chunkmap, pointer, possible_parses)
204 |     pointer+=1
205 |   
206 |   # A very lazy way to trim the parse forest. What it does is it kills all the  
207 |   # parses that doesn't matches the original input chunk.
208 |   possible_parses = list(set([i[1] for i in possible_parses \
209 |                    if len(i[1].replace(" ","")) == len(chunk.decode('utf8'))
210 |                    and i[1].replace(" ","") == chunk.replace(" ","") ]))
211 |   
212 |   parses_with_scores = calculateParseScore(possible_parses)
213 |   #print chunk
214 |   bestparse = sorted(parses_with_scores)[-1]
215 |   return bestparse[1]
216 |   
217 | # DEPRECATED: because sometimes it generates extra tokens...
218 | # Select the nodes from left to right, choose the token with the longest length
219 | # as the correct node.
220 | def left2rightGreedyParser(chunk,dic):
221 |   chunklist = getNodes(chunk,dic)
222 |   chunklist_biggest = chunklist2Map(chunklist)
223 |   
224 |   chunklist2 = []; index = 0
225 |   longest_len, prevend = 0.0,0.0
226 |   for i in chunklist_biggest:
227 |     #print i,chunklist_biggest[i]
228 |     if int(i) < prevend:
229 |       continue
230 |     else:
231 |       for j,k in chunklist_biggest[i]:
232 |         if math.pow(float(j),2) > longest_len:
233 |           longest_len=math.pow(float(j),2)
234 |           try:
235 |             chunklist2[index] = k
236 |             prevend = int(j)
237 |           except:
238 |             chunklist2.append(k)
239 |     index+=1
240 |   #print " ".join(chunklist2)
241 |   return chunklist2
242 | 
243 | def despace(text):
244 |   while "  " in text:
245 |     text = text.replace("  ", " ")
246 |   return text.strip()
247 | 
248 | def segmenter(sentence,dic=sorted(readCedict(),key=len)):
249 |   #print dic
250 |   #print sentence
251 |   chunked_sent = []
252 |   for c in getChinks(sentence):
253 |     chunked_sent.append(largestChunksParser(c,dic))
254 |   return despace(" ".join(chunked_sent))
255 | 
256 | ##############################################################################
257 | # A demo for mini-segmenter using sentences from NTU-MC test suite.
258 | def minidemo():
259 | 	textfile = "/home/alvas/git/NTU-MC/ntumc/toolkit/all.cmn"
260 | 	reader = [i.strip() for i in codecs.open(textfile,'r','utf8') if i[0] !="#"]
261 | 	cmn_dic = sorted(readCedict(),key=len)
262 | 
263 | 	for line in reader:
264 | 		segmented_sentence = segmenter(line)
265 | 		print line
266 | 		print segmented_sentence 
267 | 		print "#################################################"
268 | 		if despace(line.replace(" ","")) != segmented_sentence.replace(" ",""):
269 | 		  print "ERROR:", line
270 | 		  break
271 | 
272 | def tokenize(text):
273 |     return segmenter(text).split()
274 | 
275 | minidemo()
276 | 


--------------------------------------------------------------------------------
/ntumc/toolkit/gale_church.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Gale-Church Algorithm
  3 | # Original version from GaChalign: http://goo.gl/17t9UG
  4 | # Authors: Liling Tan
  5 | 
  6 | import math, functools, io, sys
  7 | from itertools import groupby, izip
  8 | 
  9 | import numpy as np
 10 | 
 11 | class LanguageIndependent():
 12 |     """
 13 |     The original Gale-Church (1993) provided the following parameters given
 14 |     the UBS trilingual corpus.
 15 |     
 16 |     Category        Freq    Prob(match)
 17 |     ==================================================
 18 |     1-1             1167     p(1-1) = 0.89
 19 |     1-0 or 0-1        13     p(1-0) = p(0-1) = 0.0099
 20 |     2-1 or 1-2       117     p(2-1) = p(1-2) = 0.089
 21 |     2-2               15     p(2-2) = 0.011
 22 |     ==================================================
 23 |                     1312     1.00
 24 |     
 25 |     Table 5: Sentence alignment types.
 26 |     
 27 |     parameters        eng-deu    eng-fre    'language independent'
 28 |     =============================================================
 29 |     c    (mean)        1.1        1.06           1.0
 30 |     s^2  (variance)    7.3        5.6            6.8
 31 |     
 32 |     c and s^2 numbers from pp. 8 of Gale-Church (1993)
 33 |     """
 34 |     def __init__(self):
 35 |         self.penalty = {(0, 1): 450,  # inserted   : -100 * log(p(0,1))/p(1,1))
 36 |                         (1, 0): 450,  # deleted    : -100 * log(p(1,0))/p(1,1))
 37 |                         (1, 1): 0,    # substituted: -100 * log(p(1,1))/p(1,1))
 38 |                         (2, 1): 230,  # contracted : -100 * log(p(2,1))/p(1,1))
 39 |                         (1, 2): 230,  # expanded   : -100 * log(p(1,1))/p(1,1))
 40 |                         (2, 2): 440   # merged     : -100 * log(p(2,2))/p(1,1))
 41 |                         }
 42 |         
 43 |         self.mean = 1
 44 |         self.variance = 6.8
 45 | 
 46 | 
 47 | LOG2 = math.log(2)
 48 | 
 49 | def norm_cdf(z):
 50 |     """ 
 51 |     Returns the area under a normal distribution from -inf to z standard 
 52 |     deviations. Equation 26.2.17 from Abramowitz and Stegun (1964:p.932)
 53 |     """
 54 |     t = 1.0 / (1 + 0.2316419*z) # t = 1/(1+pz) , p=0.2316419
 55 |     probdist = 1.0 - t * (0.319381530  +  
 56 |                           t * (-0.356563782 + 
 57 |                                t * (1.781477937 + 
 58 |                                     t * (-1.821255978 + 
 59 |                                          t * 1.330274429))))
 60 |     return probdist
 61 | 
 62 | def norm_logsf(z):
 63 |   """ Take log of the survival function for normal distribution. """
 64 |   try:
 65 |     return math.log(1 - norm_cdf(z))
 66 |   except ValueError:
 67 |     return float('-inf')
 68 | 
 69 | def sent_length(sentence, option='char'):
 70 |     """ Returns sentence length. """
 71 |     if option in ['char', 'character']:
 72 |         # Returns no. of chars in sentence without spaces.
 73 |         return len(sentence) - sentence.count(' ')
 74 |     elif option in ['token', 'word']:
 75 |         # Returns no. of tokens in sentences delimited by spaces.
 76 |         return sentence.count(' ') + 1
 77 | 
 78 | def length_cost(source_sents_lens, target_sents_lens, mean, variance):
 79 |     """  
 80 |     Calculate length cost given a list of source sentences lengths and 
 81 |     target sentences lengths. 
 82 |     
 83 |     Note: The function takes subsets of the source_sents_lens and 
 84 |     target_sents_lens from _align().
 85 |      
 86 |     The original Gale-Church (1993:pp. 81) paper considers l2/l1 = 1 hence:
 87 |         
 88 |         delta = (l2-l1*c)/math.sqrt(l1*s2)
 89 |     
 90 |     If l2/l1 != 1 then the following should be considered:
 91 |     
 92 |         delta = (l2-l1*c)/math.sqrt((l1+l2*c)/2 * s2)
 93 |     
 94 |     substituting c = 1 and c = l2/l1, gives the original cost function.
 95 |     
 96 |     :type source_sents_lens: int
 97 |     :param source_sent_len: length of a source sentence
 98 |     
 99 |     :type target_sents_lens: int
100 |     :param target_sent_len: length of a target sentence
101 |     
102 |     :type mean: float
103 |     :param mean: the mean (c) parameter in gale-church algorithm
104 |     
105 |     :type variance: float
106 |     :param variance: the variance (s^2) parameter in gale-church algorithm
107 |     """
108 |     l1, l2 = sum(source_sents_lens), sum(target_sents_lens)
109 |     try:
110 |         delta = (l1 - l2 * mean) / math.sqrt( (l1 + l2 * mean) / 2 * variance)
111 |     except ZeroDivisionError:
112 |         return float('-inf')
113 |     return -100 * (LOG2 + norm_logsf(abs(delta)))
114 | 
115 | def trace(tracebacks, num_source_sents, num_target_sents, 
116 |           option='lastpair', debugging=False):
117 |     """
118 |     Traverse the alignment cost from the tracebacks and retrieves
119 |     appropriate sentence pairs.
120 |     """
121 |     if debugging:
122 |         for i in sorted(tracebacks.items()):
123 |             print i
124 |         print '#########'
125 |         
126 |     alignments = []
127 | 
128 |     if option == 'lastpair':
129 |         """
130 |         Start traversing from the last pair of sentence, the last most pair
131 |         gets selected and point moves according to alignment type.
132 |         same as @vchahun's implementation: http://goo.gl/cr9YHz
133 |         
134 |         Traversing is similar to NLTK's implementation: http://goo.gl/XIjJhE
135 |         but note that NLTk saves the tracebacks differently. 
136 |         """
137 |         i,j =  num_source_sents, num_target_sents
138 |         prev_i, prev_j = i, j 
139 |         while True:
140 |             (c, di, dj) = tracebacks[i,j] # (di, dj) is the alignment type.
141 |             if di == dj == 0:
142 |                 break
143 |             alignments.append(((i-di, i), (j-dj, j)))
144 |             i -= di; j -= dj
145 |     '''
146 |     # TODO: more traversing methods.
147 |     # Traverse from the end, retrieves the minimal cost using beam search.
148 |     elif option == 'beam':
149 |         prev_i, prev_j = num_source_sents + 1, num_target_sents + 1
150 |         for i in reversed(range(num_source_sents + 1)):
151 |             traces = [tracebacks[i,j]+(j,) for j in range(num_target_sents + 1)]
152 |             # Selects possible path with min cost.
153 |             for t in sorted(traces):
154 |                 cost, di, dj,j = t
155 |                 if di == dj == 0:
156 |                     break
157 |                 if i-di >=0 and j-dj >=0 and \
158 |                 i-di < prev_i and j-dj < prev_j: # Checks for possible path.
159 |                     alignments.append(((i-di, i), (j-dj, j)))
160 |                     break
161 |                     i -= di
162 |                     prev_i = i
163 |                     prev_j = j
164 |     '''
165 |     return reversed(alignments)        
166 |     
167 | def _align(source_sents_lens, target_sents_lens, mean, variance, penalty):
168 |     """ 
169 |     The minimization function to choose the sentence pair with 
170 |     cheapest alignment cost.
171 |     
172 |     :type source_sents_lens: list
173 |     :param source_sents_lens: list of source sentences' lengths
174 |     
175 |     :type target_sents_lens: list
176 |     :param target_sents_lens: list of target sentences' lengths
177 |     
178 |     :rtype: alignments_blocks: list
179 |     :return: list of tuple of tuple sentence indices, e.g.
180 |         
181 |         [ ((0,1), (0,2)), ((1,4), (2,3)) ]
182 |     
183 |     i.e. the first source sentence aligns with the first two target sentences
184 |     and the 2nd to 4th source sentence aligns to the third target sentence.
185 |     """
186 |     # Stores the tracebacks for alignment costs between sentence pairs.
187 |     tracebacks = {}
188 |     for i in range(len(source_sents_lens) + 1):
189 |         for j in range(len(target_sents_lens) + 1):
190 |             if i == j == 0:
191 |                 tracebacks[0,0] = (0, 0, 0)
192 |             else:
193 |                 tracebacks[i,j] = min((tracebacks[i-di, j-dj][0] + 
194 |                                        length_cost(source_sents_lens[i-di:i],
195 |                                                   target_sents_lens[j-dj:j],
196 |                                                   mean, variance) + 
197 |                                        pen_cost, di, dj)
198 |                                       for (di,dj), pen_cost in penalty.items()
199 |                                       if i-di>=0 and j-dj>=0)
200 |                 '''
201 |                 # More humanly, read this:
202 |                 min_cost = (float('inf'), 0, 0)
203 |                 # For each sentence pair, minimize the cost of alignment
204 |                 # among the different alignment types.
205 |                 for (di,dj), pen_cost in penalty.items():
206 |                     if i-di>=0 and j-dj>=0:
207 |                         # Retrieves previous cost.
208 |                         prev_cost = tracebacks[i-di, j-dj][0]
209 |                         # Calculate length_cost.
210 |                         len_cost = length_cost(source_sents_lens[i-di:i],
211 |                                                target_sents_lens[j-dj:j],
212 |                                                mean, variance)
213 |                         # Sum previous cost, length cost and alignment penalty.
214 |                         total_cost = prev_cost + len_cost + pen_cost
215 |                         if total_cost < min_cost:
216 |                             # Saves the alignment type with the minimum cost.
217 |                             tracebacks[i,j] = (total_cost, di, dj)
218 |                 ''' 
219 |     # Retraces the alignments.
220 |     num_source_sents = len(source_sents_lens)
221 |     num_target_sents = len(target_sents_lens)
222 |     return trace(tracebacks, num_source_sents, num_target_sents)
223 | 
224 | def align(source_sents, target_sents, mean, variance, penalty, option='char'):
225 |     """
226 |     Main alignment function.
227 | 
228 |     :type source_sents: list
229 |     :param source_sents: list of source sentences
230 |     
231 |     :type target_sents: list
232 |     :param target_sents: list of target sentences
233 |     
234 |     :type mean: float
235 |     :param mean: the mean (c) parameter in gale-church algorithm
236 |     
237 |     :type variance: float
238 |     :param variance: the variance (s^2) parameter in gale-church algorithm
239 |     
240 |     :type penalty: dict
241 |     :param penalty: a dictionary of the cost penalty parameter in 
242 |     gale-church algorithm, the (key,value) pairs stores the alignment types and
243 |     their respective penalty costs
244 |     """
245 |     # Collects the source and target sentence lengths.
246 |     source_sents_lens = map(functools.partial(sent_length, option=option), 
247 |                             source_sents)
248 |     target_sents_lens = map(functools.partial(sent_length, option=option), 
249 |                             target_sents)
250 |     
251 |     ##print source_sents_lens, target_sents_lens
252 |     ##print sum(source_sents_lens), sum(target_sents_lens)
253 |     
254 |     # Determines alignment blocks by minimizing cost function of different
255 |     # alignment types.
256 |     for (i_start, i_end), (j_start, j_end) in \
257 |     _align(source_sents_lens, target_sents_lens, mean, variance, penalty):
258 |         ##print (i_start, i_end), (j_start, j_end)
259 |         source = "~~".join(source_sents[i_start:i_end])
260 |         target = "~~".join(target_sents[j_start:j_end])
261 |         yield "\t".join([source, target]) 
262 | 
263 | ##########################################################################
264 | # GaChalign specify code.
265 | ##########################################################################
266 | 
267 | def per_section(it, is_delimiter=lambda x: x.isspace()):
268 |     ret = []
269 |     for line in it:
270 |         if is_delimiter(line):
271 |             if ret:
272 |                 yield ret  # OR  ''.join(ret)
273 |                 ret = []
274 |         else:
275 |             ret.append(line.rstrip())  # OR  ret.append(line)
276 |     if ret:
277 |         yield ret
278 |         
279 | def per_paragraph(infile, delimiter="#"):
280 |     return per_section(infile, lambda line: line.startswith(delimiter))
281 |            
282 | def text_len(text, option='char'):
283 |     """ Calculates the length of a text without spaces. """
284 |     if option in ['char', 'character']:
285 |         return len(text) - text.count(' ')
286 |     elif option in ['token', 'word']:
287 |         return text.count(' ')+1
288 |     
289 | def file_len(filename, option='char'):
290 |     """ Calculates length of file. """
291 |     return sum(text_len(i, option=option) for i in io.open(filename))
292 |     
293 | def calculate_gacha_mean(srcfile, trgfile, option='char'):
294 |     """
295 |     Calculates mean ratio of sentence length between source and target language,
296 |     i.e. 
297 |         sum(#chars in source sents)  / sum(#chars in target sents)
298 |     
299 |     :type srcfile: str
300 |     :param srcfile: filename of the file containing the source language texts.
301 |     
302 |     :type trgfile: str
303 |     :param trgfile: filename of the file containing the target language texts.
304 |     """
305 |     return file_len(srcfile, option=option)/float(file_len(trgfile, option=option))
306 | 
307 | def calculate_gacha_variance(srcfile, trgfile, option='char'):
308 |     """
309 |     Calculates variance of text, i.e.
310 |     derivative of the sum of squares of paragraphs' length differences against 
311 |     the sum of #chars in source paragraphs
312 |     """
313 |     paragraph_len_diffs = []
314 |     source_paragraph_lens = []
315 |     with io.open(srcfile) as src, io.open(trgfile) as trg:
316 |         src = per_paragraph(src, '#')
317 |         trg = per_paragraph(trg, '#')
318 |         for s,t in izip(src, trg):
319 |             srclen = text_len(s, option=option)
320 |             trglen = text_len(t, option=option)
321 |             paragraph_len_diffs.append(math.pow((srclen - trglen),2))
322 |             source_paragraph_lens.append(srclen)
323 |     m, __ = np.polyfit(source_paragraph_lens, paragraph_len_diffs, 1)
324 |     return m
325 | 
326 | ##########################################################################
327 | # File reading and command-line usage
328 | ##########################################################################
329 | 
330 | parameters = LanguageIndependent()
331 | 
332 | def main(source_corpus, target_corpus, mean=parameters.mean, 
333 |          variance=parameters.variance, penalty= parameters.penalty, 
334 |          option='char', delimiter='#'):
335 |     if mean == 'gacha': # Automatically recalculate mean and variance parameter.
336 |         option = 'char'
337 |         mean = calculate_gacha_mean(source_corpus, target_corpus, option=option)
338 |         variance = calculate_gacha_variance(source_corpus, target_corpus,
339 |                                             option=option)
340 |     # Show Users the mean/variance.
341 |     msg = " ".join(["Aligning corpus with mean =", str(mean), 
342 |                     "and variance = ", str(variance), '\n'])
343 |     sys.stderr.write(msg)
344 |     
345 |     with io.open(source_corpus) as src, io.open(target_corpus) as trg:
346 |         src = per_paragraph(src, delimiter=delimiter)
347 |         trg = per_paragraph(trg, delimiter=delimiter)
348 |         for s,t in izip(src, trg):
349 |             for sentence_pair in align(s,t, mean, variance, penalty):
350 |                 print sentence_pair
351 |                 
352 | def gachalign(source_corpus, target_corpus, mean=parameters.mean, 
353 |          variance=parameters.variance, penalty= parameters.penalty, 
354 |          option='char', delimiter='#'):
355 |     main(source_corpus, target_corpus, mean, variance, penalty, option, delimiter)
356 |             
357 | if __name__ == '__main__':
358 |   if len(sys.argv) not in range(3,6):
359 |     sys.stderr.write('Usage: python %s corpus.x corpus.y '
360 |                      '(option) (mean) (variance)\n' % sys.argv[0])
361 |     sys.exit(1)
362 |   main(*sys.argv[1:])
363 |     


--------------------------------------------------------------------------------