├── .gitignore ├── .idea ├── .gitignore ├── Vietnamese-Word-Segmentation-Python.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── MANIFEST.in ├── README.md ├── __init__.py ├── pyproject.toml ├── setup.py ├── src ├── DataPreprocessor.py ├── FWObject.py ├── Model.RDR ├── Node.py ├── RDRSegmenter.py ├── Tokenizer.py ├── Utils.py ├── VnVocab.txt ├── Vocabulary.py ├── WordTag.py └── __init__.py ├── test.py ├── train ├── RDRsegmenter.py ├── Readme.md ├── SCRDRlearner │ ├── Node.py │ ├── Node.pyc │ ├── Object.py │ ├── Object.pyc │ ├── SCRDRTree.py │ ├── SCRDRTree.pyc │ ├── SCRDRTreeLearner.py │ ├── SCRDRTreeLearner.pyc │ ├── __init__.py │ └── __init__.pyc ├── Train_gold.txt ├── Train_gold.txt.BI ├── Train_gold.txt.RAW.Init ├── Train_gold.txt.RAW.Init.RDR └── Utility │ ├── Config.py │ ├── Config.pyc │ ├── Eval.py │ ├── LexiconCreator.py │ ├── Utils.py │ ├── __init__.py │ └── __init__.pyc ├── vws.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt └── top_level.txt └── vws ├── DataPreprocessor.py ├── FWObject.py ├── Model.RDR ├── Node.py ├── RDRSegmenter.py ├── Tokenizer.py ├── Utils.py ├── VnVocab.txt ├── Vocabulary.py ├── WordTag.py ├── __init__.py ├── __pycache__ ├── FWObject.cpython-38.pyc ├── Node.cpython-38.pyc ├── RDRSegmenter.cpython-38.pyc ├── Tokenizer.cpython-38.pyc ├── Utils.cpython-38.pyc ├── Vocabulary.cpython-38.pyc ├── WordTag.cpython-38.pyc └── __init__.cpython-38.pyc └── vws.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt └── top_level.txt /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | build/ -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/Vietnamese-Word-Segmentation-Python.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 15 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include Model.RDR 3 | include VnVocab.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # word_segmenter 2 | ## Chú ý: 3 | ### Bộ mã này được viết lại từ bộ RDRSegmenter: https://github.com/datquocnguyen/RDRsegmenter bằng Python với mục đích thuận tiện hơn cho việc sử dụng và tùy biến các công cụ NLP tiếng Việt 4 | The implementation of RDRsegmenter, as described in [our paper](http://www.lrec-conf.org/proceedings/lrec2018/summaries/55.html): 5 | 6 | @InProceedings{NguyenNVDJ2018, 7 | author={Dat Quoc Nguyen and Dai Quoc Nguyen and Thanh Vu and Mark Dras and Mark Johnson}, 8 | title={{A Fast and Accurate Vietnamese Word Segmenter}}, 9 | booktitle={Proceedings of the 11th International Conference on Language Resources and Evaluation (LREC 2018)}, 10 | pages={2582--2587}, 11 | year={2018} 12 | } 13 | 14 | **Please CITE** our paper whenever RDRsegmenter is used to produce published results or incorporated into other software. 15 | 16 | Translator: Vinh Pham 17 | 18 | ## Hướng dẫn sử dụng 19 | ** REQUIRED Python3 ** 20 | - python setup.py install 21 | - python -m pip install . 22 | 23 | ## Ví dụ 24 | ``` 25 | >>> from vws import RDRSegmenter, Tokenizer 26 | >>> rdrsegment = RDRSegmenter.RDRSegmenter() 27 | >>> tokenizer = Tokenizer.Tokenizer() 28 | >>> output = rdrsegment.segmentRawSentences(tokenizer,"Lượng khách Thái bắt đầu gia tăng từ đầu năm 2005. Bên cạnh đó, kể từ tháng 10-2005 đến nay, từ khi được phép của VN, các đoàn caravan của Thái Lan cũng đã ồ ạt đổ vào VN.") 29 | >>> print(output) 30 | ``` 31 | Output: 32 | ``` 33 | >>> Lượng khách Thái bắt_đầu gia_tăng từ đầu năm 2005. Bên cạnh đó, kể từ tháng 10-2005 đến nay, từ khi được phép của VN, các đoàn caravan của Thái_Lan cũng đã ồ_ạt đổ vào VN. 34 | ``` -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from vws.RDRSegmenter import RDRSegmenter 2 | from vws.Tokenizer import Tokenizer -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="vws", 8 | version="0.0.1", 9 | author="vinhpx", 10 | author_email="phamxuanvinh023@gmail.com", 11 | description="A small example package", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/Sudo-VP/Vietnamese-Word-Segmentation-Python", 15 | project_urls={ 16 | "Bug Tracker": "https://github.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/issues", 17 | }, 18 | classifiers=[ 19 | "Programming Language :: Python :: 3", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent", 22 | ], 23 | package_dir={"": "."}, 24 | packages=setuptools.find_packages(where="."), 25 | package_data={'': ['Model.RDR','VnVocab.txt']}, 26 | include_package_data=True, 27 | python_requires=">=3.6", 28 | ) -------------------------------------------------------------------------------- /src/DataPreprocessor.py: -------------------------------------------------------------------------------- 1 | from RDRSegmenter import RDRSegmenter 2 | from Utils import Utils 3 | import sys 4 | 5 | 6 | class DataPreprocessor: 7 | def __init__(self): 8 | self.initialSegmenter = RDRSegmenter() 9 | self.utils = Utils() 10 | 11 | def getStringInitialSegmentation(self, strs: str): 12 | sb = [] 13 | line = str.trim() 14 | if len(line) == 0: 15 | return "\n" 16 | 17 | wordtags = self.initialSegmenter.getInitialSegmentation(line) 18 | 19 | size = len(wordtags) 20 | for i in range(0, size): 21 | if wordtags[i].tag == "B": 22 | sb.append(wordtags.get(i).form + "/B ") 23 | else: 24 | sb.append(wordtags.get(i).form + "/I ") 25 | return ''.join(sb).trim() 26 | 27 | def getCorpusInitialSegmentation(self, inFilePath: str): 28 | with open(inFilePath, 'r', encoding="utf8") as buffer: 29 | with open(inFilePath + ".RAW.Init", 'a', encoding='utf8') as bwInit: 30 | with open(inFilePath + ".BI", 'a', encoding='utf8') as bw: 31 | for line in buffer: 32 | if line != "" and line != None and line != '\n': 33 | lineStr = line 34 | for regex in self.utils.NORMALIZER_KEYS: 35 | if regex in lineStr: 36 | lineStr = lineStr.replace(regex, self.utils.NORMALIZER[regex]) 37 | 38 | sb = [] 39 | 40 | words = lineStr.split() 41 | for word in words: 42 | syllabels = word.split("_") 43 | bw.write(syllabels[0] + "/B ") 44 | sb.append(syllabels[0] + " ") 45 | for i in range(1, len(syllabels)): 46 | bw.write(syllabels[i] + "/I ") 47 | sb.append(syllabels[i] + " ") 48 | bw.write("\n") 49 | 50 | bwInit.write(self.getStringInitialSegmentation(''.join(sb) + "\n")) 51 | 52 | 53 | if __name__ == '__main__': 54 | segmenter = DataPreprocessor() 55 | segmenter.getCorpusInitialSegmentation(sys.argv[0]) 56 | -------------------------------------------------------------------------------- /src/FWObject.py: -------------------------------------------------------------------------------- 1 | class FWObject: 2 | def __init__(self,check:bool): 3 | self._context = [None]*10 4 | self.check = check 5 | if self.check==True: 6 | for i in range(0,10,2): 7 | self.context.append("") 8 | self.context.append("") 9 | @property 10 | def context(self): 11 | return self._context 12 | # setting the values 13 | @context.setter 14 | def context(self, value): 15 | self._context = value 16 | -------------------------------------------------------------------------------- /src/Node.py: -------------------------------------------------------------------------------- 1 | import FWObject 2 | 3 | class Node: 4 | def __init__(self,inCondition:FWObject,inConclusion:str, inFatherNode, inExceptNode, 5 | inIfnotNode, inDepth:int): 6 | self.condition = inCondition 7 | self.conclusion = inConclusion 8 | self.fatherNode = inFatherNode 9 | self.exceptNode = inExceptNode 10 | self.ifnotNode = inIfnotNode 11 | self.depth = inDepth 12 | def setIfnotNode(self, node): 13 | self.ifnotNode = node 14 | 15 | def setExceptNode(self, node): 16 | self.exceptNode = node 17 | 18 | def setFatherNode(self, node): 19 | self.fatherNode = node 20 | def countNodes(self)->int: 21 | count = 1 22 | if self.exceptNode != None: 23 | count += self.exceptNode.countNodes() 24 | if self.ifnotNode != None : 25 | count += self.ifnotNode.countNodes() 26 | return count 27 | def satisfy(self, object:FWObject): 28 | check = True 29 | for i in range(0,10): 30 | key = self.condition.context[i] 31 | if key != None: 32 | if not key == object.context[i] : 33 | check = False 34 | break 35 | return check -------------------------------------------------------------------------------- /src/RDRSegmenter.py: -------------------------------------------------------------------------------- 1 | from Node import Node 2 | from Utils import Utils 3 | from FWObject import FWObject 4 | from WordTag import WordTag 5 | from Vocabulary import Vocabulary 6 | from Tokenizer import Tokenizer 7 | import time 8 | 9 | utils = Utils() 10 | 11 | class RDRSegmenter: 12 | def __init__(self): 13 | self._root = None 14 | try: 15 | self.constructTreeFromRulesFile("Model.RDR") 16 | except IOError as e: 17 | raise e 18 | @property 19 | def root(self): 20 | return self._root 21 | @root.setter 22 | def root(self,value:Node): 23 | self._root = value 24 | def constructTreeFromRulesFile(self, rulesFilePath:str): 25 | 26 | self.root = Node(FWObject(False), "NN", None, None, None, 0) 27 | 28 | currentNode = self.root 29 | currentDepth = 0 30 | with open(rulesFilePath,'r',encoding='utf8') as rulesFile: 31 | for indexFileRule,line in enumerate(rulesFile): 32 | depth = 0 33 | for i in range(0,6): 34 | if line[i] == '\t': 35 | depth += 1 36 | else: 37 | break 38 | if indexFileRule==0: 39 | continue 40 | line = line.strip() 41 | if len(line) == 0: 42 | continue 43 | 44 | if "cc:" in line: 45 | continue 46 | # print(line.split(" : ")[0].strip()) 47 | condition = utils.getCondition(line.split(" : ")[0].strip()) 48 | conclusion = utils.getConcreteValue(line.split(" : ")[1].strip()) 49 | 50 | node = Node(condition, conclusion, None, None, None, depth) 51 | 52 | if depth > currentDepth: 53 | currentNode.setExceptNode(node) 54 | else: 55 | if depth == currentDepth: 56 | currentNode.setIfnotNode(node) 57 | else: 58 | while currentNode.depth != depth: 59 | currentNode = currentNode.fatherNode 60 | currentNode.setIfnotNode(node) 61 | node.setFatherNode(currentNode) 62 | 63 | currentNode = node 64 | currentDepth = depth 65 | 66 | def findFiredNode(self,object:FWObject)->Node: 67 | currentN = self._root 68 | firedN = None 69 | while True: 70 | if currentN.satisfy(object): 71 | firedN = currentN 72 | if currentN.exceptNode == None : 73 | break 74 | else : 75 | currentN = currentN.exceptNode 76 | else: 77 | if currentN.ifnotNode == None: 78 | break 79 | else : 80 | currentN = currentN.ifnotNode 81 | return firedN 82 | def allIsLetter(self,strs:str)->bool: 83 | 84 | for char in strs: 85 | if char.isalpha() ==False: 86 | return False 87 | return True 88 | def allIsUpper(self,strs:str)->bool: 89 | 90 | for char in strs: 91 | if char.isupper() ==False: 92 | return False 93 | return True 94 | def getInitialSegmentation(self,sentence:str)->list: 95 | wordtags = [] 96 | vocab = Vocabulary() 97 | for regex in utils.NORMALIZER_KEYS: 98 | if regex in sentence: 99 | sentence = sentence.replace(regex, utils.NORMALIZER[regex]) 100 | tokens = sentence.split() 101 | lowerTokens = sentence.lower().split() 102 | senLength = len(tokens) 103 | i = 0 104 | while i < senLength : 105 | token = tokens[i] 106 | if self.allIsLetter(token) : 107 | if token[0].islower() and (i + 1) < senLength: 108 | if tokens[i + 1][0].isupper(): 109 | wordtags.append(WordTag(token, "B")) 110 | i+=1 111 | continue 112 | isSingleSyllabel = True 113 | for j in range(min(i + 4, senLength), i + 1,-1): 114 | word = " ".join(lowerTokens[i: j]) 115 | if word in vocab.VN_DICT or word in vocab.VN_LOCATIONS or word in vocab.COUNTRY_L_NAME: 116 | wordtags.append(WordTag(token, "B")) 117 | for k in range(i+1,j): 118 | wordtags.append(WordTag(tokens[k], "I")) 119 | 120 | i = j - 1 121 | isSingleSyllabel = False 122 | break 123 | 124 | if isSingleSyllabel : 125 | lowercasedToken = lowerTokens[i] 126 | 127 | if lowercasedToken in vocab.VN_FIRST_SENT_WORDS \ 128 | or token[0].islower() \ 129 | or self.allIsUpper(token) \ 130 | or lowercasedToken in vocab.COUNTRY_S_NAME \ 131 | or lowercasedToken in vocab.WORLD_COMPANY : \ 132 | 133 | wordtags.append(WordTag(token, "B")) 134 | i+=1 135 | continue 136 | ilower = i + 1 137 | for ilower in range(i + 1 ,min(i + 4, senLength)): 138 | ntoken = tokens[ilower] 139 | if ntoken.islower() \ 140 | or not self.allIsLetter(ntoken) \ 141 | or ntoken=="LBKT" or ntoken=="RBKT" : 142 | break 143 | 144 | if ilower > i + 1: 145 | isNotMiddleName = True 146 | if lowercasedToken in vocab.VN_MIDDLE_NAMES and i >= 1: 147 | prevT = tokens[i-1] 148 | if prevT[0].isupper(): 149 | if prevT.lower() in vocab.VN_FAMILY_NAMES: 150 | wordtags.append(WordTag(token, "I")) 151 | isNotMiddleName = False 152 | if isNotMiddleName: 153 | wordtags.append(WordTag(token, "B")) 154 | for k in range(i+1,ilower): 155 | wordtags.append( WordTag(tokens[k], "I")) 156 | 157 | i = ilower - 1 158 | else: 159 | wordtags.append(WordTag(token, "B")) 160 | else: 161 | wordtags.append(WordTag(token, "B")) 162 | i+=1 163 | return wordtags 164 | 165 | def segmentTokenizedString(self,strs :str)->str: 166 | sb = "" 167 | line = ''.join(strs).strip() 168 | if len(line) == 0: 169 | return "\n" 170 | 171 | wordtags = self.getInitialSegmentation(line) 172 | size = len(wordtags) 173 | for i in range(0,size) : 174 | object = utils.getObject(wordtags, size, i) 175 | firedNode = self.findFiredNode(object) 176 | if firedNode.depth > 0: 177 | if firedNode.conclusion=="B": 178 | sb=sb+" " + wordtags[i].form 179 | else: 180 | sb=sb+"_" + wordtags[i].form 181 | else: 182 | if wordtags[i].tag == "B": 183 | sb=sb+" " + wordtags[i].form 184 | else: 185 | sb=sb+"_" + wordtags[i].form 186 | return sb.strip() 187 | 188 | # def segmentRawString(self,strs:str)->str: 189 | # return self.segmentTokenizedString(" ".join(Tokenizer.tokenize(strs))) 190 | def segmentRawSentences(self,tokenizer:Tokenizer,strs:str): 191 | sentence = tokenizer.joinSentences(tokenizer.tokenize(strs)) 192 | return self.segmentTokenizedString(sentence) 193 | 194 | 195 | # if __name__ == "__main__": 196 | # rdrsegment = RDRSegmenter() 197 | # tokenizer = Tokenizer() 198 | # t=time.time() 199 | # output = rdrsegment.segmentRawSentences(tokenizer,"Lượng khách Thái bắt đầu gia tăng từ đầu năm 2005. Bên cạnh đó, kể từ tháng 10-2005 đến nay, từ khi được phép của VN, các đoàn caravan của Thái Lan cũng đã ồ ạt đổ vào VN.") 200 | # print(output,time.time()-t) 201 | -------------------------------------------------------------------------------- /src/Tokenizer.py: -------------------------------------------------------------------------------- 1 | import string 2 | import re 3 | # from enum import E/num 4 | 5 | class Tokenizer: 6 | def __init__(self): 7 | self.name = 'Tokenizer' 8 | def hasPunctuation(self,strs:str): 9 | for char in strs: 10 | # print(char) 11 | if not char.isalpha(): 12 | # print(char) 13 | return True 14 | return False 15 | 16 | def tokenize(self,s): 17 | if s == None or s.strip()=="": 18 | return [] 19 | 20 | tempTokens = s.strip().split() 21 | # print(tempTokens) 22 | if len(tempTokens) == 0: 23 | return [] 24 | 25 | tokens = [] 26 | for token in tempTokens: 27 | # print(len(token)) 28 | 29 | if len(token) == 1 or self.hasPunctuation(token): 30 | tokens.append(token) 31 | continue 32 | 33 | if token.endswith(","): 34 | 35 | for t in self.tokenize(token[0, len(token) - 1]): 36 | tokens.append(t) 37 | tokens.append(",") 38 | continue 39 | if token in StringUtils().VN_abbreviation: 40 | tokens.append(token) 41 | continue 42 | 43 | if token.endswith(".") and token[len(token) - 2].isalpha(): 44 | if len(token) == 2 and token[len(token) - 2].isupper() or re.search(Regex.SHORT_NAME,token): 45 | tokens.append(token) 46 | continue 47 | for t in self.tokenize(token[0, len(token) - 1]): 48 | tokens.append(t) 49 | tokens.add(".") 50 | continue 51 | 52 | if token in StringUtils().VN_exception: 53 | tokens.append(token) 54 | continue 55 | 56 | tokenContainsAbb = False 57 | for e in StringUtils().VN_abbreviation: 58 | try: 59 | i = token.index(e) 60 | except Exception as e: 61 | continue 62 | 63 | tokenContainsAbb = True 64 | tokens = self.recursive(tokens, token, i, i + e.length()) 65 | break 66 | if tokenContainsAbb: 67 | continue 68 | 69 | tokenContainsExp = False 70 | for e in StringUtils()._VN_exception: 71 | try: 72 | i = token.index(e) 73 | except Exception as e: 74 | continue 75 | 76 | tokenContainsExp = True 77 | tokens = self.recursive(tokens, token, i, i + e.length()) 78 | break 79 | if tokenContainsExp: 80 | continue 81 | 82 | regexes = Regex().getRegexList() 83 | 84 | matching = False 85 | for regex in regexes: 86 | # print(regex,token) 87 | if re.search(regex,token): 88 | tokens.append(token) 89 | matching = True 90 | break 91 | if matching: 92 | continue 93 | 94 | for i in range(0, len(regexes)): 95 | pattern = re.compile(regexes[i]) 96 | matcher = matcher.search(token) 97 | if matcher: 98 | if i == Regex.getRegexIndex("url"): 99 | elements = token.split(".") 100 | hasURL = True 101 | for ele in elements: 102 | if len(ele) == 1 and ele[0].isupper(): 103 | hasURL = False 104 | break 105 | for j in range(0,len(ele)): 106 | if ele[j] >= 128: 107 | hasURL = False 108 | break 109 | if hasURL: 110 | tokens = self.recursive(tokens, token, matcher.start(), matcher.end()) 111 | else: 112 | continue 113 | 114 | else: 115 | if i == Regex.getRegexIndex("month"): 116 | start = matcher.start() 117 | 118 | hasLetter = False 119 | 120 | for j in range(0, start): 121 | if token[j].isalpha(): 122 | tokens = self.recursive(tokens, token, matcher.start(), matcher.end()) 123 | hasLetter = True 124 | break 125 | 126 | 127 | if not hasLetter: 128 | tokens.append(token) 129 | 130 | else: 131 | tokens = self.recursive(tokens, token, matcher.start(), matcher.end()) 132 | 133 | matching = True 134 | break 135 | 136 | if matching: 137 | continue 138 | else: 139 | tokens.append(token) 140 | 141 | return tokens 142 | 143 | def recursive( self,tokens, token, beginMatch, endMatch): 144 | if beginMatch > 0: 145 | for t in self.tokenize(token[0, beginMatch]): 146 | tokens.append(t) 147 | for t in self.tokenize(token[beginMatch, endMatch]): 148 | tokens.append(t) 149 | 150 | if endMatch < len(token): 151 | for t in self.tokenize(token[endMatch]): 152 | tokens.append(t) 153 | 154 | return tokens 155 | 156 | def joinSentences(self,tokens): 157 | sentences =[] 158 | sentence = [] 159 | for i in range(0,len(tokens)): 160 | token = tokens[i] 161 | nextToken = None 162 | if i != len(tokens)- 1: 163 | nextToken = tokens[i + 1] 164 | beforeToken = None 165 | if i > 0: 166 | beforeToken = tokens[i - 1] 167 | 168 | # print(token) 169 | sentence.append(token) 170 | 171 | if i == len(tokens)- 1: 172 | sentences.append(self.joinSentence(sentence)) 173 | return sentences 174 | 175 | if i < len(tokens)- 2 and token == StringConst.COLON: 176 | if nextToken.isnumeric() and tokens[i+2]==StringConst.STOP \ 177 | or tokens[i+2] == StringConst.COMMA: 178 | sentences.append(self.joinSentence(sentence)) 179 | sentence = '' 180 | continue 181 | 182 | 183 | if re.match(Regex().EOS_PUNCTUATION,token): 184 | 185 | if nextToken == "\"" or nextToken=="''": 186 | count = 0 187 | for senToken in sentence: 188 | if senToken=="\"" or senToken=="''": 189 | count += 1 190 | if count % 2 == 1: 191 | continue 192 | 193 | if StringUtils.isBrace(nextToken) or nextToken=="" or nextToken[0].islower() \ 194 | or nextToken==StringConst.COMMA or nextToken[0].isnumeric(): 195 | continue 196 | 197 | if len(sentence) == 2 and token==StringConst.STOP: 198 | if beforeToken[0].isnumeric(): 199 | continue 200 | if beforeToken[0].islower(): 201 | continue 202 | if beforeToken[0].isupper(): 203 | if len(beforeToken) == 1: 204 | continue 205 | 206 | 207 | sentences.append(self.joinSentence(sentence)) 208 | sentence = "" 209 | return sentences 210 | 211 | def joinSentence(self,tokens): 212 | sent = [] 213 | stringConst = StringConst() 214 | length = length = len(tokens) 215 | token = "" 216 | for i in range(0,length): 217 | token = tokens[i] 218 | if token=="" or token == None or token==stringConst.SPACE: 219 | continue 220 | sent.append(token) 221 | if i < length - 1: 222 | sent.append(stringConst.SPACE) 223 | return ''.join(sent).strip() 224 | 225 | class StringConst: 226 | @property 227 | def BOS(self): 228 | return "" 229 | @property 230 | def EOS(self): 231 | return "" 232 | @property 233 | def SPACE(self): 234 | return " " 235 | @property 236 | def COMMA(self): 237 | return "," 238 | @property 239 | def STOP(self): 240 | return "." 241 | @property 242 | def COLON(self): 243 | return ":" 244 | @property 245 | def UNDERSCORE(self): 246 | return "_" 247 | 248 | class StringUtils: 249 | def __init__(self): 250 | self._VN_abbreviation={"M.City"} 251 | self._VN_abbreviation.add("V.I.P") 252 | self._VN_abbreviation.add("PGS.Ts") 253 | self._VN_abbreviation.add("MRS.") 254 | self._VN_abbreviation.add("Mrs.") 255 | self._VN_abbreviation.add("Man.United") 256 | self._VN_abbreviation.add("Mr.") 257 | self._VN_abbreviation.add("SHB.ĐN") 258 | self._VN_abbreviation.add("Gs.Bs") 259 | self._VN_abbreviation.add("U.S.A") 260 | self._VN_abbreviation.add("TMN.CSG") 261 | self._VN_abbreviation.add("Kts.Ts") 262 | self._VN_abbreviation.add("R.Madrid") 263 | self._VN_abbreviation.add("Tp.") 264 | self._VN_abbreviation.add("T.Ư") 265 | self._VN_abbreviation.add("D.C") 266 | self._VN_abbreviation.add("Gs.Tskh") 267 | self._VN_abbreviation.add("PGS.KTS") 268 | self._VN_abbreviation.add("GS.BS") 269 | self._VN_abbreviation.add("KTS.TS") 270 | self._VN_abbreviation.add("PGS-TS") 271 | self._VN_abbreviation.add("Co.") 272 | self._VN_abbreviation.add("S.H.E") 273 | self._VN_abbreviation.add("Ths.Bs") 274 | self._VN_abbreviation.add("T&T.HN") 275 | self._VN_abbreviation.add("MR.") 276 | self._VN_abbreviation.add("Ms.") 277 | self._VN_abbreviation.add("T.T.P") 278 | self._VN_abbreviation.add("TT.") 279 | self._VN_abbreviation.add("TP.") 280 | self._VN_abbreviation.add("ĐH.QGHN") 281 | self._VN_abbreviation.add("Gs.Kts") 282 | self._VN_abbreviation.add("Man.Utd") 283 | self._VN_abbreviation.add("GD-ĐT") 284 | self._VN_abbreviation.add("T.W") 285 | self._VN_abbreviation.add("Corp.") 286 | self._VN_abbreviation.add("ĐT.LA") 287 | self._VN_abbreviation.add("Dr.") 288 | self._VN_abbreviation.add("T&T") 289 | self._VN_abbreviation.add("HN.ACB") 290 | self._VN_abbreviation.add("GS.KTS") 291 | self._VN_abbreviation.add("MS.") 292 | self._VN_abbreviation.add("Prof.") 293 | self._VN_abbreviation.add("GS.TS") 294 | self._VN_abbreviation.add("PGs.Ts") 295 | self._VN_abbreviation.add("PGS.BS") 296 | self._VN_abbreviation.add("BT.") 297 | self._VN_abbreviation.add("Ltd.") 298 | self._VN_abbreviation.add("ThS.BS") 299 | self._VN_abbreviation.add("Gs.Ts") 300 | self._VN_abbreviation.add("SL.NA") 301 | self._VN_abbreviation.add("Th.S") 302 | self._VN_abbreviation.add("Gs.Vs") 303 | self._VN_abbreviation.add("PGs.Bs") 304 | self._VN_abbreviation.add("T.O.P") 305 | self._VN_abbreviation.add("PGS.TS") 306 | self._VN_abbreviation.add("HN.T&T") 307 | self._VN_abbreviation.add("SG.XT") 308 | self._VN_abbreviation.add("O.T.C") 309 | self._VN_abbreviation.add("TS.BS") 310 | self._VN_abbreviation.add("Yahoo!") 311 | self._VN_abbreviation.add("Man.City") 312 | self._VN_abbreviation.add("MISS.") 313 | self._VN_abbreviation.add("HA.GL") 314 | self._VN_abbreviation.add("GS.Ts") 315 | self._VN_abbreviation.add("TBT.") 316 | self._VN_abbreviation.add("GS.VS") 317 | self._VN_abbreviation.add("GS.TSKH") 318 | self._VN_abbreviation.add("Ts.Bs") 319 | self._VN_abbreviation.add("M.U") 320 | self._VN_abbreviation.add("Gs.TSKH") 321 | self._VN_abbreviation.add("U.S") 322 | self._VN_abbreviation.add("Miss.") 323 | self._VN_abbreviation.add("GD.ĐT") 324 | self._VN_abbreviation.add("PGs.Kts") 325 | self._VN_abbreviation.add("St.") 326 | self._VN_abbreviation.add("Ng.") 327 | self._VN_abbreviation.add("Inc.") 328 | self._VN_abbreviation.add("Th.") 329 | self._VN_abbreviation.add("N.O.V.A") 330 | 331 | self._VN_exception={"Wi-fi"} 332 | self._VN_exception.add("17+") 333 | self._VN_exception.add("km/h") 334 | self._VN_exception.add("M7") 335 | self._VN_exception.add("M8") 336 | self._VN_exception.add("21+") 337 | self._VN_exception.add("G3") 338 | self._VN_exception.add("M9") 339 | self._VN_exception.add("G4") 340 | self._VN_exception.add("km3") 341 | self._VN_exception.add("m/s") 342 | self._VN_exception.add("km2") 343 | self._VN_exception.add("5g") 344 | self._VN_exception.add("4G") 345 | self._VN_exception.add("8K") 346 | self._VN_exception.add("3g") 347 | self._VN_exception.add("E9") 348 | self._VN_exception.add("U21") 349 | self._VN_exception.add("4K") 350 | self._VN_exception.add("U23") 351 | self._VN_exception.add("Z1") 352 | self._VN_exception.add("Z2") 353 | self._VN_exception.add("Z3") 354 | self._VN_exception.add("Z4") 355 | self._VN_exception.add("Z5") 356 | self._VN_exception.add("Jong-un") 357 | self._VN_exception.add("u19") 358 | self._VN_exception.add("5s") 359 | self._VN_exception.add("wi-fi") 360 | self._VN_exception.add("18+") 361 | self._VN_exception.add("Wi-Fi") 362 | self._VN_exception.add("m2") 363 | self._VN_exception.add("16+") 364 | self._VN_exception.add("m3") 365 | self._VN_exception.add("V-League") 366 | self._VN_exception.add("Geun-hye") 367 | self._VN_exception.add("5G") 368 | self._VN_exception.add("4g") 369 | self._VN_exception.add("Z3+") 370 | self._VN_exception.add("3G") 371 | self._VN_exception.add("km/s") 372 | self._VN_exception.add("6+") 373 | self._VN_exception.add("u21") 374 | self._VN_exception.add("WI-FI") 375 | self._VN_exception.add("u23") 376 | self._VN_exception.add("U19") 377 | self._VN_exception.add("6s") 378 | self._VN_exception.add("4s") 379 | 380 | def isBrace(self,string): 381 | if string=="”" or string=="�" or string=="'" or string==")" \ 382 | or string=="}" or string=="]": 383 | return True 384 | return False 385 | @property 386 | def VN_abbreviation(self): 387 | return self._VN_abbreviation 388 | 389 | @property 390 | def VN_exception(self): 391 | return self._VN_exception 392 | class Regex: 393 | def __init__(self): 394 | self._regexes = None 395 | self._regexIndex=None 396 | @property 397 | def ELLIPSIS(self): 398 | return "\\.{2,}" 399 | 400 | @property 401 | def EMAIL(self): 402 | return "([\\w\\d_\\.-]+)@(([\\d\\w-]+)\\.)*([\\d\\w-]+)" 403 | 404 | @property 405 | def FULL_DATE(self): 406 | return "(0?[1-9]|[12][0-9]|3[01])(\\/|-|\\.)(1[0-2]|(0?[1-9]))((\\/|-|\\.)\\d{4})" 407 | 408 | @property 409 | def MONTH(self): 410 | return "(1[0-2]|(0?[1-9]))(\\/)\\d{4}" 411 | 412 | @property 413 | def DATE(self): 414 | return "(0?[1-9]|[12][0-9]|3[01])(\\/)(1[0-2]|(0?[1-9]))" 415 | 416 | @property 417 | def TIME(self): 418 | return "(\\d\\d:\\d\\d:\\d\\d)|((0?\\d|1\\d|2[0-3])(:|h)(0?\\d|[1-5]\\d)(’|'|p|ph)?)" 419 | 420 | @property 421 | def MONEY(self): 422 | return "\\\p{Sc}\\d+([\\.,]\\d+)*|\\d+([\\.,]\\d+)*\\\p{Sc}" 423 | 424 | @property 425 | def PHONE_NUMBER(self): 426 | return "(\\(?\\+\\d{1,2}\\)?[\\s\\.-]?)?\\d{2,}[\\s\\.-]?\\d{3,}[\\s\\.-]?\\d{3,}" 427 | 428 | @property 429 | def URL(self): 430 | return "(((https?|ftp):\\/\\/|www\\.)[^\\s/$.?#].[^\\s]*)|(https?:\\/\\/)?(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)" 431 | 432 | @property 433 | def NUMBER(self): 434 | return "[-+]?\\d+([\\.,]\\d+)*%?\\\p{Sc}?" 435 | 436 | @property 437 | def PUNCTUATION(self): 438 | return ",|\\.|:|\\?|!||-|_|\"|'|“|”|\\\or\\(|\\)|\\[|\\]|\\{|\\}|⟨|⟩|«|»|\\\\|\\/|\\‘|\\’|\\“|\\â€�|…|…|‘|’|·" 439 | 440 | @property 441 | def SPECIAL_CHAR(self): 442 | return "\\~|\\@|\\#|\\^|\\&|\\*|\\+|\\-|\\–|<|>|\\|" 443 | 444 | @property 445 | def EOS_PUNCTUATION(self): 446 | return "(\\.+|\\?|!|…)" 447 | 448 | @property 449 | def NUMBERS_EXPRESSION(self): 450 | return "[-+]?\\d+([\\.,]\\d+)*%?\\\p{Sc}?" + "([\\+\\-\\*\\/]" + "[-+]?\\d+([\\.,]\\d+)*%?\\\p{Sc}?" + ")*" 451 | 452 | @property 453 | def SHORT_NAME(self): 454 | return "([\\\p{L}]+([\\.\\-][\\\p{L}]+)+)|([\\\p{L}]+-\\d+)" 455 | 456 | @property 457 | def ALLCAP(self): 458 | return "[A-Z]+\\.[A-Z]+" 459 | @property 460 | def regexes(self): 461 | return self._regexes 462 | @regexes.setter 463 | def regexes(self,value): 464 | self._regexes = value 465 | @property 466 | def regexIndex(self): 467 | return self._regexIndex 468 | @regexIndex.setter 469 | def regexIndex(self,value): 470 | self._regexIndex = value 471 | 472 | def getRegexList(self): 473 | regex_ = Regex() 474 | if self._regexes == None: 475 | self._regexes = [] 476 | self._regexIndex = [] 477 | 478 | self._regexes.append(regex_.ELLIPSIS) 479 | self._regexIndex.append("ELLIPSIS") 480 | 481 | self._regexes.append(regex_.EMAIL) 482 | self._regexIndex.append("EMAIL") 483 | 484 | self._regexes.append(regex_.URL) 485 | self._regexIndex.append("URL") 486 | 487 | self._regexes.append(regex_.FULL_DATE) 488 | self._regexIndex.append("FULL_DATE") 489 | 490 | self._regexes.append(regex_.MONTH) 491 | self._regexIndex.append("MONTH") 492 | 493 | self._regexes.append(regex_.DATE) 494 | self._regexIndex.append("DATE") 495 | 496 | self._regexes.append(regex_.TIME) 497 | self._regexIndex.append("TIME") 498 | 499 | self._regexes.append(regex_.MONEY) 500 | self._regexIndex.append("MONEY") 501 | 502 | self._regexes.append(regex_.PHONE_NUMBER) 503 | self._regexIndex.append("PHONE_NUMBER") 504 | 505 | self._regexes.append(regex_.SHORT_NAME) 506 | self._regexIndex.append("SHORT_NAME") 507 | 508 | self._regexes.append(regex_.NUMBERS_EXPRESSION) 509 | self._regexIndex.append("NUMBERS_EXPRESSION") 510 | 511 | self._regexes.append(regex_.NUMBER) 512 | self._regexIndex.append("NUMBER") 513 | 514 | self._regexes.append(regex_.PUNCTUATION) 515 | self._regexIndex.append("PUNCTUATION") 516 | 517 | self._regexes.append(regex_.SPECIAL_CHAR) 518 | self._regexIndex.append("SPECIAL_CHAR") 519 | 520 | self._regexes.append(regex_.ALLCAP) 521 | self._regexIndex.append("ALLCAP") 522 | 523 | return self._regexes 524 | 525 | def getRegexIndex(self,regex): 526 | return self._regexIndex.index(regex.upper()) 527 | -------------------------------------------------------------------------------- /src/Utils.py: -------------------------------------------------------------------------------- 1 | import FWObject 2 | import Node 3 | import WordTag 4 | class Utils: 5 | def __init__(self): 6 | self._NORMALIZER = {} 7 | self._NORMALIZER["òa"]= "oà" 8 | self._NORMALIZER["óa"]="oá" 9 | self._NORMALIZER["ỏa"]="oả" 10 | self._NORMALIZER["õa"]="oã" 11 | self._NORMALIZER["ọa"]="oạ" 12 | self._NORMALIZER["òe"]= "oè" 13 | self._NORMALIZER["óe"]="oé" 14 | self._NORMALIZER["ỏe"]= "oẻ" 15 | self._NORMALIZER["õe"]= "oẽ" 16 | self._NORMALIZER["ọe"]= "oẹ" 17 | self._NORMALIZER["ùy"]= "uỳ" 18 | self._NORMALIZER["úy"]= "uý" 19 | self._NORMALIZER["ủy"]= "uỷ" 20 | self._NORMALIZER["ũy"]= "uỹ" 21 | self._NORMALIZER["ụy"]= "uỵ" 22 | self._NORMALIZER["Ủy"]= "Uỷ" 23 | def getCondition(self,strCondition:str)->FWObject: 24 | condition = FWObject.FWObject(False) 25 | # print( strCondition.split(" and ")) 26 | for rule in strCondition.split(" and "): 27 | rule = rule.strip() 28 | # print(rule) 29 | 30 | # print(rule.index(".")+1,rule.index(" ")) 31 | key = rule[rule.index(".") + 1: rule.index(" ")] 32 | value = self.getConcreteValue(rule) 33 | 34 | if key == "prevWord2": 35 | condition.context[4] = value 36 | else: 37 | if key == "prevTag2" : 38 | condition.context[5] = value 39 | else: 40 | if key == "prevWord1": 41 | condition.context[2] = value 42 | else: 43 | if key == "prevTag1": 44 | condition.context[3] = value 45 | else: 46 | if key == "word": 47 | condition.context[1] = value 48 | else: 49 | if key == "tag": 50 | condition.context[0] = value 51 | else: 52 | if key == "nextWord1": 53 | condition.context[6] = value 54 | else: 55 | if key == "nextTag1": 56 | condition.context[7] = value 57 | else: 58 | if key == "nextWord2": 59 | condition.context[8] = value 60 | else: 61 | if key == "nextTag2": 62 | condition.context[9] = value 63 | 64 | 65 | return condition 66 | 67 | def getObject(self,wordtags:list, size:int, index:int)->FWObject: 68 | object = FWObject.FWObject(True) 69 | 70 | if index > 1: 71 | object.context[4] = wordtags[index-2].word 72 | object.context[5] = wordtags[index-2].tag 73 | 74 | if index > 0: 75 | object.context[2] = wordtags[index-1].word 76 | object.context[3] = wordtags[index-1].tag 77 | 78 | currentWord = wordtags[index].word 79 | currentTag = wordtags[index].tag 80 | 81 | object.context[1] = currentWord 82 | object.context[0] = currentTag 83 | 84 | if index < size - 1: 85 | object.context[6] = wordtags[index+1].word 86 | object.context[7] = wordtags[index+1].tag 87 | 88 | if index < size - 2: 89 | object.context[8] = wordtags[index+2].word 90 | object.context[9] = wordtags[index+2].tag 91 | 92 | return object 93 | 94 | def getConcreteValue(self,strs:str)->str: 95 | if "\"\"" in strs: 96 | if "Word" in strs: 97 | return "" 98 | else: 99 | return "" 100 | conclusion = strs[strs.index("\"") + 1: len(strs) - 1] 101 | return conclusion 102 | @property 103 | def NORMALIZER(self): 104 | return self._NORMALIZER 105 | @property 106 | def NORMALIZER_KEYS(self): 107 | return self._NORMALIZER.keys() 108 | 109 | -------------------------------------------------------------------------------- /src/WordTag.py: -------------------------------------------------------------------------------- 1 | class WordTag: 2 | def __init__(self,iword,itag): 3 | self._word = iword.lower() 4 | self._form = iword 5 | self._tag = itag 6 | 7 | @property 8 | def word(self): 9 | return self._word 10 | @property 11 | def form(self): 12 | return self._form 13 | @property 14 | def tag(self): 15 | return self._tag 16 | @word.setter 17 | def word(self,value): 18 | self._word =value 19 | @form.setter 20 | def form(self,value): 21 | self._form =value 22 | @tag.setter 23 | def tag(self,value): 24 | self._tag =value 25 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from vws import RDRSegmenter -------------------------------------------------------------------------------- /train/RDRsegmenter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | os.chdir("../") 6 | sys.setrecursionlimit(100000) 7 | sys.path.append(os.path.abspath("")) 8 | os.chdir("./train") 9 | 10 | from multiprocessing import Pool 11 | from SCRDRlearner.Object import FWObject, getWordTag 12 | from SCRDRlearner.SCRDRTree import SCRDRTree 13 | from SCRDRlearner.SCRDRTreeLearner import SCRDRTreeLearner 14 | from Utility.Config import NUMBER_OF_PROCESSES, THRESHOLD 15 | 16 | def printHelp(): 17 | print "\n===== Train =====" 18 | print '\ntrain$ python RDRsegmenter.py train PATH-TO-GOLD-SEGMENTATION-TRAINING-CORPUS.BI PATH-TO-TRAINING-CORPUS.RAW.Init' 19 | print '\nExample: \n\ntrain$ python RDRsegmenter.py train Train_gold.txt.BI Train_gold.txt.RAW.Init' 20 | 21 | def run(args = sys.argv[1:]): 22 | if (len(args) == 0): 23 | printHelp() 24 | elif args[0].lower() == "train": 25 | try: 26 | print "\n===== Start =====" 27 | print '\nLearn a tree model of rules from %s and %s ' % (args[1], args[2]) 28 | rdrTree = SCRDRTreeLearner(THRESHOLD[0], THRESHOLD[1]) 29 | rdrTree.learnRDRTree(args[2], args[1]) 30 | print "\nWrite the learned tree model to file ", args[2] + ".RDR" 31 | rdrTree.writeToFile(args[3] + ".RDR") 32 | print '\nDone!' 33 | except Exception, e: 34 | print "\nERROR ==> ", e 35 | printHelp() 36 | else: 37 | printHelp() 38 | if __name__ == "__main__": 39 | run() 40 | -------------------------------------------------------------------------------- /train/Readme.md: -------------------------------------------------------------------------------- 1 | # A Fast and Accurate Vietnamese Word Segmenter 2 | 3 | The implementation of RDRsegmenter (**training code**), as described in [our paper](https://www.aclweb.org/anthology/L18-1410): 4 | 5 | @InProceedings{NguyenNVDJ2018, 6 | author={Dat Quoc Nguyen and Dai Quoc Nguyen and Thanh Vu and Mark Dras and Mark Johnson}, 7 | title={{A Fast and Accurate Vietnamese Word Segmenter}}, 8 | booktitle={Proceedings of the 11th International Conference on Language Resources and Evaluation (LREC 2018)}, 9 | pages={2582--2587}, 10 | year={2018} 11 | } 12 | 13 | **Please CITE** our paper whenever RDRsegmenter is used to produce published results or incorporated into other software. 14 | 15 | RDRsegmenter has also been incorporated into our Java NLP annotation pipeline [VnCoreNLP](https://github.com/vncorenlp/VnCoreNLP) for Vietnamese. VnCoreNLP provides rich linguistic annotations through key NLP components of word segmentation, POS tagging, named entity recognition and dependency parsing. 16 | 17 | ## Usage 18 | 19 | Supposed that Java 1.8+ & Python 2.7 are already set to run in command line or terminal. To train RDRsegmenter using a gold word segmentation corpus: 20 | 21 | // #1: Compile a preprocessor for data conversion: 22 | RDRsegmenter$ javac -encoding UTF-8 DataPreprocessor.java 23 | // #2: Convert the gold corpus into BI-based format, and initially segment its corresponding raw text: 24 | RDRsegmenter$ java DataPreprocessor PATH-TO-GOLD-SEGMENTATION-TRAINING-CORPUS 25 | #3: Train RDRsegmenter using .BI and .RAW.Init files produced by the preprocessor: 26 | train$ python RDRsegmenter.py train PATH-TO-GOLD-SEGMENTATION-TRAINING-CORPUS.BI PATH-TO-TRAINING-CORPUS.RAW.Init 27 | 28 | 29 | #### Examples: 30 | 31 | RDRsegmenter$ java DataPreprocessor train/Train_gold.txt 32 | RDRsegmenter$ cd train 33 | train$ python RDRsegmenter.py train Train_gold.txt.BI Train_gold.txt.RAW.Init -------------------------------------------------------------------------------- /train/SCRDRlearner/Node.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class Node: 4 | """ 5 | A class to represent the nodes in SCRDR tree 6 | """ 7 | 8 | def __init__(self, condition, conclusion, father = None, exceptChild = None, elseChild = None, cornerstoneCases = [], depth = 0): 9 | self.condition = condition 10 | self.conclusion = conclusion 11 | self.exceptChild = exceptChild 12 | self.elseChild = elseChild 13 | self.cornerstoneCases = cornerstoneCases 14 | self.father = father 15 | self.depth = depth 16 | 17 | def satisfied(self, object): 18 | return eval(self.condition) 19 | 20 | def executeConclusion(self, object): 21 | exec(self.conclusion) 22 | 23 | def appendCornerstoneCase(self, object): 24 | self.cornerstoneCases.append(object) 25 | 26 | def check(self, object): 27 | if self.satisfied(object): 28 | self.executeConclusion(object) 29 | if self.exceptChild != None: 30 | self.exceptChild.check(object) 31 | else: 32 | if self.elseChild != None: 33 | self.elseChild.check(object) 34 | 35 | def checkDepth(self, object, length): 36 | if self.depth <= length: 37 | if self.satisfied(object): 38 | self.executeConclusion(object) 39 | if self.exceptChild != None: 40 | self.exceptChild.checkDepth(object, length) 41 | else: 42 | if self.elseChild != None: 43 | self.elseChild.checkDepth(object, length) 44 | 45 | def findRealFather(self): 46 | node = self 47 | fatherNode = node.father 48 | while True and fatherNode != None: 49 | if fatherNode.exceptChild == node: 50 | break 51 | node = fatherNode 52 | fatherNode = node.father 53 | return fatherNode 54 | 55 | def addElseChild(self, node): 56 | fatherNode = self.findRealFather() 57 | for object in fatherNode.cornerstoneCases: 58 | if node.satisfied(object): 59 | print "The new rule fires the cornerstone cases of its father node!!!" 60 | self.findRealFather().cornerstoneCases.remove(object) 61 | self.elseChild = node 62 | return True 63 | 64 | def addExceptChild(self, node): 65 | for object in self.cornerstoneCases: 66 | if node.satisfied(object): 67 | print "The new rule fires the cornerstone cases of its father node!!!" 68 | self.cornerstoneCases.remove(object) 69 | self.exceptChild = node 70 | return True 71 | 72 | def writeToFileWithSeenCases(self, out, depth): 73 | space = tabStr(depth) 74 | out.write(space + self.condition + " : " + self.conclusion + "\n") 75 | for case in self.cornerstoneCases: 76 | out.write(" " + space + "cc: " + case.toStr() + "\n") 77 | if self.exceptChild != None: 78 | self.exceptChild.writeToFile(out, depth + 1) 79 | if self.elseChild != None: 80 | self.elseChild.writeToFile(out, depth) 81 | 82 | def writeToFile(self, out, depth): 83 | space = tabStr(depth) 84 | out.write(space + self.condition + " : " + self.conclusion + "\n") 85 | if self.exceptChild != None: 86 | self.exceptChild.writeToFile(out, depth + 1) 87 | if self.elseChild != None: 88 | self.elseChild.writeToFile(out, depth) 89 | 90 | def tabStr(length): 91 | return "".join(["\t"] * length) 92 | -------------------------------------------------------------------------------- /train/SCRDRlearner/Node.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/train/SCRDRlearner/Node.pyc -------------------------------------------------------------------------------- /train/SCRDRlearner/Object.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class Object: 4 | attributes = ["word", 5 | "tag", 6 | "prevWord2", 7 | "prevWord1", 8 | "nextWord1", 9 | "nextWord2", 10 | "prevTag2", 11 | "prevTag1", 12 | "nextTag1", 13 | "nextTag2", 14 | "suffixL2", 15 | "suffixL3", 16 | "suffixL4"] 17 | code = "def __init__(self" 18 | for att in attributes: 19 | code = code + ", " + att + " = None" 20 | code = code + "):\n" 21 | for att in attributes: 22 | code = code + " self." + att + "=" + att + "\n" 23 | 24 | exec(code) 25 | 26 | def toStr(self): 27 | res = "(" 28 | for att in Object.attributes: 29 | boo = eval("isinstance(self. " + att + ", str)") 30 | if not boo: 31 | res = res + str(eval("self." + att)) 32 | else: 33 | res = res + "\"" + str(eval("self." + att)) + "\"" 34 | 35 | if att != Object.attributes[len(Object.attributes) - 1]: 36 | res = res + "," 37 | res += ")" 38 | return res 39 | 40 | def getWordTag(wordTag): 41 | if wordTag == "///": 42 | return "/", "/" 43 | index = wordTag.rfind("/") 44 | word = wordTag[:index].strip() 45 | tag = wordTag[index + 1:].strip() 46 | return word, tag 47 | 48 | def getObject(wordTags, index):#Sequence of "Word/Tag" 49 | word, tag = getWordTag(wordTags[index]) 50 | word = word.decode("utf-8").lower().encode("utf-8") 51 | 52 | preWord1 = preTag1 = preWord2 = preTag2 = "" 53 | nextWord1 = nextTag1 = nextWord2 = nextTag2 = "" 54 | suffixL2 = suffixL3 = suffixL4 = "" 55 | 56 | decodedW = word.decode("utf-8") 57 | if len(decodedW) >= 4: 58 | suffixL3 = decodedW[-3:].encode("utf-8") 59 | suffixL2 = decodedW[-2:].encode("utf-8") 60 | if len(decodedW) >= 5: 61 | suffixL4 = decodedW[-4:].encode("utf-8") 62 | 63 | if index > 0: 64 | preWord1, preTag1 = getWordTag(wordTags[index - 1]) 65 | preWord1 = preWord1.decode("utf-8").lower().encode("utf-8") 66 | if index > 1: 67 | preWord2, preTag2 = getWordTag(wordTags[index - 2]) 68 | preWord2 = preWord2.decode("utf-8").lower().encode("utf-8") 69 | if index < len(wordTags) - 1: 70 | nextWord1, nextTag1 = getWordTag(wordTags[index + 1]) 71 | nextWord1 = nextWord1.decode("utf-8").lower().encode("utf-8") 72 | if index < len(wordTags) - 2: 73 | nextWord2, nextTag2 = getWordTag(wordTags[index + 2]) 74 | nextWord2 = nextWord2.decode("utf-8").lower().encode("utf-8") 75 | 76 | return Object(word, tag, preWord2, preWord1, nextWord1, nextWord2, preTag2, preTag1, nextTag1, nextTag2, suffixL2, suffixL3, suffixL4) 77 | 78 | def getObjectDictionary(initializedCorpus, goldStandardCorpus): 79 | goldStandardSens = open(goldStandardCorpus, "r").readlines() 80 | initializedSens = open(initializedCorpus, "r").readlines() 81 | 82 | objects = {} 83 | 84 | j = 0 85 | for i in xrange(len(initializedSens)): 86 | init = initializedSens[i].strip() 87 | if len(init) == 0: 88 | continue 89 | 90 | while j < len(goldStandardSens) and goldStandardSens[j].strip() == "": 91 | j += 1 92 | 93 | if j >= len(goldStandardSens): 94 | continue 95 | 96 | gold = goldStandardSens[j].strip() 97 | j += 1 98 | initWordTags = init.replace("“", "''").replace("”", "''").replace("\"", "''").split() 99 | goldWordTags = gold.replace("“", "''").replace("”", "''").replace("\"", "''").split() 100 | 101 | for k in xrange(len(initWordTags)): 102 | 103 | initWord, initTag = getWordTag(initWordTags[k]) 104 | goldWord, correctTag = getWordTag(goldWordTags[k]) 105 | if initWord != goldWord: 106 | print "\nERROR ==> Raw texts extracted from the gold standard corpus and the initialized corpus are not the same!" 107 | return None 108 | 109 | if initTag not in objects.keys(): 110 | objects[initTag] = {} 111 | objects[initTag][initTag] = [] 112 | 113 | if correctTag not in objects[initTag].keys(): 114 | objects[initTag][correctTag] = [] 115 | 116 | objects[initTag][correctTag].append(getObject(initWordTags, k)) 117 | 118 | return objects 119 | 120 | class FWObject: 121 | """ 122 | RDRPOSTaggerV1.1: new implementation scheme 123 | RDRPOSTaggerV1.2: add suffixes 124 | """ 125 | 126 | def __init__(self, check = False): 127 | self.context = [None, None, None, None, None, None, None, None, None, None, None, None, None] 128 | if(check == True): 129 | i = 0 130 | while (i < 10): 131 | self.context[i] = "" 132 | self.context[i + 1] = "" 133 | i = i + 2 134 | self.context[10] = ""# suffix 135 | self.context[11] = "" 136 | self.context[12] = "" 137 | self.notNoneIds = [] 138 | 139 | @staticmethod 140 | def getFWObject(startWordTags, index): 141 | object = FWObject(True) 142 | word, tag = getWordTag(startWordTags[index]) 143 | object.context[4] = word.decode("utf-8").lower().encode("utf-8") 144 | object.context[5] = tag 145 | 146 | decodedW = word.decode("utf-8") 147 | if len(decodedW) >= 4: 148 | object.context[10] = decodedW[-2:].encode("utf-8") 149 | object.context[11] = decodedW[-3:].encode("utf-8") 150 | if len(decodedW) >= 5: 151 | object.context[12] = decodedW[-4:].encode("utf-8") 152 | 153 | if index > 0: 154 | preWord1, preTag1 = getWordTag(startWordTags[index - 1]) 155 | object.context[2] = preWord1.decode("utf-8").lower().encode("utf-8") 156 | object.context[3] = preTag1 157 | 158 | if index > 1: 159 | preWord2, preTag2 = getWordTag(startWordTags[index - 2]) 160 | object.context[0] = preWord2.decode("utf-8").lower().encode("utf-8") 161 | object.context[1] = preTag2 162 | 163 | if index < len(startWordTags) - 1: 164 | nextWord1, nextTag1 = getWordTag(startWordTags[index + 1]) 165 | object.context[6] = nextWord1.decode("utf-8").lower().encode("utf-8") 166 | object.context[7] = nextTag1 167 | 168 | if index < len(startWordTags) - 2: 169 | nextWord2, nextTag2 = getWordTag(startWordTags[index + 2]) 170 | object.context[8] = nextWord2.decode("utf-8").lower().encode("utf-8") 171 | object.context[9] = nextTag2 172 | 173 | return object 174 | 175 | # def isSatisfied(self, fwObject): 176 | # for i in xrange(13): 177 | # key = self.context[i] 178 | # if (key is not None): 179 | # if key != fwObject.context[i]: 180 | # return False 181 | # return True 182 | -------------------------------------------------------------------------------- /train/SCRDRlearner/Object.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/train/SCRDRlearner/Object.pyc -------------------------------------------------------------------------------- /train/SCRDRlearner/SCRDRTree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from Node import Node 4 | from Object import FWObject 5 | 6 | class SCRDRTree: 7 | """ 8 | Single Classification Ripple Down Rules tree for Part-of-Speech and morphological tagging 9 | """ 10 | 11 | def __init__(self, root = None): 12 | self.root = root 13 | 14 | def findDepthNode(self, node, depth): 15 | while node.depth != depth: 16 | node = node.father 17 | return node 18 | 19 | def classify(self, object): 20 | self.root.check(object) 21 | 22 | def writeToFileWithSeenCases(self, outFile): 23 | out = open(outFile, "w") 24 | self.root.writeToFileWithSeenCases(out, 0) 25 | out.close() 26 | 27 | def writeToFile(self, outFile): 28 | out = open(outFile, "w") 29 | self.root.writeToFile(out, 0) 30 | out.close() 31 | 32 | #Build tree from file containing rules using FWObject 33 | def constructSCRDRtreeFromRDRfile(self, rulesFilePath): 34 | 35 | self.root = Node(FWObject(False), "NN", None, None, None, [], 0) 36 | currentNode = self.root 37 | currentDepth = 0 38 | 39 | rulesFile = open(rulesFilePath, "r") 40 | lines = rulesFile.readlines() 41 | 42 | for i in xrange(1, len(lines)): 43 | line = lines[i] 44 | depth = 0 45 | for c in line: 46 | if c == '\t': 47 | depth = depth + 1 48 | else: 49 | break 50 | 51 | line = line.strip() 52 | if len(line) == 0: 53 | continue 54 | 55 | temp = line.find("cc") 56 | if temp == 0: 57 | continue 58 | 59 | condition = getCondition(line.split(" : ", 1)[0].strip()) 60 | conclusion = getConcreteValue(line.split(" : ", 1)[1].strip()) 61 | 62 | node = Node(condition, conclusion, None, None, None, [], depth) 63 | 64 | if depth > currentDepth: 65 | currentNode.exceptChild = node 66 | elif depth == currentDepth: 67 | currentNode.elseChild = node 68 | else: 69 | while currentNode.depth != depth: 70 | currentNode = currentNode.father 71 | currentNode.elseChild = node 72 | 73 | node.father = currentNode 74 | currentNode = node 75 | currentDepth = depth 76 | 77 | def findFiredNode(self, fwObject): 78 | currentNode = self.root 79 | firedNode = None 80 | obContext = fwObject.context 81 | while True: 82 | #Check whether object satisfying the current node's condition 83 | cnContext = currentNode.condition.context 84 | notNoneIds = currentNode.condition.notNoneIds 85 | satisfied = True 86 | for i in notNoneIds: 87 | if cnContext[i] != obContext[i]: 88 | satisfied = False 89 | break 90 | 91 | if(satisfied): 92 | firedNode = currentNode 93 | exChild = currentNode.exceptChild 94 | if exChild is None: 95 | break 96 | else: 97 | currentNode = exChild 98 | else: 99 | elChild = currentNode.elseChild 100 | if elChild is None: 101 | break 102 | else: 103 | currentNode = elChild 104 | return firedNode 105 | 106 | # def findFiredNodeInDepth(self, fwObject, depth): 107 | # currentNode = self.root 108 | # firedNode = None 109 | # while True: 110 | # if(currentNode.condition.isSatisfied(fwObject)): 111 | # firedNode = currentNode 112 | # if currentNode.exceptChild is None: 113 | # break 114 | # else: 115 | # currentNode = currentNode.exceptChild 116 | # else: 117 | # if currentNode.elseChild is None: 118 | # break 119 | # else: 120 | # currentNode = currentNode.elseChild 121 | # if currentNode.depth > depth: 122 | # break 123 | # return firedNode 124 | # 125 | # #Count number of nodes in exception-structure levels 126 | # def countNodes(self, inDepth): 127 | # currentNode = self.root 128 | # nodeQueue = [] 129 | # nodeQueue.append(currentNode) 130 | # count = 0 131 | # while len(nodeQueue) > 0: 132 | # currentNode = nodeQueue[0] 133 | # #Current node's depth is smaller than a given threshold 134 | # if currentNode.depth <= inDepth: 135 | # count += 1 136 | # if currentNode.exceptChild is not None: 137 | # nodeQueue.append(currentNode.exceptChild) 138 | # if currentNode.elseChild is not None: 139 | # nodeQueue.append(currentNode.elseChild) 140 | # nodeQueue = nodeQueue[1:] 141 | # return count 142 | 143 | def getConcreteValue(str): 144 | if str.find('""') > 0: 145 | if str.find("Word") > 0: 146 | return "" 147 | elif str.find("suffixL") > 0: 148 | return "" 149 | else: 150 | return "" 151 | return str[str.find("\"") + 1 : len(str) - 1] 152 | 153 | def getCondition(strCondition): 154 | condition = FWObject(False) 155 | for rule in strCondition.split(" and "): 156 | rule = rule.strip() 157 | key = rule[rule.find(".") + 1 : rule.find(" ")] 158 | value = getConcreteValue(rule) 159 | 160 | if key == "prevWord2": 161 | condition.context[0] = value 162 | elif key == "prevTag2": 163 | condition.context[1] = value 164 | elif key == "prevWord1": 165 | condition.context[2] = value 166 | elif key == "prevTag1": 167 | condition.context[3] = value 168 | elif key == "word": 169 | condition.context[4] = value 170 | elif key == "tag": 171 | condition.context[5] = value 172 | elif key == "nextWord1": 173 | condition.context[6] = value 174 | elif key == "nextTag1": 175 | condition.context[7] = value 176 | elif key == "nextWord2": 177 | condition.context[8] = value 178 | elif key == "nextTag2": 179 | condition.context[9] = value 180 | elif key == "suffixL2": 181 | condition.context[10] = value 182 | elif key == "suffixL3": 183 | condition.context[11] = value 184 | elif key == "suffixL4": 185 | condition.context[12] = value 186 | for i in xrange(13): 187 | if condition.context[i] is not None: 188 | condition.notNoneIds.append(i) 189 | return condition 190 | 191 | if __name__ == "__main__": 192 | pass 193 | -------------------------------------------------------------------------------- /train/SCRDRlearner/SCRDRTree.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/train/SCRDRlearner/SCRDRTree.pyc -------------------------------------------------------------------------------- /train/SCRDRlearner/SCRDRTreeLearner.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from Node import Node 4 | from Object import getObjectDictionary 5 | from SCRDRTree import SCRDRTree 6 | 7 | #Generate concrete rules based on input object of 5-word window context object 8 | def generateRules(object): 9 | # 1. Current word 10 | rule1 = "object.word == \"" + object.word + "\"" 11 | # 2. Next 1st word 12 | rule2 = "object.nextWord1 == \"" + object.nextWord1 + "\"" 13 | # 3. Next 2nd word 14 | rule3 = "object.nextWord2 == \"" + object.nextWord2 + "\"" 15 | # 4. Previous 1st word 16 | rule4 = "object.prevWord1 == \"" + object.prevWord1 + "\"" 17 | # 5. Previous 2nd word 18 | rule5 = "object.prevWord2 == \"" + object.prevWord2 + "\"" 19 | 20 | # 6. Current word and next 1st word 21 | rule6 = rule1 + " and " + rule2 22 | # 7. Previous 1st word and current word 23 | rule7 = rule4 + " and " + rule1 24 | # 11. Previous 1st word and next 1st word 25 | rule11 = rule4 + " and " + rule2 26 | # 29. Next 1st word and next 2nd word 27 | #rule29 = rule2 + " and " + rule3 28 | # 30. Previous 2nd word and previous 1st word 29 | #rule30 = rule5 + " and " + rule4 30 | # 19. Current word and next 2nd word 31 | rule19 = rule1 + " and " + rule3 32 | # 20. Previous 2nd word and current word 33 | rule20 = rule5 + " and " + rule1 34 | 35 | # 8. Current word, next 1st word and next 2nd word 36 | rule8 = rule6 + " and " + rule3 37 | # 9. Previous 2nd word, previous 1st word and current word 38 | rule9 = rule5 + " and " + rule7 39 | # 10. Previous 1st word, current word and next 1st word 40 | rule10 = rule4 + " and " + rule6 41 | 42 | 43 | # 12. Next 1st tag 44 | rule12 = "object.nextTag1 == \"" + object.nextTag1 + "\"" 45 | # 13. Next 2nd tag 46 | rule13 = "object.nextTag2 == \"" + object.nextTag2 + "\"" 47 | # 14. Previous 1st tag 48 | rule14 = "object.prevTag1 == \"" + object.prevTag1 + "\"" 49 | # 15. Previous 2nd tag 50 | rule15 = "object.prevTag2 == \"" + object.prevTag2 + "\"" 51 | # 16. Next 1st tag and next 2nd tag 52 | rule16 = rule12 + " and " + rule13 53 | # 17. Previous 2nd tag and previous 1st tag 54 | rule17 = rule15 + " and " + rule14 55 | # 18. Previous 1st tag and next 1st tag 56 | rule18 = rule14 + " and " + rule12 57 | 58 | # 21. Current word and next 1st tag 59 | rule21 = rule1 + " and " + rule12 60 | # 22. Current word and previous 1st tag 61 | rule22 = rule14 + " and " + rule1 62 | # 23. Previous 1st tag, current word and next 1st tag 63 | rule23 = rule14 + " and " + rule21 64 | # 24. Current word and 2 next tags. 65 | rule24 = rule1 + " and " + rule16 66 | # 25. 2 previous tags and current word 67 | rule25 = rule17 + " and " + rule1 68 | # 26. 2-character suffix 69 | #rule26 = "object.suffixL2 == \"" + object.suffixL2 + "\"" 70 | # 27. 3-character suffix 71 | #rule27 = "object.suffixL3 == \"" + object.suffixL3 + "\"" 72 | # 28. 4-character suffix 73 | #rule28 = "object.suffixL4 == \"" + object.suffixL4 + "\"" 74 | 75 | rules = [] 76 | rules.append(rule1) 77 | rules.append(rule2) 78 | rules.append(rule3) 79 | rules.append(rule4) 80 | rules.append(rule5) 81 | rules.append(rule6) 82 | rules.append(rule7) 83 | rules.append(rule8) 84 | rules.append(rule9) 85 | rules.append(rule10) 86 | rules.append(rule11) 87 | rules.append(rule12) 88 | rules.append(rule13) 89 | rules.append(rule14) 90 | rules.append(rule15) 91 | rules.append(rule16) 92 | rules.append(rule17) 93 | rules.append(rule18) 94 | rules.append(rule19) 95 | rules.append(rule20) 96 | rules.append(rule21) 97 | rules.append(rule22) 98 | rules.append(rule23) 99 | rules.append(rule24) 100 | rules.append(rule25) 101 | #rules.append(rule26) 102 | #rules.append(rule27) 103 | #rules.append(rule28) 104 | #rules.append(rule29) 105 | #rules.append(rule30) 106 | 107 | rules = set(rules) 108 | return rules 109 | 110 | 111 | def countMatching(objects, ruleNotIn): 112 | counts = {} 113 | matchedObjects = {} 114 | for object in objects: 115 | rules = generateRules(object) 116 | for rule in rules: 117 | if rule in ruleNotIn: 118 | continue 119 | counts[rule] = counts.setdefault(rule, 0) + 1 120 | matchedObjects.setdefault(rule, []).append(object) 121 | return counts, matchedObjects 122 | 123 | def satisfy(object, rule): 124 | return eval(rule) 125 | 126 | def fire(rule, cornerstoneCases): 127 | for object in cornerstoneCases: 128 | if satisfy(object, rule): 129 | return True 130 | return False 131 | 132 | def generateRulesFromObjectSet(objects): 133 | res = [] 134 | for object in objects: 135 | rules = generateRules(object) 136 | res += rules 137 | return res 138 | 139 | class SCRDRTreeLearner(SCRDRTree): 140 | def __init__(self, iThreshold = 2, mThreshold = 2): 141 | self.improvedThreshold = iThreshold 142 | self.matchedThreshold = mThreshold 143 | 144 | #For layer-2 exception structure 145 | def findMostImprovingRuleForTag(self, startTag, correctTag, correctCounts, wrongObjects): 146 | impCounts, affectedObjects = countMatching(wrongObjects, []) 147 | 148 | maxImp = -1000000 149 | bestRule = "" 150 | for rule in impCounts: 151 | temp = impCounts[rule] 152 | if rule in correctCounts: 153 | temp -= correctCounts[rule] 154 | 155 | if temp > maxImp: 156 | maxImp = temp 157 | bestRule = rule 158 | 159 | if maxImp == -1000000: 160 | affectedObjects[bestRule] = [] 161 | 162 | return bestRule, maxImp, affectedObjects[bestRule] 163 | 164 | def findMostEfficientRule(self, startTag, objects, correctCounts): 165 | maxImp = -1000000 166 | rule = "" 167 | correctTag = "" 168 | cornerstoneCases = [] 169 | 170 | for tag in objects: 171 | if tag == startTag: 172 | continue 173 | if len(objects[tag]) <= maxImp or len(objects[tag]) < self.improvedThreshold: 174 | continue 175 | 176 | ruleTemp, imp, affectedObjects = self.findMostImprovingRuleForTag(startTag, correctTag, correctCounts, objects[tag]) 177 | if imp >= self.improvedThreshold and imp > maxImp: 178 | maxImp = imp 179 | rule = ruleTemp 180 | correctTag = tag 181 | cornerstoneCases = affectedObjects 182 | 183 | needToCorrectObjects = {} 184 | errorRaisingObjects = [] 185 | if maxImp > -1000000: 186 | for tag in objects: 187 | if tag != correctTag: 188 | for object in objects[tag]: 189 | if satisfy(object, rule): 190 | needToCorrectObjects.setdefault(tag, []).append(object) 191 | if tag == startTag: 192 | errorRaisingObjects.append(object) 193 | 194 | return rule, correctTag, maxImp, cornerstoneCases, needToCorrectObjects, errorRaisingObjects 195 | 196 | def findMostMatchingRule(self, matchingCounts): 197 | correctTag = "" 198 | bestRule = "" 199 | maxCount = -1000000 200 | 201 | for tag in matchingCounts: 202 | for rule in matchingCounts[tag]: 203 | if matchingCounts[tag][rule] >= self.matchedThreshold and matchingCounts[tag][rule] > maxCount: 204 | maxCount = matchingCounts[tag][rule] 205 | bestRule = rule 206 | correctTag = tag 207 | 208 | return bestRule, correctTag 209 | 210 | def buildNodeForObjectSet(self, objects, root): 211 | cornerstoneCaseRules = generateRulesFromObjectSet(root.cornerstoneCases) 212 | 213 | matchingCounts = {} 214 | matchingObjects = {} 215 | for tag in objects: 216 | matchingCounts[tag], matchingObjects[tag] = countMatching(objects[tag], cornerstoneCaseRules) 217 | 218 | total = 0 219 | for tag in objects: 220 | total += len(objects[tag]) 221 | 222 | currentNode = root 223 | elseChild = False 224 | while True: 225 | rule, correctTag = self.findMostMatchingRule(matchingCounts) 226 | 227 | if rule == "": 228 | break 229 | 230 | cornerstoneCases = matchingObjects[correctTag][rule] 231 | 232 | needToCorrectObjects = {} 233 | for tag in objects: 234 | if rule in matchingObjects[tag]: 235 | if tag != correctTag: 236 | needToCorrectObjects[tag] = matchingObjects[tag][rule] 237 | for object in matchingObjects[tag][rule]: 238 | rules = generateRules(object) 239 | for rule1 in rules: 240 | if rule1 not in matchingCounts[tag]: 241 | continue 242 | matchingCounts[tag][rule1] -= 1 243 | 244 | node = Node(rule, "object.conclusion = \"" + correctTag + "\"", currentNode, None, None, cornerstoneCases) 245 | 246 | if not elseChild: 247 | currentNode.exceptChild = node 248 | elseChild = True 249 | else: 250 | currentNode.elseChild = node 251 | 252 | currentNode = node 253 | self.buildNodeForObjectSet(needToCorrectObjects, currentNode) 254 | 255 | def learnRDRTree(self, initializedCorpus, goldStandardCorpus): 256 | self.root = Node("True", "object.conclusion = \"NN\"", None, None, None, [], 0) 257 | 258 | objects = getObjectDictionary(initializedCorpus, goldStandardCorpus) 259 | 260 | currentNode = self.root 261 | for initializedTag in objects: 262 | print "\n===> Building exception rules for tag %s" % initializedTag 263 | correctCounts = {} 264 | for object in objects[initializedTag][initializedTag]: 265 | rules = generateRules(object) 266 | for rule in rules: 267 | correctCounts[rule] = correctCounts.setdefault(rule, 0) + 1 268 | 269 | node = Node("object.tag == \"" + initializedTag + "\"", "object.conclusion = \"" + initializedTag + "\"", self.root, None, None, [], 1) 270 | 271 | if self.root.exceptChild == None: 272 | self.root.exceptChild = node 273 | else: 274 | currentNode.elseChild = node 275 | 276 | currentNode = node 277 | objectSet = objects[initializedTag] 278 | 279 | elseChild = False 280 | currentNode1 = currentNode 281 | while True: 282 | rule, correctTag, imp, cornerstoneCases, needToCorrectObjects, errorRaisingObjects = self.findMostEfficientRule(initializedTag, objectSet, correctCounts) 283 | if imp < self.improvedThreshold: 284 | break 285 | 286 | node = Node(rule, "object.conclusion = \"" + correctTag + "\"", currentNode, None, None, cornerstoneCases, 2) 287 | 288 | if not elseChild: 289 | currentNode1.exceptChild = node 290 | elseChild = True 291 | else: 292 | currentNode1.elseChild = node 293 | 294 | currentNode1 = node 295 | 296 | for object in cornerstoneCases: 297 | objectSet[correctTag].remove(object) 298 | 299 | for tag in needToCorrectObjects: 300 | for object in needToCorrectObjects[tag]: 301 | objectSet[tag].remove(object) 302 | 303 | for object in errorRaisingObjects: 304 | rules = generateRules(object) 305 | for rule in rules: 306 | correctCounts[rule] -= 1 307 | 308 | self.buildNodeForObjectSet(needToCorrectObjects, currentNode1) 309 | -------------------------------------------------------------------------------- /train/SCRDRlearner/SCRDRTreeLearner.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/train/SCRDRlearner/SCRDRTreeLearner.pyc -------------------------------------------------------------------------------- /train/SCRDRlearner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/train/SCRDRlearner/__init__.py -------------------------------------------------------------------------------- /train/SCRDRlearner/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/train/SCRDRlearner/__init__.pyc -------------------------------------------------------------------------------- /train/Train_gold.txt.RAW.Init.RDR: -------------------------------------------------------------------------------- 1 | True : object.conclusion = "NN" 2 | object.tag == "I" : object.conclusion = "I" 3 | object.prevWord1 == "cho" : object.conclusion = "B" 4 | object.word == "phép" : object.conclusion = "I" 5 | object.word == "ra" : object.conclusion = "B" 6 | object.prevWord1 == "ngoài" and object.word == "ra" : object.conclusion = "I" 7 | object.prevWord1 == "thật" : object.conclusion = "I" 8 | object.prevWord1 == "xem" : object.conclusion = "I" 9 | object.prevWord1 == "lẽ" and object.word == "ra" : object.conclusion = "I" 10 | object.prevWord1 == "đến" : object.conclusion = "B" 11 | object.prevTag1 == "B" and object.word == "nỗi" and object.nextTag1 == "B" : object.conclusion = "I" 12 | object.word == "khi" and object.nextTag1 == "B" : object.conclusion = "B" 13 | object.prevWord1 == "có" and object.word == "khi" : object.conclusion = "I" 14 | object.prevTag1 == "B" and object.word == "nhà" : object.conclusion = "B" 15 | object.prevWord2 == "," and object.prevWord1 == "người" and object.word == "nhà" : object.conclusion = "I" 16 | object.prevWord1 == "con" : object.conclusion = "B" 17 | object.word == "tin" : object.conclusion = "I" 18 | object.word == "số" : object.conclusion = "I" 19 | object.prevTag1 == "B" and object.word == "người" : object.conclusion = "I" 20 | object.word == "cái" and object.nextTag1 == "B" : object.conclusion = "I" 21 | object.prevTag1 == "B" and object.word == "vào" and object.nextTag1 == "B" : object.conclusion = "B" 22 | object.prevTag1 == "B" and object.word == "có" and object.nextTag1 == "B" : object.conclusion = "B" 23 | object.prevTag1 == "B" and object.word == "đó" : object.conclusion = "B" 24 | object.prevWord2 == "" and object.prevWord1 == "do" and object.word == "đó" : object.conclusion = "I" 25 | object.prevWord1 == "đâu" and object.word == "đó" : object.conclusion = "I" 26 | object.prevWord1 == "quận" : object.conclusion = "B" 27 | object.prevTag1 == "B" and object.word == "huyện" and object.nextTag1 == "B" : object.conclusion = "I" 28 | object.prevWord1 == "năm" : object.conclusion = "B" 29 | object.prevTag1 == "B" and object.word == "ngoái" : object.conclusion = "I" 30 | object.word == "tháng" and object.nextTag1 == "B" : object.conclusion = "I" 31 | object.prevTag1 == "B" and object.word == "như" : object.conclusion = "B" 32 | object.prevWord1 == "hình" : object.conclusion = "I" 33 | object.word == "cao" : object.conclusion = "B" 34 | object.prevWord1 == "văn" : object.conclusion = "I" 35 | object.prevWord1 == "vùng" and object.word == "cao" : object.conclusion = "I" 36 | object.prevWord1 == "bên" : object.conclusion = "B" 37 | object.word == "người" : object.conclusion = "B" 38 | object.prevTag1 == "B" and object.word == "không" and object.nextTag1 == "B" : object.conclusion = "B" 39 | object.word == "không" and object.nextWord1 == "ai" : object.conclusion = "I" 40 | object.prevWord2 == "với" and object.prevWord1 == "khoảng" and object.word == "không" : object.conclusion = "I" 41 | object.prevTag1 == "B" and object.word == "nhau" and object.nextTag1 == "B" : object.conclusion = "B" 42 | object.prevTag1 == "B" and object.word == "phải" : object.conclusion = "B" 43 | object.word == "từ" and object.nextTag1 == "B" and object.nextTag2 == "B" : object.conclusion = "B" 44 | object.prevTag2 == "B" and object.prevTag1 == "B" and object.word == "lại" : object.conclusion = "B" 45 | object.prevWord1 == "trở" : object.conclusion = "I" 46 | object.prevWord1 == "đi" : object.conclusion = "I" 47 | object.prevWord1 == "đi" : object.conclusion = "B" 48 | object.prevWord2 == "''" and object.word == "đêm" : object.conclusion = "I" 49 | object.word == "khách" and object.nextTag1 == "B" and object.nextTag2 == "I" : object.conclusion = "I" 50 | object.prevWord1 == "phó" : object.conclusion = "B" 51 | object.prevTag1 == "B" and object.word == "ở" and object.nextTag1 == "B" : object.conclusion = "B" 52 | object.prevTag1 == "B" and object.word == "hề" and object.nextTag1 == "B" : object.conclusion = "B" 53 | object.prevWord1 == "lúc" : object.conclusion = "B" 54 | object.word == "sau" : object.conclusion = "B" 55 | object.prevTag1 == "B" and object.word == "trên" : object.conclusion = "B" 56 | object.prevWord1 == "so" and object.word == "với" : object.conclusion = "B" 57 | object.prevWord1 == "các" : object.conclusion = "B" 58 | object.prevWord1 == "tháng" : object.conclusion = "B" 59 | object.prevWord1 == "có" and object.word == "một" : object.conclusion = "B" 60 | object.prevWord1 == "có" and object.word == "một" and object.nextWord1 == "không" : object.conclusion = "I" 61 | object.prevTag1 == "B" and object.word == "về" : object.conclusion = "B" 62 | object.prevTag1 == "B" and object.word == "gì" and object.nextTag1 == "B" : object.conclusion = "B" 63 | object.word == "ngày" : object.conclusion = "B" 64 | object.prevTag1 == "B" and object.word == "đi" and object.nextTag1 == "B" : object.conclusion = "B" 65 | object.prevWord1 == "bọn" : object.conclusion = "B" 66 | object.word == "khơi" : object.conclusion = "B" 67 | object.prevTag2 == "I" and object.prevTag1 == "B" and object.word == "là" : object.conclusion = "B" 68 | object.prevTag1 == "B" and object.word == "lúc" and object.nextTag1 == "B" : object.conclusion = "B" 69 | object.prevTag2 == "B" and object.prevTag1 == "B" and object.word == "lên" : object.conclusion = "B" 70 | object.prevTag1 == "B" and object.word == "cần" and object.nextTag1 == "B" : object.conclusion = "B" 71 | object.prevWord1 == "lấy" : object.conclusion = "B" 72 | object.prevWord1 == "cuộc" and object.word == "họp" : object.conclusion = "B" 73 | object.word == "nhưng" and object.nextTag1 == "B" : object.conclusion = "B" 74 | object.word == "cho" and object.nextTag1 == "B" : object.conclusion = "B" 75 | object.prevTag1 == "B" and object.word == "điều" : object.conclusion = "B" 76 | object.prevWord1 == "người" and object.word == "ta" : object.conclusion = "B" 77 | object.prevWord1 == "của" : object.conclusion = "B" 78 | object.prevWord1 == "cô" and object.word == "gái" : object.conclusion = "B" 79 | object.prevTag1 == "B" and object.word == "hai" and object.nextTag1 == "B" : object.conclusion = "B" 80 | object.prevTag1 == "B" and object.word == "lần" and object.nextTag1 == "B" : object.conclusion = "B" 81 | object.prevWord1 == "câu" and object.word == "hỏi" : object.conclusion = "B" 82 | object.word == "đường" and object.nextTag1 == "B" and object.nextTag2 == "I" : object.conclusion = "B" 83 | object.word == "ai" : object.conclusion = "B" 84 | object.prevWord1 == "đứng" : object.conclusion = "B" 85 | object.prevWord1 == "đứng" and object.word == "tên" and object.nextWord1 == "đăng" : object.conclusion = "I" 86 | object.prevWord1 == "chữa" and object.word == "bệnh" : object.conclusion = "B" 87 | object.prevTag2 == "B" and object.prevTag1 == "B" and object.word == "theo" : object.conclusion = "B" 88 | object.prevWord1 == "có" and object.word == "ý" : object.conclusion = "B" 89 | object.prevWord1 == "mùa" : object.conclusion = "B" 90 | object.word == "qua" and object.nextTag1 == "B" and object.nextTag2 == "I" : object.conclusion = "B" 91 | object.prevWord1 == "có" and object.word == "nghĩa" : object.conclusion = "B" 92 | object.prevWord1 == "về" : object.conclusion = "B" 93 | object.word == "thấy" and object.nextTag1 == "B" : object.conclusion = "B" 94 | object.prevWord1 == "cảm" : object.conclusion = "I" 95 | object.word == "cơm" : object.conclusion = "B" 96 | object.prevWord1 == "vì" and object.word == "sao" : object.conclusion = "B" 97 | object.prevWord1 == "một" and object.word == "thời" : object.conclusion = "B" 98 | object.prevWord1 == "chợ" and object.word == "mới" : object.conclusion = "B" 99 | object.prevWord1 == "có" and object.word == "công" : object.conclusion = "B" 100 | object.prevWord1 == "còn" and object.word == "lại" : object.conclusion = "B" 101 | object.prevWord1 == "buổi" : object.conclusion = "B" 102 | object.prevWord1 == "cá" and object.nextWord1 == "ngọt" : object.conclusion = "B" 103 | object.prevWord2 == "nạo" and object.word == "thai" : object.conclusion = "B" 104 | object.prevWord1 == "cùng" and object.word == "với" : object.conclusion = "B" 105 | object.prevWord1 == "gần" and object.word == "đây" : object.conclusion = "B" 106 | object.word == "lễ" and object.nextTag1 == "B" and object.nextTag2 == "B" : object.conclusion = "B" 107 | object.prevTag1 == "B" and object.word == "dậy" and object.nextTag1 == "B" : object.conclusion = "B" 108 | object.prevWord1 == "cánh" : object.conclusion = "B" 109 | object.prevTag1 == "B" and object.word == "nêu" and object.nextTag1 == "B" : object.conclusion = "B" 110 | object.prevTag1 == "B" and object.word == "cái" : object.conclusion = "B" 111 | object.prevTag1 == "B" and object.word == "thép" : object.conclusion = "B" 112 | object.prevWord1 == "trong" and object.word == "vòng" : object.conclusion = "B" 113 | object.prevTag1 == "B" and object.word == "nợ" : object.conclusion = "B" 114 | object.prevWord1 == "chặn" and object.word == "đường" : object.conclusion = "B" 115 | object.word == "vợ" : object.conclusion = "B" 116 | object.prevWord1 == "nhà" and object.word == "chồng" : object.conclusion = "B" 117 | object.prevWord1 == "máy" and object.word == "điện" : object.conclusion = "B" 118 | object.prevWord1 == "tuyến" and object.word == "đường" : object.conclusion = "B" 119 | object.prevWord1 == "vụ" and object.word == "án" : object.conclusion = "B" 120 | object.prevWord1 == "từ" and object.word == "đầu" : object.conclusion = "B" 121 | object.prevWord1 == "bóp" : object.conclusion = "B" 122 | object.prevWord1 == "ngay" and object.word == "cả" : object.conclusion = "B" 123 | object.prevWord1 == "chụp" : object.conclusion = "B" 124 | object.prevWord1 == "niềm" : object.conclusion = "B" 125 | object.prevWord1 == "lứa" : object.conclusion = "B" 126 | object.prevWord1 == "nổ" and object.word == "máy" : object.conclusion = "B" 127 | object.prevWord1 == "chiều" and object.word == "nay" : object.conclusion = "B" 128 | object.prevTag1 == "B" and object.word == "vn" and object.nextTag1 == "B" : object.conclusion = "B" 129 | object.prevTag1 == "B" and object.word == "lão" : object.conclusion = "B" 130 | object.word == "thuốc" and object.nextTag1 == "B" : object.conclusion = "B" 131 | object.prevTag1 == "B" and object.word == "ba" and object.nextTag1 == "B" : object.conclusion = "B" 132 | object.prevWord1 == "vài" and object.word == "ba" : object.conclusion = "I" 133 | object.prevWord1 == "mái" : object.conclusion = "B" 134 | object.prevWord1 == "đưa" : object.conclusion = "B" 135 | object.prevTag2 == "B" and object.prevTag1 == "B" and object.word == "tháng" : object.conclusion = "B" 136 | object.prevWord1 == "trứng" and object.word == "gà" : object.conclusion = "B" 137 | object.word == "bé" and object.nextTag1 == "B" : object.conclusion = "B" 138 | object.prevTag2 == "I" : object.conclusion = "I" 139 | object.word == "đến" : object.conclusion = "B" 140 | object.prevWord1 == "cục" and object.word == "trưởng" : object.conclusion = "B" 141 | object.prevWord1 == "đẫm" and object.word == "máu" : object.conclusion = "B" 142 | object.prevWord1 == "hồi" and object.word == "trước" : object.conclusion = "B" 143 | object.prevWord1 == "băng" and object.word == "cướp" : object.conclusion = "B" 144 | object.prevWord1 == "trưởng" and object.word == "phòng" : object.conclusion = "B" 145 | object.prevWord1 == "không" and object.word == "phép" : object.conclusion = "B" 146 | object.prevWord1 == "quán" and object.word == "nước" : object.conclusion = "B" 147 | object.prevWord1 == "dập" : object.conclusion = "B" 148 | object.prevWord1 == "bỏ" and object.word == "trốn" : object.conclusion = "B" 149 | object.prevWord1 == "có" and object.word == "con" : object.conclusion = "B" 150 | object.prevWord1 == "vượt" and object.word == "qua" : object.conclusion = "B" 151 | object.prevWord1 == "gầm" : object.conclusion = "B" 152 | object.prevWord1 == "làm" and object.word == "chủ" : object.conclusion = "B" 153 | object.prevWord1 == "tờ" and object.word == "báo" : object.conclusion = "B" 154 | object.word == "đồng" and object.nextWord1 == "hồ" : object.conclusion = "B" 155 | object.prevWord1 == "mang" : object.conclusion = "B" 156 | object.prevWord1 == "nghe" and object.word == "tiếng" : object.conclusion = "B" 157 | object.prevWord1 == "ra" and object.word == "sao" : object.conclusion = "B" 158 | object.prevWord1 == "giày" and object.word == "an" and object.nextWord1 == "giang" : object.conclusion = "B" 159 | object.word == "lãi" and object.nextTag1 == "B" : object.conclusion = "B" 160 | object.prevWord1 == "như" and object.word == "thế" and object.nextWord1 == "này" : object.conclusion = "B" 161 | object.word == "ruột" and object.nextTag1 == "B" : object.conclusion = "B" 162 | object.prevWord1 == "ngọn" and object.word == "lửa" : object.conclusion = "B" 163 | object.prevWord1 == "có" and object.word == "lợi" : object.conclusion = "B" 164 | object.prevWord1 == "giấy" and object.word == "chứng" : object.conclusion = "B" 165 | object.prevWord2 == "toà" and object.word == "nhân" : object.conclusion = "B" 166 | object.word == "hộ" and object.nextTag1 == "I" : object.conclusion = "B" 167 | object.word == "gia" and object.nextTag1 == "I" : object.conclusion = "B" 168 | object.nextWord1 == "khát" : object.conclusion = "B" 169 | object.prevWord1 == "mức" and object.word == "lương" : object.conclusion = "B" 170 | object.prevWord1 == "sẽ" : object.conclusion = "B" 171 | object.prevWord1 == "bắt" and object.word == "sống" : object.conclusion = "B" 172 | object.word == "xong" and object.nextTag1 == "B" : object.conclusion = "B" 173 | object.prevWord1 == "khi" and object.word == "nào" : object.conclusion = "B" 174 | object.word == "hôm" : object.conclusion = "B" 175 | object.prevWord1 == "tiêu" and object.word == "độc" : object.conclusion = "B" 176 | object.word == "rõ" : object.conclusion = "B" 177 | object.prevTag1 == "B" and object.word == "sớm" : object.conclusion = "B" 178 | object.prevWord1 == "tiến" and object.word == "tới" : object.conclusion = "B" 179 | object.prevWord1 == "viện" and object.nextWord1 == "viện" : object.conclusion = "B" 180 | object.prevTag1 == "B" and object.word == "nhìn" : object.conclusion = "B" 181 | object.word == "lại" and object.nextWord2 == "''" : object.conclusion = "B" 182 | object.word == "hàng" and object.nextWord1 == "hoá" : object.conclusion = "B" 183 | object.word == "công" and object.nextWord1 == "khai" : object.conclusion = "B" 184 | object.word == "giá" and object.nextWord1 == "trị" : object.conclusion = "B" 185 | object.prevTag1 == "B" and object.word == "khỏi" and object.nextTag1 == "B" : object.conclusion = "B" 186 | object.prevWord1 == "cơn" : object.conclusion = "B" 187 | object.prevWord1 == "áp" and object.word == "sát" : object.conclusion = "B" 188 | object.prevWord1 == "mảnh" and object.word == "đất" : object.conclusion = "B" 189 | object.prevWord1 == "gật" and object.word == "đầu" : object.conclusion = "B" 190 | object.prevWord1 == "trước" and object.word == "mặt" : object.conclusion = "B" 191 | object.prevWord1 == "đau" and object.word == "bụng" : object.conclusion = "B" 192 | object.prevTag2 == "B" and object.prevTag1 == "B" and object.word == "vai" : object.conclusion = "B" 193 | object.prevWord1 == "có" and object.word == "số" : object.conclusion = "B" 194 | object.prevWord1 == "có" and object.word == "cơ" : object.conclusion = "B" 195 | object.prevWord1 == "vẻ" : object.conclusion = "B" 196 | object.prevWord1 == "mực" and object.word == "nước" : object.conclusion = "B" 197 | object.prevWord1 == "gây" and object.word == "bệnh" : object.conclusion = "B" 198 | object.word == "năm" and object.nextTag1 == "B" and object.nextTag2 == "B" : object.conclusion = "B" 199 | object.word == "năm" and object.nextWord1 == "và" and object.nextWord2 == "thạnh" : object.conclusion = "I" 200 | object.prevWord1 == "ngã" : object.conclusion = "B" 201 | object.prevWord1 == "được" : object.conclusion = "B" 202 | object.prevWord2 == "không" and object.prevWord1 == "được" and object.word == "việc" : object.conclusion = "I" 203 | object.prevWord1 == "chiều" and object.word == "dài" : object.conclusion = "B" 204 | object.prevWord1 == "xin" and object.word == "việc" : object.conclusion = "B" 205 | object.word == "biến" and object.nextWord2 == "của" : object.conclusion = "B" 206 | object.prevWord2 == "thứ" : object.conclusion = "B" 207 | object.word == "làm" and object.nextTag1 == "B" and object.nextTag2 == "I" : object.conclusion = "B" 208 | object.prevWord1 == "vụ" and object.word == "trưởng" : object.conclusion = "B" 209 | object.prevWord2 == "" and object.prevWord1 == "ông" and object.word == "tự" : object.conclusion = "B" 210 | object.word == "cổ" and object.nextTag1 == "B" and object.nextTag2 == "B" : object.conclusion = "B" 211 | object.prevWord1 == "đấu" and object.word == "súng" : object.conclusion = "B" 212 | object.prevWord1 == "lại" : object.conclusion = "B" 213 | object.prevWord1 == "ở" : object.conclusion = "B" 214 | object.prevWord1 == "mùi" and object.word == "hôi" : object.conclusion = "B" 215 | object.tag == "B" : object.conclusion = "B" 216 | object.prevWord1 == "người" and object.word == "dân" : object.conclusion = "I" 217 | object.word == "dân" and object.nextWord1 == "tộc" : object.conclusion = "B" 218 | object.prevWord1 == "a" : object.conclusion = "I" 219 | object.prevWord1 == "bùng" and object.word == "phát" : object.conclusion = "I" 220 | object.word == "thể" and object.nextTag1 == "B" : object.conclusion = "I" 221 | object.prevWord1 == "võ" : object.conclusion = "I" 222 | object.prevWord1 == "không" and object.word == "chỉ" : object.conclusion = "I" 223 | object.prevWord1 == "cúm" and object.word == "gia" : object.conclusion = "I" 224 | object.prevWord1 == "lúc" and object.word == "nào" : object.conclusion = "I" 225 | object.prevWord1 == "thế" and object.word == "nào" : object.conclusion = "I" 226 | object.word == "ton" : object.conclusion = "I" 227 | object.word == "văn" and object.nextTag1 == "B" : object.conclusion = "I" 228 | object.prevWord1 == "cuộc" and object.word == "chơi" : object.conclusion = "I" 229 | object.prevWord1 == "hiểm" and object.word == "y" and object.nextWord1 == "tế" : object.conclusion = "I" 230 | object.prevTag1 == "I" and object.word == "ngọt" and object.nextTag1 == "B" : object.conclusion = "I" 231 | object.prevWord1 == "nạo" and object.word == "phá" : object.conclusion = "I" 232 | object.word == "hưng" : object.conclusion = "I" 233 | object.word == "bướu" and object.nextTag1 == "B" : object.conclusion = "I" 234 | object.prevWord2 == "một" and object.word == "gian" : object.conclusion = "I" 235 | object.prevTag2 == "I" and object.prevTag1 == "B" and object.word == "chánh" : object.conclusion = "I" 236 | object.prevWord1 == "phục" and object.word == "dựng" : object.conclusion = "I" 237 | object.prevWord1 == "bất" : object.conclusion = "I" 238 | object.prevWord1 == "cả" and object.word == "cấm" : object.conclusion = "I" 239 | object.prevWord1 == "tà" : object.conclusion = "I" 240 | object.prevWord1 == "dệt" and object.word == "may" : object.conclusion = "I" 241 | object.prevWord1 == "châu" and object.word == "phi" : object.conclusion = "I" 242 | object.word == "hoá" and object.nextTag1 == "B" : object.conclusion = "I" 243 | object.prevWord1 == "nhà" and object.word == "đất" : object.conclusion = "I" 244 | object.prevWord1 == "nhà" and object.nextWord1 == "loại" : object.conclusion = "I" 245 | object.prevWord1 == "tây" and object.word == "nguyên" : object.conclusion = "I" 246 | object.prevTag1 == "I" and object.word == "trang" and object.nextTag1 == "B" : object.conclusion = "I" 247 | object.prevWord1 == "châu" and object.word == "âu" : object.conclusion = "I" 248 | object.word == "dah" and object.nextWord1 == "wen" : object.conclusion = "I" 249 | object.prevTag1 == "I" and object.word == "định" and object.nextTag1 == "B" : object.conclusion = "I" 250 | object.prevTag1 == "I" and object.word == "ty" : object.conclusion = "I" 251 | object.prevWord2 == "vốn" and object.prevWord1 == "nhà" and object.word == "nước" : object.conclusion = "I" 252 | object.word == "phẩm" and object.nextTag1 == "B" : object.conclusion = "I" 253 | object.prevWord2 == "đông" and object.prevWord1 == "nam" and object.word == "á" : object.conclusion = "I" 254 | object.word == "dũ" and object.nextTag1 == "B" : object.conclusion = "I" 255 | object.prevWord1 == "trưởng" and object.word == "phòng" : object.conclusion = "I" 256 | object.prevWord1 == "bày" and object.word == "bán" : object.conclusion = "I" 257 | object.prevWord1 == "lan" and object.word == "anh" : object.conclusion = "I" 258 | object.prevWord1 == "chợ" and object.word == "rẫy" : object.conclusion = "I" 259 | object.prevWord1 == "cực" and object.word == "kỳ" : object.conclusion = "I" 260 | object.word == "trang" and object.nextTag1 == "" and object.nextTag2 == "" : object.conclusion = "I" 261 | object.prevWord1 == "giá" and object.word == "trị" : object.conclusion = "I" 262 | object.prevWord1 == "mũ" and object.word == "bảo" : object.conclusion = "I" 263 | object.prevWord1 == "cầu" and object.word == "thang" : object.conclusion = "I" 264 | object.prevWord1 == "nhà" and object.word == "đầu" and object.nextWord1 == "tư" : object.conclusion = "I" 265 | object.prevTag2 == "" and object.prevTag1 == "B" and object.word == "huy" : object.conclusion = "I" 266 | object.prevTag1 == "I" and object.word == "ngoạn" : object.conclusion = "I" 267 | object.prevTag1 == "I" and object.word == "an" and object.nextTag1 == "B" : object.conclusion = "I" 268 | object.word == "thuận" and object.nextTag1 == "B" and object.nextTag2 == "B" : object.conclusion = "I" 269 | object.word == "vương" and object.nextTag1 == "B" : object.conclusion = "I" 270 | object.prevWord1 == "tân" and object.word == "nhuận" and object.nextWord1 == "đông" : object.conclusion = "I" 271 | object.prevTag1 == "I" and object.word == "kiến" and object.nextTag1 == "B" : object.conclusion = "I" 272 | object.prevWord2 == "khoản" and object.prevWord1 == "tạm" and object.word == "thu" : object.conclusion = "I" 273 | object.word == "quỳnh" : object.conclusion = "I" 274 | object.prevWord1 == "khám" and object.word == "chữa" and object.nextWord1 == "bệnh" : object.conclusion = "I" 275 | object.word == "viên" and object.nextTag1 == "B" and object.nextTag2 == "I" : object.conclusion = "I" 276 | object.prevWord1 == "bộ" and object.word == "ngành" : object.conclusion = "I" 277 | object.prevWord1 == "sông" and object.word == "suối" : object.conclusion = "I" 278 | object.prevWord1 == "bích" : object.conclusion = "I" 279 | object.prevWord1 == "khô" and object.word == "hạn" : object.conclusion = "I" 280 | object.prevWord2 == "có" and object.word == "nghĩa" : object.conclusion = "I" 281 | object.prevWord1 == "lê" and object.word == "thanh" and object.nextWord1 == "hà" : object.conclusion = "I" 282 | object.prevWord1 == "phòng" and object.word == "tránh" : object.conclusion = "I" 283 | object.prevWord1 == "đâm" and object.word == "chém" : object.conclusion = "I" 284 | -------------------------------------------------------------------------------- /train/Utility/Config.py: -------------------------------------------------------------------------------- 1 | #Change the value of NUMBER_OF_PROCESSES to obtain faster tagging process! 2 | NUMBER_OF_PROCESSES = 2 3 | 4 | THRESHOLD = (3, 2) -------------------------------------------------------------------------------- /train/Utility/Config.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/train/Utility/Config.pyc -------------------------------------------------------------------------------- /train/Utility/Eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | os.chdir("../") 6 | sys.setrecursionlimit(100000) 7 | sys.path.append(os.path.abspath("")) 8 | os.chdir("./Utility") 9 | 10 | from Utility.Utils import getWordTag, readDictionary 11 | 12 | def computeAccuracy(goldStandardCorpus, taggedCorpus): 13 | tagged = open(taggedCorpus, "r").read().split() 14 | goldStandard = open(goldStandardCorpus, "r").read().split() 15 | if len(tagged) != len(goldStandard): 16 | print "The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus) 17 | return 0 18 | numwords = 0 19 | count = 0 20 | for i in xrange(len(tagged)): 21 | numwords += 1 22 | word1, tag1 = getWordTag(tagged[i]) 23 | word2, tag2 = getWordTag(goldStandard[i]) 24 | if word1 != word2 and word1 != "''" and word2 != "''": 25 | print "Words are not the same in gold standard and tagged corpora, at the index", i 26 | return 0 27 | 28 | if tag1.lower() == tag2.lower(): 29 | count += 1 30 | #else: 31 | # print i, word1, tag1, tag2 32 | 33 | return count * 100.0 / numwords 34 | 35 | def computeAccuracies(fullDictFile, goldStandardCorpus, taggedCorpus): 36 | """ 37 | Return known-word accuracy, unknown-word accuracy and the overall accuracy 38 | """ 39 | tagged = open(taggedCorpus, "r").read().split() 40 | goldStandard = open(goldStandardCorpus, "r").read().split() 41 | if len(tagged) != len(goldStandard): 42 | print "The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus) 43 | return 0 44 | 45 | fullDICT = readDictionary(fullDictFile) 46 | 47 | numwords = count = 0 48 | countKN = countUNKN = 0 49 | countCorrectKN = countCorrectUNKN = 0 50 | 51 | for i in xrange(len(tagged)): 52 | numwords += 1 53 | word1, tag1 = getWordTag(tagged[i]) 54 | word2, tag2 = getWordTag(goldStandard[i]) 55 | if word1 != word2 and word1 != "''" and word2 != "''": 56 | print "Words are not the same in gold standard and tagged corpora, at the index", i 57 | return 0 58 | 59 | if tag1.lower() == tag2.lower(): 60 | count += 1 61 | 62 | if word1 in fullDICT: 63 | countKN += 1 64 | if tag1.lower() == tag2.lower(): 65 | countCorrectKN += 1 66 | else: 67 | countUNKN += 1 68 | if tag1.lower() == tag2.lower(): 69 | countCorrectUNKN += 1 70 | 71 | if countUNKN == 0: 72 | return countCorrectKN * 100.0 / countKN, 0.0, count * 100.0 / numwords 73 | else: 74 | return countCorrectKN * 100.0 / countKN, countCorrectUNKN * 100.0 / countUNKN, count * 100.0 / numwords 75 | 76 | if __name__ == "__main__": 77 | print computeAccuracy(sys.argv[1], sys.argv[2]), "%" 78 | pass 79 | 80 | -------------------------------------------------------------------------------- /train/Utility/LexiconCreator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | os.chdir("../") 6 | sys.setrecursionlimit(100000) 7 | sys.path.append(os.path.abspath("")) 8 | os.chdir("./Utility") 9 | 10 | import re 11 | from Utility.Utils import getWordTag 12 | 13 | def add2WordTagFreqDict(word, tag, inDict): 14 | if word not in inDict: 15 | inDict[word] = {} 16 | inDict[word][tag] = 1 17 | else: 18 | if tag not in inDict[word]: 19 | inDict[word][tag] = 1 20 | else: 21 | inDict[word][tag] += 1 22 | 23 | def createLexicon(corpusFilePath, fullLexicon): 24 | if fullLexicon not in ['full', 'short']: 25 | print "The second parameter gets 'full' or 'short' string-value!" 26 | print "No lexicon is generated!!!" 27 | return 28 | #elif fullLexicon == 'full': 29 | # print "Creating a full .DICT lexicon from the gold standard training corpus", corpusFilePath 30 | #else: 31 | # print "Creating a short .sDict lexicon which excludes word types appearing 1 time in the gold standard training corpus" 32 | 33 | lines = open(corpusFilePath, "r").readlines() 34 | wordTagCounter = {} 35 | for i in xrange(len(lines)): 36 | # print i 37 | pairs = lines[i].strip().replace("“", "''").replace("”", "''").replace("\"", "''").split() 38 | for pair in pairs: 39 | word, tag = getWordTag(pair) 40 | if (len(word) >= (len(pair) - 1)) or (len(tag) >= (len(pair) - 1)): 41 | print "Incorrectly formatted " + str(i+1) + "th sentence at:", pair 42 | else: 43 | add2WordTagFreqDict(word, tag, wordTagCounter) 44 | 45 | from operator import itemgetter 46 | dictionary = {} 47 | suffixDictCounter = {} 48 | 49 | tagCounter_Alphabet = {} 50 | tagCounter_CapitalizedWord = {} 51 | tagCounter_Numeric = {} 52 | 53 | for word in wordTagCounter: 54 | tagFreq4Word = wordTagCounter[word] 55 | pairs = tagFreq4Word.items() 56 | pairs.sort(key = itemgetter(1), reverse = True) 57 | tag = pairs[0][0] 58 | 59 | decodedWord = word.decode("utf-8") 60 | isCapital = decodedWord[0].isupper() 61 | 62 | if fullLexicon == 'full': 63 | dictionary[word] = tag 64 | else:# Get the lexicon without 1-time-occurrence word types 65 | if (len(pairs) == 1 and pairs[0][1] > 1) or len(pairs) > 1: 66 | dictionary[word] = tag 67 | 68 | if re.search(r"[0-9]+", word) != None: 69 | if tag not in tagCounter_Numeric: 70 | tagCounter_Numeric[tag] = 1 71 | else: 72 | tagCounter_Numeric[tag] += 1 73 | else: 74 | if isCapital: 75 | if tag not in tagCounter_CapitalizedWord: 76 | tagCounter_CapitalizedWord[tag] = 1 77 | else: 78 | tagCounter_CapitalizedWord[tag] += 1 79 | else: 80 | if tag not in tagCounter_Alphabet: 81 | tagCounter_Alphabet[tag] = 1 82 | else: 83 | tagCounter_Alphabet[tag] += 1 84 | 85 | if len(decodedWord) >= 4: 86 | suffix = ".*" + decodedWord[-3:] 87 | add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter) 88 | suffix = ".*" + decodedWord[-2:] 89 | add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter) 90 | if len(decodedWord) >= 5: 91 | suffix = ".*" + decodedWord[-4:] 92 | add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter) 93 | if len(decodedWord) >= 6: 94 | suffix = ".*" + decodedWord[-5:] 95 | add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter) 96 | 97 | from collections import OrderedDict 98 | dictionary = OrderedDict(sorted(dictionary.iteritems(), key = itemgetter(0))) 99 | 100 | # Get the most frequent tag in the lexicon to label unknown words and numbers 101 | tagCounter_Alphabet = OrderedDict(sorted(tagCounter_Alphabet.iteritems(), key = itemgetter(1), reverse = True)) 102 | tagCounter_CapitalizedWord = OrderedDict(sorted(tagCounter_CapitalizedWord.iteritems(), key = itemgetter(1), reverse = True)) 103 | tagCounter_Numeric = OrderedDict(sorted(tagCounter_Numeric.iteritems(), key = itemgetter(1), reverse = True)) 104 | tag4UnknWord = tagCounter_Alphabet.keys()[0] 105 | tag4UnknCapitalizedWord = tag4UnknWord 106 | tag4UnknNum = tag4UnknWord 107 | if len(tagCounter_CapitalizedWord) > 0: 108 | tag4UnknCapitalizedWord = tagCounter_CapitalizedWord.keys()[0] 109 | if len(tagCounter_Numeric) > 0: 110 | tag4UnknNum = tagCounter_Numeric.keys()[0] 111 | 112 | # Write to file 113 | fileSuffix = ".sDict" 114 | if fullLexicon == 'full': 115 | fileSuffix = ".DICT" 116 | fileOut = open(corpusFilePath + fileSuffix, "w") 117 | 118 | 119 | fileOut.write("TAG4UNKN-WORD " + tag4UnknWord + "\n") 120 | fileOut.write("TAG4UNKN-CAPITAL " + tag4UnknCapitalizedWord + "\n") 121 | fileOut.write("TAG4UNKN-NUM " + tag4UnknNum + "\n") 122 | for key in dictionary: 123 | fileOut.write(key + " " + dictionary[key] + "\n") 124 | 125 | for suffix in suffixDictCounter: 126 | tagFreq4Suffix = suffixDictCounter[suffix] 127 | pairs = tagFreq4Suffix.items() 128 | pairs.sort(key = itemgetter(1), reverse = True) 129 | tag = pairs[0][0] 130 | freq = pairs[0][1] 131 | if len(suffix) == 7 and freq >= 2: 132 | fileOut.write(suffix + " " + tag + "\n") 133 | if len(suffix) == 6 and freq >= 3: 134 | fileOut.write(suffix + " " + tag + "\n") 135 | if len(suffix) == 5 and freq >= 4: 136 | fileOut.write(suffix + " " + tag + "\n") 137 | if len(suffix) == 4 and freq >= 5: 138 | fileOut.write(suffix + " " + tag + "\n") 139 | 140 | fileOut.close() 141 | -------------------------------------------------------------------------------- /train/Utility/Utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | def getWordTag(wordTag): 4 | if wordTag == "///": 5 | return "/", "/" 6 | index = wordTag.rfind("/") 7 | word = wordTag[:index].strip() 8 | tag = wordTag[index + 1:].strip() 9 | return word, tag 10 | 11 | def getRawText(inputFile, outFile): 12 | out = open(outFile, "w") 13 | sents = open(inputFile, "r").readlines() 14 | for sent in sents: 15 | wordTags = sent.strip().split() 16 | for wordTag in wordTags: 17 | word, tag = getWordTag(wordTag) 18 | out.write(word + " ") 19 | out.write("\n") 20 | out.close() 21 | 22 | def readDictionary(inputFile): 23 | dictionary = {} 24 | lines = open(inputFile, "r").readlines() 25 | for line in lines: 26 | wordtag = line.strip().split() 27 | dictionary[wordtag[0]] = wordtag[1] 28 | return dictionary 29 | 30 | -------------------------------------------------------------------------------- /train/Utility/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/train/Utility/__init__.py -------------------------------------------------------------------------------- /train/Utility/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/train/Utility/__init__.pyc -------------------------------------------------------------------------------- /vws.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: vws 3 | Version: 0.0.1 4 | Summary: A small example package 5 | Home-page: https://github.com/Sudo-VP/Vietnamese-Word-Segmentation-Python 6 | Author: vinhpx 7 | Author-email: phamxuanvinh023@gmail.com 8 | License: UNKNOWN 9 | Project-URL: Bug Tracker, https://github.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/issues 10 | Platform: UNKNOWN 11 | Classifier: Programming Language :: Python :: 3 12 | Classifier: License :: OSI Approved :: MIT License 13 | Classifier: Operating System :: OS Independent 14 | Requires-Python: >=3.6 15 | Description-Content-Type: text/markdown 16 | 17 | # word_segmenter 18 | ## Chú ý: 19 | ### Bộ mã này được viết lại từ bộ RDRSegmenter: https://github.com/datquocnguyen/RDRsegmenter bằng Python với mục đích thuận tiện hơn cho việc sử dụng và tùy biến các công cụ NLP tiếng Việt 20 | The implementation of RDRsegmenter, as described in [our paper](http://www.lrec-conf.org/proceedings/lrec2018/summaries/55.html): 21 | 22 | @InProceedings{NguyenNVDJ2018, 23 | author={Dat Quoc Nguyen and Dai Quoc Nguyen and Thanh Vu and Mark Dras and Mark Johnson}, 24 | title={{A Fast and Accurate Vietnamese Word Segmenter}}, 25 | booktitle={Proceedings of the 11th International Conference on Language Resources and Evaluation (LREC 2018)}, 26 | pages={2582--2587}, 27 | year={2018} 28 | } 29 | 30 | **Please CITE** our paper whenever RDRsegmenter is used to produce published results or incorporated into other software. 31 | 32 | Translator: Vinh Pham 33 | 34 | ## Hướng dẫn sử dụng 35 | ** REQUIRED Python3 ** 36 | - python setup.py install 37 | - python -m pip install . 38 | 39 | ## Ví dụ 40 | ``` 41 | >>> from vws import RDRSegmenter, Tokenizer 42 | >>> rdrsegment = RDRSegmenter.RDRSegmenter() 43 | >>> tokenizer = Tokenizer.Tokenizer() 44 | >>> output = rdrsegment.segmentRawSentences(tokenizer,"Lượng khách Thái bắt đầu gia tăng từ đầu năm 2005. Bên cạnh đó, kể từ tháng 10-2005 đến nay, từ khi được phép của VN, các đoàn caravan của Thái Lan cũng đã ồ ạt đổ vào VN.") 45 | >>> print(output) 46 | ``` 47 | Output: 48 | ``` 49 | >>> Lượng khách Thái bắt_đầu gia_tăng từ đầu năm 2005. Bên cạnh đó, kể từ tháng 10-2005 đến nay, từ khi được phép của VN, các đoàn caravan của Thái_Lan cũng đã ồ_ạt đổ vào VN. 50 | ``` 51 | 52 | -------------------------------------------------------------------------------- /vws.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | MANIFEST.in 2 | README.md 3 | pyproject.toml 4 | setup.py 5 | ./vws/DataPreprocessor.py 6 | ./vws/FWObject.py 7 | ./vws/Model.RDR 8 | ./vws/Node.py 9 | ./vws/RDRSegmenter.py 10 | ./vws/Tokenizer.py 11 | ./vws/Utils.py 12 | ./vws/VnVocab.txt 13 | ./vws/Vocabulary.py 14 | ./vws/WordTag.py 15 | ./vws/__init__.py 16 | vws.egg-info/PKG-INFO 17 | vws.egg-info/SOURCES.txt 18 | vws.egg-info/dependency_links.txt 19 | vws.egg-info/top_level.txt -------------------------------------------------------------------------------- /vws.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /vws.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | vws 2 | -------------------------------------------------------------------------------- /vws/DataPreprocessor.py: -------------------------------------------------------------------------------- 1 | from vws.RDRSegmenter import RDRSegmenter 2 | from vws.Utils import Utils 3 | import sys 4 | 5 | 6 | class DataPreprocessor: 7 | def __init__(self): 8 | self.initialSegmenter = RDRSegmenter(vocab_path='/home/ai/Downloads/vinhpx/cke/nlp/vws/vws/VnVocab.txt') 9 | self.utils = Utils() 10 | 11 | def getStringInitialSegmentation(self, strs: str): 12 | sb = [] 13 | line = strs.strip() 14 | if len(line) == 0: 15 | return "\n" 16 | 17 | wordtags = self.initialSegmenter.getInitialSegmentation(line) 18 | size = len(wordtags) 19 | for i in range(0, size): 20 | if wordtags[i].tag == "B": 21 | sb.append(wordtags[i].form + "/B ") 22 | else: 23 | sb.append(wordtags[i].form + "/I ") 24 | return ''.join(sb).strip() 25 | 26 | def getCorpusInitialSegmentation(self, inFilePath: str): 27 | with open(inFilePath, 'r', encoding="utf8") as buffer: 28 | with open(inFilePath + ".RAW.Init", 'a', encoding='utf8') as bwInit: 29 | with open(inFilePath + ".BI", 'a', encoding='utf8') as bw: 30 | for line in buffer: 31 | if line != "" and line != None and line != '\n': 32 | lineStr = line 33 | for regex in self.utils.NORMALIZER_KEYS: 34 | if regex in lineStr: 35 | lineStr = lineStr.replace(regex, self.utils.NORMALIZER[regex]) 36 | 37 | sb = [] 38 | 39 | words = lineStr.split() 40 | for word in words: 41 | syllabels = word.split("_") 42 | bw.write(syllabels[0] + "/B ") 43 | sb.append(syllabels[0] + " ") 44 | for i in range(1, len(syllabels)): 45 | bw.write(syllabels[i] + "/I ") 46 | sb.append(syllabels[i] + " ") 47 | bw.write("\n") 48 | 49 | bwInit.write(self.getStringInitialSegmentation(''.join(sb)) + "\n") 50 | 51 | 52 | if __name__ == '__main__': 53 | segmenter = DataPreprocessor() 54 | segmenter.getCorpusInitialSegmentation(sys.argv[1]) 55 | -------------------------------------------------------------------------------- /vws/FWObject.py: -------------------------------------------------------------------------------- 1 | class FWObject: 2 | def __init__(self,check:bool): 3 | self._context = [None]*10 4 | self.check = check 5 | if self.check==True: 6 | for i in range(0,10,2): 7 | self.context.append("") 8 | self.context.append("") 9 | @property 10 | def context(self): 11 | return self._context 12 | # setting the values 13 | @context.setter 14 | def context(self, value): 15 | self._context = value 16 | -------------------------------------------------------------------------------- /vws/Model.RDR: -------------------------------------------------------------------------------- 1 | True : object.conclusion = "NN" 2 | object.tag == "I" : object.conclusion = "I" 3 | object.prevWord1 == "cho" : object.conclusion = "B" 4 | object.word == "phép" : object.conclusion = "I" 5 | object.word == "ra" : object.conclusion = "B" 6 | object.prevWord1 == "ngoài" and object.word == "ra" : object.conclusion = "I" 7 | object.prevWord1 == "thật" : object.conclusion = "I" 8 | object.prevWord1 == "xem" : object.conclusion = "I" 9 | object.prevWord1 == "lẽ" and object.word == "ra" : object.conclusion = "I" 10 | object.prevWord1 == "đến" : object.conclusion = "B" 11 | object.prevTag1 == "B" and object.word == "nỗi" and object.nextTag1 == "B" : object.conclusion = "I" 12 | object.word == "khi" and object.nextTag1 == "B" : object.conclusion = "B" 13 | object.prevWord1 == "có" and object.word == "khi" : object.conclusion = "I" 14 | object.prevTag1 == "B" and object.word == "nhà" : object.conclusion = "B" 15 | object.prevWord2 == "," and object.prevWord1 == "người" and object.word == "nhà" : object.conclusion = "I" 16 | object.prevWord1 == "con" : object.conclusion = "B" 17 | object.word == "tin" : object.conclusion = "I" 18 | object.word == "số" : object.conclusion = "I" 19 | object.prevTag1 == "B" and object.word == "người" : object.conclusion = "I" 20 | object.word == "cái" and object.nextTag1 == "B" : object.conclusion = "I" 21 | object.prevTag1 == "B" and object.word == "vào" and object.nextTag1 == "B" : object.conclusion = "B" 22 | object.prevTag1 == "B" and object.word == "có" and object.nextTag1 == "B" : object.conclusion = "B" 23 | object.prevTag1 == "B" and object.word == "đó" : object.conclusion = "B" 24 | object.prevWord2 == "" and object.prevWord1 == "do" and object.word == "đó" : object.conclusion = "I" 25 | object.prevWord1 == "đâu" and object.word == "đó" : object.conclusion = "I" 26 | object.prevWord1 == "quận" : object.conclusion = "B" 27 | object.prevTag1 == "B" and object.word == "huyện" and object.nextTag1 == "B" : object.conclusion = "I" 28 | object.prevWord1 == "năm" : object.conclusion = "B" 29 | object.prevTag1 == "B" and object.word == "ngoái" : object.conclusion = "I" 30 | object.word == "tháng" and object.nextTag1 == "B" : object.conclusion = "I" 31 | object.prevTag1 == "B" and object.word == "như" : object.conclusion = "B" 32 | object.prevWord1 == "hình" : object.conclusion = "I" 33 | object.word == "cao" : object.conclusion = "B" 34 | object.prevWord1 == "văn" : object.conclusion = "I" 35 | object.prevWord1 == "vùng" and object.word == "cao" : object.conclusion = "I" 36 | object.prevWord1 == "bên" : object.conclusion = "B" 37 | object.word == "người" : object.conclusion = "B" 38 | object.prevTag1 == "B" and object.word == "không" and object.nextTag1 == "B" : object.conclusion = "B" 39 | object.word == "không" and object.nextWord1 == "ai" : object.conclusion = "I" 40 | object.prevWord2 == "với" and object.prevWord1 == "khoảng" and object.word == "không" : object.conclusion = "I" 41 | object.prevTag1 == "B" and object.word == "nhau" and object.nextTag1 == "B" : object.conclusion = "B" 42 | object.prevTag1 == "B" and object.word == "phải" : object.conclusion = "B" 43 | object.word == "từ" and object.nextTag1 == "B" and object.nextTag2 == "B" : object.conclusion = "B" 44 | object.prevTag2 == "B" and object.prevTag1 == "B" and object.word == "lại" : object.conclusion = "B" 45 | object.prevWord1 == "trở" : object.conclusion = "I" 46 | object.prevWord1 == "đi" : object.conclusion = "I" 47 | object.prevWord1 == "đi" : object.conclusion = "B" 48 | object.prevWord2 == "''" and object.word == "đêm" : object.conclusion = "I" 49 | object.word == "khách" and object.nextTag1 == "B" and object.nextTag2 == "I" : object.conclusion = "I" 50 | object.prevWord1 == "phó" : object.conclusion = "B" 51 | object.prevTag1 == "B" and object.word == "ở" and object.nextTag1 == "B" : object.conclusion = "B" 52 | object.prevTag1 == "B" and object.word == "hề" and object.nextTag1 == "B" : object.conclusion = "B" 53 | object.prevWord1 == "lúc" : object.conclusion = "B" 54 | object.word == "sau" : object.conclusion = "B" 55 | object.prevTag1 == "B" and object.word == "trên" : object.conclusion = "B" 56 | object.prevWord1 == "so" and object.word == "với" : object.conclusion = "B" 57 | object.prevWord1 == "các" : object.conclusion = "B" 58 | object.prevWord1 == "tháng" : object.conclusion = "B" 59 | object.prevWord1 == "có" and object.word == "một" : object.conclusion = "B" 60 | object.prevWord1 == "có" and object.word == "một" and object.nextWord1 == "không" : object.conclusion = "I" 61 | object.prevTag1 == "B" and object.word == "về" : object.conclusion = "B" 62 | object.prevTag1 == "B" and object.word == "gì" and object.nextTag1 == "B" : object.conclusion = "B" 63 | object.word == "ngày" : object.conclusion = "B" 64 | object.prevTag1 == "B" and object.word == "đi" and object.nextTag1 == "B" : object.conclusion = "B" 65 | object.prevWord1 == "bọn" : object.conclusion = "B" 66 | object.word == "khơi" : object.conclusion = "B" 67 | object.prevTag2 == "I" and object.prevTag1 == "B" and object.word == "là" : object.conclusion = "B" 68 | object.prevTag1 == "B" and object.word == "lúc" and object.nextTag1 == "B" : object.conclusion = "B" 69 | object.prevTag2 == "B" and object.prevTag1 == "B" and object.word == "lên" : object.conclusion = "B" 70 | object.prevTag1 == "B" and object.word == "cần" and object.nextTag1 == "B" : object.conclusion = "B" 71 | object.prevWord1 == "lấy" : object.conclusion = "B" 72 | object.prevWord1 == "cuộc" and object.word == "họp" : object.conclusion = "B" 73 | object.word == "nhưng" and object.nextTag1 == "B" : object.conclusion = "B" 74 | object.word == "cho" and object.nextTag1 == "B" : object.conclusion = "B" 75 | object.prevTag1 == "B" and object.word == "điều" : object.conclusion = "B" 76 | object.prevWord1 == "người" and object.word == "ta" : object.conclusion = "B" 77 | object.prevWord1 == "của" : object.conclusion = "B" 78 | object.prevWord1 == "cô" and object.word == "gái" : object.conclusion = "B" 79 | object.prevTag1 == "B" and object.word == "hai" and object.nextTag1 == "B" : object.conclusion = "B" 80 | object.prevTag1 == "B" and object.word == "lần" and object.nextTag1 == "B" : object.conclusion = "B" 81 | object.prevWord1 == "câu" and object.word == "hỏi" : object.conclusion = "B" 82 | object.word == "đường" and object.nextTag1 == "B" and object.nextTag2 == "I" : object.conclusion = "B" 83 | object.word == "ai" : object.conclusion = "B" 84 | object.prevWord1 == "đứng" : object.conclusion = "B" 85 | object.prevWord1 == "đứng" and object.word == "tên" and object.nextWord1 == "đăng" : object.conclusion = "I" 86 | object.prevWord1 == "chữa" and object.word == "bệnh" : object.conclusion = "B" 87 | object.prevTag2 == "B" and object.prevTag1 == "B" and object.word == "theo" : object.conclusion = "B" 88 | object.prevWord1 == "có" and object.word == "ý" : object.conclusion = "B" 89 | object.prevWord1 == "mùa" : object.conclusion = "B" 90 | object.word == "qua" and object.nextTag1 == "B" and object.nextTag2 == "I" : object.conclusion = "B" 91 | object.prevWord1 == "có" and object.word == "nghĩa" : object.conclusion = "B" 92 | object.prevWord1 == "về" : object.conclusion = "B" 93 | object.word == "thấy" and object.nextTag1 == "B" : object.conclusion = "B" 94 | object.prevWord1 == "cảm" : object.conclusion = "I" 95 | object.word == "cơm" : object.conclusion = "B" 96 | object.prevWord1 == "vì" and object.word == "sao" : object.conclusion = "B" 97 | object.prevWord1 == "một" and object.word == "thời" : object.conclusion = "B" 98 | object.prevWord1 == "chợ" and object.word == "mới" : object.conclusion = "B" 99 | object.prevWord1 == "có" and object.word == "công" : object.conclusion = "B" 100 | object.prevWord1 == "còn" and object.word == "lại" : object.conclusion = "B" 101 | object.prevWord1 == "buổi" : object.conclusion = "B" 102 | object.prevWord1 == "cá" and object.nextWord1 == "ngọt" : object.conclusion = "B" 103 | object.prevWord2 == "nạo" and object.word == "thai" : object.conclusion = "B" 104 | object.prevWord1 == "cùng" and object.word == "với" : object.conclusion = "B" 105 | object.prevWord1 == "gần" and object.word == "đây" : object.conclusion = "B" 106 | object.word == "lễ" and object.nextTag1 == "B" and object.nextTag2 == "B" : object.conclusion = "B" 107 | object.prevTag1 == "B" and object.word == "dậy" and object.nextTag1 == "B" : object.conclusion = "B" 108 | object.prevWord1 == "cánh" : object.conclusion = "B" 109 | object.prevTag1 == "B" and object.word == "nêu" and object.nextTag1 == "B" : object.conclusion = "B" 110 | object.prevTag1 == "B" and object.word == "cái" : object.conclusion = "B" 111 | object.prevTag1 == "B" and object.word == "thép" : object.conclusion = "B" 112 | object.prevWord1 == "trong" and object.word == "vòng" : object.conclusion = "B" 113 | object.prevTag1 == "B" and object.word == "nợ" : object.conclusion = "B" 114 | object.prevWord1 == "chặn" and object.word == "đường" : object.conclusion = "B" 115 | object.word == "vợ" : object.conclusion = "B" 116 | object.prevWord1 == "nhà" and object.word == "chồng" : object.conclusion = "B" 117 | object.prevWord1 == "máy" and object.word == "điện" : object.conclusion = "B" 118 | object.prevWord1 == "tuyến" and object.word == "đường" : object.conclusion = "B" 119 | object.prevWord1 == "vụ" and object.word == "án" : object.conclusion = "B" 120 | object.prevWord1 == "từ" and object.word == "đầu" : object.conclusion = "B" 121 | object.prevWord1 == "bóp" : object.conclusion = "B" 122 | object.prevWord1 == "ngay" and object.word == "cả" : object.conclusion = "B" 123 | object.prevWord1 == "chụp" : object.conclusion = "B" 124 | object.prevWord1 == "niềm" : object.conclusion = "B" 125 | object.prevWord1 == "lứa" : object.conclusion = "B" 126 | object.prevWord1 == "nổ" and object.word == "máy" : object.conclusion = "B" 127 | object.prevWord1 == "chiều" and object.word == "nay" : object.conclusion = "B" 128 | object.prevTag1 == "B" and object.word == "vn" and object.nextTag1 == "B" : object.conclusion = "B" 129 | object.prevTag1 == "B" and object.word == "lão" : object.conclusion = "B" 130 | object.word == "thuốc" and object.nextTag1 == "B" : object.conclusion = "B" 131 | object.prevTag1 == "B" and object.word == "ba" and object.nextTag1 == "B" : object.conclusion = "B" 132 | object.prevWord1 == "vài" and object.word == "ba" : object.conclusion = "I" 133 | object.prevWord1 == "mái" : object.conclusion = "B" 134 | object.prevWord1 == "đưa" : object.conclusion = "B" 135 | object.prevTag2 == "B" and object.prevTag1 == "B" and object.word == "tháng" : object.conclusion = "B" 136 | object.prevWord1 == "trứng" and object.word == "gà" : object.conclusion = "B" 137 | object.word == "bé" and object.nextTag1 == "B" : object.conclusion = "B" 138 | object.prevTag2 == "I" : object.conclusion = "I" 139 | object.word == "đến" : object.conclusion = "B" 140 | object.prevWord1 == "cục" and object.word == "trưởng" : object.conclusion = "B" 141 | object.prevWord1 == "đẫm" and object.word == "máu" : object.conclusion = "B" 142 | object.prevWord1 == "hồi" and object.word == "trước" : object.conclusion = "B" 143 | object.prevWord1 == "băng" and object.word == "cướp" : object.conclusion = "B" 144 | object.prevWord1 == "trưởng" and object.word == "phòng" : object.conclusion = "B" 145 | object.prevWord1 == "không" and object.word == "phép" : object.conclusion = "B" 146 | object.prevWord1 == "quán" and object.word == "nước" : object.conclusion = "B" 147 | object.prevWord1 == "dập" : object.conclusion = "B" 148 | object.prevWord1 == "bỏ" and object.word == "trốn" : object.conclusion = "B" 149 | object.prevWord1 == "có" and object.word == "con" : object.conclusion = "B" 150 | object.prevWord1 == "vượt" and object.word == "qua" : object.conclusion = "B" 151 | object.prevWord1 == "gầm" : object.conclusion = "B" 152 | object.prevWord1 == "làm" and object.word == "chủ" : object.conclusion = "B" 153 | object.prevWord1 == "tờ" and object.word == "báo" : object.conclusion = "B" 154 | object.word == "đồng" and object.nextWord1 == "hồ" : object.conclusion = "B" 155 | object.prevWord1 == "mang" : object.conclusion = "B" 156 | object.prevWord1 == "nghe" and object.word == "tiếng" : object.conclusion = "B" 157 | object.prevWord1 == "ra" and object.word == "sao" : object.conclusion = "B" 158 | object.prevWord1 == "giày" and object.word == "an" and object.nextWord1 == "giang" : object.conclusion = "B" 159 | object.word == "lãi" and object.nextTag1 == "B" : object.conclusion = "B" 160 | object.prevWord1 == "như" and object.word == "thế" and object.nextWord1 == "này" : object.conclusion = "B" 161 | object.word == "ruột" and object.nextTag1 == "B" : object.conclusion = "B" 162 | object.prevWord1 == "ngọn" and object.word == "lửa" : object.conclusion = "B" 163 | object.prevWord1 == "có" and object.word == "lợi" : object.conclusion = "B" 164 | object.prevWord1 == "giấy" and object.word == "chứng" : object.conclusion = "B" 165 | object.prevWord2 == "toà" and object.word == "nhân" : object.conclusion = "B" 166 | object.word == "hộ" and object.nextTag1 == "I" : object.conclusion = "B" 167 | object.word == "gia" and object.nextTag1 == "I" : object.conclusion = "B" 168 | object.nextWord1 == "khát" : object.conclusion = "B" 169 | object.prevWord1 == "mức" and object.word == "lương" : object.conclusion = "B" 170 | object.prevWord1 == "sẽ" : object.conclusion = "B" 171 | object.prevWord1 == "bắt" and object.word == "sống" : object.conclusion = "B" 172 | object.word == "xong" and object.nextTag1 == "B" : object.conclusion = "B" 173 | object.prevWord1 == "khi" and object.word == "nào" : object.conclusion = "B" 174 | object.word == "hôm" : object.conclusion = "B" 175 | object.prevWord1 == "tiêu" and object.word == "độc" : object.conclusion = "B" 176 | object.word == "rõ" : object.conclusion = "B" 177 | object.prevTag1 == "B" and object.word == "sớm" : object.conclusion = "B" 178 | object.prevWord1 == "tiến" and object.word == "tới" : object.conclusion = "B" 179 | object.prevWord1 == "viện" and object.nextWord1 == "viện" : object.conclusion = "B" 180 | object.prevTag1 == "B" and object.word == "nhìn" : object.conclusion = "B" 181 | object.word == "lại" and object.nextWord2 == "''" : object.conclusion = "B" 182 | object.word == "hàng" and object.nextWord1 == "hoá" : object.conclusion = "B" 183 | object.word == "công" and object.nextWord1 == "khai" : object.conclusion = "B" 184 | object.word == "giá" and object.nextWord1 == "trị" : object.conclusion = "B" 185 | object.prevTag1 == "B" and object.word == "khỏi" and object.nextTag1 == "B" : object.conclusion = "B" 186 | object.prevWord1 == "cơn" : object.conclusion = "B" 187 | object.prevWord1 == "áp" and object.word == "sát" : object.conclusion = "B" 188 | object.prevWord1 == "mảnh" and object.word == "đất" : object.conclusion = "B" 189 | object.prevWord1 == "gật" and object.word == "đầu" : object.conclusion = "B" 190 | object.prevWord1 == "trước" and object.word == "mặt" : object.conclusion = "B" 191 | object.prevWord1 == "đau" and object.word == "bụng" : object.conclusion = "B" 192 | object.prevTag2 == "B" and object.prevTag1 == "B" and object.word == "vai" : object.conclusion = "B" 193 | object.prevWord1 == "có" and object.word == "số" : object.conclusion = "B" 194 | object.prevWord1 == "có" and object.word == "cơ" : object.conclusion = "B" 195 | object.prevWord1 == "vẻ" : object.conclusion = "B" 196 | object.prevWord1 == "mực" and object.word == "nước" : object.conclusion = "B" 197 | object.prevWord1 == "gây" and object.word == "bệnh" : object.conclusion = "B" 198 | object.word == "năm" and object.nextTag1 == "B" and object.nextTag2 == "B" : object.conclusion = "B" 199 | object.word == "năm" and object.nextWord1 == "và" and object.nextWord2 == "thạnh" : object.conclusion = "I" 200 | object.prevWord1 == "ngã" : object.conclusion = "B" 201 | object.prevWord1 == "được" : object.conclusion = "B" 202 | object.prevWord2 == "không" and object.prevWord1 == "được" and object.word == "việc" : object.conclusion = "I" 203 | object.prevWord1 == "chiều" and object.word == "dài" : object.conclusion = "B" 204 | object.prevWord1 == "xin" and object.word == "việc" : object.conclusion = "B" 205 | object.word == "biến" and object.nextWord2 == "của" : object.conclusion = "B" 206 | object.prevWord2 == "thứ" : object.conclusion = "B" 207 | object.word == "làm" and object.nextTag1 == "B" and object.nextTag2 == "I" : object.conclusion = "B" 208 | object.prevWord1 == "vụ" and object.word == "trưởng" : object.conclusion = "B" 209 | object.prevWord2 == "" and object.prevWord1 == "ông" and object.word == "tự" : object.conclusion = "B" 210 | object.word == "cổ" and object.nextTag1 == "B" and object.nextTag2 == "B" : object.conclusion = "B" 211 | object.prevWord1 == "đấu" and object.word == "súng" : object.conclusion = "B" 212 | object.prevWord1 == "lại" : object.conclusion = "B" 213 | object.prevWord1 == "ở" : object.conclusion = "B" 214 | object.prevWord1 == "mùi" and object.word == "hôi" : object.conclusion = "B" 215 | object.tag == "B" : object.conclusion = "B" 216 | object.prevWord1 == "người" and object.word == "dân" : object.conclusion = "I" 217 | object.word == "dân" and object.nextWord1 == "tộc" : object.conclusion = "B" 218 | object.prevWord1 == "a" : object.conclusion = "I" 219 | object.prevWord1 == "bùng" and object.word == "phát" : object.conclusion = "I" 220 | object.word == "thể" and object.nextTag1 == "B" : object.conclusion = "I" 221 | object.prevWord1 == "võ" : object.conclusion = "I" 222 | object.prevWord1 == "không" and object.word == "chỉ" : object.conclusion = "I" 223 | object.prevWord1 == "cúm" and object.word == "gia" : object.conclusion = "I" 224 | object.prevWord1 == "lúc" and object.word == "nào" : object.conclusion = "I" 225 | object.prevWord1 == "thế" and object.word == "nào" : object.conclusion = "I" 226 | object.word == "ton" : object.conclusion = "I" 227 | object.word == "văn" and object.nextTag1 == "B" : object.conclusion = "I" 228 | object.prevWord1 == "cuộc" and object.word == "chơi" : object.conclusion = "I" 229 | object.prevWord1 == "hiểm" and object.word == "y" and object.nextWord1 == "tế" : object.conclusion = "I" 230 | object.prevTag1 == "I" and object.word == "ngọt" and object.nextTag1 == "B" : object.conclusion = "I" 231 | object.prevWord1 == "nạo" and object.word == "phá" : object.conclusion = "I" 232 | object.word == "hưng" : object.conclusion = "I" 233 | object.word == "bướu" and object.nextTag1 == "B" : object.conclusion = "I" 234 | object.prevWord2 == "một" and object.word == "gian" : object.conclusion = "I" 235 | object.prevTag2 == "I" and object.prevTag1 == "B" and object.word == "chánh" : object.conclusion = "I" 236 | object.prevWord1 == "phục" and object.word == "dựng" : object.conclusion = "I" 237 | object.prevWord1 == "bất" : object.conclusion = "I" 238 | object.prevWord1 == "cả" and object.word == "cấm" : object.conclusion = "I" 239 | object.prevWord1 == "tà" : object.conclusion = "I" 240 | object.prevWord1 == "dệt" and object.word == "may" : object.conclusion = "I" 241 | object.prevWord1 == "châu" and object.word == "phi" : object.conclusion = "I" 242 | object.word == "hoá" and object.nextTag1 == "B" : object.conclusion = "I" 243 | object.prevWord1 == "nhà" and object.word == "đất" : object.conclusion = "I" 244 | object.prevWord1 == "nhà" and object.nextWord1 == "loại" : object.conclusion = "I" 245 | object.prevWord1 == "tây" and object.word == "nguyên" : object.conclusion = "I" 246 | object.prevTag1 == "I" and object.word == "trang" and object.nextTag1 == "B" : object.conclusion = "I" 247 | object.prevWord1 == "châu" and object.word == "âu" : object.conclusion = "I" 248 | object.word == "dah" and object.nextWord1 == "wen" : object.conclusion = "I" 249 | object.prevTag1 == "I" and object.word == "định" and object.nextTag1 == "B" : object.conclusion = "I" 250 | object.prevTag1 == "I" and object.word == "ty" : object.conclusion = "I" 251 | object.prevWord2 == "vốn" and object.prevWord1 == "nhà" and object.word == "nước" : object.conclusion = "I" 252 | object.word == "phẩm" and object.nextTag1 == "B" : object.conclusion = "I" 253 | object.prevWord2 == "đông" and object.prevWord1 == "nam" and object.word == "á" : object.conclusion = "I" 254 | object.word == "dũ" and object.nextTag1 == "B" : object.conclusion = "I" 255 | object.prevWord1 == "trưởng" and object.word == "phòng" : object.conclusion = "I" 256 | object.prevWord1 == "bày" and object.word == "bán" : object.conclusion = "I" 257 | object.prevWord1 == "lan" and object.word == "anh" : object.conclusion = "I" 258 | object.prevWord1 == "chợ" and object.word == "rẫy" : object.conclusion = "I" 259 | object.prevWord1 == "cực" and object.word == "kỳ" : object.conclusion = "I" 260 | object.word == "trang" and object.nextTag1 == "" and object.nextTag2 == "" : object.conclusion = "I" 261 | object.prevWord1 == "giá" and object.word == "trị" : object.conclusion = "I" 262 | object.prevWord1 == "mũ" and object.word == "bảo" : object.conclusion = "I" 263 | object.prevWord1 == "cầu" and object.word == "thang" : object.conclusion = "I" 264 | object.prevWord1 == "nhà" and object.word == "đầu" and object.nextWord1 == "tư" : object.conclusion = "I" 265 | object.prevTag2 == "" and object.prevTag1 == "B" and object.word == "huy" : object.conclusion = "I" 266 | object.prevTag1 == "I" and object.word == "ngoạn" : object.conclusion = "I" 267 | object.prevTag1 == "I" and object.word == "an" and object.nextTag1 == "B" : object.conclusion = "I" 268 | object.word == "thuận" and object.nextTag1 == "B" and object.nextTag2 == "B" : object.conclusion = "I" 269 | object.word == "vương" and object.nextTag1 == "B" : object.conclusion = "I" 270 | object.prevWord1 == "tân" and object.word == "nhuận" and object.nextWord1 == "đông" : object.conclusion = "I" 271 | object.prevTag1 == "I" and object.word == "kiến" and object.nextTag1 == "B" : object.conclusion = "I" 272 | object.prevWord2 == "khoản" and object.prevWord1 == "tạm" and object.word == "thu" : object.conclusion = "I" 273 | object.word == "quỳnh" : object.conclusion = "I" 274 | object.prevWord1 == "khám" and object.word == "chữa" and object.nextWord1 == "bệnh" : object.conclusion = "I" 275 | object.word == "viên" and object.nextTag1 == "B" and object.nextTag2 == "I" : object.conclusion = "I" 276 | object.prevWord1 == "bộ" and object.word == "ngành" : object.conclusion = "I" 277 | object.prevWord1 == "sông" and object.word == "suối" : object.conclusion = "I" 278 | object.prevWord1 == "bích" : object.conclusion = "I" 279 | object.prevWord1 == "khô" and object.word == "hạn" : object.conclusion = "I" 280 | object.prevWord2 == "có" and object.word == "nghĩa" : object.conclusion = "I" 281 | object.prevWord1 == "lê" and object.word == "thanh" and object.nextWord1 == "hà" : object.conclusion = "I" 282 | object.prevWord1 == "phòng" and object.word == "tránh" : object.conclusion = "I" 283 | object.prevWord1 == "đâm" and object.word == "chém" : object.conclusion = "I" 284 | -------------------------------------------------------------------------------- /vws/Node.py: -------------------------------------------------------------------------------- 1 | import vws.FWObject as FWObject 2 | 3 | class Node: 4 | def __init__(self,inCondition:FWObject, inConclusion:str, inFatherNode, inExceptNode, 5 | inIfnotNode, inDepth:int): 6 | self.condition = inCondition 7 | self.conclusion = inConclusion 8 | self.fatherNode = inFatherNode 9 | self.exceptNode = inExceptNode 10 | self.ifnotNode = inIfnotNode 11 | self.depth = inDepth 12 | def setIfnotNode(self, node): 13 | self.ifnotNode = node 14 | 15 | def setExceptNode(self, node): 16 | self.exceptNode = node 17 | 18 | def setFatherNode(self, node): 19 | self.fatherNode = node 20 | def countNodes(self)->int: 21 | count = 1 22 | if self.exceptNode != None: 23 | count += self.exceptNode.countNodes() 24 | if self.ifnotNode != None : 25 | count += self.ifnotNode.countNodes() 26 | return count 27 | def satisfy(self, object:FWObject): 28 | check = True 29 | for i in range(0,10): 30 | key = self.condition.context[i] 31 | if key != None: 32 | if not key == object.context[i] : 33 | check = False 34 | break 35 | return check -------------------------------------------------------------------------------- /vws/RDRSegmenter.py: -------------------------------------------------------------------------------- 1 | from vws.Node import Node 2 | from vws.Utils import Utils 3 | from vws.FWObject import FWObject 4 | from vws.WordTag import WordTag 5 | from vws.Vocabulary import Vocabulary 6 | from vws.Tokenizer import Tokenizer 7 | import time,os 8 | 9 | utils = Utils() 10 | 11 | class RDRSegmenter: 12 | def __init__(self,model_path=None,vocab_path=None): 13 | self._root = None 14 | self.vocab_path = vocab_path 15 | try: 16 | if model_path is None: 17 | self.constructTreeFromRulesFile(os.path.join(os.path.dirname(__file__),"Model.RDR")) 18 | else: 19 | self.constructTreeFromRulesFile(model_path) 20 | 21 | except IOError as e: 22 | raise e 23 | @property 24 | def root(self): 25 | return self._root 26 | @root.setter 27 | def root(self,value:Node): 28 | self._root = value 29 | def constructTreeFromRulesFile(self, rulesFilePath:str): 30 | self.root = Node(FWObject(False), "NN", None, None, None, 0) 31 | 32 | currentNode = self.root 33 | currentDepth = 0 34 | with open(rulesFilePath,'r',encoding='utf8') as rulesFile: 35 | for indexFileRule,line in enumerate(rulesFile): 36 | depth = 0 37 | for i in range(0,6): 38 | if line[i] == '\t': 39 | depth += 1 40 | else: 41 | break 42 | if indexFileRule==0: 43 | continue 44 | line = line.strip() 45 | if len(line) == 0: 46 | continue 47 | 48 | if "cc:" in line: 49 | continue 50 | # print(line.split(" : ")[0].strip()) 51 | condition = utils.getCondition(line.split(" : ")[0].strip()) 52 | conclusion = utils.getConcreteValue(line.split(" : ")[1].strip()) 53 | 54 | node = Node(condition, conclusion, None, None, None, depth) 55 | 56 | if depth > currentDepth: 57 | currentNode.setExceptNode(node) 58 | else: 59 | if depth == currentDepth: 60 | currentNode.setIfnotNode(node) 61 | else: 62 | while currentNode.depth != depth: 63 | currentNode = currentNode.fatherNode 64 | currentNode.setIfnotNode(node) 65 | node.setFatherNode(currentNode) 66 | 67 | currentNode = node 68 | currentDepth = depth 69 | 70 | def findFiredNode(self,object:FWObject)->Node: 71 | currentN = self._root 72 | firedN = None 73 | while True: 74 | if currentN.satisfy(object): 75 | firedN = currentN 76 | if currentN.exceptNode == None : 77 | break 78 | else : 79 | currentN = currentN.exceptNode 80 | else: 81 | if currentN.ifnotNode == None: 82 | break 83 | else : 84 | currentN = currentN.ifnotNode 85 | return firedN 86 | def allIsLetter(self,strs:str)->bool: 87 | 88 | for char in strs: 89 | if char.isalpha() ==False: 90 | return False 91 | return True 92 | def allIsUpper(self,strs:str)->bool: 93 | 94 | for char in strs: 95 | if char.isupper() ==False: 96 | return False 97 | return True 98 | def getInitialSegmentation(self,sentence:str)->list: 99 | wordtags = [] 100 | if self.vocab_path is None: 101 | vocab = Vocabulary() 102 | else: 103 | vocab = Vocabulary(self.vocab_path) 104 | for regex in utils.NORMALIZER_KEYS: 105 | if regex in sentence: 106 | sentence = sentence.replace(regex, utils.NORMALIZER[regex]) 107 | tokens = sentence.split() 108 | lowerTokens = sentence.lower().split() 109 | senLength = len(tokens) 110 | i = 0 111 | while i < senLength : 112 | token = tokens[i] 113 | if self.allIsLetter(token) : 114 | if token[0].islower() and (i + 1) < senLength: 115 | if tokens[i + 1][0].isupper(): 116 | wordtags.append(WordTag(token, "B")) 117 | i+=1 118 | continue 119 | isSingleSyllabel = True 120 | for j in range(min(i + 4, senLength), i + 1,-1): 121 | word = " ".join(lowerTokens[i: j]) 122 | if word in vocab.VN_DICT or word in vocab.VN_LOCATIONS or word in vocab.COUNTRY_L_NAME: 123 | wordtags.append(WordTag(token, "B")) 124 | for k in range(i+1,j): 125 | wordtags.append(WordTag(tokens[k], "I")) 126 | 127 | i = j - 1 128 | isSingleSyllabel = False 129 | break 130 | 131 | if isSingleSyllabel : 132 | lowercasedToken = lowerTokens[i] 133 | 134 | if lowercasedToken in vocab.VN_FIRST_SENT_WORDS \ 135 | or token[0].islower() \ 136 | or self.allIsUpper(token) \ 137 | or lowercasedToken in vocab.COUNTRY_S_NAME \ 138 | or lowercasedToken in vocab.WORLD_COMPANY : \ 139 | 140 | wordtags.append(WordTag(token, "B")) 141 | i+=1 142 | continue 143 | ilower = i + 1 144 | for ilower in range(i + 1 ,min(i + 4, senLength)): 145 | ntoken = tokens[ilower] 146 | if ntoken.islower() \ 147 | or not self.allIsLetter(ntoken) \ 148 | or ntoken=="LBKT" or ntoken=="RBKT" : 149 | break 150 | 151 | if ilower > i + 1: 152 | isNotMiddleName = True 153 | if lowercasedToken in vocab.VN_MIDDLE_NAMES and i >= 1: 154 | prevT = tokens[i-1] 155 | if prevT[0].isupper(): 156 | if prevT.lower() in vocab.VN_FAMILY_NAMES: 157 | wordtags.append(WordTag(token, "I")) 158 | isNotMiddleName = False 159 | if isNotMiddleName: 160 | wordtags.append(WordTag(token, "B")) 161 | for k in range(i+1,ilower): 162 | wordtags.append( WordTag(tokens[k], "I")) 163 | 164 | i = ilower - 1 165 | else: 166 | wordtags.append(WordTag(token, "B")) 167 | else: 168 | wordtags.append(WordTag(token, "B")) 169 | i+=1 170 | return wordtags 171 | 172 | def segmentTokenizedString(self,strs :str)->str: 173 | sb = "" 174 | line = ''.join(strs).strip() 175 | if len(line) == 0: 176 | return "\n" 177 | 178 | wordtags = self.getInitialSegmentation(line) 179 | size = len(wordtags) 180 | for i in range(0,size) : 181 | object = utils.getObject(wordtags, size, i) 182 | firedNode = self.findFiredNode(object) 183 | if firedNode.depth > 0: 184 | if firedNode.conclusion=="B": 185 | sb=sb+" " + wordtags[i].form 186 | else: 187 | sb=sb+"_" + wordtags[i].form 188 | else: 189 | if wordtags[i].tag == "B": 190 | sb=sb+" " + wordtags[i].form 191 | else: 192 | sb=sb+"_" + wordtags[i].form 193 | return sb.strip() 194 | 195 | # def segmentRawString(self,strs:str)->str: 196 | # return self.segmentTokenizedString(" ".join(Tokenizer.tokenize(strs))) 197 | def segmentRawSentences(self,tokenizer:Tokenizer,strs:str): 198 | sentence = tokenizer.joinSentences(tokenizer.tokenize(strs)) 199 | return self.segmentTokenizedString(sentence) 200 | 201 | 202 | # if __name__ == "__main__": 203 | # rdrsegment = RDRSegmenter() 204 | # tokenizer = Tokenizer() 205 | # t=time.time() 206 | # output = rdrsegment.segmentRawSentences(tokenizer,"Lượng khách Thái bắt đầu gia tăng từ đầu năm 2005. Bên cạnh đó, kể từ tháng 10-2005 đến nay, từ khi được phép của VN, các đoàn caravan của Thái Lan cũng đã ồ ạt đổ vào VN.") 207 | # print(output,time.time()-t) 208 | -------------------------------------------------------------------------------- /vws/Tokenizer.py: -------------------------------------------------------------------------------- 1 | import string 2 | import re 3 | # from enum import E/num 4 | 5 | class Tokenizer: 6 | def __init__(self): 7 | self.name = 'Tokenizer' 8 | def hasPunctuation(self,strs:str): 9 | for char in strs: 10 | # print(char) 11 | if not char.isalpha(): 12 | # print(char) 13 | return True 14 | return False 15 | 16 | def tokenize(self,s): 17 | if s == None or s.strip()=="": 18 | return [] 19 | 20 | tempTokens = s.strip().split() 21 | # print(tempTokens) 22 | if len(tempTokens) == 0: 23 | return [] 24 | 25 | tokens = [] 26 | for token in tempTokens: 27 | # print(len(token)) 28 | 29 | if len(token) == 1 or self.hasPunctuation(token): 30 | tokens.append(token) 31 | continue 32 | 33 | if token.endswith(","): 34 | 35 | for t in self.tokenize(token[0, len(token) - 1]): 36 | tokens.append(t) 37 | tokens.append(",") 38 | continue 39 | if token in StringUtils().VN_abbreviation: 40 | tokens.append(token) 41 | continue 42 | 43 | if token.endswith(".") and token[len(token) - 2].isalpha(): 44 | if len(token) == 2 and token[len(token) - 2].isupper() or re.search(Regex.SHORT_NAME,token): 45 | tokens.append(token) 46 | continue 47 | for t in self.tokenize(token[0, len(token) - 1]): 48 | tokens.append(t) 49 | tokens.add(".") 50 | continue 51 | 52 | if token in StringUtils().VN_exception: 53 | tokens.append(token) 54 | continue 55 | 56 | tokenContainsAbb = False 57 | for e in StringUtils().VN_abbreviation: 58 | try: 59 | i = token.index(e) 60 | except Exception as e: 61 | continue 62 | 63 | tokenContainsAbb = True 64 | tokens = self.recursive(tokens, token, i, i + e.length()) 65 | break 66 | if tokenContainsAbb: 67 | continue 68 | 69 | tokenContainsExp = False 70 | for e in StringUtils()._VN_exception: 71 | try: 72 | i = token.index(e) 73 | except Exception as e: 74 | continue 75 | 76 | tokenContainsExp = True 77 | tokens = self.recursive(tokens, token, i, i + e.length()) 78 | break 79 | if tokenContainsExp: 80 | continue 81 | 82 | regexes = Regex().getRegexList() 83 | 84 | matching = False 85 | for regex in regexes: 86 | # print(regex,token) 87 | if re.search(regex,token): 88 | tokens.append(token) 89 | matching = True 90 | break 91 | if matching: 92 | continue 93 | 94 | for i in range(0, len(regexes)): 95 | pattern = re.compile(regexes[i]) 96 | matcher = matcher.search(token) 97 | if matcher: 98 | if i == Regex.getRegexIndex("url"): 99 | elements = token.split(".") 100 | hasURL = True 101 | for ele in elements: 102 | if len(ele) == 1 and ele[0].isupper(): 103 | hasURL = False 104 | break 105 | for j in range(0,len(ele)): 106 | if ele[j] >= 128: 107 | hasURL = False 108 | break 109 | if hasURL: 110 | tokens = self.recursive(tokens, token, matcher.start(), matcher.end()) 111 | else: 112 | continue 113 | 114 | else: 115 | if i == Regex.getRegexIndex("month"): 116 | start = matcher.start() 117 | 118 | hasLetter = False 119 | 120 | for j in range(0, start): 121 | if token[j].isalpha(): 122 | tokens = self.recursive(tokens, token, matcher.start(), matcher.end()) 123 | hasLetter = True 124 | break 125 | 126 | 127 | if not hasLetter: 128 | tokens.append(token) 129 | 130 | else: 131 | tokens = self.recursive(tokens, token, matcher.start(), matcher.end()) 132 | 133 | matching = True 134 | break 135 | 136 | if matching: 137 | continue 138 | else: 139 | tokens.append(token) 140 | 141 | return tokens 142 | 143 | def recursive( self,tokens, token, beginMatch, endMatch): 144 | if beginMatch > 0: 145 | for t in self.tokenize(token[0, beginMatch]): 146 | tokens.append(t) 147 | for t in self.tokenize(token[beginMatch, endMatch]): 148 | tokens.append(t) 149 | 150 | if endMatch < len(token): 151 | for t in self.tokenize(token[endMatch]): 152 | tokens.append(t) 153 | 154 | return tokens 155 | 156 | def joinSentences(self,tokens): 157 | sentences =[] 158 | sentence = [] 159 | for i in range(0,len(tokens)): 160 | token = tokens[i] 161 | nextToken = None 162 | if i != len(tokens)- 1: 163 | nextToken = tokens[i + 1] 164 | beforeToken = None 165 | if i > 0: 166 | beforeToken = tokens[i - 1] 167 | 168 | # print(token) 169 | sentence.append(token) 170 | 171 | if i == len(tokens)- 1: 172 | sentences.append(self.joinSentence(sentence)) 173 | return sentences 174 | 175 | if i < len(tokens)- 2 and token == StringConst.COLON: 176 | if nextToken.isnumeric() and tokens[i+2]==StringConst.STOP \ 177 | or tokens[i+2] == StringConst.COMMA: 178 | sentences.append(self.joinSentence(sentence)) 179 | sentence = '' 180 | continue 181 | 182 | 183 | if re.match(Regex().EOS_PUNCTUATION,token): 184 | 185 | if nextToken == "\"" or nextToken=="''": 186 | count = 0 187 | for senToken in sentence: 188 | if senToken=="\"" or senToken=="''": 189 | count += 1 190 | if count % 2 == 1: 191 | continue 192 | 193 | if StringUtils.isBrace(nextToken) or nextToken=="" or nextToken[0].islower() \ 194 | or nextToken==StringConst.COMMA or nextToken[0].isnumeric(): 195 | continue 196 | 197 | if len(sentence) == 2 and token==StringConst.STOP: 198 | if beforeToken[0].isnumeric(): 199 | continue 200 | if beforeToken[0].islower(): 201 | continue 202 | if beforeToken[0].isupper(): 203 | if len(beforeToken) == 1: 204 | continue 205 | 206 | 207 | sentences.append(self.joinSentence(sentence)) 208 | sentence = "" 209 | return sentences 210 | 211 | def joinSentence(self,tokens): 212 | sent = [] 213 | stringConst = StringConst() 214 | length = length = len(tokens) 215 | token = "" 216 | for i in range(0,length): 217 | token = tokens[i] 218 | if token=="" or token == None or token==stringConst.SPACE: 219 | continue 220 | sent.append(token) 221 | if i < length - 1: 222 | sent.append(stringConst.SPACE) 223 | return ''.join(sent).strip() 224 | 225 | class StringConst: 226 | @property 227 | def BOS(self): 228 | return "" 229 | @property 230 | def EOS(self): 231 | return "" 232 | @property 233 | def SPACE(self): 234 | return " " 235 | @property 236 | def COMMA(self): 237 | return "," 238 | @property 239 | def STOP(self): 240 | return "." 241 | @property 242 | def COLON(self): 243 | return ":" 244 | @property 245 | def UNDERSCORE(self): 246 | return "_" 247 | 248 | class StringUtils: 249 | def __init__(self): 250 | self._VN_abbreviation={"M.City"} 251 | self._VN_abbreviation.add("V.I.P") 252 | self._VN_abbreviation.add("PGS.Ts") 253 | self._VN_abbreviation.add("MRS.") 254 | self._VN_abbreviation.add("Mrs.") 255 | self._VN_abbreviation.add("Man.United") 256 | self._VN_abbreviation.add("Mr.") 257 | self._VN_abbreviation.add("SHB.ĐN") 258 | self._VN_abbreviation.add("Gs.Bs") 259 | self._VN_abbreviation.add("U.S.A") 260 | self._VN_abbreviation.add("TMN.CSG") 261 | self._VN_abbreviation.add("Kts.Ts") 262 | self._VN_abbreviation.add("R.Madrid") 263 | self._VN_abbreviation.add("Tp.") 264 | self._VN_abbreviation.add("T.Ư") 265 | self._VN_abbreviation.add("D.C") 266 | self._VN_abbreviation.add("Gs.Tskh") 267 | self._VN_abbreviation.add("PGS.KTS") 268 | self._VN_abbreviation.add("GS.BS") 269 | self._VN_abbreviation.add("KTS.TS") 270 | self._VN_abbreviation.add("PGS-TS") 271 | self._VN_abbreviation.add("Co.") 272 | self._VN_abbreviation.add("S.H.E") 273 | self._VN_abbreviation.add("Ths.Bs") 274 | self._VN_abbreviation.add("T&T.HN") 275 | self._VN_abbreviation.add("MR.") 276 | self._VN_abbreviation.add("Ms.") 277 | self._VN_abbreviation.add("T.T.P") 278 | self._VN_abbreviation.add("TT.") 279 | self._VN_abbreviation.add("TP.") 280 | self._VN_abbreviation.add("ĐH.QGHN") 281 | self._VN_abbreviation.add("Gs.Kts") 282 | self._VN_abbreviation.add("Man.Utd") 283 | self._VN_abbreviation.add("GD-ĐT") 284 | self._VN_abbreviation.add("T.W") 285 | self._VN_abbreviation.add("Corp.") 286 | self._VN_abbreviation.add("ĐT.LA") 287 | self._VN_abbreviation.add("Dr.") 288 | self._VN_abbreviation.add("T&T") 289 | self._VN_abbreviation.add("HN.ACB") 290 | self._VN_abbreviation.add("GS.KTS") 291 | self._VN_abbreviation.add("MS.") 292 | self._VN_abbreviation.add("Prof.") 293 | self._VN_abbreviation.add("GS.TS") 294 | self._VN_abbreviation.add("PGs.Ts") 295 | self._VN_abbreviation.add("PGS.BS") 296 | self._VN_abbreviation.add("BT.") 297 | self._VN_abbreviation.add("Ltd.") 298 | self._VN_abbreviation.add("ThS.BS") 299 | self._VN_abbreviation.add("Gs.Ts") 300 | self._VN_abbreviation.add("SL.NA") 301 | self._VN_abbreviation.add("Th.S") 302 | self._VN_abbreviation.add("Gs.Vs") 303 | self._VN_abbreviation.add("PGs.Bs") 304 | self._VN_abbreviation.add("T.O.P") 305 | self._VN_abbreviation.add("PGS.TS") 306 | self._VN_abbreviation.add("HN.T&T") 307 | self._VN_abbreviation.add("SG.XT") 308 | self._VN_abbreviation.add("O.T.C") 309 | self._VN_abbreviation.add("TS.BS") 310 | self._VN_abbreviation.add("Yahoo!") 311 | self._VN_abbreviation.add("Man.City") 312 | self._VN_abbreviation.add("MISS.") 313 | self._VN_abbreviation.add("HA.GL") 314 | self._VN_abbreviation.add("GS.Ts") 315 | self._VN_abbreviation.add("TBT.") 316 | self._VN_abbreviation.add("GS.VS") 317 | self._VN_abbreviation.add("GS.TSKH") 318 | self._VN_abbreviation.add("Ts.Bs") 319 | self._VN_abbreviation.add("M.U") 320 | self._VN_abbreviation.add("Gs.TSKH") 321 | self._VN_abbreviation.add("U.S") 322 | self._VN_abbreviation.add("Miss.") 323 | self._VN_abbreviation.add("GD.ĐT") 324 | self._VN_abbreviation.add("PGs.Kts") 325 | self._VN_abbreviation.add("St.") 326 | self._VN_abbreviation.add("Ng.") 327 | self._VN_abbreviation.add("Inc.") 328 | self._VN_abbreviation.add("Th.") 329 | self._VN_abbreviation.add("N.O.V.A") 330 | 331 | self._VN_exception={"Wi-fi"} 332 | self._VN_exception.add("17+") 333 | self._VN_exception.add("km/h") 334 | self._VN_exception.add("M7") 335 | self._VN_exception.add("M8") 336 | self._VN_exception.add("21+") 337 | self._VN_exception.add("G3") 338 | self._VN_exception.add("M9") 339 | self._VN_exception.add("G4") 340 | self._VN_exception.add("km3") 341 | self._VN_exception.add("m/s") 342 | self._VN_exception.add("km2") 343 | self._VN_exception.add("5g") 344 | self._VN_exception.add("4G") 345 | self._VN_exception.add("8K") 346 | self._VN_exception.add("3g") 347 | self._VN_exception.add("E9") 348 | self._VN_exception.add("U21") 349 | self._VN_exception.add("4K") 350 | self._VN_exception.add("U23") 351 | self._VN_exception.add("Z1") 352 | self._VN_exception.add("Z2") 353 | self._VN_exception.add("Z3") 354 | self._VN_exception.add("Z4") 355 | self._VN_exception.add("Z5") 356 | self._VN_exception.add("Jong-un") 357 | self._VN_exception.add("u19") 358 | self._VN_exception.add("5s") 359 | self._VN_exception.add("wi-fi") 360 | self._VN_exception.add("18+") 361 | self._VN_exception.add("Wi-Fi") 362 | self._VN_exception.add("m2") 363 | self._VN_exception.add("16+") 364 | self._VN_exception.add("m3") 365 | self._VN_exception.add("V-League") 366 | self._VN_exception.add("Geun-hye") 367 | self._VN_exception.add("5G") 368 | self._VN_exception.add("4g") 369 | self._VN_exception.add("Z3+") 370 | self._VN_exception.add("3G") 371 | self._VN_exception.add("km/s") 372 | self._VN_exception.add("6+") 373 | self._VN_exception.add("u21") 374 | self._VN_exception.add("WI-FI") 375 | self._VN_exception.add("u23") 376 | self._VN_exception.add("U19") 377 | self._VN_exception.add("6s") 378 | self._VN_exception.add("4s") 379 | 380 | def isBrace(self,string): 381 | if string=="”" or string=="�" or string=="'" or string==")" \ 382 | or string=="}" or string=="]": 383 | return True 384 | return False 385 | @property 386 | def VN_abbreviation(self): 387 | return self._VN_abbreviation 388 | 389 | @property 390 | def VN_exception(self): 391 | return self._VN_exception 392 | class Regex: 393 | def __init__(self): 394 | self._regexes = None 395 | self._regexIndex=None 396 | @property 397 | def ELLIPSIS(self): 398 | return "\\.{2,}" 399 | 400 | @property 401 | def EMAIL(self): 402 | return "([\\w\\d_\\.-]+)@(([\\d\\w-]+)\\.)*([\\d\\w-]+)" 403 | 404 | @property 405 | def FULL_DATE(self): 406 | return "(0?[1-9]|[12][0-9]|3[01])(\\/|-|\\.)(1[0-2]|(0?[1-9]))((\\/|-|\\.)\\d{4})" 407 | 408 | @property 409 | def MONTH(self): 410 | return "(1[0-2]|(0?[1-9]))(\\/)\\d{4}" 411 | 412 | @property 413 | def DATE(self): 414 | return "(0?[1-9]|[12][0-9]|3[01])(\\/)(1[0-2]|(0?[1-9]))" 415 | 416 | @property 417 | def TIME(self): 418 | return "(\\d\\d:\\d\\d:\\d\\d)|((0?\\d|1\\d|2[0-3])(:|h)(0?\\d|[1-5]\\d)(’|'|p|ph)?)" 419 | 420 | @property 421 | def MONEY(self): 422 | return "\\\p{Sc}\\d+([\\.,]\\d+)*|\\d+([\\.,]\\d+)*\\\p{Sc}" 423 | 424 | @property 425 | def PHONE_NUMBER(self): 426 | return "(\\(?\\+\\d{1,2}\\)?[\\s\\.-]?)?\\d{2,}[\\s\\.-]?\\d{3,}[\\s\\.-]?\\d{3,}" 427 | 428 | @property 429 | def URL(self): 430 | return "(((https?|ftp):\\/\\/|www\\.)[^\\s/$.?#].[^\\s]*)|(https?:\\/\\/)?(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)" 431 | 432 | @property 433 | def NUMBER(self): 434 | return "[-+]?\\d+([\\.,]\\d+)*%?\\\p{Sc}?" 435 | 436 | @property 437 | def PUNCTUATION(self): 438 | return ",|\\.|:|\\?|!||-|_|\"|'|“|”|\\\or\\(|\\)|\\[|\\]|\\{|\\}|⟨|⟩|«|»|\\\\|\\/|\\‘|\\’|\\“|\\â€�|…|…|‘|’|·" 439 | 440 | @property 441 | def SPECIAL_CHAR(self): 442 | return "\\~|\\@|\\#|\\^|\\&|\\*|\\+|\\-|\\–|<|>|\\|" 443 | 444 | @property 445 | def EOS_PUNCTUATION(self): 446 | return "(\\.+|\\?|!|…)" 447 | 448 | @property 449 | def NUMBERS_EXPRESSION(self): 450 | return "[-+]?\\d+([\\.,]\\d+)*%?\\\p{Sc}?" + "([\\+\\-\\*\\/]" + "[-+]?\\d+([\\.,]\\d+)*%?\\\p{Sc}?" + ")*" 451 | 452 | @property 453 | def SHORT_NAME(self): 454 | return "([\\\p{L}]+([\\.\\-][\\\p{L}]+)+)|([\\\p{L}]+-\\d+)" 455 | 456 | @property 457 | def ALLCAP(self): 458 | return "[A-Z]+\\.[A-Z]+" 459 | @property 460 | def regexes(self): 461 | return self._regexes 462 | @regexes.setter 463 | def regexes(self,value): 464 | self._regexes = value 465 | @property 466 | def regexIndex(self): 467 | return self._regexIndex 468 | @regexIndex.setter 469 | def regexIndex(self,value): 470 | self._regexIndex = value 471 | 472 | def getRegexList(self): 473 | regex_ = Regex() 474 | if self._regexes == None: 475 | self._regexes = [] 476 | self._regexIndex = [] 477 | 478 | self._regexes.append(regex_.ELLIPSIS) 479 | self._regexIndex.append("ELLIPSIS") 480 | 481 | self._regexes.append(regex_.EMAIL) 482 | self._regexIndex.append("EMAIL") 483 | 484 | self._regexes.append(regex_.URL) 485 | self._regexIndex.append("URL") 486 | 487 | self._regexes.append(regex_.FULL_DATE) 488 | self._regexIndex.append("FULL_DATE") 489 | 490 | self._regexes.append(regex_.MONTH) 491 | self._regexIndex.append("MONTH") 492 | 493 | self._regexes.append(regex_.DATE) 494 | self._regexIndex.append("DATE") 495 | 496 | self._regexes.append(regex_.TIME) 497 | self._regexIndex.append("TIME") 498 | 499 | self._regexes.append(regex_.MONEY) 500 | self._regexIndex.append("MONEY") 501 | 502 | self._regexes.append(regex_.PHONE_NUMBER) 503 | self._regexIndex.append("PHONE_NUMBER") 504 | 505 | self._regexes.append(regex_.SHORT_NAME) 506 | self._regexIndex.append("SHORT_NAME") 507 | 508 | self._regexes.append(regex_.NUMBERS_EXPRESSION) 509 | self._regexIndex.append("NUMBERS_EXPRESSION") 510 | 511 | self._regexes.append(regex_.NUMBER) 512 | self._regexIndex.append("NUMBER") 513 | 514 | self._regexes.append(regex_.PUNCTUATION) 515 | self._regexIndex.append("PUNCTUATION") 516 | 517 | self._regexes.append(regex_.SPECIAL_CHAR) 518 | self._regexIndex.append("SPECIAL_CHAR") 519 | 520 | self._regexes.append(regex_.ALLCAP) 521 | self._regexIndex.append("ALLCAP") 522 | 523 | return self._regexes 524 | 525 | def getRegexIndex(self,regex): 526 | return self._regexIndex.index(regex.upper()) 527 | -------------------------------------------------------------------------------- /vws/Utils.py: -------------------------------------------------------------------------------- 1 | from vws.FWObject import FWObject 2 | from vws.Node import Node 3 | from vws.WordTag import WordTag 4 | class Utils: 5 | def __init__(self): 6 | self._NORMALIZER = {} 7 | self._NORMALIZER["òa"]= "oà" 8 | self._NORMALIZER["óa"]="oá" 9 | self._NORMALIZER["ỏa"]="oả" 10 | self._NORMALIZER["õa"]="oã" 11 | self._NORMALIZER["ọa"]="oạ" 12 | self._NORMALIZER["òe"]= "oè" 13 | self._NORMALIZER["óe"]="oé" 14 | self._NORMALIZER["ỏe"]= "oẻ" 15 | self._NORMALIZER["õe"]= "oẽ" 16 | self._NORMALIZER["ọe"]= "oẹ" 17 | self._NORMALIZER["ùy"]= "uỳ" 18 | self._NORMALIZER["úy"]= "uý" 19 | self._NORMALIZER["ủy"]= "uỷ" 20 | self._NORMALIZER["ũy"]= "uỹ" 21 | self._NORMALIZER["ụy"]= "uỵ" 22 | self._NORMALIZER["Ủy"]= "Uỷ" 23 | def getCondition(self,strCondition:str)->FWObject: 24 | condition = FWObject(False) 25 | # print( strCondition.split(" and ")) 26 | for rule in strCondition.split(" and "): 27 | rule = rule.strip() 28 | # print(rule) 29 | 30 | # print(rule.index(".")+1,rule.index(" ")) 31 | key = rule[rule.index(".") + 1: rule.index(" ")] 32 | value = self.getConcreteValue(rule) 33 | 34 | if key == "prevWord2": 35 | condition.context[4] = value 36 | else: 37 | if key == "prevTag2" : 38 | condition.context[5] = value 39 | else: 40 | if key == "prevWord1": 41 | condition.context[2] = value 42 | else: 43 | if key == "prevTag1": 44 | condition.context[3] = value 45 | else: 46 | if key == "word": 47 | condition.context[1] = value 48 | else: 49 | if key == "tag": 50 | condition.context[0] = value 51 | else: 52 | if key == "nextWord1": 53 | condition.context[6] = value 54 | else: 55 | if key == "nextTag1": 56 | condition.context[7] = value 57 | else: 58 | if key == "nextWord2": 59 | condition.context[8] = value 60 | else: 61 | if key == "nextTag2": 62 | condition.context[9] = value 63 | 64 | 65 | return condition 66 | 67 | def getObject(self,wordtags:list, size:int, index:int)->FWObject: 68 | object = FWObject(True) 69 | 70 | if index > 1: 71 | object.context[4] = wordtags[index-2].word 72 | object.context[5] = wordtags[index-2].tag 73 | 74 | if index > 0: 75 | object.context[2] = wordtags[index-1].word 76 | object.context[3] = wordtags[index-1].tag 77 | 78 | currentWord = wordtags[index].word 79 | currentTag = wordtags[index].tag 80 | 81 | object.context[1] = currentWord 82 | object.context[0] = currentTag 83 | 84 | if index < size - 1: 85 | object.context[6] = wordtags[index+1].word 86 | object.context[7] = wordtags[index+1].tag 87 | 88 | if index < size - 2: 89 | object.context[8] = wordtags[index+2].word 90 | object.context[9] = wordtags[index+2].tag 91 | 92 | return object 93 | 94 | def getConcreteValue(self,strs:str)->str: 95 | if "\"\"" in strs: 96 | if "Word" in strs: 97 | return "" 98 | else: 99 | return "" 100 | conclusion = strs[strs.index("\"") + 1: len(strs) - 1] 101 | return conclusion 102 | @property 103 | def NORMALIZER(self): 104 | return self._NORMALIZER 105 | @property 106 | def NORMALIZER_KEYS(self): 107 | return self._NORMALIZER.keys() 108 | 109 | -------------------------------------------------------------------------------- /vws/WordTag.py: -------------------------------------------------------------------------------- 1 | class WordTag: 2 | def __init__(self,iword,itag): 3 | self._word = iword.lower() 4 | self._form = iword 5 | self._tag = itag 6 | 7 | @property 8 | def word(self): 9 | return self._word 10 | @property 11 | def form(self): 12 | return self._form 13 | @property 14 | def tag(self): 15 | return self._tag 16 | @word.setter 17 | def word(self,value): 18 | self._word =value 19 | @form.setter 20 | def form(self,value): 21 | self._form =value 22 | @tag.setter 23 | def tag(self,value): 24 | self._tag =value 25 | -------------------------------------------------------------------------------- /vws/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/vws/__init__.py -------------------------------------------------------------------------------- /vws/__pycache__/FWObject.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/vws/__pycache__/FWObject.cpython-38.pyc -------------------------------------------------------------------------------- /vws/__pycache__/Node.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/vws/__pycache__/Node.cpython-38.pyc -------------------------------------------------------------------------------- /vws/__pycache__/RDRSegmenter.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/vws/__pycache__/RDRSegmenter.cpython-38.pyc -------------------------------------------------------------------------------- /vws/__pycache__/Tokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/vws/__pycache__/Tokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /vws/__pycache__/Utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/vws/__pycache__/Utils.cpython-38.pyc -------------------------------------------------------------------------------- /vws/__pycache__/Vocabulary.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/vws/__pycache__/Vocabulary.cpython-38.pyc -------------------------------------------------------------------------------- /vws/__pycache__/WordTag.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/vws/__pycache__/WordTag.cpython-38.pyc -------------------------------------------------------------------------------- /vws/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/d3ba299b69f12b75f5b78d97d17043dff8eb52f3/vws/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /vws/vws.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: vws 3 | Version: 0.0.1 4 | Summary: A small example package 5 | Home-page: https://github.com/Sudo-VP/Vietnamese-Word-Segmentation-Python 6 | Author: vinhpx 7 | Author-email: phamxuanvinh023@gmail.com 8 | License: UNKNOWN 9 | Project-URL: Bug Tracker, https://github.com/Sudo-VP/Vietnamese-Word-Segmentation-Python/issues 10 | Platform: UNKNOWN 11 | Classifier: Programming Language :: Python :: 3 12 | Classifier: License :: OSI Approved :: MIT License 13 | Classifier: Operating System :: OS Independent 14 | Requires-Python: >=3.6 15 | Description-Content-Type: text/markdown 16 | 17 | # word_segmenter 18 | ## Chú ý: 19 | ### Bộ mã này được viết lại từ bộ RDRSegmenter: https://github.com/datquocnguyen/RDRsegmenter bằng Python với mục đích thuận tiện hơn cho việc sử dụng và tùy biến các công cụ NLP tiếng Việt 20 | The implementation of RDRsegmenter, as described in [our paper](http://www.lrec-conf.org/proceedings/lrec2018/summaries/55.html): 21 | 22 | @InProceedings{NguyenNVDJ2018, 23 | author={Dat Quoc Nguyen and Dai Quoc Nguyen and Thanh Vu and Mark Dras and Mark Johnson}, 24 | title={{A Fast and Accurate Vietnamese Word Segmenter}}, 25 | booktitle={Proceedings of the 11th International Conference on Language Resources and Evaluation (LREC 2018)}, 26 | pages={2582--2587}, 27 | year={2018} 28 | } 29 | 30 | **Please CITE** our paper whenever RDRsegmenter is used to produce published results or incorporated into other software. 31 | 32 | Translator: Vinh Pham 33 | 34 | ## Hướng dẫn sử dụng 35 | ** REQUIRED Python3 ** 36 | - python setup.py install 37 | - python -m pip install . 38 | 39 | ## Ví dụ 40 | ``` 41 | >>> from vws import RDRSegmenter, Tokenizer 42 | >>> rdrsegment = RDRSegmenter.RDRSegmenter() 43 | >>> tokenizer = Tokenizer.Tokenizer() 44 | >>> output = rdrsegment.segmentRawSentences(tokenizer,"Lượng khách Thái bắt đầu gia tăng từ đầu năm 2005. Bên cạnh đó, kể từ tháng 10-2005 đến nay, từ khi được phép của VN, các đoàn caravan của Thái Lan cũng đã ồ ạt đổ vào VN.") 45 | >>> print(output) 46 | ``` 47 | Output: 48 | ``` 49 | >>> Lượng khách Thái bắt_đầu gia_tăng từ đầu năm 2005. Bên cạnh đó, kể từ tháng 10-2005 đến nay, từ khi được phép của VN, các đoàn caravan của Thái_Lan cũng đã ồ_ạt đổ vào VN. 50 | ``` 51 | 52 | -------------------------------------------------------------------------------- /vws/vws.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.md 2 | pyproject.toml 3 | setup.py 4 | vws/vws.egg-info/PKG-INFO 5 | vws/vws.egg-info/SOURCES.txt 6 | vws/vws.egg-info/dependency_links.txt 7 | vws/vws.egg-info/top_level.txt -------------------------------------------------------------------------------- /vws/vws.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /vws/vws.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | vws 2 | --------------------------------------------------------------------------------