├── Parser ├── scripts │ ├── java │ │ └── .gitkeep │ └── python │ │ ├── preprocessing.py │ │ ├── nltk_stanford_parser_demo.py │ │ ├── stanford_parser_trainer.py │ │ └── stanford_parser_demo.py ├── StanfordDependenciesManual-bookmark.pdf ├── treebanks │ ├── README.md │ └── xml2ptb.py └── README.md ├── README.md └── CoreNLP └── README.md /Parser/scripts/java/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Parser/StanfordDependenciesManual-bookmark.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liu-nlper/Stanford-NLP-Usage/HEAD/Parser/StanfordDependenciesManual-bookmark.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stanford-NLP-Usage 2 | 3 | 整理[Stanford NLP](https://nlp.stanford.edu/software/)相关工具的使用。 4 | 5 | ## 1. Stanford Core NLP 6 | 7 | [Stanford Core NLP](https://github.com/liu-nlper/Stanford-NLP-Usage/tree/master/CoreNLP)的使用。 8 | 9 | ## 2. Stanford Parser 10 | 11 | [Stanford Parser](https://github.com/liu-nlper/Stanford-NLP-Usage/tree/master/Parser)的使用。 12 | 13 | Updating... 14 | -------------------------------------------------------------------------------- /Parser/scripts/python/preprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | 处理句子中的括号,转换为stanford parser能够处理的格式。 5 | """ 6 | 7 | replace_map = { 8 | '(': '-LRB-', ')': '-RRB-', 9 | '[': '-LRB-', ']': '-RRB-', 10 | '{': '-LRB-', '}': '-RRB-'} 11 | 12 | 13 | def preprocessing_sentence(sentence): 14 | """ 15 | 处理原句子中的括号 16 | 17 | Args: 18 | sentence: str,原始句子 19 | Returns: 20 | sentence: str, 处理后的句子 21 | """ 22 | chars = list(sentence) 23 | for i, c in enumerate(chars): 24 | c = chars[i] 25 | if c in replace_map: 26 | chars[i] = replace_map[c] 27 | return ''.join(chars) 28 | 29 | 30 | def demo(): 31 | sentence = 'the { quick } brown ( fox ) jumps [ over ] the lazy dog.' 32 | print(preprocessing_sentence(sentence)) 33 | 34 | 35 | if __name__ == '__main__': 36 | demo() -------------------------------------------------------------------------------- /CoreNLP/README.md: -------------------------------------------------------------------------------- 1 | # Stanford Core NLP 2 | 3 | [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/)集合了分词、POS、NER、Coreference、Parser等工具于一身,这里整理了其处理中文语料的方法。 4 | 5 | ## 1. 下载CoreNLP 6 | 7 | CoreNLP下载地址: [https://nlp.stanford.edu/software/corenlp-backup-download.html](https://nlp.stanford.edu/software/corenlp-backup-download.html) 8 | 9 | **Step 1**: 下载`stanford-corenlp-full-xxxx-xx-xx.zip`,其中`xxxx-xx-xx`表示工具发布的日期,解压后的目录记为`ROOT_CORENLP`; 10 | 11 | **Step 2**: 下载中文模型包,格式为`stanford-chinese-corenlp-xxxx-xx-xx-models.jar`,将该模型放置在`ROOT_CORENLP`中。 12 | 13 | ## 2. CoreNLP使用 14 | 15 | python示例代码: 16 | 17 | ```python 18 | import subprocess 19 | 20 | command = 'java -mx{0} -Djava.ext.dirs={1} edu.stanford.nlp.pipeline.StanfordCoreNLP ' + \ 21 | '-language Chinese -encoding utf-8 -props StanfordCoreNLP-chinese.properties ' + \ 22 | '-annotators tokenize,ssplit,pos,lemma,ner,depparse -ssplit.eolonly ' \ 23 | '-file {2} -outputFormat conllu -outputDirectory {3}' 24 | 25 | MAX_MEN = '2g' 26 | ROOT_CORENLP = 'your_corenlp_path' 27 | PATH_SENT = 'your_file_path' 28 | ROOT_CONLLU = 'your_result_root_path' 29 | 30 | command = command.format(MAX_MEM, ROOT_CORENLP, PATH_SENT, ROOT_CONLLU) 31 | 32 | return_code = subprocess.call(command, shell=True) 33 | ``` 34 | 35 | 其中: 36 | 37 | - `MAX_MEM`: jvm最大使用内存; 38 | - `ROOT_CORENLP`: CoreNLP解压后的路径; 39 | - `PATH_SENT`: 待处理的文件,每个句子一行; 40 | - `ROOT_CONLLU`: 处理后的文件根目录,处理后文件名路径为`ROOT_CONLLU/PATH_SENT.conllu`。 41 | -------------------------------------------------------------------------------- /Parser/treebanks/README.md: -------------------------------------------------------------------------------- 1 | # Treebank Corpus 2 | 3 | 4 | # 1. GENIA treeebank 5 | 6 | GENIA Project官网: http://www.geniaproject.org/ 7 | 8 | GENIA treebank下载地址: [download](http://www.nactem.ac.uk/GENIA/current/GENIA-corpus/Treebank/GENIA_treebank_v1.tar.gz) 9 | 10 | ## 1.1 目录结构 11 | 12 | . 13 | ├──GENIA_treebank_v1_xml/ # GENIA treebank语料原始xml文件 14 | │ └──xxx.xml # treebank xml格式文件 15 | ├──GENIA_treebank/ # 处理后的treebank文件 16 | │ ├──esc_char.map # 特殊字符映射表 17 | │ └──GENIA_SP.ptb # ptb格式文件 18 | ├──xml2ptb.py # xml转ptb格式脚本 19 | └──README.md # README.md 20 | 21 | ## 1.2 xml转ptb 22 | 23 | 将下载的`tar.gz`文件解压到`./GENIA_treebank_v1_xml`目录下,并运行以下脚本。 24 | 25 | $ python3 xml2ptb.py 26 | 27 | 运行完成后(~6s),在`./GENIA_treebank/`下生成`GENIA_SP.ptb`和`esc_char.map`两个文件,其中,`GENIA_SP.ptb`即为`penn treebank`格式的treebank标注语料。 28 | 29 | ## 1.3 语料统计 30 | 31 | GENIA treebank共标注了18541个句子,转为ptb格式时有111条产生错误(括号不能配对),剩余18430条有效数据。 32 | 33 | 34 | # 2. 其他treebanks 35 | 36 | ## 2.1 英文 37 | 38 | ### 2.1.1 Penn English Treebank 39 | 40 | **not free available**: http://catalog.ldc.upenn.edu/ldc99t42 41 | 42 | NLTK中包含Penn Treebanks 10%的样本,通过以下接口访问: 43 | 44 | $ from nltk.corpus import treebank 45 | $ treebank.parsed_sents() 46 | $ len(treebank.parsed_sents()) # 3914 47 | 48 | ### 2.1.2 QuestionBank 49 | 50 | https://nlp.stanford.edu/data/QuestionBank-Stanford.shtml 51 | 52 | ### 2.1.3 Sentiment Treebank 53 | 54 | https://nlp.stanford.edu/sentiment/index.html 55 | 56 | ## 2.2 中文 57 | 58 | ### 2.2.1 Penn Chinese Treebank 59 | 60 | **not free available**: https://catalog.ldc.upenn.edu/ldc2013t21 61 | 62 | ### 2.2.2 Sinica Treebank 63 | 64 | Sinica提供的繁体中文语料库,共5346个parsed sentences,可以通过NLTK接口访问。 65 | 66 | $ from nltk.corpus import sinica_treebank 67 | $ sinica_treebank.parsed_sents() 68 | 69 | ## 2.3 More... 70 | 71 | https://en.wikipedia.org/wiki/Treebank#Syntactic_treebanks 72 | -------------------------------------------------------------------------------- /Parser/scripts/python/nltk_stanford_parser_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | 更多信息参考: nltk/parse/stanford.py 5 | """ 6 | 7 | from nltk.parser.stanford import StanfordParser 8 | 9 | 10 | def demo(): 11 | # 即stanford-parser.jar的路径 12 | path_to_jar = 'path to jar' 13 | 14 | # tanford-parser-x.x.x-models.jar,其中`x.x.x`为具体的版本号 15 | path_to_model_jar = 'path to model jar' 16 | 17 | # 模型的路径,例如model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' 18 | # 若需解析中文,则将`englishPCFG.ser.gz`替换为相应的中文模型 19 | model_path = 'path to model' 20 | 21 | parser = StanfordParser( 22 | path_to_jar=path_to_jar, 23 | path_to_model_jar=path_to_model_jar, 24 | model_path=model_path, 25 | verbose=False, 26 | java_options='-mx1000m', 27 | corenlp_options='' 28 | ) 29 | 30 | # parser单个句子 31 | # Use StanfordParser to parse a sentence. Takes a sentence as a string; 32 | # before parsing, it will be automatically tokenized and tagged by 33 | # the Stanford Parser. 34 | parser.raw_parse('the quick brown fox jumps over the lazy dog.') 35 | 36 | # parser多个句子 37 | sentences = [ 38 | 'the quick brown fox jumps over the lazy dog.', 39 | 'the quick brown fox jumps over the lazy dog.'] 40 | parser.raw_parse_sents(sentences) 41 | 42 | # parser符号化之后的句子 43 | # Use StanfordParser to parse a sentence.Takes a sentence as a list 44 | # of (word, tag) tuples; the sentence must have already been tokenized 45 | # and tagged. 46 | taged_sent = [ 47 | ("The", "DT"), ("quick", "JJ"), ("brown", "JJ"), ("fox", "NN"), 48 | ("jumped", "VBD"), ("over", "IN"), ("the", "DT"), ("lazy", "JJ"), 49 | ("dog", "NN"), (".", ".")] 50 | parser.tagged_parse(taged_sent) 51 | 52 | # 批量解析符号化之后的句子 53 | taged_sents = [taged_sent, taged_sent, taged_sent] 54 | parser.tagged_parse_sents(taged_sents) 55 | 56 | 57 | if __name__ == '__main__': 58 | demo() 59 | -------------------------------------------------------------------------------- /Parser/scripts/python/stanford_parser_trainer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | 重新训练stanford parser 5 | """ 6 | import subprocess 7 | 8 | 9 | class StanfordParserTrainer(object): 10 | 11 | def __init__(self, path_stanford, parser_jar='edu.stanford.nlp.parser.lexparser.LexicalizedParser', 12 | max_mem='4g'): 13 | """ 14 | Args: 15 | path_stanford: stanford-parser.jar路径的父目录 16 | parser_jar: 'edu.stanford.nlp.parser.lexparser.LexicalizedParser' 17 | max_mem: 最大内存,default '4g' 18 | """ 19 | self._path_stanford = path_stanford 20 | self._parser_jar = parser_jar 21 | self._max_mem = max_mem 22 | 23 | def train(self, path_train, path_model, model='goodFactored', 24 | path_test=None, path_test_result=None): 25 | """ 26 | train model 27 | Args: 28 | path_train: ptb格式训练文件路径 29 | path_model: 模型存放路径 30 | path_test: 测试文件路径, default is None 31 | path_test_result: 测试文件结果存放路径 32 | """ 33 | print('training model...', end='') 34 | # -goodPCFG, -goodFactored, -ijcai03 35 | command = 'java -mx%s -Djava.ext.dirs=%s %s -evals "factDA,factCB,tsv" -%s ' + \ 36 | '-saveToSerializedFile %s -train %s 0' 37 | command %= (self._max_mem, self._path_stanford, self._parser_jar, 38 | model, path_model, path_train) 39 | if path_test: 40 | assert path_test and path_test_result # not None 41 | command += (' -testTreebank %s' % path_test) 42 | process = subprocess.Popen( 43 | command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 44 | if not path_test: 45 | while True: 46 | line = process.stdout.readline() 47 | if not line: 48 | break 49 | line = line.decode('utf-8') 50 | print(line, end='') 51 | print('done!') 52 | return 53 | with open(path_test_result, 'w', encoding='utf-8') as file_w: 54 | while True: 55 | line = process.stdout.readline() 56 | if not line: 57 | break 58 | line = line.decode('utf-8') 59 | print(line, end='') 60 | file_w.write(line) 61 | print('done!') 62 | return 63 | 64 | 65 | def train_model_demo(): 66 | """ 67 | 训练模型 68 | Args: 69 | path_train: 训练数据路径 70 | path_model: 模型存放路径 71 | model: goodPCFG or goodFactored 72 | """ 73 | path_stanford = './stanford-parser-full-2016-10-31' 74 | parser_jar = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser' 75 | stanford_parser = StanfordParserTrainer(path_stanford, parser_jar) 76 | 77 | path_train = 'path_to_train_ptb' # 训练文件路径ptb格式 78 | path_test = 'path_to_test_ptb' # 测试文件路径(可为None) 79 | path_model = 'your_model.tar.gz' # 模型保存路径 80 | model = 'goodFactored' # 使用哪种方式进行训练 81 | 82 | stanford_parser.train(path_train, path_model, model=model, path_test=path_test) 83 | 84 | 85 | if __name__ == '__main__': 86 | train_model_demo() -------------------------------------------------------------------------------- /Parser/treebanks/xml2ptb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | 将xml标注格式转为PTB format 5 | 6 | 共18541个trees,其中111个格式错误,有效的trees共18430 7 | """ 8 | import re 9 | import os 10 | import sys 11 | from time import time 12 | from collections import defaultdict 13 | # from nltk.tree import Tree 14 | # from bs4 import BeautifulSoup 15 | 16 | 17 | # map 18 | token_map_dict = { 19 | 'COMMA': ',', 'PERIOD': '.', 20 | 'LRB': '-LRB-', 'RRB': '-RRB-', 21 | 'LQT': '``', 'RQT': '\'\'', 22 | 'COLON': ':' 23 | } 24 | esc_char_map = defaultdict(set) 25 | replace_map = { 26 | '(': '-LRB-', ')': '-RRB-', 27 | '[': '-LRB-', ']': '-RRB-', 28 | '{': '-LRB-', '}': '-RRB-' 29 | } 30 | 31 | 32 | # patterns 33 | pattern_sent = re.compile('(.*?)') 34 | pattern_tok = re.compile('(.*?)') 35 | pattern_cat = re.compile('<(sentence id|cons cat)="(.*?)".*?>') 36 | pattern_close = re.compile('') 37 | pattern_space = re.compile('\s+') 38 | pattern_cor = re.compile('\([a-zA-Z]+ \)') 39 | 40 | 41 | def replace_pair(token): 42 | """ 43 | 替换token中的括号 44 | """ 45 | token = list(token) 46 | for i in range(len(token)): 47 | c = token[i] 48 | if c in replace_map: 49 | token[i] = replace_map[c] 50 | return ''.join(token) 51 | 52 | 53 | def check_bracket(sentence): 54 | """ 55 | 检查括号是否匹配 56 | Return: 57 | bool 58 | """ 59 | brackets = [] 60 | for c in sentence: 61 | if c == '(': 62 | brackets.append(c) 63 | elif c == ')': 64 | if not len(brackets): 65 | return False 66 | brackets.pop() 67 | return not bool(len(brackets)) 68 | 69 | 70 | def handle_sentence(sentence): 71 | # replace all tokens 72 | token_iter = pattern_tok.finditer(sentence) 73 | token_rev = [] 74 | for item in token_iter: 75 | start, end = item.start(), item.end() 76 | cat, name = item.groups(0)[:] 77 | cat_m = cat 78 | name = replace_pair(pattern_space.sub('', name)) 79 | if cat in token_map_dict: 80 | cat_m = token_map_dict[cat] 81 | esc_char_map[cat_m].add(name) 82 | token_rev.append([start, end, cat_m, name]) 83 | sentence = list(sentence) 84 | for item in token_rev[::-1]: 85 | start, end, cat, name = item[:] 86 | sub_str = ' (%s %s)' % (cat, name) 87 | sentence[start:end] = list(sub_str) 88 | # replace with ')' 89 | sentence = pattern_close.sub(')', ''.join(sentence)) 90 | # replace '' with '(NP ' 91 | cat_iter = pattern_cat.finditer(sentence) 92 | cat_signs = [] 93 | for item in cat_iter: 94 | start, end = item.start(), item.end() 95 | tag, name = item.groups(0)[:] 96 | if tag == 'sentence id': 97 | name = 'S' 98 | cat_signs.append([start, end, name]) 99 | sentence = list(sentence) 100 | for item in cat_signs[::-1]: 101 | start, end, name = item[:] 102 | sub_str = '(%s ' % name 103 | sentence[start:end] = list(sub_str) 104 | sentence = pattern_space.sub(' ', ''.join(sentence)) 105 | sentence = pattern_cor.sub('', sentence) 106 | if not check_bracket(sentence): # 检查格式 107 | return None 108 | # try: # check 109 | # Tree.fromstring(sentence) 110 | # except Exception as e: 111 | # return None 112 | return sentence 113 | 114 | 115 | total_count = 0 116 | 117 | 118 | def handle_article(name, file_ptb_w): 119 | """ 120 | Args: 121 | name: 待处理文件名 122 | file_ptb_w: 123 | """ 124 | with open(name, 'r', encoding='utf-8') as file_r: 125 | text = file_r.read() 126 | sentences = pattern_sent.findall(text) 127 | error_count, useful_count = 0, 0 128 | for sentence in sentences: 129 | sentence_h = handle_sentence(sentence) 130 | if not sentence_h: # 共111个格式错误 131 | # print(sentence) 132 | error_count += 1 133 | continue 134 | useful_count += 1 135 | #if total_count == 478: 136 | # print(sentence) 137 | # print(sentence_h) 138 | # exit() 139 | sentence_h = pattern_cor.sub('', sentence_h) # why 140 | file_ptb_w.write('%s\n' % sentence_h) 141 | return useful_count, error_count 142 | 143 | 144 | def main(): 145 | root_xml = './GENIA_treebank_v1_xml/' 146 | root_ptb = './GENIA_treebank/' 147 | if not os.path.exists(root_ptb): 148 | os.mkdir(root_ptb) 149 | path_ptb = root_ptb + 'GENIA_SP.ptb' 150 | file_ptb_w = open(path_ptb, 'w', encoding='utf-8') 151 | file_list = os.listdir(root_xml) 152 | useful_count, error_count = 0, 0 153 | for i, name in enumerate(file_list): 154 | sys.stdout.write('processing xml files: {0}\r'.format(i+1)) 155 | sys.stdout.flush() 156 | useful_count_, error_count_ = handle_article(root_xml + name, file_ptb_w) 157 | useful_count += useful_count_ 158 | error_count += error_count_ 159 | sys.stdout.write('processing xml files: {0}\n'.format(i+1)) 160 | file_ptb_w.close() 161 | # 字符替换映射表 162 | with open(root_ptb+'esc_char.map', 'w', encoding='utf-8') as file_w: 163 | for key in esc_char_map: 164 | char_list = esc_char_map[key] 165 | for c in char_list: 166 | file_w.write('%s\t%s\n' % (c, key)) 167 | 168 | print('useful count: {0}'.format(useful_count)) 169 | print('error count: {0}'.format(error_count)) 170 | print('result has been saved to: {0}'.format(path_ptb)) 171 | 172 | 173 | if __name__ == '__main__': 174 | t0 = time() 175 | 176 | main() 177 | print('done in {0:.1f}s!'.format(time()-t0)) 178 | -------------------------------------------------------------------------------- /Parser/README.md: -------------------------------------------------------------------------------- 1 | # Stanford Parser使用 2 | 3 | 整理stanford parser的部分使用方法,Stanford Parser版本3.9.1。 4 | 5 | **官方使用指南**: `./StanfordDependenciesManual-bookmark.pdf` 6 | 7 | **最新版本下载**: [download](https://nlp.stanford.edu/software/lex-parser.shtml#Download) 8 | 9 | **官方整理的FAQ**: [FAQ](https://nlp.stanford.edu/software/parser-faq.html) 10 | 11 | ## 1. 使用预训练的模型 12 | 13 | 以下提及的脚本,若没有特别指明路径,则都在官方下载的压缩包里;否则,相关脚本存放在`./scripts`目录下。 14 | 15 | ### 1.1 官方提供的模型 16 | 17 | Stanford Parser提供了预训练的模型供使用,表1,2分别列出了中英文模型。其中,`Mixed [Chinese|English]`分别是在中文/英文的混合标注语料上训练的模型,`wsj`是在华尔街日报语料上训练的模型,`xinhua`是在中文新华日报语料上训练的模型。 18 | 19 | 中文模型中,除了`xinhuaFactoredSegmenting.ser.gz`能够直接parse未分词的文本,其余的均需要预先分好词。 20 | 21 | 关于不同模型的介绍: [PCFG parser](https://nlp.stanford.edu/~manning/papers/unlexicalized-parsing.pdf), [Factored parser](https://nlp.stanford.edu/~manning/papers/lex-parser.pdf) 22 | 23 | **表 1.** 中文模型 24 | 25 | | Corpus | PCFG | Factored | FactoredSegmenting | 26 | | ------------- | ------------- | ------------- | -------------| 27 | | mixed Chinese | chinesePCFG.ser.gz| chineseFactored.ser.gz | | 28 | | xinhua | xinhuaPCFG.ser.gz | xinhuaFactored.ser.gz | xinhuaFactoredSegmenting.ser.gz | 29 | 30 | **表 2.** 英文模型 31 | 32 | | Corpus | PCFG | Factored | RNN | 33 | | ------------- | ------------- | ------------- | ------------- | 34 | | mixed English | englishPCFG.ser.gz, englishPCFG.ser.gz | englishFactored.ser.gz | englishRNN.ser.gz | 35 | | wsj | wsjPCFG.ser.gz | wsjFactored.ser.gz | wsjRNN.ser.gz | 36 | 37 | ### 1.2 使用预训练模型的几种方式 38 | 39 | 官方提供了多种调用方式,并提供了多种编程语言的接口,这里仅整理出其中4种,分别是: `图形界面`、`命令行`、`Java`和`Python`。 40 | 41 | **注**: 若对符号化之后的句子进行解析,则在使用Stanford Parser进行句法解析之前,需要对句子中的括号进行处理,处理方式参考: [./scripts/python/preprocessing.py](./scripts/python/preprocessing.py)。 42 | 43 | #### 1.2.1 图形界面 44 | 45 | - Linux下运行`lexparser-gui.sh` 46 | - Windows下运行`lexparser-gui.bat` 47 | 48 | #### 1.2.2 命令行 49 | 50 | - 脚本: `lexparser.sh`或`lexparser.bat`,使用英文模型; 51 | 52 | - 脚本: `lexparser_lang.sh`或`lexparser_lang.bat`,可指定语言; 53 | 54 | - 相关参数参考官方指南。 55 | 56 | #### 1.2.3 Java 57 | 58 | 参考`ParserDemo.java`、`ParserDemo2.java`和`DependencyParserDemo.java`。 59 | 60 | #### 1.2.4 Python 61 | 62 | - **NLTK接口**: 参考脚本[./scripts/python/nltk_stanford_parser_demo.py](./scripts/python/nltk_stanford_parser_demo.py)。 63 | 64 | - **python调用jar包**: 参考脚本[./scripts/python/stanford_parser_demo.py](./scripts/python/stanford_parser_demo.py)。 65 | 66 | ## 2. 输出格式 67 | 68 | 详细信息参考官方手册`In practice`部分。 69 | 70 | ### 2.1 penn or dependency format? 71 | 72 | **edu.stanford.nlp.parser.lexparser.LexicalizedParser** 73 | 74 | 若要获取不同格式的输出,则需修改`-outputFormat`: 75 | 76 | - `penn`格式: `-outputFormat "penn"` 77 | - `dependency`格式: `-outputFormat "typedDependencies"` 78 | - 同时获取两种格式: `-outputFormat "penn,typedDependencies"` 79 | 80 | 示例命令行: 81 | 82 | # 输出格式为penn和dependency 83 | java -mx200m edu.stanford.nlp.parser.lexparser.LexicalizedParser 84 | -retainTmpSubcategories -originalDependencies -outputFormat 85 | "penn,typedDependencies" -outputFormatOptions "basicDependencies" 86 | englishPCFG.ser.gz file.txt 87 | 88 | ### 2.2 不同格式的dependencies 89 | 90 | Stanford Parser的dependencies默认采用`collapsed dependencies`,若需要其他格式的dependencies,则需修改`-outputFormatOptions`选项,可选参数有: 91 | 92 | - `basicDependencies`: Basic dependencies. 93 | - `collapsedDependencies`: Collapsed dependencies (not necessarily a tree structure) 94 | - `CCPropagatedDependencies`: Collapsed dependencies with propagation of conjunct dependencies (not necessarily a tree structure). **This representation is the default, if no option is specified.** 95 | - `treeDependencies`: Collapsed dependencies that preserve a tree structure. 96 | - `nonCollapsedDependencies`: Non-collapsed dependencies: basic dependencies as well as the extra ones which do not preserve a tree structure. 97 | - `nonCollapsedDependenciesSeparated`: Non-collapsed dependencies where the basic dependencies are separated from the extra ones (by “======”). 98 | 99 | ### 2.3 penn格式转其他格式 100 | 101 | **edu.stanford.nlp.trees.EnglishGrammaticalStructure** 102 | 103 | 如果已经有了`penn treebank`格式的文件,需要将其转换为`dependency`格式,则可以使用此类。 104 | 105 | 可选参数: 106 | 107 | - `-basic`: basic dependencies 108 | - `-collapsed`: collapsed dependencies (not necessarily a tree structure) 109 | - `-CCprocessed`: collapsed dependencies with propagation of conjunct dependencies (not necessarily a tree structure) 110 | - `-collapsedTree`: collapsed dependencies that preserve a tree structure 111 | - `-nonCollapsed`: non-collapsed dependencies: basic dependencies as well as the extra ones which do not preserve a tree structure 112 | - `-conllx`: dependencies printed out in CoNLL X (CoNLL 2006) format 113 | - `-originalDependencies`: output the original Stanford Dependencies instead of the new Universal Dependencies. 114 | 115 | 示例命令行: 116 | 117 | # penn格式转dependency格式,其中`-keepPunct`参数是保留标点符号 118 | java edu.stanford.nlp.trees.EnglishGrammaticalStructure -treeFile 119 | file.tree -collapsedTree -CCprocessed -keepPunct 120 | 121 | ## 3. 重新训练模型 122 | 123 | ### 3.1 命令行 124 | 125 | 脚本: `lexparser-lang-train-test.sh`,若有treebank标注语料,则可使用该脚本重新训练句法分析模型。 126 | 127 | ### 3.2 Python 128 | 129 | #### 3.2.1 PCFG and Factored 130 | 131 | 参考:[./scripts/python/stanford_parser_trainer.py](./scripts/python/stanford_parser_trainer.py),或参考官方FAQ: [Can I train the parser?](https://nlp.stanford.edu/software/parser-faq.html#d) 132 | 133 | 若要使用新训练的模型,则参考1.2.1-1.2.4。 134 | 135 | #### 3.2.2 RNN 136 | 137 | 官方FAQ: [How do I train the RNN parser?](https://nlp.stanford.edu/software/parser-faq.html#rnn) 138 | 139 | ## 4. treebank语料整理 140 | 141 | 见[./treebanks/README.md](./treebanks/README.md)。 142 | 143 | ## 5. 性能评估 144 | 145 | ### 5.1 CONLL-U Format 146 | 147 | **CONLL-U Format介绍**: http://universaldependencies.org/docs/format.html 148 | 149 | **评估脚本下载地址**: http://universaldependencies.org/conll17/evaluation.html 150 | -------------------------------------------------------------------------------- /Parser/scripts/python/stanford_parser_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | stanford parser 5 | 6 | class MyStanfordParser 7 | 8 | Parameters: 9 | path_stanford: stanford-parser.jar路径的父目录 10 | parser_jar: 'edu.stanford.nlp.parser.lexparser.LexicalizedParser' 11 | max_mem: 最大内存,default '4g' 12 | 13 | Methods: 14 | 15 | predict: 对文件进行句法分析,预测结果为ptb格式 16 | 17 | predict_dep: 对文件进行句法分析,预测结果为dependency格式 18 | 19 | ptb2conll: 将ptb转为conll格式 20 | """ 21 | import subprocess 22 | 23 | 24 | def read_lines(path): 25 | lines = [] 26 | file_r = open(path, 'r', encoding='utf-8') 27 | for line in file_r.readlines(): 28 | line = line.strip() 29 | if line: 30 | lines.append(line) 31 | return lines 32 | 33 | 34 | class MyStanfordParser(object): 35 | 36 | def __init__(self, path_stanford, parser_jar='edu.stanford.nlp.parser.lexparser.LexicalizedParser', 37 | max_mem='4g'): 38 | """ 39 | Args: 40 | path_stanford: stanford-parser.jar路径的父目录 41 | parser_jar: 'edu.stanford.nlp.parser.lexparser.LexicalizedParser' 42 | max_mem: 最大内存,default '4g' 43 | """ 44 | self._path_stanford = path_stanford 45 | self._parser_jar = parser_jar 46 | self._max_mem = max_mem 47 | 48 | def predict(self, path_model, path_data, path_result_ptb=None, path_result_conll=None): 49 | """ 50 | parse sentences, penn format 51 | 52 | Args: 53 | path_model: 模型文件路径 54 | path_data: 待标记文件路径 55 | path_result_ptb: 结果存放路径 56 | path_result_conll: 转为conll的路径 57 | 58 | Notes: 59 | 1. '-tokenized'参数,若使用该参数,则不再进行符号化,但是要注意括号的处理; 60 | 2. '-sentences newline'参数,parser one sentence per line 61 | """ 62 | print('predict...', end='') 63 | command = 'java -mx%s -Djava.ext.dirs=%s %s -tokenized -retainTmpSubcategories ' + \ 64 | '-originalDependencies -outputFormat "penn" -outputFormatOptions ' + \ 65 | '"basicDependencies" -sentences newline %s %s' 66 | command %= (self._max_mem, self._path_stanford, self._parser_jar, 67 | path_model, path_data) 68 | if path_result_ptb: 69 | command += ' > %s' % path_result_ptb 70 | return_code = subprocess.call(command, shell=True) 71 | print('done!') 72 | 73 | if path_result_conll: # 转为conll format 74 | print('transform to conll format...', end='') 75 | self.ptb2conll(path_result_ptb, path_result_conll) 76 | print('done!') 77 | return return_code 78 | 79 | def predict_dep(self, path_model, path_data, path_result_ptb=None, path_result_conll=None): 80 | """ 81 | parse sentences, get the dependencies 82 | 83 | Args: 84 | path_model: 模型文件路径 85 | path_data: 待标记文件路径 86 | path_result_ptb: 结果存放路径 87 | path_result_conll: 转为conll的路径 88 | 89 | Notes: 90 | 1. '-tokenized'参数,若使用该参数,则不再进行符号化,但是要注意括号的处理; 91 | 2. '-sentences newline'参数,parser one sentence per line 92 | """ 93 | print('predict...', end='') 94 | command = 'java -mx%s -Djava.ext.dirs=%s %s -tokenized -retainTmpSubcategories ' + \ 95 | '-originalDependencies -outputFormat "typedDependencies" -outputFormatOptions ' + \ 96 | '"basicDependencies" -sentences newline %s %s' 97 | command %= (self._max_mem, self._path_stanford, self._parser_jar, 98 | path_model, path_data) 99 | if path_result_ptb: 100 | command += ' > %s' % path_result_ptb 101 | return_code = subprocess.call(command, shell=True) 102 | print('done!') 103 | 104 | if path_result_conll: # 转为conll format 105 | print('transform to conll format...', end='') 106 | self.ptb2conll(path_result_ptb, path_result_conll) 107 | print('done!') 108 | return return_code 109 | 110 | def ptb2conll(self, path_ptb, path_conll): 111 | """ 112 | ptb转为conll format (转为其他格式参考官方手册) 113 | Args: 114 | path_result_ptb: 结果存放路径 115 | path_result_conll: 转为conll的路径 116 | """ 117 | egs = 'edu.stanford.nlp.trees.EnglishGrammaticalStructure' 118 | command = 'java -mx%s -Djava.ext.dirs=%s %s -treeFile %s -conllx ' + \ 119 | '-basic -retainNPTmpSubcategories -makeCopulaHead -keepPunct > %s' 120 | command %= (self._max_mem, self._path_stanford, egs, path_ptb, path_conll) 121 | return subprocess.call(command, shell=True) 122 | 123 | 124 | def replace_pair(token, replace_map): 125 | """ 126 | 替换token中的括号 127 | """ 128 | token = list(token) 129 | for i in range(len(token)): 130 | c = token[i] 131 | if c in replace_map: 132 | token[i] = replace_map[c] 133 | return ''.join(token) 134 | 135 | 136 | def preprocessing_sentence(path_ori, path_result): 137 | """ 138 | 处理原句子中的特殊括号 139 | Args: 140 | path_ori: str, 原始文件路径 141 | path_result: str, 处理后文件路径 142 | """ 143 | replace_map = { 144 | '(': '-LRB-', ')': '-RRB-', 145 | '[': '-LRB-', ']': '-RRB-', 146 | '{': '-LRB-', '}': '-RRB-'} 147 | lines = read_lines(path_ori) # 可改为逐行读取... 148 | file_w = open(path_result, 'w', encoding='utf-8') 149 | for line in lines: 150 | new_line = replace_pair(line, replace_map) 151 | file_w.write('%s\n' % new_line) 152 | file_w.close() 153 | 154 | 155 | def parser_ptb_demo(): 156 | """ 157 | 解析源语言,ptb格式 158 | 159 | 注: 若使用'-tokenized'参数,即输入的是符号化之后的句子,则必须处理句子中的括号 160 | """ 161 | path_model = 'path_to_your_model' # 官方的模型或自行训练的模型路径 162 | 163 | # parse txt (change to your own paths) 164 | path_stanford = './stanford-parser-full-2018-02-27' 165 | parser_jar = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser' 166 | stanford_parser = MyStanfordParser(path_stanford, parser_jar) 167 | 168 | path_txt = 'your.txt' # 待解析文件路径 169 | path_result_ptb = 'your.ptb' # ptb格式路径 170 | path_result_conll = 'your.conll' # conll格式路径(可为None) 171 | stanford_parser.predict(path_model, path_txt, path_result_ptb, path_result_conll) 172 | 173 | 174 | def parser_dep_demo(): 175 | """ 176 | 解析源语言,dependency格式 177 | 178 | 注: 若使用'-tokenized'参数,即输入的是符号化之后的句子,则必须处理句子中的括号 179 | """ 180 | path_model = 'path_to_your_model' # 官方的模型或自行训练的模型路径 181 | 182 | # parse txt (change to your own paths) 183 | path_stanford = './stanford-parser-full-2018-02-27' 184 | parser_jar = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser' 185 | stanford_parser = MyStanfordParser(path_stanford, parser_jar) 186 | 187 | path_txt = 'your.txt' # 待解析文件路径 188 | path_result_dep = 'your.ptb' # ptb格式路径 189 | path_result_conll = 'your.conll' # conll格式路径(可为None) 190 | stanford_parser.predict_dep(path_model, path_txt, path_result_dep, path_result_conll) 191 | 192 | 193 | if __name__ == '__main__': 194 | # 处理句子中的括号,若使用'-tokenized'参数,即输入的是符号化之后的句子,则必须处理句子中的括号 195 | # 若已经处理,则忽略此步骤 196 | path_ori = 'path_to_ori_file' 197 | path_proprecess = 'path_to_preprocess_file' 198 | preprocessing_sentence(path_ori, path_proprecess) 199 | 200 | # parse sentence, parser结果为ptb格式 201 | parser_ptb_demo() 202 | 203 | # parser sentence,parser结果为dependency格式 204 | parser_dep_demo() 205 | --------------------------------------------------------------------------------