├── Parser
    ├── scripts
    │   ├── java
    │   │   └── .gitkeep
    │   └── python
    │   │   ├── preprocessing.py
    │   │   ├── nltk_stanford_parser_demo.py
    │   │   ├── stanford_parser_trainer.py
    │   │   └── stanford_parser_demo.py
    ├── StanfordDependenciesManual-bookmark.pdf
    ├── treebanks
    │   ├── README.md
    │   └── xml2ptb.py
    └── README.md
├── README.md
└── CoreNLP
    └── README.md


/Parser/scripts/java/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Parser/StanfordDependenciesManual-bookmark.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liu-nlper/Stanford-NLP-Usage/HEAD/Parser/StanfordDependenciesManual-bookmark.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Stanford-NLP-Usage
 2 | 
 3 | 整理[Stanford NLP](https://nlp.stanford.edu/software/)相关工具的使用。
 4 | 
 5 | ## 1. Stanford Core NLP
 6 | 
 7 | [Stanford Core NLP](https://github.com/liu-nlper/Stanford-NLP-Usage/tree/master/CoreNLP)的使用。
 8 | 
 9 | ## 2. Stanford Parser
10 | 
11 | [Stanford Parser](https://github.com/liu-nlper/Stanford-NLP-Usage/tree/master/Parser)的使用。
12 | 
13 | Updating...
14 | 


--------------------------------------------------------------------------------
/Parser/scripts/python/preprocessing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 |     处理句子中的括号，转换为stanford parser能够处理的格式。
 5 | """
 6 | 
 7 | replace_map = {
 8 |     '(': '-LRB-', ')': '-RRB-',
 9 |     '[': '-LRB-', ']': '-RRB-',
10 |     '{': '-LRB-', '}': '-RRB-'}
11 | 
12 | 
13 | def preprocessing_sentence(sentence):
14 |     """
15 |     处理原句子中的括号
16 | 
17 |     Args:
18 |         sentence: str，原始句子
19 |     Returns:
20 |         sentence: str, 处理后的句子
21 |     """
22 |     chars = list(sentence)
23 |     for i, c in enumerate(chars):
24 |         c = chars[i]
25 |         if c in replace_map:
26 |             chars[i] = replace_map[c]
27 |     return ''.join(chars)
28 | 
29 | 
30 | def demo():
31 |     sentence = 'the { quick } brown ( fox ) jumps [ over ] the lazy dog.'
32 |     print(preprocessing_sentence(sentence))
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     demo()


--------------------------------------------------------------------------------
/CoreNLP/README.md:
--------------------------------------------------------------------------------
 1 | # Stanford Core NLP
 2 | 
 3 | [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/)集合了分词、POS、NER、Coreference、Parser等工具于一身，这里整理了其处理中文语料的方法。
 4 | 
 5 | ## 1. 下载CoreNLP
 6 | 
 7 | CoreNLP下载地址: [https://nlp.stanford.edu/software/corenlp-backup-download.html](https://nlp.stanford.edu/software/corenlp-backup-download.html)
 8 | 
 9 | **Step 1**: 下载`stanford-corenlp-full-xxxx-xx-xx.zip`，其中`xxxx-xx-xx`表示工具发布的日期，解压后的目录记为`ROOT_CORENLP`；
10 | 
11 | **Step 2**: 下载中文模型包，格式为`stanford-chinese-corenlp-xxxx-xx-xx-models.jar`，将该模型放置在`ROOT_CORENLP`中。
12 | 
13 | ## 2. CoreNLP使用
14 | 
15 | python示例代码：
16 | 
17 | ```python
18 | import subprocess
19 | 
20 | command = 'java -mx{0} -Djava.ext.dirs={1} edu.stanford.nlp.pipeline.StanfordCoreNLP ' + \
21 |     '-language Chinese -encoding utf-8 -props StanfordCoreNLP-chinese.properties ' + \
22 |     '-annotators tokenize,ssplit,pos,lemma,ner,depparse -ssplit.eolonly ' \
23 |     '-file {2} -outputFormat conllu -outputDirectory {3}'
24 | 
25 | MAX_MEN = '2g'
26 | ROOT_CORENLP = 'your_corenlp_path'
27 | PATH_SENT = 'your_file_path'
28 | ROOT_CONLLU = 'your_result_root_path'
29 | 
30 | command = command.format(MAX_MEM, ROOT_CORENLP, PATH_SENT, ROOT_CONLLU)
31 | 
32 | return_code = subprocess.call(command, shell=True)
33 | ```
34 | 
35 | 其中：
36 | 
37 |   - `MAX_MEM`: jvm最大使用内存；
38 |   - `ROOT_CORENLP`: CoreNLP解压后的路径；
39 |   - `PATH_SENT`: 待处理的文件，每个句子一行；
40 |   - `ROOT_CONLLU`: 处理后的文件根目录，处理后文件名路径为`ROOT_CONLLU/PATH_SENT.conllu`。
41 | 


--------------------------------------------------------------------------------
/Parser/treebanks/README.md:
--------------------------------------------------------------------------------
 1 | # Treebank Corpus
 2 | 
 3 | 
 4 | # 1. GENIA treeebank
 5 | 
 6 | GENIA Project官网: http://www.geniaproject.org/
 7 | 
 8 | GENIA treebank下载地址: [download](http://www.nactem.ac.uk/GENIA/current/GENIA-corpus/Treebank/GENIA_treebank_v1.tar.gz)
 9 | 
10 | ## 1.1 目录结构
11 | 
12 |     .
13 |     ├──GENIA_treebank_v1_xml/  # GENIA treebank语料原始xml文件
14 |     │  └──xxx.xml              # treebank xml格式文件
15 |     ├──GENIA_treebank/         # 处理后的treebank文件
16 |     │  ├──esc_char.map         # 特殊字符映射表
17 |     │  └──GENIA_SP.ptb         # ptb格式文件
18 |     ├──xml2ptb.py              # xml转ptb格式脚本
19 |     └──README.md               # README.md
20 | 
21 | ## 1.2 xml转ptb
22 | 
23 | 将下载的`tar.gz`文件解压到`./GENIA_treebank_v1_xml`目录下，并运行以下脚本。
24 | 
25 |     $ python3 xml2ptb.py
26 | 
27 | 运行完成后(~6s)，在`./GENIA_treebank/`下生成`GENIA_SP.ptb`和`esc_char.map`两个文件，其中，`GENIA_SP.ptb`即为`penn treebank`格式的treebank标注语料。
28 | 
29 | ## 1.3 语料统计
30 | 
31 | GENIA treebank共标注了18541个句子，转为ptb格式时有111条产生错误(括号不能配对)，剩余18430条有效数据。
32 | 
33 | 
34 | # 2. 其他treebanks
35 | 
36 | ## 2.1 英文
37 | 
38 | ### 2.1.1 Penn English Treebank
39 | 
40 | **not free available**: http://catalog.ldc.upenn.edu/ldc99t42
41 | 
42 | NLTK中包含Penn Treebanks 10%的样本，通过以下接口访问:
43 | 
44 |     $ from nltk.corpus import treebank
45 |     $ treebank.parsed_sents()
46 |     $ len(treebank.parsed_sents())  # 3914
47 | 
48 | ### 2.1.2 QuestionBank
49 | 
50 | https://nlp.stanford.edu/data/QuestionBank-Stanford.shtml
51 | 
52 | ### 2.1.3 Sentiment Treebank
53 | 
54 | https://nlp.stanford.edu/sentiment/index.html
55 | 
56 | ## 2.2 中文
57 | 
58 | ### 2.2.1 Penn Chinese Treebank
59 | 
60 | **not free available**: https://catalog.ldc.upenn.edu/ldc2013t21
61 | 
62 | ### 2.2.2 Sinica Treebank
63 | 
64 | Sinica提供的繁体中文语料库，共5346个parsed sentences，可以通过NLTK接口访问。
65 | 
66 |     $ from nltk.corpus import sinica_treebank
67 |     $ sinica_treebank.parsed_sents()
68 | 
69 | ## 2.3 More...
70 | 
71 | https://en.wikipedia.org/wiki/Treebank#Syntactic_treebanks
72 | 


--------------------------------------------------------------------------------
/Parser/scripts/python/nltk_stanford_parser_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 |     更多信息参考： nltk/parse/stanford.py
 5 | """
 6 | 
 7 | from nltk.parser.stanford import StanfordParser
 8 | 
 9 | 
10 | def demo():
11 |     # 即stanford-parser.jar的路径
12 |     path_to_jar = 'path to jar'
13 | 
14 |     # tanford-parser-x.x.x-models.jar，其中`x.x.x`为具体的版本号
15 |     path_to_model_jar = 'path to model jar'
16 | 
17 |     # 模型的路径，例如model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
18 |     # 若需解析中文，则将`englishPCFG.ser.gz`替换为相应的中文模型
19 |     model_path = 'path to model'
20 | 
21 |     parser = StanfordParser(
22 |         path_to_jar=path_to_jar,
23 |         path_to_model_jar=path_to_model_jar,
24 |         model_path=model_path,
25 |         verbose=False,
26 |         java_options='-mx1000m',
27 |         corenlp_options=''
28 |     )
29 | 
30 |     # parser单个句子
31 |     # Use StanfordParser to parse a sentence. Takes a sentence as a string;
32 |     # before parsing, it will be automatically tokenized and tagged by
33 |     # the Stanford Parser.
34 |     parser.raw_parse('the quick brown fox jumps over the lazy dog.')
35 | 
36 |     # parser多个句子
37 |     sentences = [
38 |         'the quick brown fox jumps over the lazy dog.',
39 |         'the quick brown fox jumps over the lazy dog.']
40 |     parser.raw_parse_sents(sentences)
41 | 
42 |     # parser符号化之后的句子
43 |     # Use StanfordParser to parse a sentence.Takes a sentence as a list
44 |     # of (word, tag) tuples; the sentence must have already been tokenized
45 |     # and tagged.
46 |     taged_sent = [
47 |         ("The", "DT"), ("quick", "JJ"), ("brown", "JJ"), ("fox", "NN"),
48 |         ("jumped", "VBD"), ("over", "IN"), ("the", "DT"), ("lazy", "JJ"),
49 |         ("dog", "NN"), (".", ".")]
50 |     parser.tagged_parse(taged_sent)
51 | 
52 |     # 批量解析符号化之后的句子
53 |     taged_sents = [taged_sent, taged_sent, taged_sent]
54 |     parser.tagged_parse_sents(taged_sents)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     demo()
59 | 


--------------------------------------------------------------------------------
/Parser/scripts/python/stanford_parser_trainer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | """
 4 |     重新训练stanford parser
 5 | """
 6 | import subprocess
 7 | 
 8 | 
 9 | class StanfordParserTrainer(object):
10 | 
11 |     def __init__(self, path_stanford, parser_jar='edu.stanford.nlp.parser.lexparser.LexicalizedParser',
12 |                  max_mem='4g'):
13 |         """
14 |         Args:
15 |             path_stanford: stanford-parser.jar路径的父目录
16 |             parser_jar: 'edu.stanford.nlp.parser.lexparser.LexicalizedParser'
17 |             max_mem: 最大内存，default '4g'
18 |         """
19 |         self._path_stanford = path_stanford
20 |         self._parser_jar = parser_jar
21 |         self._max_mem = max_mem
22 | 
23 |     def train(self, path_train, path_model, model='goodFactored',
24 |               path_test=None, path_test_result=None):
25 |         """
26 |         train model
27 |         Args:
28 |             path_train: ptb格式训练文件路径
29 |             path_model: 模型存放路径
30 |             path_test: 测试文件路径, default is None
31 |             path_test_result: 测试文件结果存放路径
32 |         """
33 |         print('training model...', end='')
34 |         # -goodPCFG, -goodFactored, -ijcai03
35 |         command = 'java -mx%s -Djava.ext.dirs=%s %s -evals "factDA,factCB,tsv" -%s ' + \
36 |             '-saveToSerializedFile %s -train %s 0'
37 |         command %= (self._max_mem, self._path_stanford, self._parser_jar,
38 |                     model, path_model, path_train)
39 |         if path_test:
40 |             assert path_test and path_test_result  # not None
41 |             command += (' -testTreebank %s' % path_test)
42 |         process = subprocess.Popen(
43 |             command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
44 |         if not path_test:
45 |             while True:
46 |                 line = process.stdout.readline()
47 |                 if not line:
48 |                     break
49 |                 line = line.decode('utf-8')
50 |                 print(line, end='')
51 |             print('done!')
52 |             return
53 |         with open(path_test_result, 'w', encoding='utf-8') as file_w:
54 |             while True:
55 |                 line = process.stdout.readline()
56 |                 if not line:
57 |                     break
58 |                 line = line.decode('utf-8')
59 |                 print(line, end='')
60 |                 file_w.write(line)
61 |         print('done!')
62 |         return
63 | 
64 | 
65 | def train_model_demo():
66 |     """
67 |     训练模型
68 |     Args:
69 |         path_train: 训练数据路径
70 |         path_model: 模型存放路径
71 |         model: goodPCFG or goodFactored
72 |     """
73 |     path_stanford = './stanford-parser-full-2016-10-31'
74 |     parser_jar = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser'
75 |     stanford_parser = StanfordParserTrainer(path_stanford, parser_jar)
76 | 
77 |     path_train = 'path_to_train_ptb'  # 训练文件路径ptb格式
78 |     path_test = 'path_to_test_ptb'  # 测试文件路径(可为None)
79 |     path_model = 'your_model.tar.gz'  # 模型保存路径
80 |     model = 'goodFactored'  # 使用哪种方式进行训练
81 | 
82 |     stanford_parser.train(path_train, path_model, model=model, path_test=path_test)
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     train_model_demo()


--------------------------------------------------------------------------------
/Parser/treebanks/xml2ptb.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | """
  4 |     将xml标注格式转为PTB format
  5 | 
  6 |     共18541个trees，其中111个格式错误，有效的trees共18430
  7 | """
  8 | import re
  9 | import os
 10 | import sys
 11 | from time import time
 12 | from collections import defaultdict
 13 | # from nltk.tree import Tree
 14 | # from bs4 import BeautifulSoup
 15 | 
 16 | 
 17 | # map
 18 | token_map_dict = {
 19 |     'COMMA': ',', 'PERIOD': '.',
 20 |     'LRB': '-LRB-', 'RRB': '-RRB-',
 21 |     'LQT': '``', 'RQT': '\'\'',
 22 |     'COLON': ':'
 23 | }
 24 | esc_char_map = defaultdict(set)
 25 | replace_map = {
 26 |     '(': '-LRB-', ')': '-RRB-',
 27 |     '[': '-LRB-', ']': '-RRB-',
 28 |     '{': '-LRB-', '}': '-RRB-'
 29 | }
 30 | 
 31 | 
 32 | # patterns
 33 | pattern_sent = re.compile('(<sentence.*?>.*?</sentence>)')
 34 | pattern_tok = re.compile('<tok cat="(.*?)">(.*?)</tok>')
 35 | pattern_cat = re.compile('<(sentence id|cons cat)="(.*?)".*?>')
 36 | pattern_close = re.compile('</.*?>')
 37 | pattern_space = re.compile('\s+')
 38 | pattern_cor = re.compile('\([a-zA-Z]+ \)')
 39 | 
 40 | 
 41 | def replace_pair(token):
 42 |     """
 43 |     替换token中的括号
 44 |     """
 45 |     token = list(token)
 46 |     for i in range(len(token)):
 47 |         c = token[i]
 48 |         if c in replace_map:
 49 |             token[i] = replace_map[c]
 50 |     return ''.join(token)
 51 | 
 52 | 
 53 | def check_bracket(sentence):
 54 |     """
 55 |     检查括号是否匹配
 56 |     Return:
 57 |         bool
 58 |     """
 59 |     brackets = []
 60 |     for c in sentence:
 61 |         if c == '(':
 62 |             brackets.append(c)
 63 |         elif c == ')':
 64 |             if not len(brackets):
 65 |                 return False
 66 |             brackets.pop()
 67 |     return not bool(len(brackets))
 68 | 
 69 | 
 70 | def handle_sentence(sentence):
 71 |     # replace all tokens
 72 |     token_iter = pattern_tok.finditer(sentence)
 73 |     token_rev = []
 74 |     for item in token_iter:
 75 |         start, end = item.start(), item.end()
 76 |         cat, name = item.groups(0)[:]
 77 |         cat_m = cat
 78 |         name = replace_pair(pattern_space.sub('', name))
 79 |         if cat in token_map_dict:
 80 |             cat_m = token_map_dict[cat]
 81 |             esc_char_map[cat_m].add(name)
 82 |         token_rev.append([start, end, cat_m, name])
 83 |     sentence = list(sentence)
 84 |     for item in token_rev[::-1]:
 85 |         start, end, cat, name = item[:]
 86 |         sub_str = ' (%s %s)' % (cat, name)
 87 |         sentence[start:end] = list(sub_str)
 88 |     # replace </xxx> with ')'
 89 |     sentence = pattern_close.sub(')', ''.join(sentence))
 90 |     # replace '<cons cat="NP" ...>' with '(NP '
 91 |     cat_iter = pattern_cat.finditer(sentence)
 92 |     cat_signs = []
 93 |     for item in cat_iter:
 94 |         start, end = item.start(), item.end()
 95 |         tag, name = item.groups(0)[:]
 96 |         if tag == 'sentence id':
 97 |             name = 'S'
 98 |         cat_signs.append([start, end, name])
 99 |     sentence = list(sentence)
100 |     for item in cat_signs[::-1]:
101 |         start, end, name = item[:]
102 |         sub_str = '(%s ' % name
103 |         sentence[start:end] = list(sub_str)
104 |     sentence = pattern_space.sub(' ', ''.join(sentence))
105 |     sentence = pattern_cor.sub('', sentence)
106 |     if not check_bracket(sentence):  # 检查格式
107 |         return None
108 |     # try:  # check
109 |     #     Tree.fromstring(sentence)
110 |     # except Exception as e:
111 |     #     return None
112 |     return sentence
113 | 
114 | 
115 | total_count = 0
116 | 
117 | 
118 | def handle_article(name, file_ptb_w):
119 |     """
120 |     Args:
121 |         name: 待处理文件名
122 |         file_ptb_w:
123 |     """
124 |     with open(name, 'r', encoding='utf-8') as file_r:
125 |         text = file_r.read()
126 |     sentences = pattern_sent.findall(text)
127 |     error_count, useful_count = 0, 0
128 |     for sentence in sentences:
129 |         sentence_h = handle_sentence(sentence)
130 |         if not sentence_h:  # 共111个格式错误
131 |             # print(sentence)
132 |             error_count += 1
133 |             continue
134 |         useful_count += 1
135 |         #if total_count == 478:
136 |         #    print(sentence)
137 |         #    print(sentence_h)
138 |         #    exit()
139 |         sentence_h = pattern_cor.sub('', sentence_h)  # why
140 |         file_ptb_w.write('%s\n' % sentence_h)
141 |     return useful_count, error_count
142 | 
143 | 
144 | def main():
145 |     root_xml = './GENIA_treebank_v1_xml/'
146 |     root_ptb = './GENIA_treebank/'
147 |     if not os.path.exists(root_ptb):
148 |         os.mkdir(root_ptb)
149 |     path_ptb = root_ptb + 'GENIA_SP.ptb'
150 |     file_ptb_w = open(path_ptb, 'w', encoding='utf-8')
151 |     file_list = os.listdir(root_xml)
152 |     useful_count, error_count = 0, 0
153 |     for i, name in enumerate(file_list):
154 |         sys.stdout.write('processing xml files: {0}\r'.format(i+1))
155 |         sys.stdout.flush()
156 |         useful_count_, error_count_ = handle_article(root_xml + name, file_ptb_w)
157 |         useful_count += useful_count_
158 |         error_count += error_count_
159 |     sys.stdout.write('processing xml files: {0}\n'.format(i+1))
160 |     file_ptb_w.close()
161 |     # 字符替换映射表
162 |     with open(root_ptb+'esc_char.map', 'w', encoding='utf-8') as file_w:
163 |         for key in esc_char_map:
164 |             char_list = esc_char_map[key]
165 |             for c in char_list:
166 |                 file_w.write('%s\t%s\n' % (c, key))
167 | 
168 |     print('useful count: {0}'.format(useful_count))
169 |     print('error count: {0}'.format(error_count))
170 |     print('result has been saved to: {0}'.format(path_ptb))
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     t0 = time()
175 | 
176 |     main()
177 |     print('done in {0:.1f}s!'.format(time()-t0))
178 | 


--------------------------------------------------------------------------------
/Parser/README.md:
--------------------------------------------------------------------------------
  1 | # Stanford Parser使用
  2 | 
  3 | 整理stanford parser的部分使用方法，Stanford Parser版本3.9.1。
  4 | 
  5 | **官方使用指南**: `./StanfordDependenciesManual-bookmark.pdf`
  6 | 
  7 | **最新版本下载**: [download](https://nlp.stanford.edu/software/lex-parser.shtml#Download)
  8 | 
  9 | **官方整理的FAQ**: [FAQ](https://nlp.stanford.edu/software/parser-faq.html)
 10 | 
 11 | ## 1. 使用预训练的模型
 12 | 
 13 | 以下提及的脚本，若没有特别指明路径，则都在官方下载的压缩包里；否则，相关脚本存放在`./scripts`目录下。
 14 | 
 15 | ### 1.1 官方提供的模型
 16 | 
 17 | Stanford Parser提供了预训练的模型供使用，表1,2分别列出了中英文模型。其中，`Mixed [Chinese|English]`分别是在中文/英文的混合标注语料上训练的模型，`wsj`是在华尔街日报语料上训练的模型，`xinhua`是在中文新华日报语料上训练的模型。
 18 | 
 19 | 中文模型中，除了`xinhuaFactoredSegmenting.ser.gz`能够直接parse未分词的文本，其余的均需要预先分好词。
 20 | 
 21 | 关于不同模型的介绍: [PCFG parser](https://nlp.stanford.edu/~manning/papers/unlexicalized-parsing.pdf), [Factored parser](https://nlp.stanford.edu/~manning/papers/lex-parser.pdf)
 22 | 
 23 | **表 1.** 中文模型
 24 | 
 25 | | Corpus | PCFG | Factored | FactoredSegmenting |
 26 | | ------------- | ------------- | ------------- | -------------|
 27 | | mixed Chinese | chinesePCFG.ser.gz| chineseFactored.ser.gz | |
 28 | | xinhua | xinhuaPCFG.ser.gz | xinhuaFactored.ser.gz | xinhuaFactoredSegmenting.ser.gz |
 29 | 
 30 | **表 2.** 英文模型
 31 | 
 32 | | Corpus | PCFG | Factored | RNN |
 33 | | ------------- | ------------- | ------------- | ------------- |
 34 | | mixed English | englishPCFG.ser.gz, englishPCFG.ser.gz | englishFactored.ser.gz | englishRNN.ser.gz |
 35 | | wsj | wsjPCFG.ser.gz | wsjFactored.ser.gz | wsjRNN.ser.gz |
 36 | 
 37 | ### 1.2 使用预训练模型的几种方式
 38 | 
 39 | 官方提供了多种调用方式，并提供了多种编程语言的接口，这里仅整理出其中4种，分别是: `图形界面`、`命令行`、`Java`和`Python`。
 40 | 
 41 | **注**: 若对符号化之后的句子进行解析，则在使用Stanford Parser进行句法解析之前，需要对句子中的括号进行处理，处理方式参考: [./scripts/python/preprocessing.py](./scripts/python/preprocessing.py)。
 42 | 
 43 | #### 1.2.1 图形界面
 44 | 
 45 |  - Linux下运行`lexparser-gui.sh`
 46 |  - Windows下运行`lexparser-gui.bat`
 47 | 
 48 | #### 1.2.2 命令行
 49 | 
 50 |  - 脚本: `lexparser.sh`或`lexparser.bat`，使用英文模型；
 51 | 
 52 |  - 脚本: `lexparser_lang.sh`或`lexparser_lang.bat`，可指定语言；
 53 | 
 54 |  - 相关参数参考官方指南。
 55 | 
 56 | #### 1.2.3 Java
 57 | 
 58 | 参考`ParserDemo.java`、`ParserDemo2.java`和`DependencyParserDemo.java`。
 59 | 
 60 | #### 1.2.4 Python
 61 | 
 62 |  - **NLTK接口**: 参考脚本[./scripts/python/nltk_stanford_parser_demo.py](./scripts/python/nltk_stanford_parser_demo.py)。
 63 | 
 64 |  - **python调用jar包**: 参考脚本[./scripts/python/stanford_parser_demo.py](./scripts/python/stanford_parser_demo.py)。
 65 | 
 66 | ## 2. 输出格式
 67 | 
 68 | 详细信息参考官方手册`In practice`部分。
 69 | 
 70 | ### 2.1 penn or dependency format?
 71 | 
 72 | **edu.stanford.nlp.parser.lexparser.LexicalizedParser**
 73 | 
 74 | 若要获取不同格式的输出，则需修改`-outputFormat`：
 75 | 
 76 |  - `penn`格式: `-outputFormat "penn"`
 77 |  - `dependency`格式: `-outputFormat "typedDependencies"`
 78 |  - 同时获取两种格式: `-outputFormat "penn,typedDependencies"`
 79 | 
 80 | 示例命令行:
 81 | 
 82 |     # 输出格式为penn和dependency
 83 |     java -mx200m edu.stanford.nlp.parser.lexparser.LexicalizedParser
 84 |     -retainTmpSubcategories -originalDependencies -outputFormat
 85 |     "penn,typedDependencies" -outputFormatOptions "basicDependencies"
 86 |     englishPCFG.ser.gz file.txt
 87 | 
 88 | ### 2.2 不同格式的dependencies
 89 | 
 90 | Stanford Parser的dependencies默认采用`collapsed dependencies`，若需要其他格式的dependencies，则需修改`-outputFormatOptions`选项，可选参数有：
 91 | 
 92 |  - `basicDependencies`: Basic dependencies.
 93 |  - `collapsedDependencies`: Collapsed dependencies (not necessarily a tree structure)
 94 |  - `CCPropagatedDependencies`: Collapsed dependencies with propagation of conjunct dependencies (not necessarily a tree structure). **This representation is the default, if no option is specified.**
 95 |  - `treeDependencies`: Collapsed dependencies that preserve a tree structure.
 96 |  - `nonCollapsedDependencies`: Non-collapsed dependencies: basic dependencies as well as the extra ones which do not preserve a tree structure.
 97 |  - `nonCollapsedDependenciesSeparated`: Non-collapsed dependencies where the basic dependencies are separated from the extra ones (by “======”).
 98 | 
 99 | ### 2.3 penn格式转其他格式
100 | 
101 | **edu.stanford.nlp.trees.EnglishGrammaticalStructure**
102 | 
103 | 如果已经有了`penn treebank`格式的文件，需要将其转换为`dependency`格式，则可以使用此类。
104 | 
105 | 可选参数:
106 | 
107 |  - `-basic`: basic dependencies
108 |  - `-collapsed`: collapsed dependencies (not necessarily a tree structure)
109 |  - `-CCprocessed`: collapsed dependencies with propagation of conjunct dependencies (not necessarily a tree structure)
110 |  - `-collapsedTree`: collapsed dependencies that preserve a tree structure
111 |  - `-nonCollapsed`: non-collapsed dependencies: basic dependencies as well as the extra ones which do not preserve a tree structure
112 |  - `-conllx`: dependencies printed out in CoNLL X (CoNLL 2006) format
113 |  - `-originalDependencies`: output the original Stanford Dependencies instead of the new Universal Dependencies.
114 | 
115 | 示例命令行:
116 | 
117 |     # penn格式转dependency格式，其中`-keepPunct`参数是保留标点符号
118 |     java edu.stanford.nlp.trees.EnglishGrammaticalStructure -treeFile
119 |     file.tree -collapsedTree -CCprocessed -keepPunct
120 | 
121 | ## 3. 重新训练模型
122 | 
123 | ### 3.1 命令行
124 | 
125 | 脚本: `lexparser-lang-train-test.sh`，若有treebank标注语料，则可使用该脚本重新训练句法分析模型。
126 | 
127 | ### 3.2 Python
128 | 
129 | #### 3.2.1 PCFG and Factored
130 | 
131 | 参考：[./scripts/python/stanford_parser_trainer.py](./scripts/python/stanford_parser_trainer.py)，或参考官方FAQ: [Can I train the parser?](https://nlp.stanford.edu/software/parser-faq.html#d)
132 | 
133 | 若要使用新训练的模型，则参考1.2.1-1.2.4。
134 | 
135 | #### 3.2.2 RNN
136 | 
137 | 官方FAQ: [How do I train the RNN parser?](https://nlp.stanford.edu/software/parser-faq.html#rnn)
138 | 
139 | ## 4. treebank语料整理
140 | 
141 | 见[./treebanks/README.md](./treebanks/README.md)。
142 | 
143 | ## 5. 性能评估
144 | 
145 | ### 5.1 CONLL-U Format
146 | 
147 | **CONLL-U Format介绍**: http://universaldependencies.org/docs/format.html
148 | 
149 | **评估脚本下载地址**: http://universaldependencies.org/conll17/evaluation.html
150 | 


--------------------------------------------------------------------------------
/Parser/scripts/python/stanford_parser_demo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | """
  4 | stanford parser
  5 |     
  6 | class MyStanfordParser
  7 |     
  8 |     Parameters:
  9 |         path_stanford: stanford-parser.jar路径的父目录
 10 |         parser_jar: 'edu.stanford.nlp.parser.lexparser.LexicalizedParser'
 11 |         max_mem: 最大内存，default '4g'
 12 |     
 13 |     Methods:
 14 | 
 15 |         predict: 对文件进行句法分析，预测结果为ptb格式
 16 |         
 17 |         predict_dep: 对文件进行句法分析，预测结果为dependency格式
 18 |         
 19 |         ptb2conll: 将ptb转为conll格式
 20 | """
 21 | import subprocess
 22 | 
 23 | 
 24 | def read_lines(path):
 25 |     lines = []
 26 |     file_r = open(path, 'r', encoding='utf-8')
 27 |     for line in file_r.readlines():
 28 |         line = line.strip()
 29 |         if line:
 30 |             lines.append(line)
 31 |     return lines
 32 | 
 33 | 
 34 | class MyStanfordParser(object):
 35 | 
 36 |     def __init__(self, path_stanford, parser_jar='edu.stanford.nlp.parser.lexparser.LexicalizedParser',
 37 |                  max_mem='4g'):
 38 |         """
 39 |         Args:
 40 |             path_stanford: stanford-parser.jar路径的父目录
 41 |             parser_jar: 'edu.stanford.nlp.parser.lexparser.LexicalizedParser'
 42 |             max_mem: 最大内存，default '4g'
 43 |         """
 44 |         self._path_stanford = path_stanford
 45 |         self._parser_jar = parser_jar
 46 |         self._max_mem = max_mem
 47 | 
 48 |     def predict(self, path_model, path_data, path_result_ptb=None, path_result_conll=None):
 49 |         """
 50 |         parse sentences, penn format
 51 | 
 52 |         Args:
 53 |             path_model: 模型文件路径
 54 |             path_data: 待标记文件路径
 55 |             path_result_ptb: 结果存放路径
 56 |             path_result_conll: 转为conll的路径
 57 |             
 58 |         Notes:
 59 |             1. '-tokenized'参数，若使用该参数，则不再进行符号化，但是要注意括号的处理;
 60 |             2. '-sentences newline'参数，parser one sentence per line
 61 |         """
 62 |         print('predict...', end='')
 63 |         command = 'java -mx%s -Djava.ext.dirs=%s %s -tokenized -retainTmpSubcategories ' + \
 64 |             '-originalDependencies -outputFormat "penn" -outputFormatOptions ' + \
 65 |             '"basicDependencies" -sentences newline %s %s'
 66 |         command %= (self._max_mem, self._path_stanford, self._parser_jar,
 67 |                     path_model, path_data)
 68 |         if path_result_ptb:
 69 |             command += ' > %s' % path_result_ptb
 70 |         return_code = subprocess.call(command, shell=True)
 71 |         print('done!')
 72 | 
 73 |         if path_result_conll:  # 转为conll format
 74 |             print('transform to conll format...', end='')
 75 |             self.ptb2conll(path_result_ptb, path_result_conll)
 76 |             print('done!')
 77 |         return return_code
 78 | 
 79 |     def predict_dep(self, path_model, path_data, path_result_ptb=None, path_result_conll=None):
 80 |         """
 81 |         parse sentences, get the dependencies
 82 | 
 83 |         Args:
 84 |             path_model: 模型文件路径
 85 |             path_data: 待标记文件路径
 86 |             path_result_ptb: 结果存放路径
 87 |             path_result_conll: 转为conll的路径
 88 |         
 89 |         Notes:
 90 |             1. '-tokenized'参数，若使用该参数，则不再进行符号化，但是要注意括号的处理;
 91 |             2. '-sentences newline'参数，parser one sentence per line
 92 |         """
 93 |         print('predict...', end='')
 94 |         command = 'java -mx%s -Djava.ext.dirs=%s %s -tokenized -retainTmpSubcategories ' + \
 95 |             '-originalDependencies -outputFormat "typedDependencies" -outputFormatOptions ' + \
 96 |             '"basicDependencies" -sentences newline %s %s'
 97 |         command %= (self._max_mem, self._path_stanford, self._parser_jar,
 98 |                     path_model, path_data)
 99 |         if path_result_ptb:
100 |             command += ' > %s' % path_result_ptb
101 |         return_code = subprocess.call(command, shell=True)
102 |         print('done!')
103 | 
104 |         if path_result_conll:  # 转为conll format
105 |             print('transform to conll format...', end='')
106 |             self.ptb2conll(path_result_ptb, path_result_conll)
107 |             print('done!')
108 |         return return_code
109 | 
110 |     def ptb2conll(self, path_ptb, path_conll):
111 |         """
112 |         ptb转为conll format (转为其他格式参考官方手册)
113 |         Args:
114 |             path_result_ptb: 结果存放路径
115 |             path_result_conll: 转为conll的路径
116 |         """
117 |         egs = 'edu.stanford.nlp.trees.EnglishGrammaticalStructure'
118 |         command = 'java -mx%s -Djava.ext.dirs=%s %s -treeFile %s -conllx ' + \
119 |             '-basic -retainNPTmpSubcategories -makeCopulaHead -keepPunct > %s'
120 |         command %= (self._max_mem, self._path_stanford, egs, path_ptb, path_conll)
121 |         return subprocess.call(command, shell=True)
122 | 
123 | 
124 | def replace_pair(token, replace_map):
125 |     """
126 |     替换token中的括号
127 |     """
128 |     token = list(token)
129 |     for i in range(len(token)):
130 |         c = token[i]
131 |         if c in replace_map:
132 |             token[i] = replace_map[c]
133 |     return ''.join(token)
134 | 
135 | 
136 | def preprocessing_sentence(path_ori, path_result):
137 |     """
138 |     处理原句子中的特殊括号
139 |     Args:
140 |         path_ori: str, 原始文件路径
141 |         path_result: str, 处理后文件路径
142 |     """
143 |     replace_map = {
144 |         '(': '-LRB-', ')': '-RRB-',
145 |         '[': '-LRB-', ']': '-RRB-',
146 |         '{': '-LRB-', '}': '-RRB-'}
147 |     lines = read_lines(path_ori)  # 可改为逐行读取...
148 |     file_w = open(path_result, 'w', encoding='utf-8')
149 |     for line in lines:
150 |         new_line = replace_pair(line, replace_map)
151 |         file_w.write('%s\n' % new_line)
152 |     file_w.close()
153 | 
154 | 
155 | def parser_ptb_demo():
156 |     """
157 |     解析源语言，ptb格式
158 | 
159 |     注： 若使用'-tokenized'参数，即输入的是符号化之后的句子，则必须处理句子中的括号
160 |     """
161 |     path_model = 'path_to_your_model'  # 官方的模型或自行训练的模型路径
162 | 
163 |     # parse txt (change to your own paths)
164 |     path_stanford = './stanford-parser-full-2018-02-27'
165 |     parser_jar = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser'
166 |     stanford_parser = MyStanfordParser(path_stanford, parser_jar)
167 | 
168 |     path_txt = 'your.txt'  # 待解析文件路径
169 |     path_result_ptb = 'your.ptb'  # ptb格式路径
170 |     path_result_conll = 'your.conll'  # conll格式路径(可为None)
171 |     stanford_parser.predict(path_model, path_txt, path_result_ptb, path_result_conll)
172 | 
173 | 
174 | def parser_dep_demo():
175 |     """
176 |     解析源语言，dependency格式
177 | 
178 |     注： 若使用'-tokenized'参数，即输入的是符号化之后的句子，则必须处理句子中的括号
179 |     """
180 |     path_model = 'path_to_your_model'  # 官方的模型或自行训练的模型路径
181 | 
182 |     # parse txt (change to your own paths)
183 |     path_stanford = './stanford-parser-full-2018-02-27'
184 |     parser_jar = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser'
185 |     stanford_parser = MyStanfordParser(path_stanford, parser_jar)
186 | 
187 |     path_txt = 'your.txt'  # 待解析文件路径
188 |     path_result_dep = 'your.ptb'  # ptb格式路径
189 |     path_result_conll = 'your.conll'  # conll格式路径(可为None)
190 |     stanford_parser.predict_dep(path_model, path_txt, path_result_dep, path_result_conll)
191 | 
192 | 
193 | if __name__ == '__main__':
194 |     # 处理句子中的括号，若使用'-tokenized'参数，即输入的是符号化之后的句子，则必须处理句子中的括号
195 |     # 若已经处理，则忽略此步骤
196 |     path_ori = 'path_to_ori_file'
197 |     path_proprecess = 'path_to_preprocess_file'
198 |     preprocessing_sentence(path_ori, path_proprecess)
199 | 
200 |     # parse sentence, parser结果为ptb格式
201 |     parser_ptb_demo()
202 | 
203 |     # parser sentence，parser结果为dependency格式
204 |     parser_dep_demo()
205 | 


--------------------------------------------------------------------------------