├── .DS_Store
├── .gitignore
├── .vscode
    ├── launch.json
    └── settings.json
├── StanfordDependTree.py
├── StanfordSentTree.py
├── jiebaSeg.py
├── ltpEntity.py
├── ltpPOS.py
├── ltpParser.py
├── ltpSeg.py
├── ltpSementic.py
├── ner_test
├── postest
├── result
├── stanford.py
├── stanford.pyc
├── stanfordNER.py
├── stanfordPOS.py
└── userdict


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cgnerds/nlpBookStudy/fa603bad297fb01994922f529730210dc5bcc2bf/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | stanford.pyc
2 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     // Use IntelliSense to learn about possible attributes.
  3 |     // Hover to view descriptions of existing attributes.
  4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
  5 |     "version": "0.2.0",
  6 |     "configurations": [
  7 |         {
  8 |             "name": "Python: Current File",
  9 |             "type": "python",
 10 |             "request": "launch",
 11 |             "program": "${file}"
 12 |         },
 13 |         {
 14 |             "name": "Python: Attach",
 15 |             "type": "python",
 16 |             "request": "attach",
 17 |             "localRoot": "${workspaceFolder}",
 18 |             "remoteRoot": "${workspaceFolder}",
 19 |             "port": 3000,
 20 |             "secret": "my_secret",
 21 |             "host": "localhost"
 22 |         },
 23 |         {
 24 |             "name": "Python: Terminal (integrated)",
 25 |             "type": "python",
 26 |             "request": "launch",
 27 |             "program": "${file}",
 28 |             "console": "integratedTerminal"
 29 |         },
 30 |         {
 31 |             "name": "Python: Terminal (external)",
 32 |             "type": "python",
 33 |             "request": "launch",
 34 |             "program": "${file}",
 35 |             "console": "externalTerminal"
 36 |         },
 37 |         {
 38 |             "name": "Python: Django",
 39 |             "type": "python",
 40 |             "request": "launch",
 41 |             "program": "${workspaceFolder}/manage.py",
 42 |             "args": [
 43 |                 "runserver",
 44 |                 "--noreload",
 45 |                 "--nothreading"
 46 |             ],
 47 |             "debugOptions": [
 48 |                 "RedirectOutput",
 49 |                 "Django"
 50 |             ]
 51 |         },
 52 |         {
 53 |             "name": "Python: Flask (0.11.x or later)",
 54 |             "type": "python",
 55 |             "request": "launch",
 56 |             "module": "flask",
 57 |             "env": {
 58 |                 "FLASK_APP": "${workspaceFolder}/app.py"
 59 |             },
 60 |             "args": [
 61 |                 "run",
 62 |                 "--no-debugger",
 63 |                 "--no-reload"
 64 |             ]
 65 |         },
 66 |         {
 67 |             "name": "Python: Module",
 68 |             "type": "python",
 69 |             "request": "launch",
 70 |             "module": "module.name"
 71 |         },
 72 |         {
 73 |             "name": "Python: Pyramid",
 74 |             "type": "python",
 75 |             "request": "launch",
 76 |             "args": [
 77 |                 "${workspaceFolder}/development.ini"
 78 |             ],
 79 |             "debugOptions": [
 80 |                 "RedirectOutput",
 81 |                 "Pyramid"
 82 |             ]
 83 |         },
 84 |         {
 85 |             "name": "Python: Watson",
 86 |             "type": "python",
 87 |             "request": "launch",
 88 |             "program": "${workspaceFolder}/console.py",
 89 |             "args": [
 90 |                 "dev",
 91 |                 "runserver",
 92 |                 "--noreload=True"
 93 |             ]
 94 |         },
 95 |         {
 96 |             "name": "Python: All debug Options",
 97 |             "type": "python",
 98 |             "request": "launch",
 99 |             "pythonPath": "${config:python.pythonPath}",
100 |             "program": "${file}",
101 |             "module": "module.name",
102 |             "env": {
103 |                 "VAR1": "1",
104 |                 "VAR2": "2"
105 |             },
106 |             "envFile": "${workspaceFolder}/.env",
107 |             "args": [
108 |                 "arg1",
109 |                 "arg2"
110 |             ],
111 |             "debugOptions": [
112 |                 "RedirectOutput"
113 |             ]
114 |         }
115 |     ]
116 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/usr/local/bin/python"
3 | }


--------------------------------------------------------------------------------
/StanfordDependTree.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | import sys
 3 | import os
 4 | from nltk.tree import Tree # 导入 NLTK 库
 5 | from stanford import *
 6 | 
 7 | # 设置 UTF-8 输出环境
 8 | reload(sys)
 9 | sys.setdefaultencoding('utf-8')
10 | 
11 | # 安装库
12 | root = '../stanford-corenlp/'
13 | jarpath = root + "stanford-parser.jar"
14 | modelpath = root + "models/lexparser/chinesePCFG.ser.gz"
15 | opttype = 'typedDependencies' # 'penn, typedDependencies'
16 | parser = StanfordParser(modelpath, jarpath, opttype)
17 | result = parser.parse("罗马尼亚 的 首都 是 布加勒斯特 。")
18 | print result
19 | 


--------------------------------------------------------------------------------
/StanfordSentTree.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | import sys
 3 | import os
 4 | from nltk.tree import Tree # 导入 NLTK 库
 5 | from stanford import *
 6 | 
 7 | # 设置 UTF-8 输出环境
 8 | reload(sys)
 9 | sys.setdefaultencoding('utf-8')
10 | 
11 | # 安装库
12 | root = '../stanford-corenlp/'
13 | jarpath = root + "stanford-parser.jar"
14 | modelpath = root + "models/lexparser/chinesePCFG.ser.gz"
15 | opttype = 'penn' # 宾州树库格式
16 | parser = StanfordParser(modelpath, jarpath, opttype)
17 | result = parser.parse("罗马尼亚 的 首都 是 布加勒斯特 。")
18 | print result
19 | tree = Tree.fromstring(result)
20 | tree.draw()


--------------------------------------------------------------------------------
/jiebaSeg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | import os
 4 | import jieba # 导入结巴分词库
 5 | 
 6 | # 设置 UTF-8 输出环境
 7 | reload(sys)
 8 | sys.setdefaultencoding('utf-8')
 9 | 
10 | # 结巴分词——全模式
11 | jieba.load_userdict("userdict")
12 | sent = "在包含问题的所有解的解空间树中，按照深度优先搜索的策略，从根节点出发深度探索解空间树。"
13 | wordlist = jieba.cut(sent, cut_all=True)
14 | print " | ".join(wordlist)
15 | 
16 | # 结巴分词——精确切
17 | wordlist = jieba.cut(sent) # cut_all=True
18 | print " | ".join(wordlist)
19 | 
20 | # 结巴分词——搜索引擎模式
21 | wordlist = jieba.cut_for_search(sent)
22 | print " | ".join(wordlist)
23 | 


--------------------------------------------------------------------------------
/ltpEntity.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | import os
 4 | from pyltp import *
 5 | 
 6 | # 设置 UTF-8 输出环境
 7 | reload(sys)
 8 | sys.setdefaultencoding('utf-8')
 9 | 
10 | sent = "欧洲 东部 的 罗马尼亚 ， 首都 是 布加勒斯特 ， 也 是 一 座 世界性 的 城市 。"
11 | words = sent.split(" ")
12 | postagger = Postagger()
13 | postagger.load("../ltp3.4/pos.model") # 导入词性标注模块
14 | postags = postagger.postag(words)
15 | 
16 | recognizer = NamedEntityRecognizer()
17 | recognizer.load("../ltp3.4/ner.model") # 导入命名实体识别模块
18 | netags = recognizer.recognize(words, postags)
19 | 
20 | for word, postag, netag in zip(words, postags, netags):
21 |     print word + "/" + postag + "/" + netag,


--------------------------------------------------------------------------------
/ltpPOS.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | import os
 4 | from pyltp import *
 5 | 
 6 | # 设置 UTF-8 输出环境
 7 | reload(sys)
 8 | sys.setdefaultencoding('utf-8')
 9 | 
10 | model_path = "../ltp3.4/cws.model"
11 | pos_path = "../ltp3.4/pos.model"
12 | 
13 | sent = "在 包含 问题 的 所有 解 的 解空间树 中 ， 按照 深度优先 搜索 的 策略 ，从 根节点 出发 深度 搜索 解空间树 。"
14 | words = sent.split(" ")
15 | words 
16 | postagger = Postagger() # 实例化词性标注类
17 | postagger.load(pos_path) # 导入词性标注模型
18 | postags = postagger.postag(words)
19 | for word,postag in zip(words, postags):
20 |     print word + "/" + postag,


--------------------------------------------------------------------------------
/ltpParser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | import os
 4 | import nltk
 5 | from nltk.tree import Tree # 导入 nltk tree 结构
 6 | from nltk.grammar import DependencyGrammar # 导入依存句法包
 7 | from nltk.parse import *
 8 | from pyltp import * # 导入 ltp 应用包
 9 | import re
10 | 
11 | reload(sys)
12 | sys.setdefaultencoding('utf-8') # 设置 UTF-8 输出环境
13 | 
14 | words = "罗马尼亚 的 首都 是 布加勒斯特 。".split(" ") # 例句
15 | 
16 | postagger = Postagger() # 首先对句子进行词性标注
17 | postagger.load("../ltp3.4/pos.model")
18 | postags = postagger.postag(words)
19 | 
20 | parser = Parser() #  将词性标注和分词结果都加入分析器中进行句法解析
21 | parser.load("../ltp3.4/parser.model")
22 | arcs = parser.parse(words, postags)
23 | arclen = len(arcs)
24 | conll = ""
25 | for i in xrange(arclen): # 构建 Conll 标准的数据结构
26 |     if arcs[i].head == 0:
27 |         arcs[i].relation = "ROOT"
28 |     conll += "\t" + words[i] + "(" + postags[i] + ")" + "\t" + postags[i] + "\t" + str(arcs[i].head) + "\t" + arcs[i].relation + "\n"
29 | 
30 | print conll
31 | 
32 | conlltree = DependencyGraph(conll) # 转换为依存句法图
33 | tree = conlltree.tree() # 构建树结构
34 | tree.draw() # 显示输出的树


--------------------------------------------------------------------------------
/ltpSeg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | import os
 4 | from pyltp import Segmentor
 5 | 
 6 | reload(sys)
 7 | sys.setdefaultencoding('utf-8')
 8 | 
 9 | postdict = {"解 | 空间":"解空间", "深度 | 优先":"深度优先"}
10 | 
11 | model_path = "../ltp3.4/cws.model"
12 | user_dict = "../ltp3.4/userdict"
13 | 
14 | segmentor = Segmentor()
15 | #segmentor.load(model_path)
16 | segmentor.load_with_lexicon(model_path, user_dict)
17 | 
18 | words = segmentor.segment("在包含问题的所有解的解空间树中，按照深度优先搜索的策略，从根节点出发深度搜索解空间树。")
19 | seg_sent =  " | ".join(words)
20 | for key in postdict:
21 |     seg_sent = seg_sent.replace(key, postdict[key])
22 | print seg_sent
23 | 
24 | segmentor.release()


--------------------------------------------------------------------------------
/ltpSementic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | import os
 4 | from pyltp import *
 5 | 
 6 | # 设置 UTF-8 输出环境
 7 | reload(sys)
 8 | sys.setdefaultencoding('utf-8')
 9 | 
10 | MODELDIR = "../ltp3.4/"
11 | sentence = "欧洲东部的罗马尼亚，首都是布加勒斯特，也是一座世界性的城市。"
12 | segmentor = Segmentor()
13 | segmentor.load(os.path.join(MODELDIR, "cws.model"))
14 | words = segmentor.segment(sentence)
15 | wordlist = list(words) # 从生成器变为列表元素
16 | 
17 | postagger = Postagger()
18 | postagger.load(os.path.join(MODELDIR, "pos.model"))
19 | postags = postagger.postag(words)
20 | 
21 | parser = Parser()
22 | parser.load(os.path.join(MODELDIR, "parser.model"))
23 | arcs = parser.parse(words, postags)
24 | 
25 | recognizer = NamedEntityRecognizer()
26 | recognizer.load(os.path.join(MODELDIR, "ner.model"))
27 | netags = recognizer.recognize(words, postags)
28 | 
29 | # 语义角色标注
30 | labeller = SementicRoleLabeller()
31 | labeller.load(os.path.join(MODELDIR, "pisrl.model"))
32 | roles = labeller.label(words, postags, arcs)
33 | 
34 | # 输出标注结果
35 | for role in roles:
36 |     print 'rel: ', wordlist[role.index] # 谓词
37 |     for arg in role.arguments:
38 |         if arg.range.start != arg.range.end:
39 |             print arg.name, ' '.join(wordlist[arg.range.start:arg.range.end])
40 |         else:
41 |             print arg.name, wordlist[arg.range.start]


--------------------------------------------------------------------------------
/ner_test:
--------------------------------------------------------------------------------
1 | 欧洲/LOCATION 东部/O 的/O 罗马尼亚/GPE ，/O 首都/O 是/O 布加勒斯特/GPE ，/O 也/O 是/O 一/O 座/O 世界性/O 的/O 城市/O 。/O 
2 | 


--------------------------------------------------------------------------------
/postest:
--------------------------------------------------------------------------------
1 | 在 包含 问题 的 所有 解 的 解空间树 中 ， 按照 深度优先 搜索 的 策略 ，从 根节点 出发 深度 搜索 解空间树 。
2 | 


--------------------------------------------------------------------------------
/result:
--------------------------------------------------------------------------------
1 | 在/P 包含/VV 问题/NN 的/DEC 所有/DT 解/VV 的/DEC 解空间树/NN 中/LC ，/PU 按照/P 深度优先/NN 搜索/NN 的/DEC 策略/NN ，从/NN 根节点/NN 出发/VV 深度/JJ 搜索/NN 解空间树/VV 。/PU
2 | 


--------------------------------------------------------------------------------
/stanford.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | import os
 4 | 
 5 | # CoreNLP 3.9.1 jar 包和中文模板包
 6 | class StanfordCoreNLP(): # 所有 StanfordNLP的父类
 7 |     def __init__(self, jarpath, modelpath):
 8 |         self.tempsrcpath = "tempsrc" # 输入临时文件路径
 9 |         self.jarpath = jarpath
10 |         self.modelpath = modelpath
11 | 
12 |     def savefile(self, path, sent): # 创建临时文件存储路径
13 |         fp = open(path, "wb")
14 |         fp.write(sent)
15 |         fp.close()
16 |     
17 |     def delfile(self, path): # 删除临时文件
18 |         os.remove(path)
19 |     
20 | class StanfordPOSTagger(StanfordCoreNLP): # 词性标注子类
21 |     def __init__(self, jarpath, modelpath):
22 |         StanfordCoreNLP.__init__(self, jarpath, modelpath)
23 |         self.classfier = "edu.stanford.nlp.tagger.maxent.MaxentTagger" # 词性标注主类
24 |         self.delimiter = "/" # 标签分隔符
25 |         self.__buildcmd()
26 |     
27 |     def __buildcmd(self): # 构建命令行
28 |         self.cmdline = 'java -mx1g -cp "' + self.jarpath + '" ' + self.classfier + ' -model "' + self.modelpath + '" -tagSeparator ' + self.delimiter
29 |         print self.cmdline
30 | 
31 |     def tag(self, sent): # 标注句子
32 |         self.savefile(self.tempsrcpath, sent)
33 |         tagtxt = os.popen(self.cmdline + " -textFile " + self.tempsrcpath, 'r').read() # 结果输出到变量中
34 |         self.delfile(self.tempsrcpath)
35 |         return tagtxt
36 |     
37 |     def tagfile(self, inputpath, outpath): # 标注文件
38 |         os.system(self.cmdline + ' -textFile ' + inputpath + ' > ' + outpath)
39 | 
40 | class StanfordNERTagger(StanfordCoreNLP):
41 |     def __init__(self, jarpath, modelpath):
42 |         StanfordCoreNLP.__init__(self, jarpath, modelpath)
43 |         self.classifier = "edu.stanford.nlp.ie.crf.CRFClassifier"
44 |         self.__buildcmd()
45 |     
46 |     # 构建命令行
47 |     def __buildcmd(self):
48 |         self.cmdline = 'java -mx1g -cp "' + self.jarpath + '" ' + self.classifier + ' -loadClassifier "' + self.modelpath + '"'
49 |         print self.cmdline
50 |     
51 |     # 标注句子
52 |     def tag(self, sent):
53 |         self.savefile(self.tempsrcpath, sent)
54 |         tagtxt = os.popen(self.cmdline + ' -textFile ' + self.tempsrcpath, 'r').read() # 输出到变量中
55 |         self.delfile(self.tempsrcpath)
56 |         return tagtxt
57 |     
58 |     # 标注文件
59 |     def tagfile(self, sent, outpath):
60 |         self.savefile(self.tempsrcpath, sent)
61 |         os.system(self.cmdline + ' -textFile ' + self.tempsrcpath + ' > ' + outpath )
62 |         self.delfile(self.tempsrcpath)
63 | 
64 | class StanfordParser(StanfordCoreNLP):
65 |     def __init__(self, modelpath, jarpath, opttype):
66 |         StanfordCoreNLP.__init__(self, jarpath, modelpath)
67 |         self.modelpath = modelpath # 模型文件路径
68 |         self.classifier = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"
69 |         self.opttype = opttype
70 |         self.__buildcmd()
71 |     
72 |     # 构建命令行
73 |     def __buildcmd(self):
74 |         self.cmdline = 'java -mx500m -cp "' + self.jarpath + '" ' + self.classifier + ' -outputFormat "' + self.opttype + '" ' + self.modelpath + ' '
75 |         print self.cmdline
76 |     
77 |     # 解析句子
78 |     def parse(self, sent):
79 |         self.savefile(self.tempsrcpath, sent)
80 |         tagtxt = os.popen(self.cmdline + self.tempsrcpath, "r").read() # 输出到变量中
81 |         self.delfile(self.tempsrcpath)
82 |         return tagtxt
83 |     
84 |     # 输出到文件
85 |     def tagfile(self, sent, outpath):
86 |         self.savefile(self.tempsrcpath, sent)
87 |         os.system(self.cmdline + self.tempsrcpath + ' > ' + outpath )
88 |         self.delfile(self.tempsrcpath) 


--------------------------------------------------------------------------------
/stanford.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cgnerds/nlpBookStudy/fa603bad297fb01994922f529730210dc5bcc2bf/stanford.pyc


--------------------------------------------------------------------------------
/stanfordNER.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | import os
 4 | from stanford import StanfordNERTagger
 5 | 
 6 | # 设置 UTF-8 输出环境
 7 | reload(sys)
 8 | sys.setdefaultencoding('utf-8')
 9 | 
10 | root = '../stanford-corenlp/'
11 | jarpath = root + "stanford-ner.jar"
12 | modelpath = root + "models/ner/chinese.misc.distsim.crf.ser.gz"
13 | 
14 | st = StanfordNERTagger(jarpath, modelpath)
15 | seg_sent = "欧洲 东部 的 罗马尼亚 ， 首都 是 布加勒斯特 ， 也 是 一 座 世界性 的 城市 。"
16 | taglist = st.tagfile(seg_sent, "ner_test")
17 | print taglist


--------------------------------------------------------------------------------
/stanfordPOS.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | import os
 4 | from stanford import StanfordPOSTagger
 5 | 
 6 | # 设置 UTF-8 输出环境
 7 | reload(sys)
 8 | sys.setdefaultencoding('utf-8')
 9 | 
10 | root = '../stanford-corenlp/'
11 | jarpath = root + "stanford-postagger.jar"
12 | modelpath = root + "models/pos-tagger/chinese-distsim/chinese-distsim.tagger"
13 | 
14 | st = StanfordPOSTagger(jarpath, modelpath)
15 | seg_sent = "在 包含 问题 的 所有 解 的 解空间树 中 ， 按照 深度优先 搜索 的 策略 ，从 根节点 出发 深度 搜索 解空间树 。"
16 | postest = "postest"
17 | result = "result"
18 | taglist = st.tag(seg_sent)
19 | print taglist


--------------------------------------------------------------------------------
/userdict:
--------------------------------------------------------------------------------
1 | 解空间 5 n
2 | 解空间树 5 n
3 | 根节点 5 n
4 | 深度优先 5 n


--------------------------------------------------------------------------------