├── .DS_Store ├── .gitignore ├── .vscode ├── launch.json └── settings.json ├── StanfordDependTree.py ├── StanfordSentTree.py ├── jiebaSeg.py ├── ltpEntity.py ├── ltpPOS.py ├── ltpParser.py ├── ltpSeg.py ├── ltpSementic.py ├── ner_test ├── postest ├── result ├── stanford.py ├── stanford.pyc ├── stanfordNER.py ├── stanfordPOS.py └── userdict /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cgnerds/nlpBookStudy/fa603bad297fb01994922f529730210dc5bcc2bf/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | stanford.pyc 2 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python: Current File", 9 | "type": "python", 10 | "request": "launch", 11 | "program": "${file}" 12 | }, 13 | { 14 | "name": "Python: Attach", 15 | "type": "python", 16 | "request": "attach", 17 | "localRoot": "${workspaceFolder}", 18 | "remoteRoot": "${workspaceFolder}", 19 | "port": 3000, 20 | "secret": "my_secret", 21 | "host": "localhost" 22 | }, 23 | { 24 | "name": "Python: Terminal (integrated)", 25 | "type": "python", 26 | "request": "launch", 27 | "program": "${file}", 28 | "console": "integratedTerminal" 29 | }, 30 | { 31 | "name": "Python: Terminal (external)", 32 | "type": "python", 33 | "request": "launch", 34 | "program": "${file}", 35 | "console": "externalTerminal" 36 | }, 37 | { 38 | "name": "Python: Django", 39 | "type": "python", 40 | "request": "launch", 41 | "program": "${workspaceFolder}/manage.py", 42 | "args": [ 43 | "runserver", 44 | "--noreload", 45 | "--nothreading" 46 | ], 47 | "debugOptions": [ 48 | "RedirectOutput", 49 | "Django" 50 | ] 51 | }, 52 | { 53 | "name": "Python: Flask (0.11.x or later)", 54 | "type": "python", 55 | "request": "launch", 56 | "module": "flask", 57 | "env": { 58 | "FLASK_APP": "${workspaceFolder}/app.py" 59 | }, 60 | "args": [ 61 | "run", 62 | "--no-debugger", 63 | "--no-reload" 64 | ] 65 | }, 66 | { 67 | "name": "Python: Module", 68 | "type": "python", 69 | "request": "launch", 70 | "module": "module.name" 71 | }, 72 | { 73 | "name": "Python: Pyramid", 74 | "type": "python", 75 | "request": "launch", 76 | "args": [ 77 | "${workspaceFolder}/development.ini" 78 | ], 79 | "debugOptions": [ 80 | "RedirectOutput", 81 | "Pyramid" 82 | ] 83 | }, 84 | { 85 | "name": "Python: Watson", 86 | "type": "python", 87 | "request": "launch", 88 | "program": "${workspaceFolder}/console.py", 89 | "args": [ 90 | "dev", 91 | "runserver", 92 | "--noreload=True" 93 | ] 94 | }, 95 | { 96 | "name": "Python: All debug Options", 97 | "type": "python", 98 | "request": "launch", 99 | "pythonPath": "${config:python.pythonPath}", 100 | "program": "${file}", 101 | "module": "module.name", 102 | "env": { 103 | "VAR1": "1", 104 | "VAR2": "2" 105 | }, 106 | "envFile": "${workspaceFolder}/.env", 107 | "args": [ 108 | "arg1", 109 | "arg2" 110 | ], 111 | "debugOptions": [ 112 | "RedirectOutput" 113 | ] 114 | } 115 | ] 116 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/usr/local/bin/python" 3 | } -------------------------------------------------------------------------------- /StanfordDependTree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import os 4 | from nltk.tree import Tree # 导入 NLTK 库 5 | from stanford import * 6 | 7 | # 设置 UTF-8 输出环境 8 | reload(sys) 9 | sys.setdefaultencoding('utf-8') 10 | 11 | # 安装库 12 | root = '../stanford-corenlp/' 13 | jarpath = root + "stanford-parser.jar" 14 | modelpath = root + "models/lexparser/chinesePCFG.ser.gz" 15 | opttype = 'typedDependencies' # 'penn, typedDependencies' 16 | parser = StanfordParser(modelpath, jarpath, opttype) 17 | result = parser.parse("罗马尼亚 的 首都 是 布加勒斯特 。") 18 | print result 19 | -------------------------------------------------------------------------------- /StanfordSentTree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import os 4 | from nltk.tree import Tree # 导入 NLTK 库 5 | from stanford import * 6 | 7 | # 设置 UTF-8 输出环境 8 | reload(sys) 9 | sys.setdefaultencoding('utf-8') 10 | 11 | # 安装库 12 | root = '../stanford-corenlp/' 13 | jarpath = root + "stanford-parser.jar" 14 | modelpath = root + "models/lexparser/chinesePCFG.ser.gz" 15 | opttype = 'penn' # 宾州树库格式 16 | parser = StanfordParser(modelpath, jarpath, opttype) 17 | result = parser.parse("罗马尼亚 的 首都 是 布加勒斯特 。") 18 | print result 19 | tree = Tree.fromstring(result) 20 | tree.draw() -------------------------------------------------------------------------------- /jiebaSeg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import os 4 | import jieba # 导入结巴分词库 5 | 6 | # 设置 UTF-8 输出环境 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | # 结巴分词——全模式 11 | jieba.load_userdict("userdict") 12 | sent = "在包含问题的所有解的解空间树中,按照深度优先搜索的策略,从根节点出发深度探索解空间树。" 13 | wordlist = jieba.cut(sent, cut_all=True) 14 | print " | ".join(wordlist) 15 | 16 | # 结巴分词——精确切 17 | wordlist = jieba.cut(sent) # cut_all=True 18 | print " | ".join(wordlist) 19 | 20 | # 结巴分词——搜索引擎模式 21 | wordlist = jieba.cut_for_search(sent) 22 | print " | ".join(wordlist) 23 | -------------------------------------------------------------------------------- /ltpEntity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import os 4 | from pyltp import * 5 | 6 | # 设置 UTF-8 输出环境 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | sent = "欧洲 东部 的 罗马尼亚 , 首都 是 布加勒斯特 , 也 是 一 座 世界性 的 城市 。" 11 | words = sent.split(" ") 12 | postagger = Postagger() 13 | postagger.load("../ltp3.4/pos.model") # 导入词性标注模块 14 | postags = postagger.postag(words) 15 | 16 | recognizer = NamedEntityRecognizer() 17 | recognizer.load("../ltp3.4/ner.model") # 导入命名实体识别模块 18 | netags = recognizer.recognize(words, postags) 19 | 20 | for word, postag, netag in zip(words, postags, netags): 21 | print word + "/" + postag + "/" + netag, -------------------------------------------------------------------------------- /ltpPOS.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import os 4 | from pyltp import * 5 | 6 | # 设置 UTF-8 输出环境 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | model_path = "../ltp3.4/cws.model" 11 | pos_path = "../ltp3.4/pos.model" 12 | 13 | sent = "在 包含 问题 的 所有 解 的 解空间树 中 , 按照 深度优先 搜索 的 策略 ,从 根节点 出发 深度 搜索 解空间树 。" 14 | words = sent.split(" ") 15 | words 16 | postagger = Postagger() # 实例化词性标注类 17 | postagger.load(pos_path) # 导入词性标注模型 18 | postags = postagger.postag(words) 19 | for word,postag in zip(words, postags): 20 | print word + "/" + postag, -------------------------------------------------------------------------------- /ltpParser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import os 4 | import nltk 5 | from nltk.tree import Tree # 导入 nltk tree 结构 6 | from nltk.grammar import DependencyGrammar # 导入依存句法包 7 | from nltk.parse import * 8 | from pyltp import * # 导入 ltp 应用包 9 | import re 10 | 11 | reload(sys) 12 | sys.setdefaultencoding('utf-8') # 设置 UTF-8 输出环境 13 | 14 | words = "罗马尼亚 的 首都 是 布加勒斯特 。".split(" ") # 例句 15 | 16 | postagger = Postagger() # 首先对句子进行词性标注 17 | postagger.load("../ltp3.4/pos.model") 18 | postags = postagger.postag(words) 19 | 20 | parser = Parser() # 将词性标注和分词结果都加入分析器中进行句法解析 21 | parser.load("../ltp3.4/parser.model") 22 | arcs = parser.parse(words, postags) 23 | arclen = len(arcs) 24 | conll = "" 25 | for i in xrange(arclen): # 构建 Conll 标准的数据结构 26 | if arcs[i].head == 0: 27 | arcs[i].relation = "ROOT" 28 | conll += "\t" + words[i] + "(" + postags[i] + ")" + "\t" + postags[i] + "\t" + str(arcs[i].head) + "\t" + arcs[i].relation + "\n" 29 | 30 | print conll 31 | 32 | conlltree = DependencyGraph(conll) # 转换为依存句法图 33 | tree = conlltree.tree() # 构建树结构 34 | tree.draw() # 显示输出的树 -------------------------------------------------------------------------------- /ltpSeg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import os 4 | from pyltp import Segmentor 5 | 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | 9 | postdict = {"解 | 空间":"解空间", "深度 | 优先":"深度优先"} 10 | 11 | model_path = "../ltp3.4/cws.model" 12 | user_dict = "../ltp3.4/userdict" 13 | 14 | segmentor = Segmentor() 15 | #segmentor.load(model_path) 16 | segmentor.load_with_lexicon(model_path, user_dict) 17 | 18 | words = segmentor.segment("在包含问题的所有解的解空间树中,按照深度优先搜索的策略,从根节点出发深度搜索解空间树。") 19 | seg_sent = " | ".join(words) 20 | for key in postdict: 21 | seg_sent = seg_sent.replace(key, postdict[key]) 22 | print seg_sent 23 | 24 | segmentor.release() -------------------------------------------------------------------------------- /ltpSementic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import os 4 | from pyltp import * 5 | 6 | # 设置 UTF-8 输出环境 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | MODELDIR = "../ltp3.4/" 11 | sentence = "欧洲东部的罗马尼亚,首都是布加勒斯特,也是一座世界性的城市。" 12 | segmentor = Segmentor() 13 | segmentor.load(os.path.join(MODELDIR, "cws.model")) 14 | words = segmentor.segment(sentence) 15 | wordlist = list(words) # 从生成器变为列表元素 16 | 17 | postagger = Postagger() 18 | postagger.load(os.path.join(MODELDIR, "pos.model")) 19 | postags = postagger.postag(words) 20 | 21 | parser = Parser() 22 | parser.load(os.path.join(MODELDIR, "parser.model")) 23 | arcs = parser.parse(words, postags) 24 | 25 | recognizer = NamedEntityRecognizer() 26 | recognizer.load(os.path.join(MODELDIR, "ner.model")) 27 | netags = recognizer.recognize(words, postags) 28 | 29 | # 语义角色标注 30 | labeller = SementicRoleLabeller() 31 | labeller.load(os.path.join(MODELDIR, "pisrl.model")) 32 | roles = labeller.label(words, postags, arcs) 33 | 34 | # 输出标注结果 35 | for role in roles: 36 | print 'rel: ', wordlist[role.index] # 谓词 37 | for arg in role.arguments: 38 | if arg.range.start != arg.range.end: 39 | print arg.name, ' '.join(wordlist[arg.range.start:arg.range.end]) 40 | else: 41 | print arg.name, wordlist[arg.range.start] -------------------------------------------------------------------------------- /ner_test: -------------------------------------------------------------------------------- 1 | 欧洲/LOCATION 东部/O 的/O 罗马尼亚/GPE ,/O 首都/O 是/O 布加勒斯特/GPE ,/O 也/O 是/O 一/O 座/O 世界性/O 的/O 城市/O 。/O 2 | -------------------------------------------------------------------------------- /postest: -------------------------------------------------------------------------------- 1 | 在 包含 问题 的 所有 解 的 解空间树 中 , 按照 深度优先 搜索 的 策略 ,从 根节点 出发 深度 搜索 解空间树 。 2 | -------------------------------------------------------------------------------- /result: -------------------------------------------------------------------------------- 1 | 在/P 包含/VV 问题/NN 的/DEC 所有/DT 解/VV 的/DEC 解空间树/NN 中/LC ,/PU 按照/P 深度优先/NN 搜索/NN 的/DEC 策略/NN ,从/NN 根节点/NN 出发/VV 深度/JJ 搜索/NN 解空间树/VV 。/PU 2 | -------------------------------------------------------------------------------- /stanford.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import os 4 | 5 | # CoreNLP 3.9.1 jar 包和中文模板包 6 | class StanfordCoreNLP(): # 所有 StanfordNLP的父类 7 | def __init__(self, jarpath, modelpath): 8 | self.tempsrcpath = "tempsrc" # 输入临时文件路径 9 | self.jarpath = jarpath 10 | self.modelpath = modelpath 11 | 12 | def savefile(self, path, sent): # 创建临时文件存储路径 13 | fp = open(path, "wb") 14 | fp.write(sent) 15 | fp.close() 16 | 17 | def delfile(self, path): # 删除临时文件 18 | os.remove(path) 19 | 20 | class StanfordPOSTagger(StanfordCoreNLP): # 词性标注子类 21 | def __init__(self, jarpath, modelpath): 22 | StanfordCoreNLP.__init__(self, jarpath, modelpath) 23 | self.classfier = "edu.stanford.nlp.tagger.maxent.MaxentTagger" # 词性标注主类 24 | self.delimiter = "/" # 标签分隔符 25 | self.__buildcmd() 26 | 27 | def __buildcmd(self): # 构建命令行 28 | self.cmdline = 'java -mx1g -cp "' + self.jarpath + '" ' + self.classfier + ' -model "' + self.modelpath + '" -tagSeparator ' + self.delimiter 29 | print self.cmdline 30 | 31 | def tag(self, sent): # 标注句子 32 | self.savefile(self.tempsrcpath, sent) 33 | tagtxt = os.popen(self.cmdline + " -textFile " + self.tempsrcpath, 'r').read() # 结果输出到变量中 34 | self.delfile(self.tempsrcpath) 35 | return tagtxt 36 | 37 | def tagfile(self, inputpath, outpath): # 标注文件 38 | os.system(self.cmdline + ' -textFile ' + inputpath + ' > ' + outpath) 39 | 40 | class StanfordNERTagger(StanfordCoreNLP): 41 | def __init__(self, jarpath, modelpath): 42 | StanfordCoreNLP.__init__(self, jarpath, modelpath) 43 | self.classifier = "edu.stanford.nlp.ie.crf.CRFClassifier" 44 | self.__buildcmd() 45 | 46 | # 构建命令行 47 | def __buildcmd(self): 48 | self.cmdline = 'java -mx1g -cp "' + self.jarpath + '" ' + self.classifier + ' -loadClassifier "' + self.modelpath + '"' 49 | print self.cmdline 50 | 51 | # 标注句子 52 | def tag(self, sent): 53 | self.savefile(self.tempsrcpath, sent) 54 | tagtxt = os.popen(self.cmdline + ' -textFile ' + self.tempsrcpath, 'r').read() # 输出到变量中 55 | self.delfile(self.tempsrcpath) 56 | return tagtxt 57 | 58 | # 标注文件 59 | def tagfile(self, sent, outpath): 60 | self.savefile(self.tempsrcpath, sent) 61 | os.system(self.cmdline + ' -textFile ' + self.tempsrcpath + ' > ' + outpath ) 62 | self.delfile(self.tempsrcpath) 63 | 64 | class StanfordParser(StanfordCoreNLP): 65 | def __init__(self, modelpath, jarpath, opttype): 66 | StanfordCoreNLP.__init__(self, jarpath, modelpath) 67 | self.modelpath = modelpath # 模型文件路径 68 | self.classifier = "edu.stanford.nlp.parser.lexparser.LexicalizedParser" 69 | self.opttype = opttype 70 | self.__buildcmd() 71 | 72 | # 构建命令行 73 | def __buildcmd(self): 74 | self.cmdline = 'java -mx500m -cp "' + self.jarpath + '" ' + self.classifier + ' -outputFormat "' + self.opttype + '" ' + self.modelpath + ' ' 75 | print self.cmdline 76 | 77 | # 解析句子 78 | def parse(self, sent): 79 | self.savefile(self.tempsrcpath, sent) 80 | tagtxt = os.popen(self.cmdline + self.tempsrcpath, "r").read() # 输出到变量中 81 | self.delfile(self.tempsrcpath) 82 | return tagtxt 83 | 84 | # 输出到文件 85 | def tagfile(self, sent, outpath): 86 | self.savefile(self.tempsrcpath, sent) 87 | os.system(self.cmdline + self.tempsrcpath + ' > ' + outpath ) 88 | self.delfile(self.tempsrcpath) -------------------------------------------------------------------------------- /stanford.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cgnerds/nlpBookStudy/fa603bad297fb01994922f529730210dc5bcc2bf/stanford.pyc -------------------------------------------------------------------------------- /stanfordNER.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import os 4 | from stanford import StanfordNERTagger 5 | 6 | # 设置 UTF-8 输出环境 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | root = '../stanford-corenlp/' 11 | jarpath = root + "stanford-ner.jar" 12 | modelpath = root + "models/ner/chinese.misc.distsim.crf.ser.gz" 13 | 14 | st = StanfordNERTagger(jarpath, modelpath) 15 | seg_sent = "欧洲 东部 的 罗马尼亚 , 首都 是 布加勒斯特 , 也 是 一 座 世界性 的 城市 。" 16 | taglist = st.tagfile(seg_sent, "ner_test") 17 | print taglist -------------------------------------------------------------------------------- /stanfordPOS.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import os 4 | from stanford import StanfordPOSTagger 5 | 6 | # 设置 UTF-8 输出环境 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | root = '../stanford-corenlp/' 11 | jarpath = root + "stanford-postagger.jar" 12 | modelpath = root + "models/pos-tagger/chinese-distsim/chinese-distsim.tagger" 13 | 14 | st = StanfordPOSTagger(jarpath, modelpath) 15 | seg_sent = "在 包含 问题 的 所有 解 的 解空间树 中 , 按照 深度优先 搜索 的 策略 ,从 根节点 出发 深度 搜索 解空间树 。" 16 | postest = "postest" 17 | result = "result" 18 | taglist = st.tag(seg_sent) 19 | print taglist -------------------------------------------------------------------------------- /userdict: -------------------------------------------------------------------------------- 1 | 解空间 5 n 2 | 解空间树 5 n 3 | 根节点 5 n 4 | 深度优先 5 n --------------------------------------------------------------------------------