├── DataStudy.py ├── README.md └── SVMEvaluate.py /DataStudy.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | import jieba 3 | import urllib 4 | import sys 5 | from jieba import analyse 6 | import os 7 | import jieba.posseg as pseg 8 | import numpy as np 9 | import numpy 10 | import jieba.analyse 11 | from sklearn import feature_extraction 12 | ############################################################################# 13 | #############################将单个摘要分别分词保存####################### 14 | WordCast=open('Cnki_label _abstruct2.csv', 'r')#CnkiLabelAbstruct.txt 15 | ReadLines=WordCast.readlines() 16 | JiebaCast=open('BitCnkiNOLabelAbstractJieba.txt', 'w') 17 | stopwords = {}.fromkeys([line.rstrip() for line in open('StopWords.txt')]) 18 | for ReadLine in ReadLines: 19 | #print ReadLine[0] 20 | AbstractPart = jieba.analyse.extract_tags(ReadLine, topK=50, withWeight=False, allowPOS=())#1原来的数据,2,提取前20个,3权重是否显示,4,允许输出的词性 21 | segs = AbstractPart 22 | segs = [word.encode('utf-8') for word in list(segs)] 23 | segs = [word for word in list(segs) if word not in stopwords] 24 | 25 | for seg in segs: 26 | #if '\n' not in seg: 27 | JiebaCast.write('%s\t'%seg) 28 | # if len(segs)==1: 29 | # JiebaCast.write('NA\n') 30 | JiebaCast.write('\n') 31 | JiebaCast.close() 32 | ##########################################数据测试部分############################## 33 | 34 | ######################################################## 35 | ##########################生成文本向量################ 36 | WordCast=open('DictionaryText.txt', 'r')#导入总词典向量 37 | ReadLines=WordCast.readlines() 38 | LabelOnly1=open('OnlyTable.txt', 'r')#导入标签向量 39 | LabelOnly=LabelOnly1.readlines() 40 | JiebaWordCast=open('BitCnkiNOLabelAbstractJieba.txt', 'r')#导入每个摘要分词后的词语,用于接下来生成每个摘要的向量 41 | JiebaReadLines=JiebaWordCast.readlines() 42 | DictionaryWriteNumber=0 43 | VectorEachAbstract=open('VectorEachAbstract\%s.txt'%DictionaryWriteNumber, 'w')#将每个向量保存起来 44 | DictionaryNumber=0 45 | 46 | DictionaryTranslate={'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'H':8} 47 | for JiebaText in JiebaReadLines: 48 | VectorAbstract = JiebaText.split("\t") 49 | TextVector = numpy.zeros([len(ReadLines)+1]) 50 | for Vector in VectorAbstract: 51 | VectorPar=0 52 | for ReadLine in ReadLines: 53 | ReadLine = ReadLine.strip('\n') 54 | if ReadLine == Vector : 55 | TextVector[VectorPar] = 1 56 | break 57 | VectorPar+=1 58 | if 'A' in LabelOnly[DictionaryNumber]: 59 | TextVector[len(ReadLines)] = DictionaryTranslate['A'] 60 | if 'B' in LabelOnly[DictionaryNumber]: 61 | TextVector[len(ReadLines)] = DictionaryTranslate['B'] 62 | if 'C' in LabelOnly[DictionaryNumber]: 63 | TextVector[len(ReadLines)] = DictionaryTranslate['C'] 64 | if 'D' in LabelOnly[DictionaryNumber]: 65 | TextVector[len(ReadLines)] = DictionaryTranslate['D'] 66 | if 'E' in LabelOnly[DictionaryNumber]: 67 | TextVector[len(ReadLines)] = DictionaryTranslate['E'] 68 | if 'F' in LabelOnly[DictionaryNumber]: 69 | TextVector[len(ReadLines)] = DictionaryTranslate['F'] 70 | if 'G' in LabelOnly[DictionaryNumber]: 71 | TextVector[len(ReadLines)] = DictionaryTranslate['G'] 72 | if 'H' in LabelOnly[DictionaryNumber]: 73 | TextVector[len(ReadLines)] = DictionaryTranslate['H'] 74 | print DictionaryNumber 75 | DictionaryNumber+=1 76 | #if DictionaryNumber%500!=0: 77 | for Vector1 in TextVector: 78 | VectorEachAbstract.write('%s\t'%Vector1) 79 | VectorEachAbstract.write('\n') 80 | if DictionaryNumber%2000==0: 81 | DictionaryWriteNumber+=1 82 | VectorEachAbstract.close() 83 | VectorEachAbstract = open('VectorEachAbstract\%s.txt'%DictionaryWriteNumber, 'w') 84 | 85 | WordCast.close() 86 | LabelOnly1.close() 87 | JiebaWordCast.close() 88 | VectorEachAbstract.close() 89 | ################################################################# 90 | ################################################################# 91 | ##########标签提取,A B C D E F G H############################# 92 | ################################################################ 93 | WordCast=open('Cnki_label_abstruct.txt', 'r')#CnkiLabelAbstruct.txt 94 | ReadLines=WordCast.readlines() 95 | LineNumpy=[] 96 | first_ele = True 97 | LabelSelect=[] 98 | WriteLines=open('LabelOnly.txt','w') 99 | JiebaDictionary=[] 100 | for ReadLine in ReadLines: 101 | ## 去掉每行的换行符,"\n" 102 | Data2 = ReadLine.strip('\n') 103 | Data3=Data2.strip(' ') 104 | LineNumpy = Data3.split("\t") 105 | LabelPart=LineNumpy[0] 106 | LineNumpy[0]=LabelPart[0:3] 107 | LabelSelect.append(LineNumpy) 108 | print LineNumpy[0] 109 | WriteLines.write('%s\n'%LineNumpy[0]) 110 | 111 | ######################################################## 112 | #############方法1:直接分词,直接对词语标注############ 113 | # WordCast=open('test5.txt', 'r') 114 | # ReadLines=WordCast.read() 115 | # DictionaryJieba=jieba.cut(ReadLines) 116 | 117 | #print odom 118 | 119 | #print ReadLine 120 | #print Read_lines 121 | ############################################################################# 122 | ##########################方法2:用TF*IDF,他是降低维度的一种方式################ 123 | WordCast=open('Cnki_label _abstruct2.csv', 'r')#CnkiLabelAbstruct.txt 124 | ReadLines=WordCast.read() 125 | keywords = jieba.analyse.extract_tags(ReadLines, topK=10000, withWeight=False, allowPOS=())#1原来的数据,2,提取前20个,3权重是否显示,4,允许输出的词性 126 | 127 | 128 | import codecs 129 | stopwords = {}.fromkeys([line.rstrip() for line in open('StopWords.txt')]) 130 | print keywords 131 | segs = keywords 132 | segs = [word.encode('utf-8') for word in list(segs)] 133 | segs = [word for word in list(segs) if word not in stopwords] 134 | DictionaryText=[] 135 | WriteLines=open('DictionaryText.txt','w') 136 | for seg in segs: 137 | WriteLines.write('%s\n'%seg) 138 | DictionaryText.append(seg) 139 | print seg 140 | 141 | 142 | ####WriteLines########################################################################### 143 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CNKI_Patent_SVM 2 | 文本分类是指在给定分类体系下 , 根据文本的内容自动确定文本类别的过程。首先我们根据scrapy爬虫根据中国知网URL的规律,爬取70多万条2014年公开的发明专利,然后通过数据清洗筛选出了60多万条含标签数据。通过TF-IDF对60多万条本文进行词频提取,依照词频排序提取前3000个词语形成语义词典,然后根据观察设置停用词。然后再用TF-IDF的方式对每个摘要进行词频选取,通过布尔模型,对比语义词典生成文本向量。然后对标签进行数字化转换。取90%的文本为训练集,10%的文本为测试集。用有监督学习的SVM算法对文本进行分类,(人类生活必需品、作业运输、化学冶金、纺织造纸、固定建筑物、机械工程、物理学、电学)分成8类 3 | -------------------------------------------------------------------------------- /SVMEvaluate.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | import numpy 3 | import csv 4 | # -*- coding: cp936 -*- 5 | import numpy as np 6 | import random 7 | import matplotlib.pyplot as plt 8 | import re 9 | from os import listdir #读取整个目录的文件 10 | import string 11 | import sklearn 12 | from sklearn import svm 13 | Label=[] 14 | Vector=[] 15 | PartVector=[] 16 | EvaluateFunctionResult=listdir('SVMTestPart') 17 | for SingleFunction in EvaluateFunctionResult: 18 | print SingleFunction 19 | fr = open('SVMTestPart\%s' % SingleFunction) 20 | TotalLineCost = fr.readlines() 21 | fr.close() 22 | for LineCost in TotalLineCost: 23 | LineCost1=LineCost.strip() 24 | NumberLineCost=LineCost1.split('\t') 25 | if '2' in NumberLineCost[-1] or '4' in NumberLineCost[-1] or '8' in NumberLineCost[-1]:# or '8' in NumberLineCost[-1]: 26 | NumberLineCost1 = NumberLineCost[:501]#降维度 27 | Label.append(NumberLineCost[-1]) 28 | PartVector.append(NumberLineCost1) 29 | #asvm详情请参见http://www.cnblogs.com/luyaoblog/p/6775342.html 30 | # 1.split(数据,分割位置,轴 = 1(水平分割) or 0(垂直分割))。 31 | # 2.x = x[:, :2]是为方便后期画图更直观,故只取了前两列特征值向量训练。 32 | # 3.sklearn.model_selection.train_test_split随机划分训练集与测试集。train_test_split(train_data, train_target, test_size=数字,random_state=0) 33 | #   参数解释: 34 | #   train_data:所要划分的样本特征集 35 | #   train_target:所要划分的样本结果 36 | #   test_size:样本占比,如果是整数的话就是样本的数量 37 | #   random_state:是随机数的种子。 38 | #   随机数种子:其实就是该组随机数的编号,在需要重复试验的时候,保证得到一组一样的随机数。比如你每次都填1,其他参数一样的情况下你得到的随机数组是一样的。但填0或不填,每次都会不一样。随机数的产生取决于种子,随机数和种子之间的关系遵从以下两个规则:种子不同,产生不同的随机数;种子相同,即使实例不同也产生相同的随机数。 39 | Vector=PartVector 40 | VectorTrain, VectorTest, LabelTrain, LabelTest = sklearn.model_selection.train_test_split(Vector, Label, random_state=1, train_size=0.8) 41 | clf = svm.SVC(C=0.9, kernel='rbf', gamma=100, decision_function_shape='ovo') 42 | clf.fit(VectorTrain, LabelTrain) 43 | #   kernel='linear'时,为线性核,C越大分类效果越好,但有可能会过拟合(defaul C=1)。 44 | #    kernel='rbf'时(default),为高斯核,gamma值越小,分类界面越连续;gamma值越大,分类界面越“散”,分类效果越好,但有可能会过拟合。 45 | #   decision_function_shape='ovr'时,为one v rest,即一个类别与其他类别进行划分, 46 | #   decision_function_shape='ovo'时,为one v one,即将类别两两之间进行划分,用二分类的方法模拟多分类的结果。 47 | print clf.score(VectorTrain, LabelTrain) # 精度 48 | print clf.score(VectorTest, LabelTest) 49 | --------------------------------------------------------------------------------