├── DataStudy.py
├── README.md
└── SVMEvaluate.py


/DataStudy.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | import jieba
  3 | import urllib
  4 | import sys
  5 | from jieba import analyse
  6 | import os
  7 | import jieba.posseg as pseg
  8 | import numpy as np
  9 | import numpy
 10 | import jieba.analyse
 11 | from sklearn import feature_extraction
 12 | #############################################################################
 13 | #############################将单个摘要分别分词保存#######################
 14 | WordCast=open('Cnki_label _abstruct2.csv', 'r')#CnkiLabelAbstruct.txt
 15 | ReadLines=WordCast.readlines()
 16 | JiebaCast=open('BitCnkiNOLabelAbstractJieba.txt', 'w')
 17 | stopwords = {}.fromkeys([line.rstrip() for line in open('StopWords.txt')])
 18 | for ReadLine in ReadLines:
 19 |     #print ReadLine[0]
 20 |     AbstractPart = jieba.analyse.extract_tags(ReadLine, topK=50, withWeight=False, allowPOS=())#1原来的数据，2，提取前20个，3权重是否显示，4，允许输出的词性
 21 |     segs = AbstractPart
 22 |     segs = [word.encode('utf-8') for word in list(segs)]
 23 |     segs = [word for word in list(segs) if word not in stopwords]
 24 | 
 25 |     for seg in segs:
 26 |         #if '\n' not in seg:
 27 |         JiebaCast.write('%s\t'%seg)
 28 |     # if len(segs)==1:
 29 |     #     JiebaCast.write('NA\n')
 30 |     JiebaCast.write('\n')
 31 | JiebaCast.close()
 32 | ##########################################数据测试部分##############################
 33 | 
 34 | ########################################################
 35 | ##########################生成文本向量################
 36 | WordCast=open('DictionaryText.txt', 'r')#导入总词典向量
 37 | ReadLines=WordCast.readlines()
 38 | LabelOnly1=open('OnlyTable.txt', 'r')#导入标签向量
 39 | LabelOnly=LabelOnly1.readlines()
 40 | JiebaWordCast=open('BitCnkiNOLabelAbstractJieba.txt', 'r')#导入每个摘要分词后的词语，用于接下来生成每个摘要的向量
 41 | JiebaReadLines=JiebaWordCast.readlines()
 42 | DictionaryWriteNumber=0
 43 | VectorEachAbstract=open('VectorEachAbstract\%s.txt'%DictionaryWriteNumber, 'w')#将每个向量保存起来
 44 | DictionaryNumber=0
 45 | 
 46 | DictionaryTranslate={'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'H':8}
 47 | for JiebaText in JiebaReadLines:
 48 |     VectorAbstract = JiebaText.split("\t")
 49 |     TextVector = numpy.zeros([len(ReadLines)+1])
 50 |     for Vector in VectorAbstract:
 51 |         VectorPar=0
 52 |         for ReadLine in ReadLines:
 53 |             ReadLine = ReadLine.strip('\n')
 54 |             if  ReadLine == Vector :
 55 |                 TextVector[VectorPar] = 1
 56 |                 break
 57 |             VectorPar+=1
 58 |     if 'A' in LabelOnly[DictionaryNumber]:
 59 |         TextVector[len(ReadLines)] = DictionaryTranslate['A']
 60 |     if 'B' in LabelOnly[DictionaryNumber]:
 61 |         TextVector[len(ReadLines)] = DictionaryTranslate['B']
 62 |     if 'C' in LabelOnly[DictionaryNumber]:
 63 |         TextVector[len(ReadLines)] = DictionaryTranslate['C']
 64 |     if 'D' in LabelOnly[DictionaryNumber]:
 65 |         TextVector[len(ReadLines)] = DictionaryTranslate['D']
 66 |     if 'E' in LabelOnly[DictionaryNumber]:
 67 |         TextVector[len(ReadLines)] = DictionaryTranslate['E']
 68 |     if 'F' in LabelOnly[DictionaryNumber]:
 69 |         TextVector[len(ReadLines)] = DictionaryTranslate['F']
 70 |     if 'G' in LabelOnly[DictionaryNumber]:
 71 |         TextVector[len(ReadLines)] = DictionaryTranslate['G']
 72 |     if 'H' in LabelOnly[DictionaryNumber]:
 73 |         TextVector[len(ReadLines)] = DictionaryTranslate['H']
 74 |     print DictionaryNumber
 75 |     DictionaryNumber+=1
 76 |     #if DictionaryNumber%500!=0:
 77 |     for Vector1 in TextVector:
 78 |         VectorEachAbstract.write('%s\t'%Vector1)
 79 |     VectorEachAbstract.write('\n')
 80 |     if DictionaryNumber%2000==0:
 81 |         DictionaryWriteNumber+=1
 82 |         VectorEachAbstract.close()
 83 |         VectorEachAbstract = open('VectorEachAbstract\%s.txt'%DictionaryWriteNumber, 'w')
 84 | 
 85 | WordCast.close()
 86 | LabelOnly1.close()
 87 | JiebaWordCast.close()
 88 | VectorEachAbstract.close()
 89 | #################################################################
 90 | #################################################################
 91 | ##########标签提取，A B C D E F G H#############################
 92 | ################################################################
 93 | WordCast=open('Cnki_label_abstruct.txt', 'r')#CnkiLabelAbstruct.txt
 94 | ReadLines=WordCast.readlines()
 95 | LineNumpy=[]
 96 | first_ele = True
 97 | LabelSelect=[]
 98 | WriteLines=open('LabelOnly.txt','w')
 99 | JiebaDictionary=[]
100 | for ReadLine in ReadLines:
101 |     ## 去掉每行的换行符，"\n"
102 |     Data2 = ReadLine.strip('\n')
103 |     Data3=Data2.strip(' ')
104 |     LineNumpy = Data3.split("\t")
105 |     LabelPart=LineNumpy[0]
106 |     LineNumpy[0]=LabelPart[0:3]
107 |     LabelSelect.append(LineNumpy)
108 |     print LineNumpy[0]
109 |     WriteLines.write('%s\n'%LineNumpy[0])
110 | 
111 | ########################################################
112 | #############方法1：直接分词，直接对词语标注############
113 | # WordCast=open('test5.txt', 'r')
114 | # ReadLines=WordCast.read()
115 | # DictionaryJieba=jieba.cut(ReadLines)
116 | 
117 |     #print odom
118 | 
119 |     #print ReadLine
120 | #print Read_lines
121 | #############################################################################
122 | ##########################方法2：用TF*IDF，他是降低维度的一种方式################
123 | WordCast=open('Cnki_label _abstruct2.csv', 'r')#CnkiLabelAbstruct.txt
124 | ReadLines=WordCast.read()
125 | keywords = jieba.analyse.extract_tags(ReadLines, topK=10000, withWeight=False, allowPOS=())#1原来的数据，2，提取前20个，3权重是否显示，4，允许输出的词性
126 | 
127 | 
128 | import codecs
129 | stopwords = {}.fromkeys([line.rstrip() for line in open('StopWords.txt')])
130 | print keywords
131 | segs = keywords
132 | segs = [word.encode('utf-8') for word in list(segs)]
133 | segs = [word for word in list(segs) if word not in stopwords]
134 | DictionaryText=[]
135 | WriteLines=open('DictionaryText.txt','w')
136 | for seg in segs:
137 |     WriteLines.write('%s\n'%seg)
138 |     DictionaryText.append(seg)
139 |     print seg
140 | 
141 | 
142 | ####WriteLines###########################################################################
143 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CNKI_Patent_SVM
2 | 文本分类是指在给定分类体系下 , 根据文本的内容自动确定文本类别的过程。首先我们根据scrapy爬虫根据中国知网URL的规律，爬取70多万条2014年公开的发明专利，然后通过数据清洗筛选出了60多万条含标签数据。通过TF-IDF对60多万条本文进行词频提取，依照词频排序提取前3000个词语形成语义词典，然后根据观察设置停用词。然后再用TF-IDF的方式对每个摘要进行词频选取，通过布尔模型，对比语义词典生成文本向量。然后对标签进行数字化转换。取90%的文本为训练集，10%的文本为测试集。用有监督学习的SVM算法对文本进行分类，（人类生活必需品、作业运输、化学冶金、纺织造纸、固定建筑物、机械工程、物理学、电学）分成8类
3 | 


--------------------------------------------------------------------------------
/SVMEvaluate.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | import numpy
 3 | import csv
 4 | # -*- coding: cp936 -*-
 5 | import numpy as np
 6 | import random
 7 | import matplotlib.pyplot as plt
 8 | import re
 9 | from os import listdir #读取整个目录的文件
10 | import string
11 | import sklearn
12 | from sklearn import svm
13 | Label=[]
14 | Vector=[]
15 | PartVector=[]
16 | EvaluateFunctionResult=listdir('SVMTestPart')
17 | for SingleFunction in EvaluateFunctionResult:
18 |     print SingleFunction
19 |     fr = open('SVMTestPart\%s' % SingleFunction)
20 |     TotalLineCost = fr.readlines()
21 |     fr.close()
22 |     for LineCost in TotalLineCost:
23 |         LineCost1=LineCost.strip()
24 |         NumberLineCost=LineCost1.split('\t')
25 |         if '2' in NumberLineCost[-1] or '4' in NumberLineCost[-1] or '8' in NumberLineCost[-1]:# or '8' in NumberLineCost[-1]:
26 |             NumberLineCost1 = NumberLineCost[:501]#降维度
27 |             Label.append(NumberLineCost[-1])
28 |             PartVector.append(NumberLineCost1)
29 |         #asvm详情请参见http://www.cnblogs.com/luyaoblog/p/6775342.html
30 |         # 1.split(数据，分割位置，轴 = 1（水平分割） or 0（垂直分割）)。
31 |         # 2.x = x[:, :2]是为方便后期画图更直观，故只取了前两列特征值向量训练。
32 |         # 3.sklearn.model_selection.train_test_split随机划分训练集与测试集。train_test_split(train_data, train_target, test_size=数字,random_state=0)
33 |         # 　　参数解释：
34 |         # 　　train_data：所要划分的样本特征集
35 |         # 　　train_target：所要划分的样本结果
36 |         # 　　test_size：样本占比，如果是整数的话就是样本的数量
37 |         # 　　random_state：是随机数的种子。
38 |         # 　　随机数种子：其实就是该组随机数的编号，在需要重复试验的时候，保证得到一组一样的随机数。比如你每次都填1，其他参数一样的情况下你得到的随机数组是一样的。但填0或不填，每次都会不一样。随机数的产生取决于种子，随机数和种子之间的关系遵从以下两个规则：种子不同，产生不同的随机数；种子相同，即使实例不同也产生相同的随机数。
39 | Vector=PartVector
40 | VectorTrain, VectorTest, LabelTrain, LabelTest = sklearn.model_selection.train_test_split(Vector, Label, random_state=1, train_size=0.8)
41 | clf = svm.SVC(C=0.9, kernel='rbf', gamma=100, decision_function_shape='ovo')
42 | clf.fit(VectorTrain, LabelTrain)
43 | #  　　kernel='linear'时，为线性核，C越大分类效果越好，但有可能会过拟合（defaul C=1）。
44 | # 　　 kernel='rbf'时（default），为高斯核，gamma值越小，分类界面越连续；gamma值越大，分类界面越“散”，分类效果越好，但有可能会过拟合。
45 | # 　　decision_function_shape='ovr'时，为one v rest，即一个类别与其他类别进行划分，
46 | # 　　decision_function_shape='ovo'时，为one v one，即将类别两两之间进行划分，用二分类的方法模拟多分类的结果。
47 | print clf.score(VectorTrain, LabelTrain)  # 精度
48 | print clf.score(VectorTest, LabelTest)
49 | 


--------------------------------------------------------------------------------