├── MultiLabelMovie.py ├── PrepareData.py ├── README.md ├── analysis_data.py ├── labels.txt └── score.txt /MultiLabelMovie.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | __author__ = 'Jeremy' 3 | from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | from sklearn.svm import LinearSVC 6 | from sklearn.feature_extraction.text import TfidfTransformer 7 | from sklearn.multiclass import OneVsRestClassifier 8 | from sklearn.metrics import average_precision_score, recall_score, precision_score, f1_score 9 | from sklearn.cross_validation import train_test_split 10 | 11 | from PrepareData import * 12 | 13 | 14 | def createLB(): 15 | f_labels = open("labels.txt","r").readlines() 16 | f_labels = [line.rstrip() for line in f_labels] 17 | lb = LabelEncoder() 18 | lb.fit(f_labels) 19 | return lb 20 | 21 | def createMLB(): 22 | labels_set = get_labels_set() 23 | mlb = MultiLabelBinarizer() 24 | mlb.fit(labels_set) 25 | return mlb 26 | 27 | def load_movie_data(): 28 | fr = open("labels_summary.txt","r") 29 | x_data, y_data = [], [] 30 | lb = createLB() 31 | mlb = MultiLabelBinarizer() 32 | label_set = get_labels_set() 33 | for line in fr.readlines(): 34 | line = line.rstrip() 35 | line_datas = line.split("--") 36 | summary = line_datas[-1] 37 | labels = line_datas[-2].split(' ') 38 | labels = [item for item in labels if item in label_set] 39 | if len(labels) == 0: 40 | continue 41 | labels = lb.transform(labels) 42 | x_data.append(summary) 43 | y_data.append(labels) 44 | y_data = mlb.fit_transform(y_data) 45 | return x_data, y_data, mlb, lb 46 | 47 | 48 | def tfidf_transformer(x_train, y_train): 49 | countVec = CountVectorizer() 50 | transformer = TfidfTransformer(norm="l2") 51 | matrix = countVec.fit_transform(x_train) 52 | tfidf_matrix = transformer.fit_transform(matrix) 53 | return transformer, countVec, tfidf_matrix 54 | 55 | def SVCClassifier(x_train, y_train): 56 | classifier = OneVsRestClassifier(LinearSVC(random_state=0)).fit(x_train, y_train) 57 | return classifier 58 | 59 | def statistic_result(y_test, y_score, lb): 60 | score_dict = {} 61 | n_class = y_score.shape[1] 62 | for i in range(n_class): 63 | precision = precision_score(y_test[:,i], y_score[:,i]) 64 | recall = recall_score(y_test[:,i], y_score[:,i]) 65 | f1_value = f1_score(y_test[:,i], y_score[:,i]) 66 | label = lb.classes_[i] 67 | score_dict[label] = (precision, recall, f1_value) 68 | return score_dict 69 | 70 | 71 | if __name__=="__main__": 72 | f=open("score.csv","w") 73 | x_data, y_data, mlb, lb = load_movie_data() 74 | x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25) 75 | tfidfTrans, countTrans, x_train = tfidf_transformer(x_train, y_train) 76 | x_test = tfidfTrans.transform(countTrans.transform(x_test)) 77 | classifier = SVCClassifier(x_train, y_train) 78 | y_score = classifier.predict(x_test) 79 | score_dict = statistic_result(y_test, y_score, lb) 80 | for key,value in score_dict.items(): 81 | line = key + ",%s,%s,%s" % value 82 | f.write(line) 83 | f.write("\n") 84 | f.close() 85 | -------------------------------------------------------------------------------- /PrepareData.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | __author__ = 'Jeremy' 3 | 4 | 5 | def get_labels_set(): 6 | fr = open("labels.txt","r") 7 | labels_set = set() 8 | 9 | for label in fr.readlines(): 10 | label = label.rstrip() 11 | labels_set.add(label) 12 | 13 | fr.close() 14 | return labels_set 15 | 16 | 17 | def print_label_result(mlb, lb, y_score): 18 | for label_set in mlb.inverse_transform(y_score): 19 | if len(label_set) == 0: 20 | print u"其他" 21 | continue 22 | label = lb.inverse_transform(label_set) 23 | for i in label: 24 | print i, 25 | print 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #### 内容简述 2 | 这只是对多标签分类的一个练手的代码 3 | 4 | 数据爬取自豆瓣,收集了近两万部电影的标签和简介内容 5 | 6 | 使用的是scikit中的分类方法。中文分词使用的结巴分词。 7 | 8 | #### 文件内容 9 | 1. labels_summary.txt 保存的是电影数据,包含标签、和简介 10 | 2. labels.txt 保存的是数据中出现的labels,对于冷门的标签进行了提出 11 | 3. MultiLabelMovie.py 包含了构建VSM模型、分类器等,是项目的主要代码 12 | -------------------------------------------------------------------------------- /analysis_data.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | __author__ = 'Jeremy' 4 | 5 | 6 | def get_labels_set(): 7 | fr = open("labels_summary.txt","r") 8 | labels_dict = {} 9 | for line in fr.readlines(): 10 | line = line.rstrip() 11 | line_datas = line.split("--") 12 | labels = line_datas[-2].split(' ') 13 | for label in labels: 14 | if label not in labels_dict.keys(): 15 | labels_dict.setdefault(label, 1) 16 | else: 17 | labels_dict[label] +=1 18 | 19 | for key, value in labels_dict.items(): 20 | print key, value 21 | print "Done" 22 | 23 | if __name__ == "__main__": 24 | get_labels_set() -------------------------------------------------------------------------------- /labels.txt: -------------------------------------------------------------------------------- 1 | 剧情 2 | 喜剧 3 | 爱情 4 | 惊悚 5 | 动作 6 | 犯罪 7 | 恐怖 8 | 动画 9 | 冒险 10 | 短片 11 | 悬疑 12 | 奇幻 13 | 科幻 14 | 家庭 15 | 纪录片 16 | 战争 17 | 音乐 18 | 传记 19 | 历史 20 | 同性 21 | 运动 22 | 歌舞 23 | 西部 24 | 儿童 25 | 古装 26 | 情色 27 | 武侠 28 | -------------------------------------------------------------------------------- /score.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NobodyWHU/MultiLabelClassification/5a820fd1e504993b4948116d8357b8b36bec0777/score.txt --------------------------------------------------------------------------------