├── LICENSE ├── README.md ├── project ├── 1-process_data │ ├── 1-README.txt │ ├── CSVtolabel.py │ ├── jieba_cut_fliter.py │ └── toCSV.py ├── 2-word2vec │ ├── 1-README.txt │ ├── class_w2v.py │ ├── main.py │ └── preprocess.py ├── 3-main │ ├── README.txt │ ├── STFIWF.py │ ├── classify.py │ ├── main.py │ └── preprocess.py └── README.txt └── 答辩ppt -.ppt /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 陈潇凯 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2016CCF_BDCI_Sougou 2 | 【The Right团队-源码以及PPT分享】2016CCF大数据与计算智能大赛:精准营销中搜狗用户画像挖掘 3 | 4 | 具体详见我的博客: 5 | [传送门](http://coderskychen.cn/2016/12/28/%E3%80%90%E5%B9%B2%E8%B4%A7%E5%88%86%E4%BA%AB%E3%80%912016CCF%E5%A4%A7%E6%95%B0%E6%8D%AE%E4%B8%8E%E8%AE%A1%E7%AE%97%E6%99%BA%E8%83%BD%E5%A4%A7%E8%B5%9B-%E6%90%9C%E7%8B%97%E7%94%A8%E6%88%B7%E7%94%BB%E5%83%8F%E6%8C%96%E6%8E%98/) 6 | 7 | 复赛数据下载链接: 8 | http://pan.baidu.com/s/1mi9DjIg 9 | 密码:g8i9 10 | 11 | 初识python,代码写的很粗糙,多多包涵~ 12 | -------------------------------------------------------------------------------- /project/1-process_data/1-README.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderSkyChen/2016CCF_BDCI_Sougou/d49c94be714265adbad27ece0d84658264b1a3d4/project/1-process_data/1-README.txt -------------------------------------------------------------------------------- /project/1-process_data/CSVtolabel.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | 根据上一步骤得到的CSV文件,将搜索文本以及三个属性剥离,保存为相应的文件 4 | 注意路径 5 | """ 6 | import pandas as pd 7 | 8 | #path of the train and test files 9 | trainname = 'user_tag_query.10W.TRAIN.csv' 10 | testname = 'user_tag_query.10W.TEST.csv' 11 | 12 | data = pd.read_csv(trainname) 13 | print data.info() 14 | 15 | #generate three labels for age/gender/education 16 | data.age.to_csv("train_age.csv", index=False) 17 | data.Gender.to_csv("train_gender.csv", index=False) 18 | data.Education.to_csv("train_education.csv", index=False) 19 | #generate trainfile's text file 20 | data.QueryList.to_csv("train_querylist.csv", index=False) 21 | 22 | data = pd.read_csv(testname) 23 | print data.info() 24 | #generate testfile's text file 25 | data.QueryList.to_csv("test_querylist.csv", index=False) 26 | 27 | 28 | -------------------------------------------------------------------------------- /project/1-process_data/jieba_cut_fliter.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | 调用jieba分词,完成搜索文本的分词。同时只保留n,v,j三种词性。 4 | 注意输入的文件为纯文本格式,最好一个用户的搜索历史为一行。 5 | 注意路径 6 | """ 7 | import pandas as pd 8 | import jieba.analyse 9 | import time 10 | import jieba 11 | import jieba.posseg 12 | import os, sys 13 | reload(sys) 14 | sys.setdefaultencoding('utf8') 15 | def input(trainname): 16 | traindata = [] 17 | with open(trainname, 'rb') as f: 18 | line = f.readline() 19 | count = 0 20 | while line: 21 | try: 22 | traindata.append(line) 23 | count += 1 24 | except: 25 | print "error:", line, count 26 | line=f.readline() 27 | return traindata 28 | start = time.clock() 29 | 30 | filepath = 'train.csv' 31 | QueryList = input(filepath) 32 | 33 | writepath = 'writefile.csv' 34 | csvfile = open(writepath, 'w') 35 | #parallel:speed up 36 | jieba.enable_parallel() 37 | POS = {} 38 | for i in range(len(QueryList)): 39 | s = [] 40 | str = "" 41 | words = jieba.posseg.cut(QueryList[i])# 带有词性的精确分词模式 42 | allowPOS = ['n','v','j'] 43 | for word, flag in words: 44 | POS[flag]=POS.get(flag,0)+1 45 | if (flag[0] in allowPOS) and len(word)>=2: 46 | str += word + " " 47 | s.append(str.encode('utf8')) 48 | csvfile.write(" ".join(s)+'\n') 49 | csvfile.close() 50 | print POS 51 | 52 | end = time.clock() 53 | print "total time: %f s" % (end - start) 54 | 55 | 56 | # seg_list = jieba.cut("陶喆下载", cut_all=False) 57 | # print("Default Mode: " + "/ ".join(seg_list)) # 默认模式 58 | # 59 | # words = jieba.posseg.cut("陶喆下载") 60 | # for word, flag in words: 61 | # print('%s %s' % (word, flag)) 62 | -------------------------------------------------------------------------------- /project/1-process_data/toCSV.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | add变量表示了原始文件的路径,TRAIN/TEST 4 | csvfile表示了生成文件的信息 5 | 主要功能:把原始文件转为UTF-8格式 6 | 注意路径 7 | """ 8 | import csv 9 | 10 | add = 'user_tag_query.10W.TRAIN' #path of the original train file 11 | 12 | csvfile = file(add + '.csv', 'wb')# the path of the generated train file 13 | writer = csv.writer(csvfile) 14 | writer.writerow(['ID', 'age', 'Gender', 'Education', 'QueryList']) 15 | with open(add, 'r') as f: 16 | for line in f: 17 | line.strip() 18 | data = line.split("\t") 19 | writedata = [data[0], data[1], data[2], data[3]] 20 | querystr = '' 21 | data[-1]=data[-1][:-1] 22 | for d in data[4:]: 23 | try: 24 | querystr += d.decode('GB18030').encode('utf8') + '\t' 25 | except: 26 | print data[0],querystr 27 | querystr = querystr[:-1] 28 | writedata.append(querystr) 29 | writer.writerow(writedata) 30 | 31 | add = 'user_tag_query.10W.TEST'#path of the original test file 32 | 33 | csvfile = file(add + '.csv', 'wb')# the path of the generated test file 34 | writer = csv.writer(csvfile) 35 | writer.writerow(['ID', 'QueryList']) 36 | with open(add, 'r') as f: 37 | for line in f: 38 | data = line.split("\t") 39 | writedata = [data[0]] 40 | querystr = '' 41 | data[-1]=data[-1][:-1] 42 | for d in data[1:]: 43 | try: 44 | querystr += d.decode('GB18030').encode('utf8') + '\t' 45 | except: 46 | print data[0],querystr 47 | querystr = querystr[:-1] 48 | writedata.append(querystr) 49 | writer.writerow(writedata) 50 | 51 | -------------------------------------------------------------------------------- /project/2-word2vec/1-README.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderSkyChen/2016CCF_BDCI_Sougou/d49c94be714265adbad27ece0d84658264b1a3d4/project/2-word2vec/1-README.txt -------------------------------------------------------------------------------- /project/2-word2vec/class_w2v.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from sklearn.cross_validation import KFold, StratifiedKFold 3 | from gensim.models import word2vec 4 | import xgboost as xgb 5 | import numpy as np 6 | from sklearn.linear_model import SGDClassifier, LogisticRegression 7 | from sklearn.svm import LinearSVC,SVC 8 | from sklearn.ensemble import VotingClassifier 9 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB 10 | from sklearn.preprocessing import MinMaxScaler,StandardScaler 11 | class w2v(): 12 | def __init__(self,size=300): 13 | random_rate = 8240 14 | self.size=size 15 | self.svc= SVC(C=1, random_state=random_rate) 16 | self.LR = LogisticRegression(C=1.0, max_iter=100, class_weight='balanced', random_state=random_rate, n_jobs=-1) 17 | self.clf = LinearSVC(random_state=random_rate) 18 | 19 | def fit(self, X, Y, T): 20 | """ 21 | train and predict 22 | """ 23 | print 'fitting..' 24 | self.LR.fit(X, Y) 25 | res = self.LR.predict(T) 26 | return res 27 | 28 | def validation(self,X,Y,kind): 29 | """ 30 | 31 | 使用2-fold进行验证 32 | """ 33 | print 'validating...' 34 | fold_n=2 35 | folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0)) 36 | score=np.zeros(fold_n) 37 | for j, (train_idx, test_idx) in enumerate(folds): 38 | print j + 1, '-fold' 39 | X_train = X[train_idx] 40 | y_train = Y[train_idx] 41 | X_test = X[test_idx] 42 | y_test = Y[test_idx] 43 | 44 | res = self.fit(X_train, y_train, X_test) 45 | cur = sum(y_test == res) * 1.0 / len(res) 46 | score[j] = cur 47 | print score, score.mean() 48 | return score.mean() 49 | 50 | def train_w2v(self, filename): 51 | """ 52 | 训练wv模型 53 | :param filename:path 54 | :return:none 55 | """ 56 | sentences = word2vec.LineSentence(filename) # 加载语料,要求语料为“一行一文本”的格式 57 | print '正在训练w2v 针对语料:',str(filename) 58 | print 'size is: ',self.size 59 | model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48) # 训练模型; 注意参数window 对结果有影响 一般5-100 60 | savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径 61 | print '训练完毕,已保存: ', savepath, 62 | model.save(savepath) 63 | def load_trainsform(self,X): 64 | """ 65 | 载入模型,并且生成wv向量 66 | :param X:读入的文档,list 67 | :return:np.array 68 | """ 69 | print '载入模型中' 70 | model = word2vec.Word2Vec.load('20w_size_win100_300.model') #填写你的路径 71 | print '加载成功' 72 | res=np.zeros((len(X),self.size)) 73 | print '生成w2v向量中..' 74 | for i,line in enumerate(X): 75 | line=line.decode('utf-8') 76 | terms=line.split() 77 | count=0 78 | for j,term in enumerate(terms): 79 | try:#---try失败说明X中有单词不在model中,训练的时候model的模型是min_count的 忽略了一部分单词 80 | count += 1 81 | res[i]+=np.array(model[term]) 82 | except: 83 | 1 == 1 84 | if count!=0: 85 | res[i]=res[i]/float(count) # 求均值 86 | return res 87 | 88 | -------------------------------------------------------------------------------- /project/2-word2vec/main.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import class_w2v 3 | import preprocess 4 | import numpy as np 5 | import csv 6 | 7 | def input(trainname): 8 | """ 9 | load file 10 | :param trainname:path 11 | :return: list 12 | """ 13 | traindata = [] 14 | with open(trainname, 'rb') as f: 15 | reader = csv.reader(f) 16 | count = 0 17 | for line in reader: 18 | try: 19 | traindata.append(line[0]) 20 | count += 1 21 | except: 22 | print "error:", line, count 23 | traindata.append(" ") 24 | return traindata 25 | if __name__ == '__main__': 26 | """ 27 | 使用方法:先训练wv的model,然后再生成wv的向量,最后可以使用2-fold验证效果 28 | 主要目的:生成WV向量,提供给下一个步骤:特征融合。 29 | 注意路径 30 | """ 31 | print '---------w2v----------' 32 | # order = 'train w2v model' 33 | # order='getvec' 34 | order = 'test' 35 | 36 | print 'order is', order 37 | 38 | classob = class_w2v.w2v(300) 39 | 40 | if order == 'train w2v model': #训练WV的model 41 | totalname = 'jieba_total_cut.csv' #纯文本文件路径 42 | classob.train_w2v(totalname) 43 | exit() 44 | elif order == 'getvec': #利用生成的model得到文档的WV的向量,使用求和平均法 45 | trainname = 'jieba_train_cut.csv' 46 | testname = 'jieba_test_cut.csv' 47 | traindata = input(trainname) 48 | testdata = input(testname) 49 | 50 | res1 = classob.load_trainsform(traindata) 51 | res2 = classob.load_trainsform(testdata) 52 | print res1.shape,res2.shape 53 | np.save('wv300_win100.train.npy', res1)#保存生成的向量 54 | np.save('wv300_win100.test.npy', res2) 55 | exit() 56 | 57 | 58 | #以下为测试wv向量,即仅仅使用wv向量做这个比赛,目的在于寻找最好参数的WV向量 59 | print '载入所有的w2v向量中..' 60 | w2vtrain = np.load('wv300_win100.train.npy') 61 | w2vtest = np.load('wv300_win100.test.npy') 62 | 63 | #防止出现非法值 64 | if np.any((np.isnan(w2vtrain))): 65 | print 'nan to num!' 66 | w2vtrain = np.nan_to_num(w2vtrain) 67 | 68 | if np.any((np.isnan(w2vtest))): 69 | print 'nan to num!' 70 | w2vtest = np.nan_to_num(w2vtest) 71 | 72 | #载入label文件 73 | label_genderfile_path = 'train_gender.csv' 74 | label_agefile_path = 'train_age.csv' 75 | label_edufile_path = 'train_education.csv' 76 | genderdata = np.loadtxt(open(label_genderfile_path, 'r')).astype(int) 77 | agedata = np.loadtxt(open(label_agefile_path, 'r')).astype(int) 78 | educationdata = np.loadtxt(open(label_edufile_path, 'r')).astype(int) 79 | 80 | print '预处理中..' 81 | preprocessob = preprocess.preprocess() 82 | gender_traindatas, genderlabel = preprocessob.removezero(w2vtrain, genderdata) 83 | age_traindatas, agelabel = preprocessob.removezero(w2vtrain, agedata) 84 | edu_traindatas, edulabel = preprocessob.removezero(w2vtrain, educationdata) 85 | # ------------------------------------------------------ 86 | 87 | if order == 'test': #使用2-fold进行验证 88 | res1 = classob.validation(gender_traindatas, genderlabel, kind='gender') 89 | res2 = classob.validation(age_traindatas, agelabel, kind='age') 90 | res3 = classob.validation(edu_traindatas, edulabel, kind='edu') 91 | print 'avg is:', (res1+res2+res3)/3.0 92 | else: 93 | print 'error!' 94 | exit() 95 | 96 | -------------------------------------------------------------------------------- /project/2-word2vec/preprocess.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import numpy as np 3 | class preprocess(): 4 | # 主要功能:去除缺失值 5 | def removezero(self, x, y): 6 | nozero = np.nonzero(y) 7 | y = y[nozero] 8 | x = np.array(x) 9 | x = x[nozero] 10 | return x, y 11 | -------------------------------------------------------------------------------- /project/3-main/README.txt: -------------------------------------------------------------------------------- 1 | 主要功能:该部分为本次比赛的核心,包括了特征词加权、模型集成stacking和W2V的融合等等。 2 | 3 | 环境说明: 4 | python2.7_64bit:以及必要的sklearn、numpy等工具包 5 | 6 | 文件说明: 7 | main.py: 主调函数 8 | preprocess.py:预处理类,主要是去除缺失值 9 | classify.py: 实现主要功能的类文件,完成预测和交叉验证。 10 | STFIWF.py: S-TFIWF加权的实现,被classify调用。该类基于sklearn.feature_extraction.text 我们根据提出的公式对IDF以及TF的部分进行了修改,具体在1093-1176行 11 | 12 | 13 | -------------------------------------------------------------------------------- /project/3-main/STFIWF.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # coding=utf-8 3 | 4 | from __future__ import unicode_literals 5 | 6 | import array 7 | from collections import Mapping, defaultdict 8 | import numbers 9 | from operator import itemgetter 10 | import re 11 | import unicodedata 12 | 13 | import numpy as np 14 | import scipy.sparse as sp 15 | 16 | from sklearn.base import BaseEstimator, TransformerMixin 17 | from sklearn.externals import six 18 | from sklearn.externals.six.moves import xrange 19 | from sklearn.preprocessing import normalize 20 | from sklearn.feature_extraction.hashing import FeatureHasher 21 | from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS 22 | from sklearn.utils import deprecated 23 | from sklearn.utils.fixes import frombuffer_empty, bincount 24 | from sklearn.utils.validation import check_is_fitted 25 | from sklearn import preprocessing 26 | 27 | __all__ = ['CountVectorizer', 28 | 'ENGLISH_STOP_WORDS', 29 | 'TfidfTransformer', 30 | 'TfidfVectorizer', 31 | 'strip_accents_ascii', 32 | 'strip_accents_unicode', 33 | 'strip_tags'] 34 | 35 | def strip_accents_unicode(s): 36 | """Transform accentuated unicode symbols into their simple counterpart 37 | 38 | Warning: the python-level loop and join operations make this 39 | implementation 20 times slower than the strip_accents_ascii basic 40 | normalization. 41 | 42 | See also 43 | -------- 44 | strip_accents_ascii 45 | Remove accentuated char for any unicode symbol that has a direct 46 | ASCII equivalent. 47 | """ 48 | normalized = unicodedata.normalize('NFKD', s) 49 | if normalized == s: 50 | return s 51 | else: 52 | return ''.join([c for c in normalized if not unicodedata.combining(c)]) 53 | 54 | def strip_accents_ascii(s): 55 | """Transform accentuated unicode symbols into ascii or nothing 56 | 57 | Warning: this solution is only suited for languages that have a direct 58 | transliteration to ASCII symbols. 59 | 60 | See also 61 | -------- 62 | strip_accents_unicode 63 | Remove accentuated char for any unicode symbol. 64 | """ 65 | nkfd_form = unicodedata.normalize('NFKD', s) 66 | return nkfd_form.encode('ASCII', 'ignore').decode('ASCII') 67 | 68 | def strip_tags(s): 69 | """Basic regexp based HTML / XML tag stripper function 70 | 71 | For serious HTML/XML preprocessing you should rather use an external 72 | library such as lxml or BeautifulSoup. 73 | """ 74 | return re.compile(r"<([^>]+)>", flags=re.UNICODE).sub(" ", s) 75 | 76 | def _check_stop_list(stop): 77 | if stop == "english": 78 | return ENGLISH_STOP_WORDS 79 | elif isinstance(stop, six.string_types): 80 | raise ValueError("not a built-in stop list: %s" % stop) 81 | elif stop is None: 82 | return None 83 | else: # assume it's a collection 84 | return frozenset(stop) 85 | 86 | class VectorizerMixin(object): 87 | """Provides common code for text vectorizers (tokenization logic).""" 88 | 89 | _white_spaces = re.compile(r"\s\s+") 90 | 91 | def decode(self, doc): 92 | """Decode the input into a string of unicode symbols 93 | 94 | The decoding strategy depends on the vectorizer parameters. 95 | """ 96 | if self.input == 'filename': 97 | with open(doc, 'rb') as fh: 98 | doc = fh.read() 99 | 100 | elif self.input == 'file': 101 | doc = doc.read() 102 | 103 | if isinstance(doc, bytes): 104 | doc = doc.decode(self.encoding, self.decode_error) 105 | 106 | if doc is np.nan: 107 | raise ValueError("np.nan is an invalid document, expected byte or " 108 | "unicode string.") 109 | 110 | return doc 111 | 112 | def _word_ngrams(self, tokens, stop_words=None): 113 | """Turn tokens into a sequence of n-grams after stop words filtering""" 114 | # handle stop words 115 | if stop_words is not None: 116 | tokens = [w for w in tokens if w not in stop_words] 117 | 118 | # handle token n-grams 119 | min_n, max_n = self.ngram_range 120 | if max_n != 1: 121 | original_tokens = tokens 122 | tokens = [] 123 | n_original_tokens = len(original_tokens) 124 | for n in xrange(min_n, 125 | min(max_n + 1, n_original_tokens + 1)): 126 | for i in xrange(n_original_tokens - n + 1): 127 | tokens.append(" ".join(original_tokens[i: i + n])) 128 | 129 | return tokens 130 | 131 | def _char_ngrams(self, text_document): 132 | """Tokenize text_document into a sequence of character n-grams""" 133 | # normalize white spaces 134 | text_document = self._white_spaces.sub(" ", text_document) 135 | 136 | text_len = len(text_document) 137 | ngrams = [] 138 | min_n, max_n = self.ngram_range 139 | for n in xrange(min_n, min(max_n + 1, text_len + 1)): 140 | for i in xrange(text_len - n + 1): 141 | ngrams.append(text_document[i: i + n]) 142 | return ngrams 143 | 144 | def _char_wb_ngrams(self, text_document): 145 | """Whitespace sensitive char-n-gram tokenization. 146 | 147 | Tokenize text_document into a sequence of character n-grams 148 | excluding any whitespace (operating only inside word boundaries)""" 149 | # normalize white spaces 150 | text_document = self._white_spaces.sub(" ", text_document) 151 | 152 | min_n, max_n = self.ngram_range 153 | ngrams = [] 154 | for w in text_document.split(): 155 | w = ' ' + w + ' ' 156 | w_len = len(w) 157 | for n in xrange(min_n, max_n + 1): 158 | offset = 0 159 | ngrams.append(w[offset:offset + n]) 160 | while offset + n < w_len: 161 | offset += 1 162 | ngrams.append(w[offset:offset + n]) 163 | if offset == 0: # count a short word (w_len < n) only once 164 | break 165 | return ngrams 166 | 167 | def build_preprocessor(self): 168 | """Return a function to preprocess the text before tokenization""" 169 | if self.preprocessor is not None: 170 | return self.preprocessor 171 | 172 | # unfortunately python functools package does not have an efficient 173 | # `compose` function that would have allowed us to chain a dynamic 174 | # number of functions. However the cost of a lambda call is a few 175 | # hundreds of nanoseconds which is negligible when compared to the 176 | # cost of tokenizing a string of 1000 chars for instance. 177 | noop = lambda x: x 178 | 179 | # accent stripping 180 | if not self.strip_accents: 181 | strip_accents = noop 182 | elif callable(self.strip_accents): 183 | strip_accents = self.strip_accents 184 | elif self.strip_accents == 'ascii': 185 | strip_accents = strip_accents_ascii 186 | elif self.strip_accents == 'unicode': 187 | strip_accents = strip_accents_unicode 188 | else: 189 | raise ValueError('Invalid value for "strip_accents": %s' % 190 | self.strip_accents) 191 | 192 | if self.lowercase: 193 | return lambda x: strip_accents(x.lower()) 194 | else: 195 | return strip_accents 196 | 197 | def build_tokenizer(self): 198 | """Return a function that splits a string into a sequence of tokens""" 199 | if self.tokenizer is not None: 200 | return self.tokenizer 201 | token_pattern = re.compile(self.token_pattern) 202 | return lambda doc: token_pattern.findall(doc) 203 | 204 | def get_stop_words(self): 205 | """Build or fetch the effective stop words list""" 206 | return _check_stop_list(self.stop_words) 207 | 208 | def build_analyzer(self): 209 | """Return a callable that handles preprocessing and tokenization""" 210 | if callable(self.analyzer): 211 | return self.analyzer 212 | 213 | preprocess = self.build_preprocessor() 214 | 215 | if self.analyzer == 'char': 216 | return lambda doc: self._char_ngrams(preprocess(self.decode(doc))) 217 | 218 | elif self.analyzer == 'char_wb': 219 | return lambda doc: self._char_wb_ngrams( 220 | preprocess(self.decode(doc))) 221 | 222 | elif self.analyzer == 'word': 223 | stop_words = self.get_stop_words() 224 | tokenize = self.build_tokenizer() 225 | 226 | return lambda doc: self._word_ngrams( 227 | tokenize(preprocess(self.decode(doc))), stop_words) 228 | 229 | else: 230 | raise ValueError('%s is not a valid tokenization scheme/analyzer' % 231 | self.analyzer) 232 | 233 | def _validate_vocabulary(self): 234 | vocabulary = self.vocabulary 235 | if vocabulary is not None: 236 | if isinstance(vocabulary, set): 237 | vocabulary = sorted(vocabulary) 238 | if not isinstance(vocabulary, Mapping): 239 | vocab = {} 240 | for i, t in enumerate(vocabulary): 241 | if vocab.setdefault(t, i) != i: 242 | msg = "Duplicate term in vocabulary: %r" % t 243 | raise ValueError(msg) 244 | vocabulary = vocab 245 | else: 246 | indices = set(six.itervalues(vocabulary)) 247 | if len(indices) != len(vocabulary): 248 | raise ValueError("Vocabulary contains repeated indices.") 249 | for i in xrange(len(vocabulary)): 250 | if i not in indices: 251 | msg = ("Vocabulary of size %d doesn't contain index " 252 | "%d." % (len(vocabulary), i)) 253 | raise ValueError(msg) 254 | if not vocabulary: 255 | raise ValueError("empty vocabulary passed to fit") 256 | self.fixed_vocabulary_ = True 257 | self.vocabulary_ = dict(vocabulary) 258 | else: 259 | self.fixed_vocabulary_ = False 260 | 261 | def _check_vocabulary(self): 262 | """Check if vocabulary is empty or missing (not fit-ed)""" 263 | msg = "%(name)s - Vocabulary wasn't fitted." 264 | check_is_fitted(self, 'vocabulary_', msg=msg), 265 | 266 | if len(self.vocabulary_) == 0: 267 | raise ValueError("Vocabulary is empty") 268 | 269 | class HashingVectorizer(BaseEstimator, VectorizerMixin): 270 | """Convert a collection of text documents to a matrix of token occurrences 271 | 272 | It turns a collection of text documents into a scipy.sparse matrix holding 273 | token occurrence counts (or binary occurrence information), possibly 274 | normalized as token frequencies if norm='l1' or projected on the euclidean 275 | unit sphere if norm='l2'. 276 | 277 | This text vectorizer implementation uses the hashing trick to find the 278 | token string name to feature integer index mapping. 279 | 280 | This strategy has several advantages: 281 | 282 | - it is very low memory scalable to large datasets as there is no need to 283 | store a vocabulary dictionary in memory 284 | 285 | - it is fast to pickle and un-pickle as it holds no state besides the 286 | constructor parameters 287 | 288 | - it can be used in a streaming (partial fit) or parallel pipeline as there 289 | is no state computed during fit. 290 | 291 | There are also a couple of cons (vs using a CountVectorizer with an 292 | in-memory vocabulary): 293 | 294 | - there is no way to compute the inverse transform (from feature indices to 295 | string feature names) which can be a problem when trying to introspect 296 | which features are most important to a model. 297 | 298 | - there can be collisions: distinct tokens can be mapped to the same 299 | feature index. However in practice this is rarely an issue if n_features 300 | is large enough (e.g. 2 ** 18 for text classification problems). 301 | 302 | - no IDF weighting as this would render the transformer stateful. 303 | 304 | The hash function employed is the signed 32-bit version of Murmurhash3. 305 | 306 | Read more in the :ref:`User Guide `. 307 | 308 | Parameters 309 | ---------- 310 | 311 | input : string {'filename', 'file', 'content'} 312 | If 'filename', the sequence passed as an argument to fit is 313 | expected to be a list of filenames that need reading to fetch 314 | the raw content to analyze. 315 | 316 | If 'file', the sequence items must have a 'read' method (file-like 317 | object) that is called to fetch the bytes in memory. 318 | 319 | Otherwise the input is expected to be the sequence strings or 320 | bytes items are expected to be analyzed directly. 321 | 322 | encoding : string, default='utf-8' 323 | If bytes or files are given to analyze, this encoding is used to 324 | decode. 325 | 326 | decode_error : {'strict', 'ignore', 'replace'} 327 | Instruction on what to do if a byte sequence is given to analyze that 328 | contains characters not of the given `encoding`. By default, it is 329 | 'strict', meaning that a UnicodeDecodeError will be raised. Other 330 | values are 'ignore' and 'replace'. 331 | 332 | strip_accents : {'ascii', 'unicode', None} 333 | Remove accents during the preprocessing step. 334 | 'ascii' is a fast method that only works on characters that have 335 | an direct ASCII mapping. 336 | 'unicode' is a slightly slower method that works on any characters. 337 | None (default) does nothing. 338 | 339 | analyzer : string, {'word', 'char', 'char_wb'} or callable 340 | Whether the feature should be made of word or character n-grams. 341 | Option 'char_wb' creates character n-grams only from text inside 342 | word boundaries. 343 | 344 | If a callable is passed it is used to extract the sequence of features 345 | out of the raw, unprocessed input. 346 | 347 | preprocessor : callable or None (default) 348 | Override the preprocessing (string transformation) stage while 349 | preserving the tokenizing and n-grams generation steps. 350 | 351 | tokenizer : callable or None (default) 352 | Override the string tokenization step while preserving the 353 | preprocessing and n-grams generation steps. 354 | Only applies if ``analyzer == 'word'``. 355 | 356 | ngram_range : tuple (min_n, max_n), default=(1, 1) 357 | The lower and upper boundary of the range of n-values for different 358 | n-grams to be extracted. All values of n such that min_n <= n <= max_n 359 | will be used. 360 | 361 | stop_words : string {'english'}, list, or None (default) 362 | If 'english', a built-in stop word list for English is used. 363 | 364 | If a list, that list is assumed to contain stop words, all of which 365 | will be removed from the resulting tokens. 366 | Only applies if ``analyzer == 'word'``. 367 | 368 | lowercase : boolean, default=True 369 | Convert all characters to lowercase before tokenizing. 370 | 371 | token_pattern : string 372 | Regular expression denoting what constitutes a "token", only used 373 | if ``analyzer == 'word'``. The default regexp selects tokens of 2 374 | or more alphanumeric characters (punctuation is completely ignored 375 | and always treated as a token separator). 376 | 377 | n_features : integer, default=(2 ** 20) 378 | The number of features (columns) in the output matrices. Small numbers 379 | of features are likely to cause hash collisions, but large numbers 380 | will cause larger coefficient dimensions in linear learners. 381 | 382 | norm : 'l1', 'l2' or None, optional 383 | Norm used to normalize term vectors. None for no normalization. 384 | 385 | binary: boolean, default=False. 386 | If True, all non zero counts are set to 1. This is useful for discrete 387 | probabilistic models that model binary events rather than integer 388 | counts. 389 | 390 | dtype: type, optional 391 | Type of the matrix returned by fit_transform() or transform(). 392 | 393 | non_negative : boolean, default=False 394 | Whether output matrices should contain non-negative values only; 395 | effectively calls abs on the matrix prior to returning it. 396 | When True, output values can be interpreted as frequencies. 397 | When False, output values will have expected value zero. 398 | 399 | See also 400 | -------- 401 | CountVectorizer, TfidfVectorizer 402 | 403 | """ 404 | 405 | def __init__(self, input='content', encoding='utf-8', 406 | decode_error='strict', strip_accents=None, 407 | lowercase=True, preprocessor=None, tokenizer=None, 408 | stop_words=None, token_pattern=r"(?u)\b\w\w+\b", 409 | ngram_range=(1, 1), analyzer='word', n_features=(2 ** 20), 410 | binary=False, norm='l2', non_negative=False, 411 | dtype=np.float64): 412 | self.input = input 413 | self.encoding = encoding 414 | self.decode_error = decode_error 415 | self.strip_accents = strip_accents 416 | self.preprocessor = preprocessor 417 | self.tokenizer = tokenizer 418 | self.analyzer = analyzer 419 | self.lowercase = lowercase 420 | self.token_pattern = token_pattern 421 | self.stop_words = stop_words 422 | self.n_features = n_features 423 | self.ngram_range = ngram_range 424 | self.binary = binary 425 | self.norm = norm 426 | self.non_negative = non_negative 427 | self.dtype = dtype 428 | 429 | def partial_fit(self, X, y=None): 430 | """Does nothing: this transformer is stateless. 431 | 432 | This method is just there to mark the fact that this transformer 433 | can work in a streaming setup. 434 | 435 | """ 436 | return self 437 | 438 | def fit(self, X, y=None): 439 | """Does nothing: this transformer is stateless.""" 440 | # triggers a parameter validation 441 | self._get_hasher().fit(X, y=y) 442 | return self 443 | 444 | def transform(self, X, y=None): 445 | """Transform a sequence of documents to a document-term matrix. 446 | 447 | Parameters 448 | ---------- 449 | X : iterable over raw text documents, length = n_samples 450 | Samples. Each sample must be a text document (either bytes or 451 | unicode strings, file name or file object depending on the 452 | constructor argument) which will be tokenized and hashed. 453 | 454 | y : (ignored) 455 | 456 | Returns 457 | ------- 458 | X : scipy.sparse matrix, shape = (n_samples, self.n_features) 459 | Document-term matrix. 460 | 461 | """ 462 | analyzer = self.build_analyzer() 463 | X = self._get_hasher().transform(analyzer(doc) for doc in X) 464 | if self.binary: 465 | X.data.fill(1) 466 | if self.norm is not None: 467 | X = normalize(X, norm=self.norm, copy=False) 468 | return X 469 | 470 | # Alias transform to fit_transform for convenience 471 | fit_transform = transform 472 | 473 | def _get_hasher(self): 474 | return FeatureHasher(n_features=self.n_features, 475 | input_type='string', dtype=self.dtype, 476 | non_negative=self.non_negative) 477 | 478 | def _document_frequency(X): 479 | """Count the number of non-zero values for each feature in sparse X.""" 480 | 481 | if sp.isspmatrix_csr(X): 482 | # return np.sum(X,axis=0) 483 | return bincount(X.indices, minlength=X.shape[1]) 484 | 485 | else: 486 | 487 | return np.diff(sp.csc_matrix(X, copy=False).indptr) 488 | 489 | class CountVectorizer(BaseEstimator, VectorizerMixin): 490 | """Convert a collection of text documents to a matrix of token counts 491 | 492 | This implementation produces a sparse representation of the counts using 493 | scipy.sparse.coo_matrix. 494 | 495 | If you do not provide an a-priori dictionary and you do not use an analyzer 496 | that does some kind of feature selection then the number of features will 497 | be equal to the vocabulary size found by analyzing the data. 498 | 499 | Read more in the :ref:`User Guide `. 500 | 501 | Parameters 502 | ---------- 503 | input : string {'filename', 'file', 'content'} 504 | If 'filename', the sequence passed as an argument to fit is 505 | expected to be a list of filenames that need reading to fetch 506 | the raw content to analyze. 507 | 508 | If 'file', the sequence items must have a 'read' method (file-like 509 | object) that is called to fetch the bytes in memory. 510 | 511 | Otherwise the input is expected to be the sequence strings or 512 | bytes items are expected to be analyzed directly. 513 | 514 | encoding : string, 'utf-8' by default. 515 | If bytes or files are given to analyze, this encoding is used to 516 | decode. 517 | 518 | decode_error : {'strict', 'ignore', 'replace'} 519 | Instruction on what to do if a byte sequence is given to analyze that 520 | contains characters not of the given `encoding`. By default, it is 521 | 'strict', meaning that a UnicodeDecodeError will be raised. Other 522 | values are 'ignore' and 'replace'. 523 | 524 | strip_accents : {'ascii', 'unicode', None} 525 | Remove accents during the preprocessing step. 526 | 'ascii' is a fast method that only works on characters that have 527 | an direct ASCII mapping. 528 | 'unicode' is a slightly slower method that works on any characters. 529 | None (default) does nothing. 530 | 531 | analyzer : string, {'word', 'char', 'char_wb'} or callable 532 | Whether the feature should be made of word or character n-grams. 533 | Option 'char_wb' creates character n-grams only from text inside 534 | word boundaries. 535 | 536 | If a callable is passed it is used to extract the sequence of features 537 | out of the raw, unprocessed input. 538 | 539 | preprocessor : callable or None (default) 540 | Override the preprocessing (string transformation) stage while 541 | preserving the tokenizing and n-grams generation steps. 542 | 543 | tokenizer : callable or None (default) 544 | Override the string tokenization step while preserving the 545 | preprocessing and n-grams generation steps. 546 | Only applies if ``analyzer == 'word'``. 547 | 548 | ngram_range : tuple (min_n, max_n) 549 | The lower and upper boundary of the range of n-values for different 550 | n-grams to be extracted. All values of n such that min_n <= n <= max_n 551 | will be used. 552 | 553 | stop_words : string {'english'}, list, or None (default) 554 | If 'english', a built-in stop word list for English is used. 555 | 556 | If a list, that list is assumed to contain stop words, all of which 557 | will be removed from the resulting tokens. 558 | Only applies if ``analyzer == 'word'``. 559 | 560 | If None, no stop words will be used. max_df can be set to a value 561 | in the range [0.7, 1.0) to automatically detect and filter stop 562 | words based on intra corpus document frequency of terms. 563 | 564 | lowercase : boolean, True by default 565 | Convert all characters to lowercase before tokenizing. 566 | 567 | token_pattern : string 568 | Regular expression denoting what constitutes a "token", only used 569 | if ``analyzer == 'word'``. The default regexp select tokens of 2 570 | or more alphanumeric characters (punctuation is completely ignored 571 | and always treated as a token separator). 572 | 573 | max_df : float in range [0.0, 1.0] or int, default=1.0 574 | When building the vocabulary ignore terms that have a document 575 | frequency strictly higher than the given threshold (corpus-specific 576 | stop words). 577 | If float, the parameter represents a proportion of documents, integer 578 | absolute counts. 579 | This parameter is ignored if vocabulary is not None. 580 | 581 | min_df : float in range [0.0, 1.0] or int, default=1 582 | When building the vocabulary ignore terms that have a document 583 | frequency strictly lower than the given threshold. This value is also 584 | called cut-off in the literature. 585 | If float, the parameter represents a proportion of documents, integer 586 | absolute counts. 587 | This parameter is ignored if vocabulary is not None. 588 | 589 | max_features : int or None, default=None 590 | If not None, build a vocabulary that only consider the top 591 | max_features ordered by term frequency across the corpus. 592 | 593 | This parameter is ignored if vocabulary is not None. 594 | 595 | vocabulary : Mapping or iterable, optional 596 | Either a Mapping (e.g., a dict) where keys are terms and values are 597 | indices in the feature matrix, or an iterable over terms. If not 598 | given, a vocabulary is determined from the input documents. Indices 599 | in the mapping should not be repeated and should not have any gap 600 | between 0 and the largest index. 601 | 602 | binary : boolean, default=False 603 | If True, all non zero counts are set to 1. This is useful for discrete 604 | probabilistic models that model binary events rather than integer 605 | counts. 606 | 607 | dtype : type, optional 608 | Type of the matrix returned by fit_transform() or transform(). 609 | 610 | Attributes 611 | ---------- 612 | vocabulary_ : dict 613 | A mapping of terms to feature indices. 614 | 615 | stop_words_ : set 616 | Terms that were ignored because they either: 617 | 618 | - occurred in too many documents (`max_df`) 619 | - occurred in too few documents (`min_df`) 620 | - were cut off by feature selection (`max_features`). 621 | 622 | This is only available if no vocabulary was given. 623 | 624 | See also 625 | -------- 626 | HashingVectorizer, TfidfVectorizer 627 | 628 | Notes 629 | ----- 630 | The ``stop_words_`` attribute can get large and increase the model size 631 | when pickling. This attribute is provided only for introspection and can 632 | be safely removed using delattr or set to None before pickling. 633 | """ 634 | 635 | def __init__(self, input='content', encoding='utf-8', 636 | decode_error='strict', strip_accents=None, 637 | lowercase=True, preprocessor=None, tokenizer=None, 638 | stop_words=None, token_pattern=r"(?u)\b\w\w+\b", 639 | ngram_range=(1, 1), analyzer='word', 640 | max_df=1.0, min_df=1, max_features=None, 641 | vocabulary=None, binary=False, dtype=np.int64): 642 | self.input = input 643 | self.encoding = encoding 644 | self.decode_error = decode_error 645 | self.strip_accents = strip_accents 646 | self.preprocessor = preprocessor 647 | self.tokenizer = tokenizer 648 | self.analyzer = analyzer 649 | self.lowercase = lowercase 650 | self.token_pattern = token_pattern 651 | self.stop_words = stop_words 652 | self.max_df = max_df 653 | self.min_df = min_df 654 | if max_df < 0 or min_df < 0: 655 | raise ValueError("negative value for max_df or min_df") 656 | self.max_features = max_features 657 | if max_features is not None: 658 | if (not isinstance(max_features, numbers.Integral) or 659 | max_features <= 0): 660 | raise ValueError( 661 | "max_features=%r, neither a positive integer nor None" 662 | % max_features) 663 | self.ngram_range = ngram_range 664 | self.vocabulary = vocabulary 665 | self.binary = binary 666 | self.dtype = dtype 667 | 668 | def _sort_features(self, X, vocabulary): 669 | """Sort features by name 670 | 671 | Returns a reordered matrix and modifies the vocabulary in place 672 | """ 673 | sorted_features = sorted(six.iteritems(vocabulary)) 674 | map_index = np.empty(len(sorted_features), dtype=np.int32) 675 | for new_val, (term, old_val) in enumerate(sorted_features): 676 | vocabulary[term] = new_val 677 | map_index[old_val] = new_val 678 | 679 | X.indices = map_index.take(X.indices, mode='clip') 680 | return X 681 | 682 | def _limit_features(self, X, vocabulary, high=None, low=None, 683 | limit=None): 684 | """Remove too rare or too common features. 685 | 686 | Prune features that are non zero in more samples than high or less 687 | documents than low, modifying the vocabulary, and restricting it to 688 | at most the limit most frequent. 689 | 690 | This does not prune samples with zero features. 691 | """ 692 | if high is None and low is None and limit is None: 693 | return X, set() 694 | 695 | # Calculate a mask based on document frequencies 696 | dfs = _document_frequency(X) 697 | tfs = np.asarray(X.sum(axis=0)).ravel() 698 | mask = np.ones(len(dfs), dtype=bool) 699 | if high is not None: 700 | mask &= dfs <= high 701 | if low is not None: 702 | mask &= dfs >= low 703 | if limit is not None and mask.sum() > limit: 704 | mask_inds = (-tfs[mask]).argsort()[:limit] 705 | new_mask = np.zeros(len(dfs), dtype=bool) 706 | new_mask[np.where(mask)[0][mask_inds]] = True 707 | mask = new_mask 708 | 709 | new_indices = np.cumsum(mask) - 1 # maps old indices to new 710 | removed_terms = set() 711 | for term, old_index in list(six.iteritems(vocabulary)): 712 | if mask[old_index]: 713 | vocabulary[term] = new_indices[old_index] 714 | else: 715 | del vocabulary[term] 716 | removed_terms.add(term) 717 | kept_indices = np.where(mask)[0] 718 | if len(kept_indices) == 0: 719 | raise ValueError("After pruning, no terms remain. Try a lower" 720 | " min_df or a higher max_df.") 721 | return X[:, kept_indices], removed_terms 722 | 723 | def _count_vocab(self, raw_documents, fixed_vocab): 724 | """Create sparse feature matrix, and vocabulary where fixed_vocab=False 725 | """ 726 | if fixed_vocab: 727 | vocabulary = self.vocabulary_ 728 | else: 729 | # Add a new value when a new vocabulary item is seen 730 | vocabulary = defaultdict() 731 | vocabulary.default_factory = vocabulary.__len__ 732 | 733 | analyze = self.build_analyzer() 734 | j_indices = [] 735 | indptr = _make_int_array() 736 | values = _make_int_array() 737 | indptr.append(0) 738 | for doc in raw_documents: 739 | feature_counter = {} 740 | for feature in analyze(doc): 741 | try: 742 | feature_idx = vocabulary[feature] 743 | if feature_idx not in feature_counter: 744 | feature_counter[feature_idx] = 1 745 | else: 746 | feature_counter[feature_idx] += 1 747 | except KeyError: 748 | # Ignore out-of-vocabulary items for fixed_vocab=True 749 | continue 750 | 751 | j_indices.extend(feature_counter.keys()) 752 | values.extend(feature_counter.values()) 753 | indptr.append(len(j_indices)) 754 | 755 | if not fixed_vocab: 756 | # disable defaultdict behaviour 757 | vocabulary = dict(vocabulary) 758 | if not vocabulary: 759 | raise ValueError("empty vocabulary; perhaps the documents only" 760 | " contain stop words") 761 | 762 | j_indices = np.asarray(j_indices, dtype=np.intc) 763 | indptr = np.frombuffer(indptr, dtype=np.intc) 764 | values = frombuffer_empty(values, dtype=np.intc) 765 | 766 | X = sp.csr_matrix((values, j_indices, indptr), 767 | shape=(len(indptr) - 1, len(vocabulary)), 768 | dtype=self.dtype) 769 | X.sort_indices() 770 | return vocabulary, X 771 | 772 | def _count_vocab_2(self, raw_documents, fixed_vocab): 773 | """Create sparse feature matrix, and vocabulary where fixed_vocab=False 774 | """ 775 | if fixed_vocab: 776 | vocabulary = self.vocabulary_ 777 | else: 778 | # Add a new value when a new vocabulary item is seen 779 | vocabulary = defaultdict() 780 | vocabulary.default_factory = vocabulary.__len__ 781 | 782 | analyze = self.build_analyzer() 783 | j_indices = [] 784 | indptr = _make_int_array() 785 | # values = _make_int_array() 786 | values = array.array(str("f")) 787 | indptr.append(0) 788 | for doc in raw_documents: 789 | feature_counter = {} 790 | for feature in analyze(doc): 791 | try: 792 | feature_idx = vocabulary[feature] 793 | if feature_idx not in feature_counter: 794 | feature_counter[feature_idx] = 1 795 | else: 796 | feature_counter[feature_idx] += 1 797 | except KeyError: 798 | # Ignore out-of-vocabulary items for fixed_vocab=True 799 | continue 800 | 801 | j_indices.extend(feature_counter.keys()) 802 | values.extend([i * 1.0 / sum(feature_counter.values()) for i in feature_counter.values()]) 803 | indptr.append(len(j_indices)) 804 | 805 | if not fixed_vocab: 806 | # disable defaultdict behaviour 807 | vocabulary = dict(vocabulary) 808 | if not vocabulary: 809 | raise ValueError("empty vocabulary; perhaps the documents only" 810 | " contain stop words") 811 | 812 | j_indices = np.asarray(j_indices, dtype=np.intc) 813 | indptr = np.frombuffer(indptr, dtype=np.intc) 814 | values = frombuffer_empty(values, dtype=np.float32) 815 | 816 | X = sp.csr_matrix((values, j_indices, indptr), 817 | shape=(len(indptr) - 1, len(vocabulary))) 818 | X.sort_indices() 819 | return vocabulary, X 820 | 821 | def fit(self, raw_documents, y=None): 822 | """Learn a vocabulary dictionary of all tokens in the raw documents. 823 | 824 | Parameters 825 | ---------- 826 | raw_documents : iterable 827 | An iterable which yields either str, unicode or file objects. 828 | 829 | Returns 830 | ------- 831 | self 832 | """ 833 | self.fit_transform(raw_documents) 834 | return self 835 | 836 | def fit_transform(self, raw_documents, y=None): 837 | """Learn the vocabulary dictionary and return term-document matrix. 838 | 839 | This is equivalent to fit followed by transform, but more efficiently 840 | implemented. 841 | 842 | Parameters 843 | ---------- 844 | raw_documents : iterable 845 | An iterable which yields either str, unicode or file objects. 846 | 847 | Returns 848 | ------- 849 | X : array, [n_samples, n_features] 850 | Document-term matrix. 851 | """ 852 | # We intentionally don't call the transform method to make 853 | # fit_transform overridable without unwanted side effects in 854 | # TfidfVectorizer. 855 | self._validate_vocabulary() 856 | max_df = self.max_df 857 | min_df = self.min_df 858 | max_features = self.max_features 859 | 860 | vocabulary, X = self._count_vocab(raw_documents, 861 | self.fixed_vocabulary_) 862 | 863 | if self.binary: 864 | X.data.fill(1) 865 | 866 | if not self.fixed_vocabulary_: 867 | X = self._sort_features(X, vocabulary) 868 | 869 | n_doc = X.shape[0] 870 | max_doc_count = (max_df 871 | if isinstance(max_df, numbers.Integral) 872 | else max_df * n_doc) 873 | min_doc_count = (min_df 874 | if isinstance(min_df, numbers.Integral) 875 | else min_df * n_doc) 876 | if max_doc_count < min_doc_count: 877 | raise ValueError( 878 | "max_df corresponds to < documents than min_df") 879 | X, self.stop_words_ = self._limit_features(X, vocabulary, 880 | max_doc_count, 881 | min_doc_count, 882 | max_features) 883 | 884 | self.vocabulary_ = vocabulary 885 | 886 | return X 887 | 888 | def transform(self, raw_documents): 889 | """Transform documents to document-term matrix. 890 | 891 | Extract token counts out of raw text documents using the vocabulary 892 | fitted with fit or the one provided to the constructor. 893 | 894 | Parameters 895 | ---------- 896 | raw_documents : iterable 897 | An iterable which yields either str, unicode or file objects. 898 | 899 | Returns 900 | ------- 901 | X : sparse matrix, [n_samples, n_features] 902 | Document-term matrix. 903 | """ 904 | if not hasattr(self, 'vocabulary_'): 905 | self._validate_vocabulary() 906 | 907 | self._check_vocabulary() 908 | 909 | # use the same matrix-building strategy as fit_transform 910 | _, X = self._count_vocab(raw_documents, fixed_vocab=True) 911 | if self.binary: 912 | X.data.fill(1) 913 | return X 914 | 915 | def get_term_topic(self, X): 916 | n_features = X.shape[1] 917 | id2word = self.vocabulary_ 918 | word2topic = {} 919 | 920 | with open('word_topic.txt', 'r') as f: 921 | for line in f: 922 | strs = line.decode('utf-8').strip('\n').split('\t') 923 | word2topic[strs[0]] = strs[2] 924 | 925 | topic = np.zeros((len(id2word),)) 926 | 927 | for i, key in enumerate(id2word): 928 | if key in word2topic: 929 | topic[id2word[key]] = word2topic[key] 930 | else: 931 | print key 932 | 933 | topic = preprocessing.MinMaxScaler().fit_transform(topic) 934 | # topic = sp.spdiags(topic, diags=0, m=n_features, 935 | # n=n_features, format='csr') 936 | return topic 937 | 938 | def transform2(self, raw_documents): 939 | """Transform documents to document-term matrix. 940 | 941 | Extract token counts out of raw text documents using the vocabulary 942 | fitted with fit or the one provided to the constructor. 943 | 944 | Parameters 945 | ---------- 946 | raw_documents : iterable 947 | An iterable which yields either str, unicode or file objects. 948 | 949 | Returns 950 | ------- 951 | X : sparse matrix, [n_samples, n_features] 952 | Document-term matrix. 953 | """ 954 | if not hasattr(self, 'vocabulary_'): 955 | self._validate_vocabulary() 956 | 957 | self._check_vocabulary() 958 | 959 | # use the same matrix-building strategy as fit_transform 960 | _, X = self._count_vocab_2(raw_documents, fixed_vocab=True) 961 | if self.binary: 962 | X.data.fill(1) 963 | return X 964 | 965 | def inverse_transform(self, X): 966 | """Return terms per document with nonzero entries in X. 967 | 968 | Parameters 969 | ---------- 970 | X : {array, sparse matrix}, shape = [n_samples, n_features] 971 | 972 | Returns 973 | ------- 974 | X_inv : list of arrays, len = n_samples 975 | List of arrays of terms. 976 | """ 977 | self._check_vocabulary() 978 | 979 | if sp.issparse(X): 980 | # We need CSR format for fast row manipulations. 981 | X = X.tocsr() 982 | else: 983 | # We need to convert X to a matrix, so that the indexing 984 | # returns 2D objects 985 | X = np.asmatrix(X) 986 | n_samples = X.shape[0] 987 | 988 | terms = np.array(list(self.vocabulary_.keys())) 989 | indices = np.array(list(self.vocabulary_.values())) 990 | inverse_vocabulary = terms[np.argsort(indices)] 991 | 992 | return [inverse_vocabulary[X[i, :].nonzero()[1]].ravel() 993 | for i in range(n_samples)] 994 | 995 | def get_feature_names(self): 996 | """Array mapping from feature integer indices to feature name""" 997 | self._check_vocabulary() 998 | 999 | return [t for t, i in sorted(six.iteritems(self.vocabulary_), 1000 | key=itemgetter(1))] 1001 | 1002 | 1003 | def _make_int_array(): 1004 | """Construct an array.array of a type suitable for scipy.sparse indices.""" 1005 | return array.array(str("i")) 1006 | 1007 | class TfidfTransformer(BaseEstimator, TransformerMixin): 1008 | """Transform a count matrix to a normalized tf or tf-idf representation 1009 | 1010 | Tf means term-frequency while tf-idf means term-frequency times inverse 1011 | document-frequency. This is a common term weighting scheme in information 1012 | retrieval, that has also found good use in document classification. 1013 | 1014 | The goal of using tf-idf instead of the raw frequencies of occurrence of a 1015 | token in a given document is to scale down the impact of tokens that occur 1016 | very frequently in a given corpus and that are hence empirically less 1017 | informative than features that occur in a small fraction of the training 1018 | corpus. 1019 | 1020 | The formula that is used to compute the tf-idf of term t is 1021 | tf-idf(d, t) = tf(t) * idf(d, t), and the idf is computed as 1022 | idf(d, t) = log [ n / df(d, t) ] + 1 (if ``smooth_idf=False``), 1023 | where n is the total number of documents and df(d, t) is the 1024 | document frequency; the document frequency is the number of documents d 1025 | that contain term t. The effect of adding "1" to the idf in the equation 1026 | above is that terms with zero idf, i.e., terms that occur in all documents 1027 | in a training set, will not be entirely ignored. 1028 | (Note that the idf formula above differs from the standard 1029 | textbook notation that defines the idf as 1030 | idf(d, t) = log [ n / (df(d, t) + 1) ]). 1031 | 1032 | If ``smooth_idf=True`` (the default), the constant "1" is added to the 1033 | numerator and denominator of the idf as if an extra document was seen 1034 | containing every term in the collection exactly once, which prevents 1035 | zero divisions: idf(d, t) = log [ (1 + n) / 1 + df(d, t) ] + 1. 1036 | 1037 | Furthermore, the formulas used to compute tf and idf depend 1038 | on parameter settings that correspond to the SMART notation used in IR 1039 | as follows: 1040 | 1041 | Tf is "n" (natural) by default, "l" (logarithmic) when 1042 | ``sublinear_tf=True``. 1043 | Idf is "t" when use_idf is given, "n" (none) otherwise. 1044 | Normalization is "c" (cosine) when ``norm='l2'``, "n" (none) 1045 | when ``norm=None``. 1046 | 1047 | Read more in the :ref:`User Guide `. 1048 | 1049 | Parameters 1050 | ---------- 1051 | norm : 'l1', 'l2' or None, optional 1052 | Norm used to normalize term vectors. None for no normalization. 1053 | 1054 | use_idf : boolean, default=True 1055 | Enable inverse-document-frequency reweighting. 1056 | 1057 | smooth_idf : boolean, default=True 1058 | Smooth idf weights by adding one to document frequencies, as if an 1059 | extra document was seen containing every term in the collection 1060 | exactly once. Prevents zero divisions. 1061 | 1062 | sublinear_tf : boolean, default=False 1063 | Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). 1064 | 1065 | References 1066 | ---------- 1067 | 1068 | .. [Yates2011] `R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern 1069 | Information Retrieval. Addison Wesley, pp. 68-74.` 1070 | 1071 | .. [MRS2008] `C.D. Manning, P. Raghavan and H. Schütze (2008). 1072 | Introduction to Information Retrieval. Cambridge University 1073 | Press, pp. 118-120.` 1074 | """ 1075 | 1076 | def __init__(self, norm='l2', use_idf=True, smooth_idf=True, 1077 | sublinear_tf=False): 1078 | self.norm = norm 1079 | self.use_idf = use_idf 1080 | self.smooth_idf = smooth_idf 1081 | self.sublinear_tf = sublinear_tf 1082 | 1083 | def fit(self, X, y, termTopic=None): 1084 | """Learn the idf vector (global term weights) 1085 | 1086 | Parameters 1087 | ---------- 1088 | X : sparse matrix, [n_samples, n_features] 1089 | a matrix of term/token counts 1090 | """ 1091 | # todo http://nlpr-web.ia.ac.cn/cip/proceedings/klchen.pdf 1092 | # compute the normalized var 1093 | if y is not None: 1094 | aX = X 1095 | m = len(np.unique(y)) 1096 | p = np.zeros((m, aX.shape[1])) 1097 | 1098 | for j in range(np.min(y), m + np.min(y)): 1099 | w = aX[y == j, :] 1100 | tij = np.sum(w, axis=0) 1101 | lj = np.sum(tij) 1102 | p[j - np.min(y), :] = tij * 1.0 / lj 1103 | 1104 | ave_p = np.sum(p, axis=0) * 1.0 / m 1105 | 1106 | new_var = np.sqrt(np.sqrt(np.sum((p - ave_p) ** 2, axis=0)) * 1.0 / np.sum(p, axis=0)) 1107 | 1108 | if not sp.issparse(X): 1109 | X = sp.csc_matrix(X) 1110 | 1111 | if self.use_idf: 1112 | n_samples, n_features = X.shape 1113 | 1114 | df = _document_frequency(X) 1115 | # the number of all words 1116 | whole_df = np.sum(df) 1117 | 1118 | # perform idf smoothing if required 1119 | df += int(self.smooth_idf) 1120 | n_samples += int(self.smooth_idf) 1121 | 1122 | idf = np.log(whole_df * 1.0 / df * 1.0) 1123 | 1124 | idf = idf * new_var 1125 | 1126 | self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, 1127 | n=n_features, format='csr') 1128 | 1129 | return self 1130 | 1131 | def transform(self, X, copy=True): 1132 | """Transform a count matrix to a tf or tf-idf representation 1133 | 1134 | Parameters 1135 | ---------- 1136 | X : sparse matrix, [n_samples, n_features] 1137 | a matrix of term/token counts 1138 | 1139 | copy : boolean, default True 1140 | Whether to copy X and operate on the copy or perform in-place 1141 | operations. 1142 | 1143 | Returns 1144 | ------- 1145 | vectors : sparse matrix, [n_samples, n_features] 1146 | """ 1147 | 1148 | if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float): 1149 | # preserve float family dtype 1150 | X = sp.csr_matrix(X, copy=copy) 1151 | else: 1152 | # convert counts or binary occurrences to floats 1153 | X = sp.csr_matrix(X, dtype=np.float64, copy=copy) 1154 | 1155 | n_samples, n_features = X.shape 1156 | 1157 | if self.sublinear_tf: 1158 | np.log(X.data, X.data) 1159 | X.data += 1 1160 | 1161 | if self.use_idf: 1162 | check_is_fitted(self, '_idf_diag', 'idf vector is not fitted') 1163 | 1164 | expected_n_features = self._idf_diag.shape[0] 1165 | if n_features != expected_n_features: 1166 | raise ValueError("Input has n_features=%d while the model" 1167 | " has been trained with n_features=%d" % ( 1168 | n_features, expected_n_features)) 1169 | # *= doesn't work 1170 | 1171 | X = np.sqrt(X) * self._idf_diag 1172 | 1173 | if self.norm: 1174 | X = normalize(X, norm=self.norm, copy=False) 1175 | 1176 | return X 1177 | 1178 | @property 1179 | def idf_(self): 1180 | if hasattr(self, "_idf_diag"): 1181 | return np.ravel(self._idf_diag.sum(axis=0)) 1182 | else: 1183 | return None 1184 | 1185 | class TfidfVectorizer(CountVectorizer): 1186 | """Convert a collection of raw documents to a matrix of TF-IDF features. 1187 | 1188 | Equivalent to CountVectorizer followed by TfidfTransformer. 1189 | 1190 | Read more in the :ref:`User Guide `. 1191 | 1192 | Parameters 1193 | ---------- 1194 | input : string {'filename', 'file', 'content'} 1195 | If 'filename', the sequence passed as an argument to fit is 1196 | expected to be a list of filenames that need reading to fetch 1197 | the raw content to analyze. 1198 | 1199 | If 'file', the sequence items must have a 'read' method (file-like 1200 | object) that is called to fetch the bytes in memory. 1201 | 1202 | Otherwise the input is expected to be the sequence strings or 1203 | bytes items are expected to be analyzed directly. 1204 | 1205 | encoding : string, 'utf-8' by default. 1206 | If bytes or files are given to analyze, this encoding is used to 1207 | decode. 1208 | 1209 | decode_error : {'strict', 'ignore', 'replace'} 1210 | Instruction on what to do if a byte sequence is given to analyze that 1211 | contains characters not of the given `encoding`. By default, it is 1212 | 'strict', meaning that a UnicodeDecodeError will be raised. Other 1213 | values are 'ignore' and 'replace'. 1214 | 1215 | strip_accents : {'ascii', 'unicode', None} 1216 | Remove accents during the preprocessing step. 1217 | 'ascii' is a fast method that only works on characters that have 1218 | an direct ASCII mapping. 1219 | 'unicode' is a slightly slower method that works on any characters. 1220 | None (default) does nothing. 1221 | 1222 | analyzer : string, {'word', 'char'} or callable 1223 | Whether the feature should be made of word or character n-grams. 1224 | 1225 | If a callable is passed it is used to extract the sequence of features 1226 | out of the raw, unprocessed input. 1227 | 1228 | preprocessor : callable or None (default) 1229 | Override the preprocessing (string transformation) stage while 1230 | preserving the tokenizing and n-grams generation steps. 1231 | 1232 | tokenizer : callable or None (default) 1233 | Override the string tokenization step while preserving the 1234 | preprocessing and n-grams generation steps. 1235 | Only applies if ``analyzer == 'word'``. 1236 | 1237 | ngram_range : tuple (min_n, max_n) 1238 | The lower and upper boundary of the range of n-values for different 1239 | n-grams to be extracted. All values of n such that min_n <= n <= max_n 1240 | will be used. 1241 | 1242 | stop_words : string {'english'}, list, or None (default) 1243 | If a string, it is passed to _check_stop_list and the appropriate stop 1244 | list is returned. 'english' is currently the only supported string 1245 | value. 1246 | 1247 | If a list, that list is assumed to contain stop words, all of which 1248 | will be removed from the resulting tokens. 1249 | Only applies if ``analyzer == 'word'``. 1250 | 1251 | If None, no stop words will be used. max_df can be set to a value 1252 | in the range [0.7, 1.0) to automatically detect and filter stop 1253 | words based on intra corpus document frequency of terms. 1254 | 1255 | lowercase : boolean, default True 1256 | Convert all characters to lowercase before tokenizing. 1257 | 1258 | token_pattern : string 1259 | Regular expression denoting what constitutes a "token", only used 1260 | if ``analyzer == 'word'``. The default regexp selects tokens of 2 1261 | or more alphanumeric characters (punctuation is completely ignored 1262 | and always treated as a token separator). 1263 | 1264 | max_df : float in range [0.0, 1.0] or int, default=1.0 1265 | When building the vocabulary ignore terms that have a document 1266 | frequency strictly higher than the given threshold (corpus-specific 1267 | stop words). 1268 | If float, the parameter represents a proportion of documents, integer 1269 | absolute counts. 1270 | This parameter is ignored if vocabulary is not None. 1271 | 1272 | min_df : float in range [0.0, 1.0] or int, default=1 1273 | When building the vocabulary ignore terms that have a document 1274 | frequency strictly lower than the given threshold. This value is also 1275 | called cut-off in the literature. 1276 | If float, the parameter represents a proportion of documents, integer 1277 | absolute counts. 1278 | This parameter is ignored if vocabulary is not None. 1279 | 1280 | max_features : int or None, default=None 1281 | If not None, build a vocabulary that only consider the top 1282 | max_features ordered by term frequency across the corpus. 1283 | 1284 | This parameter is ignored if vocabulary is not None. 1285 | 1286 | vocabulary : Mapping or iterable, optional 1287 | Either a Mapping (e.g., a dict) where keys are terms and values are 1288 | indices in the feature matrix, or an iterable over terms. If not 1289 | given, a vocabulary is determined from the input documents. 1290 | 1291 | binary : boolean, default=False 1292 | If True, all non-zero term counts are set to 1. This does not mean 1293 | outputs will have only 0/1 values, only that the tf term in tf-idf 1294 | is binary. (Set idf and normalization to False to get 0/1 outputs.) 1295 | 1296 | dtype : type, optional 1297 | Type of the matrix returned by fit_transform() or transform(). 1298 | 1299 | norm : 'l1', 'l2' or None, optional 1300 | Norm used to normalize term vectors. None for no normalization. 1301 | 1302 | use_idf : boolean, default=True 1303 | Enable inverse-document-frequency reweighting. 1304 | 1305 | smooth_idf : boolean, default=True 1306 | Smooth idf weights by adding one to document frequencies, as if an 1307 | extra document was seen containing every term in the collection 1308 | exactly once. Prevents zero divisions. 1309 | 1310 | sublinear_tf : boolean, default=False 1311 | Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). 1312 | 1313 | Attributes 1314 | ---------- 1315 | vocabulary_ : dict 1316 | A mapping of terms to feature indices. 1317 | 1318 | idf_ : array, shape = [n_features], or None 1319 | The learned idf vector (global term weights) 1320 | when ``use_idf`` is set to True, None otherwise. 1321 | 1322 | stop_words_ : set 1323 | Terms that were ignored because they either: 1324 | 1325 | - occurred in too many documents (`max_df`) 1326 | - occurred in too few documents (`min_df`) 1327 | - were cut off by feature selection (`max_features`). 1328 | 1329 | This is only available if no vocabulary was given. 1330 | 1331 | See also 1332 | -------- 1333 | CountVectorizer 1334 | Tokenize the documents and count the occurrences of token and return 1335 | them as a sparse matrix 1336 | 1337 | TfidfTransformer 1338 | Apply Term Frequency Inverse Document Frequency normalization to a 1339 | sparse matrix of occurrence counts. 1340 | 1341 | Notes 1342 | ----- 1343 | The ``stop_words_`` attribute can get large and increase the model size 1344 | when pickling. This attribute is provided only for introspection and can 1345 | be safely removed using delattr or set to None before pickling. 1346 | """ 1347 | 1348 | def __init__(self, input='content', encoding='utf-8', 1349 | decode_error='strict', strip_accents=None, lowercase=True, 1350 | preprocessor=None, tokenizer=None, analyzer='word', 1351 | stop_words=None, token_pattern=r"(?u)\b\w\w+\b", 1352 | ngram_range=(1, 1), max_df=1.0, min_df=1, 1353 | max_features=None, vocabulary=None, binary=False, 1354 | dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True, 1355 | sublinear_tf=False): 1356 | super(TfidfVectorizer, self).__init__( 1357 | input=input, encoding=encoding, decode_error=decode_error, 1358 | strip_accents=strip_accents, lowercase=lowercase, 1359 | preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer, 1360 | stop_words=stop_words, token_pattern=token_pattern, 1361 | ngram_range=ngram_range, max_df=max_df, min_df=min_df, 1362 | max_features=max_features, vocabulary=vocabulary, binary=binary, 1363 | dtype=dtype) 1364 | 1365 | self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf, 1366 | smooth_idf=smooth_idf, 1367 | sublinear_tf=sublinear_tf) 1368 | 1369 | # Broadcast the TF-IDF parameters to the underlying transformer instance 1370 | # for easy grid search and repr 1371 | 1372 | @property 1373 | def norm(self): 1374 | return self._tfidf.norm 1375 | 1376 | @norm.setter 1377 | def norm(self, value): 1378 | self._tfidf.norm = value 1379 | 1380 | @property 1381 | def use_idf(self): 1382 | return self._tfidf.use_idf 1383 | 1384 | @use_idf.setter 1385 | def use_idf(self, value): 1386 | self._tfidf.use_idf = value 1387 | 1388 | @property 1389 | def smooth_idf(self): 1390 | return self._tfidf.smooth_idf 1391 | 1392 | @smooth_idf.setter 1393 | def smooth_idf(self, value): 1394 | self._tfidf.smooth_idf = value 1395 | 1396 | @property 1397 | def sublinear_tf(self): 1398 | return self._tfidf.sublinear_tf 1399 | 1400 | @sublinear_tf.setter 1401 | def sublinear_tf(self, value): 1402 | self._tfidf.sublinear_tf = value 1403 | 1404 | @property 1405 | def idf_(self): 1406 | return self._tfidf.idf_ 1407 | 1408 | def fit(self, raw_documents, y=None): 1409 | """Learn vocabulary and idf from training set. 1410 | 1411 | Parameters 1412 | ---------- 1413 | raw_documents : iterable 1414 | an iterable which yields either str, unicode or file objects 1415 | 1416 | Returns 1417 | ------- 1418 | self : TfidfVectorizer 1419 | """ 1420 | X = super(TfidfVectorizer, self).fit_transform(raw_documents) 1421 | 1422 | # termTopic = super(TfidfVectorizer, self).get_term_topic(X) 1423 | 1424 | self._tfidf.fit(X, y, None) 1425 | 1426 | return self 1427 | 1428 | def fit_transform(self, raw_documents, y=None): 1429 | """Learn vocabulary and idf, return term-document matrix. 1430 | 1431 | This is equivalent to fit followed by transform, but more efficiently 1432 | implemented. 1433 | 1434 | Parameters 1435 | ---------- 1436 | raw_documents : iterable 1437 | an iterable which yields either str, unicode or file objects 1438 | 1439 | Returns 1440 | ------- 1441 | X : sparse matrix, [n_samples, n_features] 1442 | Tf-idf-weighted document-term matrix. 1443 | """ 1444 | X = super(TfidfVectorizer, self).fit_transform(raw_documents) 1445 | self._tfidf.fit(X, y, None) 1446 | # X is already a transformed view of raw_documents so 1447 | # we set copy to False 1448 | return self._tfidf.transform(X, copy=False) 1449 | 1450 | def transform(self, raw_documents, copy=True): 1451 | """Transform documents to document-term matrix. 1452 | 1453 | Uses the vocabulary and document frequencies (df) learned by fit (or 1454 | fit_transform). 1455 | 1456 | Parameters 1457 | ---------- 1458 | raw_documents : iterable 1459 | an iterable which yields either str, unicode or file objects 1460 | 1461 | copy : boolean, default True 1462 | Whether to copy X and operate on the copy or perform in-place 1463 | operations. 1464 | 1465 | Returns 1466 | ------- 1467 | X : sparse matrix, [n_samples, n_features] 1468 | Tf-idf-weighted document-term matrix. 1469 | """ 1470 | check_is_fitted(self, '_tfidf', 'The tfidf vector is not fitted') 1471 | 1472 | X = super(TfidfVectorizer, self).transform(raw_documents) 1473 | 1474 | return self._tfidf.transform(X, copy=False) 1475 | -------------------------------------------------------------------------------- /project/3-main/classify.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import multiprocessing,Queue 3 | from sklearn.cross_validation import KFold, StratifiedKFold 4 | import xgboost as xgb 5 | from STFIWF import TfidfVectorizer 6 | import numpy as np 7 | from sklearn.linear_model import SGDClassifier, LogisticRegression,RidgeClassifier,PassiveAggressiveClassifier,Lasso,HuberRegressor 8 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB 9 | from sklearn.ensemble import VotingClassifier,RandomForestClassifier,gradient_boosting 10 | from sklearn.ensemble.bagging import BaggingClassifier 11 | from sklearn.ensemble.weight_boosting import AdaBoostClassifier 12 | from sklearn.svm import LinearSVC, SVC 13 | from sklearn.preprocessing import MinMaxScaler,StandardScaler,MaxAbsScaler 14 | class term(object): 15 | def __init__(self): 16 | random_rate = 8240 17 | clf1 = SGDClassifier( 18 | alpha=5e-05, 19 | average=False, 20 | class_weight='balanced', 21 | loss='log', 22 | n_iter=30, 23 | penalty='l2', n_jobs=-1, random_state=random_rate) 24 | clf2 = MultinomialNB(alpha=0.1) 25 | clf3 = LinearSVC(C=0.1, random_state=random_rate) 26 | clf4 = LogisticRegression(C=1.0,n_jobs=-1, max_iter=100, class_weight='balanced', random_state=random_rate) 27 | clf5 = BernoulliNB(alpha=0.1) 28 | clf6 = VotingClassifier(estimators=[('sgd', clf1), 29 | ('mb', clf2), 30 | ('bb', clf3), 31 | ('lf', clf4), 32 | ('bnb', clf5)], voting='hard') 33 | clf7 = SGDClassifier( 34 | alpha=5e-05, 35 | average=False, 36 | class_weight='balanced', 37 | loss='log', 38 | n_iter=30, 39 | penalty='l1', n_jobs=-1, random_state=random_rate) 40 | clf8 = LinearSVC(C=0.9, random_state=random_rate) 41 | clf9 = LogisticRegression(C=0.5, n_jobs=-1, max_iter=100, class_weight='balanced', random_state=random_rate) 42 | clf10 = MultinomialNB(alpha=0.9) 43 | clf11 = BernoulliNB(alpha=0.9) 44 | clf12 = LogisticRegression(C=0.2, n_jobs=-1, max_iter=100, class_weight='balanced', random_state=random_rate,penalty='l1') 45 | clf13 = LogisticRegression(C=0.8, n_jobs=-1, max_iter=100, class_weight='balanced', random_state=random_rate,penalty='l1') 46 | clf14 = RidgeClassifier(alpha=8) 47 | clf15 = PassiveAggressiveClassifier(C=0.01, loss='squared_hinge', n_iter=20, n_jobs=-1) 48 | clf16 = RidgeClassifier(alpha=2) 49 | clf17 = PassiveAggressiveClassifier(C=0.5, loss='squared_hinge', n_iter=30, n_jobs=-1) 50 | clf18 = LinearSVC(C=0.5, random_state=random_rate) 51 | clf19 = MultinomialNB(alpha=0.5) 52 | clf20 = BernoulliNB(alpha=0.5) 53 | clf21 = Lasso(alpha=0.1, max_iter=20, random_state=random_rate) 54 | clf22 = Lasso(alpha=0.9, max_iter=30, random_state=random_rate) 55 | clf23 = PassiveAggressiveClassifier(C=0.1, loss='hinge', n_iter=30, n_jobs=-1, random_state=random_rate) 56 | clf24 = PassiveAggressiveClassifier(C=0.9, loss='hinge', n_iter=30, n_jobs=-1, random_state=random_rate) 57 | clf25 = HuberRegressor(max_iter=30) 58 | 59 | basemodel = [ 60 | ['sgd', clf1], 61 | ['nb', clf2], 62 | ['lsvc1', clf3], 63 | ['LR1', clf4], 64 | ['bb',clf5], 65 | ['vote', clf6], 66 | ['sgdl1', clf7], 67 | ['lsvc2', clf8], 68 | ['LR2', clf9], 69 | ['nb2', clf10], 70 | ['bb2', clf11], 71 | ['LR3', clf12], 72 | ['LR4', clf13], 73 | ['rc1', clf14], 74 | ['pac1', clf15], 75 | ['rc2', clf16], 76 | ['pac2', clf17], 77 | ['lsvc3', clf18], 78 | ['nb3', clf19], 79 | ['bb3', clf20], 80 | ['lr5', clf21], 81 | ['lr6', clf22], 82 | ['rc3', clf23], 83 | ['pac3', clf24], 84 | ['hub', clf25], 85 | ] 86 | ##################################### 87 | clf_svc = SVC(C=1,random_state=random_rate,cache_size=1000) 88 | 89 | self.base_models = basemodel 90 | self.LR=clf4 91 | self.svc = clf_svc 92 | 93 | def stacking(self,X,Y,T,wv_X,wv_T,kind): 94 | """ 95 | ensemble model:stacking 96 | 97 | """ 98 | print 'fitting..' 99 | models = self.base_models 100 | folds = list(KFold(len(Y), n_folds=5, random_state=0)) 101 | S_train = np.zeros((X.shape[0], len(models))) 102 | S_test = np.zeros((T.shape[0], len(models))) 103 | 104 | for i, bm in enumerate(models): 105 | clf = bm[1] 106 | 107 | S_test_i = np.zeros((T.shape[0], len(folds))) 108 | for j, (train_idx, test_idx) in enumerate(folds): 109 | X_train = X[train_idx] 110 | y_train = Y[train_idx] 111 | X_holdout = X[test_idx] 112 | 113 | clf.fit(X_train, y_train) 114 | y_pred = clf.predict(X_holdout)[:] 115 | S_train[test_idx, i] = y_pred 116 | S_test_i[:, j] = clf.predict(T)[:] 117 | 118 | S_test[:, i] = S_test_i.mean(1) 119 | 120 | print S_train.shape,S_test.shape 121 | 122 | S_train = np.concatenate((S_train,wv_X),axis=1) 123 | S_test = np.concatenate((S_test, wv_T), axis=1) 124 | 125 | print S_train.shape,S_test.shape 126 | 127 | print 'scalering..' 128 | min_max_scaler = StandardScaler() 129 | S_train = min_max_scaler.fit_transform(S_train) 130 | S_test = min_max_scaler.fit_transform(S_test) 131 | print 'scalering over!' 132 | self.svc.fit(S_train, Y) 133 | yp= self.svc.predict(S_test)[:] 134 | return yp 135 | 136 | def validation(self, X, Y, wv_X, kind): 137 | """ 138 | 2-fold validation 139 | :param X: train text 140 | :param Y: train label 141 | :param wv_X: train wv_vec 142 | :param kind: age/gender/education 143 | :return: mean score of 2-fold validation 144 | """ 145 | print '向量化中...' 146 | X=np.array(X) 147 | fold_n=2 148 | folds = list(StratifiedKFold(Y, n_folds=fold_n, shuffle=False,random_state=0)) 149 | score = np.zeros(fold_n) 150 | for j, (train_idx, test_idx) in enumerate(folds): 151 | print j+1,'-fold' 152 | 153 | X_train = X[train_idx] 154 | y_train = Y[train_idx] 155 | X_test = X[test_idx] 156 | y_test = Y[test_idx] 157 | 158 | wv_X_train =wv_X[train_idx] 159 | wv_X_test = wv_X[test_idx] 160 | 161 | vec = TfidfVectorizer(use_idf=True,sublinear_tf=False, max_features=50000, binary=True) 162 | vec.fit(X_train, y_train) 163 | X_train = vec.transform(X_train) 164 | X_test = vec.transform(X_test) 165 | 166 | print 'shape',X_train.shape 167 | 168 | ypre = self.stacking(X_train,y_train,X_test,wv_X_train,wv_X_test,kind) 169 | cur = sum(y_test == ypre) * 1.0 / len(ypre) 170 | score[j] = cur 171 | 172 | print score 173 | print score.mean(),kind 174 | return score.mean() 175 | 176 | def predict(self,X,Y,T,wv_X,wv_T,kind): 177 | """ 178 | train and predict 179 | :param X: train text 180 | :param Y: train label 181 | :param T: test text 182 | :param wv_X: train wv 183 | :param wv_T: test wv 184 | :param kind: age/gender/education 185 | :return: array like ,predict of "kind" 186 | """ 187 | print 'predicting..向量化中...' 188 | vec = TfidfVectorizer(use_idf=True, sublinear_tf=False, max_features=60000, binary=True) 189 | 190 | vec.fit(X, Y) 191 | X = vec.transform(X) 192 | T = vec.transform(T) 193 | 194 | print 'train size',X.shape,T.shape 195 | res = self.stacking(X, Y, T, wv_X, wv_T, kind) 196 | return res 197 | 198 | 199 | -------------------------------------------------------------------------------- /project/3-main/main.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import classify 3 | import preprocess 4 | import pandas as pd 5 | import numpy as np 6 | import csv 7 | import codecs 8 | import multiprocessing 9 | import time 10 | 11 | 12 | def input(trainname): 13 | """ 14 | load the text file 15 | :param trainname: path of the input file 16 | :return:list 17 | """ 18 | traindata = [] 19 | with open(trainname, 'rb') as f: 20 | reader = csv.reader(f) 21 | count = 0 22 | for line in reader: 23 | try: 24 | traindata.append(line[0]) 25 | count += 1 26 | except: 27 | print "error:", line, count 28 | traindata.append("1 ") 29 | return traindata 30 | def output(filename, ID, age, gender, education): 31 | """ 32 | generate the submit file 33 | :param filename: path of the submit file 34 | :param ID: user ID 35 | :param age:predicted age 36 | :param gender:predicted gender 37 | :param education:predicted education 38 | :return:submit file 39 | """ 40 | print ID.shape, age.shape, gender.shape, education.shape 41 | with codecs.open(filename, 'w', encoding='gbk') as f: 42 | count=0 43 | for i in range(len(ID)): 44 | # if count>=1000: 45 | # break 46 | f.write(str(ID[i]) + ' ' + str(age[i]) + ' ' + str(gender[i]) + ' ' + str(education[i]) + '\n') 47 | count+=1 48 | if __name__ == '__main__': 49 | """ 50 | the main function 51 | 注意路径 52 | """ 53 | start=time.time() 54 | # order='predict' #execute predict function 55 | order='test' #execute 2-fold validation function 56 | print 'orderis ', order 57 | print '----------start----------' 58 | 59 | #loading 60 | trainname = 'jieba_train_cut.csv' 61 | testname = 'jieba_test_cut.csv' 62 | traindata = input(trainname) 63 | testdata = input(testname) 64 | label_genderfile_path = 'train_gender.csv' 65 | label_agefile_path = 'train_age.csv' 66 | label_edufile_path = 'train_education.csv' 67 | genderdata = np.loadtxt(open(label_genderfile_path, 'r')).astype(int) 68 | agedata = np.loadtxt(open(label_agefile_path, 'r')).astype(int) 69 | educationdata = np.loadtxt(open(label_edufile_path, 'r')).astype(int) 70 | 71 | # --------------------------------- 72 | print '预处理中..' 73 | preprocessob = preprocess.preprocess() 74 | 75 | #remove label missed samples 76 | gender_traindatas, genderlabel = preprocessob.removezero(traindata, genderdata) 77 | age_traindatas, agelabel = preprocessob.removezero(traindata, agedata) 78 | edu_traindatas, edulabel = preprocessob.removezero(traindata, educationdata) 79 | 80 | # 填写你的wv向量路径 81 | w2vtrain = np.load('wv300_win100.train.npy') 82 | w2vtest = np.load('wv300_win100.test.npy') 83 | 84 | wv_gender_traindatas, genderlabel = preprocessob.removezero(w2vtrain, genderdata) 85 | wv_age_traindatas, agelabel = preprocessob.removezero(w2vtrain, agedata) 86 | wv_edu_traindatas, edulabel = preprocessob.removezero(w2vtrain, educationdata) 87 | 88 | if order=='test': 89 | termob1 = classify.term() 90 | termob2 = classify.term() 91 | termob3 = classify.term() 92 | p1 = multiprocessing.Process(target=termob1.validation, 93 | args=(gender_traindatas, genderlabel, wv_gender_traindatas, 'gender',)) 94 | p2=multiprocessing.Process(target=termob2.validation,args=(age_traindatas, agelabel, wv_age_traindatas, 'age',)) 95 | p3=multiprocessing.Process(target=termob3.validation,args=(edu_traindatas, edulabel, wv_edu_traindatas, 'edu',)) 96 | 97 | p1.start() 98 | p2.start() 99 | p3.start() 100 | 101 | p1.join() 102 | p2.join() 103 | p3.join() 104 | elif order=='predict': 105 | termob = classify.term() 106 | gender=termob.predict(gender_traindatas, genderlabel, testdata, wv_gender_traindatas, w2vtest, 'gender') 107 | age=termob.predict(age_traindatas, agelabel, testdata, wv_age_traindatas, w2vtest, 'age') 108 | edu=termob.predict(edu_traindatas, edulabel, testdata, wv_edu_traindatas, w2vtest, 'edu') 109 | ID = pd.read_csv('user_tag_query.10W.TEST.csv').ID 110 | output('submit.csv', ID, age, gender, edu) 111 | 112 | end=time.time() 113 | print 'total time is', end-start 114 | -------------------------------------------------------------------------------- /project/3-main/preprocess.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import numpy as np 3 | class preprocess(): 4 | 5 | def removezero(self, x, y): 6 | nozero = np.nonzero(y) 7 | y = y[nozero] 8 | x = np.array(x) 9 | x = x[nozero] 10 | return x, y -------------------------------------------------------------------------------- /project/README.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderSkyChen/2016CCF_BDCI_Sougou/d49c94be714265adbad27ece0d84658264b1a3d4/project/README.txt -------------------------------------------------------------------------------- /答辩ppt -.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderSkyChen/2016CCF_BDCI_Sougou/d49c94be714265adbad27ece0d84658264b1a3d4/答辩ppt -.ppt --------------------------------------------------------------------------------