├── LICENSE
├── README.md
├── project
    ├── 1-process_data
    │   ├── 1-README.txt
    │   ├── CSVtolabel.py
    │   ├── jieba_cut_fliter.py
    │   └── toCSV.py
    ├── 2-word2vec
    │   ├── 1-README.txt
    │   ├── class_w2v.py
    │   ├── main.py
    │   └── preprocess.py
    ├── 3-main
    │   ├── README.txt
    │   ├── STFIWF.py
    │   ├── classify.py
    │   ├── main.py
    │   └── preprocess.py
    └── README.txt
└── 答辩ppt -.ppt


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 陈潇凯
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 2016CCF_BDCI_Sougou
 2 | 【The Right团队-源码以及PPT分享】2016CCF大数据与计算智能大赛：精准营销中搜狗用户画像挖掘
 3 | 
 4 | 具体详见我的博客：
 5 | [传送门](http://coderskychen.cn/2016/12/28/%E3%80%90%E5%B9%B2%E8%B4%A7%E5%88%86%E4%BA%AB%E3%80%912016CCF%E5%A4%A7%E6%95%B0%E6%8D%AE%E4%B8%8E%E8%AE%A1%E7%AE%97%E6%99%BA%E8%83%BD%E5%A4%A7%E8%B5%9B-%E6%90%9C%E7%8B%97%E7%94%A8%E6%88%B7%E7%94%BB%E5%83%8F%E6%8C%96%E6%8E%98/)
 6 | 
 7 | 复赛数据下载链接：
 8 | http://pan.baidu.com/s/1mi9DjIg 
 9 | 密码：g8i9
10 | 
11 | 初识python，代码写的很粗糙，多多包涵~
12 | 


--------------------------------------------------------------------------------
/project/1-process_data/1-README.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderSkyChen/2016CCF_BDCI_Sougou/d49c94be714265adbad27ece0d84658264b1a3d4/project/1-process_data/1-README.txt


--------------------------------------------------------------------------------
/project/1-process_data/CSVtolabel.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """
 3 | 根据上一步骤得到的CSV文件，将搜索文本以及三个属性剥离，保存为相应的文件
 4 | 注意路径
 5 | """
 6 | import pandas as pd
 7 | 
 8 | #path of the train and test files
 9 | trainname = 'user_tag_query.10W.TRAIN.csv'
10 | testname = 'user_tag_query.10W.TEST.csv'
11 | 
12 | data = pd.read_csv(trainname)
13 | print data.info()
14 | 
15 | #generate three labels for age/gender/education
16 | data.age.to_csv("train_age.csv", index=False)
17 | data.Gender.to_csv("train_gender.csv", index=False)
18 | data.Education.to_csv("train_education.csv", index=False)
19 | #generate trainfile's text file
20 | data.QueryList.to_csv("train_querylist.csv", index=False)
21 | 
22 | data = pd.read_csv(testname)
23 | print data.info()
24 | #generate testfile's text file
25 | data.QueryList.to_csv("test_querylist.csv", index=False)
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/project/1-process_data/jieba_cut_fliter.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """
 3 | 调用jieba分词，完成搜索文本的分词。同时只保留n,v,j三种词性。
 4 | 注意输入的文件为纯文本格式，最好一个用户的搜索历史为一行。
 5 | 注意路径
 6 | """
 7 | import pandas as pd
 8 | import jieba.analyse
 9 | import time
10 | import jieba
11 | import jieba.posseg
12 | import os, sys
13 | reload(sys)
14 | sys.setdefaultencoding('utf8')
15 | def input(trainname):
16 |     traindata = []
17 |     with open(trainname, 'rb') as f:
18 |         line = f.readline()
19 |         count = 0
20 |         while line:
21 |             try:
22 |                 traindata.append(line)
23 |                 count += 1
24 |             except:
25 |                 print "error:", line, count
26 |             line=f.readline()
27 |     return traindata
28 | start = time.clock()
29 | 
30 | filepath = 'train.csv'
31 | QueryList = input(filepath)
32 | 
33 | writepath = 'writefile.csv'
34 | csvfile = open(writepath, 'w')
35 | #parallel:speed up
36 | jieba.enable_parallel()
37 | POS = {}
38 | for i in range(len(QueryList)):
39 |     s = []
40 |     str = ""
41 |     words = jieba.posseg.cut(QueryList[i])# 带有词性的精确分词模式
42 |     allowPOS = ['n','v','j']
43 |     for word, flag in words:
44 |         POS[flag]=POS.get(flag,0)+1
45 |         if (flag[0] in allowPOS) and len(word)>=2:
46 |             str += word + " "
47 |     s.append(str.encode('utf8'))
48 |     csvfile.write(" ".join(s)+'\n')
49 | csvfile.close()
50 | print POS
51 | 
52 | end = time.clock()
53 | print "total time: %f s" % (end - start)
54 | 
55 | 
56 | # seg_list = jieba.cut("陶喆下载", cut_all=False)
57 | # print("Default Mode: " + "/ ".join(seg_list))  # 默认模式
58 | #
59 | # words = jieba.posseg.cut("陶喆下载")
60 | # for word, flag in words:
61 | #     print('%s %s' % (word, flag))
62 | 


--------------------------------------------------------------------------------
/project/1-process_data/toCSV.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """
 3 | add变量表示了原始文件的路径，TRAIN/TEST
 4 | csvfile表示了生成文件的信息
 5 | 主要功能：把原始文件转为UTF-8格式
 6 | 注意路径
 7 | """
 8 | import csv
 9 | 
10 | add = 'user_tag_query.10W.TRAIN' #path of the original train file
11 | 
12 | csvfile = file(add + '.csv', 'wb')# the path of the generated train file
13 | writer = csv.writer(csvfile)
14 | writer.writerow(['ID', 'age', 'Gender', 'Education', 'QueryList'])
15 | with open(add, 'r') as f:
16 |     for line in f:
17 |         line.strip()
18 |         data = line.split("\t")
19 |         writedata = [data[0], data[1], data[2], data[3]]
20 |         querystr = ''
21 |         data[-1]=data[-1][:-1]
22 |         for d in data[4:]:
23 |            try:
24 |                 querystr += d.decode('GB18030').encode('utf8') + '\t'
25 |            except:
26 |                print data[0],querystr
27 |         querystr = querystr[:-1]
28 |         writedata.append(querystr)
29 |         writer.writerow(writedata)
30 | 
31 | add = 'user_tag_query.10W.TEST'#path of the original test file
32 | 
33 | csvfile = file(add + '.csv', 'wb')# the path of the generated test file
34 | writer = csv.writer(csvfile)
35 | writer.writerow(['ID', 'QueryList'])
36 | with open(add, 'r') as f:
37 |     for line in f:
38 |         data = line.split("\t")
39 |         writedata = [data[0]]
40 |         querystr = ''
41 |         data[-1]=data[-1][:-1]
42 |         for d in data[1:]:
43 |            try:
44 |                 querystr += d.decode('GB18030').encode('utf8') + '\t'
45 |            except:
46 |                print data[0],querystr
47 |         querystr = querystr[:-1]
48 |         writedata.append(querystr)
49 |         writer.writerow(writedata)
50 | 
51 | 


--------------------------------------------------------------------------------
/project/2-word2vec/1-README.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderSkyChen/2016CCF_BDCI_Sougou/d49c94be714265adbad27ece0d84658264b1a3d4/project/2-word2vec/1-README.txt


--------------------------------------------------------------------------------
/project/2-word2vec/class_w2v.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from sklearn.cross_validation import KFold, StratifiedKFold
 3 | from gensim.models import word2vec
 4 | import xgboost as xgb
 5 | import numpy as np
 6 | from sklearn.linear_model import SGDClassifier, LogisticRegression
 7 | from sklearn.svm import LinearSVC,SVC
 8 | from sklearn.ensemble import VotingClassifier
 9 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB
10 | from sklearn.preprocessing import MinMaxScaler,StandardScaler
11 | class w2v():
12 |     def __init__(self,size=300):
13 |         random_rate = 8240
14 |         self.size=size
15 |         self.svc= SVC(C=1, random_state=random_rate)
16 |         self.LR = LogisticRegression(C=1.0, max_iter=100, class_weight='balanced', random_state=random_rate, n_jobs=-1)
17 |         self.clf = LinearSVC(random_state=random_rate)
18 | 
19 |     def fit(self, X, Y, T):
20 |         """
21 |         train and predict
22 |         """
23 |         print 'fitting..'
24 |         self.LR.fit(X, Y)
25 |         res = self.LR.predict(T)
26 |         return res
27 | 
28 |     def validation(self,X,Y,kind):
29 |         """
30 | 
31 |         使用2-fold进行验证
32 |         """
33 |         print 'validating...'
34 |         fold_n=2
35 |         folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0))
36 |         score=np.zeros(fold_n)
37 |         for j, (train_idx, test_idx) in enumerate(folds):
38 |             print j + 1, '-fold'
39 |             X_train = X[train_idx]
40 |             y_train = Y[train_idx]
41 |             X_test = X[test_idx]
42 |             y_test = Y[test_idx]
43 | 
44 |             res = self.fit(X_train, y_train, X_test)
45 |             cur = sum(y_test == res) * 1.0 / len(res)
46 |             score[j] = cur
47 |         print score, score.mean()
48 |         return score.mean()
49 | 
50 |     def train_w2v(self, filename):
51 |         """
52 |         训练wv模型
53 |         :param filename:path
54 |         :return:none
55 |         """
56 |         sentences = word2vec.LineSentence(filename)  # 加载语料，要求语料为“一行一文本”的格式
57 |         print '正在训练w2v 针对语料：',str(filename)
58 |         print 'size is: ',self.size
59 |         model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48)  # 训练模型; 注意参数window 对结果有影响 一般5-100
60 |         savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径
61 |         print '训练完毕，已保存: ', savepath,
62 |         model.save(savepath)
63 |     def load_trainsform(self,X):
64 |         """
65 |         载入模型，并且生成wv向量
66 |         :param X:读入的文档，list
67 |         :return:np.array
68 |         """
69 |         print '载入模型中'
70 |         model = word2vec.Word2Vec.load('20w_size_win100_300.model') #填写你的路径
71 |         print '加载成功'
72 |         res=np.zeros((len(X),self.size))
73 |         print '生成w2v向量中..'
74 |         for i,line in enumerate(X):
75 |             line=line.decode('utf-8')
76 |             terms=line.split()
77 |             count=0
78 |             for j,term in enumerate(terms):
79 |                 try:#---try失败说明X中有单词不在model中，训练的时候model的模型是min_count的 忽略了一部分单词
80 |                     count += 1
81 |                     res[i]+=np.array(model[term])
82 |                 except:
83 |                     1 == 1
84 |             if count!=0:
85 |                 res[i]=res[i]/float(count) # 求均值
86 |         return res
87 | 
88 | 


--------------------------------------------------------------------------------
/project/2-word2vec/main.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | import class_w2v
 3 | import preprocess
 4 | import numpy as np
 5 | import csv
 6 | 
 7 | def input(trainname):
 8 |     """
 9 |     load file
10 |     :param trainname:path
11 |     :return: list
12 |     """
13 |     traindata = []
14 |     with open(trainname, 'rb') as f:
15 |         reader = csv.reader(f)
16 |         count = 0
17 |         for line in reader:
18 |             try:
19 |                 traindata.append(line[0])
20 |                 count += 1
21 |             except:
22 |                 print "error:", line, count
23 |                 traindata.append(" ")
24 |     return traindata
25 | if __name__ == '__main__':
26 |     """
27 |     使用方法：先训练wv的model，然后再生成wv的向量，最后可以使用2-fold验证效果
28 |     主要目的：生成WV向量，提供给下一个步骤：特征融合。
29 |     注意路径
30 |     """
31 |     print '---------w2v----------'
32 |     # order = 'train w2v model'
33 |     # order='getvec'
34 |     order = 'test'
35 | 
36 |     print 'order is', order
37 | 
38 |     classob = class_w2v.w2v(300)
39 | 
40 |     if order == 'train w2v model': #训练WV的model
41 |         totalname = 'jieba_total_cut.csv' #纯文本文件路径
42 |         classob.train_w2v(totalname)
43 |         exit()
44 |     elif order == 'getvec': #利用生成的model得到文档的WV的向量，使用求和平均法
45 |         trainname = 'jieba_train_cut.csv'
46 |         testname = 'jieba_test_cut.csv'
47 |         traindata = input(trainname)
48 |         testdata = input(testname)
49 | 
50 |         res1 = classob.load_trainsform(traindata)
51 |         res2 = classob.load_trainsform(testdata)
52 |         print res1.shape,res2.shape
53 |         np.save('wv300_win100.train.npy', res1)#保存生成的向量
54 |         np.save('wv300_win100.test.npy', res2)
55 |         exit()
56 | 
57 | 
58 |     #以下为测试wv向量，即仅仅使用wv向量做这个比赛，目的在于寻找最好参数的WV向量
59 |     print '载入所有的w2v向量中..'
60 |     w2vtrain = np.load('wv300_win100.train.npy')
61 |     w2vtest = np.load('wv300_win100.test.npy')
62 | 
63 |     #防止出现非法值
64 |     if np.any((np.isnan(w2vtrain))):
65 |         print 'nan to num!'
66 |         w2vtrain = np.nan_to_num(w2vtrain)
67 | 
68 |     if np.any((np.isnan(w2vtest))):
69 |         print 'nan to num!'
70 |         w2vtest = np.nan_to_num(w2vtest)
71 | 
72 |     #载入label文件
73 |     label_genderfile_path = 'train_gender.csv'
74 |     label_agefile_path = 'train_age.csv'
75 |     label_edufile_path = 'train_education.csv'
76 |     genderdata = np.loadtxt(open(label_genderfile_path, 'r')).astype(int)
77 |     agedata = np.loadtxt(open(label_agefile_path, 'r')).astype(int)
78 |     educationdata = np.loadtxt(open(label_edufile_path, 'r')).astype(int)
79 | 
80 |     print '预处理中..'
81 |     preprocessob = preprocess.preprocess()
82 |     gender_traindatas, genderlabel = preprocessob.removezero(w2vtrain, genderdata)
83 |     age_traindatas, agelabel = preprocessob.removezero(w2vtrain, agedata)
84 |     edu_traindatas, edulabel = preprocessob.removezero(w2vtrain, educationdata)
85 |     # ------------------------------------------------------
86 | 
87 |     if order == 'test': #使用2-fold进行验证
88 |         res1 = classob.validation(gender_traindatas, genderlabel, kind='gender')
89 |         res2 = classob.validation(age_traindatas, agelabel, kind='age')
90 |         res3 = classob.validation(edu_traindatas, edulabel, kind='edu')
91 |         print 'avg is:', (res1+res2+res3)/3.0
92 |     else:
93 |         print 'error!'
94 |         exit()
95 | 
96 | 


--------------------------------------------------------------------------------
/project/2-word2vec/preprocess.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | import numpy as np
 3 | class preprocess():
 4 |     # 主要功能：去除缺失值
 5 |     def removezero(self, x, y):
 6 |         nozero = np.nonzero(y)
 7 |         y = y[nozero]
 8 |         x = np.array(x)
 9 |         x = x[nozero]
10 |         return x, y
11 | 


--------------------------------------------------------------------------------
/project/3-main/README.txt:
--------------------------------------------------------------------------------
 1 | 主要功能：该部分为本次比赛的核心，包括了特征词加权、模型集成stacking和W2V的融合等等。
 2 | 
 3 | 环境说明：
 4 | python2.7_64bit：以及必要的sklearn、numpy等工具包
 5 | 
 6 | 文件说明：
 7 | main.py: 主调函数
 8 | preprocess.py：预处理类，主要是去除缺失值
 9 | classify.py: 实现主要功能的类文件，完成预测和交叉验证。
10 | STFIWF.py: S-TFIWF加权的实现，被classify调用。该类基于sklearn.feature_extraction.text 我们根据提出的公式对IDF以及TF的部分进行了修改,具体在1093-1176行
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/project/3-main/STFIWF.py:
--------------------------------------------------------------------------------
   1 | # -*- coding: utf-8 -*-
   2 | # coding=utf-8
   3 | 
   4 | from __future__ import unicode_literals
   5 | 
   6 | import array
   7 | from collections import Mapping, defaultdict
   8 | import numbers
   9 | from operator import itemgetter
  10 | import re
  11 | import unicodedata
  12 | 
  13 | import numpy as np
  14 | import scipy.sparse as sp
  15 | 
  16 | from sklearn.base import BaseEstimator, TransformerMixin
  17 | from sklearn.externals import six
  18 | from sklearn.externals.six.moves import xrange
  19 | from sklearn.preprocessing import normalize
  20 | from sklearn.feature_extraction.hashing import FeatureHasher
  21 | from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
  22 | from sklearn.utils import deprecated
  23 | from sklearn.utils.fixes import frombuffer_empty, bincount
  24 | from sklearn.utils.validation import check_is_fitted
  25 | from sklearn import preprocessing
  26 | 
  27 | __all__ = ['CountVectorizer',
  28 |            'ENGLISH_STOP_WORDS',
  29 |            'TfidfTransformer',
  30 |            'TfidfVectorizer',
  31 |            'strip_accents_ascii',
  32 |            'strip_accents_unicode',
  33 |            'strip_tags']
  34 | 
  35 | def strip_accents_unicode(s):
  36 |     """Transform accentuated unicode symbols into their simple counterpart
  37 | 
  38 |     Warning: the python-level loop and join operations make this
  39 |     implementation 20 times slower than the strip_accents_ascii basic
  40 |     normalization.
  41 | 
  42 |     See also
  43 |     --------
  44 |     strip_accents_ascii
  45 |         Remove accentuated char for any unicode symbol that has a direct
  46 |         ASCII equivalent.
  47 |     """
  48 |     normalized = unicodedata.normalize('NFKD', s)
  49 |     if normalized == s:
  50 |         return s
  51 |     else:
  52 |         return ''.join([c for c in normalized if not unicodedata.combining(c)])
  53 | 
  54 | def strip_accents_ascii(s):
  55 |     """Transform accentuated unicode symbols into ascii or nothing
  56 | 
  57 |     Warning: this solution is only suited for languages that have a direct
  58 |     transliteration to ASCII symbols.
  59 | 
  60 |     See also
  61 |     --------
  62 |     strip_accents_unicode
  63 |         Remove accentuated char for any unicode symbol.
  64 |     """
  65 |     nkfd_form = unicodedata.normalize('NFKD', s)
  66 |     return nkfd_form.encode('ASCII', 'ignore').decode('ASCII')
  67 | 
  68 | def strip_tags(s):
  69 |     """Basic regexp based HTML / XML tag stripper function
  70 | 
  71 |     For serious HTML/XML preprocessing you should rather use an external
  72 |     library such as lxml or BeautifulSoup.
  73 |     """
  74 |     return re.compile(r"<([^>]+)>", flags=re.UNICODE).sub(" ", s)
  75 | 
  76 | def _check_stop_list(stop):
  77 |     if stop == "english":
  78 |         return ENGLISH_STOP_WORDS
  79 |     elif isinstance(stop, six.string_types):
  80 |         raise ValueError("not a built-in stop list: %s" % stop)
  81 |     elif stop is None:
  82 |         return None
  83 |     else:  # assume it's a collection
  84 |         return frozenset(stop)
  85 | 
  86 | class VectorizerMixin(object):
  87 |     """Provides common code for text vectorizers (tokenization logic)."""
  88 | 
  89 |     _white_spaces = re.compile(r"\s\s+")
  90 | 
  91 |     def decode(self, doc):
  92 |         """Decode the input into a string of unicode symbols
  93 | 
  94 |         The decoding strategy depends on the vectorizer parameters.
  95 |         """
  96 |         if self.input == 'filename':
  97 |             with open(doc, 'rb') as fh:
  98 |                 doc = fh.read()
  99 | 
 100 |         elif self.input == 'file':
 101 |             doc = doc.read()
 102 | 
 103 |         if isinstance(doc, bytes):
 104 |             doc = doc.decode(self.encoding, self.decode_error)
 105 | 
 106 |         if doc is np.nan:
 107 |             raise ValueError("np.nan is an invalid document, expected byte or "
 108 |                              "unicode string.")
 109 | 
 110 |         return doc
 111 | 
 112 |     def _word_ngrams(self, tokens, stop_words=None):
 113 |         """Turn tokens into a sequence of n-grams after stop words filtering"""
 114 |         # handle stop words
 115 |         if stop_words is not None:
 116 |             tokens = [w for w in tokens if w not in stop_words]
 117 | 
 118 |         # handle token n-grams
 119 |         min_n, max_n = self.ngram_range
 120 |         if max_n != 1:
 121 |             original_tokens = tokens
 122 |             tokens = []
 123 |             n_original_tokens = len(original_tokens)
 124 |             for n in xrange(min_n,
 125 |                             min(max_n + 1, n_original_tokens + 1)):
 126 |                 for i in xrange(n_original_tokens - n + 1):
 127 |                     tokens.append(" ".join(original_tokens[i: i + n]))
 128 | 
 129 |         return tokens
 130 | 
 131 |     def _char_ngrams(self, text_document):
 132 |         """Tokenize text_document into a sequence of character n-grams"""
 133 |         # normalize white spaces
 134 |         text_document = self._white_spaces.sub(" ", text_document)
 135 | 
 136 |         text_len = len(text_document)
 137 |         ngrams = []
 138 |         min_n, max_n = self.ngram_range
 139 |         for n in xrange(min_n, min(max_n + 1, text_len + 1)):
 140 |             for i in xrange(text_len - n + 1):
 141 |                 ngrams.append(text_document[i: i + n])
 142 |         return ngrams
 143 | 
 144 |     def _char_wb_ngrams(self, text_document):
 145 |         """Whitespace sensitive char-n-gram tokenization.
 146 | 
 147 |         Tokenize text_document into a sequence of character n-grams
 148 |         excluding any whitespace (operating only inside word boundaries)"""
 149 |         # normalize white spaces
 150 |         text_document = self._white_spaces.sub(" ", text_document)
 151 | 
 152 |         min_n, max_n = self.ngram_range
 153 |         ngrams = []
 154 |         for w in text_document.split():
 155 |             w = ' ' + w + ' '
 156 |             w_len = len(w)
 157 |             for n in xrange(min_n, max_n + 1):
 158 |                 offset = 0
 159 |                 ngrams.append(w[offset:offset + n])
 160 |                 while offset + n < w_len:
 161 |                     offset += 1
 162 |                     ngrams.append(w[offset:offset + n])
 163 |                 if offset == 0:  # count a short word (w_len < n) only once
 164 |                     break
 165 |         return ngrams
 166 | 
 167 |     def build_preprocessor(self):
 168 |         """Return a function to preprocess the text before tokenization"""
 169 |         if self.preprocessor is not None:
 170 |             return self.preprocessor
 171 | 
 172 |         # unfortunately python functools package does not have an efficient
 173 |         # `compose` function that would have allowed us to chain a dynamic
 174 |         # number of functions. However the cost of a lambda call is a few
 175 |         # hundreds of nanoseconds which is negligible when compared to the
 176 |         # cost of tokenizing a string of 1000 chars for instance.
 177 |         noop = lambda x: x
 178 | 
 179 |         # accent stripping
 180 |         if not self.strip_accents:
 181 |             strip_accents = noop
 182 |         elif callable(self.strip_accents):
 183 |             strip_accents = self.strip_accents
 184 |         elif self.strip_accents == 'ascii':
 185 |             strip_accents = strip_accents_ascii
 186 |         elif self.strip_accents == 'unicode':
 187 |             strip_accents = strip_accents_unicode
 188 |         else:
 189 |             raise ValueError('Invalid value for "strip_accents": %s' %
 190 |                              self.strip_accents)
 191 | 
 192 |         if self.lowercase:
 193 |             return lambda x: strip_accents(x.lower())
 194 |         else:
 195 |             return strip_accents
 196 | 
 197 |     def build_tokenizer(self):
 198 |         """Return a function that splits a string into a sequence of tokens"""
 199 |         if self.tokenizer is not None:
 200 |             return self.tokenizer
 201 |         token_pattern = re.compile(self.token_pattern)
 202 |         return lambda doc: token_pattern.findall(doc)
 203 | 
 204 |     def get_stop_words(self):
 205 |         """Build or fetch the effective stop words list"""
 206 |         return _check_stop_list(self.stop_words)
 207 | 
 208 |     def build_analyzer(self):
 209 |         """Return a callable that handles preprocessing and tokenization"""
 210 |         if callable(self.analyzer):
 211 |             return self.analyzer
 212 | 
 213 |         preprocess = self.build_preprocessor()
 214 | 
 215 |         if self.analyzer == 'char':
 216 |             return lambda doc: self._char_ngrams(preprocess(self.decode(doc)))
 217 | 
 218 |         elif self.analyzer == 'char_wb':
 219 |             return lambda doc: self._char_wb_ngrams(
 220 |                 preprocess(self.decode(doc)))
 221 | 
 222 |         elif self.analyzer == 'word':
 223 |             stop_words = self.get_stop_words()
 224 |             tokenize = self.build_tokenizer()
 225 | 
 226 |             return lambda doc: self._word_ngrams(
 227 |                 tokenize(preprocess(self.decode(doc))), stop_words)
 228 | 
 229 |         else:
 230 |             raise ValueError('%s is not a valid tokenization scheme/analyzer' %
 231 |                              self.analyzer)
 232 | 
 233 |     def _validate_vocabulary(self):
 234 |         vocabulary = self.vocabulary
 235 |         if vocabulary is not None:
 236 |             if isinstance(vocabulary, set):
 237 |                 vocabulary = sorted(vocabulary)
 238 |             if not isinstance(vocabulary, Mapping):
 239 |                 vocab = {}
 240 |                 for i, t in enumerate(vocabulary):
 241 |                     if vocab.setdefault(t, i) != i:
 242 |                         msg = "Duplicate term in vocabulary: %r" % t
 243 |                         raise ValueError(msg)
 244 |                 vocabulary = vocab
 245 |             else:
 246 |                 indices = set(six.itervalues(vocabulary))
 247 |                 if len(indices) != len(vocabulary):
 248 |                     raise ValueError("Vocabulary contains repeated indices.")
 249 |                 for i in xrange(len(vocabulary)):
 250 |                     if i not in indices:
 251 |                         msg = ("Vocabulary of size %d doesn't contain index "
 252 |                                "%d." % (len(vocabulary), i))
 253 |                         raise ValueError(msg)
 254 |             if not vocabulary:
 255 |                 raise ValueError("empty vocabulary passed to fit")
 256 |             self.fixed_vocabulary_ = True
 257 |             self.vocabulary_ = dict(vocabulary)
 258 |         else:
 259 |             self.fixed_vocabulary_ = False
 260 | 
 261 |     def _check_vocabulary(self):
 262 |         """Check if vocabulary is empty or missing (not fit-ed)"""
 263 |         msg = "%(name)s - Vocabulary wasn't fitted."
 264 |         check_is_fitted(self, 'vocabulary_', msg=msg),
 265 | 
 266 |         if len(self.vocabulary_) == 0:
 267 |             raise ValueError("Vocabulary is empty")
 268 | 
 269 | class HashingVectorizer(BaseEstimator, VectorizerMixin):
 270 |     """Convert a collection of text documents to a matrix of token occurrences
 271 | 
 272 |     It turns a collection of text documents into a scipy.sparse matrix holding
 273 |     token occurrence counts (or binary occurrence information), possibly
 274 |     normalized as token frequencies if norm='l1' or projected on the euclidean
 275 |     unit sphere if norm='l2'.
 276 | 
 277 |     This text vectorizer implementation uses the hashing trick to find the
 278 |     token string name to feature integer index mapping.
 279 | 
 280 |     This strategy has several advantages:
 281 | 
 282 |     - it is very low memory scalable to large datasets as there is no need to
 283 |       store a vocabulary dictionary in memory
 284 | 
 285 |     - it is fast to pickle and un-pickle as it holds no state besides the
 286 |       constructor parameters
 287 | 
 288 |     - it can be used in a streaming (partial fit) or parallel pipeline as there
 289 |       is no state computed during fit.
 290 | 
 291 |     There are also a couple of cons (vs using a CountVectorizer with an
 292 |     in-memory vocabulary):
 293 | 
 294 |     - there is no way to compute the inverse transform (from feature indices to
 295 |       string feature names) which can be a problem when trying to introspect
 296 |       which features are most important to a model.
 297 | 
 298 |     - there can be collisions: distinct tokens can be mapped to the same
 299 |       feature index. However in practice this is rarely an issue if n_features
 300 |       is large enough (e.g. 2 ** 18 for text classification problems).
 301 | 
 302 |     - no IDF weighting as this would render the transformer stateful.
 303 | 
 304 |     The hash function employed is the signed 32-bit version of Murmurhash3.
 305 | 
 306 |     Read more in the :ref:`User Guide <text_feature_extraction>`.
 307 | 
 308 |     Parameters
 309 |     ----------
 310 | 
 311 |     input : string {'filename', 'file', 'content'}
 312 |         If 'filename', the sequence passed as an argument to fit is
 313 |         expected to be a list of filenames that need reading to fetch
 314 |         the raw content to analyze.
 315 | 
 316 |         If 'file', the sequence items must have a 'read' method (file-like
 317 |         object) that is called to fetch the bytes in memory.
 318 | 
 319 |         Otherwise the input is expected to be the sequence strings or
 320 |         bytes items are expected to be analyzed directly.
 321 | 
 322 |     encoding : string, default='utf-8'
 323 |         If bytes or files are given to analyze, this encoding is used to
 324 |         decode.
 325 | 
 326 |     decode_error : {'strict', 'ignore', 'replace'}
 327 |         Instruction on what to do if a byte sequence is given to analyze that
 328 |         contains characters not of the given `encoding`. By default, it is
 329 |         'strict', meaning that a UnicodeDecodeError will be raised. Other
 330 |         values are 'ignore' and 'replace'.
 331 | 
 332 |     strip_accents : {'ascii', 'unicode', None}
 333 |         Remove accents during the preprocessing step.
 334 |         'ascii' is a fast method that only works on characters that have
 335 |         an direct ASCII mapping.
 336 |         'unicode' is a slightly slower method that works on any characters.
 337 |         None (default) does nothing.
 338 | 
 339 |     analyzer : string, {'word', 'char', 'char_wb'} or callable
 340 |         Whether the feature should be made of word or character n-grams.
 341 |         Option 'char_wb' creates character n-grams only from text inside
 342 |         word boundaries.
 343 | 
 344 |         If a callable is passed it is used to extract the sequence of features
 345 |         out of the raw, unprocessed input.
 346 | 
 347 |     preprocessor : callable or None (default)
 348 |         Override the preprocessing (string transformation) stage while
 349 |         preserving the tokenizing and n-grams generation steps.
 350 | 
 351 |     tokenizer : callable or None (default)
 352 |         Override the string tokenization step while preserving the
 353 |         preprocessing and n-grams generation steps.
 354 |         Only applies if ``analyzer == 'word'``.
 355 | 
 356 |     ngram_range : tuple (min_n, max_n), default=(1, 1)
 357 |         The lower and upper boundary of the range of n-values for different
 358 |         n-grams to be extracted. All values of n such that min_n <= n <= max_n
 359 |         will be used.
 360 | 
 361 |     stop_words : string {'english'}, list, or None (default)
 362 |         If 'english', a built-in stop word list for English is used.
 363 | 
 364 |         If a list, that list is assumed to contain stop words, all of which
 365 |         will be removed from the resulting tokens.
 366 |         Only applies if ``analyzer == 'word'``.
 367 | 
 368 |     lowercase : boolean, default=True
 369 |         Convert all characters to lowercase before tokenizing.
 370 | 
 371 |     token_pattern : string
 372 |         Regular expression denoting what constitutes a "token", only used
 373 |         if ``analyzer == 'word'``. The default regexp selects tokens of 2
 374 |         or more alphanumeric characters (punctuation is completely ignored
 375 |         and always treated as a token separator).
 376 | 
 377 |     n_features : integer, default=(2 ** 20)
 378 |         The number of features (columns) in the output matrices. Small numbers
 379 |         of features are likely to cause hash collisions, but large numbers
 380 |         will cause larger coefficient dimensions in linear learners.
 381 | 
 382 |     norm : 'l1', 'l2' or None, optional
 383 |         Norm used to normalize term vectors. None for no normalization.
 384 | 
 385 |     binary: boolean, default=False.
 386 |         If True, all non zero counts are set to 1. This is useful for discrete
 387 |         probabilistic models that model binary events rather than integer
 388 |         counts.
 389 | 
 390 |     dtype: type, optional
 391 |         Type of the matrix returned by fit_transform() or transform().
 392 | 
 393 |     non_negative : boolean, default=False
 394 |         Whether output matrices should contain non-negative values only;
 395 |         effectively calls abs on the matrix prior to returning it.
 396 |         When True, output values can be interpreted as frequencies.
 397 |         When False, output values will have expected value zero.
 398 | 
 399 |     See also
 400 |     --------
 401 |     CountVectorizer, TfidfVectorizer
 402 | 
 403 |     """
 404 | 
 405 |     def __init__(self, input='content', encoding='utf-8',
 406 |                  decode_error='strict', strip_accents=None,
 407 |                  lowercase=True, preprocessor=None, tokenizer=None,
 408 |                  stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
 409 |                  ngram_range=(1, 1), analyzer='word', n_features=(2 ** 20),
 410 |                  binary=False, norm='l2', non_negative=False,
 411 |                  dtype=np.float64):
 412 |         self.input = input
 413 |         self.encoding = encoding
 414 |         self.decode_error = decode_error
 415 |         self.strip_accents = strip_accents
 416 |         self.preprocessor = preprocessor
 417 |         self.tokenizer = tokenizer
 418 |         self.analyzer = analyzer
 419 |         self.lowercase = lowercase
 420 |         self.token_pattern = token_pattern
 421 |         self.stop_words = stop_words
 422 |         self.n_features = n_features
 423 |         self.ngram_range = ngram_range
 424 |         self.binary = binary
 425 |         self.norm = norm
 426 |         self.non_negative = non_negative
 427 |         self.dtype = dtype
 428 | 
 429 |     def partial_fit(self, X, y=None):
 430 |         """Does nothing: this transformer is stateless.
 431 | 
 432 |         This method is just there to mark the fact that this transformer
 433 |         can work in a streaming setup.
 434 | 
 435 |         """
 436 |         return self
 437 | 
 438 |     def fit(self, X, y=None):
 439 |         """Does nothing: this transformer is stateless."""
 440 |         # triggers a parameter validation
 441 |         self._get_hasher().fit(X, y=y)
 442 |         return self
 443 | 
 444 |     def transform(self, X, y=None):
 445 |         """Transform a sequence of documents to a document-term matrix.
 446 | 
 447 |         Parameters
 448 |         ----------
 449 |         X : iterable over raw text documents, length = n_samples
 450 |             Samples. Each sample must be a text document (either bytes or
 451 |             unicode strings, file name or file object depending on the
 452 |             constructor argument) which will be tokenized and hashed.
 453 | 
 454 |         y : (ignored)
 455 | 
 456 |         Returns
 457 |         -------
 458 |         X : scipy.sparse matrix, shape = (n_samples, self.n_features)
 459 |             Document-term matrix.
 460 | 
 461 |         """
 462 |         analyzer = self.build_analyzer()
 463 |         X = self._get_hasher().transform(analyzer(doc) for doc in X)
 464 |         if self.binary:
 465 |             X.data.fill(1)
 466 |         if self.norm is not None:
 467 |             X = normalize(X, norm=self.norm, copy=False)
 468 |         return X
 469 | 
 470 |     # Alias transform to fit_transform for convenience
 471 |     fit_transform = transform
 472 | 
 473 |     def _get_hasher(self):
 474 |         return FeatureHasher(n_features=self.n_features,
 475 |                              input_type='string', dtype=self.dtype,
 476 |                              non_negative=self.non_negative)
 477 | 
 478 | def _document_frequency(X):
 479 |     """Count the number of non-zero values for each feature in sparse X."""
 480 | 
 481 |     if sp.isspmatrix_csr(X):
 482 |         # return np.sum(X,axis=0)
 483 |         return bincount(X.indices, minlength=X.shape[1])
 484 | 
 485 |     else:
 486 | 
 487 |         return np.diff(sp.csc_matrix(X, copy=False).indptr)
 488 | 
 489 | class CountVectorizer(BaseEstimator, VectorizerMixin):
 490 |     """Convert a collection of text documents to a matrix of token counts
 491 | 
 492 |     This implementation produces a sparse representation of the counts using
 493 |     scipy.sparse.coo_matrix.
 494 | 
 495 |     If you do not provide an a-priori dictionary and you do not use an analyzer
 496 |     that does some kind of feature selection then the number of features will
 497 |     be equal to the vocabulary size found by analyzing the data.
 498 | 
 499 |     Read more in the :ref:`User Guide <text_feature_extraction>`.
 500 | 
 501 |     Parameters
 502 |     ----------
 503 |     input : string {'filename', 'file', 'content'}
 504 |         If 'filename', the sequence passed as an argument to fit is
 505 |         expected to be a list of filenames that need reading to fetch
 506 |         the raw content to analyze.
 507 | 
 508 |         If 'file', the sequence items must have a 'read' method (file-like
 509 |         object) that is called to fetch the bytes in memory.
 510 | 
 511 |         Otherwise the input is expected to be the sequence strings or
 512 |         bytes items are expected to be analyzed directly.
 513 | 
 514 |     encoding : string, 'utf-8' by default.
 515 |         If bytes or files are given to analyze, this encoding is used to
 516 |         decode.
 517 | 
 518 |     decode_error : {'strict', 'ignore', 'replace'}
 519 |         Instruction on what to do if a byte sequence is given to analyze that
 520 |         contains characters not of the given `encoding`. By default, it is
 521 |         'strict', meaning that a UnicodeDecodeError will be raised. Other
 522 |         values are 'ignore' and 'replace'.
 523 | 
 524 |     strip_accents : {'ascii', 'unicode', None}
 525 |         Remove accents during the preprocessing step.
 526 |         'ascii' is a fast method that only works on characters that have
 527 |         an direct ASCII mapping.
 528 |         'unicode' is a slightly slower method that works on any characters.
 529 |         None (default) does nothing.
 530 | 
 531 |     analyzer : string, {'word', 'char', 'char_wb'} or callable
 532 |         Whether the feature should be made of word or character n-grams.
 533 |         Option 'char_wb' creates character n-grams only from text inside
 534 |         word boundaries.
 535 | 
 536 |         If a callable is passed it is used to extract the sequence of features
 537 |         out of the raw, unprocessed input.
 538 | 
 539 |     preprocessor : callable or None (default)
 540 |         Override the preprocessing (string transformation) stage while
 541 |         preserving the tokenizing and n-grams generation steps.
 542 | 
 543 |     tokenizer : callable or None (default)
 544 |         Override the string tokenization step while preserving the
 545 |         preprocessing and n-grams generation steps.
 546 |         Only applies if ``analyzer == 'word'``.
 547 | 
 548 |     ngram_range : tuple (min_n, max_n)
 549 |         The lower and upper boundary of the range of n-values for different
 550 |         n-grams to be extracted. All values of n such that min_n <= n <= max_n
 551 |         will be used.
 552 | 
 553 |     stop_words : string {'english'}, list, or None (default)
 554 |         If 'english', a built-in stop word list for English is used.
 555 | 
 556 |         If a list, that list is assumed to contain stop words, all of which
 557 |         will be removed from the resulting tokens.
 558 |         Only applies if ``analyzer == 'word'``.
 559 | 
 560 |         If None, no stop words will be used. max_df can be set to a value
 561 |         in the range [0.7, 1.0) to automatically detect and filter stop
 562 |         words based on intra corpus document frequency of terms.
 563 | 
 564 |     lowercase : boolean, True by default
 565 |         Convert all characters to lowercase before tokenizing.
 566 | 
 567 |     token_pattern : string
 568 |         Regular expression denoting what constitutes a "token", only used
 569 |         if ``analyzer == 'word'``. The default regexp select tokens of 2
 570 |         or more alphanumeric characters (punctuation is completely ignored
 571 |         and always treated as a token separator).
 572 | 
 573 |     max_df : float in range [0.0, 1.0] or int, default=1.0
 574 |         When building the vocabulary ignore terms that have a document
 575 |         frequency strictly higher than the given threshold (corpus-specific
 576 |         stop words).
 577 |         If float, the parameter represents a proportion of documents, integer
 578 |         absolute counts.
 579 |         This parameter is ignored if vocabulary is not None.
 580 | 
 581 |     min_df : float in range [0.0, 1.0] or int, default=1
 582 |         When building the vocabulary ignore terms that have a document
 583 |         frequency strictly lower than the given threshold. This value is also
 584 |         called cut-off in the literature.
 585 |         If float, the parameter represents a proportion of documents, integer
 586 |         absolute counts.
 587 |         This parameter is ignored if vocabulary is not None.
 588 | 
 589 |     max_features : int or None, default=None
 590 |         If not None, build a vocabulary that only consider the top
 591 |         max_features ordered by term frequency across the corpus.
 592 | 
 593 |         This parameter is ignored if vocabulary is not None.
 594 | 
 595 |     vocabulary : Mapping or iterable, optional
 596 |         Either a Mapping (e.g., a dict) where keys are terms and values are
 597 |         indices in the feature matrix, or an iterable over terms. If not
 598 |         given, a vocabulary is determined from the input documents. Indices
 599 |         in the mapping should not be repeated and should not have any gap
 600 |         between 0 and the largest index.
 601 | 
 602 |     binary : boolean, default=False
 603 |         If True, all non zero counts are set to 1. This is useful for discrete
 604 |         probabilistic models that model binary events rather than integer
 605 |         counts.
 606 | 
 607 |     dtype : type, optional
 608 |         Type of the matrix returned by fit_transform() or transform().
 609 | 
 610 |     Attributes
 611 |     ----------
 612 |     vocabulary_ : dict
 613 |         A mapping of terms to feature indices.
 614 | 
 615 |     stop_words_ : set
 616 |         Terms that were ignored because they either:
 617 | 
 618 |           - occurred in too many documents (`max_df`)
 619 |           - occurred in too few documents (`min_df`)
 620 |           - were cut off by feature selection (`max_features`).
 621 | 
 622 |         This is only available if no vocabulary was given.
 623 | 
 624 |     See also
 625 |     --------
 626 |     HashingVectorizer, TfidfVectorizer
 627 | 
 628 |     Notes
 629 |     -----
 630 |     The ``stop_words_`` attribute can get large and increase the model size
 631 |     when pickling. This attribute is provided only for introspection and can
 632 |     be safely removed using delattr or set to None before pickling.
 633 |     """
 634 | 
 635 |     def __init__(self, input='content', encoding='utf-8',
 636 |                  decode_error='strict', strip_accents=None,
 637 |                  lowercase=True, preprocessor=None, tokenizer=None,
 638 |                  stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
 639 |                  ngram_range=(1, 1), analyzer='word',
 640 |                  max_df=1.0, min_df=1, max_features=None,
 641 |                  vocabulary=None, binary=False, dtype=np.int64):
 642 |         self.input = input
 643 |         self.encoding = encoding
 644 |         self.decode_error = decode_error
 645 |         self.strip_accents = strip_accents
 646 |         self.preprocessor = preprocessor
 647 |         self.tokenizer = tokenizer
 648 |         self.analyzer = analyzer
 649 |         self.lowercase = lowercase
 650 |         self.token_pattern = token_pattern
 651 |         self.stop_words = stop_words
 652 |         self.max_df = max_df
 653 |         self.min_df = min_df
 654 |         if max_df < 0 or min_df < 0:
 655 |             raise ValueError("negative value for max_df or min_df")
 656 |         self.max_features = max_features
 657 |         if max_features is not None:
 658 |             if (not isinstance(max_features, numbers.Integral) or
 659 |                         max_features <= 0):
 660 |                 raise ValueError(
 661 |                     "max_features=%r, neither a positive integer nor None"
 662 |                     % max_features)
 663 |         self.ngram_range = ngram_range
 664 |         self.vocabulary = vocabulary
 665 |         self.binary = binary
 666 |         self.dtype = dtype
 667 | 
 668 |     def _sort_features(self, X, vocabulary):
 669 |         """Sort features by name
 670 | 
 671 |         Returns a reordered matrix and modifies the vocabulary in place
 672 |         """
 673 |         sorted_features = sorted(six.iteritems(vocabulary))
 674 |         map_index = np.empty(len(sorted_features), dtype=np.int32)
 675 |         for new_val, (term, old_val) in enumerate(sorted_features):
 676 |             vocabulary[term] = new_val
 677 |             map_index[old_val] = new_val
 678 | 
 679 |         X.indices = map_index.take(X.indices, mode='clip')
 680 |         return X
 681 | 
 682 |     def _limit_features(self, X, vocabulary, high=None, low=None,
 683 |                         limit=None):
 684 |         """Remove too rare or too common features.
 685 | 
 686 |         Prune features that are non zero in more samples than high or less
 687 |         documents than low, modifying the vocabulary, and restricting it to
 688 |         at most the limit most frequent.
 689 | 
 690 |         This does not prune samples with zero features.
 691 |         """
 692 |         if high is None and low is None and limit is None:
 693 |             return X, set()
 694 | 
 695 |         # Calculate a mask based on document frequencies
 696 |         dfs = _document_frequency(X)
 697 |         tfs = np.asarray(X.sum(axis=0)).ravel()
 698 |         mask = np.ones(len(dfs), dtype=bool)
 699 |         if high is not None:
 700 |             mask &= dfs <= high
 701 |         if low is not None:
 702 |             mask &= dfs >= low
 703 |         if limit is not None and mask.sum() > limit:
 704 |             mask_inds = (-tfs[mask]).argsort()[:limit]
 705 |             new_mask = np.zeros(len(dfs), dtype=bool)
 706 |             new_mask[np.where(mask)[0][mask_inds]] = True
 707 |             mask = new_mask
 708 | 
 709 |         new_indices = np.cumsum(mask) - 1  # maps old indices to new
 710 |         removed_terms = set()
 711 |         for term, old_index in list(six.iteritems(vocabulary)):
 712 |             if mask[old_index]:
 713 |                 vocabulary[term] = new_indices[old_index]
 714 |             else:
 715 |                 del vocabulary[term]
 716 |                 removed_terms.add(term)
 717 |         kept_indices = np.where(mask)[0]
 718 |         if len(kept_indices) == 0:
 719 |             raise ValueError("After pruning, no terms remain. Try a lower"
 720 |                              " min_df or a higher max_df.")
 721 |         return X[:, kept_indices], removed_terms
 722 | 
 723 |     def _count_vocab(self, raw_documents, fixed_vocab):
 724 |         """Create sparse feature matrix, and vocabulary where fixed_vocab=False
 725 |         """
 726 |         if fixed_vocab:
 727 |             vocabulary = self.vocabulary_
 728 |         else:
 729 |             # Add a new value when a new vocabulary item is seen
 730 |             vocabulary = defaultdict()
 731 |             vocabulary.default_factory = vocabulary.__len__
 732 | 
 733 |         analyze = self.build_analyzer()
 734 |         j_indices = []
 735 |         indptr = _make_int_array()
 736 |         values = _make_int_array()
 737 |         indptr.append(0)
 738 |         for doc in raw_documents:
 739 |             feature_counter = {}
 740 |             for feature in analyze(doc):
 741 |                 try:
 742 |                     feature_idx = vocabulary[feature]
 743 |                     if feature_idx not in feature_counter:
 744 |                         feature_counter[feature_idx] = 1
 745 |                     else:
 746 |                         feature_counter[feature_idx] += 1
 747 |                 except KeyError:
 748 |                     # Ignore out-of-vocabulary items for fixed_vocab=True
 749 |                     continue
 750 | 
 751 |             j_indices.extend(feature_counter.keys())
 752 |             values.extend(feature_counter.values())
 753 |             indptr.append(len(j_indices))
 754 | 
 755 |         if not fixed_vocab:
 756 |             # disable defaultdict behaviour
 757 |             vocabulary = dict(vocabulary)
 758 |             if not vocabulary:
 759 |                 raise ValueError("empty vocabulary; perhaps the documents only"
 760 |                                  " contain stop words")
 761 | 
 762 |         j_indices = np.asarray(j_indices, dtype=np.intc)
 763 |         indptr = np.frombuffer(indptr, dtype=np.intc)
 764 |         values = frombuffer_empty(values, dtype=np.intc)
 765 | 
 766 |         X = sp.csr_matrix((values, j_indices, indptr),
 767 |                           shape=(len(indptr) - 1, len(vocabulary)),
 768 |                           dtype=self.dtype)
 769 |         X.sort_indices()
 770 |         return vocabulary, X
 771 | 
 772 |     def _count_vocab_2(self, raw_documents, fixed_vocab):
 773 |         """Create sparse feature matrix, and vocabulary where fixed_vocab=False
 774 |         """
 775 |         if fixed_vocab:
 776 |             vocabulary = self.vocabulary_
 777 |         else:
 778 |             # Add a new value when a new vocabulary item is seen
 779 |             vocabulary = defaultdict()
 780 |             vocabulary.default_factory = vocabulary.__len__
 781 | 
 782 |         analyze = self.build_analyzer()
 783 |         j_indices = []
 784 |         indptr = _make_int_array()
 785 |         # values = _make_int_array()
 786 |         values = array.array(str("f"))
 787 |         indptr.append(0)
 788 |         for doc in raw_documents:
 789 |             feature_counter = {}
 790 |             for feature in analyze(doc):
 791 |                 try:
 792 |                     feature_idx = vocabulary[feature]
 793 |                     if feature_idx not in feature_counter:
 794 |                         feature_counter[feature_idx] = 1
 795 |                     else:
 796 |                         feature_counter[feature_idx] += 1
 797 |                 except KeyError:
 798 |                     # Ignore out-of-vocabulary items for fixed_vocab=True
 799 |                     continue
 800 | 
 801 |             j_indices.extend(feature_counter.keys())
 802 |             values.extend([i * 1.0 / sum(feature_counter.values()) for i in feature_counter.values()])
 803 |             indptr.append(len(j_indices))
 804 | 
 805 |         if not fixed_vocab:
 806 |             # disable defaultdict behaviour
 807 |             vocabulary = dict(vocabulary)
 808 |             if not vocabulary:
 809 |                 raise ValueError("empty vocabulary; perhaps the documents only"
 810 |                                  " contain stop words")
 811 | 
 812 |         j_indices = np.asarray(j_indices, dtype=np.intc)
 813 |         indptr = np.frombuffer(indptr, dtype=np.intc)
 814 |         values = frombuffer_empty(values, dtype=np.float32)
 815 | 
 816 |         X = sp.csr_matrix((values, j_indices, indptr),
 817 |                           shape=(len(indptr) - 1, len(vocabulary)))
 818 |         X.sort_indices()
 819 |         return vocabulary, X
 820 | 
 821 |     def fit(self, raw_documents, y=None):
 822 |         """Learn a vocabulary dictionary of all tokens in the raw documents.
 823 | 
 824 |         Parameters
 825 |         ----------
 826 |         raw_documents : iterable
 827 |             An iterable which yields either str, unicode or file objects.
 828 | 
 829 |         Returns
 830 |         -------
 831 |         self
 832 |         """
 833 |         self.fit_transform(raw_documents)
 834 |         return self
 835 | 
 836 |     def fit_transform(self, raw_documents, y=None):
 837 |         """Learn the vocabulary dictionary and return term-document matrix.
 838 | 
 839 |         This is equivalent to fit followed by transform, but more efficiently
 840 |         implemented.
 841 | 
 842 |         Parameters
 843 |         ----------
 844 |         raw_documents : iterable
 845 |             An iterable which yields either str, unicode or file objects.
 846 | 
 847 |         Returns
 848 |         -------
 849 |         X : array, [n_samples, n_features]
 850 |             Document-term matrix.
 851 |         """
 852 |         # We intentionally don't call the transform method to make
 853 |         # fit_transform overridable without unwanted side effects in
 854 |         # TfidfVectorizer.
 855 |         self._validate_vocabulary()
 856 |         max_df = self.max_df
 857 |         min_df = self.min_df
 858 |         max_features = self.max_features
 859 | 
 860 |         vocabulary, X = self._count_vocab(raw_documents,
 861 |                                           self.fixed_vocabulary_)
 862 | 
 863 |         if self.binary:
 864 |             X.data.fill(1)
 865 | 
 866 |         if not self.fixed_vocabulary_:
 867 |             X = self._sort_features(X, vocabulary)
 868 | 
 869 |             n_doc = X.shape[0]
 870 |             max_doc_count = (max_df
 871 |                              if isinstance(max_df, numbers.Integral)
 872 |                              else max_df * n_doc)
 873 |             min_doc_count = (min_df
 874 |                              if isinstance(min_df, numbers.Integral)
 875 |                              else min_df * n_doc)
 876 |             if max_doc_count < min_doc_count:
 877 |                 raise ValueError(
 878 |                     "max_df corresponds to < documents than min_df")
 879 |             X, self.stop_words_ = self._limit_features(X, vocabulary,
 880 |                                                        max_doc_count,
 881 |                                                        min_doc_count,
 882 |                                                        max_features)
 883 | 
 884 |             self.vocabulary_ = vocabulary
 885 | 
 886 |         return X
 887 | 
 888 |     def transform(self, raw_documents):
 889 |         """Transform documents to document-term matrix.
 890 | 
 891 |         Extract token counts out of raw text documents using the vocabulary
 892 |         fitted with fit or the one provided to the constructor.
 893 | 
 894 |         Parameters
 895 |         ----------
 896 |         raw_documents : iterable
 897 |             An iterable which yields either str, unicode or file objects.
 898 | 
 899 |         Returns
 900 |         -------
 901 |         X : sparse matrix, [n_samples, n_features]
 902 |             Document-term matrix.
 903 |         """
 904 |         if not hasattr(self, 'vocabulary_'):
 905 |             self._validate_vocabulary()
 906 | 
 907 |         self._check_vocabulary()
 908 | 
 909 |         # use the same matrix-building strategy as fit_transform
 910 |         _, X = self._count_vocab(raw_documents, fixed_vocab=True)
 911 |         if self.binary:
 912 |             X.data.fill(1)
 913 |         return X
 914 | 
 915 |     def get_term_topic(self, X):
 916 |         n_features = X.shape[1]
 917 |         id2word = self.vocabulary_
 918 |         word2topic = {}
 919 | 
 920 |         with open('word_topic.txt', 'r') as f:
 921 |             for line in f:
 922 |                 strs = line.decode('utf-8').strip('\n').split('\t')
 923 |                 word2topic[strs[0]] = strs[2]
 924 | 
 925 |         topic = np.zeros((len(id2word),))
 926 | 
 927 |         for i, key in enumerate(id2word):
 928 |             if key in word2topic:
 929 |                 topic[id2word[key]] = word2topic[key]
 930 |             else:
 931 |                 print key
 932 | 
 933 |         topic = preprocessing.MinMaxScaler().fit_transform(topic)
 934 |         # topic = sp.spdiags(topic, diags=0, m=n_features,
 935 |         #                    n=n_features, format='csr')
 936 |         return topic
 937 | 
 938 |     def transform2(self, raw_documents):
 939 |         """Transform documents to document-term matrix.
 940 | 
 941 |         Extract token counts out of raw text documents using the vocabulary
 942 |         fitted with fit or the one provided to the constructor.
 943 | 
 944 |         Parameters
 945 |         ----------
 946 |         raw_documents : iterable
 947 |             An iterable which yields either str, unicode or file objects.
 948 | 
 949 |         Returns
 950 |         -------
 951 |         X : sparse matrix, [n_samples, n_features]
 952 |             Document-term matrix.
 953 |         """
 954 |         if not hasattr(self, 'vocabulary_'):
 955 |             self._validate_vocabulary()
 956 | 
 957 |         self._check_vocabulary()
 958 | 
 959 |         # use the same matrix-building strategy as fit_transform
 960 |         _, X = self._count_vocab_2(raw_documents, fixed_vocab=True)
 961 |         if self.binary:
 962 |             X.data.fill(1)
 963 |         return X
 964 | 
 965 |     def inverse_transform(self, X):
 966 |         """Return terms per document with nonzero entries in X.
 967 | 
 968 |         Parameters
 969 |         ----------
 970 |         X : {array, sparse matrix}, shape = [n_samples, n_features]
 971 | 
 972 |         Returns
 973 |         -------
 974 |         X_inv : list of arrays, len = n_samples
 975 |             List of arrays of terms.
 976 |         """
 977 |         self._check_vocabulary()
 978 | 
 979 |         if sp.issparse(X):
 980 |             # We need CSR format for fast row manipulations.
 981 |             X = X.tocsr()
 982 |         else:
 983 |             # We need to convert X to a matrix, so that the indexing
 984 |             # returns 2D objects
 985 |             X = np.asmatrix(X)
 986 |         n_samples = X.shape[0]
 987 | 
 988 |         terms = np.array(list(self.vocabulary_.keys()))
 989 |         indices = np.array(list(self.vocabulary_.values()))
 990 |         inverse_vocabulary = terms[np.argsort(indices)]
 991 | 
 992 |         return [inverse_vocabulary[X[i, :].nonzero()[1]].ravel()
 993 |                 for i in range(n_samples)]
 994 | 
 995 |     def get_feature_names(self):
 996 |         """Array mapping from feature integer indices to feature name"""
 997 |         self._check_vocabulary()
 998 | 
 999 |         return [t for t, i in sorted(six.iteritems(self.vocabulary_),
1000 |                                      key=itemgetter(1))]
1001 | 
1002 | 
1003 | def _make_int_array():
1004 |     """Construct an array.array of a type suitable for scipy.sparse indices."""
1005 |     return array.array(str("i"))
1006 | 
1007 | class TfidfTransformer(BaseEstimator, TransformerMixin):
1008 |     """Transform a count matrix to a normalized tf or tf-idf representation
1009 | 
1010 |     Tf means term-frequency while tf-idf means term-frequency times inverse
1011 |     document-frequency. This is a common term weighting scheme in information
1012 |     retrieval, that has also found good use in document classification.
1013 | 
1014 |     The goal of using tf-idf instead of the raw frequencies of occurrence of a
1015 |     token in a given document is to scale down the impact of tokens that occur
1016 |     very frequently in a given corpus and that are hence empirically less
1017 |     informative than features that occur in a small fraction of the training
1018 |     corpus.
1019 | 
1020 |     The formula that is used to compute the tf-idf of term t is
1021 |     tf-idf(d, t) = tf(t) * idf(d, t), and the idf is computed as
1022 |     idf(d, t) = log [ n / df(d, t) ] + 1 (if ``smooth_idf=False``),
1023 |     where n is the total number of documents and df(d, t) is the
1024 |     document frequency; the document frequency is the number of documents d
1025 |     that contain term t. The effect of adding "1" to the idf in the equation
1026 |     above is that terms with zero idf, i.e., terms  that occur in all documents
1027 |     in a training set, will not be entirely ignored.
1028 |     (Note that the idf formula above differs from the standard
1029 |     textbook notation that defines the idf as
1030 |     idf(d, t) = log [ n / (df(d, t) + 1) ]).
1031 | 
1032 |     If ``smooth_idf=True`` (the default), the constant "1" is added to the
1033 |     numerator and denominator of the idf as if an extra document was seen
1034 |     containing every term in the collection exactly once, which prevents
1035 |     zero divisions: idf(d, t) = log [ (1 + n) / 1 + df(d, t) ] + 1.
1036 | 
1037 |     Furthermore, the formulas used to compute tf and idf depend
1038 |     on parameter settings that correspond to the SMART notation used in IR
1039 |     as follows:
1040 | 
1041 |     Tf is "n" (natural) by default, "l" (logarithmic) when
1042 |     ``sublinear_tf=True``.
1043 |     Idf is "t" when use_idf is given, "n" (none) otherwise.
1044 |     Normalization is "c" (cosine) when ``norm='l2'``, "n" (none)
1045 |     when ``norm=None``.
1046 | 
1047 |     Read more in the :ref:`User Guide <text_feature_extraction>`.
1048 | 
1049 |     Parameters
1050 |     ----------
1051 |     norm : 'l1', 'l2' or None, optional
1052 |         Norm used to normalize term vectors. None for no normalization.
1053 | 
1054 |     use_idf : boolean, default=True
1055 |         Enable inverse-document-frequency reweighting.
1056 | 
1057 |     smooth_idf : boolean, default=True
1058 |         Smooth idf weights by adding one to document frequencies, as if an
1059 |         extra document was seen containing every term in the collection
1060 |         exactly once. Prevents zero divisions.
1061 | 
1062 |     sublinear_tf : boolean, default=False
1063 |         Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
1064 | 
1065 |     References
1066 |     ----------
1067 | 
1068 |     .. [Yates2011] `R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern
1069 |                    Information Retrieval. Addison Wesley, pp. 68-74.`
1070 | 
1071 |     .. [MRS2008] `C.D. Manning, P. Raghavan and H. Schütze  (2008).
1072 |                    Introduction to Information Retrieval. Cambridge University
1073 |                    Press, pp. 118-120.`
1074 |     """
1075 | 
1076 |     def __init__(self, norm='l2', use_idf=True, smooth_idf=True,
1077 |                  sublinear_tf=False):
1078 |         self.norm = norm
1079 |         self.use_idf = use_idf
1080 |         self.smooth_idf = smooth_idf
1081 |         self.sublinear_tf = sublinear_tf
1082 | 
1083 |     def fit(self, X, y, termTopic=None):
1084 |         """Learn the idf vector (global term weights)
1085 | 
1086 |         Parameters
1087 |         ----------
1088 |         X : sparse matrix, [n_samples, n_features]
1089 |             a matrix of term/token counts
1090 |         """
1091 |         # todo http://nlpr-web.ia.ac.cn/cip/proceedings/klchen.pdf
1092 |         # compute the normalized var
1093 |         if y is not None:
1094 |             aX = X
1095 |             m = len(np.unique(y))
1096 |             p = np.zeros((m, aX.shape[1]))
1097 | 
1098 |             for j in range(np.min(y), m + np.min(y)):
1099 |                 w = aX[y == j, :]
1100 |                 tij = np.sum(w, axis=0)
1101 |                 lj = np.sum(tij)
1102 |                 p[j - np.min(y), :] = tij * 1.0 / lj
1103 | 
1104 |             ave_p = np.sum(p, axis=0) * 1.0 / m
1105 | 
1106 |             new_var = np.sqrt(np.sqrt(np.sum((p - ave_p) ** 2, axis=0)) * 1.0 / np.sum(p, axis=0))
1107 | 
1108 |         if not sp.issparse(X):
1109 |             X = sp.csc_matrix(X)
1110 | 
1111 |         if self.use_idf:
1112 |             n_samples, n_features = X.shape
1113 | 
1114 |             df = _document_frequency(X)
1115 |             # the number of all words
1116 |             whole_df = np.sum(df)
1117 | 
1118 |             # perform idf smoothing if required
1119 |             df += int(self.smooth_idf)
1120 |             n_samples += int(self.smooth_idf)
1121 | 
1122 |             idf = np.log(whole_df * 1.0 / df * 1.0)
1123 | 
1124 |             idf = idf * new_var
1125 | 
1126 |             self._idf_diag = sp.spdiags(idf, diags=0, m=n_features,
1127 |                                         n=n_features, format='csr')
1128 | 
1129 |         return self
1130 | 
1131 |     def transform(self, X, copy=True):
1132 |         """Transform a count matrix to a tf or tf-idf representation
1133 | 
1134 |         Parameters
1135 |         ----------
1136 |         X : sparse matrix, [n_samples, n_features]
1137 |             a matrix of term/token counts
1138 | 
1139 |         copy : boolean, default True
1140 |             Whether to copy X and operate on the copy or perform in-place
1141 |             operations.
1142 | 
1143 |         Returns
1144 |         -------
1145 |         vectors : sparse matrix, [n_samples, n_features]
1146 |         """
1147 | 
1148 |         if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
1149 |             # preserve float family dtype
1150 |             X = sp.csr_matrix(X, copy=copy)
1151 |         else:
1152 |             # convert counts or binary occurrences to floats
1153 |             X = sp.csr_matrix(X, dtype=np.float64, copy=copy)
1154 | 
1155 |         n_samples, n_features = X.shape
1156 | 
1157 |         if self.sublinear_tf:
1158 |             np.log(X.data, X.data)
1159 |             X.data += 1
1160 | 
1161 |         if self.use_idf:
1162 |             check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')
1163 | 
1164 |             expected_n_features = self._idf_diag.shape[0]
1165 |             if n_features != expected_n_features:
1166 |                 raise ValueError("Input has n_features=%d while the model"
1167 |                                  " has been trained with n_features=%d" % (
1168 |                                      n_features, expected_n_features))
1169 |             # *= doesn't work
1170 | 
1171 |             X = np.sqrt(X) * self._idf_diag
1172 | 
1173 |         if self.norm:
1174 |             X = normalize(X, norm=self.norm, copy=False)
1175 | 
1176 |         return X
1177 | 
1178 |     @property
1179 |     def idf_(self):
1180 |         if hasattr(self, "_idf_diag"):
1181 |             return np.ravel(self._idf_diag.sum(axis=0))
1182 |         else:
1183 |             return None
1184 | 
1185 | class TfidfVectorizer(CountVectorizer):
1186 |     """Convert a collection of raw documents to a matrix of TF-IDF features.
1187 | 
1188 |     Equivalent to CountVectorizer followed by TfidfTransformer.
1189 | 
1190 |     Read more in the :ref:`User Guide <text_feature_extraction>`.
1191 | 
1192 |     Parameters
1193 |     ----------
1194 |     input : string {'filename', 'file', 'content'}
1195 |         If 'filename', the sequence passed as an argument to fit is
1196 |         expected to be a list of filenames that need reading to fetch
1197 |         the raw content to analyze.
1198 | 
1199 |         If 'file', the sequence items must have a 'read' method (file-like
1200 |         object) that is called to fetch the bytes in memory.
1201 | 
1202 |         Otherwise the input is expected to be the sequence strings or
1203 |         bytes items are expected to be analyzed directly.
1204 | 
1205 |     encoding : string, 'utf-8' by default.
1206 |         If bytes or files are given to analyze, this encoding is used to
1207 |         decode.
1208 | 
1209 |     decode_error : {'strict', 'ignore', 'replace'}
1210 |         Instruction on what to do if a byte sequence is given to analyze that
1211 |         contains characters not of the given `encoding`. By default, it is
1212 |         'strict', meaning that a UnicodeDecodeError will be raised. Other
1213 |         values are 'ignore' and 'replace'.
1214 | 
1215 |     strip_accents : {'ascii', 'unicode', None}
1216 |         Remove accents during the preprocessing step.
1217 |         'ascii' is a fast method that only works on characters that have
1218 |         an direct ASCII mapping.
1219 |         'unicode' is a slightly slower method that works on any characters.
1220 |         None (default) does nothing.
1221 | 
1222 |     analyzer : string, {'word', 'char'} or callable
1223 |         Whether the feature should be made of word or character n-grams.
1224 | 
1225 |         If a callable is passed it is used to extract the sequence of features
1226 |         out of the raw, unprocessed input.
1227 | 
1228 |     preprocessor : callable or None (default)
1229 |         Override the preprocessing (string transformation) stage while
1230 |         preserving the tokenizing and n-grams generation steps.
1231 | 
1232 |     tokenizer : callable or None (default)
1233 |         Override the string tokenization step while preserving the
1234 |         preprocessing and n-grams generation steps.
1235 |         Only applies if ``analyzer == 'word'``.
1236 | 
1237 |     ngram_range : tuple (min_n, max_n)
1238 |         The lower and upper boundary of the range of n-values for different
1239 |         n-grams to be extracted. All values of n such that min_n <= n <= max_n
1240 |         will be used.
1241 | 
1242 |     stop_words : string {'english'}, list, or None (default)
1243 |         If a string, it is passed to _check_stop_list and the appropriate stop
1244 |         list is returned. 'english' is currently the only supported string
1245 |         value.
1246 | 
1247 |         If a list, that list is assumed to contain stop words, all of which
1248 |         will be removed from the resulting tokens.
1249 |         Only applies if ``analyzer == 'word'``.
1250 | 
1251 |         If None, no stop words will be used. max_df can be set to a value
1252 |         in the range [0.7, 1.0) to automatically detect and filter stop
1253 |         words based on intra corpus document frequency of terms.
1254 | 
1255 |     lowercase : boolean, default True
1256 |         Convert all characters to lowercase before tokenizing.
1257 | 
1258 |     token_pattern : string
1259 |         Regular expression denoting what constitutes a "token", only used
1260 |         if ``analyzer == 'word'``. The default regexp selects tokens of 2
1261 |         or more alphanumeric characters (punctuation is completely ignored
1262 |         and always treated as a token separator).
1263 | 
1264 |     max_df : float in range [0.0, 1.0] or int, default=1.0
1265 |         When building the vocabulary ignore terms that have a document
1266 |         frequency strictly higher than the given threshold (corpus-specific
1267 |         stop words).
1268 |         If float, the parameter represents a proportion of documents, integer
1269 |         absolute counts.
1270 |         This parameter is ignored if vocabulary is not None.
1271 | 
1272 |     min_df : float in range [0.0, 1.0] or int, default=1
1273 |         When building the vocabulary ignore terms that have a document
1274 |         frequency strictly lower than the given threshold. This value is also
1275 |         called cut-off in the literature.
1276 |         If float, the parameter represents a proportion of documents, integer
1277 |         absolute counts.
1278 |         This parameter is ignored if vocabulary is not None.
1279 | 
1280 |     max_features : int or None, default=None
1281 |         If not None, build a vocabulary that only consider the top
1282 |         max_features ordered by term frequency across the corpus.
1283 | 
1284 |         This parameter is ignored if vocabulary is not None.
1285 | 
1286 |     vocabulary : Mapping or iterable, optional
1287 |         Either a Mapping (e.g., a dict) where keys are terms and values are
1288 |         indices in the feature matrix, or an iterable over terms. If not
1289 |         given, a vocabulary is determined from the input documents.
1290 | 
1291 |     binary : boolean, default=False
1292 |         If True, all non-zero term counts are set to 1. This does not mean
1293 |         outputs will have only 0/1 values, only that the tf term in tf-idf
1294 |         is binary. (Set idf and normalization to False to get 0/1 outputs.)
1295 | 
1296 |     dtype : type, optional
1297 |         Type of the matrix returned by fit_transform() or transform().
1298 | 
1299 |     norm : 'l1', 'l2' or None, optional
1300 |         Norm used to normalize term vectors. None for no normalization.
1301 | 
1302 |     use_idf : boolean, default=True
1303 |         Enable inverse-document-frequency reweighting.
1304 | 
1305 |     smooth_idf : boolean, default=True
1306 |         Smooth idf weights by adding one to document frequencies, as if an
1307 |         extra document was seen containing every term in the collection
1308 |         exactly once. Prevents zero divisions.
1309 | 
1310 |     sublinear_tf : boolean, default=False
1311 |         Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
1312 | 
1313 |     Attributes
1314 |     ----------
1315 |     vocabulary_ : dict
1316 |         A mapping of terms to feature indices.
1317 | 
1318 |     idf_ : array, shape = [n_features], or None
1319 |         The learned idf vector (global term weights)
1320 |         when ``use_idf`` is set to True, None otherwise.
1321 | 
1322 |     stop_words_ : set
1323 |         Terms that were ignored because they either:
1324 | 
1325 |           - occurred in too many documents (`max_df`)
1326 |           - occurred in too few documents (`min_df`)
1327 |           - were cut off by feature selection (`max_features`).
1328 | 
1329 |         This is only available if no vocabulary was given.
1330 | 
1331 |     See also
1332 |     --------
1333 |     CountVectorizer
1334 |         Tokenize the documents and count the occurrences of token and return
1335 |         them as a sparse matrix
1336 | 
1337 |     TfidfTransformer
1338 |         Apply Term Frequency Inverse Document Frequency normalization to a
1339 |         sparse matrix of occurrence counts.
1340 | 
1341 |     Notes
1342 |     -----
1343 |     The ``stop_words_`` attribute can get large and increase the model size
1344 |     when pickling. This attribute is provided only for introspection and can
1345 |     be safely removed using delattr or set to None before pickling.
1346 |     """
1347 | 
1348 |     def __init__(self, input='content', encoding='utf-8',
1349 |                  decode_error='strict', strip_accents=None, lowercase=True,
1350 |                  preprocessor=None, tokenizer=None, analyzer='word',
1351 |                  stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
1352 |                  ngram_range=(1, 1), max_df=1.0, min_df=1,
1353 |                  max_features=None, vocabulary=None, binary=False,
1354 |                  dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
1355 |                  sublinear_tf=False):
1356 |         super(TfidfVectorizer, self).__init__(
1357 |             input=input, encoding=encoding, decode_error=decode_error,
1358 |             strip_accents=strip_accents, lowercase=lowercase,
1359 |             preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
1360 |             stop_words=stop_words, token_pattern=token_pattern,
1361 |             ngram_range=ngram_range, max_df=max_df, min_df=min_df,
1362 |             max_features=max_features, vocabulary=vocabulary, binary=binary,
1363 |             dtype=dtype)
1364 | 
1365 |         self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
1366 |                                        smooth_idf=smooth_idf,
1367 |                                        sublinear_tf=sublinear_tf)
1368 | 
1369 |     # Broadcast the TF-IDF parameters to the underlying transformer instance
1370 |     # for easy grid search and repr
1371 | 
1372 |     @property
1373 |     def norm(self):
1374 |         return self._tfidf.norm
1375 | 
1376 |     @norm.setter
1377 |     def norm(self, value):
1378 |         self._tfidf.norm = value
1379 | 
1380 |     @property
1381 |     def use_idf(self):
1382 |         return self._tfidf.use_idf
1383 | 
1384 |     @use_idf.setter
1385 |     def use_idf(self, value):
1386 |         self._tfidf.use_idf = value
1387 | 
1388 |     @property
1389 |     def smooth_idf(self):
1390 |         return self._tfidf.smooth_idf
1391 | 
1392 |     @smooth_idf.setter
1393 |     def smooth_idf(self, value):
1394 |         self._tfidf.smooth_idf = value
1395 | 
1396 |     @property
1397 |     def sublinear_tf(self):
1398 |         return self._tfidf.sublinear_tf
1399 | 
1400 |     @sublinear_tf.setter
1401 |     def sublinear_tf(self, value):
1402 |         self._tfidf.sublinear_tf = value
1403 | 
1404 |     @property
1405 |     def idf_(self):
1406 |         return self._tfidf.idf_
1407 | 
1408 |     def fit(self, raw_documents, y=None):
1409 |         """Learn vocabulary and idf from training set.
1410 | 
1411 |         Parameters
1412 |         ----------
1413 |         raw_documents : iterable
1414 |             an iterable which yields either str, unicode or file objects
1415 | 
1416 |         Returns
1417 |         -------
1418 |         self : TfidfVectorizer
1419 |         """
1420 |         X = super(TfidfVectorizer, self).fit_transform(raw_documents)
1421 | 
1422 |         # termTopic = super(TfidfVectorizer, self).get_term_topic(X)
1423 | 
1424 |         self._tfidf.fit(X, y, None)
1425 | 
1426 |         return self
1427 | 
1428 |     def fit_transform(self, raw_documents, y=None):
1429 |         """Learn vocabulary and idf, return term-document matrix.
1430 | 
1431 |         This is equivalent to fit followed by transform, but more efficiently
1432 |         implemented.
1433 | 
1434 |         Parameters
1435 |         ----------
1436 |         raw_documents : iterable
1437 |             an iterable which yields either str, unicode or file objects
1438 | 
1439 |         Returns
1440 |         -------
1441 |         X : sparse matrix, [n_samples, n_features]
1442 |             Tf-idf-weighted document-term matrix.
1443 |         """
1444 |         X = super(TfidfVectorizer, self).fit_transform(raw_documents)
1445 |         self._tfidf.fit(X, y, None)
1446 |         # X is already a transformed view of raw_documents so
1447 |         # we set copy to False
1448 |         return self._tfidf.transform(X, copy=False)
1449 | 
1450 |     def transform(self, raw_documents, copy=True):
1451 |         """Transform documents to document-term matrix.
1452 | 
1453 |         Uses the vocabulary and document frequencies (df) learned by fit (or
1454 |         fit_transform).
1455 | 
1456 |         Parameters
1457 |         ----------
1458 |         raw_documents : iterable
1459 |             an iterable which yields either str, unicode or file objects
1460 | 
1461 |         copy : boolean, default True
1462 |             Whether to copy X and operate on the copy or perform in-place
1463 |             operations.
1464 | 
1465 |         Returns
1466 |         -------
1467 |         X : sparse matrix, [n_samples, n_features]
1468 |             Tf-idf-weighted document-term matrix.
1469 |         """
1470 |         check_is_fitted(self, '_tfidf', 'The tfidf vector is not fitted')
1471 | 
1472 |         X = super(TfidfVectorizer, self).transform(raw_documents)
1473 | 
1474 |         return self._tfidf.transform(X, copy=False)
1475 | 


--------------------------------------------------------------------------------
/project/3-main/classify.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import multiprocessing,Queue
  3 | from sklearn.cross_validation import KFold, StratifiedKFold
  4 | import xgboost as xgb
  5 | from STFIWF import TfidfVectorizer
  6 | import numpy as np
  7 | from sklearn.linear_model import SGDClassifier, LogisticRegression,RidgeClassifier,PassiveAggressiveClassifier,Lasso,HuberRegressor
  8 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB
  9 | from sklearn.ensemble import VotingClassifier,RandomForestClassifier,gradient_boosting
 10 | from sklearn.ensemble.bagging import BaggingClassifier
 11 | from sklearn.ensemble.weight_boosting import AdaBoostClassifier
 12 | from sklearn.svm import LinearSVC, SVC
 13 | from sklearn.preprocessing import MinMaxScaler,StandardScaler,MaxAbsScaler
 14 | class term(object):
 15 |     def __init__(self):
 16 |         random_rate = 8240
 17 |         clf1 = SGDClassifier(
 18 |             alpha=5e-05,
 19 |             average=False,
 20 |             class_weight='balanced',
 21 |             loss='log',
 22 |             n_iter=30,
 23 |             penalty='l2', n_jobs=-1, random_state=random_rate)
 24 |         clf2 = MultinomialNB(alpha=0.1)
 25 |         clf3 = LinearSVC(C=0.1, random_state=random_rate)
 26 |         clf4 = LogisticRegression(C=1.0,n_jobs=-1, max_iter=100, class_weight='balanced', random_state=random_rate)
 27 |         clf5 = BernoulliNB(alpha=0.1)
 28 |         clf6 = VotingClassifier(estimators=[('sgd', clf1),
 29 |                                             ('mb', clf2),
 30 |                                             ('bb', clf3),
 31 |                                             ('lf', clf4),
 32 |                                             ('bnb', clf5)], voting='hard')
 33 |         clf7 = SGDClassifier(
 34 |             alpha=5e-05,
 35 |             average=False,
 36 |             class_weight='balanced',
 37 |             loss='log',
 38 |             n_iter=30,
 39 |             penalty='l1', n_jobs=-1, random_state=random_rate)
 40 |         clf8 = LinearSVC(C=0.9, random_state=random_rate)
 41 |         clf9 = LogisticRegression(C=0.5, n_jobs=-1, max_iter=100, class_weight='balanced', random_state=random_rate)
 42 |         clf10 = MultinomialNB(alpha=0.9)
 43 |         clf11 = BernoulliNB(alpha=0.9)
 44 |         clf12 = LogisticRegression(C=0.2, n_jobs=-1, max_iter=100, class_weight='balanced', random_state=random_rate,penalty='l1')
 45 |         clf13 = LogisticRegression(C=0.8, n_jobs=-1, max_iter=100, class_weight='balanced', random_state=random_rate,penalty='l1')
 46 |         clf14 = RidgeClassifier(alpha=8)
 47 |         clf15 = PassiveAggressiveClassifier(C=0.01, loss='squared_hinge', n_iter=20, n_jobs=-1)
 48 |         clf16 = RidgeClassifier(alpha=2)
 49 |         clf17 = PassiveAggressiveClassifier(C=0.5, loss='squared_hinge', n_iter=30, n_jobs=-1)
 50 |         clf18 = LinearSVC(C=0.5, random_state=random_rate)
 51 |         clf19 = MultinomialNB(alpha=0.5)
 52 |         clf20 = BernoulliNB(alpha=0.5)
 53 |         clf21 = Lasso(alpha=0.1, max_iter=20, random_state=random_rate)
 54 |         clf22 = Lasso(alpha=0.9, max_iter=30, random_state=random_rate)
 55 |         clf23 = PassiveAggressiveClassifier(C=0.1, loss='hinge', n_iter=30, n_jobs=-1, random_state=random_rate)
 56 |         clf24 = PassiveAggressiveClassifier(C=0.9, loss='hinge', n_iter=30, n_jobs=-1, random_state=random_rate)
 57 |         clf25 = HuberRegressor(max_iter=30)
 58 | 
 59 |         basemodel = [
 60 |             ['sgd', clf1],
 61 |             ['nb', clf2],
 62 |             ['lsvc1', clf3],
 63 |             ['LR1', clf4],
 64 |             ['bb',clf5],
 65 |             ['vote', clf6],
 66 |             ['sgdl1', clf7],
 67 |             ['lsvc2', clf8],
 68 |             ['LR2', clf9],
 69 |             ['nb2', clf10],
 70 |             ['bb2', clf11],
 71 |             ['LR3', clf12],
 72 |             ['LR4', clf13],
 73 |             ['rc1', clf14],
 74 |             ['pac1', clf15],
 75 |             ['rc2', clf16],
 76 |             ['pac2', clf17],
 77 |             ['lsvc3', clf18],
 78 |             ['nb3', clf19],
 79 |             ['bb3', clf20],
 80 |             ['lr5', clf21],
 81 |             ['lr6', clf22],
 82 |             ['rc3', clf23],
 83 |             ['pac3', clf24],
 84 |             ['hub', clf25],
 85 |         ]
 86 |         #####################################
 87 |         clf_svc = SVC(C=1,random_state=random_rate,cache_size=1000)
 88 | 
 89 |         self.base_models = basemodel
 90 |         self.LR=clf4
 91 |         self.svc = clf_svc
 92 | 
 93 |     def stacking(self,X,Y,T,wv_X,wv_T,kind):
 94 |         """
 95 |         ensemble model:stacking
 96 | 
 97 |         """
 98 |         print 'fitting..'
 99 |         models = self.base_models
100 |         folds = list(KFold(len(Y), n_folds=5, random_state=0))
101 |         S_train = np.zeros((X.shape[0], len(models)))
102 |         S_test = np.zeros((T.shape[0], len(models)))
103 | 
104 |         for i, bm in enumerate(models):
105 |             clf = bm[1]
106 | 
107 |             S_test_i = np.zeros((T.shape[0], len(folds)))
108 |             for j, (train_idx, test_idx) in enumerate(folds):
109 |                 X_train = X[train_idx]
110 |                 y_train = Y[train_idx]
111 |                 X_holdout = X[test_idx]
112 | 
113 |                 clf.fit(X_train, y_train)
114 |                 y_pred = clf.predict(X_holdout)[:]
115 |                 S_train[test_idx, i] = y_pred
116 |                 S_test_i[:, j] = clf.predict(T)[:]
117 | 
118 |             S_test[:, i] = S_test_i.mean(1)
119 | 
120 |         print S_train.shape,S_test.shape
121 | 
122 |         S_train = np.concatenate((S_train,wv_X),axis=1)
123 |         S_test = np.concatenate((S_test, wv_T), axis=1)
124 | 
125 |         print S_train.shape,S_test.shape
126 | 
127 |         print 'scalering..'
128 |         min_max_scaler = StandardScaler()
129 |         S_train = min_max_scaler.fit_transform(S_train)
130 |         S_test = min_max_scaler.fit_transform(S_test)
131 |         print 'scalering over!'
132 |         self.svc.fit(S_train, Y)
133 |         yp= self.svc.predict(S_test)[:]
134 |         return yp
135 | 
136 |     def validation(self, X, Y, wv_X, kind):
137 |         """
138 |         2-fold validation
139 |         :param X: train text
140 |         :param Y: train label
141 |         :param wv_X: train wv_vec
142 |         :param kind: age/gender/education
143 |         :return: mean score of 2-fold validation
144 |         """
145 |         print '向量化中...'
146 |         X=np.array(X)
147 |         fold_n=2
148 |         folds = list(StratifiedKFold(Y, n_folds=fold_n, shuffle=False,random_state=0))
149 |         score = np.zeros(fold_n)
150 |         for j, (train_idx, test_idx) in enumerate(folds):
151 |             print j+1,'-fold'
152 | 
153 |             X_train = X[train_idx]
154 |             y_train = Y[train_idx]
155 |             X_test = X[test_idx]
156 |             y_test = Y[test_idx]
157 | 
158 |             wv_X_train =wv_X[train_idx]
159 |             wv_X_test = wv_X[test_idx]
160 | 
161 |             vec = TfidfVectorizer(use_idf=True,sublinear_tf=False, max_features=50000, binary=True)
162 |             vec.fit(X_train, y_train)
163 |             X_train = vec.transform(X_train)
164 |             X_test = vec.transform(X_test)
165 | 
166 |             print 'shape',X_train.shape
167 | 
168 |             ypre = self.stacking(X_train,y_train,X_test,wv_X_train,wv_X_test,kind)
169 |             cur = sum(y_test == ypre) * 1.0 / len(ypre)
170 |             score[j] = cur
171 | 
172 |         print score
173 |         print score.mean(),kind
174 |         return score.mean()
175 | 
176 |     def predict(self,X,Y,T,wv_X,wv_T,kind):
177 |         """
178 |         train and predict
179 |         :param X: train text
180 |         :param Y: train label
181 |         :param T: test text
182 |         :param wv_X: train wv
183 |         :param wv_T: test wv
184 |         :param kind: age/gender/education
185 |         :return: array like ,predict of "kind"
186 |         """
187 |         print 'predicting..向量化中...'
188 |         vec = TfidfVectorizer(use_idf=True, sublinear_tf=False, max_features=60000, binary=True)
189 | 
190 |         vec.fit(X, Y)
191 |         X = vec.transform(X)
192 |         T = vec.transform(T)
193 | 
194 |         print 'train size',X.shape,T.shape
195 |         res = self.stacking(X, Y, T, wv_X, wv_T, kind)
196 |         return res
197 | 
198 | 
199 | 


--------------------------------------------------------------------------------
/project/3-main/main.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import classify
  3 | import preprocess
  4 | import pandas as pd
  5 | import numpy as np
  6 | import csv
  7 | import codecs
  8 | import multiprocessing
  9 | import time
 10 | 
 11 | 
 12 | def input(trainname):
 13 |     """
 14 |     load the text file
 15 |     :param trainname: path of the input file
 16 |     :return:list
 17 |     """
 18 |     traindata = []
 19 |     with open(trainname, 'rb') as f:
 20 |         reader = csv.reader(f)
 21 |         count = 0
 22 |         for line in reader:
 23 |             try:
 24 |                 traindata.append(line[0])
 25 |                 count += 1
 26 |             except:
 27 |                 print "error:", line, count
 28 |                 traindata.append("1 ")
 29 |     return traindata
 30 | def output(filename, ID, age, gender, education):
 31 |     """
 32 |     generate the submit file
 33 |     :param filename: path of the submit file
 34 |     :param ID: user ID
 35 |     :param age:predicted age
 36 |     :param gender:predicted gender
 37 |     :param education:predicted education
 38 |     :return:submit file
 39 |     """
 40 |     print ID.shape, age.shape, gender.shape, education.shape
 41 |     with codecs.open(filename, 'w', encoding='gbk') as f:
 42 |         count=0
 43 |         for i in range(len(ID)):
 44 |             # if count>=1000:
 45 |             #     break
 46 |             f.write(str(ID[i]) + ' ' + str(age[i]) + ' ' + str(gender[i]) + ' ' + str(education[i]) + '\n')
 47 |             count+=1
 48 | if __name__ == '__main__':
 49 |     """
 50 |     the main function
 51 |     注意路径
 52 |     """
 53 |     start=time.time()
 54 |     # order='predict' #execute predict function
 55 |     order='test' #execute 2-fold validation function
 56 |     print 'orderis ', order
 57 |     print '----------start----------'
 58 | 
 59 |     #loading
 60 |     trainname = 'jieba_train_cut.csv'
 61 |     testname = 'jieba_test_cut.csv'
 62 |     traindata = input(trainname)
 63 |     testdata = input(testname)
 64 |     label_genderfile_path = 'train_gender.csv'
 65 |     label_agefile_path = 'train_age.csv'
 66 |     label_edufile_path = 'train_education.csv'
 67 |     genderdata = np.loadtxt(open(label_genderfile_path, 'r')).astype(int)
 68 |     agedata = np.loadtxt(open(label_agefile_path, 'r')).astype(int)
 69 |     educationdata = np.loadtxt(open(label_edufile_path, 'r')).astype(int)
 70 | 
 71 |     # ---------------------------------
 72 |     print '预处理中..'
 73 |     preprocessob = preprocess.preprocess()
 74 | 
 75 |     #remove label missed samples
 76 |     gender_traindatas, genderlabel = preprocessob.removezero(traindata, genderdata)
 77 |     age_traindatas, agelabel = preprocessob.removezero(traindata, agedata)
 78 |     edu_traindatas, edulabel = preprocessob.removezero(traindata, educationdata)
 79 | 
 80 |     # 填写你的wv向量路径
 81 |     w2vtrain = np.load('wv300_win100.train.npy')
 82 |     w2vtest = np.load('wv300_win100.test.npy')
 83 | 
 84 |     wv_gender_traindatas, genderlabel = preprocessob.removezero(w2vtrain, genderdata)
 85 |     wv_age_traindatas, agelabel = preprocessob.removezero(w2vtrain, agedata)
 86 |     wv_edu_traindatas, edulabel = preprocessob.removezero(w2vtrain, educationdata)
 87 | 
 88 |     if order=='test':
 89 |         termob1 = classify.term()
 90 |         termob2 = classify.term()
 91 |         termob3 = classify.term()
 92 |         p1 = multiprocessing.Process(target=termob1.validation,
 93 |                                      args=(gender_traindatas, genderlabel, wv_gender_traindatas, 'gender',))
 94 |         p2=multiprocessing.Process(target=termob2.validation,args=(age_traindatas, agelabel, wv_age_traindatas, 'age',))
 95 |         p3=multiprocessing.Process(target=termob3.validation,args=(edu_traindatas, edulabel, wv_edu_traindatas, 'edu',))
 96 | 
 97 |         p1.start()
 98 |         p2.start()
 99 |         p3.start()
100 | 
101 |         p1.join()
102 |         p2.join()
103 |         p3.join()
104 |     elif order=='predict':
105 |         termob = classify.term()
106 |         gender=termob.predict(gender_traindatas, genderlabel, testdata, wv_gender_traindatas, w2vtest, 'gender')
107 |         age=termob.predict(age_traindatas, agelabel, testdata, wv_age_traindatas, w2vtest, 'age')
108 |         edu=termob.predict(edu_traindatas, edulabel, testdata, wv_edu_traindatas, w2vtest, 'edu')
109 |         ID = pd.read_csv('user_tag_query.10W.TEST.csv').ID
110 |         output('submit.csv', ID, age, gender, edu)
111 | 
112 |     end=time.time()
113 |     print 'total time is', end-start
114 | 


--------------------------------------------------------------------------------
/project/3-main/preprocess.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | import numpy as np
 3 | class preprocess():
 4 | 
 5 |     def removezero(self, x, y):
 6 |         nozero = np.nonzero(y)
 7 |         y = y[nozero]
 8 |         x = np.array(x)
 9 |         x = x[nozero]
10 |         return x, y


--------------------------------------------------------------------------------
/project/README.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderSkyChen/2016CCF_BDCI_Sougou/d49c94be714265adbad27ece0d84658264b1a3d4/project/README.txt


--------------------------------------------------------------------------------
/答辩ppt -.ppt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderSkyChen/2016CCF_BDCI_Sougou/d49c94be714265adbad27ece0d84658264b1a3d4/答辩ppt -.ppt


--------------------------------------------------------------------------------