├── README.md
├── fastbin.py
└── vocab.txt


/README.md:
--------------------------------------------------------------------------------
1 | # use-pretrained-word2vec
2 | 


--------------------------------------------------------------------------------
/fastbin.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | '''本程序只是对word2vec进行了简单的预处理，应用到复杂模型中还需要根据实际情况做必要的改动'''
 5 | 
 6 | class Wordlist(object):
 7 |     def __init__(self, filename, maxn = 100000):
 8 |         lines = map(lambda x: x.split(), open(filename).readlines()[:maxn])
 9 |         self.size = len(lines)
10 | 
11 |         self.voc = [(item[0][0], item[1]) for item in zip(lines, xrange(self.size))]
12 |         self.voc = dict(self.voc)
13 | 
14 |     def getID(self, word):
15 |         try:
16 |             return self.voc[word]
17 |         except:
18 |             return 0
19 | 
20 | def get_W(word_vecs, k=300):
21 |     """
22 |     Get word matrix. W[i] is the vector for word indexed by i
23 |     """
24 |     vocab_size = len(word_vecs)
25 |     word_idx_map = dict()
26 |     W = np.zeros(shape=(vocab_size+1, k), dtype='float32')
27 |     W[0] = np.zeros(k, dtype='float32')
28 |     i = 1
29 |     for word in word_vecs:
30 |         W[i] = word_vecs[word]
31 |         word_idx_map[word] = i
32 |         i += 1
33 |     return W, word_idx_map
34 | 
35 | def load_bin_vec(fname, vocab):
36 |     """
37 |     Loads 300x1 word vecs from Google (Mikolov) word2vec
38 |     """
39 |     i=0
40 |     word_vecs = {}
41 |     pury_word_vec = []
42 |     with open(fname, "rb") as f:
43 |         header = f.readline()
44 |         print 'header',header
45 |         vocab_size, layer1_size = map(int, header.split())
46 |         print 'vocabsize:',vocab_size,'layer1_size:',layer1_size
47 |         binary_len = np.dtype('float32').itemsize * layer1_size
48 |         for line in xrange(vocab_size):
49 |             word = []
50 |             while True:
51 |                 ch = f.read(1)
52 |                 #print ch
53 |                 if ch == ' ':
54 |                     word = ''.join(word)
55 |                     #print 'single word:',word
56 |                     break
57 |                 if ch != '\n':
58 |                     word.append(ch)
59 |                     #print word
60 |             #print word
61 |             if word in vocab:
62 |                word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
63 |                pury_word_vec.append(word_vecs[word])
64 |                if i==0:
65 |                    print 'word',word
66 |                    i=1
67 |             else:
68 |                 f.read(binary_len)
69 |        #np.savetxt('googleembedding.txt',pury_word_vec)
70 |     return word_vecs,pury_word_vec
71 | 
72 | def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
73 |     """
74 |     For words that occur in at least min_df documents, create a separate word vector.
75 |     0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
76 |     """
77 |     for word in vocab:
78 |         if word not in word_vecs and vocab[word] >= min_df:
79 |             word_vecs[word] = np.random.uniform(-0.25,0.25,k)
80 | 
81 | if __name__=="__main__":
82 |     w2v_file = "GoogleNews-vectors-negative300.bin"#Google news word2vec bin文件
83 |     print "loading data...",
84 |     vocab = Wordlist('vocab.txt')#自己的数据集要用到的词表
85 |     w2v,pury_word2vec = load_bin_vec(w2v_file, vocab.voc)
86 |     add_unknown_words(w2v, vocab.voc)
87 |     W, word_idx_map = get_W(w2v)
88 | 
89 |     '''embedding lookup简单应用'''
90 |     Wa = tf.Variable(W)
91 |     embedding_input = tf.nn.embedding_lookup(Wa, [0,1,2])#正常使用时要替换成相应的doc
92 | 
93 |     with tf.Session() as sess:
94 |         sess.run(tf.global_variables_initializer())
95 |         input = sess.run(Wa)
96 |         #print np.shape(Wa)
97 | 


--------------------------------------------------------------------------------
/vocab.txt:
--------------------------------------------------------------------------------
 1 | unk
 2 | </s>
 3 | in
 4 | for
 5 | that
 6 | is
 7 | on
 8 | ##
 9 | The
10 | with
11 | said
12 | was
13 | the
14 | at
15 | not
16 | as
17 | it
18 | be
19 | from
20 | by
21 | are
22 | I
23 | have
24 | he
25 | will
26 | has
27 | ####
28 | his
29 | an
30 | this
31 | or
32 | their
33 | 


--------------------------------------------------------------------------------