├── README.md
├── build_lm.sh
├── build_word2vec.py
├── c2d.py
├── dist_lda.py
├── dist_lsi.py
├── document_to_corpus.py
├── env.sh
├── fetch_url.go
├── hello.go
├── images
    ├── Bernoulli_distribution_estimation_1.png
    ├── Bernoulli_distribution_estimation_2.png
    ├── Bernoulli_distribution_estimation_3.png
    ├── Bernoulli_distribution_estimation_4.png
    ├── DNC.png
    ├── GLU.png
    ├── MindMeld_MessagingInterfacesDemystified.pdf
    ├── SGNS_1.png
    ├── SGNS_2.png
    ├── additive_multiplicative_attention.png
    ├── aho-corasick.png
    ├── allocation-weighting.png
    ├── alpha_beta.png
    ├── attention_def1.png
    ├── attention_def2.png
    ├── attention_def3.png
    ├── backward-beta.jpeg
    ├── binomial_estimation_1.png
    ├── binomial_estimation_2.png
    ├── binomial_estimation_3.png
    ├── binomial_estimation_4.png
    ├── bm.jpg
    ├── brat_sejong.png
    ├── cmp_1.png
    ├── cmp_10.png
    ├── cmp_11.png
    ├── cmp_12.png
    ├── cmp_13.png
    ├── cmp_14.png
    ├── cmp_15.png
    ├── cmp_2.png
    ├── cmp_3.png
    ├── cmp_4.png
    ├── cmp_5.png
    ├── cmp_6.png
    ├── cmp_7.png
    ├── cmp_8.png
    ├── cmp_9.png
    ├── cnn_embedding.png
    ├── content-based-addressing.png
    ├── conv_1.jpeg
    ├── cross_entropy_loss.png
    ├── cross_entropy_loss_many_output.png
    ├── deptree.png
    ├── distribution_function.png
    ├── du_1.png
    ├── du_2.png
    ├── entropy_1.jpg
    ├── entropy_10.jpg
    ├── entropy_11.jpg
    ├── entropy_12.jpg
    ├── entropy_2.jpg
    ├── entropy_3.jpg
    ├── entropy_4.jpg
    ├── entropy_5.jpg
    ├── entropy_6.jpg
    ├── entropy_7.jpg
    ├── entropy_8.jpg
    ├── entropy_9.jpg
    ├── expectation.png
    ├── forward-alpha.jpeg
    ├── forward_backward_var.png
    ├── four_equation.png
    ├── hierarchical_attention.png
    ├── hmm_1.png
    ├── hmm_2.png
    ├── hmm_3.png
    ├── hmm_4.png
    ├── hmm_5.png
    ├── hmm_6.png
    ├── hmm_7.png
    ├── kmp.jpg
    ├── layer_norm_timesteps.png
    ├── me_1.png
    ├── me_2.png
    ├── ml_1.png
    ├── ml_2.png
    ├── ml_3.png
    ├── ml_4.png
    ├── mult_head_self_attention.png
    ├── multi_dimensional_self_attention.png
    ├── multi_headed_attention_1.png
    ├── multi_headed_attention_2.png
    ├── multinomial_estimation_1.png
    ├── multinomial_estimation_2.png
    ├── multinomial_estimation_3.png
    ├── ner_attention.jpg
    ├── ner_attention_math1.jpg
    ├── ner_attention_math2.jpg
    ├── ngram_cnn_highway_1.png
    ├── ngram_cnn_highway_2.png
    ├── nn_1.jpeg
    ├── nn_2.jpeg
    ├── nn_3.jpeg
    ├── nn_4.jpeg
    ├── nn_5.jpeg
    ├── ntm-addressing.png
    ├── ntm-content-addressing.png
    ├── ntm-interface-vector.png
    ├── ntm-interpolation.png
    ├── ntm-lstm.png
    ├── ntm-pseudocode.png
    ├── ntm-sharpen.png
    ├── ntm-shift.png
    ├── ntm-test.png
    ├── ntm-train.png
    ├── ntm.png
    ├── p-value.png
    ├── partition.png
    ├── pstree.png
    ├── re_attention_1.png
    ├── re_attention_2.png
    ├── read-vector.png
    ├── regularization.jpeg
    ├── retention-vector.png
    ├── scaled_dot_product_attention.png
    ├── sejong_entry.png
    ├── self-attention-map.png
    ├── self-attention.png
    ├── self_attention_with_fnn.png
    ├── seq2seq_attention_machanism.jpg
    ├── seq2seq_attention_machanism.png
    ├── seq2seq_autoencoder.jpeg
    ├── time_invariant_self_attention.png
    ├── time_invariant_self_attention_full.png
    ├── transformer_model.png
    ├── traversal_london.png
    ├── url_sejong.png
    ├── usage-vector.png
    ├── variance.png
    ├── vbox_port.png
    ├── viterbi.png
    ├── wor2vec_visualizer.png
    ├── word2vec_1.jpeg
    ├── word2vec_2.jpeg
    ├── word2vec_3.jpeg
    ├── word2vec_4.jpeg
    ├── word2vec_5.jpeg
    ├── workbench_fatal.png
    ├── write-operation.png
    └── write-weight-vector.png
├── keras_mlp.py
├── make_bdb.py
├── make_leveldb.py
├── make_lmdb.c
├── make_lmdb.py
├── multiplexing.go
├── ngram.cc
├── queue.go
├── search_bdb.py
├── search_leveldb.py
├── search_lmdb.c
├── search_lmdb.py
├── search_word2vec.py
├── similarity.py
├── stack.go
├── test_numpy.py
├── test_theano.py
├── transform.py
└── wordcount_spark.py


/README.md:
--------------------------------------------------------------------------------
 1 | ### WIKI
 2 |   - Natural Language Processing
 3 |   - Development
 4 |   - Algorithm
 5 |   - Machine Learning
 6 |   - details in [Wiki](https://github.com/dsindex/blog/wiki)
 7 | 
 8 | ### Sources
 9 |   - sources referred by wiki
10 |   
11 | ### Pretty Viewer
12 |   - [dsindex.github.io](http://dsindex.github.io/)
13 | 


--------------------------------------------------------------------------------
/build_lm.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -o nounset
  4 | set -o errexit
  5 | 
  6 | VERBOSE_MODE=0
  7 | 
  8 | function error_handler()
  9 | {
 10 |   local STATUS=${1:-1}
 11 |   [ ${VERBOSE_MODE} == 0 ] && exit ${STATUS}
 12 |   echo "Exits abnormally at line "`caller 0`
 13 |   exit ${STATUS}
 14 | }
 15 | trap "error_handler" ERR
 16 | 
 17 | PROGNAME=`basename ${BASH_SOURCE}`
 18 | DRY_RUN_MODE=0
 19 | 
 20 | function print_usage_and_exit()
 21 | {
 22 |   set +x
 23 |   local STATUS=$1
 24 |   echo "Usage: ${PROGNAME} [-v] [-v] [--dry-run] [-h] [--help]"
 25 |   echo ""
 26 |   echo " Options -"
 27 |   echo "  -v                 enables verbose mode 1"
 28 |   echo "  -v -v              enables verbose mode 2"
 29 |   echo "      --dry-run      show what would have been dumped"
 30 |   echo "  -h, --help         shows this help message"
 31 |   exit ${STATUS:-0}
 32 | }
 33 | 
 34 | function debug()
 35 | {
 36 |   if [ "$VERBOSE_MODE" != 0 ]; then
 37 |     echo $@
 38 |   fi
 39 | }
 40 | 
 41 | GETOPT=`getopt -o vh --long dry-run,help -n "${PROGNAME}" -- "$@"`
 42 | if [ $? != 0 ] ; then print_usage_and_exit 1; fi
 43 | 
 44 | eval set -- "${GETOPT}"
 45 | 
 46 | while true
 47 | do case "$1" in
 48 |      -v)            let VERBOSE_MODE+=1; shift;;
 49 |      --dry-run)     DRY_RUN_MODE=1; shift;;
 50 |      -h|--help)     print_usage_and_exit 0;;
 51 |      --)            shift; break;;
 52 |      *) echo "Internal error!"; exit 1;;
 53 |    esac
 54 | done
 55 | 
 56 | if (( VERBOSE_MODE > 1 )); then
 57 |   set -x
 58 | fi
 59 | 
 60 | 
 61 | # template area is ended.
 62 | # -----------------------------------------------------------------------------
 63 | if [ ${#} != 0 ]; then print_usage_and_exit 1; fi
 64 | 
 65 | # current dir of this script
 66 | CDIR=$(readlink -f $(dirname $(readlink -f ${BASH_SOURCE[0]})))
 67 | 
 68 | [[ -f ${CDIR}/env.sh ]] && . ${CDIR}/env.sh || exit
 69 | 
 70 | # -----------------------------------------------------------------------------
 71 | # functions
 72 | 
 73 | 
 74 | 
 75 | # end functions
 76 | # -----------------------------------------------------------------------------
 77 | 
 78 | # -----------------------------------------------------------------------------
 79 | # main
 80 | 
 81 | make_calmness
 82 | child_verbose=""
 83 | if (( VERBOSE_MODE > 1 )); then
 84 |     revert_calmness
 85 |     child_verbose="-v -v"
 86 | fi
 87 | 
 88 | ${IRSTLM}/dict -InputFile=${DOC} -OutputFile=${DICT} -Freq=yes -sort=no
 89 | ${IRSTLM}/split-dict.pl --input ${DICT} --output ${DICT}. --parts ${SPLIT}
 90 | for subdict in `ls ${DICT}.*`
 91 | do
 92 |     filename=$(basename "$subdict")
 93 |     extension="${filename##*.}"
 94 |     ${IRSTLM}/ngt -InputFile=${DOC} -FilterDict=${filename} -NgramSize=${NGRAM_SIZE} -OutputFile=${NGRAM}.${extension} -OutputGoogleFormat=yes
 95 | done
 96 | 
 97 | for subngram in `ls ${NGRAM}.*`
 98 | do
 99 |     filename=$(basename "$subngram")
100 |     extension="${filename##*.}"
101 |     ${IRSTLM}/build-sublm.pl --size ${NGRAM_SIZE} --ngrams ${subngram} --sublm ${LM}.${extension}
102 | done
103 | 
104 | ${IRSTLM}/merge-sublm.pl --size ${NGRAM_SIZE} --sublm ${LM} -lm ${iARPA}.gz
105 | 
106 | function optional {
107 |     ${IRSTLM}/quantize-lm ${iARPA} ${qARPA}
108 | }
109 | 
110 | gunzip ${iARPA}.gz
111 | ${IRSTLM}/compile-lm --text=yes ${iARPA} ${ARPA}
112 | 
113 | ${KENLM}/build_binary -s -i -w mmap ${ARPA} ${ARPA}.mmap
114 | 
115 | close_fd
116 | 
117 | # end main
118 | # -----------------------------------------------------------------------------
119 | 


--------------------------------------------------------------------------------
/build_word2vec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf8 -*-
 3 | 
 4 | '''
 5 | read http://radimrehurek.com/gensim/models/word2vec.html
 6 | here is test code
 7 | '''
 8 | 
 9 | import os
10 | import sys
11 | reload(sys)
12 | sys.setdefaultencoding('utf-8')
13 | import re
14 | from   optparse import OptionParser
15 | import time
16 | from   gensim.models import word2vec,phrases
17 | import logging
18 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
19 | 
20 | def build_model(corpus_path, detect_phrase=False) :
21 |     startTime = time.time()
22 | 
23 |     sentences = word2vec.LineSentence(corpus_path)
24 |     if detect_phrase :
25 |         bigram_transformer = phrases.Phrases(sentences)
26 |         model = word2vec.Word2Vec(bigram_transformer[sentences], size=100, alpha=0.025, window=5, min_count=5, sample=1e-5, workers=4, sg=1)
27 |     else :
28 |         model = word2vec.Word2Vec(sentences, size=100, alpha=0.025, window=5, min_count=5, sample=1e-5, workers=4, sg=1)
29 |     # no more training
30 |     model.init_sims(replace=True)
31 |     durationTime = time.time() - startTime
32 |     sys.stderr.write("duration time = %f\n" % durationTime)
33 |     return model
34 | 
35 | def save_model(model, model_path) :
36 |     model.save(model_path)
37 |     
38 | '''
39 | python2.7 build_word2vec.py -c corpus.txt -m corpus.txt.model
40 | '''
41 | if __name__ == '__main__':
42 | 
43 |     parser = OptionParser()
44 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
45 |     parser.add_option("-c", "--corpus", dest="corpus",help="corpus path", metavar="CORPUS")
46 |     parser.add_option("-m", "--model", dest="model",help="model path, output file", metavar="MODEL")
47 |     (options, args) = parser.parse_args()
48 | 
49 |     if options.verbose == 1 : VERBOSE = 1
50 | 
51 |     corpus_path = options.corpus
52 |     if corpus_path == None :
53 |         parser.print_help()
54 |         sys.exit(1)
55 | 
56 |     model_path = options.model
57 |     if model_path == None :
58 |         parser.print_help()
59 |         sys.exit(1)
60 | 
61 |     model = build_model(corpus_path)
62 |     save_model(model, model_path)
63 | 


--------------------------------------------------------------------------------
/c2d.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding: utf8 -*-
  3 | 
  4 | import os
  5 | from optparse import OptionParser
  6 | 
  7 | # global variable
  8 | VERBOSE = 0
  9 | 
 10 | import sys
 11 | reload(sys)
 12 | sys.setdefaultencoding('utf-8')
 13 | 
 14 | # -------------------------------------------------------------------------
 15 | # build tree
 16 | # -------------------------------------------------------------------------
 17 | def next_paren(tokens, i) :
 18 | 	'''
 19 | 	tokens[i]에서 시작해서 다음 '(' 혹은 ')'의 위치를 탐색
 20 | 	못찾은 경우 return -1 
 21 | 	'''
 22 | 	j = 0
 23 | 	found = False
 24 | 	for token in tokens[i:] :
 25 | 		if token == '(' or token == ')' : 
 26 | 			found = True
 27 | 			break
 28 | 		j += 1
 29 | 	if found : return i + j
 30 | 	return -1
 31 | 
 32 | def node_string(node, enable_eoj=True) :
 33 | 	if node['leaf'] : 
 34 | 		if enable_eoj :
 35 | 			return '(' + node['label'] + ' ' + node['eoj'] + '/' + str(node['eoj_idx']) + ' ' + node['morphs'] + ')'
 36 | 		else :
 37 | 			return '(' + node['label'] + ' ' + node['morphs'] + ')'
 38 | 	else :
 39 | 		return '(' + node['label'] + ')'
 40 | 
 41 | def create_node(tokens, i, j) :
 42 | 	'''
 43 | 	i ~ j까지가 label,morphs 영역
 44 | 	i + 1 = j  : label
 45 | 	             ex) '( NP ('
 46 | 				        i  j
 47 | 	i + 1 < j  : label,morphs
 48 | 	             ex) '( NP_MOD 프랑스/NNP+의/JKG )'
 49 | 				        i                        j
 50 | 	'''
 51 | 	node = {'lchild':{}, 'rchild':{}, 'parent':{}, 'sibling':{}}
 52 | 	if i + 1 == j :
 53 | 		node['label'] = tokens[i]
 54 | 		node['leaf']  = False
 55 | 		return node
 56 | 	elif i + 1 < j :
 57 | 		node['label'] = tokens[i]
 58 | 		node['morphs']  = tokens[i+1]
 59 | 		node['leaf']  = True
 60 | 		node['nleaf'] = {}
 61 | 		node['pleaf'] = {}
 62 | 		return node
 63 | 	else :
 64 | 		return None
 65 | 
 66 | def make_edge(top, node) :
 67 | 	if not top['lchild'] : # link to left child
 68 | 		top['lchild'] = node
 69 | 		node['parent'] = top
 70 | 		if VERBOSE : print node_string(top) + '-[left]->' + node_string(node)
 71 | 	elif not top['rchild'] : # link to right child
 72 | 		top['rchild'] = node
 73 | 		node['parent'] = top
 74 | 		top['lchild']['sibling'] = node
 75 | 		if VERBOSE : print node_string(top) + '-[right]->' + node_string(node)
 76 | 	else :
 77 | 		return False
 78 | 	return True	
 79 | 
 80 | def make_leaf_edge(node, history, depth=0) :
 81 | 	'''
 82 | 	tree의 leaf간 next,prev link 연결
 83 | 	즉, node['nleaf'], node['pleaf'] 설정
 84 | 	'''
 85 | 	if node['leaf'] :
 86 | 		length = len(history)
 87 | 		if length != 0 :
 88 | 			prev = history[-1]
 89 | 			prev['nleaf'] = node
 90 | 			node['pleaf'] = prev
 91 | 		history.append(node)
 92 | 
 93 | 	if node['lchild'] : 
 94 | 		make_leaf_edge(node['lchild'], history, depth+1)
 95 | 	if node['rchild'] : 
 96 | 		make_leaf_edge(node['rchild'], history, depth+1)
 97 | 
 98 | def build_tree(sent, tokens) :
 99 | 	'''
100 | 	sent = ; 프랑스의 세계적인 의상 디자이너 엠마누엘 웅가로가 실내 장식용 직물 디자이너로 나섰다.
101 | 	tokens = ( S ( NP_SBJ ( NP ( NP_MOD 프랑스/NNP+의/JKG ) \
102 | 			( NP ( VNP_MOD 세계/NNG+적/XSN+이/VCP+ᆫ/ETM ) ( NP ( NP 의상/NNG ) ( NP 디자이너/NNG ) ) ) ) \
103 | 			( NP_SBJ ( NP 엠마누엘/NNP ) ( NP_SBJ 웅가로/NNP+가/JKS ) ) ) \
104 | 			( VP ( NP_AJT ( NP ( NP ( NP 실내/NNG ) ( NP 장식/NNG+용/XSN ) ) ( NP 직물/NNG ) ) \
105 | 			( NP_AJT 디자이너/NNG+로/JKB ) ) ( VP 나서/VV+었/EP+다/EF+./SF ) ) )
106 | 	'''
107 | 	err = ' '.join(tokens)
108 | 	root = {'lchild':{}, 'rchild':{}, 'parent':{}, 'sibling':{}, 'leaf':False, 'label':'ROOT'}
109 | 	stack = []
110 | 	stack.append(root)
111 | 	max = len(tokens)
112 | 	i = 0
113 | 	eoj_idx = 1
114 | 	eoj_max = len(sent)
115 | 	while i < max :
116 | 		token = tokens[i]
117 | 		if token == '(' : # create node and push
118 | 			j = next_paren(tokens, i+1)
119 | 			if j == -1 or i+1 == j : 
120 | 				sys.stderr.write("ill-formed parentheses[1] : %s\n" % (err))
121 | 				return None
122 | 			node = create_node(tokens, i+1, j)
123 | 			if not node : return None
124 | 			# assign eoj/eoj_idx to leaf node
125 | 			if node['leaf'] :
126 | 				if eoj_idx >= eoj_max :
127 | 					sys.stderr.write("not aligned sentence %s : %s\n" % (' '.join(sent), err))
128 | 					return None
129 | 				node['eoj'] = sent[eoj_idx]
130 | 				node['eoj_idx'] = eoj_idx
131 | 				eoj_idx += 1
132 | 			if VERBOSE : print node_string(node)
133 | 			# push to stack
134 | 			stack.append(node)
135 | 		if token == ')' :
136 | 			# pop and make edge
137 | 			if len(stack) == 0 : 
138 | 				sys.stderr.write("ill-formed parentheses[2] : %s\n" % (err))
139 | 				return None
140 | 			node = stack.pop()
141 | 			if len(stack) == 0 : 
142 | 				sys.stderr.write("ill-formed parentheses[3] : %s\n" % (err))
143 | 				return None
144 | 			top  = stack[-1]
145 | 			if not make_edge(top, node) :
146 | 				sys.stderr.write("can't make edge : %s\n" % (err))
147 | 				return None
148 | 		i += 1
149 | 
150 | 	if len(stack) == 1 and stack[-1]['label'] == 'ROOT' :
151 | 		history = []
152 | 		make_leaf_edge(root['lchild'], history, depth=0)
153 | 		return root
154 | 	else :
155 | 		sys.stderr.write("build failure : %s\n" % (err))
156 | 		return None
157 | # -------------------------------------------------------------------------
158 | 
159 | # -------------------------------------------------------------------------
160 | # preprocessing
161 | # -------------------------------------------------------------------------
162 | def modify_illformed_1(tokens) :
163 | 	# ex) '( NP ( NP ( NP ( NP+포로/NNG )'
164 | 	# '(' 다음이 label인데 '+'가 포함되어 있으면 처음 '+'만 공백으로
165 | 	n_tokens = []
166 | 	max = len(tokens)
167 | 	i = 0
168 | 	while i < max :
169 | 		token = tokens[i]
170 | 		if token == '(' :
171 | 			n_tokens.append(token)
172 | 			if '+' in tokens[i+1] :
173 | 				t_list = tokens[i+1].split('+')
174 | 				n_tokens.append(t_list[0]) # label
175 | 				n_tokens.append(''.join(t_list[1:])) # morphs
176 | 				i += 1
177 | 		else :
178 | 			n_tokens.append(token)
179 | 		i += 1
180 | 	return n_tokens
181 | 
182 | def tokenize(bucket) :
183 | 	'''
184 | 	* 다루기 쉽도록 공백으로 분리된 token 단위로 변환한다. 
185 | 	예) bucket
186 | 	; 프랑스의 세계적인 의상 디자이너 엠마누엘 웅가로가 실내 장식용 직물 디자이너로 나섰다.
187 | 	(S	(NP_SBJ	(NP	(NP_MOD 프랑스/NNP + 의/JKG)
188 | 				(NP	(VNP_MOD 세계/NNG + 적/XSN + 이/VCP + ᆫ/ETM)
189 | 					(NP	(NP 의상/NNG)
190 | 						(NP 디자이너/NNG))))
191 | 			(NP_SBJ	(NP 엠마누엘/NNP)
192 | 				(NP_SBJ 웅가로/NNP + 가/JKS)))
193 | 		(VP	(NP_AJT	(NP	(NP	(NP 실내/NNG)
194 | 						(NP 장식/NNG + 용/XSN))
195 | 					(NP 직물/NNG))
196 | 				(NP_AJT 디자이너/NNG + 로/JKB))
197 | 			(VP 나서/VV + 었/EP + 다/EF + ./SF)))
198 | 	'''
199 | 	sent = bucket[0].split()
200 | 	if sent[0] != ';' : return None,None
201 | 	paren_parse = ' '.join([s.strip('\t').replace('\t',' ') for s in bucket[1:]])
202 | 	paren_parse = paren_parse.replace(' + ','+')
203 | 	paren_parse = paren_parse.replace('(/','^[/').replace(')/','^]/')
204 | 	paren_parse = paren_parse.replace('(',' ( ').replace(')',' ) ')
205 | 	paren_parse = paren_parse.replace('^[/','(/').replace('^]/',')/')
206 | 	paren_parse = paren_parse.replace('+ ','+')
207 | 	tokens = paren_parse.split()
208 | 	tokens = modify_illformed_1(tokens)
209 | 
210 | 	if VERBOSE : print ' '.join(tokens)
211 | 	return sent, tokens
212 | # -------------------------------------------------------------------------
213 | 
214 | # -------------------------------------------------------------------------
215 | # tree traversal
216 | # -------------------------------------------------------------------------
217 | def tree2tokens(node, tokens, depth=0) :
218 | 	'''
219 | 	입력을 tree로 변환하기 전 tokenizing 했는데,
220 | 	여기서는 tree를 가지고 역으로 tokenizing 결과를 만든다. 
221 | 	'''
222 | 	if node['leaf'] :
223 | 		tokens.append('(')
224 | 		tokens.append(node['label'])
225 | 		tokens.append(node['morphs'])
226 | 		tokens.append(')')
227 | 	else :
228 | 		tokens.append('(')
229 | 		tokens.append(node['label'])
230 | 
231 | 	if node['lchild'] : 
232 | 		tree2tokens(node['lchild'], tokens, depth=depth+1)
233 | 		if not node['rchild'] :
234 | 			tokens.append(')') # closed
235 | 	if node['rchild'] : 
236 | 		tree2tokens(node['rchild'], tokens, depth=depth+1)
237 | 		tokens.append(')') # closed
238 | 
239 | def modify_morphs(morphs) :
240 | 	try : 
241 | 		t_morphs = morphs.replace('++/','+\t/') # + -> tab
242 | 		t_morphs = t_morphs.replace('+',' + ')
243 | 		t_morphs = t_morphs.replace('\t','+')   # tab -> +
244 | 	except :
245 | 		return morphs
246 | 	return t_morphs
247 | 
248 | def tree2con(node, tokens, history, depth=0) :
249 | 	'''
250 | 	입력을 tree로 변환했다면, 여기서 다시
251 | 	tree를 입력과 같은 형태(constituent, phrase structure)로 출력한다. 
252 | 	'''
253 | 	if depth == 0 : prev_node = None
254 | 	else : prev_node = history[-1]
255 | 	if prev_node and prev_node['leaf'] : # 바로 전에 leaf를 찍었다면
256 | 		tokens.append('\n')
257 | 		for i in xrange(depth) :
258 | 			tokens.append('\t')
259 | 
260 | 	if node['leaf'] :
261 | 		tokens.append('(' + node['label'] + ' ' + modify_morphs(node['morphs']) + ')')
262 | 	else :
263 | 		tokens.append('(' + node['label'] + '\t')
264 | 	history.append(node)
265 | 
266 | 	if node['lchild'] : 
267 | 		tree2con(node['lchild'], tokens, history, depth+1)
268 | 		if not node['rchild'] :
269 | 			tokens.append(')') # closed
270 | 	if node['rchild'] : 
271 | 		tree2con(node['rchild'], tokens, history, depth+1)
272 | 		tokens.append(')') # closed
273 | 
274 | def is_vx(gov_node) :
275 | 	morphs = gov_node['morphs']
276 | 	tokens = morphs.split('+')
277 | 	if '/VX' in tokens[0] : return True
278 | 	# VX는 아니지만 VX처럼 동작하는 용언, ex) '지니게 되다'
279 | 	if '되/' in tokens[0] :
280 | 		pleaf = None
281 | 		if gov_node['pleaf'] : pleaf = gov_node['pleaf']
282 | 		if pleaf :
283 | 			morphs = pleaf['morphs']
284 | 			tokens = morphs.split('+')
285 | 			if '게/EC' in tokens[-1] : return True
286 | 			if '면/EC' in tokens[-1] : return True
287 | 			if '아도/EC' in tokens[-1] : return True
288 | 	if '않/' in tokens[0] :
289 | 		pleaf = None
290 | 		if gov_node['pleaf'] : pleaf = gov_node['pleaf']
291 | 		if pleaf :
292 | 			morphs = pleaf['morphs']
293 | 			tokens = morphs.split('+')
294 | 			if '지/EC' in tokens[-1] : return True
295 | 	return False
296 | 
297 | def is_vnp(morphs) :
298 | 	tokens = morphs.split('+')
299 | 	if len(tokens) <= 2 : return False
300 | 	if '/NNB' in tokens[0] and '/VCP' in tokens[1] : return True
301 | 	return False
302 | 
303 | def is_va(morphs) :
304 | 	tokens = morphs.split('+')
305 | 	# '/VV'로 잘못 태깅된 케이스도 커버
306 | 	if '있/VA' in tokens[0] or \
307 | 		'있/VV' in tokens[0] or \
308 | 		'없/VA' in tokens[0] or \
309 | 		'없/VV' in tokens[0] or \
310 | 		'같/VA' in tokens[0] : return True
311 | 	else : return False
312 | 
313 | def is_nnb(morphs) :
314 | 	tokens = morphs.split('+')
315 | 	if '/NNB' in tokens[0] : return True
316 | 	return False
317 | 
318 | def is_etm(morphs) :
319 | 	tokens = morphs.split('+')
320 | 	if 'ᆫ/ETM' in tokens[-1] : return True
321 | 	if '는/ETM' in tokens[-1] : return True
322 | 	if 'ᆯ/ETM' in tokens[-1] : return True
323 | 	if '을/ETM' in tokens[-1] : return True
324 | 	if '를/ETM' in tokens[-1] : return True
325 | 	return False
326 | 
327 | def check_vx_rule(gov_node) :
328 | 	if not gov_node['parent'] : return False
329 | 	if not gov_node['parent']['lchild'] : return False
330 | 	if not is_vx(gov_node) : return False
331 | 	return True
332 | 
333 | def check_vnp_rule(gov_node) :
334 | 	if not gov_node['parent'] : return False
335 | 	if not gov_node['parent']['lchild'] : return False
336 | 	# 'VNP 것/NNB + 이/VCP + 다/EF' 형태인지 검사
337 | 	if not is_vnp(gov_node['morphs']) : return False
338 | 	return True
339 | 
340 | def check_va_rule(gov_node) :
341 | 	if not gov_node['parent'] : return False
342 | 	if not gov_node['parent']['lchild'] : return False
343 | 	# 'ㄹ NNB 있다/없다/같다' 형태인지 검사
344 | 	# 'NNB'는 어절의 시작이 NNB이면 된다. 즉, '~ㄹ 수가 없다' 형태도 허용
345 | 	if is_va(gov_node['morphs']) : 
346 | 		pleaf = None
347 | 		if gov_node['pleaf'] : pleaf = gov_node['pleaf']
348 | 		if pleaf and is_nnb(pleaf['morphs']) :
349 | 			ppleaf = None
350 | 			if pleaf['pleaf'] : 
351 | 				ppleaf = pleaf['pleaf']
352 | 			if ppleaf and is_etm(ppleaf['morphs']) : 
353 | 				return True
354 | 	return False
355 | 
356 | def find_for_vx_rule(node, gov_node) :
357 | 	found = None
358 | 	t_next = gov_node['parent']
359 | 	while t_next :
360 | 		# 새로운 지배소가 앞쪽에 있거나 같으면 안됨
361 | 		if t_next['leaf'] and ('VP' in t_next['label'] or 'VNP' in t_next['label']) and t_next['eoj_idx'] > node['eoj_idx'] :
362 | 			found = t_next
363 | 			break
364 | 		if t_next['lchild'] :
365 | 			if 'S' in t_next['lchild']['label'] or 'VP' in t_next['lchild']['label'] or 'VNP' in t_next['lchild']['label'] :
366 | 				t_next = t_next['lchild']
367 | 				continue
368 | 		if t_next['rchild'] :
369 | 			if 'VP' in t_next['rchild']['label'] or 'VNP' in t_next['rchild']['label'] :
370 | 				t_next = t_next['rchild']
371 | 				continue
372 | 		t_next = t_next['lchild']
373 | 	return found
374 | 
375 | def find_for_vnp_rule(node, gov_node) :
376 | 	found = None
377 | 	t_next = gov_node['parent']
378 | 	while t_next :
379 | 		# 새로운 지배소가 앞쪽에 있거나 같으면 안됨
380 | 		if t_next['leaf'] and ('VP' in t_next['label'] or 'VNP' in t_next['label']) and t_next['eoj_idx'] > node['eoj_idx'] :
381 | 			# 새로운 지배소와 기존 지배소간 거리가 너무 멀어도 안됨
382 | 			if abs(gov_node['eoj_idx'] - t_next['eoj_idx']) <= 3 : 
383 | 				found = t_next
384 | 				break
385 | 		if t_next['lchild'] :
386 | 			if 'S' in t_next['lchild']['label'] or 'VP' in t_next['lchild']['label'] or 'VNP' in t_next['lchild']['label'] :
387 | 				t_next = t_next['lchild']
388 | 				continue
389 | 		if t_next['rchild'] :
390 | 			if 'VP' in t_next['rchild']['label'] or 'VNP' in t_next['rchild']['label'] :
391 | 				t_next = t_next['rchild']
392 | 				continue
393 | 		t_next = t_next['lchild']
394 | 	return found
395 | 
396 | def find_for_va_rule(node, gov_node, search_mode=1) :
397 | 	found = None
398 | 	if search_mode == 2 : # parent->parent 부터 탐색이 필요한 경우
399 | 		t_next = gov_node['parent']
400 | 		if t_next and t_next['parent'] : 
401 | 			t_next = t_next['parent']
402 | 	else : # 일반적인 경우
403 | 		t_next = gov_node['parent']
404 | 	while t_next :
405 | 		# 새로운 지배소가 앞쪽에 있거나 같으면 안됨
406 | 		if t_next['leaf'] and ('VP' in t_next['label'] or 'VNP' in t_next['label']) and t_next['eoj_idx'] > node['eoj_idx'] :
407 | 			# 새로운 지배소와 기존 지배소간 거리가 너무 멀어도 안됨
408 | 			if abs(gov_node['eoj_idx'] - t_next['eoj_idx']) <= 3 : 
409 | 				found = t_next
410 | 				break
411 | 		t_next = t_next['lchild']
412 | 	return found
413 | 
414 | def find_gov(node) :
415 | 	'''
416 | 	* node = leaf node
417 | 
418 | 	1. head final rule
419 | 	  - 현재 node에서 parent를 따라가면서
420 |  	    첫번째로 right child가 있는 node를 만나면
421 | 	    해당 node의 right child를 따라서 leaf node까지 이동
422 | 	2. VX rule
423 | 	  - 보조용언을 governor로 갖는다면 본용언으로 바꿔준다. 
424 | 	  - 보조용언은 아니지만 보조용언처럼 동작하는 용언도 비슷하게 처리한다. ex) '지니게 되다'
425 | 	3. VNP rule
426 | 	  - 'VNP 것/NNB + 이/VCP + 다/EF' 형태를 governor로 갖는다면 앞쪽 용언으로 바꿔준다. 
427 | 	4. VA rule
428 | 	  - '있/VA, 없/VA, 같/VA'가 governor인 경우, 앞쪽에 'ㄹ NNB' 형태가 오면 앞쪽 용언으로 바꿔준다. 
429 | 	    node['pleaf'] 링크를 활용한다. 
430 | 	'''
431 | 	# 첫번째로 right child가 있는 node를 탐색
432 | 	# sibling link를 활용한다. 
433 | 	next = node
434 | 	found = None
435 | 	while next :
436 | 		if next['sibling'] :
437 | 			found = next['sibling']['parent']
438 | 			break
439 | 		next = next['parent']
440 | 
441 | 	gov_node = None
442 | 	if found :
443 | 		# right child를 따라서 leaf node까지
444 | 		next = found
445 | 		while next :
446 | 			if next['leaf'] :
447 | 				gov_node = next
448 | 				# -----------------------------------------------------------------
449 | 				# gov_node가 vx rule을 만족하는 경우 parent->lchild를 따라간다. 
450 | 				if check_vx_rule(gov_node) :
451 | 					new_gov_node = find_for_vx_rule(node, gov_node)
452 | 					if new_gov_node : gov_node = new_gov_node
453 | 				# gov_node가 vnp rule을 만족하는 경우 parent->lchild를 따라간다. 
454 | 				if check_vnp_rule(gov_node) :
455 | 					new_gov_node = find_for_vnp_rule(node, gov_node)
456 | 					if new_gov_node :
457 | 						gov_node = new_gov_node
458 | 						# 새로운 지배소가 '있다,없다,같다'인 경우 
459 | 						# check_va_rule을 한번 태워본다. 
460 | 						if check_va_rule(gov_node) :
461 | 							new_gov_node = find_for_va_rule(node, gov_node, search_mode=2)
462 | 							if new_gov_node : gov_node = new_gov_node
463 | 				# gov_node가 va rule을 만족하는 경우 parent->lchild를 따라간다. 
464 | 				if check_va_rule(gov_node) :
465 | 					new_gov_node = find_for_va_rule(node, gov_node, search_mode=1)
466 | 					if new_gov_node : gov_node = new_gov_node
467 | 				# -----------------------------------------------------------------
468 | 				break
469 | 			next = next['rchild']
470 | 	if gov_node :
471 | 		return gov_node['eoj_idx']
472 | 	return 0
473 | 		
474 | 
475 | def tree2dep(node, depth=0) :
476 | 	'''
477 | 	tree에서 dependency 구조를 뽑아낸다. 
478 | 	'''
479 | 	if node['leaf'] :
480 | 		eoj_idx = node['eoj_idx']
481 | 		eoj     = node['eoj']
482 | 		morphs  = modify_morphs(node['morphs'])
483 | 		label   = node['label']
484 | 		gov     = find_gov(node)
485 | 		out = [eoj_idx, eoj, morphs, label, gov]
486 | 		print '\t'.join([str(e) for e in out])
487 | 	if node['lchild'] : 
488 | 		tree2dep(node['lchild'], depth+1)
489 | 	if node['rchild'] : 
490 | 		tree2dep(node['rchild'], depth+1)
491 | 
492 | def find_ep(node) :
493 | 	'''
494 | 	parent를 따라서 처음으로 VP_MOD,S_MOD,VNP_MOD가 아닌 node를 탐색
495 | 	해당 node의 most left leaf  = ep begin
496 | 	해당 node의 most right leaf = ep end
497 | 	'''
498 | 	next = node
499 | 	found = None
500 | 	while next :
501 | 		if next['label'] not in ['VP_MOD','VNP_MOD','S_MOD'] :
502 | 			found = next
503 | 			break
504 | 		next = next['parent']
505 | 
506 | 	left_ep = None
507 | 	right_ep = None
508 | 	if found :
509 | 		# left child를 따라서 leaf node까지
510 | 		next = found
511 | 		while next :
512 | 			if next['leaf'] :
513 | 				left_ep = next
514 | 				break
515 | 			next = next['lchild']
516 | 		# right child를 따라서 leaf node까지
517 | 		next = found
518 | 		while next :
519 | 			if next['leaf'] :
520 | 				right_ep = next
521 | 				break
522 | 			next = next['rchild']
523 | 	if left_ep and right_ep :
524 | 		return left_ep['eoj_idx'], right_ep['eoj_idx']
525 | 	return 0,0
526 | 
527 | def is_ec(morphs) :
528 | 	tokens = morphs.split('+')
529 | 	if '/EC' in tokens[-1] : return True
530 | 	if '/SP' in tokens[-1] and len(tokens) >= 2 and '/EC' in tokens[-2] : return True
531 | 	return False
532 | 
533 | def find_sp(node) :
534 | 	'''
535 | 	parent를 따라서 처음으로 VP,S,VNP_CMP가 아닌 node를 탐색
536 | 	단, 현재 node는 parent의 right child여야 한다.
537 | 	정지하기 전 node에 대해서
538 | 	해당 node의 most left leaf  = sp begin
539 | 	'''
540 | 	next = node
541 | 	prev = None
542 | 	found = None
543 | 	while next :
544 | 		if next['label'] not in ['VP','S','VNP_CMP'] :
545 | 			found = prev
546 | 			break
547 | 		if next['sibling'] :
548 | 			found = next
549 | 			break
550 | 		prev = next
551 | 		next = next['parent']
552 | 
553 | 	left_sp = None
554 | 	if found :
555 | 		# left child를 따라서 leaf node까지
556 | 		next = found
557 | 		while next :
558 | 			if next['leaf'] :
559 | 				left_sp = next
560 | 				break
561 | 			next = next['lchild']
562 | 	if left_sp :
563 | 		return left_sp['eoj_idx']
564 | 	return 0
565 | 
566 | def tree2embedded(node, depth=0) :
567 | 	'''
568 | 	tree에서 embedded phrase/clause 구조를 뽑아낸다. 
569 | 	'''
570 | 	if node['leaf'] :
571 | 		eoj_idx = node['eoj_idx']
572 | 		eoj     = node['eoj']
573 | 		morphs  = modify_morphs(node['morphs'])
574 | 		label   = node['label']
575 | 		gov     = find_gov(node)
576 | 		ep_begin = 0
577 | 		ep_end   = 0
578 | 		if label in ['VP_MOD','VNP_MOD'] : 
579 | 			ep_begin,ep_end = find_ep(node)
580 | 		sp_begin = 0
581 | 		sp_end   = 0
582 | 		if label in ['VP','VNP','VNP_CMP'] and is_ec(node['morphs']) : 
583 | 			sp_begin = find_sp(node)
584 | 			if sp_begin != 0 :
585 | 				sp_end   = eoj_idx
586 | 				if sp_begin == sp_end : # 같은 경우는 의미없음
587 | 					sp_begin = 0
588 | 					sp_end = 0
589 | 		out = [eoj_idx, eoj, morphs, label, gov, ep_begin, ep_end, sp_begin, sp_end]
590 | 		print '\t'.join([str(e) for e in out])
591 | 	if node['lchild'] : 
592 | 		tree2embedded(node['lchild'], depth+1)
593 | 	if node['rchild'] : 
594 | 		tree2embedded(node['rchild'], depth+1)
595 | # -------------------------------------------------------------------------
596 | 
597 | def spill(bucket, mode) :
598 | 
599 | 	# --------------------------------------------------------------
600 | 	# ill-formed filtering and build tree
601 | 	sent, tokens = tokenize(bucket)
602 | 	if not sent : return False
603 | 	tree = build_tree(sent, tokens)
604 | 	if not tree : return False
605 | 	# begin with tree['lchild'](ROOT 제외)
606 | 	t_tokens = []
607 | 	tree2tokens(tree['lchild'], t_tokens, depth=0)
608 | 	if tokens != t_tokens :
609 | 		sys.stderr.write("input parentheses != tree2tokens\n")
610 | 		sys.stderr.write("input        = %s\n" % (' '.join(tokens)))
611 | 		sys.stderr.write("tree2tokens  = %s\n" % (' '.join(t_tokens)))
612 | 		return False
613 | 	# --------------------------------------------------------------
614 | 
615 | 	if mode == 0 : # print constituent tree
616 | 		print ' '.join(sent)
617 | 		t_tokens = []
618 | 		history  = []
619 | 		tree2con(tree['lchild'], t_tokens, history, depth=0)
620 | 		print ''.join(t_tokens).strip()
621 | 	if mode == 1 : # print dependency tree
622 | 		tree2dep(tree['lchild'], depth=0)
623 | 	if mode == 2 : # print embedded phrase/clause tagged tree
624 | 		tree2embedded(tree['lchild'], depth=0)
625 | 
626 | 	print '\n',
627 | 	return True
628 | 
629 | if __name__ == '__main__':
630 | 
631 | 	parser = OptionParser()
632 | 	parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
633 | 	parser.add_option("-m", "--mode", dest="mode", help="mode : 0(constituent), 1(dependency), 2(embedded phrase/clause)", metavar="mode")
634 | 	(options, args) = parser.parse_args()
635 | 
636 | 	if options.verbose : VERBOSE = 1
637 | 
638 | 	mode = options.mode
639 | 	if mode == None : mode = 0
640 | 	else : mode = int(mode)
641 | 
642 | 	bucket = []
643 | 	while 1:
644 | 		try:
645 | 			line = sys.stdin.readline()
646 | 		except KeyboardInterrupt:
647 | 			break
648 | 		if not line:
649 | 			break
650 | 		line = line.strip()
651 | 
652 | 		if not line and len(bucket) >= 1 : 
653 | 			ret = spill(bucket, mode)
654 | 			bucket = []
655 | 			continue
656 | 
657 | 		if line : bucket.append(line)
658 | 
659 | 	if len(bucket) != 0 :
660 | 		ret = spill(bucket, mode)
661 | 
662 | 


--------------------------------------------------------------------------------
/dist_lda.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding: utf8 -*-
  3 | 
  4 | '''
  5 | read http://radimrehurek.com/gensim/dist_lda.html
  6 | here is test code
  7 | '''
  8 | 
  9 | import os
 10 | import sys
 11 | reload(sys)
 12 | sys.setdefaultencoding('utf-8')
 13 | import re
 14 | from   optparse import OptionParser
 15 | from   gensim import corpora, models, similarities, matutils
 16 | import logging
 17 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 18 | 
 19 | def construct_dictionary(documents_path, filter=None) :
 20 |     # collect statistics about all tokens
 21 |     dictionary = corpora.Dictionary(line.lower().split() for line in open(documents_path))
 22 | 
 23 |     if filter :
 24 |         # remove stop words and words that appear only once
 25 |         stoplist = set('for a of the and to in'.split())
 26 |         stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
 27 |         once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
 28 |         dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
 29 |         dictionary.compactify() # remove gaps in id sequence after words that were removed
 30 | 
 31 |     return dictionary
 32 | 
 33 | def save_dictionary(dictionary, dictionary_path) :
 34 |     dictionary.save(dictionary_path)
 35 | 
 36 | def load_dictionary(dictionary_path) :
 37 |     dictionary = corpora.Dictionary().load(dictionary_path,mmap='r')
 38 |     return dictionary
 39 |     
 40 | def save_corpus(corpus, corpus_path, format=None) :
 41 |     if format == 'svmlight' : # Joachim’s SVMlight format
 42 |         corpora.SvmLightCorpus.serialize(corpus_path, corpus)
 43 |     if format == 'lda-c' : # Blei’s LDA-C format
 44 |         corpora.BleiCorpus.serialize(corpus_path, corpus)
 45 |     if format == 'low' : # GibbsLDA++ format
 46 |         corpora.LowCorpus.serialize(corpus_path, corpus)
 47 |     if not format : # Matrix Market format
 48 |         corpora.MmCorpus.serialize(corpus_path, corpus)
 49 | 
 50 | def load_corpus(corpus_path) :
 51 |     corpus = corpora.MmCorpus(corpus_path)
 52 |     return corpus
 53 | 
 54 | def corpus_to_tfidf(corpus) :
 55 |     tfidf = models.TfidfModel(corpus, normalize=True) # step 1 -- initialize a model
 56 |     '''
 57 |     corpus_tfidf = tfidf[corpus]
 58 |     for doc in corpus_tfidf:
 59 |         print doc
 60 |     '''
 61 |     return tfidf
 62 | 
 63 | def save_tfidf(tfidf, tfidf_path) :
 64 |     tfidf.save(tfidf_path)
 65 | 
 66 | def load_tfidf(tfidf_path) :
 67 |     tfidf = models.TfidfModel.load(tfidf_path)
 68 |     return tfidf
 69 | 
 70 | def corpus_to_lsi(corpus, tfidf, dictionary, topic_number) :
 71 |     corpus_tfidf = tfidf[corpus]
 72 |     lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_number) # initialize an LSI transformation
 73 |     '''
 74 |     corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
 75 |     lsi.print_topics(3)
 76 |     for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
 77 |         print doc
 78 |     '''
 79 |     return lsi
 80 | 
 81 | def save_lsi(lsi, lsi_path) :
 82 |     lsi.save(lsi_path)
 83 | 
 84 | def load_lsi(lsi_path) :
 85 |     lsi = models.LsiModel.load(lsi_path)
 86 |     return lsi
 87 |     
 88 | def corpus_to_lda(corpus, dictionary, topic_number) :
 89 |     lda = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_number)
 90 |     return lda
 91 | 
 92 | def save_lda(lda, lda_path) :
 93 |     lda.save(lda_path)
 94 | 
 95 | def load_lda(lda_path) :
 96 |     lda = models.LdaModel.load(lda_path)
 97 |     return lda
 98 | 
 99 | def corpus_to_lsi_dist(corpus, dictionary, topic_number) :
100 |     lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=topic_number, chunksize=10000, distributed=True)
101 |     return lsi
102 | 
103 | def corpus_to_lda_dist(corpus, dictionary, topic_number) :
104 |     lda = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_number, update_every=1, chunksize=10000, passes=1, distributed=True)
105 |     return lda
106 | 
107 | '''
108 | python2.7 dist_lda.py --dictionary=document.txt.dict --corpus=document.txt.mm --lda=document.txt.lda
109 | '''
110 | if __name__ == '__main__':
111 | 
112 |     parser = OptionParser()
113 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
114 |     parser.add_option("-d", "--dictionary", dest="dictionary",help="dictionary", metavar="DICT")
115 |     parser.add_option("-c", "--corpus", dest="corpus",help="corpus", metavar="CORPUS")
116 |     parser.add_option("-a", "--lda", dest="lda",help="lda, output file", metavar="LDA")
117 |     (options, args) = parser.parse_args()
118 | 
119 |     if options.verbose == 1 : VERBOSE = 1
120 | 
121 |     dictionary_path = options.dictionary
122 |     if dictionary_path == None :
123 |         parser.print_help()
124 |         sys.exit(1)
125 | 
126 |     corpus_path = options.corpus
127 |     if corpus_path == None :
128 |         parser.print_help()
129 |         sys.exit(1)
130 |         
131 |     lda_path = options.lda
132 |     if lda_path == None :
133 |         parser.print_help()
134 |         sys.exit(1)
135 | 
136 |     dictionary = load_dictionary(dictionary_path)
137 |     corpus = load_corpus(corpus_path)
138 | 
139 |     lda = corpus_to_lda_dist(corpus, dictionary, 200)
140 |     save_lda(lda, lda_path)
141 | 


--------------------------------------------------------------------------------
/dist_lsi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding: utf8 -*-
  3 | 
  4 | '''
  5 | read http://radimrehurek.com/gensim/dist_lsi.html
  6 | here is test code
  7 | '''
  8 | 
  9 | import os
 10 | import sys
 11 | reload(sys)
 12 | sys.setdefaultencoding('utf-8')
 13 | import re
 14 | from   optparse import OptionParser
 15 | from   gensim import corpora, models, similarities, matutils
 16 | import logging
 17 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 18 | 
 19 | def construct_dictionary(documents_path, filter=None) :
 20 |     # collect statistics about all tokens
 21 |     dictionary = corpora.Dictionary(line.lower().split() for line in open(documents_path))
 22 | 
 23 |     if filter :
 24 |         # remove stop words and words that appear only once
 25 |         stoplist = set('for a of the and to in'.split())
 26 |         stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
 27 |         once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
 28 |         dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
 29 |         dictionary.compactify() # remove gaps in id sequence after words that were removed
 30 | 
 31 |     return dictionary
 32 | 
 33 | def save_dictionary(dictionary, dictionary_path) :
 34 |     dictionary.save(dictionary_path)
 35 | 
 36 | def load_dictionary(dictionary_path) :
 37 |     dictionary = corpora.Dictionary().load(dictionary_path,mmap='r')
 38 |     return dictionary
 39 |     
 40 | def save_corpus(corpus, corpus_path, format=None) :
 41 |     if format == 'svmlight' : # Joachim’s SVMlight format
 42 |         corpora.SvmLightCorpus.serialize(corpus_path, corpus)
 43 |     if format == 'lda-c' : # Blei’s LDA-C format
 44 |         corpora.BleiCorpus.serialize(corpus_path, corpus)
 45 |     if format == 'low' : # GibbsLDA++ format
 46 |         corpora.LowCorpus.serialize(corpus_path, corpus)
 47 |     if not format : # Matrix Market format
 48 |         corpora.MmCorpus.serialize(corpus_path, corpus)
 49 | 
 50 | def load_corpus(corpus_path) :
 51 |     corpus = corpora.MmCorpus(corpus_path)
 52 |     return corpus
 53 | 
 54 | def corpus_to_tfidf(corpus) :
 55 |     tfidf = models.TfidfModel(corpus, normalize=True) # step 1 -- initialize a model
 56 |     '''
 57 |     corpus_tfidf = tfidf[corpus]
 58 |     for doc in corpus_tfidf:
 59 |         print doc
 60 |     '''
 61 |     return tfidf
 62 | 
 63 | def save_tfidf(tfidf, tfidf_path) :
 64 |     tfidf.save(tfidf_path)
 65 | 
 66 | def load_tfidf(tfidf_path) :
 67 |     tfidf = models.TfidfModel.load(tfidf_path)
 68 |     return tfidf
 69 | 
 70 | def corpus_to_lsi(corpus, tfidf, dictionary, topic_number) :
 71 |     corpus_tfidf = tfidf[corpus]
 72 |     lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_number) # initialize an LSI transformation
 73 |     '''
 74 |     corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
 75 |     lsi.print_topics(3)
 76 |     for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
 77 |         print doc
 78 |     '''
 79 |     return lsi
 80 |     
 81 | def save_lsi(lsi, lsi_path) :
 82 |     lsi.save(lsi_path)
 83 | 
 84 | def load_lsi(lsi_path) :
 85 |     lsi = models.LsiModel.load(lsi_path)
 86 |     return lsi
 87 | 
 88 | def corpus_to_lda(corpus, dictionary, topic_number) :
 89 |     lda = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_number)
 90 |     return lda
 91 | 
 92 | def save_lda(lda, lda_path) :
 93 |     lda.save(lda_path)
 94 | 
 95 | def load_lda(lda_path) :
 96 |     lda = models.LdaModel.load(lda_path)
 97 |     return lda
 98 | 
 99 | def corpus_to_lsi_dist(corpus, dictionary, topic_number) :
100 |     lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=topic_number, chunksize=10000, distributed=True)
101 |     return lsi
102 |     
103 | '''
104 | python2.7 dist_lsi.py --dictionary=document.txt.dict --corpus=document.txt.mm --lsi=document.txt.lsi
105 | '''
106 | if __name__ == '__main__':
107 | 
108 |     parser = OptionParser()
109 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
110 |     parser.add_option("-d", "--dictionary", dest="dictionary",help="dictionary", metavar="DICT")
111 |     parser.add_option("-c", "--corpus", dest="corpus",help="corpus", metavar="CORPUS")
112 |     parser.add_option("-l", "--lsi", dest="lsi",help="lsi, output file", metavar="LSI")
113 |     (options, args) = parser.parse_args()
114 | 
115 |     if options.verbose == 1 : VERBOSE = 1
116 | 
117 |     dictionary_path = options.dictionary
118 |     if dictionary_path == None :
119 |         parser.print_help()
120 |         sys.exit(1)
121 | 
122 |     corpus_path = options.corpus
123 |     if corpus_path == None :
124 |         parser.print_help()
125 |         sys.exit(1)
126 | 
127 |     lsi_path = options.lsi
128 |     if lsi_path == None :
129 |         parser.print_help()
130 |         sys.exit(1)
131 | 
132 |     dictionary = load_dictionary(dictionary_path)
133 |     corpus = load_corpus(corpus_path)
134 | 
135 |     lsi = corpus_to_lsi_dist(corpus, dictionary, 200)
136 |     save_lsi(lsi, lsi_path)
137 | 


--------------------------------------------------------------------------------
/document_to_corpus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding: utf8 -*-
  3 | 
  4 | '''
  5 | read http://radimrehurek.com/gensim/tut1.html
  6 | here is test code
  7 | '''
  8 | 
  9 | import os
 10 | import sys
 11 | reload(sys)
 12 | sys.setdefaultencoding('utf-8')
 13 | import re
 14 | from   optparse import OptionParser
 15 | from   gensim import corpora, models, similarities, matutils
 16 | import logging
 17 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 18 | import numpy
 19 | import scipy
 20 | 
 21 | def construct_dictionary(documents_path, filter=None) :
 22 |     # collect statistics about all tokens
 23 |     dictionary = corpora.Dictionary(line.lower().split() for line in open(documents_path))
 24 | 
 25 |     if filter :
 26 |         # remove stop words and words that appear only once
 27 |         stoplist = set('for a of the and to in'.split())
 28 |         stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
 29 |         once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
 30 |         dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
 31 |         dictionary.compactify() # remove gaps in id sequence after words that were removed
 32 | 
 33 |     return dictionary
 34 |     
 35 | def save_dictionary(dictionary, dictionary_path) :
 36 |     dictionary.save(dictionary_path)
 37 | 
 38 | def load_dictionary(dictionary_path) :
 39 |     dictionary = corpora.Dictionary().load(dictionary_path,mmap='r')
 40 |     return dictionary
 41 | 
 42 | def save_corpus(corpus, corpus_path, format=None) :
 43 |     if format == 'svmlight' : # Joachim’s SVMlight format
 44 |         corpora.SvmLightCorpus.serialize(corpus_path, corpus)
 45 |     if format == 'lda-c' : # Blei’s LDA-C format
 46 |         corpora.BleiCorpus.serialize(corpus_path, corpus)
 47 |     if format == 'low' : # GibbsLDA++ format
 48 |         corpora.LowCorpus.serialize(corpus_path, corpus)
 49 |     if not format : # Matrix Market format
 50 |         corpora.MmCorpus.serialize(corpus_path, corpus)
 51 | 
 52 | def load_corpus(corpus_path) :
 53 |     corpus = corpora.MmCorpus(corpus_path)
 54 |     return corpus
 55 | 
 56 | def corpus_to_dense(corpus, dictionary) :
 57 |     num_terms = len(dictionary.token2id)
 58 |     numpy_matrix = matutils.corpus2dense(corpus, num_terms)
 59 |     return numpy_matrix
 60 | 
 61 | def dense_to_corpus(numpy_matrix) :
 62 |     corpus = matutils.Dense2Corpus(numpy_matrix)
 63 |     return corpus
 64 | 
 65 | def corpus_to_sparse(corpus) :
 66 |     scipy_csc_matrix = matutils.corpus2csc(corpus)
 67 |     return scipy_csc_matrix
 68 | 
 69 | def sparse_to_corpus(scipy_csc_matrix) :
 70 |     corpus = matutils.Sparse2Corpus(scipy_csc_matrix)
 71 |     return corpus
 72 |     
 73 | '''
 74 | python2.7 documents_to_corpus.py -d documents.txt < documents.txt
 75 | '''
 76 | if __name__ == '__main__':
 77 | 
 78 |     parser = OptionParser()
 79 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
 80 |     parser.add_option("-d", "--documents", dest="documents",help="documents", metavar="DOCS")
 81 |     (options, args) = parser.parse_args()
 82 | 
 83 |     if options.verbose == 1 : VERBOSE = 1
 84 | 
 85 |     documents_path = options.documents
 86 |     if documents_path == None :
 87 |         parser.print_help()
 88 |         sys.exit(1)
 89 | 
 90 |     dictionary = construct_dictionary(documents_path)
 91 | 
 92 |     corpus = []
 93 |     linecount = 0
 94 |     while 1 :
 95 |         try : line = sys.stdin.readline()
 96 |         except KeyboardInterrupt : break
 97 |         if not line : break
 98 |         try : line = line.strip()
 99 |         except : continue
100 |         if not line : continue
101 |         linecount += 1
102 |         if linecount % 1000 == 0 :
103 |             sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n")
104 | 
105 |         vector = dictionary.doc2bow(line.lower().split())
106 |         '''
107 |         for id,tf in vector :
108 |             print dictionary.get(id) + "\t" + str(tf)
109 |         '''
110 |         corpus.append(vector)
111 | 
112 |     dictionary_path = documents_path + '.dict'
113 |     save_dictionary(dictionary, dictionary_path)
114 |     corpus_path = documents_path + '.mm'
115 |     save_corpus(corpus, corpus_path)
116 | 


--------------------------------------------------------------------------------
/env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o errexit
 4 | 
 5 | export LC_ALL=ko_KR.UTF-8
 6 | export LANG=ko_KR.UTF-8
 7 | 
 8 | # directory
 9 | ## current dir of this script
10 | CDIR=$(readlink -f $(dirname $(readlink -f ${BASH_SOURCE[0]})))
11 | PDIR=$(readlink -f $(dirname $(readlink -f ${BASH_SOURCE[0]}))/..)
12 | 
13 | IRSTLM=/usr/local/irstlm/bin
14 | DOC=../doc.txt
15 | DICT=dict
16 | NGRAM=ngram
17 | LM=lm
18 | iARPA=iarpa_lm
19 | qARPA=qarpa_lm
20 | ARPA=arpa_lm
21 | SPLIT=8
22 | NGRAM_SIZE=2
23 | KENLM=../package/kenlm/bin
24 | 
25 | # command setting
26 | python='/usr/local/bin/python2.7'
27 | pig='pig'
28 | hls='hadoop fs -ls'
29 | hget='hadoop fs -get'
30 | hmkdir='hadoop fs -mkdir'
31 | hrm='hadoop fs -rm -skipTrash'
32 | hrmr='hadoop fs -rm -r -skipTrash'
33 | hmv='hadoop fs -mv'
34 | hcp='hadoop fs -cp'
35 | hcat='hadoop fs -cat'
36 | hput='hadoop fs -copyFromLocal'
37 | htest='hadoop fs -test -e'
38 | htestd='hadoop fs -test -d'
39 | hmerge='hadoop fs -getmerge'
40 | hdu='hadoop fs -du'
41 | 
42 | # functions
43 | 
44 | function make_calmness()
45 | {
46 |     exec 3>&2 # save 2 to 3
47 |     exec 2> /dev/null
48 | }
49 | 
50 | function revert_calmness()
51 | {
52 |     exec 2>&3 # restore 2 from previous saved 3(originally 2)
53 | }
54 | 
55 | function close_fd()
56 | {
57 |     exec 3>&-
58 | }
59 | 
60 | function jumpto
61 | {
62 |     label=$1
63 |     cmd=$(sed -n "/$label:/{:a;n;p;ba};" $0 | grep -v ':$')
64 |     eval "$cmd"
65 |     exit
66 | }
67 | 


--------------------------------------------------------------------------------
/fetch_url.go:
--------------------------------------------------------------------------------
 1 | import (
 2 | 	"net/url"
 3 | 	"encoding/json"
 4 | 	"log"
 5 | )
 6 | 
 7 | func fetch(q *Query) bool {
 8 | 	ok, res := utils.HttpGet(API + url.QueryEscape(q.query))
 9 | 	if !ok {
10 | 		return false
11 | 	}
12 | 	var f map[string]interface{}
13 | 	if err := json.Unmarshal([]byte(res), &f); err != nil {
14 | 		return false
15 | 	}
16 | 	m := f["output"].(map[string]interface{})
17 | 	if val := m["val"]; val != "" {
18 | 		q.result = val.(string)
19 | 		log.Printf("[FETCH] %s", q.result)
20 | 		return true
21 | 	}
22 | 	return false
23 | }
24 | 


--------------------------------------------------------------------------------
/hello.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 |     "fmt"
  5 |     "math"
  6 |     "math/cmplx"
  7 |     "math/rand"
  8 |     "runtime"
  9 |     "time"
 10 | )
 11 | 
 12 | func add(x int, y int) int {
 13 |     return x + y
 14 | }
 15 | 
 16 | func swap(x, y string) (string, string) {
 17 |     return y, x
 18 | }
 19 | 
 20 | func split(sum int) (x, y int) {
 21 |     x = sum * 4 / 9
 22 |     y = sum - x
 23 |     return
 24 | }
 25 | 
 26 | func variable_test() {
 27 |     fmt.Println("Welcome to the playground!")
 28 |     fmt.Println("The time is", time.Now())
 29 |     fmt.Println("My favorite number is", rand.Intn(10))
 30 |     fmt.Println("Now you have %g problems.", math.Nextafter(2, 4))
 31 |     fmt.Println(math.Pi)
 32 |     fmt.Println(add(42, 13))
 33 |     a, b := swap("hello", "world")
 34 |     fmt.Println(a, b)
 35 |     fmt.Println(split(17))
 36 | 
 37 |     //var c, python, java bool
 38 |     var i, j int = 1, 2
 39 |     var c, python, java = true, false, "no!"
 40 |     k := 3
 41 |     fmt.Println(i, j, k, c, python, java)
 42 | 
 43 |     var (
 44 |         ToBe   bool       = false
 45 |         MaxInt uint64     = 1<<64 - 1
 46 |         z      complex128 = cmplx.Sqrt(-5 + 12i)
 47 |     )
 48 |     const f = "%T(%v)\n"
 49 |     fmt.Printf(f, ToBe, ToBe)
 50 |     fmt.Printf(f, MaxInt, MaxInt)
 51 |     fmt.Printf(f, z, z)
 52 | 
 53 |     var m int
 54 |     var n float64
 55 |     var e bool
 56 |     var s string
 57 |     fmt.Printf("%v %v %v %q\n", m, n, e, s)
 58 | }
 59 | 
 60 | func needInt(x int) int {
 61 |     return x*10 + 1
 62 | }
 63 | 
 64 | func needFloat(x float64) float64 {
 65 |     return x * 0.1
 66 | }
 67 | 
 68 | func const_test() {
 69 |     var x, y int = 3, 4
 70 |     var f float64 = math.Sqrt(float64(x*x + y*y))
 71 |     var z int = int(f)
 72 |     fmt.Printf("%d %d %f %d\n", x, y, f, z)
 73 |     fmt.Printf("f is of type %T\n", f)
 74 | 
 75 |     const Pi = 3.14
 76 |     const World = "世界"
 77 |     fmt.Println("Hello", World)
 78 |     fmt.Println("Happy", Pi, "Day")
 79 |     const Truth = true
 80 |     fmt.Println("Go rules?", Truth)
 81 |     const (
 82 |         Big   = 1 << 100
 83 |         Small = Big >> 99
 84 |     )
 85 |     fmt.Println(needInt(Small))
 86 |     fmt.Println(needFloat(Small))
 87 |     fmt.Println(needFloat(Big))
 88 | }
 89 | 
 90 | func for_test() {
 91 |     sum := 0
 92 |     for i := 0; i < 10; i++ {
 93 |         sum += i
 94 |     }
 95 |     fmt.Println(sum)
 96 | }
 97 | 
 98 | func pow(x, n, lim float64) float64 {
 99 |     if v := math.Pow(x, n); v < lim {
100 |         return v
101 |     } else {
102 |         fmt.Printf("%g >= %g\n", v, lim)
103 |     }
104 |     // can't use v here, though
105 |     return lim
106 | }
107 | 
108 | func if_test() {
109 |     fmt.Println(
110 |         pow(3, 2, 10),
111 |         pow(3, 3, 20),
112 |     )
113 | }
114 | 
115 | func switch_test() {
116 |     fmt.Print("Go runs on ")
117 |     switch os := runtime.GOOS; os {
118 |     case "darwin":
119 |         fmt.Println("OS X.")
120 |     case "linux":
121 |         fmt.Println("Linux.")
122 |     default:
123 |         // freebsd, openbsd,
124 |         // plan9, windows...
125 |         fmt.Printf("%s.", os)
126 |     }
127 |     fmt.Println("When's Saturday?")
128 |     today := time.Now().Weekday()
129 |     fmt.Println("today is", today)
130 |     fmt.Println("today + 2 is", today+2)
131 |     if today == time.Saturday-2 {
132 |         fmt.Println("Saturday - 2 is today")
133 |     }
134 |     switch time.Saturday {
135 |     case today + 0:
136 |         fmt.Println("Today.")
137 |     case today + 1:
138 |         fmt.Println("Tomorrow.")
139 |     case today + 2:
140 |         fmt.Println("In two days.")
141 |     default:
142 |         fmt.Println("Too far away.")
143 |     }
144 |     t := time.Now()
145 |     switch {
146 |     case t.Hour() < 12:
147 |         fmt.Println("Good morning!")
148 |     case t.Hour() < 17:
149 |         fmt.Println("Good afternoon.")
150 |     default:
151 |         fmt.Println("Good evening.")
152 |     }
153 | }
154 | 
155 | func defer_test_1() {
156 |     defer fmt.Println("world!") // 2
157 |     fmt.Println("hello")        // 1
158 | }
159 | 
160 | func defer_test_2() {
161 |     defer_test_1()
162 | 
163 |     fmt.Println("counting")
164 | 
165 |     for i := 0; i < 10; i++ {
166 |         defer fmt.Println(i) // reverse order
167 |     }
168 | 
169 |     fmt.Println("done")
170 | }
171 | 
172 | func pointer_test() {
173 | 
174 |     i, j := 42, 2701
175 | 
176 |     p := &i         // point to i
177 |     fmt.Println(*p) // read i through the pointer
178 |     *p = 21         // set i through the pointer
179 |     fmt.Println(i)  // see the new value of i
180 | 
181 |     p = &j         // point to j
182 |     *p = *p / 37   // divide j through the pointer
183 |     fmt.Println(j) // see the new value of j
184 | }
185 | 
186 | type Vertex struct {
187 |     X int
188 |     Y int
189 | }
190 | 
191 | func struct_test() {
192 |     //var v Vertex = Vertex{1, 2}
193 |     v := Vertex{1, 2}
194 |     p := &v
195 |     fmt.Println(p)
196 |     p.X = 1e9 // 1000000000
197 |     fmt.Printf("%d %d\n", p.X, p.Y)
198 |     fmt.Printf("%d %d\n", v.X, v.Y)
199 | 
200 |     var (
201 |         v1 = Vertex{1, 2}  // has type Vertex
202 |         v2 = Vertex{X: 1}  // Y:0 is implicit
203 |         v3 = Vertex{}      // X:0 and Y:0
204 |         q  = &Vertex{1, 2} // has type *Vertex
205 |     )
206 |     fmt.Println(v1, q, v2, v3)
207 | }
208 | 
209 | func array_test() {
210 |     var a [2]string // [n]T, static size of array
211 |     a[0] = "Hello"
212 |     a[1] = "World"
213 |     fmt.Println(a[0], a[1])
214 |     fmt.Println(a)
215 | 
216 |     var v = [3]int{1, 2, 3}
217 |     fmt.Println(v)
218 | }
219 | 
220 | func slice_test_1() {
221 |     s := []int{2, 3, 5, 7, 11, 13} // initialize slice
222 |     fmt.Println("s ==", s)
223 | 
224 |     for i := 0; i < len(s); i++ {
225 |         fmt.Printf("s[%d] == %d\n", i, s[i])
226 |     }
227 | 
228 |     w := []int{2, 3, 5, 7, 11, 13}
229 |     fmt.Println("w ==", w)
230 |     fmt.Println("w[1:4] ==", w[1:4]) // index 1 ~ index 4-1
231 | 
232 |     // missing low index implies 0
233 |     fmt.Println("w[:3] ==", w[:3]) // index 0 ~ index 3-1
234 | 
235 |     // missing high index implies len(s)
236 |     fmt.Println("w[4:] ==", w[4:]) // index 4 ~ end
237 | }
238 | 
239 | func printSlice(s string, x []int) {
240 |     fmt.Printf("%s len=%d cap=%d %v\n",
241 |         s, len(x), cap(x), x)
242 | }
243 | 
244 | func slice_test_2() {
245 |     // slice is reference to an array
246 |     // for example
247 |     // var v = [3]int{1,2,3} is an array, its type is [3]int
248 |     // var v = []int{1,2,3} is a slice
249 |     a := make([]int, 5) // slice refers to int array, size 5, cap 5(==size), zero initialzed
250 |     printSlice("a", a)
251 |     b := make([]int, 0, 5) // slice refers to int array, size 0, cap 5
252 |     printSlice("b", b)
253 | 
254 |     c := b[:2] // size = 2, cap 5 is copied <---- XXX why?
255 |     printSlice("c", c)
256 |     d := c[2:5] // size = 3, cap 3
257 |     printSlice("d", d)
258 | 
259 |     var z []int
260 |     fmt.Println(z, len(z), cap(z))
261 |     if z == nil { // nill slice
262 |         fmt.Println("nil!")
263 |     }
264 | }
265 | 
266 | 


--------------------------------------------------------------------------------
/images/Bernoulli_distribution_estimation_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/Bernoulli_distribution_estimation_1.png


--------------------------------------------------------------------------------
/images/Bernoulli_distribution_estimation_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/Bernoulli_distribution_estimation_2.png


--------------------------------------------------------------------------------
/images/Bernoulli_distribution_estimation_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/Bernoulli_distribution_estimation_3.png


--------------------------------------------------------------------------------
/images/Bernoulli_distribution_estimation_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/Bernoulli_distribution_estimation_4.png


--------------------------------------------------------------------------------
/images/DNC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/DNC.png


--------------------------------------------------------------------------------
/images/GLU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/GLU.png


--------------------------------------------------------------------------------
/images/MindMeld_MessagingInterfacesDemystified.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/MindMeld_MessagingInterfacesDemystified.pdf


--------------------------------------------------------------------------------
/images/SGNS_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/SGNS_1.png


--------------------------------------------------------------------------------
/images/SGNS_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/SGNS_2.png


--------------------------------------------------------------------------------
/images/additive_multiplicative_attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/additive_multiplicative_attention.png


--------------------------------------------------------------------------------
/images/aho-corasick.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/aho-corasick.png


--------------------------------------------------------------------------------
/images/allocation-weighting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/allocation-weighting.png


--------------------------------------------------------------------------------
/images/alpha_beta.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/alpha_beta.png


--------------------------------------------------------------------------------
/images/attention_def1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/attention_def1.png


--------------------------------------------------------------------------------
/images/attention_def2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/attention_def2.png


--------------------------------------------------------------------------------
/images/attention_def3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/attention_def3.png


--------------------------------------------------------------------------------
/images/backward-beta.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/backward-beta.jpeg


--------------------------------------------------------------------------------
/images/binomial_estimation_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/binomial_estimation_1.png


--------------------------------------------------------------------------------
/images/binomial_estimation_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/binomial_estimation_2.png


--------------------------------------------------------------------------------
/images/binomial_estimation_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/binomial_estimation_3.png


--------------------------------------------------------------------------------
/images/binomial_estimation_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/binomial_estimation_4.png


--------------------------------------------------------------------------------
/images/bm.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/bm.jpg


--------------------------------------------------------------------------------
/images/brat_sejong.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/brat_sejong.png


--------------------------------------------------------------------------------
/images/cmp_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_1.png


--------------------------------------------------------------------------------
/images/cmp_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_10.png


--------------------------------------------------------------------------------
/images/cmp_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_11.png


--------------------------------------------------------------------------------
/images/cmp_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_12.png


--------------------------------------------------------------------------------
/images/cmp_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_13.png


--------------------------------------------------------------------------------
/images/cmp_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_14.png


--------------------------------------------------------------------------------
/images/cmp_15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_15.png


--------------------------------------------------------------------------------
/images/cmp_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_2.png


--------------------------------------------------------------------------------
/images/cmp_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_3.png


--------------------------------------------------------------------------------
/images/cmp_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_4.png


--------------------------------------------------------------------------------
/images/cmp_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_5.png


--------------------------------------------------------------------------------
/images/cmp_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_6.png


--------------------------------------------------------------------------------
/images/cmp_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_7.png


--------------------------------------------------------------------------------
/images/cmp_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_8.png


--------------------------------------------------------------------------------
/images/cmp_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_9.png


--------------------------------------------------------------------------------
/images/cnn_embedding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cnn_embedding.png


--------------------------------------------------------------------------------
/images/content-based-addressing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/content-based-addressing.png


--------------------------------------------------------------------------------
/images/conv_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/conv_1.jpeg


--------------------------------------------------------------------------------
/images/cross_entropy_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cross_entropy_loss.png


--------------------------------------------------------------------------------
/images/cross_entropy_loss_many_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cross_entropy_loss_many_output.png


--------------------------------------------------------------------------------
/images/deptree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/deptree.png


--------------------------------------------------------------------------------
/images/distribution_function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/distribution_function.png


--------------------------------------------------------------------------------
/images/du_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/du_1.png


--------------------------------------------------------------------------------
/images/du_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/du_2.png


--------------------------------------------------------------------------------
/images/entropy_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_1.jpg


--------------------------------------------------------------------------------
/images/entropy_10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_10.jpg


--------------------------------------------------------------------------------
/images/entropy_11.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_11.jpg


--------------------------------------------------------------------------------
/images/entropy_12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_12.jpg


--------------------------------------------------------------------------------
/images/entropy_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_2.jpg


--------------------------------------------------------------------------------
/images/entropy_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_3.jpg


--------------------------------------------------------------------------------
/images/entropy_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_4.jpg


--------------------------------------------------------------------------------
/images/entropy_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_5.jpg


--------------------------------------------------------------------------------
/images/entropy_6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_6.jpg


--------------------------------------------------------------------------------
/images/entropy_7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_7.jpg


--------------------------------------------------------------------------------
/images/entropy_8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_8.jpg


--------------------------------------------------------------------------------
/images/entropy_9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_9.jpg


--------------------------------------------------------------------------------
/images/expectation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/expectation.png


--------------------------------------------------------------------------------
/images/forward-alpha.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/forward-alpha.jpeg


--------------------------------------------------------------------------------
/images/forward_backward_var.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/forward_backward_var.png


--------------------------------------------------------------------------------
/images/four_equation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/four_equation.png


--------------------------------------------------------------------------------
/images/hierarchical_attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hierarchical_attention.png


--------------------------------------------------------------------------------
/images/hmm_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_1.png


--------------------------------------------------------------------------------
/images/hmm_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_2.png


--------------------------------------------------------------------------------
/images/hmm_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_3.png


--------------------------------------------------------------------------------
/images/hmm_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_4.png


--------------------------------------------------------------------------------
/images/hmm_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_5.png


--------------------------------------------------------------------------------
/images/hmm_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_6.png


--------------------------------------------------------------------------------
/images/hmm_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_7.png


--------------------------------------------------------------------------------
/images/kmp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/kmp.jpg


--------------------------------------------------------------------------------
/images/layer_norm_timesteps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/layer_norm_timesteps.png


--------------------------------------------------------------------------------
/images/me_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/me_1.png


--------------------------------------------------------------------------------
/images/me_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/me_2.png


--------------------------------------------------------------------------------
/images/ml_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ml_1.png


--------------------------------------------------------------------------------
/images/ml_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ml_2.png


--------------------------------------------------------------------------------
/images/ml_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ml_3.png


--------------------------------------------------------------------------------
/images/ml_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ml_4.png


--------------------------------------------------------------------------------
/images/mult_head_self_attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/mult_head_self_attention.png


--------------------------------------------------------------------------------
/images/multi_dimensional_self_attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/multi_dimensional_self_attention.png


--------------------------------------------------------------------------------
/images/multi_headed_attention_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/multi_headed_attention_1.png


--------------------------------------------------------------------------------
/images/multi_headed_attention_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/multi_headed_attention_2.png


--------------------------------------------------------------------------------
/images/multinomial_estimation_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/multinomial_estimation_1.png


--------------------------------------------------------------------------------
/images/multinomial_estimation_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/multinomial_estimation_2.png


--------------------------------------------------------------------------------
/images/multinomial_estimation_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/multinomial_estimation_3.png


--------------------------------------------------------------------------------
/images/ner_attention.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ner_attention.jpg


--------------------------------------------------------------------------------
/images/ner_attention_math1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ner_attention_math1.jpg


--------------------------------------------------------------------------------
/images/ner_attention_math2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ner_attention_math2.jpg


--------------------------------------------------------------------------------
/images/ngram_cnn_highway_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ngram_cnn_highway_1.png


--------------------------------------------------------------------------------
/images/ngram_cnn_highway_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ngram_cnn_highway_2.png


--------------------------------------------------------------------------------
/images/nn_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/nn_1.jpeg


--------------------------------------------------------------------------------
/images/nn_2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/nn_2.jpeg


--------------------------------------------------------------------------------
/images/nn_3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/nn_3.jpeg


--------------------------------------------------------------------------------
/images/nn_4.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/nn_4.jpeg


--------------------------------------------------------------------------------
/images/nn_5.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/nn_5.jpeg


--------------------------------------------------------------------------------
/images/ntm-addressing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-addressing.png


--------------------------------------------------------------------------------
/images/ntm-content-addressing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-content-addressing.png


--------------------------------------------------------------------------------
/images/ntm-interface-vector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-interface-vector.png


--------------------------------------------------------------------------------
/images/ntm-interpolation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-interpolation.png


--------------------------------------------------------------------------------
/images/ntm-lstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-lstm.png


--------------------------------------------------------------------------------
/images/ntm-pseudocode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-pseudocode.png


--------------------------------------------------------------------------------
/images/ntm-sharpen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-sharpen.png


--------------------------------------------------------------------------------
/images/ntm-shift.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-shift.png


--------------------------------------------------------------------------------
/images/ntm-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-test.png


--------------------------------------------------------------------------------
/images/ntm-train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-train.png


--------------------------------------------------------------------------------
/images/ntm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm.png


--------------------------------------------------------------------------------
/images/p-value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/p-value.png


--------------------------------------------------------------------------------
/images/partition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/partition.png


--------------------------------------------------------------------------------
/images/pstree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/pstree.png


--------------------------------------------------------------------------------
/images/re_attention_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/re_attention_1.png


--------------------------------------------------------------------------------
/images/re_attention_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/re_attention_2.png


--------------------------------------------------------------------------------
/images/read-vector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/read-vector.png


--------------------------------------------------------------------------------
/images/regularization.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/regularization.jpeg


--------------------------------------------------------------------------------
/images/retention-vector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/retention-vector.png


--------------------------------------------------------------------------------
/images/scaled_dot_product_attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/scaled_dot_product_attention.png


--------------------------------------------------------------------------------
/images/sejong_entry.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/sejong_entry.png


--------------------------------------------------------------------------------
/images/self-attention-map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/self-attention-map.png


--------------------------------------------------------------------------------
/images/self-attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/self-attention.png


--------------------------------------------------------------------------------
/images/self_attention_with_fnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/self_attention_with_fnn.png


--------------------------------------------------------------------------------
/images/seq2seq_attention_machanism.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/seq2seq_attention_machanism.jpg


--------------------------------------------------------------------------------
/images/seq2seq_attention_machanism.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/seq2seq_attention_machanism.png


--------------------------------------------------------------------------------
/images/seq2seq_autoencoder.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/seq2seq_autoencoder.jpeg


--------------------------------------------------------------------------------
/images/time_invariant_self_attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/time_invariant_self_attention.png


--------------------------------------------------------------------------------
/images/time_invariant_self_attention_full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/time_invariant_self_attention_full.png


--------------------------------------------------------------------------------
/images/transformer_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/transformer_model.png


--------------------------------------------------------------------------------
/images/traversal_london.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/traversal_london.png


--------------------------------------------------------------------------------
/images/url_sejong.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/url_sejong.png


--------------------------------------------------------------------------------
/images/usage-vector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/usage-vector.png


--------------------------------------------------------------------------------
/images/variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/variance.png


--------------------------------------------------------------------------------
/images/vbox_port.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/vbox_port.png


--------------------------------------------------------------------------------
/images/viterbi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/viterbi.png


--------------------------------------------------------------------------------
/images/wor2vec_visualizer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/wor2vec_visualizer.png


--------------------------------------------------------------------------------
/images/word2vec_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/word2vec_1.jpeg


--------------------------------------------------------------------------------
/images/word2vec_2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/word2vec_2.jpeg


--------------------------------------------------------------------------------
/images/word2vec_3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/word2vec_3.jpeg


--------------------------------------------------------------------------------
/images/word2vec_4.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/word2vec_4.jpeg


--------------------------------------------------------------------------------
/images/word2vec_5.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/word2vec_5.jpeg


--------------------------------------------------------------------------------
/images/workbench_fatal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/workbench_fatal.png


--------------------------------------------------------------------------------
/images/write-operation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/write-operation.png


--------------------------------------------------------------------------------
/images/write-weight-vector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/write-weight-vector.png


--------------------------------------------------------------------------------
/keras_mlp.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | 
 3 | from keras.models import Sequential
 4 | from keras.layers.core import Dense, Dropout, Activation
 5 | from keras.optimizers import SGD
 6 | import numpy as np
 7 | 
 8 | ## model configuration
 9 | model = Sequential()
10 | # Dense(64) is a fully-connected layer with 64 hidden units.
11 | # in the first layer, you must specify the expected input data shape:
12 | # here, 20-dimensional vectors.
13 | model.add(Dense(64, input_dim=20, init='uniform'))
14 | model.add(Activation('tanh'))
15 | model.add(Dropout(0.5))
16 | model.add(Dense(64, init='uniform'))
17 | model.add(Activation('tanh'))
18 | model.add(Dropout(0.5))
19 | model.add(Dense(10, init='uniform'))
20 | model.add(Activation('softmax'))
21 | sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
22 | model.compile(loss='categorical_crossentropy',
23 |               optimizer=sgd)
24 |               
25 | ## generate train/test data
26 | X_train = []
27 | y_train = []
28 | for i in xrange(10000) :
29 |     x = np.random.uniform(-1, 1, size=20)
30 |     y = [i % 10]
31 |     X_train.append(x)
32 |     y_train.append(y)
33 | X_train = np.array(X_train)
34 | y_train = np.array(y_train)
35 | print "X_train shape = " + str(X_train.shape)
36 | print "y_train shape = " + str(y_train.shape)
37 | X_test = []
38 | y_test = []
39 | for i in xrange(1000) :
40 |     x = np.random.uniform(-1, 1, size=20)
41 |     y = [i % 10]
42 |     X_test.append(x)
43 |     y_test.append(y)
44 | X_test = np.array(X_test)
45 | y_test = np.array(y_test)
46 | print "X_test shape = " + str(X_test.shape)
47 | print "y_test shape = " + str(y_test.shape)
48 | 
49 | ## training and evalutation
50 | model.fit(X_train, y_train,
51 |           nb_epoch=20,
52 |           batch_size=100,
53 |           show_accuracy=True)
54 | score = model.evaluate(X_test, y_test, batch_size=100, show_accuracy=True)
55 | print('Test score:', score[0])
56 | print('Test accuracy:', score[1])
57 | 


--------------------------------------------------------------------------------
/make_bdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | import re
 7 | from   optparse import OptionParser
 8 | import time
 9 | from   bsddb3 import db
10 | 
11 | if __name__ == '__main__':
12 | 
13 |     parser = OptionParser()
14 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
15 |     parser.add_option("-d", "--dir", dest="dir",help="home directory", metavar="DIR")
16 |     parser.add_option("-b", "--bdb", dest="bdbfile",help="bdb file name", metavar="BDB")
17 |     (options, args) = parser.parse_args()
18 | 
19 |     if options.verbose == 1 : VERBOSE = 1
20 | 
21 |     dir_path = options.dir
22 |     if dir_path == None :
23 |         parser.print_help()
24 |         sys.exit(1)
25 | 
26 |     bdb_file = options.bdbfile
27 |     if bdb_file == None :
28 |         parser.print_help()
29 |         sys.exit(1)
30 | 
31 |     startTime = time.time()
32 | 
33 |     dbenv = db.DBEnv()
34 |     if dbenv.open(dir_path, db.DB_CREATE | db.DB_INIT_MPOOL) :
35 |         sys.stderr.write("DBEnv.open() fail\n")
36 |         sys.exit(1)
37 |     d = db.DB(dbenv)
38 |     if d.open(bdb_file, db.DB_BTREE, db.DB_CREATE | db.DB_TRUNCATE, 0666) :
39 |         sys.stderr.write("DB.open() fail\n")
40 |         sys.exit(1)
41 |         
42 |     linecount = 0
43 |     while 1 :
44 |         try : line = sys.stdin.readline()
45 |         except KeyboardInterrupt : break
46 |         if not line : break
47 |         try : line = line.strip()
48 |         except : continue
49 |         if not line : continue
50 |         linecount += 1
51 |         if linecount % 1000 == 0 :
52 |             sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n")
53 | 
54 |         key,value = line.split('\t',1)
55 |         if not key or not value : continue
56 | 
57 |         d.put(key,value)
58 | 
59 |     d.close()
60 |     dbenv.close()
61 | 
62 |     durationTime = time.time() - startTime
63 |     sys.stderr.write("duration time = %f\n" % durationTime)
64 | 


--------------------------------------------------------------------------------
/make_leveldb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | import re
 7 | from   optparse import OptionParser
 8 | import time
 9 | import leveldb
10 | 
11 | if __name__ == '__main__':
12 | 
13 |     parser = OptionParser()
14 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
15 |     parser.add_option("-d", "--db", dest="dbdir",help="db dir path", metavar="DB")
16 |     (options, args) = parser.parse_args()
17 | 
18 |     if options.verbose == 1 : VERBOSE = 1
19 | 
20 |     db_dir = options.dbfile
21 |     if db_dir == None :
22 |         parser.print_help()
23 |         sys.exit(1)
24 | 
25 |     startTime = time.time()
26 | 
27 |     db = leveldb.LevelDB(db_dir)
28 | 
29 |     linecount = 0
30 |     while 1 :
31 |         try : line = sys.stdin.readline()
32 |         except KeyboardInterrupt : break
33 |         if not line : break
34 |         try : line = line.strip()
35 |         except : continue
36 |         if not line : continue
37 |         linecount += 1
38 |         if linecount % 1000 == 0 :
39 |             sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n")
40 | 
41 |         key,value = line.split('\t',1)
42 |         if not key or not value : continue
43 | 
44 |         db.Put(key,value)
45 | 
46 |     durationTime = time.time() - startTime
47 |     sys.stderr.write("duration time = %f\n" % durationTime)
48 | 


--------------------------------------------------------------------------------
/make_lmdb.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include <stdlib.h>
  3 | #include <stdio.h>
  4 | #include <limits.h>
  5 | #include <ctype.h>
  6 | #include <unistd.h>
  7 | #include <stdint.h>
  8 | #include <sys/time.h>
  9 | 
 10 | #include "lmdb.h"
 11 | 
 12 | #define LINE_SIZE       10240
 13 | 
 14 | int main(int    argc, char  *argv[])
 15 | {
 16 |     int     size;
 17 |     char    string[LINE_SIZE+1];
 18 |     char    s_key[LINE_SIZE+1];
 19 |     char    s_value[LINE_SIZE+1];
 20 |     char*   token;
 21 |     char*   save;
 22 |     int     cnt_line;
 23 |     
 24 |     int         rc;
 25 |     MDB_env*    env;
 26 |     MDB_txn*    txn;
 27 |     MDB_cursor* mc;
 28 |     MDB_dbi     dbi;
 29 |     MDB_val     key, data;
 30 |     char*       envname;
 31 |     int         envflags=0;
 32 |     int         putflags=0;
 33 |     char*       subname;
 34 |     char*       prog = argv[0];
 35 |     size_t      map_size = (SIZE_MAX / (1024*1024*1024) / 4)*6; // 4giga * 6
 36 |     int         batch;
 37 | 
 38 |     struct timeval tv1, tv2;
 39 | 
 40 |     if(argc != 3) {
 41 |         fprintf(stderr,"%s <envname> <subname>\n",prog);
 42 |         exit(1);
 43 |     }
 44 | 
 45 |     gettimeofday(&tv1, NULL);
 46 | 
 47 |     envflags = MDB_NOSUBDIR | MDB_NOLOCK;
 48 |     envname = argv[1];
 49 |     rc = mdb_env_create(&env);
 50 |     if(rc) {
 51 |         fprintf(stderr, "mdb_env_create failed, error %d %s\n", rc, mdb_strerror(rc));
 52 |         return EXIT_FAILURE;
 53 |     }
 54 |     mdb_env_set_maxdbs(env, 2);
 55 |     mdb_env_set_mapsize(env, map_size);
 56 |     rc = mdb_env_open(env, envname, envflags, 0664);
 57 |     if(rc) {
 58 |         fprintf(stderr, "mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc));
 59 |         goto env_close;
 60 |     }
 61 |     rc = mdb_txn_begin(env, NULL, 0, &txn);
 62 |     if(rc) {
 63 |         fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc));
 64 |         goto env_close;
 65 |     }
 66 |     subname = argv[2];
 67 |     rc = mdb_open(txn, subname, MDB_CREATE, &dbi);
 68 |     if (rc) {
 69 |         fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc));
 70 |         goto txn_abort;
 71 |     }
 72 |     rc = mdb_cursor_open(txn, dbi, &mc);
 73 |     if (rc) {
 74 |         fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc));
 75 |         goto txn_abort;
 76 |     }
 77 |     
 78 |     batch = 0;
 79 |     cnt_line = 0;
 80 |     while(fgets(string, LINE_SIZE, stdin) != NULL) {
 81 |         size = strlen(string);
 82 |         if(string[size-1] == '\n'){
 83 |             string[size-1] = '\0';
 84 |             --size;
 85 |         }
 86 |         if(size > 1 && string[size-1] == '\r'){
 87 |             string[size-1] = '\0';
 88 |             --size;
 89 |         }
 90 |         if(string[0] == '\0')
 91 |             continue;
 92 | 
 93 |         if(cnt_line % 10000 == 0)
 94 |             fprintf(stderr,"[linecount]\t%d\n",cnt_line);
 95 | 
 96 |         token = strtok_r(string, "\t", &save);
 97 |         if(token != NULL) {
 98 |             strcpy(s_key, token);
 99 |             token = strtok_r(NULL, "\t", &save);
100 |             if(token != NULL) {
101 |                 strcpy(s_value, token);
102 |             } else continue;
103 |         } else continue;
104 | 
105 |         key.mv_data = s_key;
106 |         key.mv_size = strlen(s_key) + 1;
107 |         data.mv_data = s_value;
108 |         data.mv_size = strlen(s_value) + 1;
109 |         
110 |         
111 |     rc = mdb_cursor_put(mc, &key, &data, putflags);
112 |         if(rc == MDB_KEYEXIST)
113 |             continue;
114 | 
115 |         if(batch % 100000000 == 0) {
116 |             rc = mdb_txn_commit(txn);
117 |             if(rc) {
118 |                 fprintf(stderr, "%s: line %d: txn_commit: %s\n", prog, cnt_line, mdb_strerror(rc));
119 |                 goto env_close;
120 |             }
121 |             rc = mdb_txn_begin(env, NULL, 0, &txn);
122 |             if(rc) {
123 |                 fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc));
124 |                 goto env_close;
125 |             }
126 |             rc = mdb_cursor_open(txn, dbi, &mc);
127 |             if(rc) {
128 |                 fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc));
129 |                 goto txn_abort;
130 |             }
131 |         }
132 | 
133 |         cnt_line++;
134 |         batch++;
135 |     }
136 |     
137 |     rc = mdb_txn_commit(txn);
138 |     txn = NULL;
139 |     if(rc) {
140 |         fprintf(stderr, "%s: txn_commit fail: %s\n", prog, mdb_strerror(rc));
141 |         goto env_close;
142 |     }
143 |     mdb_dbi_close(env, dbi);
144 | 
145 | txn_abort:
146 |     mdb_txn_abort(txn);
147 | env_close:
148 |     mdb_env_close(env);
149 | 
150 |     gettimeofday(&tv2, NULL);
151 |     fprintf(stderr, "<-end > : t2.sec = %d t2.usec = %d\n",(int)tv2.tv_sec,(int)tv2.tv_usec);
152 |     fprintf(stderr, "<+time> : sec = %d usec = %d\n",(int)(tv2.tv_sec-tv1.tv_sec),(int)(tv2.tv_usec-tv1.tv_usec));
153 | 
154 |     return rc ? EXIT_FAILURE : EXIT_SUCCESS;
155 | 
156 | }
157 | 


--------------------------------------------------------------------------------
/make_lmdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | import re
 7 | from   optparse import OptionParser
 8 | import time
 9 | import lmdb
10 | 
11 | if __name__ == '__main__':
12 | 
13 |     parser = OptionParser()
14 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
15 |     parser.add_option("-d", "--db", dest="dbpath",help="db path", metavar="DB")
16 |     (options, args) = parser.parse_args()
17 | 
18 |     if options.verbose == 1 : VERBOSE = 1
19 | 
20 |     db_path = options.dbpath
21 |     if db_path == None :
22 |         parser.print_help()
23 |         sys.exit(1)
24 | 
25 |     startTime = time.time()
26 | 
27 |     # env == db coz max_dbs=0
28 |     env = lmdb.Environment(db_path,map_size=24*(1023**3),subdir=False,readonly=False,create=False,max_dbs=0,lock=False)
29 |     txn = lmdb.Transaction(env,db=None,write=True)
30 |     
31 |     linecount = 0
32 |     while 1 :
33 |         try : line = sys.stdin.readline()
34 |         except KeyboardInterrupt : break
35 |         if not line : break
36 |         try : line = line.strip()
37 |         except : continue
38 |         if not line : continue
39 |         linecount += 1
40 |         if linecount % 1000 == 0 :
41 |             sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n")
42 | 
43 |         key,value = line.split('\t',1)
44 |         if not key or not value : continue
45 | 
46 |         try : txn.put(key,value)
47 |         except Exception, e :
48 |             sys.stderr.write(str(e) + '\n')
49 |             continue
50 | 
51 |     durationTime = time.time() - startTime
52 |     sys.stderr.write("duration time = %f\n" % durationTime)
53 | 
54 |     txn.commit()
55 |     env.close()
56 | 


--------------------------------------------------------------------------------
/multiplexing.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 |     "bufio"
  5 |     "fmt"
  6 |     "os"
  7 |     "runtime"
  8 |     "sync"
  9 |     "time"
 10 | )
 11 | 
 12 | func worker(jobs chan string, outs chan string, jobs_wg *sync.WaitGroup, jobs_shutdown chan bool) {
 13 |     // jobs -> outs
 14 |     defer jobs_wg.Done()
 15 |     for {
 16 |         select {
 17 |         case line, _ := <-jobs:
 18 |             out := "do something here"
 19 |             outs <- out
 20 |         case _ = <-jobs_shutdown:
 21 |             fmt.Fprintf(os.Stderr, "shutdown worker\n")
 22 |             return
 23 |         }
 24 |     }
 25 | }
 26 | 
 27 | func outputer(outs chan string, outs_wg *sync.WaitGroup, outs_shutdown chan bool) {
 28 |     // outs -> stdout
 29 |     // synchronize standard out
 30 |     defer outs_wg.Done()
 31 |     for {
 32 |         select {
 33 |         case _ = <-outs_shutdown:
 34 |             fmt.Fprintf(os.Stderr, "shutdown outputer\n")
 35 |             return
 36 |         case out := <-outs:
 37 |             fmt.Printf("out = %s\n", out)
 38 |         }
 39 |     }
 40 | }
 41 | 
 42 | func prepare_workers(n_worker int, jobs chan string, outs chan string, jobs_wg_list *[]*sync.WaitGroup, jobs_shutdown_list *[]chan bool) {
 43 |     for i := 0; i < n_worker; i++ {
 44 |         jobs_wg := &sync.WaitGroup{}
 45 |         jobs_wg.Add(1)
 46 |         *jobs_wg_list = append(*jobs_wg_list, jobs_wg)
 47 |         jobs_shutdown := make(chan bool)
 48 |         *jobs_shutdown_list = append(*jobs_shutdown_list, jobs_shutdown)
 49 |         go worker(jobs, outs, jobs_wg, jobs_shutdown)
 50 |     }
 51 | }
 52 | 
 53 | func prepare_outputer(outs chan string, outs_wg *sync.WaitGroup, outs_shutdown chan bool) {
 54 |     go outputer(outs, outs_wg, outs_shutdown)
 55 | }
 56 | 
 57 | func main() {
 58 |     const n_worker = 10
 59 |     const n_core = 10
 60 |     const size_buff = 100
 61 | 
 62 |     runtime.GOMAXPROCS(n_core)
 63 | 
 64 |     var jobs = make(chan string, size_buff)
 65 |     var outs = make(chan string, size_buff)
 66 |     var jobs_wg_list []*sync.WaitGroup
 67 |     var jobs_shutdown_list []chan bool
 68 |     outs_wg := &sync.WaitGroup{}
 69 |     outs_wg.Add(1)
 70 |     outs_shutdown := make(chan bool)
 71 | 
 72 |     // prepare workers, outputer
 73 |     prepare_workers(n_worker, jobs, outs, &jobs_wg_list, &jobs_shutdown_list)
 74 |     prepare_outputer(outs, outs_wg, outs_shutdown)
 75 | 
 76 |     start := time.Now() // get current time
 77 |     scanner := bufio.NewScanner(os.Stdin)
 78 |     for scanner.Scan() {
 79 |         line := scanner.Text()
 80 |         jobs <- line
 81 |     }
 82 |     
 83 |     // shutdown all workers
 84 |     fmt.Fprintf(os.Stderr, "jobs_shutdown_list size : %v\n", len(jobs_shutdown_list))
 85 |     fmt.Fprintf(os.Stderr, "jobs_wg_list size : %v\n", len(jobs_wg_list))
 86 |     for i, jobs_shutdown := range jobs_shutdown_list {
 87 |         fmt.Fprintf(os.Stderr, "close jobs_shutdown : %v\n", i)
 88 |         close(jobs_shutdown)
 89 |         // wait until finish job
 90 |         fmt.Fprintf(os.Stderr, "wait jobs_wg : %v\n", i)
 91 |         jobs_wg := jobs_wg_list[i]
 92 |         jobs_wg.Wait()
 93 |         fmt.Fprintf(os.Stderr, "done jobs_wg\n")
 94 |     }
 95 | 
 96 |     // shutdown outputer
 97 |     fmt.Fprintf(os.Stderr, "close outs_shutdown\n")
 98 |     close(outs_shutdown)
 99 |     // wait until outputer ends
100 |     fmt.Fprintf(os.Stderr, "wait outs_wg\n")
101 |     outs_wg.Wait()
102 |     fmt.Fprintf(os.Stderr, "done outs_wg\n")
103 | 
104 |     elapsed := time.Since(start)
105 |     fmt.Fprintf(os.Stderr, "elapsed time = %s\n", elapsed)
106 | }
107 | 


--------------------------------------------------------------------------------
/ngram.cc:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <string>
  3 | #include <vector>
  4 | #include <assert.h>
  5 | 
  6 | static const int32_t MAX_VOCAB_SIZE = 30000000;
  7 | static const int32_t BUCKET_SIZE = 2000000;
  8 | static const int32_t MIN_NGRAM_SIZE = 3;
  9 | static const int32_t MAX_NGRAM_SIZE = 6;
 10 | static std::string PREFIX_LABEL = "_label_";
 11 | static const std::string EOS = "</s>";
 12 | static const std::string BOW = "<";
 13 | static const std::string EOW = ">";
 14 | 
 15 | enum class entry_type : int8_t {word=0, label=1};
 16 | struct entry {
 17 |     std::string word;
 18 |     int64_t count;
 19 |     entry_type type;
 20 |     std::vector<int32_t> subwords;
 21 | };
 22 | 
 23 | std::vector<entry> words_;
 24 | int32_t size_ = 0;
 25 | int32_t nwords_ = 0;
 26 | int32_t nlabels_ = 0;
 27 | int32_t ntokens_ = 0;
 28 | std::vector<int32_t> word2int_;
 29 | 
 30 | static void init() {
 31 |   size_ = 0;
 32 |   nwords_ = 0;
 33 |   nlabels_ = 0;
 34 |   ntokens_ = 0;
 35 |   word2int_.resize(MAX_VOCAB_SIZE);
 36 |   for (int32_t i = 0; i < MAX_VOCAB_SIZE; i++) {
 37 |     word2int_[i] = -1;
 38 |   }
 39 | }
 40 | 
 41 | static uint32_t hash(const std::string& str) {
 42 |     uint32_t h = 2166136261;
 43 |     for (size_t i = 0; i < str.size(); i++) {
 44 |         h = h ^ uint32_t(str[i]);
 45 |         h = h * 16777619;
 46 |     }
 47 |     return h;
 48 | }
 49 | 
 50 | static int32_t find(const std::string& w) {
 51 |     int32_t h = hash(w) % MAX_VOCAB_SIZE;
 52 |     while (word2int_[h] != -1 && words_[word2int_[h]].word != w) {
 53 |         h = (h + 1) % MAX_VOCAB_SIZE;
 54 |     }
 55 |     return h;
 56 | }
 57 | 
 58 | static void add(const std::string& w) {
 59 |     int32_t h = find(w);
 60 |     ntokens_++;
 61 |     if (word2int_[h] == -1) {
 62 |         entry e;
 63 |         e.word = w;
 64 |         e.count = 1;
 65 |         e.type = (w.find(PREFIX_LABEL) == 0) ? entry_type::label : entry_type::word;
 66 |         if (e.type == entry_type::word) nwords_++;
 67 |         if (e.type == entry_type::label) nlabels_++;
 68 |         words_.push_back(e);
 69 |         word2int_[h] = size_++;
 70 |     } else {
 71 |         words_[word2int_[h]].count++;
 72 |     }
 73 | }
 74 | 
 75 | static int32_t getId(const std::string& w) {
 76 |     int32_t h = find(w);
 77 |     return word2int_[h];
 78 | }
 79 | 
 80 | static entry_type getType(int32_t id) {
 81 |     assert(id >= 0);
 82 |     assert(id < size_);
 83 |     return words_[id].type;
 84 | }
 85 | 
 86 | static std::string getWord(int32_t id) {
 87 |     assert(id >= 0);
 88 |     assert(id < size_);
 89 |     return words_[id].word;
 90 | }
 91 | 
 92 | static void computeNgrams(const std::string& word,
 93 |                     std::vector<int32_t>& ngrams) {
 94 |     for (size_t i = 0; i < word.size(); i++) {
 95 |         std::string ngram;
 96 |         if ((word[i] & 0xC0) == 0x80) continue;
 97 |         for (size_t j = i, n = 1; j < word.size() && n <= MAX_NGRAM_SIZE; n++) {
 98 |             ngram.push_back(word[j++]);
 99 |             while (j < word.size() && (word[j] & 0xC0) == 0x80) {
100 |                 ngram.push_back(word[j++]);
101 |             }
102 |             if (n >= MIN_NGRAM_SIZE && !(n == 1 && (i == 0 || j == word.size()))) {
103 |                 int32_t h = hash(ngram) % BUCKET_SIZE;
104 |                 std::cout << ngram << "\t" << h << std::endl;
105 |                 ngrams.push_back(nwords_ + h);
106 |             }
107 |         }
108 |     }
109 | }
110 | 
111 | static void initNgrams() {
112 |     for (size_t i = 0; i < size_; i++) {
113 |         std::string word = BOW + words_[i].word + EOW;
114 |         words_[i].subwords.push_back(i);
115 |         computeNgrams(word, words_[i].subwords);
116 |     }
117 | }
118 | 
119 | static const std::vector<int32_t>& getNgrams(int32_t i) {
120 |     assert(i >= 0);
121 |     assert(i < nwords_);
122 |     return words_[i].subwords;
123 | }
124 | 
125 | static const std::vector<int32_t> getNgrams(const std::string& word) {
126 |     int32_t i = getId(word);
127 |     if (i >= 0) {
128 |         return getNgrams(i);
129 |     }
130 |     std::vector<int32_t> ngrams;
131 |     computeNgrams(BOW + word + EOW, ngrams);
132 |     return ngrams;
133 | }
134 | 
135 | int main(int argc, char** argv) {
136 | 
137 |     init();
138 | 
139 |     std::string word1 = "카카오12검색";
140 |     std::string word2 = "ab네이버구글34";
141 | 
142 |     add(word1);
143 |     add(word2);
144 | 
145 |     initNgrams();
146 | 
147 |     return 0;
148 | }
149 | 


--------------------------------------------------------------------------------
/queue.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 |     "fmt"
 5 | )
 6 | 
 7 | type Node struct {
 8 |     Value int
 9 | }
10 | 
11 | /*
12 | func (n *Node) String() string {
13 |     return fmt.Sprint(n.Value)
14 | }
15 | */
16 | 
17 | // NewQueue returns a new queue with the given initial size.
18 | func NewQueue(size int) *Queue {
19 |     return &Queue{
20 |         nodes: make([]*Node, size),
21 |         size:  size,
22 |     }
23 | }
24 | 
25 | // Queue is a basic FIFO queue based on a circular list that resizes as needed.
26 | type Queue struct {
27 |     nodes []*Node
28 |     size  int
29 |     head  int
30 |     tail  int
31 |     count int
32 | }
33 | 
34 | // Push adds a node to the queue.
35 | func (q *Queue) Push(n *Node) {
36 |     if q.head == q.tail && q.count > 0 {
37 |         nodes := make([]*Node, len(q.nodes)+q.size)
38 |         copy(nodes, q.nodes[q.head:])
39 |         copy(nodes[len(q.nodes)-q.head:], q.nodes[:q.head])
40 |         q.head = 0
41 |         q.tail = len(q.nodes)
42 |         q.nodes = nodes
43 |     }
44 |     q.nodes[q.tail] = n
45 |     q.tail = (q.tail + 1) % len(q.nodes)
46 |     q.count++
47 | }
48 | 
49 | // Pop removes and returns a node from the queue in first to last order.
50 | func (q *Queue) Pop() *Node {
51 |     if q.count == 0 {
52 |         return nil
53 |     }
54 |     node := q.nodes[q.head]
55 |     q.head = (q.head + 1) % len(q.nodes)
56 |     q.count--
57 |     return node
58 | }
59 | 
60 | func main() {
61 |     q := NewQueue(1)
62 |     q.Push(&Node{4})
63 |     q.Push(&Node{5})
64 |     q.Push(&Node{6})
65 |     fmt.Println(q.Pop(), q.Pop(), q.Pop())
66 | }
67 | 


--------------------------------------------------------------------------------
/search_bdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | import re
 7 | from   optparse import OptionParser
 8 | import time
 9 | from   bsddb3 import db
10 | 
11 | if __name__ == '__main__':
12 | 
13 |     parser = OptionParser()
14 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
15 |     parser.add_option("-d", "--dir", dest="dir",help="home directory", metavar="DIR")
16 |     parser.add_option("-b", "--bdb", dest="bdbfile",help="bdb file name", metavar="BDB")
17 |     (options, args) = parser.parse_args()
18 | 
19 |     if options.verbose == 1 : VERBOSE = 1
20 | 
21 |     dir_path = options.dir
22 |     if dir_path == None :
23 |         parser.print_help()
24 |         sys.exit(1)
25 | 
26 |     bdb_file = options.bdbfile
27 |     if bdb_file == None :
28 |         parser.print_help()
29 |         sys.exit(1)
30 | 
31 |     dbenv = db.DBEnv()
32 |     if dbenv.open(dir_path, db.DB_CREATE | db.DB_INIT_MPOOL) :
33 |         sys.stderr.write("DBEnv.open() fail\n")
34 |         sys.exit(1)
35 |     d = db.DB(dbenv)
36 |     if d.open(bdb_file, db.DB_BTREE, db.DB_RDONLY) :
37 |         sys.stderr.write("DB.open() fail\n")
38 |         sys.exit(1)
39 | 
40 |     startTime = time.time()
41 |     
42 |     linecount = 0
43 |     while 1 :
44 |         try : line = sys.stdin.readline()
45 |         except KeyboardInterrupt : break
46 |         if not line : break
47 |         try : line = line.strip()
48 |         except : continue
49 |         if not line : continue
50 |         linecount += 1
51 |         if linecount % 1000 == 0 :
52 |             sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n")
53 | 
54 |         key,value = line.split('\t',1)
55 |         if not key or not value : continue
56 | 
57 |         v = d.get(key)
58 |         if v :
59 |             print v
60 | 
61 |     durationTime = time.time() - startTime
62 |     sys.stderr.write("duration time = %f\n" % durationTime)
63 | 
64 |     d.close()
65 |     dbenv.close()
66 | 


--------------------------------------------------------------------------------
/search_leveldb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | import re
 7 | from   optparse import OptionParser
 8 | import time
 9 | import leveldb
10 | 
11 | if __name__ == '__main__':
12 | 
13 |     parser = OptionParser()
14 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
15 |     parser.add_option("-d", "--db", dest="dbdir",help="db dir path", metavar="DB")
16 |     (options, args) = parser.parse_args()
17 | 
18 |     if options.verbose == 1 : VERBOSE = 1
19 | 
20 |     db_dir = options.dbdir
21 |     if db_dir == None :
22 |         parser.print_help()
23 |         sys.exit(1)
24 | 
25 |     db = leveldb.LevelDB(db_dir)
26 |     lock_file = db_dir + '/LOCK'
27 |     if os.path.exists(lock_file) :
28 |         try : os.remove(lock_file)
29 |         except OSError :
30 |             sys.stderr.write("remove lock file(%s) fail\n" % (lock_file))
31 |             sys.exit(1)
32 | 
33 |     startTime = time.time()
34 |     
35 |     linecount = 0
36 |     while 1 :
37 |         try : line = sys.stdin.readline()
38 |         except KeyboardInterrupt : break
39 |         if not line : break
40 |         try : line = line.strip()
41 |         except : continue
42 |         if not line : continue
43 |         linecount += 1
44 |         if linecount % 1000 == 0 :
45 |             sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n")
46 | 
47 |         key,value = line.split('\t',1)
48 |         if not key or not value : continue
49 | 
50 |         ret = db.Get(key)
51 |         if ret : print ret
52 | 
53 |     durationTime = time.time() - startTime
54 |     sys.stderr.write("duration time = %f\n" % durationTime)
55 | 


--------------------------------------------------------------------------------
/search_lmdb.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include <stdlib.h>
  3 | #include <stdio.h>
  4 | #include <limits.h>
  5 | #include <ctype.h>
  6 | #include <unistd.h>
  7 | #include <stdint.h>
  8 | #include <sys/time.h>
  9 | 
 10 | #include "lmdb.h"
 11 | 
 12 | #define LINE_SIZE       10240
 13 | 
 14 | int main(int    argc, char  *argv[])
 15 | {
 16 |     int     size;
 17 |     char    string[LINE_SIZE+1];
 18 |     char    s_key[LINE_SIZE+1];
 19 |     char    s_value[LINE_SIZE+1];
 20 |     char*   token;
 21 |     char*   save;
 22 |     int     cnt_line;
 23 | 
 24 |     int         rc;
 25 |     MDB_env*    env;
 26 |     MDB_txn*    txn;
 27 |     MDB_cursor* mc;
 28 |     MDB_dbi     dbi;
 29 |     MDB_val     key, data;
 30 |     char*       envname;
 31 |     int         envflags=0;
 32 |     char*       subname;
 33 |     char*       prog = argv[0];
 34 |     size_t      map_size = (SIZE_MAX / (1024*1024*1024) / 4)*6; // 4giga * 6
 35 | 
 36 |     struct timeval tv1, tv2;
 37 | 
 38 |     if(argc != 3) {
 39 |         fprintf(stderr,"%s <envname> <subname>\n",prog);
 40 |         exit(1);
 41 |     }
 42 |     
 43 |     envflags = MDB_NOSUBDIR | MDB_NOLOCK;
 44 |     envname = argv[1];
 45 |     rc = mdb_env_create(&env);
 46 |     if(rc) {
 47 |         fprintf(stderr, "mdb_env_create failed, error %d %s\n", rc, mdb_strerror(rc));
 48 |         return EXIT_FAILURE;
 49 |     }
 50 |     mdb_env_set_maxdbs(env, 2);
 51 |     mdb_env_set_mapsize(env, map_size);
 52 |     rc = mdb_env_open(env, envname, envflags, 0664);
 53 |     if(rc) {
 54 |         fprintf(stderr, "mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc));
 55 |         goto env_close;
 56 |     }
 57 |     rc = mdb_txn_begin(env, NULL, 0, &txn);
 58 |     if(rc) {
 59 |         fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc));
 60 |         goto env_close;
 61 |     }
 62 |     subname = argv[2];
 63 |     rc = mdb_open(txn, subname, MDB_CREATE, &dbi);
 64 |     if (rc) {
 65 |         fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc));
 66 |         goto txn_abort;
 67 |     }
 68 |     rc = mdb_cursor_open(txn, dbi, &mc);
 69 |     if (rc) {
 70 |         fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc));
 71 |         goto txn_abort;
 72 |     }
 73 | 
 74 |     gettimeofday(&tv1, NULL);
 75 |     
 76 |     cnt_line = 0;
 77 |     while(fgets(string, LINE_SIZE, stdin) != NULL) {
 78 |         size = strlen(string);
 79 |         if(string[size-1] == '\n'){
 80 |             string[size-1] = '\0';
 81 |             --size;
 82 |         }
 83 |         if(size > 1 && string[size-1] == '\r'){
 84 |             string[size-1] = '\0';
 85 |             --size;
 86 |         }
 87 |         if(string[0] == '\0')
 88 |             continue;
 89 | 
 90 |         if(cnt_line % 10000 == 0)
 91 |             fprintf(stderr,"[linecount]\t%d\n",cnt_line);
 92 | 
 93 |         /*
 94 |         token = strtok_r(string, "\t", &save);
 95 |         if(token != NULL) {
 96 |             strcpy(s_key, token);
 97 |             token = strtok_r(NULL, "\t", &save);
 98 |             if(token != NULL) {
 99 |                 strcpy(s_value, token);
100 |             } else continue;
101 |         } else continue;
102 |         */
103 |         token = strtok_r(string, "\t", &save);
104 |         if(token != NULL) {
105 |             strcpy(s_key, token);
106 |         } else continue;
107 |         
108 |         key.mv_data = s_key;
109 |         key.mv_size = strlen(s_key) + 1;
110 | 
111 |         rc = mdb_get(txn, dbi, &key, &data);
112 |         if(!rc) {
113 |             fprintf(stdout, "%s\t%s\n", s_key, (char*)data.mv_data);
114 |         }
115 | 
116 |         cnt_line++;
117 |     }
118 |     gettimeofday(&tv2, NULL);
119 |     fprintf(stderr, "<-end > : t2.sec = %d t2.usec = %d\n",(int)tv2.tv_sec,(int)tv2.tv_usec);
120 |     fprintf(stderr, "<+time> : sec = %d usec = %d\n",(int)(tv2.tv_sec-tv1.tv_sec),(int)(tv2.tv_usec-tv1.tv_usec));
121 | 
122 |     rc = mdb_txn_commit(txn);
123 |     txn = NULL;
124 |     if(rc) {
125 |         fprintf(stderr, "%s: txn_commit fail: %s\n", prog, mdb_strerror(rc));
126 |         goto env_close;
127 |     }
128 |     mdb_dbi_close(env, dbi);
129 | 
130 | txn_abort:
131 |     mdb_txn_abort(txn);
132 | env_close:
133 |     mdb_env_close(env);
134 | 
135 |     return rc ? EXIT_FAILURE : EXIT_SUCCESS;
136 | 
137 | }
138 | 


--------------------------------------------------------------------------------
/search_lmdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | import re
 7 | from   optparse import OptionParser
 8 | import time
 9 | import lmdb
10 | 
11 | if __name__ == '__main__':
12 | 
13 |     parser = OptionParser()
14 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
15 |     parser.add_option("-d", "--db", dest="dbpath",help="db path", metavar="DB")
16 |     (options, args) = parser.parse_args()
17 | 
18 |     if options.verbose == 1 : VERBOSE = 1
19 | 
20 |     db_path = options.dbpath
21 |     if db_path == None :
22 |         parser.print_help()
23 |         sys.exit(1)
24 | 
25 | 
26 |     # env == db coz max_dbs=0
27 |     env = lmdb.Environment(db_path,map_size=24*(1023**3),subdir=False,readonly=True,create=False,max_dbs=0,lock=False)
28 |     txn = lmdb.Transaction(env,db=None,write=False)
29 | 
30 |     startTime = time.time()
31 |     
32 |     linecount = 0
33 |     while 1 :
34 |         try : line = sys.stdin.readline()
35 |         except KeyboardInterrupt : break
36 |         if not line : break
37 |         try : line = line.strip()
38 |         except : continue
39 |         if not line : continue
40 |         linecount += 1
41 |         if linecount % 1000 == 0 :
42 |             sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n")
43 | 
44 |         key,value = line.split('\t',1)
45 |         if not key or not value : continue
46 | 
47 |         ret = txn.get(key,default=None)
48 |         if ret :
49 |             print ret
50 | 
51 |     durationTime = time.time() - startTime
52 |     sys.stderr.write("duration time = %f\n" % durationTime)
53 | 
54 |     txn.abort()
55 |     env.close()
56 | 


--------------------------------------------------------------------------------
/search_word2vec.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | reload(sys)
 4 | sys.setdefaultencoding('utf-8')
 5 | import re
 6 | from   optparse import OptionParser
 7 | import time
 8 | from   gensim.models import word2vec,phrases
 9 | import logging
10 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
11 | 
12 | def load_model(model_path) :
13 |     model = word2vec.Word2Vec.load(model_path)
14 |     return model
15 | 
16 | '''
17 | python2.7 search_word2vec.py -m corpus.txt.model
18 | '''
19 | if __name__ == '__main__':
20 | 
21 |     parser = OptionParser()
22 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
23 |     parser.add_option("-m", "--model", dest="model",help="model path, output file", metavar="MODEL")
24 |     (options, args) = parser.parse_args()
25 | 
26 |     if options.verbose == 1 : VERBOSE = 1
27 | 
28 |     model_path = options.model
29 |     if model_path == None :
30 |         parser.print_help()
31 |         sys.exit(1)
32 | 
33 |     model = load_model(model_path)
34 |     
35 |     linecount = 0
36 |     while 1 :
37 |         try : line = sys.stdin.readline()
38 |         except KeyboardInterrupt : break
39 |         if not line : break
40 |         try : line = line.strip()
41 |         except : continue
42 |         if not line : continue
43 |         linecount += 1
44 |         if linecount % 1000 == 0 :
45 |             sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n")
46 | 
47 |         # convert to unicode
48 |         line_unicode = line.decode('utf-8')
49 |         tokens = []
50 |         for token in line_unicode.split() :
51 |             if token in model : tokens.append(token)
52 |         if len(tokens) >= 1 :
53 |             ret = model.most_similar(positive=tokens)
54 |             for word,sim in ret :
55 |                 print word + "\t" + str(sim)
56 |         else :
57 |             print "not in vocab"
58 |         print "=================================="
59 | 


--------------------------------------------------------------------------------
/similarity.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding: utf8 -*-
  3 | 
  4 | '''
  5 | read http://radimrehurek.com/gensim/tut3.html
  6 | here is test code
  7 | '''
  8 | 
  9 | import os
 10 | import sys
 11 | reload(sys)
 12 | sys.setdefaultencoding('utf-8')
 13 | import re
 14 | from   optparse import OptionParser
 15 | from   gensim import corpora, models, similarities, matutils
 16 | import logging
 17 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 18 | 
 19 | def construct_dictionary(documents_path, filter=None) :
 20 |     # collect statistics about all tokens
 21 |     dictionary = corpora.Dictionary(line.lower().split() for line in open(documents_path))
 22 | 
 23 |     if filter :
 24 |         # remove stop words and words that appear only once
 25 |         stoplist = set('for a of the and to in'.split())
 26 |         stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
 27 |         once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
 28 |         dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
 29 |         dictionary.compactify() # remove gaps in id sequence after words that were removed
 30 | 
 31 |     return dictionary
 32 | 
 33 | def save_dictionary(dictionary, dictionary_path) :
 34 |     dictionary.save(dictionary_path)
 35 | 
 36 | def load_dictionary(dictionary_path) :
 37 |     dictionary = corpora.Dictionary().load(dictionary_path,mmap='r')
 38 |     return dictionary
 39 |     
 40 | def save_corpus(corpus, corpus_path, format=None) :
 41 |     if format == 'svmlight' : # Joachim’s SVMlight format
 42 |         corpora.SvmLightCorpus.serialize(corpus_path, corpus)
 43 |     if format == 'lda-c' : # Blei’s LDA-C format
 44 |         corpora.BleiCorpus.serialize(corpus_path, corpus)
 45 |     if format == 'low' : # GibbsLDA++ format
 46 |         corpora.LowCorpus.serialize(corpus_path, corpus)
 47 |     if not format : # Matrix Market format
 48 |         corpora.MmCorpus.serialize(corpus_path, corpus)
 49 | 
 50 | def load_corpus(corpus_path) :
 51 |     corpus = corpora.MmCorpus(corpus_path)
 52 |     return corpus
 53 | 
 54 | def corpus_to_tfidf(corpus) :
 55 |     tfidf = models.TfidfModel(corpus, normalize=True) # step 1 -- initialize a model
 56 |     '''
 57 |     corpus_tfidf = tfidf[corpus]
 58 |     for doc in corpus_tfidf:
 59 |         print doc
 60 |     '''
 61 |     return tfidf
 62 | 
 63 | def save_tfidf(tfidf, tfidf_path) :
 64 |     tfidf.save(tfidf_path)
 65 | 
 66 | def load_tfidf(tfidf_path) :
 67 |     tfidf = models.TfidfModel.load(tfidf_path)
 68 |     return tfidf
 69 | 
 70 | def corpus_to_lsi(corpus, tfidf, dictionary, topic_number) :
 71 |     corpus_tfidf = tfidf[corpus]
 72 |     lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_number) # initialize an LSI transformation
 73 |     '''
 74 |     corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
 75 |     lsi.print_topics(3)
 76 |     for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
 77 |         print doc
 78 |     '''
 79 |     return lsi
 80 |     
 81 | def save_lsi(lsi, lsi_path) :
 82 |     lsi.save(lsi_path)
 83 | 
 84 | def load_lsi(lsi_path) :
 85 |     lsi = models.LsiModel.load(lsi_path)
 86 |     return lsi
 87 | 
 88 | def corpus_to_lda(corpus, dictionary, topic_number) :
 89 |     model = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_number)
 90 |     return model
 91 | 
 92 | def save_lda(lda, lda_path) :
 93 |     lda.save(lda_path)
 94 | 
 95 | def load_lda(lda_path) :
 96 |     lda = models.LdaModel.load(lda_path)
 97 |     return lda
 98 | 
 99 | def corpus_to_simmat(corpus, model) :
100 |     simmat = similarities.MatrixSimilarity(model[corpus])
101 |     return simmat
102 | 
103 | def save_simmat(simmat, simmat_path) :
104 |     simmat.save(simmat_path)
105 | 
106 | def load_simmat(simmat_path) :
107 |     simmat = similarities.MatrixSimilarity.load(simmat_path)
108 |     return simmat
109 |     
110 | '''
111 | python2.7 similarity.py --dictionary=document.txt.dict --corpus=document.txt.mm --tfidf=document.txt.tfidf --lsi=document.txt.lsi --lda=document.txt.lda --simmat=document.txt.simmat
112 | '''
113 | if __name__ == '__main__':
114 | 
115 |     parser = OptionParser()
116 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
117 |     parser.add_option("-d", "--dictionary", dest="dictionary",help="dictionary", metavar="DICT")
118 |     parser.add_option("-c", "--corpus", dest="corpus",help="corpus", metavar="CORPUS")
119 |     parser.add_option("-t", "--tfidf", dest="tfidf",help="tfidf", metavar="TFIDF")
120 |     parser.add_option("-l", "--lsi", dest="lsi",help="lsi", metavar="LSI")
121 |     parser.add_option("-a", "--lda", dest="lda",help="lda", metavar="LDA")
122 |     parser.add_option("-s", "--simmat", dest="simmat",help="similarity matrix, output file", metavar="SIMMAT")
123 |     (options, args) = parser.parse_args()
124 | 
125 |     if options.verbose == 1 : VERBOSE = 1
126 | 
127 |     dictionary_path = options.dictionary
128 |     if dictionary_path == None :
129 |         parser.print_help()
130 |         sys.exit(1)
131 | 
132 |     corpus_path = options.corpus
133 |     if corpus_path == None :
134 |         parser.print_help()
135 |         sys.exit(1)
136 | 
137 |     tfidf_path = options.tfidf
138 |     if tfidf_path == None :
139 |         parser.print_help()
140 |         sys.exit(1)
141 | 
142 |     lsi_path = options.lsi
143 |     if lsi_path == None :
144 |         parser.print_help()
145 |         sys.exit(1)
146 | 
147 |     lda_path = options.lda
148 |     if lda_path == None :
149 |         parser.print_help()
150 |         sys.exit(1)
151 |         
152 |     simmat_path = options.simmat
153 |     if simmat_path == None :
154 |         parser.print_help()
155 |         sys.exit(1)
156 | 
157 |     dictionary = load_dictionary(dictionary_path)
158 |     corpus = load_corpus(corpus_path)
159 |     tfidf = load_tfidf(tfidf_path)
160 |     lsi = load_lsi(lsi_path)
161 |     lda = load_lda(lda_path)
162 | 
163 |     simmat = corpus_to_simmat(corpus, tfidf)
164 |     save_simmat(simmat, simmat_path)
165 |     simmat = load_simmat(simmat_path)
166 | 
167 |     linecount = 0
168 |     while 1 :
169 |         try : line = sys.stdin.readline()
170 |         except KeyboardInterrupt : break
171 |         if not line : break
172 |         try : line = line.strip()
173 |         except : continue
174 |         if not line : continue
175 |         linecount += 1
176 |         if linecount % 1000 == 0 :
177 |             sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n")
178 |         vector = dictionary.doc2bow(line.lower().split())
179 |         vec_tfidf = tfidf[vector]
180 |         vec_lsi = lsi[vector]
181 |         vec_lda = lda[vector]
182 |         
183 |         sims = simmat[vec_tfidf] # perform a similarity query against the corpus
184 |         sims = sorted(enumerate(sims), key=lambda item: -item[1])
185 |         idx = 0
186 |         for docid, similarity in sims :
187 |             if idx >= 5 : break
188 |             output = [str(similarity)]
189 |             for termid, freq in corpus[docid] :
190 |                 term = dictionary.get(termid)
191 |                 output.append(term + "/" + str(freq))
192 |             print "\t".join(output)
193 |             idx += 1
194 | 


--------------------------------------------------------------------------------
/stack.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 |     "fmt"
 5 | )
 6 | 
 7 | type Node struct {
 8 |     Value int
 9 | }
10 | 
11 | /*
12 | func (n *Node) String() string {
13 |     return fmt.Sprint(n.Value)
14 | }
15 | */
16 | 
17 | // NewStack returns a new stack.
18 | func NewStack() *Stack {
19 |     return &Stack{}
20 | }
21 | 
22 | // Stack is a basic LIFO stack that resizes as needed.
23 | type Stack struct {
24 |     nodes []*Node
25 |     count int
26 | }
27 | 
28 | // Push adds a node to the stack.
29 | func (s *Stack) Push(n *Node) {
30 |     s.nodes = append(s.nodes[:s.count], n)
31 |     s.count++
32 | }
33 | 
34 | // Pop removes and returns a node from the stack in last to first order.
35 | func (s *Stack) Pop() *Node {
36 |     if s.count == 0 {
37 |         return nil
38 |     }
39 |     s.count--
40 |     return s.nodes[s.count]
41 | }
42 | 
43 | func main() {
44 |     s := NewStack()
45 |     s.Push(&Node{1})
46 |     s.Push(&Node{2})
47 |     s.Push(&Node{3})
48 |     fmt.Println(s.Pop(), s.Pop(), s.Pop())
49 | }
50 | 


--------------------------------------------------------------------------------
/test_numpy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.7
 2 | #-*- coding: utf8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | import re
 7 | from   optparse import OptionParser
 8 | 
 9 | import theano
10 | import theano.tensor as T
11 | import numpy as np
12 | import scipy
13 | 
14 | 
15 | # --verbose
16 | VERBOSE = 0
17 | 
18 | def open_file(filename, mode) :
19 |     try : fid = open(filename, mode)
20 |     except :
21 |         sys.stderr.write("open_file(), file open error : %s\n" % (filename))
22 |         exit(1)
23 |     else :
24 |         return fid
25 | 
26 | def close_file(fid) :
27 |     fid.close()
28 |     
29 | def type_test() :
30 |     m = np.asarray([[1., 2], [3, 4], [5, 6]])
31 |     print m
32 |     print m.shape # shape is tuple (3,2)
33 |     print m[2,0]
34 | 
35 |     x = np.float32(1.0)
36 |     print x
37 |     y = np.int_([1,2,4])
38 |     print y
39 |     z = np.array([1,2,3], dtype=np.int8)
40 |     print z
41 |     print z.dtype
42 |     z = np.float16(z)
43 |     print z
44 |     print z.dtype
45 |     z = z.astype(np.int_) # or z.astype(int)
46 |     print z
47 |     print z.dtype
48 |     print np.issubdtype(z.dtype,float)
49 | 
50 | def array_test() :
51 |     x = np.array([2, 3, 1, 0])
52 |     print x
53 |     x = np.array([[1,2.0],[0,0],(1+1j,3.)])
54 |     print x
55 |     x = np.array([[ 1.+0.j, 2.+0.j], [ 0.+0.j, 0.+0.j], [ 1.+1.j, 3.+0.j]])
56 |     print x
57 |     x = np.zeros((2, 3))
58 |     print x
59 |     x = np.ones((2, 3))
60 |     print x
61 |     print np.arange(10)
62 |     print np.arange(2, 10, dtype=np.float)
63 |     print np.arange(2, 3, 0.1)
64 |     print np.linspace(1., 4., 6)
65 |     print np.indices((3,3))
66 |     
67 | if __name__ == '__main__':
68 | 
69 |     parser = OptionParser()
70 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
71 |     (options, args) = parser.parse_args()
72 | 
73 |     if options.verbose == 1 : VERBOSE = 1
74 | 
75 |     type_test()
76 |     array_test()
77 | 


--------------------------------------------------------------------------------
/test_theano.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.7
 2 | #-*- coding: utf8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | import re
 7 | from   optparse import OptionParser
 8 | 
 9 | import cPickle, gzip
10 | import theano
11 | import theano.tensor as T
12 | import numpy as np
13 | import scipy
14 | 
15 | 
16 | # --verbose
17 | VERBOSE = 0
18 | 
19 | def open_file(filename, mode) :
20 |     try : fid = open(filename, mode)
21 |     except :
22 |         sys.stderr.write("open_file(), file open error : %s\n" % (filename))
23 |         exit(1)
24 |     else :
25 |         return fid
26 | 
27 | def close_file(fid) :
28 |     fid.close()
29 | 
30 | def shared_dataset(data_xy):
31 |     data_x, data_y = data_xy
32 |     shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
33 |     shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
34 |     return shared_x, T.cast(shared_y, 'int32')
35 |     
36 | if __name__ == '__main__':
37 | 
38 |     parser = OptionParser()
39 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
40 |     (options, args) = parser.parse_args()
41 | 
42 |     if options.verbose == 1 : VERBOSE = 1
43 | 
44 |     f = gzip.open('mnist.pkl.gz', 'rb')
45 |     train_set, valid_set, test_set = cPickle.load(f)
46 |     f.close()
47 | 
48 |     test_set_x, test_set_y = shared_dataset(test_set)
49 |     valid_set_x, valid_set_y = shared_dataset(valid_set)
50 |     train_set_x, train_set_y = shared_dataset(train_set)
51 |     batch_size = 500 # size of the minibatch
52 | 
53 |     # accessing the third minibatch of the training set
54 |     data = train_set_x[2 * 500: 3 * 500]
55 |     label = train_set_y[2 * 500: 3 * 500]
56 | 


--------------------------------------------------------------------------------
/transform.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding: utf8 -*-
  3 | 
  4 | '''
  5 | read http://radimrehurek.com/gensim/tut2.html
  6 | here is test code
  7 | '''
  8 | 
  9 | import os
 10 | import sys
 11 | reload(sys)
 12 | sys.setdefaultencoding('utf-8')
 13 | import re
 14 | from   optparse import OptionParser
 15 | from   gensim import corpora, models, similarities, matutils
 16 | import logging
 17 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 18 | 
 19 | def construct_dictionary(documents_path, filter=None) :
 20 |     # collect statistics about all tokens
 21 |     dictionary = corpora.Dictionary(line.lower().split() for line in open(documents_path))
 22 | 
 23 |     if filter :
 24 |         # remove stop words and words that appear only once
 25 |         stoplist = set('for a of the and to in'.split())
 26 |         stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
 27 |         once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
 28 |         dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
 29 |         dictionary.compactify() # remove gaps in id sequence after words that were removed
 30 | 
 31 |     return dictionary
 32 |     
 33 | def save_dictionary(dictionary, dictionary_path) :
 34 |     dictionary.save(dictionary_path)
 35 | 
 36 | def load_dictionary(dictionary_path) :
 37 |     dictionary = corpora.Dictionary().load(dictionary_path,mmap='r')
 38 |     return dictionary
 39 | 
 40 | def save_corpus(corpus, corpus_path, format=None) :
 41 |     if format == 'svmlight' : # Joachim’s SVMlight format
 42 |         corpora.SvmLightCorpus.serialize(corpus_path, corpus)
 43 |     if format == 'lda-c' : # Blei’s LDA-C format
 44 |         corpora.BleiCorpus.serialize(corpus_path, corpus)
 45 |     if format == 'low' : # GibbsLDA++ format
 46 |         corpora.LowCorpus.serialize(corpus_path, corpus)
 47 |     if not format : # Matrix Market format
 48 |         corpora.MmCorpus.serialize(corpus_path, corpus)
 49 | 
 50 | def load_corpus(corpus_path) :
 51 |     corpus = corpora.MmCorpus(corpus_path)
 52 |     return corpus
 53 | 
 54 | def corpus_to_tfidf(corpus) :
 55 |     tfidf = models.TfidfModel(corpus, normalize=True) # step 1 -- initialize a model
 56 |     '''
 57 |     corpus_tfidf = tfidf[corpus]
 58 |     for doc in corpus_tfidf:
 59 |         print doc
 60 |     '''
 61 |     return tfidf
 62 | 
 63 | def save_tfidf(tfidf, tfidf_path) :
 64 |     tfidf.save(tfidf_path)
 65 | 
 66 | def load_tfidf(tfidf_path) :
 67 |     tfidf = models.TfidfModel.load(tfidf_path)
 68 |     return tfidf
 69 |     
 70 | def corpus_to_lsi(corpus, tfidf, dictionary, topic_number) :
 71 |     corpus_tfidf = tfidf[corpus]
 72 |     lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_number) # initialize an LSI transformation
 73 |     '''
 74 |     corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
 75 |     lsi.print_topics(3)
 76 |     for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
 77 |         print doc
 78 |     '''
 79 |     return lsi
 80 | 
 81 | def save_lsi(lsi, lsi_path) :
 82 |     lsi.save(lsi_path)
 83 | 
 84 | def load_lsi(lsi_path) :
 85 |     lsi = models.LsiModel.load(lsi_path)
 86 |     return lsi
 87 | 
 88 | def corpus_to_lda(corpus, dictionary, topic_number) :
 89 |     model = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_number)
 90 |     return model
 91 | 
 92 | def save_lda(lda, lda_path) :
 93 |     lda.save(lda_path)
 94 | 
 95 | def load_lda(lda_path) :
 96 |     lda = models.LdaModel.load(lda_path)
 97 |     return lda
 98 |     
 99 | '''
100 | python2.7 transform.py --dictionary=document.txt.dict --corpus=document.txt.mm --tfidf=document.txt.tfidf --lsi=document.txt.lsi --lda=document.txt.lda
101 | '''
102 | if __name__ == '__main__':
103 | 
104 |     parser = OptionParser()
105 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
106 |     parser.add_option("-d", "--dictionary", dest="dictionary",help="dictionary", metavar="DICT")
107 |     parser.add_option("-c", "--corpus", dest="corpus",help="corpus", metavar="CORPUS")
108 |     parser.add_option("-t", "--tfidf", dest="tfidf",help="tfidf, output file", metavar="TFIDF")
109 |     parser.add_option("-l", "--lsi", dest="lsi",help="lsi, output file", metavar="LSI")
110 |     parser.add_option("-a", "--lda", dest="lda",help="lda, output file", metavar="LDA")
111 |     (options, args) = parser.parse_args()
112 | 
113 |     if options.verbose == 1 : VERBOSE = 1
114 | 
115 |     dictionary_path = options.dictionary
116 |     if dictionary_path == None :
117 |         parser.print_help()
118 |         sys.exit(1)
119 | 
120 |     corpus_path = options.corpus
121 |     if corpus_path == None :
122 |         parser.print_help()
123 |         sys.exit(1)
124 | 
125 |     tfidf_path = options.tfidf
126 |     if tfidf_path == None :
127 |         parser.print_help()
128 |         sys.exit(1)
129 | 
130 |     lsi_path = options.lsi
131 |     if lsi_path == None :
132 |         parser.print_help()
133 |         sys.exit(1)
134 | 
135 |     lda_path = options.lda
136 |     if lda_path == None :
137 |         parser.print_help()
138 |         sys.exit(1)
139 | 
140 |     dictionary = load_dictionary(dictionary_path)
141 |     corpus = load_corpus(corpus_path)
142 | 
143 |     tfidf = corpus_to_tfidf(corpus)
144 |     save_tfidf(tfidf, tfidf_path)
145 | 
146 |     lsi = corpus_to_lsi(corpus, tfidf, dictionary, 10)
147 |     save_lsi(lsi, lsi_path)
148 | 
149 |     lda = corpus_to_lda(corpus, dictionary, 10)
150 |     save_lda(lda, lda_path)
151 | 


--------------------------------------------------------------------------------
/wordcount_spark.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding: utf8 -*-
  3 | 
  4 | import os
  5 | import sys
  6 | reload(sys)
  7 | sys.setdefaultencoding('utf-8')
  8 | from   optparse import OptionParser
  9 | 
 10 | from   pyspark import SparkContext
 11 | 
 12 | VERBOSE = 0
 13 | 
 14 | def open_file(filename, mode) :
 15 |     try : fid = open(filename, mode)
 16 |     except :
 17 |         sys.stderr.write("open_file(), file open error : %s\n" % (filename))
 18 |         exit(1)
 19 |     else :
 20 |         return fid
 21 | 
 22 | def close_file(fid) :
 23 |     fid.close()
 24 | 
 25 | def map_func(line) :
 26 |     words = line.split(' ')
 27 |     return map(lambda x: (x, 1), words)
 28 | 
 29 | def reduce_func(a,b) :
 30 |     return a+b
 31 | 
 32 | def map_func2(entry) :
 33 |     key,value = entry
 34 |     return (key,reduce(lambda a,b: a+b,value))
 35 |     
 36 | '''
 37 | usage : spark-submit --master yarn-client --total-executor-cores 100 --executor-memory 512M wordcount.py -f input_file_on_hdfs
 38 | '''
 39 | if __name__ == "__main__":
 40 |     parser = OptionParser()
 41 |     parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
 42 |     parser.add_option("-f", "--file", dest="file",help="file path in HDFS", metavar="FILE")
 43 |     (options, args) = parser.parse_args()
 44 | 
 45 |     if options.verbose == 1 : VERBOSE = 1
 46 | 
 47 |     file_path = options.file
 48 |     if file_path == None :
 49 |         parser.print_help()
 50 |         sys.exit(1)
 51 | 
 52 |     sc = SparkContext(appName="PythonWordCount")
 53 | 
 54 |     '''
 55 |     # read from hdfs directory
 56 |     lines = sc.wholeTextFiles(file_path, 1)
 57 |     counts = lines.values().flatMap(lambda x: x.split(' ')) \
 58 |             .map(lambda x: (x, 1)) \
 59 |             .reduceByKey(lambda a, b: a + b) \
 60 |             .sortBy(lambda x: x[1],ascending=False)
 61 |     counts.saveAsHadoopFile("gensim/output","org.apache.hadoop.mapred.TextOutputFormat")
 62 |     '''
 63 | 
 64 |     lines = sc.textFile(file_path, 1)
 65 | 
 66 |     # save to hdfs
 67 |     counts = lines.flatMap(lambda x: x.split(' ')) \
 68 |             .map(lambda x: (x, 1)) \
 69 |             .reduceByKey(lambda a, b: a + b) \
 70 |             .sortBy(lambda x: x[1],ascending=False)
 71 |     counts.saveAsHadoopFile("gensim/output","org.apache.hadoop.mapred.TextOutputFormat")
 72 |     
 73 |     '''
 74 |     lines = sc.textFile(file_path, 1)
 75 |     # user defined map,reduce
 76 |     # map : string -> [(a,1),(b,1),..],[(a,1),(c,1),...],....
 77 |     # flatMap : list of list -> [(a,1),(b,1),....,(a,1),(c,1),....]
 78 |     # reduceByKey : goup by key -> [(a,(1,1,1,....)),(b,(1,1,1)),(c,1,1,1,1,...),...]
 79 |     #             : reduce value list -> [(a,10),(b,3),(c,17),....]
 80 |     # sortBy : [(a,10),(b,3),(c,17),....] -> [(c,17),(a,10),(c,3),....]
 81 |     counts = lines.map(map_func) \
 82 |             .flatMap(lambda x: x) \
 83 |             .reduceByKey(reduce_func) \
 84 |             .sortBy(lambda x: x[1],ascending=False)
 85 |     counts.saveAsHadoopFile("gensim/output","org.apache.hadoop.mapred.TextOutputFormat")
 86 |     '''
 87 | 
 88 |     '''
 89 |     lines = sc.textFile(file_path, 1)
 90 |     # user defined map,reduce
 91 |     counts = lines.map(map_func) \
 92 |             .flatMap(lambda x: x) \
 93 |             .groupByKey() \
 94 |             .map(map_func2) \
 95 |             .sortBy(lambda x: x[1],ascending=False)
 96 |     output = counts.collect()
 97 |     for key,value in output :
 98 |         print key + "\t" + str(value)
 99 |     '''
100 | 
101 |     '''
102 |     lines = sc.textFile(file_path, 1)
103 |     # save to local
104 |     counts = lines.flatMap(lambda x: x.split(' ')) \
105 |                   .map(lambda x: (x, 1)) \
106 |                   .reduceByKey(lambda a, b: a + b)
107 |     output = counts.collect()
108 |     fd = open_file("output.txt",'w')
109 |     for (word, count) in output:
110 |         fd.write("%s\t%s\n" % (word,count))
111 |     close_file(fd)
112 |     '''
113 |     
114 |     '''
115 |     lines = sc.textFile(file_path, 1)
116 |     # test goupByKey
117 |     group = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).groupByKey()
118 |     output = group.collect()
119 |     for (word,count_list) in output :
120 |         print word + "\t" + ','.join(map(lambda x: str(x),count_list))
121 |     '''
122 | 
123 |     sc.stop()
124 | 


--------------------------------------------------------------------------------