├── README.md ├── build_lm.sh ├── build_word2vec.py ├── c2d.py ├── dist_lda.py ├── dist_lsi.py ├── document_to_corpus.py ├── env.sh ├── fetch_url.go ├── hello.go ├── images ├── Bernoulli_distribution_estimation_1.png ├── Bernoulli_distribution_estimation_2.png ├── Bernoulli_distribution_estimation_3.png ├── Bernoulli_distribution_estimation_4.png ├── DNC.png ├── GLU.png ├── MindMeld_MessagingInterfacesDemystified.pdf ├── SGNS_1.png ├── SGNS_2.png ├── additive_multiplicative_attention.png ├── aho-corasick.png ├── allocation-weighting.png ├── alpha_beta.png ├── attention_def1.png ├── attention_def2.png ├── attention_def3.png ├── backward-beta.jpeg ├── binomial_estimation_1.png ├── binomial_estimation_2.png ├── binomial_estimation_3.png ├── binomial_estimation_4.png ├── bm.jpg ├── brat_sejong.png ├── cmp_1.png ├── cmp_10.png ├── cmp_11.png ├── cmp_12.png ├── cmp_13.png ├── cmp_14.png ├── cmp_15.png ├── cmp_2.png ├── cmp_3.png ├── cmp_4.png ├── cmp_5.png ├── cmp_6.png ├── cmp_7.png ├── cmp_8.png ├── cmp_9.png ├── cnn_embedding.png ├── content-based-addressing.png ├── conv_1.jpeg ├── cross_entropy_loss.png ├── cross_entropy_loss_many_output.png ├── deptree.png ├── distribution_function.png ├── du_1.png ├── du_2.png ├── entropy_1.jpg ├── entropy_10.jpg ├── entropy_11.jpg ├── entropy_12.jpg ├── entropy_2.jpg ├── entropy_3.jpg ├── entropy_4.jpg ├── entropy_5.jpg ├── entropy_6.jpg ├── entropy_7.jpg ├── entropy_8.jpg ├── entropy_9.jpg ├── expectation.png ├── forward-alpha.jpeg ├── forward_backward_var.png ├── four_equation.png ├── hierarchical_attention.png ├── hmm_1.png ├── hmm_2.png ├── hmm_3.png ├── hmm_4.png ├── hmm_5.png ├── hmm_6.png ├── hmm_7.png ├── kmp.jpg ├── layer_norm_timesteps.png ├── me_1.png ├── me_2.png ├── ml_1.png ├── ml_2.png ├── ml_3.png ├── ml_4.png ├── mult_head_self_attention.png ├── multi_dimensional_self_attention.png ├── multi_headed_attention_1.png ├── multi_headed_attention_2.png ├── multinomial_estimation_1.png ├── multinomial_estimation_2.png ├── multinomial_estimation_3.png ├── ner_attention.jpg ├── ner_attention_math1.jpg ├── ner_attention_math2.jpg ├── ngram_cnn_highway_1.png ├── ngram_cnn_highway_2.png ├── nn_1.jpeg ├── nn_2.jpeg ├── nn_3.jpeg ├── nn_4.jpeg ├── nn_5.jpeg ├── ntm-addressing.png ├── ntm-content-addressing.png ├── ntm-interface-vector.png ├── ntm-interpolation.png ├── ntm-lstm.png ├── ntm-pseudocode.png ├── ntm-sharpen.png ├── ntm-shift.png ├── ntm-test.png ├── ntm-train.png ├── ntm.png ├── p-value.png ├── partition.png ├── pstree.png ├── re_attention_1.png ├── re_attention_2.png ├── read-vector.png ├── regularization.jpeg ├── retention-vector.png ├── scaled_dot_product_attention.png ├── sejong_entry.png ├── self-attention-map.png ├── self-attention.png ├── self_attention_with_fnn.png ├── seq2seq_attention_machanism.jpg ├── seq2seq_attention_machanism.png ├── seq2seq_autoencoder.jpeg ├── time_invariant_self_attention.png ├── time_invariant_self_attention_full.png ├── transformer_model.png ├── traversal_london.png ├── url_sejong.png ├── usage-vector.png ├── variance.png ├── vbox_port.png ├── viterbi.png ├── wor2vec_visualizer.png ├── word2vec_1.jpeg ├── word2vec_2.jpeg ├── word2vec_3.jpeg ├── word2vec_4.jpeg ├── word2vec_5.jpeg ├── workbench_fatal.png ├── write-operation.png └── write-weight-vector.png ├── keras_mlp.py ├── make_bdb.py ├── make_leveldb.py ├── make_lmdb.c ├── make_lmdb.py ├── multiplexing.go ├── ngram.cc ├── queue.go ├── search_bdb.py ├── search_leveldb.py ├── search_lmdb.c ├── search_lmdb.py ├── search_word2vec.py ├── similarity.py ├── stack.go ├── test_numpy.py ├── test_theano.py ├── transform.py └── wordcount_spark.py /README.md: -------------------------------------------------------------------------------- 1 | ### WIKI 2 | - Natural Language Processing 3 | - Development 4 | - Algorithm 5 | - Machine Learning 6 | - details in [Wiki](https://github.com/dsindex/blog/wiki) 7 | 8 | ### Sources 9 | - sources referred by wiki 10 | 11 | ### Pretty Viewer 12 | - [dsindex.github.io](http://dsindex.github.io/) 13 | -------------------------------------------------------------------------------- /build_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o nounset 4 | set -o errexit 5 | 6 | VERBOSE_MODE=0 7 | 8 | function error_handler() 9 | { 10 | local STATUS=${1:-1} 11 | [ ${VERBOSE_MODE} == 0 ] && exit ${STATUS} 12 | echo "Exits abnormally at line "`caller 0` 13 | exit ${STATUS} 14 | } 15 | trap "error_handler" ERR 16 | 17 | PROGNAME=`basename ${BASH_SOURCE}` 18 | DRY_RUN_MODE=0 19 | 20 | function print_usage_and_exit() 21 | { 22 | set +x 23 | local STATUS=$1 24 | echo "Usage: ${PROGNAME} [-v] [-v] [--dry-run] [-h] [--help]" 25 | echo "" 26 | echo " Options -" 27 | echo " -v enables verbose mode 1" 28 | echo " -v -v enables verbose mode 2" 29 | echo " --dry-run show what would have been dumped" 30 | echo " -h, --help shows this help message" 31 | exit ${STATUS:-0} 32 | } 33 | 34 | function debug() 35 | { 36 | if [ "$VERBOSE_MODE" != 0 ]; then 37 | echo $@ 38 | fi 39 | } 40 | 41 | GETOPT=`getopt -o vh --long dry-run,help -n "${PROGNAME}" -- "$@"` 42 | if [ $? != 0 ] ; then print_usage_and_exit 1; fi 43 | 44 | eval set -- "${GETOPT}" 45 | 46 | while true 47 | do case "$1" in 48 | -v) let VERBOSE_MODE+=1; shift;; 49 | --dry-run) DRY_RUN_MODE=1; shift;; 50 | -h|--help) print_usage_and_exit 0;; 51 | --) shift; break;; 52 | *) echo "Internal error!"; exit 1;; 53 | esac 54 | done 55 | 56 | if (( VERBOSE_MODE > 1 )); then 57 | set -x 58 | fi 59 | 60 | 61 | # template area is ended. 62 | # ----------------------------------------------------------------------------- 63 | if [ ${#} != 0 ]; then print_usage_and_exit 1; fi 64 | 65 | # current dir of this script 66 | CDIR=$(readlink -f $(dirname $(readlink -f ${BASH_SOURCE[0]}))) 67 | 68 | [[ -f ${CDIR}/env.sh ]] && . ${CDIR}/env.sh || exit 69 | 70 | # ----------------------------------------------------------------------------- 71 | # functions 72 | 73 | 74 | 75 | # end functions 76 | # ----------------------------------------------------------------------------- 77 | 78 | # ----------------------------------------------------------------------------- 79 | # main 80 | 81 | make_calmness 82 | child_verbose="" 83 | if (( VERBOSE_MODE > 1 )); then 84 | revert_calmness 85 | child_verbose="-v -v" 86 | fi 87 | 88 | ${IRSTLM}/dict -InputFile=${DOC} -OutputFile=${DICT} -Freq=yes -sort=no 89 | ${IRSTLM}/split-dict.pl --input ${DICT} --output ${DICT}. --parts ${SPLIT} 90 | for subdict in `ls ${DICT}.*` 91 | do 92 | filename=$(basename "$subdict") 93 | extension="${filename##*.}" 94 | ${IRSTLM}/ngt -InputFile=${DOC} -FilterDict=${filename} -NgramSize=${NGRAM_SIZE} -OutputFile=${NGRAM}.${extension} -OutputGoogleFormat=yes 95 | done 96 | 97 | for subngram in `ls ${NGRAM}.*` 98 | do 99 | filename=$(basename "$subngram") 100 | extension="${filename##*.}" 101 | ${IRSTLM}/build-sublm.pl --size ${NGRAM_SIZE} --ngrams ${subngram} --sublm ${LM}.${extension} 102 | done 103 | 104 | ${IRSTLM}/merge-sublm.pl --size ${NGRAM_SIZE} --sublm ${LM} -lm ${iARPA}.gz 105 | 106 | function optional { 107 | ${IRSTLM}/quantize-lm ${iARPA} ${qARPA} 108 | } 109 | 110 | gunzip ${iARPA}.gz 111 | ${IRSTLM}/compile-lm --text=yes ${iARPA} ${ARPA} 112 | 113 | ${KENLM}/build_binary -s -i -w mmap ${ARPA} ${ARPA}.mmap 114 | 115 | close_fd 116 | 117 | # end main 118 | # ----------------------------------------------------------------------------- 119 | -------------------------------------------------------------------------------- /build_word2vec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | ''' 5 | read http://radimrehurek.com/gensim/models/word2vec.html 6 | here is test code 7 | ''' 8 | 9 | import os 10 | import sys 11 | reload(sys) 12 | sys.setdefaultencoding('utf-8') 13 | import re 14 | from optparse import OptionParser 15 | import time 16 | from gensim.models import word2vec,phrases 17 | import logging 18 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 19 | 20 | def build_model(corpus_path, detect_phrase=False) : 21 | startTime = time.time() 22 | 23 | sentences = word2vec.LineSentence(corpus_path) 24 | if detect_phrase : 25 | bigram_transformer = phrases.Phrases(sentences) 26 | model = word2vec.Word2Vec(bigram_transformer[sentences], size=100, alpha=0.025, window=5, min_count=5, sample=1e-5, workers=4, sg=1) 27 | else : 28 | model = word2vec.Word2Vec(sentences, size=100, alpha=0.025, window=5, min_count=5, sample=1e-5, workers=4, sg=1) 29 | # no more training 30 | model.init_sims(replace=True) 31 | durationTime = time.time() - startTime 32 | sys.stderr.write("duration time = %f\n" % durationTime) 33 | return model 34 | 35 | def save_model(model, model_path) : 36 | model.save(model_path) 37 | 38 | ''' 39 | python2.7 build_word2vec.py -c corpus.txt -m corpus.txt.model 40 | ''' 41 | if __name__ == '__main__': 42 | 43 | parser = OptionParser() 44 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 45 | parser.add_option("-c", "--corpus", dest="corpus",help="corpus path", metavar="CORPUS") 46 | parser.add_option("-m", "--model", dest="model",help="model path, output file", metavar="MODEL") 47 | (options, args) = parser.parse_args() 48 | 49 | if options.verbose == 1 : VERBOSE = 1 50 | 51 | corpus_path = options.corpus 52 | if corpus_path == None : 53 | parser.print_help() 54 | sys.exit(1) 55 | 56 | model_path = options.model 57 | if model_path == None : 58 | parser.print_help() 59 | sys.exit(1) 60 | 61 | model = build_model(corpus_path) 62 | save_model(model, model_path) 63 | -------------------------------------------------------------------------------- /c2d.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | import os 5 | from optparse import OptionParser 6 | 7 | # global variable 8 | VERBOSE = 0 9 | 10 | import sys 11 | reload(sys) 12 | sys.setdefaultencoding('utf-8') 13 | 14 | # ------------------------------------------------------------------------- 15 | # build tree 16 | # ------------------------------------------------------------------------- 17 | def next_paren(tokens, i) : 18 | ''' 19 | tokens[i]에서 시작해서 다음 '(' 혹은 ')'의 위치를 탐색 20 | 못찾은 경우 return -1 21 | ''' 22 | j = 0 23 | found = False 24 | for token in tokens[i:] : 25 | if token == '(' or token == ')' : 26 | found = True 27 | break 28 | j += 1 29 | if found : return i + j 30 | return -1 31 | 32 | def node_string(node, enable_eoj=True) : 33 | if node['leaf'] : 34 | if enable_eoj : 35 | return '(' + node['label'] + ' ' + node['eoj'] + '/' + str(node['eoj_idx']) + ' ' + node['morphs'] + ')' 36 | else : 37 | return '(' + node['label'] + ' ' + node['morphs'] + ')' 38 | else : 39 | return '(' + node['label'] + ')' 40 | 41 | def create_node(tokens, i, j) : 42 | ''' 43 | i ~ j까지가 label,morphs 영역 44 | i + 1 = j : label 45 | ex) '( NP (' 46 | i j 47 | i + 1 < j : label,morphs 48 | ex) '( NP_MOD 프랑스/NNP+의/JKG )' 49 | i j 50 | ''' 51 | node = {'lchild':{}, 'rchild':{}, 'parent':{}, 'sibling':{}} 52 | if i + 1 == j : 53 | node['label'] = tokens[i] 54 | node['leaf'] = False 55 | return node 56 | elif i + 1 < j : 57 | node['label'] = tokens[i] 58 | node['morphs'] = tokens[i+1] 59 | node['leaf'] = True 60 | node['nleaf'] = {} 61 | node['pleaf'] = {} 62 | return node 63 | else : 64 | return None 65 | 66 | def make_edge(top, node) : 67 | if not top['lchild'] : # link to left child 68 | top['lchild'] = node 69 | node['parent'] = top 70 | if VERBOSE : print node_string(top) + '-[left]->' + node_string(node) 71 | elif not top['rchild'] : # link to right child 72 | top['rchild'] = node 73 | node['parent'] = top 74 | top['lchild']['sibling'] = node 75 | if VERBOSE : print node_string(top) + '-[right]->' + node_string(node) 76 | else : 77 | return False 78 | return True 79 | 80 | def make_leaf_edge(node, history, depth=0) : 81 | ''' 82 | tree의 leaf간 next,prev link 연결 83 | 즉, node['nleaf'], node['pleaf'] 설정 84 | ''' 85 | if node['leaf'] : 86 | length = len(history) 87 | if length != 0 : 88 | prev = history[-1] 89 | prev['nleaf'] = node 90 | node['pleaf'] = prev 91 | history.append(node) 92 | 93 | if node['lchild'] : 94 | make_leaf_edge(node['lchild'], history, depth+1) 95 | if node['rchild'] : 96 | make_leaf_edge(node['rchild'], history, depth+1) 97 | 98 | def build_tree(sent, tokens) : 99 | ''' 100 | sent = ; 프랑스의 세계적인 의상 디자이너 엠마누엘 웅가로가 실내 장식용 직물 디자이너로 나섰다. 101 | tokens = ( S ( NP_SBJ ( NP ( NP_MOD 프랑스/NNP+의/JKG ) \ 102 | ( NP ( VNP_MOD 세계/NNG+적/XSN+이/VCP+ᆫ/ETM ) ( NP ( NP 의상/NNG ) ( NP 디자이너/NNG ) ) ) ) \ 103 | ( NP_SBJ ( NP 엠마누엘/NNP ) ( NP_SBJ 웅가로/NNP+가/JKS ) ) ) \ 104 | ( VP ( NP_AJT ( NP ( NP ( NP 실내/NNG ) ( NP 장식/NNG+용/XSN ) ) ( NP 직물/NNG ) ) \ 105 | ( NP_AJT 디자이너/NNG+로/JKB ) ) ( VP 나서/VV+었/EP+다/EF+./SF ) ) ) 106 | ''' 107 | err = ' '.join(tokens) 108 | root = {'lchild':{}, 'rchild':{}, 'parent':{}, 'sibling':{}, 'leaf':False, 'label':'ROOT'} 109 | stack = [] 110 | stack.append(root) 111 | max = len(tokens) 112 | i = 0 113 | eoj_idx = 1 114 | eoj_max = len(sent) 115 | while i < max : 116 | token = tokens[i] 117 | if token == '(' : # create node and push 118 | j = next_paren(tokens, i+1) 119 | if j == -1 or i+1 == j : 120 | sys.stderr.write("ill-formed parentheses[1] : %s\n" % (err)) 121 | return None 122 | node = create_node(tokens, i+1, j) 123 | if not node : return None 124 | # assign eoj/eoj_idx to leaf node 125 | if node['leaf'] : 126 | if eoj_idx >= eoj_max : 127 | sys.stderr.write("not aligned sentence %s : %s\n" % (' '.join(sent), err)) 128 | return None 129 | node['eoj'] = sent[eoj_idx] 130 | node['eoj_idx'] = eoj_idx 131 | eoj_idx += 1 132 | if VERBOSE : print node_string(node) 133 | # push to stack 134 | stack.append(node) 135 | if token == ')' : 136 | # pop and make edge 137 | if len(stack) == 0 : 138 | sys.stderr.write("ill-formed parentheses[2] : %s\n" % (err)) 139 | return None 140 | node = stack.pop() 141 | if len(stack) == 0 : 142 | sys.stderr.write("ill-formed parentheses[3] : %s\n" % (err)) 143 | return None 144 | top = stack[-1] 145 | if not make_edge(top, node) : 146 | sys.stderr.write("can't make edge : %s\n" % (err)) 147 | return None 148 | i += 1 149 | 150 | if len(stack) == 1 and stack[-1]['label'] == 'ROOT' : 151 | history = [] 152 | make_leaf_edge(root['lchild'], history, depth=0) 153 | return root 154 | else : 155 | sys.stderr.write("build failure : %s\n" % (err)) 156 | return None 157 | # ------------------------------------------------------------------------- 158 | 159 | # ------------------------------------------------------------------------- 160 | # preprocessing 161 | # ------------------------------------------------------------------------- 162 | def modify_illformed_1(tokens) : 163 | # ex) '( NP ( NP ( NP ( NP+포로/NNG )' 164 | # '(' 다음이 label인데 '+'가 포함되어 있으면 처음 '+'만 공백으로 165 | n_tokens = [] 166 | max = len(tokens) 167 | i = 0 168 | while i < max : 169 | token = tokens[i] 170 | if token == '(' : 171 | n_tokens.append(token) 172 | if '+' in tokens[i+1] : 173 | t_list = tokens[i+1].split('+') 174 | n_tokens.append(t_list[0]) # label 175 | n_tokens.append(''.join(t_list[1:])) # morphs 176 | i += 1 177 | else : 178 | n_tokens.append(token) 179 | i += 1 180 | return n_tokens 181 | 182 | def tokenize(bucket) : 183 | ''' 184 | * 다루기 쉽도록 공백으로 분리된 token 단위로 변환한다. 185 | 예) bucket 186 | ; 프랑스의 세계적인 의상 디자이너 엠마누엘 웅가로가 실내 장식용 직물 디자이너로 나섰다. 187 | (S (NP_SBJ (NP (NP_MOD 프랑스/NNP + 의/JKG) 188 | (NP (VNP_MOD 세계/NNG + 적/XSN + 이/VCP + ᆫ/ETM) 189 | (NP (NP 의상/NNG) 190 | (NP 디자이너/NNG)))) 191 | (NP_SBJ (NP 엠마누엘/NNP) 192 | (NP_SBJ 웅가로/NNP + 가/JKS))) 193 | (VP (NP_AJT (NP (NP (NP 실내/NNG) 194 | (NP 장식/NNG + 용/XSN)) 195 | (NP 직물/NNG)) 196 | (NP_AJT 디자이너/NNG + 로/JKB)) 197 | (VP 나서/VV + 었/EP + 다/EF + ./SF))) 198 | ''' 199 | sent = bucket[0].split() 200 | if sent[0] != ';' : return None,None 201 | paren_parse = ' '.join([s.strip('\t').replace('\t',' ') for s in bucket[1:]]) 202 | paren_parse = paren_parse.replace(' + ','+') 203 | paren_parse = paren_parse.replace('(/','^[/').replace(')/','^]/') 204 | paren_parse = paren_parse.replace('(',' ( ').replace(')',' ) ') 205 | paren_parse = paren_parse.replace('^[/','(/').replace('^]/',')/') 206 | paren_parse = paren_parse.replace('+ ','+') 207 | tokens = paren_parse.split() 208 | tokens = modify_illformed_1(tokens) 209 | 210 | if VERBOSE : print ' '.join(tokens) 211 | return sent, tokens 212 | # ------------------------------------------------------------------------- 213 | 214 | # ------------------------------------------------------------------------- 215 | # tree traversal 216 | # ------------------------------------------------------------------------- 217 | def tree2tokens(node, tokens, depth=0) : 218 | ''' 219 | 입력을 tree로 변환하기 전 tokenizing 했는데, 220 | 여기서는 tree를 가지고 역으로 tokenizing 결과를 만든다. 221 | ''' 222 | if node['leaf'] : 223 | tokens.append('(') 224 | tokens.append(node['label']) 225 | tokens.append(node['morphs']) 226 | tokens.append(')') 227 | else : 228 | tokens.append('(') 229 | tokens.append(node['label']) 230 | 231 | if node['lchild'] : 232 | tree2tokens(node['lchild'], tokens, depth=depth+1) 233 | if not node['rchild'] : 234 | tokens.append(')') # closed 235 | if node['rchild'] : 236 | tree2tokens(node['rchild'], tokens, depth=depth+1) 237 | tokens.append(')') # closed 238 | 239 | def modify_morphs(morphs) : 240 | try : 241 | t_morphs = morphs.replace('++/','+\t/') # + -> tab 242 | t_morphs = t_morphs.replace('+',' + ') 243 | t_morphs = t_morphs.replace('\t','+') # tab -> + 244 | except : 245 | return morphs 246 | return t_morphs 247 | 248 | def tree2con(node, tokens, history, depth=0) : 249 | ''' 250 | 입력을 tree로 변환했다면, 여기서 다시 251 | tree를 입력과 같은 형태(constituent, phrase structure)로 출력한다. 252 | ''' 253 | if depth == 0 : prev_node = None 254 | else : prev_node = history[-1] 255 | if prev_node and prev_node['leaf'] : # 바로 전에 leaf를 찍었다면 256 | tokens.append('\n') 257 | for i in xrange(depth) : 258 | tokens.append('\t') 259 | 260 | if node['leaf'] : 261 | tokens.append('(' + node['label'] + ' ' + modify_morphs(node['morphs']) + ')') 262 | else : 263 | tokens.append('(' + node['label'] + '\t') 264 | history.append(node) 265 | 266 | if node['lchild'] : 267 | tree2con(node['lchild'], tokens, history, depth+1) 268 | if not node['rchild'] : 269 | tokens.append(')') # closed 270 | if node['rchild'] : 271 | tree2con(node['rchild'], tokens, history, depth+1) 272 | tokens.append(')') # closed 273 | 274 | def is_vx(gov_node) : 275 | morphs = gov_node['morphs'] 276 | tokens = morphs.split('+') 277 | if '/VX' in tokens[0] : return True 278 | # VX는 아니지만 VX처럼 동작하는 용언, ex) '지니게 되다' 279 | if '되/' in tokens[0] : 280 | pleaf = None 281 | if gov_node['pleaf'] : pleaf = gov_node['pleaf'] 282 | if pleaf : 283 | morphs = pleaf['morphs'] 284 | tokens = morphs.split('+') 285 | if '게/EC' in tokens[-1] : return True 286 | if '면/EC' in tokens[-1] : return True 287 | if '아도/EC' in tokens[-1] : return True 288 | if '않/' in tokens[0] : 289 | pleaf = None 290 | if gov_node['pleaf'] : pleaf = gov_node['pleaf'] 291 | if pleaf : 292 | morphs = pleaf['morphs'] 293 | tokens = morphs.split('+') 294 | if '지/EC' in tokens[-1] : return True 295 | return False 296 | 297 | def is_vnp(morphs) : 298 | tokens = morphs.split('+') 299 | if len(tokens) <= 2 : return False 300 | if '/NNB' in tokens[0] and '/VCP' in tokens[1] : return True 301 | return False 302 | 303 | def is_va(morphs) : 304 | tokens = morphs.split('+') 305 | # '/VV'로 잘못 태깅된 케이스도 커버 306 | if '있/VA' in tokens[0] or \ 307 | '있/VV' in tokens[0] or \ 308 | '없/VA' in tokens[0] or \ 309 | '없/VV' in tokens[0] or \ 310 | '같/VA' in tokens[0] : return True 311 | else : return False 312 | 313 | def is_nnb(morphs) : 314 | tokens = morphs.split('+') 315 | if '/NNB' in tokens[0] : return True 316 | return False 317 | 318 | def is_etm(morphs) : 319 | tokens = morphs.split('+') 320 | if 'ᆫ/ETM' in tokens[-1] : return True 321 | if '는/ETM' in tokens[-1] : return True 322 | if 'ᆯ/ETM' in tokens[-1] : return True 323 | if '을/ETM' in tokens[-1] : return True 324 | if '를/ETM' in tokens[-1] : return True 325 | return False 326 | 327 | def check_vx_rule(gov_node) : 328 | if not gov_node['parent'] : return False 329 | if not gov_node['parent']['lchild'] : return False 330 | if not is_vx(gov_node) : return False 331 | return True 332 | 333 | def check_vnp_rule(gov_node) : 334 | if not gov_node['parent'] : return False 335 | if not gov_node['parent']['lchild'] : return False 336 | # 'VNP 것/NNB + 이/VCP + 다/EF' 형태인지 검사 337 | if not is_vnp(gov_node['morphs']) : return False 338 | return True 339 | 340 | def check_va_rule(gov_node) : 341 | if not gov_node['parent'] : return False 342 | if not gov_node['parent']['lchild'] : return False 343 | # 'ㄹ NNB 있다/없다/같다' 형태인지 검사 344 | # 'NNB'는 어절의 시작이 NNB이면 된다. 즉, '~ㄹ 수가 없다' 형태도 허용 345 | if is_va(gov_node['morphs']) : 346 | pleaf = None 347 | if gov_node['pleaf'] : pleaf = gov_node['pleaf'] 348 | if pleaf and is_nnb(pleaf['morphs']) : 349 | ppleaf = None 350 | if pleaf['pleaf'] : 351 | ppleaf = pleaf['pleaf'] 352 | if ppleaf and is_etm(ppleaf['morphs']) : 353 | return True 354 | return False 355 | 356 | def find_for_vx_rule(node, gov_node) : 357 | found = None 358 | t_next = gov_node['parent'] 359 | while t_next : 360 | # 새로운 지배소가 앞쪽에 있거나 같으면 안됨 361 | if t_next['leaf'] and ('VP' in t_next['label'] or 'VNP' in t_next['label']) and t_next['eoj_idx'] > node['eoj_idx'] : 362 | found = t_next 363 | break 364 | if t_next['lchild'] : 365 | if 'S' in t_next['lchild']['label'] or 'VP' in t_next['lchild']['label'] or 'VNP' in t_next['lchild']['label'] : 366 | t_next = t_next['lchild'] 367 | continue 368 | if t_next['rchild'] : 369 | if 'VP' in t_next['rchild']['label'] or 'VNP' in t_next['rchild']['label'] : 370 | t_next = t_next['rchild'] 371 | continue 372 | t_next = t_next['lchild'] 373 | return found 374 | 375 | def find_for_vnp_rule(node, gov_node) : 376 | found = None 377 | t_next = gov_node['parent'] 378 | while t_next : 379 | # 새로운 지배소가 앞쪽에 있거나 같으면 안됨 380 | if t_next['leaf'] and ('VP' in t_next['label'] or 'VNP' in t_next['label']) and t_next['eoj_idx'] > node['eoj_idx'] : 381 | # 새로운 지배소와 기존 지배소간 거리가 너무 멀어도 안됨 382 | if abs(gov_node['eoj_idx'] - t_next['eoj_idx']) <= 3 : 383 | found = t_next 384 | break 385 | if t_next['lchild'] : 386 | if 'S' in t_next['lchild']['label'] or 'VP' in t_next['lchild']['label'] or 'VNP' in t_next['lchild']['label'] : 387 | t_next = t_next['lchild'] 388 | continue 389 | if t_next['rchild'] : 390 | if 'VP' in t_next['rchild']['label'] or 'VNP' in t_next['rchild']['label'] : 391 | t_next = t_next['rchild'] 392 | continue 393 | t_next = t_next['lchild'] 394 | return found 395 | 396 | def find_for_va_rule(node, gov_node, search_mode=1) : 397 | found = None 398 | if search_mode == 2 : # parent->parent 부터 탐색이 필요한 경우 399 | t_next = gov_node['parent'] 400 | if t_next and t_next['parent'] : 401 | t_next = t_next['parent'] 402 | else : # 일반적인 경우 403 | t_next = gov_node['parent'] 404 | while t_next : 405 | # 새로운 지배소가 앞쪽에 있거나 같으면 안됨 406 | if t_next['leaf'] and ('VP' in t_next['label'] or 'VNP' in t_next['label']) and t_next['eoj_idx'] > node['eoj_idx'] : 407 | # 새로운 지배소와 기존 지배소간 거리가 너무 멀어도 안됨 408 | if abs(gov_node['eoj_idx'] - t_next['eoj_idx']) <= 3 : 409 | found = t_next 410 | break 411 | t_next = t_next['lchild'] 412 | return found 413 | 414 | def find_gov(node) : 415 | ''' 416 | * node = leaf node 417 | 418 | 1. head final rule 419 | - 현재 node에서 parent를 따라가면서 420 | 첫번째로 right child가 있는 node를 만나면 421 | 해당 node의 right child를 따라서 leaf node까지 이동 422 | 2. VX rule 423 | - 보조용언을 governor로 갖는다면 본용언으로 바꿔준다. 424 | - 보조용언은 아니지만 보조용언처럼 동작하는 용언도 비슷하게 처리한다. ex) '지니게 되다' 425 | 3. VNP rule 426 | - 'VNP 것/NNB + 이/VCP + 다/EF' 형태를 governor로 갖는다면 앞쪽 용언으로 바꿔준다. 427 | 4. VA rule 428 | - '있/VA, 없/VA, 같/VA'가 governor인 경우, 앞쪽에 'ㄹ NNB' 형태가 오면 앞쪽 용언으로 바꿔준다. 429 | node['pleaf'] 링크를 활용한다. 430 | ''' 431 | # 첫번째로 right child가 있는 node를 탐색 432 | # sibling link를 활용한다. 433 | next = node 434 | found = None 435 | while next : 436 | if next['sibling'] : 437 | found = next['sibling']['parent'] 438 | break 439 | next = next['parent'] 440 | 441 | gov_node = None 442 | if found : 443 | # right child를 따라서 leaf node까지 444 | next = found 445 | while next : 446 | if next['leaf'] : 447 | gov_node = next 448 | # ----------------------------------------------------------------- 449 | # gov_node가 vx rule을 만족하는 경우 parent->lchild를 따라간다. 450 | if check_vx_rule(gov_node) : 451 | new_gov_node = find_for_vx_rule(node, gov_node) 452 | if new_gov_node : gov_node = new_gov_node 453 | # gov_node가 vnp rule을 만족하는 경우 parent->lchild를 따라간다. 454 | if check_vnp_rule(gov_node) : 455 | new_gov_node = find_for_vnp_rule(node, gov_node) 456 | if new_gov_node : 457 | gov_node = new_gov_node 458 | # 새로운 지배소가 '있다,없다,같다'인 경우 459 | # check_va_rule을 한번 태워본다. 460 | if check_va_rule(gov_node) : 461 | new_gov_node = find_for_va_rule(node, gov_node, search_mode=2) 462 | if new_gov_node : gov_node = new_gov_node 463 | # gov_node가 va rule을 만족하는 경우 parent->lchild를 따라간다. 464 | if check_va_rule(gov_node) : 465 | new_gov_node = find_for_va_rule(node, gov_node, search_mode=1) 466 | if new_gov_node : gov_node = new_gov_node 467 | # ----------------------------------------------------------------- 468 | break 469 | next = next['rchild'] 470 | if gov_node : 471 | return gov_node['eoj_idx'] 472 | return 0 473 | 474 | 475 | def tree2dep(node, depth=0) : 476 | ''' 477 | tree에서 dependency 구조를 뽑아낸다. 478 | ''' 479 | if node['leaf'] : 480 | eoj_idx = node['eoj_idx'] 481 | eoj = node['eoj'] 482 | morphs = modify_morphs(node['morphs']) 483 | label = node['label'] 484 | gov = find_gov(node) 485 | out = [eoj_idx, eoj, morphs, label, gov] 486 | print '\t'.join([str(e) for e in out]) 487 | if node['lchild'] : 488 | tree2dep(node['lchild'], depth+1) 489 | if node['rchild'] : 490 | tree2dep(node['rchild'], depth+1) 491 | 492 | def find_ep(node) : 493 | ''' 494 | parent를 따라서 처음으로 VP_MOD,S_MOD,VNP_MOD가 아닌 node를 탐색 495 | 해당 node의 most left leaf = ep begin 496 | 해당 node의 most right leaf = ep end 497 | ''' 498 | next = node 499 | found = None 500 | while next : 501 | if next['label'] not in ['VP_MOD','VNP_MOD','S_MOD'] : 502 | found = next 503 | break 504 | next = next['parent'] 505 | 506 | left_ep = None 507 | right_ep = None 508 | if found : 509 | # left child를 따라서 leaf node까지 510 | next = found 511 | while next : 512 | if next['leaf'] : 513 | left_ep = next 514 | break 515 | next = next['lchild'] 516 | # right child를 따라서 leaf node까지 517 | next = found 518 | while next : 519 | if next['leaf'] : 520 | right_ep = next 521 | break 522 | next = next['rchild'] 523 | if left_ep and right_ep : 524 | return left_ep['eoj_idx'], right_ep['eoj_idx'] 525 | return 0,0 526 | 527 | def is_ec(morphs) : 528 | tokens = morphs.split('+') 529 | if '/EC' in tokens[-1] : return True 530 | if '/SP' in tokens[-1] and len(tokens) >= 2 and '/EC' in tokens[-2] : return True 531 | return False 532 | 533 | def find_sp(node) : 534 | ''' 535 | parent를 따라서 처음으로 VP,S,VNP_CMP가 아닌 node를 탐색 536 | 단, 현재 node는 parent의 right child여야 한다. 537 | 정지하기 전 node에 대해서 538 | 해당 node의 most left leaf = sp begin 539 | ''' 540 | next = node 541 | prev = None 542 | found = None 543 | while next : 544 | if next['label'] not in ['VP','S','VNP_CMP'] : 545 | found = prev 546 | break 547 | if next['sibling'] : 548 | found = next 549 | break 550 | prev = next 551 | next = next['parent'] 552 | 553 | left_sp = None 554 | if found : 555 | # left child를 따라서 leaf node까지 556 | next = found 557 | while next : 558 | if next['leaf'] : 559 | left_sp = next 560 | break 561 | next = next['lchild'] 562 | if left_sp : 563 | return left_sp['eoj_idx'] 564 | return 0 565 | 566 | def tree2embedded(node, depth=0) : 567 | ''' 568 | tree에서 embedded phrase/clause 구조를 뽑아낸다. 569 | ''' 570 | if node['leaf'] : 571 | eoj_idx = node['eoj_idx'] 572 | eoj = node['eoj'] 573 | morphs = modify_morphs(node['morphs']) 574 | label = node['label'] 575 | gov = find_gov(node) 576 | ep_begin = 0 577 | ep_end = 0 578 | if label in ['VP_MOD','VNP_MOD'] : 579 | ep_begin,ep_end = find_ep(node) 580 | sp_begin = 0 581 | sp_end = 0 582 | if label in ['VP','VNP','VNP_CMP'] and is_ec(node['morphs']) : 583 | sp_begin = find_sp(node) 584 | if sp_begin != 0 : 585 | sp_end = eoj_idx 586 | if sp_begin == sp_end : # 같은 경우는 의미없음 587 | sp_begin = 0 588 | sp_end = 0 589 | out = [eoj_idx, eoj, morphs, label, gov, ep_begin, ep_end, sp_begin, sp_end] 590 | print '\t'.join([str(e) for e in out]) 591 | if node['lchild'] : 592 | tree2embedded(node['lchild'], depth+1) 593 | if node['rchild'] : 594 | tree2embedded(node['rchild'], depth+1) 595 | # ------------------------------------------------------------------------- 596 | 597 | def spill(bucket, mode) : 598 | 599 | # -------------------------------------------------------------- 600 | # ill-formed filtering and build tree 601 | sent, tokens = tokenize(bucket) 602 | if not sent : return False 603 | tree = build_tree(sent, tokens) 604 | if not tree : return False 605 | # begin with tree['lchild'](ROOT 제외) 606 | t_tokens = [] 607 | tree2tokens(tree['lchild'], t_tokens, depth=0) 608 | if tokens != t_tokens : 609 | sys.stderr.write("input parentheses != tree2tokens\n") 610 | sys.stderr.write("input = %s\n" % (' '.join(tokens))) 611 | sys.stderr.write("tree2tokens = %s\n" % (' '.join(t_tokens))) 612 | return False 613 | # -------------------------------------------------------------- 614 | 615 | if mode == 0 : # print constituent tree 616 | print ' '.join(sent) 617 | t_tokens = [] 618 | history = [] 619 | tree2con(tree['lchild'], t_tokens, history, depth=0) 620 | print ''.join(t_tokens).strip() 621 | if mode == 1 : # print dependency tree 622 | tree2dep(tree['lchild'], depth=0) 623 | if mode == 2 : # print embedded phrase/clause tagged tree 624 | tree2embedded(tree['lchild'], depth=0) 625 | 626 | print '\n', 627 | return True 628 | 629 | if __name__ == '__main__': 630 | 631 | parser = OptionParser() 632 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 633 | parser.add_option("-m", "--mode", dest="mode", help="mode : 0(constituent), 1(dependency), 2(embedded phrase/clause)", metavar="mode") 634 | (options, args) = parser.parse_args() 635 | 636 | if options.verbose : VERBOSE = 1 637 | 638 | mode = options.mode 639 | if mode == None : mode = 0 640 | else : mode = int(mode) 641 | 642 | bucket = [] 643 | while 1: 644 | try: 645 | line = sys.stdin.readline() 646 | except KeyboardInterrupt: 647 | break 648 | if not line: 649 | break 650 | line = line.strip() 651 | 652 | if not line and len(bucket) >= 1 : 653 | ret = spill(bucket, mode) 654 | bucket = [] 655 | continue 656 | 657 | if line : bucket.append(line) 658 | 659 | if len(bucket) != 0 : 660 | ret = spill(bucket, mode) 661 | 662 | -------------------------------------------------------------------------------- /dist_lda.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | ''' 5 | read http://radimrehurek.com/gensim/dist_lda.html 6 | here is test code 7 | ''' 8 | 9 | import os 10 | import sys 11 | reload(sys) 12 | sys.setdefaultencoding('utf-8') 13 | import re 14 | from optparse import OptionParser 15 | from gensim import corpora, models, similarities, matutils 16 | import logging 17 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 18 | 19 | def construct_dictionary(documents_path, filter=None) : 20 | # collect statistics about all tokens 21 | dictionary = corpora.Dictionary(line.lower().split() for line in open(documents_path)) 22 | 23 | if filter : 24 | # remove stop words and words that appear only once 25 | stoplist = set('for a of the and to in'.split()) 26 | stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id] 27 | once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] 28 | dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once 29 | dictionary.compactify() # remove gaps in id sequence after words that were removed 30 | 31 | return dictionary 32 | 33 | def save_dictionary(dictionary, dictionary_path) : 34 | dictionary.save(dictionary_path) 35 | 36 | def load_dictionary(dictionary_path) : 37 | dictionary = corpora.Dictionary().load(dictionary_path,mmap='r') 38 | return dictionary 39 | 40 | def save_corpus(corpus, corpus_path, format=None) : 41 | if format == 'svmlight' : # Joachim’s SVMlight format 42 | corpora.SvmLightCorpus.serialize(corpus_path, corpus) 43 | if format == 'lda-c' : # Blei’s LDA-C format 44 | corpora.BleiCorpus.serialize(corpus_path, corpus) 45 | if format == 'low' : # GibbsLDA++ format 46 | corpora.LowCorpus.serialize(corpus_path, corpus) 47 | if not format : # Matrix Market format 48 | corpora.MmCorpus.serialize(corpus_path, corpus) 49 | 50 | def load_corpus(corpus_path) : 51 | corpus = corpora.MmCorpus(corpus_path) 52 | return corpus 53 | 54 | def corpus_to_tfidf(corpus) : 55 | tfidf = models.TfidfModel(corpus, normalize=True) # step 1 -- initialize a model 56 | ''' 57 | corpus_tfidf = tfidf[corpus] 58 | for doc in corpus_tfidf: 59 | print doc 60 | ''' 61 | return tfidf 62 | 63 | def save_tfidf(tfidf, tfidf_path) : 64 | tfidf.save(tfidf_path) 65 | 66 | def load_tfidf(tfidf_path) : 67 | tfidf = models.TfidfModel.load(tfidf_path) 68 | return tfidf 69 | 70 | def corpus_to_lsi(corpus, tfidf, dictionary, topic_number) : 71 | corpus_tfidf = tfidf[corpus] 72 | lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_number) # initialize an LSI transformation 73 | ''' 74 | corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi 75 | lsi.print_topics(3) 76 | for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly 77 | print doc 78 | ''' 79 | return lsi 80 | 81 | def save_lsi(lsi, lsi_path) : 82 | lsi.save(lsi_path) 83 | 84 | def load_lsi(lsi_path) : 85 | lsi = models.LsiModel.load(lsi_path) 86 | return lsi 87 | 88 | def corpus_to_lda(corpus, dictionary, topic_number) : 89 | lda = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_number) 90 | return lda 91 | 92 | def save_lda(lda, lda_path) : 93 | lda.save(lda_path) 94 | 95 | def load_lda(lda_path) : 96 | lda = models.LdaModel.load(lda_path) 97 | return lda 98 | 99 | def corpus_to_lsi_dist(corpus, dictionary, topic_number) : 100 | lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=topic_number, chunksize=10000, distributed=True) 101 | return lsi 102 | 103 | def corpus_to_lda_dist(corpus, dictionary, topic_number) : 104 | lda = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_number, update_every=1, chunksize=10000, passes=1, distributed=True) 105 | return lda 106 | 107 | ''' 108 | python2.7 dist_lda.py --dictionary=document.txt.dict --corpus=document.txt.mm --lda=document.txt.lda 109 | ''' 110 | if __name__ == '__main__': 111 | 112 | parser = OptionParser() 113 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 114 | parser.add_option("-d", "--dictionary", dest="dictionary",help="dictionary", metavar="DICT") 115 | parser.add_option("-c", "--corpus", dest="corpus",help="corpus", metavar="CORPUS") 116 | parser.add_option("-a", "--lda", dest="lda",help="lda, output file", metavar="LDA") 117 | (options, args) = parser.parse_args() 118 | 119 | if options.verbose == 1 : VERBOSE = 1 120 | 121 | dictionary_path = options.dictionary 122 | if dictionary_path == None : 123 | parser.print_help() 124 | sys.exit(1) 125 | 126 | corpus_path = options.corpus 127 | if corpus_path == None : 128 | parser.print_help() 129 | sys.exit(1) 130 | 131 | lda_path = options.lda 132 | if lda_path == None : 133 | parser.print_help() 134 | sys.exit(1) 135 | 136 | dictionary = load_dictionary(dictionary_path) 137 | corpus = load_corpus(corpus_path) 138 | 139 | lda = corpus_to_lda_dist(corpus, dictionary, 200) 140 | save_lda(lda, lda_path) 141 | -------------------------------------------------------------------------------- /dist_lsi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | ''' 5 | read http://radimrehurek.com/gensim/dist_lsi.html 6 | here is test code 7 | ''' 8 | 9 | import os 10 | import sys 11 | reload(sys) 12 | sys.setdefaultencoding('utf-8') 13 | import re 14 | from optparse import OptionParser 15 | from gensim import corpora, models, similarities, matutils 16 | import logging 17 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 18 | 19 | def construct_dictionary(documents_path, filter=None) : 20 | # collect statistics about all tokens 21 | dictionary = corpora.Dictionary(line.lower().split() for line in open(documents_path)) 22 | 23 | if filter : 24 | # remove stop words and words that appear only once 25 | stoplist = set('for a of the and to in'.split()) 26 | stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id] 27 | once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] 28 | dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once 29 | dictionary.compactify() # remove gaps in id sequence after words that were removed 30 | 31 | return dictionary 32 | 33 | def save_dictionary(dictionary, dictionary_path) : 34 | dictionary.save(dictionary_path) 35 | 36 | def load_dictionary(dictionary_path) : 37 | dictionary = corpora.Dictionary().load(dictionary_path,mmap='r') 38 | return dictionary 39 | 40 | def save_corpus(corpus, corpus_path, format=None) : 41 | if format == 'svmlight' : # Joachim’s SVMlight format 42 | corpora.SvmLightCorpus.serialize(corpus_path, corpus) 43 | if format == 'lda-c' : # Blei’s LDA-C format 44 | corpora.BleiCorpus.serialize(corpus_path, corpus) 45 | if format == 'low' : # GibbsLDA++ format 46 | corpora.LowCorpus.serialize(corpus_path, corpus) 47 | if not format : # Matrix Market format 48 | corpora.MmCorpus.serialize(corpus_path, corpus) 49 | 50 | def load_corpus(corpus_path) : 51 | corpus = corpora.MmCorpus(corpus_path) 52 | return corpus 53 | 54 | def corpus_to_tfidf(corpus) : 55 | tfidf = models.TfidfModel(corpus, normalize=True) # step 1 -- initialize a model 56 | ''' 57 | corpus_tfidf = tfidf[corpus] 58 | for doc in corpus_tfidf: 59 | print doc 60 | ''' 61 | return tfidf 62 | 63 | def save_tfidf(tfidf, tfidf_path) : 64 | tfidf.save(tfidf_path) 65 | 66 | def load_tfidf(tfidf_path) : 67 | tfidf = models.TfidfModel.load(tfidf_path) 68 | return tfidf 69 | 70 | def corpus_to_lsi(corpus, tfidf, dictionary, topic_number) : 71 | corpus_tfidf = tfidf[corpus] 72 | lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_number) # initialize an LSI transformation 73 | ''' 74 | corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi 75 | lsi.print_topics(3) 76 | for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly 77 | print doc 78 | ''' 79 | return lsi 80 | 81 | def save_lsi(lsi, lsi_path) : 82 | lsi.save(lsi_path) 83 | 84 | def load_lsi(lsi_path) : 85 | lsi = models.LsiModel.load(lsi_path) 86 | return lsi 87 | 88 | def corpus_to_lda(corpus, dictionary, topic_number) : 89 | lda = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_number) 90 | return lda 91 | 92 | def save_lda(lda, lda_path) : 93 | lda.save(lda_path) 94 | 95 | def load_lda(lda_path) : 96 | lda = models.LdaModel.load(lda_path) 97 | return lda 98 | 99 | def corpus_to_lsi_dist(corpus, dictionary, topic_number) : 100 | lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=topic_number, chunksize=10000, distributed=True) 101 | return lsi 102 | 103 | ''' 104 | python2.7 dist_lsi.py --dictionary=document.txt.dict --corpus=document.txt.mm --lsi=document.txt.lsi 105 | ''' 106 | if __name__ == '__main__': 107 | 108 | parser = OptionParser() 109 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 110 | parser.add_option("-d", "--dictionary", dest="dictionary",help="dictionary", metavar="DICT") 111 | parser.add_option("-c", "--corpus", dest="corpus",help="corpus", metavar="CORPUS") 112 | parser.add_option("-l", "--lsi", dest="lsi",help="lsi, output file", metavar="LSI") 113 | (options, args) = parser.parse_args() 114 | 115 | if options.verbose == 1 : VERBOSE = 1 116 | 117 | dictionary_path = options.dictionary 118 | if dictionary_path == None : 119 | parser.print_help() 120 | sys.exit(1) 121 | 122 | corpus_path = options.corpus 123 | if corpus_path == None : 124 | parser.print_help() 125 | sys.exit(1) 126 | 127 | lsi_path = options.lsi 128 | if lsi_path == None : 129 | parser.print_help() 130 | sys.exit(1) 131 | 132 | dictionary = load_dictionary(dictionary_path) 133 | corpus = load_corpus(corpus_path) 134 | 135 | lsi = corpus_to_lsi_dist(corpus, dictionary, 200) 136 | save_lsi(lsi, lsi_path) 137 | -------------------------------------------------------------------------------- /document_to_corpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | ''' 5 | read http://radimrehurek.com/gensim/tut1.html 6 | here is test code 7 | ''' 8 | 9 | import os 10 | import sys 11 | reload(sys) 12 | sys.setdefaultencoding('utf-8') 13 | import re 14 | from optparse import OptionParser 15 | from gensim import corpora, models, similarities, matutils 16 | import logging 17 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 18 | import numpy 19 | import scipy 20 | 21 | def construct_dictionary(documents_path, filter=None) : 22 | # collect statistics about all tokens 23 | dictionary = corpora.Dictionary(line.lower().split() for line in open(documents_path)) 24 | 25 | if filter : 26 | # remove stop words and words that appear only once 27 | stoplist = set('for a of the and to in'.split()) 28 | stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id] 29 | once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] 30 | dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once 31 | dictionary.compactify() # remove gaps in id sequence after words that were removed 32 | 33 | return dictionary 34 | 35 | def save_dictionary(dictionary, dictionary_path) : 36 | dictionary.save(dictionary_path) 37 | 38 | def load_dictionary(dictionary_path) : 39 | dictionary = corpora.Dictionary().load(dictionary_path,mmap='r') 40 | return dictionary 41 | 42 | def save_corpus(corpus, corpus_path, format=None) : 43 | if format == 'svmlight' : # Joachim’s SVMlight format 44 | corpora.SvmLightCorpus.serialize(corpus_path, corpus) 45 | if format == 'lda-c' : # Blei’s LDA-C format 46 | corpora.BleiCorpus.serialize(corpus_path, corpus) 47 | if format == 'low' : # GibbsLDA++ format 48 | corpora.LowCorpus.serialize(corpus_path, corpus) 49 | if not format : # Matrix Market format 50 | corpora.MmCorpus.serialize(corpus_path, corpus) 51 | 52 | def load_corpus(corpus_path) : 53 | corpus = corpora.MmCorpus(corpus_path) 54 | return corpus 55 | 56 | def corpus_to_dense(corpus, dictionary) : 57 | num_terms = len(dictionary.token2id) 58 | numpy_matrix = matutils.corpus2dense(corpus, num_terms) 59 | return numpy_matrix 60 | 61 | def dense_to_corpus(numpy_matrix) : 62 | corpus = matutils.Dense2Corpus(numpy_matrix) 63 | return corpus 64 | 65 | def corpus_to_sparse(corpus) : 66 | scipy_csc_matrix = matutils.corpus2csc(corpus) 67 | return scipy_csc_matrix 68 | 69 | def sparse_to_corpus(scipy_csc_matrix) : 70 | corpus = matutils.Sparse2Corpus(scipy_csc_matrix) 71 | return corpus 72 | 73 | ''' 74 | python2.7 documents_to_corpus.py -d documents.txt < documents.txt 75 | ''' 76 | if __name__ == '__main__': 77 | 78 | parser = OptionParser() 79 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 80 | parser.add_option("-d", "--documents", dest="documents",help="documents", metavar="DOCS") 81 | (options, args) = parser.parse_args() 82 | 83 | if options.verbose == 1 : VERBOSE = 1 84 | 85 | documents_path = options.documents 86 | if documents_path == None : 87 | parser.print_help() 88 | sys.exit(1) 89 | 90 | dictionary = construct_dictionary(documents_path) 91 | 92 | corpus = [] 93 | linecount = 0 94 | while 1 : 95 | try : line = sys.stdin.readline() 96 | except KeyboardInterrupt : break 97 | if not line : break 98 | try : line = line.strip() 99 | except : continue 100 | if not line : continue 101 | linecount += 1 102 | if linecount % 1000 == 0 : 103 | sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n") 104 | 105 | vector = dictionary.doc2bow(line.lower().split()) 106 | ''' 107 | for id,tf in vector : 108 | print dictionary.get(id) + "\t" + str(tf) 109 | ''' 110 | corpus.append(vector) 111 | 112 | dictionary_path = documents_path + '.dict' 113 | save_dictionary(dictionary, dictionary_path) 114 | corpus_path = documents_path + '.mm' 115 | save_corpus(corpus, corpus_path) 116 | -------------------------------------------------------------------------------- /env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | 5 | export LC_ALL=ko_KR.UTF-8 6 | export LANG=ko_KR.UTF-8 7 | 8 | # directory 9 | ## current dir of this script 10 | CDIR=$(readlink -f $(dirname $(readlink -f ${BASH_SOURCE[0]}))) 11 | PDIR=$(readlink -f $(dirname $(readlink -f ${BASH_SOURCE[0]}))/..) 12 | 13 | IRSTLM=/usr/local/irstlm/bin 14 | DOC=../doc.txt 15 | DICT=dict 16 | NGRAM=ngram 17 | LM=lm 18 | iARPA=iarpa_lm 19 | qARPA=qarpa_lm 20 | ARPA=arpa_lm 21 | SPLIT=8 22 | NGRAM_SIZE=2 23 | KENLM=../package/kenlm/bin 24 | 25 | # command setting 26 | python='/usr/local/bin/python2.7' 27 | pig='pig' 28 | hls='hadoop fs -ls' 29 | hget='hadoop fs -get' 30 | hmkdir='hadoop fs -mkdir' 31 | hrm='hadoop fs -rm -skipTrash' 32 | hrmr='hadoop fs -rm -r -skipTrash' 33 | hmv='hadoop fs -mv' 34 | hcp='hadoop fs -cp' 35 | hcat='hadoop fs -cat' 36 | hput='hadoop fs -copyFromLocal' 37 | htest='hadoop fs -test -e' 38 | htestd='hadoop fs -test -d' 39 | hmerge='hadoop fs -getmerge' 40 | hdu='hadoop fs -du' 41 | 42 | # functions 43 | 44 | function make_calmness() 45 | { 46 | exec 3>&2 # save 2 to 3 47 | exec 2> /dev/null 48 | } 49 | 50 | function revert_calmness() 51 | { 52 | exec 2>&3 # restore 2 from previous saved 3(originally 2) 53 | } 54 | 55 | function close_fd() 56 | { 57 | exec 3>&- 58 | } 59 | 60 | function jumpto 61 | { 62 | label=$1 63 | cmd=$(sed -n "/$label:/{:a;n;p;ba};" $0 | grep -v ':$') 64 | eval "$cmd" 65 | exit 66 | } 67 | -------------------------------------------------------------------------------- /fetch_url.go: -------------------------------------------------------------------------------- 1 | import ( 2 | "net/url" 3 | "encoding/json" 4 | "log" 5 | ) 6 | 7 | func fetch(q *Query) bool { 8 | ok, res := utils.HttpGet(API + url.QueryEscape(q.query)) 9 | if !ok { 10 | return false 11 | } 12 | var f map[string]interface{} 13 | if err := json.Unmarshal([]byte(res), &f); err != nil { 14 | return false 15 | } 16 | m := f["output"].(map[string]interface{}) 17 | if val := m["val"]; val != "" { 18 | q.result = val.(string) 19 | log.Printf("[FETCH] %s", q.result) 20 | return true 21 | } 22 | return false 23 | } 24 | -------------------------------------------------------------------------------- /hello.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "math/cmplx" 7 | "math/rand" 8 | "runtime" 9 | "time" 10 | ) 11 | 12 | func add(x int, y int) int { 13 | return x + y 14 | } 15 | 16 | func swap(x, y string) (string, string) { 17 | return y, x 18 | } 19 | 20 | func split(sum int) (x, y int) { 21 | x = sum * 4 / 9 22 | y = sum - x 23 | return 24 | } 25 | 26 | func variable_test() { 27 | fmt.Println("Welcome to the playground!") 28 | fmt.Println("The time is", time.Now()) 29 | fmt.Println("My favorite number is", rand.Intn(10)) 30 | fmt.Println("Now you have %g problems.", math.Nextafter(2, 4)) 31 | fmt.Println(math.Pi) 32 | fmt.Println(add(42, 13)) 33 | a, b := swap("hello", "world") 34 | fmt.Println(a, b) 35 | fmt.Println(split(17)) 36 | 37 | //var c, python, java bool 38 | var i, j int = 1, 2 39 | var c, python, java = true, false, "no!" 40 | k := 3 41 | fmt.Println(i, j, k, c, python, java) 42 | 43 | var ( 44 | ToBe bool = false 45 | MaxInt uint64 = 1<<64 - 1 46 | z complex128 = cmplx.Sqrt(-5 + 12i) 47 | ) 48 | const f = "%T(%v)\n" 49 | fmt.Printf(f, ToBe, ToBe) 50 | fmt.Printf(f, MaxInt, MaxInt) 51 | fmt.Printf(f, z, z) 52 | 53 | var m int 54 | var n float64 55 | var e bool 56 | var s string 57 | fmt.Printf("%v %v %v %q\n", m, n, e, s) 58 | } 59 | 60 | func needInt(x int) int { 61 | return x*10 + 1 62 | } 63 | 64 | func needFloat(x float64) float64 { 65 | return x * 0.1 66 | } 67 | 68 | func const_test() { 69 | var x, y int = 3, 4 70 | var f float64 = math.Sqrt(float64(x*x + y*y)) 71 | var z int = int(f) 72 | fmt.Printf("%d %d %f %d\n", x, y, f, z) 73 | fmt.Printf("f is of type %T\n", f) 74 | 75 | const Pi = 3.14 76 | const World = "世界" 77 | fmt.Println("Hello", World) 78 | fmt.Println("Happy", Pi, "Day") 79 | const Truth = true 80 | fmt.Println("Go rules?", Truth) 81 | const ( 82 | Big = 1 << 100 83 | Small = Big >> 99 84 | ) 85 | fmt.Println(needInt(Small)) 86 | fmt.Println(needFloat(Small)) 87 | fmt.Println(needFloat(Big)) 88 | } 89 | 90 | func for_test() { 91 | sum := 0 92 | for i := 0; i < 10; i++ { 93 | sum += i 94 | } 95 | fmt.Println(sum) 96 | } 97 | 98 | func pow(x, n, lim float64) float64 { 99 | if v := math.Pow(x, n); v < lim { 100 | return v 101 | } else { 102 | fmt.Printf("%g >= %g\n", v, lim) 103 | } 104 | // can't use v here, though 105 | return lim 106 | } 107 | 108 | func if_test() { 109 | fmt.Println( 110 | pow(3, 2, 10), 111 | pow(3, 3, 20), 112 | ) 113 | } 114 | 115 | func switch_test() { 116 | fmt.Print("Go runs on ") 117 | switch os := runtime.GOOS; os { 118 | case "darwin": 119 | fmt.Println("OS X.") 120 | case "linux": 121 | fmt.Println("Linux.") 122 | default: 123 | // freebsd, openbsd, 124 | // plan9, windows... 125 | fmt.Printf("%s.", os) 126 | } 127 | fmt.Println("When's Saturday?") 128 | today := time.Now().Weekday() 129 | fmt.Println("today is", today) 130 | fmt.Println("today + 2 is", today+2) 131 | if today == time.Saturday-2 { 132 | fmt.Println("Saturday - 2 is today") 133 | } 134 | switch time.Saturday { 135 | case today + 0: 136 | fmt.Println("Today.") 137 | case today + 1: 138 | fmt.Println("Tomorrow.") 139 | case today + 2: 140 | fmt.Println("In two days.") 141 | default: 142 | fmt.Println("Too far away.") 143 | } 144 | t := time.Now() 145 | switch { 146 | case t.Hour() < 12: 147 | fmt.Println("Good morning!") 148 | case t.Hour() < 17: 149 | fmt.Println("Good afternoon.") 150 | default: 151 | fmt.Println("Good evening.") 152 | } 153 | } 154 | 155 | func defer_test_1() { 156 | defer fmt.Println("world!") // 2 157 | fmt.Println("hello") // 1 158 | } 159 | 160 | func defer_test_2() { 161 | defer_test_1() 162 | 163 | fmt.Println("counting") 164 | 165 | for i := 0; i < 10; i++ { 166 | defer fmt.Println(i) // reverse order 167 | } 168 | 169 | fmt.Println("done") 170 | } 171 | 172 | func pointer_test() { 173 | 174 | i, j := 42, 2701 175 | 176 | p := &i // point to i 177 | fmt.Println(*p) // read i through the pointer 178 | *p = 21 // set i through the pointer 179 | fmt.Println(i) // see the new value of i 180 | 181 | p = &j // point to j 182 | *p = *p / 37 // divide j through the pointer 183 | fmt.Println(j) // see the new value of j 184 | } 185 | 186 | type Vertex struct { 187 | X int 188 | Y int 189 | } 190 | 191 | func struct_test() { 192 | //var v Vertex = Vertex{1, 2} 193 | v := Vertex{1, 2} 194 | p := &v 195 | fmt.Println(p) 196 | p.X = 1e9 // 1000000000 197 | fmt.Printf("%d %d\n", p.X, p.Y) 198 | fmt.Printf("%d %d\n", v.X, v.Y) 199 | 200 | var ( 201 | v1 = Vertex{1, 2} // has type Vertex 202 | v2 = Vertex{X: 1} // Y:0 is implicit 203 | v3 = Vertex{} // X:0 and Y:0 204 | q = &Vertex{1, 2} // has type *Vertex 205 | ) 206 | fmt.Println(v1, q, v2, v3) 207 | } 208 | 209 | func array_test() { 210 | var a [2]string // [n]T, static size of array 211 | a[0] = "Hello" 212 | a[1] = "World" 213 | fmt.Println(a[0], a[1]) 214 | fmt.Println(a) 215 | 216 | var v = [3]int{1, 2, 3} 217 | fmt.Println(v) 218 | } 219 | 220 | func slice_test_1() { 221 | s := []int{2, 3, 5, 7, 11, 13} // initialize slice 222 | fmt.Println("s ==", s) 223 | 224 | for i := 0; i < len(s); i++ { 225 | fmt.Printf("s[%d] == %d\n", i, s[i]) 226 | } 227 | 228 | w := []int{2, 3, 5, 7, 11, 13} 229 | fmt.Println("w ==", w) 230 | fmt.Println("w[1:4] ==", w[1:4]) // index 1 ~ index 4-1 231 | 232 | // missing low index implies 0 233 | fmt.Println("w[:3] ==", w[:3]) // index 0 ~ index 3-1 234 | 235 | // missing high index implies len(s) 236 | fmt.Println("w[4:] ==", w[4:]) // index 4 ~ end 237 | } 238 | 239 | func printSlice(s string, x []int) { 240 | fmt.Printf("%s len=%d cap=%d %v\n", 241 | s, len(x), cap(x), x) 242 | } 243 | 244 | func slice_test_2() { 245 | // slice is reference to an array 246 | // for example 247 | // var v = [3]int{1,2,3} is an array, its type is [3]int 248 | // var v = []int{1,2,3} is a slice 249 | a := make([]int, 5) // slice refers to int array, size 5, cap 5(==size), zero initialzed 250 | printSlice("a", a) 251 | b := make([]int, 0, 5) // slice refers to int array, size 0, cap 5 252 | printSlice("b", b) 253 | 254 | c := b[:2] // size = 2, cap 5 is copied <---- XXX why? 255 | printSlice("c", c) 256 | d := c[2:5] // size = 3, cap 3 257 | printSlice("d", d) 258 | 259 | var z []int 260 | fmt.Println(z, len(z), cap(z)) 261 | if z == nil { // nill slice 262 | fmt.Println("nil!") 263 | } 264 | } 265 | 266 | -------------------------------------------------------------------------------- /images/Bernoulli_distribution_estimation_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/Bernoulli_distribution_estimation_1.png -------------------------------------------------------------------------------- /images/Bernoulli_distribution_estimation_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/Bernoulli_distribution_estimation_2.png -------------------------------------------------------------------------------- /images/Bernoulli_distribution_estimation_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/Bernoulli_distribution_estimation_3.png -------------------------------------------------------------------------------- /images/Bernoulli_distribution_estimation_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/Bernoulli_distribution_estimation_4.png -------------------------------------------------------------------------------- /images/DNC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/DNC.png -------------------------------------------------------------------------------- /images/GLU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/GLU.png -------------------------------------------------------------------------------- /images/MindMeld_MessagingInterfacesDemystified.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/MindMeld_MessagingInterfacesDemystified.pdf -------------------------------------------------------------------------------- /images/SGNS_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/SGNS_1.png -------------------------------------------------------------------------------- /images/SGNS_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/SGNS_2.png -------------------------------------------------------------------------------- /images/additive_multiplicative_attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/additive_multiplicative_attention.png -------------------------------------------------------------------------------- /images/aho-corasick.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/aho-corasick.png -------------------------------------------------------------------------------- /images/allocation-weighting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/allocation-weighting.png -------------------------------------------------------------------------------- /images/alpha_beta.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/alpha_beta.png -------------------------------------------------------------------------------- /images/attention_def1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/attention_def1.png -------------------------------------------------------------------------------- /images/attention_def2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/attention_def2.png -------------------------------------------------------------------------------- /images/attention_def3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/attention_def3.png -------------------------------------------------------------------------------- /images/backward-beta.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/backward-beta.jpeg -------------------------------------------------------------------------------- /images/binomial_estimation_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/binomial_estimation_1.png -------------------------------------------------------------------------------- /images/binomial_estimation_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/binomial_estimation_2.png -------------------------------------------------------------------------------- /images/binomial_estimation_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/binomial_estimation_3.png -------------------------------------------------------------------------------- /images/binomial_estimation_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/binomial_estimation_4.png -------------------------------------------------------------------------------- /images/bm.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/bm.jpg -------------------------------------------------------------------------------- /images/brat_sejong.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/brat_sejong.png -------------------------------------------------------------------------------- /images/cmp_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_1.png -------------------------------------------------------------------------------- /images/cmp_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_10.png -------------------------------------------------------------------------------- /images/cmp_11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_11.png -------------------------------------------------------------------------------- /images/cmp_12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_12.png -------------------------------------------------------------------------------- /images/cmp_13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_13.png -------------------------------------------------------------------------------- /images/cmp_14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_14.png -------------------------------------------------------------------------------- /images/cmp_15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_15.png -------------------------------------------------------------------------------- /images/cmp_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_2.png -------------------------------------------------------------------------------- /images/cmp_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_3.png -------------------------------------------------------------------------------- /images/cmp_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_4.png -------------------------------------------------------------------------------- /images/cmp_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_5.png -------------------------------------------------------------------------------- /images/cmp_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_6.png -------------------------------------------------------------------------------- /images/cmp_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_7.png -------------------------------------------------------------------------------- /images/cmp_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_8.png -------------------------------------------------------------------------------- /images/cmp_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cmp_9.png -------------------------------------------------------------------------------- /images/cnn_embedding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cnn_embedding.png -------------------------------------------------------------------------------- /images/content-based-addressing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/content-based-addressing.png -------------------------------------------------------------------------------- /images/conv_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/conv_1.jpeg -------------------------------------------------------------------------------- /images/cross_entropy_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cross_entropy_loss.png -------------------------------------------------------------------------------- /images/cross_entropy_loss_many_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/cross_entropy_loss_many_output.png -------------------------------------------------------------------------------- /images/deptree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/deptree.png -------------------------------------------------------------------------------- /images/distribution_function.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/distribution_function.png -------------------------------------------------------------------------------- /images/du_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/du_1.png -------------------------------------------------------------------------------- /images/du_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/du_2.png -------------------------------------------------------------------------------- /images/entropy_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_1.jpg -------------------------------------------------------------------------------- /images/entropy_10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_10.jpg -------------------------------------------------------------------------------- /images/entropy_11.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_11.jpg -------------------------------------------------------------------------------- /images/entropy_12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_12.jpg -------------------------------------------------------------------------------- /images/entropy_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_2.jpg -------------------------------------------------------------------------------- /images/entropy_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_3.jpg -------------------------------------------------------------------------------- /images/entropy_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_4.jpg -------------------------------------------------------------------------------- /images/entropy_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_5.jpg -------------------------------------------------------------------------------- /images/entropy_6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_6.jpg -------------------------------------------------------------------------------- /images/entropy_7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_7.jpg -------------------------------------------------------------------------------- /images/entropy_8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_8.jpg -------------------------------------------------------------------------------- /images/entropy_9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/entropy_9.jpg -------------------------------------------------------------------------------- /images/expectation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/expectation.png -------------------------------------------------------------------------------- /images/forward-alpha.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/forward-alpha.jpeg -------------------------------------------------------------------------------- /images/forward_backward_var.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/forward_backward_var.png -------------------------------------------------------------------------------- /images/four_equation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/four_equation.png -------------------------------------------------------------------------------- /images/hierarchical_attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hierarchical_attention.png -------------------------------------------------------------------------------- /images/hmm_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_1.png -------------------------------------------------------------------------------- /images/hmm_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_2.png -------------------------------------------------------------------------------- /images/hmm_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_3.png -------------------------------------------------------------------------------- /images/hmm_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_4.png -------------------------------------------------------------------------------- /images/hmm_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_5.png -------------------------------------------------------------------------------- /images/hmm_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_6.png -------------------------------------------------------------------------------- /images/hmm_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/hmm_7.png -------------------------------------------------------------------------------- /images/kmp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/kmp.jpg -------------------------------------------------------------------------------- /images/layer_norm_timesteps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/layer_norm_timesteps.png -------------------------------------------------------------------------------- /images/me_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/me_1.png -------------------------------------------------------------------------------- /images/me_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/me_2.png -------------------------------------------------------------------------------- /images/ml_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ml_1.png -------------------------------------------------------------------------------- /images/ml_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ml_2.png -------------------------------------------------------------------------------- /images/ml_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ml_3.png -------------------------------------------------------------------------------- /images/ml_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ml_4.png -------------------------------------------------------------------------------- /images/mult_head_self_attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/mult_head_self_attention.png -------------------------------------------------------------------------------- /images/multi_dimensional_self_attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/multi_dimensional_self_attention.png -------------------------------------------------------------------------------- /images/multi_headed_attention_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/multi_headed_attention_1.png -------------------------------------------------------------------------------- /images/multi_headed_attention_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/multi_headed_attention_2.png -------------------------------------------------------------------------------- /images/multinomial_estimation_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/multinomial_estimation_1.png -------------------------------------------------------------------------------- /images/multinomial_estimation_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/multinomial_estimation_2.png -------------------------------------------------------------------------------- /images/multinomial_estimation_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/multinomial_estimation_3.png -------------------------------------------------------------------------------- /images/ner_attention.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ner_attention.jpg -------------------------------------------------------------------------------- /images/ner_attention_math1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ner_attention_math1.jpg -------------------------------------------------------------------------------- /images/ner_attention_math2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ner_attention_math2.jpg -------------------------------------------------------------------------------- /images/ngram_cnn_highway_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ngram_cnn_highway_1.png -------------------------------------------------------------------------------- /images/ngram_cnn_highway_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ngram_cnn_highway_2.png -------------------------------------------------------------------------------- /images/nn_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/nn_1.jpeg -------------------------------------------------------------------------------- /images/nn_2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/nn_2.jpeg -------------------------------------------------------------------------------- /images/nn_3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/nn_3.jpeg -------------------------------------------------------------------------------- /images/nn_4.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/nn_4.jpeg -------------------------------------------------------------------------------- /images/nn_5.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/nn_5.jpeg -------------------------------------------------------------------------------- /images/ntm-addressing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-addressing.png -------------------------------------------------------------------------------- /images/ntm-content-addressing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-content-addressing.png -------------------------------------------------------------------------------- /images/ntm-interface-vector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-interface-vector.png -------------------------------------------------------------------------------- /images/ntm-interpolation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-interpolation.png -------------------------------------------------------------------------------- /images/ntm-lstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-lstm.png -------------------------------------------------------------------------------- /images/ntm-pseudocode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-pseudocode.png -------------------------------------------------------------------------------- /images/ntm-sharpen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-sharpen.png -------------------------------------------------------------------------------- /images/ntm-shift.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-shift.png -------------------------------------------------------------------------------- /images/ntm-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-test.png -------------------------------------------------------------------------------- /images/ntm-train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm-train.png -------------------------------------------------------------------------------- /images/ntm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/ntm.png -------------------------------------------------------------------------------- /images/p-value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/p-value.png -------------------------------------------------------------------------------- /images/partition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/partition.png -------------------------------------------------------------------------------- /images/pstree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/pstree.png -------------------------------------------------------------------------------- /images/re_attention_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/re_attention_1.png -------------------------------------------------------------------------------- /images/re_attention_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/re_attention_2.png -------------------------------------------------------------------------------- /images/read-vector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/read-vector.png -------------------------------------------------------------------------------- /images/regularization.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/regularization.jpeg -------------------------------------------------------------------------------- /images/retention-vector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/retention-vector.png -------------------------------------------------------------------------------- /images/scaled_dot_product_attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/scaled_dot_product_attention.png -------------------------------------------------------------------------------- /images/sejong_entry.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/sejong_entry.png -------------------------------------------------------------------------------- /images/self-attention-map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/self-attention-map.png -------------------------------------------------------------------------------- /images/self-attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/self-attention.png -------------------------------------------------------------------------------- /images/self_attention_with_fnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/self_attention_with_fnn.png -------------------------------------------------------------------------------- /images/seq2seq_attention_machanism.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/seq2seq_attention_machanism.jpg -------------------------------------------------------------------------------- /images/seq2seq_attention_machanism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/seq2seq_attention_machanism.png -------------------------------------------------------------------------------- /images/seq2seq_autoencoder.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/seq2seq_autoencoder.jpeg -------------------------------------------------------------------------------- /images/time_invariant_self_attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/time_invariant_self_attention.png -------------------------------------------------------------------------------- /images/time_invariant_self_attention_full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/time_invariant_self_attention_full.png -------------------------------------------------------------------------------- /images/transformer_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/transformer_model.png -------------------------------------------------------------------------------- /images/traversal_london.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/traversal_london.png -------------------------------------------------------------------------------- /images/url_sejong.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/url_sejong.png -------------------------------------------------------------------------------- /images/usage-vector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/usage-vector.png -------------------------------------------------------------------------------- /images/variance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/variance.png -------------------------------------------------------------------------------- /images/vbox_port.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/vbox_port.png -------------------------------------------------------------------------------- /images/viterbi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/viterbi.png -------------------------------------------------------------------------------- /images/wor2vec_visualizer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/wor2vec_visualizer.png -------------------------------------------------------------------------------- /images/word2vec_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/word2vec_1.jpeg -------------------------------------------------------------------------------- /images/word2vec_2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/word2vec_2.jpeg -------------------------------------------------------------------------------- /images/word2vec_3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/word2vec_3.jpeg -------------------------------------------------------------------------------- /images/word2vec_4.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/word2vec_4.jpeg -------------------------------------------------------------------------------- /images/word2vec_5.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/word2vec_5.jpeg -------------------------------------------------------------------------------- /images/workbench_fatal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/workbench_fatal.png -------------------------------------------------------------------------------- /images/write-operation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/write-operation.png -------------------------------------------------------------------------------- /images/write-weight-vector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsindex/blog/9dabe2a2bd418096359ed3d73dcf52bfce550981/images/write-weight-vector.png -------------------------------------------------------------------------------- /keras_mlp.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | from keras.models import Sequential 4 | from keras.layers.core import Dense, Dropout, Activation 5 | from keras.optimizers import SGD 6 | import numpy as np 7 | 8 | ## model configuration 9 | model = Sequential() 10 | # Dense(64) is a fully-connected layer with 64 hidden units. 11 | # in the first layer, you must specify the expected input data shape: 12 | # here, 20-dimensional vectors. 13 | model.add(Dense(64, input_dim=20, init='uniform')) 14 | model.add(Activation('tanh')) 15 | model.add(Dropout(0.5)) 16 | model.add(Dense(64, init='uniform')) 17 | model.add(Activation('tanh')) 18 | model.add(Dropout(0.5)) 19 | model.add(Dense(10, init='uniform')) 20 | model.add(Activation('softmax')) 21 | sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) 22 | model.compile(loss='categorical_crossentropy', 23 | optimizer=sgd) 24 | 25 | ## generate train/test data 26 | X_train = [] 27 | y_train = [] 28 | for i in xrange(10000) : 29 | x = np.random.uniform(-1, 1, size=20) 30 | y = [i % 10] 31 | X_train.append(x) 32 | y_train.append(y) 33 | X_train = np.array(X_train) 34 | y_train = np.array(y_train) 35 | print "X_train shape = " + str(X_train.shape) 36 | print "y_train shape = " + str(y_train.shape) 37 | X_test = [] 38 | y_test = [] 39 | for i in xrange(1000) : 40 | x = np.random.uniform(-1, 1, size=20) 41 | y = [i % 10] 42 | X_test.append(x) 43 | y_test.append(y) 44 | X_test = np.array(X_test) 45 | y_test = np.array(y_test) 46 | print "X_test shape = " + str(X_test.shape) 47 | print "y_test shape = " + str(y_test.shape) 48 | 49 | ## training and evalutation 50 | model.fit(X_train, y_train, 51 | nb_epoch=20, 52 | batch_size=100, 53 | show_accuracy=True) 54 | score = model.evaluate(X_test, y_test, batch_size=100, show_accuracy=True) 55 | print('Test score:', score[0]) 56 | print('Test accuracy:', score[1]) 57 | -------------------------------------------------------------------------------- /make_bdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | import os 5 | import sys 6 | import re 7 | from optparse import OptionParser 8 | import time 9 | from bsddb3 import db 10 | 11 | if __name__ == '__main__': 12 | 13 | parser = OptionParser() 14 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 15 | parser.add_option("-d", "--dir", dest="dir",help="home directory", metavar="DIR") 16 | parser.add_option("-b", "--bdb", dest="bdbfile",help="bdb file name", metavar="BDB") 17 | (options, args) = parser.parse_args() 18 | 19 | if options.verbose == 1 : VERBOSE = 1 20 | 21 | dir_path = options.dir 22 | if dir_path == None : 23 | parser.print_help() 24 | sys.exit(1) 25 | 26 | bdb_file = options.bdbfile 27 | if bdb_file == None : 28 | parser.print_help() 29 | sys.exit(1) 30 | 31 | startTime = time.time() 32 | 33 | dbenv = db.DBEnv() 34 | if dbenv.open(dir_path, db.DB_CREATE | db.DB_INIT_MPOOL) : 35 | sys.stderr.write("DBEnv.open() fail\n") 36 | sys.exit(1) 37 | d = db.DB(dbenv) 38 | if d.open(bdb_file, db.DB_BTREE, db.DB_CREATE | db.DB_TRUNCATE, 0666) : 39 | sys.stderr.write("DB.open() fail\n") 40 | sys.exit(1) 41 | 42 | linecount = 0 43 | while 1 : 44 | try : line = sys.stdin.readline() 45 | except KeyboardInterrupt : break 46 | if not line : break 47 | try : line = line.strip() 48 | except : continue 49 | if not line : continue 50 | linecount += 1 51 | if linecount % 1000 == 0 : 52 | sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n") 53 | 54 | key,value = line.split('\t',1) 55 | if not key or not value : continue 56 | 57 | d.put(key,value) 58 | 59 | d.close() 60 | dbenv.close() 61 | 62 | durationTime = time.time() - startTime 63 | sys.stderr.write("duration time = %f\n" % durationTime) 64 | -------------------------------------------------------------------------------- /make_leveldb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | import os 5 | import sys 6 | import re 7 | from optparse import OptionParser 8 | import time 9 | import leveldb 10 | 11 | if __name__ == '__main__': 12 | 13 | parser = OptionParser() 14 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 15 | parser.add_option("-d", "--db", dest="dbdir",help="db dir path", metavar="DB") 16 | (options, args) = parser.parse_args() 17 | 18 | if options.verbose == 1 : VERBOSE = 1 19 | 20 | db_dir = options.dbfile 21 | if db_dir == None : 22 | parser.print_help() 23 | sys.exit(1) 24 | 25 | startTime = time.time() 26 | 27 | db = leveldb.LevelDB(db_dir) 28 | 29 | linecount = 0 30 | while 1 : 31 | try : line = sys.stdin.readline() 32 | except KeyboardInterrupt : break 33 | if not line : break 34 | try : line = line.strip() 35 | except : continue 36 | if not line : continue 37 | linecount += 1 38 | if linecount % 1000 == 0 : 39 | sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n") 40 | 41 | key,value = line.split('\t',1) 42 | if not key or not value : continue 43 | 44 | db.Put(key,value) 45 | 46 | durationTime = time.time() - startTime 47 | sys.stderr.write("duration time = %f\n" % durationTime) 48 | -------------------------------------------------------------------------------- /make_lmdb.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "lmdb.h" 11 | 12 | #define LINE_SIZE 10240 13 | 14 | int main(int argc, char *argv[]) 15 | { 16 | int size; 17 | char string[LINE_SIZE+1]; 18 | char s_key[LINE_SIZE+1]; 19 | char s_value[LINE_SIZE+1]; 20 | char* token; 21 | char* save; 22 | int cnt_line; 23 | 24 | int rc; 25 | MDB_env* env; 26 | MDB_txn* txn; 27 | MDB_cursor* mc; 28 | MDB_dbi dbi; 29 | MDB_val key, data; 30 | char* envname; 31 | int envflags=0; 32 | int putflags=0; 33 | char* subname; 34 | char* prog = argv[0]; 35 | size_t map_size = (SIZE_MAX / (1024*1024*1024) / 4)*6; // 4giga * 6 36 | int batch; 37 | 38 | struct timeval tv1, tv2; 39 | 40 | if(argc != 3) { 41 | fprintf(stderr,"%s \n",prog); 42 | exit(1); 43 | } 44 | 45 | gettimeofday(&tv1, NULL); 46 | 47 | envflags = MDB_NOSUBDIR | MDB_NOLOCK; 48 | envname = argv[1]; 49 | rc = mdb_env_create(&env); 50 | if(rc) { 51 | fprintf(stderr, "mdb_env_create failed, error %d %s\n", rc, mdb_strerror(rc)); 52 | return EXIT_FAILURE; 53 | } 54 | mdb_env_set_maxdbs(env, 2); 55 | mdb_env_set_mapsize(env, map_size); 56 | rc = mdb_env_open(env, envname, envflags, 0664); 57 | if(rc) { 58 | fprintf(stderr, "mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc)); 59 | goto env_close; 60 | } 61 | rc = mdb_txn_begin(env, NULL, 0, &txn); 62 | if(rc) { 63 | fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc)); 64 | goto env_close; 65 | } 66 | subname = argv[2]; 67 | rc = mdb_open(txn, subname, MDB_CREATE, &dbi); 68 | if (rc) { 69 | fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); 70 | goto txn_abort; 71 | } 72 | rc = mdb_cursor_open(txn, dbi, &mc); 73 | if (rc) { 74 | fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); 75 | goto txn_abort; 76 | } 77 | 78 | batch = 0; 79 | cnt_line = 0; 80 | while(fgets(string, LINE_SIZE, stdin) != NULL) { 81 | size = strlen(string); 82 | if(string[size-1] == '\n'){ 83 | string[size-1] = '\0'; 84 | --size; 85 | } 86 | if(size > 1 && string[size-1] == '\r'){ 87 | string[size-1] = '\0'; 88 | --size; 89 | } 90 | if(string[0] == '\0') 91 | continue; 92 | 93 | if(cnt_line % 10000 == 0) 94 | fprintf(stderr,"[linecount]\t%d\n",cnt_line); 95 | 96 | token = strtok_r(string, "\t", &save); 97 | if(token != NULL) { 98 | strcpy(s_key, token); 99 | token = strtok_r(NULL, "\t", &save); 100 | if(token != NULL) { 101 | strcpy(s_value, token); 102 | } else continue; 103 | } else continue; 104 | 105 | key.mv_data = s_key; 106 | key.mv_size = strlen(s_key) + 1; 107 | data.mv_data = s_value; 108 | data.mv_size = strlen(s_value) + 1; 109 | 110 | 111 | rc = mdb_cursor_put(mc, &key, &data, putflags); 112 | if(rc == MDB_KEYEXIST) 113 | continue; 114 | 115 | if(batch % 100000000 == 0) { 116 | rc = mdb_txn_commit(txn); 117 | if(rc) { 118 | fprintf(stderr, "%s: line %d: txn_commit: %s\n", prog, cnt_line, mdb_strerror(rc)); 119 | goto env_close; 120 | } 121 | rc = mdb_txn_begin(env, NULL, 0, &txn); 122 | if(rc) { 123 | fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc)); 124 | goto env_close; 125 | } 126 | rc = mdb_cursor_open(txn, dbi, &mc); 127 | if(rc) { 128 | fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); 129 | goto txn_abort; 130 | } 131 | } 132 | 133 | cnt_line++; 134 | batch++; 135 | } 136 | 137 | rc = mdb_txn_commit(txn); 138 | txn = NULL; 139 | if(rc) { 140 | fprintf(stderr, "%s: txn_commit fail: %s\n", prog, mdb_strerror(rc)); 141 | goto env_close; 142 | } 143 | mdb_dbi_close(env, dbi); 144 | 145 | txn_abort: 146 | mdb_txn_abort(txn); 147 | env_close: 148 | mdb_env_close(env); 149 | 150 | gettimeofday(&tv2, NULL); 151 | fprintf(stderr, "<-end > : t2.sec = %d t2.usec = %d\n",(int)tv2.tv_sec,(int)tv2.tv_usec); 152 | fprintf(stderr, "<+time> : sec = %d usec = %d\n",(int)(tv2.tv_sec-tv1.tv_sec),(int)(tv2.tv_usec-tv1.tv_usec)); 153 | 154 | return rc ? EXIT_FAILURE : EXIT_SUCCESS; 155 | 156 | } 157 | -------------------------------------------------------------------------------- /make_lmdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | import os 5 | import sys 6 | import re 7 | from optparse import OptionParser 8 | import time 9 | import lmdb 10 | 11 | if __name__ == '__main__': 12 | 13 | parser = OptionParser() 14 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 15 | parser.add_option("-d", "--db", dest="dbpath",help="db path", metavar="DB") 16 | (options, args) = parser.parse_args() 17 | 18 | if options.verbose == 1 : VERBOSE = 1 19 | 20 | db_path = options.dbpath 21 | if db_path == None : 22 | parser.print_help() 23 | sys.exit(1) 24 | 25 | startTime = time.time() 26 | 27 | # env == db coz max_dbs=0 28 | env = lmdb.Environment(db_path,map_size=24*(1023**3),subdir=False,readonly=False,create=False,max_dbs=0,lock=False) 29 | txn = lmdb.Transaction(env,db=None,write=True) 30 | 31 | linecount = 0 32 | while 1 : 33 | try : line = sys.stdin.readline() 34 | except KeyboardInterrupt : break 35 | if not line : break 36 | try : line = line.strip() 37 | except : continue 38 | if not line : continue 39 | linecount += 1 40 | if linecount % 1000 == 0 : 41 | sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n") 42 | 43 | key,value = line.split('\t',1) 44 | if not key or not value : continue 45 | 46 | try : txn.put(key,value) 47 | except Exception, e : 48 | sys.stderr.write(str(e) + '\n') 49 | continue 50 | 51 | durationTime = time.time() - startTime 52 | sys.stderr.write("duration time = %f\n" % durationTime) 53 | 54 | txn.commit() 55 | env.close() 56 | -------------------------------------------------------------------------------- /multiplexing.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "os" 7 | "runtime" 8 | "sync" 9 | "time" 10 | ) 11 | 12 | func worker(jobs chan string, outs chan string, jobs_wg *sync.WaitGroup, jobs_shutdown chan bool) { 13 | // jobs -> outs 14 | defer jobs_wg.Done() 15 | for { 16 | select { 17 | case line, _ := <-jobs: 18 | out := "do something here" 19 | outs <- out 20 | case _ = <-jobs_shutdown: 21 | fmt.Fprintf(os.Stderr, "shutdown worker\n") 22 | return 23 | } 24 | } 25 | } 26 | 27 | func outputer(outs chan string, outs_wg *sync.WaitGroup, outs_shutdown chan bool) { 28 | // outs -> stdout 29 | // synchronize standard out 30 | defer outs_wg.Done() 31 | for { 32 | select { 33 | case _ = <-outs_shutdown: 34 | fmt.Fprintf(os.Stderr, "shutdown outputer\n") 35 | return 36 | case out := <-outs: 37 | fmt.Printf("out = %s\n", out) 38 | } 39 | } 40 | } 41 | 42 | func prepare_workers(n_worker int, jobs chan string, outs chan string, jobs_wg_list *[]*sync.WaitGroup, jobs_shutdown_list *[]chan bool) { 43 | for i := 0; i < n_worker; i++ { 44 | jobs_wg := &sync.WaitGroup{} 45 | jobs_wg.Add(1) 46 | *jobs_wg_list = append(*jobs_wg_list, jobs_wg) 47 | jobs_shutdown := make(chan bool) 48 | *jobs_shutdown_list = append(*jobs_shutdown_list, jobs_shutdown) 49 | go worker(jobs, outs, jobs_wg, jobs_shutdown) 50 | } 51 | } 52 | 53 | func prepare_outputer(outs chan string, outs_wg *sync.WaitGroup, outs_shutdown chan bool) { 54 | go outputer(outs, outs_wg, outs_shutdown) 55 | } 56 | 57 | func main() { 58 | const n_worker = 10 59 | const n_core = 10 60 | const size_buff = 100 61 | 62 | runtime.GOMAXPROCS(n_core) 63 | 64 | var jobs = make(chan string, size_buff) 65 | var outs = make(chan string, size_buff) 66 | var jobs_wg_list []*sync.WaitGroup 67 | var jobs_shutdown_list []chan bool 68 | outs_wg := &sync.WaitGroup{} 69 | outs_wg.Add(1) 70 | outs_shutdown := make(chan bool) 71 | 72 | // prepare workers, outputer 73 | prepare_workers(n_worker, jobs, outs, &jobs_wg_list, &jobs_shutdown_list) 74 | prepare_outputer(outs, outs_wg, outs_shutdown) 75 | 76 | start := time.Now() // get current time 77 | scanner := bufio.NewScanner(os.Stdin) 78 | for scanner.Scan() { 79 | line := scanner.Text() 80 | jobs <- line 81 | } 82 | 83 | // shutdown all workers 84 | fmt.Fprintf(os.Stderr, "jobs_shutdown_list size : %v\n", len(jobs_shutdown_list)) 85 | fmt.Fprintf(os.Stderr, "jobs_wg_list size : %v\n", len(jobs_wg_list)) 86 | for i, jobs_shutdown := range jobs_shutdown_list { 87 | fmt.Fprintf(os.Stderr, "close jobs_shutdown : %v\n", i) 88 | close(jobs_shutdown) 89 | // wait until finish job 90 | fmt.Fprintf(os.Stderr, "wait jobs_wg : %v\n", i) 91 | jobs_wg := jobs_wg_list[i] 92 | jobs_wg.Wait() 93 | fmt.Fprintf(os.Stderr, "done jobs_wg\n") 94 | } 95 | 96 | // shutdown outputer 97 | fmt.Fprintf(os.Stderr, "close outs_shutdown\n") 98 | close(outs_shutdown) 99 | // wait until outputer ends 100 | fmt.Fprintf(os.Stderr, "wait outs_wg\n") 101 | outs_wg.Wait() 102 | fmt.Fprintf(os.Stderr, "done outs_wg\n") 103 | 104 | elapsed := time.Since(start) 105 | fmt.Fprintf(os.Stderr, "elapsed time = %s\n", elapsed) 106 | } 107 | -------------------------------------------------------------------------------- /ngram.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | static const int32_t MAX_VOCAB_SIZE = 30000000; 7 | static const int32_t BUCKET_SIZE = 2000000; 8 | static const int32_t MIN_NGRAM_SIZE = 3; 9 | static const int32_t MAX_NGRAM_SIZE = 6; 10 | static std::string PREFIX_LABEL = "_label_"; 11 | static const std::string EOS = ""; 12 | static const std::string BOW = "<"; 13 | static const std::string EOW = ">"; 14 | 15 | enum class entry_type : int8_t {word=0, label=1}; 16 | struct entry { 17 | std::string word; 18 | int64_t count; 19 | entry_type type; 20 | std::vector subwords; 21 | }; 22 | 23 | std::vector words_; 24 | int32_t size_ = 0; 25 | int32_t nwords_ = 0; 26 | int32_t nlabels_ = 0; 27 | int32_t ntokens_ = 0; 28 | std::vector word2int_; 29 | 30 | static void init() { 31 | size_ = 0; 32 | nwords_ = 0; 33 | nlabels_ = 0; 34 | ntokens_ = 0; 35 | word2int_.resize(MAX_VOCAB_SIZE); 36 | for (int32_t i = 0; i < MAX_VOCAB_SIZE; i++) { 37 | word2int_[i] = -1; 38 | } 39 | } 40 | 41 | static uint32_t hash(const std::string& str) { 42 | uint32_t h = 2166136261; 43 | for (size_t i = 0; i < str.size(); i++) { 44 | h = h ^ uint32_t(str[i]); 45 | h = h * 16777619; 46 | } 47 | return h; 48 | } 49 | 50 | static int32_t find(const std::string& w) { 51 | int32_t h = hash(w) % MAX_VOCAB_SIZE; 52 | while (word2int_[h] != -1 && words_[word2int_[h]].word != w) { 53 | h = (h + 1) % MAX_VOCAB_SIZE; 54 | } 55 | return h; 56 | } 57 | 58 | static void add(const std::string& w) { 59 | int32_t h = find(w); 60 | ntokens_++; 61 | if (word2int_[h] == -1) { 62 | entry e; 63 | e.word = w; 64 | e.count = 1; 65 | e.type = (w.find(PREFIX_LABEL) == 0) ? entry_type::label : entry_type::word; 66 | if (e.type == entry_type::word) nwords_++; 67 | if (e.type == entry_type::label) nlabels_++; 68 | words_.push_back(e); 69 | word2int_[h] = size_++; 70 | } else { 71 | words_[word2int_[h]].count++; 72 | } 73 | } 74 | 75 | static int32_t getId(const std::string& w) { 76 | int32_t h = find(w); 77 | return word2int_[h]; 78 | } 79 | 80 | static entry_type getType(int32_t id) { 81 | assert(id >= 0); 82 | assert(id < size_); 83 | return words_[id].type; 84 | } 85 | 86 | static std::string getWord(int32_t id) { 87 | assert(id >= 0); 88 | assert(id < size_); 89 | return words_[id].word; 90 | } 91 | 92 | static void computeNgrams(const std::string& word, 93 | std::vector& ngrams) { 94 | for (size_t i = 0; i < word.size(); i++) { 95 | std::string ngram; 96 | if ((word[i] & 0xC0) == 0x80) continue; 97 | for (size_t j = i, n = 1; j < word.size() && n <= MAX_NGRAM_SIZE; n++) { 98 | ngram.push_back(word[j++]); 99 | while (j < word.size() && (word[j] & 0xC0) == 0x80) { 100 | ngram.push_back(word[j++]); 101 | } 102 | if (n >= MIN_NGRAM_SIZE && !(n == 1 && (i == 0 || j == word.size()))) { 103 | int32_t h = hash(ngram) % BUCKET_SIZE; 104 | std::cout << ngram << "\t" << h << std::endl; 105 | ngrams.push_back(nwords_ + h); 106 | } 107 | } 108 | } 109 | } 110 | 111 | static void initNgrams() { 112 | for (size_t i = 0; i < size_; i++) { 113 | std::string word = BOW + words_[i].word + EOW; 114 | words_[i].subwords.push_back(i); 115 | computeNgrams(word, words_[i].subwords); 116 | } 117 | } 118 | 119 | static const std::vector& getNgrams(int32_t i) { 120 | assert(i >= 0); 121 | assert(i < nwords_); 122 | return words_[i].subwords; 123 | } 124 | 125 | static const std::vector getNgrams(const std::string& word) { 126 | int32_t i = getId(word); 127 | if (i >= 0) { 128 | return getNgrams(i); 129 | } 130 | std::vector ngrams; 131 | computeNgrams(BOW + word + EOW, ngrams); 132 | return ngrams; 133 | } 134 | 135 | int main(int argc, char** argv) { 136 | 137 | init(); 138 | 139 | std::string word1 = "카카오12검색"; 140 | std::string word2 = "ab네이버구글34"; 141 | 142 | add(word1); 143 | add(word2); 144 | 145 | initNgrams(); 146 | 147 | return 0; 148 | } 149 | -------------------------------------------------------------------------------- /queue.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | type Node struct { 8 | Value int 9 | } 10 | 11 | /* 12 | func (n *Node) String() string { 13 | return fmt.Sprint(n.Value) 14 | } 15 | */ 16 | 17 | // NewQueue returns a new queue with the given initial size. 18 | func NewQueue(size int) *Queue { 19 | return &Queue{ 20 | nodes: make([]*Node, size), 21 | size: size, 22 | } 23 | } 24 | 25 | // Queue is a basic FIFO queue based on a circular list that resizes as needed. 26 | type Queue struct { 27 | nodes []*Node 28 | size int 29 | head int 30 | tail int 31 | count int 32 | } 33 | 34 | // Push adds a node to the queue. 35 | func (q *Queue) Push(n *Node) { 36 | if q.head == q.tail && q.count > 0 { 37 | nodes := make([]*Node, len(q.nodes)+q.size) 38 | copy(nodes, q.nodes[q.head:]) 39 | copy(nodes[len(q.nodes)-q.head:], q.nodes[:q.head]) 40 | q.head = 0 41 | q.tail = len(q.nodes) 42 | q.nodes = nodes 43 | } 44 | q.nodes[q.tail] = n 45 | q.tail = (q.tail + 1) % len(q.nodes) 46 | q.count++ 47 | } 48 | 49 | // Pop removes and returns a node from the queue in first to last order. 50 | func (q *Queue) Pop() *Node { 51 | if q.count == 0 { 52 | return nil 53 | } 54 | node := q.nodes[q.head] 55 | q.head = (q.head + 1) % len(q.nodes) 56 | q.count-- 57 | return node 58 | } 59 | 60 | func main() { 61 | q := NewQueue(1) 62 | q.Push(&Node{4}) 63 | q.Push(&Node{5}) 64 | q.Push(&Node{6}) 65 | fmt.Println(q.Pop(), q.Pop(), q.Pop()) 66 | } 67 | -------------------------------------------------------------------------------- /search_bdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | import os 5 | import sys 6 | import re 7 | from optparse import OptionParser 8 | import time 9 | from bsddb3 import db 10 | 11 | if __name__ == '__main__': 12 | 13 | parser = OptionParser() 14 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 15 | parser.add_option("-d", "--dir", dest="dir",help="home directory", metavar="DIR") 16 | parser.add_option("-b", "--bdb", dest="bdbfile",help="bdb file name", metavar="BDB") 17 | (options, args) = parser.parse_args() 18 | 19 | if options.verbose == 1 : VERBOSE = 1 20 | 21 | dir_path = options.dir 22 | if dir_path == None : 23 | parser.print_help() 24 | sys.exit(1) 25 | 26 | bdb_file = options.bdbfile 27 | if bdb_file == None : 28 | parser.print_help() 29 | sys.exit(1) 30 | 31 | dbenv = db.DBEnv() 32 | if dbenv.open(dir_path, db.DB_CREATE | db.DB_INIT_MPOOL) : 33 | sys.stderr.write("DBEnv.open() fail\n") 34 | sys.exit(1) 35 | d = db.DB(dbenv) 36 | if d.open(bdb_file, db.DB_BTREE, db.DB_RDONLY) : 37 | sys.stderr.write("DB.open() fail\n") 38 | sys.exit(1) 39 | 40 | startTime = time.time() 41 | 42 | linecount = 0 43 | while 1 : 44 | try : line = sys.stdin.readline() 45 | except KeyboardInterrupt : break 46 | if not line : break 47 | try : line = line.strip() 48 | except : continue 49 | if not line : continue 50 | linecount += 1 51 | if linecount % 1000 == 0 : 52 | sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n") 53 | 54 | key,value = line.split('\t',1) 55 | if not key or not value : continue 56 | 57 | v = d.get(key) 58 | if v : 59 | print v 60 | 61 | durationTime = time.time() - startTime 62 | sys.stderr.write("duration time = %f\n" % durationTime) 63 | 64 | d.close() 65 | dbenv.close() 66 | -------------------------------------------------------------------------------- /search_leveldb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | import os 5 | import sys 6 | import re 7 | from optparse import OptionParser 8 | import time 9 | import leveldb 10 | 11 | if __name__ == '__main__': 12 | 13 | parser = OptionParser() 14 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 15 | parser.add_option("-d", "--db", dest="dbdir",help="db dir path", metavar="DB") 16 | (options, args) = parser.parse_args() 17 | 18 | if options.verbose == 1 : VERBOSE = 1 19 | 20 | db_dir = options.dbdir 21 | if db_dir == None : 22 | parser.print_help() 23 | sys.exit(1) 24 | 25 | db = leveldb.LevelDB(db_dir) 26 | lock_file = db_dir + '/LOCK' 27 | if os.path.exists(lock_file) : 28 | try : os.remove(lock_file) 29 | except OSError : 30 | sys.stderr.write("remove lock file(%s) fail\n" % (lock_file)) 31 | sys.exit(1) 32 | 33 | startTime = time.time() 34 | 35 | linecount = 0 36 | while 1 : 37 | try : line = sys.stdin.readline() 38 | except KeyboardInterrupt : break 39 | if not line : break 40 | try : line = line.strip() 41 | except : continue 42 | if not line : continue 43 | linecount += 1 44 | if linecount % 1000 == 0 : 45 | sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n") 46 | 47 | key,value = line.split('\t',1) 48 | if not key or not value : continue 49 | 50 | ret = db.Get(key) 51 | if ret : print ret 52 | 53 | durationTime = time.time() - startTime 54 | sys.stderr.write("duration time = %f\n" % durationTime) 55 | -------------------------------------------------------------------------------- /search_lmdb.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "lmdb.h" 11 | 12 | #define LINE_SIZE 10240 13 | 14 | int main(int argc, char *argv[]) 15 | { 16 | int size; 17 | char string[LINE_SIZE+1]; 18 | char s_key[LINE_SIZE+1]; 19 | char s_value[LINE_SIZE+1]; 20 | char* token; 21 | char* save; 22 | int cnt_line; 23 | 24 | int rc; 25 | MDB_env* env; 26 | MDB_txn* txn; 27 | MDB_cursor* mc; 28 | MDB_dbi dbi; 29 | MDB_val key, data; 30 | char* envname; 31 | int envflags=0; 32 | char* subname; 33 | char* prog = argv[0]; 34 | size_t map_size = (SIZE_MAX / (1024*1024*1024) / 4)*6; // 4giga * 6 35 | 36 | struct timeval tv1, tv2; 37 | 38 | if(argc != 3) { 39 | fprintf(stderr,"%s \n",prog); 40 | exit(1); 41 | } 42 | 43 | envflags = MDB_NOSUBDIR | MDB_NOLOCK; 44 | envname = argv[1]; 45 | rc = mdb_env_create(&env); 46 | if(rc) { 47 | fprintf(stderr, "mdb_env_create failed, error %d %s\n", rc, mdb_strerror(rc)); 48 | return EXIT_FAILURE; 49 | } 50 | mdb_env_set_maxdbs(env, 2); 51 | mdb_env_set_mapsize(env, map_size); 52 | rc = mdb_env_open(env, envname, envflags, 0664); 53 | if(rc) { 54 | fprintf(stderr, "mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc)); 55 | goto env_close; 56 | } 57 | rc = mdb_txn_begin(env, NULL, 0, &txn); 58 | if(rc) { 59 | fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc)); 60 | goto env_close; 61 | } 62 | subname = argv[2]; 63 | rc = mdb_open(txn, subname, MDB_CREATE, &dbi); 64 | if (rc) { 65 | fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); 66 | goto txn_abort; 67 | } 68 | rc = mdb_cursor_open(txn, dbi, &mc); 69 | if (rc) { 70 | fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); 71 | goto txn_abort; 72 | } 73 | 74 | gettimeofday(&tv1, NULL); 75 | 76 | cnt_line = 0; 77 | while(fgets(string, LINE_SIZE, stdin) != NULL) { 78 | size = strlen(string); 79 | if(string[size-1] == '\n'){ 80 | string[size-1] = '\0'; 81 | --size; 82 | } 83 | if(size > 1 && string[size-1] == '\r'){ 84 | string[size-1] = '\0'; 85 | --size; 86 | } 87 | if(string[0] == '\0') 88 | continue; 89 | 90 | if(cnt_line % 10000 == 0) 91 | fprintf(stderr,"[linecount]\t%d\n",cnt_line); 92 | 93 | /* 94 | token = strtok_r(string, "\t", &save); 95 | if(token != NULL) { 96 | strcpy(s_key, token); 97 | token = strtok_r(NULL, "\t", &save); 98 | if(token != NULL) { 99 | strcpy(s_value, token); 100 | } else continue; 101 | } else continue; 102 | */ 103 | token = strtok_r(string, "\t", &save); 104 | if(token != NULL) { 105 | strcpy(s_key, token); 106 | } else continue; 107 | 108 | key.mv_data = s_key; 109 | key.mv_size = strlen(s_key) + 1; 110 | 111 | rc = mdb_get(txn, dbi, &key, &data); 112 | if(!rc) { 113 | fprintf(stdout, "%s\t%s\n", s_key, (char*)data.mv_data); 114 | } 115 | 116 | cnt_line++; 117 | } 118 | gettimeofday(&tv2, NULL); 119 | fprintf(stderr, "<-end > : t2.sec = %d t2.usec = %d\n",(int)tv2.tv_sec,(int)tv2.tv_usec); 120 | fprintf(stderr, "<+time> : sec = %d usec = %d\n",(int)(tv2.tv_sec-tv1.tv_sec),(int)(tv2.tv_usec-tv1.tv_usec)); 121 | 122 | rc = mdb_txn_commit(txn); 123 | txn = NULL; 124 | if(rc) { 125 | fprintf(stderr, "%s: txn_commit fail: %s\n", prog, mdb_strerror(rc)); 126 | goto env_close; 127 | } 128 | mdb_dbi_close(env, dbi); 129 | 130 | txn_abort: 131 | mdb_txn_abort(txn); 132 | env_close: 133 | mdb_env_close(env); 134 | 135 | return rc ? EXIT_FAILURE : EXIT_SUCCESS; 136 | 137 | } 138 | -------------------------------------------------------------------------------- /search_lmdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | import os 5 | import sys 6 | import re 7 | from optparse import OptionParser 8 | import time 9 | import lmdb 10 | 11 | if __name__ == '__main__': 12 | 13 | parser = OptionParser() 14 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 15 | parser.add_option("-d", "--db", dest="dbpath",help="db path", metavar="DB") 16 | (options, args) = parser.parse_args() 17 | 18 | if options.verbose == 1 : VERBOSE = 1 19 | 20 | db_path = options.dbpath 21 | if db_path == None : 22 | parser.print_help() 23 | sys.exit(1) 24 | 25 | 26 | # env == db coz max_dbs=0 27 | env = lmdb.Environment(db_path,map_size=24*(1023**3),subdir=False,readonly=True,create=False,max_dbs=0,lock=False) 28 | txn = lmdb.Transaction(env,db=None,write=False) 29 | 30 | startTime = time.time() 31 | 32 | linecount = 0 33 | while 1 : 34 | try : line = sys.stdin.readline() 35 | except KeyboardInterrupt : break 36 | if not line : break 37 | try : line = line.strip() 38 | except : continue 39 | if not line : continue 40 | linecount += 1 41 | if linecount % 1000 == 0 : 42 | sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n") 43 | 44 | key,value = line.split('\t',1) 45 | if not key or not value : continue 46 | 47 | ret = txn.get(key,default=None) 48 | if ret : 49 | print ret 50 | 51 | durationTime = time.time() - startTime 52 | sys.stderr.write("duration time = %f\n" % durationTime) 53 | 54 | txn.abort() 55 | env.close() 56 | -------------------------------------------------------------------------------- /search_word2vec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | reload(sys) 4 | sys.setdefaultencoding('utf-8') 5 | import re 6 | from optparse import OptionParser 7 | import time 8 | from gensim.models import word2vec,phrases 9 | import logging 10 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 11 | 12 | def load_model(model_path) : 13 | model = word2vec.Word2Vec.load(model_path) 14 | return model 15 | 16 | ''' 17 | python2.7 search_word2vec.py -m corpus.txt.model 18 | ''' 19 | if __name__ == '__main__': 20 | 21 | parser = OptionParser() 22 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 23 | parser.add_option("-m", "--model", dest="model",help="model path, output file", metavar="MODEL") 24 | (options, args) = parser.parse_args() 25 | 26 | if options.verbose == 1 : VERBOSE = 1 27 | 28 | model_path = options.model 29 | if model_path == None : 30 | parser.print_help() 31 | sys.exit(1) 32 | 33 | model = load_model(model_path) 34 | 35 | linecount = 0 36 | while 1 : 37 | try : line = sys.stdin.readline() 38 | except KeyboardInterrupt : break 39 | if not line : break 40 | try : line = line.strip() 41 | except : continue 42 | if not line : continue 43 | linecount += 1 44 | if linecount % 1000 == 0 : 45 | sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n") 46 | 47 | # convert to unicode 48 | line_unicode = line.decode('utf-8') 49 | tokens = [] 50 | for token in line_unicode.split() : 51 | if token in model : tokens.append(token) 52 | if len(tokens) >= 1 : 53 | ret = model.most_similar(positive=tokens) 54 | for word,sim in ret : 55 | print word + "\t" + str(sim) 56 | else : 57 | print "not in vocab" 58 | print "==================================" 59 | -------------------------------------------------------------------------------- /similarity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | ''' 5 | read http://radimrehurek.com/gensim/tut3.html 6 | here is test code 7 | ''' 8 | 9 | import os 10 | import sys 11 | reload(sys) 12 | sys.setdefaultencoding('utf-8') 13 | import re 14 | from optparse import OptionParser 15 | from gensim import corpora, models, similarities, matutils 16 | import logging 17 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 18 | 19 | def construct_dictionary(documents_path, filter=None) : 20 | # collect statistics about all tokens 21 | dictionary = corpora.Dictionary(line.lower().split() for line in open(documents_path)) 22 | 23 | if filter : 24 | # remove stop words and words that appear only once 25 | stoplist = set('for a of the and to in'.split()) 26 | stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id] 27 | once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] 28 | dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once 29 | dictionary.compactify() # remove gaps in id sequence after words that were removed 30 | 31 | return dictionary 32 | 33 | def save_dictionary(dictionary, dictionary_path) : 34 | dictionary.save(dictionary_path) 35 | 36 | def load_dictionary(dictionary_path) : 37 | dictionary = corpora.Dictionary().load(dictionary_path,mmap='r') 38 | return dictionary 39 | 40 | def save_corpus(corpus, corpus_path, format=None) : 41 | if format == 'svmlight' : # Joachim’s SVMlight format 42 | corpora.SvmLightCorpus.serialize(corpus_path, corpus) 43 | if format == 'lda-c' : # Blei’s LDA-C format 44 | corpora.BleiCorpus.serialize(corpus_path, corpus) 45 | if format == 'low' : # GibbsLDA++ format 46 | corpora.LowCorpus.serialize(corpus_path, corpus) 47 | if not format : # Matrix Market format 48 | corpora.MmCorpus.serialize(corpus_path, corpus) 49 | 50 | def load_corpus(corpus_path) : 51 | corpus = corpora.MmCorpus(corpus_path) 52 | return corpus 53 | 54 | def corpus_to_tfidf(corpus) : 55 | tfidf = models.TfidfModel(corpus, normalize=True) # step 1 -- initialize a model 56 | ''' 57 | corpus_tfidf = tfidf[corpus] 58 | for doc in corpus_tfidf: 59 | print doc 60 | ''' 61 | return tfidf 62 | 63 | def save_tfidf(tfidf, tfidf_path) : 64 | tfidf.save(tfidf_path) 65 | 66 | def load_tfidf(tfidf_path) : 67 | tfidf = models.TfidfModel.load(tfidf_path) 68 | return tfidf 69 | 70 | def corpus_to_lsi(corpus, tfidf, dictionary, topic_number) : 71 | corpus_tfidf = tfidf[corpus] 72 | lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_number) # initialize an LSI transformation 73 | ''' 74 | corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi 75 | lsi.print_topics(3) 76 | for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly 77 | print doc 78 | ''' 79 | return lsi 80 | 81 | def save_lsi(lsi, lsi_path) : 82 | lsi.save(lsi_path) 83 | 84 | def load_lsi(lsi_path) : 85 | lsi = models.LsiModel.load(lsi_path) 86 | return lsi 87 | 88 | def corpus_to_lda(corpus, dictionary, topic_number) : 89 | model = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_number) 90 | return model 91 | 92 | def save_lda(lda, lda_path) : 93 | lda.save(lda_path) 94 | 95 | def load_lda(lda_path) : 96 | lda = models.LdaModel.load(lda_path) 97 | return lda 98 | 99 | def corpus_to_simmat(corpus, model) : 100 | simmat = similarities.MatrixSimilarity(model[corpus]) 101 | return simmat 102 | 103 | def save_simmat(simmat, simmat_path) : 104 | simmat.save(simmat_path) 105 | 106 | def load_simmat(simmat_path) : 107 | simmat = similarities.MatrixSimilarity.load(simmat_path) 108 | return simmat 109 | 110 | ''' 111 | python2.7 similarity.py --dictionary=document.txt.dict --corpus=document.txt.mm --tfidf=document.txt.tfidf --lsi=document.txt.lsi --lda=document.txt.lda --simmat=document.txt.simmat 112 | ''' 113 | if __name__ == '__main__': 114 | 115 | parser = OptionParser() 116 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 117 | parser.add_option("-d", "--dictionary", dest="dictionary",help="dictionary", metavar="DICT") 118 | parser.add_option("-c", "--corpus", dest="corpus",help="corpus", metavar="CORPUS") 119 | parser.add_option("-t", "--tfidf", dest="tfidf",help="tfidf", metavar="TFIDF") 120 | parser.add_option("-l", "--lsi", dest="lsi",help="lsi", metavar="LSI") 121 | parser.add_option("-a", "--lda", dest="lda",help="lda", metavar="LDA") 122 | parser.add_option("-s", "--simmat", dest="simmat",help="similarity matrix, output file", metavar="SIMMAT") 123 | (options, args) = parser.parse_args() 124 | 125 | if options.verbose == 1 : VERBOSE = 1 126 | 127 | dictionary_path = options.dictionary 128 | if dictionary_path == None : 129 | parser.print_help() 130 | sys.exit(1) 131 | 132 | corpus_path = options.corpus 133 | if corpus_path == None : 134 | parser.print_help() 135 | sys.exit(1) 136 | 137 | tfidf_path = options.tfidf 138 | if tfidf_path == None : 139 | parser.print_help() 140 | sys.exit(1) 141 | 142 | lsi_path = options.lsi 143 | if lsi_path == None : 144 | parser.print_help() 145 | sys.exit(1) 146 | 147 | lda_path = options.lda 148 | if lda_path == None : 149 | parser.print_help() 150 | sys.exit(1) 151 | 152 | simmat_path = options.simmat 153 | if simmat_path == None : 154 | parser.print_help() 155 | sys.exit(1) 156 | 157 | dictionary = load_dictionary(dictionary_path) 158 | corpus = load_corpus(corpus_path) 159 | tfidf = load_tfidf(tfidf_path) 160 | lsi = load_lsi(lsi_path) 161 | lda = load_lda(lda_path) 162 | 163 | simmat = corpus_to_simmat(corpus, tfidf) 164 | save_simmat(simmat, simmat_path) 165 | simmat = load_simmat(simmat_path) 166 | 167 | linecount = 0 168 | while 1 : 169 | try : line = sys.stdin.readline() 170 | except KeyboardInterrupt : break 171 | if not line : break 172 | try : line = line.strip() 173 | except : continue 174 | if not line : continue 175 | linecount += 1 176 | if linecount % 1000 == 0 : 177 | sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n") 178 | vector = dictionary.doc2bow(line.lower().split()) 179 | vec_tfidf = tfidf[vector] 180 | vec_lsi = lsi[vector] 181 | vec_lda = lda[vector] 182 | 183 | sims = simmat[vec_tfidf] # perform a similarity query against the corpus 184 | sims = sorted(enumerate(sims), key=lambda item: -item[1]) 185 | idx = 0 186 | for docid, similarity in sims : 187 | if idx >= 5 : break 188 | output = [str(similarity)] 189 | for termid, freq in corpus[docid] : 190 | term = dictionary.get(termid) 191 | output.append(term + "/" + str(freq)) 192 | print "\t".join(output) 193 | idx += 1 194 | -------------------------------------------------------------------------------- /stack.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | type Node struct { 8 | Value int 9 | } 10 | 11 | /* 12 | func (n *Node) String() string { 13 | return fmt.Sprint(n.Value) 14 | } 15 | */ 16 | 17 | // NewStack returns a new stack. 18 | func NewStack() *Stack { 19 | return &Stack{} 20 | } 21 | 22 | // Stack is a basic LIFO stack that resizes as needed. 23 | type Stack struct { 24 | nodes []*Node 25 | count int 26 | } 27 | 28 | // Push adds a node to the stack. 29 | func (s *Stack) Push(n *Node) { 30 | s.nodes = append(s.nodes[:s.count], n) 31 | s.count++ 32 | } 33 | 34 | // Pop removes and returns a node from the stack in last to first order. 35 | func (s *Stack) Pop() *Node { 36 | if s.count == 0 { 37 | return nil 38 | } 39 | s.count-- 40 | return s.nodes[s.count] 41 | } 42 | 43 | func main() { 44 | s := NewStack() 45 | s.Push(&Node{1}) 46 | s.Push(&Node{2}) 47 | s.Push(&Node{3}) 48 | fmt.Println(s.Pop(), s.Pop(), s.Pop()) 49 | } 50 | -------------------------------------------------------------------------------- /test_numpy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | #-*- coding: utf8 -*- 3 | 4 | import os 5 | import sys 6 | import re 7 | from optparse import OptionParser 8 | 9 | import theano 10 | import theano.tensor as T 11 | import numpy as np 12 | import scipy 13 | 14 | 15 | # --verbose 16 | VERBOSE = 0 17 | 18 | def open_file(filename, mode) : 19 | try : fid = open(filename, mode) 20 | except : 21 | sys.stderr.write("open_file(), file open error : %s\n" % (filename)) 22 | exit(1) 23 | else : 24 | return fid 25 | 26 | def close_file(fid) : 27 | fid.close() 28 | 29 | def type_test() : 30 | m = np.asarray([[1., 2], [3, 4], [5, 6]]) 31 | print m 32 | print m.shape # shape is tuple (3,2) 33 | print m[2,0] 34 | 35 | x = np.float32(1.0) 36 | print x 37 | y = np.int_([1,2,4]) 38 | print y 39 | z = np.array([1,2,3], dtype=np.int8) 40 | print z 41 | print z.dtype 42 | z = np.float16(z) 43 | print z 44 | print z.dtype 45 | z = z.astype(np.int_) # or z.astype(int) 46 | print z 47 | print z.dtype 48 | print np.issubdtype(z.dtype,float) 49 | 50 | def array_test() : 51 | x = np.array([2, 3, 1, 0]) 52 | print x 53 | x = np.array([[1,2.0],[0,0],(1+1j,3.)]) 54 | print x 55 | x = np.array([[ 1.+0.j, 2.+0.j], [ 0.+0.j, 0.+0.j], [ 1.+1.j, 3.+0.j]]) 56 | print x 57 | x = np.zeros((2, 3)) 58 | print x 59 | x = np.ones((2, 3)) 60 | print x 61 | print np.arange(10) 62 | print np.arange(2, 10, dtype=np.float) 63 | print np.arange(2, 3, 0.1) 64 | print np.linspace(1., 4., 6) 65 | print np.indices((3,3)) 66 | 67 | if __name__ == '__main__': 68 | 69 | parser = OptionParser() 70 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 71 | (options, args) = parser.parse_args() 72 | 73 | if options.verbose == 1 : VERBOSE = 1 74 | 75 | type_test() 76 | array_test() 77 | -------------------------------------------------------------------------------- /test_theano.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | #-*- coding: utf8 -*- 3 | 4 | import os 5 | import sys 6 | import re 7 | from optparse import OptionParser 8 | 9 | import cPickle, gzip 10 | import theano 11 | import theano.tensor as T 12 | import numpy as np 13 | import scipy 14 | 15 | 16 | # --verbose 17 | VERBOSE = 0 18 | 19 | def open_file(filename, mode) : 20 | try : fid = open(filename, mode) 21 | except : 22 | sys.stderr.write("open_file(), file open error : %s\n" % (filename)) 23 | exit(1) 24 | else : 25 | return fid 26 | 27 | def close_file(fid) : 28 | fid.close() 29 | 30 | def shared_dataset(data_xy): 31 | data_x, data_y = data_xy 32 | shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) 33 | shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) 34 | return shared_x, T.cast(shared_y, 'int32') 35 | 36 | if __name__ == '__main__': 37 | 38 | parser = OptionParser() 39 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 40 | (options, args) = parser.parse_args() 41 | 42 | if options.verbose == 1 : VERBOSE = 1 43 | 44 | f = gzip.open('mnist.pkl.gz', 'rb') 45 | train_set, valid_set, test_set = cPickle.load(f) 46 | f.close() 47 | 48 | test_set_x, test_set_y = shared_dataset(test_set) 49 | valid_set_x, valid_set_y = shared_dataset(valid_set) 50 | train_set_x, train_set_y = shared_dataset(train_set) 51 | batch_size = 500 # size of the minibatch 52 | 53 | # accessing the third minibatch of the training set 54 | data = train_set_x[2 * 500: 3 * 500] 55 | label = train_set_y[2 * 500: 3 * 500] 56 | -------------------------------------------------------------------------------- /transform.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | ''' 5 | read http://radimrehurek.com/gensim/tut2.html 6 | here is test code 7 | ''' 8 | 9 | import os 10 | import sys 11 | reload(sys) 12 | sys.setdefaultencoding('utf-8') 13 | import re 14 | from optparse import OptionParser 15 | from gensim import corpora, models, similarities, matutils 16 | import logging 17 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 18 | 19 | def construct_dictionary(documents_path, filter=None) : 20 | # collect statistics about all tokens 21 | dictionary = corpora.Dictionary(line.lower().split() for line in open(documents_path)) 22 | 23 | if filter : 24 | # remove stop words and words that appear only once 25 | stoplist = set('for a of the and to in'.split()) 26 | stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id] 27 | once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] 28 | dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once 29 | dictionary.compactify() # remove gaps in id sequence after words that were removed 30 | 31 | return dictionary 32 | 33 | def save_dictionary(dictionary, dictionary_path) : 34 | dictionary.save(dictionary_path) 35 | 36 | def load_dictionary(dictionary_path) : 37 | dictionary = corpora.Dictionary().load(dictionary_path,mmap='r') 38 | return dictionary 39 | 40 | def save_corpus(corpus, corpus_path, format=None) : 41 | if format == 'svmlight' : # Joachim’s SVMlight format 42 | corpora.SvmLightCorpus.serialize(corpus_path, corpus) 43 | if format == 'lda-c' : # Blei’s LDA-C format 44 | corpora.BleiCorpus.serialize(corpus_path, corpus) 45 | if format == 'low' : # GibbsLDA++ format 46 | corpora.LowCorpus.serialize(corpus_path, corpus) 47 | if not format : # Matrix Market format 48 | corpora.MmCorpus.serialize(corpus_path, corpus) 49 | 50 | def load_corpus(corpus_path) : 51 | corpus = corpora.MmCorpus(corpus_path) 52 | return corpus 53 | 54 | def corpus_to_tfidf(corpus) : 55 | tfidf = models.TfidfModel(corpus, normalize=True) # step 1 -- initialize a model 56 | ''' 57 | corpus_tfidf = tfidf[corpus] 58 | for doc in corpus_tfidf: 59 | print doc 60 | ''' 61 | return tfidf 62 | 63 | def save_tfidf(tfidf, tfidf_path) : 64 | tfidf.save(tfidf_path) 65 | 66 | def load_tfidf(tfidf_path) : 67 | tfidf = models.TfidfModel.load(tfidf_path) 68 | return tfidf 69 | 70 | def corpus_to_lsi(corpus, tfidf, dictionary, topic_number) : 71 | corpus_tfidf = tfidf[corpus] 72 | lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_number) # initialize an LSI transformation 73 | ''' 74 | corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi 75 | lsi.print_topics(3) 76 | for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly 77 | print doc 78 | ''' 79 | return lsi 80 | 81 | def save_lsi(lsi, lsi_path) : 82 | lsi.save(lsi_path) 83 | 84 | def load_lsi(lsi_path) : 85 | lsi = models.LsiModel.load(lsi_path) 86 | return lsi 87 | 88 | def corpus_to_lda(corpus, dictionary, topic_number) : 89 | model = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_number) 90 | return model 91 | 92 | def save_lda(lda, lda_path) : 93 | lda.save(lda_path) 94 | 95 | def load_lda(lda_path) : 96 | lda = models.LdaModel.load(lda_path) 97 | return lda 98 | 99 | ''' 100 | python2.7 transform.py --dictionary=document.txt.dict --corpus=document.txt.mm --tfidf=document.txt.tfidf --lsi=document.txt.lsi --lda=document.txt.lda 101 | ''' 102 | if __name__ == '__main__': 103 | 104 | parser = OptionParser() 105 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 106 | parser.add_option("-d", "--dictionary", dest="dictionary",help="dictionary", metavar="DICT") 107 | parser.add_option("-c", "--corpus", dest="corpus",help="corpus", metavar="CORPUS") 108 | parser.add_option("-t", "--tfidf", dest="tfidf",help="tfidf, output file", metavar="TFIDF") 109 | parser.add_option("-l", "--lsi", dest="lsi",help="lsi, output file", metavar="LSI") 110 | parser.add_option("-a", "--lda", dest="lda",help="lda, output file", metavar="LDA") 111 | (options, args) = parser.parse_args() 112 | 113 | if options.verbose == 1 : VERBOSE = 1 114 | 115 | dictionary_path = options.dictionary 116 | if dictionary_path == None : 117 | parser.print_help() 118 | sys.exit(1) 119 | 120 | corpus_path = options.corpus 121 | if corpus_path == None : 122 | parser.print_help() 123 | sys.exit(1) 124 | 125 | tfidf_path = options.tfidf 126 | if tfidf_path == None : 127 | parser.print_help() 128 | sys.exit(1) 129 | 130 | lsi_path = options.lsi 131 | if lsi_path == None : 132 | parser.print_help() 133 | sys.exit(1) 134 | 135 | lda_path = options.lda 136 | if lda_path == None : 137 | parser.print_help() 138 | sys.exit(1) 139 | 140 | dictionary = load_dictionary(dictionary_path) 141 | corpus = load_corpus(corpus_path) 142 | 143 | tfidf = corpus_to_tfidf(corpus) 144 | save_tfidf(tfidf, tfidf_path) 145 | 146 | lsi = corpus_to_lsi(corpus, tfidf, dictionary, 10) 147 | save_lsi(lsi, lsi_path) 148 | 149 | lda = corpus_to_lda(corpus, dictionary, 10) 150 | save_lda(lda, lda_path) 151 | -------------------------------------------------------------------------------- /wordcount_spark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | import os 5 | import sys 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | from optparse import OptionParser 9 | 10 | from pyspark import SparkContext 11 | 12 | VERBOSE = 0 13 | 14 | def open_file(filename, mode) : 15 | try : fid = open(filename, mode) 16 | except : 17 | sys.stderr.write("open_file(), file open error : %s\n" % (filename)) 18 | exit(1) 19 | else : 20 | return fid 21 | 22 | def close_file(fid) : 23 | fid.close() 24 | 25 | def map_func(line) : 26 | words = line.split(' ') 27 | return map(lambda x: (x, 1), words) 28 | 29 | def reduce_func(a,b) : 30 | return a+b 31 | 32 | def map_func2(entry) : 33 | key,value = entry 34 | return (key,reduce(lambda a,b: a+b,value)) 35 | 36 | ''' 37 | usage : spark-submit --master yarn-client --total-executor-cores 100 --executor-memory 512M wordcount.py -f input_file_on_hdfs 38 | ''' 39 | if __name__ == "__main__": 40 | parser = OptionParser() 41 | parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") 42 | parser.add_option("-f", "--file", dest="file",help="file path in HDFS", metavar="FILE") 43 | (options, args) = parser.parse_args() 44 | 45 | if options.verbose == 1 : VERBOSE = 1 46 | 47 | file_path = options.file 48 | if file_path == None : 49 | parser.print_help() 50 | sys.exit(1) 51 | 52 | sc = SparkContext(appName="PythonWordCount") 53 | 54 | ''' 55 | # read from hdfs directory 56 | lines = sc.wholeTextFiles(file_path, 1) 57 | counts = lines.values().flatMap(lambda x: x.split(' ')) \ 58 | .map(lambda x: (x, 1)) \ 59 | .reduceByKey(lambda a, b: a + b) \ 60 | .sortBy(lambda x: x[1],ascending=False) 61 | counts.saveAsHadoopFile("gensim/output","org.apache.hadoop.mapred.TextOutputFormat") 62 | ''' 63 | 64 | lines = sc.textFile(file_path, 1) 65 | 66 | # save to hdfs 67 | counts = lines.flatMap(lambda x: x.split(' ')) \ 68 | .map(lambda x: (x, 1)) \ 69 | .reduceByKey(lambda a, b: a + b) \ 70 | .sortBy(lambda x: x[1],ascending=False) 71 | counts.saveAsHadoopFile("gensim/output","org.apache.hadoop.mapred.TextOutputFormat") 72 | 73 | ''' 74 | lines = sc.textFile(file_path, 1) 75 | # user defined map,reduce 76 | # map : string -> [(a,1),(b,1),..],[(a,1),(c,1),...],.... 77 | # flatMap : list of list -> [(a,1),(b,1),....,(a,1),(c,1),....] 78 | # reduceByKey : goup by key -> [(a,(1,1,1,....)),(b,(1,1,1)),(c,1,1,1,1,...),...] 79 | # : reduce value list -> [(a,10),(b,3),(c,17),....] 80 | # sortBy : [(a,10),(b,3),(c,17),....] -> [(c,17),(a,10),(c,3),....] 81 | counts = lines.map(map_func) \ 82 | .flatMap(lambda x: x) \ 83 | .reduceByKey(reduce_func) \ 84 | .sortBy(lambda x: x[1],ascending=False) 85 | counts.saveAsHadoopFile("gensim/output","org.apache.hadoop.mapred.TextOutputFormat") 86 | ''' 87 | 88 | ''' 89 | lines = sc.textFile(file_path, 1) 90 | # user defined map,reduce 91 | counts = lines.map(map_func) \ 92 | .flatMap(lambda x: x) \ 93 | .groupByKey() \ 94 | .map(map_func2) \ 95 | .sortBy(lambda x: x[1],ascending=False) 96 | output = counts.collect() 97 | for key,value in output : 98 | print key + "\t" + str(value) 99 | ''' 100 | 101 | ''' 102 | lines = sc.textFile(file_path, 1) 103 | # save to local 104 | counts = lines.flatMap(lambda x: x.split(' ')) \ 105 | .map(lambda x: (x, 1)) \ 106 | .reduceByKey(lambda a, b: a + b) 107 | output = counts.collect() 108 | fd = open_file("output.txt",'w') 109 | for (word, count) in output: 110 | fd.write("%s\t%s\n" % (word,count)) 111 | close_file(fd) 112 | ''' 113 | 114 | ''' 115 | lines = sc.textFile(file_path, 1) 116 | # test goupByKey 117 | group = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).groupByKey() 118 | output = group.collect() 119 | for (word,count_list) in output : 120 | print word + "\t" + ','.join(map(lambda x: str(x),count_list)) 121 | ''' 122 | 123 | sc.stop() 124 | --------------------------------------------------------------------------------