├── src ├── Makefile ├── biterm.h ├── infer.h ├── doc.h ├── sampler.h ├── str_util.h ├── model.h ├── main.cpp ├── model.cpp ├── infer.cpp ├── pvec.h └── pmat.h ├── data ├── tag_title_translate.py └── tag_title.txt ├── script ├── runExample.sh ├── indexDocs.py └── topicDisplay.py ├── README.md └── LICENSE.txt /src/Makefile: -------------------------------------------------------------------------------- 1 | CC=g++ 2 | CFLAGS=-c -pipe -O3 3 | MODEL_OBJ=model.o infer.o main.o 4 | MODEL_EXE=btm 5 | 6 | all:$(MODEL_EXE) 7 | 8 | $(MODEL_EXE):$(MODEL_OBJ) 9 | $(CC) $(MODEL_OBJ) -o $@ 10 | 11 | main.o:model.h 12 | model.o:model.h biterm.h doc.h 13 | infer.o:infer.h doc.h 14 | 15 | %.o:%.cpp 16 | $(CC) $(INCLUDE) $(CFLAGS) $< -o $@ 17 | 18 | clean: 19 | rm -rf $(MODEL_OBJ) $(MODEL_EXE) 20 | -------------------------------------------------------------------------------- /data/tag_title_translate.py: -------------------------------------------------------------------------------- 1 | tags = {} 2 | voca = {} 3 | 4 | wf = open('tag_title_ids.txt', 'w') 5 | 6 | for l in open('tag_title.txt'): 7 | tag, ws = l.strip().split('\t')[1].split('|') 8 | ws = ws.split() 9 | if not tags.has_key(tag): 10 | tags[tag] = len(tags) 11 | 12 | for w in ws: 13 | if not voca.has_key(w): 14 | voca[w] = len(voca) 15 | 16 | s = ' '.join([str(voca[i]) for i in ws]) 17 | wf.write('%d\t%s\n' % (tags[tag], s)) 18 | 19 | wf = open('voca.txt', 'w') 20 | wf.write(''.join(['%d\t%s\n' % (v, k) for k,v in sorted(voca.items(), key=lambda d:d[1])])) 21 | 22 | wf = open('tags.txt', 'w') 23 | wf.write(''.join(['%d\t%s\n' % (v, k) for k,v in sorted(tags.items(), key=lambda d:d[1])])) 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /src/biterm.h: -------------------------------------------------------------------------------- 1 | #ifndef _BITERM_H 2 | #define _BITERM_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | class Biterm { 12 | private: 13 | int wi; 14 | int wj; 15 | int z; // topic assignment 16 | 17 | public: 18 | Biterm(int w1, int w2): z(-1) { 19 | wi = min(w1, w2); 20 | wj = max(w1, w2); 21 | } 22 | 23 | // s format:wi wj z 24 | Biterm(string s) { 25 | istringstream iss(s); 26 | iss >> wi >> wj >> z; 27 | } 28 | 29 | int get_wi() const {return wi;} 30 | int get_wj() const {return wj;} 31 | 32 | int get_z() const {return z;} 33 | void set_z(int k) {z = k;} 34 | void reset_z() {z = -1;} 35 | 36 | string str() const { 37 | ostringstream os; 38 | os << wi << '\t' << wj << '\t' << '\t' << z; 39 | return os.str(); 40 | } 41 | }; 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/infer.h: -------------------------------------------------------------------------------- 1 | #ifndef _INFERLDA_H 2 | #define _INFERLDA_H 3 | 4 | #include 5 | 6 | #include "pvec.h" 7 | #include "pmat.h" 8 | 9 | #include "doc.h" 10 | 11 | using namespace std; 12 | 13 | class Infer { 14 | private: 15 | int K; 16 | string type; // infer type 17 | 18 | string dfile; // inference docs 19 | Pvec pz; // p(z) = theta 20 | Pmat pw_z; // p(w|z) = phi, size K * M 21 | 22 | private: 23 | void load_para(string model_dir); 24 | 25 | void doc_infer(const Doc& doc, Pvec& pz_d); 26 | void doc_infer_sum_b(const Doc& doc, Pvec& pz_d); 27 | void doc_infer_sum_w(const Doc& doc, Pvec& pz_d); 28 | void doc_infer_mix(const Doc& doc, Pvec& pz_d); 29 | 30 | // compute condition distribution p(z|w, d) with p(w|z) fixed 31 | void compute_pz_dw(int w, const Pvec& pz_d, Pvec& p); 32 | 33 | public: 34 | Infer(string type, int K): type(type), K(K) {} 35 | 36 | void run(string docs_pt, string model_dir); 37 | }; 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /src/doc.h: -------------------------------------------------------------------------------- 1 | #ifndef _DOC_H 2 | #define _DOC_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "biterm.h" 11 | 12 | using namespace std; 13 | 14 | class Doc { 15 | private: 16 | vector ws; // word sequence 17 | 18 | public: 19 | Doc(const string& s) {read_doc(s);} 20 | 21 | int size() const {return ws.size();} 22 | 23 | const vector& get_ws() const {return ws;} 24 | 25 | const int get_w(int i) const { 26 | assert(i < ws.size()); 27 | return ws[i]; 28 | } 29 | 30 | /** 31 | * Extract biterms from a document 32 | * `win`: window size for biterm extraction 33 | * `bs`: the output biterms 34 | */ 35 | void gen_biterms(vector& bs, int win = 15) const { 36 | if (ws.size() < 2) return; 37 | 38 | for (int i = 0; i < ws.size()-1; ++i) 39 | for (int j = i+1; j < min(i + win, int(ws.size())); ++j) 40 | bs.push_back( Biterm(ws[i], ws[j]) ); 41 | } 42 | 43 | private: 44 | void read_doc(const string& s) { 45 | istringstream iss(s); 46 | int w; 47 | while (iss >> w) ws.push_back(w); 48 | } 49 | }; 50 | 51 | #endif 52 | 53 | -------------------------------------------------------------------------------- /script/runExample.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # run an toy example for BTM 3 | 4 | K=20 # number of topics 5 | 6 | alpha=`echo "scale=3;50/$K"|bc` 7 | beta=0.005 8 | niter=5 9 | save_step=501 10 | 11 | input_dir=../sample-data/ 12 | output_dir=../output/ 13 | model_dir=${output_dir}model/ 14 | mkdir -p $output_dir/model 15 | 16 | # the input docs for training 17 | doc_pt=${input_dir}doc_info.txt 18 | 19 | echo "=============== Index Docs =============" 20 | # docs after indexing 21 | dwid_pt=${output_dir}doc_wids.txt 22 | # vocabulary file 23 | voca_pt=${output_dir}voca.txt 24 | python indexDocs.py $doc_pt $dwid_pt $voca_pt 25 | 26 | ## learning parameters p(z) and p(w|z) 27 | echo "=============== Topic Learning =============" 28 | W=`wc -l < $voca_pt` # vocabulary size 29 | make -C ../src 30 | echo "../src/btm est $K $W $alpha $beta $niter $save_step $dwid_pt $model_dir" 31 | ../src/btm est $K $W $alpha $beta $niter $save_step $dwid_pt $model_dir 32 | 33 | ## infer p(z|d) for each doc 34 | echo "================ Infer P(z|d)===============" 35 | echo "../src/btm inf sum_b $K $dwid_pt $model_dir" 36 | ../src/btm inf sum_b $K $dwid_pt $model_dir 37 | 38 | ## output top words of each topic 39 | echo "================ Topic Display =============" 40 | python topicDisplay.py $model_dir $K $voca_pt 41 | -------------------------------------------------------------------------------- /script/indexDocs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf-8 3 | # translate word into id in documents 4 | import sys 5 | 6 | w2id = {} 7 | 8 | def indexFile(pt, res_pt): 9 | print('index file: '+str(pt)) 10 | wf = open(res_pt, 'w') 11 | for l in open(pt): 12 | ws = l.strip().split() 13 | for w in ws: 14 | if w not in w2id: 15 | w2id[w] = len(w2id) 16 | 17 | wids = [w2id[w] for w in ws] 18 | print(' '.join(map(str, wids)), file=wf) 19 | 20 | print('write file: '+str(res_pt)) 21 | 22 | 23 | def write_w2id(res_pt): 24 | print('write:'+str(res_pt)) 25 | wf = open(res_pt, 'w') 26 | for w, wid in sorted(w2id.items(), key=lambda d:d[1]): 27 | print('%d\t%s' % (wid, w), file=wf) 28 | 29 | if __name__ == '__main__': 30 | if len(sys.argv) < 4: 31 | print('Usage: python %s ' % sys.argv[0]) 32 | print('\tdoc_pt input docs to be indexed, each line is a doc with the format "word word ..."') 33 | print('\tdwid_pt output docs after indexing, each line is a doc with the format "wordId wordId..."') 34 | print('\tvoca_pt output vocabulary file, each line is a word with the format "wordId word"') 35 | exit(1) 36 | 37 | doc_pt = sys.argv[1] 38 | dwid_pt = sys.argv[2] 39 | voca_pt = sys.argv[3] 40 | indexFile(doc_pt, dwid_pt) 41 | print('n(w)='+str(len(w2id))) 42 | write_w2id(voca_pt) 43 | -------------------------------------------------------------------------------- /script/topicDisplay.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf-8 3 | # Function: translate the results from BTM 4 | # Input: 5 | # mat/pw_z.k20 6 | 7 | import sys 8 | 9 | # return: {wid:w, ...} 10 | def read_voca(pt): 11 | voca = {} 12 | for l in open(pt): 13 | wid, w = l.strip().split('\t')[:2] 14 | voca[int(wid)] = w 15 | return voca 16 | 17 | def read_pz(pt): 18 | return [float(p) for p in open(pt).readline().split()] 19 | 20 | # voca = {wid:w,...} 21 | def dispTopics(pt, voca, pz): 22 | k = 0 23 | topics = [] 24 | for l in open(pt): 25 | vs = [float(v) for v in l.split()] 26 | wvs = zip(range(len(vs)), vs) 27 | wvs = sorted(wvs, key=lambda d:d[1], reverse=True) 28 | #tmps = ' '.join(['%s' % voca[w] for w,v in wvs[:10]]) 29 | tmps = ' '.join(['%s:%f' % (voca[w],v) for w,v in wvs[:10]]) 30 | topics.append((pz[k], tmps)) 31 | k += 1 32 | 33 | print('p(z)\t\tTop words') 34 | for pz, s in sorted(topics, reverse=True): 35 | print('%f\t%s' % (pz, s)) 36 | 37 | if __name__ == '__main__': 38 | if len(sys.argv) < 4: 39 | print('Usage: python %s ' % sys.argv[0]) 40 | print('\tmodel_dir the output dir of BTM') 41 | print('\tK the number of topics') 42 | print('\tvoca_pt the vocabulary file') 43 | exit(1) 44 | 45 | model_dir = sys.argv[1] 46 | K = int(sys.argv[2]) 47 | voca_pt = sys.argv[3] 48 | voca = read_voca(voca_pt) 49 | W = len(voca) 50 | print('K:%d, n(W):%d' % (K, W)) 51 | 52 | pz_pt = model_dir + 'k%d.pz' % K 53 | pz = read_pz(pz_pt) 54 | 55 | zw_pt = model_dir + 'k%d.pw_z' % K 56 | dispTopics(zw_pt, voca, pz) 57 | -------------------------------------------------------------------------------- /src/sampler.h: -------------------------------------------------------------------------------- 1 | #ifndef _SAMPLER_H 2 | #define _SAMPLER_H 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | 9 | namespace Sampler { 10 | 11 | inline void init() { 12 | srand(1); 13 | } //time(NULL));} 14 | 15 | // uniform sample Mult(1/K), result [0, K-1] 16 | inline int uni_sample(int K) { 17 | int k = rand() % K; 18 | return k; 19 | } 20 | 21 | // sample from [0,1) 22 | inline double uni_sample() { 23 | double t = rand() % 10000 / 10000; 24 | return t; 25 | } 26 | 27 | // sample from Mult(p) 28 | inline int mult_sample(vector p) { 29 | int K = p.size(); 30 | for (int i = 1; i < K; i++) 31 | p[i] += p[i - 1]; 32 | 33 | double u = double(rand()) / RAND_MAX; 34 | int k; // record sampled index 35 | for (k = 0; k < K; k++) 36 | if (p[k] >= u * p[K - 1]) 37 | break; 38 | 39 | if (k == K) 40 | --k; 41 | 42 | return k; 43 | } 44 | 45 | // p is p(1) 46 | inline bool Bern_sample(float p) { 47 | double u = double(rand()) / RAND_MAX; 48 | return (u < p); 49 | } 50 | 51 | // return counts sampling from multinormial distribution p 52 | // the counts sum to N 53 | inline void systematic_sample(const vector& p, int N, 54 | vector& counts) { 55 | counts.resize(p.size(), 0); 56 | 57 | vector u(N); 58 | u[0] = Sampler::uni_sample() / N; 59 | for (int n = 1; n < N; ++n) 60 | u[n] = u[0] + double(n) / N; 61 | 62 | int i = 0; 63 | double s1 = 0; 64 | double s2 = p[0]; 65 | for (int n = 0; n < N - 1; ++n) { 66 | while (i < N && u[i] < s2) { 67 | ++i; 68 | ++counts[n]; 69 | } 70 | 71 | s1 = s2; 72 | s2 += p[n + 1]; 73 | } 74 | 75 | counts[N - 1] = N - i; 76 | } 77 | } 78 | #endif 79 | -------------------------------------------------------------------------------- /src/str_util.h: -------------------------------------------------------------------------------- 1 | #ifndef _STR_UTIL_H 2 | #define _STR_UTIL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | namespace str_util { 12 | // Trims spaces off the end and the beginning of the given string. 13 | inline string trim(string& str){ 14 | string::size_type pos = str.find_last_not_of(" \n\t"); 15 | if (pos != string::npos){ 16 | str.erase(pos + 1); 17 | pos = str.find_first_not_of(" \n\t"); 18 | if (pos != string::npos) { 19 | str.erase(0, pos); 20 | } 21 | } 22 | else{ 23 | // There is nothing else but whitespace in the string 24 | str.clear(); 25 | } 26 | return str; 27 | } 28 | 29 | // split a string by delimiter 30 | inline vector split(string str, char delim=' ') { 31 | vector vec; 32 | istringstream iss(str); 33 | string tmps; 34 | 35 | while (getline(iss, tmps, delim)) 36 | vec.push_back(tmps); 37 | 38 | return vec; 39 | } 40 | 41 | // turn a int to string 42 | inline string itos(int n) { 43 | ostringstream ss; 44 | ss << n; 45 | return ss.str(); 46 | } 47 | 48 | // turn a string to int 49 | inline int stoi(string s) { 50 | int n; 51 | istringstream ss(s); 52 | ss >> n; 53 | return n; 54 | } 55 | 56 | inline double stof(string s) { 57 | double n; 58 | istringstream ss(s); 59 | ss >> n; 60 | return n; 61 | } 62 | 63 | inline string ftos(double n) { 64 | ostringstream ss; 65 | ss << n; 66 | return ss.str(); 67 | } 68 | 69 | template 70 | inline string vec2str(const vector& vec) { 71 | ostringstream ss; 72 | for (int i = 0; i < vec.size(); ++i) 73 | ss << vec[i] << ' '; 74 | return ss.str(); 75 | } 76 | } 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /src/model.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Biterm topic model(BTM) with Gbbis sampling 3 | * Author: Xiaohui Yan(xhcloud@gmail.com) 4 | * 2012-9-25 5 | */ 6 | #ifndef _MODEL_H 7 | #define _MODEL_H 8 | 9 | #include 10 | #include 11 | #include "biterm.h" 12 | #include "doc.h" 13 | #include "pvec.h" 14 | #include "pmat.h" 15 | 16 | using namespace std; 17 | 18 | class Model { 19 | public: 20 | vector bs; 21 | 22 | protected: 23 | int W; // vocabulary size 24 | int K; // number of topics 25 | int n_iter; // maximum number of iteration of Gibbs Sampling 26 | int save_step; 27 | 28 | double alpha; // hyperparameters of p(z) 29 | double beta; // hyperparameters of p(w|z) 30 | 31 | // sample recorders 32 | Pvec nb_z; // n(b|z), size K*1 33 | Pmat nwz; // n(w,z), size K*W 34 | 35 | Pvec pw_b; // the background word distribution 36 | 37 | // If true, the topic 0 is set to a background topic that 38 | // equals to the emperiacal word dsitribution. It can filter 39 | // out common words 40 | bool has_background; 41 | 42 | public: 43 | Model(int K, int W, double a, double b, int n_iter, int save_step, 44 | bool has_b = false): 45 | K(K), W(W), alpha(a), beta(b), 46 | n_iter(n_iter), has_background(has_b), 47 | save_step(save_step) { 48 | pw_b.resize(W); 49 | nwz.resize(K, W); 50 | nb_z.resize(K); 51 | } 52 | 53 | // run estimate procedures 54 | void run(string docs_pt, string res_dir); 55 | 56 | private: 57 | // intialize memeber varibles and biterms 58 | void model_init(); // load from docs 59 | void load_docs(string docs_pt); 60 | 61 | // update estimate of a biterm 62 | void update_biterm(Biterm& bi); 63 | 64 | // reset topic proportions for biterm b 65 | void reset_biterm_topic(Biterm& bi); 66 | 67 | // assign topic proportions for biterm b 68 | void assign_biterm_topic(Biterm& bi, int k); 69 | 70 | // compute condition distribution p(z|b) 71 | void compute_pz_b(Biterm& bi, Pvec& p); 72 | 73 | void save_res(string res_dir); 74 | void save_pz(string pt); 75 | void save_pw_z(string pt); 76 | }; 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "model.h" 11 | #include "infer.h" 12 | 13 | using namespace std; 14 | 15 | void usage() { 16 | cout << "Training Usage:" << endl 17 | << "btm est \n" 18 | << "\tK int, number of topics, like 20" << endl 19 | << "\tW int, size of vocabulary" << endl 20 | << "\talpha double, Pymmetric Dirichlet prior of P(z), like 1.0" << endl 21 | << "\tbeta double, Pymmetric Dirichlet prior of P(w|z), like 0.01" << endl 22 | << "\tn_iter int, number of iterations of Gibbs sampling" << endl 23 | << "\tsave_step int, steps to save the results" << endl 24 | << "\tdocs_pt string, path of training docs" << endl 25 | << "\tmodel_dir string, output directory" << endl 26 | << "Inference Usage:" << endl 27 | << "btm inf " << endl 28 | << "\tK int, number of topics, like 20" << endl 29 | << "\tdocs_pt string, path of training docs" << endl 30 | << "\tmodel_dir string, output directory" << endl; 31 | } 32 | 33 | int main(int argc, char* argv[]) { 34 | if (argc < 4) { 35 | usage(); 36 | return 1; 37 | } 38 | 39 | //// load parameters from std input 40 | int i = 1; 41 | if (strcmp(argv[i++], "est")==0) { 42 | int K = atoi(argv[i++]); // topic num 43 | int W = atoi(argv[i++]); 44 | double alpha = atof(argv[i++]); // hyperparameters of p(z) 45 | double beta = atof(argv[i++]); // hyperparameters of p(w|z) 46 | int n_iter = atoi(argv[i++]); 47 | int save_step = atoi(argv[i++]); 48 | string docs_pt(argv[i++]); 49 | string dir(argv[i++]); 50 | 51 | cout << "Run BTM, K=" << K << ", W=" << W << ", alpha=" << alpha << ", beta=" << beta << ", n_iter=" << n_iter << ", save_step=" << save_step << " ====" << endl; 52 | // load training data from file 53 | clock_t start = clock(); 54 | Model model(K, W, alpha, beta, n_iter, save_step); 55 | model.run(docs_pt, dir); 56 | clock_t end = clock(); 57 | printf("cost %fs\n", double(end - start)/CLOCKS_PER_SEC); 58 | } else if (strcmp(argv[1], "inf")==0) { 59 | string type(argv[2]); 60 | int K = atoi(argv[3]); // topic num 61 | string docs_pt(argv[4]); 62 | string dir(argv[5]); 63 | cout << "Run inference:K=" << K << ", type " << type << " ====" << endl; 64 | Infer inf(type, K); 65 | inf.run(docs_pt, dir); 66 | } else { 67 | cout << "Wrong common:" << argv[0] << " " << argv[1] << endl; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/model.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "sampler.h" 10 | #include "str_util.h" 11 | #include "model.h" 12 | 13 | void Model::run(string doc_pt, string res_dir) { 14 | load_docs(doc_pt); 15 | 16 | model_init(); 17 | 18 | cout << "Begin iteration" << endl; 19 | string out_dir = res_dir + "k" + str_util::itos(K) + "."; 20 | for (int it = 1; it < n_iter + 1; ++it) { 21 | cout << "\riter " << it << '/' << n_iter; 22 | fflush(stdout); 23 | for (int b = 0; b < bs.size(); ++b) { 24 | update_biterm(bs[b]); 25 | } 26 | 27 | if (it % save_step == 0) 28 | save_res(out_dir); 29 | } 30 | 31 | save_res(out_dir); 32 | } 33 | 34 | void Model::model_init() { 35 | srand(time(NULL)); 36 | // random initialize 37 | for (vector::iterator b = bs.begin(); b != bs.end(); ++b) { 38 | int k = Sampler::uni_sample(K); 39 | assign_biterm_topic(*b, k); 40 | } 41 | } 42 | 43 | // input, each line is a doc 44 | // format: wid wid wid ... 45 | void Model::load_docs(string dfile) { 46 | cout << "load docs: " << dfile << endl; 47 | ifstream rf( dfile.c_str() ); 48 | if (!rf) { 49 | cout << "file not find:" << dfile << endl; 50 | exit(-1); 51 | } 52 | 53 | string line; 54 | while(getline(rf, line)) { 55 | Doc doc(line); 56 | doc.gen_biterms(bs); 57 | // statistic the exmperial word distribution 58 | for (int i = 0; i < doc.size(); ++i) { 59 | int w = doc.get_w(i); 60 | pw_b[w] += 1; 61 | } 62 | } 63 | 64 | pw_b.normalize(); 65 | } 66 | 67 | // sample procedure for ith biterm 68 | void Model::update_biterm(Biterm& bi) { 69 | reset_biterm_topic(bi); 70 | 71 | // compute p(z|b) 72 | Pvec pz; 73 | compute_pz_b(bi, pz); 74 | 75 | // sample topic for biterm b 76 | int k = Sampler::mult_sample(pz.to_vector()); 77 | assign_biterm_topic(bi, k); 78 | } 79 | 80 | // reset topic assignment of biterm i 81 | void Model::reset_biterm_topic(Biterm& bi) { 82 | int k = bi.get_z(); 83 | // not is the background topic 84 | int w1 = bi.get_wi(); 85 | int w2 = bi.get_wj(); 86 | 87 | nb_z[k] -= 1; // update number of biterms in topic K 88 | nwz[k][w1] -= 1; // update w1's occurrence times in topic K 89 | nwz[k][w2] -= 1; 90 | assert(nb_z[k] > -10e-7 && nwz[k][w1] > -10e-7 && nwz[k][w2] > -10e-7); 91 | bi.reset_z(); 92 | } 93 | 94 | // compute p(z|w_i, w_j) 95 | void Model::compute_pz_b(Biterm& bi, Pvec& pz) { 96 | pz.resize(K); 97 | int w1 = bi.get_wi(); 98 | int w2 = bi.get_wj(); 99 | 100 | double pw1k, pw2k, pk; 101 | for (int k = 0; k < K; ++k) { 102 | // avoid numerical problem by mutipling W 103 | if (has_background && k == 0) { 104 | pw1k = pw_b[w1]; 105 | pw2k = pw_b[w2]; 106 | } 107 | else { 108 | pw1k = (nwz[k][w1] + beta) / (2 * nb_z[k] + W * beta); 109 | pw2k = (nwz[k][w2] + beta) / (2 * nb_z[k] + 1 + W * beta); 110 | } 111 | pk = (nb_z[k] + alpha) / (bs.size() + K * alpha); 112 | pz[k] = pk * pw1k * pw2k; 113 | } 114 | 115 | //pz.normalize(); 116 | } 117 | 118 | // assign topic k to biterm i 119 | void Model::assign_biterm_topic(Biterm& bi, int k) { 120 | bi.set_z(k); 121 | int w1 = bi.get_wi(); 122 | int w2 = bi.get_wj(); 123 | nb_z[k] += 1; 124 | nwz[k][w1] += 1; 125 | nwz[k][w2] += 1; 126 | } 127 | 128 | 129 | void Model::save_res(string dir) { 130 | string pt = dir + "pz"; 131 | cout << "\nwrite p(z): " << pt << endl; 132 | save_pz(pt); 133 | 134 | string pt2 = dir + "pw_z"; 135 | cout << "write p(w|z): " << pt2 << endl; 136 | save_pw_z(pt2); 137 | } 138 | 139 | // p(z) is determinated by the overall proportions 140 | // of biterms in it 141 | void Model::save_pz(string pt) { 142 | Pvec pz(nb_z); 143 | pz.normalize(alpha); 144 | pz.write(pt); 145 | } 146 | 147 | void Model::save_pw_z(string pt) { 148 | Pmat pw_z(K, W); // p(w|z) = phi, size K * M 149 | ofstream wf(pt.c_str()); 150 | for (int k = 0; k < K; k++) { 151 | for (int w = 0; w < W; w++) 152 | pw_z[k][w] = (nwz[k][w] + beta) / (nb_z[k] * 2 + W * beta); 153 | 154 | wf << pw_z[k].str() << endl; 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /src/infer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "str_util.h" 7 | #include "infer.h" 8 | 9 | void Infer::run(string docs_pt, string model_dir) { 10 | load_para(model_dir); 11 | 12 | cout << "Infer p(z|d) for docs in: " << docs_pt << endl; 13 | ifstream rf(docs_pt.c_str()); 14 | assert(rf); 15 | 16 | string pt = model_dir + "k" + str_util::itos(K) + ".pz_d"; 17 | ofstream wf(pt.c_str()); 18 | assert(wf); 19 | cout << "write p(z|d): " << pt << endl; 20 | 21 | string line; 22 | while (getline(rf, line)) { 23 | Doc doc(line); 24 | Pvec pz_d(K); 25 | doc_infer(doc, pz_d); 26 | // write p(z|d) for d, a doc a time 27 | wf << pz_d.str() << endl; 28 | } 29 | } 30 | 31 | void Infer::load_para(string model_dir) { 32 | string pt = model_dir + "k" + str_util::itos(K) + ".pz"; 33 | cout << "load p(z):" << pt < 0 && abs(pw_z[0].sum() - 1) < 1e-4); 42 | } 43 | 44 | void Infer::doc_infer(const Doc& doc, Pvec& pz_d) { 45 | if (type == "sum_b") 46 | doc_infer_sum_b(doc, pz_d); 47 | else if (type == "sub_w") 48 | doc_infer_sum_w(doc, pz_d); 49 | else if (type == "mix") 50 | doc_infer_mix(doc, pz_d); 51 | else { 52 | cout << "[Err] unkown infer type:" << type << endl; 53 | exit(1); 54 | } 55 | } 56 | 57 | 58 | // p(z|d) = \sum_b{ p(z|b)p(b|d) } 59 | void Infer::doc_infer_sum_b(const Doc& doc, Pvec& pz_d) { 60 | pz_d.assign(K, 0); 61 | 62 | if (doc.size() == 1) { 63 | // doc is a single word, p(z|d) = p(z|w) \propo p(z)p(w|z) 64 | for (int k = 0; k < K; ++k) 65 | pz_d[k] = pz[k] * pw_z[k][doc.get_w(0)]; 66 | } 67 | else { 68 | // more than one words 69 | vector bs; 70 | doc.gen_biterms(bs); 71 | 72 | int W = pw_z.cols(); 73 | for (int b = 0; b < bs.size(); ++b) { 74 | int w1 = bs[b].get_wi(); 75 | int w2 = bs[b].get_wj(); 76 | 77 | // filter out-of-vocabulary words 78 | if (w2 >= W) continue; 79 | 80 | // compute p(z|b) \propo p(w1|z)p(w2|z)p(z) 81 | Pvec pz_b(K); 82 | for (int k = 0; k < K; ++k) { 83 | assert(pw_z[k][w1]>0 && pw_z[k][w2]>0); 84 | pz_b[k] = pz[k] * pw_z[k][w1] * pw_z[k][w2]; 85 | } 86 | pz_b.normalize(); 87 | 88 | // sum for b, p(b|d) is unifrom 89 | for (int k = 0; k < K; ++k) 90 | pz_d[k] += pz_b[k]; 91 | } 92 | } 93 | 94 | pz_d.normalize(); 95 | } 96 | 97 | // p(z|d) = \sum_w{ p(z|w)p(w|d) } 98 | void Infer::doc_infer_sum_w(const Doc& doc, Pvec& pz_d) { 99 | pz_d.assign(K, 0); 100 | 101 | int W = pw_z.cols(); 102 | const vector& ws = doc.get_ws(); 103 | 104 | for (int i = 0; i < ws.size(); ++i) { 105 | int w = ws[i]; 106 | if (w >= W) continue; 107 | 108 | // compute p(z|w) \propo p(w|z)p(z) 109 | Pvec pz_w(K); 110 | for (int k = 0; k < K; ++k) 111 | pz_w[k] = pz[k] * pw_z[k][w]; 112 | 113 | pz_w.normalize(); 114 | 115 | // sum for b, p(b|d) is unifrom 116 | for (int k = 0; k < K; ++k) 117 | pz_d[k] += pz_w[k]; 118 | } 119 | pz_d.normalize(); 120 | } 121 | 122 | void Infer::doc_infer_mix(const Doc& doc, Pvec& pz_d) { 123 | pz_d.resize(K); 124 | for (int k = 0; k < K; ++k) 125 | pz_d[k] = pz[k]; 126 | 127 | const vector& ws = doc.get_ws(); 128 | int W = pw_z.cols(); 129 | for (int i = 0; i < ws.size(); ++i) { 130 | int w = ws[i]; 131 | if (w >= W) continue; 132 | 133 | for (int k = 0; k < K; ++k) 134 | pz_d[k] *= (pw_z[k][w] * W); 135 | } 136 | 137 | // sum for b, p(b|d) is unifrom 138 | pz_d.normalize(); 139 | } 140 | 141 | // compute p(z|d, w) \proto p(w|z)p(z|d) 142 | void Infer::compute_pz_dw(int w, const Pvec& pz_d, Pvec& p) { 143 | p.resize(K); 144 | 145 | for (int k = 0; k < K; ++k) 146 | p[k] = pw_z[k][w] * pz_d[k]; 147 | 148 | p.normalize(); 149 | } 150 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Code of Biterm Topic Model 2 | 3 | Biterm Topic Model (BTM) is a word co-occurrence based topic model that learns topics by modeling word-word co-occurrences patterns (e.g., biterms). 4 | (In constrast, LDA and PLSA are word-document co-occurrence topic models, since they model word-document co-occurrences.) 5 | 6 | A biterm consists of two words co-occurring in the same context, for example, in the same short text window. Unlike LDA models the word occurrences, BTM models the biterm occurrences in a corpus. In generation procedure, a biterm is generated by drawn two words independently from a same topic. In other words, the distribution of a biterm b=(wi,wj) is defined as: 7 | 8 | P(b) = \sum_k{P(wi|z)*P(wj|z)*P(z)}. 9 | 10 | With Gibbs sampling algorithm, we can learn topics by estimate P(w|k) and P(z). 11 | 12 | More detail can be referred to the following paper: 13 | 14 | > Xiaohui Yan, Jiafeng Guo, Yanyan Lan, Xueqi Cheng. A Biterm Topic Model For Short Text. WWW2013. 15 | 16 | ## Usage ## 17 | The code has been test on linux. If you on windows, please install 18 | cygwin (with bc, wc, make). 19 | 20 | The code includes a runnable example, you can run it *on Linux* by: 21 | 22 | $ cd script 23 | $ sh runExample.sh 24 | 25 | It trains BTM over the documents in *sample-data/doc\_info.txt* and output the topics. The doc\_info.txt contains all the training documents, where each line represents one document with words separated by space as: 26 | > word1 word2 word3 .... 27 | 28 | (*Note: the sample data is only used for illustration of the usage of the code. It is not the data set used in the paper.*) 29 | 30 | You can change the paths of data files and parameters in *script/runExample.sh* to run over your own data. 31 | 32 | Indeed, the *runExample.sh* processes the input documents in 4 steps. 33 | 34 | **1. Index the words in the documents** 35 | To simplify the main code, we provide a python script to map each word to a unique ID (starts from 0) in the documents. 36 | 37 | $ python script/indexDocs.py 38 | doc_pt input docs to be indexed, each line is a doc with the format "word word ..." 39 | dwid_pt output docs after indexing, each line is a doc with the format "wordId wordId ..." 40 | voca_pt output vocabulary file, each line is a word with the format "wordId word" 41 | 42 | **2. Topic learning** 43 | The next step is to train the model using the documents represented by word ids. 44 | 45 | $ ./src/btm est 46 | K int, number of topics 47 | W int, size of vocabulary 48 | alpha double, Symmetric Dirichlet prior of P(z), like 1 49 | beta double, Symmetric Dirichlet prior of P(w|z), like 0.01 50 | n_iter int, number of iterations of Gibbs sampling 51 | save_step int, steps to save the results 52 | docs_pt string, path of training docs 53 | model_dir string, output directory 54 | 55 | The results will be written into the directory "model\_dir": 56 | - k20.pw_z: a K*M matrix for P(w|z), suppose K=20 57 | - k20.pz: a K*1 matrix for P(z), suppose K=20 58 | 59 | **3. Inference topic proportions for documents, i.e., P(z|d)** 60 | If you need to analysis the topic proportions of each documents, just run the following common to infer that using the model estimated. 61 | 62 | $ ./src/btm inf 63 | K int, number of topics, like 20 64 | type string, 4 choices:sum_w, sum_b, lda, mix. sum_b is used in our paper. 65 | docs_pt string, path of docs to be inferred 66 | model_dir string, output directory 67 | 68 | The result will be output to "model_dir": 69 | - k20.pz_d: a N*K matrix for P(z|d), suppose K=20 70 | 71 | **4. Results display** 72 | Finally, we also provide a python script to illustrate the top words of the topics and their proportions in the collection. 73 | 74 | $ python script/topicDisplay.py 75 | model_dirthe output dir of BTM 76 | Kthe number of topics 77 | voca_ptthe vocabulary file 78 | 79 | ## Related codes ## 80 | - [Online BTM](https://github.com/xiaohuiyan/OnlineBTM) 81 | - [Bursty BTM](https://github.com/xiaohuiyan/BurstyBTM) 82 | 83 | ## History ## 84 | - 2015-01-12, v0.5, improve the usability of the code 85 | - 2012-09-25, v0.1 86 | 87 | If there is any question, feel free to contact: [Xiaohui Yan](http://xiaohuiyan.github.io "Xiaohui Yan")(xhcloud@gmail.com). 88 | -------------------------------------------------------------------------------- /src/pvec.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Positive vector 3 | */ 4 | #ifndef _PVEC_H 5 | #define _PVEC_H 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #define EXIT_ERR( s1, s2 ) {printf("[Error] %s%s\n", s1, s2); \ 19 | exit(EXIT_FAILURE);} 20 | 21 | using namespace std; 22 | 23 | template 24 | class Pvec { 25 | private: 26 | vector p; 27 | 28 | public: 29 | Pvec(){} 30 | Pvec(size_t n): p(n) {} 31 | Pvec(size_t n, T v): p(n, v) {} 32 | 33 | template 34 | Pvec(const vector& v) { 35 | for (int i = 0; i < p.size(); ++i) 36 | p[i] = v[i]; 37 | } 38 | 39 | template 40 | Pvec(const Pvec& v) { 41 | p.resize(v.size()); 42 | for (int i = 0; i < v.size(); ++i) 43 | p[i] = v[i]; 44 | } 45 | 46 | Pvec(const string& line) { 47 | loadString(line); 48 | } 49 | 50 | size_t size() const {return p.size();} 51 | void resize(size_t n) {p.resize(n);} 52 | void resize(size_t n, const T& v) {p.resize(n, v);} 53 | void assign(size_t n, const T& v) {p.assign(n, v);} 54 | 55 | void rand_init() { 56 | srand(time(NULL)); 57 | for (size_t i = 0; i < p.size(); ++i) 58 | p[i] = rand() % 100 + 1; 59 | 60 | normalize(); 61 | } 62 | 63 | void fill(T v) { 64 | for( size_t i = 0 ; i < p.size() ; ++i ) 65 | p[i] = v; 66 | } 67 | 68 | void uniform_init() { 69 | for( size_t i = 0 ; i < p.size() ; ++i ) 70 | p[i] = double(1) / p.size(); 71 | } 72 | 73 | // p[0] = v, others is (1-v)/(p.size - 1) 74 | void bias_init(double v) { 75 | assert( v < 1); 76 | p[0] = v; 77 | for( size_t i = 1 ; i < p.size() ; ++i ) 78 | p[i] = (double)((1-v) / ((int)p.size()-1)); 79 | } 80 | 81 | void push_back(T v) {p.push_back(v);} 82 | 83 | void extend(const Pvec& vec) { 84 | p.insert(p.end(), vec.p.begin(), vec.p.end()); 85 | } 86 | 87 | void loadString(const string& line) { 88 | p.clear(); 89 | istringstream iss(line); 90 | T v; 91 | while (iss >> v) { 92 | p.push_back(v); 93 | } 94 | } 95 | 96 | // load a varible an line, make sure no empty lines 97 | // the number of rows determinates the dimension of vector 98 | void loadFile(const string& inf) { 99 | p.clear(); 100 | ifstream rf(inf.c_str()); 101 | if (!rf) 102 | EXIT_ERR("file not find:", inf.c_str()); 103 | loadFileStream(rf); 104 | } 105 | 106 | void loadFileStream(ifstream& rf) { 107 | p.clear(); 108 | T v; 109 | while (rf >> v) { 110 | p.push_back(v); 111 | } 112 | } 113 | 114 | T sum() const{ 115 | T s = 0; 116 | for( size_t i = 0 ; i < p.size() ; ++i ) 117 | s += p[i]; 118 | return s; 119 | } 120 | 121 | T norm() const{ 122 | T s = 0; 123 | for( size_t i = 0 ; i < p.size() ; ++i ) 124 | s += p[i]*p[i]; 125 | return sqrt(s); 126 | } 127 | 128 | void normalize(double smoother = 0.0) { 129 | T s = sum(); 130 | assert(s>=0); 131 | 132 | int K = p.size(); 133 | // avoid numerical problem 134 | for( size_t i = 0 ; i < K ; ++i ) { 135 | p[i] = (p[i] + smoother)/(s + K*smoother); 136 | } 137 | } 138 | 139 | // normalize to all exponents of entries sum to 1 140 | void exp_normalize() { 141 | vector tmp(p); 142 | for (size_t i = 0; i < p.size(); ++i ) { 143 | double s = 0.0; 144 | for (size_t j = 0; j < p.size(); ++j ) 145 | s += exp( tmp[j] - tmp[i] ); 146 | 147 | assert(s>=1); 148 | p[i] = 1/s; 149 | } 150 | } 151 | 152 | void smooth(double smoother) { 153 | for( size_t i = 0 ; i < p.size() ; ++i ) 154 | if (p[i] < smoother) 155 | p[i] = smoother; 156 | } 157 | 158 | template 159 | Pvec& operator=(const vector& v) { 160 | p.resize(v.size()); 161 | copy(v.begin(), v.end(), p.begin()); 162 | return *this; 163 | } 164 | 165 | T &operator[](int i) { 166 | if (i >= p.size()) 167 | cout << "ERR: index=" << i << ", size=" << p.size() << endl; 168 | assert(i < p.size()); 169 | return p[i]; 170 | } 171 | 172 | const T& operator[](int i) const{assert(i operator+(const T & v){ 175 | Pvec tp(p.size()); 176 | for (int i=0; i operator+(const Pvec& v){ 182 | Pvec tp(p.size()); 183 | for (int i=0; i& operator+=(const T & v){ 189 | for (int i=0; i& operator+=(const Pvec& v){ 196 | for (int i=0; i operator-(const T & v){ 202 | Pvec tp(p.size()); 203 | for (int i=0; i operator-(const Pvec& v){ 209 | Pvec tp(p.size()); 210 | for (int i=0; i& operator-=(const T & v){ 216 | for (int i=0; i& operator-=(const Pvec& v) { 222 | for (int i=0; i operator*(const T & v) { 228 | Pvec tp(p.size()); 229 | for (int i=0; i& operator*=(const T& v) { 235 | for (int i=0; i operator/(const T & v) { 241 | Pvec tp(p.size()); 242 | for (int i=0; i& operator/=(const T& v) { 248 | assert(v > 0); 249 | for (int i=0; i max_v) { 264 | max_v = p[i]; 265 | } 266 | } 267 | return max_v; 268 | } 269 | 270 | int max_idx() const { 271 | T max_v = -10000000; 272 | int idx = 0; 273 | for (int i=0; i max_v) { 275 | max_v = p[i]; 276 | idx = i; 277 | } 278 | } 279 | return idx; 280 | } 281 | 282 | // erase v[start:end) 283 | void erase(int start, int end) { 284 | assert(end >= start && end <= p.size()); 285 | p.erase(p.begin() + start, p.begin() + end); 286 | } 287 | 288 | void clear() {p.clear();} 289 | 290 | vector to_vector() {return p;} 291 | Pvec toDouble() {return Pvec(*this);} 292 | 293 | string str(char delim = ' ') const{ 294 | ostringstream os; 295 | size_t i; 296 | for( i = 0 ; i < p.size(); ++i ) 297 | os << p[i] << delim; 298 | 299 | return os.str(); 300 | } 301 | 302 | // filter small values less than v 303 | string sparse_str(double v) const{ 304 | ostringstream os; 305 | size_t i; 306 | for( i = 0 ; i < p.size(); ++i ) { 307 | if (p[i] > v) 308 | os << i << ':' << p[i] << ' '; 309 | } 310 | 311 | return os.str(); 312 | } 313 | 314 | void write(const string& pt, char delim = ' ') { 315 | ofstream wf(pt.c_str()); 316 | if (!wf) { 317 | cout << "Path not exists:" << pt << endl; 318 | exit(-1); 319 | } 320 | 321 | wf << str(delim); 322 | } 323 | }; 324 | 325 | #endif 326 | -------------------------------------------------------------------------------- /src/pmat.h: -------------------------------------------------------------------------------- 1 | /* 2 | * (dense) Positive Matrix, each row is a Pvec 3 | * 4 | * Created on: 2012-7-31 5 | * Author: xhcloud@gmail.com 6 | */ 7 | #ifndef _PMAT_H 8 | #define _PMAT_H 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "pvec.h" 22 | 23 | #define EXIT_ERR(s1, s2) {printf("[Error] %s%s\n", s1, s2); \ 24 | exit(EXIT_FAILURE);} 25 | 26 | using namespace std; 27 | 28 | template 29 | class Pmat 30 | { 31 | private: 32 | vector > array; 33 | 34 | public: 35 | Pmat(){} 36 | 37 | Pmat(int M, int N):array(M) { 38 | for (int i = 0; i < M; i++) 39 | array[i].resize(N); 40 | } 41 | 42 | Pmat(int M, int N, T v):array(M) { 43 | for (int i = 0; i < M; i++) 44 | array[i].resize(N, v); 45 | } 46 | 47 | template 48 | Pmat(const Pmat& ma) { 49 | resize(ma.rows(), ma.cols()); 50 | for (int m = 0; m < ma.rows(); ++m) 51 | for (int n = 0; n < ma.cols(); ++n) 52 | array[m][n] = ma[m][n]; 53 | } 54 | 55 | template 56 | Pmat(const vector >& ma) { 57 | array.resize(ma.size()); 58 | for (int r = 0; r < ma.size(); ++r) { 59 | array[r].resize(ma[r].size()); 60 | for (int c = 0; c < ma[r].size(); ++c) { 61 | array[r][c] = ma[r][c]; 62 | } 63 | } 64 | } 65 | 66 | void resize(int M, int N) { 67 | array.resize(M); 68 | for (int i = 0; i < M; i++) 69 | array[i].resize(N); 70 | } 71 | 72 | void resize(int M, int N, T v) { 73 | array.resize(M); 74 | for (int i = 0; i < M; i++) 75 | array[i].resize(N, v); 76 | } 77 | 78 | void rand_init() { 79 | for (int i = 0 ; i < rows() ; ++i) 80 | array[i].rand_init(); 81 | } 82 | 83 | void fill(T v) { 84 | for (int i = 0; i < array.size(); ++i) 85 | array[i].fill(v); 86 | } 87 | 88 | // input format: v v v ... 89 | // dimensions of the matrix are determinated by input data 90 | void load(const string& inf) { 91 | ifstream rf(inf.c_str()); 92 | if (!rf) 93 | EXIT_ERR("file not find:", inf.c_str()); 94 | 95 | loadFileStream(rf); 96 | } 97 | 98 | void loadFileStream(ifstream& rf) { 99 | string line; 100 | while (getline(rf, line)) { 101 | // add a new row 102 | Pvec r(line); 103 | add_row(r); 104 | } 105 | } 106 | 107 | // load a transpose matrix 108 | void load_tmat(const string& inf) { 109 | ifstream rf(inf.c_str()); 110 | if (!rf) 111 | EXIT_ERR("file not find:", inf.c_str()); 112 | 113 | try { 114 | string line; 115 | while (getline(rf, line)) { 116 | // add a new row 117 | Pvec r(line); 118 | add_col(r); 119 | } 120 | } 121 | catch (...) { 122 | EXIT_ERR("Err file:", inf.c_str()); 123 | } 124 | } 125 | 126 | const int rows() const {return array.size();} 127 | const int size() const {return rows();} 128 | const int cols() const {return rows()?array[0].size():0;} 129 | 130 | Pvec &operator[] (int m){ 131 | if (m >= array.size()) 132 | cout << "ERR Row(i):" << m << ' ' << array.size() << endl; 133 | return array[m]; 134 | } 135 | 136 | const Pvec &operator[] (int m) const {return array[m];} 137 | 138 | template 139 | Pmat& operator=(const vector >& ma) { 140 | array.resize(ma.size()); 141 | for (int i = 0; i < array.size(); ++i) { 142 | array[i].resize(ma[i].size()); 143 | for (int j = 0; j < array[i].size(); ++j) 144 | array[i][j] = ma[i][j]; 145 | } 146 | return *this; 147 | } 148 | 149 | Pmat operator+(const T & v){ 150 | Pmat tp(this->rows(), this->cols()); 151 | for (int i=0; irows(); ++i) 152 | tp[i] = array[i] + v; 153 | return tp; 154 | } 155 | 156 | Pmat& operator+=(const T & v){ 157 | for (int i=0; irows(); ++i) 158 | array[i] += v; 159 | return *this; 160 | } 161 | 162 | Pmat operator+(const Pmat& v){ 163 | Pmat tp(this->rows(), this->cols()); 164 | for (int i=0; irows(); ++i) 165 | tp[i] = array[i] + v[i]; 166 | return tp; 167 | } 168 | 169 | Pmat& operator+=(const Pmat& v){ 170 | for (int i=0; irows(); ++i) 171 | array[i] += v[i]; 172 | return *this; 173 | } 174 | 175 | 176 | Pmat operator-(const T & v){ 177 | Pmat tp(this->rows(), this->cols()); 178 | for (int i=0; irows(); ++i) 179 | tp[i] = array[i] - v; 180 | return tp; 181 | } 182 | 183 | Pmat& operator-=(const T & v){ 184 | for (int i=0; irows(); ++i) 185 | array[i] -= v; 186 | return *this; 187 | } 188 | 189 | Pmat operator-(const Pmat& v){ 190 | Pmat tp(this->rows(), this->cols()); 191 | for (int i=0; irows(); ++i) 192 | tp[i] = array[i] - v[i]; 193 | return tp; 194 | } 195 | 196 | Pmat& operator-=(const Pmat& v){ 197 | for (int i=0; irows(); ++i) 198 | array[i] -= v[i]; 199 | return *this; 200 | } 201 | 202 | Pmat operator*(const T & v){ 203 | Pmat tp(this->rows(), this->cols()); 204 | for (int i=0; irows(); ++i) 205 | tp[i] = array[i] * v; 206 | return tp; 207 | } 208 | 209 | Pmat& operator*=(const T & v){ 210 | for (int i=0; irows(); ++i) 211 | array[i] *= v; 212 | return *this; 213 | } 214 | 215 | Pmat operator/(const T & v){ 216 | Pmat tp(this->rows(), this->cols()); 217 | for (int i=0; irows(); ++i) 218 | tp[i] = array[i] / v; 219 | return tp; 220 | } 221 | 222 | Pmat& operator/=(const T & v){ 223 | for (int i=0; irows(); ++i) 224 | array[i] /= v; 225 | return *this; 226 | } 227 | 228 | void add_row(const Pvec& v) {array.push_back(v);} 229 | void push_back(const Pvec& v) {array.push_back(v);} 230 | 231 | void add_col(const Pvec& v) { 232 | if (array.size() == 0) 233 | array.resize(v.size()); 234 | else 235 | assert(array.size() == v.size()); 236 | 237 | for (int i=0; i= start && end <= array.size()); 243 | array.erase(array.begin() + start, array.begin() + end); 244 | } 245 | 246 | void erase_col(int start, int end) { 247 | for (int r = 0; r < array.size(); ++r) { 248 | assert(end >= start && end <= array[r].size()); 249 | array[r].erase(array[r].begin() + start, array[r].begin() + end); 250 | } 251 | } 252 | 253 | void clear() {array.clear();} 254 | 255 | T sum() { 256 | T s=0; 257 | for (int i = 0 ; i < rows() ; ++i) 258 | s += array[i].sum(); 259 | return s; 260 | } 261 | 262 | Pvec rowSum() { 263 | Pvec s(array.size()); 264 | for (int i = 0 ; i < rows() ; ++i) 265 | s[i] = array[i].sum(); 266 | return s; 267 | } 268 | 269 | T norm() { 270 | T s=0; 271 | for (int i = 0 ; i < rows(); ++i) 272 | for (int j = 0; j < cols(); ++j) 273 | s += array[i][j] * array[i][j]; 274 | return sqrt(s); 275 | } 276 | 277 | // normalize to all entries sum to 1 278 | void normalize() { 279 | double eps = 1e-30; 280 | double smoother = eps * rows() * cols(); 281 | 282 | T s = this->sum(); 283 | for (int i=0 ; i transpose() const { 312 | int N = rows(); 313 | int M = cols(); 314 | Pmat tmat(M, N); 315 | 316 | for (int i=0; i toDouble() { 324 | return Pmat(*this); 325 | } 326 | 327 | string str() { 328 | ostringstream os; 329 | int i; 330 | for (i=0; i