├── Debug
    ├── objects.mk
    ├── sources.mk
    ├── src
    │   └── subdir.mk
    └── makefile
├── training.properties
├── test.properties
├── src
    ├── polya_fit_simple.h
    ├── strtokenizer.h
    ├── constants.h
    ├── math_func.h
    ├── strtokenizer.cpp
    ├── map_type.h
    ├── dataset.h
    ├── utils.h
    ├── document.h
    ├── polya_fit_simple.cpp
    ├── main.cpp
    ├── model.h
    ├── inference.h
    ├── cokus.h
    ├── utils.cpp
    ├── dataset.cpp
    ├── model.cpp
    ├── math_func.cpp
    └── inference.cpp
└── readme.txt


/Debug/objects.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 | 
5 | USER_OBJS :=
6 | 
7 | LIBS :=
8 | 
9 | 


--------------------------------------------------------------------------------
/training.properties:
--------------------------------------------------------------------------------
 1 | nsentiLabs=3
 2 | ntopics=1
 3 | niters=800
 4 | savestep=100
 5 | updateParaStep=50
 6 | twords=20
 7 | data_dir=/Users/chenghualin/Documents/workspace/JST-release/data/
 8 | datasetFile=MR.dat
 9 | result_dir=/Volumes/CHENGHUA-2T/JST-release/result/train/t1
10 | sentiFile=/Users/chenghualin/Documents/workspace/JST-release/data/mpqa.constraint
11 | beta=0.01
12 | 


--------------------------------------------------------------------------------
/test.properties:
--------------------------------------------------------------------------------
 1 | niters=60
 2 | savestep=20
 3 | twords=30
 4 | data_dir=/Users/chenghualin/Documents/workspace/JST-release/data/
 5 | datasetFile=test.dat
 6 | result_dir=/Volumes/CHENGHUA-2T/JST-release/result/test/t1
 7 | sentiFile=/Users/chenghualin/Documents/workspace/JST-release/data/mpqa.constraint
 8 | beta=0.01
 9 | model_dir=/Volumes/CHENGHUA-2T/JST-release/result/train/t1
10 | model=00100
11 | 


--------------------------------------------------------------------------------
/Debug/sources.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | O_SRCS := 
 6 | CPP_SRCS := 
 7 | C_UPPER_SRCS := 
 8 | C_SRCS := 
 9 | S_UPPER_SRCS := 
10 | OBJ_SRCS := 
11 | ASM_SRCS := 
12 | CXX_SRCS := 
13 | C++_SRCS := 
14 | CC_SRCS := 
15 | C++_DEPS := 
16 | OBJS := 
17 | C_DEPS := 
18 | CC_DEPS := 
19 | CPP_DEPS := 
20 | EXECUTABLES := 
21 | CXX_DEPS := 
22 | C_UPPER_DEPS := 
23 | 
24 | # Every subdirectory with source files must be described here
25 | SUBDIRS := \
26 | src \
27 | 
28 | 


--------------------------------------------------------------------------------
/Debug/src/subdir.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | # Add inputs and outputs from these tool invocations to the build variables 
 6 | CPP_SRCS += \
 7 | ../src/dataset.cpp \
 8 | ../src/inference.cpp \
 9 | ../src/main.cpp \
10 | ../src/math_func.cpp \
11 | ../src/model.cpp \
12 | ../src/polya_fit_simple.cpp \
13 | ../src/strtokenizer.cpp \
14 | ../src/utils.cpp 
15 | 
16 | OBJS += \
17 | ./src/dataset.o \
18 | ./src/inference.o \
19 | ./src/main.o \
20 | ./src/math_func.o \
21 | ./src/model.o \
22 | ./src/polya_fit_simple.o \
23 | ./src/strtokenizer.o \
24 | ./src/utils.o 
25 | 
26 | CPP_DEPS += \
27 | ./src/dataset.d \
28 | ./src/inference.d \
29 | ./src/main.d \
30 | ./src/math_func.d \
31 | ./src/model.d \
32 | ./src/polya_fit_simple.d \
33 | ./src/strtokenizer.d \
34 | ./src/utils.d 
35 | 
36 | 
37 | # Each subdirectory must supply rules for building sources it contributes
38 | src/%.o: ../src/%.cpp
39 | 	@echo 'Building file: $<'
40 | 	@echo 'Invoking: Cross G++ Compiler'
41 | 	g++ -O0 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<"
42 | 	@echo 'Finished building: $<'
43 | 	@echo ' '
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/src/polya_fit_simple.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 | 		        Joint Sentiment-Topic (JST) Model
 3 | ***********************************************************************
 4 | 
 5 | (C) Copyright 2013, Chenghua Lin and Yulan He
 6 | 
 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
 8 | Part of code is from http://gibbslda.sourceforge.net/.
 9 | 
10 | This file is part of JST implementation.
11 | 
12 | JST is free software; you can redistribute it and/or modify it under
13 | the terms of the GNU General Public License as published by the Free
14 | Software Foundation; either version 2 of the License, or (at your
15 | option) any later version.
16 | 
17 | JST is distributed in the hope that it will be useful, but WITHOUT
18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
20 | for more details.
21 | 
22 | You should have received a copy of the GNU General Public License
23 | along with this program; if not, write to the Free Software
24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
25 | USA
26 | 
27 | ***********************************************************************/
28 | 
29 | #ifndef	_POLYA_FIT_SIMPLE_H
30 | #define	_POLYA_FIT_SIMPLE_H
31 | 
32 | int polya_fit_simple(int ** data, double * alhpa, int _K, int _nSample);
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/src/strtokenizer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2007 by
 3 |  * 
 4 |  * 	Xuan-Hieu Phan
 5 |  *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
 6 |  * 	Graduate School of Information Sciences
 7 |  * 	Tohoku University
 8 |  *
 9 |  * GibbsLDA++ is a free software; you can redistribute it and/or modify
10 |  * it under the terms of the GNU General Public License as published
11 |  * by the Free Software Foundation; either version 2 of the License,
12 |  * or (at your option) any later version.
13 |  *
14 |  * GibbsLDA++ is distributed in the hope that it will be useful, but
15 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
16 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 |  * GNU General Public License for more details.
18 |  *
19 |  * You should have received a copy of the GNU General Public License
20 |  * along with GibbsLDA++; if not, write to the Free Software Foundation,
21 |  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
22 |  */
23 | 
24 | #ifndef _STRTOKENIZER_H
25 | #define _STRTOKENIZER_H
26 | 
27 | #include <string>
28 | #include <vector>
29 | 
30 | using namespace std;
31 | 
32 | class strtokenizer {
33 | protected:
34 |     vector<string> tokens;
35 |     int idx;
36 | 
37 | public:
38 |     strtokenizer(string str, string seperators = " ");    
39 |     
40 |     void parse(string str, string seperators);
41 |     
42 |     int count_tokens();
43 |     string next_token();   
44 |     void start_scan();
45 | 
46 |     string token(int i);
47 | };
48 | 
49 | #endif
50 | 
51 | 


--------------------------------------------------------------------------------
/Debug/makefile:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | -include ../makefile.init
 6 | 
 7 | RM := rm -rf
 8 | 
 9 | # All of the sources participating in the build are defined here
10 | -include sources.mk
11 | -include src/subdir.mk
12 | -include subdir.mk
13 | -include objects.mk
14 | 
15 | ifneq ($(MAKECMDGOALS),clean)
16 | ifneq ($(strip $(C++_DEPS)),)
17 | -include $(C++_DEPS)
18 | endif
19 | ifneq ($(strip $(C_DEPS)),)
20 | -include $(C_DEPS)
21 | endif
22 | ifneq ($(strip $(CC_DEPS)),)
23 | -include $(CC_DEPS)
24 | endif
25 | ifneq ($(strip $(CPP_DEPS)),)
26 | -include $(CPP_DEPS)
27 | endif
28 | ifneq ($(strip $(CXX_DEPS)),)
29 | -include $(CXX_DEPS)
30 | endif
31 | ifneq ($(strip $(C_UPPER_DEPS)),)
32 | -include $(C_UPPER_DEPS)
33 | endif
34 | endif
35 | 
36 | -include ../makefile.defs
37 | 
38 | # Add inputs and outputs from these tool invocations to the build variables 
39 | 
40 | # All Target
41 | all: jst
42 | 
43 | # Tool invocations
44 | jst: $(OBJS) $(USER_OBJS)
45 | 	@echo 'Building target: $@'
46 | 	@echo 'Invoking: Cross G++ Linker'
47 | 	g++  -o "jst" $(OBJS) $(USER_OBJS) $(LIBS)
48 | 	@echo 'Finished building target: $@'
49 | 	@echo ' '
50 | 
51 | # Other Targets
52 | clean:
53 | 	-$(RM) $(C++_DEPS)$(OBJS)$(C_DEPS)$(CC_DEPS)$(CPP_DEPS)$(EXECUTABLES)$(CXX_DEPS)$(C_UPPER_DEPS) jst
54 | 	-@echo ' '
55 | 
56 | .PHONY: all clean dependents
57 | .SECONDARY:
58 | 
59 | -include ../makefile.targets
60 | 


--------------------------------------------------------------------------------
/src/constants.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 | 		        Joint Sentiment-Topic (JST) Model
 3 | ***********************************************************************
 4 | 
 5 | (C) Copyright 2013, Chenghua Lin and Yulan He
 6 | 
 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
 8 | Part of code is from http://gibbslda.sourceforge.net/.
 9 | 
10 | This file is part of JST implementation.
11 | 
12 | JST is free software; you can redistribute it and/or modify it under
13 | the terms of the GNU General Public License as published by the Free
14 | Software Foundation; either version 2 of the License, or (at your
15 | option) any later version.
16 | 
17 | JST is distributed in the hope that it will be useful, but WITHOUT
18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
20 | for more details.
21 | 
22 | You should have received a copy of the GNU General Public License
23 | along with this program; if not, write to the Free Software
24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
25 | USA
26 | 
27 | ***********************************************************************/
28 | 
29 | 
30 | #ifndef _CONSTANTS_H
31 | #define _CONSTANTS_H
32 | 
33 | #define	BUFF_SIZE_LONG	1000000
34 | #define	BUFF_SIZE_SHORT	512
35 | 
36 | #define	MODEL_STATUS_UNKNOWN	0
37 | #define	MODEL_STATUS_EST	1
38 | #define	MODEL_STATUS_ESTC	2
39 | #define	MODEL_STATUS_INF	3
40 | 
41 | #define	MODE_NONE	0
42 | #define	MODE_SLIDING	1
43 | #define	MODE_SKIP	2
44 | #define	MODE_MULTISCALE	3
45 | 
46 | #define	MAX_ITERATION	100000
47 | 
48 | #endif
49 | 
50 | 


--------------------------------------------------------------------------------
/src/math_func.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 | 		        Joint Sentiment-Topic (JST) Model
 3 | ***********************************************************************
 4 | 
 5 | (C) Copyright 2013, Chenghua Lin and Yulan He
 6 | 
 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
 8 | Part of code is from http://gibbslda.sourceforge.net/.
 9 | 
10 | This file is part of JST implementation.
11 | 
12 | JST is free software; you can redistribute it and/or modify it under
13 | the terms of the GNU General Public License as published by the Free
14 | Software Foundation; either version 2 of the License, or (at your
15 | option) any later version.
16 | 
17 | JST is distributed in the hope that it will be useful, but WITHOUT
18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
20 | for more details.
21 | 
22 | You should have received a copy of the GNU General Public License
23 | along with this program; if not, write to the Free Software
24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
25 | USA
26 | 
27 | ***********************************************************************/
28 |    
29 |    
30 | #ifndef	_MATH_FUNC_H
31 | #define	_MATH_FUNC_H
32 | 
33 | 
34 | //*************************  asa032.h   ************************************//
35 | double alngam ( double xvalue, int *ifault );
36 | double gamain ( double x, double p, int *ifault );
37 | void gamma_inc_values ( int *n_data, double *a, double *x, double *fx );
38 | double r8_abs ( double x );
39 | void timestamp ( void );
40 | 
41 | 
42 | //*************************  asa103.cpp   ************************************//
43 | double digama ( double x, int *ifault );
44 | void psi_values ( int *n_data, double *x, double *fx );
45 | //void timestamp ( void );
46 | 
47 | 
48 | //*************************  asa121.cpp   ************************************//
49 | //void timestamp ( void );
50 | double trigam ( double x, int *ifault );
51 | void trigamma_values ( int *n_data, double *x, double *fx );
52 | 
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/src/strtokenizer.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2007 by
 3 |  * 
 4 |  * 	Xuan-Hieu Phan
 5 |  *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
 6 |  * 	Graduate School of Information Sciences
 7 |  * 	Tohoku University
 8 |  *
 9 |  * GibbsLDA++ is a free software; you can redistribute it and/or modify
10 |  * it under the terms of the GNU General Public License as published
11 |  * by the Free Software Foundation; either version 2 of the License,
12 |  * or (at your option) any later version.
13 |  *
14 |  * GibbsLDA++ is distributed in the hope that it will be useful, but
15 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
16 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 |  * GNU General Public License for more details.
18 |  *
19 |  * You should have received a copy of the GNU General Public License
20 |  * along with GibbsLDA++; if not, write to the Free Software Foundation,
21 |  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
22 |  */
23 | 
24 | #include <string>
25 | #include <vector>
26 | #include "strtokenizer.h"
27 |  
28 | using namespace std;
29 | 
30 | strtokenizer::strtokenizer(string str, string seperators) {
31 |     parse(str, seperators);
32 | }
33 | 
34 | void strtokenizer::parse(string str, string seperators) {
35 |     int n = str.length();
36 |     int start, stop;
37 |     
38 |     start = str.find_first_not_of(seperators);
39 |     while (start >= 0 && start < n) {
40 | 		stop = str.find_first_of(seperators, start);
41 | 		if (stop < 0 || stop > n) {
42 | 			stop = n;
43 | 		}
44 | 		
45 | 		tokens.push_back(str.substr(start, stop - start));	
46 | 		start = str.find_first_not_of(seperators, stop + 1);
47 |     }
48 |     
49 |     start_scan();
50 | }
51 | 
52 | int strtokenizer::count_tokens() {
53 |     return tokens.size();
54 | }
55 | 
56 | void strtokenizer::start_scan() {
57 |     idx = 0;
58 | }
59 | 
60 | string strtokenizer::next_token() {
61 |     if (idx >= 0 && idx < (int)tokens.size()) 
62 |     	return tokens[idx++];
63 |     else 
64 |     	return "";
65 | }
66 | 
67 | string strtokenizer::token(int i) {
68 |     if (i >= 0 && i < (int)tokens.size())
69 |     	return tokens[i];
70 |     else
71 |     	return "";
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/src/map_type.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 | 		        Joint Sentiment-Topic (JST) Model
 3 | ***********************************************************************
 4 | 
 5 | (C) Copyright 2013, Chenghua Lin and Yulan He
 6 | 
 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
 8 | Part of code is from http://gibbslda.sourceforge.net/.
 9 | 
10 | This file is part of JST implementation.
11 | 
12 | JST is free software; you can redistribute it and/or modify it under
13 | the terms of the GNU General Public License as published by the Free
14 | Software Foundation; either version 2 of the License, or (at your
15 | option) any later version.
16 | 
17 | JST is distributed in the hope that it will be useful, but WITHOUT
18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
20 | for more details.
21 | 
22 | You should have received a copy of the GNU General Public License
23 | along with this program; if not, write to the Free Software
24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
25 | USA
26 | 
27 | ***********************************************************************/
28 |    
29 |    
30 | #ifndef	_MAP_TYPE_H
31 | #define	_MAP_TYPE_H
32 | #include <map>
33 | #include <iostream>
34 | using namespace std;
35 | 
36 | 
37 | struct Word_atr { 
38 | 	int id; // vocabulary index
39 | 	int polarity; // sentiment label
40 | };
41 | 
42 | struct Word_Prior_Attr { 
43 | 	int id; // prior sentiment label
44 | 	vector<double> labDist; // label distribution
45 | };
46 | 
47 | // map of words/terms [string => int]
48 | typedef map<string, int> mapword2id;
49 | 
50 | // map of words/terms [int => string]
51 | typedef map<int, string> mapid2word;
52 | 
53 | // map of words/attributes_of_words [string => word_attr]
54 | typedef map<string, Word_atr> mapword2atr;
55 | 
56 | // map of word / word prior info [string => sentiment lab ID, sentiment label distribition]
57 | typedef map<string, Word_Prior_Attr > mapword2prior;
58 | 
59 | // map of doc / doc label distribution [string => doc label distribition]
60 | typedef map<string, vector<double> > mapname2labs;
61 | 
62 | #endif
63 | 


--------------------------------------------------------------------------------
/src/dataset.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 | 		        Joint Sentiment-Topic (JST) Model
 3 | ***********************************************************************
 4 | 
 5 | (C) Copyright 2013, Chenghua Lin and Yulan He
 6 | 
 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
 8 | Part of code is from http://gibbslda.sourceforge.net/.
 9 | 
10 | This file is part of JST implementation.
11 | 
12 | JST is free software; you can redistribute it and/or modify it under
13 | the terms of the GNU General Public License as published by the Free
14 | Software Foundation; either version 2 of the License, or (at your
15 | option) any later version.
16 | 
17 | JST is distributed in the hope that it will be useful, but WITHOUT
18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
20 | for more details.
21 | 
22 | You should have received a copy of the GNU General Public License
23 | along with this program; if not, write to the Free Software
24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
25 | USA
26 | 
27 | ***********************************************************************/
28 | 
29 | 
30 | #ifndef	_DATASET_H
31 | #define	_DATASET_H
32 | 
33 | #include "constants.h"
34 | #include "document.h"
35 | #include "map_type.h"
36 | #include <string>
37 | #include <vector>
38 | #include <map>
39 | #include <fstream>
40 | #include <sstream>
41 | using namespace std; 
42 | 
43 | 
44 | class dataset {
45 | 
46 | public:
47 |     mapword2atr word2atr;
48 | 	mapid2word id2word; 
49 | 	mapword2prior sentiLex; // <word, polarity>
50 | 	
51 | 	document ** pdocs; // store training data vocab ID
52 | 	document ** _pdocs; // only use for inference, i.e., for storing the new/test vocab ID
53 |     ifstream fin;
54 | 	
55 | 	string data_dir;
56 | 	string result_dir;
57 | 	string wordmapfile;
58 | 
59 | 	int numDocs;
60 | 	int aveDocLength; // average document length
61 | 	int vocabSize;
62 | 	int corpusSize;
63 | 	
64 | 	vector<string> docs; // for buffering dataset
65 | 	vector<string> newWords;
66 | 		
67 | 	// functions 
68 | 	dataset();
69 | 	dataset(string result_dir);
70 | 	~dataset(void);
71 | 	
72 | 	int read_dataStream(ifstream& fin);
73 | 	int read_newData(string filename);
74 | 	int read_senti_lexicon(string sentiLexiconFileDir);
75 | 	int analyzeCorpus(vector<string>& docs);
76 | 
77 | 	static int write_wordmap(string wordmapfile, mapword2atr& pword2atr);
78 | 	static int read_wordmap(string wordmapfile, mapid2word& pid2word);
79 | 	static int read_wordmap(string wordmapfile, mapword2id& pword2id); 
80 | 
81 | 	int init_parameter();
82 | 	void deallocate();  
83 | 	void add_doc(document * doc, int idx);
84 | 	void _add_doc(document * doc, int idx);
85 | 
86 | };
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 | 		        Joint Sentiment-Topic (JST) Model
 3 | ***********************************************************************
 4 | 
 5 | (C) Copyright 2013, Chenghua Lin and Yulan He
 6 | 
 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
 8 | Part of code is from http://gibbslda.sourceforge.net/.
 9 | 
10 | This file is part of JST implementation.
11 | 
12 | JST is free software; you can redistribute it and/or modify it under
13 | the terms of the GNU General Public License as published by the Free
14 | Software Foundation; either version 2 of the License, or (at your
15 | option) any later version.
16 | 
17 | JST is distributed in the hope that it will be useful, but WITHOUT
18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
20 | for more details.
21 | 
22 | You should have received a copy of the GNU General Public License
23 | along with this program; if not, write to the Free Software
24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
25 | USA
26 | 
27 | ***********************************************************************/
28 | 
29 | #ifndef _UTILS_H
30 | #define _UTILS_H
31 | 
32 | #include "dataset.h"
33 | #include <string>
34 | #include <algorithm>
35 | using namespace std;
36 | 
37 | // for sorting word probabilitys
38 | struct sort_pred {
39 |     bool operator()(const std::pair<int,double> &left, const std::pair<int,double> &right) {
40 | 	    return left.second > right.second;
41 |     }
42 | };
43 | 
44 | class model;
45 | class Inference;
46 | 
47 | class utils {
48 | private:
49 | 	int model_status;
50 |     string model_dir;
51 | 	string data_dir;
52 | 	string result_dir;
53 |     string model_name;
54 | 	string wordmapfile;
55 | 	string sentiLexFile;
56 |     string datasetFile;
57 |     string configfile;
58 | 
59 |     int numSentiLabs;
60 | 	int numTopics;
61 |     int niters;
62 |     int savestep;
63 |     int twords;
64 | 	int updateParaStep;
65 | 	double alpha;
66 | 	double beta;
67 |     double gamma;
68 | 	
69 | 	
70 | public:
71 | 	utils();
72 | 		
73 |     // parse command line arguments
74 |     int parse_args(int argc, char ** argv, int&  model_status);
75 |   	int parse_args_est(int argc, char ** argv, model * pmodel);
76 | 	int parse_args_inf(int argc, char ** argv, Inference * pmodel_inf);
77 |     
78 |     // read configuration file
79 | 	int read_config_file(string configfile);
80 |   
81 |     // generate the model name for the current iteration
82 |     string generate_model_name(int iter);  
83 | 
84 |     // make directory
85 |     int make_dir(string strPath);
86 |     
87 |     // sort    
88 |     void sort(vector<double> & probs, vector<int> & words);
89 | };
90 | 
91 | #endif
92 | 
93 | 


--------------------------------------------------------------------------------
/readme.txt:
--------------------------------------------------------------------------------
  1 | *****************************************************
  2 |          Joint Sentiment-Topic (JST) Model 
  3 | *****************************************************
  4 | 
  5 | (C) Copyright 2013, Chenghua Lin and Yulan He
  6 | 
  7 | Written by Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk, part of code
  8 | is from http://gibbslda.sourceforge.net/.
  9 | 
 10 | This file is part of JST implementation.
 11 | 
 12 | JST is free software; you can redistribute it and/or modify it under
 13 | the terms of the GNU General Public License as published by the Free
 14 | Software Foundation; either version 2 of the License, or (at your
 15 | option) any later version.
 16 | 
 17 | JST is distributed in the hope that it will be useful, but WITHOUT
 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 20 | for more details.
 21 | 
 22 | You should have received a copy of the GNU General Public License
 23 | along with this program; if not, write to the Free Software
 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 25 | USA
 26 | 
 27 | 
 28 | ------------------------------------------------------------------------
 29 | 
 30 | This is a C++ implementation of the joint sentiment-topic (JST) model for  
 31 | sentiment classification and extracting sentiment-bearing topics from text copara.
 32 | 
 33 | ------------------------------------------------------------------------
 34 | 
 35 | 
 36 | TABLE OF CONTENTS
 37 | 
 38 | 
 39 | A. COMPILING
 40 | 
 41 | B. ESTIMATION
 42 | 
 43 | C. INFERENCE
 44 | 
 45 | D. Data format
 46 | 
 47 | E. References 
 48 | 
 49 | 
 50 | ------------------------------------------------------------------------
 51 | 
 52 | A. COMPILING
 53 | 
 54 | Type "make" in a shell.
 55 | 
 56 | 
 57 | ------------------------------------------------------------------------
 58 | 
 59 | B. ESTIMATION
 60 | 
 61 | Estimate the model by executing:
 62 | 
 63 | 	jst -est -config YOUR-PATH/train.properties	
 64 | 	
 65 | Outputs of jst estimation include the following files:
 66 | 	<iter>.others  // contains model parameter settings
 67 | 	<iter>.pi      // contains the per-document sentiment distributions
 68 | 	<iter>.phi     // contains the sentiment specific topic-word distributions
 69 | 	<iter>.theta   // contains the per-document sentiment specific topic proportions
 70 | 	<iter>.tassign // contains the sentiment label and topic assignments for words in training data
 71 | ------------------------------------------------------------------------
 72 | 
 73 | C. INFERENCE
 74 | 
 75 | To perform inference on a different set of data (in the same format as
 76 | for estimation), execute:
 77 | 
 78 |     jst -inf -config YOUR-PATH/test.properties
 79 |     
 80 | Outputs of jst inference include the following files:
 81 | 	<modelName_iter>.newothers 
 82 | 	<modelName_iter>.newpi 
 83 | 	<modelName_iter>.newphi 
 84 | 	<modelName_iter>.newtheta 
 85 | 	<modelName_iter>.newtassign
 86 |     
 87 | ------------------------------------------------------------------------
 88 | 
 89 | D. Data format
 90 | 
 91 | (1) The input data format for estimation/inference is as follows, where each line is one document, preceded by the document ID.
 92 | 
 93 |     [Doc_1 name] [token_1] [token_2] ... [token_N]
 94 |      :
 95 |      :
 96 |     [Doc_M name] [token_1] [token_2] ... [token_N]
 97 | 
 98 | (2) Sentiment lexicon (mpqa.constraint)
 99 | 
100 |     [word]	[neu prior prob.] [pos prior prob.] [neg prior prob.]
101 | 	
102 | 	
103 | ------------------------------------------------------------------------
104 | 
105 | E. References
106 | 
107 | [1] Lin, C., He, Y., Everson, R. and Reuger, S. Weakly-supervised Joint Sentiment-Topic Detection from Text, IEEE Transactions on Knowledge and Data Engineering (TKDE), 2011.
108 | 
109 | [2] Lin, C. and He, Y. Joint Sentiment/Topic Model for Sentiment Analysis, In Proceedings of the 18th ACM Conference on Information and Knowl- edge Management (CIKM), Hong Kong, China, 2009.
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/src/document.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 | 		        Joint Sentiment-Topic (JST) Model
  3 | ***********************************************************************
  4 | 
  5 | (C) Copyright 2013, Chenghua Lin and Yulan He
  6 | 
  7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
  8 | Part of code is from http://gibbslda.sourceforge.net/.
  9 | 
 10 | This file is part of JST implementation.
 11 | 
 12 | JST is free software; you can redistribute it and/or modify it under
 13 | the terms of the GNU General Public License as published by the Free
 14 | Software Foundation; either version 2 of the License, or (at your
 15 | option) any later version.
 16 | 
 17 | JST is distributed in the hope that it will be useful, but WITHOUT
 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 20 | for more details.
 21 | 
 22 | You should have received a copy of the GNU General Public License
 23 | along with this program; if not, write to the Free Software
 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 25 | USA
 26 | 
 27 | ***********************************************************************/
 28 |    
 29 |    
 30 | #ifndef	_DOCUMENT_H
 31 | #define	_DOCUMENT_H
 32 | 
 33 | #include <vector>
 34 | #include <iostream>
 35 | using namespace std; 
 36 | 
 37 | 
 38 | 
 39 | class document {
 40 | 
 41 | public:
 42 | 	int * words;
 43 | 	int * priorSentiLabels;
 44 | 	string docID;
 45 | 	string rawstr;
 46 | 	int length;
 47 | 	
 48 | 	document() {
 49 | 		words = NULL;
 50 | 		priorSentiLabels = NULL;
 51 | 		docID = "";
 52 | 		rawstr = "";
 53 | 		length = 0;	
 54 | 	}
 55 |     
 56 | 	// Constructor. Retrieve the length of the document and allocate memory for storing the documents
 57 |     document(int length) {
 58 | 		this->length = length;
 59 | 		docID = "";
 60 | 		rawstr = "";
 61 | 		words = new int[length]; // words stores the word token ID, which is integer
 62 | 		priorSentiLabels = new int[length];	
 63 |     }
 64 |     
 65 |     // Constructor. Retrieve the length of the document and store the element of words into the array
 66 |     document(int length, int * words) {
 67 | 		this->length = length;
 68 | 		docID = "";
 69 | 		rawstr = "";
 70 | 		this->words = new int[length];
 71 | 		for (int i = 0; i < length; i++) {
 72 | 			this->words[i] = words[i];
 73 | 		}
 74 | 		priorSentiLabels = new int[length];	
 75 |     }
 76 | 
 77 |     document(int length, int * words, string rawstr) {
 78 | 		this->length = length;
 79 | 		docID = "";
 80 | 		this->rawstr = rawstr;
 81 | 		this->words = new int[length];
 82 | 		for (int i = 0; i < length; i++) {
 83 | 			 this->words[i] = words[i];
 84 | 		}
 85 | 		priorSentiLabels = new int[length];
 86 |     }
 87 |     
 88 | 
 89 |     document(vector<int> & doc) {
 90 | 		this->length = doc.size();
 91 | 		docID = "";
 92 | 		rawstr = "";
 93 | 		this->words = new int[length];
 94 | 		for (int i = 0; i < length; i++) {
 95 | 			this->words[i] = doc[i];
 96 | 		}
 97 | 		priorSentiLabels = new int[length];	
 98 |     }
 99 | 
100 | 
101 | 	document(vector<int> & doc, string rawstr) {
102 | 		this->length = doc.size();
103 | 		docID = "";
104 | 		this->rawstr = rawstr;
105 | 		this->words = new int[length];
106 | 		for (int i = 0; i < length; i++) {
107 | 			this->words[i] = doc[i];
108 | 		}
109 | 		priorSentiLabels = new int[length];
110 | 	}
111 | 
112 |     document(vector<int> & doc, vector<int> &priorSentiLab, string rawstr) {
113 | 		this->length = doc.size();
114 | 		docID = "";
115 | 		this->rawstr = rawstr;
116 | 		this->words = new int[length];
117 | 		this->priorSentiLabels = new int[length];
118 | 		for (int i = 0; i < length; i++) {
119 | 			this->words[i] = doc[i];
120 | 			this->priorSentiLabels[i] = priorSentiLab[i];
121 | 		}
122 |     }
123 |     
124 |     ~document() {
125 | 		if (words != NULL){ 
126 | 			delete [] words;
127 | 			words = NULL;
128 | 		}
129 | 			
130 | 		if (priorSentiLabels != NULL){
131 | 			delete [] priorSentiLabels;
132 | 			priorSentiLabels = NULL;
133 | 		}
134 |     }
135 | };
136 | 
137 | #endif
138 | 


--------------------------------------------------------------------------------
/src/polya_fit_simple.cpp:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 | 		        Joint Sentiment-Topic (JST) Model
  3 | ***********************************************************************
  4 | 
  5 | (C) Copyright 2013, Chenghua Lin and Yulan He
  6 | 
  7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
  8 | Part of code is from http://gibbslda.sourceforge.net/.
  9 | 
 10 | This file is part of JST implementation.
 11 | 
 12 | JST is free software; you can redistribute it and/or modify it under
 13 | the terms of the GNU General Public License as published by the Free
 14 | Software Foundation; either version 2 of the License, or (at your
 15 | option) any later version.
 16 | 
 17 | JST is distributed in the hope that it will be useful, but WITHOUT
 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 20 | for more details.
 21 | 
 22 | You should have received a copy of the GNU General Public License
 23 | along with this program; if not, write to the Free Software
 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 25 | USA
 26 | 
 27 | ***********************************************************************/
 28 | 
 29 | #include "polya_fit_simple.h"
 30 | #include "math_func.h"
 31 | #include <math.h>
 32 | #include <iostream>
 33 | #include <string>
 34 | #include <algorithm>
 35 | 
 36 | using namespace std;
 37 | 
 38 | 
 39 | int polya_fit_simple(int ** data, double * alpha, int _K, int _nSample) {
 40 | 	int K = _K;                 // hyperparameter dimension
 41 | 	int nSample = _nSample;     // total number of samples, i.e.documents
 42 | 	int polya_iter = 100000;    // maximum number of fixed point iterations
 43 | 	int ifault1, ifault2;
 44 | 
 45 | 	double sum_alpha_old;
 46 | 	double * old_alpha = NULL;
 47 | 	double sum_g = 0; // sum_g = sum_digama(data[i][k] + old_alpha[k]),
 48 | 	double sum_h = 0; // sum_h + sum_digama(data[i] + sum_alpha_old) , where data[i] = sum_data[i][k] for all k,
 49 | 	double * data_row_sum = NULL; // the sum of the counts of each data sample P = {P_1, P_2,...,P_k}
 50 | 	bool sat_state = false;
 51 | 	int i, k, j;
 52 |   
 53 | 	old_alpha = new double[K];
 54 | 	for (k = 0; k < K; k++) {
 55 | 		old_alpha[k] = 0;
 56 | 	}
 57 |   
 58 | 	data_row_sum = new double[nSample];
 59 | 	for (i = 0; i < nSample; i++) {
 60 | 		data_row_sum[i] = 0;
 61 | 	}
 62 | 
 63 | 	// data_row_sum
 64 | 	for (i = 0; i < nSample; i++) {
 65 | 		for (k = 0; k < K; k++) {
 66 | 			data_row_sum[i] += *(*(data+k)+i) ;
 67 | 		}
 68 | 	}
 69 | 
 70 | 	// simple fix point iteration
 71 | 	printf("Optimising parameters...\n");
 72 | 	for (i = 0; i < polya_iter; i++) {  // reset sum_alpha_old
 73 | 		sum_alpha_old = 0;
 74 | 		// update old_alpha after each iteration
 75 | 		for (j = 0; j < K; j++) {
 76 | 			old_alpha[j] = *(alpha+j);
 77 | 		}
 78 |  
 79 | 		 // calculate sum_alpha_old
 80 | 		 for (j = 0; j < K; j++) {
 81 | 			 sum_alpha_old += old_alpha[j];
 82 | 		 }
 83 | 
 84 | 		 for (k = 0; k < K; k++) {
 85 | 			 sum_g = 0;
 86 | 			 sum_h = 0;
 87 | 
 88 | 			 // calculate sum_g[k]
 89 | 			 for (j = 0; j < nSample; j++) {
 90 | 				 sum_g += digama( *(*(data+k)+j) + old_alpha[k], &ifault1);
 91 | 			 }
 92 | 
 93 | 			 // calculate sum_h
 94 | 			 for (j = 0; j < nSample; j++) {
 95 | 				 sum_h += digama(data_row_sum[j] + sum_alpha_old, &ifault1);
 96 | 			 }
 97 | 
 98 | 			 // update alpha (new)
 99 | 			 *(alpha+k) = old_alpha[k] * (sum_g - nSample * digama(old_alpha[k], &ifault1)) / (sum_h - nSample * digama(sum_alpha_old, &ifault2));
100 | 		 }
101 | 
102 | 		 // terminate iteration ONLY if each dimension of {alpha_1, alpha_2, ... alpha_k} satisfy the termination criteria,
103 | 		 for (j = 0; j < K; j++) {
104 | 			 if (fabs( *(alpha+j) - old_alpha[j]) > 0.000001) break;
105 | 			 if ( j == K-1) {
106 | 				 sat_state = true;
107 | 			 }
108 | 		 }
109 | 
110 | 		// check whether to terminate the whole iteration
111 | 		if(sat_state) {
112 | 			cout<<"Terminated at iteration: "<<i<<endl;
113 | 			break;
114 | 		}
115 | 		else if(i == polya_iter-1)  cout<<"Haven't converged! Terminated at iteration: "<<i+1<<endl;
116 | 
117 | 	}
118 | 
119 | 	printf("Optimisation done!\n");
120 | 
121 |     return 0;
122 | }
123 | 


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 | 		        Joint Sentiment-Topic (JST) Model
  3 | ***********************************************************************
  4 | 
  5 | (C) Copyright 2013, Chenghua Lin and Yulan He
  6 | 
  7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
  8 | Part of code is from http://gibbslda.sourceforge.net/.
  9 | 
 10 | This file is part of JST implementation.
 11 | 
 12 | JST is free software; you can redistribute it and/or modify it under
 13 | the terms of the GNU General Public License as published by the Free
 14 | Software Foundation; either version 2 of the License, or (at your
 15 | option) any later version.
 16 | 
 17 | JST is distributed in the hope that it will be useful, but WITHOUT
 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 20 | for more details.
 21 | 
 22 | You should have received a copy of the GNU General Public License
 23 | along with this program; if not, write to the Free Software
 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 25 | USA
 26 | 
 27 | ***********************************************************************/
 28 |    
 29 | 
 30 | #include "model.h"
 31 | #include "inference.h"
 32 | #include "utils.h"
 33 | #include "stdio.h"
 34 | #include "stdlib.h"
 35 | #include "time.h"
 36 | #include <iostream>
 37 | #include <map>
 38 | using namespace std; 
 39 | 
 40 | void show_help();
 41 | 
 42 | 
 43 | int main(int argc, char ** argv) {
 44 | 
 45 | 	int model_status = MODEL_STATUS_UNKNOWN;
 46 | 	utils *putils = new utils();
 47 | 	model_status = putils->parse_args(argc, argv, model_status);
 48 | 	
 49 | 	if (putils)
 50 | 		delete putils;
 51 | 
 52 | 	if (model_status == MODEL_STATUS_UNKNOWN) {
 53 | 		printf("Please specify the task you would like to perform, training (-est) or inference (-inf)!\n");
 54 | 		show_help();
 55 | 		return 1;
 56 | 	}
 57 | 	else if (model_status == MODEL_STATUS_EST){
 58 | 		model jst;
 59 | 	
 60 | 		if (jst.init(argc, argv)) {
 61 | 			show_help();
 62 | 			return 1;
 63 | 		}
 64 | 		
 65 | 		if(jst.excute_model()) return 1;
 66 | 	}
 67 | 	else if (model_status == MODEL_STATUS_INF) {
 68 | 		Inference jst;
 69 | 		
 70 | 		if (jst.init(argc, argv)) {
 71 | 			show_help();
 72 | 			return 1;
 73 | 		}
 74 | 	}
 75 | 
 76 | 	return 0;
 77 | }
 78 | 
 79 | 
 80 | void show_help() {
 81 | 
 82 | 	printf("Command line usage:\n");
 83 | 	printf("jst -est|-inf [options]\n");
 84 | 	printf("-est \t Estimate the DJST model from scratch.\n");
 85 | 	printf("-inf \t Perform inference on unseen (new) data using a trained model.\n");
 86 | 	
 87 | 	printf("\n-----------------------------------------------------------\n");
 88 | 	printf("Command line opitions:\n\n");
 89 | 
 90 | 	printf("-nsentiLabs \t The number of sentiment labels. The default is 3.\n");
 91 | 	printf("-ntopics \t The number of topics. The default is 50.\n");
 92 | 	printf("-niters \t The number of Gibbs sampling iterations. The default is 1000.\n");
 93 | 	printf("-savestep \t The step (counted by the number of Gibbs sampling iterations) at which the model is saved to hard disk. The default is 200.\n");
 94 | 	printf("-updateParaStep The step (counted by the number of Gibbs sampling iterations) at which the hyperparameters are updated. The default is 40.\n");
 95 | 	printf("-twords \t The number of most likely words to be printed for each topic. The default is 20.\n");
 96 | 	printf("-data_dir \t The directory where the input training data is stored.\n");
 97 | 	printf("-result_dir \t The directory where the output models and parameters will be stored.\n");
 98 | 	printf("-datasetFile \t The input training data file.\n");
 99 | 	printf("-sentiFile \t The sentiment lexicon file.\n");
100 | 	printf("-vocab \t\t The vocabulary file.\n");
101 | 	printf("-alpha \t\t The hyperparameter of the per-document sentiment specific topic proportion. The default is avgDocLength*0.05/(numSentiLabs*numTopics).\n");
102 | 	printf("-beta \t\t The hyperparameter of the per-corpus sentiment specific topic-word distribution. The default is 0.01.\n");
103 | 	printf("-gamma \t\t The hyperparameter of the per-document sentiment proportion. The default is avgDocLength*0.05/numSentiLabs.\n");
104 | 	printf("-model_dir \t\t The directory of the previously trained model. (for inference only).\n");
105 | 	printf("-model \t\t The name of the previously trained model. (for inference only).\n");
106 | }
107 | 
108 | 


--------------------------------------------------------------------------------
/src/model.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 | 		        Joint Sentiment-Topic (JST) Model
  3 | ***********************************************************************
  4 | 
  5 | (C) Copyright 2013, Chenghua Lin and Yulan He
  6 | 
  7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
  8 | Part of code is from http://gibbslda.sourceforge.net/.
  9 | 
 10 | This file is part of JST implementation.
 11 | 
 12 | JST is free software; you can redistribute it and/or modify it under
 13 | the terms of the GNU General Public License as published by the Free
 14 | Software Foundation; either version 2 of the License, or (at your
 15 | option) any later version.
 16 | 
 17 | JST is distributed in the hope that it will be useful, but WITHOUT
 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 20 | for more details.
 21 | 
 22 | You should have received a copy of the GNU General Public License
 23 | along with this program; if not, write to the Free Software
 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 25 | USA
 26 | 
 27 | ***********************************************************************/
 28 |    
 29 | 
 30 | #ifndef	_MODEL_H
 31 | #define	_MODEL_H
 32 | 
 33 | #include <stdio.h>
 34 | #include <stdlib.h>
 35 | #include <time.h>
 36 | #include <sys/stat.h>
 37 | #include <math.h>
 38 | #include <fstream>
 39 | #include <sstream>
 40 | #include <iostream>
 41 | #include <vector>
 42 | 
 43 | #include "dataset.h"
 44 | #include "document.h"
 45 | #include "map_type.h"
 46 | #include "utils.h"
 47 | #include "math_func.h"
 48 | #include "polya_fit_simple.h"
 49 | #include "strtokenizer.h"
 50 | 
 51 | using namespace std;
 52 | 
 53 | 
 54 | class model {
 55 | 
 56 | public:
 57 | 	model(void);
 58 | 	~model(void);
 59 | 
 60 | 	mapword2atr word2atr;
 61 | 	mapid2word id2word; 
 62 | 	mapword2prior sentiLex; // <word, [senti lab, word prior distribution]>
 63 | 	
 64 | 	string data_dir;
 65 | 	string datasetFile;
 66 | 	string result_dir;
 67 | 	string sentiLexFile;
 68 | 	string wordmapfile;
 69 | 	string tassign_suffix;
 70 | 	string pi_suffix;
 71 | 	string theta_suffix;
 72 | 	string phi_suffix;
 73 | 	string others_suffix;
 74 | 	string twords_suffix;
 75 | 
 76 | 	int numTopics;
 77 | 	int numSentiLabs; 
 78 | 	int niters;
 79 | 	int liter;
 80 | 	int twords;
 81 | 	int savestep;
 82 | 	int updateParaStep;
 83 | 	double _alpha;
 84 | 	double _beta;
 85 | 	double _gamma;
 86 | 
 87 | 	// init functions
 88 | 	int init(int argc, char ** argv);
 89 | 	int excute_model();
 90 | 	
 91 | 
 92 | private:
 93 | 	
 94 | 	int numDocs;
 95 | 	int vocabSize;
 96 | 	int corpusSize;
 97 | 	int aveDocLength;
 98 | 	
 99 | 	ifstream fin;	
100 | 	dataset * pdataset;
101 | 	utils * putils;
102 | 
103 | 	// model counts
104 | 	vector<int> nd;
105 | 	vector<vector<int> > ndl;
106 | 	vector<vector<vector<int> > > ndlz;
107 | 	vector<vector<vector<int> > > nlzw;
108 | 	vector<vector<int> > nlz;
109 | 	
110 | 	// topic and label assignments
111 | 	vector<vector<double> > p;
112 | 	vector<vector<int> > z;
113 | 	vector<vector<int> > l;
114 | 	
115 | 	// model parameters
116 | 	vector<vector<double> > pi_dl; // size: (numDocs x L)
117 | 	vector<vector<vector<double> > > theta_dlz; // size: (numDocs x L x T) 
118 | 	vector<vector<vector<double> > > phi_lzw; // size: (L x T x V)
119 | 	
120 | 	// hyperparameters 
121 | 	vector<vector<double> > alpha_lz; // \alpha_tlz size: (L x T)
122 | 	vector<double> alphaSum_l; 
123 | 	vector<vector<vector<double> > > beta_lzw; // size: (L x T x V)
124 | 	vector<vector<double> > betaSum_lz;
125 | 	vector<vector<double> > gamma_dl; // size: (numDocs x L)
126 | 	vector<double> gammaSum_d; 
127 | 	vector<vector<double> > lambda_lw; // size: (L x V) -- for encoding prior sentiment information 
128 | 		
129 | 	vector<vector<double> > opt_alpha_lz;  //optimal value, size:(L x T) -- for storing the optimal value of alpha_lz after fix point iteration
130 | 	
131 | 	/************************* Functions ***************************/
132 | 	int set_gamma();
133 | 	int init_model_parameters();
134 | 	inline int delete_model_parameters() {
135 | 		numDocs = 0;
136 | 		vocabSize = 0;
137 | 		corpusSize = 0;
138 | 		aveDocLength = 0;
139 | 		
140 | 		if (pdataset != NULL) {
141 | 			delete pdataset;
142 | 			pdataset = NULL;
143 | 		}
144 | 		
145 | 		return 0;
146 | 	}
147 | 
148 | 	int init_estimate();
149 | 	int estimate();
150 | 	int prior2beta();
151 | 	int sampling(int m, int n, int& sentiLab, int& topic);
152 | 	
153 | 	// compute parameter functions
154 | 	void compute_pi_dl(); 
155 | 	void compute_theta_dlz(); 
156 | 	void compute_phi_lzw(); 
157 | 	
158 | 	// update parameter functions
159 | 	void init_parameters();
160 | 	int update_Parameters();
161 | 
162 | 	// save model parameter funtions 
163 | 	int save_model(string model_name);
164 | 	int save_model_tassign(string filename);
165 | 	int save_model_pi_dl(string filename);
166 | 	int save_model_theta_dlz(string filename);
167 | 	int save_model_phi_lzw(string filename);
168 | 	int save_model_others(string filename);
169 | 	int save_model_twords(string filename);
170 | };
171 | 
172 | #endif
173 | 


--------------------------------------------------------------------------------
/src/inference.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 | 		        Joint Sentiment-Topic (JST) Model
  3 | ***********************************************************************
  4 | 
  5 | (C) Copyright 2013, Chenghua Lin and Yulan He
  6 | 
  7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
  8 | Part of code is from http://gibbslda.sourceforge.net/.
  9 | 
 10 | This file is part of JST implementation.
 11 | 
 12 | JST is free software; you can redistribute it and/or modify it under
 13 | the terms of the GNU General Public License as published by the Free
 14 | Software Foundation; either version 2 of the License, or (at your
 15 | option) any later version.
 16 | 
 17 | JST is distributed in the hope that it will be useful, but WITHOUT
 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 20 | for more details.
 21 | 
 22 | You should have received a copy of the GNU General Public License
 23 | along with this program; if not, write to the Free Software
 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 25 | USA
 26 | 
 27 | ***********************************************************************/
 28 |    
 29 |    
 30 | #ifndef _INFERENCE_H
 31 | #define _INFERENCE_H
 32 | 
 33 | #include <sys/stat.h>
 34 | #include <math.h>
 35 | #include <iostream>
 36 | #include <fstream>
 37 | #include "constants.h"
 38 | #include "document.h"
 39 | #include "dataset.h"
 40 | #include "utils.h"
 41 | #include "strtokenizer.h"
 42 | 
 43 | using namespace std; 
 44 | 
 45 | 
 46 | class Inference {
 47 | 
 48 | public:
 49 | 	Inference(void);
 50 | 	~Inference(void);
 51 | 
 52 |     int numSentiLabs; 
 53 | 	int numTopics;
 54 | 	int numDocs;      // for trained model
 55 | 	int vocabSize;    // for trained model
 56 | 	int newNumDocs;   // for test set
 57 | 	int newVocabSize; // for test set
 58 | 
 59 | 	vector<vector<vector<int> > > nlzw; // for trained model
 60 | 	vector<vector<int> > nlz;  // for trained model
 61 |     mapword2atr word2atr;
 62 | 	mapword2id word2id; 
 63 | 	mapid2word id2word; 
 64 |     map<int, int> id2_id;
 65 | 	map<int, int> _id2id;
 66 | 	mapword2prior sentiLex; // <string, int> => <word, polarity>
 67 | 	vector<string> newWords;
 68 | 
 69 | 	string model_dir;
 70 | 	string model_name;
 71 | 	string data_dir;
 72 | 	string datasetFile;
 73 | 	string result_dir;
 74 | 	string sentiLexFile;
 75 | 	string wordmapfile;
 76 | 	string betaFile;
 77 | 
 78 | 	string tassign_suffix;
 79 |     string pi_suffix;
 80 |     string theta_suffix;
 81 |     string phi_suffix;
 82 |     string others_suffix;
 83 |     string twords_suffix;
 84 | 
 85 | 	dataset * pmodelData;	// pointer to trained model object
 86 |     dataset * pnewData; // pointer to new/test dataset object
 87 | 	utils * putils;
 88 | 
 89 |     int niters;
 90 | 	int liter;
 91 |     int twords;
 92 |     int savestep;
 93 | 	int updateParaStep;
 94 | 
 95 | 	double _alpha;
 96 | 	double _beta;
 97 | 	double _gamma;
 98 | 	
 99 | 	vector<vector<double> > new_p; // for posterior
100 | 	vector<vector<int> > new_z;
101 |     vector<vector<int> > new_l;
102 | 	vector<vector<int> > z;  // for trained model
103 |     vector<vector<int> > l;  // for trained model
104 | 
105 | 
106 | 	// from NEW/test documents
107 | 	vector<int> new_nd;
108 | 	vector<vector<int> > new_ndl;
109 | 	vector<vector<vector<int> > > new_ndlz;
110 | 	vector<vector<vector<int> > > new_nlzw;
111 | 	vector<vector<int> > new_nlz;
112 | 
113 | 	// hyperparameters 
114 |     vector<vector<double> > alpha_lz; // size: (L x T)
115 | 	vector<double> alphaSum_l; 
116 | 	vector<vector<vector<double> > > beta_lzw; // size: (L x T x V)
117 | 	vector<vector<double> > betaSum_lz;
118 | 	vector<double> gamma_l; // size: (L)
119 | 	double gammaSum; 
120 | 	vector<vector<double> > lambda_lw; // size: (L x V) -- for encoding prior sentiment information 
121 | 	
122 | 	// model parameters
123 | 	vector<vector<double> > newpi_dl; // size: (numDocs x L)
124 | 	vector<vector<vector<double> > > newtheta_dlz; // size: (numDocs x L x T) 
125 | 	vector<vector<vector<double> > > newphi_lzw; // size: (L x T x V)
126 | 
127 | 	// functions 
128 | 	int init(int argc, char ** argv);
129 |     int init_inf();
130 |     int inference(); // inference for new (unseen) data based on previously trained model
131 |     int inf_sampling(int m, int n, int& sentiLab, int& topic);
132 | 	int init_parameters();
133 |     
134 | 	int read_newData(string filename);
135 | 	int read_model_setting(string filename);
136 | 	int load_model(string model_name);
137 | 	int prior2beta(); // for incorporating priro information
138 | 
139 | 	// compute model parameters
140 | 	void compute_newpi();
141 | 	void compute_newtheta();
142 | 	int compute_newphi();
143 | 
144 | 	// save new data models
145 | 	int save_model(string model_name);
146 |     int save_model_newtassign(string filename);
147 |     int save_model_newpi_dl(string filename);
148 |     int save_model_newtheta_dlz(string filename);
149 |     int save_model_newphi_lzw(string filename);
150 |     int save_model_newothers(string filename);
151 |     int save_model_newtwords(string filename);
152 | };
153 | 
154 | #endif
155 | 


--------------------------------------------------------------------------------
/src/cokus.h:
--------------------------------------------------------------------------------
  1 | // This is the Mersenne Twister random number generator MT19937, which
  2 | // generates pseudorandom integers uniformly distributed in 0..(2^32 - 1)
  3 | // starting from any odd seed in 0..(2^32 - 1).  This version is a recode
  4 | // by Shawn Cokus (Cokus@math.washington.edu) on March 8, 1998 of a version by
  5 | // Takuji Nishimura (who had suggestions from Topher Cooper and Marc Rieffel in
  6 | // July-August 1997).
  7 | //
  8 | // Effectiveness of the recoding (on Goedel2.math.washington.edu, a DEC Alpha
  9 | // running OSF/1) using GCC -O3 as a compiler: before recoding: 51.6 sec. to
 10 | // generate 300 million random numbers; after recoding: 24.0 sec. for the same
 11 | // (i.e., 46.5% of original time), so speed is now about 12.5 million random
 12 | // number generations per second on this machine.
 13 | //
 14 | // According to the URL <http://www.math.keio.ac.jp/~matumoto/emt.html>
 15 | // (and paraphrasing a bit in places), the Mersenne Twister is ``designed
 16 | // with consideration of the flaws of various existing generators,'' has
 17 | // a period of 2^19937 - 1, gives a sequence that is 623-dimensionally
 18 | // equidistributed, and ``has passed many stringent tests, including the
 19 | // die-hard test of G. Marsaglia and the load test of P. Hellekalek and
 20 | // S. Wegenkittl.''  It is efficient in memory usage (typically using 2506
 21 | // to 5012 bytes of static data, depending on data type sizes, and the code
 22 | // is quite short as well).  It generates random numbers in batches of 624
 23 | // at a time, so the caching and pipelining of modern systems is exploited.
 24 | // It is also divide- and mod-free.
 25 | //
 26 | // This library is free software; you can redistribute it and/or modify it
 27 | // under the terms of the GNU Library General Public License as published by
 28 | // the Free Software Foundation (either version 2 of the License or, at your
 29 | // option, any later version).  This library is distributed in the hope that
 30 | // it will be useful, but WITHOUT ANY WARRANTY, without even the implied
 31 | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
 32 | // the GNU Library General Public License for more details.  You should have
 33 | // received a copy of the GNU Library General Public License along with this
 34 | // library; if not, write to the Free Software Foundation, Inc., 59 Temple
 35 | // Place, Suite 330, Boston, MA 02111-1307, USA.
 36 | //
 37 | // The code as Shawn received it included the following notice:
 38 | //
 39 | //   Copyright (C) 1997 Makoto Matsumoto and Takuji Nishimura.  When
 40 | //   you use this, send an e-mail to <matumoto@math.keio.ac.jp> with
 41 | //   an appropriate reference to your work.
 42 | //
 43 | // It would be nice to CC: <Cokus@math.washington.edu> when you write.
 44 | //
 45 | 
 46 | //#include <stdio.h>
 47 | //#include <stdlib.h>
 48 | 
 49 | //
 50 | // uint32 must be an unsigned integer type capable of holding at least 32
 51 | // bits; exactly 32 should be fastest, but 64 is better on an Alpha with
 52 | // GCC at -O3 optimization so try your options and see whats best for you
 53 | //
 54 | 
 55 | #ifndef	_COKUS_H
 56 | #define	_COKUS_H
 57 | 
 58 | 
 59 | typedef unsigned long uint32;
 60 | 
 61 | #define N              (624)                 // length of state vector
 62 | #define M              (397)                 // a period parameter
 63 | #define K              (0x9908B0DFU)         // a magic constant
 64 | #define hiBit(u)       ((u) & 0x80000000U)   // mask all but highest   bit of u
 65 | #define loBit(u)       ((u) & 0x00000001U)   // mask all but lowest    bit of u
 66 | #define loBits(u)      ((u) & 0x7FFFFFFFU)   // mask     the highest   bit of u
 67 | #define mixBits(u, v)  (hiBit(u)|loBits(v))  // move hi bit of u to hi bit of v
 68 | 
 69 | static uint32   state[N+1];     // state vector + 1 extra to not violate ANSI C
 70 | static uint32   *next;          // next random value is computed from here
 71 | static int      left = -1;      // can *next++ this many times before reloading
 72 | 
 73 | 
 74 | void seedMT(uint32 seed)
 75 |  {
 76 |     //
 77 |     // We initialize state[0..(N-1)] via the generator
 78 |     //
 79 |     //   x_new = (69069 * x_old) mod 2^32
 80 |     //
 81 |     // from Line 15 of Table 1, p. 106, Sec. 3.3.4 of Knuths
 82 |     // _The Art of Computer Programming_, Volume 2, 3rd ed.
 83 |     //
 84 |     // Notes (SJC): I do not know what the initial state requirements
 85 |     // of the Mersenne Twister are, but it seems this seeding generator
 86 |     // could be better.  It achieves the maximum period for its modulus
 87 |     // (2^30) iff x_initial is odd (p. 20-21, Sec. 3.2.1.2, Knuth); if
 88 |     // x_initial can be even, you have sequences like 0, 0, 0, ...;
 89 |     // 2^31, 2^31, 2^31, ...; 2^30, 2^30, 2^30, ...; 2^29, 2^29 + 2^31,
 90 |     // 2^29, 2^29 + 2^31, ..., etc. so I force seed to be odd below.
 91 |     //
 92 |     // Even if x_initial is odd, if x_initial is 1 mod 4 then
 93 |     //
 94 |     //   the          lowest bit of x is always 1,
 95 |     //   the  next-to-lowest bit of x is always 0,
 96 |     //   the 2nd-from-lowest bit of x alternates      ... 0 1 0 1 0 1 0 1 ... ,
 97 |     //   the 3rd-from-lowest bit of x 4-cycles        ... 0 1 1 0 0 1 1 0 ... ,
 98 |     //   the 4th-from-lowest bit of x has the 8-cycle ... 0 0 0 1 1 1 1 0 ... ,
 99 |     //    ...
100 |     //
101 |     // and if x_initial is 3 mod 4 then
102 |     //
103 |     //   the          lowest bit of x is always 1,
104 |     //   the  next-to-lowest bit of x is always 1,
105 |     //   the 2nd-from-lowest bit of x alternates      ... 0 1 0 1 0 1 0 1 ... ,
106 |     //   the 3rd-from-lowest bit of x 4-cycles        ... 0 0 1 1 0 0 1 1 ... ,
107 |     //   the 4th-from-lowest bit of x has the 8-cycle ... 0 0 1 1 1 1 0 0 ... ,
108 |     //    ...
109 |     //
110 |     // The generators potency (min. s>=0 with (69069-1)^s = 0 mod 2^32) is
111 |     // 16, which seems to be alright by p. 25, Sec. 3.2.1.3 of Knuth.  It
112 |     // also does well in the dimension 2..5 spectral tests, but it could be
113 |     // better in dimension 6 (Line 15, Table 1, p. 106, Sec. 3.3.4, Knuth).
114 |     //
115 |     // Note that the random number user does not see the values generated
116 |     // here directly since reloadMT() will always munge them first, so maybe
117 |     // none of all of this matters.  In fact, the seed values made here could
118 |     // even be extra-special desirable if the Mersenne Twister theory says
119 |     // so-- thats why the only change I made is to restrict to odd seeds.
120 |     //
121 | 
122 |     register uint32 x = (seed | 1U) & 0xFFFFFFFFU, *s = state;
123 |     register int    j;
124 | 
125 |     for(left=0, *s++=x, j=N; --j;
126 |         *s++ = (x*=69069U) & 0xFFFFFFFFU);
127 |  }
128 | 
129 | 
130 | uint32 reloadMT(void)
131 |  {
132 |     register uint32 *p0=state, *p2=state+2, *pM=state+M, s0, s1;
133 |     register int    j;
134 | 
135 |     if(left < -1)
136 |         seedMT(4357U);
137 | 
138 |     left=N-1, next=state+1;
139 | 
140 |     for(s0=state[0], s1=state[1], j=N-M+1; --j; s0=s1, s1=*p2++)
141 |         *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
142 | 
143 |     for(pM=state, j=M; --j; s0=s1, s1=*p2++)
144 |         *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
145 | 
146 |     s1=state[0], *p0 = *pM ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
147 |     s1 ^= (s1 >> 11);
148 |     s1 ^= (s1 <<  7) & 0x9D2C5680U;
149 |     s1 ^= (s1 << 15) & 0xEFC60000U;
150 |     return(s1 ^ (s1 >> 18));
151 |  }
152 | 
153 | 
154 | uint32 randomMT(void)
155 |  {
156 |     uint32 y;
157 | 
158 |     if(--left < 0)
159 |         return(reloadMT());
160 | 
161 |     y  = *next++;
162 |     y ^= (y >> 11);
163 |     y ^= (y <<  7) & 0x9D2C5680U;
164 |     y ^= (y << 15) & 0xEFC60000U;
165 |     y ^= (y >> 18);
166 |     return(y);
167 |  }
168 | 
169 | #endif
170 | 
171 | 
172 | 


--------------------------------------------------------------------------------
/src/utils.cpp:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 | 		        Joint Sentiment-Topic (JST) Model
  3 | ***********************************************************************
  4 | 
  5 | (C) Copyright 2013, Chenghua Lin and Yulan He
  6 | 
  7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
  8 | Part of code is from http://gibbslda.sourceforge.net/.
  9 | 
 10 | This file is part of JST implementation.
 11 | 
 12 | JST is free software; you can redistribute it and/or modify it under
 13 | the terms of the GNU General Public License as published by the Free
 14 | Software Foundation; either version 2 of the License, or (at your
 15 | option) any later version.
 16 | 
 17 | JST is distributed in the hope that it will be useful, but WITHOUT
 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 20 | for more details.
 21 | 
 22 | You should have received a copy of the GNU General Public License
 23 | along with this program; if not, write to the Free Software
 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 25 | USA
 26 | 
 27 | ***********************************************************************/
 28 | 
 29 | #include <stdio.h>
 30 | #include <string>
 31 | #include <map>
 32 | #include <iostream>
 33 | #include <sstream>
 34 | #include "strtokenizer.h"
 35 | #include "utils.h"
 36 | #include "model.h"
 37 | #include "inference.h"
 38 | #include "dataset.h"
 39 | #include <sys/types.h>
 40 | #include <sys/stat.h>
 41 | 
 42 | using namespace std;
 43 | 
 44 | #undef WINDOWS
 45 | #ifdef _WIN32
 46 |     #define WINDOWS
 47 | #endif
 48 | #ifdef __WIN32__
 49 |     #define WINDOWS
 50 | #endif
 51 | 
 52 | #ifdef WINDOWS
 53 | 	#include <direct.h>  // For _mkdir().
 54 | 	#include <io.h>      // For access().
 55 | #else 
 56 | 	#include <unistd.h>  // For access().
 57 | #endif
 58 | 
 59 | 
 60 | utils::utils() {
 61 | 	model_status = MODEL_STATUS_UNKNOWN;
 62 | 	model_dir = "";
 63 | 	data_dir = "";
 64 | 	result_dir = "";
 65 |     model_name = "";
 66 | 	configfile = "";
 67 | 
 68 | 	wordmapfile = "";
 69 | 	sentiLexFile = "";
 70 |     datasetFile = "";
 71 |     configfile = "";
 72 |     numSentiLabs = 0;
 73 | 	numTopics = 0;
 74 |     niters = 0;
 75 |     savestep = 0;
 76 |     twords = 0;
 77 | 	updateParaStep = -1; 
 78 | 
 79 | 	alpha = -1.0;
 80 | 	beta = -1.0;
 81 |     gamma = -1.0;
 82 | }
 83 | 
 84 | 
 85 | int utils::parse_args(int argc, char ** argv, int&  model_status) {
 86 |     int i = 1;
 87 |     while (i < argc) {
 88 | 		string arg = argv[i];
 89 | 		if (arg == "-est") {
 90 | 			model_status = MODEL_STATUS_EST;
 91 | 			break;
 92 | 		}
 93 | 		else if (arg == "-estc") {
 94 | 			model_status = MODEL_STATUS_ESTC;
 95 | 			break;
 96 | 		}
 97 | 		else if (arg == "-inf") {
 98 | 			model_status = MODEL_STATUS_INF;
 99 | 			break;
100 | 		}
101 | 		i++;
102 | 	}
103 | 
104 |     this->model_status = model_status;
105 |     cout << "model_status = " << this->model_status<< endl;
106 | 	return (model_status);
107 | }
108 | 
109 | 
110 | 
111 | int utils::parse_args_est(int argc, char ** argv, model * pmodel) {
112 | 
113 |     int i = 1;
114 |     while (i < argc) {
115 | 	    string arg = argv[i];
116 | 		if (arg == "-config") {
117 | 			configfile = argv[++i];
118 | 			break;
119 | 		}
120 | 		i++;
121 | 	}
122 | 
123 | 	if (configfile != "") {
124 | 		if (read_config_file(configfile)) {
125 | 			return 1;
126 | 		}
127 | 	}
128 | 	
129 | 	if (wordmapfile != "")   
130 | 		pmodel->wordmapfile = wordmapfile;
131 | 			
132 | 	if (sentiLexFile != "")
133 | 		pmodel->sentiLexFile = sentiLexFile;
134 | 	
135 | 	if (datasetFile != "") {
136 | 		pmodel->datasetFile = datasetFile;
137 | 	}
138 | 
139 | 	if (numSentiLabs > 0) pmodel->numSentiLabs = numSentiLabs;
140 | 	if (numTopics > 0) pmodel->numTopics = numTopics;
141 | 	if (niters > 0)  pmodel->niters = niters;
142 | 	if (savestep > 0) pmodel->savestep = savestep;
143 | 	if (twords > 0)   pmodel->twords = twords;
144 | 	pmodel->updateParaStep = updateParaStep; // -1: no parameter optimization
145 | 
146 | 	if (alpha > 0.0) pmodel->_alpha = alpha;
147 | 	if (beta > 0.0) pmodel->_beta = beta;
148 | 	if (gamma > 0.0) pmodel->_gamma = gamma;
149 | 
150 | 	if (data_dir != "")	{
151 | 		if (data_dir[data_dir.size() - 1] != '/') {
152 | 			data_dir += "/";
153 | 		}
154 | 		pmodel->data_dir = data_dir;
155 | 	}
156 | 	else {
157 | 		printf("Please specify input data dir!\n");
158 | 		return 1;
159 | 	}
160 | 	
161 | 	if (result_dir != "")	{
162 | 	    if (make_dir(result_dir)) return 1;
163 | 	    if (result_dir[result_dir.size() - 1] != '/') {
164 | 		    result_dir += "/";
165 | 	    }
166 | 		pmodel->result_dir = result_dir;
167 | 	}
168 | 	else {
169 | 		printf("Please specify output dir!\n");
170 | 		return 1;
171 | 	}
172 | 
173 |     return 0;
174 | }
175 |    
176 | 
177 | int utils::parse_args_inf(int argc, char ** argv, Inference * pmodel_inf) {
178 | 
179 | 	int i = 1; 
180 | 	while (i < argc) {
181 | 		string arg = argv[i];
182 | 	    printf("arg=%s\n", arg.c_str());
183 | 		if (arg == "-config") {
184 | 			configfile = argv[++i];
185 | 			break;
186 | 		}
187 | 		i++;
188 | 	}
189 | 	if (configfile != "") {
190 | 		if (read_config_file(configfile)) return 1;
191 | 	}
192 |     
193 | 	if (wordmapfile != "") 
194 | 		pmodel_inf->wordmapfile = wordmapfile;
195 | 		
196 | 	if (sentiLexFile != "") 
197 | 		pmodel_inf->sentiLexFile = sentiLexFile;
198 | 	
199 | 	if (datasetFile != "")
200 | 		pmodel_inf->datasetFile = datasetFile;
201 | 	else {
202 | 		printf("Please specify input dataset file!\n");
203 | 		return 1;
204 | 	}
205 | 	
206 | 	if (model_dir != "")	{
207 | 		if (model_dir[model_dir.size() - 1] != '/') model_dir += "/";
208 | 		pmodel_inf->model_dir = model_dir;
209 | 	}
210 | 	
211 | 	if (data_dir != "")	{
212 | 		if (data_dir[data_dir.size() - 1] != '/') data_dir += "/";
213 | 		pmodel_inf->data_dir = data_dir;
214 | 	}
215 | 	else {
216 | 		printf("Please specify input data dir!\n");
217 | 		return 1;
218 | 	}
219 | 	
220 | 	if (result_dir != "")	{
221 | 		if (make_dir(result_dir)) return 1;
222 | 		if (result_dir[result_dir.size() - 1] != '/') result_dir += "/";
223 | 		pmodel_inf->result_dir = result_dir;
224 | 	}
225 | 	else {
226 | 		printf("Please specify output dir!\n");
227 | 		return 1;
228 | 	}
229 | 	
230 | 	if (model_name != "")
231 | 		pmodel_inf->model_name = model_name;
232 | 	else {
233 | 		printf("Please specify the trained dJST model name!\n");
234 | 		return 1;
235 | 	}
236 |     
237 | 	if (niters > 0) pmodel_inf->niters = niters;
238 | 
239 | 	
240 | 	if (twords > 0) pmodel_inf->twords = twords;
241 | 	if (savestep > 0) pmodel_inf->savestep = savestep;
242 | 	if (updateParaStep > 0) pmodel_inf->updateParaStep = updateParaStep;
243 | 	if (alpha > 0.0) pmodel_inf->_alpha = alpha; 
244 | 	if (beta > 0.0) pmodel_inf->_beta = beta;
245 | 	if (gamma > 0.0) pmodel_inf->_gamma = gamma;
246 | 
247 |     return 0;
248 | }
249 |    
250 | 
251 | int utils::read_config_file(string filename) {
252 | 
253 | 	char buff[BUFF_SIZE_SHORT];
254 |     string line;
255 | 
256 | 	FILE * fin = fopen(filename.c_str(), "r");
257 |     if (!fin) {
258 | 		printf("Cannot read file %s\n", filename.c_str());
259 | 		return 1;
260 |     }
261 |     
262 |     while (fgets(buff, BUFF_SIZE_SHORT - 1, fin)) {
263 | 			line = buff;
264 | 			strtokenizer strtok(line, "= \t\r\n");
265 | 			int count = strtok.count_tokens();
266 | 	
267 | 			// line invalid, ignore
268 | 			if (count != 2) {
269 | 			    continue;
270 | 			}
271 | 			
272 | 			string optstr = strtok.token(0);
273 | 			string optval = strtok.token(1);
274 | 			
275 | 			if(optstr == "nsentiLabs")
276 | 				numSentiLabs = atoi(optval.c_str());
277 | 			else if(optstr == "ntopics") 
278 | 				numTopics = atoi(optval.c_str());	
279 | 			else if(optstr == "niters") 
280 | 				niters = atoi(optval.c_str());	
281 | 			else if(optstr == "savestep") 
282 | 				savestep = atoi(optval.c_str());				
283 | 			else if (optstr == "updateParaStep") 
284 | 				updateParaStep = atoi(optval.c_str());
285 | 			else if(optstr == "twords") 
286 | 				twords = atoi(optval.c_str());	
287 | 			else if(optstr == "data_dir") 
288 | 				data_dir = optval;	
289 | 			else if (optstr == "model_dir") 
290 | 				model_dir = optval;	    
291 | 			else if(optstr == "result_dir") 
292 | 				result_dir = optval;	
293 | 			else if(optstr == "datasetFile") 
294 | 				datasetFile = optval;	
295 | 			else if(optstr == "sentiFile") 
296 | 				sentiLexFile = optval;	
297 | 			else if (optstr == "vocabFile") 
298 | 				wordmapfile = optval;					
299 | 			else if (optstr == "alpha")
300 | 				alpha = atof(optval.c_str());
301 | 			else if (optstr == "beta")    
302 | 				beta = atof(optval.c_str());
303 | 			else if (optstr == "gamma")    
304 | 				gamma = atof(optval.c_str());
305 | 			else if (optstr == "model")  
306 | 				model_name = optval;
307 | 		}
308 | 		
309 | 		fclose(fin);
310 | 		    
311 |     return 0;
312 | }
313 | 
314 | 
315 | string utils::generate_model_name(int iter)  {
316 | 
317 | 	string model_name;
318 | 	std::stringstream out;
319 | 	char buff[BUFF_SIZE_SHORT];
320 | 	
321 | 	sprintf(buff, "%05d", iter);
322 | 
323 | 	if (iter >= 0)
324 | 		model_name = buff;
325 | 	else
326 | 		model_name = "final";
327 | 	
328 | 	return model_name;
329 | }
330 | 
331 | 
332 | #ifdef WINDOWS
333 | int utils::make_dir(string strPath) {
334 | 	if(_access(strPath.c_str(), 0) == 0) 
335 | 		return 0;
336 | 	else if(_mkdir(strPath.c_str()) == 0) 
337 | 		return 0;
338 | 	else {
339 | 		printf("Throw exception in creating directory %s !\n",strPath.c_str());
340 | 		return 1;
341 | 	} 
342 | }
343 | #else
344 | int utils::make_dir(string strPath) {
345 | 	if(access(strPath.c_str(), 0) == 0)  
346 | 		return 0;
347 | 	else if(mkdir(strPath.c_str(),  S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) == 0)
348 | 		return 0;
349 | 	else { 
350 | 		cout<<"Throw exception in creating directory "<<strPath.c_str()<<endl;
351 | 		return 1; 
352 | 	}
353 | }
354 | #endif
355 | 


--------------------------------------------------------------------------------
/src/dataset.cpp:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 | 		        Joint Sentiment-Topic (JST) Model
  3 | ***********************************************************************
  4 | 
  5 | (C) Copyright 2013, Chenghua Lin and Yulan He
  6 | 
  7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
  8 | Part of code is from http://gibbslda.sourceforge.net/.
  9 | 
 10 | This file is part of JST implementation.
 11 | 
 12 | JST is free software; you can redistribute it and/or modify it under
 13 | the terms of the GNU General Public License as published by the Free
 14 | Software Foundation; either version 2 of the License, or (at your
 15 | option) any later version.
 16 | 
 17 | JST is distributed in the hope that it will be useful, but WITHOUT
 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 20 | for more details.
 21 | 
 22 | You should have received a copy of the GNU General Public License
 23 | along with this program; if not, write to the Free Software
 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 25 | USA
 26 | 
 27 | ***********************************************************************/
 28 | 
 29 | #include "dataset.h"
 30 | #include "document.h"
 31 | #include "model.h"
 32 | #include "map_type.h"
 33 | #include "strtokenizer.h"
 34 | #include <iostream>
 35 | #include <iomanip>
 36 | #include <fstream>
 37 | #include <sstream>
 38 | #include <stdlib.h>
 39 | using namespace std; 
 40 | 
 41 | 
 42 | dataset::dataset() {
 43 | 	pdocs = NULL;
 44 | 	_pdocs = NULL;
 45 | 	word2atr.clear();
 46 | 	result_dir = ".";
 47 | 	wordmapfile = "wordmap.txt";
 48 | 
 49 | 	numDocs = 0;
 50 | 	aveDocLength = 0;
 51 | 	vocabSize = 0;
 52 | 	corpusSize = 0;
 53 | }
 54 | 
 55 | dataset::dataset(string result_dir) {
 56 | 	pdocs = NULL;
 57 | 	_pdocs = NULL;
 58 | 	word2atr.clear();
 59 | 	this->result_dir = result_dir;
 60 | 	wordmapfile = "wordmap.txt";
 61 | 
 62 | 	numDocs = 0; 
 63 | 	aveDocLength = 0;
 64 | 	vocabSize = 0; 
 65 | 	corpusSize = 0;
 66 | }
 67 | 
 68 | 
 69 | dataset::~dataset(void) {
 70 | 	deallocate();
 71 | }
 72 | 
 73 | 
 74 | int dataset::read_dataStream(ifstream& fin) {
 75 | 	string line;
 76 | 	char buff[BUFF_SIZE_LONG];
 77 | 	docs.clear();
 78 | 	numDocs = 0;
 79 | 	
 80 | 	while (fin.getline(buff, BUFF_SIZE_LONG)) {
 81 | 		line = buff;
 82 | 		if(!line.empty()) {
 83 | 			docs.push_back(line);
 84 | 			numDocs++;
 85 | 		}
 86 | 	}
 87 | 	
 88 | 	if (numDocs > 0) {
 89 | 		this->analyzeCorpus(docs);
 90 | 	}
 91 | 	
 92 | 	return 0;
 93 | }
 94 | 
 95 | 
 96 | int dataset::analyzeCorpus(vector<string>& docs) {
 97 | 
 98 | 	mapword2atr::iterator it;
 99 | 	mapword2id::iterator vocabIt;   
100 | 	mapword2prior::iterator sentiIt;
101 | 	map<int,int>::iterator idIt;
102 | 		
103 | 	string line;
104 | 	numDocs = docs.size();
105 | 	vocabSize = 0;
106 | 	corpusSize = 0;
107 | 	aveDocLength = 0; 
108 | 
109 |   // allocate memory for corpus
110 | 	if (pdocs) {
111 | 		deallocate();
112 | 		pdocs = new document*[numDocs];
113 |     } 
114 | 	else {
115 | 		pdocs = new document*[numDocs];
116 | 	}
117 | 	
118 | 	for (int i = 0; i < (int)docs.size(); ++i) {			
119 | 		line = docs.at(i);
120 | 		strtokenizer strtok(line, " \t\r\n");  // \t\r\n are the separators
121 | 		int docLength = strtok.count_tokens();
122 | 	
123 | 		if (docLength <= 0) {
124 | 			printf("Invalid (empty) document!\n");
125 | 			deallocate();
126 | 			numDocs = vocabSize = 0;
127 | 			return 1;
128 | 		}
129 | 	
130 | 		corpusSize += docLength - 1; // the first word is document name/id
131 | 		
132 | 		// allocate memory for the new document_i 
133 | 		document * pdoc = new document(docLength-1);
134 | 		pdoc->docID = strtok.token(0).c_str();
135 | 
136 | 		// generate ID for the tokens in the corpus, and assign each word token with the corresponding vocabulary ID.
137 | 		for (int k = 0; k < docLength-1; k++) {
138 | 			int priorSenti = -1;	
139 | 			it = word2atr.find(strtok.token(k+1).c_str());
140 | 		
141 | 			if (it == word2atr.end()) { //  i.e., new word
142 | 				pdoc->words[k] = word2atr.size();
143 | 				sentiIt = sentiLex.find(strtok.token(k+1).c_str()); // check whether the word token can be found in the sentiment lexicon
144 | 				// incorporate sentiment lexicon
145 | 				if (sentiIt != sentiLex.end()) {
146 | 				    priorSenti = sentiIt->second.id;
147 | 				}
148 | 					
149 | 				// insert sentiment info into word2atr
150 | 				Word_atr temp = {word2atr.size(), priorSenti};  // vocabulary index; word polarity
151 | 				word2atr.insert(pair<string, Word_atr>(strtok.token(k+1), temp));
152 | 				pdoc->priorSentiLabels[k] = priorSenti;
153 | 				
154 | 			} 
155 | 			else { // word seen before
156 | 				pdoc->words[k] = it->second.id;
157 | 				pdoc->priorSentiLabels[k] = it->second.polarity;
158 | 			}
159 | 		}
160 | 		
161 | 		add_doc(pdoc, i);
162 | 	} 
163 | 	    
164 | 	    
165 | 	// update number of words
166 | 	vocabSize = word2atr.size();
167 | 	aveDocLength = corpusSize/numDocs;
168 | 
169 | 	if (write_wordmap(result_dir + wordmapfile, word2atr)) {
170 | 		printf("ERROE! Can not read wordmap file %s!\n", wordmapfile.c_str());
171 | 		return 1;
172 | 	}
173 | 	if (read_wordmap(result_dir + wordmapfile, id2word)) {
174 | 		printf("ERROE! Can not read wordmap file %s!\n", wordmapfile.c_str());
175 | 		return 1;
176 | 	}
177 | 
178 | 	docs.clear();
179 | 	return 0;
180 | }
181 | 
182 | 
183 | 
184 | void dataset::deallocate() 
185 | {
186 | 	if (pdocs) {
187 | 		for (int i = 0; i < numDocs; i++) 
188 | 			delete pdocs[i];		
189 | 		delete [] pdocs;
190 | 		pdocs = NULL;
191 | 	}
192 | 	
193 | 	if (_pdocs) {
194 | 		for (int i = 0; i < numDocs; i++) 
195 | 			delete _pdocs[i];
196 | 		delete [] _pdocs;
197 | 		_pdocs = NULL;
198 | 	}
199 | }
200 |     
201 | 
202 | void dataset::add_doc(document * doc, int idx) {
203 |     if (0 <= idx && idx < numDocs)
204 |         pdocs[idx] = doc;
205 | }   
206 | 
207 | void dataset::_add_doc(document * doc, int idx) {
208 |     if (0 <= idx && idx < numDocs) {
209 | 	    _pdocs[idx] = doc;
210 |     }
211 | }
212 | 
213 | 
214 | int dataset::read_senti_lexicon(string sentiLexiconFile) {
215 | 	sentiLex.clear();
216 | 	char buff[BUFF_SIZE_SHORT];
217 |     string line;
218 |     vector<double> wordPrior;
219 |     int labID;
220 |     double tmp, val;
221 |     int numSentiLabs;
222 |     
223 |     FILE * fin = fopen(sentiLexiconFile.c_str(), "r");
224 |     if (!fin) {
225 | 		printf("Cannot read file %s!\n", sentiLexiconFile.c_str());
226 | 		return 1;
227 |     }    
228 |      
229 |     while (fgets(buff, BUFF_SIZE_SHORT - 1, fin) != NULL) {
230 | 		line = buff;
231 | 		strtokenizer strtok(line, " \t\r\n");
232 | 			
233 | 		if (strtok.count_tokens() < 1)  {
234 | 			printf("Warning! The strtok count in the lexicon line [%s] is smaller than 2!\n", line.c_str());
235 | 		}
236 | 		else {	
237 | 			tmp = 0.0;
238 | 			labID = 0;
239 | 			wordPrior.clear();
240 | 			numSentiLabs = strtok.count_tokens();
241 | 			for (int k = 1; k < strtok.count_tokens(); k++) {
242 | 				val = atof(strtok.token(k).c_str());
243 | 				if (tmp < val) {
244 | 					tmp = val;
245 | 					labID = k-1;
246 | 				}
247 | 				wordPrior.push_back(val);
248 | 			}
249 | 			Word_Prior_Attr temp = {labID, wordPrior};  // sentiment label ID, sentiment label distribution
250 | 			sentiLex.insert(pair<string, Word_Prior_Attr >(strtok.token(0), temp));
251 | 		}
252 |     }
253 |     
254 | 	if (sentiLex.size() <= 0) {
255 | 		printf("Can not find any sentiment lexicon in file %s!\n", sentiLexiconFile.c_str());
256 | 		return 1;
257 | 	}
258 | 	
259 |     fclose(fin);
260 |     return 0;
261 | }
262 | 
263 | 
264 | int dataset::write_wordmap(string wordmapfile, mapword2atr &pword2atr) {
265 | 
266 |     FILE * fout = fopen(wordmapfile.c_str(), "w");
267 |     if (!fout) {
268 | 		printf("Cannot open file %s to write!\n", wordmapfile.c_str());
269 | 		return 1;
270 |     }    
271 |     
272 |     mapword2atr::iterator it;
273 |     fprintf(fout, "%d\n", (int)(pword2atr.size()));
274 |     for (it = pword2atr.begin(); it != pword2atr.end(); it++) {
275 | 	    fprintf(fout, "%s %d\n", (it->first).c_str(), it->second.id);
276 |     }
277 |     
278 |     fclose(fout);
279 |     return 0;
280 | }
281 | 
282 | 
283 | int dataset::read_wordmap(string wordmapfile, mapid2word &pid2word) {
284 |     pid2word.clear(); 
285 |     
286 |     FILE * fin = fopen(wordmapfile.c_str(), "r");
287 |     if (!fin) {
288 | 		printf("Cannot open file %s to read!\n", wordmapfile.c_str());
289 | 		return 1;
290 |     }    
291 |     
292 |     char buff[BUFF_SIZE_SHORT];
293 |     string line;
294 |     
295 |     fgets(buff, BUFF_SIZE_SHORT - 1, fin);
296 |     int nwords = atoi(buff);
297 |     
298 |     for (int i = 0; i < nwords; i++) {
299 | 		fgets(buff, BUFF_SIZE_SHORT - 1, fin);
300 | 		line = buff;
301 | 		strtokenizer strtok(line, " \t\r\n");
302 | 		if (strtok.count_tokens() != 2) {
303 | 			printf("Warning! Line %d in %s contains less than 2 words!\n", i+1, wordmapfile.c_str());
304 | 			continue;
305 | 		}
306 | 		
307 | 		pid2word.insert(pair<int, string>(atoi(strtok.token(1).c_str()), strtok.token(0)));
308 |     }
309 |     
310 |     fclose(fin);
311 |     return 0;
312 | }
313 | 
314 | 
315 | int dataset::read_wordmap(string wordmapfile, mapword2id& pword2id) {
316 |     pword2id.clear();
317 |     char buff[BUFF_SIZE_SHORT];
318 |     string line;
319 | 
320 | 
321 |     FILE * fin = fopen(wordmapfile.c_str(), "r");
322 |     if (!fin) {
323 | 		printf("Cannot read file %s!\n", wordmapfile.c_str());
324 | 		return 1;
325 |     }    
326 |         
327 |     fgets(buff, BUFF_SIZE_SHORT - 1, fin);
328 |     int nwords = atoi(buff);
329 |     
330 |     for (int i = 0; i < nwords; i++) {
331 | 		fgets(buff, BUFF_SIZE_SHORT - 1, fin);
332 | 		line = buff;
333 | 		strtokenizer strtok(line, " \t\r\n");
334 | 		if (strtok.count_tokens() != 2) {
335 | 			continue;
336 | 		}
337 | 		pword2id.insert(pair<string, int>(strtok.token(0), atoi(strtok.token(1).c_str())));
338 |     }
339 |     
340 |     fclose(fin);
341 |     return 0;
342 | }
343 | 
344 | 


--------------------------------------------------------------------------------
/src/model.cpp:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 | 		        Joint Sentiment-Topic (JST) Model
  3 | ***********************************************************************
  4 | 
  5 | (C) Copyright 2013, Chenghua Lin and Yulan He
  6 | 
  7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
  8 | Part of code is from http://gibbslda.sourceforge.net/.
  9 | 
 10 | This file is part of JST implementation.
 11 | 
 12 | JST is free software; you can redistribute it and/or modify it under
 13 | the terms of the GNU General Public License as published by the Free
 14 | Software Foundation; either version 2 of the License, or (at your
 15 | option) any later version.
 16 | 
 17 | JST is distributed in the hope that it will be useful, but WITHOUT
 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 20 | for more details.
 21 | 
 22 | You should have received a copy of the GNU General Public License
 23 | along with this program; if not, write to the Free Software
 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 25 | USA
 26 | 
 27 | ***********************************************************************/
 28 | 
 29 | 
 30 | #include "model.h"
 31 | using namespace std;
 32 | 
 33 | 
 34 | model::model(void) {
 35 | 
 36 | 	wordmapfile = "wordmap.txt";
 37 | 	tassign_suffix = ".tassign";
 38 | 	pi_suffix = ".pi";
 39 | 	theta_suffix = ".theta";
 40 | 	phi_suffix = ".phi";
 41 | 	others_suffix = ".others";
 42 | 	twords_suffix = ".twords";
 43 | 	
 44 | 	numTopics = 50;
 45 | 	numSentiLabs = 3;
 46 | 	vocabSize = 0;
 47 | 	numDocs = 0;
 48 | 	corpusSize = 0;
 49 | 	aveDocLength = 0;
 50 | 	
 51 | 	niters = 1000;
 52 | 	liter = 0;
 53 | 	savestep = 200; 
 54 | 	twords = 20; 
 55 | 	updateParaStep = 40;
 56 | 
 57 | 	_alpha  = -1.0;
 58 | 	_beta = -1.0;
 59 | 	_gamma = -1.0;
 60 | 
 61 | 	putils = new utils();
 62 | }
 63 | 
 64 | 
 65 | model::~model(void) {
 66 | 	if (putils) delete putils;
 67 | }
 68 | 
 69 | 
 70 | int model::init(int argc, char ** argv) {
 71 | 
 72 |     if (putils->parse_args_est(argc, argv, this)) {
 73 |         return 1;
 74 |     }
 75 |     
 76 | 	cout<<"data_dir = "<<data_dir<<endl;
 77 | 	cout<<"datasetFile = "<<datasetFile<<endl;
 78 | 	cout<<"result_dir = "<<result_dir<<endl;
 79 | 	cout<<"sentiLexFile = "<<sentiLexFile<<endl;
 80 | 	cout<<"wordmapfile = "<<wordmapfile<<endl;
 81 | 
 82 | 	cout<<"numTopics = "<<numTopics<<endl;
 83 | 	cout<<"numSentiLabs = "<<numSentiLabs<<endl;
 84 | 	cout<<"niters = "<<niters<<endl;
 85 | 	cout<<"savestep = "<<savestep<<endl;
 86 | 	cout<<"twords = "<<twords<<endl;
 87 | 	cout<<"updateParaStep = "<<updateParaStep<<endl;
 88 | 
 89 | 	cout<<"_alpha = "<<_alpha<<endl;
 90 | 	cout<<"_beta = "<<_beta<<endl;
 91 | 	cout<<"_gamma = "<<_gamma<<endl;
 92 | 	
 93 | 	return 0;
 94 | }
 95 | 
 96 | 
 97 | 
 98 | int model::excute_model() {
 99 | 
100 | 	pdataset = new dataset(result_dir);
101 | 
102 | 	if (sentiLexFile != "") {
103 | 	    if (pdataset->read_senti_lexicon((sentiLexFile).c_str())) {
104 | 		    printf("Error! Cannot read sentiFile %s!\n", (sentiLexFile).c_str());
105 | 			    delete pdataset;
106 | 				return 1;
107 | 		}
108 | 		this->sentiLex = pdataset->sentiLex;
109 | 	}
110 | 				
111 | 	// read training data
112 | 	fin.open((data_dir+datasetFile).c_str(), ifstream::in);
113 | 	if(!fin) {
114 | 	    printf("Error! Cannot read dataset %s!\n", (data_dir+datasetFile).c_str());
115 | 	    return 1;
116 | 	}
117 |     if(pdataset->read_dataStream(fin)) {
118 | 		printf("Throw exception in function read_dataStream()! \n");
119 | 		delete pdataset;
120 | 		return 1;
121 | 	}
122 | 
123 | 	word2atr = pdataset->word2atr;
124 | 	id2word =  pdataset->id2word;
125 | 	init_model_parameters();
126 | 	if (init_estimate()) return 1;
127 | 	if(estimate()) return 1;
128 | 	delete_model_parameters();
129 | 	fin.close();
130 | 
131 | 	return 0;
132 | }
133 | 
134 | 
135 | int model::init_model_parameters()
136 | {
137 | 	numDocs = pdataset->numDocs;
138 | 	vocabSize = pdataset->vocabSize;
139 | 	corpusSize = pdataset->corpusSize;
140 | 	aveDocLength = pdataset->aveDocLength;
141 | 	
142 | 	// model counts
143 | 	nd.resize(numDocs);
144 | 	for (int m = 0; m < numDocs; m++) {
145 | 		nd[m]  = 0;
146 | 	}
147 | 
148 | 	ndl.resize(numDocs);
149 | 	for (int m = 0; m < numDocs; m++) {
150 | 		ndl[m].resize(numSentiLabs);
151 | 		for (int l = 0; l < numSentiLabs; l++)
152 | 		    ndl[m][l] = 0;
153 | 	}
154 | 
155 | 	ndlz.resize(numDocs);
156 | 	for (int m = 0; m < numDocs; m++) {
157 | 		ndlz[m].resize(numSentiLabs);
158 | 		for (int l = 0; l < numSentiLabs; l++) {
159 | 			ndlz[m][l].resize(numTopics);
160 | 			for (int z = 0; z < numTopics; z++)
161 | 				ndlz[m][l][z] = 0; 
162 | 		}
163 | 	}
164 | 
165 | 	nlzw.resize(numSentiLabs);
166 | 	for (int l = 0; l < numSentiLabs; l++) {
167 | 		nlzw[l].resize(numTopics);
168 | 		for (int z = 0; z < numTopics; z++) {
169 | 			nlzw[l][z].resize(vocabSize);
170 | 			for (int r = 0; r < vocabSize; r++)
171 | 			    nlzw[l][z][r] = 0;
172 | 		}
173 | 	}
174 | 
175 | 	nlz.resize(numSentiLabs);
176 | 	for (int l = 0; l < numSentiLabs; l++) {
177 | 		nlz[l].resize(numTopics);
178 | 		for (int z = 0; z < numTopics; z++) {
179 | 		    nlz[l][z] = 0;
180 | 		}
181 | 	}
182 | 
183 | 	// posterior P
184 | 	p.resize(numSentiLabs);
185 | 	for (int l = 0; l < numSentiLabs; l++) {
186 | 		p[l].resize(numTopics);
187 | 	}
188 | 
189 | 	// model parameters
190 | 	pi_dl.resize(numDocs);
191 | 	for (int m = 0; m < numDocs; m++) {
192 | 		pi_dl[m].resize(numSentiLabs);
193 | 	}
194 | 
195 | 	theta_dlz.resize(numDocs);
196 | 	for (int m = 0; m < numDocs; m++) {
197 | 		theta_dlz[m].resize(numSentiLabs);
198 | 		for (int l = 0; l < numSentiLabs; l++) {
199 | 			theta_dlz[m][l].resize(numTopics);
200 | 		}
201 | 	}
202 | 
203 | 	phi_lzw.resize(numSentiLabs);
204 | 	for (int l = 0; l < numSentiLabs; l++) {
205 | 		phi_lzw[l].resize(numTopics);
206 | 		for (int z = 0; z < numTopics; z++) {
207 | 			phi_lzw[l][z].resize(vocabSize);
208 | 		}
209 | 	}
210 | 
211 | 	// init hyperparameters
212 | 	alpha_lz.resize(numSentiLabs);
213 | 	for (int l = 0; l < numSentiLabs; l++) {
214 | 		alpha_lz[l].resize(numTopics);
215 | 	}
216 | 
217 | 	alphaSum_l.resize(numSentiLabs);
218 | 	
219 | 	if (_alpha <= 0) {
220 | 		_alpha =  (double)aveDocLength * 0.05 / (double)(numSentiLabs * numTopics);
221 | 	}
222 | 
223 | 	for (int l = 0; l < numSentiLabs; l++) {
224 | 		alphaSum_l[l] = 0.0;
225 | 	    for (int z = 0; z < numTopics; z++) {
226 | 		    alpha_lz[l][z] = _alpha;
227 | 		    alphaSum_l[l] += alpha_lz[l][z];
228 | 	    }
229 | 	}
230 | 
231 | 	opt_alpha_lz.resize(numSentiLabs);
232 | 	for (int l = 0; l < numSentiLabs; l++) {
233 | 		opt_alpha_lz[l].resize(numTopics);
234 | 	}
235 | 
236 | 	//beta
237 | 	if (_beta <= 0) _beta = 0.01;
238 | 
239 | 	beta_lzw.resize(numSentiLabs);
240 | 	betaSum_lz.resize(numSentiLabs);
241 | 	for (int l = 0; l < numSentiLabs; l++) {
242 | 		beta_lzw[l].resize(numTopics);
243 | 		betaSum_lz[l].resize(numTopics);
244 | 		for (int z = 0; z < numTopics; z++) {
245 | 			betaSum_lz[l][z] = 0.0;
246 | 			beta_lzw[l][z].resize(vocabSize);
247 | 			for (int r = 0; r < vocabSize; r++) {
248 | 				beta_lzw[l][z][r] = _beta;
249 | 			}
250 | 		} 		
251 | 	}
252 | 
253 | 	// word prior transformation matrix
254 | 	lambda_lw.resize(numSentiLabs); 
255 | 	for (int l = 0; l < numSentiLabs; l++) {
256 | 	    lambda_lw[l].resize(vocabSize);
257 | 		for (int r = 0; r < vocabSize; r++) {
258 | 			lambda_lw[l][r] = 1; 	
259 | 		}
260 | 	}
261 | 
262 | 	// incorporate prior information into beta
263 | 	this->prior2beta();
264 | 	this->set_gamma();
265 | 
266 | 	return 0;
267 | }
268 | 
269 | 
270 | int model::set_gamma() {
271 | 
272 | 	mapname2labs::iterator it;
273 | 
274 | 	if (_gamma <= 0 ) {
275 | 		_gamma = (double)aveDocLength * 0.05 / (double)numSentiLabs;
276 | 	}
277 | 
278 | 	gamma_dl.resize(numDocs);
279 | 	gammaSum_d.resize(numDocs);
280 | 
281 | 	for (int d = 0; d < numDocs; d++) {
282 | 		gamma_dl[d].resize(numSentiLabs);
283 | 		gammaSum_d[d] = 0.0;
284 | 		for (int l = 0; l < numSentiLabs; l++) {
285 | 			gamma_dl[d][l] = _gamma;
286 | 			gammaSum_d[d] += gamma_dl[d][l];
287 | 		}
288 | 	}
289 | 
290 | 	return 0;
291 | }
292 | 
293 | 
294 | int model::prior2beta() {
295 | 
296 | 	mapword2atr::iterator wordIt;
297 | 	mapword2prior::iterator sentiIt;
298 | 	
299 | 	for (sentiIt = sentiLex.begin(); sentiIt != sentiLex.end(); sentiIt++) {
300 | 		wordIt = word2atr.find(sentiIt->first);
301 | 		if (wordIt != word2atr.end()) {
302 | 			for (int j = 0; j < numSentiLabs; j++)  {
303 | 				lambda_lw[j][wordIt->second.id] = sentiIt->second.labDist[j];
304 | 			}
305 | 		}
306 | 	}
307 | 	
308 | 	for (int l = 0; l < numSentiLabs; l++) {
309 | 		for (int z = 0; z < numTopics; z++) {
310 | 			betaSum_lz[l][z] = 0.0;
311 | 		    for (int r = 0; r < vocabSize; r++) {
312 | 			    beta_lzw[l][z][r] = beta_lzw[l][z][r] * lambda_lw[l][r];  
313 | 			    betaSum_lz[l][z] += beta_lzw[l][z][r];
314 | 		    }
315 | 		}
316 | 	}
317 | 
318 | 	return 0;
319 | }
320 | 
321 | 
322 | void model::compute_phi_lzw() {
323 | 
324 | 	for (int l = 0; l < numSentiLabs; l++)  {
325 | 	    for (int z = 0; z < numTopics; z++) {
326 | 			for(int r = 0; r < vocabSize; r++) {
327 | 				phi_lzw[l][z][r] = (nlzw[l][z][r] + beta_lzw[l][z][r]) / (nlz[l][z] + betaSum_lz[l][z]);
328 | 			}
329 | 		}
330 | 	}
331 | }
332 | 
333 | 
334 | 
335 | void model::compute_pi_dl() {
336 | 
337 | 	for (int m = 0; m < numDocs; m++) {
338 | 	    for (int l = 0; l < numSentiLabs; l++) {
339 | 		    pi_dl[m][l] = (ndl[m][l] + gamma_dl[m][l]) / (nd[m] + gammaSum_d[m]);
340 | 		}
341 | 	}
342 | }
343 | 
344 | void model::compute_theta_dlz() {
345 | 
346 | 	for (int m = 0; m < numDocs; m++) {
347 | 	    for (int l = 0; l < numSentiLabs; l++)  {
348 | 			for (int z = 0; z < numTopics; z++) {
349 | 			    theta_dlz[m][l][z] = (ndlz[m][l][z] + alpha_lz[l][z]) / (ndl[m][l] + alphaSum_l[l]);    
350 | 			}
351 | 		}
352 | 	}
353 | }
354 | 
355 | 
356 | int model::save_model(string model_name) {
357 | 
358 | 	if (save_model_tassign(result_dir + model_name + tassign_suffix)) 
359 | 		return 1;
360 | 	
361 | 	if (save_model_twords(result_dir + model_name + twords_suffix)) 
362 | 		return 1;
363 | 
364 | 	if (save_model_pi_dl(result_dir + model_name + pi_suffix)) 
365 | 		return 1;
366 | 
367 | 	if (save_model_theta_dlz(result_dir + model_name + theta_suffix)) 
368 | 		return 1;
369 | 
370 | 	if (save_model_phi_lzw(result_dir + model_name + phi_suffix))
371 | 		return 1;
372 | 
373 | 	if (save_model_others(result_dir + model_name + others_suffix)) 
374 | 		return 1;
375 | 
376 | 	return 0;
377 | }
378 | 
379 | 
380 | int model::save_model_tassign(string filename) {
381 |     
382 |     FILE * fout = fopen(filename.c_str(), "w");
383 |     if (!fout) {
384 | 	    printf("Cannot save file %s!\n", filename.c_str());
385 | 	    return 1;
386 |     }
387 | 
388 | 	for (int m = 0; m < pdataset->numDocs; m++) {    
389 | 		fprintf(fout, "%s \n", pdataset->pdocs[m]->docID.c_str());
390 | 		for (int n = 0; n < pdataset->pdocs[m]->length; n++) {
391 | 	        fprintf(fout, "%d:%d:%d ", pdataset->pdocs[m]->words[n], l[m][n], z[m][n]); //  wordID:sentiLab:topic
392 | 	    }
393 | 	    fprintf(fout, "\n");
394 |     }
395 | 
396 |     fclose(fout);
397 | 	return 0;
398 | }
399 | 
400 | 
401 | int model::save_model_twords(string filename) 
402 | {   
403 |     FILE * fout = fopen(filename.c_str(), "w");
404 |     if (!fout) {
405 | 	    printf("Cannot save file %s!\n", filename.c_str());
406 | 	    return 1;
407 |     }
408 |     
409 |     if (twords > vocabSize) {
410 | 	    twords = vocabSize; // print out entire vocab list
411 |     }
412 |     
413 |     mapid2word::iterator it;
414 |    
415 |     for (int l = 0; l < numSentiLabs; l++) { 
416 |         for (int k = 0; k < numTopics; k++) { 
417 | 	        vector<pair<int, double> > words_probs;
418 | 	        pair<int, double> word_prob;
419 | 	        for (int w = 0; w < vocabSize; w++) { 
420 | 		        word_prob.first = w; // w: word id/index
421 | 	            word_prob.second = phi_lzw[l][k][w]; // topic-word probability
422 | 	            words_probs.push_back(word_prob);
423 | 	        }
424 |     
425 | 		    std::sort(words_probs.begin(), words_probs.end(), sort_pred());
426 | 
427 | 	        fprintf(fout, "Label%d_Topic%d\n", l, k);
428 | 	        for (int i = 0; i < twords; i++) { 
429 | 		        it = id2word.find(words_probs[i].first);
430 | 	            if (it != id2word.end()) 
431 | 			        fprintf(fout, "%s   %15f\n", (it->second).c_str(), words_probs[i].second);
432 | 	        }
433 | 	    }
434 |     }
435 |      
436 |     fclose(fout);      
437 |     return 0;    
438 | }
439 | 
440 | 
441 | 
442 | int model::save_model_pi_dl(string filename) {
443 | 
444 | 	FILE * fout = fopen(filename.c_str(), "w");
445 |     if (!fout) {
446 | 		printf("Cannot save file %s!\n", filename.c_str());
447 | 		return 1;
448 |     }
449 | 
450 | 	for (int m = 0; m < numDocs; m++) { 
451 | 		fprintf(fout, "d_%d %s ", m, pdataset->pdocs[m]->docID.c_str());
452 | 		for (int l = 0; l < numSentiLabs; l++) {
453 | 			fprintf(fout, "%f ", pi_dl[m][l]);
454 | 		}
455 | 		fprintf(fout, "\n");
456 |     }
457 |    
458 |     fclose(fout);       
459 | 	return 0;
460 | }
461 | 
462 | 
463 | int model::save_model_theta_dlz(string filename) {
464 | 
465 | 	FILE * fout = fopen(filename.c_str(), "w");
466 |     if (!fout) {
467 | 		printf("Cannot save file %s!\n", filename.c_str());
468 | 		return 1;
469 |     }
470 |     
471 |     for(int m = 0; m < numDocs; m++) {
472 |         fprintf(fout, "Document %d\n", m);
473 | 	    for (int l = 0; l < numSentiLabs; l++) {
474 | 	        for (int z = 0; z < numTopics; z++) {
475 | 		        fprintf(fout, "%f ", theta_dlz[m][l][z]);
476 | 	        }
477 | 		    fprintf(fout, "\n");
478 | 		 }
479 |     }
480 | 
481 |     fclose(fout);
482 |     return 0;
483 | }
484 | 
485 | 
486 | int model::save_model_phi_lzw(string filename) {
487 | 
488 | 	FILE * fout = fopen(filename.c_str(), "w");
489 |     if (!fout) {
490 | 	    printf("Cannot save file %s!\n", filename.c_str());
491 | 	    return 1;
492 |     }
493 |     
494 | 	for (int l = 0; l < numSentiLabs; l++) {  
495 | 	    for (int z = 0; z < numTopics; z++) { 
496 | 		    fprintf(fout, "Label:%d  Topic:%d\n", l, z);
497 |      	    for (int r = 0; r < vocabSize; r++) {
498 | 			    fprintf(fout, "%.15f ", phi_lzw[l][z][r]);
499 |      	    }
500 |             fprintf(fout, "\n");
501 | 	   }
502 |     }
503 |     
504 |     fclose(fout);
505 | 	return 0;
506 | }
507 | 
508 | 
509 | 
510 | int model::save_model_others(string filename) {
511 | 
512 | 	FILE * fout = fopen(filename.c_str(), "w");
513 |     if (!fout) {
514 | 	    printf("Cannot save file %s!\n", filename.c_str());
515 | 	    return 1;
516 |     }
517 |     
518 | 	fprintf(fout, "data_dir=%s\n", this->data_dir.c_str());
519 | 	fprintf(fout, "datasetFile=%s\n", this->datasetFile.c_str());
520 | 	fprintf(fout, "result_dir=%s\n", this->result_dir.c_str());
521 | 	fprintf(fout, "sentiLexFile=%s\n", this->sentiLexFile.c_str());
522 | 
523 | 	fprintf(fout, "\n-------------------- Corpus statistics -----------------------\n");
524 |     fprintf(fout, "numDocs=%d\n", numDocs);
525 |     fprintf(fout, "corpusSize=%d\n", corpusSize);
526 | 	fprintf(fout, "aveDocLength=%d\n", aveDocLength);
527 |     fprintf(fout, "vocabSize=%d\n", vocabSize);
528 | 
529 |     fprintf(fout, "\n---------------------- Model settings -----------------------\n");
530 | 	fprintf(fout, "numSentiLabs=%d\n", numSentiLabs);
531 | 	fprintf(fout, "numTopics=%d\n", numTopics);
532 | 	fprintf(fout, "liter=%d\n", liter);
533 | 	fprintf(fout, "savestep=%d\n", savestep);
534 | 	fprintf(fout, "updateParaStep=%d\n", updateParaStep);
535 | 
536 | 	fprintf(fout, "_alpha=%f\n", _alpha);
537 | 	fprintf(fout, "_beta=%f\n", _beta);
538 | 	fprintf(fout, "_gamma=%f\n", _gamma);
539 | 
540 | 	fclose(fout);
541 |     return 0;
542 | }
543 | 
544 | 
545 | int model::init_estimate() {
546 | 
547 |     int sentiLab, topic;
548 | 	srand(time(0)); // initialize for random number generation
549 | 	z.resize(numDocs);
550 | 	l.resize(numDocs);
551 | 
552 | 	for (int m = 0; m < numDocs; m++) {
553 | 		int docLength = pdataset->pdocs[m]->length;
554 | 		z[m].resize(docLength);
555 | 		l[m].resize(docLength);
556 | 
557 |         for (int t = 0; t < docLength; t++) {
558 | 		    if (pdataset->pdocs[m]->words[t] < 0) {
559 | 			    printf("ERROE! word token %d has index smaller than 0 at doc[%d][%d]\n", pdataset->pdocs[m]->words[t], m, t);
560 | 				return 1;
561 | 			}
562 | 
563 |     	    if ((pdataset->pdocs[m]->priorSentiLabels[t] > -1) && (pdataset->pdocs[m]->priorSentiLabels[t] < numSentiLabs)) {
564 | 			    sentiLab = pdataset->pdocs[m]->priorSentiLabels[t]; // incorporate prior information into the model
565 | 
566 | 			}
567 | 			else {
568 | 			    sentiLab = (int)(((double)rand() / RAND_MAX) * numSentiLabs);
569 | 			    if (sentiLab == numSentiLabs) sentiLab = numSentiLabs -1;  // to avoid over array boundary
570 | 			}
571 |     	    l[m][t] = sentiLab;
572 | 
573 | 			// random initialize the topic assginment
574 | 			topic = (int)(((double)rand() / RAND_MAX) * numTopics);
575 | 			if (topic == numTopics)  topic = numTopics - 1; // to avoid over array boundary
576 | 			z[m][t] = topic;
577 | 
578 | 			// model count assignments
579 | 			nd[m]++;
580 | 			ndl[m][sentiLab]++;
581 | 			ndlz[m][sentiLab][topic]++;
582 | 			nlzw[sentiLab][topic][pdataset->pdocs[m]->words[t]]++;
583 | 			nlz[sentiLab][topic]++;
584 |         }
585 |     }
586 | 
587 |     return 0;
588 | }
589 | 
590 | 
591 | 
592 | int model::estimate() {
593 | 
594 | 	int sentiLab, topic;
595 | 	mapname2labs::iterator it;
596 | 
597 | 	printf("Sampling %d iterations!\n", niters);
598 | 	for (liter = 1; liter <= niters; liter++) {
599 | 	    printf("Iteration %d ...\n", liter);
600 | 		for (int m = 0; m < numDocs; m++) {
601 | 		    for (int n = 0; n < pdataset->pdocs[m]->length; n++) {
602 | 				sampling(m, n, sentiLab, topic);
603 | 				l[m][n] = sentiLab;
604 | 				z[m][n] = topic;
605 | 			}
606 | 		}
607 | 		
608 | 		if (updateParaStep > 0 && liter % updateParaStep == 0) {
609 | 			this->update_Parameters();
610 | 		}
611 | 		
612 | 		if (savestep > 0 && liter % savestep == 0) {
613 | 			if (liter == niters) break;
614 | 
615 | 			printf("Saving the model at iteration %d ...\n", liter);
616 | 			compute_pi_dl();
617 | 			compute_theta_dlz();
618 | 			compute_phi_lzw();
619 | 			save_model(putils->generate_model_name(liter));
620 | 		}
621 | 	}
622 | 	
623 | 	printf("Gibbs sampling completed!\n");
624 | 	printf("Saving the final model!\n");
625 | 	compute_pi_dl();
626 | 	compute_theta_dlz();
627 | 	compute_phi_lzw();
628 | 	save_model(putils->generate_model_name(-1));
629 | 	
630 | 	return 0;
631 | }
632 | 
633 | 
634 | int model::sampling(int m, int n, int& sentiLab, int& topic) {
635 | 
636 | 	sentiLab = l[m][n];
637 | 	topic = z[m][n];
638 | 	int w = pdataset->pdocs[m]->words[n]; // the ID/index of the current word token in vocabulary 
639 | 	double u;
640 | 	
641 | 	nd[m]--;
642 | 	ndl[m][sentiLab]--;
643 | 	ndlz[m][sentiLab][topic]--;
644 | 	nlzw[sentiLab][topic][pdataset->pdocs[m]->words[n]]--;
645 | 	nlz[sentiLab][topic]--;
646 | 
647 | 	// do multinomial sampling via cumulative method
648 | 	for (int l = 0; l < numSentiLabs; l++) {
649 | 		for (int k = 0; k < numTopics; k++) {
650 | 			p[l][k] = (nlzw[l][k][w] + beta_lzw[l][k][w]) / (nlz[l][k] + betaSum_lz[l][k]) *
651 | 		   		(ndlz[m][l][k] + alpha_lz[l][k]) / (ndl[m][l] + alphaSum_l[l]) *
652 | 				(ndl[m][l] + gamma_dl[m][l]) / (nd[m] + gammaSum_d[m]);
653 | 		}
654 | 	}
655 | 	
656 | 	// accumulate multinomial parameters
657 | 	for (int l = 0; l < numSentiLabs; l++)  {
658 | 		for (int k = 0; k < numTopics; k++) {
659 | 			if (k==0)  {
660 | 			    if (l==0) continue;
661 | 		        else p[l][k] += p[l-1][numTopics-1]; // accumulate the sum of the previous array
662 | 			}
663 | 			else p[l][k] += p[l][k-1];
664 | 		}
665 | 	}
666 | 
667 | 	// probability normalization
668 | 	u = ((double)rand() / RAND_MAX) * p[numSentiLabs-1][numTopics-1];
669 | 
670 | 	// sample sentiment label l, where l \in [0, S-1]
671 | 	bool loopBreak=false;
672 | 	for (sentiLab = 0; sentiLab < numSentiLabs; sentiLab++) {   
673 | 		for (topic = 0; topic < numTopics; topic++) { 
674 | 		    if (p[sentiLab][topic] > u) {
675 | 		        loopBreak = true;
676 | 		        break;
677 | 		    }
678 | 		}
679 | 		if (loopBreak == true) {
680 | 			break;
681 | 		}
682 | 	}
683 |     
684 | 	if (sentiLab == numSentiLabs) sentiLab = numSentiLabs - 1; // to avoid over array boundary
685 | 	if (topic == numTopics) topic = numTopics - 1;
686 | 
687 | 	// add estimated 'z' and 'l' to count variables
688 | 	nd[m]++;
689 | 	ndl[m][sentiLab]++;
690 | 	ndlz[m][sentiLab][topic]++;
691 | 	nlzw[sentiLab][topic][pdataset->pdocs[m]->words[n]]++;
692 | 	nlz[sentiLab][topic]++;
693 | 
694 |     return 0;
695 | }
696 | 
697 | 
698 | int model::update_Parameters() {
699 | 
700 | 	int ** data; // temp valuable for exporting 3-dimentional array to 2-dimentional
701 | 	double * alpha_temp;
702 | 	data = new int*[numTopics];
703 | 	for (int k = 0; k < numTopics; k++) {
704 | 		data[k] = new int[numDocs];
705 | 		for (int m = 0; m < numDocs; m++) {
706 | 			data[k][m] = 0;
707 | 		}
708 | 	}
709 | 
710 | 	alpha_temp = new double[numTopics];
711 | 	for (int k = 0; k < numTopics; k++){
712 | 		alpha_temp[k] = 0.0;
713 | 	}
714 | 
715 | 	// update alpha
716 | 	for (int j = 0; j < numSentiLabs; j++) {
717 | 		for (int k = 0; k < numTopics; k++) {
718 | 			for (int m = 0; m < numDocs; m++) {
719 | 				data[k][m] = ndlz[m][j][k]; // ntldsum[j][k][m];
720 | 			}
721 | 		}
722 | 
723 | 		for (int k = 0; k < numTopics; k++) {
724 | 			alpha_temp[k] =  alpha_lz[j][k]; //alpha[j][k];
725 | 		}
726 | 
727 | 		polya_fit_simple(data, alpha_temp, numTopics, numDocs);
728 | 
729 | 		// update alpha
730 | 		alphaSum_l[j] = 0.0;
731 | 		for (int k = 0; k < numTopics; k++) {
732 | 			alpha_lz[j][k] = alpha_temp[k];
733 | 			alphaSum_l[j] += alpha_lz[j][k];
734 | 		}
735 | 	}
736 | 	
737 | 	return 0;
738 | }
739 | 


--------------------------------------------------------------------------------
/src/math_func.cpp:
--------------------------------------------------------------------------------
   1 | #include "math_func.h"
   2 | #include <cstdlib>
   3 | #include <iostream>
   4 | #include <iomanip>
   5 | #include <cmath>
   6 | #include <ctime>
   7 | 
   8 | using namespace std;
   9 | 
  10 | //*************************  asa032.cpp   ************************************//
  11 | //****************************************************************************80
  12 | 
  13 | double alngam ( double xvalue, int *ifault )
  14 | 
  15 | //****************************************************************************80
  16 | //
  17 | //  Purpose:
  18 | //
  19 | //    ALNGAM computes the logarithm of the gamma function.
  20 | //
  21 | //  Modified:
  22 | //
  23 | //    13 January 2008
  24 | //
  25 | //  Author:
  26 | //
  27 | //    Original FORTRAN77 version by Allan Macleod
  28 | //    C++ version by John Burkardt
  29 | //
  30 | //  Reference:
  31 | //
  32 | //    Allan Macleod,
  33 | //    Algorithm AS 245,
  34 | //    A Robust and Reliable Algorithm for the Logarithm of the Gamma Function,
  35 | //    Applied Statistics,
  36 | //    Volume 38, Number 2, 1989, pages 397-402.
  37 | //
  38 | //  Parameters:
  39 | //
  40 | //    Input, double XVALUE, the argument of the Gamma function.
  41 | //
  42 | //    Output, int IFAULT, error flag.
  43 | //    0, no error occurred.
  44 | //    1, XVALUE is less than or equal to 0.
  45 | //    2, XVALUE is too big.
  46 | //
  47 | //    Output, double ALNGAM, the logarithm of the gamma function of X.
  48 | //
  49 | {
  50 |   double alr2pi = 0.918938533204673;
  51 |   double r1[9] = {
  52 |     -2.66685511495, 
  53 |     -24.4387534237, 
  54 |     -21.9698958928, 
  55 |      11.1667541262, 
  56 |      3.13060547623, 
  57 |      0.607771387771, 
  58 |      11.9400905721, 
  59 |      31.4690115749, 
  60 |      15.2346874070 };
  61 |   double r2[9] = {
  62 |     -78.3359299449, 
  63 |     -142.046296688, 
  64 |      137.519416416, 
  65 |      78.6994924154, 
  66 |      4.16438922228, 
  67 |      47.0668766060, 
  68 |      313.399215894, 
  69 |      263.505074721, 
  70 |      43.3400022514 };
  71 |   double r3[9] = {
  72 |     -2.12159572323E+05, 
  73 |      2.30661510616E+05, 
  74 |      2.74647644705E+04, 
  75 |     -4.02621119975E+04, 
  76 |     -2.29660729780E+03, 
  77 |     -1.16328495004E+05, 
  78 |     -1.46025937511E+05, 
  79 |     -2.42357409629E+04, 
  80 |     -5.70691009324E+02 };
  81 |   double r4[5] = {
  82 |      0.279195317918525, 
  83 |      0.4917317610505968, 
  84 |      0.0692910599291889, 
  85 |      3.350343815022304, 
  86 |      6.012459259764103 };
  87 |   double value;
  88 |   double x;
  89 |   double x1;
  90 |   double x2;
  91 |   double xlge = 510000.0;
  92 |   double xlgst = 1.0E+30;
  93 |   double y;
  94 | 
  95 |   x = xvalue;
  96 |   value = 0.0;
  97 | //
  98 | //  Check the input.
  99 | //
 100 |   if ( xlgst <= x )
 101 |   {
 102 |     *ifault = 2;
 103 |     return value;
 104 |   }
 105 | 
 106 |   if ( x <= 0.0 )
 107 |   {
 108 |     *ifault = 1;
 109 |     return value;
 110 |   }
 111 | 
 112 |   *ifault = 0;
 113 | //
 114 | //  Calculation for 0 < X < 0.5 and 0.5 <= X < 1.5 combined.
 115 | //
 116 |   if ( x < 1.5 )
 117 |   {
 118 |     if ( x < 0.5 )
 119 |     {
 120 |       value = - log ( x );
 121 |       y = x + 1.0;
 122 | //
 123 | //  Test whether X < machine epsilon.
 124 | //
 125 |       if ( y == 1.0 )
 126 |       {
 127 |         return value;
 128 |       }
 129 |     }
 130 |     else
 131 |     {
 132 |       value = 0.0;
 133 |       y = x;
 134 |       x = ( x - 0.5 ) - 0.5;
 135 |     }
 136 | 
 137 |     value = value + x * (((( 
 138 |         r1[4]   * y 
 139 |       + r1[3] ) * y 
 140 |       + r1[2] ) * y 
 141 |       + r1[1] ) * y 
 142 |       + r1[0] ) / (((( 
 143 |                   y 
 144 |       + r1[8] ) * y 
 145 |       + r1[7] ) * y 
 146 |       + r1[6] ) * y 
 147 |       + r1[5] );
 148 | 
 149 |     return value;
 150 |   }
 151 | //
 152 | //  Calculation for 1.5 <= X < 4.0.
 153 | //
 154 |   if ( x < 4.0 )
 155 |   {
 156 |     y = ( x - 1.0 ) - 1.0;
 157 | 
 158 |     value = y * (((( 
 159 |         r2[4]   * x 
 160 |       + r2[3] ) * x 
 161 |       + r2[2] ) * x 
 162 |       + r2[1] ) * x 
 163 |       + r2[0] ) / (((( 
 164 |                   x 
 165 |       + r2[8] ) * x 
 166 |       + r2[7] ) * x 
 167 |       + r2[6] ) * x 
 168 |       + r2[5] );
 169 |   }
 170 | //
 171 | //  Calculation for 4.0 <= X < 12.0.
 172 | //
 173 |   else if ( x < 12.0 ) 
 174 |   {
 175 |     value = (((( 
 176 |         r3[4]   * x 
 177 |       + r3[3] ) * x 
 178 |       + r3[2] ) * x 
 179 |       + r3[1] ) * x 
 180 |       + r3[0] ) / (((( 
 181 |                   x 
 182 |       + r3[8] ) * x 
 183 |       + r3[7] ) * x 
 184 |       + r3[6] ) * x 
 185 |       + r3[5] );
 186 |   }
 187 | //
 188 | //  Calculation for 12.0 <= X.
 189 | //
 190 |   else
 191 |   {
 192 |     y = log ( x );
 193 |     value = x * ( y - 1.0 ) - 0.5 * y + alr2pi;
 194 | 
 195 |     if ( x <= xlge )
 196 |     {
 197 |       x1 = 1.0 / x;
 198 |       x2 = x1 * x1;
 199 | 
 200 |       value = value + x1 * ( ( 
 201 |              r4[2]   * 
 202 |         x2 + r4[1] ) * 
 203 |         x2 + r4[0] ) / ( ( 
 204 |         x2 + r4[4] ) * 
 205 |         x2 + r4[3] );
 206 |     }
 207 |   }
 208 | 
 209 |   return value;
 210 | }
 211 | //****************************************************************************80
 212 | 
 213 | double gamain ( double x, double p, int *ifault )
 214 | 
 215 | //****************************************************************************80
 216 | //
 217 | //  Purpose:
 218 | //
 219 | //    GAMAIN computes the incomplete gamma ratio.
 220 | //
 221 | //  Discussion:
 222 | //
 223 | //    A series expansion is used if P > X or X <= 1.  Otherwise, a
 224 | //    continued fraction approximation is used.
 225 | //
 226 | //  Modified:
 227 | //
 228 | //    17 January 2008
 229 | //
 230 | //  Author:
 231 | //
 232 | //    Original FORTRAN77 version by G Bhattacharjee
 233 | //    C++ version by John Burkardt
 234 | //
 235 | //  Reference:
 236 | //
 237 | //    G Bhattacharjee,
 238 | //    Algorithm AS 32:
 239 | //    The Incomplete Gamma Integral,
 240 | //    Applied Statistics,
 241 | //    Volume 19, Number 3, 1970, pages 285-287.
 242 | //
 243 | //  Parameters:
 244 | //
 245 | //    Input, double X, P, the parameters of the incomplete 
 246 | //    gamma ratio.  0 <= X, and 0 < P.
 247 | //
 248 | //    Output, int *IFAULT, error flag.
 249 | //    0, no errors.
 250 | //    1, P <= 0.
 251 | //    2, X < 0.
 252 | //    3, underflow.
 253 | //    4, error return from the Log Gamma routine.
 254 | //
 255 | //    Output, double GAMAIN, the value of the incomplete gamma ratio.
 256 | //
 257 | {
 258 |   double a;
 259 |   double acu = 1.0E-08;
 260 |   double an;
 261 |   double arg;
 262 |   double b;
 263 |   double dif;
 264 |   double factor;
 265 |   double g;
 266 |   double gin;
 267 |   int i;
 268 |   double oflo = 1.0E+37;
 269 |   double pn[6];
 270 |   double rn;
 271 |   double term;
 272 |   double uflo = 1.0E-37;
 273 |   double value;
 274 | //
 275 | //  Check the input.
 276 | //
 277 |   if ( p <= 0.0 )
 278 |   {
 279 |     *ifault = 1;
 280 |     value = 0.0;
 281 |     return value;
 282 |   }
 283 | 
 284 |   if ( x < 0.0 )
 285 |   {
 286 |     *ifault = 2;
 287 |     value = 0.0;
 288 |     return value;
 289 |   }
 290 | 
 291 |   if ( x == 0.0 )
 292 |   {
 293 |     *ifault = 0;
 294 |     value = 0.0;
 295 |     return value;
 296 |   }
 297 | 
 298 |   g = alngam ( p, ifault );
 299 | 
 300 |   if ( *ifault != 0 )
 301 |   {
 302 |     *ifault = 4;
 303 |     value = 0.0;
 304 |     return value;
 305 |   }
 306 | 
 307 |   arg = p * log ( x ) - x - g;
 308 | 
 309 |   if ( arg < log ( uflo ) )
 310 |   {
 311 |     *ifault = 3;
 312 |     value = 0.0;
 313 |     return value;
 314 |   }
 315 | 
 316 |   *ifault = 0;
 317 |   factor = exp ( arg );
 318 | //
 319 | //  Calculation by series expansion.
 320 | //
 321 |   if ( x <= 1.0 || x < p )
 322 |   {
 323 |     gin = 1.0;
 324 |     term = 1.0;
 325 |     rn = p;
 326 | 
 327 |     for ( ; ; )
 328 |     {
 329 |       rn = rn + 1.0;
 330 |       term = term * x / rn;
 331 |       gin = gin + term;
 332 | 
 333 |       if ( term <= acu )
 334 |       {
 335 |         break;
 336 |       }
 337 |     }
 338 | 
 339 |     value = gin * factor / p;
 340 |     return value;
 341 |   }
 342 | //
 343 | //  Calculation by continued fraction.
 344 | //
 345 |   a = 1.0 - p;
 346 |   b = a + x + 1.0;
 347 |   term = 0.0;
 348 | 
 349 |   pn[0] = 1.0;
 350 |   pn[1] = x;
 351 |   pn[2] = x + 1.0;
 352 |   pn[3] = x * b;
 353 | 
 354 |   gin = pn[2] / pn[3];
 355 | 
 356 |   for ( ; ; )
 357 |   {
 358 |     a = a + 1.0;
 359 |     b = b + 2.0;
 360 |     term = term + 1.0;
 361 |     an = a * term;
 362 |     for ( i = 0; i <= 1; i++ )
 363 |     {
 364 |       pn[i+4] = b * pn[i+2] - an * pn[i];
 365 |     }
 366 | 
 367 |     if ( pn[5] != 0.0 )
 368 |     {
 369 |       rn = pn[4] / pn[5];
 370 |       dif = r8_abs ( gin - rn );
 371 | //
 372 | //  Absolute error tolerance satisfied?
 373 | //
 374 |       if ( dif <= acu )
 375 |       {
 376 | //
 377 | //  Relative error tolerance satisfied?
 378 | //
 379 |         if ( dif <= acu * rn )
 380 |         {
 381 |           value = 1.0 - factor * gin;
 382 |           break;
 383 |         }
 384 |       }
 385 |       gin = rn;
 386 |     }
 387 | 
 388 |     for ( i = 0; i < 4; i++ )
 389 |     {
 390 |       pn[i] = pn[i+2];
 391 |     }
 392 | 
 393 |     if ( oflo <= r8_abs ( pn[4] ) )
 394 |     {
 395 |       for ( i = 0; i < 4; i++ )
 396 |       {
 397 |         pn[i] = pn[i] / oflo;
 398 |       }
 399 |     }
 400 |   }
 401 | 
 402 |   return value;
 403 | }
 404 | //****************************************************************************80
 405 | 
 406 | void gamma_inc_values ( int *n_data, double *a, double *x, double *fx )
 407 | 
 408 | //****************************************************************************80
 409 | //
 410 | //  Purpose:
 411 | //
 412 | //    GAMMA_INC_VALUES returns some values of the incomplete Gamma function.
 413 | //
 414 | //  Discussion:
 415 | //
 416 | //    The (normalized) incomplete Gamma function P(A,X) is defined as:
 417 | //
 418 | //      PN(A,X) = 1/Gamma(A) * Integral ( 0 <= T <= X ) T**(A-1) * exp(-T) dT.
 419 | //
 420 | //    With this definition, for all A and X,
 421 | //
 422 | //      0 <= PN(A,X) <= 1
 423 | //
 424 | //    and
 425 | //
 426 | //      PN(A,INFINITY) = 1.0
 427 | //
 428 | //    In Mathematica, the function can be evaluated by:
 429 | //
 430 | //      1 - GammaRegularized[A,X]
 431 | //
 432 | //  Modified:
 433 | //
 434 | //    20 November 2004
 435 | //
 436 | //  Author:
 437 | //
 438 | //    John Burkardt
 439 | //
 440 | //  Reference:
 441 | //
 442 | //    Milton Abramowitz, Irene Stegun,
 443 | //    Handbook of Mathematical Functions,
 444 | //    National Bureau of Standards, 1964,
 445 | //    ISBN: 0-486-61272-4,
 446 | //    LC: QA47.A34.
 447 | //
 448 | //    Stephen Wolfram,
 449 | //    The Mathematica Book,
 450 | //    Fourth Edition,
 451 | //    Cambridge University Press, 1999,
 452 | //    ISBN: 0-521-64314-7,
 453 | //    LC: QA76.95.W65.
 454 | //
 455 | //  Parameters:
 456 | //
 457 | //    Input/output, int *N_DATA.  The user sets N_DATA to 0 before the
 458 | //    first call.  On each call, the routine increments N_DATA by 1, and
 459 | //    returns the corresponding data; when there is no more data, the
 460 | //    output value of N_DATA will be 0 again.
 461 | //
 462 | //    Output, double *A, the parameter of the function.
 463 | //
 464 | //    Output, double *X, the argument of the function.
 465 | //
 466 | //    Output, double *FX, the value of the function.
 467 | //
 468 | {
 469 | # define N_MAX 20
 470 | 
 471 |   double a_vec[N_MAX] = { 
 472 |      0.10E+00,  
 473 |      0.10E+00,  
 474 |      0.10E+00,  
 475 |      0.50E+00,  
 476 |      0.50E+00,  
 477 |      0.50E+00,  
 478 |      0.10E+01,  
 479 |      0.10E+01,  
 480 |      0.10E+01,  
 481 |      0.11E+01,  
 482 |      0.11E+01,  
 483 |      0.11E+01,  
 484 |      0.20E+01,  
 485 |      0.20E+01,  
 486 |      0.20E+01,  
 487 |      0.60E+01,  
 488 |      0.60E+01,  
 489 |      0.11E+02,  
 490 |      0.26E+02,  
 491 |      0.41E+02  };
 492 | 
 493 |   double fx_vec[N_MAX] = { 
 494 |      0.7382350532339351E+00,  
 495 |      0.9083579897300343E+00,  
 496 |      0.9886559833621947E+00,  
 497 |      0.3014646416966613E+00,  
 498 |      0.7793286380801532E+00,  
 499 |      0.9918490284064973E+00,  
 500 |      0.9516258196404043E-01,  
 501 |      0.6321205588285577E+00,  
 502 |      0.9932620530009145E+00,  
 503 |      0.7205974576054322E-01,  
 504 |      0.5891809618706485E+00,  
 505 |      0.9915368159845525E+00,  
 506 |      0.1018582711118352E-01,
 507 |      0.4421745996289254E+00,
 508 |      0.9927049442755639E+00,
 509 |      0.4202103819530612E-01,  
 510 |      0.9796589705830716E+00,  
 511 |      0.9226039842296429E+00,  
 512 |      0.4470785799755852E+00,  
 513 |      0.7444549220718699E+00 };
 514 | 
 515 |   double x_vec[N_MAX] = { 
 516 |      0.30E-01,  
 517 |      0.30E+00,  
 518 |      0.15E+01,  
 519 |      0.75E-01,  
 520 |      0.75E+00,  
 521 |      0.35E+01,  
 522 |      0.10E+00,  
 523 |      0.10E+01,  
 524 |      0.50E+01,  
 525 |      0.10E+00,   
 526 |      0.10E+01,  
 527 |      0.50E+01,  
 528 |      0.15E+00,  
 529 |      0.15E+01,  
 530 |      0.70E+01,  
 531 |      0.25E+01,  
 532 |      0.12E+02,  
 533 |      0.16E+02,  
 534 |      0.25E+02,  
 535 |      0.45E+02 };
 536 | 
 537 |   if ( *n_data < 0 )
 538 |   {
 539 |     *n_data = 0;
 540 |   }
 541 | 
 542 |   *n_data = *n_data + 1;
 543 | 
 544 |   if ( N_MAX < *n_data )
 545 |   {
 546 |     *n_data = 0;
 547 |     *a = 0.0;
 548 |     *x = 0.0;
 549 |     *fx = 0.0;
 550 |   }
 551 |   else
 552 |   {
 553 |     *a = a_vec[*n_data-1];
 554 |     *x = x_vec[*n_data-1];
 555 |     *fx = fx_vec[*n_data-1];
 556 |   }
 557 | 
 558 |   return;
 559 | # undef N_MAX
 560 | }
 561 | //****************************************************************************80
 562 | 
 563 | double r8_abs ( double x )
 564 | 
 565 | //****************************************************************************80
 566 | //
 567 | //  Purpose:
 568 | //
 569 | //    R8_ABS returns the absolute value of an R8.
 570 | //
 571 | //  Modified:
 572 | //
 573 | //    17 January 2008
 574 | //
 575 | //  Author:
 576 | //
 577 | //    John Burkardt
 578 | //
 579 | //  Parameters:
 580 | //
 581 | //    Input, double X, the argument.
 582 | //
 583 | //    Output, double R8_ABS, the absolute value of the argument.
 584 | //
 585 | {
 586 |   if ( 0.0 <= x )
 587 |   {
 588 |     return x;
 589 |   }
 590 |   else
 591 |   {
 592 |     return ( - x );
 593 |   }
 594 | }
 595 | //****************************************************************************80
 596 | 
 597 | void timestamp ( void )
 598 | 
 599 | //****************************************************************************80
 600 | //
 601 | //  Purpose:
 602 | //
 603 | //    TIMESTAMP prints the current YMDHMS date as a time stamp.
 604 | //
 605 | //  Example:
 606 | //
 607 | //    31 May 2001 09:45:54 AM
 608 | //
 609 | //  Modified:
 610 | //
 611 | //    24 September 2003
 612 | //
 613 | //  Author:
 614 | //
 615 | //    John Burkardt
 616 | //
 617 | //  Parameters:
 618 | //
 619 | //    None
 620 | //
 621 | {
 622 | # define TIME_SIZE 40
 623 | 
 624 |   static char time_buffer[TIME_SIZE];
 625 |   const struct tm *tm;
 626 |   size_t len;
 627 |   time_t now;
 628 | 
 629 |   now = time ( NULL );
 630 |   tm = localtime ( &now );
 631 | 
 632 |   len = strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm );
 633 | 
 634 |   cout << time_buffer << "\n";
 635 | 
 636 |   return;
 637 | # undef TIME_SIZE
 638 | }
 639 | 
 640 | 
 641 | 
 642 | //*************************  asa103.cpp   ************************************//
 643 | //*****************************************************************************//
 644 | 
 645 | //****************************************************************************80
 646 | 
 647 | double digama ( double x, int *ifault )
 648 | 
 649 | //****************************************************************************80
 650 | //
 651 | //  Purpose:
 652 | //
 653 | //    DIGAMA calculates DIGAMMA ( X ) = d ( LOG ( GAMMA ( X ) ) ) / dX
 654 | //
 655 | //  Modified:
 656 | //
 657 | //    18 January 2008
 658 | //
 659 | //  Author:
 660 | //
 661 | //    Jose Bernardo
 662 | //    FORTRAN90 version by John Burkardt
 663 | //
 664 | //  Reference:
 665 | //
 666 | //    Jose Bernardo,
 667 | //    Algorithm AS 103:
 668 | //    Psi ( Digamma ) Function,
 669 | //    Applied Statistics,
 670 | //    Volume 25, Number 3, 1976, pages 315-317.
 671 | //
 672 | //  Parameters:
 673 | //
 674 | //    Input, double X, the argument of the digamma function.
 675 | //    0 < X.
 676 | //
 677 | //    Output, int *IFAULT, error flag.
 678 | //    0, no error.
 679 | //    1, X <= 0.
 680 | //
 681 | //    Output, double DIGAMA, the value of the digamma function at X.
 682 | //
 683 | {
 684 |   double c = 8.5;
 685 |   double d1 = -0.5772156649;
 686 |   double r;
 687 |   double s = 0.00001;
 688 |   double s3 = 0.08333333333;
 689 |   double s4 = 0.0083333333333;
 690 |   double s5 = 0.003968253968;
 691 |   double value;
 692 |   double y;
 693 | //
 694 | //  Check the input.
 695 | //
 696 |   if ( x <= 0.0 )
 697 |   {
 698 |     value = 0.0;
 699 |     *ifault = 1;
 700 |     return value;
 701 |   }
 702 | //
 703 | //  Initialize.
 704 | //
 705 |   *ifault = 0;
 706 |   y = x;
 707 |   value = 0.0;
 708 | //
 709 | //  Use approximation if argument <= S.
 710 | //
 711 |   if ( y <= s )
 712 |   {
 713 |     value = d1 - 1.0 / y;
 714 |     return value;
 715 |   }
 716 | //
 717 | //  Reduce to DIGAMA(X + N) where (X + N) >= C.
 718 | //
 719 |   while ( y < c )
 720 |   {
 721 |     value = value - 1.0 / y;
 722 |     y = y + 1.0;
 723 |   }
 724 | //
 725 | //  Use Stirling's (actually de Moivre's) expansion if argument > C.
 726 | //
 727 |   r = 1.0 / y;
 728 |   value = value + log ( y ) - 0.5 * r;
 729 |   r = r * r;
 730 |   value = value - r * ( s3 - r * ( s4 - r * s5 ) );
 731 | 
 732 |   return value;
 733 | }
 734 | //****************************************************************************80
 735 | 
 736 | void psi_values ( int *n_data, double *x, double *fx )
 737 | 
 738 | //****************************************************************************80
 739 | //
 740 | //  Purpose:
 741 | //
 742 | //    PSI_VALUES returns some values of the Psi or Digamma function.
 743 | //
 744 | //  Discussion:
 745 | //
 746 | //    In Mathematica, the function can be evaluated by:
 747 | //
 748 | //      PolyGamma[x]
 749 | //
 750 | //    or
 751 | //
 752 | //      Polygamma[0,x]
 753 | //
 754 | //    PSI(X) = d ln ( Gamma ( X ) ) / d X = Gamma'(X) / Gamma(X)
 755 | //
 756 | //    PSI(1) = -Euler's constant.
 757 | //
 758 | //    PSI(X+1) = PSI(X) + 1 / X.
 759 | //
 760 | //  Modified:
 761 | //
 762 | //    17 August 2004
 763 | //
 764 | //  Author:
 765 | //
 766 | //    John Burkardt
 767 | //
 768 | //  Reference:
 769 | //
 770 | //    Milton Abramowitz, Irene Stegun,
 771 | //    Handbook of Mathematical Functions,
 772 | //    National Bureau of Standards, 1964,
 773 | //    ISBN: 0-486-61272-4,
 774 | //    LC: QA47.A34.
 775 | //
 776 | //    Stephen Wolfram,
 777 | //    The Mathematica Book,
 778 | //    Fourth Edition,
 779 | //    Cambridge University Press, 1999,
 780 | //    ISBN: 0-521-64314-7,
 781 | //    LC: QA76.95.W65.
 782 | //
 783 | //  Parameters:
 784 | //
 785 | //    Input/output, int *N_DATA.  The user sets N_DATA to 0 before the
 786 | //    first call.  On each call, the routine increments N_DATA by 1, and
 787 | //    returns the corresponding data; when there is no more data, the
 788 | //    output value of N_DATA will be 0 again.
 789 | //
 790 | //    Output, double *X, the argument of the function.
 791 | //
 792 | //    Output, double *FX, the value of the function.
 793 | //
 794 | {
 795 | # define N_MAX 11
 796 | 
 797 |   double fx_vec[N_MAX] = { 
 798 |      -0.5772156649015329E+00,  
 799 |      -0.4237549404110768E+00,  
 800 |      -0.2890398965921883E+00,  
 801 |      -0.1691908888667997E+00,  
 802 |      -0.6138454458511615E-01,  
 803 |       0.3648997397857652E-01,  
 804 |       0.1260474527734763E+00,  
 805 |       0.2085478748734940E+00,  
 806 |       0.2849914332938615E+00,  
 807 |       0.3561841611640597E+00,  
 808 |       0.4227843350984671E+00 };
 809 | 
 810 |   double x_vec[N_MAX] = { 
 811 |      1.0E+00,  
 812 |      1.1E+00,  
 813 |      1.2E+00,  
 814 |      1.3E+00,  
 815 |      1.4E+00,  
 816 |      1.5E+00,  
 817 |      1.6E+00,  
 818 |      1.7E+00,  
 819 |      1.8E+00,  
 820 |      1.9E+00,  
 821 |      2.0E+00 };
 822 | 
 823 |   if ( *n_data < 0 )
 824 |   {
 825 |     *n_data = 0;
 826 |   }
 827 | 
 828 |   *n_data = *n_data + 1;
 829 | 
 830 |   if ( N_MAX < *n_data )
 831 |   {
 832 |     *n_data = 0;
 833 |     *x = 0.0;
 834 |     *fx = 0.0;
 835 |   }
 836 |   else
 837 |   {
 838 |     *x = x_vec[*n_data-1];
 839 |     *fx = fx_vec[*n_data-1];
 840 |   }
 841 | 
 842 |   return;
 843 | # undef N_MAX
 844 | }
 845 | //****************************************************************************80
 846 | 
 847 | //void timestamp ( void )
 848 | //
 849 | ////****************************************************************************80
 850 | ////
 851 | ////  Purpose:
 852 | ////
 853 | ////    TIMESTAMP prints the current YMDHMS date as a time stamp.
 854 | ////
 855 | ////  Example:
 856 | ////
 857 | ////    31 May 2001 09:45:54 AM
 858 | ////
 859 | ////  Modified:
 860 | ////
 861 | ////    24 September 2003
 862 | ////
 863 | ////  Author:
 864 | ////
 865 | ////    John Burkardt
 866 | ////
 867 | ////  Parameters:
 868 | ////
 869 | ////    None
 870 | ////
 871 | //{
 872 | //# define TIME_SIZE 40
 873 | //
 874 | //  static char time_buffer[TIME_SIZE];
 875 | //  const struct tm *tm;
 876 | //  size_t len;
 877 | //  time_t now;
 878 | //
 879 | //  now = time ( NULL );
 880 | //  tm = localtime ( &now );
 881 | //
 882 | //  len = strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm );
 883 | //
 884 | //  cout << time_buffer << "\n";
 885 | //
 886 | //  return;
 887 | //# undef TIME_SIZE
 888 | //}
 889 | 
 890 | 
 891 | 
 892 | //*************************  asa121.cpp   ************************************//
 893 | //*****************************************************************************//
 894 | 
 895 | //****************************************************************************80
 896 | 
 897 | //void timestamp ( void )
 898 | //
 899 | ////****************************************************************************80
 900 | ////
 901 | ////  Purpose:
 902 | ////
 903 | ////    TIMESTAMP prints the current YMDHMS date as a time stamp.
 904 | ////
 905 | ////  Example:
 906 | ////
 907 | ////    31 May 2001 09:45:54 AM
 908 | ////
 909 | ////  Modified:
 910 | ////
 911 | ////    24 September 2003
 912 | ////
 913 | ////  Author:
 914 | ////
 915 | ////    John Burkardt
 916 | ////
 917 | ////  Parameters:
 918 | ////
 919 | ////    None
 920 | ////
 921 | //{
 922 | //# define TIME_SIZE 40
 923 | //
 924 | //  static char time_buffer[TIME_SIZE];
 925 | //  const struct tm *tm;
 926 | //  size_t len;
 927 | //  time_t now;
 928 | //
 929 | //  now = time ( NULL );
 930 | //  tm = localtime ( &now );
 931 | //
 932 | //  len = strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm );
 933 | //
 934 | //  cout << time_buffer << "\n";
 935 | //
 936 | //  return;
 937 | //# undef TIME_SIZE
 938 | //}
 939 | //****************************************************************************80
 940 | 
 941 | double trigam ( double x, int *ifault )
 942 | 
 943 | //****************************************************************************80
 944 | //
 945 | //  Purpose:
 946 | //
 947 | //    TRIGAM calculates trigamma(x) = d**2 log(gamma(x)) / dx**2
 948 | //
 949 | //  Modified:
 950 | //
 951 | //    19 January 2008
 952 | //
 953 | //  Author:
 954 | //
 955 | //    BE Schneider
 956 | //    Modifications by John Burkardt
 957 | //
 958 | //  Reference:
 959 | //
 960 | //    BE Schneider,
 961 | //    Algorithm AS 121:
 962 | //    Trigamma Function,
 963 | //    Applied Statistics, 
 964 | //    Volume 27, Number 1, pages 97-99, 1978.
 965 | //
 966 | //  Parameters:
 967 | //
 968 | //    Input, double X, the argument of the trigamma function.
 969 | //    0 < X.
 970 | //
 971 | //    Output, int *IFAULT, error flag.
 972 | //    0, no error.
 973 | //    1, X <= 0.
 974 | //
 975 | //    Output, double TRIGAM, the value of the trigamma function at X.
 976 | //
 977 | {
 978 |   double a = 0.0001;
 979 |   double b = 5.0;
 980 |   double b2 =  0.1666666667;
 981 |   double b4 = -0.03333333333;
 982 |   double b6 =  0.02380952381;
 983 |   double b8 = -0.03333333333;
 984 |   double value;
 985 |   double y;
 986 |   double z;
 987 | //
 988 | //  Check the input.
 989 | //
 990 |   if ( x <= 0.0 )
 991 |   {
 992 |     *ifault = 1;
 993 |     value = 0.0;
 994 |     return value;
 995 |   }
 996 | 
 997 |   *ifault = 0;
 998 |   z = x;
 999 | //
1000 | //  Use small value approximation if X <= A.
1001 | //
1002 |   if ( x <= a )
1003 |   {
1004 |     value = 1.0 / x / x;
1005 |     return value;
1006 |   }
1007 | //
1008 | //  Increase argument to ( X + I ) >= B.
1009 | //
1010 |   value = 0.0;
1011 | 
1012 |   while ( z < b )
1013 |   {
1014 |     value = value + 1.0 / z / z;
1015 |     z = z + 1.0;
1016 |   }
1017 | //
1018 | //  Apply asymptotic formula if argument is B or greater.
1019 | //
1020 |   y = 1.0 / z / z;
1021 | 
1022 |   value = value + 0.5 * 
1023 |       y + ( 1.0 
1024 |     + y * ( b2  
1025 |     + y * ( b4  
1026 |     + y * ( b6  
1027 |     + y *   b8 )))) / z;
1028 | 
1029 |   return value;
1030 | }
1031 | //****************************************************************************80
1032 | 
1033 | void trigamma_values ( int *n_data, double *x, double *fx )
1034 | 
1035 | //****************************************************************************80
1036 | //
1037 | //  Purpose:
1038 | //
1039 | //    TRIGAMMA_VALUES returns some values of the TriGamma function.
1040 | //
1041 | //  Discussion:
1042 | //
1043 | //    In Mathematica, the function can be evaluated by:
1044 | //
1045 | //      PolyGamma[1,x]
1046 | //
1047 | //    TriGamma(X) = d^2 ln ( Gamma ( X ) ) / d X^2
1048 | //
1049 | //  Modified:
1050 | //
1051 | //    16 September 2004
1052 | //
1053 | //  Author:
1054 | //
1055 | //    John Burkardt
1056 | //
1057 | //  Reference:
1058 | //
1059 | //    Milton Abramowitz, Irene Stegun,
1060 | //    Handbook of Mathematical Functions,
1061 | //    National Bureau of Standards, 1964,
1062 | //    ISBN: 0-486-61272-4,
1063 | //    LC: QA47.A34.
1064 | //
1065 | //    Stephen Wolfram,
1066 | //    The Mathematica Book,
1067 | //    Fourth Edition,
1068 | //    Cambridge University Press, 1999,
1069 | //    ISBN: 0-521-64314-7,
1070 | //    LC: QA76.95.W65.
1071 | //
1072 | //  Parameters:
1073 | //
1074 | //    Input/output, int *N_DATA.  The user sets N_DATA to 0 before the
1075 | //    first call.  On each call, the routine increments N_DATA by 1, and
1076 | //    returns the corresponding data; when there is no more data, the
1077 | //    output value of N_DATA will be 0 again.
1078 | //
1079 | //    Output, double *X, the argument of the function.
1080 | //
1081 | //    Output, double *FX, the value of the function.
1082 | //
1083 | {
1084 | # define N_MAX 11
1085 | 
1086 |   double fx_vec[N_MAX] = { 
1087 |     0.1644934066848226E+01, 
1088 |     0.1433299150792759E+01, 
1089 |     0.1267377205423779E+01, 
1090 |     0.1134253434996619E+01, 
1091 |     0.1025356590529597E+01, 
1092 |     0.9348022005446793E+00, 
1093 |     0.8584318931245799E+00, 
1094 |     0.7932328301639984E+00, 
1095 |     0.7369741375017002E+00, 
1096 |     0.6879720582426356E+00, 
1097 |     0.6449340668482264E+00 };
1098 | 
1099 |   double x_vec[N_MAX] = { 
1100 |      1.0E+00,  
1101 |      1.1E+00,  
1102 |      1.2E+00,  
1103 |      1.3E+00,  
1104 |      1.4E+00,  
1105 |      1.5E+00,  
1106 |      1.6E+00,  
1107 |      1.7E+00,  
1108 |      1.8E+00,  
1109 |      1.9E+00,  
1110 |      2.0E+00 };
1111 | 
1112 |   if ( *n_data < 0 )
1113 |   {
1114 |     *n_data = 0;
1115 |   }
1116 | 
1117 |   *n_data = *n_data + 1;
1118 | 
1119 |   if ( N_MAX < *n_data )
1120 |   {
1121 |     *n_data = 0;
1122 |     *x = 0.0;
1123 |     *fx = 0.0;
1124 |   }
1125 |   else
1126 |   {
1127 |     *x = x_vec[*n_data-1];
1128 |     *fx = fx_vec[*n_data-1];
1129 |   }
1130 | 
1131 |   return;
1132 | # undef N_MAX
1133 | }
1134 | 


--------------------------------------------------------------------------------
/src/inference.cpp:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 | 		        Joint Sentiment-Topic (JST) Model
  3 | ***********************************************************************
  4 | 
  5 | (C) Copyright 2013, Chenghua Lin and Yulan He
  6 | 
  7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk.
  8 | Part of code is from http://gibbslda.sourceforge.net/.
  9 | 
 10 | This file is part of JST implementation.
 11 | 
 12 | JST is free software; you can redistribute it and/or modify it under
 13 | the terms of the GNU General Public License as published by the Free
 14 | Software Foundation; either version 2 of the License, or (at your
 15 | option) any later version.
 16 | 
 17 | JST is distributed in the hope that it will be useful, but WITHOUT
 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 20 | for more details.
 21 | 
 22 | You should have received a copy of the GNU General Public License
 23 | along with this program; if not, write to the Free Software
 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 25 | USA
 26 | 
 27 | ***********************************************************************/
 28 |    
 29 | #include "inference.h"
 30 | using namespace std;
 31 | 
 32 | Inference::Inference(void) {
 33 | 
 34 |     numSentiLabs = 0; 
 35 | 	numTopics = 0;
 36 | 	numDocs = 0; 
 37 | 	vocabSize = 0;
 38 | 	newNumDocs = 0;
 39 | 	newVocabSize = 0;
 40 | 	_beta = -1.0;
 41 | 	
 42 | 	wordmapfile = "wordmap.txt";
 43 |     tassign_suffix = ".newtassign";
 44 |     pi_suffix = ".newpi";
 45 |     theta_suffix = ".newtheta";
 46 |     phi_suffix = ".newphi";
 47 |     others_suffix = ".newothers";
 48 |     twords_suffix = ".newtwords";
 49 | 	model_name = "";
 50 | 	data_dir = "";
 51 | 	datasetFile = "";
 52 | 	result_dir = "";
 53 | 	sentiLexFile = "";
 54 | 
 55 | 	updateParaStep = -1;
 56 | 	savestep = 20;
 57 | 	twords = 20;
 58 | 	niters = 40;
 59 | 	
 60 | 	putils = new utils();
 61 | 	pmodelData = NULL;
 62 | 	pnewData = NULL;
 63 | }
 64 | 
 65 | 
 66 | Inference::~Inference(void) {
 67 | 
 68 | 	if (putils)
 69 | 		delete putils;
 70 | 		
 71 | 	if (pmodelData)
 72 | 		delete pmodelData;
 73 | 	
 74 | 	if (pnewData)
 75 | 		delete pnewData;
 76 | }
 77 | 
 78 | 
 79 | int Inference::init(int argc, char ** argv) {
 80 | 
 81 |     if (putils->parse_args_inf(argc, argv, this)) {
 82 | 	    return 1;
 83 |     }
 84 | 
 85 | 	if(init_inf()) {
 86 | 	    printf("Throw expectation in init_inf()!  \n");
 87 | 		return 1; 
 88 | 	}
 89 | 
 90 | 	if(inference()) {
 91 | 	    printf("Throw expectation in inference()!  \n");
 92 | 		return 1; 
 93 | 	}
 94 | 
 95 |     return 0;
 96 | }
 97 | 
 98 | 
 99 | // read '.others' file
100 | int Inference::read_model_setting(string filename) {
101 | 
102 | 	char buff[BUFF_SIZE_LONG];
103 | 	string line;
104 | 	numSentiLabs = 0;
105 | 	numTopics = 0;
106 | 	numDocs = 0;
107 | 	vocabSize = 0;
108 | 
109 | 	FILE * fin = fopen(filename.c_str(), "r");
110 | 	if (!fin) {
111 |         printf("Cannot read file %s!\n", filename.c_str());
112 |         return 1;
113 | 	}
114 |     
115 | 	while (fgets(buff, BUFF_SIZE_LONG - 1, fin) != NULL) {
116 | 		line = buff; 
117 | 		strtokenizer values(line, ": \t\r\n={}[]"); // \t\r\n are separators
118 | 
119 | 		if (values.token(0) == "numSentiLabs") {
120 | 			numSentiLabs = atoi(values.token(1).c_str());
121 | 		}
122 | 		else if (values.token(0) == "numTopics") {
123 | 			numTopics = atoi(values.token(1).c_str());
124 | 		}
125 | 		else if (values.token(0) == "numDocs") {
126 | 			numDocs = atoi(values.token(1).c_str());
127 | 		}
128 | 		else if (values.token(0) == "vocabSize") {
129 | 			vocabSize = atoi(values.token(1).c_str());
130 | 		}
131 | 		if (numSentiLabs > 0 && numTopics > 0 && numDocs > 0 && vocabSize > 0) {
132 | 			break;
133 | 		}
134 | 	}
135 | 
136 | 	fclose(fin);
137 | 	
138 | 	if (numSentiLabs == 0 || numTopics == 0 || numDocs == 0 || vocabSize == 0) {
139 | 		cout << "Throw exception in reading model parameter settings!\n" << filename << endl;
140 | 		return 1;
141 | 	}
142 | 	else {
143 | 		cout<<"data_dir = "<<data_dir<<endl;
144 | 		cout<<"datasetFile = "<<datasetFile<<endl;
145 | 		cout<<"result_dir = "<<result_dir<<endl;
146 | 		cout<<"sentiLexFile = "<<sentiLexFile<<endl;
147 | 		cout<<"model_dir = "<<model_dir<<endl;
148 | 		cout<<"model_name = "<<model_name<<endl;
149 | 		cout<<"wordmapfile = "<<wordmapfile<<endl;
150 | 		cout<<"numTopics = "<<numTopics<<endl;
151 | 		cout<<"numSentiLabs = "<<numSentiLabs<<endl;
152 | 		cout<<"niters = "<<niters<<endl;
153 | 		cout<<"savestep = "<<savestep<<endl;
154 | 		cout<<"twords = "<<twords<<endl;
155 | 		cout<<"updateParaStep = "<<updateParaStep<<endl;
156 | 	}
157 | 
158 | 
159 | 	return 0;
160 | }
161 | 
162 | 
163 | // read '.tassign' file of previously trained model
164 | int Inference::load_model(string filename) {
165 | 
166 |     char buff[BUFF_SIZE_LONG];
167 | 	string line;
168 |     
169 |     FILE * fin = fopen(filename.c_str(), "r");
170 |     if (!fin) {
171 | 	    printf("Cannot read file %s!\n", filename.c_str());
172 | 	    return 1;
173 |     }
174 | 
175 | 	pmodelData->pdocs = new document*[numDocs];
176 | 	pmodelData->vocabSize= vocabSize;
177 | 	pmodelData->numDocs= numDocs;
178 | 	l.resize(pmodelData->numDocs);
179 | 	z.resize(pmodelData->numDocs);
180 | 
181 |     for (int m = 0; m < numDocs; m++) {
182 | 		fgets(buff, BUFF_SIZE_LONG - 1, fin);  // first line - ignore the document ID
183 | 		fgets(buff, BUFF_SIZE_LONG - 1, fin);  // second line - read the sentiment label / topic assignments
184 | 		line = buff; 
185 | 	    strtokenizer strtok(line, " \t\r\n");
186 | 	    int length = strtok.count_tokens();
187 | 	
188 | 	    vector<int> words;
189 | 		vector<int> sentiLabs;
190 | 	    vector<int> topics;
191 | 
192 | 	    for (int j = 0; j < length; j++) {
193 | 	        string token = strtok.token(j);
194 | 	        strtokenizer tok(token, ":");
195 | 	        if (tok.count_tokens() != 3) {
196 | 		        printf("Invalid word-sentiment-topic assignment format!\n");
197 | 		        return 1;
198 | 	        }
199 | 	    
200 | 	        words.push_back(atoi(tok.token(0).c_str()));
201 | 			sentiLabs.push_back(atoi(tok.token(1).c_str()));
202 | 	        topics.push_back(atoi(tok.token(2).c_str()));
203 | 	    }
204 | 	
205 | 		// allocate and add training document to the corpus
206 | 		document * pdoc = new document(words);
207 | 		pmodelData->add_doc(pdoc, m);
208 | 
209 | 		l[m].resize(sentiLabs.size());
210 | 		for (int j = 0; j < (int)sentiLabs.size(); j++) {
211 | 			l[m][j] = sentiLabs[j];
212 | 		}
213 | 
214 | 		z[m].resize(topics.size());
215 | 		for (int j = 0; j < (int)topics.size(); j++) {
216 | 			z[m][j] = topics[j];
217 | 		}
218 | 	}
219 |     fclose(fin);
220 |     
221 | 	// init model counts
222 | 	nlzw.resize(numSentiLabs);
223 | 	for (int l = 0; l < numSentiLabs; l++) {
224 | 		nlzw[l].resize(numTopics);
225 | 		for (int z = 0; z < numTopics; z++) {
226 | 			nlzw[l][z].resize(vocabSize);
227 | 			for (int r = 0; r < vocabSize; r++) {
228 | 			    nlzw[l][z][r] = 0;
229 | 			}
230 | 		}
231 | 	}
232 | 
233 | 	nlz.resize(numSentiLabs);
234 | 	for (int l = 0; l < numSentiLabs; l++) {
235 | 		nlz[l].resize(numTopics);
236 | 		for (int z = 0; z < numTopics; z++) {
237 |             nlz[l][z] = 0;
238 | 		}
239 | 	}
240 | 
241 | 	// recover count values from trained model
242 | 	for (int m = 0; m < pmodelData->numDocs; m++) {
243 | 		int docLength = pmodelData->pdocs[m]->length;
244 | 		for (int n = 0; n < docLength; n++) {
245 | 			int w = pmodelData->pdocs[m]->words[n];
246 | 			int sentiLab = this->l[m][n];
247 | 			int topic = this->z[m][n];
248 | 
249 | 			nlzw[sentiLab][topic][w]++;
250 | 			nlz[sentiLab][topic]++;
251 | 		}
252 | 	}
253 | 	
254 |     return 0;
255 | }
256 | 
257 | 
258 | 
259 | int Inference::init_inf() {
260 | 
261 | 	pmodelData = new dataset();
262 | 	pnewData = new dataset(result_dir);
263 | 
264 | 	if(read_model_setting(model_dir + model_name + ".others")) {
265 | 	    printf("Throw exception in read_para_setting()!\n");
266 | 		return 1; 
267 | 	}
268 | 
269 | 	// load model
270 | 	if(load_model(model_dir + model_name + ".tassign")) {
271 | 	    printf("Throw exception in load_model()!\n");
272 | 		return 1; 
273 | 	}
274 | 
275 | 	// *** TODO move the function to dataset class
276 | 	if(read_newData(data_dir + datasetFile)) {
277 | 	    printf("Throw exception in read_newData()!\n");
278 | 		return 1; 
279 | 	}
280 | 
281 | 	if(init_parameters()) {
282 | 	    printf("Throw exception in init_parameters!\n");
283 | 		return 1; 
284 | 	}
285 | 
286 | 	printf("Testset statistics: \n");
287 | 	printf("numDocs = %d\n", pnewData->numDocs);
288 | 	printf("vocabSize = %d\n", pnewData->vocabSize);
289 | 	printf("numNew_word = %d\n", (int)(pnewData->newWords.size()));
290 | 
291 | 	// init inf
292 | 	int sentiLab, topic; 
293 | 	new_z.resize(pnewData->numDocs);
294 | 	new_l.resize(pnewData->numDocs);
295 | 
296 | 	for (int m = 0; m < pnewData->numDocs; m++) {
297 | 		int docLength = pnewData->_pdocs[m]->length;
298 | 		new_z[m].resize(docLength);
299 | 		new_l[m].resize(docLength);
300 | 		for (int t = 0; t < docLength; t++) {
301 | 		    if (pnewData->_pdocs[m]->words[t] < 0) {
302 | 			    printf("ERROE! word token %d has index smaller than 0 in doc[%d][%d]\n", pnewData->_pdocs[m]->words[t], m, t);
303 | 				return 1;
304 | 			}
305 | 
306 | 			// sample sentiment label
307 | 		    if ((pnewData->pdocs[m]->priorSentiLabels[t] > -1) && (pnewData->pdocs[m]->priorSentiLabels[t] < numSentiLabs)) {
308 | 			    sentiLab = pnewData->pdocs[m]->priorSentiLabels[t]; // incorporate prior information into the model  
309 | 			}
310 | 			else {
311 | 			    sentiLab = (int)(((double)rand() / RAND_MAX) * numSentiLabs);
312 | 			    if (sentiLab == numSentiLabs) sentiLab = numSentiLabs -1;
313 | 			}
314 | 		    new_l[m][t] = sentiLab;
315 | 
316 | 			// sample topic label 
317 | 			topic = (int)(((double)rand() / RAND_MAX) * numTopics);
318 | 			if (topic == numTopics)  topic = numTopics - 1;
319 | 			new_z[m][t] = topic;
320 | 
321 | 			new_nd[m]++;
322 | 			new_ndl[m][sentiLab]++;
323 | 			new_ndlz[m][sentiLab][topic]++;
324 | 			new_nlzw[sentiLab][topic][pnewData->_pdocs[m]->words[t]]++;
325 | 			new_nlz[sentiLab][topic]++;
326 |        } 
327 | 	}
328 | 	
329 | 	return 0;
330 | }
331 | 
332 | 
333 | int Inference::inference() {
334 | 
335 | 	int sentiLab, topic;
336 | 	printf("Sampling %d iterations for inference!\n", niters);
337 | 
338 | 	liter = 0; 
339 | 	for (liter = 1; liter <= niters; liter++) {
340 | 		printf("Iteration %d ...\n", liter);
341 | 		for (int m = 0; m < pnewData->numDocs; m++) {
342 | 			for (int n = 0; n < pnewData->pdocs[m]->length; n++) {
343 | 				inf_sampling(m, n, sentiLab, topic);
344 | 				new_l[m][n] = sentiLab; 
345 | 				new_z[m][n] = topic; 
346 | 			} 
347 | 		}
348 | 		
349 | 		if (savestep > 0 && liter % savestep == 0) {
350 | 			if (liter == niters) break;
351 | 				
352 | 			printf("Saving the model at iteration %d ...\n", liter);
353 | 			compute_newpi();
354 | 			compute_newtheta();
355 | 			compute_newphi();
356 | 			save_model(model_name + "_" + putils->generate_model_name(liter));
357 | 		}
358 | 	}
359 |     
360 | 	printf("Gibbs sampling completed!\n");
361 | 	printf("Saving the final model!\n");
362 | 	compute_newpi();
363 | 	compute_newtheta();
364 | 	compute_newphi();
365 | 	save_model(model_name + "_" + putils->generate_model_name(-1));
366 | 
367 | 	return 0;
368 | }
369 | 
370 | 
371 | int Inference::init_parameters() {
372 | 
373 | 	// model counts
374 | 	new_p.resize(numSentiLabs);
375 | 	for (int l = 0; l < numSentiLabs; l++) 	{
376 | 		new_p[l].resize(numTopics);
377 | 		for (int z = 0; z < numTopics; z++) {
378 | 		    new_p[l][z] = 0.0;
379 | 		}
380 | 	}
381 | 
382 | 	new_nd.resize(pnewData->numDocs); 
383 | 	for (int m = 0; m < pnewData->numDocs; m++) {
384 | 	    new_nd[m] = 0;
385 | 	}
386 | 
387 | 	new_ndl.resize(pnewData->numDocs);
388 | 	for (int m = 0; m < pnewData->numDocs; m++) {
389 | 		new_ndl[m].resize(numSentiLabs);
390 | 		for (int l = 0; l < numSentiLabs; l++) {
391 | 		    new_ndl[m][l] = 0;
392 | 		}
393 | 	}
394 | 
395 | 	new_ndlz.resize(pnewData->numDocs);
396 | 	for (int m = 0; m < pnewData->numDocs; m++) {
397 | 		new_ndlz[m].resize(numSentiLabs);
398 | 	    for (int l = 0; l < numSentiLabs; l++)	{
399 | 			new_ndlz[m][l].resize(numTopics);
400 | 			for (int z = 0; z < numTopics; z++) {
401 | 			    new_ndlz[m][l][z] = 0; 
402 | 			}
403 | 		}
404 | 	}
405 | 
406 | 	new_nlzw.resize(numSentiLabs);
407 | 	for (int l = 0; l < numSentiLabs; l++) {
408 | 		new_nlzw[l].resize(numTopics);
409 | 		for (int z = 0; z < numTopics; z++) {
410 | 			new_nlzw[l][z].resize(pnewData->vocabSize);
411 | 			for (int r = 0; r < pnewData->vocabSize; r++) {
412 | 			    new_nlzw[l][z][r] = 0;
413 | 			}
414 | 		}
415 | 	}
416 | 
417 | 	new_nlz.resize(numSentiLabs);
418 | 	for (int l = 0; l < numSentiLabs; l++) {
419 | 		new_nlz[l].resize(numTopics);
420 | 		for (int z = 0; z < numTopics; z++) {
421 | 		    new_nlz[l][z] = 0;
422 | 		}
423 | 	}
424 | 
425 | 	// model parameters
426 | 	newpi_dl.resize(pnewData->numDocs);
427 | 	for (int m = 0; m < pnewData->numDocs; m++) {
428 | 		newpi_dl[m].resize(numSentiLabs);
429 | 	}
430 | 
431 | 	newtheta_dlz.resize(pnewData->numDocs);
432 | 	for (int m = 0; m < pnewData->numDocs; m++) {
433 | 		newtheta_dlz[m].resize(numSentiLabs);
434 | 		for (int l = 0; l < numSentiLabs; l++) {
435 | 			newtheta_dlz[m][l].resize(numTopics);
436 | 		}
437 | 	}
438 | 
439 | 	newphi_lzw.resize(numSentiLabs);
440 | 	for (int l = 0; l < numSentiLabs; l++) {
441 | 		newphi_lzw[l].resize(numTopics);
442 | 		for (int z = 0; z < numTopics; z++) {
443 | 			newphi_lzw[l][z].resize(pnewData->vocabSize);
444 | 		}
445 | 	}
446 | 
447 | 	// hyperparameters
448 | 	_alpha =  (double)pnewData->aveDocLength * 0.05 / (double)(numSentiLabs * numTopics);
449 | 	alpha_lz.resize(numSentiLabs);
450 | 	alphaSum_l.resize(numSentiLabs);
451 | 	for (int l = 0; l < numSentiLabs; l++) {
452 | 		alphaSum_l[l] = 0.0;
453 | 		alpha_lz[l].resize(numTopics);
454 | 		for (int z = 0; z < numTopics; z++) {
455 | 			alpha_lz[l][z] = _alpha;
456 | 			alphaSum_l[l] += alpha_lz[l][z];
457 | 		}
458 | 	}
459 | 
460 | 	// gamma
461 | 	gamma_l.resize(numSentiLabs);
462 | 	gammaSum = 0.0;
463 | 	for (int l = 0; l < numSentiLabs; l++) {
464 | 		gamma_l[l] = (double)pnewData->aveDocLength * 0.05 / (double)numSentiLabs;
465 | 		gammaSum += gamma_l[l];
466 | 	}
467 | 
468 | 	//beta
469 | 	if (_beta <= 0) {
470 | 		_beta = 0.01;
471 | 	}
472 | 	beta_lzw.resize(numSentiLabs);
473 | 	betaSum_lz.resize(numSentiLabs);
474 | 	for (int l = 0; l < numSentiLabs; l++) {
475 | 		beta_lzw[l].resize(numTopics);
476 | 		betaSum_lz[l].resize(numTopics);
477 | 		for (int z = 0; z < numTopics; z++) {
478 | 			beta_lzw[l][z].resize(pnewData->vocabSize);
479 | 			for (int r = 0; r < pnewData->vocabSize; r++) {
480 | 				beta_lzw[l][z][r] = _beta; 
481 | 				betaSum_lz[l][z] += beta_lzw[l][z][r];
482 | 			}
483 | 		} 		
484 | 	}
485 | 	
486 | 	// incorporate prior knowledge into beta
487 | 	if (sentiLexFile != "") {
488 | 		// word prior transformation matrix
489 | 		lambda_lw.resize(numSentiLabs);
490 | 		for (int l = 0; l < numSentiLabs; l++) {
491 | 		  lambda_lw[l].resize(pnewData->vocabSize);
492 | 			for (int r = 0; r < pnewData->vocabSize; r++)
493 | 				lambda_lw[l][r] = 1;
494 | 		}
495 | 		// MUST init beta_lzw first before incorporating prior information into beta
496 | 		this->prior2beta();
497 | 	}
498 | 
499 | 	return 0;
500 | }
501 | 
502 | 
503 | 
504 | int Inference::inf_sampling(int m, int n, int& sentiLab, int& topic) {
505 | 	sentiLab = new_l[m][n];
506 | 	topic = new_z[m][n];
507 | 	int w = pnewData->pdocs[m]->words[n];   // word index of trained model
508 | 	int _w = pnewData->_pdocs[m]->words[n]; // word index of test data
509 | 	double u;
510 | 	
511 | 	new_nd[m]--;
512 | 	new_ndl[m][sentiLab]--;
513 | 	new_ndlz[m][sentiLab][topic]--;
514 | 	new_nlzw[sentiLab][topic][_w]--;
515 | 	new_nlz[sentiLab][topic]--;
516 | 
517 |     // do multinomial sampling via cumulative method
518 |     for (int l = 0; l < numSentiLabs; l++) {
519 |   	    for (int k = 0; k < numTopics; k++) {
520 | 		    new_p[l][k] = (nlzw[l][k][w] + new_nlzw[l][k][_w] + beta_lzw[l][k][_w]) / (nlz[l][k] + new_nlz[l][k] + betaSum_lz[l][k]) *
521 | 			    (new_ndlz[m][l][k] + alpha_lz[l][k]) / (new_ndl[m][l] + alphaSum_l[l]) *
522 | 			    (new_ndl[m][l] + gamma_l[l]) / (new_nd[m] + gammaSum);
523 | 		}
524 | 	}
525 | 
526 | 	// accumulate multinomial parameters
527 | 	for (int l = 0; l < numSentiLabs; l++) {
528 | 		for (int k = 0; k < numTopics; k++) {
529 | 			if (k==0) {
530 | 			    if (l==0) continue;
531 | 		        else new_p[l][k] += new_p[l-1][numTopics-1];
532 | 			}
533 | 			else new_p[l][k] += new_p[l][k-1];
534 | 	    }
535 | 	}
536 | 	// probability normalization
537 | 	u = ((double)rand() / RAND_MAX) * new_p[numSentiLabs-1][numTopics-1];
538 | 
539 | 	for (sentiLab = 0; sentiLab < numSentiLabs; sentiLab++) {
540 | 		for (topic = 0; topic < numTopics; topic++) {
541 | 		    if (new_p[sentiLab][topic] > u) {
542 | 		    	goto stop;
543 | 		    }
544 | 		}
545 | 	}
546 |     
547 | 	stop:
548 | 	if (sentiLab == numSentiLabs) sentiLab = numSentiLabs - 1; // the max value of label is (S - 1)
549 | 	if (topic == numTopics) topic = numTopics - 1; 
550 | 
551 | 	// add estimated 'z' and 'l' to counts
552 | 	new_nd[m]++;
553 | 	new_ndl[m][sentiLab]++;
554 | 	new_ndlz[m][sentiLab][topic]++;
555 | 	new_nlzw[sentiLab][topic][_w]++;
556 | 	new_nlz[sentiLab][topic]++;
557 | 
558 |     return 0;  
559 | }
560 | 
561 | 
562 | int Inference::read_newData(string filename) {
563 | 
564 | 	mapword2id::iterator it;
565 |     map<int, int>::iterator _it;
566 | 	mapword2atr::iterator itatr;
567 | 	mapword2prior::iterator sentiIt;
568 | 	string line;
569 | 	char buff[BUFF_SIZE_LONG];
570 | 
571 | 	pmodelData->read_wordmap(model_dir + "wordmap.txt", word2id);  // map word2id
572 |     pmodelData->read_wordmap(model_dir + "wordmap.txt", id2word);  // map id2word
573 | 
574 | 	// read sentiment lexicon file
575 | 	if (sentiLexFile != "") {
576 | 		if (pnewData->read_senti_lexicon((sentiLexFile).c_str())) {
577 | 			printf("Error! Cannot read sentiFile %s!\n", sentiLexFile.c_str());
578 | 			delete pnewData;
579 | 			return 1;
580 | 		}
581 | 		else {
582 | 			this->sentiLex = pnewData->sentiLex;
583 | 		}
584 | 	}
585 | 
586 |     if (word2id.size() <= 0) {
587 | 	    printf("Invalid wordmap!\n");
588 | 	    return 1;
589 |     }
590 | 
591 |     // read test data
592 | 	ifstream fin;
593 | 	fin.open(filename.c_str(), ifstream::in);
594 |     if(!fin) {
595 | 	    printf("Cannot read file %s!\n", filename.c_str());
596 | 	    return 1;
597 |   	}
598 | 
599 | 	vector<string> docs;
600 | 	int numDocs = 0;
601 | 
602 | 	while (fin.getline(buff, BUFF_SIZE_LONG)) {
603 | 		line = buff;
604 | 		if(!line.empty()) {
605 | 			docs.push_back(line);
606 | 			numDocs++;
607 | 	    }
608 | 	}
609 | 	fin.close();
610 | 
611 | 	if (numDocs <= 0) {
612 | 		printf("Error! No documents found in test data %s.\n", filename.c_str());
613 | 		return 1;
614 | 	}
615 | 
616 | 	pnewData->numDocs = numDocs;
617 | 	// allocate memory
618 |     if (pnewData->pdocs) {
619 | 		pnewData->deallocate();
620 |     }
621 | 	else {
622 | 		pnewData->pdocs = new document*[pnewData->numDocs];
623 | 	}
624 |     pnewData->_pdocs = new document*[pnewData->numDocs];
625 | 	pnewData->vocabSize = 0;
626 | 	pnewData->corpusSize = 0;
627 | 
628 | 	// process each document
629 | 	for (int i = 0; i < pnewData->numDocs; i++) {
630 | 		line = docs.at(i);
631 | 		strtokenizer strtok(line, " \t\r\n"); // \t\r\n are separators
632 | 		int docLength = strtok.count_tokens();
633 | 		if (docLength <= 0) {
634 | 			printf("Invalid (empty) document!\n");
635 | 			pnewData->deallocate();
636 | 			pnewData->numDocs = 0;
637 | 			pnewData->vocabSize = 0;
638 | 			return 1;
639 | 		}
640 | 
641 | 	    pnewData->corpusSize += docLength - 1;
642 | 	    vector<int> doc;
643 | 	    vector<int> _doc;
644 | 	    vector<int> priorSentiLabels;
645 | 
646 | 	    // process each token in the document
647 | 	    for (int k = 1; k < docLength; k++) {
648 | 			it = word2id.find(strtok.token(k));
649 | 			if (it == word2id.end()) {
650 | 				pnewData->newWords.push_back(strtok.token(k).c_str());
651 | 			  // word not found, i.e., word unseen in training data
652 | 			  // do anything? (future decision)
653 | 			}
654 | 			else {
655 | 				int _id;
656 | 				_it = id2_id.find(it->second);
657 | 				if (_it == id2_id.end()) {
658 | 				    _id = id2_id.size();
659 | 				    id2_id.insert(pair<int, int>(it->second, _id));
660 | 				    _id2id.insert(pair<int, int>(_id, it->second));
661 | 				}
662 | 				else {
663 | 				    _id = _it->second;
664 | 				}
665 | 
666 | 				doc.push_back(it->second);
667 | 				_doc.push_back(_id);
668 | 
669 | 				// 'word2atr' is specific to new/test dataset
670 | 				itatr = word2atr.find(strtok.token(k).c_str());
671 | 				int priorSenti = -1;
672 | 				if (itatr == word2atr.end()) {
673 | 					sentiIt = sentiLex.find(strtok.token(k).c_str()); // check whether the word token can be found in the sentiment lexicon
674 | 					if (sentiIt != sentiLex.end()) {
675 | 						priorSenti = sentiIt->second.id;
676 | 					}
677 | 					// encode sentiment info into word2atr
678 | 					Word_atr temp = {_id, priorSenti};  // vocabulary index; word polarity
679 | 					word2atr.insert(pair<string, Word_atr>(strtok.token(k), temp));
680 | 					priorSentiLabels.push_back(priorSenti);
681 | 				}
682 | 				else {
683 | 					priorSentiLabels.push_back(itatr->second.polarity);
684 | 				}
685 | 
686 | 			}
687 | 		}
688 | 
689 | 		// allocate memory for new doc
690 | 		document * pdoc = new document(doc, priorSentiLabels, "inference");
691 | 		document * _pdoc = new document(_doc, priorSentiLabels, "inference");
692 | 
693 | 		pdoc->docID = strtok.token(0).c_str();
694 | 		_pdoc->docID = strtok.token(0).c_str();
695 | 
696 | 		// add new doc
697 | 		pnewData->add_doc(pdoc, i);
698 | 		pnewData->_add_doc(_pdoc, i);
699 | 	}
700 | 
701 |     // update number of new words
702 | 	pnewData->vocabSize = id2_id.size();
703 | 	pnewData->aveDocLength = pnewData->corpusSize / pnewData->numDocs;
704 | 	this->newNumDocs = pnewData->numDocs;
705 | 	this->newVocabSize = pnewData->vocabSize;
706 | 
707 |     if (newVocabSize == 0) {
708 | 	    printf("ERROR! Vocabulary size of test set after removing unseen words is 0! \n");
709 | 		return 1;
710 | 	}
711 | 
712 | 	return 0;
713 | }
714 | 
715 | 
716 | void Inference::compute_newpi() {
717 | 
718 | 	for (int m = 0; m < pnewData->numDocs; m++) {
719 | 	    for (int l = 0; l < numSentiLabs; l++) {
720 | 		    newpi_dl[m][l] = (new_ndl[m][l] + gamma_l[l]) / (new_nd[m] + gammaSum);
721 | 	    }
722 | 	}
723 | }
724 | 
725 | 
726 | void Inference::compute_newtheta() {
727 | 
728 | 	for (int m = 0; m < pnewData->numDocs; m++) {
729 | 	    for (int l = 0; l < numSentiLabs; l++)  {
730 | 			for (int z = 0; z < numTopics; z++) {
731 | 			    newtheta_dlz[m][l][z] = (new_ndlz[m][l][z] + alpha_lz[l][z]) / (new_ndl[m][l] + alphaSum_l[l]);
732 | 			}
733 | 		}
734 | 	}
735 | }
736 | 
737 | 
738 | int Inference::compute_newphi() {
739 | 	map<int, int>::iterator it;
740 | 
741 | 	for (int l = 0; l < numSentiLabs; l++)  {
742 | 	    for (int z = 0; z < numTopics; z++) {
743 | 			for(int r = 0; r < pnewData->vocabSize; r++) {
744 | 			    it = _id2id.find(r);
745 | 				if (it != _id2id.end()) {
746 | 				    newphi_lzw[l][z][r] = (nlzw[l][z][it->second] + new_nlzw[l][z][r] + beta_lzw[l][z][r]) / (nlz[l][z] + new_nlz[l][z] + betaSum_lz[l][z]);
747 | 				}
748 | 				else {
749 | 				    printf("Error! Cannot find word [%d] !\n", r);
750 | 					return 1; 
751 | 				}
752 | 			}
753 | 		}
754 | 	}
755 | 
756 | 	return 0;
757 | }
758 | 
759 | 
760 | int Inference::save_model(string model_name) {
761 | 
762 | 	if (save_model_newtassign(result_dir + model_name + tassign_suffix))
763 | 		return 1;
764 | 	
765 | 	if (save_model_newtwords(result_dir + model_name + twords_suffix)) 
766 | 		return 1;
767 | 
768 | 	if (save_model_newpi_dl(result_dir + model_name + pi_suffix)) 
769 | 		return 1;
770 | 
771 | 	if (save_model_newtheta_dlz(result_dir + model_name + theta_suffix))
772 | 		return 1;
773 | 
774 | 	if (save_model_newphi_lzw(result_dir + model_name + phi_suffix)) 
775 | 		return 1;
776 | 
777 | 	if (save_model_newothers(result_dir + model_name + others_suffix)) 
778 | 		return 1;
779 | 
780 | 	return 0;
781 | }
782 | 
783 | 
784 | 
785 | int Inference::save_model_newpi_dl(string filename) {
786 | 
787 |     FILE * fout = fopen(filename.c_str(), "w");
788 |     if (!fout) {
789 | 		printf("Cannot save file %s!\n", filename.c_str());
790 | 		return 1;
791 |     }
792 | 
793 | 	for (int m = 0; m < pnewData->numDocs; m++)	{
794 | 		fprintf(fout, "d_%d %s ", m, pnewData->pdocs[m]->docID.c_str());
795 | 		for (int l = 0; l < numSentiLabs; l++) {
796 | 			fprintf(fout, "%f ", newpi_dl[m][l]);
797 | 		}
798 | 		fprintf(fout, "\n");
799 |     }
800 |    
801 |     fclose(fout);       
802 | 	return 0;
803 | }
804 | 
805 | 
806 | 
807 | int Inference::save_model_newtheta_dlz(string filename) {
808 | 
809 |     FILE * fout = fopen(filename.c_str(), "w");
810 |     if (!fout) {
811 | 		printf("Cannot save file %s!\n", filename.c_str());
812 | 		return 1;
813 |     }
814 |     
815 |     for(int m = 0; m < pnewData->numDocs; m++) {
816 |         fprintf(fout, "Document %d\n", m);
817 | 	    for (int l = 0; l < numSentiLabs; l++) {
818 | 	        for (int z = 0; z < numTopics; z++) {
819 | 		        fprintf(fout, "%f ", newtheta_dlz[m][l][z]);
820 | 	        }
821 | 		    fprintf(fout, "\n");
822 | 		 }
823 |     }
824 | 
825 |     fclose(fout);
826 | 	return 0;
827 | }
828 | 
829 | 
830 | 
831 | int Inference::save_model_newphi_lzw(string filename) {
832 | 
833 | 	FILE * fout = fopen(filename.c_str(), "w");
834 |     if (!fout) {
835 | 	    printf("Cannot save file %s!\n", filename.c_str());
836 | 	    return 1;
837 |     }
838 |     
839 | 	for (int l = 0; l < numSentiLabs; l++) {
840 | 	    for (int z = 0; z < numTopics; z++) {
841 | 		    fprintf(fout, "Label:%d  Topic:%d\n", l, z);
842 |      	    for (int r = 0; r < pnewData->vocabSize; r++) {
843 |      	    	fprintf(fout, "%.15f ", newphi_lzw[l][z][r]);
844 |      	    }
845 |             fprintf(fout, "\n");
846 | 	    }
847 |     }
848 |     
849 |     fclose(fout);    
850 | 	return 0;
851 | }
852 | 
853 | 
854 | int Inference::save_model_newothers(string filename) {
855 | 	
856 | 	FILE * fout = fopen(filename.c_str(), "w");
857 |     if (!fout) {
858 | 	    printf("Cannot save file %s!\n", filename.c_str());
859 | 	    return 1;
860 |     }
861 |     
862 | 	fprintf(fout, "model_dir=%s\n", model_dir.c_str());
863 | 	fprintf(fout, "model_name=%s\n", model_name.c_str());
864 | 	fprintf(fout, "data_dir=%s\n", data_dir.c_str());
865 | 	fprintf(fout, "datasetFile=%s\n", datasetFile.c_str());
866 | 	fprintf(fout, "result_dir=%s\n", result_dir.c_str());
867 | 	fprintf(fout, "niters-inf=%d\n", niters);
868 | 	fprintf(fout, "savestep-inf=%d\n", savestep);
869 | 
870 | 	fprintf(fout, "\n------------------ Testset ** %s ** statistics ----------------------\n", datasetFile.c_str());
871 |     fprintf(fout, "newNumDocs=%d\n", pnewData->numDocs);
872 |     fprintf(fout, "newCorpusSize=%d\n", pnewData->corpusSize);
873 |     fprintf(fout, "newVocabSize=%d\n", pnewData->vocabSize);
874 | 	fprintf(fout, "numNewWords=%d\n", (int)(pnewData->newWords.size()));
875 | 	fprintf(fout, "aveDocLength=%d\n", pnewData->aveDocLength);
876 | 	fprintf(fout, "\n------------------ Loaded model settings ----------------------\n");
877 | 	fprintf(fout, "numSentiLabs=%d\n", numSentiLabs);
878 | 	fprintf(fout, "numTopics=%d\n", numTopics);
879 | 	fprintf(fout, "numDocs=%d\n", pmodelData->numDocs);
880 | 	fprintf(fout, "corpusSize=%d\n", pmodelData->corpusSize);
881 | 	fprintf(fout, "vocabSize=%d\n", pmodelData->vocabSize);
882 | 
883 | 	fclose(fout);
884 | 	return 0;
885 | }
886 | 
887 | 
888 | 
889 | int Inference::save_model_newtwords(string filename) {
890 | 
891 | 	mapid2word::iterator it; // typedef map<int, string> mapid2word
892 | 	map<int, int>::iterator _it;
893 | 
894 | 	FILE * fout = fopen(filename.c_str(), "w");
895 |     if (!fout) {
896 | 	    printf("Cannot save file %s!\n", filename.c_str());
897 | 	    return 1;
898 |     }
899 |     
900 |     if (twords > pnewData->vocabSize) {
901 | 	    twords = pnewData->vocabSize;
902 |     }
903 |    
904 |     for (int l = 0; l < numSentiLabs; l++) {
905 |         fprintf(fout, "Label %dth\n", l);
906 |         for (int k = 0; k < numTopics; k++) {
907 | 	        vector<pair<int, double> > words_probs;
908 | 	        pair<int, double> word_prob;
909 | 	        for (int w = 0; w < pnewData->vocabSize; w++) {
910 | 		        word_prob.first = w;
911 | 	            word_prob.second = newphi_lzw[l][k][w];
912 | 	            words_probs.push_back(word_prob);
913 | 	        }
914 |     
915 | 		    std::sort(words_probs.begin(), words_probs.end(), sort_pred());
916 | 
917 | 	        fprintf(fout, "Topic %dth:\n", k);
918 | 	        for (int i = 0; i < twords; i++) {
919 | 				_it = _id2id.find(words_probs[i].first);
920 | 				if (_it == _id2id.end()) {
921 | 		            continue;
922 | 	            }
923 | 				it = id2word.find(_it->second);
924 | 	            if (it != id2word.end()) {
925 | 			        fprintf(fout, "\t%s   %f\n", (it->second).c_str(), words_probs[i].second);
926 | 	            } 
927 | 	        }
928 | 	    } // for topic
929 |     } // for label
930 |      
931 |     fclose(fout);      
932 | 	return 0;
933 | }
934 | 
935 | 
936 | int Inference::save_model_newtassign(string filename) {
937 | 
938 | 	FILE * fout = fopen(filename.c_str(), "w");
939 |     if (!fout) {
940 | 	    printf("Cannot save file %s!\n", filename.c_str());
941 | 	    return 1;
942 |     }
943 | 
944 | 	for (int m = 0; m < pnewData->numDocs; m++) {
945 | 		fprintf(fout, "%s \n", pnewData->pdocs[m]->docID.c_str());
946 | 		for (int n = 0; n < pnewData->pdocs[m]->length; n++) {
947 | 	        fprintf(fout, "%d:%d:%d ", pnewData->pdocs[m]->words[n], new_l[m][n], new_z[m][n]); //  wordID:sentiLab:topic
948 | 	    }
949 | 	    fprintf(fout, "\n");
950 |     }
951 | 
952 |     fclose(fout);
953 | 	return 0;
954 | }
955 | 
956 | 
957 | 
958 | int Inference::prior2beta() {
959 | 	mapword2atr::iterator wordIt;
960 | 	mapword2prior::iterator sentiIt;
961 | 
962 | 	for (sentiIt = sentiLex.begin(); sentiIt != sentiLex.end(); sentiIt++) {
963 | 		wordIt = word2atr.find(sentiIt->first);
964 | 		if (wordIt != word2atr.end()) {
965 | 			for (int j = 0; j < numSentiLabs; j++)  {
966 | 				lambda_lw[j][wordIt->second.id] = sentiIt->second.labDist[j];
967 | 			}
968 | 		}
969 | 	}
970 | 
971 | 	// Note: the 'r' index of lambda[j][r] is corresponding to the vocabulary ID.
972 | 	// Therefore the correct prior info can be incorporated to corresponding word cound nlzw,
973 | 	// as 'w' is also corresponding to the vocabulary ID.
974 | 	for (int l = 0; l < numSentiLabs; l++) {
975 | 		for (int z = 0; z < numTopics; z++) {
976 | 			betaSum_lz[l][z] = 0.0;
977 | 		    for (int r = 0; r < pnewData->vocabSize; r++) {
978 | 			    beta_lzw[l][z][r] = beta_lzw[l][z][r] * lambda_lw[l][r];
979 | 			    betaSum_lz[l][z] += beta_lzw[l][z][r];
980 | 		    }
981 | 		}
982 | 	}
983 | 
984 | 	return 0;
985 | }
986 | 


--------------------------------------------------------------------------------