├── Debug ├── objects.mk ├── sources.mk ├── src │ └── subdir.mk └── makefile ├── training.properties ├── test.properties ├── src ├── polya_fit_simple.h ├── strtokenizer.h ├── constants.h ├── math_func.h ├── strtokenizer.cpp ├── map_type.h ├── dataset.h ├── utils.h ├── document.h ├── polya_fit_simple.cpp ├── main.cpp ├── model.h ├── inference.h ├── cokus.h ├── utils.cpp ├── dataset.cpp ├── model.cpp ├── math_func.cpp └── inference.cpp └── readme.txt /Debug/objects.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | USER_OBJS := 6 | 7 | LIBS := 8 | 9 | -------------------------------------------------------------------------------- /training.properties: -------------------------------------------------------------------------------- 1 | nsentiLabs=3 2 | ntopics=1 3 | niters=800 4 | savestep=100 5 | updateParaStep=50 6 | twords=20 7 | data_dir=/Users/chenghualin/Documents/workspace/JST-release/data/ 8 | datasetFile=MR.dat 9 | result_dir=/Volumes/CHENGHUA-2T/JST-release/result/train/t1 10 | sentiFile=/Users/chenghualin/Documents/workspace/JST-release/data/mpqa.constraint 11 | beta=0.01 12 | -------------------------------------------------------------------------------- /test.properties: -------------------------------------------------------------------------------- 1 | niters=60 2 | savestep=20 3 | twords=30 4 | data_dir=/Users/chenghualin/Documents/workspace/JST-release/data/ 5 | datasetFile=test.dat 6 | result_dir=/Volumes/CHENGHUA-2T/JST-release/result/test/t1 7 | sentiFile=/Users/chenghualin/Documents/workspace/JST-release/data/mpqa.constraint 8 | beta=0.01 9 | model_dir=/Volumes/CHENGHUA-2T/JST-release/result/train/t1 10 | model=00100 11 | -------------------------------------------------------------------------------- /Debug/sources.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | O_SRCS := 6 | CPP_SRCS := 7 | C_UPPER_SRCS := 8 | C_SRCS := 9 | S_UPPER_SRCS := 10 | OBJ_SRCS := 11 | ASM_SRCS := 12 | CXX_SRCS := 13 | C++_SRCS := 14 | CC_SRCS := 15 | C++_DEPS := 16 | OBJS := 17 | C_DEPS := 18 | CC_DEPS := 19 | CPP_DEPS := 20 | EXECUTABLES := 21 | CXX_DEPS := 22 | C_UPPER_DEPS := 23 | 24 | # Every subdirectory with source files must be described here 25 | SUBDIRS := \ 26 | src \ 27 | 28 | -------------------------------------------------------------------------------- /Debug/src/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/dataset.cpp \ 8 | ../src/inference.cpp \ 9 | ../src/main.cpp \ 10 | ../src/math_func.cpp \ 11 | ../src/model.cpp \ 12 | ../src/polya_fit_simple.cpp \ 13 | ../src/strtokenizer.cpp \ 14 | ../src/utils.cpp 15 | 16 | OBJS += \ 17 | ./src/dataset.o \ 18 | ./src/inference.o \ 19 | ./src/main.o \ 20 | ./src/math_func.o \ 21 | ./src/model.o \ 22 | ./src/polya_fit_simple.o \ 23 | ./src/strtokenizer.o \ 24 | ./src/utils.o 25 | 26 | CPP_DEPS += \ 27 | ./src/dataset.d \ 28 | ./src/inference.d \ 29 | ./src/main.d \ 30 | ./src/math_func.d \ 31 | ./src/model.d \ 32 | ./src/polya_fit_simple.d \ 33 | ./src/strtokenizer.d \ 34 | ./src/utils.d 35 | 36 | 37 | # Each subdirectory must supply rules for building sources it contributes 38 | src/%.o: ../src/%.cpp 39 | @echo 'Building file: $<' 40 | @echo 'Invoking: Cross G++ Compiler' 41 | g++ -O0 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<" 42 | @echo 'Finished building: $<' 43 | @echo ' ' 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/polya_fit_simple.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | *********************************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk. 8 | Part of code is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ***********************************************************************/ 28 | 29 | #ifndef _POLYA_FIT_SIMPLE_H 30 | #define _POLYA_FIT_SIMPLE_H 31 | 32 | int polya_fit_simple(int ** data, double * alhpa, int _K, int _nSample); 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /src/strtokenizer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2007 by 3 | * 4 | * Xuan-Hieu Phan 5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com 6 | * Graduate School of Information Sciences 7 | * Tohoku University 8 | * 9 | * GibbsLDA++ is a free software; you can redistribute it and/or modify 10 | * it under the terms of the GNU General Public License as published 11 | * by the Free Software Foundation; either version 2 of the License, 12 | * or (at your option) any later version. 13 | * 14 | * GibbsLDA++ is distributed in the hope that it will be useful, but 15 | * WITHOUT ANY WARRANTY; without even the implied warranty of 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | * GNU General Public License for more details. 18 | * 19 | * You should have received a copy of the GNU General Public License 20 | * along with GibbsLDA++; if not, write to the Free Software Foundation, 21 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 22 | */ 23 | 24 | #ifndef _STRTOKENIZER_H 25 | #define _STRTOKENIZER_H 26 | 27 | #include 28 | #include 29 | 30 | using namespace std; 31 | 32 | class strtokenizer { 33 | protected: 34 | vector tokens; 35 | int idx; 36 | 37 | public: 38 | strtokenizer(string str, string seperators = " "); 39 | 40 | void parse(string str, string seperators); 41 | 42 | int count_tokens(); 43 | string next_token(); 44 | void start_scan(); 45 | 46 | string token(int i); 47 | }; 48 | 49 | #endif 50 | 51 | -------------------------------------------------------------------------------- /Debug/makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | -include ../makefile.init 6 | 7 | RM := rm -rf 8 | 9 | # All of the sources participating in the build are defined here 10 | -include sources.mk 11 | -include src/subdir.mk 12 | -include subdir.mk 13 | -include objects.mk 14 | 15 | ifneq ($(MAKECMDGOALS),clean) 16 | ifneq ($(strip $(C++_DEPS)),) 17 | -include $(C++_DEPS) 18 | endif 19 | ifneq ($(strip $(C_DEPS)),) 20 | -include $(C_DEPS) 21 | endif 22 | ifneq ($(strip $(CC_DEPS)),) 23 | -include $(CC_DEPS) 24 | endif 25 | ifneq ($(strip $(CPP_DEPS)),) 26 | -include $(CPP_DEPS) 27 | endif 28 | ifneq ($(strip $(CXX_DEPS)),) 29 | -include $(CXX_DEPS) 30 | endif 31 | ifneq ($(strip $(C_UPPER_DEPS)),) 32 | -include $(C_UPPER_DEPS) 33 | endif 34 | endif 35 | 36 | -include ../makefile.defs 37 | 38 | # Add inputs and outputs from these tool invocations to the build variables 39 | 40 | # All Target 41 | all: jst 42 | 43 | # Tool invocations 44 | jst: $(OBJS) $(USER_OBJS) 45 | @echo 'Building target: $@' 46 | @echo 'Invoking: Cross G++ Linker' 47 | g++ -o "jst" $(OBJS) $(USER_OBJS) $(LIBS) 48 | @echo 'Finished building target: $@' 49 | @echo ' ' 50 | 51 | # Other Targets 52 | clean: 53 | -$(RM) $(C++_DEPS)$(OBJS)$(C_DEPS)$(CC_DEPS)$(CPP_DEPS)$(EXECUTABLES)$(CXX_DEPS)$(C_UPPER_DEPS) jst 54 | -@echo ' ' 55 | 56 | .PHONY: all clean dependents 57 | .SECONDARY: 58 | 59 | -include ../makefile.targets 60 | -------------------------------------------------------------------------------- /src/constants.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | *********************************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk. 8 | Part of code is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ***********************************************************************/ 28 | 29 | 30 | #ifndef _CONSTANTS_H 31 | #define _CONSTANTS_H 32 | 33 | #define BUFF_SIZE_LONG 1000000 34 | #define BUFF_SIZE_SHORT 512 35 | 36 | #define MODEL_STATUS_UNKNOWN 0 37 | #define MODEL_STATUS_EST 1 38 | #define MODEL_STATUS_ESTC 2 39 | #define MODEL_STATUS_INF 3 40 | 41 | #define MODE_NONE 0 42 | #define MODE_SLIDING 1 43 | #define MODE_SKIP 2 44 | #define MODE_MULTISCALE 3 45 | 46 | #define MAX_ITERATION 100000 47 | 48 | #endif 49 | 50 | -------------------------------------------------------------------------------- /src/math_func.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | *********************************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk. 8 | Part of code is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ***********************************************************************/ 28 | 29 | 30 | #ifndef _MATH_FUNC_H 31 | #define _MATH_FUNC_H 32 | 33 | 34 | //************************* asa032.h ************************************// 35 | double alngam ( double xvalue, int *ifault ); 36 | double gamain ( double x, double p, int *ifault ); 37 | void gamma_inc_values ( int *n_data, double *a, double *x, double *fx ); 38 | double r8_abs ( double x ); 39 | void timestamp ( void ); 40 | 41 | 42 | //************************* asa103.cpp ************************************// 43 | double digama ( double x, int *ifault ); 44 | void psi_values ( int *n_data, double *x, double *fx ); 45 | //void timestamp ( void ); 46 | 47 | 48 | //************************* asa121.cpp ************************************// 49 | //void timestamp ( void ); 50 | double trigam ( double x, int *ifault ); 51 | void trigamma_values ( int *n_data, double *x, double *fx ); 52 | 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /src/strtokenizer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2007 by 3 | * 4 | * Xuan-Hieu Phan 5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com 6 | * Graduate School of Information Sciences 7 | * Tohoku University 8 | * 9 | * GibbsLDA++ is a free software; you can redistribute it and/or modify 10 | * it under the terms of the GNU General Public License as published 11 | * by the Free Software Foundation; either version 2 of the License, 12 | * or (at your option) any later version. 13 | * 14 | * GibbsLDA++ is distributed in the hope that it will be useful, but 15 | * WITHOUT ANY WARRANTY; without even the implied warranty of 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | * GNU General Public License for more details. 18 | * 19 | * You should have received a copy of the GNU General Public License 20 | * along with GibbsLDA++; if not, write to the Free Software Foundation, 21 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 22 | */ 23 | 24 | #include 25 | #include 26 | #include "strtokenizer.h" 27 | 28 | using namespace std; 29 | 30 | strtokenizer::strtokenizer(string str, string seperators) { 31 | parse(str, seperators); 32 | } 33 | 34 | void strtokenizer::parse(string str, string seperators) { 35 | int n = str.length(); 36 | int start, stop; 37 | 38 | start = str.find_first_not_of(seperators); 39 | while (start >= 0 && start < n) { 40 | stop = str.find_first_of(seperators, start); 41 | if (stop < 0 || stop > n) { 42 | stop = n; 43 | } 44 | 45 | tokens.push_back(str.substr(start, stop - start)); 46 | start = str.find_first_not_of(seperators, stop + 1); 47 | } 48 | 49 | start_scan(); 50 | } 51 | 52 | int strtokenizer::count_tokens() { 53 | return tokens.size(); 54 | } 55 | 56 | void strtokenizer::start_scan() { 57 | idx = 0; 58 | } 59 | 60 | string strtokenizer::next_token() { 61 | if (idx >= 0 && idx < (int)tokens.size()) 62 | return tokens[idx++]; 63 | else 64 | return ""; 65 | } 66 | 67 | string strtokenizer::token(int i) { 68 | if (i >= 0 && i < (int)tokens.size()) 69 | return tokens[i]; 70 | else 71 | return ""; 72 | } 73 | 74 | -------------------------------------------------------------------------------- /src/map_type.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | *********************************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk. 8 | Part of code is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ***********************************************************************/ 28 | 29 | 30 | #ifndef _MAP_TYPE_H 31 | #define _MAP_TYPE_H 32 | #include 33 | #include 34 | using namespace std; 35 | 36 | 37 | struct Word_atr { 38 | int id; // vocabulary index 39 | int polarity; // sentiment label 40 | }; 41 | 42 | struct Word_Prior_Attr { 43 | int id; // prior sentiment label 44 | vector labDist; // label distribution 45 | }; 46 | 47 | // map of words/terms [string => int] 48 | typedef map mapword2id; 49 | 50 | // map of words/terms [int => string] 51 | typedef map mapid2word; 52 | 53 | // map of words/attributes_of_words [string => word_attr] 54 | typedef map mapword2atr; 55 | 56 | // map of word / word prior info [string => sentiment lab ID, sentiment label distribition] 57 | typedef map mapword2prior; 58 | 59 | // map of doc / doc label distribution [string => doc label distribition] 60 | typedef map > mapname2labs; 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /src/dataset.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | *********************************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk. 8 | Part of code is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ***********************************************************************/ 28 | 29 | 30 | #ifndef _DATASET_H 31 | #define _DATASET_H 32 | 33 | #include "constants.h" 34 | #include "document.h" 35 | #include "map_type.h" 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | using namespace std; 42 | 43 | 44 | class dataset { 45 | 46 | public: 47 | mapword2atr word2atr; 48 | mapid2word id2word; 49 | mapword2prior sentiLex; // 50 | 51 | document ** pdocs; // store training data vocab ID 52 | document ** _pdocs; // only use for inference, i.e., for storing the new/test vocab ID 53 | ifstream fin; 54 | 55 | string data_dir; 56 | string result_dir; 57 | string wordmapfile; 58 | 59 | int numDocs; 60 | int aveDocLength; // average document length 61 | int vocabSize; 62 | int corpusSize; 63 | 64 | vector docs; // for buffering dataset 65 | vector newWords; 66 | 67 | // functions 68 | dataset(); 69 | dataset(string result_dir); 70 | ~dataset(void); 71 | 72 | int read_dataStream(ifstream& fin); 73 | int read_newData(string filename); 74 | int read_senti_lexicon(string sentiLexiconFileDir); 75 | int analyzeCorpus(vector& docs); 76 | 77 | static int write_wordmap(string wordmapfile, mapword2atr& pword2atr); 78 | static int read_wordmap(string wordmapfile, mapid2word& pid2word); 79 | static int read_wordmap(string wordmapfile, mapword2id& pword2id); 80 | 81 | int init_parameter(); 82 | void deallocate(); 83 | void add_doc(document * doc, int idx); 84 | void _add_doc(document * doc, int idx); 85 | 86 | }; 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | *********************************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk. 8 | Part of code is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ***********************************************************************/ 28 | 29 | #ifndef _UTILS_H 30 | #define _UTILS_H 31 | 32 | #include "dataset.h" 33 | #include 34 | #include 35 | using namespace std; 36 | 37 | // for sorting word probabilitys 38 | struct sort_pred { 39 | bool operator()(const std::pair &left, const std::pair &right) { 40 | return left.second > right.second; 41 | } 42 | }; 43 | 44 | class model; 45 | class Inference; 46 | 47 | class utils { 48 | private: 49 | int model_status; 50 | string model_dir; 51 | string data_dir; 52 | string result_dir; 53 | string model_name; 54 | string wordmapfile; 55 | string sentiLexFile; 56 | string datasetFile; 57 | string configfile; 58 | 59 | int numSentiLabs; 60 | int numTopics; 61 | int niters; 62 | int savestep; 63 | int twords; 64 | int updateParaStep; 65 | double alpha; 66 | double beta; 67 | double gamma; 68 | 69 | 70 | public: 71 | utils(); 72 | 73 | // parse command line arguments 74 | int parse_args(int argc, char ** argv, int& model_status); 75 | int parse_args_est(int argc, char ** argv, model * pmodel); 76 | int parse_args_inf(int argc, char ** argv, Inference * pmodel_inf); 77 | 78 | // read configuration file 79 | int read_config_file(string configfile); 80 | 81 | // generate the model name for the current iteration 82 | string generate_model_name(int iter); 83 | 84 | // make directory 85 | int make_dir(string strPath); 86 | 87 | // sort 88 | void sort(vector & probs, vector & words); 89 | }; 90 | 91 | #endif 92 | 93 | -------------------------------------------------------------------------------- /readme.txt: -------------------------------------------------------------------------------- 1 | ***************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | ***************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk, part of code 8 | is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | 28 | ------------------------------------------------------------------------ 29 | 30 | This is a C++ implementation of the joint sentiment-topic (JST) model for 31 | sentiment classification and extracting sentiment-bearing topics from text copara. 32 | 33 | ------------------------------------------------------------------------ 34 | 35 | 36 | TABLE OF CONTENTS 37 | 38 | 39 | A. COMPILING 40 | 41 | B. ESTIMATION 42 | 43 | C. INFERENCE 44 | 45 | D. Data format 46 | 47 | E. References 48 | 49 | 50 | ------------------------------------------------------------------------ 51 | 52 | A. COMPILING 53 | 54 | Type "make" in a shell. 55 | 56 | 57 | ------------------------------------------------------------------------ 58 | 59 | B. ESTIMATION 60 | 61 | Estimate the model by executing: 62 | 63 | jst -est -config YOUR-PATH/train.properties 64 | 65 | Outputs of jst estimation include the following files: 66 | .others // contains model parameter settings 67 | .pi // contains the per-document sentiment distributions 68 | .phi // contains the sentiment specific topic-word distributions 69 | .theta // contains the per-document sentiment specific topic proportions 70 | .tassign // contains the sentiment label and topic assignments for words in training data 71 | ------------------------------------------------------------------------ 72 | 73 | C. INFERENCE 74 | 75 | To perform inference on a different set of data (in the same format as 76 | for estimation), execute: 77 | 78 | jst -inf -config YOUR-PATH/test.properties 79 | 80 | Outputs of jst inference include the following files: 81 | .newothers 82 | .newpi 83 | .newphi 84 | .newtheta 85 | .newtassign 86 | 87 | ------------------------------------------------------------------------ 88 | 89 | D. Data format 90 | 91 | (1) The input data format for estimation/inference is as follows, where each line is one document, preceded by the document ID. 92 | 93 | [Doc_1 name] [token_1] [token_2] ... [token_N] 94 | : 95 | : 96 | [Doc_M name] [token_1] [token_2] ... [token_N] 97 | 98 | (2) Sentiment lexicon (mpqa.constraint) 99 | 100 | [word] [neu prior prob.] [pos prior prob.] [neg prior prob.] 101 | 102 | 103 | ------------------------------------------------------------------------ 104 | 105 | E. References 106 | 107 | [1] Lin, C., He, Y., Everson, R. and Reuger, S. Weakly-supervised Joint Sentiment-Topic Detection from Text, IEEE Transactions on Knowledge and Data Engineering (TKDE), 2011. 108 | 109 | [2] Lin, C. and He, Y. Joint Sentiment/Topic Model for Sentiment Analysis, In Proceedings of the 18th ACM Conference on Information and Knowl- edge Management (CIKM), Hong Kong, China, 2009. 110 | 111 | 112 | -------------------------------------------------------------------------------- /src/document.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | *********************************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk. 8 | Part of code is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ***********************************************************************/ 28 | 29 | 30 | #ifndef _DOCUMENT_H 31 | #define _DOCUMENT_H 32 | 33 | #include 34 | #include 35 | using namespace std; 36 | 37 | 38 | 39 | class document { 40 | 41 | public: 42 | int * words; 43 | int * priorSentiLabels; 44 | string docID; 45 | string rawstr; 46 | int length; 47 | 48 | document() { 49 | words = NULL; 50 | priorSentiLabels = NULL; 51 | docID = ""; 52 | rawstr = ""; 53 | length = 0; 54 | } 55 | 56 | // Constructor. Retrieve the length of the document and allocate memory for storing the documents 57 | document(int length) { 58 | this->length = length; 59 | docID = ""; 60 | rawstr = ""; 61 | words = new int[length]; // words stores the word token ID, which is integer 62 | priorSentiLabels = new int[length]; 63 | } 64 | 65 | // Constructor. Retrieve the length of the document and store the element of words into the array 66 | document(int length, int * words) { 67 | this->length = length; 68 | docID = ""; 69 | rawstr = ""; 70 | this->words = new int[length]; 71 | for (int i = 0; i < length; i++) { 72 | this->words[i] = words[i]; 73 | } 74 | priorSentiLabels = new int[length]; 75 | } 76 | 77 | document(int length, int * words, string rawstr) { 78 | this->length = length; 79 | docID = ""; 80 | this->rawstr = rawstr; 81 | this->words = new int[length]; 82 | for (int i = 0; i < length; i++) { 83 | this->words[i] = words[i]; 84 | } 85 | priorSentiLabels = new int[length]; 86 | } 87 | 88 | 89 | document(vector & doc) { 90 | this->length = doc.size(); 91 | docID = ""; 92 | rawstr = ""; 93 | this->words = new int[length]; 94 | for (int i = 0; i < length; i++) { 95 | this->words[i] = doc[i]; 96 | } 97 | priorSentiLabels = new int[length]; 98 | } 99 | 100 | 101 | document(vector & doc, string rawstr) { 102 | this->length = doc.size(); 103 | docID = ""; 104 | this->rawstr = rawstr; 105 | this->words = new int[length]; 106 | for (int i = 0; i < length; i++) { 107 | this->words[i] = doc[i]; 108 | } 109 | priorSentiLabels = new int[length]; 110 | } 111 | 112 | document(vector & doc, vector &priorSentiLab, string rawstr) { 113 | this->length = doc.size(); 114 | docID = ""; 115 | this->rawstr = rawstr; 116 | this->words = new int[length]; 117 | this->priorSentiLabels = new int[length]; 118 | for (int i = 0; i < length; i++) { 119 | this->words[i] = doc[i]; 120 | this->priorSentiLabels[i] = priorSentiLab[i]; 121 | } 122 | } 123 | 124 | ~document() { 125 | if (words != NULL){ 126 | delete [] words; 127 | words = NULL; 128 | } 129 | 130 | if (priorSentiLabels != NULL){ 131 | delete [] priorSentiLabels; 132 | priorSentiLabels = NULL; 133 | } 134 | } 135 | }; 136 | 137 | #endif 138 | -------------------------------------------------------------------------------- /src/polya_fit_simple.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | *********************************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk. 8 | Part of code is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ***********************************************************************/ 28 | 29 | #include "polya_fit_simple.h" 30 | #include "math_func.h" 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | using namespace std; 37 | 38 | 39 | int polya_fit_simple(int ** data, double * alpha, int _K, int _nSample) { 40 | int K = _K; // hyperparameter dimension 41 | int nSample = _nSample; // total number of samples, i.e.documents 42 | int polya_iter = 100000; // maximum number of fixed point iterations 43 | int ifault1, ifault2; 44 | 45 | double sum_alpha_old; 46 | double * old_alpha = NULL; 47 | double sum_g = 0; // sum_g = sum_digama(data[i][k] + old_alpha[k]), 48 | double sum_h = 0; // sum_h + sum_digama(data[i] + sum_alpha_old) , where data[i] = sum_data[i][k] for all k, 49 | double * data_row_sum = NULL; // the sum of the counts of each data sample P = {P_1, P_2,...,P_k} 50 | bool sat_state = false; 51 | int i, k, j; 52 | 53 | old_alpha = new double[K]; 54 | for (k = 0; k < K; k++) { 55 | old_alpha[k] = 0; 56 | } 57 | 58 | data_row_sum = new double[nSample]; 59 | for (i = 0; i < nSample; i++) { 60 | data_row_sum[i] = 0; 61 | } 62 | 63 | // data_row_sum 64 | for (i = 0; i < nSample; i++) { 65 | for (k = 0; k < K; k++) { 66 | data_row_sum[i] += *(*(data+k)+i) ; 67 | } 68 | } 69 | 70 | // simple fix point iteration 71 | printf("Optimising parameters...\n"); 72 | for (i = 0; i < polya_iter; i++) { // reset sum_alpha_old 73 | sum_alpha_old = 0; 74 | // update old_alpha after each iteration 75 | for (j = 0; j < K; j++) { 76 | old_alpha[j] = *(alpha+j); 77 | } 78 | 79 | // calculate sum_alpha_old 80 | for (j = 0; j < K; j++) { 81 | sum_alpha_old += old_alpha[j]; 82 | } 83 | 84 | for (k = 0; k < K; k++) { 85 | sum_g = 0; 86 | sum_h = 0; 87 | 88 | // calculate sum_g[k] 89 | for (j = 0; j < nSample; j++) { 90 | sum_g += digama( *(*(data+k)+j) + old_alpha[k], &ifault1); 91 | } 92 | 93 | // calculate sum_h 94 | for (j = 0; j < nSample; j++) { 95 | sum_h += digama(data_row_sum[j] + sum_alpha_old, &ifault1); 96 | } 97 | 98 | // update alpha (new) 99 | *(alpha+k) = old_alpha[k] * (sum_g - nSample * digama(old_alpha[k], &ifault1)) / (sum_h - nSample * digama(sum_alpha_old, &ifault2)); 100 | } 101 | 102 | // terminate iteration ONLY if each dimension of {alpha_1, alpha_2, ... alpha_k} satisfy the termination criteria, 103 | for (j = 0; j < K; j++) { 104 | if (fabs( *(alpha+j) - old_alpha[j]) > 0.000001) break; 105 | if ( j == K-1) { 106 | sat_state = true; 107 | } 108 | } 109 | 110 | // check whether to terminate the whole iteration 111 | if(sat_state) { 112 | cout<<"Terminated at iteration: "< 37 | #include 38 | using namespace std; 39 | 40 | void show_help(); 41 | 42 | 43 | int main(int argc, char ** argv) { 44 | 45 | int model_status = MODEL_STATUS_UNKNOWN; 46 | utils *putils = new utils(); 47 | model_status = putils->parse_args(argc, argv, model_status); 48 | 49 | if (putils) 50 | delete putils; 51 | 52 | if (model_status == MODEL_STATUS_UNKNOWN) { 53 | printf("Please specify the task you would like to perform, training (-est) or inference (-inf)!\n"); 54 | show_help(); 55 | return 1; 56 | } 57 | else if (model_status == MODEL_STATUS_EST){ 58 | model jst; 59 | 60 | if (jst.init(argc, argv)) { 61 | show_help(); 62 | return 1; 63 | } 64 | 65 | if(jst.excute_model()) return 1; 66 | } 67 | else if (model_status == MODEL_STATUS_INF) { 68 | Inference jst; 69 | 70 | if (jst.init(argc, argv)) { 71 | show_help(); 72 | return 1; 73 | } 74 | } 75 | 76 | return 0; 77 | } 78 | 79 | 80 | void show_help() { 81 | 82 | printf("Command line usage:\n"); 83 | printf("jst -est|-inf [options]\n"); 84 | printf("-est \t Estimate the DJST model from scratch.\n"); 85 | printf("-inf \t Perform inference on unseen (new) data using a trained model.\n"); 86 | 87 | printf("\n-----------------------------------------------------------\n"); 88 | printf("Command line opitions:\n\n"); 89 | 90 | printf("-nsentiLabs \t The number of sentiment labels. The default is 3.\n"); 91 | printf("-ntopics \t The number of topics. The default is 50.\n"); 92 | printf("-niters \t The number of Gibbs sampling iterations. The default is 1000.\n"); 93 | printf("-savestep \t The step (counted by the number of Gibbs sampling iterations) at which the model is saved to hard disk. The default is 200.\n"); 94 | printf("-updateParaStep The step (counted by the number of Gibbs sampling iterations) at which the hyperparameters are updated. The default is 40.\n"); 95 | printf("-twords \t The number of most likely words to be printed for each topic. The default is 20.\n"); 96 | printf("-data_dir \t The directory where the input training data is stored.\n"); 97 | printf("-result_dir \t The directory where the output models and parameters will be stored.\n"); 98 | printf("-datasetFile \t The input training data file.\n"); 99 | printf("-sentiFile \t The sentiment lexicon file.\n"); 100 | printf("-vocab \t\t The vocabulary file.\n"); 101 | printf("-alpha \t\t The hyperparameter of the per-document sentiment specific topic proportion. The default is avgDocLength*0.05/(numSentiLabs*numTopics).\n"); 102 | printf("-beta \t\t The hyperparameter of the per-corpus sentiment specific topic-word distribution. The default is 0.01.\n"); 103 | printf("-gamma \t\t The hyperparameter of the per-document sentiment proportion. The default is avgDocLength*0.05/numSentiLabs.\n"); 104 | printf("-model_dir \t\t The directory of the previously trained model. (for inference only).\n"); 105 | printf("-model \t\t The name of the previously trained model. (for inference only).\n"); 106 | } 107 | 108 | -------------------------------------------------------------------------------- /src/model.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | *********************************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk. 8 | Part of code is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ***********************************************************************/ 28 | 29 | 30 | #ifndef _MODEL_H 31 | #define _MODEL_H 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | 43 | #include "dataset.h" 44 | #include "document.h" 45 | #include "map_type.h" 46 | #include "utils.h" 47 | #include "math_func.h" 48 | #include "polya_fit_simple.h" 49 | #include "strtokenizer.h" 50 | 51 | using namespace std; 52 | 53 | 54 | class model { 55 | 56 | public: 57 | model(void); 58 | ~model(void); 59 | 60 | mapword2atr word2atr; 61 | mapid2word id2word; 62 | mapword2prior sentiLex; // 63 | 64 | string data_dir; 65 | string datasetFile; 66 | string result_dir; 67 | string sentiLexFile; 68 | string wordmapfile; 69 | string tassign_suffix; 70 | string pi_suffix; 71 | string theta_suffix; 72 | string phi_suffix; 73 | string others_suffix; 74 | string twords_suffix; 75 | 76 | int numTopics; 77 | int numSentiLabs; 78 | int niters; 79 | int liter; 80 | int twords; 81 | int savestep; 82 | int updateParaStep; 83 | double _alpha; 84 | double _beta; 85 | double _gamma; 86 | 87 | // init functions 88 | int init(int argc, char ** argv); 89 | int excute_model(); 90 | 91 | 92 | private: 93 | 94 | int numDocs; 95 | int vocabSize; 96 | int corpusSize; 97 | int aveDocLength; 98 | 99 | ifstream fin; 100 | dataset * pdataset; 101 | utils * putils; 102 | 103 | // model counts 104 | vector nd; 105 | vector > ndl; 106 | vector > > ndlz; 107 | vector > > nlzw; 108 | vector > nlz; 109 | 110 | // topic and label assignments 111 | vector > p; 112 | vector > z; 113 | vector > l; 114 | 115 | // model parameters 116 | vector > pi_dl; // size: (numDocs x L) 117 | vector > > theta_dlz; // size: (numDocs x L x T) 118 | vector > > phi_lzw; // size: (L x T x V) 119 | 120 | // hyperparameters 121 | vector > alpha_lz; // \alpha_tlz size: (L x T) 122 | vector alphaSum_l; 123 | vector > > beta_lzw; // size: (L x T x V) 124 | vector > betaSum_lz; 125 | vector > gamma_dl; // size: (numDocs x L) 126 | vector gammaSum_d; 127 | vector > lambda_lw; // size: (L x V) -- for encoding prior sentiment information 128 | 129 | vector > opt_alpha_lz; //optimal value, size:(L x T) -- for storing the optimal value of alpha_lz after fix point iteration 130 | 131 | /************************* Functions ***************************/ 132 | int set_gamma(); 133 | int init_model_parameters(); 134 | inline int delete_model_parameters() { 135 | numDocs = 0; 136 | vocabSize = 0; 137 | corpusSize = 0; 138 | aveDocLength = 0; 139 | 140 | if (pdataset != NULL) { 141 | delete pdataset; 142 | pdataset = NULL; 143 | } 144 | 145 | return 0; 146 | } 147 | 148 | int init_estimate(); 149 | int estimate(); 150 | int prior2beta(); 151 | int sampling(int m, int n, int& sentiLab, int& topic); 152 | 153 | // compute parameter functions 154 | void compute_pi_dl(); 155 | void compute_theta_dlz(); 156 | void compute_phi_lzw(); 157 | 158 | // update parameter functions 159 | void init_parameters(); 160 | int update_Parameters(); 161 | 162 | // save model parameter funtions 163 | int save_model(string model_name); 164 | int save_model_tassign(string filename); 165 | int save_model_pi_dl(string filename); 166 | int save_model_theta_dlz(string filename); 167 | int save_model_phi_lzw(string filename); 168 | int save_model_others(string filename); 169 | int save_model_twords(string filename); 170 | }; 171 | 172 | #endif 173 | -------------------------------------------------------------------------------- /src/inference.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | *********************************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk. 8 | Part of code is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ***********************************************************************/ 28 | 29 | 30 | #ifndef _INFERENCE_H 31 | #define _INFERENCE_H 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include "constants.h" 38 | #include "document.h" 39 | #include "dataset.h" 40 | #include "utils.h" 41 | #include "strtokenizer.h" 42 | 43 | using namespace std; 44 | 45 | 46 | class Inference { 47 | 48 | public: 49 | Inference(void); 50 | ~Inference(void); 51 | 52 | int numSentiLabs; 53 | int numTopics; 54 | int numDocs; // for trained model 55 | int vocabSize; // for trained model 56 | int newNumDocs; // for test set 57 | int newVocabSize; // for test set 58 | 59 | vector > > nlzw; // for trained model 60 | vector > nlz; // for trained model 61 | mapword2atr word2atr; 62 | mapword2id word2id; 63 | mapid2word id2word; 64 | map id2_id; 65 | map _id2id; 66 | mapword2prior sentiLex; // => 67 | vector newWords; 68 | 69 | string model_dir; 70 | string model_name; 71 | string data_dir; 72 | string datasetFile; 73 | string result_dir; 74 | string sentiLexFile; 75 | string wordmapfile; 76 | string betaFile; 77 | 78 | string tassign_suffix; 79 | string pi_suffix; 80 | string theta_suffix; 81 | string phi_suffix; 82 | string others_suffix; 83 | string twords_suffix; 84 | 85 | dataset * pmodelData; // pointer to trained model object 86 | dataset * pnewData; // pointer to new/test dataset object 87 | utils * putils; 88 | 89 | int niters; 90 | int liter; 91 | int twords; 92 | int savestep; 93 | int updateParaStep; 94 | 95 | double _alpha; 96 | double _beta; 97 | double _gamma; 98 | 99 | vector > new_p; // for posterior 100 | vector > new_z; 101 | vector > new_l; 102 | vector > z; // for trained model 103 | vector > l; // for trained model 104 | 105 | 106 | // from NEW/test documents 107 | vector new_nd; 108 | vector > new_ndl; 109 | vector > > new_ndlz; 110 | vector > > new_nlzw; 111 | vector > new_nlz; 112 | 113 | // hyperparameters 114 | vector > alpha_lz; // size: (L x T) 115 | vector alphaSum_l; 116 | vector > > beta_lzw; // size: (L x T x V) 117 | vector > betaSum_lz; 118 | vector gamma_l; // size: (L) 119 | double gammaSum; 120 | vector > lambda_lw; // size: (L x V) -- for encoding prior sentiment information 121 | 122 | // model parameters 123 | vector > newpi_dl; // size: (numDocs x L) 124 | vector > > newtheta_dlz; // size: (numDocs x L x T) 125 | vector > > newphi_lzw; // size: (L x T x V) 126 | 127 | // functions 128 | int init(int argc, char ** argv); 129 | int init_inf(); 130 | int inference(); // inference for new (unseen) data based on previously trained model 131 | int inf_sampling(int m, int n, int& sentiLab, int& topic); 132 | int init_parameters(); 133 | 134 | int read_newData(string filename); 135 | int read_model_setting(string filename); 136 | int load_model(string model_name); 137 | int prior2beta(); // for incorporating priro information 138 | 139 | // compute model parameters 140 | void compute_newpi(); 141 | void compute_newtheta(); 142 | int compute_newphi(); 143 | 144 | // save new data models 145 | int save_model(string model_name); 146 | int save_model_newtassign(string filename); 147 | int save_model_newpi_dl(string filename); 148 | int save_model_newtheta_dlz(string filename); 149 | int save_model_newphi_lzw(string filename); 150 | int save_model_newothers(string filename); 151 | int save_model_newtwords(string filename); 152 | }; 153 | 154 | #endif 155 | -------------------------------------------------------------------------------- /src/cokus.h: -------------------------------------------------------------------------------- 1 | // This is the Mersenne Twister random number generator MT19937, which 2 | // generates pseudorandom integers uniformly distributed in 0..(2^32 - 1) 3 | // starting from any odd seed in 0..(2^32 - 1). This version is a recode 4 | // by Shawn Cokus (Cokus@math.washington.edu) on March 8, 1998 of a version by 5 | // Takuji Nishimura (who had suggestions from Topher Cooper and Marc Rieffel in 6 | // July-August 1997). 7 | // 8 | // Effectiveness of the recoding (on Goedel2.math.washington.edu, a DEC Alpha 9 | // running OSF/1) using GCC -O3 as a compiler: before recoding: 51.6 sec. to 10 | // generate 300 million random numbers; after recoding: 24.0 sec. for the same 11 | // (i.e., 46.5% of original time), so speed is now about 12.5 million random 12 | // number generations per second on this machine. 13 | // 14 | // According to the URL 15 | // (and paraphrasing a bit in places), the Mersenne Twister is ``designed 16 | // with consideration of the flaws of various existing generators,'' has 17 | // a period of 2^19937 - 1, gives a sequence that is 623-dimensionally 18 | // equidistributed, and ``has passed many stringent tests, including the 19 | // die-hard test of G. Marsaglia and the load test of P. Hellekalek and 20 | // S. Wegenkittl.'' It is efficient in memory usage (typically using 2506 21 | // to 5012 bytes of static data, depending on data type sizes, and the code 22 | // is quite short as well). It generates random numbers in batches of 624 23 | // at a time, so the caching and pipelining of modern systems is exploited. 24 | // It is also divide- and mod-free. 25 | // 26 | // This library is free software; you can redistribute it and/or modify it 27 | // under the terms of the GNU Library General Public License as published by 28 | // the Free Software Foundation (either version 2 of the License or, at your 29 | // option, any later version). This library is distributed in the hope that 30 | // it will be useful, but WITHOUT ANY WARRANTY, without even the implied 31 | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See 32 | // the GNU Library General Public License for more details. You should have 33 | // received a copy of the GNU Library General Public License along with this 34 | // library; if not, write to the Free Software Foundation, Inc., 59 Temple 35 | // Place, Suite 330, Boston, MA 02111-1307, USA. 36 | // 37 | // The code as Shawn received it included the following notice: 38 | // 39 | // Copyright (C) 1997 Makoto Matsumoto and Takuji Nishimura. When 40 | // you use this, send an e-mail to with 41 | // an appropriate reference to your work. 42 | // 43 | // It would be nice to CC: when you write. 44 | // 45 | 46 | //#include 47 | //#include 48 | 49 | // 50 | // uint32 must be an unsigned integer type capable of holding at least 32 51 | // bits; exactly 32 should be fastest, but 64 is better on an Alpha with 52 | // GCC at -O3 optimization so try your options and see whats best for you 53 | // 54 | 55 | #ifndef _COKUS_H 56 | #define _COKUS_H 57 | 58 | 59 | typedef unsigned long uint32; 60 | 61 | #define N (624) // length of state vector 62 | #define M (397) // a period parameter 63 | #define K (0x9908B0DFU) // a magic constant 64 | #define hiBit(u) ((u) & 0x80000000U) // mask all but highest bit of u 65 | #define loBit(u) ((u) & 0x00000001U) // mask all but lowest bit of u 66 | #define loBits(u) ((u) & 0x7FFFFFFFU) // mask the highest bit of u 67 | #define mixBits(u, v) (hiBit(u)|loBits(v)) // move hi bit of u to hi bit of v 68 | 69 | static uint32 state[N+1]; // state vector + 1 extra to not violate ANSI C 70 | static uint32 *next; // next random value is computed from here 71 | static int left = -1; // can *next++ this many times before reloading 72 | 73 | 74 | void seedMT(uint32 seed) 75 | { 76 | // 77 | // We initialize state[0..(N-1)] via the generator 78 | // 79 | // x_new = (69069 * x_old) mod 2^32 80 | // 81 | // from Line 15 of Table 1, p. 106, Sec. 3.3.4 of Knuths 82 | // _The Art of Computer Programming_, Volume 2, 3rd ed. 83 | // 84 | // Notes (SJC): I do not know what the initial state requirements 85 | // of the Mersenne Twister are, but it seems this seeding generator 86 | // could be better. It achieves the maximum period for its modulus 87 | // (2^30) iff x_initial is odd (p. 20-21, Sec. 3.2.1.2, Knuth); if 88 | // x_initial can be even, you have sequences like 0, 0, 0, ...; 89 | // 2^31, 2^31, 2^31, ...; 2^30, 2^30, 2^30, ...; 2^29, 2^29 + 2^31, 90 | // 2^29, 2^29 + 2^31, ..., etc. so I force seed to be odd below. 91 | // 92 | // Even if x_initial is odd, if x_initial is 1 mod 4 then 93 | // 94 | // the lowest bit of x is always 1, 95 | // the next-to-lowest bit of x is always 0, 96 | // the 2nd-from-lowest bit of x alternates ... 0 1 0 1 0 1 0 1 ... , 97 | // the 3rd-from-lowest bit of x 4-cycles ... 0 1 1 0 0 1 1 0 ... , 98 | // the 4th-from-lowest bit of x has the 8-cycle ... 0 0 0 1 1 1 1 0 ... , 99 | // ... 100 | // 101 | // and if x_initial is 3 mod 4 then 102 | // 103 | // the lowest bit of x is always 1, 104 | // the next-to-lowest bit of x is always 1, 105 | // the 2nd-from-lowest bit of x alternates ... 0 1 0 1 0 1 0 1 ... , 106 | // the 3rd-from-lowest bit of x 4-cycles ... 0 0 1 1 0 0 1 1 ... , 107 | // the 4th-from-lowest bit of x has the 8-cycle ... 0 0 1 1 1 1 0 0 ... , 108 | // ... 109 | // 110 | // The generators potency (min. s>=0 with (69069-1)^s = 0 mod 2^32) is 111 | // 16, which seems to be alright by p. 25, Sec. 3.2.1.3 of Knuth. It 112 | // also does well in the dimension 2..5 spectral tests, but it could be 113 | // better in dimension 6 (Line 15, Table 1, p. 106, Sec. 3.3.4, Knuth). 114 | // 115 | // Note that the random number user does not see the values generated 116 | // here directly since reloadMT() will always munge them first, so maybe 117 | // none of all of this matters. In fact, the seed values made here could 118 | // even be extra-special desirable if the Mersenne Twister theory says 119 | // so-- thats why the only change I made is to restrict to odd seeds. 120 | // 121 | 122 | register uint32 x = (seed | 1U) & 0xFFFFFFFFU, *s = state; 123 | register int j; 124 | 125 | for(left=0, *s++=x, j=N; --j; 126 | *s++ = (x*=69069U) & 0xFFFFFFFFU); 127 | } 128 | 129 | 130 | uint32 reloadMT(void) 131 | { 132 | register uint32 *p0=state, *p2=state+2, *pM=state+M, s0, s1; 133 | register int j; 134 | 135 | if(left < -1) 136 | seedMT(4357U); 137 | 138 | left=N-1, next=state+1; 139 | 140 | for(s0=state[0], s1=state[1], j=N-M+1; --j; s0=s1, s1=*p2++) 141 | *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U); 142 | 143 | for(pM=state, j=M; --j; s0=s1, s1=*p2++) 144 | *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U); 145 | 146 | s1=state[0], *p0 = *pM ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U); 147 | s1 ^= (s1 >> 11); 148 | s1 ^= (s1 << 7) & 0x9D2C5680U; 149 | s1 ^= (s1 << 15) & 0xEFC60000U; 150 | return(s1 ^ (s1 >> 18)); 151 | } 152 | 153 | 154 | uint32 randomMT(void) 155 | { 156 | uint32 y; 157 | 158 | if(--left < 0) 159 | return(reloadMT()); 160 | 161 | y = *next++; 162 | y ^= (y >> 11); 163 | y ^= (y << 7) & 0x9D2C5680U; 164 | y ^= (y << 15) & 0xEFC60000U; 165 | y ^= (y >> 18); 166 | return(y); 167 | } 168 | 169 | #endif 170 | 171 | 172 | -------------------------------------------------------------------------------- /src/utils.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | *********************************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk. 8 | Part of code is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ***********************************************************************/ 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include "strtokenizer.h" 35 | #include "utils.h" 36 | #include "model.h" 37 | #include "inference.h" 38 | #include "dataset.h" 39 | #include 40 | #include 41 | 42 | using namespace std; 43 | 44 | #undef WINDOWS 45 | #ifdef _WIN32 46 | #define WINDOWS 47 | #endif 48 | #ifdef __WIN32__ 49 | #define WINDOWS 50 | #endif 51 | 52 | #ifdef WINDOWS 53 | #include // For _mkdir(). 54 | #include // For access(). 55 | #else 56 | #include // For access(). 57 | #endif 58 | 59 | 60 | utils::utils() { 61 | model_status = MODEL_STATUS_UNKNOWN; 62 | model_dir = ""; 63 | data_dir = ""; 64 | result_dir = ""; 65 | model_name = ""; 66 | configfile = ""; 67 | 68 | wordmapfile = ""; 69 | sentiLexFile = ""; 70 | datasetFile = ""; 71 | configfile = ""; 72 | numSentiLabs = 0; 73 | numTopics = 0; 74 | niters = 0; 75 | savestep = 0; 76 | twords = 0; 77 | updateParaStep = -1; 78 | 79 | alpha = -1.0; 80 | beta = -1.0; 81 | gamma = -1.0; 82 | } 83 | 84 | 85 | int utils::parse_args(int argc, char ** argv, int& model_status) { 86 | int i = 1; 87 | while (i < argc) { 88 | string arg = argv[i]; 89 | if (arg == "-est") { 90 | model_status = MODEL_STATUS_EST; 91 | break; 92 | } 93 | else if (arg == "-estc") { 94 | model_status = MODEL_STATUS_ESTC; 95 | break; 96 | } 97 | else if (arg == "-inf") { 98 | model_status = MODEL_STATUS_INF; 99 | break; 100 | } 101 | i++; 102 | } 103 | 104 | this->model_status = model_status; 105 | cout << "model_status = " << this->model_status<< endl; 106 | return (model_status); 107 | } 108 | 109 | 110 | 111 | int utils::parse_args_est(int argc, char ** argv, model * pmodel) { 112 | 113 | int i = 1; 114 | while (i < argc) { 115 | string arg = argv[i]; 116 | if (arg == "-config") { 117 | configfile = argv[++i]; 118 | break; 119 | } 120 | i++; 121 | } 122 | 123 | if (configfile != "") { 124 | if (read_config_file(configfile)) { 125 | return 1; 126 | } 127 | } 128 | 129 | if (wordmapfile != "") 130 | pmodel->wordmapfile = wordmapfile; 131 | 132 | if (sentiLexFile != "") 133 | pmodel->sentiLexFile = sentiLexFile; 134 | 135 | if (datasetFile != "") { 136 | pmodel->datasetFile = datasetFile; 137 | } 138 | 139 | if (numSentiLabs > 0) pmodel->numSentiLabs = numSentiLabs; 140 | if (numTopics > 0) pmodel->numTopics = numTopics; 141 | if (niters > 0) pmodel->niters = niters; 142 | if (savestep > 0) pmodel->savestep = savestep; 143 | if (twords > 0) pmodel->twords = twords; 144 | pmodel->updateParaStep = updateParaStep; // -1: no parameter optimization 145 | 146 | if (alpha > 0.0) pmodel->_alpha = alpha; 147 | if (beta > 0.0) pmodel->_beta = beta; 148 | if (gamma > 0.0) pmodel->_gamma = gamma; 149 | 150 | if (data_dir != "") { 151 | if (data_dir[data_dir.size() - 1] != '/') { 152 | data_dir += "/"; 153 | } 154 | pmodel->data_dir = data_dir; 155 | } 156 | else { 157 | printf("Please specify input data dir!\n"); 158 | return 1; 159 | } 160 | 161 | if (result_dir != "") { 162 | if (make_dir(result_dir)) return 1; 163 | if (result_dir[result_dir.size() - 1] != '/') { 164 | result_dir += "/"; 165 | } 166 | pmodel->result_dir = result_dir; 167 | } 168 | else { 169 | printf("Please specify output dir!\n"); 170 | return 1; 171 | } 172 | 173 | return 0; 174 | } 175 | 176 | 177 | int utils::parse_args_inf(int argc, char ** argv, Inference * pmodel_inf) { 178 | 179 | int i = 1; 180 | while (i < argc) { 181 | string arg = argv[i]; 182 | printf("arg=%s\n", arg.c_str()); 183 | if (arg == "-config") { 184 | configfile = argv[++i]; 185 | break; 186 | } 187 | i++; 188 | } 189 | if (configfile != "") { 190 | if (read_config_file(configfile)) return 1; 191 | } 192 | 193 | if (wordmapfile != "") 194 | pmodel_inf->wordmapfile = wordmapfile; 195 | 196 | if (sentiLexFile != "") 197 | pmodel_inf->sentiLexFile = sentiLexFile; 198 | 199 | if (datasetFile != "") 200 | pmodel_inf->datasetFile = datasetFile; 201 | else { 202 | printf("Please specify input dataset file!\n"); 203 | return 1; 204 | } 205 | 206 | if (model_dir != "") { 207 | if (model_dir[model_dir.size() - 1] != '/') model_dir += "/"; 208 | pmodel_inf->model_dir = model_dir; 209 | } 210 | 211 | if (data_dir != "") { 212 | if (data_dir[data_dir.size() - 1] != '/') data_dir += "/"; 213 | pmodel_inf->data_dir = data_dir; 214 | } 215 | else { 216 | printf("Please specify input data dir!\n"); 217 | return 1; 218 | } 219 | 220 | if (result_dir != "") { 221 | if (make_dir(result_dir)) return 1; 222 | if (result_dir[result_dir.size() - 1] != '/') result_dir += "/"; 223 | pmodel_inf->result_dir = result_dir; 224 | } 225 | else { 226 | printf("Please specify output dir!\n"); 227 | return 1; 228 | } 229 | 230 | if (model_name != "") 231 | pmodel_inf->model_name = model_name; 232 | else { 233 | printf("Please specify the trained dJST model name!\n"); 234 | return 1; 235 | } 236 | 237 | if (niters > 0) pmodel_inf->niters = niters; 238 | 239 | 240 | if (twords > 0) pmodel_inf->twords = twords; 241 | if (savestep > 0) pmodel_inf->savestep = savestep; 242 | if (updateParaStep > 0) pmodel_inf->updateParaStep = updateParaStep; 243 | if (alpha > 0.0) pmodel_inf->_alpha = alpha; 244 | if (beta > 0.0) pmodel_inf->_beta = beta; 245 | if (gamma > 0.0) pmodel_inf->_gamma = gamma; 246 | 247 | return 0; 248 | } 249 | 250 | 251 | int utils::read_config_file(string filename) { 252 | 253 | char buff[BUFF_SIZE_SHORT]; 254 | string line; 255 | 256 | FILE * fin = fopen(filename.c_str(), "r"); 257 | if (!fin) { 258 | printf("Cannot read file %s\n", filename.c_str()); 259 | return 1; 260 | } 261 | 262 | while (fgets(buff, BUFF_SIZE_SHORT - 1, fin)) { 263 | line = buff; 264 | strtokenizer strtok(line, "= \t\r\n"); 265 | int count = strtok.count_tokens(); 266 | 267 | // line invalid, ignore 268 | if (count != 2) { 269 | continue; 270 | } 271 | 272 | string optstr = strtok.token(0); 273 | string optval = strtok.token(1); 274 | 275 | if(optstr == "nsentiLabs") 276 | numSentiLabs = atoi(optval.c_str()); 277 | else if(optstr == "ntopics") 278 | numTopics = atoi(optval.c_str()); 279 | else if(optstr == "niters") 280 | niters = atoi(optval.c_str()); 281 | else if(optstr == "savestep") 282 | savestep = atoi(optval.c_str()); 283 | else if (optstr == "updateParaStep") 284 | updateParaStep = atoi(optval.c_str()); 285 | else if(optstr == "twords") 286 | twords = atoi(optval.c_str()); 287 | else if(optstr == "data_dir") 288 | data_dir = optval; 289 | else if (optstr == "model_dir") 290 | model_dir = optval; 291 | else if(optstr == "result_dir") 292 | result_dir = optval; 293 | else if(optstr == "datasetFile") 294 | datasetFile = optval; 295 | else if(optstr == "sentiFile") 296 | sentiLexFile = optval; 297 | else if (optstr == "vocabFile") 298 | wordmapfile = optval; 299 | else if (optstr == "alpha") 300 | alpha = atof(optval.c_str()); 301 | else if (optstr == "beta") 302 | beta = atof(optval.c_str()); 303 | else if (optstr == "gamma") 304 | gamma = atof(optval.c_str()); 305 | else if (optstr == "model") 306 | model_name = optval; 307 | } 308 | 309 | fclose(fin); 310 | 311 | return 0; 312 | } 313 | 314 | 315 | string utils::generate_model_name(int iter) { 316 | 317 | string model_name; 318 | std::stringstream out; 319 | char buff[BUFF_SIZE_SHORT]; 320 | 321 | sprintf(buff, "%05d", iter); 322 | 323 | if (iter >= 0) 324 | model_name = buff; 325 | else 326 | model_name = "final"; 327 | 328 | return model_name; 329 | } 330 | 331 | 332 | #ifdef WINDOWS 333 | int utils::make_dir(string strPath) { 334 | if(_access(strPath.c_str(), 0) == 0) 335 | return 0; 336 | else if(_mkdir(strPath.c_str()) == 0) 337 | return 0; 338 | else { 339 | printf("Throw exception in creating directory %s !\n",strPath.c_str()); 340 | return 1; 341 | } 342 | } 343 | #else 344 | int utils::make_dir(string strPath) { 345 | if(access(strPath.c_str(), 0) == 0) 346 | return 0; 347 | else if(mkdir(strPath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) == 0) 348 | return 0; 349 | else { 350 | cout<<"Throw exception in creating directory "< 35 | #include 36 | #include 37 | #include 38 | #include 39 | using namespace std; 40 | 41 | 42 | dataset::dataset() { 43 | pdocs = NULL; 44 | _pdocs = NULL; 45 | word2atr.clear(); 46 | result_dir = "."; 47 | wordmapfile = "wordmap.txt"; 48 | 49 | numDocs = 0; 50 | aveDocLength = 0; 51 | vocabSize = 0; 52 | corpusSize = 0; 53 | } 54 | 55 | dataset::dataset(string result_dir) { 56 | pdocs = NULL; 57 | _pdocs = NULL; 58 | word2atr.clear(); 59 | this->result_dir = result_dir; 60 | wordmapfile = "wordmap.txt"; 61 | 62 | numDocs = 0; 63 | aveDocLength = 0; 64 | vocabSize = 0; 65 | corpusSize = 0; 66 | } 67 | 68 | 69 | dataset::~dataset(void) { 70 | deallocate(); 71 | } 72 | 73 | 74 | int dataset::read_dataStream(ifstream& fin) { 75 | string line; 76 | char buff[BUFF_SIZE_LONG]; 77 | docs.clear(); 78 | numDocs = 0; 79 | 80 | while (fin.getline(buff, BUFF_SIZE_LONG)) { 81 | line = buff; 82 | if(!line.empty()) { 83 | docs.push_back(line); 84 | numDocs++; 85 | } 86 | } 87 | 88 | if (numDocs > 0) { 89 | this->analyzeCorpus(docs); 90 | } 91 | 92 | return 0; 93 | } 94 | 95 | 96 | int dataset::analyzeCorpus(vector& docs) { 97 | 98 | mapword2atr::iterator it; 99 | mapword2id::iterator vocabIt; 100 | mapword2prior::iterator sentiIt; 101 | map::iterator idIt; 102 | 103 | string line; 104 | numDocs = docs.size(); 105 | vocabSize = 0; 106 | corpusSize = 0; 107 | aveDocLength = 0; 108 | 109 | // allocate memory for corpus 110 | if (pdocs) { 111 | deallocate(); 112 | pdocs = new document*[numDocs]; 113 | } 114 | else { 115 | pdocs = new document*[numDocs]; 116 | } 117 | 118 | for (int i = 0; i < (int)docs.size(); ++i) { 119 | line = docs.at(i); 120 | strtokenizer strtok(line, " \t\r\n"); // \t\r\n are the separators 121 | int docLength = strtok.count_tokens(); 122 | 123 | if (docLength <= 0) { 124 | printf("Invalid (empty) document!\n"); 125 | deallocate(); 126 | numDocs = vocabSize = 0; 127 | return 1; 128 | } 129 | 130 | corpusSize += docLength - 1; // the first word is document name/id 131 | 132 | // allocate memory for the new document_i 133 | document * pdoc = new document(docLength-1); 134 | pdoc->docID = strtok.token(0).c_str(); 135 | 136 | // generate ID for the tokens in the corpus, and assign each word token with the corresponding vocabulary ID. 137 | for (int k = 0; k < docLength-1; k++) { 138 | int priorSenti = -1; 139 | it = word2atr.find(strtok.token(k+1).c_str()); 140 | 141 | if (it == word2atr.end()) { // i.e., new word 142 | pdoc->words[k] = word2atr.size(); 143 | sentiIt = sentiLex.find(strtok.token(k+1).c_str()); // check whether the word token can be found in the sentiment lexicon 144 | // incorporate sentiment lexicon 145 | if (sentiIt != sentiLex.end()) { 146 | priorSenti = sentiIt->second.id; 147 | } 148 | 149 | // insert sentiment info into word2atr 150 | Word_atr temp = {word2atr.size(), priorSenti}; // vocabulary index; word polarity 151 | word2atr.insert(pair(strtok.token(k+1), temp)); 152 | pdoc->priorSentiLabels[k] = priorSenti; 153 | 154 | } 155 | else { // word seen before 156 | pdoc->words[k] = it->second.id; 157 | pdoc->priorSentiLabels[k] = it->second.polarity; 158 | } 159 | } 160 | 161 | add_doc(pdoc, i); 162 | } 163 | 164 | 165 | // update number of words 166 | vocabSize = word2atr.size(); 167 | aveDocLength = corpusSize/numDocs; 168 | 169 | if (write_wordmap(result_dir + wordmapfile, word2atr)) { 170 | printf("ERROE! Can not read wordmap file %s!\n", wordmapfile.c_str()); 171 | return 1; 172 | } 173 | if (read_wordmap(result_dir + wordmapfile, id2word)) { 174 | printf("ERROE! Can not read wordmap file %s!\n", wordmapfile.c_str()); 175 | return 1; 176 | } 177 | 178 | docs.clear(); 179 | return 0; 180 | } 181 | 182 | 183 | 184 | void dataset::deallocate() 185 | { 186 | if (pdocs) { 187 | for (int i = 0; i < numDocs; i++) 188 | delete pdocs[i]; 189 | delete [] pdocs; 190 | pdocs = NULL; 191 | } 192 | 193 | if (_pdocs) { 194 | for (int i = 0; i < numDocs; i++) 195 | delete _pdocs[i]; 196 | delete [] _pdocs; 197 | _pdocs = NULL; 198 | } 199 | } 200 | 201 | 202 | void dataset::add_doc(document * doc, int idx) { 203 | if (0 <= idx && idx < numDocs) 204 | pdocs[idx] = doc; 205 | } 206 | 207 | void dataset::_add_doc(document * doc, int idx) { 208 | if (0 <= idx && idx < numDocs) { 209 | _pdocs[idx] = doc; 210 | } 211 | } 212 | 213 | 214 | int dataset::read_senti_lexicon(string sentiLexiconFile) { 215 | sentiLex.clear(); 216 | char buff[BUFF_SIZE_SHORT]; 217 | string line; 218 | vector wordPrior; 219 | int labID; 220 | double tmp, val; 221 | int numSentiLabs; 222 | 223 | FILE * fin = fopen(sentiLexiconFile.c_str(), "r"); 224 | if (!fin) { 225 | printf("Cannot read file %s!\n", sentiLexiconFile.c_str()); 226 | return 1; 227 | } 228 | 229 | while (fgets(buff, BUFF_SIZE_SHORT - 1, fin) != NULL) { 230 | line = buff; 231 | strtokenizer strtok(line, " \t\r\n"); 232 | 233 | if (strtok.count_tokens() < 1) { 234 | printf("Warning! The strtok count in the lexicon line [%s] is smaller than 2!\n", line.c_str()); 235 | } 236 | else { 237 | tmp = 0.0; 238 | labID = 0; 239 | wordPrior.clear(); 240 | numSentiLabs = strtok.count_tokens(); 241 | for (int k = 1; k < strtok.count_tokens(); k++) { 242 | val = atof(strtok.token(k).c_str()); 243 | if (tmp < val) { 244 | tmp = val; 245 | labID = k-1; 246 | } 247 | wordPrior.push_back(val); 248 | } 249 | Word_Prior_Attr temp = {labID, wordPrior}; // sentiment label ID, sentiment label distribution 250 | sentiLex.insert(pair(strtok.token(0), temp)); 251 | } 252 | } 253 | 254 | if (sentiLex.size() <= 0) { 255 | printf("Can not find any sentiment lexicon in file %s!\n", sentiLexiconFile.c_str()); 256 | return 1; 257 | } 258 | 259 | fclose(fin); 260 | return 0; 261 | } 262 | 263 | 264 | int dataset::write_wordmap(string wordmapfile, mapword2atr &pword2atr) { 265 | 266 | FILE * fout = fopen(wordmapfile.c_str(), "w"); 267 | if (!fout) { 268 | printf("Cannot open file %s to write!\n", wordmapfile.c_str()); 269 | return 1; 270 | } 271 | 272 | mapword2atr::iterator it; 273 | fprintf(fout, "%d\n", (int)(pword2atr.size())); 274 | for (it = pword2atr.begin(); it != pword2atr.end(); it++) { 275 | fprintf(fout, "%s %d\n", (it->first).c_str(), it->second.id); 276 | } 277 | 278 | fclose(fout); 279 | return 0; 280 | } 281 | 282 | 283 | int dataset::read_wordmap(string wordmapfile, mapid2word &pid2word) { 284 | pid2word.clear(); 285 | 286 | FILE * fin = fopen(wordmapfile.c_str(), "r"); 287 | if (!fin) { 288 | printf("Cannot open file %s to read!\n", wordmapfile.c_str()); 289 | return 1; 290 | } 291 | 292 | char buff[BUFF_SIZE_SHORT]; 293 | string line; 294 | 295 | fgets(buff, BUFF_SIZE_SHORT - 1, fin); 296 | int nwords = atoi(buff); 297 | 298 | for (int i = 0; i < nwords; i++) { 299 | fgets(buff, BUFF_SIZE_SHORT - 1, fin); 300 | line = buff; 301 | strtokenizer strtok(line, " \t\r\n"); 302 | if (strtok.count_tokens() != 2) { 303 | printf("Warning! Line %d in %s contains less than 2 words!\n", i+1, wordmapfile.c_str()); 304 | continue; 305 | } 306 | 307 | pid2word.insert(pair(atoi(strtok.token(1).c_str()), strtok.token(0))); 308 | } 309 | 310 | fclose(fin); 311 | return 0; 312 | } 313 | 314 | 315 | int dataset::read_wordmap(string wordmapfile, mapword2id& pword2id) { 316 | pword2id.clear(); 317 | char buff[BUFF_SIZE_SHORT]; 318 | string line; 319 | 320 | 321 | FILE * fin = fopen(wordmapfile.c_str(), "r"); 322 | if (!fin) { 323 | printf("Cannot read file %s!\n", wordmapfile.c_str()); 324 | return 1; 325 | } 326 | 327 | fgets(buff, BUFF_SIZE_SHORT - 1, fin); 328 | int nwords = atoi(buff); 329 | 330 | for (int i = 0; i < nwords; i++) { 331 | fgets(buff, BUFF_SIZE_SHORT - 1, fin); 332 | line = buff; 333 | strtokenizer strtok(line, " \t\r\n"); 334 | if (strtok.count_tokens() != 2) { 335 | continue; 336 | } 337 | pword2id.insert(pair(strtok.token(0), atoi(strtok.token(1).c_str()))); 338 | } 339 | 340 | fclose(fin); 341 | return 0; 342 | } 343 | 344 | -------------------------------------------------------------------------------- /src/model.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | *********************************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk. 8 | Part of code is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ***********************************************************************/ 28 | 29 | 30 | #include "model.h" 31 | using namespace std; 32 | 33 | 34 | model::model(void) { 35 | 36 | wordmapfile = "wordmap.txt"; 37 | tassign_suffix = ".tassign"; 38 | pi_suffix = ".pi"; 39 | theta_suffix = ".theta"; 40 | phi_suffix = ".phi"; 41 | others_suffix = ".others"; 42 | twords_suffix = ".twords"; 43 | 44 | numTopics = 50; 45 | numSentiLabs = 3; 46 | vocabSize = 0; 47 | numDocs = 0; 48 | corpusSize = 0; 49 | aveDocLength = 0; 50 | 51 | niters = 1000; 52 | liter = 0; 53 | savestep = 200; 54 | twords = 20; 55 | updateParaStep = 40; 56 | 57 | _alpha = -1.0; 58 | _beta = -1.0; 59 | _gamma = -1.0; 60 | 61 | putils = new utils(); 62 | } 63 | 64 | 65 | model::~model(void) { 66 | if (putils) delete putils; 67 | } 68 | 69 | 70 | int model::init(int argc, char ** argv) { 71 | 72 | if (putils->parse_args_est(argc, argv, this)) { 73 | return 1; 74 | } 75 | 76 | cout<<"data_dir = "<read_senti_lexicon((sentiLexFile).c_str())) { 104 | printf("Error! Cannot read sentiFile %s!\n", (sentiLexFile).c_str()); 105 | delete pdataset; 106 | return 1; 107 | } 108 | this->sentiLex = pdataset->sentiLex; 109 | } 110 | 111 | // read training data 112 | fin.open((data_dir+datasetFile).c_str(), ifstream::in); 113 | if(!fin) { 114 | printf("Error! Cannot read dataset %s!\n", (data_dir+datasetFile).c_str()); 115 | return 1; 116 | } 117 | if(pdataset->read_dataStream(fin)) { 118 | printf("Throw exception in function read_dataStream()! \n"); 119 | delete pdataset; 120 | return 1; 121 | } 122 | 123 | word2atr = pdataset->word2atr; 124 | id2word = pdataset->id2word; 125 | init_model_parameters(); 126 | if (init_estimate()) return 1; 127 | if(estimate()) return 1; 128 | delete_model_parameters(); 129 | fin.close(); 130 | 131 | return 0; 132 | } 133 | 134 | 135 | int model::init_model_parameters() 136 | { 137 | numDocs = pdataset->numDocs; 138 | vocabSize = pdataset->vocabSize; 139 | corpusSize = pdataset->corpusSize; 140 | aveDocLength = pdataset->aveDocLength; 141 | 142 | // model counts 143 | nd.resize(numDocs); 144 | for (int m = 0; m < numDocs; m++) { 145 | nd[m] = 0; 146 | } 147 | 148 | ndl.resize(numDocs); 149 | for (int m = 0; m < numDocs; m++) { 150 | ndl[m].resize(numSentiLabs); 151 | for (int l = 0; l < numSentiLabs; l++) 152 | ndl[m][l] = 0; 153 | } 154 | 155 | ndlz.resize(numDocs); 156 | for (int m = 0; m < numDocs; m++) { 157 | ndlz[m].resize(numSentiLabs); 158 | for (int l = 0; l < numSentiLabs; l++) { 159 | ndlz[m][l].resize(numTopics); 160 | for (int z = 0; z < numTopics; z++) 161 | ndlz[m][l][z] = 0; 162 | } 163 | } 164 | 165 | nlzw.resize(numSentiLabs); 166 | for (int l = 0; l < numSentiLabs; l++) { 167 | nlzw[l].resize(numTopics); 168 | for (int z = 0; z < numTopics; z++) { 169 | nlzw[l][z].resize(vocabSize); 170 | for (int r = 0; r < vocabSize; r++) 171 | nlzw[l][z][r] = 0; 172 | } 173 | } 174 | 175 | nlz.resize(numSentiLabs); 176 | for (int l = 0; l < numSentiLabs; l++) { 177 | nlz[l].resize(numTopics); 178 | for (int z = 0; z < numTopics; z++) { 179 | nlz[l][z] = 0; 180 | } 181 | } 182 | 183 | // posterior P 184 | p.resize(numSentiLabs); 185 | for (int l = 0; l < numSentiLabs; l++) { 186 | p[l].resize(numTopics); 187 | } 188 | 189 | // model parameters 190 | pi_dl.resize(numDocs); 191 | for (int m = 0; m < numDocs; m++) { 192 | pi_dl[m].resize(numSentiLabs); 193 | } 194 | 195 | theta_dlz.resize(numDocs); 196 | for (int m = 0; m < numDocs; m++) { 197 | theta_dlz[m].resize(numSentiLabs); 198 | for (int l = 0; l < numSentiLabs; l++) { 199 | theta_dlz[m][l].resize(numTopics); 200 | } 201 | } 202 | 203 | phi_lzw.resize(numSentiLabs); 204 | for (int l = 0; l < numSentiLabs; l++) { 205 | phi_lzw[l].resize(numTopics); 206 | for (int z = 0; z < numTopics; z++) { 207 | phi_lzw[l][z].resize(vocabSize); 208 | } 209 | } 210 | 211 | // init hyperparameters 212 | alpha_lz.resize(numSentiLabs); 213 | for (int l = 0; l < numSentiLabs; l++) { 214 | alpha_lz[l].resize(numTopics); 215 | } 216 | 217 | alphaSum_l.resize(numSentiLabs); 218 | 219 | if (_alpha <= 0) { 220 | _alpha = (double)aveDocLength * 0.05 / (double)(numSentiLabs * numTopics); 221 | } 222 | 223 | for (int l = 0; l < numSentiLabs; l++) { 224 | alphaSum_l[l] = 0.0; 225 | for (int z = 0; z < numTopics; z++) { 226 | alpha_lz[l][z] = _alpha; 227 | alphaSum_l[l] += alpha_lz[l][z]; 228 | } 229 | } 230 | 231 | opt_alpha_lz.resize(numSentiLabs); 232 | for (int l = 0; l < numSentiLabs; l++) { 233 | opt_alpha_lz[l].resize(numTopics); 234 | } 235 | 236 | //beta 237 | if (_beta <= 0) _beta = 0.01; 238 | 239 | beta_lzw.resize(numSentiLabs); 240 | betaSum_lz.resize(numSentiLabs); 241 | for (int l = 0; l < numSentiLabs; l++) { 242 | beta_lzw[l].resize(numTopics); 243 | betaSum_lz[l].resize(numTopics); 244 | for (int z = 0; z < numTopics; z++) { 245 | betaSum_lz[l][z] = 0.0; 246 | beta_lzw[l][z].resize(vocabSize); 247 | for (int r = 0; r < vocabSize; r++) { 248 | beta_lzw[l][z][r] = _beta; 249 | } 250 | } 251 | } 252 | 253 | // word prior transformation matrix 254 | lambda_lw.resize(numSentiLabs); 255 | for (int l = 0; l < numSentiLabs; l++) { 256 | lambda_lw[l].resize(vocabSize); 257 | for (int r = 0; r < vocabSize; r++) { 258 | lambda_lw[l][r] = 1; 259 | } 260 | } 261 | 262 | // incorporate prior information into beta 263 | this->prior2beta(); 264 | this->set_gamma(); 265 | 266 | return 0; 267 | } 268 | 269 | 270 | int model::set_gamma() { 271 | 272 | mapname2labs::iterator it; 273 | 274 | if (_gamma <= 0 ) { 275 | _gamma = (double)aveDocLength * 0.05 / (double)numSentiLabs; 276 | } 277 | 278 | gamma_dl.resize(numDocs); 279 | gammaSum_d.resize(numDocs); 280 | 281 | for (int d = 0; d < numDocs; d++) { 282 | gamma_dl[d].resize(numSentiLabs); 283 | gammaSum_d[d] = 0.0; 284 | for (int l = 0; l < numSentiLabs; l++) { 285 | gamma_dl[d][l] = _gamma; 286 | gammaSum_d[d] += gamma_dl[d][l]; 287 | } 288 | } 289 | 290 | return 0; 291 | } 292 | 293 | 294 | int model::prior2beta() { 295 | 296 | mapword2atr::iterator wordIt; 297 | mapword2prior::iterator sentiIt; 298 | 299 | for (sentiIt = sentiLex.begin(); sentiIt != sentiLex.end(); sentiIt++) { 300 | wordIt = word2atr.find(sentiIt->first); 301 | if (wordIt != word2atr.end()) { 302 | for (int j = 0; j < numSentiLabs; j++) { 303 | lambda_lw[j][wordIt->second.id] = sentiIt->second.labDist[j]; 304 | } 305 | } 306 | } 307 | 308 | for (int l = 0; l < numSentiLabs; l++) { 309 | for (int z = 0; z < numTopics; z++) { 310 | betaSum_lz[l][z] = 0.0; 311 | for (int r = 0; r < vocabSize; r++) { 312 | beta_lzw[l][z][r] = beta_lzw[l][z][r] * lambda_lw[l][r]; 313 | betaSum_lz[l][z] += beta_lzw[l][z][r]; 314 | } 315 | } 316 | } 317 | 318 | return 0; 319 | } 320 | 321 | 322 | void model::compute_phi_lzw() { 323 | 324 | for (int l = 0; l < numSentiLabs; l++) { 325 | for (int z = 0; z < numTopics; z++) { 326 | for(int r = 0; r < vocabSize; r++) { 327 | phi_lzw[l][z][r] = (nlzw[l][z][r] + beta_lzw[l][z][r]) / (nlz[l][z] + betaSum_lz[l][z]); 328 | } 329 | } 330 | } 331 | } 332 | 333 | 334 | 335 | void model::compute_pi_dl() { 336 | 337 | for (int m = 0; m < numDocs; m++) { 338 | for (int l = 0; l < numSentiLabs; l++) { 339 | pi_dl[m][l] = (ndl[m][l] + gamma_dl[m][l]) / (nd[m] + gammaSum_d[m]); 340 | } 341 | } 342 | } 343 | 344 | void model::compute_theta_dlz() { 345 | 346 | for (int m = 0; m < numDocs; m++) { 347 | for (int l = 0; l < numSentiLabs; l++) { 348 | for (int z = 0; z < numTopics; z++) { 349 | theta_dlz[m][l][z] = (ndlz[m][l][z] + alpha_lz[l][z]) / (ndl[m][l] + alphaSum_l[l]); 350 | } 351 | } 352 | } 353 | } 354 | 355 | 356 | int model::save_model(string model_name) { 357 | 358 | if (save_model_tassign(result_dir + model_name + tassign_suffix)) 359 | return 1; 360 | 361 | if (save_model_twords(result_dir + model_name + twords_suffix)) 362 | return 1; 363 | 364 | if (save_model_pi_dl(result_dir + model_name + pi_suffix)) 365 | return 1; 366 | 367 | if (save_model_theta_dlz(result_dir + model_name + theta_suffix)) 368 | return 1; 369 | 370 | if (save_model_phi_lzw(result_dir + model_name + phi_suffix)) 371 | return 1; 372 | 373 | if (save_model_others(result_dir + model_name + others_suffix)) 374 | return 1; 375 | 376 | return 0; 377 | } 378 | 379 | 380 | int model::save_model_tassign(string filename) { 381 | 382 | FILE * fout = fopen(filename.c_str(), "w"); 383 | if (!fout) { 384 | printf("Cannot save file %s!\n", filename.c_str()); 385 | return 1; 386 | } 387 | 388 | for (int m = 0; m < pdataset->numDocs; m++) { 389 | fprintf(fout, "%s \n", pdataset->pdocs[m]->docID.c_str()); 390 | for (int n = 0; n < pdataset->pdocs[m]->length; n++) { 391 | fprintf(fout, "%d:%d:%d ", pdataset->pdocs[m]->words[n], l[m][n], z[m][n]); // wordID:sentiLab:topic 392 | } 393 | fprintf(fout, "\n"); 394 | } 395 | 396 | fclose(fout); 397 | return 0; 398 | } 399 | 400 | 401 | int model::save_model_twords(string filename) 402 | { 403 | FILE * fout = fopen(filename.c_str(), "w"); 404 | if (!fout) { 405 | printf("Cannot save file %s!\n", filename.c_str()); 406 | return 1; 407 | } 408 | 409 | if (twords > vocabSize) { 410 | twords = vocabSize; // print out entire vocab list 411 | } 412 | 413 | mapid2word::iterator it; 414 | 415 | for (int l = 0; l < numSentiLabs; l++) { 416 | for (int k = 0; k < numTopics; k++) { 417 | vector > words_probs; 418 | pair word_prob; 419 | for (int w = 0; w < vocabSize; w++) { 420 | word_prob.first = w; // w: word id/index 421 | word_prob.second = phi_lzw[l][k][w]; // topic-word probability 422 | words_probs.push_back(word_prob); 423 | } 424 | 425 | std::sort(words_probs.begin(), words_probs.end(), sort_pred()); 426 | 427 | fprintf(fout, "Label%d_Topic%d\n", l, k); 428 | for (int i = 0; i < twords; i++) { 429 | it = id2word.find(words_probs[i].first); 430 | if (it != id2word.end()) 431 | fprintf(fout, "%s %15f\n", (it->second).c_str(), words_probs[i].second); 432 | } 433 | } 434 | } 435 | 436 | fclose(fout); 437 | return 0; 438 | } 439 | 440 | 441 | 442 | int model::save_model_pi_dl(string filename) { 443 | 444 | FILE * fout = fopen(filename.c_str(), "w"); 445 | if (!fout) { 446 | printf("Cannot save file %s!\n", filename.c_str()); 447 | return 1; 448 | } 449 | 450 | for (int m = 0; m < numDocs; m++) { 451 | fprintf(fout, "d_%d %s ", m, pdataset->pdocs[m]->docID.c_str()); 452 | for (int l = 0; l < numSentiLabs; l++) { 453 | fprintf(fout, "%f ", pi_dl[m][l]); 454 | } 455 | fprintf(fout, "\n"); 456 | } 457 | 458 | fclose(fout); 459 | return 0; 460 | } 461 | 462 | 463 | int model::save_model_theta_dlz(string filename) { 464 | 465 | FILE * fout = fopen(filename.c_str(), "w"); 466 | if (!fout) { 467 | printf("Cannot save file %s!\n", filename.c_str()); 468 | return 1; 469 | } 470 | 471 | for(int m = 0; m < numDocs; m++) { 472 | fprintf(fout, "Document %d\n", m); 473 | for (int l = 0; l < numSentiLabs; l++) { 474 | for (int z = 0; z < numTopics; z++) { 475 | fprintf(fout, "%f ", theta_dlz[m][l][z]); 476 | } 477 | fprintf(fout, "\n"); 478 | } 479 | } 480 | 481 | fclose(fout); 482 | return 0; 483 | } 484 | 485 | 486 | int model::save_model_phi_lzw(string filename) { 487 | 488 | FILE * fout = fopen(filename.c_str(), "w"); 489 | if (!fout) { 490 | printf("Cannot save file %s!\n", filename.c_str()); 491 | return 1; 492 | } 493 | 494 | for (int l = 0; l < numSentiLabs; l++) { 495 | for (int z = 0; z < numTopics; z++) { 496 | fprintf(fout, "Label:%d Topic:%d\n", l, z); 497 | for (int r = 0; r < vocabSize; r++) { 498 | fprintf(fout, "%.15f ", phi_lzw[l][z][r]); 499 | } 500 | fprintf(fout, "\n"); 501 | } 502 | } 503 | 504 | fclose(fout); 505 | return 0; 506 | } 507 | 508 | 509 | 510 | int model::save_model_others(string filename) { 511 | 512 | FILE * fout = fopen(filename.c_str(), "w"); 513 | if (!fout) { 514 | printf("Cannot save file %s!\n", filename.c_str()); 515 | return 1; 516 | } 517 | 518 | fprintf(fout, "data_dir=%s\n", this->data_dir.c_str()); 519 | fprintf(fout, "datasetFile=%s\n", this->datasetFile.c_str()); 520 | fprintf(fout, "result_dir=%s\n", this->result_dir.c_str()); 521 | fprintf(fout, "sentiLexFile=%s\n", this->sentiLexFile.c_str()); 522 | 523 | fprintf(fout, "\n-------------------- Corpus statistics -----------------------\n"); 524 | fprintf(fout, "numDocs=%d\n", numDocs); 525 | fprintf(fout, "corpusSize=%d\n", corpusSize); 526 | fprintf(fout, "aveDocLength=%d\n", aveDocLength); 527 | fprintf(fout, "vocabSize=%d\n", vocabSize); 528 | 529 | fprintf(fout, "\n---------------------- Model settings -----------------------\n"); 530 | fprintf(fout, "numSentiLabs=%d\n", numSentiLabs); 531 | fprintf(fout, "numTopics=%d\n", numTopics); 532 | fprintf(fout, "liter=%d\n", liter); 533 | fprintf(fout, "savestep=%d\n", savestep); 534 | fprintf(fout, "updateParaStep=%d\n", updateParaStep); 535 | 536 | fprintf(fout, "_alpha=%f\n", _alpha); 537 | fprintf(fout, "_beta=%f\n", _beta); 538 | fprintf(fout, "_gamma=%f\n", _gamma); 539 | 540 | fclose(fout); 541 | return 0; 542 | } 543 | 544 | 545 | int model::init_estimate() { 546 | 547 | int sentiLab, topic; 548 | srand(time(0)); // initialize for random number generation 549 | z.resize(numDocs); 550 | l.resize(numDocs); 551 | 552 | for (int m = 0; m < numDocs; m++) { 553 | int docLength = pdataset->pdocs[m]->length; 554 | z[m].resize(docLength); 555 | l[m].resize(docLength); 556 | 557 | for (int t = 0; t < docLength; t++) { 558 | if (pdataset->pdocs[m]->words[t] < 0) { 559 | printf("ERROE! word token %d has index smaller than 0 at doc[%d][%d]\n", pdataset->pdocs[m]->words[t], m, t); 560 | return 1; 561 | } 562 | 563 | if ((pdataset->pdocs[m]->priorSentiLabels[t] > -1) && (pdataset->pdocs[m]->priorSentiLabels[t] < numSentiLabs)) { 564 | sentiLab = pdataset->pdocs[m]->priorSentiLabels[t]; // incorporate prior information into the model 565 | 566 | } 567 | else { 568 | sentiLab = (int)(((double)rand() / RAND_MAX) * numSentiLabs); 569 | if (sentiLab == numSentiLabs) sentiLab = numSentiLabs -1; // to avoid over array boundary 570 | } 571 | l[m][t] = sentiLab; 572 | 573 | // random initialize the topic assginment 574 | topic = (int)(((double)rand() / RAND_MAX) * numTopics); 575 | if (topic == numTopics) topic = numTopics - 1; // to avoid over array boundary 576 | z[m][t] = topic; 577 | 578 | // model count assignments 579 | nd[m]++; 580 | ndl[m][sentiLab]++; 581 | ndlz[m][sentiLab][topic]++; 582 | nlzw[sentiLab][topic][pdataset->pdocs[m]->words[t]]++; 583 | nlz[sentiLab][topic]++; 584 | } 585 | } 586 | 587 | return 0; 588 | } 589 | 590 | 591 | 592 | int model::estimate() { 593 | 594 | int sentiLab, topic; 595 | mapname2labs::iterator it; 596 | 597 | printf("Sampling %d iterations!\n", niters); 598 | for (liter = 1; liter <= niters; liter++) { 599 | printf("Iteration %d ...\n", liter); 600 | for (int m = 0; m < numDocs; m++) { 601 | for (int n = 0; n < pdataset->pdocs[m]->length; n++) { 602 | sampling(m, n, sentiLab, topic); 603 | l[m][n] = sentiLab; 604 | z[m][n] = topic; 605 | } 606 | } 607 | 608 | if (updateParaStep > 0 && liter % updateParaStep == 0) { 609 | this->update_Parameters(); 610 | } 611 | 612 | if (savestep > 0 && liter % savestep == 0) { 613 | if (liter == niters) break; 614 | 615 | printf("Saving the model at iteration %d ...\n", liter); 616 | compute_pi_dl(); 617 | compute_theta_dlz(); 618 | compute_phi_lzw(); 619 | save_model(putils->generate_model_name(liter)); 620 | } 621 | } 622 | 623 | printf("Gibbs sampling completed!\n"); 624 | printf("Saving the final model!\n"); 625 | compute_pi_dl(); 626 | compute_theta_dlz(); 627 | compute_phi_lzw(); 628 | save_model(putils->generate_model_name(-1)); 629 | 630 | return 0; 631 | } 632 | 633 | 634 | int model::sampling(int m, int n, int& sentiLab, int& topic) { 635 | 636 | sentiLab = l[m][n]; 637 | topic = z[m][n]; 638 | int w = pdataset->pdocs[m]->words[n]; // the ID/index of the current word token in vocabulary 639 | double u; 640 | 641 | nd[m]--; 642 | ndl[m][sentiLab]--; 643 | ndlz[m][sentiLab][topic]--; 644 | nlzw[sentiLab][topic][pdataset->pdocs[m]->words[n]]--; 645 | nlz[sentiLab][topic]--; 646 | 647 | // do multinomial sampling via cumulative method 648 | for (int l = 0; l < numSentiLabs; l++) { 649 | for (int k = 0; k < numTopics; k++) { 650 | p[l][k] = (nlzw[l][k][w] + beta_lzw[l][k][w]) / (nlz[l][k] + betaSum_lz[l][k]) * 651 | (ndlz[m][l][k] + alpha_lz[l][k]) / (ndl[m][l] + alphaSum_l[l]) * 652 | (ndl[m][l] + gamma_dl[m][l]) / (nd[m] + gammaSum_d[m]); 653 | } 654 | } 655 | 656 | // accumulate multinomial parameters 657 | for (int l = 0; l < numSentiLabs; l++) { 658 | for (int k = 0; k < numTopics; k++) { 659 | if (k==0) { 660 | if (l==0) continue; 661 | else p[l][k] += p[l-1][numTopics-1]; // accumulate the sum of the previous array 662 | } 663 | else p[l][k] += p[l][k-1]; 664 | } 665 | } 666 | 667 | // probability normalization 668 | u = ((double)rand() / RAND_MAX) * p[numSentiLabs-1][numTopics-1]; 669 | 670 | // sample sentiment label l, where l \in [0, S-1] 671 | bool loopBreak=false; 672 | for (sentiLab = 0; sentiLab < numSentiLabs; sentiLab++) { 673 | for (topic = 0; topic < numTopics; topic++) { 674 | if (p[sentiLab][topic] > u) { 675 | loopBreak = true; 676 | break; 677 | } 678 | } 679 | if (loopBreak == true) { 680 | break; 681 | } 682 | } 683 | 684 | if (sentiLab == numSentiLabs) sentiLab = numSentiLabs - 1; // to avoid over array boundary 685 | if (topic == numTopics) topic = numTopics - 1; 686 | 687 | // add estimated 'z' and 'l' to count variables 688 | nd[m]++; 689 | ndl[m][sentiLab]++; 690 | ndlz[m][sentiLab][topic]++; 691 | nlzw[sentiLab][topic][pdataset->pdocs[m]->words[n]]++; 692 | nlz[sentiLab][topic]++; 693 | 694 | return 0; 695 | } 696 | 697 | 698 | int model::update_Parameters() { 699 | 700 | int ** data; // temp valuable for exporting 3-dimentional array to 2-dimentional 701 | double * alpha_temp; 702 | data = new int*[numTopics]; 703 | for (int k = 0; k < numTopics; k++) { 704 | data[k] = new int[numDocs]; 705 | for (int m = 0; m < numDocs; m++) { 706 | data[k][m] = 0; 707 | } 708 | } 709 | 710 | alpha_temp = new double[numTopics]; 711 | for (int k = 0; k < numTopics; k++){ 712 | alpha_temp[k] = 0.0; 713 | } 714 | 715 | // update alpha 716 | for (int j = 0; j < numSentiLabs; j++) { 717 | for (int k = 0; k < numTopics; k++) { 718 | for (int m = 0; m < numDocs; m++) { 719 | data[k][m] = ndlz[m][j][k]; // ntldsum[j][k][m]; 720 | } 721 | } 722 | 723 | for (int k = 0; k < numTopics; k++) { 724 | alpha_temp[k] = alpha_lz[j][k]; //alpha[j][k]; 725 | } 726 | 727 | polya_fit_simple(data, alpha_temp, numTopics, numDocs); 728 | 729 | // update alpha 730 | alphaSum_l[j] = 0.0; 731 | for (int k = 0; k < numTopics; k++) { 732 | alpha_lz[j][k] = alpha_temp[k]; 733 | alphaSum_l[j] += alpha_lz[j][k]; 734 | } 735 | } 736 | 737 | return 0; 738 | } 739 | -------------------------------------------------------------------------------- /src/math_func.cpp: -------------------------------------------------------------------------------- 1 | #include "math_func.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | //************************* asa032.cpp ************************************// 11 | //****************************************************************************80 12 | 13 | double alngam ( double xvalue, int *ifault ) 14 | 15 | //****************************************************************************80 16 | // 17 | // Purpose: 18 | // 19 | // ALNGAM computes the logarithm of the gamma function. 20 | // 21 | // Modified: 22 | // 23 | // 13 January 2008 24 | // 25 | // Author: 26 | // 27 | // Original FORTRAN77 version by Allan Macleod 28 | // C++ version by John Burkardt 29 | // 30 | // Reference: 31 | // 32 | // Allan Macleod, 33 | // Algorithm AS 245, 34 | // A Robust and Reliable Algorithm for the Logarithm of the Gamma Function, 35 | // Applied Statistics, 36 | // Volume 38, Number 2, 1989, pages 397-402. 37 | // 38 | // Parameters: 39 | // 40 | // Input, double XVALUE, the argument of the Gamma function. 41 | // 42 | // Output, int IFAULT, error flag. 43 | // 0, no error occurred. 44 | // 1, XVALUE is less than or equal to 0. 45 | // 2, XVALUE is too big. 46 | // 47 | // Output, double ALNGAM, the logarithm of the gamma function of X. 48 | // 49 | { 50 | double alr2pi = 0.918938533204673; 51 | double r1[9] = { 52 | -2.66685511495, 53 | -24.4387534237, 54 | -21.9698958928, 55 | 11.1667541262, 56 | 3.13060547623, 57 | 0.607771387771, 58 | 11.9400905721, 59 | 31.4690115749, 60 | 15.2346874070 }; 61 | double r2[9] = { 62 | -78.3359299449, 63 | -142.046296688, 64 | 137.519416416, 65 | 78.6994924154, 66 | 4.16438922228, 67 | 47.0668766060, 68 | 313.399215894, 69 | 263.505074721, 70 | 43.3400022514 }; 71 | double r3[9] = { 72 | -2.12159572323E+05, 73 | 2.30661510616E+05, 74 | 2.74647644705E+04, 75 | -4.02621119975E+04, 76 | -2.29660729780E+03, 77 | -1.16328495004E+05, 78 | -1.46025937511E+05, 79 | -2.42357409629E+04, 80 | -5.70691009324E+02 }; 81 | double r4[5] = { 82 | 0.279195317918525, 83 | 0.4917317610505968, 84 | 0.0692910599291889, 85 | 3.350343815022304, 86 | 6.012459259764103 }; 87 | double value; 88 | double x; 89 | double x1; 90 | double x2; 91 | double xlge = 510000.0; 92 | double xlgst = 1.0E+30; 93 | double y; 94 | 95 | x = xvalue; 96 | value = 0.0; 97 | // 98 | // Check the input. 99 | // 100 | if ( xlgst <= x ) 101 | { 102 | *ifault = 2; 103 | return value; 104 | } 105 | 106 | if ( x <= 0.0 ) 107 | { 108 | *ifault = 1; 109 | return value; 110 | } 111 | 112 | *ifault = 0; 113 | // 114 | // Calculation for 0 < X < 0.5 and 0.5 <= X < 1.5 combined. 115 | // 116 | if ( x < 1.5 ) 117 | { 118 | if ( x < 0.5 ) 119 | { 120 | value = - log ( x ); 121 | y = x + 1.0; 122 | // 123 | // Test whether X < machine epsilon. 124 | // 125 | if ( y == 1.0 ) 126 | { 127 | return value; 128 | } 129 | } 130 | else 131 | { 132 | value = 0.0; 133 | y = x; 134 | x = ( x - 0.5 ) - 0.5; 135 | } 136 | 137 | value = value + x * (((( 138 | r1[4] * y 139 | + r1[3] ) * y 140 | + r1[2] ) * y 141 | + r1[1] ) * y 142 | + r1[0] ) / (((( 143 | y 144 | + r1[8] ) * y 145 | + r1[7] ) * y 146 | + r1[6] ) * y 147 | + r1[5] ); 148 | 149 | return value; 150 | } 151 | // 152 | // Calculation for 1.5 <= X < 4.0. 153 | // 154 | if ( x < 4.0 ) 155 | { 156 | y = ( x - 1.0 ) - 1.0; 157 | 158 | value = y * (((( 159 | r2[4] * x 160 | + r2[3] ) * x 161 | + r2[2] ) * x 162 | + r2[1] ) * x 163 | + r2[0] ) / (((( 164 | x 165 | + r2[8] ) * x 166 | + r2[7] ) * x 167 | + r2[6] ) * x 168 | + r2[5] ); 169 | } 170 | // 171 | // Calculation for 4.0 <= X < 12.0. 172 | // 173 | else if ( x < 12.0 ) 174 | { 175 | value = (((( 176 | r3[4] * x 177 | + r3[3] ) * x 178 | + r3[2] ) * x 179 | + r3[1] ) * x 180 | + r3[0] ) / (((( 181 | x 182 | + r3[8] ) * x 183 | + r3[7] ) * x 184 | + r3[6] ) * x 185 | + r3[5] ); 186 | } 187 | // 188 | // Calculation for 12.0 <= X. 189 | // 190 | else 191 | { 192 | y = log ( x ); 193 | value = x * ( y - 1.0 ) - 0.5 * y + alr2pi; 194 | 195 | if ( x <= xlge ) 196 | { 197 | x1 = 1.0 / x; 198 | x2 = x1 * x1; 199 | 200 | value = value + x1 * ( ( 201 | r4[2] * 202 | x2 + r4[1] ) * 203 | x2 + r4[0] ) / ( ( 204 | x2 + r4[4] ) * 205 | x2 + r4[3] ); 206 | } 207 | } 208 | 209 | return value; 210 | } 211 | //****************************************************************************80 212 | 213 | double gamain ( double x, double p, int *ifault ) 214 | 215 | //****************************************************************************80 216 | // 217 | // Purpose: 218 | // 219 | // GAMAIN computes the incomplete gamma ratio. 220 | // 221 | // Discussion: 222 | // 223 | // A series expansion is used if P > X or X <= 1. Otherwise, a 224 | // continued fraction approximation is used. 225 | // 226 | // Modified: 227 | // 228 | // 17 January 2008 229 | // 230 | // Author: 231 | // 232 | // Original FORTRAN77 version by G Bhattacharjee 233 | // C++ version by John Burkardt 234 | // 235 | // Reference: 236 | // 237 | // G Bhattacharjee, 238 | // Algorithm AS 32: 239 | // The Incomplete Gamma Integral, 240 | // Applied Statistics, 241 | // Volume 19, Number 3, 1970, pages 285-287. 242 | // 243 | // Parameters: 244 | // 245 | // Input, double X, P, the parameters of the incomplete 246 | // gamma ratio. 0 <= X, and 0 < P. 247 | // 248 | // Output, int *IFAULT, error flag. 249 | // 0, no errors. 250 | // 1, P <= 0. 251 | // 2, X < 0. 252 | // 3, underflow. 253 | // 4, error return from the Log Gamma routine. 254 | // 255 | // Output, double GAMAIN, the value of the incomplete gamma ratio. 256 | // 257 | { 258 | double a; 259 | double acu = 1.0E-08; 260 | double an; 261 | double arg; 262 | double b; 263 | double dif; 264 | double factor; 265 | double g; 266 | double gin; 267 | int i; 268 | double oflo = 1.0E+37; 269 | double pn[6]; 270 | double rn; 271 | double term; 272 | double uflo = 1.0E-37; 273 | double value; 274 | // 275 | // Check the input. 276 | // 277 | if ( p <= 0.0 ) 278 | { 279 | *ifault = 1; 280 | value = 0.0; 281 | return value; 282 | } 283 | 284 | if ( x < 0.0 ) 285 | { 286 | *ifault = 2; 287 | value = 0.0; 288 | return value; 289 | } 290 | 291 | if ( x == 0.0 ) 292 | { 293 | *ifault = 0; 294 | value = 0.0; 295 | return value; 296 | } 297 | 298 | g = alngam ( p, ifault ); 299 | 300 | if ( *ifault != 0 ) 301 | { 302 | *ifault = 4; 303 | value = 0.0; 304 | return value; 305 | } 306 | 307 | arg = p * log ( x ) - x - g; 308 | 309 | if ( arg < log ( uflo ) ) 310 | { 311 | *ifault = 3; 312 | value = 0.0; 313 | return value; 314 | } 315 | 316 | *ifault = 0; 317 | factor = exp ( arg ); 318 | // 319 | // Calculation by series expansion. 320 | // 321 | if ( x <= 1.0 || x < p ) 322 | { 323 | gin = 1.0; 324 | term = 1.0; 325 | rn = p; 326 | 327 | for ( ; ; ) 328 | { 329 | rn = rn + 1.0; 330 | term = term * x / rn; 331 | gin = gin + term; 332 | 333 | if ( term <= acu ) 334 | { 335 | break; 336 | } 337 | } 338 | 339 | value = gin * factor / p; 340 | return value; 341 | } 342 | // 343 | // Calculation by continued fraction. 344 | // 345 | a = 1.0 - p; 346 | b = a + x + 1.0; 347 | term = 0.0; 348 | 349 | pn[0] = 1.0; 350 | pn[1] = x; 351 | pn[2] = x + 1.0; 352 | pn[3] = x * b; 353 | 354 | gin = pn[2] / pn[3]; 355 | 356 | for ( ; ; ) 357 | { 358 | a = a + 1.0; 359 | b = b + 2.0; 360 | term = term + 1.0; 361 | an = a * term; 362 | for ( i = 0; i <= 1; i++ ) 363 | { 364 | pn[i+4] = b * pn[i+2] - an * pn[i]; 365 | } 366 | 367 | if ( pn[5] != 0.0 ) 368 | { 369 | rn = pn[4] / pn[5]; 370 | dif = r8_abs ( gin - rn ); 371 | // 372 | // Absolute error tolerance satisfied? 373 | // 374 | if ( dif <= acu ) 375 | { 376 | // 377 | // Relative error tolerance satisfied? 378 | // 379 | if ( dif <= acu * rn ) 380 | { 381 | value = 1.0 - factor * gin; 382 | break; 383 | } 384 | } 385 | gin = rn; 386 | } 387 | 388 | for ( i = 0; i < 4; i++ ) 389 | { 390 | pn[i] = pn[i+2]; 391 | } 392 | 393 | if ( oflo <= r8_abs ( pn[4] ) ) 394 | { 395 | for ( i = 0; i < 4; i++ ) 396 | { 397 | pn[i] = pn[i] / oflo; 398 | } 399 | } 400 | } 401 | 402 | return value; 403 | } 404 | //****************************************************************************80 405 | 406 | void gamma_inc_values ( int *n_data, double *a, double *x, double *fx ) 407 | 408 | //****************************************************************************80 409 | // 410 | // Purpose: 411 | // 412 | // GAMMA_INC_VALUES returns some values of the incomplete Gamma function. 413 | // 414 | // Discussion: 415 | // 416 | // The (normalized) incomplete Gamma function P(A,X) is defined as: 417 | // 418 | // PN(A,X) = 1/Gamma(A) * Integral ( 0 <= T <= X ) T**(A-1) * exp(-T) dT. 419 | // 420 | // With this definition, for all A and X, 421 | // 422 | // 0 <= PN(A,X) <= 1 423 | // 424 | // and 425 | // 426 | // PN(A,INFINITY) = 1.0 427 | // 428 | // In Mathematica, the function can be evaluated by: 429 | // 430 | // 1 - GammaRegularized[A,X] 431 | // 432 | // Modified: 433 | // 434 | // 20 November 2004 435 | // 436 | // Author: 437 | // 438 | // John Burkardt 439 | // 440 | // Reference: 441 | // 442 | // Milton Abramowitz, Irene Stegun, 443 | // Handbook of Mathematical Functions, 444 | // National Bureau of Standards, 1964, 445 | // ISBN: 0-486-61272-4, 446 | // LC: QA47.A34. 447 | // 448 | // Stephen Wolfram, 449 | // The Mathematica Book, 450 | // Fourth Edition, 451 | // Cambridge University Press, 1999, 452 | // ISBN: 0-521-64314-7, 453 | // LC: QA76.95.W65. 454 | // 455 | // Parameters: 456 | // 457 | // Input/output, int *N_DATA. The user sets N_DATA to 0 before the 458 | // first call. On each call, the routine increments N_DATA by 1, and 459 | // returns the corresponding data; when there is no more data, the 460 | // output value of N_DATA will be 0 again. 461 | // 462 | // Output, double *A, the parameter of the function. 463 | // 464 | // Output, double *X, the argument of the function. 465 | // 466 | // Output, double *FX, the value of the function. 467 | // 468 | { 469 | # define N_MAX 20 470 | 471 | double a_vec[N_MAX] = { 472 | 0.10E+00, 473 | 0.10E+00, 474 | 0.10E+00, 475 | 0.50E+00, 476 | 0.50E+00, 477 | 0.50E+00, 478 | 0.10E+01, 479 | 0.10E+01, 480 | 0.10E+01, 481 | 0.11E+01, 482 | 0.11E+01, 483 | 0.11E+01, 484 | 0.20E+01, 485 | 0.20E+01, 486 | 0.20E+01, 487 | 0.60E+01, 488 | 0.60E+01, 489 | 0.11E+02, 490 | 0.26E+02, 491 | 0.41E+02 }; 492 | 493 | double fx_vec[N_MAX] = { 494 | 0.7382350532339351E+00, 495 | 0.9083579897300343E+00, 496 | 0.9886559833621947E+00, 497 | 0.3014646416966613E+00, 498 | 0.7793286380801532E+00, 499 | 0.9918490284064973E+00, 500 | 0.9516258196404043E-01, 501 | 0.6321205588285577E+00, 502 | 0.9932620530009145E+00, 503 | 0.7205974576054322E-01, 504 | 0.5891809618706485E+00, 505 | 0.9915368159845525E+00, 506 | 0.1018582711118352E-01, 507 | 0.4421745996289254E+00, 508 | 0.9927049442755639E+00, 509 | 0.4202103819530612E-01, 510 | 0.9796589705830716E+00, 511 | 0.9226039842296429E+00, 512 | 0.4470785799755852E+00, 513 | 0.7444549220718699E+00 }; 514 | 515 | double x_vec[N_MAX] = { 516 | 0.30E-01, 517 | 0.30E+00, 518 | 0.15E+01, 519 | 0.75E-01, 520 | 0.75E+00, 521 | 0.35E+01, 522 | 0.10E+00, 523 | 0.10E+01, 524 | 0.50E+01, 525 | 0.10E+00, 526 | 0.10E+01, 527 | 0.50E+01, 528 | 0.15E+00, 529 | 0.15E+01, 530 | 0.70E+01, 531 | 0.25E+01, 532 | 0.12E+02, 533 | 0.16E+02, 534 | 0.25E+02, 535 | 0.45E+02 }; 536 | 537 | if ( *n_data < 0 ) 538 | { 539 | *n_data = 0; 540 | } 541 | 542 | *n_data = *n_data + 1; 543 | 544 | if ( N_MAX < *n_data ) 545 | { 546 | *n_data = 0; 547 | *a = 0.0; 548 | *x = 0.0; 549 | *fx = 0.0; 550 | } 551 | else 552 | { 553 | *a = a_vec[*n_data-1]; 554 | *x = x_vec[*n_data-1]; 555 | *fx = fx_vec[*n_data-1]; 556 | } 557 | 558 | return; 559 | # undef N_MAX 560 | } 561 | //****************************************************************************80 562 | 563 | double r8_abs ( double x ) 564 | 565 | //****************************************************************************80 566 | // 567 | // Purpose: 568 | // 569 | // R8_ABS returns the absolute value of an R8. 570 | // 571 | // Modified: 572 | // 573 | // 17 January 2008 574 | // 575 | // Author: 576 | // 577 | // John Burkardt 578 | // 579 | // Parameters: 580 | // 581 | // Input, double X, the argument. 582 | // 583 | // Output, double R8_ABS, the absolute value of the argument. 584 | // 585 | { 586 | if ( 0.0 <= x ) 587 | { 588 | return x; 589 | } 590 | else 591 | { 592 | return ( - x ); 593 | } 594 | } 595 | //****************************************************************************80 596 | 597 | void timestamp ( void ) 598 | 599 | //****************************************************************************80 600 | // 601 | // Purpose: 602 | // 603 | // TIMESTAMP prints the current YMDHMS date as a time stamp. 604 | // 605 | // Example: 606 | // 607 | // 31 May 2001 09:45:54 AM 608 | // 609 | // Modified: 610 | // 611 | // 24 September 2003 612 | // 613 | // Author: 614 | // 615 | // John Burkardt 616 | // 617 | // Parameters: 618 | // 619 | // None 620 | // 621 | { 622 | # define TIME_SIZE 40 623 | 624 | static char time_buffer[TIME_SIZE]; 625 | const struct tm *tm; 626 | size_t len; 627 | time_t now; 628 | 629 | now = time ( NULL ); 630 | tm = localtime ( &now ); 631 | 632 | len = strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm ); 633 | 634 | cout << time_buffer << "\n"; 635 | 636 | return; 637 | # undef TIME_SIZE 638 | } 639 | 640 | 641 | 642 | //************************* asa103.cpp ************************************// 643 | //*****************************************************************************// 644 | 645 | //****************************************************************************80 646 | 647 | double digama ( double x, int *ifault ) 648 | 649 | //****************************************************************************80 650 | // 651 | // Purpose: 652 | // 653 | // DIGAMA calculates DIGAMMA ( X ) = d ( LOG ( GAMMA ( X ) ) ) / dX 654 | // 655 | // Modified: 656 | // 657 | // 18 January 2008 658 | // 659 | // Author: 660 | // 661 | // Jose Bernardo 662 | // FORTRAN90 version by John Burkardt 663 | // 664 | // Reference: 665 | // 666 | // Jose Bernardo, 667 | // Algorithm AS 103: 668 | // Psi ( Digamma ) Function, 669 | // Applied Statistics, 670 | // Volume 25, Number 3, 1976, pages 315-317. 671 | // 672 | // Parameters: 673 | // 674 | // Input, double X, the argument of the digamma function. 675 | // 0 < X. 676 | // 677 | // Output, int *IFAULT, error flag. 678 | // 0, no error. 679 | // 1, X <= 0. 680 | // 681 | // Output, double DIGAMA, the value of the digamma function at X. 682 | // 683 | { 684 | double c = 8.5; 685 | double d1 = -0.5772156649; 686 | double r; 687 | double s = 0.00001; 688 | double s3 = 0.08333333333; 689 | double s4 = 0.0083333333333; 690 | double s5 = 0.003968253968; 691 | double value; 692 | double y; 693 | // 694 | // Check the input. 695 | // 696 | if ( x <= 0.0 ) 697 | { 698 | value = 0.0; 699 | *ifault = 1; 700 | return value; 701 | } 702 | // 703 | // Initialize. 704 | // 705 | *ifault = 0; 706 | y = x; 707 | value = 0.0; 708 | // 709 | // Use approximation if argument <= S. 710 | // 711 | if ( y <= s ) 712 | { 713 | value = d1 - 1.0 / y; 714 | return value; 715 | } 716 | // 717 | // Reduce to DIGAMA(X + N) where (X + N) >= C. 718 | // 719 | while ( y < c ) 720 | { 721 | value = value - 1.0 / y; 722 | y = y + 1.0; 723 | } 724 | // 725 | // Use Stirling's (actually de Moivre's) expansion if argument > C. 726 | // 727 | r = 1.0 / y; 728 | value = value + log ( y ) - 0.5 * r; 729 | r = r * r; 730 | value = value - r * ( s3 - r * ( s4 - r * s5 ) ); 731 | 732 | return value; 733 | } 734 | //****************************************************************************80 735 | 736 | void psi_values ( int *n_data, double *x, double *fx ) 737 | 738 | //****************************************************************************80 739 | // 740 | // Purpose: 741 | // 742 | // PSI_VALUES returns some values of the Psi or Digamma function. 743 | // 744 | // Discussion: 745 | // 746 | // In Mathematica, the function can be evaluated by: 747 | // 748 | // PolyGamma[x] 749 | // 750 | // or 751 | // 752 | // Polygamma[0,x] 753 | // 754 | // PSI(X) = d ln ( Gamma ( X ) ) / d X = Gamma'(X) / Gamma(X) 755 | // 756 | // PSI(1) = -Euler's constant. 757 | // 758 | // PSI(X+1) = PSI(X) + 1 / X. 759 | // 760 | // Modified: 761 | // 762 | // 17 August 2004 763 | // 764 | // Author: 765 | // 766 | // John Burkardt 767 | // 768 | // Reference: 769 | // 770 | // Milton Abramowitz, Irene Stegun, 771 | // Handbook of Mathematical Functions, 772 | // National Bureau of Standards, 1964, 773 | // ISBN: 0-486-61272-4, 774 | // LC: QA47.A34. 775 | // 776 | // Stephen Wolfram, 777 | // The Mathematica Book, 778 | // Fourth Edition, 779 | // Cambridge University Press, 1999, 780 | // ISBN: 0-521-64314-7, 781 | // LC: QA76.95.W65. 782 | // 783 | // Parameters: 784 | // 785 | // Input/output, int *N_DATA. The user sets N_DATA to 0 before the 786 | // first call. On each call, the routine increments N_DATA by 1, and 787 | // returns the corresponding data; when there is no more data, the 788 | // output value of N_DATA will be 0 again. 789 | // 790 | // Output, double *X, the argument of the function. 791 | // 792 | // Output, double *FX, the value of the function. 793 | // 794 | { 795 | # define N_MAX 11 796 | 797 | double fx_vec[N_MAX] = { 798 | -0.5772156649015329E+00, 799 | -0.4237549404110768E+00, 800 | -0.2890398965921883E+00, 801 | -0.1691908888667997E+00, 802 | -0.6138454458511615E-01, 803 | 0.3648997397857652E-01, 804 | 0.1260474527734763E+00, 805 | 0.2085478748734940E+00, 806 | 0.2849914332938615E+00, 807 | 0.3561841611640597E+00, 808 | 0.4227843350984671E+00 }; 809 | 810 | double x_vec[N_MAX] = { 811 | 1.0E+00, 812 | 1.1E+00, 813 | 1.2E+00, 814 | 1.3E+00, 815 | 1.4E+00, 816 | 1.5E+00, 817 | 1.6E+00, 818 | 1.7E+00, 819 | 1.8E+00, 820 | 1.9E+00, 821 | 2.0E+00 }; 822 | 823 | if ( *n_data < 0 ) 824 | { 825 | *n_data = 0; 826 | } 827 | 828 | *n_data = *n_data + 1; 829 | 830 | if ( N_MAX < *n_data ) 831 | { 832 | *n_data = 0; 833 | *x = 0.0; 834 | *fx = 0.0; 835 | } 836 | else 837 | { 838 | *x = x_vec[*n_data-1]; 839 | *fx = fx_vec[*n_data-1]; 840 | } 841 | 842 | return; 843 | # undef N_MAX 844 | } 845 | //****************************************************************************80 846 | 847 | //void timestamp ( void ) 848 | // 849 | ////****************************************************************************80 850 | //// 851 | //// Purpose: 852 | //// 853 | //// TIMESTAMP prints the current YMDHMS date as a time stamp. 854 | //// 855 | //// Example: 856 | //// 857 | //// 31 May 2001 09:45:54 AM 858 | //// 859 | //// Modified: 860 | //// 861 | //// 24 September 2003 862 | //// 863 | //// Author: 864 | //// 865 | //// John Burkardt 866 | //// 867 | //// Parameters: 868 | //// 869 | //// None 870 | //// 871 | //{ 872 | //# define TIME_SIZE 40 873 | // 874 | // static char time_buffer[TIME_SIZE]; 875 | // const struct tm *tm; 876 | // size_t len; 877 | // time_t now; 878 | // 879 | // now = time ( NULL ); 880 | // tm = localtime ( &now ); 881 | // 882 | // len = strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm ); 883 | // 884 | // cout << time_buffer << "\n"; 885 | // 886 | // return; 887 | //# undef TIME_SIZE 888 | //} 889 | 890 | 891 | 892 | //************************* asa121.cpp ************************************// 893 | //*****************************************************************************// 894 | 895 | //****************************************************************************80 896 | 897 | //void timestamp ( void ) 898 | // 899 | ////****************************************************************************80 900 | //// 901 | //// Purpose: 902 | //// 903 | //// TIMESTAMP prints the current YMDHMS date as a time stamp. 904 | //// 905 | //// Example: 906 | //// 907 | //// 31 May 2001 09:45:54 AM 908 | //// 909 | //// Modified: 910 | //// 911 | //// 24 September 2003 912 | //// 913 | //// Author: 914 | //// 915 | //// John Burkardt 916 | //// 917 | //// Parameters: 918 | //// 919 | //// None 920 | //// 921 | //{ 922 | //# define TIME_SIZE 40 923 | // 924 | // static char time_buffer[TIME_SIZE]; 925 | // const struct tm *tm; 926 | // size_t len; 927 | // time_t now; 928 | // 929 | // now = time ( NULL ); 930 | // tm = localtime ( &now ); 931 | // 932 | // len = strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm ); 933 | // 934 | // cout << time_buffer << "\n"; 935 | // 936 | // return; 937 | //# undef TIME_SIZE 938 | //} 939 | //****************************************************************************80 940 | 941 | double trigam ( double x, int *ifault ) 942 | 943 | //****************************************************************************80 944 | // 945 | // Purpose: 946 | // 947 | // TRIGAM calculates trigamma(x) = d**2 log(gamma(x)) / dx**2 948 | // 949 | // Modified: 950 | // 951 | // 19 January 2008 952 | // 953 | // Author: 954 | // 955 | // BE Schneider 956 | // Modifications by John Burkardt 957 | // 958 | // Reference: 959 | // 960 | // BE Schneider, 961 | // Algorithm AS 121: 962 | // Trigamma Function, 963 | // Applied Statistics, 964 | // Volume 27, Number 1, pages 97-99, 1978. 965 | // 966 | // Parameters: 967 | // 968 | // Input, double X, the argument of the trigamma function. 969 | // 0 < X. 970 | // 971 | // Output, int *IFAULT, error flag. 972 | // 0, no error. 973 | // 1, X <= 0. 974 | // 975 | // Output, double TRIGAM, the value of the trigamma function at X. 976 | // 977 | { 978 | double a = 0.0001; 979 | double b = 5.0; 980 | double b2 = 0.1666666667; 981 | double b4 = -0.03333333333; 982 | double b6 = 0.02380952381; 983 | double b8 = -0.03333333333; 984 | double value; 985 | double y; 986 | double z; 987 | // 988 | // Check the input. 989 | // 990 | if ( x <= 0.0 ) 991 | { 992 | *ifault = 1; 993 | value = 0.0; 994 | return value; 995 | } 996 | 997 | *ifault = 0; 998 | z = x; 999 | // 1000 | // Use small value approximation if X <= A. 1001 | // 1002 | if ( x <= a ) 1003 | { 1004 | value = 1.0 / x / x; 1005 | return value; 1006 | } 1007 | // 1008 | // Increase argument to ( X + I ) >= B. 1009 | // 1010 | value = 0.0; 1011 | 1012 | while ( z < b ) 1013 | { 1014 | value = value + 1.0 / z / z; 1015 | z = z + 1.0; 1016 | } 1017 | // 1018 | // Apply asymptotic formula if argument is B or greater. 1019 | // 1020 | y = 1.0 / z / z; 1021 | 1022 | value = value + 0.5 * 1023 | y + ( 1.0 1024 | + y * ( b2 1025 | + y * ( b4 1026 | + y * ( b6 1027 | + y * b8 )))) / z; 1028 | 1029 | return value; 1030 | } 1031 | //****************************************************************************80 1032 | 1033 | void trigamma_values ( int *n_data, double *x, double *fx ) 1034 | 1035 | //****************************************************************************80 1036 | // 1037 | // Purpose: 1038 | // 1039 | // TRIGAMMA_VALUES returns some values of the TriGamma function. 1040 | // 1041 | // Discussion: 1042 | // 1043 | // In Mathematica, the function can be evaluated by: 1044 | // 1045 | // PolyGamma[1,x] 1046 | // 1047 | // TriGamma(X) = d^2 ln ( Gamma ( X ) ) / d X^2 1048 | // 1049 | // Modified: 1050 | // 1051 | // 16 September 2004 1052 | // 1053 | // Author: 1054 | // 1055 | // John Burkardt 1056 | // 1057 | // Reference: 1058 | // 1059 | // Milton Abramowitz, Irene Stegun, 1060 | // Handbook of Mathematical Functions, 1061 | // National Bureau of Standards, 1964, 1062 | // ISBN: 0-486-61272-4, 1063 | // LC: QA47.A34. 1064 | // 1065 | // Stephen Wolfram, 1066 | // The Mathematica Book, 1067 | // Fourth Edition, 1068 | // Cambridge University Press, 1999, 1069 | // ISBN: 0-521-64314-7, 1070 | // LC: QA76.95.W65. 1071 | // 1072 | // Parameters: 1073 | // 1074 | // Input/output, int *N_DATA. The user sets N_DATA to 0 before the 1075 | // first call. On each call, the routine increments N_DATA by 1, and 1076 | // returns the corresponding data; when there is no more data, the 1077 | // output value of N_DATA will be 0 again. 1078 | // 1079 | // Output, double *X, the argument of the function. 1080 | // 1081 | // Output, double *FX, the value of the function. 1082 | // 1083 | { 1084 | # define N_MAX 11 1085 | 1086 | double fx_vec[N_MAX] = { 1087 | 0.1644934066848226E+01, 1088 | 0.1433299150792759E+01, 1089 | 0.1267377205423779E+01, 1090 | 0.1134253434996619E+01, 1091 | 0.1025356590529597E+01, 1092 | 0.9348022005446793E+00, 1093 | 0.8584318931245799E+00, 1094 | 0.7932328301639984E+00, 1095 | 0.7369741375017002E+00, 1096 | 0.6879720582426356E+00, 1097 | 0.6449340668482264E+00 }; 1098 | 1099 | double x_vec[N_MAX] = { 1100 | 1.0E+00, 1101 | 1.1E+00, 1102 | 1.2E+00, 1103 | 1.3E+00, 1104 | 1.4E+00, 1105 | 1.5E+00, 1106 | 1.6E+00, 1107 | 1.7E+00, 1108 | 1.8E+00, 1109 | 1.9E+00, 1110 | 2.0E+00 }; 1111 | 1112 | if ( *n_data < 0 ) 1113 | { 1114 | *n_data = 0; 1115 | } 1116 | 1117 | *n_data = *n_data + 1; 1118 | 1119 | if ( N_MAX < *n_data ) 1120 | { 1121 | *n_data = 0; 1122 | *x = 0.0; 1123 | *fx = 0.0; 1124 | } 1125 | else 1126 | { 1127 | *x = x_vec[*n_data-1]; 1128 | *fx = fx_vec[*n_data-1]; 1129 | } 1130 | 1131 | return; 1132 | # undef N_MAX 1133 | } 1134 | -------------------------------------------------------------------------------- /src/inference.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | Joint Sentiment-Topic (JST) Model 3 | *********************************************************************** 4 | 5 | (C) Copyright 2013, Chenghua Lin and Yulan He 6 | 7 | Written by: Chenghua Lin, University of Aberdeen, chenghua.lin@abdn.ac.uk. 8 | Part of code is from http://gibbslda.sourceforge.net/. 9 | 10 | This file is part of JST implementation. 11 | 12 | JST is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | JST is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ***********************************************************************/ 28 | 29 | #include "inference.h" 30 | using namespace std; 31 | 32 | Inference::Inference(void) { 33 | 34 | numSentiLabs = 0; 35 | numTopics = 0; 36 | numDocs = 0; 37 | vocabSize = 0; 38 | newNumDocs = 0; 39 | newVocabSize = 0; 40 | _beta = -1.0; 41 | 42 | wordmapfile = "wordmap.txt"; 43 | tassign_suffix = ".newtassign"; 44 | pi_suffix = ".newpi"; 45 | theta_suffix = ".newtheta"; 46 | phi_suffix = ".newphi"; 47 | others_suffix = ".newothers"; 48 | twords_suffix = ".newtwords"; 49 | model_name = ""; 50 | data_dir = ""; 51 | datasetFile = ""; 52 | result_dir = ""; 53 | sentiLexFile = ""; 54 | 55 | updateParaStep = -1; 56 | savestep = 20; 57 | twords = 20; 58 | niters = 40; 59 | 60 | putils = new utils(); 61 | pmodelData = NULL; 62 | pnewData = NULL; 63 | } 64 | 65 | 66 | Inference::~Inference(void) { 67 | 68 | if (putils) 69 | delete putils; 70 | 71 | if (pmodelData) 72 | delete pmodelData; 73 | 74 | if (pnewData) 75 | delete pnewData; 76 | } 77 | 78 | 79 | int Inference::init(int argc, char ** argv) { 80 | 81 | if (putils->parse_args_inf(argc, argv, this)) { 82 | return 1; 83 | } 84 | 85 | if(init_inf()) { 86 | printf("Throw expectation in init_inf()! \n"); 87 | return 1; 88 | } 89 | 90 | if(inference()) { 91 | printf("Throw expectation in inference()! \n"); 92 | return 1; 93 | } 94 | 95 | return 0; 96 | } 97 | 98 | 99 | // read '.others' file 100 | int Inference::read_model_setting(string filename) { 101 | 102 | char buff[BUFF_SIZE_LONG]; 103 | string line; 104 | numSentiLabs = 0; 105 | numTopics = 0; 106 | numDocs = 0; 107 | vocabSize = 0; 108 | 109 | FILE * fin = fopen(filename.c_str(), "r"); 110 | if (!fin) { 111 | printf("Cannot read file %s!\n", filename.c_str()); 112 | return 1; 113 | } 114 | 115 | while (fgets(buff, BUFF_SIZE_LONG - 1, fin) != NULL) { 116 | line = buff; 117 | strtokenizer values(line, ": \t\r\n={}[]"); // \t\r\n are separators 118 | 119 | if (values.token(0) == "numSentiLabs") { 120 | numSentiLabs = atoi(values.token(1).c_str()); 121 | } 122 | else if (values.token(0) == "numTopics") { 123 | numTopics = atoi(values.token(1).c_str()); 124 | } 125 | else if (values.token(0) == "numDocs") { 126 | numDocs = atoi(values.token(1).c_str()); 127 | } 128 | else if (values.token(0) == "vocabSize") { 129 | vocabSize = atoi(values.token(1).c_str()); 130 | } 131 | if (numSentiLabs > 0 && numTopics > 0 && numDocs > 0 && vocabSize > 0) { 132 | break; 133 | } 134 | } 135 | 136 | fclose(fin); 137 | 138 | if (numSentiLabs == 0 || numTopics == 0 || numDocs == 0 || vocabSize == 0) { 139 | cout << "Throw exception in reading model parameter settings!\n" << filename << endl; 140 | return 1; 141 | } 142 | else { 143 | cout<<"data_dir = "<pdocs = new document*[numDocs]; 176 | pmodelData->vocabSize= vocabSize; 177 | pmodelData->numDocs= numDocs; 178 | l.resize(pmodelData->numDocs); 179 | z.resize(pmodelData->numDocs); 180 | 181 | for (int m = 0; m < numDocs; m++) { 182 | fgets(buff, BUFF_SIZE_LONG - 1, fin); // first line - ignore the document ID 183 | fgets(buff, BUFF_SIZE_LONG - 1, fin); // second line - read the sentiment label / topic assignments 184 | line = buff; 185 | strtokenizer strtok(line, " \t\r\n"); 186 | int length = strtok.count_tokens(); 187 | 188 | vector words; 189 | vector sentiLabs; 190 | vector topics; 191 | 192 | for (int j = 0; j < length; j++) { 193 | string token = strtok.token(j); 194 | strtokenizer tok(token, ":"); 195 | if (tok.count_tokens() != 3) { 196 | printf("Invalid word-sentiment-topic assignment format!\n"); 197 | return 1; 198 | } 199 | 200 | words.push_back(atoi(tok.token(0).c_str())); 201 | sentiLabs.push_back(atoi(tok.token(1).c_str())); 202 | topics.push_back(atoi(tok.token(2).c_str())); 203 | } 204 | 205 | // allocate and add training document to the corpus 206 | document * pdoc = new document(words); 207 | pmodelData->add_doc(pdoc, m); 208 | 209 | l[m].resize(sentiLabs.size()); 210 | for (int j = 0; j < (int)sentiLabs.size(); j++) { 211 | l[m][j] = sentiLabs[j]; 212 | } 213 | 214 | z[m].resize(topics.size()); 215 | for (int j = 0; j < (int)topics.size(); j++) { 216 | z[m][j] = topics[j]; 217 | } 218 | } 219 | fclose(fin); 220 | 221 | // init model counts 222 | nlzw.resize(numSentiLabs); 223 | for (int l = 0; l < numSentiLabs; l++) { 224 | nlzw[l].resize(numTopics); 225 | for (int z = 0; z < numTopics; z++) { 226 | nlzw[l][z].resize(vocabSize); 227 | for (int r = 0; r < vocabSize; r++) { 228 | nlzw[l][z][r] = 0; 229 | } 230 | } 231 | } 232 | 233 | nlz.resize(numSentiLabs); 234 | for (int l = 0; l < numSentiLabs; l++) { 235 | nlz[l].resize(numTopics); 236 | for (int z = 0; z < numTopics; z++) { 237 | nlz[l][z] = 0; 238 | } 239 | } 240 | 241 | // recover count values from trained model 242 | for (int m = 0; m < pmodelData->numDocs; m++) { 243 | int docLength = pmodelData->pdocs[m]->length; 244 | for (int n = 0; n < docLength; n++) { 245 | int w = pmodelData->pdocs[m]->words[n]; 246 | int sentiLab = this->l[m][n]; 247 | int topic = this->z[m][n]; 248 | 249 | nlzw[sentiLab][topic][w]++; 250 | nlz[sentiLab][topic]++; 251 | } 252 | } 253 | 254 | return 0; 255 | } 256 | 257 | 258 | 259 | int Inference::init_inf() { 260 | 261 | pmodelData = new dataset(); 262 | pnewData = new dataset(result_dir); 263 | 264 | if(read_model_setting(model_dir + model_name + ".others")) { 265 | printf("Throw exception in read_para_setting()!\n"); 266 | return 1; 267 | } 268 | 269 | // load model 270 | if(load_model(model_dir + model_name + ".tassign")) { 271 | printf("Throw exception in load_model()!\n"); 272 | return 1; 273 | } 274 | 275 | // *** TODO move the function to dataset class 276 | if(read_newData(data_dir + datasetFile)) { 277 | printf("Throw exception in read_newData()!\n"); 278 | return 1; 279 | } 280 | 281 | if(init_parameters()) { 282 | printf("Throw exception in init_parameters!\n"); 283 | return 1; 284 | } 285 | 286 | printf("Testset statistics: \n"); 287 | printf("numDocs = %d\n", pnewData->numDocs); 288 | printf("vocabSize = %d\n", pnewData->vocabSize); 289 | printf("numNew_word = %d\n", (int)(pnewData->newWords.size())); 290 | 291 | // init inf 292 | int sentiLab, topic; 293 | new_z.resize(pnewData->numDocs); 294 | new_l.resize(pnewData->numDocs); 295 | 296 | for (int m = 0; m < pnewData->numDocs; m++) { 297 | int docLength = pnewData->_pdocs[m]->length; 298 | new_z[m].resize(docLength); 299 | new_l[m].resize(docLength); 300 | for (int t = 0; t < docLength; t++) { 301 | if (pnewData->_pdocs[m]->words[t] < 0) { 302 | printf("ERROE! word token %d has index smaller than 0 in doc[%d][%d]\n", pnewData->_pdocs[m]->words[t], m, t); 303 | return 1; 304 | } 305 | 306 | // sample sentiment label 307 | if ((pnewData->pdocs[m]->priorSentiLabels[t] > -1) && (pnewData->pdocs[m]->priorSentiLabels[t] < numSentiLabs)) { 308 | sentiLab = pnewData->pdocs[m]->priorSentiLabels[t]; // incorporate prior information into the model 309 | } 310 | else { 311 | sentiLab = (int)(((double)rand() / RAND_MAX) * numSentiLabs); 312 | if (sentiLab == numSentiLabs) sentiLab = numSentiLabs -1; 313 | } 314 | new_l[m][t] = sentiLab; 315 | 316 | // sample topic label 317 | topic = (int)(((double)rand() / RAND_MAX) * numTopics); 318 | if (topic == numTopics) topic = numTopics - 1; 319 | new_z[m][t] = topic; 320 | 321 | new_nd[m]++; 322 | new_ndl[m][sentiLab]++; 323 | new_ndlz[m][sentiLab][topic]++; 324 | new_nlzw[sentiLab][topic][pnewData->_pdocs[m]->words[t]]++; 325 | new_nlz[sentiLab][topic]++; 326 | } 327 | } 328 | 329 | return 0; 330 | } 331 | 332 | 333 | int Inference::inference() { 334 | 335 | int sentiLab, topic; 336 | printf("Sampling %d iterations for inference!\n", niters); 337 | 338 | liter = 0; 339 | for (liter = 1; liter <= niters; liter++) { 340 | printf("Iteration %d ...\n", liter); 341 | for (int m = 0; m < pnewData->numDocs; m++) { 342 | for (int n = 0; n < pnewData->pdocs[m]->length; n++) { 343 | inf_sampling(m, n, sentiLab, topic); 344 | new_l[m][n] = sentiLab; 345 | new_z[m][n] = topic; 346 | } 347 | } 348 | 349 | if (savestep > 0 && liter % savestep == 0) { 350 | if (liter == niters) break; 351 | 352 | printf("Saving the model at iteration %d ...\n", liter); 353 | compute_newpi(); 354 | compute_newtheta(); 355 | compute_newphi(); 356 | save_model(model_name + "_" + putils->generate_model_name(liter)); 357 | } 358 | } 359 | 360 | printf("Gibbs sampling completed!\n"); 361 | printf("Saving the final model!\n"); 362 | compute_newpi(); 363 | compute_newtheta(); 364 | compute_newphi(); 365 | save_model(model_name + "_" + putils->generate_model_name(-1)); 366 | 367 | return 0; 368 | } 369 | 370 | 371 | int Inference::init_parameters() { 372 | 373 | // model counts 374 | new_p.resize(numSentiLabs); 375 | for (int l = 0; l < numSentiLabs; l++) { 376 | new_p[l].resize(numTopics); 377 | for (int z = 0; z < numTopics; z++) { 378 | new_p[l][z] = 0.0; 379 | } 380 | } 381 | 382 | new_nd.resize(pnewData->numDocs); 383 | for (int m = 0; m < pnewData->numDocs; m++) { 384 | new_nd[m] = 0; 385 | } 386 | 387 | new_ndl.resize(pnewData->numDocs); 388 | for (int m = 0; m < pnewData->numDocs; m++) { 389 | new_ndl[m].resize(numSentiLabs); 390 | for (int l = 0; l < numSentiLabs; l++) { 391 | new_ndl[m][l] = 0; 392 | } 393 | } 394 | 395 | new_ndlz.resize(pnewData->numDocs); 396 | for (int m = 0; m < pnewData->numDocs; m++) { 397 | new_ndlz[m].resize(numSentiLabs); 398 | for (int l = 0; l < numSentiLabs; l++) { 399 | new_ndlz[m][l].resize(numTopics); 400 | for (int z = 0; z < numTopics; z++) { 401 | new_ndlz[m][l][z] = 0; 402 | } 403 | } 404 | } 405 | 406 | new_nlzw.resize(numSentiLabs); 407 | for (int l = 0; l < numSentiLabs; l++) { 408 | new_nlzw[l].resize(numTopics); 409 | for (int z = 0; z < numTopics; z++) { 410 | new_nlzw[l][z].resize(pnewData->vocabSize); 411 | for (int r = 0; r < pnewData->vocabSize; r++) { 412 | new_nlzw[l][z][r] = 0; 413 | } 414 | } 415 | } 416 | 417 | new_nlz.resize(numSentiLabs); 418 | for (int l = 0; l < numSentiLabs; l++) { 419 | new_nlz[l].resize(numTopics); 420 | for (int z = 0; z < numTopics; z++) { 421 | new_nlz[l][z] = 0; 422 | } 423 | } 424 | 425 | // model parameters 426 | newpi_dl.resize(pnewData->numDocs); 427 | for (int m = 0; m < pnewData->numDocs; m++) { 428 | newpi_dl[m].resize(numSentiLabs); 429 | } 430 | 431 | newtheta_dlz.resize(pnewData->numDocs); 432 | for (int m = 0; m < pnewData->numDocs; m++) { 433 | newtheta_dlz[m].resize(numSentiLabs); 434 | for (int l = 0; l < numSentiLabs; l++) { 435 | newtheta_dlz[m][l].resize(numTopics); 436 | } 437 | } 438 | 439 | newphi_lzw.resize(numSentiLabs); 440 | for (int l = 0; l < numSentiLabs; l++) { 441 | newphi_lzw[l].resize(numTopics); 442 | for (int z = 0; z < numTopics; z++) { 443 | newphi_lzw[l][z].resize(pnewData->vocabSize); 444 | } 445 | } 446 | 447 | // hyperparameters 448 | _alpha = (double)pnewData->aveDocLength * 0.05 / (double)(numSentiLabs * numTopics); 449 | alpha_lz.resize(numSentiLabs); 450 | alphaSum_l.resize(numSentiLabs); 451 | for (int l = 0; l < numSentiLabs; l++) { 452 | alphaSum_l[l] = 0.0; 453 | alpha_lz[l].resize(numTopics); 454 | for (int z = 0; z < numTopics; z++) { 455 | alpha_lz[l][z] = _alpha; 456 | alphaSum_l[l] += alpha_lz[l][z]; 457 | } 458 | } 459 | 460 | // gamma 461 | gamma_l.resize(numSentiLabs); 462 | gammaSum = 0.0; 463 | for (int l = 0; l < numSentiLabs; l++) { 464 | gamma_l[l] = (double)pnewData->aveDocLength * 0.05 / (double)numSentiLabs; 465 | gammaSum += gamma_l[l]; 466 | } 467 | 468 | //beta 469 | if (_beta <= 0) { 470 | _beta = 0.01; 471 | } 472 | beta_lzw.resize(numSentiLabs); 473 | betaSum_lz.resize(numSentiLabs); 474 | for (int l = 0; l < numSentiLabs; l++) { 475 | beta_lzw[l].resize(numTopics); 476 | betaSum_lz[l].resize(numTopics); 477 | for (int z = 0; z < numTopics; z++) { 478 | beta_lzw[l][z].resize(pnewData->vocabSize); 479 | for (int r = 0; r < pnewData->vocabSize; r++) { 480 | beta_lzw[l][z][r] = _beta; 481 | betaSum_lz[l][z] += beta_lzw[l][z][r]; 482 | } 483 | } 484 | } 485 | 486 | // incorporate prior knowledge into beta 487 | if (sentiLexFile != "") { 488 | // word prior transformation matrix 489 | lambda_lw.resize(numSentiLabs); 490 | for (int l = 0; l < numSentiLabs; l++) { 491 | lambda_lw[l].resize(pnewData->vocabSize); 492 | for (int r = 0; r < pnewData->vocabSize; r++) 493 | lambda_lw[l][r] = 1; 494 | } 495 | // MUST init beta_lzw first before incorporating prior information into beta 496 | this->prior2beta(); 497 | } 498 | 499 | return 0; 500 | } 501 | 502 | 503 | 504 | int Inference::inf_sampling(int m, int n, int& sentiLab, int& topic) { 505 | sentiLab = new_l[m][n]; 506 | topic = new_z[m][n]; 507 | int w = pnewData->pdocs[m]->words[n]; // word index of trained model 508 | int _w = pnewData->_pdocs[m]->words[n]; // word index of test data 509 | double u; 510 | 511 | new_nd[m]--; 512 | new_ndl[m][sentiLab]--; 513 | new_ndlz[m][sentiLab][topic]--; 514 | new_nlzw[sentiLab][topic][_w]--; 515 | new_nlz[sentiLab][topic]--; 516 | 517 | // do multinomial sampling via cumulative method 518 | for (int l = 0; l < numSentiLabs; l++) { 519 | for (int k = 0; k < numTopics; k++) { 520 | new_p[l][k] = (nlzw[l][k][w] + new_nlzw[l][k][_w] + beta_lzw[l][k][_w]) / (nlz[l][k] + new_nlz[l][k] + betaSum_lz[l][k]) * 521 | (new_ndlz[m][l][k] + alpha_lz[l][k]) / (new_ndl[m][l] + alphaSum_l[l]) * 522 | (new_ndl[m][l] + gamma_l[l]) / (new_nd[m] + gammaSum); 523 | } 524 | } 525 | 526 | // accumulate multinomial parameters 527 | for (int l = 0; l < numSentiLabs; l++) { 528 | for (int k = 0; k < numTopics; k++) { 529 | if (k==0) { 530 | if (l==0) continue; 531 | else new_p[l][k] += new_p[l-1][numTopics-1]; 532 | } 533 | else new_p[l][k] += new_p[l][k-1]; 534 | } 535 | } 536 | // probability normalization 537 | u = ((double)rand() / RAND_MAX) * new_p[numSentiLabs-1][numTopics-1]; 538 | 539 | for (sentiLab = 0; sentiLab < numSentiLabs; sentiLab++) { 540 | for (topic = 0; topic < numTopics; topic++) { 541 | if (new_p[sentiLab][topic] > u) { 542 | goto stop; 543 | } 544 | } 545 | } 546 | 547 | stop: 548 | if (sentiLab == numSentiLabs) sentiLab = numSentiLabs - 1; // the max value of label is (S - 1) 549 | if (topic == numTopics) topic = numTopics - 1; 550 | 551 | // add estimated 'z' and 'l' to counts 552 | new_nd[m]++; 553 | new_ndl[m][sentiLab]++; 554 | new_ndlz[m][sentiLab][topic]++; 555 | new_nlzw[sentiLab][topic][_w]++; 556 | new_nlz[sentiLab][topic]++; 557 | 558 | return 0; 559 | } 560 | 561 | 562 | int Inference::read_newData(string filename) { 563 | 564 | mapword2id::iterator it; 565 | map::iterator _it; 566 | mapword2atr::iterator itatr; 567 | mapword2prior::iterator sentiIt; 568 | string line; 569 | char buff[BUFF_SIZE_LONG]; 570 | 571 | pmodelData->read_wordmap(model_dir + "wordmap.txt", word2id); // map word2id 572 | pmodelData->read_wordmap(model_dir + "wordmap.txt", id2word); // map id2word 573 | 574 | // read sentiment lexicon file 575 | if (sentiLexFile != "") { 576 | if (pnewData->read_senti_lexicon((sentiLexFile).c_str())) { 577 | printf("Error! Cannot read sentiFile %s!\n", sentiLexFile.c_str()); 578 | delete pnewData; 579 | return 1; 580 | } 581 | else { 582 | this->sentiLex = pnewData->sentiLex; 583 | } 584 | } 585 | 586 | if (word2id.size() <= 0) { 587 | printf("Invalid wordmap!\n"); 588 | return 1; 589 | } 590 | 591 | // read test data 592 | ifstream fin; 593 | fin.open(filename.c_str(), ifstream::in); 594 | if(!fin) { 595 | printf("Cannot read file %s!\n", filename.c_str()); 596 | return 1; 597 | } 598 | 599 | vector docs; 600 | int numDocs = 0; 601 | 602 | while (fin.getline(buff, BUFF_SIZE_LONG)) { 603 | line = buff; 604 | if(!line.empty()) { 605 | docs.push_back(line); 606 | numDocs++; 607 | } 608 | } 609 | fin.close(); 610 | 611 | if (numDocs <= 0) { 612 | printf("Error! No documents found in test data %s.\n", filename.c_str()); 613 | return 1; 614 | } 615 | 616 | pnewData->numDocs = numDocs; 617 | // allocate memory 618 | if (pnewData->pdocs) { 619 | pnewData->deallocate(); 620 | } 621 | else { 622 | pnewData->pdocs = new document*[pnewData->numDocs]; 623 | } 624 | pnewData->_pdocs = new document*[pnewData->numDocs]; 625 | pnewData->vocabSize = 0; 626 | pnewData->corpusSize = 0; 627 | 628 | // process each document 629 | for (int i = 0; i < pnewData->numDocs; i++) { 630 | line = docs.at(i); 631 | strtokenizer strtok(line, " \t\r\n"); // \t\r\n are separators 632 | int docLength = strtok.count_tokens(); 633 | if (docLength <= 0) { 634 | printf("Invalid (empty) document!\n"); 635 | pnewData->deallocate(); 636 | pnewData->numDocs = 0; 637 | pnewData->vocabSize = 0; 638 | return 1; 639 | } 640 | 641 | pnewData->corpusSize += docLength - 1; 642 | vector doc; 643 | vector _doc; 644 | vector priorSentiLabels; 645 | 646 | // process each token in the document 647 | for (int k = 1; k < docLength; k++) { 648 | it = word2id.find(strtok.token(k)); 649 | if (it == word2id.end()) { 650 | pnewData->newWords.push_back(strtok.token(k).c_str()); 651 | // word not found, i.e., word unseen in training data 652 | // do anything? (future decision) 653 | } 654 | else { 655 | int _id; 656 | _it = id2_id.find(it->second); 657 | if (_it == id2_id.end()) { 658 | _id = id2_id.size(); 659 | id2_id.insert(pair(it->second, _id)); 660 | _id2id.insert(pair(_id, it->second)); 661 | } 662 | else { 663 | _id = _it->second; 664 | } 665 | 666 | doc.push_back(it->second); 667 | _doc.push_back(_id); 668 | 669 | // 'word2atr' is specific to new/test dataset 670 | itatr = word2atr.find(strtok.token(k).c_str()); 671 | int priorSenti = -1; 672 | if (itatr == word2atr.end()) { 673 | sentiIt = sentiLex.find(strtok.token(k).c_str()); // check whether the word token can be found in the sentiment lexicon 674 | if (sentiIt != sentiLex.end()) { 675 | priorSenti = sentiIt->second.id; 676 | } 677 | // encode sentiment info into word2atr 678 | Word_atr temp = {_id, priorSenti}; // vocabulary index; word polarity 679 | word2atr.insert(pair(strtok.token(k), temp)); 680 | priorSentiLabels.push_back(priorSenti); 681 | } 682 | else { 683 | priorSentiLabels.push_back(itatr->second.polarity); 684 | } 685 | 686 | } 687 | } 688 | 689 | // allocate memory for new doc 690 | document * pdoc = new document(doc, priorSentiLabels, "inference"); 691 | document * _pdoc = new document(_doc, priorSentiLabels, "inference"); 692 | 693 | pdoc->docID = strtok.token(0).c_str(); 694 | _pdoc->docID = strtok.token(0).c_str(); 695 | 696 | // add new doc 697 | pnewData->add_doc(pdoc, i); 698 | pnewData->_add_doc(_pdoc, i); 699 | } 700 | 701 | // update number of new words 702 | pnewData->vocabSize = id2_id.size(); 703 | pnewData->aveDocLength = pnewData->corpusSize / pnewData->numDocs; 704 | this->newNumDocs = pnewData->numDocs; 705 | this->newVocabSize = pnewData->vocabSize; 706 | 707 | if (newVocabSize == 0) { 708 | printf("ERROR! Vocabulary size of test set after removing unseen words is 0! \n"); 709 | return 1; 710 | } 711 | 712 | return 0; 713 | } 714 | 715 | 716 | void Inference::compute_newpi() { 717 | 718 | for (int m = 0; m < pnewData->numDocs; m++) { 719 | for (int l = 0; l < numSentiLabs; l++) { 720 | newpi_dl[m][l] = (new_ndl[m][l] + gamma_l[l]) / (new_nd[m] + gammaSum); 721 | } 722 | } 723 | } 724 | 725 | 726 | void Inference::compute_newtheta() { 727 | 728 | for (int m = 0; m < pnewData->numDocs; m++) { 729 | for (int l = 0; l < numSentiLabs; l++) { 730 | for (int z = 0; z < numTopics; z++) { 731 | newtheta_dlz[m][l][z] = (new_ndlz[m][l][z] + alpha_lz[l][z]) / (new_ndl[m][l] + alphaSum_l[l]); 732 | } 733 | } 734 | } 735 | } 736 | 737 | 738 | int Inference::compute_newphi() { 739 | map::iterator it; 740 | 741 | for (int l = 0; l < numSentiLabs; l++) { 742 | for (int z = 0; z < numTopics; z++) { 743 | for(int r = 0; r < pnewData->vocabSize; r++) { 744 | it = _id2id.find(r); 745 | if (it != _id2id.end()) { 746 | newphi_lzw[l][z][r] = (nlzw[l][z][it->second] + new_nlzw[l][z][r] + beta_lzw[l][z][r]) / (nlz[l][z] + new_nlz[l][z] + betaSum_lz[l][z]); 747 | } 748 | else { 749 | printf("Error! Cannot find word [%d] !\n", r); 750 | return 1; 751 | } 752 | } 753 | } 754 | } 755 | 756 | return 0; 757 | } 758 | 759 | 760 | int Inference::save_model(string model_name) { 761 | 762 | if (save_model_newtassign(result_dir + model_name + tassign_suffix)) 763 | return 1; 764 | 765 | if (save_model_newtwords(result_dir + model_name + twords_suffix)) 766 | return 1; 767 | 768 | if (save_model_newpi_dl(result_dir + model_name + pi_suffix)) 769 | return 1; 770 | 771 | if (save_model_newtheta_dlz(result_dir + model_name + theta_suffix)) 772 | return 1; 773 | 774 | if (save_model_newphi_lzw(result_dir + model_name + phi_suffix)) 775 | return 1; 776 | 777 | if (save_model_newothers(result_dir + model_name + others_suffix)) 778 | return 1; 779 | 780 | return 0; 781 | } 782 | 783 | 784 | 785 | int Inference::save_model_newpi_dl(string filename) { 786 | 787 | FILE * fout = fopen(filename.c_str(), "w"); 788 | if (!fout) { 789 | printf("Cannot save file %s!\n", filename.c_str()); 790 | return 1; 791 | } 792 | 793 | for (int m = 0; m < pnewData->numDocs; m++) { 794 | fprintf(fout, "d_%d %s ", m, pnewData->pdocs[m]->docID.c_str()); 795 | for (int l = 0; l < numSentiLabs; l++) { 796 | fprintf(fout, "%f ", newpi_dl[m][l]); 797 | } 798 | fprintf(fout, "\n"); 799 | } 800 | 801 | fclose(fout); 802 | return 0; 803 | } 804 | 805 | 806 | 807 | int Inference::save_model_newtheta_dlz(string filename) { 808 | 809 | FILE * fout = fopen(filename.c_str(), "w"); 810 | if (!fout) { 811 | printf("Cannot save file %s!\n", filename.c_str()); 812 | return 1; 813 | } 814 | 815 | for(int m = 0; m < pnewData->numDocs; m++) { 816 | fprintf(fout, "Document %d\n", m); 817 | for (int l = 0; l < numSentiLabs; l++) { 818 | for (int z = 0; z < numTopics; z++) { 819 | fprintf(fout, "%f ", newtheta_dlz[m][l][z]); 820 | } 821 | fprintf(fout, "\n"); 822 | } 823 | } 824 | 825 | fclose(fout); 826 | return 0; 827 | } 828 | 829 | 830 | 831 | int Inference::save_model_newphi_lzw(string filename) { 832 | 833 | FILE * fout = fopen(filename.c_str(), "w"); 834 | if (!fout) { 835 | printf("Cannot save file %s!\n", filename.c_str()); 836 | return 1; 837 | } 838 | 839 | for (int l = 0; l < numSentiLabs; l++) { 840 | for (int z = 0; z < numTopics; z++) { 841 | fprintf(fout, "Label:%d Topic:%d\n", l, z); 842 | for (int r = 0; r < pnewData->vocabSize; r++) { 843 | fprintf(fout, "%.15f ", newphi_lzw[l][z][r]); 844 | } 845 | fprintf(fout, "\n"); 846 | } 847 | } 848 | 849 | fclose(fout); 850 | return 0; 851 | } 852 | 853 | 854 | int Inference::save_model_newothers(string filename) { 855 | 856 | FILE * fout = fopen(filename.c_str(), "w"); 857 | if (!fout) { 858 | printf("Cannot save file %s!\n", filename.c_str()); 859 | return 1; 860 | } 861 | 862 | fprintf(fout, "model_dir=%s\n", model_dir.c_str()); 863 | fprintf(fout, "model_name=%s\n", model_name.c_str()); 864 | fprintf(fout, "data_dir=%s\n", data_dir.c_str()); 865 | fprintf(fout, "datasetFile=%s\n", datasetFile.c_str()); 866 | fprintf(fout, "result_dir=%s\n", result_dir.c_str()); 867 | fprintf(fout, "niters-inf=%d\n", niters); 868 | fprintf(fout, "savestep-inf=%d\n", savestep); 869 | 870 | fprintf(fout, "\n------------------ Testset ** %s ** statistics ----------------------\n", datasetFile.c_str()); 871 | fprintf(fout, "newNumDocs=%d\n", pnewData->numDocs); 872 | fprintf(fout, "newCorpusSize=%d\n", pnewData->corpusSize); 873 | fprintf(fout, "newVocabSize=%d\n", pnewData->vocabSize); 874 | fprintf(fout, "numNewWords=%d\n", (int)(pnewData->newWords.size())); 875 | fprintf(fout, "aveDocLength=%d\n", pnewData->aveDocLength); 876 | fprintf(fout, "\n------------------ Loaded model settings ----------------------\n"); 877 | fprintf(fout, "numSentiLabs=%d\n", numSentiLabs); 878 | fprintf(fout, "numTopics=%d\n", numTopics); 879 | fprintf(fout, "numDocs=%d\n", pmodelData->numDocs); 880 | fprintf(fout, "corpusSize=%d\n", pmodelData->corpusSize); 881 | fprintf(fout, "vocabSize=%d\n", pmodelData->vocabSize); 882 | 883 | fclose(fout); 884 | return 0; 885 | } 886 | 887 | 888 | 889 | int Inference::save_model_newtwords(string filename) { 890 | 891 | mapid2word::iterator it; // typedef map mapid2word 892 | map::iterator _it; 893 | 894 | FILE * fout = fopen(filename.c_str(), "w"); 895 | if (!fout) { 896 | printf("Cannot save file %s!\n", filename.c_str()); 897 | return 1; 898 | } 899 | 900 | if (twords > pnewData->vocabSize) { 901 | twords = pnewData->vocabSize; 902 | } 903 | 904 | for (int l = 0; l < numSentiLabs; l++) { 905 | fprintf(fout, "Label %dth\n", l); 906 | for (int k = 0; k < numTopics; k++) { 907 | vector > words_probs; 908 | pair word_prob; 909 | for (int w = 0; w < pnewData->vocabSize; w++) { 910 | word_prob.first = w; 911 | word_prob.second = newphi_lzw[l][k][w]; 912 | words_probs.push_back(word_prob); 913 | } 914 | 915 | std::sort(words_probs.begin(), words_probs.end(), sort_pred()); 916 | 917 | fprintf(fout, "Topic %dth:\n", k); 918 | for (int i = 0; i < twords; i++) { 919 | _it = _id2id.find(words_probs[i].first); 920 | if (_it == _id2id.end()) { 921 | continue; 922 | } 923 | it = id2word.find(_it->second); 924 | if (it != id2word.end()) { 925 | fprintf(fout, "\t%s %f\n", (it->second).c_str(), words_probs[i].second); 926 | } 927 | } 928 | } // for topic 929 | } // for label 930 | 931 | fclose(fout); 932 | return 0; 933 | } 934 | 935 | 936 | int Inference::save_model_newtassign(string filename) { 937 | 938 | FILE * fout = fopen(filename.c_str(), "w"); 939 | if (!fout) { 940 | printf("Cannot save file %s!\n", filename.c_str()); 941 | return 1; 942 | } 943 | 944 | for (int m = 0; m < pnewData->numDocs; m++) { 945 | fprintf(fout, "%s \n", pnewData->pdocs[m]->docID.c_str()); 946 | for (int n = 0; n < pnewData->pdocs[m]->length; n++) { 947 | fprintf(fout, "%d:%d:%d ", pnewData->pdocs[m]->words[n], new_l[m][n], new_z[m][n]); // wordID:sentiLab:topic 948 | } 949 | fprintf(fout, "\n"); 950 | } 951 | 952 | fclose(fout); 953 | return 0; 954 | } 955 | 956 | 957 | 958 | int Inference::prior2beta() { 959 | mapword2atr::iterator wordIt; 960 | mapword2prior::iterator sentiIt; 961 | 962 | for (sentiIt = sentiLex.begin(); sentiIt != sentiLex.end(); sentiIt++) { 963 | wordIt = word2atr.find(sentiIt->first); 964 | if (wordIt != word2atr.end()) { 965 | for (int j = 0; j < numSentiLabs; j++) { 966 | lambda_lw[j][wordIt->second.id] = sentiIt->second.labDist[j]; 967 | } 968 | } 969 | } 970 | 971 | // Note: the 'r' index of lambda[j][r] is corresponding to the vocabulary ID. 972 | // Therefore the correct prior info can be incorporated to corresponding word cound nlzw, 973 | // as 'w' is also corresponding to the vocabulary ID. 974 | for (int l = 0; l < numSentiLabs; l++) { 975 | for (int z = 0; z < numTopics; z++) { 976 | betaSum_lz[l][z] = 0.0; 977 | for (int r = 0; r < pnewData->vocabSize; r++) { 978 | beta_lzw[l][z][r] = beta_lzw[l][z][r] * lambda_lw[l][r]; 979 | betaSum_lz[l][z] += beta_lzw[l][z][r]; 980 | } 981 | } 982 | } 983 | 984 | return 0; 985 | } 986 | --------------------------------------------------------------------------------