├── .gitignore ├── .gitmodules ├── README.md ├── doc ├── ctm-model.png └── ctm.png ├── scripts ├── ctm-test ├── ctm-train ├── make_test_data.sh ├── remote.py ├── shuffle_lines.py ├── split_test_data.py └── splitcorpus.py └── src ├── Makefile ├── Unigram_Model ├── Dump │ └── dumpJson.cc ├── Formatter │ ├── Unigram_Test_Data_Formatter.cc │ ├── Unigram_Test_Data_Formatter.h │ ├── Unigram_Train_Data_Formatter.cc │ └── Unigram_Train_Data_Formatter.h ├── Merge │ ├── Merge_Dictionaries.cc │ ├── Merge_Topic_Counts.cc │ └── Merge_Topic_Counts_Without_Server.cc ├── Perplexity │ └── perplexity.cc ├── Server │ ├── Unigram_Model_Server_Helper.cc │ └── Unigram_Model_Server_Helper.h └── TopicLearner │ ├── BIT.cc │ ├── BIT.h │ ├── Hadoop_Checkpointer.cc │ ├── Hadoop_Checkpointer.h │ ├── Local_Checkpointer.cc │ ├── Local_Checkpointer.h │ ├── TopKList.cc │ ├── TopKList.h │ ├── TopicCounts.cc │ ├── TopicCounts.h │ ├── TypeTopicCounts.cc │ ├── TypeTopicCounts.h │ ├── Unigram_Model.cc │ ├── Unigram_Model.h │ ├── Unigram_Model_Synchronized_Training_Builder.cc │ ├── Unigram_Model_Synchronized_Training_Builder.h │ ├── Unigram_Model_Synchronizer_Helper.cc │ ├── Unigram_Model_Synchronizer_Helper.h │ ├── Unigram_Model_Tester.cc │ ├── Unigram_Model_Tester.h │ ├── Unigram_Model_Testing_Builder.cc │ ├── Unigram_Model_Testing_Builder.h │ ├── Unigram_Model_Trainer.cc │ ├── Unigram_Model_Trainer.h │ ├── Unigram_Model_Training_Builder.cc │ ├── Unigram_Model_Training_Builder.h │ ├── atomic.hpp │ ├── atomic_ops.hpp │ ├── eff_small_map.cc │ ├── eff_small_map.h │ ├── sampler.cc │ └── sampler.h ├── architecture.h ├── commons ├── Client.h ├── Context.cc ├── Context.h ├── DocumentReader.cc ├── DocumentReader.h ├── DocumentWriter.cc ├── DocumentWriter.h ├── Formatter │ ├── Controller.cc │ ├── Data_Formatter.h │ └── FormatData_flags_define.h ├── LDAUtil.cc ├── LDAUtil.h ├── MVGaussian.cc ├── MVGaussian.h ├── MVGaussian2.cc ├── MVGaussian2.h ├── Server │ ├── DM_Server.cc │ ├── DM_Server.h │ ├── DistributedMap.ice │ ├── Hashmap_Array.h │ └── Server_Helper.h ├── TopicLearner │ ├── Checkpointer.h │ ├── Controller.cc │ ├── DM_Client.cc │ ├── DM_Client.h │ ├── Dirichlet.cc │ ├── Dirichlet.h │ ├── Execution_Strategy.h │ ├── Filter_Accumulator.cc │ ├── Filter_Accumulator.h │ ├── Filter_EtaSampler.cc │ ├── Filter_EtaSampler.h │ ├── Filter_Eval.cc │ ├── Filter_Eval.h │ ├── Filter_Optimizer.cc │ ├── Filter_Optimizer.h │ ├── Filter_Reader.cc │ ├── Filter_Reader.h │ ├── Filter_Sampler.cc │ ├── Filter_Sampler.h │ ├── Filter_Tester.cc │ ├── Filter_Tester.h │ ├── Filter_Updater.cc │ ├── Filter_Updater.h │ ├── Filter_Writer.cc │ ├── Filter_Writer.h │ ├── GenericTopKList.h │ ├── Main_flags_define.h │ ├── Model.h │ ├── Model_Builder.h │ ├── Model_Director.cc │ ├── Model_Director.h │ ├── Model_Refiner.h │ ├── PThread_Pipeline.cc │ ├── PThread_Pipeline.h │ ├── Parameter.cc │ ├── Parameter.h │ ├── Pipeline.h │ ├── Synchronized_Training_Execution_Strategy.cc │ ├── Synchronized_Training_Execution_Strategy.h │ ├── Synchronizer.cc │ ├── Synchronizer.h │ ├── Synchronizer_Helper.h │ ├── TBB_Pipeline.cc │ ├── TBB_Pipeline.h │ ├── Testing_Execution_Strategy.cc │ ├── Testing_Execution_Strategy.h │ ├── Training_Execution_Strategy.cc │ └── Training_Execution_Strategy.h ├── WordIndexDictionary.cc ├── WordIndexDictionary.h ├── ap.cc ├── ap.h ├── apvt.h ├── cholesky.cc ├── cholesky.h ├── comparator.cc ├── comparator.h ├── constants.h ├── defs.h ├── matrixIO.h ├── polyagamma.cc ├── polyagamma.h ├── random.cc ├── random.h ├── spdinverse.cc ├── spdinverse.h ├── testUtil │ ├── compare.cc │ ├── compare.h │ ├── remote.cc │ ├── remote.h │ ├── testgenerator.cc │ └── testgenerator.h ├── types.h ├── utils.cc └── utils.h ├── document.proto ├── mainpage.h ├── multi_mcahine_usage.h ├── single_machine_usage.h └── usage.h /.gitignore: -------------------------------------------------------------------------------- 1 | # C files 2 | *.o 3 | *.d 4 | 5 | # Binary 6 | bin 7 | 8 | # Third-party 9 | #third_party 10 | #!third_party/Makefile 11 | #!third_party/third_party/third_party.mk 12 | 13 | # Generated source 14 | document.pb.* 15 | DistributedMap.* 16 | scripts/setenv* 17 | 18 | # Python 19 | *.pyc 20 | 21 | # vim 22 | *.swp 23 | *.swo 24 | 25 | # emacs 26 | *~ 27 | 28 | # work 29 | work 30 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party"] 2 | path = third_party 3 | url = https://github.com/cjf00000/toolkit.git 4 | -------------------------------------------------------------------------------- /doc/ctm-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjf00000/ScaCTM/0503f892b4bf441a22b8b2dbcb122dc4343c14f4/doc/ctm-model.png -------------------------------------------------------------------------------- /doc/ctm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjf00000/ScaCTM/0503f892b4bf441a22b8b2dbcb122dc4343c14f4/doc/ctm.png -------------------------------------------------------------------------------- /scripts/make_test_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt "4" ] 4 | then 5 | echo "Usage: make_test_data " 6 | exit 0 7 | fi 8 | 9 | full_data=$1 10 | train_data=$2 11 | test_observed_data=$3 12 | proportion=$4 13 | test_heldout_data=$5 14 | intermediate_data=${train_data}.temp 15 | full_test_data=${train_data}.temp2 16 | script_root=`dirname $0` 17 | 18 | # First we shuffle the training data 19 | echo "Shuffling" 20 | python $script_root/shuffle_lines.py $full_data > $intermediate_data 21 | 22 | # We make train and test set 23 | ndocuments=`cat $intermediate_data | wc -l` 24 | 25 | ntrain=`printf "%.0f" $(echo "$ndocuments * $proportion" | bc)` 26 | ntest=`printf "%.0f" $(echo "$ndocuments - $ntrain" | bc)` 27 | 28 | head -n $ntrain $intermediate_data > $train_data 29 | tail -n $ntest $intermediate_data > $full_test_data 30 | 31 | if [ -n "$5" ] 32 | then 33 | # Then we make observed and heldout datasets 34 | echo "Splitting" 35 | python $script_root/split_test_data.py ${full_test_data} $test_observed_data $test_heldout_data 36 | else 37 | cp $full_test_data $test_observed_data 38 | fi 39 | 40 | # Clean up 41 | rm $intermediate_data 42 | rm $full_test_data 43 | -------------------------------------------------------------------------------- /scripts/shuffle_lines.py: -------------------------------------------------------------------------------- 1 | import sys, random 2 | 3 | fin = open(sys.argv[1], 'r') 4 | data = fin.readlines() 5 | fin.close() 6 | 7 | random.shuffle(data) 8 | 9 | sys.stdout.writelines(data) 10 | -------------------------------------------------------------------------------- /scripts/split_test_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # make a observed and heldout set 3 | # split each document into 2 halfs 4 | 5 | import sys, random 6 | 7 | f_full = sys.argv[1] 8 | f_observed = sys.argv[2] 9 | f_heldout = sys.argv[3] 10 | 11 | data = map(lambda x : x.replace('\r', '').replace('\n', '').split(), open(f_full).readlines()) 12 | 13 | observed = open(f_observed, 'w') 14 | heldout = open(f_heldout, 'w') 15 | 16 | for document in data: 17 | id = document[:2] 18 | document = document[2:] 19 | random.shuffle(document) 20 | 21 | observed_doc_len = len(document) / 2 22 | document_observed = document[:observed_doc_len] 23 | document_heldout = document[observed_doc_len:] 24 | 25 | observed.write(" ".join(id + document_observed) + "\n") 26 | heldout.write(" ".join(id + document_heldout) + "\n") 27 | 28 | observed.close() 29 | heldout.close() 30 | -------------------------------------------------------------------------------- /scripts/splitcorpus.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | if (len(sys.argv) != 3): 4 | print 'Usage: python %s ' % sys.argv[0] 5 | sys.exit(0) 6 | 7 | m = int(sys.argv[2]) 8 | l = open(sys.argv[1]).readlines() 9 | n = len(l) 10 | 11 | p = (n-1)/m + 1 12 | 13 | for i in xrange(0, m): 14 | begin = i*p 15 | end = min( (i+1)*p, n ) 16 | 17 | f = open('%s.%d' % (sys.argv[1], i), 'w') 18 | f.writelines(l[begin:end]) 19 | f.close() 20 | -------------------------------------------------------------------------------- /src/Unigram_Model/Formatter/Unigram_Test_Data_Formatter.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Unigram_Test_Data_Formatter.cpp 20 | * 21 | * Created on: 28-Jan-2011 22 | * 23 | */ 24 | 25 | #include "Unigram_Test_Data_Formatter.h" 26 | #include "commons/Context.h" 27 | 28 | Unigram_Test_Data_Formatter::Unigram_Test_Data_Formatter() { 29 | Context& context = Context::get_instance(); 30 | string dumpfile = context.get_string("dumpfile"); 31 | LOG(WARNING) << "Initializing Dictionary from " << dumpfile; 32 | _dict.initialize_from_dump(dumpfile); 33 | LOG(WARNING) << "Num of unique words: " << _dict.get_num_words(); 34 | } 35 | 36 | Unigram_Test_Data_Formatter::~Unigram_Test_Data_Formatter() { 37 | } 38 | 39 | int Unigram_Test_Data_Formatter::insert_word_to_dict(string word) { 40 | int windex = _dict.get_index(word); 41 | return windex; 42 | } 43 | -------------------------------------------------------------------------------- /src/Unigram_Model/Formatter/Unigram_Test_Data_Formatter.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Unigram_Test_Data_Formatter.h 20 | * 21 | * Created on: 28-Jan-2011 22 | * 23 | */ 24 | 25 | #ifndef UNIGRAM_TEST_DATA_FORMATTER_H_ 26 | #define UNIGRAM_TEST_DATA_FORMATTER_H_ 27 | 28 | #include "Unigram_Train_Data_Formatter.h" 29 | 30 | class Unigram_Test_Data_Formatter: public Unigram_Train_Data_Formatter { 31 | public: 32 | Unigram_Test_Data_Formatter(); 33 | virtual ~Unigram_Test_Data_Formatter(); 34 | 35 | protected: 36 | int insert_word_to_dict(std::string word); 37 | 38 | private: 39 | WordIndexDictionary _global_dict; 40 | }; 41 | 42 | #endif /* UNIGRAM_TEST_DATA_FORMATTER_H_ */ 43 | -------------------------------------------------------------------------------- /src/Unigram_Model/Formatter/Unigram_Train_Data_Formatter.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Unigram_Train_Data_Formatter.h 20 | * 21 | * Created on: 11-Jan-2011 22 | * 23 | */ 24 | 25 | #ifndef UNIGRAM_TRAIN_DATA_FORMATTER_H_ 26 | #define UNIGRAM_TRAIN_DATA_FORMATTER_H_ 27 | 28 | #include "commons/Formatter/Data_Formatter.h" 29 | #include "commons/DocumentWriter.h" 30 | #include 31 | #include "boost/unordered_set.hpp" 32 | 33 | class Unigram_Train_Data_Formatter: public Data_Formatter { 34 | public: 35 | Unigram_Train_Data_Formatter(); 36 | virtual ~Unigram_Train_Data_Formatter(); 37 | 38 | void format(); 39 | 40 | WordIndexDictionary& get_dictionary(); 41 | 42 | int get_num_docs(); 43 | 44 | int get_total_num_words(); 45 | 46 | protected: 47 | virtual int insert_word_to_dict(std::string word); 48 | int read_from_inp(LDA::unigram_document & wdoc, std::istream& inp); 49 | 50 | protected: 51 | WordIndexDictionary _dict; 52 | int _num_docs, _num_words_in_all_docs; 53 | boost::unordered_set _stopWords; 54 | std::ifstream _in; 55 | DocumentWriter* _doc_writer; 56 | }; 57 | 58 | #endif /* UNIGRAM_TRAIN_DATA_FORMATTER_H_ */ 59 | -------------------------------------------------------------------------------- /src/Unigram_Model/Merge/Merge_Dictionaries.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Merge_Dictionaries.cpp 20 | * 21 | * Created on: 31-Jan-2011 22 | * 23 | */ 24 | 25 | #include "gflags/gflags.h" 26 | #include "glog/logging.h" 27 | #include "WordIndexDictionary.h" 28 | 29 | using namespace std; 30 | 31 | DEFINE_string(dumpprefix,"specify", "The dump of the dictionary to be used instead of creating afresh"); 32 | DEFINE_int32(dictionaries,-1,"The number of dictionaries present"); 33 | DEFINE_string(outputprefix,"lda", "A prefix that will be used with all files output by the program"); 34 | 35 | int main(int argc, char *argv[]) { 36 | google::InitGoogleLogging(argv[0]); 37 | google::InstallFailureSignalHandler(); 38 | 39 | string 40 | usage( 41 | "\n\n This program takes a set of dictionary dumps and merges into one single dictionary\n" 42 | "It is used to merge a set of local dictionaries found in the current directory in the multi-machine version\n\n"); 43 | 44 | string 45 | cmd_usage( 46 | (string) argv[0] 47 | + " --dumpprefix= --dictionaries= [--outputprefix=]"); 48 | 49 | usage += cmd_usage; 50 | 51 | google::SetUsageMessage(usage); 52 | for (int i = 0; i < argc; i++) { 53 | std::string arg = argv[i]; 54 | if (strcmp(argv[i], "--help") == 0) { 55 | google::ShowUsageWithFlagsRestrict(argv[0], "Merge_Dictionaries"); 56 | exit(0); 57 | } 58 | } 59 | 60 | google::ParseCommandLineFlags(&argc, &argv, true); 61 | 62 | google::SetCommandLineOptionWithMode("minloglevel", "1", 63 | google::SET_FLAG_IF_DEFAULT); 64 | google::SetCommandLineOptionWithMode("stderrthreshold", "1", 65 | google::SET_FLAG_IF_DEFAULT); 66 | 67 | const char* pwd = google::StringFromEnv("PWD", "/tmp"); 68 | google::SetCommandLineOptionWithMode("log_dir", pwd, 69 | google::SET_FLAG_IF_DEFAULT); 70 | 71 | LOG(WARNING) 72 | << "----------------------------------------------------------------------"; 73 | LOG(WARNING) << "Log files are being stored at " << pwd 74 | << "/formatter.*"; 75 | LOG(WARNING) 76 | << "----------------------------------------------------------------------"; 77 | 78 | string flagsInp = google::CommandlineFlagsIntoString(); 79 | 80 | LOG(INFO) << flagsInp << endl; 81 | 82 | if (google::GetCommandLineFlagInfoOrDie("dictionaries").is_default 83 | || google::GetCommandLineFlagInfoOrDie("dumpprefix").is_default) 84 | LOG(FATAL) 85 | << "You need to specify the number of dictionaries to merge" 86 | << " and the dictionary prefix via dumpprefix flag"; 87 | 88 | WordIndexDictionary* dict = new WordIndexDictionary(); 89 | string dict_dump = FLAGS_outputprefix + ".dict" + ".dump"; 90 | 91 | LOG(WARNING) << "Initializing merge of " << FLAGS_dictionaries 92 | << " dictionaries using " << FLAGS_dumpprefix << " as prefix"; 93 | dict->initialize_from_dumps(FLAGS_dumpprefix, FLAGS_dictionaries); 94 | LOG(WARNING) << "Dictionaries merged"; 95 | 96 | LOG(WARNING) << "Num of unique words: " << dict->get_num_words(); 97 | LOG(WARNING) 98 | << "Dumping global dictionary for later use by learnTopics into " 99 | << dict_dump << endl; 100 | dict->dump(dict_dump); 101 | LOG(WARNING) << "Finished dictionary dump" << endl; 102 | 103 | delete dict; 104 | } 105 | -------------------------------------------------------------------------------- /src/Unigram_Model/Perplexity/perplexity.cc: -------------------------------------------------------------------------------- 1 | #include "WordIndexDictionary.h" 2 | #include "Unigram_Model/TopicLearner/TypeTopicCounts.h" 3 | #include "Unigram_Model/TopicLearner/TopicCounts.h" 4 | #include "document.pb.h" 5 | #include "DocumentReader.h" 6 | #include 7 | #include 8 | #include 9 | #include "matrixIO.h" 10 | #include "types.h" 11 | #include "utils.h" 12 | #include "commons/TopicLearner/Parameter.h" 13 | using namespace std; 14 | 15 | int main(int argc, char **argv) 16 | { 17 | if (argc != 8) 18 | { 19 | printf("Usage: \n"); 20 | return 0; 21 | } 22 | 23 | int num_topics = atoi(argv[1]); 24 | string dictionary_path = string(argv[2]); 25 | string ttc_path = string(argv[3]); 26 | string document_file_prefix = string(argv[4]); 27 | string topic_file_prefix = string(argv[5]); 28 | double beta = atof(argv[6]); 29 | int num = atoi(argv[7]); 30 | 31 | // Read dictionary 32 | WordIndexDictionary dict; 33 | dict.initialize_from_dump(dictionary_path); 34 | // dict.print(); 35 | int num_words = dict.get_num_words(); 36 | 37 | TypeTopicCounts ttc(num_words, num_topics); 38 | ttc.initialize_from_dump(ttc_path, &dict); 39 | // ttc.print(); 40 | 41 | cerr << num_topics << " Topics." << endl; 42 | cerr << num_words << " Words." << endl; 43 | cerr << "Beta = " << beta << endl; 44 | 45 | double **phi_wk = alloc2D(num_words, num_topics); 46 | atomic *tokens_per_topic = new atomic [num_topics]; //Storage for n(t) 47 | ttc.get_counts(tokens_per_topic); 48 | for (int w=0; wread(&wdoc) != -1) 86 | { 87 | double doc_likelihood = 0; 88 | 89 | trdr->read(&tdoc); 90 | 91 | int num_words_in_document = wdoc.body_size(); 92 | total_words += num_words_in_document; 93 | 94 | for (int k=0; k 3 | #include 4 | 5 | BIT::BIT(int size) 6 | { 7 | _size = size; 8 | _capacity = 1; 9 | while (_capacity < _size) 10 | { 11 | _capacity *= 2; 12 | } 13 | 14 | // Start from 0 15 | __bit = new double[_capacity]; 16 | values = new double[_capacity]; 17 | 18 | // Start from 1 19 | _bit = __bit - 1; 20 | _values = values - 1; 21 | memset(__bit, 0, sizeof(double)*_capacity); 22 | memset(values, 0, sizeof(double)*_capacity); 23 | sum = 0; 24 | } 25 | 26 | BIT::~BIT() 27 | { 28 | delete[] values; 29 | delete[] __bit; 30 | } 31 | 32 | void BIT::initialize(double *values) 33 | { 34 | // From 0 35 | for (int i=0; i<_size; ++i) 36 | { 37 | this->values[i] = values[i]; 38 | for (int j=i+1; j<=_capacity; j+=(j&-j)) 39 | _bit[j] += values[i]; 40 | } 41 | 42 | for (int i=_size; i<_capacity; ++i) 43 | { 44 | this->values[i] = 0; 45 | } 46 | 47 | sum = _bit[_capacity]; 48 | } 49 | 50 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/BIT.h: -------------------------------------------------------------------------------- 1 | #ifndef __BIT_H 2 | #define __BIT_H 3 | 4 | #include 5 | 6 | // binary indexed tree 7 | class BIT 8 | { 9 | public: 10 | BIT(int size); 11 | ~BIT(); 12 | 13 | void initialize(double *values); 14 | 15 | void update(int index, double new_value) 16 | { 17 | int n = index + 1; 18 | 19 | double delta = new_value - _values[n]; 20 | _values[n] = new_value; 21 | 22 | sum += delta; 23 | // TODO update BIT, O(1)-->O(log n) 24 | } 25 | 26 | int upper_bound_sum(double x) const 27 | { 28 | // TODO search BIT, O(n)-->O(log n) 29 | double sum = 0; 30 | for (int i=1; i<=_size; ++i) 31 | { 32 | sum += _values[i]; 33 | 34 | if (sum >= x) 35 | return i-1; 36 | } 37 | 38 | return _size-1; 39 | } 40 | 41 | double *values; 42 | double sum; 43 | 44 | private: 45 | double *_bit; 46 | double *_values; 47 | double *__bit; 48 | int _size; 49 | int _capacity; 50 | }; 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/Hadoop_Checkpointer.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Hadoop_Checkpointer.cpp 20 | * 21 | * Created on: 08-Mar-2011 22 | * 23 | */ 24 | 25 | #include "Hadoop_Checkpointer.h" 26 | #include "Context.h" 27 | #include "glog/logging.h" 28 | 29 | Hadoop_Checkpointer::Hadoop_Checkpointer() { 30 | } 31 | 32 | Hadoop_Checkpointer::~Hadoop_Checkpointer() { 33 | } 34 | 35 | void Hadoop_Checkpointer::checkpoint() { 36 | Context& context = Context::get_instance(); 37 | using namespace std; 38 | string chkpt_dir = context.get_string("chkptdir"); 39 | string chkpt_file = context.get_string("chkpt_file"); 40 | string input_t = context.get_string("input_t"); 41 | 42 | { 43 | //copy the topic assignments to chkpt_dir 44 | string dest_file(chkpt_dir + "/" + input_t); 45 | string top_cp_cmd( 46 | "hadoop dfs -put " + input_t + " " + dest_file + ".tmp"); 47 | LOG(WARNING) << top_cp_cmd; 48 | system(top_cp_cmd.c_str()); 49 | string rm_cmd("hadoop dfs -rmr " + dest_file); 50 | LOG(WARNING) << rm_cmd; 51 | system(rm_cmd.c_str()); 52 | string mv_cmd("hadoop dfs -mv " + dest_file + ".tmp " + dest_file); 53 | LOG(WARNING) << mv_cmd; 54 | system(mv_cmd.c_str()); 55 | } 56 | 57 | { 58 | //copy the metadata to chkpt_dir 59 | string dest_file(chkpt_dir + "/" + chkpt_file); 60 | string chk_cp_cmd( 61 | "hadoop dfs -put " + chkpt_file + " " + dest_file + ".tmp"); 62 | system(chk_cp_cmd.c_str()); 63 | string rm_cmd("hadoop dfs -rmr " + dest_file); 64 | LOG(WARNING) << rm_cmd; 65 | system(rm_cmd.c_str()); 66 | string mv_cmd("hadoop dfs -mv " + dest_file + ".tmp " + dest_file); 67 | LOG(WARNING) << mv_cmd; 68 | system(mv_cmd.c_str()); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/Hadoop_Checkpointer.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Hadoop_Checkpointer.h 20 | * 21 | * Created on: 08-Mar-2011 22 | * 23 | */ 24 | 25 | #ifndef HADOOP_CHECKPOINTER_H_ 26 | #define HADOOP_CHECKPOINTER_H_ 27 | 28 | #include "Local_Checkpointer.h" 29 | 30 | class Hadoop_Checkpointer: public Local_Checkpointer { 31 | public: 32 | Hadoop_Checkpointer(); 33 | virtual ~Hadoop_Checkpointer(); 34 | 35 | void checkpoint(); 36 | 37 | }; 38 | 39 | #endif /* HADOOP_CHECKPOINTER_H_ */ 40 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/Local_Checkpointer.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Local_Checkpointer.cpp 20 | * 21 | * Created on: 07-Mar-2011 22 | * 23 | */ 24 | #include "Context.h" 25 | #include "Local_Checkpointer.h" 26 | #include 27 | #include 28 | 29 | Local_Checkpointer::Local_Checkpointer() { 30 | } 31 | 32 | Local_Checkpointer::~Local_Checkpointer() { 33 | } 34 | 35 | void Local_Checkpointer::save_metadata(std::string& state) { 36 | std::string chkpt_file = Context::get_instance().get_string("chkpt_file"); 37 | std::string new_chkpt_file = chkpt_file + ".new"; 38 | std::ofstream chkpt_output(new_chkpt_file.c_str()); 39 | chkpt_output << state; 40 | chkpt_output.flush(); 41 | chkpt_output.close(); 42 | std::string cmd = "mv " + new_chkpt_file + " " + chkpt_file; 43 | system(cmd.c_str()); 44 | } 45 | 46 | std::string Local_Checkpointer::load_metadata() { 47 | std::string chkpt_file = Context::get_instance().get_string("chkpt_file"); 48 | std::ifstream chkpt_input(chkpt_file.c_str()); 49 | //std::string empty_str(""); 50 | if (!chkpt_input.is_open()) 51 | return ""; 52 | std::stringbuf state; 53 | chkpt_input.get(state); 54 | return state.str(); 55 | } 56 | 57 | void Local_Checkpointer::checkpoint() { 58 | } 59 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/Local_Checkpointer.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Local_Checkpointer.h 20 | * 21 | * Created on: 07-Mar-2011 22 | * 23 | */ 24 | 25 | #ifndef LOCAL_CHECKPOINTER_H_ 26 | #define LOCAL_CHECKPOINTER_H_ 27 | 28 | #include "TopicLearner/Checkpointer.h" 29 | 30 | class Local_Checkpointer: public Checkpointer { 31 | public: 32 | Local_Checkpointer(); 33 | virtual ~Local_Checkpointer(); 34 | 35 | virtual void save_metadata(std::string& state); 36 | virtual std::string load_metadata(); 37 | virtual void checkpoint(); 38 | }; 39 | 40 | #endif /* LOCAL_CHECKPOINTER_H_ */ 41 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/TopKList.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * TopKList.cpp 20 | * 21 | * Created on: 14 May, 2009 22 | * 23 | */ 24 | 25 | #include "TopKList.h" 26 | #include 27 | #include 28 | 29 | /** 30 | * Constructs a TopKList supporting 31 | * top K_ elements 32 | */ 33 | TopKList::TopKList(int K_) { 34 | K = K_; 35 | num_elements = 0; 36 | array = (cnt_word_t*) calloc(K, sizeof(cnt_word_t)); 37 | min = -1; 38 | } 39 | 40 | TopKList::~TopKList() { 41 | free(array); 42 | } 43 | 44 | /** 45 | * inserts into the sorted array. Find pos by binary search, move the elements 46 | * to the right and insert at pos. The only assumption is that all elements are 47 | * filled and elements smaller than min are not sent as arguments. Since this 48 | * is the usual case, tried to keep it short and optimized (no conditional checks) 49 | */ 50 | void TopKList::insert_into_array(const cnt_word_t& cnt_word) { 51 | packed_t* src = std::upper_bound(&array[0].cnt_top, &array[K].cnt_top, 52 | cnt_word.cnt_top, cnt_cmp); 53 | memmove(src + 1, src, (&array[K - 1].cnt_top - src) * sizeof(packed_t)); 54 | *src = cnt_word.cnt_top; 55 | } 56 | 57 | void TopKList::insert_word(const cnt_word_t& cnt_word) { 58 | LOG_IF(FATAL,K==0)<< "Cannot work with Top0Lists. Quitting"; 59 | 60 | if(num_elementsmin) { 77 | insert_into_array(cnt_word); 78 | min = array[K-1].choose.cnt; 79 | } 80 | } 81 | } 82 | 83 | /** 84 | * Used for test convenience. Just to check the 85 | * array is always sorted and has at most K elems. 86 | * Should return true at any point of time 87 | */ 88 | bool TopKList::is_sorted() { 89 | for (int i = 0; i < num_elements - 1; i++) { 90 | // if(!cnt_cmp(array[i].cnt_top,array[i+1].cnt_top)) 91 | if (!(array[i].cnt_top > array[i + 1].cnt_top)) 92 | return false; 93 | } 94 | return true && (num_elements <= K); 95 | } 96 | 97 | /** 98 | * Returns the max element which is the first elem 99 | */ 100 | cnt_word_t TopKList::get_max() { 101 | return array[0]; 102 | } 103 | 104 | /** 105 | * Iterator methods. Return an iterator to the 106 | * beginning of the list 107 | */ 108 | TopKList::iterator TopKList::get_beg() { 109 | return &array[0]; 110 | } 111 | 112 | /** 113 | * Iterator methods. Return an iterator to the 114 | * end of the list 115 | */ 116 | TopKList::iterator TopKList::get_end() { 117 | return &array[num_elements]; 118 | } 119 | 120 | /** 121 | * Print the list to log(INFO) 122 | */ 123 | void TopKList::print() { 124 | std::stringstream ss; 125 | for (int i = 0; i < num_elements; i++) { 126 | ss << "(" << array[i].choose.top << ", " << array[i].choose.cnt << ") "; 127 | } 128 | ss << std::endl; 129 | LOG(INFO) << ss.str(); 130 | ss.str(""); 131 | } 132 | 133 | /** 134 | * Clears the list 135 | */ 136 | void TopKList::clear() { 137 | memset(array, 0, num_elements * sizeof(cnt_word_t)); 138 | num_elements = 0; 139 | min = -1; 140 | } 141 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/TopKList.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * TopKList.h 20 | * A list that maintains top K records 21 | * Each record is assumed to be the 22 | * cnt_topic_t union. The abstraction 23 | * of the record is (word,cnt) pair 24 | * So top K records indicate K unique 25 | * records having the highest count 26 | * 27 | * The main assumption is that the 28 | * records inserted should be unique 29 | * wrt to the words. Same word with 30 | * different counts should not be 31 | * inserted. Results are undefined 32 | * 33 | * Created on: 14 May, 2009 34 | * 35 | */ 36 | 37 | #ifndef TOPKLIST_H_ 38 | #define TOPKLIST_H_ 39 | #include "constants.h" 40 | #include 41 | #include "comparator.h" 42 | 43 | typedef cnt_topic_t cnt_word_t; 44 | 45 | class TopKList { 46 | public: 47 | TopKList(int K_); 48 | virtual ~TopKList(); 49 | //Assumes that the words inserted are unique and doesn't check this explicitly 50 | void insert_word(const cnt_word_t& cnt_word); 51 | bool is_sorted(); 52 | cnt_word_t get_max(); 53 | typedef cnt_topic_t* iterator; 54 | iterator get_beg(); 55 | iterator get_end(); 56 | void print(); 57 | void clear(); 58 | 59 | private: 60 | int K, //The val K in top K 61 | num_elements; //The actual num of elements stored; can be lesser than K 62 | 63 | cnt_word_t* array; //The array which stores the elements. Its always kept sorted 64 | 65 | int32_t min; //The min count. Used to chk if an incoming element should be inserted 66 | 67 | void insert_into_array(const cnt_word_t& cnt_word); 68 | }; 69 | 70 | #endif /* TOPKLIST_H_ */ 71 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/TopicCounts.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * TopicCounts.h 20 | * The main structure that stores the sparse 21 | * topic counts vector. It uses a blocked 22 | * allocation scheme in that memory is allocated 23 | * and deallocated in blocks. Blocked allocation 24 | * is used to reduce heap fragmentation. 25 | * 26 | * The elements are of the type cnt_topic_t 27 | * which packs both the topic and the count 28 | * into a single 64bit integer. 29 | * 30 | * Supports a map-view of the sparse vector, 31 | * but you don't have order information in 32 | * this view. So use with discretion. 33 | * 34 | * Its a struct for speed. So everything is 35 | * public. Don't mess with the items, length 36 | * and origLenth if you aren't sure about 37 | * what you are doing. Use the methods 38 | * provided making sure the assumptions hold 39 | * 40 | * Created on: 14 Oct, 2009 41 | * 42 | */ 43 | 44 | #ifndef TOPICCOUNTS_H_ 45 | #define TOPICCOUNTS_H_ 46 | 47 | #include "types.h" 48 | #include 49 | #include "boost/unordered_map.hpp" 50 | #include "tbb/atomic.h" 51 | #include "comparator.h" 52 | #include 53 | 54 | class simple_map; 55 | 56 | typedef struct TopicCounts { 57 | cnt_topic_t* items; //The actual array holding data 58 | //which is dynamically reshaped 59 | //This is always sorted in descending 60 | //order and only has non-zero entries 61 | //Methods do not check for uniqueness 62 | //but assume uniqueness. 63 | //Responsiblity of user to ensure 64 | //uniqueness 65 | 66 | topic_t length; //The number of elements stored in the array 67 | 68 | topic_t origLength; //The size of the allocated array 69 | 70 | std::vector vec_items; 71 | 72 | //mapped_vec tmp_map; //A temporary map to hasten some 73 | //internal update operations 74 | int frequency; 75 | bool QUIT; 76 | 77 | /***** Init *****/ 78 | TopicCounts(); 79 | TopicCounts(int length); 80 | TopicCounts(cnt_topic_t* it, int len); 81 | TopicCounts(const std::string& counts); 82 | void init(cnt_topic_t* it, int len); 83 | void init(const std::string& counts); 84 | ~TopicCounts(); 85 | void assign(int length, bool setLen = true); 86 | void setLength(int length_); 87 | /***** Init *****/ 88 | /***** Getters *****/ 89 | void findOldnNew(topic_t oldTopic, topic_t newTopic, topic_t** oldTop, 90 | topic_t** newTop); 91 | int get_frequency(); 92 | cnt_t get_counts(topic_t topic); 93 | int convertTo(mapped_vec& map, int mult = 1) const; 94 | void convertTo(simple_map& map, int mult = 1) const; 95 | void convertTo(std::string& counts) const; 96 | int convertTo_d(mapped_vec& map, double mult) const; 97 | //int computeFrequency(); 98 | //bool matchFrequency(); 99 | /***** Getters *****/ 100 | 101 | /***** Setters *****/ 102 | bool findAndIncrement(topic_t topic); 103 | bool findAndDecrement(topic_t topic); 104 | //void setFrequency(); 105 | 106 | void compact(); 107 | 108 | void addNewTop(topic_t topic, cnt_t count = 1); 109 | void addNewTopAftChk(topic_t topic, cnt_t count = 1); 110 | void upd_count(mapped_vec& delta, tbb::atomic* t = NULL); 111 | 112 | //Convenience updates 113 | void operator+=(TopicCounts& inp); 114 | void operator-=(TopicCounts& inp); 115 | 116 | void removeOldTop(topic_t ind, cnt_topic_t& ct); 117 | 118 | void replace(TopicCounts& tc); 119 | 120 | void decrement(topic_t ind, topic_t** newTop); 121 | void increment(topic_t ind); 122 | /***** Setters *****/ 123 | 124 | //Test & Debug 125 | std::string print(); 126 | TopicCounts(mapped_vec& map); 127 | bool equal(const TopicCounts& expected); 128 | } topicCounts; 129 | 130 | #endif /* TOPICCOUNTS_H_ */ 131 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/Unigram_Model.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Unigram_Model.h 20 | * 21 | * Created on: 05-Jan-2011 22 | * 23 | */ 24 | 25 | #ifndef UNIGRAM_MODEL_H_ 26 | #define UNIGRAM_MODEL_H_ 27 | 28 | #include "TopicLearner/Model.h" 29 | #include 30 | #include "TopicLearner/Parameter.h" 31 | #include "TypeTopicCounts.h" 32 | #include "TopicLearner/GenericTopKList.h" 33 | #include 34 | 35 | using namespace std; 36 | 37 | class Unigram_Model: public Model { 38 | public: 39 | const static int ALPHA = 1; 40 | const static int BETA = 2; 41 | const static int MU_0 = 3; 42 | const static int RHO = 4; 43 | const static int WISHART = 5; 44 | const static int MU = 6; 45 | const static int COV = 7; 46 | 47 | public: 48 | Unigram_Model(int, int); 49 | virtual ~Unigram_Model(); 50 | 51 | Parameter& get_parameter(int); 52 | arma::mat& get_mat(int); 53 | void set_parameter(int, Parameter&); 54 | void set_parameter(int, arma::mat&); 55 | int& get_kappa(); 56 | 57 | TypeTopicCounts& get_ttc(); 58 | 59 | double get_eval(); 60 | 61 | bool save(); 62 | 63 | void write_statistics(WordIndexDictionary&); 64 | 65 | private: 66 | TypeTopicCounts* _ttc; 67 | param _alpha, _beta; 68 | 69 | arma::mat _mu_0; 70 | param _rho; 71 | arma::mat _wishart; 72 | int _kappa; 73 | 74 | arma::mat _mu; 75 | arma::mat _cov; 76 | 77 | typedef GenericTopKList topK_word_prop_t; 78 | topK_word_prop_t** top_words_per_topic; 79 | bool _top_words_empty; 80 | 81 | int _num_words; 82 | int _num_topics; 83 | }; 84 | 85 | #endif /* UNIGRAM_MODEL_H_ */ 86 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/Unigram_Model_Synchronized_Training_Builder.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Unigram_Model_Synchronized_Training_Builder.cpp 20 | * 21 | * Created on: 14-Jan-2011 22 | * 23 | */ 24 | 25 | #include "Unigram_Model_Synchronized_Training_Builder.h" 26 | #include "Unigram_Model_Synchronizer_Helper.h" 27 | #include "TopicLearner/Synchronized_Training_Execution_Strategy.h" 28 | #include "Hadoop_Checkpointer.h" 29 | #include "Context.h" 30 | 31 | Unigram_Model_Synchronized_Training_Builder::Unigram_Model_Synchronized_Training_Builder() { 32 | _sync_helper = NULL; 33 | } 34 | 35 | Unigram_Model_Synchronized_Training_Builder::~Unigram_Model_Synchronized_Training_Builder() { 36 | if (_sync_helper) 37 | delete _sync_helper; 38 | } 39 | 40 | Execution_Strategy& Unigram_Model_Synchronized_Training_Builder::create_execution_strategy( 41 | Pipeline& pipeline) { 42 | _sync_helper = new Unigram_Model_Synchronizer_Helper(_model->get_ttc(), 43 | get_dict()); 44 | string chkpt_dir = Context::get_instance().get_string("chkptdir"); 45 | if (chkpt_dir == "") 46 | LOG(FATAL) << "HDFS Checkpoint Directory missing"; 47 | _checkpointer = new Hadoop_Checkpointer(); 48 | _strategy = new Synchronized_Training_Execution_Strategy(pipeline, *_model, 49 | *_checkpointer, *_sync_helper); 50 | return *_strategy; 51 | } 52 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/Unigram_Model_Synchronized_Training_Builder.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Unigram_Model_Synchronized_Training_Builder.h 20 | * 21 | * Created on: 14-Jan-2011 22 | * 23 | */ 24 | 25 | #ifndef UNIGRAM_MODEL_SYNCHRONIZED_TRAINING_BUILDER_H_ 26 | #define UNIGRAM_MODEL_SYNCHRONIZED_TRAINING_BUILDER_H_ 27 | 28 | #include "Unigram_Model_Training_Builder.h" 29 | #include "TopicLearner/Synchronizer_Helper.h" 30 | 31 | class Unigram_Model_Synchronized_Training_Builder: public Unigram_Model_Training_Builder { 32 | public: 33 | Unigram_Model_Synchronized_Training_Builder(); 34 | virtual ~Unigram_Model_Synchronized_Training_Builder(); 35 | 36 | virtual Execution_Strategy& create_execution_strategy(Pipeline&); 37 | 38 | protected: 39 | Synchronizer_Helper* _sync_helper; 40 | }; 41 | 42 | #endif /* UNIGRAM_MODEL_SYNCHRONIZED_TRAINING_BUILDER_H_ */ 43 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/Unigram_Model_Synchronizer_Helper.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Unigram_Model_Synchronizer_Helper.h 20 | * 21 | * Created on: 13-Jan-2011 22 | * 23 | */ 24 | 25 | #ifndef UNIGRAM_MODEL_SYNCHRONIZER_HELPER_H_ 26 | #define UNIGRAM_MODEL_SYNCHRONIZER_HELPER_H_ 27 | 28 | #include "TopicLearner/Synchronizer_Helper.h" 29 | #include "TypeTopicCounts.h" 30 | #include "WordIndexDictionary.h" 31 | #include "Client.h" 32 | 33 | class Unigram_Model_Synchronizer_Helper: public Synchronizer_Helper { 34 | public: 35 | Unigram_Model_Synchronizer_Helper(TypeTopicCounts& ttc, 36 | WordIndexDictionary& dict); 37 | virtual ~Unigram_Model_Synchronizer_Helper(); 38 | 39 | void initialize(); 40 | bool has_to_synchronize(); 41 | void reset_to_synchronize(); 42 | void synchronize(); 43 | void end_putNget(const std::string& word, const std::string& counts); 44 | 45 | private: 46 | TypeTopicCounts& _ttc; 47 | WordIndexDictionary& _dict; 48 | 49 | TypeTopicCounts* _prev_ttc; 50 | Client* _client; 51 | 52 | int _num_words, _num_topics; 53 | 54 | int _cur_word; 55 | 56 | topicCounts tc_client, tc_old; 57 | }; 58 | 59 | #endif /* UNIGRAM_MODEL_SYNCHRONIZER_HELPER_H_ */ 60 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/Unigram_Model_Tester.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Unigram_Model_Tester.h 20 | * 21 | * Created on: 06-Jan-2011 22 | * 23 | */ 24 | 25 | #ifndef UNIGRAM_MODEL_TESTER_H_ 26 | #define UNIGRAM_MODEL_TESTER_H_ 27 | 28 | #include "TopicLearner/Model_Refiner.h" 29 | #include "TypeTopicCounts.h" 30 | #include "TopicLearner/Parameter.h" 31 | #include "DocumentReader.h" 32 | #include "DocumentWriter.h" 33 | #include 34 | #include 35 | #include "WordIndexDictionary.h" 36 | #include 37 | 38 | using namespace boost; 39 | using namespace std; 40 | 41 | class Random; 42 | 43 | class Unigram_Model_Tester: public Model_Refiner { 44 | public: 45 | Unigram_Model_Tester(TypeTopicCounts&, Parameter&, Parameter&, arma::mat&, arma::mat&, 46 | WordIndexDictionary&, bool no_init = false); 47 | virtual ~Unigram_Model_Tester(); 48 | 49 | google::protobuf::Message* allocate_document_buffer(size_t); 50 | void deallocate_document_buffer(google::protobuf::Message*); 51 | google::protobuf::Message* get_nth_document( 52 | google::protobuf::Message* docs, size_t n); 53 | void* read(google::protobuf::Message&); 54 | void* sample(void*); 55 | void* update(void*); 56 | void* sampleEta(void*); 57 | void* accumulateEta(void*); 58 | void sampleGauss(); 59 | void* optimize(void*); 60 | void* eval(void*, double&); 61 | void write(void*); 62 | void iteration_done(); 63 | 64 | void* test(void*); 65 | 66 | static long doc_index; //Running count of all the documents processed by the optimizer 67 | 68 | private: 69 | void set_up_io(string, string); 70 | void release_io(); 71 | 72 | void sampleEta(LDA::unigram_document& doc, Random *random); 73 | 74 | protected: 75 | TypeTopicCounts& _ttc; 76 | Parameter& _alpha; 77 | Parameter& _beta; 78 | arma::mat& _mu; 79 | arma::mat& _cov; 80 | arma::mat _prec; 81 | bool ignore_old_topic; 82 | int _num_words, _num_topics; 83 | //Reader 84 | DocumentReader *_wdoc_rdr; 85 | DocumentWriter *_tdoc_writer; 86 | }; 87 | 88 | #endif /* UNIGRAM_MODEL_TESTER_H_ */ 89 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/Unigram_Model_Testing_Builder.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Unigram_Model_Builder.cpp 20 | * 21 | * Created on: 06-Jan-2011 22 | * 23 | */ 24 | 25 | #include 26 | #include "Unigram_Model_Testing_Builder.h" 27 | #include "Context.h" 28 | #include "DocumentReader.h" 29 | #include "DocumentWriter.h" 30 | #include "Unigram_Model_Tester.h" 31 | #include "TopicLearner/TBB_Pipeline.h" 32 | #include "TopicLearner/Testing_Execution_Strategy.h" 33 | #include 34 | using namespace arma; 35 | 36 | Unigram_Model_Testing_Builder::Unigram_Model_Testing_Builder() { 37 | } 38 | 39 | Unigram_Model_Testing_Builder::~Unigram_Model_Testing_Builder() { 40 | } 41 | 42 | Model_Refiner& Unigram_Model_Testing_Builder::create_model_refiner() { 43 | int rank; 44 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 45 | ostringstream sout; 46 | sout << "." << rank; 47 | string dotrank = sout.str(); 48 | 49 | Context& context = Context::get_instance(); 50 | string input_prefix = context.get_string("inputprefix"); 51 | string input_w = input_prefix + ".wor" + dotrank; 52 | int num_topics = context.get_int("topics"); 53 | 54 | context.put_string("input_w", input_w); 55 | 56 | //The following is needed so that the create_output 57 | //method which is not overridded works. It uses input_t 58 | //as the topic assignments file and in testing we do 59 | //not need that. It also deletes the output_t file. 60 | //So we switch back to creating a tmp out file 61 | string input_t = input_prefix + ".top" + dotrank; 62 | context.put_string("input_t", input_t); 63 | string output_t = input_prefix + ".top.tmp" + dotrank; 64 | context.put_string("output_t", output_t); 65 | 66 | string ttc_dumpfile = context.get_string("dumpprefix") + ".ttc.dump"; 67 | string param_dumpfile = context.get_string("dumpprefix") + ".par.dump"; 68 | string global_dict_dump = context.get_string("inputprefix") 69 | + ".dict.dump.global"; 70 | 71 | context.put_string("ttc_dumpfile", ttc_dumpfile); 72 | context.put_string("param_dumpfile", param_dumpfile); 73 | context.put_string("global_dict_dump", global_dict_dump); 74 | 75 | string mudump = context.get_string("dumpprefix") + ".mu" + ".dump"; 76 | string covdump = context.get_string("dumpprefix") + ".cov" + ".dump"; 77 | context.put_string("mudump", mudump); 78 | context.put_string("covdump", covdump); 79 | 80 | TypeTopicCounts& ttc = _model->get_ttc(); 81 | Parameter& alpha = _model->get_parameter(Unigram_Model::ALPHA); 82 | Parameter& beta = _model->get_parameter(Unigram_Model::BETA); 83 | mat& mu = _model->get_mat(Unigram_Model::MU); 84 | mat& cov = _model->get_mat(Unigram_Model::COV); 85 | 86 | _refiner = new Unigram_Model_Tester(ttc, alpha, beta, mu, cov, get_dict()); 87 | return *_refiner; 88 | } 89 | 90 | Pipeline& Unigram_Model_Testing_Builder::create_pipeline( 91 | Model_Refiner& refiner) { 92 | _pipeline = new TBB_Pipeline(refiner); 93 | return *_pipeline; 94 | } 95 | 96 | Execution_Strategy& Unigram_Model_Testing_Builder::create_execution_strategy( 97 | Pipeline& pipeline) { 98 | _strategy = new Testing_Execution_Strategy(pipeline, *_model); 99 | return *_strategy; 100 | } 101 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/Unigram_Model_Testing_Builder.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Unigram_Model_Builder.h 20 | * 21 | * Created on: 06-Jan-2011 22 | * 23 | */ 24 | 25 | #ifndef UNIGRAMMODELTESTINGBUILDER_H_ 26 | #define UNIGRAMMODELTESTINGBUILDER_H_ 27 | 28 | #include "Unigram_Model_Training_Builder.h" 29 | 30 | class Unigram_Model_Testing_Builder: public Unigram_Model_Training_Builder { 31 | public: 32 | Unigram_Model_Testing_Builder(); 33 | virtual ~Unigram_Model_Testing_Builder(); 34 | virtual Model_Refiner& create_model_refiner(); 35 | virtual Execution_Strategy& create_execution_strategy(Pipeline&); 36 | virtual Pipeline& create_pipeline(Model_Refiner&); 37 | }; 38 | 39 | #endif /* UNIGRAMMODELTESTINGBUILDER_H_ */ 40 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/Unigram_Model_Training_Builder.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Unigram_Model_Builder.h 20 | * 21 | * Created on: 06-Jan-2011 22 | * 23 | */ 24 | 25 | #ifndef UNIGRAMMODELTRAININGBUILDER_H_ 26 | #define UNIGRAMMODELTRAININGBUILDER_H_ 27 | 28 | #include "TopicLearner/Model_Builder.h" 29 | #include "Unigram_Model.h" 30 | #include "WordIndexDictionary.h" 31 | #include "TopicLearner/Checkpointer.h" 32 | 33 | class Unigram_Model_Training_Builder: public Model_Builder { 34 | public: 35 | Unigram_Model_Training_Builder(); 36 | virtual ~Unigram_Model_Training_Builder(); 37 | virtual Model_Refiner& create_model_refiner(); 38 | virtual Pipeline& create_pipeline(Model_Refiner&); 39 | virtual Execution_Strategy& create_execution_strategy(Pipeline&); 40 | 41 | void create_output(); 42 | Model& get_model(); 43 | 44 | protected: 45 | void init_dict(); 46 | WordIndexDictionary& get_dict(); 47 | void initialize_topics(string, string, int); 48 | 49 | protected: 50 | Unigram_Model* _model; 51 | WordIndexDictionary* _dict; 52 | Model_Refiner* _refiner; 53 | Pipeline* _pipeline; 54 | Checkpointer* _checkpointer; 55 | Execution_Strategy* _strategy; 56 | }; 57 | 58 | #endif /* UNIGRAMMODELTRAININGBUILDER_H_ */ 59 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/eff_small_map.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * eff_small_map.cpp 20 | * 21 | * Created on: 15-Jul-2010 22 | * 23 | */ 24 | 25 | #include "eff_small_map.h" 26 | 27 | simple_map::simple_map(int N) { 28 | _N = N; 29 | buckets = new topicCounts*[N + 1]; 30 | for (int i = 0; i < N + 1; i++) 31 | buckets[i] = new topicCounts(INIT_TC_SIZE); 32 | } 33 | 34 | simple_map::~simple_map() { 35 | if (buckets != NULL) { 36 | for (int i = 0; i < _N + 1; i++) 37 | delete buckets[i]; 38 | delete[] buckets; 39 | buckets = NULL; 40 | } 41 | } 42 | 43 | cnt_t simple_map::get(topic_t key) { 44 | topicCounts* bucket = buckets[hash(key)]; 45 | return bucket->get_counts(key); 46 | } 47 | 48 | void simple_map::put(topic_t key, cnt_t val) { 49 | topicCounts* bucket = buckets[hash(key)]; 50 | bucket->addNewTop(key, val); 51 | } 52 | 53 | void simple_map::clear() { 54 | for (int i = 0; i < _N + 1; i++) 55 | buckets[i]->length = 0; 56 | } 57 | 58 | std::string simple_map::print() { 59 | std::stringstream ss; 60 | for (int i = 0; i < _N + 1; i++) { 61 | ss << i << ": " << buckets[i]->print() << std::endl; 62 | } 63 | return ss.str(); 64 | } 65 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/eff_small_map.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * eff_small_map.h 20 | * 21 | * Created on: 15-Jul-2010 22 | * 23 | */ 24 | 25 | #ifndef EFF_SMALL_MAP_H_ 26 | #define EFF_SMALL_MAP_H_ 27 | 28 | #include "TopicCounts.h" 29 | 30 | class simple_map { 31 | private: 32 | topicCounts** buckets; 33 | int _N; 34 | 35 | public: 36 | simple_map(int N = 0x7F); 37 | virtual ~simple_map(); 38 | 39 | inline int hash(topic_t key) { 40 | return key & _N; 41 | } 42 | 43 | cnt_t get(topic_t key); 44 | void put(topic_t key, cnt_t val); 45 | 46 | std::string print(); 47 | 48 | void clear(); 49 | }; 50 | 51 | #endif /* EFF_SMALL_MAP_H_ */ 52 | -------------------------------------------------------------------------------- /src/Unigram_Model/TopicLearner/sampler.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * sampler.h 20 | * 21 | * The sampling function which samples the 22 | * new topic assignment using the collapsed 23 | * Gibbs Sampler approach. It takes in the 24 | * topic counts for the current word, the 25 | * local topic counts for the document being 26 | * processed and Abar, Bbar & Ccached 27 | * 28 | * The procedure is simple. It computes C(t) 29 | * and Cbar. It then generates a 30 | * random number using the uniform RNG passed 31 | * in. It scales the number by Abar + Bbar + Cbar 32 | * It checks to which probability mass the number 33 | * generated belongs. Based on that samples the 34 | * topic responsible for generating this number 35 | * 36 | * Created on: 24 Apr, 2009 37 | * 38 | */ 39 | 40 | #ifndef SAMPLER_H_ 41 | #define SAMPLER_H_ 42 | 43 | #include "constants.h" 44 | #include 45 | #include 46 | #include "tbb/atomic.h" 47 | #include "TopicCounts.h" 48 | 49 | using namespace boost; 50 | using namespace tbb; 51 | namespace sampler { 52 | 53 | topic_t 54 | sample( 55 | const topicCounts* currentTypeTopicCounts, 56 | const topic_t old_topic, 57 | const atomic* tokens_per_topic, 58 | const topic_t* loca_topic_counts, 59 | const topic_t* loca_topic_index, 60 | const int& non_zero_topics, 61 | const double& smoothingOnlyMass, 62 | const double& topicBetaMass, 63 | const double* cachedCoefficients, 64 | const double& betaSum, 65 | const double* alpha, 66 | double* topic_term_scores, 67 | const topic_t& numTopics, 68 | variate_generator >* unif01); 69 | 70 | } // namespace sampler 71 | #endif /* SAMPLER_H_ */ 72 | -------------------------------------------------------------------------------- /src/commons/Client.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Client.h 20 | * 21 | * 22 | * 23 | * Created on: 07-Jan-2011 24 | * 25 | */ 26 | 27 | #ifndef CLIENT_H_ 28 | #define CLIENT_H_ 29 | 30 | #include 31 | 32 | using namespace std; 33 | 34 | /** 35 | * The interface to be implemented by any client for the 36 | * Distributed Map service 37 | */ 38 | class Client { 39 | public: 40 | //!Gets the serialized val stored for the key in 41 | //!the Distributed Map. Returns true if key exists 42 | //!and false otherwise 43 | virtual bool get(const string& key, string& val) = 0; 44 | 45 | //!Sets val as the serialized value for the key in 46 | //!the Distributed Map. This has replace semantics 47 | virtual void set(const string& key, const string& val) = 0; 48 | 49 | //!Adds delta to the serialized value for the key in 50 | //!the Distributed Map. This has accumulator semantics. 51 | //!The accumulator logic is provided by the Server_Helper. 52 | virtual void put(const string& key, const string& delta) = 0; 53 | 54 | //!Asynchronous put and get operation. It begins with 55 | //!a put. The call back in Synchronizer_Helper is called 56 | //!which creates the effect of a get 57 | virtual void begin_putNget(const string& key, const string& val) = 0; 58 | 59 | //!Remove the key from the Distributed Map returning 60 | //!the serialized value stored as val. Returns true if key exists 61 | //!and false otherwise 62 | virtual bool remove(const string& key, string& val) = 0; 63 | 64 | //!Provides barrier functionality 65 | virtual void wait_for_all() = 0; 66 | 67 | //!Provides functionality to wait for any asynchronous communication 68 | //!to end 69 | virtual void wait_till_done() = 0; 70 | }; 71 | 72 | #endif /* CLIENT_H_ */ 73 | -------------------------------------------------------------------------------- /src/commons/Context.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Context.cpp 20 | * 21 | * 22 | * Created on: 23-Dec-2010 23 | * 24 | */ 25 | 26 | #include "Context.h" 27 | #include "gflags/gflags.h" 28 | #include 29 | #include 30 | #include "glog/logging.h" 31 | 32 | Context::Context() { 33 | std::vector flags; 34 | google::GetAllFlags(&flags); 35 | for (size_t i = 0; i < flags.size(); i++) { 36 | google::CommandLineFlagInfo& flag = flags[i]; 37 | _flags[flag.name] = flag.is_default ? flag.default_value 38 | : flag.current_value; 39 | } 40 | } 41 | 42 | Context::~Context() { 43 | } 44 | 45 | Context& Context::get_instance() { 46 | if (_instance) 47 | return *_instance; 48 | _instance = new Context(); 49 | return *_instance; 50 | } 51 | 52 | string Context::get_string(string key) { 53 | if (_flags.count(key)) 54 | return _flags[key]; 55 | string err = key + " not found. Invalid access"; 56 | LOG(WARNING) << err; 57 | throw err; 58 | } 59 | 60 | int Context::get_int(string key) { 61 | return atoi(get_string(key).c_str()); 62 | } 63 | 64 | double Context::get_double(string key) { 65 | return atof(get_string(key).c_str()); 66 | } 67 | 68 | bool Context::get_bool(string key) { 69 | if (get_string(key).compare("true") == 0) 70 | return true; 71 | else 72 | return false; 73 | } 74 | 75 | void Context::put_string(string key, string val) { 76 | _flags[key] = val; 77 | } 78 | 79 | Context* Context::_instance = NULL; 80 | -------------------------------------------------------------------------------- /src/commons/Context.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Context.h 20 | * 21 | * 22 | * Created on: 23-Dec-2010 23 | * 24 | */ 25 | 26 | #ifndef CONTEXT_H_ 27 | #define CONTEXT_H_ 28 | 29 | #include "boost/unordered_map.hpp" 30 | #include 31 | 32 | using namespace std; 33 | using namespace boost; 34 | 35 | //!An object that maintains the context for the executing code 36 | /** 37 | * This is a Singleton object that maintains a context 38 | * for the executing code. It contains a list of properties 39 | * stored as a key-value map 40 | * 41 | * All the flags defined are made available through this 42 | * object. It can also be used as a mechanism for message 43 | * passing. This reduces coupling between the code and 44 | * gflags. 45 | */ 46 | class Context { 47 | public: 48 | static Context& get_instance(); 49 | 50 | //!Get an integer valued property named key 51 | int get_int(string key); 52 | //!Get a string valued property named key 53 | string get_string(string key); 54 | //!Get a double valued property named key 55 | double get_double(string key); 56 | //!Get a bool valued property named key 57 | bool get_bool(string key); 58 | 59 | //!Put a string value for property key 60 | //!into the map 61 | void put_string(string key, string val); 62 | 63 | private: 64 | Context(); 65 | virtual ~Context(); 66 | 67 | static Context* _instance; 68 | unordered_map _flags; 69 | }; 70 | 71 | #endif /* CONTEXT_H_ */ 72 | -------------------------------------------------------------------------------- /src/commons/DocumentReader.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * DocumentReader.cpp 20 | * 21 | * Created on: 7 May, 2009 22 | * 23 | */ 24 | 25 | #ifndef O_LARGEFILE 26 | #define O_LARGEFILE 0 27 | #endif /* O_LARGEFILE */ 28 | 29 | #include "DocumentReader.h" 30 | 31 | /** 32 | * Constructs a DocumentReader object to read msgs 33 | * from w_fname and optionally topics from t_fname 34 | */ 35 | DocumentReader::DocumentReader(string w_fname_) { 36 | w_fname = w_fname_; 37 | w_input.open(w_fname.c_str(), ios::in | ios::binary); 38 | LOG_IF(FATAL,w_fname!="" && !w_input.is_open())<<"Unable to open input file: " << w_fname; 39 | 40 | c_size = new char [sizeof(size_int)]; 41 | c_msg = new char [MAX_MSG_SIZE]; 42 | } 43 | 44 | DocumentReader::~DocumentReader() { 45 | w_input.close(); 46 | delete[] c_size; 47 | delete[] c_msg; 48 | } 49 | 50 | /** 51 | * The default method to read a message from w_fname which 52 | * is in (size of serialized msg,msg serialized as string)* 53 | * format 54 | */ 55 | int DocumentReader::read(google::protobuf::Message* msg) { 56 | try { 57 | read_base(w_input, w_fname, msg); 58 | return 0; 59 | } catch (int e) { 60 | return -1; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/commons/DocumentReader.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * DocumentReader.h 20 | * 21 | * 22 | * Created on: 7 May, 2009 23 | * 24 | */ 25 | 26 | #ifndef DOCUMENTREADER_H_ 27 | #define DOCUMENTREADER_H_ 28 | 29 | #include 30 | #include "google/protobuf/message.h" 31 | #include "constants.h" 32 | 33 | using namespace std; 34 | using namespace LDA; 35 | 36 | /** 37 | * Wrapper around protobuf messages for convenient 38 | * reading of words, topics & (word,index) pairs 39 | * from word, topic, dictionary dump files respectively. 40 | * Assumes that each msg is in a binary file in record* format 41 | * where record=(size of serialized msg,msg serialized as string) 42 | */ 43 | class DocumentReader { 44 | public: 45 | DocumentReader(string w_fname_); 46 | virtual ~DocumentReader(); 47 | 48 | int read(google::protobuf::Message* msg); 49 | 50 | private: 51 | 52 | string w_fname; //The input file to read various msgs from 53 | 54 | ifstream w_input; //The input stream to read from w_fname 55 | 56 | char* c_size; //Array to store size of serialized msg 57 | char* c_msg; //Array to store the serialized msg 58 | 59 | /** 60 | * The main function that reads records from input and stores them c_size & c_msg 61 | * fname is used for logging 62 | */ 63 | inline int read_sized_record_from(ifstream& input, string fname) 64 | throw (int) { 65 | if (input.read(c_size, sizeof(size_int)).eof()) { 66 | LOG_IF(FATAL,input.bad())<< "Unable to read from input file: " <=MAX_MSG_SIZE) << "Reading input from " << fname << "Message size " << size << " exceeds " << MAX_MSG_SIZE << ". Quitting..."; 71 | 72 | input.read(c_msg,size); 73 | return size; 74 | } 75 | 76 | /** 77 | * The base method to read into msg from inp_str 78 | * inp_fname is used for logging 79 | */ 80 | inline void read_base(ifstream& inp_str, string& inp_fname, google::protobuf::Message* msg) throw(int) { 81 | size_int size = read_sized_record_from(inp_str, inp_fname); 82 | string str_message(c_msg, size); 83 | msg->Clear(); 84 | msg->ParseFromString(str_message); 85 | } 86 | }; 87 | 88 | #endif /* DOCUMENTREADER_H_ */ 89 | -------------------------------------------------------------------------------- /src/commons/DocumentWriter.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * DocumentWriter.cpp 20 | * 21 | * Created on: 8 May, 2009 22 | * 23 | */ 24 | 25 | #ifndef O_LARGEFILE 26 | #define O_LARGEFILE 0 27 | #endif /* O_LARGEFILE */ 28 | 29 | #include "DocumentWriter.h" 30 | 31 | /** 32 | * Constructs a DocumentWriter object to write msgs 33 | * into w_fname and optionally topics to t_fname. 34 | * You can however choose to ignore the words file 35 | * if you want to write only topics by setting 36 | * w_fname_="" & using write_topics() method. 37 | */ 38 | DocumentWriter::DocumentWriter(string w_fname_) { 39 | w_fname = w_fname_; 40 | w_output.open(w_fname.c_str(), ios::out | ios::trunc | ios::binary); 41 | LOG_IF(FATAL,w_fname!="" && !w_output.is_open())<<"Unable to open output file: " << w_fname; 42 | } 43 | 44 | DocumentWriter::~DocumentWriter() { 45 | w_output.close(); 46 | } 47 | 48 | /** 49 | * The default method to write a message to w_fname 50 | * in (size of serialized msg,msg serialized as string)* 51 | * format 52 | */ 53 | bool DocumentWriter::write(const google::protobuf::Message& msg) { 54 | return write_base(w_output, msg); 55 | } 56 | -------------------------------------------------------------------------------- /src/commons/DocumentWriter.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * DocumentWriter.h 20 | * 21 | * Created on: 8 May, 2009 22 | * 23 | */ 24 | 25 | #ifndef DOCUMENTWRITER_H_ 26 | #define DOCUMENTWRITER_H_ 27 | 28 | #include 29 | #include "google/protobuf/message.h" 30 | #include "constants.h" 31 | 32 | using namespace std; 33 | using namespace LDA; 34 | 35 | /** 36 | * Wrapper around protobuf msgs for convenient 37 | * writing of words, topics & (word,index) pairs 38 | * into word, topic, dictionary dump files respectively. 39 | * Each msg is written into a binary file in record* format 40 | * where record=(size of serialized msg,msg serialized as string) 41 | */ 42 | class DocumentWriter { 43 | public: 44 | DocumentWriter(string w_fname_); 45 | virtual ~DocumentWriter(); 46 | bool write(const google::protobuf::Message& msg); 47 | 48 | private: 49 | string w_fname; //The output file to write various msgs from 50 | 51 | ofstream w_output; //The output stream to write to w_fname 52 | 53 | string serialized; //The string to which msgs are serialized to 54 | 55 | /** 56 | * The main function that writes records to output 57 | * Assumes that the msg has been serialized by the 58 | * caller into 'seialized' 59 | */ 60 | inline bool write_sized_record_to(ofstream& output) { 61 | LOG_IF(FATAL,serialized.size()>size_t(MAX_MSG_SIZE))<<"Writing file: Message size " << serialized.size() << " exceeds " << MAX_MSG_SIZE << ". Quitting..."; 62 | 63 | size_int size = (size_int)serialized.size(); 64 | output.write((char*)&size,sizeof(size_int)); 65 | output.write(serialized.c_str(),size); 66 | output.flush(); 67 | return !output.bad(); 68 | } 69 | 70 | //The base method to write msg into out_str 71 | inline bool write_base(ofstream& out_str, const google::protobuf::Message& msg) { 72 | msg.SerializeToString(&serialized); 73 | return write_sized_record_to(out_str); 74 | } 75 | }; 76 | 77 | #endif /* DOCUMENTWRITER_H_ */ 78 | -------------------------------------------------------------------------------- /src/commons/Formatter/Data_Formatter.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Data_Formatter.h 20 | * 21 | * 22 | * Created on: 11-Jan-2011 23 | * 24 | */ 25 | 26 | #ifndef DATA_FORMATTER_H_ 27 | #define DATA_FORMATTER_H_ 28 | 29 | #include "WordIndexDictionary.h" 30 | 31 | //!An interface for formatter objects 32 | /** A formatter is an object that 33 | * converts raw text corpus into 34 | * binary so that its disk footprint 35 | * is low and there is no parsing 36 | * involved while reading it back 37 | */ 38 | class Data_Formatter { 39 | public: 40 | //!Perform the actual formatting 41 | virtual void format() = 0; 42 | 43 | //!Return the dictionary being used by the formatter 44 | virtual WordIndexDictionary& get_dictionary() = 0; 45 | 46 | //!The number of documents formatted 47 | virtual int get_num_docs() = 0; 48 | 49 | //!The total number of words found 50 | virtual int get_total_num_words() = 0; 51 | }; 52 | 53 | #endif /* DATA_FORMATTER_H_ */ 54 | -------------------------------------------------------------------------------- /src/commons/Formatter/FormatData_flags_define.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * FormatData_flags_define.h 20 | * 21 | * Contains definitions of formatter related flags 22 | * 23 | * Created on: 19 Jul, 2009 24 | * 25 | */ 26 | 27 | #ifndef FORMATDATA_FLAGS_DEFINE_H_ 28 | #define FORMATDATA_FLAGS_DEFINE_H_ 29 | 30 | #include "gflags/gflags.h" 31 | 32 | DEFINE_int32(model,1,"Unigram-1 or some other model"); 33 | DEFINE_string(corpusprefix,"specify","Corpus file in libSVM format"); 34 | DEFINE_string(outputprefix,"lda", "A prefix that will be used with all files output by the program"); 35 | DEFINE_string(dumpfile,"specify", "The dump of the dictionary to be used instead of creating afresh"); 36 | 37 | #endif /* FORMATDATA_FLAGS_DEFINE_H_ */ 38 | -------------------------------------------------------------------------------- /src/commons/LDAUtil.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * StringTokenizer.cpp 20 | * 21 | * Created on: 20-Oct-2010 22 | * 23 | */ 24 | 25 | #include "LDAUtil.h" 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | std::string LDAUtil::Itoa::_cache[100] = { "0", "1", "2", "3", "4", "5", "6", 32 | "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", 33 | "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", 34 | "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", 35 | "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53", "54", 36 | "55", "56", "57", "58", "59", "60", "61", "62", "63", "64", "65", "66", 37 | "67", "68", "69", "70", "71", "72", "73", "74", "75", "76", "77", "78", 38 | "79", "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", 39 | "91", "92", "93", "94", "95", "96", "97", "98", "99" }; 40 | std::stringstream LDAUtil::Itoa::_ss; 41 | 42 | namespace LDAUtil { 43 | 44 | void StringTokenizer::tokenize(const string& s, char delim, 45 | vector& ret_vec) { 46 | ret_vec.clear(); 47 | size_t start_pos = 0; 48 | size_t sz = s.size(); 49 | while (start_pos < sz) { 50 | size_t pos = s.find_first_of(delim, start_pos); 51 | if (pos == string::npos) 52 | pos = sz; 53 | string sub_str = s.substr(start_pos, pos - start_pos); 54 | StringTrimmer::trim(sub_str); 55 | ret_vec.push_back(sub_str); 56 | start_pos = pos + 1; 57 | } 58 | } 59 | 60 | StringTokenizer::StringTokenizer() { 61 | } 62 | 63 | StringTokenizer::~StringTokenizer() { 64 | } 65 | 66 | void StringTrimmer::trim(string& s, char trim_char) { 67 | size_t start_pos = 0; 68 | while (true) { 69 | size_t pos = s.find_first_of(trim_char, start_pos); 70 | if (pos == string::npos) 71 | break; 72 | s.erase(pos, 1); 73 | start_pos = pos; 74 | } 75 | } 76 | 77 | StringTrimmer::StringTrimmer() { 78 | } 79 | 80 | StringTrimmer::~StringTrimmer() { 81 | } 82 | 83 | string DM_Server_Names::get_servant_name(const int server_id) { 84 | return "DM_Server_" + Itoa::get_string(server_id); 85 | } 86 | 87 | string DM_Server_Names::get_server_endpoint(const string& host_port) { 88 | vector host; 89 | StringTokenizer::tokenize(host_port, ':', host); 90 | return "default -h " + host[0] + " -p " + host[1]; 91 | } 92 | 93 | Itoa::Itoa() { 94 | } 95 | 96 | Itoa::~Itoa() { 97 | } 98 | 99 | string Itoa::get_string(const int i) { 100 | if (0 <= i && i <= 99) 101 | return _cache[i]; 102 | else { 103 | _ss.str(""); 104 | _ss << i; 105 | return _ss.str(); 106 | } 107 | } 108 | 109 | } 110 | /*using namespace std; 111 | 112 | int main(int argc, char* argv[]){ 113 | string test = "123,45,6789,"; 114 | vector tokens; 115 | LDAUtil::StringTokenizer::tokenize(test,',',tokens); 116 | ostream_iterator out_it(cout, " "); 117 | copy(tokens.begin(),tokens.end(),out_it); 118 | cout << endl; 119 | test = " 123 45 6789 "; 120 | LDAUtil::StringTrimmer::trim(test); 121 | cout << test; 122 | }*/ 123 | -------------------------------------------------------------------------------- /src/commons/LDAUtil.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * LDAUtil.h 20 | * 21 | * 22 | * 23 | * Created on: 20-Oct-2010 24 | * 25 | */ 26 | 27 | #ifndef LDAUTIL_H_ 28 | #define LDAUTIL_H_ 29 | 30 | #include 31 | #include 32 | #include 33 | 34 | namespace LDAUtil { 35 | using namespace std; 36 | 37 | /** 38 | * Utility class for tokenization and such other 39 | * commonly used functions 40 | */ 41 | class StringTokenizer { 42 | public: 43 | static void tokenize(const string& s, char delim, vector& ret_vec); 44 | 45 | private: 46 | StringTokenizer(); 47 | virtual ~StringTokenizer(); 48 | }; 49 | 50 | class StringTrimmer { 51 | public: 52 | static void trim(string& s, char trim_char = ' '); 53 | private: 54 | StringTrimmer(); 55 | virtual ~StringTrimmer(); 56 | }; 57 | 58 | class Itoa { 59 | public: 60 | static string get_string(const int i); 61 | private: 62 | static string _cache[100]; 63 | static stringstream _ss; 64 | 65 | Itoa(); 66 | virtual ~Itoa(); 67 | }; 68 | 69 | class DM_Server_Names { 70 | public: 71 | static string get_servant_name(const int server_id); 72 | static string get_server_endpoint(const string& host_port); 73 | 74 | private: 75 | DM_Server_Names(); 76 | virtual ~DM_Server_Names(); 77 | }; 78 | } 79 | 80 | #endif /* LDAUTIL_H_ */ 81 | -------------------------------------------------------------------------------- /src/commons/MVGaussian.cc: -------------------------------------------------------------------------------- 1 | #include "MVGaussian.h" 2 | #include "utils.h" 3 | #include "defs.h" 4 | #include "utils.h" 5 | 6 | MVGaussian::MVGaussian(uniform_random_t *uniform) 7 | { 8 | m_iSet = 0; 9 | this->uniform = uniform; 10 | } 11 | 12 | MVGaussian::~MVGaussian(void) 13 | { 14 | } 15 | 16 | void MVGaussian::nextMVGaussian(double *mean, double **precision, double *res, const int &n) 17 | { 18 | double **precisionLowerTriangular = alloc2D(n, n); 19 | choleskydec(precision, precisionLowerTriangular, n, false); 20 | 21 | nextMVGaussianWithCholesky(mean, precisionLowerTriangular, res, n); 22 | free2D(precisionLowerTriangular, n, n); 23 | } 24 | 25 | void MVGaussian::nextMVGaussian(double *mean, double **precision, double **res, const int &n, const int &m) 26 | { 27 | double **precisionLowerTriangular = alloc2D(n, n); 28 | choleskydec(precision, precisionLowerTriangular, n, false); 29 | 30 | for (int i=0; i(precisionLowerTriangular, n, n); 33 | } 34 | 35 | void MVGaussian::nextMVGaussianWithCholesky(double *mean, double **precisionLowerTriangular, double *res, const int &n) 36 | { 37 | // Initialize vector z to standard normals 38 | // [NB: using the same array for z and x] 39 | for (int i = 0; i < n; i++) { 40 | res[i] = nextGaussian(); 41 | } 42 | 43 | // Now solve trans(L) x = z using back substitution 44 | double innerProduct; 45 | 46 | for (int i = n-1; i >= 0; i--) { 47 | innerProduct = 0; 48 | for (int j = i+1; j < n; j++) { 49 | // the cholesky decomp got us the precisionLowerTriangular triangular 50 | // matrix, but we really want the transpose. 51 | innerProduct += res[j] * precisionLowerTriangular[j][i]; 52 | } 53 | 54 | res[i] = (res[i] - innerProduct) / precisionLowerTriangular[i][i]; 55 | } 56 | 57 | for (int i = 0; i < n; i++) { 58 | res[i] += mean[i]; 59 | } 60 | } 61 | 62 | double MVGaussian::nextGaussian() 63 | { 64 | if ( m_iSet == 0 ) { 65 | double dRsq = 0; 66 | double v1, v2; 67 | do { 68 | v1 = 2.0 * (*uniform)() - 1.0; 69 | v2 = 2.0 * (*uniform)() - 1.0; 70 | dRsq = v1 * v1 + v2 * v2; 71 | } while (dRsq > 1.0 || dRsq < 1e-300); 72 | 73 | double dFac = sqrt(-2.0 * log(dRsq) / dRsq); 74 | m_dGset = v1 * dFac; 75 | m_iSet = 1; 76 | return v2 * dFac; 77 | } else { 78 | m_iSet = 0; 79 | return m_dGset; 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/commons/MVGaussian.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "random.h" 4 | 5 | class MVGaussian 6 | { 7 | public: 8 | MVGaussian(uniform_random_t *uniform); 9 | ~MVGaussian(void); 10 | 11 | void nextMVGaussian(double *mean, double **precision, double *res, const int &n); 12 | void nextMVGaussian(double *mean, double **precision, double **res, const int &n, const int &m); 13 | 14 | void nextMVGaussianWithCholesky(double *mean, double **precisionLowerTriangular, double *res, const int &n) ; 15 | 16 | private: 17 | double nextGaussian(); 18 | 19 | private: 20 | // for Gaussian random variable 21 | int m_iSet; 22 | double m_dGset; 23 | uniform_random_t *uniform; 24 | }; 25 | -------------------------------------------------------------------------------- /src/commons/MVGaussian2.cc: -------------------------------------------------------------------------------- 1 | #include "MVGaussian2.h" 2 | #include "utils.h" 3 | #include "defs.h" 4 | #include "utils.h" 5 | #include 6 | #include 7 | #include 8 | using namespace std; 9 | using namespace arma; 10 | 11 | #define tic timer.tic() 12 | #define toc cerr << timer.toc() << endl; 13 | 14 | class MatGaussianFiller 15 | { 16 | public: 17 | MatGaussianFiller() 18 | { 19 | m_iSet = 0; 20 | create_generator(generator, uni_dist, uniform); 21 | 22 | int tmp = rand()&1023; 23 | for (int i=0; in_rows; 37 | double *mem = res->memptr(); 38 | 39 | for (int i=beginCol; i 1.0 || dRsq < 1e-300); 56 | 57 | double dFac = sqrt(-2.0 * log(dRsq) / dRsq); 58 | m_dGset = v1 * dFac; 59 | m_iSet = 1; 60 | return v2 * dFac; 61 | } else { 62 | m_iSet = 0; 63 | return m_dGset; 64 | } 65 | } 66 | 67 | int m_iSet; 68 | double m_dGset; 69 | 70 | base_generator_type *generator; 71 | uniform_real<> *uni_dist; 72 | uniform_random_t *uniform; 73 | }; 74 | 75 | 76 | MVGaussian2::MVGaussian2() 77 | { 78 | //For load balancing 79 | //TODO Ugly, use mutex to assign tasks dynamically 80 | nthreads = 240; 81 | //nthreads = 1; 82 | filler = new MatGaussianFiller[nthreads]; 83 | } 84 | 85 | MVGaussian2::~MVGaussian2(void) 86 | { 87 | delete[] filler; 88 | } 89 | 90 | struct FillTask 91 | { 92 | int begin; 93 | int end; 94 | arma::mat *y; 95 | MatGaussianFiller *filler; 96 | }; 97 | 98 | void* fillthread(void *args) 99 | { 100 | FillTask* task = (FillTask*)args; 101 | 102 | (*(task->filler))(task->y, task->begin, task->end); 103 | 104 | pthread_exit(NULL); 105 | } 106 | 107 | void MVGaussian2::parallelFill(arma::mat *y) 108 | { 109 | int ncols = y->n_cols; 110 | 111 | int chuckSize = (ncols-1) / nthreads + 1; 112 | 113 | int lastEnd = 0; 114 | 115 | pthread_t *threads = new pthread_t[nthreads]; 116 | FillTask *tasks = new FillTask[nthreads]; 117 | for (int k=0; k 5 | 6 | class MatGaussianFiller; 7 | 8 | class MVGaussian2 9 | { 10 | public: 11 | MVGaussian2(); 12 | ~MVGaussian2(void); 13 | 14 | // mu : dim x 1 15 | // precision : dim x dim 16 | // return : dim x n 17 | arma::mat nextMVGaussian(arma::mat &mu, 18 | arma::mat &precision, 19 | const int n); 20 | 21 | private: 22 | void parallelFill(arma::mat *y); 23 | 24 | int nthreads; 25 | MatGaussianFiller *filler; 26 | }; 27 | -------------------------------------------------------------------------------- /src/commons/Server/DistributedMap.ice: -------------------------------------------------------------------------------- 1 | module GlobalTable{ 2 | interface DistributedMap{ 3 | void put(string s, string delta); 4 | void set(string s, string counts); 5 | bool remove(string s, out string counts); 6 | ["amd"] void putNget(string s, string delta, out string counts); 7 | bool get(string s, out string counts); 8 | ["amd"] void waitForAllClients(); 9 | }; 10 | }; 11 | -------------------------------------------------------------------------------- /src/commons/Server/Server_Helper.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Server_Helper.h 20 | * 21 | * 22 | * 23 | * Created on: 07-Jan-2011 24 | * 25 | */ 26 | 27 | #ifndef SERVER_HELPER_H_ 28 | #define SERVER_HELPER_H_ 29 | 30 | #include 31 | 32 | using namespace std; 33 | 34 | 35 | 36 | //! Server Helper interface 37 | /** 38 | * Users of the Distributed Map need to provide 39 | * semantics for the put operation using this 40 | * helper class. 41 | * 42 | * The put operation calls helper.combine() 43 | * to perform the accumulate operation 44 | * 45 | * All server helpers must implement this interface 46 | */ 47 | class Server_Helper { 48 | public: 49 | virtual void combine(const string entity, const string& old, 50 | const string& delta, string& combined) = 0; 51 | }; 52 | 53 | #endif /* SERVER_HELPER_H_ */ 54 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Checkpointer.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Checkpointer.h 20 | * 21 | * 22 | * 23 | * Created on: 07-Mar-2011 24 | * 25 | */ 26 | 27 | #ifndef CHECKPOINTER_H_ 28 | #define CHECKPOINTER_H_ 29 | 30 | #include 31 | 32 | //!Used to implement failure recovery 33 | /** 34 | * The checkpointer interface. 35 | * A checkpointer object helps create a checkpoint and 36 | * also to load from a checkpoint 37 | * 38 | * A checkpointer has two things: 39 | * 1. Metadata - (Ex.: The iteration at which this checkpoint was created) 40 | * 2. The method to serialize the necessary data structures to disk 41 | * 42 | */ 43 | class Checkpointer { 44 | public: 45 | //!Serialize the metadata 46 | virtual void save_metadata(std::string& state) = 0; 47 | //!Load the metadata 48 | virtual std::string load_metadata() = 0; 49 | //!Serialize other necessary data structures 50 | virtual void checkpoint() = 0; 51 | }; 52 | 53 | #endif /* CHECKPOINTER_H_ */ 54 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/DM_Client.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * DM_Client.cpp 20 | * 21 | * Created on: 19-Oct-2010 22 | * 23 | */ 24 | 25 | #include "DM_Client.h" 26 | #include "LDAUtil.h" 27 | #include 28 | #include "boost/unordered_map.hpp" 29 | 30 | DM_Client::DM_Client(int num_entities, const string& servers, 31 | Synchronizer_Helper& sync_helper) { 32 | _num_entities = num_entities; 33 | _max_msgs = min(_num_entities, MAX_MSGS); 34 | _cur_msg_id = 0; 35 | _png_cb = new PNGCallback(sync_helper, _max_msgs); 36 | vector tokens; 37 | LDAUtil::StringTokenizer::tokenize(servers, ',', tokens); 38 | for (size_t i = 0; i < tokens.size(); ++i) { 39 | add_server(LDAUtil::DM_Server_Names::get_servant_name(i), 40 | LDAUtil::DM_Server_Names::get_server_endpoint(tokens[i])); 41 | } 42 | _cb = Ice::newCallback(_png_cb, &PNGCallback::finished); 43 | } 44 | 45 | DM_Client::~DM_Client() { 46 | if (_ic) 47 | _ic->destroy(); 48 | } 49 | 50 | void DM_Client::add_server(const string& servant_name, 51 | const string& server_endpoint) { 52 | try { 53 | if (!_ic) { 54 | int argc = 2; 55 | char* argv[2] = { (char*) "learnTopics", 56 | (char*) "--Ice.ThreadPool.Client.SizeMax=2" }; 57 | _ic = Ice::initialize(argc, argv); 58 | } 59 | string proxy(servant_name + ":" + server_endpoint); 60 | Ice::ObjectPrx base = _ic->stringToProxy(proxy); 61 | DistributedMapPrx dist_map = DistributedMapPrx::checkedCast(base); 62 | if (!dist_map) 63 | throw "Invalid Proxy: " + proxy; 64 | _dist_maps.push_back(dist_map); 65 | NUM_SERVERS = _dist_maps.size(); 66 | } catch (const Ice::Exception& ex) { 67 | cerr << ex << endl; 68 | throw ex; 69 | } catch (const char* msg) { 70 | cerr << msg << endl; 71 | throw msg; 72 | } 73 | } 74 | 75 | int DM_Client::get_num_servers() { 76 | return NUM_SERVERS; 77 | } 78 | 79 | size_t DM_Client::get_server(const string& s) { 80 | return _hasher(s) % NUM_SERVERS; 81 | } 82 | 83 | void DM_Client::begin_putNget(const string& entity, const string& delta) { 84 | _png_cb->wait_till_done(_cur_msg_id); 85 | CookiePtr cookie = new Cookie; 86 | cookie->entity = entity; 87 | cookie->msg_id = _cur_msg_id; 88 | _png_cb->set_done(_cur_msg_id, false); 89 | _dist_maps[get_server(entity)]->begin_putNget(entity, delta, _cb, cookie); 90 | if (++_cur_msg_id % _max_msgs == 0) { 91 | wait_till_done(); 92 | } 93 | } 94 | 95 | void DM_Client::wait_till_done() { 96 | if (_cur_msg_id == 0) 97 | return; 98 | _png_cb->wait_till_done(_cur_msg_id - 1); 99 | _cur_msg_id = 0; 100 | } 101 | 102 | void DM_Client::put(const string& entity, const string& delta) { 103 | _dist_maps[get_server(entity)]->put(entity, delta); 104 | } 105 | 106 | void DM_Client::set(const string& entity, const string& counts) { 107 | _dist_maps[get_server(entity)]->set(entity, counts); 108 | } 109 | 110 | bool DM_Client::get(const string& entity, string& counts) { 111 | return _dist_maps[get_server(entity)]->get(entity, counts); 112 | } 113 | 114 | bool DM_Client::remove(const string& entity, string& counts) { 115 | return _dist_maps[get_server(entity)]->remove(entity, counts); 116 | } 117 | 118 | void DM_Client::wait_for_all() { 119 | _dist_maps[0]->waitForAllClients(); 120 | } 121 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Dirichlet.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Dirichlet.cpp 20 | * 21 | * Has code that computes the log_gamma & digamma functions 22 | * Parts of the code are related to the cc/mallet/types/Dirichlet.java 23 | * file in the Mallet code base 24 | * Created on: 13 May, 2009 25 | * 26 | */ 27 | 28 | #include 29 | #include "Dirichlet.h" 30 | 31 | static const double NEG_EULER_MASCHERONI = -0.5772156649015328606065121; 32 | static const double HALF_LOG_TWO_PI = log(2 * M_PI) / 2; 33 | 34 | static const double DIGAMMA_COEFF_1 = 1 / 12; 35 | static const double DIGAMMA_COEFF_2 = 1 / 120; 36 | static const double DIGAMMA_COEFF_3 = 1 / 252; 37 | static const double DIGAMMA_COEFF_4 = 1 / 240; 38 | static const double DIGAMMA_COEFF_5 = 1 / 132; 39 | static const double DIGAMMA_COEFF_6 = 691 / 32760; 40 | static const double DIGAMMA_COEFF_7 = 1 / 12; 41 | 42 | static const double DIGAMMA_LARGE = 9.5; 43 | static const double DIGAMMA_SMALL = .000001; 44 | 45 | double log_gamma(double z) { 46 | if (z < 0) { 47 | return 0; 48 | } 49 | 50 | int shift = (z<2) ? ceil(2-z) : 0; 51 | z += shift; 52 | 53 | double result = HALF_LOG_TWO_PI + ((z - 0.5) * log(z)) - z + (1 / (12 * z)) 54 | - (1 / (360 * z * z * z)) + (1 / (1260 * z * z * z * z * z)); 55 | 56 | while (shift-- > 0) { 57 | z--; 58 | result -= log(z); 59 | } 60 | 61 | return result; 62 | } 63 | 64 | double digamma(double z) { 65 | if (z < 0) { 66 | return 0; 67 | } 68 | double psi = 0; 69 | 70 | if (z < DIGAMMA_SMALL) { 71 | psi = NEG_EULER_MASCHERONI - (1 / z); 72 | return psi; 73 | } 74 | 75 | while (z < DIGAMMA_LARGE) { 76 | psi -= 1 / z; 77 | z++; 78 | } 79 | 80 | double inv_z = 1 / z; 81 | double inv_z_squared = inv_z * inv_z; 82 | 83 | psi += log(z) - .5 * inv_z - inv_z_squared * (DIGAMMA_COEFF_1 - inv_z_squared 84 | * (DIGAMMA_COEFF_2 - inv_z_squared * (DIGAMMA_COEFF_3 - inv_z_squared 85 | * (DIGAMMA_COEFF_4 - inv_z_squared * (DIGAMMA_COEFF_5 86 | - inv_z_squared * (DIGAMMA_COEFF_6 - inv_z_squared 87 | * DIGAMMA_COEFF_7)))))); 88 | 89 | return psi; 90 | } 91 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Dirichlet.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Dirichlet.h 20 | * The log_gamma digamma functions declarations 21 | * Created on: 13 May, 2009 22 | * 23 | */ 24 | 25 | #ifndef DIRICHLET_H_ 26 | #define DIRICHLET_H_ 27 | 28 | #include "constants.h" 29 | 30 | using namespace boost; 31 | 32 | double log_gamma(double z); 33 | double digamma(double z); 34 | 35 | #endif /* DIRICHLET_H_ */ 36 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Execution_Strategy.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Execution_Strategy.h 20 | * 21 | * 22 | * 23 | * Created on: 23-Dec-2010 24 | * 25 | */ 26 | 27 | #ifndef EXECUTION_STRATEGY_H_ 28 | #define EXECUTION_STRATEGY_H_ 29 | 30 | //!Interface for strategy objects 31 | /** 32 | * Implement this interface to define 33 | * a strategy detailing what filters 34 | * are added to the pipeline and how 35 | * to run the pipeline 36 | */ 37 | class Execution_Strategy { 38 | public: 39 | virtual void execute() = 0; 40 | }; 41 | 42 | #endif /* EXECUTION_STRATEGY_H_ */ 43 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Accumulator.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | #include "Filter_Accumulator.h" 19 | 20 | Filter_Accumulator::Filter_Accumulator(Model_Refiner& refiner) : 21 | filter(serial), _refiner(refiner) { 22 | } 23 | 24 | Filter_Accumulator::~Filter_Accumulator() { 25 | 26 | } 27 | 28 | void* Filter_Accumulator::operator ()(void *token) { 29 | return _refiner.accumulateEta(token); 30 | } 31 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Accumulator.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Filter_Accumulator.cpp 20 | * 21 | * 22 | * 23 | * Created on: 24 Apr, 2013 24 | * 25 | */ 26 | 27 | #ifndef FILTER_ACCUMULATOR_H_ 28 | #define FILTER_ACCUMULATOR_H_ 29 | 30 | #include "tbb/pipeline.h" 31 | #include "Model_Refiner.h" 32 | 33 | using namespace std; 34 | using namespace tbb; 35 | 36 | //!A filter in the TBB pipeline. 37 | /** 38 | * Delegates the task to be done to 39 | * refiner.accumulateEta() 40 | */ 41 | class Filter_Accumulator: public filter { 42 | private: 43 | Model_Refiner& _refiner; 44 | 45 | public: 46 | Filter_Accumulator(Model_Refiner&); 47 | virtual ~Filter_Accumulator(); 48 | 49 | void* operator ()(void *); 50 | }; 51 | 52 | #endif /* FILTER_ACCUMULATOR_H_ */ 53 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_EtaSampler.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | #include "Filter_EtaSampler.h" 19 | 20 | Filter_EtaSampler::Filter_EtaSampler(Model_Refiner& refiner) : 21 | filter(parallel), _refiner(refiner) { 22 | } 23 | 24 | Filter_EtaSampler::~Filter_EtaSampler() { 25 | 26 | } 27 | 28 | void* Filter_EtaSampler::operator ()(void *token) { 29 | return _refiner.sampleEta(token); 30 | } 31 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_EtaSampler.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Filter_EtaSampler.cpp 20 | * 21 | * 22 | * 23 | * Created on: 24 Apr, 2013 24 | * 25 | */ 26 | 27 | #ifndef FILTER_ETASAMPLER_H_ 28 | #define FILTER_ETASAMPLER_H_ 29 | 30 | #include "tbb/pipeline.h" 31 | #include "Model_Refiner.h" 32 | 33 | using namespace std; 34 | using namespace tbb; 35 | 36 | //!A filter in the TBB pipeline. 37 | /** 38 | * Delegates the task to be done to 39 | * refiner.accumulateEta() 40 | */ 41 | class Filter_EtaSampler: public filter { 42 | private: 43 | Model_Refiner& _refiner; 44 | 45 | public: 46 | Filter_EtaSampler(Model_Refiner&); 47 | virtual ~Filter_EtaSampler(); 48 | 49 | void* operator ()(void *); 50 | }; 51 | 52 | #endif /* FILTER_ETASAMPLER_H_ */ 53 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Eval.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | #include "Filter_Eval.h" 19 | 20 | Filter_Eval::Filter_Eval(Model_Refiner& refiner) : 21 | filter(serial), _refiner(refiner) { 22 | _value = 0; 23 | } 24 | 25 | Filter_Eval::~Filter_Eval() { 26 | } 27 | 28 | void* Filter_Eval::operator ()(void *token) { 29 | double value = 0; 30 | void* ret_token = _refiner.eval(token, value); 31 | _value += value; 32 | return ret_token; 33 | } 34 | 35 | double Filter_Eval::get_eval() { 36 | return _value; 37 | } 38 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Eval.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Filter_Eval.cpp 20 | * 21 | * 22 | * 23 | * Created on: 24 Mar, 2009 24 | * 25 | */ 26 | 27 | #ifndef FILTER_EVAL_H_ 28 | #define FILTER_EVAL_H_ 29 | 30 | #include "tbb/pipeline.h" 31 | #include "Model_Refiner.h" 32 | 33 | using namespace tbb; 34 | using namespace std; 35 | 36 | //!A filter in the TBB pipeline. 37 | /** 38 | * Delegates the task to be done to 39 | * refiner.eval. The output from the eval step is accumulated and 40 | * can be queried using get_eval() 41 | */ 42 | class Filter_Eval: public filter { 43 | private: 44 | Model_Refiner& _refiner; 45 | double _value; 46 | 47 | public: 48 | Filter_Eval(Model_Refiner&); 49 | virtual ~Filter_Eval(); 50 | 51 | void* operator()(void*); 52 | double get_eval(); 53 | }; 54 | 55 | #endif /* FILTER_EVAL_H_ */ 56 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Optimizer.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | #include "Filter_Optimizer.h" 19 | 20 | //Constructs an optimizer filter 21 | Filter_Optimizer::Filter_Optimizer(Model_Refiner& refiner) : 22 | filter(serial), _refiner(refiner) { 23 | } 24 | 25 | Filter_Optimizer::~Filter_Optimizer() { 26 | } 27 | 28 | /** 29 | * Each document is passed through the filter using this method 30 | * If we have read in tau documents, we update the global alphas 31 | * Else we compute gradients & accumulate them in the local alphas 32 | * 33 | */ 34 | void* Filter_Optimizer::operator ()(void *token) { 35 | return _refiner.optimize(token); 36 | } 37 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Optimizer.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Filter_Optimizer.cpp 20 | * 21 | * 22 | * 23 | * Created on: 11 Jun, 2009 24 | * 25 | */ 26 | 27 | #ifndef FILTER_OPTIMIZER_H_ 28 | #define FILTER_OPTIMIZER_H_ 29 | 30 | #include "tbb/pipeline.h" 31 | #include "Model_Refiner.h" 32 | 33 | using namespace tbb; 34 | 35 | //!A filter in the TBB pipeline. 36 | /** 37 | * Delegates the task to be done to 38 | * refiner.optimize() 39 | */ 40 | class Filter_Optimizer: public filter { 41 | private: 42 | Model_Refiner& _refiner; 43 | 44 | public: 45 | 46 | Filter_Optimizer(Model_Refiner&); 47 | virtual ~Filter_Optimizer(); 48 | 49 | void* operator ()(void *); 50 | }; 51 | 52 | #endif /* FILTER_OPTIMIZER_H_ */ 53 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Reader.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | #include "Filter_Reader.h" 19 | #include "Context.h" 20 | 21 | Filter_Reader::Filter_Reader(Model_Refiner& refiner) : 22 | filter(serial), _refiner(refiner) { 23 | int livetokens = Context::get_instance().get_int("livetokens"); 24 | docs = _refiner.allocate_document_buffer(livetokens); 25 | next_doc = 0; 26 | } 27 | 28 | Filter_Reader::~Filter_Reader() { 29 | _refiner.deallocate_document_buffer(docs); 30 | } 31 | 32 | void* Filter_Reader::operator ()(void *) { 33 | int livetokens = Context::get_instance().get_int("livetokens"); 34 | google::protobuf::Message* doc = _refiner.get_nth_document(docs, next_doc); 35 | next_doc = (next_doc + 1) % livetokens; 36 | return _refiner.read(*doc); 37 | } 38 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Reader.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Filter_Reader.cpp 20 | * 21 | * 22 | * 23 | * Created on: 24 Mar, 2009 24 | * 25 | */ 26 | 27 | #ifndef FILTER_READER_H_ 28 | #define FILTER_READER_H_ 29 | 30 | #include "tbb/pipeline.h" 31 | #include "Model_Refiner.h" 32 | #include "google/protobuf/message.h" 33 | 34 | using namespace tbb; 35 | 36 | //!A filter in the TBB pipeline. 37 | /** 38 | * Delegates the task to be done to 39 | * refiner.read() 40 | */ 41 | class Filter_Reader: public filter { 42 | private: 43 | size_t next_doc; 44 | google::protobuf::Message* docs; 45 | Model_Refiner& _refiner; 46 | 47 | public: 48 | Filter_Reader(Model_Refiner&); 49 | virtual ~Filter_Reader(); 50 | 51 | void* operator()(void*); 52 | }; 53 | 54 | #endif /* FILTER_READER_H_ */ 55 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Sampler.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | #include "Filter_Sampler.h" 19 | 20 | Filter_Sampler::Filter_Sampler(Model_Refiner& refiner) : 21 | filter(parallel), _refiner(refiner) { 22 | } 23 | 24 | Filter_Sampler::~Filter_Sampler() { 25 | 26 | } 27 | 28 | void* Filter_Sampler::operator ()(void *token) { 29 | return _refiner.sample(token); 30 | } 31 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Sampler.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Filter_Sampler.cpp 20 | * 21 | * 22 | * 23 | * Created on: 5 Apr, 2009 24 | * 25 | */ 26 | 27 | #ifndef FILTER_SAMPLER_H_ 28 | #define FILTER_SAMPLER_H_ 29 | 30 | #include "tbb/pipeline.h" 31 | #include "Model_Refiner.h" 32 | 33 | using namespace std; 34 | using namespace tbb; 35 | 36 | //!A filter in the TBB pipeline. 37 | /** 38 | * Delegates the task to be done to 39 | * refiner.sample() 40 | */ 41 | class Filter_Sampler: public filter { 42 | private: 43 | Model_Refiner& _refiner; 44 | 45 | public: 46 | Filter_Sampler(Model_Refiner&); 47 | virtual ~Filter_Sampler(); 48 | 49 | void* operator ()(void *); 50 | }; 51 | 52 | #endif /* FILTER_SAMPLER_H_ */ 53 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Tester.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | #include "Filter_Tester.h" 19 | 20 | Filter_Tester::Filter_Tester(Model_Refiner& refiner) : 21 | filter(parallel), _refiner(refiner) { 22 | } 23 | 24 | Filter_Tester::~Filter_Tester() { 25 | } 26 | 27 | void* Filter_Tester::operator ()(void *token) { 28 | return _refiner.test(token); 29 | } 30 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Tester.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Filter_Tester.cpp 20 | * 21 | * 22 | * 23 | * Created on: 6 Apr, 2009 24 | * 25 | */ 26 | 27 | #ifndef FILTER_TESTER_H_ 28 | #define FILTER_TESTER_H_ 29 | 30 | #include "tbb/pipeline.h" 31 | #include "Model_Refiner.h" 32 | 33 | using namespace tbb; 34 | //!A filter in the TBB pipeline. 35 | /** 36 | * Delegates the task to be done to 37 | * refiner.test() 38 | */ 39 | class Filter_Tester: public filter { 40 | private: 41 | Model_Refiner& _refiner; 42 | 43 | public: 44 | Filter_Tester(Model_Refiner&); 45 | virtual ~Filter_Tester(); 46 | 47 | void* operator ()(void *); 48 | }; 49 | 50 | #endif /* FILTER_TESTER_H_ */ 51 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Updater.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | #include "Filter_Updater.h" 19 | 20 | Filter_Updater::Filter_Updater(Model_Refiner& refiner) : 21 | filter(parallel), _refiner(refiner) { 22 | } 23 | 24 | Filter_Updater::~Filter_Updater() { 25 | } 26 | 27 | void* Filter_Updater::operator ()(void *token) { 28 | return _refiner.update(token); 29 | } 30 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Updater.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Filter_Updater.cpp 20 | * 21 | * A filter in the TBB pipeline. Delegates the task to be done to 22 | * refiner.update() 23 | * 24 | * Created on: 6 Apr, 2009 25 | * 26 | */ 27 | 28 | #ifndef FILTER_UPDATER_H_ 29 | #define FILTER_UPDATER_H_ 30 | 31 | #include "tbb/pipeline.h" 32 | #include "Model_Refiner.h" 33 | 34 | using namespace tbb; 35 | 36 | class Filter_Updater: public filter { 37 | private: 38 | Model_Refiner& _refiner; 39 | 40 | public: 41 | Filter_Updater(Model_Refiner&); 42 | virtual ~Filter_Updater(); 43 | 44 | void* operator ()(void *); 45 | }; 46 | 47 | #endif /* FILTER_UPDATER_H_ */ 48 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Writer.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | #include "Filter_Writer.h" 19 | 20 | Filter_Writer::Filter_Writer(Model_Refiner& refiner) : 21 | filter(serial), _refiner(refiner) { 22 | } 23 | 24 | Filter_Writer::~Filter_Writer() { 25 | } 26 | 27 | /** 28 | * We receive the document to write to disk. This contains both words & their 29 | * topic assignments. We use write_topics() to only write 30 | * the topics which internally clears the words. 31 | */ 32 | void* Filter_Writer::operator ()(void *token) { 33 | _refiner.write(token); 34 | return NULL; 35 | } 36 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Filter_Writer.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Filter_Writer.cpp 20 | * 21 | * 22 | * 23 | * Created on: 24 Mar, 2009 24 | * 25 | */ 26 | 27 | #ifndef FILTER_WRITER_H_ 28 | #define FILTER_WRITER_H_ 29 | 30 | #include "tbb/pipeline.h" 31 | #include "Model_Refiner.h" 32 | 33 | using namespace tbb; 34 | 35 | //!A filter in the TBB pipeline. 36 | /** 37 | * Delegates the task to be done to 38 | * refiner.write() 39 | */ 40 | class Filter_Writer: public filter { 41 | private: 42 | Model_Refiner& _refiner; 43 | 44 | public: 45 | Filter_Writer(Model_Refiner&); 46 | virtual ~Filter_Writer(); 47 | 48 | void* operator()(void*); 49 | }; 50 | 51 | #endif /* FILTER_WRITER_H_ */ 52 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/GenericTopKList.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * GenericTopKList.h 20 | * 21 | * 22 | * Created on: 22-Jul-2010 23 | * 24 | */ 25 | 26 | #ifndef GENERICTOPKLIST_H_ 27 | #define GENERICTOPKLIST_H_ 28 | 29 | #include 30 | #include 31 | #include 32 | 33 | using namespace std; 34 | 35 | //!A list that maintains top K elements 36 | template 37 | class GenericTopKList { 38 | private: 39 | priority_queue , GreaterThan> pq; 40 | size_t K; 41 | 42 | public: 43 | GenericTopKList(size_t K_ = 10) : 44 | K(K_) { 45 | } 46 | 47 | virtual ~GenericTopKList() { 48 | } 49 | 50 | void push(T elem) { 51 | if (pq.size() < K) { 52 | pq.push(elem); 53 | } else { 54 | T top_elem = pq.top(); 55 | GreaterThan gt; 56 | if (gt(elem, top_elem)) { 57 | pq.pop(); 58 | pq.push(elem); 59 | } 60 | } 61 | } 62 | 63 | T top() { 64 | return pq.top(); 65 | } 66 | 67 | void pop() { 68 | if (!pq.empty()) 69 | pq.pop(); 70 | } 71 | 72 | bool empty() { 73 | return pq.empty(); 74 | } 75 | 76 | void print(ostream& out) { 77 | stack st; 78 | while (!pq.empty()) { 79 | st.push(pq.top()); 80 | pq.pop(); 81 | } 82 | while (!st.empty()) { 83 | out << st.top() << ","; 84 | st.pop(); 85 | } 86 | } 87 | 88 | void clear() { 89 | while (!pq.empty()) 90 | pq.pop(); 91 | } 92 | }; 93 | 94 | #endif /* GENERICTOPKLIST_H_ */ 95 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Main_flags_define.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Main_flags_define.h 20 | * 21 | * Flags for learntopics 22 | * 23 | * Created on: 19 Jul, 2009 24 | * 25 | */ 26 | 27 | #ifndef MAIN_FLAGS_DEFINE_H_ 28 | #define MAIN_FLAGS_DEFINE_H_ 29 | 30 | #include "gflags/gflags.h" 31 | #include "constants.h" 32 | #include "tbb/task_scheduler_init.h" 33 | 34 | DEFINE_int32(iter,1000,"Number of iterations the topic modeller should be run"); 35 | DEFINE_int32(burnin,299,"Number of iterations after which alpha optimization should be to be run after every iterations"); 36 | DEFINE_int32(optimizestats,25,"Optimize hyper parameters every these many iterations"); 37 | DEFINE_int32(printloglikelihood,25,"Print log likelihood after every iterations after burn-in"); 38 | DEFINE_int32(topics,100,"The number of topics to be used by LDA."); 39 | 40 | //dataflow 41 | DEFINE_string(inputprefix,"lda","The output prefix used for the FormatData routine"); 42 | DEFINE_string(dumpprefix,"","The word-topic counts are initialized from this file which is generated by the preprocessing step or at the end of an iteration"); 43 | 44 | //parameters 45 | DEFINE_int32(subiter,10,"Number of sub-iteration"); 46 | DEFINE_bool(restart,false,"Indicates use of failure recovery mode. The iteration to start with should also be specified"); 47 | DEFINE_bool(online,false,"Uses online initialization instead of random"); 48 | DEFINE_bool(skipiniteta,false,"Don't random initialize eta and lambda, read from topic document. (debug)"); 49 | DEFINE_bool(skipinitz,false,"Don't random initialize z, read from topic document. (debug)"); 50 | DEFINE_bool(testml,false,"Test Multinomial Logistic (debug)"); 51 | DEFINE_int32(startiter,1,"This the iteration at which failure recovery should start"); 52 | DEFINE_bool(test,false,"Run the test pipeline. No updates are done & requires an earlier dump of the word-topic counts table"); 53 | DEFINE_bool(teststream,false,"Run the test pipeline in streaming mode. Formatting is a part of the pipeline. No updates are done & requires an earlier dump of the word-topic counts table & dictionary"); 54 | DEFINE_double(alpha,ALPHA_SUM,"Weight of the Dirichlet conjugate for topics"); 55 | DEFINE_double(rho, 1, "Parameter of Normal-Inverse-Wishart prior"); 56 | DEFINE_int32(prior, 30, "Kappa and Rho of NIW distribution"); 57 | DEFINE_double(beta,BETA,"Weight of the Dirichlet conjugate for words"); 58 | DEFINE_int32(chkptinterval,25,"The topic assignments are saved every these many iterations"); 59 | DEFINE_int32(lag, -1, "Model data saved every these many iterations."); 60 | DEFINE_string(chkptdir,"","The directory to which the checkpoints need to written"); 61 | DEFINE_string(servers,"specify","The set of all memcached servers that are storing the state. E.g. 192.168.0.1, 192.168.0.3:44, 200.132.12.34"); 62 | DEFINE_int32(numdumps,1,"Number of word-topic count dumps in the training data"); 63 | DEFINE_int32(maxmemory,2048,"The max memory that can be used"); 64 | DEFINE_string(dictionary,"specify","The dump of the global dictionary produced in the training run. To be use for teststream"); 65 | DEFINE_int32(minibatchsize, 256, "Size of mini batch."); 66 | //hidden 67 | DEFINE_int32(livetokens,500,"Max Live Tokens in pipeline"); 68 | DEFINE_int32(model,1,"Unigram-1"); 69 | //DEFINE_int32(samplerthreads,tbb::task_scheduler_init::automatic,"The number of foreground threads that run actual LDA pipeline. Default is to figure out automatically"); 70 | DEFINE_int32(samplerthreads,12,"The number of foreground threads that run actual LDA pipeline. Default is 12"); 71 | DEFINE_int32(pgsamples,1,"Number of Polya-Gamma samples, -1=full sampling"); 72 | DEFINE_string(samplemode, "gaussian", "Method for Polya-Gamma sampling, can be precise/pg1/truncated/gaussian"); 73 | #endif /* MAIN_FLAGS_DEFINE_H_ */ 74 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Model.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Model.h 20 | * 21 | * 22 | * Created on: 03-Jan-2011 23 | * 24 | */ 25 | 26 | #ifndef MODEL_H_ 27 | #define MODEL_H_ 28 | 29 | #include "WordIndexDictionary.h" 30 | 31 | //!A marker interface for the LDA graphical model 32 | //!and its extensions. 33 | /** 34 | * A model should be able to compute its contribution to 35 | * the log-likelihood, serialize to disk & also explain 36 | * itself by writing the word mixtures that represent the 37 | * latent topics to disk 38 | */ 39 | class Model { 40 | public: 41 | const static int UNIGRAM = 1; 42 | 43 | public: 44 | //!Model's contribution of log-likelihood 45 | virtual double get_eval() = 0; 46 | 47 | //!Serialize to disk 48 | virtual bool save() = 0; 49 | 50 | //!Explain: word mixtures for the latent topics 51 | virtual void write_statistics(WordIndexDictionary&) = 0; 52 | }; 53 | 54 | #endif /* MODEL_H_ */ 55 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Model_Builder.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Model_Builder.h 20 | * 21 | * 22 | * 23 | * 24 | * 25 | * Created on: 11-May-2010 26 | * 27 | */ 28 | 29 | #ifndef MODELBUILDER_H_ 30 | #define MODELBUILDER_H_ 31 | 32 | #include "Model_Refiner.h" 33 | #include "Pipeline.h" 34 | #include "Execution_Strategy.h" 35 | #include "Model.h" 36 | 37 | //!The builder class which builds the different components 38 | //!needed to create the model. 39 | /** 40 | * We use the Builder pattern. 41 | * 42 | * The components are: 43 | * 1. Model_Refiner: This main component that describes how 44 | * the model should be refined as documents 45 | * pass through the pipeline 46 | * 2. Pipeline: A pipeline of filters that perform the various 47 | * refinements defined by the refiner 48 | * 3. Execution_Strategy: A strategy that defines how the pipeline 49 | * of filters has to be executed 50 | * 51 | * Usually, different modes demand different builders and similar 52 | * or different components. So keep in mind that inheritance can 53 | * be used here to maximize code reuse 54 | */ 55 | class Model_Builder { 56 | 57 | public: 58 | virtual Model_Refiner& create_model_refiner() = 0; 59 | virtual Pipeline& create_pipeline(Model_Refiner&) = 0; 60 | virtual Execution_Strategy& create_execution_strategy(Pipeline&) = 0; 61 | virtual void create_output() = 0; 62 | 63 | virtual Model& get_model() = 0; 64 | }; 65 | 66 | #endif /* MODELBUILDER_H_ */ 67 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Model_Director.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Model_Director.cpp 20 | * 21 | * Created on: 28-Dec-2010 22 | * 23 | */ 24 | 25 | #include "Model_Director.h" 26 | #include "Model_Builder.h" 27 | 28 | Model_Director::Model_Director() { 29 | } 30 | 31 | Model_Director::~Model_Director() { 32 | } 33 | 34 | Model& Model_Director::build_model(Model_Builder& builder) { 35 | Model_Refiner& refiner = builder.create_model_refiner(); 36 | Pipeline& pipeline = builder.create_pipeline(refiner); 37 | Execution_Strategy& strategy = builder.create_execution_strategy(pipeline); 38 | strategy.execute(); 39 | return builder.get_model(); 40 | } 41 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Model_Director.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Model_Director.h 20 | * 21 | * 22 | * 23 | * 24 | * Created on: 28-Dec-2010 25 | * 26 | */ 27 | 28 | #ifndef MODEL_DIRECTOR_H_ 29 | #define MODEL_DIRECTOR_H_ 30 | 31 | #include "Model_Builder.h" 32 | #include "Model.h" 33 | 34 | //!The Director class of the Builder pattern 35 | /** 36 | * Simple steps: Use the builder to 37 | * 1. Create Refiner 38 | * 2. Create a pipeline of filters to perform the 39 | * various refinements defined by the Refiner 40 | * 3. Create a strategy to execute the pipeline of filters 41 | * 4. Execute the strategy 42 | * 5. Return the model inside the builder that has been refined 43 | * by the strategy 44 | */ 45 | class Model_Director { 46 | public: 47 | Model_Director(); 48 | virtual ~Model_Director(); 49 | 50 | Model& build_model(Model_Builder& builder); 51 | }; 52 | 53 | #endif /* MODEL_DIRECTOR_H_ */ 54 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Model_Refiner.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Model_Refiner.h 20 | * 21 | * 22 | * 23 | * Created on: 23-Dec-2010 24 | * 25 | */ 26 | 27 | #ifndef MODEL_REFINER_H_ 28 | #define MODEL_REFINER_H_ 29 | 30 | #include "google/protobuf/message.h" 31 | 32 | //!An interface that defines the necessary refinements 33 | //!for refining the LDA graphical model and its extensions. 34 | /** 35 | * A refinement is defined as an operation done to improve 36 | * the model like sampling. The definition is also extended 37 | * to operations that enable a refinement like reading documents 38 | * and evaluating log-likelihoods. 39 | * 40 | * The provider of a model's implementation has to implement 41 | * this interface suitably 42 | */ 43 | class Model_Refiner { 44 | public: 45 | virtual google::protobuf::Message* allocate_document_buffer(size_t) = 0; 46 | virtual void deallocate_document_buffer(google::protobuf::Message*) = 0; 47 | virtual google::protobuf::Message* get_nth_document( 48 | google::protobuf::Message* docs, size_t n) = 0; 49 | virtual void* read(google::protobuf::Message&) = 0; 50 | virtual void* sample(void*) = 0; 51 | virtual void* test(void*) = 0; 52 | virtual void* update(void*) = 0; 53 | virtual void* sampleEta(void*) = 0; 54 | virtual void* accumulateEta(void*) = 0; 55 | virtual void sampleGauss() = 0; 56 | virtual void* optimize(void*) = 0; 57 | virtual void* eval(void*, double&) = 0; 58 | virtual void write(void*) = 0; 59 | virtual void iteration_done() = 0; 60 | }; 61 | 62 | #endif /* MODEL_REFINER_H_ */ 63 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/PThread_Pipeline.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * PThread_Pipeline.h 20 | * 21 | * 22 | * Created on: 14-May-2013 23 | * 24 | */ 25 | 26 | #ifndef PThread_PIPELINE_H_ 27 | #define PThread_PIPELINE_H_ 28 | 29 | #include "Pipeline.h" 30 | #include "Unigram_Model/TopicLearner/Unigram_Model_Trainer.h" 31 | #include "types.h" 32 | #include 33 | #include "tbb/atomic.h" 34 | #include "tbb/spin_mutex.h" 35 | 36 | //!An implementation of the Pipeline interface using 37 | //!PThread. It is only a dummy pipeline to use yahoo-lda's interface. 38 | class PThread_Pipeline: public Pipeline { 39 | public: 40 | PThread_Pipeline(Unigram_Model_Trainer&); 41 | virtual ~PThread_Pipeline(); 42 | void init(); 43 | void add_reader(); 44 | void add_sampler(); 45 | void add_updater(); 46 | void add_etasampler(); 47 | void add_accumulator(); 48 | void add_gausssampler(); 49 | void add_optimizer(); 50 | void add_eval(); 51 | void add_writer(); 52 | void add_tester(); 53 | void clear(); 54 | void destroy(); 55 | void run(); 56 | void dump(int lag); 57 | 58 | Model_Refiner& get_refiner(); 59 | double get_eval(); 60 | 61 | public: 62 | void readNwrite(void); 63 | void sample(int thread_id); 64 | 65 | protected: 66 | Unigram_Model_Trainer& _refiner; 67 | std::vector *_readSet, *_writeSet, *_workingSet; 68 | 69 | bool _ifSampleTopic; 70 | bool _ifUpdate; 71 | bool _ifSampleEta; 72 | bool _ifAccumulate; 73 | bool _ifSampleGauss; 74 | bool _ifEval; 75 | int _num_threads; 76 | 77 | // count down 78 | tbb::atomic _current_document; 79 | tbb::atomic _eval_words; 80 | double _eval_likelihood; 81 | tbb::spin_mutex _eval_likelihood_m; 82 | 83 | double time_sample; 84 | double time_gauss; 85 | double time_accumulate; 86 | }; 87 | 88 | #endif /* PThread_PIPELINE_H_ */ 89 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Parameter.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Parameter.h 20 | * 21 | * 22 | * 23 | * Created on: 14-May-2010 24 | * 25 | */ 26 | 27 | #ifndef PARAMETER_H_ 28 | #define PARAMETER_H_ 29 | 30 | #include 31 | #include 32 | 33 | using namespace std; 34 | 35 | /** 36 | * A class to represent the various parameters like 37 | * the dirichlet conjugate weights and such 38 | * 39 | * Can be vector valued parameters 40 | * 41 | * This class additionally stores the sum and provides 42 | * functionality to dump to and initialize from disk 43 | */ 44 | typedef struct Parameter { 45 | int length; 46 | double* values; 47 | double sum; 48 | 49 | Parameter(); 50 | Parameter(const Parameter ¶m); 51 | virtual ~Parameter(); 52 | 53 | Parameter& operator=(const Parameter ¶m); 54 | virtual void dump(string fname); 55 | virtual void initialize_from_dump(string fname); 56 | virtual void initialize_from_values(int length_, const double* values); 57 | virtual void initialize_from_values(int length_, const double* values_, 58 | float sum_); 59 | double norm() const; 60 | 61 | // If the parameter is an unrolled matrix, use this 62 | // to get a pointer to a matrix to manipulate it. 63 | // ==============CAUTION============== 64 | // It is user's duty to delete this handler, delete[] handler; 65 | virtual double** getMatrixHandler(int rows, int cols); 66 | friend ostream& operator << (ostream& out, const Parameter& param); 67 | } param; 68 | 69 | Parameter operator + (const Parameter& a, const Parameter& b); 70 | Parameter operator - (const Parameter& a, const Parameter& b); 71 | Parameter operator * (const Parameter& a, double factor); 72 | bool operator == (const Parameter &a, const Parameter &b); 73 | bool operator != (const Parameter &a, const Parameter &b); 74 | 75 | 76 | #endif /* PARAMETER_H_ */ 77 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Pipeline.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Pipeline_Builder.h 20 | * 21 | * Created on: 03-Jan-2011 22 | * 23 | */ 24 | 25 | #ifndef PIPELINE_BUILDER_H_ 26 | #define PIPELINE_BUILDER_H_ 27 | 28 | #include "TopicLearner/Model_Refiner.h" 29 | 30 | //!An interface that all pipeline objects must implement 31 | /** 32 | * A pipeline is a sequence of computation steps 33 | * that can be performed on some data passing through 34 | * it. Its similar to an assembly pipeline. 35 | */ 36 | class Pipeline { 37 | public: 38 | virtual void init() = 0; 39 | virtual void add_reader() = 0; 40 | virtual void add_sampler() = 0; 41 | virtual void add_updater() = 0; 42 | virtual void add_etasampler() = 0; 43 | virtual void add_accumulator() = 0; 44 | virtual void add_gausssampler() = 0; 45 | virtual void add_optimizer() = 0; 46 | virtual void add_eval() = 0; 47 | virtual void add_writer() = 0; 48 | virtual void add_tester() = 0; 49 | virtual void clear() = 0; 50 | virtual void destroy() = 0; 51 | virtual void dump(int lag) = 0; 52 | virtual void run() = 0; 53 | virtual Model_Refiner& get_refiner() = 0; 54 | virtual double get_eval() = 0; 55 | }; 56 | 57 | #endif /* PIPELINE_BUILDER_H_ */ 58 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Synchronized_Training_Execution_Strategy.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Synchronized_Training_Execution_Strategy.cpp 20 | * 21 | * Created on: 07-Jan-2011 22 | * 23 | */ 24 | 25 | #include "Synchronized_Training_Execution_Strategy.h" 26 | #include "Synchronizer.h" 27 | #include 28 | #include 29 | 30 | Synchronized_Training_Execution_Strategy::Synchronized_Training_Execution_Strategy( 31 | Pipeline& pipeline, Model& model, Checkpointer& checkpointer, 32 | Synchronizer_Helper& sync_helper) : 33 | Training_Execution_Strategy(pipeline, model, checkpointer), 34 | _sync_helper(sync_helper) { 35 | } 36 | 37 | Synchronized_Training_Execution_Strategy::~Synchronized_Training_Execution_Strategy() { 38 | } 39 | 40 | void* synchronize(void* inp) { 41 | Synchronizer* synchronizer = (Synchronizer*) (inp); 42 | synchronizer->synchronize(); 43 | return NULL; 44 | } 45 | 46 | //Start a synchronizer thread that performs background 47 | //synchronization. Run the usual execution strategy 48 | //Once that is done, indicate the synchronizer to stop 49 | void Synchronized_Training_Execution_Strategy::execute() { 50 | Synchronizer synchronizer(_sync_helper); 51 | pthread_t synch_thread; 52 | pthread_create(&synch_thread, NULL, ::synchronize, &synchronizer); 53 | Training_Execution_Strategy::execute(); 54 | LOG(WARNING) << "All Iters Done"; 55 | synchronizer.set_all_iters_done(); 56 | void *status; 57 | pthread_join(synch_thread, &status); 58 | LOG(WARNING) << "All threads joined"; 59 | } 60 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Synchronized_Training_Execution_Strategy.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Synchronized_Training_Execution_Strategy.h 20 | * 21 | * Created on: 07-Jan-2011 22 | * 23 | */ 24 | 25 | #ifndef SYNCHRONIZED_TRAINING_EXECUTION_STRATEGY_H_ 26 | #define SYNCHRONIZED_TRAINING_EXECUTION_STRATEGY_H_ 27 | 28 | #include "Training_Execution_Strategy.h" 29 | #include "Synchronizer_Helper.h" 30 | 31 | //!Default implementation of the Execution_Strategy interface 32 | /** 33 | * Runs the Training_Execution_Strategy at its 34 | * core and also provides for synchronization of the model 35 | */ 36 | class Synchronized_Training_Execution_Strategy: public Training_Execution_Strategy { 37 | public: 38 | Synchronized_Training_Execution_Strategy(Pipeline&, Model&, Checkpointer&, 39 | Synchronizer_Helper&); 40 | virtual ~Synchronized_Training_Execution_Strategy(); 41 | 42 | void execute(); 43 | 44 | private: 45 | Synchronizer_Helper& _sync_helper; 46 | }; 47 | 48 | #endif /* SYNCHRONIZED_TRAINING_EXECUTION_STRATEGY_H_ */ 49 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Synchronizer.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Synchronizer.cpp 20 | * 21 | * Created on: 07-Jan-2011 22 | * 23 | */ 24 | 25 | #include "Synchronizer.h" 26 | #include "tbb/tick_count.h" 27 | #include "types.h" 28 | #include "glog/logging.h" 29 | 30 | Synchronizer::Synchronizer(Synchronizer_Helper& sync_helper) : 31 | _sync_helper(sync_helper) { 32 | _all_iters_done = false; 33 | _sync_helper.initialize(); 34 | } 35 | 36 | Synchronizer::~Synchronizer() { } 37 | 38 | void Synchronizer::set_all_iters_done() { 39 | _all_iters_done = true; 40 | } 41 | 42 | bool Synchronizer::is_all_iters_done() { 43 | return _all_iters_done; 44 | } 45 | 46 | void Synchronizer::synchronize() { 47 | LOG(WARNING) << "Starting synchronization"; 48 | int sync_time_cnt = 0; 49 | double sync_time = 0.0; 50 | do { 51 | using namespace tbb; 52 | TIME(t1); 53 | while (_sync_helper.has_to_synchronize()) { 54 | _sync_helper.synchronize(); 55 | } 56 | TIME(t2); 57 | sync_time += (t2 - t1).seconds(); 58 | ++sync_time_cnt; 59 | 60 | LOG(WARNING) << "Synch pass " << sync_time_cnt << " done. Took " 61 | << (t2 - t1).seconds() << " seconds"; 62 | 63 | _sync_helper.reset_to_synchronize(); 64 | } while (!is_all_iters_done()); 65 | LOG(WARNING) 66 | << "Stopping synchronization since all iterations are done"; 67 | } 68 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Synchronizer.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Synchronizer.h 20 | * 21 | * 22 | * Created on: 07-Jan-2011 23 | * 24 | */ 25 | 26 | #ifndef SYNCHRONIZER_H_ 27 | #define SYNCHRONIZER_H_ 28 | 29 | #include "Synchronizer_Helper.h" 30 | 31 | //!The synchronization strategy that 32 | //!uses the Synchronizer_Helper to perform the 33 | //!concrete synchronization steps. 34 | /** 35 | * It provides slots, so to say, for synchronization 36 | * and calls the Synchronizer_Helper to perform its 37 | * duties for that slot. 38 | * 39 | * This greatly reduces the burden on the model writer 40 | * 41 | * So whenever a model needs to be scaled by providing 42 | * a distributed inferencing solution, all the model writer 43 | * has to do is write a Synchronizer_Helper. 44 | * 45 | * This is a default synchronization strategy provided and 46 | * users can implement their own strategy by extending this 47 | * 48 | * The main aim is to run the inferencing locally while 49 | * keeping the model in sync globally through the use of 50 | * this Synchronizer and a Distributed Map 51 | */ 52 | class Synchronizer { 53 | public: 54 | Synchronizer(Synchronizer_Helper&); 55 | virtual ~Synchronizer(); 56 | 57 | //!The synchronization strategy 58 | void synchronize(); 59 | 60 | //!Set that all iterations of the Execution_Strategy 61 | //!have been done 62 | void set_all_iters_done(); 63 | 64 | //!Check if the Execution_Strategy has finished 65 | bool is_all_iters_done(); 66 | 67 | private: 68 | Synchronizer_Helper& _sync_helper; 69 | bool _all_iters_done; 70 | 71 | }; 72 | 73 | #endif /* SYNCHRONIZER_H_ */ 74 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Synchronizer_Helper.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Synchronizer_Helper.h 20 | * 21 | * 22 | * Created on: 07-Jan-2011 23 | * 24 | */ 25 | 26 | #ifndef SYNCHRONIZER_HELPER_H_ 27 | #define SYNCHRONIZER_HELPER_H_ 28 | 29 | #include 30 | 31 | //!A helper class for the synchronizer 32 | /** 33 | * Each helper object implements synchronization algorithms 34 | * for maintaining the data structures in sync 35 | * 36 | * The Synchronizer class depends on this interface to enable 37 | * synchronization of a multi-machine LDA set-up 38 | */ 39 | class Synchronizer_Helper { 40 | public: 41 | virtual void initialize() = 0; 42 | virtual void synchronize() = 0; 43 | 44 | //!Returns true as long as all items to be 45 | //!synchronized are not synchronized 46 | virtual bool has_to_synchronize() = 0; 47 | 48 | //!After this call, has_to_synchronize() 49 | //!should return true 50 | virtual void reset_to_synchronize() = 0; 51 | 52 | //!This is a callback from the Client 53 | //!when an async_putNget is used on the Client 54 | //!So when a Client is instantiated, you need 55 | //!to pass a reference of (*this) 56 | virtual void 57 | end_putNget(const std::string& word, const std::string& counts) = 0; 58 | }; 59 | 60 | #endif /* SYNCHRONIZER_HELPER_H_ */ 61 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/TBB_Pipeline.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * TBB_Pipeline.cpp 20 | * 21 | * Created on: 04-Jan-2011 22 | * 23 | */ 24 | 25 | #include "TBB_Pipeline.h" 26 | #include "Filter_Reader.h" 27 | #include "Filter_Sampler.h" 28 | #include "Filter_Updater.h" 29 | #include "Filter_Optimizer.h" 30 | #include "Filter_Eval.h" 31 | #include "Filter_Writer.h" 32 | #include "Filter_Tester.h" 33 | #include "Filter_Accumulator.h" 34 | #include "Filter_EtaSampler.h" 35 | #include "Context.h" 36 | #include "glog/logging.h" 37 | 38 | TBB_Pipeline::TBB_Pipeline(Model_Refiner& refiner) : 39 | _refiner(refiner) { 40 | Context& context = Context::get_instance(); 41 | int num_threads = context.get_int("samplerthreads"); 42 | _init.initialize(num_threads); 43 | } 44 | 45 | TBB_Pipeline::~TBB_Pipeline() { } 46 | 47 | void TBB_Pipeline::init() { 48 | _pipeline = new tbb::pipeline; 49 | _reader = new Filter_Reader(_refiner); 50 | _sampler = new Filter_Sampler(_refiner); 51 | _updater = new Filter_Updater(_refiner); 52 | _etasampler = new Filter_EtaSampler(_refiner); 53 | _accumulator = new Filter_Accumulator(_refiner); 54 | _optimizer = new Filter_Optimizer(_refiner); 55 | _eval = new Filter_Eval(_refiner); 56 | _writer = new Filter_Writer(_refiner); 57 | _tester = new Filter_Tester(_refiner); 58 | 59 | _ifSampleGauss = false; 60 | } 61 | 62 | void TBB_Pipeline::destroy() { 63 | delete _pipeline; 64 | delete _reader; 65 | delete _sampler; 66 | delete _updater; 67 | delete _etasampler; 68 | delete _accumulator; 69 | delete _optimizer; 70 | delete _eval; 71 | delete _writer; 72 | delete _tester; 73 | } 74 | 75 | void TBB_Pipeline::clear() { 76 | _refiner.iteration_done(); 77 | _pipeline->clear(); 78 | } 79 | 80 | void TBB_Pipeline::add_reader() { 81 | _pipeline->add_filter(*_reader); 82 | } 83 | 84 | void TBB_Pipeline::add_sampler() { 85 | _pipeline->add_filter(*_sampler); 86 | } 87 | 88 | void TBB_Pipeline::add_updater() { 89 | _pipeline->add_filter(*_updater); 90 | } 91 | 92 | void TBB_Pipeline::add_optimizer() { 93 | _pipeline->add_filter(*_optimizer); 94 | } 95 | 96 | void TBB_Pipeline::add_etasampler() 97 | { 98 | _pipeline->add_filter(*_etasampler); 99 | } 100 | 101 | void TBB_Pipeline::add_accumulator() { 102 | _pipeline->add_filter(*_accumulator); 103 | } 104 | 105 | void TBB_Pipeline::add_gausssampler() 106 | { 107 | _ifSampleGauss = true; 108 | } 109 | 110 | void TBB_Pipeline::add_eval() { 111 | _pipeline->add_filter(*_eval); 112 | } 113 | 114 | void TBB_Pipeline::add_writer() { 115 | _pipeline->add_filter(*_writer); 116 | } 117 | 118 | void TBB_Pipeline::add_tester() { 119 | _pipeline->add_filter(*_tester); 120 | } 121 | 122 | Model_Refiner& TBB_Pipeline::get_refiner() { 123 | return _refiner; 124 | } 125 | 126 | double TBB_Pipeline::get_eval() { 127 | Filter_Eval* eval = dynamic_cast (_eval); 128 | return eval->get_eval(); 129 | } 130 | 131 | void TBB_Pipeline::run() { 132 | _pipeline->run(Context::get_instance().get_int("livetokens")); 133 | 134 | if (_ifSampleGauss) 135 | { 136 | _refiner.sampleGauss(); 137 | } 138 | } 139 | 140 | void TBB_Pipeline::dump(int lag) 141 | { 142 | LOG(FATAL) << "Unsupported method TBB_Pipeline::dump"; 143 | } 144 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/TBB_Pipeline.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * TBB_Pipeline.h 20 | * 21 | * 22 | * Created on: 04-Jan-2011 23 | * 24 | */ 25 | 26 | #ifndef TBB_PIPELINE_H_ 27 | #define TBB_PIPELINE_H_ 28 | 29 | #include "Pipeline.h" 30 | #include "tbb/task_scheduler_init.h" 31 | #include "tbb/pipeline.h" 32 | #include "Model_Refiner.h" 33 | 34 | using namespace tbb; 35 | 36 | //!An implementation of the Pipeline interface using 37 | //!Intel's Threading Building Blocks. 38 | /** 39 | * TBB::filter is the basic unit of computation and TBB::Pipeline 40 | * puts the filters together. 41 | * 42 | * What filters exist in a pipeline and how the pipeline 43 | * is executed is implemented via an Execution Strategy 44 | * 45 | * Calling each of the add methods adds that particular 46 | * filter to the pipeline 47 | */ 48 | class TBB_Pipeline: public Pipeline { 49 | public: 50 | TBB_Pipeline(Model_Refiner&); 51 | virtual ~TBB_Pipeline(); 52 | void init(); 53 | void add_reader(); 54 | void add_sampler(); 55 | void add_updater(); 56 | void add_etasampler(); 57 | void add_accumulator(); 58 | void add_gausssampler(); 59 | void add_optimizer(); 60 | void add_eval(); 61 | void add_writer(); 62 | void add_tester(); 63 | void clear(); 64 | void destroy(); 65 | void run(); 66 | void dump(int lag); 67 | 68 | Model_Refiner& get_refiner(); 69 | double get_eval(); 70 | 71 | protected: 72 | task_scheduler_init _init; 73 | tbb::pipeline* _pipeline; 74 | Model_Refiner& _refiner; 75 | 76 | filter *_reader, *_sampler, *_updater, *_optimizer, *_eval, *_writer, 77 | *_tester, *_accumulator, *_etasampler; 78 | 79 | bool _ifSampleGauss; 80 | }; 81 | 82 | #endif /* TBB_PIPELINE_H_ */ 83 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Testing_Execution_Strategy.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Simple_Execution_Strategy.cpp 20 | * 21 | * Created on: 28-Dec-2010 22 | * 23 | */ 24 | 25 | #include "Testing_Execution_Strategy.h" 26 | #include "Context.h" 27 | #include "Pipeline.h" 28 | #include "tbb/tick_count.h" 29 | #include "glog/logging.h" 30 | 31 | Testing_Execution_Strategy::Testing_Execution_Strategy(Pipeline& pipeline, 32 | Model& model) : 33 | _pipeline(pipeline), _model(model) { 34 | } 35 | 36 | Testing_Execution_Strategy::~Testing_Execution_Strategy() { 37 | } 38 | 39 | void Testing_Execution_Strategy::execute() { 40 | Context& context = Context::get_instance(); 41 | int start_iter = 0; 42 | int end_iter = 1; 43 | int loglikelihood_interval = context.get_int("printloglikelihood"); 44 | 45 | LOG(WARNING) << "Starting Parallel testing Pipeline"; 46 | for (int iter = start_iter; iter < end_iter; ++iter) { 47 | bool compute_loglikelihood = (iter == start_iter) || (iter 48 | % loglikelihood_interval == 0); 49 | _pipeline.init(); 50 | _pipeline.add_reader(); 51 | _pipeline.add_tester(); 52 | // if (compute_loglikelihood) 53 | // _pipeline.add_eval(); 54 | _pipeline.add_writer(); 55 | tbb::tick_count t0 = tbb::tick_count::now(); 56 | _pipeline.run(); 57 | tbb::tick_count t1 = tbb::tick_count::now(); 58 | LOG(WARNING) << "Iteration " << iter << " done. Took " 59 | << (t1 - t0).seconds() / 60 << " mins" << endl; 60 | if (compute_loglikelihood) { 61 | double word_loglikelihood = _model.get_eval(); 62 | double doc_loglikelihood = _pipeline.get_eval(); 63 | LOG(WARNING) 64 | << ">>>>>>>>>> Log-Likelihood (model, doc, total): " 65 | << word_loglikelihood << " , " << doc_loglikelihood 66 | << " , " << word_loglikelihood + doc_loglikelihood; 67 | } 68 | _pipeline.clear(); 69 | _pipeline.destroy(); 70 | } 71 | LOG(WARNING) << "Parallel testing Pipeline done"; 72 | } 73 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Testing_Execution_Strategy.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Testing_Execution_Strategy.h 20 | * 21 | * 22 | * Created on: 28-Dec-2010 23 | * 24 | */ 25 | 26 | #ifndef TESTING_EXECUTION_STRATEGY_H_ 27 | #define TESTING_EXECUTION_STRATEGY_H_ 28 | 29 | #include "Execution_Strategy.h" 30 | #include "Pipeline.h" 31 | #include "Model.h" 32 | 33 | /** 34 | * This is a default implementation for the Execution_Strategy 35 | * interface for testing using a model. 36 | */ 37 | class Testing_Execution_Strategy: public Execution_Strategy { 38 | public: 39 | Testing_Execution_Strategy(Pipeline&, Model&); 40 | virtual ~Testing_Execution_Strategy(); 41 | 42 | //!Define what filters need to added and when 43 | //!Runs the assembled pipeline once 44 | void execute(); 45 | 46 | private: 47 | Pipeline& _pipeline; 48 | Model& _model; 49 | }; 50 | 51 | #endif /* TESTING_EXECUTION_STRATEGY_H_ */ 52 | -------------------------------------------------------------------------------- /src/commons/TopicLearner/Training_Execution_Strategy.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * Training_Execution_Strategy.h 20 | * 21 | * 22 | * 23 | * Created on: 28-Dec-2010 24 | * 25 | */ 26 | 27 | #ifndef TRAINING_EXECUTION_STRATEGY_H_ 28 | #define TRAINING_EXECUTION_STRATEGY_H_ 29 | 30 | #include "Execution_Strategy.h" 31 | #include "Pipeline.h" 32 | #include "Model.h" 33 | #include "Checkpointer.h" 34 | 35 | /** 36 | * This is a default implementation for the Execution_Strategy 37 | * interface for training a model. 38 | * 39 | * This can use a passed in Checkpointer to do failure recovery 40 | * 41 | * A pipeline needs to be passed. 42 | */ 43 | class Training_Execution_Strategy: public Execution_Strategy { 44 | public: 45 | Training_Execution_Strategy(Pipeline&, Model&, Checkpointer&); 46 | virtual ~Training_Execution_Strategy(); 47 | 48 | //!Define what filters need to added and when 49 | //!Runs the pipeline for 'iter' iterations 50 | void execute(); 51 | 52 | private: 53 | Pipeline& _pipeline; 54 | Model& _model; 55 | Checkpointer& _checkpointer; 56 | }; 57 | 58 | #endif /* TRAINING_EXECUTION_STRATEGY_H_ */ 59 | -------------------------------------------------------------------------------- /src/commons/WordIndexDictionary.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * WordIndexDictionary.h 20 | * 21 | * 22 | * 23 | * Created on: 7 May, 2009 24 | * 25 | */ 26 | 27 | #ifndef WORDINDEXDICTIONARY_H_ 28 | #define WORDINDEXDICTIONARY_H_ 29 | 30 | #include "boost/unordered_map.hpp" 31 | #include "types.h" 32 | #include "DocumentReader.h" 33 | #include 34 | 35 | using namespace std; 36 | using namespace boost; 37 | //!A two way dictionary of words to indices 38 | /** 39 | * Provides a two way dictionary mapping 40 | * words as strings to a unique int index 41 | * and vice versa. The hashtable implementation 42 | * of boost/unordered_map is used. 43 | * 44 | */ 45 | class WordIndexDictionary { 46 | public: 47 | WordIndexDictionary(); 48 | virtual ~WordIndexDictionary(); 49 | int get_index(string word); 50 | string get_word(int index); 51 | int insert_word(string word); 52 | int get_num_words() const; 53 | void print(); 54 | bool match_word_index(); 55 | void dump(string fname); 56 | void initialize_from_dict(WordIndexDictionary* dict, bool sort = false); 57 | void initialize_from_dump(string fname, int num_words = INT_MAX, bool sort = 58 | false); 59 | void initialize_from_dumps(string prefix, int dumps); 60 | size_t size(); 61 | int get_prev_index(int new_id); 62 | int get_freq(int index); 63 | 64 | vector frequencies; 65 | 66 | private: 67 | typedef unordered_map wimap; 68 | unordered_map word_ind_map; 69 | unordered_map ind_word_map; 70 | 71 | wimap::iterator wi_end; 72 | int current_index; 73 | int insert_word(string word, int index_); 74 | int verify_header(DocumentReader & doc_rdr); 75 | string get_suffix(int n); 76 | void sort_on_freq(); 77 | string suffices[100]; 78 | }; 79 | 80 | #endif /* WORDINDEXDICTIONARY_H_ */ 81 | -------------------------------------------------------------------------------- /src/commons/cholesky.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | Copyright (c) 1992-2007 The University of Tennessee. All rights reserved. 3 | 4 | Contributors: 5 | * Sergey Bochkanov (ALGLIB project). Translation from FORTRAN to 6 | pseudocode. 7 | 8 | See subroutines comments for additional copyrights. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are 12 | met: 13 | 14 | - Redistributions of source code must retain the above copyright 15 | notice, this list of conditions and the following disclaimer. 16 | 17 | - Redistributions in binary form must reproduce the above copyright 18 | notice, this list of conditions and the following disclaimer listed 19 | in this license in the documentation and/or other materials 20 | provided with the distribution. 21 | 22 | - Neither the name of the copyright holders nor the names of its 23 | contributors may be used to endorse or promote products derived from 24 | this software without specific prior written permission. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 30 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 31 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 32 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 33 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 34 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 36 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | *************************************************************************/ 38 | 39 | #ifndef _cholesky_h 40 | #define _cholesky_h 41 | 42 | #include "ap.h" 43 | 44 | /************************************************************************* 45 | Cholesky decomposition 46 | 47 | The algorithm computes Cholesky decomposition of a symmetric 48 | positive-definite matrix. 49 | 50 | The result of an algorithm is a representation of matrix A as A = U'*U or 51 | A = L*L'. 52 | 53 | Input parameters: 54 | A - upper or lower triangle of a factorized matrix. 55 | array with elements [0..N-1, 0..N-1]. 56 | N - size of matrix A. 57 | IsUpper - if IsUpper=True, then A contains an upper triangle of 58 | a symmetric matrix, otherwise A contains a lower one. 59 | 60 | Output parameters: 61 | A - the result of factorization. If IsUpper=True, then 62 | the upper triangle contains matrix U, so that A = U'*U, 63 | and the elements below the main diagonal are not modified. 64 | Similarly, if IsUpper = False. 65 | 66 | Result: 67 | If the matrix is positive-definite, the function returns True. 68 | Otherwise, the function returns False. This means that the 69 | factorization could not be carried out. 70 | 71 | -- LAPACK routine (version 3.0) -- 72 | Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., 73 | Courant Institute, Argonne National Lab, and Rice University 74 | February 29, 1992 75 | *************************************************************************/ 76 | bool spdmatrixcholesky(ap::real_2d_array& a, int n, bool isupper); 77 | 78 | 79 | /************************************************************************* 80 | Obsolete 1-based subroutine. 81 | *************************************************************************/ 82 | bool choleskydecomposition(ap::real_2d_array& a, int n, bool isupper); 83 | 84 | 85 | #endif 86 | -------------------------------------------------------------------------------- /src/commons/comparator.cc: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * comparator.cpp 20 | * 21 | * Provides various comparator implementations 22 | * 23 | * Created on: 16 Apr, 2009 24 | * 25 | */ 26 | #include "types.h" 27 | 28 | bool cnt_cmp(packed_t i, packed_t j) { 29 | return (i > j); 30 | } 31 | 32 | bool cnt_cmp_ttc(cnt_topic_t i, cnt_topic_t j) { 33 | return (i.choose.cnt > j.choose.cnt); 34 | } 35 | 36 | bool prob_cmp(tppair v1, tppair v2) { 37 | return (v1.second > v2.second); 38 | } 39 | 40 | bool freq_cmp(id2freq_t v1, id2freq_t v2) { 41 | return (v1.second > v2.second); 42 | } 43 | -------------------------------------------------------------------------------- /src/commons/comparator.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2011 Yahoo! Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. See accompanying LICENSE file. 15 | 16 | The Initial Developer of the Original Code is Shravan Narayanamurthy. 17 | ******************************************************************************/ 18 | /* 19 | * comparator.h 20 | * 21 | * Contains various comparator functions used as comparator in stl algorithms 22 | * All of them implement a descending order on the elements 23 | * 24 | * Created on: 15 Oct, 2009 25 | * 26 | */ 27 | 28 | #ifndef COMPARATOR_H_ 29 | #define COMPARATOR_H_ 30 | 31 | bool cnt_cmp(packed_t i, packed_t j); 32 | 33 | bool cnt_cmp_ttc(cnt_topic_t i, cnt_topic_t j); 34 | 35 | bool prob_cmp(tppair v1, tppair v2); 36 | 37 | bool freq_cmp(id2freq_t v1, id2freq_t v2); 38 | 39 | struct wppair_gt { 40 | bool operator()(wppair& i, wppair& j) { 41 | return i.second > j.second; 42 | } 43 | }; 44 | 45 | struct bigppair_gt { 46 | bool operator()(bigppair& i, bigppair& j) { 47 | return i.second > j.second; 48 | } 49 | }; 50 | 51 | #endif /* COMPARATOR_H_ */ 52 | -------------------------------------------------------------------------------- /src/commons/defs.h: -------------------------------------------------------------------------------- 1 | #ifndef _DEFS_H 2 | #define _DEFS_H 3 | 4 | #define NUMERICAL_ZERO 1e-10 5 | #define PI 3.141592653589793238462643 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /src/commons/matrixIO.h: -------------------------------------------------------------------------------- 1 | #ifndef __MATRIX_IO 2 | #define __MATRIX_IO 3 | #include 4 | #include 5 | #include 6 | #include 7 | using std::ifstream; 8 | using std::ofstream; 9 | using std::endl; 10 | using std::cout; 11 | using std::setw; 12 | using std::setiosflags; 13 | 14 | template 15 | void save(const char* file, T *v, int n, double factor = 1) 16 | { 17 | ofstream fout(file); 18 | for (int i=0; i 24 | void save(const char* file, T **m, int r, int c, double factor = 1) 25 | { 26 | ofstream fout(file); 27 | for (int i=0; i 36 | void load(T* res, const char* file, int n) 37 | { 38 | ifstream fin(file); 39 | for (int i=0; i> res[i]; 41 | } 42 | 43 | template 44 | void load(T** res, const char* file, int n, int m) 45 | { 46 | ifstream fin(file); 47 | for (int i=0; i> res[i][j]; 50 | } 51 | 52 | template 53 | void print(T *a, int n) 54 | { 55 | for (int i=0; i 61 | void print(T **a, int n, int m) 62 | { 63 | for (int i=0; i 6 | #include 7 | #include 8 | using namespace boost; 9 | 10 | class MVGaussian; 11 | class MVGaussian2; 12 | class PolyaGamma; 13 | 14 | class Random 15 | { 16 | public: 17 | Random(uniform_random_t *uniform); 18 | ~Random(); 19 | 20 | void randvector(double *a, const int n, double factor); 21 | void randpdmatrix(double **a, const int n, const int dof, double factor); 22 | 23 | double gammarnd(double shape, double scale); 24 | void dirichletrnd(double *a, const int N, double *theta); 25 | double nextPG(int n, double z); 26 | double rnorm(); 27 | // double rand(); 28 | // unsigned long long longRand(); 29 | 30 | int multRnd(double *theta, int D); 31 | void nextMVGaussian(double *mean, double **precision, double *res, const int &n); 32 | void nextMVGaussian(double *mean, double **precision, double **res, const int &n, const int &m); 33 | arma::mat nextMVGaussian(arma::mat &mu, 34 | arma::mat &precision, 35 | const int n); 36 | 37 | void rinvertwishart (double **wishart, double **LAMDA, int n, int m); 38 | void rinvertwishart (double **wishart, double **LAMDA, int n, int m, int rank, int size); 39 | arma::mat rinvertwishart(arma::mat &LAMBDA, int m, int rank, int size); 40 | 41 | // Generate normal-inverse-wishart 42 | // E[cov] = wishart / (kappa-N-1) 43 | void NIWrnd(double *mu, double **cov, 44 | double rho, int kappa, double *mu_0, double **wishart, 45 | int N); 46 | void NIWrnd(double *mu, double **cov, 47 | double rho, int kappa, double *mu_0, double **wishart, 48 | int N, int rank, int size); 49 | 50 | // mu, cov are empty matrix of dim dimensions 51 | void NIWrnd(arma::mat &mu, arma::mat &cov, 52 | double rho, int kappa, 53 | arma::mat &mu_0, arma::mat &wishart, 54 | int rank, int size); 55 | 56 | private: 57 | void mpi_iw_xtx(double **temp2, double **LAMDA, int dim, int m, int rank, int size); 58 | arma::mat mpi_iw_xtx(arma::mat &LAMBDA, int m, int rank, int size); 59 | MVGaussian2 *fast_gaussian; 60 | MVGaussian *gaussian; 61 | PolyaGamma *pg; 62 | uniform_random_t *uniform; 63 | }; 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /src/commons/testUtil/compare.cc: -------------------------------------------------------------------------------- 1 | #include "compare.h" 2 | #include 3 | #include "../TopicLearner/Parameter.h" 4 | #include "glog/logging.h" 5 | #include 6 | #include "matrixIO.h" 7 | #include "utils.h" 8 | 9 | double meanTest(int n, const Parameter *samples, const Parameter& expectedMean) 10 | { 11 | if (n>0) 12 | LOG_IF(FATAL, samples[0].length!=expectedMean.length) 13 | << "vector dimensions must agree"; 14 | 15 | // norm(samples-expectedMean) / norm(samples+expectedMean) 16 | Parameter sampleMean; 17 | sampleMean.initialize_from_values(expectedMean.length, NULL, 0); 18 | for (int i=0; i bestCost) 47 | return; 48 | if (i==K) 49 | { 50 | bestCost = currentCost; 51 | memcpy(bestPerm, perm, sizeof(int)*K); 52 | return; 53 | } 54 | for (int j=0; j(K, K); 70 | for (int i=0; i(cost, K, K); 82 | delete[] temp_perm; 83 | } 84 | -------------------------------------------------------------------------------- /src/commons/testUtil/compare.h: -------------------------------------------------------------------------------- 1 | #ifndef __COMPARE 2 | #define __COMPARE 3 | 4 | class Parameter; 5 | 6 | double norm(const Parameter& parameter); 7 | 8 | double meanTest(int n, const Parameter *samples, const Parameter& expectedMean); 9 | 10 | double averageDifferenceMatrixRows(double **expected, 11 | double **realvalue, 12 | int nrows, int ncols); 13 | 14 | void bestPermutation(double **Phi, double **inf_Phi, int *perm, int K, int W); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/commons/testUtil/remote.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "remote.h" 5 | #include "DocumentReader.h" 6 | #include "DocumentWriter.h" 7 | #include "document.pb.h" 8 | #include 9 | using namespace std; 10 | 11 | void spreadDocument(const char *prefix, int nnodes) 12 | { 13 | DocumentReader reader(prefix); 14 | 15 | unigram_document doc; 16 | vector docs; 17 | 18 | while (reader.read(&doc) != -1) 19 | { 20 | docs.push_back(doc); 21 | } 22 | 23 | int n = docs.size(); 24 | assert( n % nnodes == 0 ); 25 | int p = n/nnodes; 26 | 27 | int cnt = 0; 28 | for (int i=1; i<=nnodes; ++i) 29 | { 30 | char path[256]; 31 | sprintf(path, "%s.%d", prefix, i); 32 | DocumentWriter writer(path); 33 | 34 | for (; cnt collectDocuments(const char *prefix, int nnodes) 47 | { 48 | vector docs; 49 | 50 | for (int i=1; i<=nnodes; ++i) 51 | { 52 | char path[256]; 53 | sprintf(path, "%s.%d", prefix, i); 54 | char buff[256]; 55 | sprintf(buff, "scp juncluster%d:/home/jianfei/git/yahoo-lda/%s %s", i, prefix, path); 56 | puts(buff); 57 | system(buff); 58 | 59 | DocumentReader reader(path); 60 | unigram_document doc; 61 | 62 | while (reader.read(&doc) != -1) 63 | { 64 | docs.push_back(doc); 65 | } 66 | 67 | printf("%d\n", docs.size()); 68 | } 69 | 70 | return docs; 71 | } 72 | 73 | void broadcastFile(const char *path, int nnodes) 74 | { 75 | for (int i=1; i<=nnodes; ++i) 76 | { 77 | char buff[256]; 78 | sprintf(buff, "scp %s juncluster%d:/home/jianfei/git/yahoo-lda/%s", path, i, path); 79 | puts(buff); 80 | system(buff); 81 | } 82 | } 83 | 84 | void removeAll(const char *path, int nnodes) 85 | { 86 | for (int i=1; i<=nnodes; ++i) 87 | { 88 | char buff[256]; 89 | sprintf(buff, "ssh juncluster%d rm /home/jianfei/git/yahoo-lda/%s", i, path); 90 | puts(buff); 91 | system(buff); 92 | } 93 | } 94 | 95 | void startParameterServer(int nnodes) 96 | { 97 | ostringstream sout; 98 | sout << "nohup mpiexec -f hostfile"; 99 | 100 | for (int i=0; i 5 | #include "document.pb.h" 6 | using namespace std; 7 | 8 | void spreadDocument(const char *prefix, int nnodes); 9 | vector collectDocuments(const char *prefix, int nnodes); 10 | void broadcastFile(const char *path, int nnodes); 11 | void removeAll(const char *path, int nnodes); 12 | 13 | void startParameterServer(int nnodes); 14 | void endParameterServer(int nnodes); 15 | 16 | void mergeDict(int nnodes); 17 | void mergeTTC(int ntopics, const char *serverlist); 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /src/commons/testUtil/testgenerator.cc: -------------------------------------------------------------------------------- 1 | #include "testgenerator.h" 2 | #include "random.h" 3 | #include "utils.h" 4 | 5 | void generateNIW(int dim, double *mu_0, double **wishart, double &rho, int &kappa) 6 | { 7 | int prior = 50; 8 | rho = prior; 9 | kappa = prior; 10 | 11 | clear(mu_0, dim); 12 | clear2D(wishart, dim, dim); 13 | for (int i=0; iNIWrnd(mu, cov, rho, kappa, mu_0, wishart, dim); 20 | } 21 | 22 | void generateGaussSample(int dim, double *eta, double *mu, double **cov, Random *random) 23 | { 24 | double **prec = alloc2D(dim, dim); 25 | inverse(cov, prec, dim); 26 | 27 | random->nextMVGaussian(mu, prec, eta, dim); 28 | 29 | free2D(prec, dim, dim); 30 | } 31 | 32 | 33 | -------------------------------------------------------------------------------- /src/commons/testUtil/testgenerator.h: -------------------------------------------------------------------------------- 1 | #ifndef __TESTGEN 2 | #define __TESTGEN 3 | 4 | class Random; 5 | 6 | void generateNIW(int dim, double *mu_0, double **wishart, double &rho, int &kappa); 7 | 8 | void generateGauss(int dim, double *mu, double **cov, double *mu_0, double **wishart, double rho, int kappa, Random *random); 9 | 10 | void generateGaussSample(int dim, double *eta, double *mu, double **cov, Random *random); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /src/document.proto: -------------------------------------------------------------------------------- 1 | package LDA; 2 | 3 | option optimize_for = SPEED; 4 | 5 | message unigram_document { 6 | optional string docID = 1; 7 | optional string url = 2; 8 | repeated uint32 body = 3 [packed=true]; 9 | repeated uint32 topic_assignment = 4 [packed=true]; 10 | repeated uint32 topic_counts = 5 [packed=true]; 11 | repeated uint64 cnt_topics = 6 [packed=true]; 12 | repeated float topic_distribution = 7 [packed=true]; 13 | repeated float eta = 8 [packed=true]; 14 | repeated float lambda = 9 [packed=true]; 15 | } 16 | 17 | message unigram_counts{ 18 | repeated uint64 counts = 1 [packed=true]; 19 | repeated uint32 tokens_per_topic = 2 [packed=true]; 20 | } 21 | 22 | message word_ind_pair { 23 | repeated string word = 1; 24 | repeated uint32 index = 2 [packed=true]; 25 | repeated uint32 frequency = 3 [packed=true]; 26 | } 27 | 28 | message header { 29 | required double version = 1; 30 | optional uint32 num_words = 2; 31 | optional uint32 num_topics = 3; 32 | } 33 | 34 | message parameters { 35 | optional double alphasum = 1; 36 | repeated double alphas = 2 [packed=true]; 37 | } 38 | --------------------------------------------------------------------------------