├── .gitignore ├── Scrapper ├── article.py ├── ldac2uci.py ├── removeInfrequentWords.py ├── getarticle.py ├── __init__.py ├── text2ldac.py └── multitext2ldac.py ├── Evaluation ├── getFinalPerplexities.py ├── WordTrends.py ├── getWordVariation.py └── lookupWords.py ├── makefile ├── README.md ├── TopicChains ├── GenerateChains.cpp ├── GetData.cpp └── TopicChains.cpp ├── SCVB0_Evaluation └── scvb.cpp └── SCVB0 └── scvb0.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled source # 2 | ################### 3 | *.o 4 | *.pyc 5 | #Data 6 | *.txt 7 | /Data 8 | #eclipse files 9 | .settings/ 10 | *~ 11 | .cproject 12 | /Debug 13 | -------------------------------------------------------------------------------- /Scrapper/article.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Mar 29, 2014 3 | 4 | @author: vspathak 5 | ''' 6 | 7 | class article(object): 8 | def __init__(self, title, date, text, url, ID): 9 | self.Title = title 10 | self.Date = date 11 | self.Text = text 12 | self.URL = url 13 | self.id = ID 14 | 15 | -------------------------------------------------------------------------------- /Scrapper/ldac2uci.py: -------------------------------------------------------------------------------- 1 | infile = open('jan.dat') 2 | outfile = open('JanUCI.txt', 'w') 3 | 4 | doclines = infile.readlines() 5 | infile.close() 6 | 7 | for i in range(len(doclines)): 8 | line = doclines[i].strip().split(' ')[1:] 9 | for elt in line: 10 | pieces = elt.split(':') 11 | outfile.write(str(i + 1) + ' ' + str(int(pieces[0]) + 1) + ' ' + pieces[1] + '\n') 12 | 13 | outfile.close() 14 | -------------------------------------------------------------------------------- /Evaluation/getFinalPerplexities.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | if len(sys.argv) >= 2: 5 | perpfilename = sys.argv[1] 6 | num_iter = int(sys.argv[2]) 7 | 8 | perpfile = open(perpfilename) 9 | 10 | finalperps = [perp for perp in perpfile.readlines() if perp[1] != ',' and int(perp[:2]) == num_iter] 11 | 12 | # for perp in finalperps: 13 | # print perp.split(',')[1] 14 | 15 | #print len(finalperps) 16 | for i in range(len(finalperps)): 17 | if i % 3 == 0: 18 | print i/3, finalperps[i].split(',')[1] 19 | -------------------------------------------------------------------------------- /Evaluation/WordTrends.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | topTenFile = open('/home/vspathak/git/DynamicLDA/TopTen.txt', 'r') 4 | for input in topTenFile: 5 | words = input.split(',') 6 | topSet = set(words) 7 | 8 | wordEvolution = open('/home/vspathak/git/DynamicLDA/WordEvolution.txt', 'w') 9 | 10 | files = os.listdir('/home/vspathak/git/DynamicLDA/Scrapper/Pi') 11 | 12 | for fileName in files: 13 | for topic in open('output/' + fileName, 'r'): 14 | topic = topic.strip() 15 | wordProb = topic.split(',') 16 | wordEvolution.write(wordProb[word]) 17 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | all: SCVB0/scvb0.cpp 2 | g++ -g -std=c++0x -fopenmp SCVB0/scvb0.cpp -o fastLDA 3 | 4 | GenerateChains: TopicChains/GenerateChains.cpp 5 | g++ -g -std=c++0x -fopenmp TopicChains/GenerateChains.cpp -o GenerateChains 6 | 7 | GetData: TopicChains/GetData.cpp 8 | g++ -g -std=c++0x -fopenmp TopicChains/GetData.cpp -o GetData 9 | 10 | serial: SCVB0/scvb0.cpp 11 | g++ -g -std=c++0x SCVB0/scvb0.cpp -o fastLDA 12 | 13 | scvb: SCVB0_Evaluation/scvb.cpp 14 | g++ -g -std=c++0x -fopenmp SCVB0_Evaluation/scvb.cpp -o scvb 15 | 16 | clean: 17 | rm -f *.o fastLDA GetData GenerateChains 18 | -------------------------------------------------------------------------------- /Scrapper/removeInfrequentWords.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | vocabDict = {} 5 | 6 | files = os.listdir('/home/vspathak/git/DynamicLDA/Scrapper/output') 7 | 8 | for fileName in files: 9 | for line in open('output/'+fileName, 'r'): 10 | line = line.strip() 11 | words = line.split() 12 | for key in words: 13 | if key in vocabDict: 14 | vocabDict[key] += 1 15 | else: 16 | vocabDict[key] = 1 17 | 18 | stopwordsFile = open('stopwords', 'a') 19 | for term in vocabDict: 20 | if vocabDict[term] < 26: 21 | # print term 22 | stopwordsFile.write(term.encode('utf-8') + '\n') 23 | if vocabDict[term] > 100000: 24 | print term 25 | stopwordsFile.close() 26 | -------------------------------------------------------------------------------- /Scrapper/getarticle.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from lxml import html 3 | import requests 4 | # http://www.reuters.com/article/2012/12/02/us-space-france-russia-idUSBRE8B101L20121202 5 | # http://www.reuters.com/article/2007/01/02/music-jazz-chicago-dc-idUSN2927338620070102 6 | # http://www.reuters.com/article/2014/03/28/us-microsoft-office-ipad-idUSBREA2Q1MV20140328 7 | page = requests.get('http://www.reuters.com/article/2014/01/02/walmart-china-idUSL3N0KC0LH20140102') 8 | tree = html.fromstring(page.text) 9 | 10 | # This will create a list of article URLs: 11 | # URL = tree.xpath('//div[@class="headlineMed"]/a/@href') 12 | # Title = tree.xpath('//div[@class="headlineMed"]/a/text()' 13 | Title = tree.xpath('//*[@id="content"]/div[4]/div/div[3]/div[1]/h1/text()') 14 | 15 | Location = tree.xpath('//*[@id="articleInfo"]/p[2]/span[1]/text()') 16 | 17 | Paragraphs = tree.xpath('//*[@id="articleText"]/p/text()') 18 | 19 | print 'Paragraphs: ', Paragraphs 20 | print 'Location: ' , Location 21 | print 'Title:' , Title 22 | -------------------------------------------------------------------------------- /Evaluation/getWordVariation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | if len(sys.argv) >= 3: 5 | topic_id = int(sys.argv[1]) 6 | word_id = int(sys.argv[2]) 7 | filepath = sys.argv[3] 8 | startYear = int(sys.argv[4]) 9 | endYear = int(sys.argv[5]) 10 | 11 | files = os.listdir(filepath) 12 | 13 | for year in range(startYear, endYear + 1): 14 | for month in range(1, 13): 15 | if(month > 9): 16 | fileName = filepath + '/' + 'topics_' + str(year) + str(month) + '.txt' 17 | if os.path.isfile(fileName): 18 | # print 'reading file: ' + fileName 19 | monthFile = open(fileName, 'r') 20 | lines = monthFile.readlines() 21 | topic = lines[topic_id] 22 | prob = float(topic.split(',')[word_id]) 23 | print prob 24 | else: 25 | fileName = filepath + '/' + 'topics_' + str(year) + '0' + str(month) + '.txt' 26 | if os.path.isfile(fileName): 27 | # print 'reading file: ' + fileName 28 | monthFile = open(fileName, 'r') 29 | lines = monthFile.readlines() 30 | topic = lines[topic_id] 31 | prob = float(topic.split(',')[word_id]) 32 | print prob 33 | -------------------------------------------------------------------------------- /Evaluation/lookupWords.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | def buildtriple(idprob,vocab): 6 | # if (len(idprob) == 0): 7 | # print 'working' 8 | word_Prob = idprob.split(':') 9 | word = vocab[int(word_Prob[0])] 10 | return (word, word_Prob[0], word_Prob[1]) 11 | 12 | #input: directory of input files, directory for output files, vocab file 13 | if len(sys.argv) >= 3: 14 | infolder = sys.argv[1] 15 | outfolder = sys.argv[2] 16 | vocabfilename = sys.argv[3] 17 | 18 | vocabfile = open(vocabfilename, 'r') 19 | vocab = [word.strip() for word in vocabfile.readlines()] 20 | vocabfile.close() 21 | 22 | #iterate through files of infolder 23 | for filename in os.listdir(infolder): 24 | infile = open(infolder + '/' + filename) 25 | outfile = open(outfolder + '/' + filename, 'w') 26 | for topic in infile.readlines(): 27 | temp = topic[:-2].split(',') 28 | word_Prob = temp[0].split(':') 29 | if not (word_Prob[0].isdigit()): 30 | continue 31 | outline = [buildtriple(idprob,vocab) for idprob in topic[:-2].split(',')] 32 | map(lambda x: outfile.write(str(x)), outline) 33 | outfile.write('\n') 34 | outfile.close() 35 | infile.close() 36 | -------------------------------------------------------------------------------- /Scrapper/__init__.py: -------------------------------------------------------------------------------- 1 | from lxml import html 2 | import requests 3 | import article as ac 4 | import sys 5 | import random 6 | import nltk 7 | import re 8 | from stemming.porter2 import stem 9 | 10 | 11 | docId = 0 12 | if len(sys.argv) >= 4: 13 | theyear = int(sys.argv[1]) 14 | firstmonth = int(sys.argv[2]) 15 | num_months = int(sys.argv[3]) 16 | seqfile = open('seq-' + str(theyear) + '-' + str(firstmonth) + '-' + str(num_months) + '.txt', 'w') 17 | 18 | else: 19 | print 'usage: python __init__.py year firstmonth num_months' 20 | sys.exit(0) 21 | 22 | for yr in range(theyear, theyear + 1): 23 | year = 'http://www.reuters.com/resources/archive/us/' + str(yr) 24 | for mnth in range(firstmonth, firstmonth + num_months): 25 | if(mnth < 10): 26 | month = '0' + str(mnth) 27 | else: 28 | month = str(mnth) 29 | 30 | monthdocs = 0 31 | for day in range(1, 32): 32 | if(day < 10): 33 | URL = year + month + '0' + str(day) + '.html' 34 | else: 35 | URL = year + month + str(day) + '.html' 36 | 37 | page = requests.get(URL) 38 | tree = html.fromstring(page.text) 39 | URLs = tree.xpath('//div[@class="headlineMed"]/a/@href') 40 | date = URL[-13:-5] 41 | 42 | f = open('output/' + str(date) + '.txt', 'w') 43 | # generate the random vector(python generate a sample without 44 | # replacement from a range of numbers) 45 | 46 | for num in random.sample(range(0, len(URLs)), int(len(URLs))): 47 | doc = ac.article('', date, '', URLs[num], -1) 48 | curpage = requests.get(doc.URL) 49 | curtree = html.fromstring(curpage.text) 50 | Title = curtree.xpath('//*[@id="content"]/div[4]/div/div[3]/div[1]/h1/text()') 51 | Paragraphs = curtree.xpath('//*[@id="articleText"]/p/text()') 52 | if len(Title) > 0: 53 | doc.Title = Title[0].replace('\"', '') 54 | Paragraphs.append(Title[0]) 55 | doc.Text = " ".join(Paragraphs) 56 | doc.Text = doc.Text.replace('\n', ' ') 57 | doc.Text = doc.Text.replace('\"', '') 58 | 59 | if(len(doc.Text.split()) > 100): 60 | docId = docId + 1 61 | doc.id = docId 62 | print doc.id 63 | monthdocs = monthdocs + 1 64 | 65 | docText = re.sub('[^A-Za-z]+', ' ', doc.Text) 66 | docTitle = re.sub('[^A-Za-z]+', ' ', doc.Title) 67 | docText = docTitle + ' ' + docText 68 | docText = docText.lower() 69 | tokens = docText.split() 70 | 71 | docText = " ".join([stem(t) for t in tokens]) 72 | 73 | f.write(docText.encode('utf-8') + '\n') 74 | 75 | f.close() 76 | seqfile.write(str(theyear) + '-' + str(mnth) + ':' + str(monthdocs) + '\n') 77 | seqfile.close() 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DynamicLDA 2 | ========== 3 | 4 | Dynamic Topic Model of Reuters News Articles between 2007-2013 5 | -------------------------------------------------------------------------------------------- 6 |

We have implemented fast version of Dynamic Topic Model proposed by David Blei and John Lafferty in 2006.

7 |

This version takes advantage of new advancements in LDA model. We have implemented the LDA part of DTM using SCVB0 which is proposed by Foulds, et al 2013. This is parallelized implementation of SCVB0 using OpenMP.

8 |

As per our evaluation, even our Serial version gives 36X speedup and the Parallel version when run on core 2 duo 2GHz 2Gb machine gives 53X speedup.

9 |

(Report with detail evaluation)

10 | 11 | Reuters News Dataset Details 12 | ---------------------------- 13 | Timestamped News articles published by Reuters between 2007 and 2013. This is corpus of 161,989 documents with vocab size of 32,468 after preprocessing. Following are the preprocessing steps performed (Scripts are available in Scrapper folder) 14 | 15 | - From Reuters data we removed all the docs which have length less than 100 words 16 | - We have scrapped random 10% of the data from each day. This was done just to minimize the corpus size.The assumption is that randomly selected data wont cause problem while finding the long and major topics. 17 | - We removed all the punctuation marks and performed stemming using Porter2 stemmer 18 | - We also removed the words which have frequency of less than 25 or more than 100,000 19 | example run of text2ldac: 20 | 21 | Topic Chains 22 | ------------ 23 | We have investigated the Topic Chains a solution to topic Birth-Death problem in Dynamic LDA proposed by Kim, et al in 2013. 24 | - We use the same Reuters dataset and use the Jensen-Shannon (JS) divergence to compare similarity between the topics. 25 | - We evaluate performance at different Similarity Thresholds and Window Sizes and find similar results as given in the original paper 26 | - We identify some issues in the method and propose solutions to the same (Please refer the report for more details) 27 | 28 | Execution Commands 29 | ------------------ 30 | - Scrape Data from reuters archive website between startMonth for num_of_months
31 | python __init__.py startMonth num_of_months 32 | - Get Stopwords python removeInfrequentWords.py 33 | - Convert the text data to ldac format used by Blei's implementation
34 | python multitext2ldac.py data_folder --stopwords stopwords_file 35 | - Convert data to UCI format 36 | python ldac2uci.py 37 | - Compile Dynamic LDA. make 38 | - Execute Dynamic Topic Modeling on UCI dataset
39 | ./fastLDA UCIFormat_data_file iterations NumOfTopics MiniBatchSize Vocab_file GeneratePi 40 | - Get the word trend in a topic
41 | python getWordVariation.py TopicId WordId PiFolderPath StartYear EndYear 42 | - Compile Topic Chains GetData to get all the Topics in the dataset for all the TimeSlices 43 | make GetData 44 | - Execute GetData for Topic Chains
./GetData UCIFormat_data_file iterations NumOfTopics MiniBatchSize Vocab_file GeneratePi 45 | - Compile GenerateChains for Topic Chains make GenerateChains 46 | - Execute GenerateChains ./GenerateChains Pi_folder num_topics WindowSize SimilarityThreshold 47 | -------------------------------------------------------------------------------- /TopicChains/GenerateChains.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * TopicChains.cpp 3 | * 4 | * Created on: Apr 22, 2014 5 | * Author: vspathak 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | using namespace std; 26 | using namespace boost; 27 | 28 | typedef adjacency_list Graph; 29 | 30 | // Initialize number of documents, topics and words in vocabulary 31 | unsigned int W, D, K; 32 | 33 | double KLDivergence(double*** Pi, int t, int k, double* M) { 34 | double result = 0.0; 35 | for (unsigned int w = 0; w < W; ++w) { 36 | result += log(Pi[t][w][k] / M[w]) * Pi[t][w][k]; 37 | } 38 | return result; 39 | } 40 | 41 | double JSsimilarity(double*** Pi, int t1, int k1, int t2, int k2) { 42 | double result = 0.0; 43 | double* M = new double[W]; 44 | for (unsigned int w = 0; w < W; ++w) { 45 | M[w] = (Pi[t1][w][k1] + Pi[t2][w][k2]) / 2; 46 | } 47 | result = KLDivergence(Pi, t1, k1, M) + KLDivergence(Pi, t2, k2, M); 48 | result = result / 2; 49 | return result; 50 | } 51 | 52 | void generateTopicLinks(Graph &G, double*** Pi, int timeSlice, int topic, 53 | int numTopics, int windowSize, double threshold) { 54 | for (int w = 0; w < windowSize; w++) { 55 | int numLinks = 0; 56 | for (int k = 0; k < numTopics; k++) { 57 | if ((timeSlice - 1 - w >= 0) && JSsimilarity(Pi, timeSlice, topic, timeSlice - 1 - w, k) > threshold) { 58 | //add edge to graph structure here 59 | int e1 = (timeSlice * numTopics) + topic; 60 | int e2 = ((timeSlice - 1 - w) * numTopics) + k; 61 | 62 | cout << "Adding edge " << e1 << ", " << e2 << endl; 63 | add_edge(e1, e2, G); 64 | numLinks++; 65 | } 66 | } 67 | if (numLinks > 0) { 68 | break; 69 | } 70 | } 71 | } 72 | 73 | void generateAllLinks(Graph &G, double*** Pi, int numTimeSlices, int numTopics, 74 | int windowSize, double threshold) { 75 | for (int t = 0; t < numTimeSlices; t++) { 76 | for (int k = 0; k < numTopics; k++) { 77 | generateTopicLinks(G, Pi, t, k, numTopics, windowSize, threshold); 78 | } 79 | } 80 | } 81 | 82 | int main(int argc, char* argv[]) { 83 | if (argc < 4) { 84 | printf("Usage: ./fastLDA Pi_folder num_topics WindowSize SimilarityThreshold\n"); 85 | return 1; 86 | } 87 | string piFolder = argv[1]; 88 | cout << "Input Pi folder: " << piFolder << endl; 89 | 90 | double ***Pi; 91 | int windowSize = 0; 92 | double similarityThreshold = 0; 93 | 94 | ifstream seqfile; 95 | seqfile.open("Data/seqfile.txt"); 96 | string newline = ""; 97 | vector* months = new vector(); 98 | vector* numOfDocs = new vector(); 99 | vector* monthFirstIdx = new vector(); 100 | vector* monthLastIdx = new vector(); 101 | int curIdx = 0; 102 | 103 | while (seqfile >> newline) { 104 | const char * ptr = strchr(newline.c_str(), ':'); 105 | int count = atoi(ptr + 1); 106 | ptr = "\0"; 107 | int yearMonth = atoi(newline.c_str()); 108 | months->push_back(yearMonth); 109 | numOfDocs->push_back(count); 110 | monthFirstIdx->push_back(curIdx); 111 | monthLastIdx->push_back(curIdx + count); 112 | curIdx += count; 113 | } 114 | seqfile.close(); 115 | 116 | K = atoi(argv[2]); 117 | windowSize = atoi(argv[3]); 118 | similarityThreshold = atof(argv[4]); 119 | W = 32468; 120 | 121 | 122 | printf("Number of topics: %d\n", K); 123 | printf("Window Size: %d\n", windowSize); 124 | printf("Similarity Threshold: %f\n", similarityThreshold); 125 | 126 | // Dynamically allocate Pi 127 | Pi = new double**[months->size()]; 128 | for (unsigned int m = 0; m < months->size(); ++m) { 129 | Pi[m] = new double*[W]; 130 | for (unsigned int word = 0; word < W; word++) { 131 | Pi[m][word] = new double[K]; 132 | for(unsigned int k = 0; k < K; k++) { 133 | Pi[m][word][k] = 0; 134 | } 135 | } 136 | } 137 | 138 | //Read Pi files in Memory 139 | for (int timeSlice = 0; timeSlice < (int)months->size(); timeSlice++) { 140 | string fileName = piFolder + "/topics_" + to_string(months->at(timeSlice)) + ".txt"; 141 | cout << "Reading File: " << fileName << endl; 142 | ifstream pifile; 143 | pifile.open(fileName); 144 | int topic = 0; 145 | while (pifile >> newline) { 146 | std::istringstream ss(newline); 147 | std::string token; 148 | 149 | int wordId = 0; 150 | while (std::getline(ss, token, ',')) { 151 | Pi[timeSlice][wordId][topic] = stod(token); 152 | wordId++; 153 | } 154 | topic++; 155 | } 156 | pifile.close(); 157 | 158 | } //All timeSlices finished 159 | // for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) { 160 | // for (int k = 0; k < K; k++) { 161 | // for (int w = 0; w < W; w++) { 162 | // cout << Pi[timeSlice][w][k] << ","; 163 | // } 164 | // cout << endl; 165 | // } 166 | // } 167 | 168 | // MAKE CHAINS 169 | Graph G; 170 | //K is unsigned -- is this a problem? 171 | // generateAllLinks(G, Pi, months->size(), K, windowSize, similarityThreshold); 172 | generateAllLinks(G, Pi, 10, K, windowSize, similarityThreshold); 173 | 174 | vector component(num_vertices(G)); 175 | int num = connected_components(G, &component[0]); 176 | 177 | vector::size_type p; 178 | cout << "Total number of components: " << num << endl; 179 | for (p = 0; p != component.size(); ++p) { 180 | cout << "Vertex " << p << " is in component " << component[p] << endl; 181 | } 182 | 183 | return (0); 184 | 185 | } // End of main 186 | -------------------------------------------------------------------------------- /Scrapper/text2ldac.py: -------------------------------------------------------------------------------- 1 | #This file is part of text2ldac. 2 | 3 | #text2ldac is free software: you can redistribute it and/or modify 4 | #it under the terms of the GNU General Public License as published by 5 | #the Free Software Foundation, either version 3 of the License, or 6 | #(at your option) any later version. 7 | 8 | #text2ldac is distributed in the hope that it will be useful, 9 | #but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | #GNU General Public License for more details. 12 | 13 | #You should have received a copy of the GNU General Public License 14 | #along with text2ldac. If not, see . 15 | 16 | 17 | 18 | import argparse 19 | import codecs 20 | import os 21 | import operator 22 | import string 23 | import sys 24 | 25 | __doc = \ 26 | ''' 27 | This is a program to convert documents into the file format used by David 28 | Blei's lda-c (and hlda-c) implementation. It generates the .dat, .vocab and 29 | .dmap files from .txt files in a given directory. 30 | 31 | cf. http://www.cs.princeton.edu/~blei/lda-c/readme.txt 32 | ''' 33 | __author__ = 'Johannes Knopp ' 34 | 35 | def init_parser(): 36 | ''' 37 | Returns an argument parser configured with options for this program 38 | ''' 39 | parser = argparse.ArgumentParser( 40 | description='A program to convert documents to .dat, .vocap and .dmap files' 41 | ) 42 | 43 | #positional argument 44 | parser.add_argument('dirname', action='store', 45 | help='directory containing .txt files (files must be encoded in utf-8)') 46 | 47 | #options 48 | parser.add_argument('-o', '--output', action='store', dest='outdir', 49 | help='directory to store the resulting files') 50 | parser.add_argument('-e', '--extension', action='store', dest='extension', 51 | default='.txt', 52 | help='extension of the files you are looking for. Default: %(default)s') 53 | #TODO minoccurrence should work for the overall occurrence 54 | parser.add_argument('--minoccurrence', action='store', 55 | dest='minoccurrence', type=int, default=1, 56 | help='Minimum occurrences a word needs at least once in one document to be taken into account.') 57 | parser.add_argument('--minlength', action='store', 58 | dest='minlength', type=int, default=1, 59 | help='Minimum length a word needs to be taken into account.') 60 | #stopwords 61 | parser.add_argument('--stopwords', action='store', dest='stopword_file', 62 | help='Remove the stopwords given in the stopword file (one line per stopword).') 63 | 64 | #TODO 65 | parser.add_argument('--mallet', action='store_true', 66 | help='convert data that exists in the format used by mallet. NOT SUPPORTED YET') 67 | 68 | return parser.parse_args() 69 | 70 | 71 | def get_filenames(directory, extension): 72 | ''' 73 | Search for files in the directory ending in EXTENSION and return the full 74 | paths as a list. 75 | ''' 76 | all_fnames = [] 77 | for dirpath,dirnames,filenames in os.walk(directory): 78 | all_fnames += [os.path.join(dirpath,f) for f in filenames if 79 | f.endswith(extension)] 80 | return all_fnames 81 | 82 | 83 | def clean_word(word): 84 | ''' 85 | returns the word in lowercase without punctuation at the start or end 86 | ''' 87 | return word.rstrip(string.punctuation).lstrip(string.punctuation).lower() 88 | 89 | def load_stopwords(stopword_filename): 90 | ''' 91 | returns a set of stopwords found line by line in the stopwords file 92 | ''' 93 | stopwords = set() 94 | 95 | with codecs.open(stopword_filename, 'r', 'utf-8') as sf: 96 | for line in sf: 97 | if len(line.split()) != 1: 98 | print('ignoring line with more than one stopword:\n"{0}"'.format( 99 | line)) 100 | continue 101 | stopwords.add(line.strip()) 102 | 103 | return stopwords 104 | 105 | def write_document_map_file(fnames, dmap_fname): 106 | """ 107 | Save document's names in the order they were processed 108 | """ 109 | with codecs.open(dmap_fname,'w','utf-8') as d_file: 110 | for title in fnames: 111 | d_file.write(title + '\n') 112 | 113 | def reindex(word_id_dict, min_index): 114 | """ 115 | re-index the word_id for word_id pairs to guarantee that the max 116 | index of the word matches/reflects number of words in word dict 117 | """ 118 | num_word_shifts = 0 119 | for word in word_id_dict: 120 | cur_index = word_id_dict[word] 121 | 122 | if cur_index > min_index: 123 | word_id_dict[word] = min_index + num_word_shifts 124 | num_word_shifts += 1 125 | 126 | def generate_dat_lines_and_word_ids(fnames, config): 127 | dat_lines = [] #.dat file output 128 | word_id_dict = dict() 129 | used_docs = [] #needed to generate .dmap file 130 | 131 | for docname in fnames: 132 | freq_dict = dict() 133 | new_words = set() 134 | 135 | try: 136 | with codecs.open(docname, 'r', 'utf-8') as doc: 137 | for line in doc: 138 | for word in line.split(): 139 | word = clean_word(word) 140 | 141 | if len(word) < config['minlength'] or word in config['stopwords']: 142 | continue 143 | 144 | #word occurrs for the first time 145 | if not word_id_dict.has_key(word): 146 | freq_dict[word] = 1 147 | word_id_dict[word] = len(word_id_dict) 148 | new_words.add(word) 149 | #word may be in word_id_dict but not yet in freq_dict 150 | else: 151 | freq = freq_dict.setdefault(word, 0) 152 | freq_dict[word] = freq + 1 153 | except UnicodeDecodeError as u_error: 154 | print('Document "{0}" has encoding errors and is ignored!\n{1}'.format( 155 | docname, u_error)) 156 | 157 | 158 | if len(freq_dict)==0: #did the document contribute anything? 159 | print('Document "{0}" (#{1}) seems to be empty and is ignored!'.format( 160 | docname,fnames.index(docname))) 161 | continue 162 | else: 163 | used_docs.append(docname) 164 | 165 | #remove words that do not reach minoccurrence 166 | remove_list = [word for word in freq_dict.iterkeys() if\ 167 | freq_dict[word] < config['minoccurrence']] 168 | #smallest index of a word that is removed 169 | remove_word_min_index = len(word_id_dict) 170 | 171 | for word in remove_list: 172 | freq_dict.pop(word) 173 | #if they are new also remove them from word_id_dict 174 | if word in new_words: 175 | word_index = word_id_dict[word] 176 | if word_index < remove_word_min_index: 177 | remove_word_min_index = word_index 178 | word_id_dict.pop(word) 179 | reindex(word_id_dict, remove_word_min_index) 180 | 181 | dat_line = '' #line for the .dat file 182 | 183 | for word in freq_dict.iterkeys(): 184 | dat_line += str(word_id_dict[word]) + ':' + str(freq_dict[word]) + ' ' 185 | 186 | #last blank in dat_line is removed 187 | dat_lines.append(str(len(freq_dict)) + ' ' + dat_line[:-1] + '\n') 188 | 189 | write_document_map_file(used_docs, config['dmapname']) 190 | 191 | return dat_lines, word_id_dict 192 | 193 | 194 | def generate_dat_and_vocab_files(fnames, config): 195 | 196 | with codecs.open(config['datname'], 'w', 'utf-8') as datfile: 197 | dat_lines, word_id_dict = generate_dat_lines_and_word_ids(fnames, 198 | config) 199 | datfile.writelines(dat_lines) 200 | 201 | #sort word_id_dict ascending by value und write the words in that 202 | #order to a .vocab file 203 | with codecs.open(config['vocabname'], 'w', 'utf-8') as vocabfile: 204 | for item in sorted(word_id_dict.iteritems(), key=operator.itemgetter(1)): 205 | vocabfile.write(item[0]+'\n') 206 | 207 | print('Found {0} unique words in {1} files.'.format( 208 | len(word_id_dict), len(fnames))) 209 | print('Results can be found in "{0}" and "{1}"'.format( 210 | config['datname'], config['vocabname'])) 211 | 212 | 213 | if __name__=='__main__': 214 | 215 | parser = init_parser() 216 | 217 | #directory with document files 218 | dirname = parser.dirname 219 | dirname = dirname + os.sep if not dirname.endswith(os.sep) else dirname 220 | #directory for results 221 | outdir_name = parser.outdir if parser.outdir else dirname 222 | outdir_name = outdir_name + os.sep if not outdir_name.endswith(os.sep) else outdir_name 223 | #prefix of the .dat and .vocab files 224 | basename = os.path.dirname(dirname).split('/')[-1] 225 | 226 | 227 | if not os.path.exists(outdir_name): 228 | os.mkdir(outdir_name) 229 | 230 | #store configuration 231 | config = dict() 232 | config['datname'] = outdir_name + basename + '.dat' 233 | config['vocabname'] = outdir_name + basename + '.vocab' 234 | config['dmapname'] = outdir_name + basename + '.dmap' 235 | config['minlength'] = parser.minlength 236 | config['minoccurrence'] = parser.minoccurrence 237 | if parser.stopword_file: 238 | config['stopwords'] = load_stopwords(parser.stopword_file) 239 | else: 240 | config['stopwords'] = set() 241 | 242 | fnames = get_filenames(dirname, parser.extension) 243 | 244 | try: 245 | generate_dat_and_vocab_files(fnames, config) 246 | except IOError as ioe: 247 | print(ioe) 248 | sys.exit(1) 249 | -------------------------------------------------------------------------------- /Scrapper/multitext2ldac.py: -------------------------------------------------------------------------------- 1 | #This file is part of text2ldac. 2 | 3 | #text2ldac is free software: you can redistribute it and/or modify 4 | #it under the terms of the GNU General Public License as published by 5 | #the Free Software Foundation, either version 3 of the License, or 6 | #(at your option) any later version. 7 | 8 | #text2ldac is distributed in the hope that it will be useful, 9 | #but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | #GNU General Public License for more details. 12 | 13 | #You should have received a copy of the GNU General Public License 14 | #along with text2ldac. If not, see . 15 | 16 | 17 | 18 | import argparse 19 | import codecs 20 | import os 21 | import operator 22 | import string 23 | import sys 24 | 25 | __doc = \ 26 | ''' 27 | This is a program to convert documents into the file format used by David 28 | Blei's lda-c (and hlda-c) implementation. It generates the .dat, .vocab and 29 | .dmap files from .txt files in a given directory. 30 | 31 | cf. http://www.cs.princeton.edu/~blei/lda-c/readme.txt 32 | ''' 33 | __author__ = 'Johannes Knopp ' 34 | 35 | def init_parser(): 36 | ''' 37 | Returns an argument parser configured with options for this program 38 | ''' 39 | parser = argparse.ArgumentParser( 40 | description='A program to convert documents to .dat, .vocap and .dmap files' 41 | ) 42 | 43 | #positional argument 44 | parser.add_argument('dirname', action='store', 45 | help='directory containing .txt files (files must be encoded in utf-8)') 46 | 47 | #options 48 | parser.add_argument('-o', '--output', action='store', dest='outdir', 49 | help='directory to store the resulting files') 50 | parser.add_argument('-e', '--extension', action='store', dest='extension', 51 | default='.txt', 52 | help='extension of the files you are looking for. Default: %(default)s') 53 | #TODO minoccurrence should work for the overall occurrence 54 | parser.add_argument('--minoccurrence', action='store', 55 | dest='minoccurrence', type=int, default=1, 56 | help='Minimum occurrences a word needs at least once in one document to be taken into account.') 57 | parser.add_argument('--minlength', action='store', 58 | dest='minlength', type=int, default=1, 59 | help='Minimum length a word needs to be taken into account.') 60 | #stopwords 61 | parser.add_argument('--stopwords', action='store', dest='stopword_file', 62 | help='Remove the stopwords given in the stopword file (one line per stopword).') 63 | 64 | #TODO 65 | parser.add_argument('--mallet', action='store_true', 66 | help='convert data that exists in the format used by mallet. NOT SUPPORTED YET') 67 | 68 | return parser.parse_args() 69 | 70 | 71 | def get_filenames(directory, extension): 72 | ''' 73 | Search for files in the directory ending in EXTENSION and return the full 74 | paths as a list. 75 | ''' 76 | all_fnames = [] 77 | for dirpath,dirnames,filenames in os.walk(directory): 78 | all_fnames += [os.path.join(dirpath,f) for f in filenames if 79 | f.endswith(extension)] 80 | return all_fnames 81 | 82 | 83 | def clean_word(word): 84 | ''' 85 | returns the word in lowercase without punctuation at the start or end 86 | ''' 87 | return word.rstrip(string.punctuation).lstrip(string.punctuation).lower() 88 | 89 | def load_stopwords(stopword_filename): 90 | ''' 91 | returns a set of stopwords found line by line in the stopwords file 92 | ''' 93 | stopwords = set() 94 | 95 | with codecs.open(stopword_filename, 'r', 'utf-8') as sf: 96 | for line in sf: 97 | if len(line.split()) != 1: 98 | print('ignoring line with more than one stopword:\n"{0}"'.format( 99 | line)) 100 | continue 101 | stopwords.add(line.strip()) 102 | 103 | return stopwords 104 | 105 | def write_document_map_file(fnames, dmap_fname): 106 | """ 107 | Save document's names in the order they were processed 108 | """ 109 | with codecs.open(dmap_fname,'w','utf-8') as d_file: 110 | for title in fnames: 111 | d_file.write(title + '\n') 112 | 113 | def reindex(word_id_dict, min_index): 114 | """ 115 | re-index the word_id for word_id pairs to guarantee that the max 116 | index of the word matches/reflects number of words in word dict 117 | """ 118 | num_word_shifts = 0 119 | for word in word_id_dict: 120 | cur_index = word_id_dict[word] 121 | 122 | if cur_index > min_index: 123 | word_id_dict[word] = min_index + num_word_shifts 124 | num_word_shifts += 1 125 | 126 | def generate_dat_lines_and_word_ids(fnames, config): 127 | dat_lines = [] #.dat file output 128 | word_id_dict = dict() 129 | used_docs = [] #needed to generate .dmap file 130 | 131 | for dayname in fnames: 132 | print dayname 133 | try: 134 | with codecs.open(dayname, 'r', 'utf-8') as day: 135 | for line in day: 136 | freq_dict = dict() 137 | new_words = set() 138 | 139 | for word in line.split(): 140 | word = clean_word(word) 141 | 142 | if len(word) < config['minlength'] or word in config['stopwords']: 143 | continue 144 | 145 | #word occurrs for the first time 146 | if not word_id_dict.has_key(word): 147 | freq_dict[word] = 1 148 | word_id_dict[word] = len(word_id_dict) 149 | new_words.add(word) 150 | #word may be in word_id_dict but not yet in freq_dict 151 | else: 152 | freq = freq_dict.setdefault(word, 0) 153 | freq_dict[word] = freq + 1 154 | 155 | if len(freq_dict)==0: #did the document contribute anything? 156 | print('Document "{0}" (#{1}) seems to be empty and is ignored!'.format( 157 | dayname,fnames.index(docname))) 158 | continue 159 | else: 160 | used_docs.append(dayname) 161 | 162 | #remove words that do not reach minoccurrence 163 | remove_list = [word for word in freq_dict.iterkeys() if\ 164 | freq_dict[word] < config['minoccurrence']] 165 | #smallest index of a word that is removed 166 | remove_word_min_index = len(word_id_dict) 167 | 168 | for word in remove_list: 169 | freq_dict.pop(word) 170 | #if they are new also remove them from word_id_dict 171 | if word in new_words: 172 | word_index = word_id_dict[word] 173 | if word_index < remove_word_min_index: 174 | remove_word_min_index = word_index 175 | word_id_dict.pop(word) 176 | reindex(word_id_dict, remove_word_min_index) 177 | 178 | dat_line = '' #line for the .dat file 179 | 180 | for word in freq_dict.iterkeys(): 181 | dat_line += str(word_id_dict[word]) + ':' + str(freq_dict[word]) + ' ' 182 | 183 | #last blank in dat_line is removed 184 | dat_lines.append(str(len(freq_dict)) + ' ' + dat_line[:-1] + '\n') 185 | 186 | except UnicodeDecodeError as u_error: 187 | print('Document "{0}" has encoding errors and is ignored!\n{1}'.format(dayname, u_error)) 188 | 189 | #write_document_map_file(used_docs, config['dmapname']) 190 | 191 | return dat_lines, word_id_dict 192 | 193 | def generate_dat_and_vocab_files(fnames, config): 194 | 195 | with codecs.open(config['datname'], 'w', 'utf-8') as datfile: 196 | dat_lines, word_id_dict = generate_dat_lines_and_word_ids(fnames, 197 | config) 198 | datfile.writelines(dat_lines) 199 | 200 | #sort word_id_dict ascending by value und write the words in that 201 | #order to a .vocab file 202 | with codecs.open(config['vocabname'], 'w', 'utf-8') as vocabfile: 203 | for item in sorted(word_id_dict.iteritems(), key=operator.itemgetter(1)): 204 | vocabfile.write(item[0]+'\n') 205 | 206 | print('Found {0} unique words in {1} files.'.format( 207 | len(word_id_dict), len(fnames))) 208 | print('Results can be found in "{0}" and "{1}"'.format( 209 | config['datname'], config['vocabname'])) 210 | 211 | 212 | if __name__=='__main__': 213 | 214 | parser = init_parser() 215 | 216 | #directory with document files 217 | dirname = parser.dirname 218 | dirname = dirname + os.sep if not dirname.endswith(os.sep) else dirname 219 | #directory for results 220 | outdir_name = parser.outdir if parser.outdir else dirname 221 | outdir_name = outdir_name + os.sep if not outdir_name.endswith(os.sep) else outdir_name 222 | #prefix of the .dat and .vocab files 223 | basename = os.path.dirname(dirname).split('/')[-1] 224 | 225 | 226 | if not os.path.exists(outdir_name): 227 | os.mkdir(outdir_name) 228 | 229 | #store configuration 230 | config = dict() 231 | config['datname'] = outdir_name + basename + '.dat' 232 | config['vocabname'] = outdir_name + basename + '.vocab' 233 | config['dmapname'] = outdir_name + basename + '.dmap' 234 | config['minlength'] = parser.minlength 235 | config['minoccurrence'] = parser.minoccurrence 236 | if parser.stopword_file: 237 | config['stopwords'] = load_stopwords(parser.stopword_file) 238 | else: 239 | config['stopwords'] = set() 240 | 241 | fnames = get_filenames(dirname, parser.extension) 242 | 243 | try: 244 | generate_dat_and_vocab_files(fnames, config) 245 | except IOError as ioe: 246 | print(ioe) 247 | sys.exit(1) 248 | -------------------------------------------------------------------------------- /SCVB0_Evaluation/scvb.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * scvb.cpp 3 | * 4 | * Created on: May 3, 2014 5 | * Author: vspathak 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | using namespace std; 22 | 23 | unsigned int W, D, K; 24 | 25 | int main(int argc, char **argv) { 26 | 27 | if (argc < 4) { 28 | printf("Usage: ./scvb inputfile num_iterations num_topics miniBatchSize vocabFile anything\n"); 29 | return 1; 30 | } 31 | 32 | // Initlialize expected topic counts per document 33 | double **nTheta; 34 | // Dynamically 35 | double **nPi; 36 | double *N_z; 37 | // Initialize estimates from each minibatch 38 | // Initialize step sizes 39 | double rhoTheta = 0; 40 | double rhoPhi = 0; 41 | double **Pi; 42 | double **theta; 43 | double **perplexities; 44 | // Initlalize dirichlet prior parameters 45 | double alpha, eta; 46 | double M; // Number of documents in each minibatch 47 | int Cj = 0; 48 | unsigned int i, j, k, w, MAXITER; 49 | int batch_idx = 0; 50 | int C = 0; 51 | int iter = 0; 52 | int NNZ; 53 | double perplexityval, innerval; 54 | ofstream pfile; 55 | pfile.open("perplexity.txt"); 56 | 57 | M = 100; //343 works for KOS and only for KOS 58 | eta = 0.01; // was 0.01 59 | alpha = 0.1; 60 | 61 | ifstream seqfile; 62 | seqfile.open("Data/seqfile.txt"); 63 | string newline = ""; 64 | vector* months = new vector(); 65 | vector* numOfDocs = new vector(); 66 | vector* monthFirstIdx = new vector(); 67 | vector* monthLastIdx = new vector(); 68 | int curIdx = 0; 69 | 70 | while (seqfile >> newline) { 71 | const char * ptr = strchr(newline.c_str(), ':'); 72 | int count = atoi(ptr + 1); 73 | ptr = "\0"; 74 | int yearMonth = atoi(newline.c_str()); 75 | months->push_back(yearMonth); 76 | numOfDocs->push_back(count); 77 | monthFirstIdx->push_back(curIdx); 78 | monthLastIdx->push_back(curIdx + count); 79 | curIdx += count; 80 | } 81 | seqfile.close(); 82 | 83 | //if user also specified a minibatch size 84 | if (argc == 5 || argc == 6) { 85 | M = atof(argv[4]); 86 | } 87 | 88 | MAXITER = atoi(argv[2]); 89 | K = atoi(argv[3]); 90 | 91 | printf("Input file: %s\n", argv[1]); 92 | printf("Number of iterations: %d\n", MAXITER); 93 | printf("Number of topics: %d\n", K); 94 | printf("Minibatch size: %f\n", M); 95 | printf("alpha: %f\n", alpha); 96 | printf("eta: %f\n", eta); 97 | 98 | // Read the file and store it in DATA 99 | FILE* fptr; 100 | unsigned int docnum, wnum; 101 | unsigned char countnum; 102 | 103 | fptr = fopen(argv[1], "rt"); 104 | 105 | fscanf(fptr, "%d\n", &D); 106 | fscanf(fptr, "%d\n", &W); 107 | fscanf(fptr, "%d\n", &NNZ); 108 | 109 | printf("Number of documents: %d\n", D); 110 | printf("Vocabulary size: %d\n", W); 111 | 112 | // Dynamically allocate phi 113 | Pi = new double*[W]; 114 | //#pragma omp parallel for 115 | for (w = 0; w < W; w++) { 116 | Pi[w] = new double[K]; 117 | } 118 | 119 | printf("allocated phi\n"); 120 | 121 | // Dynamically allocate theta 122 | 123 | theta = new double*[D]; 124 | //#pragma omp parallel for 125 | for (i = 0; i < D; i++) { 126 | theta[i] = new double[K]; 127 | } 128 | 129 | printf("allocated theta\n"); 130 | 131 | vector > corpus; 132 | vector corpus_size(D, 0); 133 | corpus.resize(D); 134 | vector > corpus_expanded; 135 | corpus_expanded.resize(D); 136 | 137 | while (!feof(fptr)) { 138 | fscanf(fptr, "%d %d %hhu\n", &docnum, &wnum, &countnum); 139 | 140 | corpus[docnum - 1].push_back(wnum - 1); 141 | corpus[docnum - 1].push_back(countnum); 142 | 143 | corpus_size[docnum - 1] += countnum; 144 | 145 | for (i = 0; i < countnum; i++) { 146 | corpus_expanded[docnum - 1].push_back(wnum - 1); 147 | } 148 | } 149 | fclose(fptr); 150 | 151 | 152 | // Initialize phi_est and all other arrays 153 | nPi = new double*[W]; 154 | 155 | for (i = 0; i < W; i++) { 156 | nPi[i] = new double[K]; 157 | } 158 | 159 | for (i = 0; i < W; i++) { 160 | for (k = 0; k < K; k++) { 161 | nPi[i][k] = rand() % 10; 162 | } 163 | } 164 | 165 | // Initialize n_z and n_z_est and other arrays 166 | N_z = new double[K]; 167 | for (k = 0; k < K; k++) { 168 | N_z[k] = 0; 169 | } 170 | 171 | nTheta = new double*[D]; 172 | for (i = 0; i < D; i++) { 173 | nTheta[i] = new double[K]; 174 | } 175 | 176 | for (i = 0; i < D; i++) { 177 | for (k = 0; k < K; k++) { 178 | nTheta[i][k] = rand() % 10; 179 | } 180 | } 181 | 182 | perplexities = new double*[months->size()]; 183 | for (i = 0; i < months->size(); i++) { 184 | perplexities[i] = new double[MAXITER]; 185 | for (unsigned int a = 0; a < MAXITER; ++a) { 186 | perplexities[i][a] = 0; 187 | } 188 | } 189 | 190 | int*** topwords; 191 | double** maxval; 192 | topwords = new int**[months->size()]; 193 | 194 | for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) { 195 | cout << endl << (*months)[timeSlice] << " " << (*numOfDocs)[timeSlice] << endl; 196 | 197 | for (i = 0; i < W; i++) { 198 | for (k = 0; k < K; k++) { 199 | nPi[i][k] = rand() % 10; 200 | } 201 | } 202 | 203 | //if parallelizing this, make sure to avoid race condition (most likely use reduction) 204 | for (k = 0; k < K; k++) { 205 | N_z[k] = 0; 206 | for (w = 0; w < W; w++) { 207 | N_z[k] += nPi[w][k]; 208 | } 209 | } 210 | 211 | 212 | // Find the total number of word in the document 213 | int monthFirstDoc = monthFirstIdx->at(timeSlice); 214 | int monthLastDoc = monthLastIdx->at(timeSlice); 215 | 216 | int monthD = monthLastDoc - monthFirstDoc; 217 | 218 | C = 0; 219 | 220 | for (j = monthFirstDoc; j < (unsigned)monthLastDoc; j++) { 221 | C += corpus_size[j]; 222 | } 223 | 224 | printf("Number of words in corpus: %d\n", C); 225 | 226 | int firstdoc = 0; 227 | int lastdoc = 0; 228 | int DM = monthD / M; 229 | 230 | for (iter = 0; iter < (int)MAXITER; iter++) { 231 | // Decide rho_phi and rho_theta 232 | rhoPhi = 10 / pow((1000 + iter), 0.9); 233 | rhoTheta = 1 / pow((10 + iter), 0.9); 234 | 235 | #pragma omp parallel private(batch_idx,j,k,i,w,firstdoc,lastdoc) 236 | { 237 | double *gamma = new double[K]; 238 | double *nzHat = new double[K]; 239 | double **nPhiHat = new double *[W]; 240 | for (k = 0; k < K; k++) { 241 | gamma[k] = 0; 242 | nzHat[k] = 0; 243 | } 244 | for (i = 0; i < W; i++) { 245 | nPhiHat[i] = new double[K]; 246 | for (k = 0; k < K; k++) { 247 | nPhiHat[i][k] = 0; 248 | } 249 | } 250 | 251 | #pragma omp for 252 | for (batch_idx = 0; batch_idx < DM+1; batch_idx++) { 253 | 254 | // Decide the document indices which go in each minibatch 255 | firstdoc = monthFirstDoc + (batch_idx * M); 256 | lastdoc = monthFirstDoc + ((batch_idx + 1) * M); 257 | 258 | if (batch_idx == DM) { 259 | lastdoc = monthLastDoc; 260 | } 261 | for (j = (unsigned)firstdoc; j < (unsigned)lastdoc; j++) { 262 | 263 | // First perform the burn-in passes 264 | // Iteration of burn in passes 265 | 266 | // Store size of corpus in Cj 267 | Cj = corpus_size[j]; 268 | 269 | for (i = 0; i < (corpus[j].size() / 2); i++) {// indexing is very different here! 270 | 271 | int w_aj = corpus[j][2 * i]; 272 | int m_aj = corpus[j][(2 * i) + 1]; 273 | // Update gamma_ij and N_theta 274 | double normSum = 0; 275 | 276 | for (k = 0; k < K; k++) { 277 | gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W)); 278 | normSum += gamma[k]; 279 | } 280 | 281 | for (k = 0; k < K; k++) { 282 | gamma[k] = gamma[k] / normSum; 283 | } 284 | 285 | for (k = 0; k < K; k++) { 286 | 287 | nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k]) 288 | + ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]); 289 | } 290 | 291 | } 292 | 293 | // Iteration of the main loop 294 | for (i = 0; i < (corpus[j].size() / 2); i++) { // indexing is very different here! 295 | 296 | int w_aj = corpus[j][2 * i]; 297 | int m_aj = corpus[j][(2 * i) + 1]; 298 | double normSum = 0; 299 | for (k = 0; k < K; k++) { 300 | gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W)); 301 | normSum += gamma[k]; 302 | } 303 | 304 | for (k = 0; k < K; k++) { 305 | gamma[k] = gamma[k] / normSum; 306 | } 307 | 308 | // Update N_theta estimates 309 | for (k = 0; k < K; k++) { 310 | nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k]) 311 | + ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]); 312 | 313 | nPhiHat[w_aj][k] = nPhiHat[w_aj][k] + (C * gamma[k] / M); 314 | 315 | nzHat[k] = nzHat[k] + (C * gamma[k] / M); 316 | } 317 | } 318 | 319 | } // End of j 320 | 321 | // Update the estimates matrix 322 | for (k = 0; k < K; k++) { 323 | for (w = 0; w < W; w++) { 324 | nPi[w][k] = (1 - rhoPhi) * nPi[w][k] + rhoPhi * nPhiHat[w][k]; 325 | } 326 | #pragma omp atomic 327 | N_z[k] *= (1 - rhoPhi); 328 | #pragma omp atomic 329 | N_z[k] += rhoPhi * nzHat[k]; 330 | } 331 | 332 | } // End of batch_idx 333 | 334 | // Compute phi 335 | #pragma omp for 336 | for (k = 0; k < K; k++) { 337 | double normSum = 0; 338 | for (w = 0; w < W; w++) { 339 | nPi[w][k] += eta; 340 | normSum += nPi[w][k]; 341 | } 342 | for (w = 0; w < W; w++) { 343 | Pi[w][k] = (double) nPi[w][k] / normSum; 344 | } 345 | } 346 | 347 | // Compute theta 348 | #pragma omp for 349 | for (i = monthFirstDoc; i < (unsigned)monthLastDoc; i++) { 350 | double normSum = 0; 351 | for (k = 0; k < K; k++) { 352 | nTheta[i][k] += alpha; 353 | normSum += nTheta[i][k]; 354 | } 355 | for (k = 0; k < K; k++) { 356 | theta[i][k] = (double) nTheta[i][k] / normSum; 357 | } 358 | } 359 | 360 | delete[] gamma; 361 | delete[] nzHat; 362 | 363 | for (i = 0; i < W; i++) { 364 | delete[] nPhiHat[i]; 365 | } 366 | 367 | delete[] nPhiHat; 368 | 369 | } 370 | 371 | // Calculate the perplexity here 372 | // Compute posterior means here 373 | // Iterate over the corpus here 374 | perplexityval = 0; 375 | #pragma omp parallel for private(j,i,k) reduction(+:innerval) reduction(+:perplexityval) 376 | for (j = monthFirstDoc; j < (unsigned)monthLastDoc; j++) { 377 | for (i = 0; i < corpus_expanded[j].size(); i++) { 378 | innerval = 0; 379 | for (k = 0; k < K; k++) { 380 | innerval += (theta[j][k] * Pi[corpus_expanded[j][i]][k]); 381 | } 382 | perplexityval += (log(innerval) / log(2)); 383 | } 384 | } 385 | printf("%d,%f\n", iter, pow(2, -perplexityval / C)); 386 | perplexities[timeSlice][iter] = pow(2, -perplexityval / C); 387 | 388 | pfile << iter + 1 << "," << perplexities[timeSlice][iter] << endl; 389 | pfile.flush(); 390 | 391 | } // End of iter 392 | 393 | if (argc == 7) { 394 | ofstream pifile; 395 | pifile.open("Pi/topics_" + to_string(months->at(timeSlice)) + ".txt"); 396 | for (k = 0; k < K; k++) { 397 | for (w = 0; w < W; w++) { 398 | pifile << Pi[w][k] << ","; 399 | } 400 | pifile << endl; 401 | } 402 | pifile.close(); 403 | } 404 | 405 | //compute the top 100 words for each topic 406 | 407 | topwords[timeSlice] = new int*[K]; 408 | maxval = new double*[K]; 409 | for (k = 0; k < K; k++) { 410 | topwords[timeSlice][k] = new int[100]; 411 | maxval[k] = new double[100]; 412 | } 413 | for (k = 0; k < K; k++) { 414 | double oldMax = std::numeric_limits::max(); 415 | for (i = 0; i < 100; i++) { 416 | double max = -1; 417 | int max_idx = -1; 418 | for (w = 0; w < W; w++) { 419 | if ((oldMax > Pi[w][k]) && (Pi[w][k] > max)) { 420 | max = Pi[w][k]; 421 | max_idx = w; 422 | } 423 | } 424 | oldMax = Pi[max_idx][k]; 425 | topwords[timeSlice][k][i] = max_idx; 426 | maxval[k][i] = max; 427 | } 428 | } 429 | 430 | } // End of TimeSlice Loop 431 | string *dict; 432 | dict = new string[W]; 433 | // char word; 434 | //retrieve the words from the file 435 | w = 0; 436 | string line; 437 | ifstream vocabFile(argv[5]); 438 | if (vocabFile.is_open()) { 439 | while (getline(vocabFile, line)) { 440 | dict[w] = line; 441 | w++; 442 | } 443 | vocabFile.close(); 444 | } 445 | 446 | // write topics file 447 | for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) { 448 | ofstream tfile; 449 | tfile.open("output/topics_" + to_string(months->at(timeSlice)) + ".txt"); 450 | for (k = 0; k < K; k++) { 451 | for (w = 0; w < 100; w++) { 452 | tfile << topwords[timeSlice][k][w] << ":" << maxval[k][w] << ","; 453 | } 454 | tfile << endl; 455 | 456 | for (w = 0; w < 100; w++) { 457 | tfile << dict[topwords[timeSlice][k][w]] << ","; 458 | } 459 | tfile << endl; 460 | } 461 | tfile.close(); 462 | } 463 | 464 | ofstream topTenfile; 465 | topTenfile.open("TopTen.txt"); 466 | for (k = 0; k < K; k++) { 467 | for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) { 468 | for (w = 0; w < 10; w++) { 469 | topTenfile << topwords[timeSlice][k][w] << ","; 470 | } 471 | } 472 | topTenfile << endl; 473 | } 474 | topTenfile.close(); 475 | 476 | return (0); 477 | } 478 | -------------------------------------------------------------------------------- /TopicChains/GetData.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * TopicChains.cpp 3 | * 4 | * Created on: Apr 22, 2014 5 | * Author: vspathak 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | using namespace std; 28 | using namespace boost; 29 | 30 | typedef adjacency_list Graph; 31 | 32 | // Initialize number of documents, topics and words in vocabulary 33 | unsigned int W, D, K; 34 | 35 | int main(int argc, char* argv[]) { 36 | if (argc < 4) { 37 | printf("Usage: ./fastLDA inputfile num_iterations num_topics\n"); 38 | return 1; 39 | } 40 | 41 | // Initlialize expected topic counts per document 42 | double **nTheta; 43 | // Dynamically 44 | double **nPi; 45 | double *N_z; 46 | // Initialize estimates from each minibatch 47 | // Initialize step sizes 48 | double rhoTheta = 0; 49 | double rhoPhi = 0; 50 | double ***Pi; 51 | double **theta; 52 | double **perplexities; 53 | // Initlalize dirichlet prior parameters 54 | double alpha, eta; 55 | double M; // Number of documents in each minibatch 56 | int Cj = 0; 57 | unsigned int i, j, k, w, MAXITER; 58 | int batch_idx = 0; 59 | int C = 0; 60 | int iter = 0; 61 | int NNZ; 62 | double perplexityval, innerval; 63 | ofstream pfile; 64 | pfile.open("perplexity.txt"); 65 | 66 | M = 100; //343 works for KOS and only for KOS 67 | eta = 0.01; // was 0.01 68 | alpha = 0.1; 69 | 70 | ifstream seqfile; 71 | seqfile.open("Data/seqfile.txt"); 72 | string newline = ""; 73 | vector* months = new vector(); 74 | vector* numOfDocs = new vector(); 75 | vector* monthFirstIdx = new vector(); 76 | vector* monthLastIdx = new vector(); 77 | int curIdx = 0; 78 | 79 | while (seqfile >> newline) { 80 | const char * ptr = strchr(newline.c_str(), ':'); 81 | int count = atoi(ptr + 1); 82 | ptr = "\0"; 83 | int yearMonth = atoi(newline.c_str()); 84 | months->push_back(yearMonth); 85 | numOfDocs->push_back(count); 86 | monthFirstIdx->push_back(curIdx); 87 | monthLastIdx->push_back(curIdx+count); 88 | curIdx += count; 89 | } 90 | seqfile.close(); 91 | 92 | //if user also specified a minibatch size 93 | if (argc > 4) { 94 | M = atof(argv[4]); 95 | } 96 | 97 | MAXITER = atoi(argv[2]); 98 | K = atoi(argv[3]); 99 | 100 | printf("Input file: %s\n", argv[1]); 101 | printf("Number of iterations: %d\n", MAXITER); 102 | printf("Number of topics: %d\n", K); 103 | printf("Minibatch size: %f\n", M); 104 | printf("alpha: %f\n", alpha); 105 | printf("eta: %f\n", eta); 106 | 107 | // Read the file and store it in DATA 108 | FILE* fptr; 109 | unsigned int docnum, wnum; 110 | unsigned char countnum; 111 | 112 | fptr = fopen(argv[1], "rt"); 113 | 114 | fscanf(fptr, "%d\n", &D); 115 | fscanf(fptr, "%d\n", &W); 116 | fscanf(fptr, "%d\n", &NNZ); 117 | 118 | printf("Number of documents: %d\n", D); 119 | printf("Vocabulary size: %d\n", W); 120 | 121 | // Dynamically allocate phi 122 | Pi = new double**[months->size()]; 123 | for (unsigned int m = 0; m < months->size(); ++m) { 124 | Pi[m] = new double*[W]; 125 | for (unsigned int word = 0; word < W; word++) { 126 | Pi[m][word] = new double[K]; 127 | } 128 | } 129 | //#pragma omp parallel for 130 | 131 | 132 | printf("allocated phi\n"); 133 | 134 | // Dynamically allocate theta 135 | 136 | theta = new double*[D]; 137 | //#pragma omp parallel for 138 | for (i = 0; i < D; i++) { 139 | theta[i] = new double[K]; 140 | } 141 | 142 | printf("allocated theta\n"); 143 | 144 | vector > corpus; 145 | vector corpus_size(D, 0); 146 | corpus.resize(D); 147 | vector > corpus_expanded; 148 | corpus_expanded.resize(D); 149 | 150 | while (!feof(fptr)) { 151 | fscanf(fptr, "%d %d %hhu\n", &docnum, &wnum, &countnum); 152 | 153 | corpus[docnum - 1].push_back(wnum - 1); 154 | corpus[docnum - 1].push_back(countnum); 155 | 156 | corpus_size[docnum - 1] += countnum; 157 | 158 | for (i = 0; i < countnum; i++) { 159 | corpus_expanded[docnum - 1].push_back(wnum - 1); 160 | } 161 | } 162 | fclose(fptr); 163 | 164 | 165 | // Initialize phi_est and all other arrays 166 | nPi = new double*[W]; 167 | 168 | for (i = 0; i < W; i++) { 169 | nPi[i] = new double[K]; 170 | } 171 | 172 | // Initialize n_z and n_z_est and other arrays 173 | N_z = new double[K]; 174 | for (k = 0; k < K; k++) { 175 | N_z[k] = 0; 176 | } 177 | 178 | nTheta = new double*[D]; 179 | for (i = 0; i < D; i++) { 180 | nTheta[i] = new double[K]; 181 | } 182 | 183 | perplexities = new double*[months->size()]; 184 | for (i = 0; i < months->size(); i++) { 185 | perplexities[i] = new double[MAXITER]; 186 | for (unsigned int a = 0; a < MAXITER; ++a) { 187 | perplexities[i][a] = 0; 188 | } 189 | } 190 | 191 | int*** topwords; 192 | topwords = new int**[months->size()]; 193 | 194 | //Generate Numbers according to Gaussian Distribution 195 | 196 | for (int timeSlice = 0; timeSlice < months->size(); timeSlice++) { 197 | cout << (*months)[timeSlice] << " " << (*numOfDocs)[timeSlice] << endl; 198 | 199 | 200 | for (int doc = 0; doc < D; doc++) { 201 | for (int top = 0; top < K; top++) { 202 | nTheta[doc][top] = rand() % 10; 203 | } 204 | } 205 | 206 | for (int word = 0; word < W; word++) { 207 | for (int top = 0; top < K; top++) { 208 | nPi[word][top] = rand() % 10; 209 | } 210 | } 211 | 212 | //if parallelizing this, make sure to avoid race condition (most likely use reduction) 213 | for (k = 0; k < K; k++) { 214 | N_z[k] = 0; 215 | for (w = 0; w < W; w++) { 216 | N_z[k] += nPi[w][k]; 217 | } 218 | } 219 | 220 | // Find the total number of word in the document 221 | int monthFirstDoc = monthFirstIdx->at(timeSlice); 222 | int monthLastDoc = monthLastIdx->at(timeSlice); 223 | 224 | int monthD = monthLastDoc - monthFirstDoc; 225 | 226 | C = 0; 227 | 228 | for (int var = monthFirstDoc; var < monthLastDoc; var++) { 229 | C += corpus_size[var]; 230 | } 231 | 232 | printf("Number of words in corpus: %d\n", C); 233 | 234 | int firstdoc = 0; 235 | int lastdoc = 0; 236 | int DM = monthD / M; 237 | 238 | for (iter = 0; iter < (int)MAXITER; iter++) { 239 | // Decide rho_phi and rho_theta 240 | rhoPhi = 10 / pow((1000 + iter), 0.9); 241 | rhoTheta = 1 / pow((10 + iter), 0.9); 242 | 243 | #pragma omp parallel private(batch_idx,j,k,i,w,firstdoc,lastdoc) 244 | { 245 | double *gamma = new double[K]; 246 | double *nzHat = new double[K]; 247 | double **nPhiHat = new double *[W]; 248 | for (k = 0; k < K; k++) { 249 | gamma[k] = 0; 250 | nzHat[k] = 0; 251 | } 252 | for (i = 0; i < W; i++) { 253 | nPhiHat[i] = new double[K]; 254 | for (k = 0; k < K; k++) { 255 | nPhiHat[i][k] = 0; 256 | } 257 | } 258 | 259 | #pragma omp for 260 | for (batch_idx = 0; batch_idx < DM+1; batch_idx++) { 261 | 262 | // Decide the document indices which go in each minibatch 263 | firstdoc = monthFirstDoc + (batch_idx * M); 264 | lastdoc = monthFirstDoc + ((batch_idx + 1) * M); 265 | 266 | if (batch_idx == DM) { 267 | lastdoc = monthLastDoc; 268 | } 269 | for (j = (unsigned)firstdoc; j < (unsigned)lastdoc; j++) { 270 | 271 | // First perform the burn-in passes 272 | // Iteration of burn in passes 273 | 274 | // Store size of corpus in Cj 275 | Cj = corpus_size[j]; 276 | 277 | for (i = 0; i < (corpus[j].size() / 2); i++) {// indexing is very different here! 278 | 279 | int w_aj = corpus[j][2 * i]; 280 | int m_aj = corpus[j][(2 * i) + 1]; 281 | // Update gamma_ij and N_theta 282 | double normSum = 0; 283 | 284 | for (k = 0; k < K; k++) { 285 | gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W)); 286 | normSum += gamma[k]; 287 | } 288 | 289 | for (k = 0; k < K; k++) { 290 | gamma[k] = gamma[k] / normSum; 291 | } 292 | 293 | for (k = 0; k < K; k++) { 294 | 295 | nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k]) 296 | + ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]); 297 | } 298 | 299 | } 300 | 301 | // Iteration of the main loop 302 | for (i = 0; i < (corpus[j].size() / 2); i++) { // indexing is very different here! 303 | 304 | int w_aj = corpus[j][2 * i]; 305 | int m_aj = corpus[j][(2 * i) + 1]; 306 | double normSum = 0; 307 | for (k = 0; k < K; k++) { 308 | gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W)); 309 | normSum += gamma[k]; 310 | } 311 | 312 | for (k = 0; k < K; k++) { 313 | gamma[k] = gamma[k] / normSum; 314 | } 315 | 316 | // Update N_theta estimates 317 | for (k = 0; k < K; k++) { 318 | nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k]) 319 | + ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]); 320 | 321 | nPhiHat[w_aj][k] = nPhiHat[w_aj][k] + (C * gamma[k] / M); 322 | 323 | nzHat[k] = nzHat[k] + (C * gamma[k] / M); 324 | } 325 | } 326 | 327 | } // End of j 328 | 329 | // Update the estimates matrix 330 | for (k = 0; k < K; k++) { 331 | for (w = 0; w < W; w++) { 332 | nPi[w][k] = (1 - rhoPhi) * nPi[w][k] + rhoPhi * nPhiHat[w][k]; 333 | } 334 | #pragma omp atomic 335 | N_z[k] *= (1 - rhoPhi); 336 | #pragma omp atomic 337 | N_z[k] += rhoPhi * nzHat[k]; 338 | } 339 | 340 | } // End of batch_idx 341 | 342 | // Compute phi 343 | #pragma omp for 344 | for (k = 0; k < K; k++) { 345 | double normSum = 0; 346 | for (w = 0; w < W; w++) { 347 | nPi[w][k] += eta; 348 | normSum += nPi[w][k]; 349 | } 350 | 351 | for (w = 0; w < W; w++) { 352 | Pi[timeSlice][w][k] = (double) nPi[w][k] / normSum; 353 | } 354 | } 355 | 356 | // Compute theta 357 | #pragma omp for 358 | for (int var = monthFirstDoc; var < monthLastDoc; var++) { 359 | double normSum = 0; 360 | for (k = 0; k < K; k++) { 361 | nTheta[var][k] += alpha; 362 | normSum += nTheta[var][k]; 363 | } 364 | 365 | for (k = 0; k < K; k++) { 366 | theta[var][k] = (double) nTheta[var][k] / normSum; 367 | } 368 | } 369 | 370 | delete[] gamma; 371 | delete[] nzHat; 372 | 373 | for (i = 0; i < W; i++) { 374 | delete[] nPhiHat[i]; 375 | } 376 | 377 | delete[] nPhiHat; 378 | 379 | } 380 | 381 | // Calculate the perplexity here 382 | perplexityval = 0; 383 | #pragma omp parallel for private(j,i,k) reduction(+:innerval) reduction(+:perplexityval) 384 | for (j = monthFirstDoc; j < monthLastDoc; j++) { 385 | for (i = 0; i < corpus_expanded[j].size(); i++) { 386 | innerval = 0; 387 | for (k = 0; k < K; k++) { 388 | innerval += (theta[j][k] * Pi[timeSlice][corpus_expanded[j][i]][k]); 389 | } 390 | perplexityval += (log(innerval) / log(2)); 391 | } 392 | } 393 | printf("%d,%f\n", iter, pow(2, -perplexityval / C)); 394 | perplexities[timeSlice][iter] = pow(2, -perplexityval / C); 395 | 396 | pfile << iter + 1 << "," << perplexities[timeSlice][iter] << endl; 397 | pfile.flush(); 398 | 399 | } // End of iter 400 | 401 | //compute the top 100 words for each topic 402 | 403 | double** maxval; 404 | topwords[timeSlice] = new int*[K]; 405 | maxval = new double*[K]; 406 | for (k = 0; k < K; k++) { 407 | topwords[timeSlice][k] = new int[100]; 408 | maxval[k] = new double[100]; 409 | } 410 | for (k = 0; k < K; k++) { 411 | double oldMax = std::numeric_limits::max(); 412 | for (i = 0; i < 100; i++) { 413 | double max = -1; 414 | int max_idx = -1; 415 | for (w = 0; w < W; w++) { 416 | if (oldMax > Pi[timeSlice][w][k] && Pi[timeSlice][w][k] > max) { 417 | max = Pi[timeSlice][w][k]; 418 | max_idx = w; 419 | } 420 | } 421 | oldMax = Pi[timeSlice][max_idx][k]; 422 | topwords[timeSlice][k][i] = max_idx; 423 | maxval[k][i] = max; 424 | } 425 | } 426 | 427 | ofstream pifile; 428 | pifile.open("Pi/topics_" + to_string(months->at(timeSlice)) + ".txt"); 429 | for (k = 0; k < K; k++) { 430 | for (w = 0; w < W-1; w++) { 431 | pifile << Pi[timeSlice][w][k] << ","; 432 | } 433 | pifile << Pi[timeSlice][W-1][k]; 434 | pifile << endl; 435 | } 436 | pifile.close(); 437 | }//All timeSlices finished 438 | 439 | string *dict; 440 | dict = new string[W]; 441 | // char word; 442 | //retrieve the words from the file 443 | w = 0; 444 | string line; 445 | ifstream vocabFile(argv[5]); 446 | if (vocabFile.is_open()) { 447 | while (getline(vocabFile, line)) { 448 | dict[w] = line; 449 | w++; 450 | } 451 | vocabFile.close(); 452 | } 453 | 454 | // write topics file 455 | for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) { 456 | ofstream tfile; 457 | tfile.open("output/topics_" + to_string(months->at(timeSlice)) + ".txt"); 458 | for (k = 0; k < K; k++) { 459 | for (w = 0; w < 100; w++) { 460 | tfile << topwords[timeSlice][k][w];// << ":" << maxval[k][w] << ","; 461 | 462 | } 463 | tfile << endl; 464 | 465 | for (w = 0; w < 100; w++) { 466 | tfile << dict[topwords[timeSlice][k][w]] << ","; 467 | } 468 | 469 | tfile << endl; 470 | } 471 | tfile.close(); 472 | } 473 | 474 | return (0); 475 | 476 | } // End of main 477 | -------------------------------------------------------------------------------- /TopicChains/TopicChains.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * TopicChains.cpp 3 | * 4 | * Created on: Apr 22, 2014 5 | * Author: vspathak 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | using namespace std; 28 | using namespace boost; 29 | 30 | typedef adjacency_list Graph; 31 | 32 | 33 | // Initialize number of documents, topics and words in vocabulary 34 | unsigned int W, D, K; 35 | 36 | double diffclock(clock_t clock1, clock_t clock2) { 37 | double diffticks = clock1 - clock2; 38 | double diffms = (diffticks * 1000) / CLOCKS_PER_SEC; 39 | return diffms; 40 | } 41 | 42 | double KLDivergence(double*** Pi, int t, int k, double* M) { 43 | double result = 0.0; 44 | for (unsigned int w = 0; w < W; ++w) { 45 | result += log(Pi[t][w][k] / M[w]) * Pi[t][w][k]; 46 | } 47 | return result; 48 | } 49 | 50 | double JSsimilarity(double*** Pi, int t1, int k1, int t2, int k2) { 51 | double result = 0.0; 52 | double* M = new double[W]; 53 | for (unsigned int w = 0; w < W; ++w) { 54 | M[w] = (Pi[t1][w][k1] + Pi[t2][w][k2]) / 2; 55 | } 56 | result = KLDivergence(Pi, t1, k1, M) + KLDivergence(Pi, t2, k2, M); 57 | result = result / 2; 58 | return result; 59 | } 60 | 61 | void generateTopicLinks(Graph &G, double*** Pi, int timeSlice, int topic, int numTopics, int windowSize, double threshold) { 62 | for (int w = 0; w < windowSize; w++) { 63 | int numLinks = 0; 64 | for (int k = 0; k < numTopics; k++) { 65 | if ((timeSlice - 1 - w >= 0) 66 | && JSsimilarity(Pi, timeSlice, topic, timeSlice - 1 - w, k) > threshold) { 67 | //add edge to graph structure here 68 | int e1 = (timeSlice * numTopics) + topic; 69 | int e2 = ((timeSlice - 1 - w) * numTopics) + k; 70 | 71 | cout<<"Adding edge "< 0) { 77 | break; 78 | } 79 | } 80 | } 81 | 82 | void generateAllLinks(Graph &G, double*** Pi, int numTimeSlices, int numTopics, int windowSize, double threshold) { 83 | for (int t = 0; t < numTimeSlices; t++) { 84 | for (int k = 0; k < numTopics; k++) { 85 | generateTopicLinks(G, Pi, t, k, numTopics, windowSize, threshold); 86 | } 87 | } 88 | } 89 | 90 | int main(int argc, char* argv[]) { 91 | if (argc < 4) { 92 | printf("Usage: ./fastLDA inputfile num_iterations num_topics\n"); 93 | return 1; 94 | } 95 | 96 | // Initlialize expected topic counts per document 97 | double **nTheta; 98 | // Dynamically 99 | double **nPi; 100 | double *N_z; 101 | // Initialize estimates from each minibatch 102 | // Initialize step sizes 103 | double rhoTheta = 0; 104 | double rhoPhi = 0; 105 | double ***Pi; 106 | double **theta; 107 | double **perplexities; 108 | // Initlalize dirichlet prior parameters 109 | double alpha, eta; 110 | double M; // Number of documents in each minibatch 111 | int Cj = 0; 112 | unsigned int i, j, k, w, MAXITER; 113 | int batch_idx = 0; 114 | int C = 0; 115 | int iter = 0; 116 | int NNZ; 117 | 118 | int windowSize = 0; 119 | double similarityThreshold = 0; 120 | 121 | ofstream pfile; 122 | pfile.open("perplexity.txt"); 123 | 124 | M = 100; //343 works for KOS and only for KOS 125 | eta = 0.01; // was 0.01 126 | alpha = 0.1; 127 | 128 | ifstream seqfile; 129 | seqfile.open("Data/seqfile.txt"); 130 | string newline = ""; 131 | vector* months = new vector(); 132 | vector* numOfDocs = new vector(); 133 | vector* monthFirstIdx = new vector(); 134 | vector* monthLastIdx = new vector(); 135 | int curIdx = 0; 136 | 137 | while (seqfile >> newline) { 138 | const char * ptr = strchr(newline.c_str(), ':'); 139 | int count = atoi(ptr + 1); 140 | ptr = "\0"; 141 | int yearMonth = atoi(newline.c_str()); 142 | months->push_back(yearMonth); 143 | numOfDocs->push_back(count); 144 | monthFirstIdx->push_back(curIdx); 145 | monthLastIdx->push_back(curIdx+count); 146 | curIdx += count; 147 | } 148 | seqfile.close(); 149 | 150 | //if user also specified a minibatch size 151 | if (argc > 4) { 152 | M = atof(argv[4]); 153 | windowSize = atoi(argv[6]); 154 | similarityThreshold = atof(argv[7]); 155 | } 156 | 157 | MAXITER = atoi(argv[2]); 158 | K = atoi(argv[3]); 159 | 160 | printf("Input file: %s\n", argv[1]); 161 | printf("Number of iterations: %d\n", MAXITER); 162 | printf("Number of topics: %d\n", K); 163 | printf("Minibatch size: %f\n", M); 164 | printf("alpha: %f\n", alpha); 165 | printf("eta: %f\n", eta); 166 | 167 | // Read the file and store it in DATA 168 | FILE* fptr; 169 | unsigned int docnum, wnum; 170 | unsigned char countnum; 171 | 172 | fptr = fopen(argv[1], "rt"); 173 | 174 | fscanf(fptr, "%d\n", &D); 175 | fscanf(fptr, "%d\n", &W); 176 | fscanf(fptr, "%d\n", &NNZ); 177 | 178 | printf("Number of documents: %d\n", D); 179 | printf("Vocabulary size: %d\n", W); 180 | 181 | // Dynamically allocate phi 182 | Pi = new double**[months->size()]; 183 | for (unsigned int m = 0; m < months->size(); ++m) { 184 | Pi[m] = new double*[W]; 185 | for (unsigned int word = 0; word < W; word++) { 186 | Pi[m][word] = new double[K]; 187 | } 188 | } 189 | //#pragma omp parallel for 190 | 191 | 192 | printf("allocated phi\n"); 193 | 194 | // Dynamically allocate theta 195 | 196 | theta = new double*[D]; 197 | //#pragma omp parallel for 198 | for (i = 0; i < D; i++) { 199 | theta[i] = new double[K]; 200 | } 201 | 202 | printf("allocated theta\n"); 203 | 204 | vector > corpus; 205 | vector corpus_size(D, 0); 206 | corpus.resize(D); 207 | vector > corpus_expanded; 208 | corpus_expanded.resize(D); 209 | 210 | while (!feof(fptr)) { 211 | fscanf(fptr, "%d %d %hhu\n", &docnum, &wnum, &countnum); 212 | 213 | corpus[docnum - 1].push_back(wnum - 1); 214 | corpus[docnum - 1].push_back(countnum); 215 | 216 | corpus_size[docnum - 1] += countnum; 217 | 218 | for (i = 0; i < countnum; i++) { 219 | corpus_expanded[docnum - 1].push_back(wnum - 1); 220 | } 221 | } 222 | fclose(fptr); 223 | 224 | 225 | // Initialize phi_est and all other arrays 226 | nPi = new double*[W]; 227 | 228 | for (i = 0; i < W; i++) { 229 | nPi[i] = new double[K]; 230 | } 231 | 232 | // Initialize n_z and n_z_est and other arrays 233 | N_z = new double[K]; 234 | for (k = 0; k < K; k++) { 235 | N_z[k] = 0; 236 | } 237 | 238 | nTheta = new double*[D]; 239 | for (i = 0; i < D; i++) { 240 | nTheta[i] = new double[K]; 241 | } 242 | 243 | for (i = 0; i < D; i++) { 244 | for (k = 0; k < K; k++) { 245 | nTheta[i][k] = rand() % 10; 246 | } 247 | } 248 | 249 | perplexities = new double*[months->size()]; 250 | for (i = 0; i < months->size(); i++) { 251 | perplexities[i] = new double[K]; 252 | for (unsigned int a = 0; a < K; ++a) { 253 | perplexities[i][a] = 0; 254 | } 255 | } 256 | 257 | int*** topwords; 258 | topwords = new int**[months->size()]; 259 | 260 | //Generate Numbers according to Gaussian Distribution 261 | 262 | for (int timeSlice = 0; timeSlice < 10; timeSlice++) { 263 | cout << (*months)[timeSlice] << " " << (*numOfDocs)[timeSlice] << endl; 264 | 265 | //if parallelizing this, make sure to avoid race condition (most likely use reduction) 266 | for (k = 0; k < K; k++) { 267 | for (w = 0; w < W; w++) { 268 | N_z[k] += nPi[w][k]; 269 | } 270 | } 271 | 272 | // Find the total number of word in the document 273 | int monthFirstDoc = monthFirstIdx->at(timeSlice); 274 | int monthLastDoc = monthLastIdx->at(timeSlice); 275 | 276 | int monthD = monthLastDoc - monthFirstDoc; 277 | 278 | C = 0; 279 | 280 | for (j = monthFirstDoc; j < monthLastDoc; j++) { 281 | C += corpus_size[j]; 282 | } 283 | 284 | printf("Number of words in corpus: %d\n", C); 285 | 286 | int firstdoc = 0; 287 | int lastdoc = 0; 288 | int DM = monthD / M; 289 | 290 | for (iter = 0; iter < (int)MAXITER; iter++) { 291 | // Decide rho_phi and rho_theta 292 | rhoPhi = 10 / pow((1000 + iter), 0.9); 293 | rhoTheta = 1 / pow((10 + iter), 0.9); 294 | 295 | #pragma omp parallel private(batch_idx,j,k,i,w,firstdoc,lastdoc) 296 | { 297 | double *gamma = new double[K]; 298 | double *nzHat = new double[K]; 299 | double **nPhiHat = new double *[W]; 300 | for (k = 0; k < K; k++) { 301 | gamma[k] = 0; 302 | nzHat[k] = 0; 303 | } 304 | for (i = 0; i < W; i++) { 305 | nPhiHat[i] = new double[K]; 306 | for (k = 0; k < K; k++) { 307 | nPhiHat[i][k] = 0; 308 | } 309 | } 310 | 311 | #pragma omp for 312 | for (batch_idx = 0; batch_idx < DM+1; batch_idx++) { 313 | 314 | // Decide the document indices which go in each minibatch 315 | firstdoc = monthFirstDoc + (batch_idx * M); 316 | lastdoc = monthFirstDoc + ((batch_idx + 1) * M); 317 | 318 | if (batch_idx == DM) { 319 | lastdoc = monthLastDoc; 320 | } 321 | for (j = (unsigned)firstdoc; j < (unsigned)lastdoc; j++) { 322 | 323 | // First perform the burn-in passes 324 | // Iteration of burn in passes 325 | 326 | // Store size of corpus in Cj 327 | Cj = corpus_size[j]; 328 | 329 | for (i = 0; i < (corpus[j].size() / 2); i++) {// indexing is very different here! 330 | 331 | int w_aj = corpus[j][2 * i]; 332 | int m_aj = corpus[j][(2 * i) + 1]; 333 | // Update gamma_ij and N_theta 334 | double normSum = 0; 335 | 336 | for (k = 0; k < K; k++) { 337 | gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W)); 338 | normSum += gamma[k]; 339 | } 340 | 341 | for (k = 0; k < K; k++) { 342 | gamma[k] = gamma[k] / normSum; 343 | } 344 | 345 | for (k = 0; k < K; k++) { 346 | 347 | nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k]) 348 | + ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]); 349 | } 350 | 351 | } 352 | 353 | // Iteration of the main loop 354 | for (i = 0; i < (corpus[j].size() / 2); i++) { // indexing is very different here! 355 | 356 | int w_aj = corpus[j][2 * i]; 357 | int m_aj = corpus[j][(2 * i) + 1]; 358 | double normSum = 0; 359 | for (k = 0; k < K; k++) { 360 | gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W)); 361 | normSum += gamma[k]; 362 | } 363 | 364 | for (k = 0; k < K; k++) { 365 | gamma[k] = gamma[k] / normSum; 366 | } 367 | 368 | // Update N_theta estimates 369 | for (k = 0; k < K; k++) { 370 | nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k]) 371 | + ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]); 372 | 373 | nPhiHat[w_aj][k] = nPhiHat[w_aj][k] + (C * gamma[k] / M); 374 | 375 | nzHat[k] = nzHat[k] + (C * gamma[k] / M); 376 | } 377 | } 378 | 379 | } // End of j 380 | 381 | // Update the estimates matrix 382 | for (k = 0; k < K; k++) { 383 | for (w = 0; w < W; w++) { 384 | nPi[w][k] = (1 - rhoPhi) * nPi[w][k] + rhoPhi * nPhiHat[w][k]; 385 | } 386 | #pragma omp atomic 387 | N_z[k] *= (1 - rhoPhi); 388 | #pragma omp atomic 389 | N_z[k] += rhoPhi * nzHat[k]; 390 | } 391 | 392 | } // End of batch_idx 393 | 394 | // Compute phi 395 | #pragma omp for 396 | for (k = 0; k < K; k++) { 397 | double normSum = 0; 398 | for (w = 0; w < W; w++) { 399 | nPi[w][k] += eta; 400 | normSum += nPi[w][k]; 401 | } 402 | // cout << normSum << endl; 403 | for (w = 0; w < W; w++) { 404 | Pi[timeSlice][w][k] = (double) nPi[w][k] / normSum; 405 | } 406 | } 407 | 408 | // Compute theta 409 | #pragma omp for 410 | for (i = monthFirstDoc; i < monthLastDoc; i++) { 411 | double normSum = 0; 412 | for (k = 0; k < K; k++) { 413 | nTheta[i][k] += alpha; 414 | normSum += nTheta[i][k]; 415 | } 416 | for (k = 0; k < K; k++) { 417 | theta[i][k] = (double) nTheta[i][k] / normSum; 418 | } 419 | } 420 | 421 | delete[] gamma; 422 | delete[] nzHat; 423 | 424 | for (i = 0; i < W; i++) { 425 | delete[] nPhiHat[i]; 426 | } 427 | 428 | delete[] nPhiHat; 429 | 430 | } 431 | 432 | } // End of iter 433 | 434 | //write doctopics file 435 | /*ofstream dtfile; 436 | dtfile.open("output/doctopic_" + to_string(months->at(timeSlice)) + ".txt"); 437 | for (i = monthFirstDoc; i < monthLastDoc; i++) { 438 | for (k = 0; k < K; k++) { 439 | dtfile << theta[i][k] << ","; 440 | } 441 | dtfile << endl; 442 | } 443 | dtfile.close();*/ 444 | 445 | //compute the top 100 words for each topic 446 | 447 | // double** maxval; 448 | // topwords[timeSlice] = new int*[K]; 449 | // maxval = new double*[K]; 450 | // for (k = 0; k < K; k++) { 451 | // topwords[timeSlice][k] = new int[100]; 452 | // maxval[k] = new double[100]; 453 | // } 454 | // for (k = 0; k < K; k++) { 455 | // double oldMax = std::numeric_limits::max(); 456 | // for (i = 0; i < 100; i++) { 457 | // double max = -1; 458 | // int max_idx = -1; 459 | // for (w = 0; w < W; w++) { 460 | // if (oldMax > Pi[timeSlice][w][k] && Pi[timeSlice][w][k] > max) { 461 | // max = Pi[timeSlice][w][k]; 462 | // max_idx = w; 463 | // } 464 | // } 465 | // oldMax = Pi[timeSlice][max_idx][k]; 466 | // topwords[timeSlice][k][i] = max_idx; 467 | // maxval[k][i] = max; 468 | // } 469 | // } 470 | }//All timeSlices finished 471 | 472 | // MAKE CHAINS 473 | Graph G; 474 | //K is unsigned -- is this a problem? 475 | // generateAllLinks(G, Pi, months->size(), K, windowSize, similarityThreshold); 476 | generateAllLinks(G, Pi, 10, K, windowSize, similarityThreshold); 477 | 478 | vector component(num_vertices(G)); 479 | int num = connected_components(G, &component[0]); 480 | 481 | vector::size_type p; 482 | cout << "Total number of components: " << num << endl; 483 | for (p = 0; p != component.size(); ++p) 484 | cout << "Vertex " << p <<" is in component " << component[p] << endl; 485 | 486 | // string *dict; 487 | // dict = new string[W]; 488 | //// char word; 489 | // //retrieve the words from the file 490 | // w = 0; 491 | // string line; 492 | // ifstream vocabFile(argv[5]); 493 | // if (vocabFile.is_open()) { 494 | // while (getline(vocabFile, line)) { 495 | // dict[w] = line; 496 | // w++; 497 | // } 498 | // vocabFile.close(); 499 | // } 500 | 501 | // write topics file 502 | // for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) { 503 | // for (int timeSlice = 0; timeSlice < 10; timeSlice++) { 504 | // ofstream tfile; 505 | // tfile.open("output/topics_" + to_string(months->at(timeSlice)) + ".txt"); 506 | // for (k = 0; k < K; k++) { 507 | // for (w = 0; w < 100; w++) { 508 | // tfile << topwords[timeSlice][k][w];// << ":" << maxval[k][w] << ","; 509 | // 510 | // } 511 | // tfile << endl; 512 | // 513 | // for (w = 0; w < 100; w++) { 514 | // tfile << dict[topwords[timeSlice][k][w]] << ","; 515 | // } 516 | // 517 | // tfile << endl; 518 | // } 519 | // tfile.close(); 520 | // } 521 | 522 | return (0); 523 | 524 | } // End of main 525 | -------------------------------------------------------------------------------- /SCVB0/scvb0.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | using namespace std; 14 | 15 | // Initialize number of documents, topics and words in vocabulary 16 | unsigned int W, D, K; 17 | 18 | void Transform(double** beta_t, double** npi) { 19 | double* Beta_Total = new double[K]; 20 | for (unsigned int var = 0; var < K; ++var) { 21 | Beta_Total[var] = 0; 22 | } 23 | for (unsigned int q = 0; q < K; ++q) { 24 | for (unsigned int p = 0; p < W; ++p) { 25 | Beta_Total[q] += exp(beta_t[p][q]); 26 | } 27 | cout << "BetaTol: " << Beta_Total[q] << endl; 28 | } 29 | for (unsigned int p = 0; p < W; ++p) { 30 | for (unsigned int q = 0; q < K; ++q) { 31 | npi[p][q] = exp(beta_t[p][q]) / Beta_Total[q]; 32 | } 33 | } 34 | delete [] Beta_Total; 35 | } 36 | 37 | void InverseTransform(double** pi, double** beta_t) { 38 | double* Pi_Total = new double[K]; 39 | for (unsigned int var = 0; var < K; ++var) { 40 | Pi_Total[var] = 0; 41 | } 42 | for (unsigned int q = 0; q < K; ++q) { 43 | for (unsigned int p = 0; p < W; ++p) { 44 | Pi_Total[q] += pi[p][q]; 45 | } 46 | } 47 | for (unsigned int p = 0; p < W; ++p) { 48 | for (unsigned int q = 0; q < K; ++q) { 49 | beta_t[p][q] = log(pi[p][q] / Pi_Total[q]); 50 | } 51 | } 52 | delete [] Pi_Total; 53 | } 54 | 55 | void runRegularSCVB(double** nPi,vector > &corpus, vector &corpus_size, int MAXITER, double M){ 56 | 57 | double **nTheta; 58 | double *N_z; 59 | 60 | double rhoTheta = 0; 61 | double rhoPhi = 0; 62 | double **Pi; 63 | // Initlalize dirichlet prior parameters 64 | double alpha, eta; 65 | //double M; // Number of documents in each minibatch 66 | int Cj = 0; 67 | unsigned int i, j, k, w; 68 | int batch_idx = 0; 69 | int C = 0; 70 | int iter = 0; 71 | ofstream pfile; 72 | pfile.open("perplexity.txt"); 73 | 74 | //M = 100; //343 works for KOS and only for KOS 75 | eta = 0.01; // was 0.01 76 | alpha = 0.1; 77 | 78 | // Dynamically allocate phi 79 | Pi = new double*[W]; 80 | //#pragma omp parallel for 81 | for (w = 0; w < W; w++) { 82 | Pi[w] = new double[K]; 83 | } 84 | 85 | // Initialize phi_est and all other arrays 86 | N_z = new double[K]; 87 | 88 | for (unsigned int var = 0; var < W; ++var) { 89 | for (unsigned int var2 = 0; var2 < K; ++var2) { 90 | nPi[var][var2] = rand() % 10; 91 | } 92 | } 93 | nTheta = new double*[D]; 94 | for (i = 0; i < D; i++) { 95 | nTheta[i] = new double[K]; 96 | } 97 | 98 | for (i = 0; i < D; i++) { 99 | for (k = 0; k < K; k++) { 100 | nTheta[i][k] = rand() % 10; 101 | } 102 | } 103 | 104 | for (k = 0; k < K; k++) { 105 | N_z[k] = 0; 106 | for (w = 0; w < W; w++) { 107 | N_z[k] += nPi[w][k]; 108 | } 109 | } 110 | 111 | 112 | // Find the total number of word in the document 113 | C = 0; 114 | for(j=0;j* months = new vector(); 295 | vector* numOfDocs = new vector(); 296 | vector* monthFirstIdx = new vector(); 297 | vector* monthLastIdx = new vector(); 298 | int curIdx = 0; 299 | 300 | while (seqfile >> newline) { 301 | const char * ptr = strchr(newline.c_str(), ':'); 302 | int count = atoi(ptr + 1); 303 | ptr = "\0"; 304 | int yearMonth = atoi(newline.c_str()); 305 | months->push_back(yearMonth); 306 | numOfDocs->push_back(count); 307 | monthFirstIdx->push_back(curIdx); 308 | monthLastIdx->push_back(curIdx + count); 309 | curIdx += count; 310 | } 311 | seqfile.close(); 312 | 313 | //if user also specified a minibatch size 314 | //if (argc == 5 || argc == 6) { 315 | M = atof(argv[4]); 316 | //} 317 | 318 | MAXITER = atoi(argv[2]); 319 | K = atoi(argv[3]); 320 | 321 | printf("Input file: %s\n", argv[1]); 322 | printf("Number of iterations: %d\n", MAXITER); 323 | printf("Number of topics: %d\n", K); 324 | printf("Minibatch size: %f\n", M); 325 | printf("alpha: %f\n", alpha); 326 | printf("eta: %f\n", eta); 327 | 328 | // Read the file and store it in DATA 329 | FILE* fptr; 330 | unsigned int docnum, wnum; 331 | unsigned char countnum; 332 | 333 | fptr = fopen(argv[1], "rt"); 334 | 335 | fscanf(fptr, "%d\n", &D); 336 | fscanf(fptr, "%d\n", &W); 337 | fscanf(fptr, "%d\n", &NNZ); 338 | 339 | printf("Number of documents: %d\n", D); 340 | printf("Vocabulary size: %d\n", W); 341 | 342 | // Dynamically allocate phi 343 | Pi = new double*[W]; 344 | //#pragma omp parallel for 345 | for (w = 0; w < W; w++) { 346 | Pi[w] = new double[K]; 347 | } 348 | 349 | printf("allocated phi\n"); 350 | 351 | // Dynamically allocate theta 352 | 353 | theta = new double*[D]; 354 | //#pragma omp parallel for 355 | for (i = 0; i < D; i++) { 356 | theta[i] = new double[K]; 357 | } 358 | 359 | printf("allocated theta\n"); 360 | 361 | vector > corpus; 362 | vector corpus_size(D, 0); 363 | corpus.resize(D); 364 | vector > corpus_expanded; 365 | corpus_expanded.resize(D); 366 | 367 | while (!feof(fptr)) { 368 | fscanf(fptr, "%d %d %hhu\n", &docnum, &wnum, &countnum); 369 | 370 | corpus[docnum - 1].push_back(wnum - 1); 371 | corpus[docnum - 1].push_back(countnum); 372 | 373 | corpus_size[docnum - 1] += countnum; 374 | 375 | for (i = 0; i < countnum; i++) { 376 | corpus_expanded[docnum - 1].push_back(wnum - 1); 377 | } 378 | } 379 | fclose(fptr); 380 | 381 | 382 | // Initialize phi_est and all other arrays 383 | nPi = new double*[W]; 384 | 385 | for (i = 0; i < W; i++) { 386 | nPi[i] = new double[K]; 387 | } 388 | 389 | // Initialize n_z and n_z_est and other arrays 390 | N_z = new double[K]; 391 | for (k = 0; k < K; k++) { 392 | N_z[k] = 0; 393 | } 394 | 395 | nTheta = new double*[D]; 396 | for (i = 0; i < D; i++) { 397 | nTheta[i] = new double[K]; 398 | } 399 | 400 | for (i = 0; i < D; i++) { 401 | for (k = 0; k < K; k++) { 402 | nTheta[i][k] = rand() % 10; 403 | } 404 | } 405 | 406 | perplexities = new double*[months->size()]; 407 | for (i = 0; i < months->size(); i++) { 408 | perplexities[i] = new double[MAXITER]; 409 | for (unsigned int a = 0; a < MAXITER; ++a) { 410 | perplexities[i][a] = 0; 411 | } 412 | } 413 | 414 | int*** topwords; 415 | double** maxval; 416 | topwords = new int**[months->size()]; 417 | 418 | //Generate Numbers according to Gaussian Distribution 419 | std::default_random_engine generator; 420 | double **Beta_t_1 = new double*[W]; 421 | double **Beta_t = new double*[W]; 422 | for (i = 0; i < W; i++) { 423 | Beta_t_1[i] = new double[K]; 424 | Beta_t[i] = new double[K]; 425 | } 426 | 427 | for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) { 428 | cout << (*months)[timeSlice] << " " << (*numOfDocs)[timeSlice] << endl; 429 | 430 | //We are initializing nPi from 5 runs of regular SCVB 431 | if (timeSlice == 0) { 432 | //Run SCVB to initialize betas and thus reduce initial bias 433 | runRegularSCVB(nPi, corpus, corpus_size, 5, M); 434 | } else { 435 | for (unsigned int word = 0; word < W; ++word) { 436 | for (unsigned int topic = 0; topic < K; ++topic) { 437 | normal_distribution distribution(Beta_t_1[word][topic], 0.005); 438 | Beta_t[word][topic] = distribution(generator); 439 | } 440 | } 441 | Transform(Beta_t, nPi); 442 | } 443 | 444 | //if parallelizing this, make sure to avoid race condition (most likely use reduction) 445 | for (k = 0; k < K; k++) { 446 | N_z[k] = 0; 447 | for (w = 0; w < W; w++) { 448 | N_z[k] += nPi[w][k]; 449 | } 450 | } 451 | 452 | 453 | // Find the total number of word in the document 454 | int monthFirstDoc = monthFirstIdx->at(timeSlice); 455 | int monthLastDoc = monthLastIdx->at(timeSlice); 456 | 457 | int monthD = monthLastDoc - monthFirstDoc; 458 | 459 | C = 0; 460 | 461 | for (j = monthFirstDoc; j < (unsigned)monthLastDoc; j++) { 462 | C += corpus_size[j]; 463 | } 464 | 465 | printf("Number of words in corpus: %d\n", C); 466 | 467 | int firstdoc = 0; 468 | int lastdoc = 0; 469 | int DM = monthD / M; 470 | 471 | for (iter = 0; iter < (int)MAXITER; iter++) { 472 | // Decide rho_phi and rho_theta 473 | rhoPhi = 10 / pow((1000 + iter), 0.9); 474 | rhoTheta = 1 / pow((10 + iter), 0.9); 475 | 476 | #pragma omp parallel private(batch_idx,j,k,i,w,firstdoc,lastdoc) 477 | { 478 | double *gamma = new double[K]; 479 | double *nzHat = new double[K]; 480 | double **nPhiHat = new double *[W]; 481 | for (k = 0; k < K; k++) { 482 | gamma[k] = 0; 483 | nzHat[k] = 0; 484 | } 485 | for (i = 0; i < W; i++) { 486 | nPhiHat[i] = new double[K]; 487 | for (k = 0; k < K; k++) { 488 | nPhiHat[i][k] = 0; 489 | } 490 | } 491 | 492 | #pragma omp for 493 | for (batch_idx = 0; batch_idx < DM+1; batch_idx++) { 494 | 495 | // Decide the document indices which go in each minibatch 496 | firstdoc = monthFirstDoc + (batch_idx * M); 497 | lastdoc = monthFirstDoc + ((batch_idx + 1) * M); 498 | 499 | if (batch_idx == DM) { 500 | lastdoc = monthLastDoc; 501 | } 502 | for (j = (unsigned)firstdoc; j < (unsigned)lastdoc; j++) { 503 | 504 | // First perform the burn-in passes 505 | // Iteration of burn in passes 506 | 507 | // Store size of corpus in Cj 508 | Cj = corpus_size[j]; 509 | 510 | for (i = 0; i < (corpus[j].size() / 2); i++) {// indexing is very different here! 511 | 512 | int w_aj = corpus[j][2 * i]; 513 | int m_aj = corpus[j][(2 * i) + 1]; 514 | // Update gamma_ij and N_theta 515 | double normSum = 0; 516 | 517 | for (k = 0; k < K; k++) { 518 | gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W)); 519 | normSum += gamma[k]; 520 | } 521 | 522 | for (k = 0; k < K; k++) { 523 | gamma[k] = gamma[k] / normSum; 524 | } 525 | 526 | for (k = 0; k < K; k++) { 527 | 528 | nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k]) 529 | + ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]); 530 | } 531 | 532 | } 533 | 534 | // Iteration of the main loop 535 | for (i = 0; i < (corpus[j].size() / 2); i++) { // indexing is very different here! 536 | 537 | int w_aj = corpus[j][2 * i]; 538 | int m_aj = corpus[j][(2 * i) + 1]; 539 | double normSum = 0; 540 | for (k = 0; k < K; k++) { 541 | gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W)); 542 | normSum += gamma[k]; 543 | } 544 | 545 | for (k = 0; k < K; k++) { 546 | gamma[k] = gamma[k] / normSum; 547 | } 548 | 549 | // Update N_theta estimates 550 | for (k = 0; k < K; k++) { 551 | nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k]) 552 | + ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]); 553 | 554 | nPhiHat[w_aj][k] = nPhiHat[w_aj][k] + (C * gamma[k] / M); 555 | 556 | nzHat[k] = nzHat[k] + (C * gamma[k] / M); 557 | } 558 | } 559 | 560 | } // End of j 561 | 562 | // Update the estimates matrix 563 | for (k = 0; k < K; k++) { 564 | for (w = 0; w < W; w++) { 565 | nPi[w][k] = (1 - rhoPhi) * nPi[w][k] + rhoPhi * nPhiHat[w][k]; 566 | } 567 | #pragma omp atomic 568 | N_z[k] *= (1 - rhoPhi); 569 | #pragma omp atomic 570 | N_z[k] += rhoPhi * nzHat[k]; 571 | } 572 | 573 | } // End of batch_idx 574 | 575 | // Compute phi 576 | #pragma omp for 577 | for (k = 0; k < K; k++) { 578 | double normSum = 0; 579 | for (w = 0; w < W; w++) { 580 | nPi[w][k] += eta; 581 | normSum += nPi[w][k]; 582 | } 583 | for (w = 0; w < W; w++) { 584 | Pi[w][k] = (double) nPi[w][k] / normSum; 585 | } 586 | } 587 | 588 | // Compute theta 589 | #pragma omp for 590 | for (i = monthFirstDoc; i < (unsigned)monthLastDoc; i++) { 591 | double normSum = 0; 592 | for (k = 0; k < K; k++) { 593 | nTheta[i][k] += alpha; 594 | normSum += nTheta[i][k]; 595 | } 596 | for (k = 0; k < K; k++) { 597 | theta[i][k] = (double) nTheta[i][k] / normSum; 598 | } 599 | } 600 | 601 | delete[] gamma; 602 | delete[] nzHat; 603 | 604 | for (i = 0; i < W; i++) { 605 | delete[] nPhiHat[i]; 606 | } 607 | 608 | delete[] nPhiHat; 609 | 610 | } 611 | 612 | // Calculate the perplexity here 613 | // Compute posterior means here 614 | // Iterate over the corpus here 615 | perplexityval = 0; 616 | #pragma omp parallel for private(j,i,k) reduction(+:innerval) reduction(+:perplexityval) 617 | for (j = monthFirstDoc; j < (unsigned)monthLastDoc; j++) { 618 | for (i = 0; i < corpus_expanded[j].size(); i++) { 619 | innerval = 0; 620 | for (k = 0; k < K; k++) { 621 | innerval += (theta[j][k] * Pi[corpus_expanded[j][i]][k]); 622 | } 623 | perplexityval += (log(innerval) / log(2)); 624 | } 625 | } 626 | printf("%d,%f\n", iter, pow(2, -perplexityval / C)); 627 | perplexities[timeSlice][iter] = pow(2, -perplexityval / C); 628 | 629 | pfile << iter + 1 << "," << perplexities[timeSlice][iter] << endl; 630 | pfile.flush(); 631 | 632 | } // End of iter 633 | 634 | if (argc == 7) { 635 | ofstream pifile; 636 | pifile.open("Pi/topics_" + to_string(months->at(timeSlice)) + ".txt"); 637 | for (k = 0; k < K; k++) { 638 | for (w = 0; w < W; w++) { 639 | if(w == 23701){ 640 | cout<::max(); 659 | for (i = 0; i < 100; i++) { 660 | double max = -1; 661 | int max_idx = -1; 662 | for (w = 0; w < W; w++) { 663 | if ((oldMax > Pi[w][k]) && (Pi[w][k] > max)) { 664 | max = Pi[w][k]; 665 | max_idx = w; 666 | } 667 | } 668 | oldMax = Pi[max_idx][k]; 669 | topwords[timeSlice][k][i] = max_idx; 670 | maxval[k][i] = max; 671 | } 672 | } 673 | 674 | InverseTransform(Pi, Beta_t); 675 | for (unsigned int word = 0; word < W; ++word) { 676 | for (unsigned int topic = 0; topic < K; ++topic) { 677 | Beta_t_1[word][topic] = Beta_t[word][topic]; 678 | } 679 | } 680 | } // End of TimeSlice Loop 681 | string *dict; 682 | dict = new string[W]; 683 | // char word; 684 | //retrieve the words from the file 685 | w = 0; 686 | string line; 687 | ifstream vocabFile(argv[5]); 688 | if (vocabFile.is_open()) { 689 | while (getline(vocabFile, line)) { 690 | dict[w] = line; 691 | w++; 692 | } 693 | vocabFile.close(); 694 | } 695 | 696 | // write topics file 697 | for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) { 698 | ofstream tfile; 699 | tfile.open("output/topics_" + to_string(months->at(timeSlice)) + ".txt"); 700 | for (k = 0; k < K; k++) { 701 | for (w = 0; w < 100; w++) { 702 | tfile << topwords[timeSlice][k][w] << ":" << maxval[k][w] << ","; 703 | } 704 | tfile << endl; 705 | 706 | for (w = 0; w < 100; w++) { 707 | tfile << dict[topwords[timeSlice][k][w]] << ","; 708 | } 709 | tfile << endl; 710 | } 711 | tfile.close(); 712 | } 713 | 714 | ofstream topTenfile; 715 | topTenfile.open("TopTen.txt"); 716 | for (k = 0; k < K; k++) { 717 | for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) { 718 | for (w = 0; w < 10; w++) { 719 | topTenfile << topwords[timeSlice][k][w] << ","; 720 | } 721 | } 722 | topTenfile << endl; 723 | } 724 | topTenfile.close(); 725 | 726 | return (0); 727 | 728 | } // End of main 729 | --------------------------------------------------------------------------------