├── .gitignore
├── Scrapper
    ├── article.py
    ├── ldac2uci.py
    ├── removeInfrequentWords.py
    ├── getarticle.py
    ├── __init__.py
    ├── text2ldac.py
    └── multitext2ldac.py
├── Evaluation
    ├── getFinalPerplexities.py
    ├── WordTrends.py
    ├── getWordVariation.py
    └── lookupWords.py
├── makefile
├── README.md
├── TopicChains
    ├── GenerateChains.cpp
    ├── GetData.cpp
    └── TopicChains.cpp
├── SCVB0_Evaluation
    └── scvb.cpp
└── SCVB0
    └── scvb0.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled source #
 2 | ###################
 3 | *.o
 4 | *.pyc
 5 | #Data
 6 | *.txt
 7 | /Data
 8 | #eclipse files
 9 | .settings/
10 | *~
11 | .cproject
12 | /Debug
13 | 


--------------------------------------------------------------------------------
/Scrapper/article.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Mar 29, 2014
 3 | 
 4 | @author: vspathak
 5 | '''
 6 | 
 7 | class article(object):
 8 |     def __init__(self, title, date, text, url, ID):
 9 |         self.Title = title
10 |         self.Date = date
11 |         self.Text = text
12 |         self.URL = url
13 |         self.id = ID
14 |         
15 | 


--------------------------------------------------------------------------------
/Scrapper/ldac2uci.py:
--------------------------------------------------------------------------------
 1 | infile = open('jan.dat')
 2 | outfile = open('JanUCI.txt', 'w')
 3 | 
 4 | doclines = infile.readlines()
 5 | infile.close()
 6 | 
 7 | for i in range(len(doclines)):
 8 |     line = doclines[i].strip().split(' ')[1:]
 9 |     for elt in line:
10 |         pieces = elt.split(':')
11 |         outfile.write(str(i + 1) + ' ' + str(int(pieces[0]) + 1) + ' ' + pieces[1] + '\n')
12 |     
13 | outfile.close()
14 | 


--------------------------------------------------------------------------------
/Evaluation/getFinalPerplexities.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | if len(sys.argv) >= 2:
 5 |     perpfilename = sys.argv[1]
 6 |     num_iter = int(sys.argv[2])
 7 | 
 8 | perpfile = open(perpfilename)
 9 | 
10 | finalperps = [perp for perp in perpfile.readlines() if perp[1] != ',' and int(perp[:2]) == num_iter]
11 | 
12 | # for perp in finalperps:
13 | #     print perp.split(',')[1]
14 | 
15 | #print len(finalperps)
16 | for i in range(len(finalperps)):
17 |     if i % 3 == 0:
18 |         print i/3, finalperps[i].split(',')[1]
19 | 


--------------------------------------------------------------------------------
/Evaluation/WordTrends.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | topTenFile = open('/home/vspathak/git/DynamicLDA/TopTen.txt', 'r')
 4 | for input in topTenFile:
 5 |     words = input.split(',')
 6 |     topSet = set(words)
 7 | 
 8 | wordEvolution = open('/home/vspathak/git/DynamicLDA/WordEvolution.txt', 'w')
 9 | 
10 | files = os.listdir('/home/vspathak/git/DynamicLDA/Scrapper/Pi')
11 | 
12 | for fileName in files:
13 |     for topic in open('output/' + fileName, 'r'):
14 |         topic = topic.strip()
15 |         wordProb = topic.split(',')
16 |         wordEvolution.write(wordProb[word])
17 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | all: SCVB0/scvb0.cpp
 2 | 	g++ -g -std=c++0x -fopenmp SCVB0/scvb0.cpp -o fastLDA
 3 | 
 4 | GenerateChains: TopicChains/GenerateChains.cpp
 5 | 	g++ -g -std=c++0x -fopenmp TopicChains/GenerateChains.cpp -o GenerateChains
 6 | 	
 7 | GetData: TopicChains/GetData.cpp
 8 | 	g++ -g -std=c++0x -fopenmp TopicChains/GetData.cpp -o GetData
 9 | 	
10 | serial: SCVB0/scvb0.cpp
11 | 	g++ -g -std=c++0x SCVB0/scvb0.cpp -o fastLDA
12 | 
13 | scvb: SCVB0_Evaluation/scvb.cpp
14 | 	g++ -g -std=c++0x -fopenmp SCVB0_Evaluation/scvb.cpp -o scvb
15 | 	
16 | clean:
17 | 	rm -f *.o fastLDA GetData GenerateChains
18 | 


--------------------------------------------------------------------------------
/Scrapper/removeInfrequentWords.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | vocabDict = {}
 5 | 
 6 | files = os.listdir('/home/vspathak/git/DynamicLDA/Scrapper/output')
 7 | 
 8 | for fileName in files:
 9 |     for line in open('output/'+fileName, 'r'):
10 |         line = line.strip()
11 |         words = line.split()
12 |         for key in words:
13 |             if key in vocabDict:
14 |                 vocabDict[key] += 1
15 |             else:
16 |                 vocabDict[key] = 1
17 | 
18 | stopwordsFile = open('stopwords', 'a')
19 | for term in vocabDict:
20 |     if vocabDict[term] < 26:
21 | #         print term
22 |         stopwordsFile.write(term.encode('utf-8') + '\n')
23 |     if vocabDict[term] > 100000:
24 |         print term
25 | stopwordsFile.close()
26 | 


--------------------------------------------------------------------------------
/Scrapper/getarticle.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from lxml import html
 3 | import requests
 4 | # http://www.reuters.com/article/2012/12/02/us-space-france-russia-idUSBRE8B101L20121202
 5 | # http://www.reuters.com/article/2007/01/02/music-jazz-chicago-dc-idUSN2927338620070102
 6 | # http://www.reuters.com/article/2014/03/28/us-microsoft-office-ipad-idUSBREA2Q1MV20140328
 7 | page = requests.get('http://www.reuters.com/article/2014/01/02/walmart-china-idUSL3N0KC0LH20140102')
 8 | tree = html.fromstring(page.text)
 9 | 
10 | # This will create a list of article URLs:
11 | # URL = tree.xpath('//div[@class="headlineMed"]/a/@href')
12 | # Title = tree.xpath('//div[@class="headlineMed"]/a/text()'
13 | Title = tree.xpath('//*[@id="content"]/div[4]/div/div[3]/div[1]/h1/text()')
14 | 
15 | Location = tree.xpath('//*[@id="articleInfo"]/p[2]/span[1]/text()')
16 | 
17 | Paragraphs = tree.xpath('//*[@id="articleText"]/p/text()')
18 | 
19 | print 'Paragraphs: ', Paragraphs
20 | print 'Location: ' , Location
21 | print 'Title:' , Title
22 | 


--------------------------------------------------------------------------------
/Evaluation/getWordVariation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | if len(sys.argv) >= 3:
 5 |     topic_id = int(sys.argv[1])
 6 |     word_id = int(sys.argv[2])
 7 |     filepath = sys.argv[3]
 8 |     startYear = int(sys.argv[4])
 9 |     endYear = int(sys.argv[5])
10 | 
11 | files = os.listdir(filepath)
12 | 
13 | for year in range(startYear, endYear + 1):
14 |     for month in range(1, 13):
15 |         if(month > 9):
16 |             fileName = filepath + '/' + 'topics_' + str(year) + str(month) + '.txt'
17 |             if os.path.isfile(fileName):
18 | #                 print 'reading file: ' + fileName
19 |                 monthFile = open(fileName, 'r')
20 |                 lines = monthFile.readlines()
21 |                 topic = lines[topic_id]
22 |                 prob = float(topic.split(',')[word_id])
23 |                 print prob
24 |         else:
25 |             fileName = filepath + '/' + 'topics_' + str(year) + '0' + str(month) + '.txt'
26 |             if os.path.isfile(fileName):
27 | #                 print 'reading file: ' + fileName
28 |                 monthFile = open(fileName, 'r')
29 |                 lines = monthFile.readlines()
30 |                 topic = lines[topic_id]
31 |                 prob = float(topic.split(',')[word_id])
32 |                 print prob
33 | 


--------------------------------------------------------------------------------
/Evaluation/lookupWords.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | 
 5 | def buildtriple(idprob,vocab):
 6 | #     if (len(idprob) == 0):
 7 | #         print 'working'
 8 |     word_Prob = idprob.split(':')
 9 |     word = vocab[int(word_Prob[0])]
10 |     return (word, word_Prob[0], word_Prob[1])
11 |     
12 | #input: directory of input files, directory for output files, vocab file
13 | if len(sys.argv) >= 3:
14 |     infolder = sys.argv[1]
15 |     outfolder = sys.argv[2]
16 |     vocabfilename = sys.argv[3]
17 | 
18 |     vocabfile = open(vocabfilename, 'r')
19 |     vocab = [word.strip() for word in vocabfile.readlines()]
20 |     vocabfile.close()
21 |     
22 |     #iterate through files of infolder
23 |     for filename in os.listdir(infolder):
24 |         infile = open(infolder + '/' + filename)
25 |         outfile = open(outfolder + '/' + filename, 'w')
26 |         for topic in infile.readlines():
27 |             temp = topic[:-2].split(',')
28 |             word_Prob = temp[0].split(':')
29 |             if not (word_Prob[0].isdigit()):
30 |                 continue
31 |             outline = [buildtriple(idprob,vocab) for idprob in topic[:-2].split(',')]
32 |             map(lambda x: outfile.write(str(x)), outline)
33 |             outfile.write('\n')
34 |         outfile.close()
35 |         infile.close()
36 | 


--------------------------------------------------------------------------------
/Scrapper/__init__.py:
--------------------------------------------------------------------------------
 1 | from lxml import html
 2 | import requests
 3 | import article as ac
 4 | import sys
 5 | import random
 6 | import nltk
 7 | import re
 8 | from stemming.porter2 import stem
 9 | 
10 | 
11 | docId = 0  
12 | if len(sys.argv) >= 4:
13 |     theyear = int(sys.argv[1])
14 |     firstmonth = int(sys.argv[2])
15 |     num_months = int(sys.argv[3])
16 |     seqfile = open('seq-' + str(theyear) + '-' + str(firstmonth) + '-' + str(num_months) + '.txt', 'w')
17 | 
18 | else:
19 |     print 'usage: python __init__.py year firstmonth num_months'
20 |     sys.exit(0)
21 | 
22 | for yr in range(theyear, theyear + 1):
23 |     year = 'http://www.reuters.com/resources/archive/us/' + str(yr)
24 |     for mnth in range(firstmonth, firstmonth + num_months):
25 |         if(mnth < 10):
26 |             month = '0' + str(mnth)
27 |         else:
28 |             month = str(mnth)
29 |         
30 |         monthdocs = 0
31 |         for day in range(1, 32):
32 |             if(day < 10):
33 |                 URL = year + month + '0' + str(day) + '.html'
34 |             else:
35 |                 URL = year + month + str(day) + '.html'
36 |             
37 |             page = requests.get(URL)
38 |             tree = html.fromstring(page.text)
39 |             URLs = tree.xpath('//div[@class="headlineMed"]/a/@href')
40 |             date = URL[-13:-5]
41 | 
42 |             f = open('output/' + str(date) + '.txt', 'w')
43 |             # generate the random vector(python generate a sample without 
44 |             # replacement from a range of numbers)
45 |              
46 |             for num in random.sample(range(0, len(URLs)), int(len(URLs))):
47 |                 doc = ac.article('', date, '', URLs[num], -1)
48 |                 curpage = requests.get(doc.URL)
49 |                 curtree = html.fromstring(curpage.text)
50 |                 Title = curtree.xpath('//*[@id="content"]/div[4]/div/div[3]/div[1]/h1/text()')
51 |                 Paragraphs = curtree.xpath('//*[@id="articleText"]/p/text()')
52 |                 if len(Title) > 0:
53 |                     doc.Title = Title[0].replace('\"', '')
54 |                     Paragraphs.append(Title[0])
55 |                 doc.Text = " ".join(Paragraphs)
56 |                 doc.Text = doc.Text.replace('\n', ' ')
57 |                 doc.Text = doc.Text.replace('\"', '')
58 | 
59 |                 if(len(doc.Text.split()) > 100):
60 |                     docId = docId + 1
61 |                     doc.id = docId
62 |                     print doc.id
63 |                     monthdocs = monthdocs + 1
64 |                     
65 |                     docText = re.sub('[^A-Za-z]+', ' ', doc.Text)
66 |                     docTitle = re.sub('[^A-Za-z]+', ' ', doc.Title)
67 |                     docText = docTitle + ' ' + docText  
68 |                     docText = docText.lower()
69 |                     tokens = docText.split()
70 | 
71 |                     docText = " ".join([stem(t) for t in tokens])
72 | 
73 |                     f.write(docText.encode('utf-8') + '\n')
74 | 
75 |             f.close()
76 |         seqfile.write(str(theyear) + '-' + str(mnth) + ':' + str(monthdocs) + '\n')
77 | seqfile.close()
78 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | DynamicLDA
 2 | ==========
 3 | 
 4 | Dynamic Topic Model of Reuters News Articles between 2007-2013
 5 | --------------------------------------------------------------------------------------------
 6 | <p>We have implemented fast version of <a href="https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0CDAQFjAA&url=http%3A%2F%2Fwww.cs.cmu.edu%2F~lafferty%2Fpub%2Fdtm.pdf&ei=YZJqU_-ABIL_oQTVroCQDg&usg=AFQjCNGicS7Nr_Q76R5uSUczaUP2DaAd1A&sig2=xOoJWejgXXVBTu9wf4vAVw&bvm=bv.66111022,d.cGU&cad=rja">Dynamic Topic Model</a> proposed by David Blei and John Lafferty in 2006.</p>
 7 | <p>This version takes advantage of new advancements in LDA model. We have implemented the LDA part of DTM using <a href="http://arxiv.org/abs/1305.2452">SCVB0</a> which is proposed by Foulds, et al 2013. This is parallelized implementation of SCVB0 using OpenMP.</p>
 8 | <p>As per our evaluation, even our Serial version gives 36X speedup and the Parallel version when run on core 2 duo 2GHz 2Gb machine gives 53X speedup.</p>
 9 | <p> (<a href="https://drive.google.com/file/d/0B_VfeWUws5WUdGxZRzFHd25kcU0/view?usp=sharing">Report with detail evaluation</a>)</p>
10 | 
11 | Reuters News Dataset Details
12 | ----------------------------
13 | Timestamped News articles published by Reuters between 2007 and 2013. This is corpus of 161,989 documents with vocab size of 32,468 after preprocessing. Following are the preprocessing steps performed (Scripts are available in Scrapper folder)
14 | 
15 |  - From Reuters data we removed all the docs which have length less than 100 words
16 |  - We have scrapped random 10% of the data from each day. This was done just to minimize the corpus size.The assumption is that randomly selected data wont cause problem while finding the long and major topics.
17 |  - We removed all the punctuation marks and performed stemming using Porter2 stemmer
18 |  - We also removed the words which have frequency of less than 25 or more than 100,000
19 | example run of text2ldac:
20 | 
21 | Topic Chains
22 | ------------
23 | We have investigated the <a href="http://dl.acm.org/citation.cfm?id=1964765">Topic Chains</a> a solution to topic Birth-Death problem in Dynamic LDA proposed by Kim, et al in 2013.
24 |  - We use the same Reuters dataset and use the Jensen-Shannon (JS) divergence to compare similarity between the topics.
25 |  - We evaluate performance at different Similarity Thresholds and Window Sizes and find similar results as given in the original paper
26 |  - We identify some issues in the method and propose solutions to the same (Please refer the report for more details)
27 | 
28 | Execution Commands
29 | ------------------
30 |  - Scrape Data from reuters archive website between startMonth for num_of_months<br>
31 | <code>python __init__.py startMonth num_of_months</code>
32 |  - Get Stopwords <code>python removeInfrequentWords.py</code>
33 |  - Convert the text data to ldac format used by <a href="https://code.google.com/p/princeton-statistical-learning/downloads/detail?name=dtm_release-0.8.tgz">Blei's implementation</a><br>
34 | <code>python multitext2ldac.py data_folder --stopwords stopwords_file</code>
35 |  - Convert data to <a href="https://archive.ics.uci.edu/ml/datasets/Bag+of+Words">UCI</a> format
36 |   <code>python ldac2uci.py</code>
37 |  - Compile Dynamic LDA. <code>make</code>
38 |  - Execute Dynamic Topic Modeling on UCI dataset<br>
39 |  <code>./fastLDA UCIFormat_data_file iterations NumOfTopics MiniBatchSize Vocab_file GeneratePi</code>
40 |  - Get the word trend in a topic<br>
41 |  <code>python getWordVariation.py TopicId WordId PiFolderPath StartYear EndYear</code>
42 |  - Compile Topic Chains GetData to get all the Topics in the dataset for all the TimeSlices
43 |  <code>make GetData</code>
44 |  - Execute GetData for Topic Chains<br> <code>./GetData UCIFormat_data_file iterations NumOfTopics MiniBatchSize Vocab_file GeneratePi</code>
45 |  - Compile GenerateChains for Topic Chains <code>make GenerateChains</code>
46 |  - Execute GenerateChains <code>./GenerateChains  Pi_folder num_topics WindowSize SimilarityThreshold</code>
47 | 


--------------------------------------------------------------------------------
/TopicChains/GenerateChains.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * TopicChains.cpp
  3 |  *
  4 |  *  Created on: Apr 22, 2014
  5 |  *      Author: vspathak
  6 |  */
  7 | 
  8 | #include <boost/graph/adjacency_list.hpp>
  9 | #include <boost/graph/connected_components.hpp>
 10 | #include <boost/graph/detail/adjacency_list.hpp>
 11 | #include <boost/graph/graph_selectors.hpp>
 12 | #include <math.h>
 13 | #include <stdio.h>
 14 | #include <stdlib.h>
 15 | #include <sys/time.h>
 16 | #include <cstring>
 17 | #include <ctime>
 18 | #include <fstream>
 19 | #include <iostream>
 20 | #include <sstream>
 21 | #include <string>
 22 | #include <utility>
 23 | #include <vector>
 24 | 
 25 | using namespace std;
 26 | using namespace boost;
 27 | 
 28 | typedef adjacency_list<vecS, vecS, undirectedS> Graph;
 29 | 
 30 | // Initialize number of documents, topics and words in vocabulary
 31 | unsigned int W, D, K;
 32 | 
 33 | double KLDivergence(double*** Pi, int t, int k, double* M) {
 34 | 	double result = 0.0;
 35 | 	for (unsigned int w = 0; w < W; ++w) {
 36 | 		result += log(Pi[t][w][k] / M[w]) * Pi[t][w][k];
 37 | 	}
 38 | 	return result;
 39 | }
 40 | 
 41 | double JSsimilarity(double*** Pi, int t1, int k1, int t2, int k2) {
 42 | 	double result = 0.0;
 43 | 	double* M = new double[W];
 44 | 	for (unsigned int w = 0; w < W; ++w) {
 45 | 		M[w] = (Pi[t1][w][k1] + Pi[t2][w][k2]) / 2;
 46 | 	}
 47 | 	result = KLDivergence(Pi, t1, k1, M) + KLDivergence(Pi, t2, k2, M);
 48 | 	result = result / 2;
 49 | 	return result;
 50 | }
 51 | 
 52 | void generateTopicLinks(Graph &G, double*** Pi, int timeSlice, int topic,
 53 | 		int numTopics, int windowSize, double threshold) {
 54 | 	for (int w = 0; w < windowSize; w++) {
 55 | 		int numLinks = 0;
 56 | 		for (int k = 0; k < numTopics; k++) {
 57 | 			if ((timeSlice - 1 - w >= 0) && JSsimilarity(Pi, timeSlice, topic, timeSlice - 1 - w, k) > threshold) {
 58 | 				//add edge to graph structure here
 59 | 				int e1 = (timeSlice * numTopics) + topic;
 60 | 				int e2 = ((timeSlice - 1 - w) * numTopics) + k;
 61 | 
 62 | 				cout << "Adding edge " << e1 << ", " << e2 << endl;
 63 | 				add_edge(e1, e2, G);
 64 | 				numLinks++;
 65 | 			}
 66 | 		}
 67 | 		if (numLinks > 0) {
 68 | 			break;
 69 | 		}
 70 | 	}
 71 | }
 72 | 
 73 | void generateAllLinks(Graph &G, double*** Pi, int numTimeSlices, int numTopics,
 74 | 		int windowSize, double threshold) {
 75 | 	for (int t = 0; t < numTimeSlices; t++) {
 76 | 		for (int k = 0; k < numTopics; k++) {
 77 | 			generateTopicLinks(G, Pi, t, k, numTopics, windowSize, threshold);
 78 | 		}
 79 | 	}
 80 | }
 81 | 
 82 | int main(int argc, char* argv[]) {
 83 | 	if (argc < 4) {
 84 | 		printf("Usage: ./fastLDA Pi_folder num_topics WindowSize SimilarityThreshold\n");
 85 | 		return 1;
 86 | 	}
 87 | 	string piFolder = argv[1];
 88 | 	cout << "Input Pi folder: " << piFolder << endl;
 89 | 
 90 | 	double ***Pi;
 91 | 	int windowSize = 0;
 92 | 	double similarityThreshold = 0;
 93 | 
 94 | 	ifstream seqfile;
 95 | 	seqfile.open("Data/seqfile.txt");
 96 | 	string newline = "";
 97 | 	vector<int>* months = new vector<int>();
 98 | 	vector<int>* numOfDocs = new vector<int>();
 99 | 	vector<int>* monthFirstIdx = new vector<int>();
100 | 	vector<int>* monthLastIdx = new vector<int>();
101 | 	int curIdx = 0;
102 | 
103 | 	while (seqfile >> newline) {
104 | 		const char * ptr = strchr(newline.c_str(), ':');
105 | 		int count = atoi(ptr + 1);
106 | 		ptr = "\0";
107 | 		int yearMonth = atoi(newline.c_str());
108 | 		months->push_back(yearMonth);
109 | 		numOfDocs->push_back(count);
110 | 		monthFirstIdx->push_back(curIdx);
111 | 		monthLastIdx->push_back(curIdx + count);
112 | 		curIdx += count;
113 | 	}
114 | 	seqfile.close();
115 | 
116 | 	K = atoi(argv[2]);
117 | 	windowSize = atoi(argv[3]);
118 | 	similarityThreshold = atof(argv[4]);
119 | 	W = 32468;
120 | 
121 | 
122 | 	printf("Number of topics: %d\n", K);
123 | 	printf("Window Size: %d\n", windowSize);
124 | 	printf("Similarity Threshold: %f\n", similarityThreshold);
125 | 
126 | 	// Dynamically allocate Pi
127 | 	Pi = new double**[months->size()];
128 | 	for (unsigned int m = 0; m < months->size(); ++m) {
129 | 		Pi[m] = new double*[W];
130 | 		for (unsigned int word = 0; word < W; word++) {
131 | 			Pi[m][word] = new double[K];
132 | 			for(unsigned int k = 0; k < K; k++) {
133 | 				Pi[m][word][k] = 0;
134 | 			}
135 | 		}
136 | 	}
137 | 
138 | 	//Read Pi files in Memory
139 | 	for (int timeSlice = 0; timeSlice < (int)months->size(); timeSlice++) {
140 | 		string fileName = piFolder + "/topics_" + to_string(months->at(timeSlice)) + ".txt";
141 | 		cout << "Reading File: " << fileName << endl;
142 | 		ifstream pifile;
143 | 		pifile.open(fileName);
144 | 		int topic = 0;
145 | 		while (pifile >> newline) {
146 | 			std::istringstream ss(newline);
147 | 			std::string token;
148 | 
149 | 			int wordId = 0;
150 | 			while (std::getline(ss, token, ',')) {
151 | 				Pi[timeSlice][wordId][topic] = stod(token);
152 | 				wordId++;
153 | 			}
154 | 			topic++;
155 | 		}
156 | 		pifile.close();
157 | 
158 | 	} //All timeSlices finished
159 | //	for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) {
160 | //		for (int k = 0; k < K; k++) {
161 | //			for (int w = 0; w < W; w++) {
162 | //				cout << Pi[timeSlice][w][k] << ",";
163 | //			}
164 | //			cout << endl;
165 | //		}
166 | //	}
167 | 
168 | 	// MAKE CHAINS
169 | 	Graph G;
170 | 	//K is unsigned -- is this a problem?
171 | //    generateAllLinks(G, Pi, months->size(), K, windowSize, similarityThreshold);
172 | 	generateAllLinks(G, Pi, 10, K, windowSize, similarityThreshold);
173 | 
174 | 	vector<int> component(num_vertices(G));
175 | 	int num = connected_components(G, &component[0]);
176 | 
177 | 	vector<int>::size_type p;
178 | 	cout << "Total number of components: " << num << endl;
179 | 	for (p = 0; p != component.size(); ++p) {
180 | 		cout << "Vertex " << p << " is in component " << component[p] << endl;
181 | 	}
182 | 
183 | 	return (0);
184 | 
185 | } // End of main
186 | 


--------------------------------------------------------------------------------
/Scrapper/text2ldac.py:
--------------------------------------------------------------------------------
  1 | #This file is part of text2ldac.
  2 | 
  3 | #text2ldac is free software: you can redistribute it and/or modify
  4 | #it under the terms of the GNU General Public License as published by
  5 | #the Free Software Foundation, either version 3 of the License, or
  6 | #(at your option) any later version.
  7 | 
  8 | #text2ldac is distributed in the hope that it will be useful,
  9 | #but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | #GNU General Public License for more details.
 12 | 
 13 | #You should have received a copy of the GNU General Public License
 14 | #along with text2ldac. If not, see <http://www.gnu.org/licenses/>.
 15 | 
 16 | 
 17 | 
 18 | import argparse
 19 | import codecs
 20 | import os
 21 | import operator
 22 | import string
 23 | import sys
 24 | 
 25 | __doc = \
 26 | '''
 27 | This is a program to convert documents into the file format used by David
 28 | Blei's lda-c (and hlda-c) implementation. It generates the .dat, .vocab and
 29 | .dmap files from .txt files in a given directory.
 30 | 
 31 | cf. http://www.cs.princeton.edu/~blei/lda-c/readme.txt
 32 | '''
 33 | __author__ = 'Johannes Knopp <johannes@informatik.uni-mannheim.de>'
 34 | 
 35 | def init_parser():
 36 |     '''
 37 |     Returns an argument parser configured with options for this program
 38 |     '''
 39 |     parser = argparse.ArgumentParser(
 40 |             description='A program to convert documents to .dat, .vocap and .dmap files'
 41 |             )
 42 | 
 43 |     #positional argument
 44 |     parser.add_argument('dirname', action='store',
 45 |         help='directory containing .txt files (files must be encoded in utf-8)')
 46 | 
 47 |     #options
 48 |     parser.add_argument('-o', '--output', action='store', dest='outdir',
 49 |             help='directory to store the resulting files')
 50 |     parser.add_argument('-e', '--extension', action='store', dest='extension',
 51 |             default='.txt',
 52 |             help='extension of the files you are looking for. Default: %(default)s')
 53 |     #TODO minoccurrence should work for the overall occurrence
 54 |     parser.add_argument('--minoccurrence', action='store',
 55 |             dest='minoccurrence', type=int, default=1,
 56 |             help='Minimum occurrences a word needs at least once in one document to be taken into account.')
 57 |     parser.add_argument('--minlength', action='store',
 58 |             dest='minlength', type=int, default=1,
 59 |             help='Minimum length a word needs to be taken into account.')
 60 |     #stopwords
 61 |     parser.add_argument('--stopwords', action='store', dest='stopword_file',
 62 |             help='Remove the stopwords given in the stopword file (one line per stopword).')
 63 | 
 64 |     #TODO
 65 |     parser.add_argument('--mallet', action='store_true',
 66 |             help='convert data that exists in the format used by mallet. NOT SUPPORTED YET')
 67 | 
 68 |     return parser.parse_args()
 69 | 
 70 | 
 71 | def get_filenames(directory, extension):
 72 |     '''
 73 |     Search for files in the directory ending in EXTENSION and return the full
 74 |     paths as a list.
 75 |     '''
 76 |     all_fnames = []
 77 |     for dirpath,dirnames,filenames in os.walk(directory):
 78 |         all_fnames += [os.path.join(dirpath,f) for f in filenames if
 79 |                 f.endswith(extension)]
 80 |     return all_fnames
 81 | 
 82 | 
 83 | def clean_word(word):
 84 |     '''
 85 |     returns the word in lowercase without punctuation at the start or end
 86 |     '''
 87 |     return word.rstrip(string.punctuation).lstrip(string.punctuation).lower()
 88 | 
 89 | def load_stopwords(stopword_filename):
 90 |     '''
 91 |     returns a set of stopwords found line by line in the stopwords file
 92 |     '''
 93 |     stopwords = set()
 94 | 
 95 |     with codecs.open(stopword_filename, 'r', 'utf-8') as sf:
 96 |         for line in sf:
 97 |             if len(line.split()) != 1:
 98 |                 print('ignoring line with more than one stopword:\n"{0}"'.format(
 99 |                     line))
100 |                 continue
101 |             stopwords.add(line.strip())
102 | 
103 |     return stopwords
104 | 
105 | def write_document_map_file(fnames, dmap_fname):
106 |     """
107 |     Save document's names in the order they were processed
108 |     """
109 |     with codecs.open(dmap_fname,'w','utf-8') as d_file:
110 |         for title in fnames:
111 |             d_file.write(title + '\n')
112 | 
113 | def reindex(word_id_dict, min_index):
114 |     """
115 |     re-index the word_id for word_id pairs to guarantee that the max
116 |     index of the word matches/reflects number of words in word dict
117 |     """
118 |     num_word_shifts = 0
119 |     for word in word_id_dict:
120 |         cur_index = word_id_dict[word]
121 | 
122 |         if cur_index > min_index:
123 |             word_id_dict[word] = min_index + num_word_shifts
124 |             num_word_shifts += 1
125 | 
126 | def generate_dat_lines_and_word_ids(fnames, config):
127 |     dat_lines = [] #.dat file output
128 |     word_id_dict = dict()
129 |     used_docs = [] #needed to generate .dmap file
130 | 
131 |     for docname in fnames:
132 |         freq_dict = dict()
133 |         new_words = set()
134 | 
135 |         try:
136 |             with codecs.open(docname, 'r', 'utf-8') as doc:
137 |                 for line in doc:
138 |                     for word in line.split():
139 |                         word = clean_word(word)
140 | 
141 |                         if len(word) < config['minlength'] or word in config['stopwords']:
142 |                             continue
143 | 
144 |                         #word occurrs for the first time
145 |                         if not word_id_dict.has_key(word):
146 |                             freq_dict[word] = 1
147 |                             word_id_dict[word] = len(word_id_dict)
148 |                             new_words.add(word)
149 |                         #word may be in word_id_dict but not yet in freq_dict
150 |                         else:
151 |                             freq = freq_dict.setdefault(word, 0)
152 |                             freq_dict[word] = freq + 1
153 |         except UnicodeDecodeError as u_error:
154 |             print('Document "{0}" has encoding errors and is ignored!\n{1}'.format(
155 |                 docname, u_error))
156 | 
157 | 
158 |         if len(freq_dict)==0: #did the document contribute anything?
159 |             print('Document "{0}" (#{1}) seems to be empty and is ignored!'.format(
160 |                 docname,fnames.index(docname)))
161 |             continue
162 |         else:
163 |             used_docs.append(docname)
164 | 
165 |         #remove words that do not reach minoccurrence
166 |         remove_list = [word for word in freq_dict.iterkeys() if\
167 |             freq_dict[word] < config['minoccurrence']]
168 |         #smallest index of a word that is removed
169 |         remove_word_min_index = len(word_id_dict)
170 | 
171 |         for word in remove_list:
172 |             freq_dict.pop(word)
173 |             #if they are new also remove them from word_id_dict
174 |             if word in new_words:
175 |                 word_index = word_id_dict[word]
176 |                 if word_index < remove_word_min_index:
177 |                     remove_word_min_index = word_index
178 |                 word_id_dict.pop(word)
179 |         reindex(word_id_dict, remove_word_min_index)
180 | 
181 |         dat_line =  '' #line for the .dat file
182 | 
183 |         for word in freq_dict.iterkeys():
184 |             dat_line += str(word_id_dict[word]) + ':' + str(freq_dict[word]) + ' '
185 | 
186 |         #last blank in dat_line is removed
187 |         dat_lines.append(str(len(freq_dict)) + ' ' + dat_line[:-1] + '\n')
188 | 
189 |     write_document_map_file(used_docs, config['dmapname'])
190 | 
191 |     return dat_lines, word_id_dict
192 | 
193 | 
194 | def generate_dat_and_vocab_files(fnames, config):
195 | 
196 |     with codecs.open(config['datname'], 'w', 'utf-8') as datfile:
197 |         dat_lines, word_id_dict = generate_dat_lines_and_word_ids(fnames,
198 |                 config)
199 |         datfile.writelines(dat_lines)
200 | 
201 |     #sort word_id_dict ascending by value und write the words in that
202 |     #order to a .vocab file
203 |     with codecs.open(config['vocabname'], 'w', 'utf-8') as vocabfile:
204 |         for item in sorted(word_id_dict.iteritems(), key=operator.itemgetter(1)):
205 |             vocabfile.write(item[0]+'\n')
206 | 
207 |     print('Found {0} unique words in {1} files.'.format(
208 |         len(word_id_dict), len(fnames)))
209 |     print('Results can be found in "{0}" and "{1}"'.format(
210 |         config['datname'], config['vocabname']))
211 | 
212 | 
213 | if __name__=='__main__':
214 | 
215 |     parser = init_parser()
216 | 
217 |     #directory with document files
218 |     dirname = parser.dirname
219 |     dirname = dirname + os.sep if not dirname.endswith(os.sep) else dirname
220 |     #directory for results
221 |     outdir_name = parser.outdir if parser.outdir else dirname
222 |     outdir_name = outdir_name + os.sep if not outdir_name.endswith(os.sep) else outdir_name
223 |     #prefix of the .dat and .vocab files
224 |     basename = os.path.dirname(dirname).split('/')[-1]
225 | 
226 | 
227 |     if not os.path.exists(outdir_name):
228 |         os.mkdir(outdir_name)
229 | 
230 |     #store configuration
231 |     config = dict()
232 |     config['datname'] = outdir_name + basename + '.dat'
233 |     config['vocabname'] = outdir_name + basename + '.vocab'
234 |     config['dmapname'] = outdir_name + basename + '.dmap'
235 |     config['minlength'] = parser.minlength
236 |     config['minoccurrence'] = parser.minoccurrence
237 |     if parser.stopword_file:
238 |         config['stopwords'] = load_stopwords(parser.stopword_file)
239 |     else:
240 |         config['stopwords'] = set()
241 | 
242 |     fnames = get_filenames(dirname, parser.extension)
243 |     
244 |     try:
245 |         generate_dat_and_vocab_files(fnames, config)
246 |     except IOError as ioe:
247 |         print(ioe)
248 |         sys.exit(1)
249 | 


--------------------------------------------------------------------------------
/Scrapper/multitext2ldac.py:
--------------------------------------------------------------------------------
  1 | #This file is part of text2ldac.
  2 | 
  3 | #text2ldac is free software: you can redistribute it and/or modify
  4 | #it under the terms of the GNU General Public License as published by
  5 | #the Free Software Foundation, either version 3 of the License, or
  6 | #(at your option) any later version.
  7 | 
  8 | #text2ldac is distributed in the hope that it will be useful,
  9 | #but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | #GNU General Public License for more details.
 12 | 
 13 | #You should have received a copy of the GNU General Public License
 14 | #along with text2ldac. If not, see <http://www.gnu.org/licenses/>.
 15 | 
 16 | 
 17 | 
 18 | import argparse
 19 | import codecs
 20 | import os
 21 | import operator
 22 | import string
 23 | import sys
 24 | 
 25 | __doc = \
 26 | '''
 27 | This is a program to convert documents into the file format used by David
 28 | Blei's lda-c (and hlda-c) implementation. It generates the .dat, .vocab and
 29 | .dmap files from .txt files in a given directory.
 30 | 
 31 | cf. http://www.cs.princeton.edu/~blei/lda-c/readme.txt
 32 | '''
 33 | __author__ = 'Johannes Knopp <johannes@informatik.uni-mannheim.de>'
 34 | 
 35 | def init_parser():
 36 |     '''
 37 |     Returns an argument parser configured with options for this program
 38 |     '''
 39 |     parser = argparse.ArgumentParser(
 40 |             description='A program to convert documents to .dat, .vocap and .dmap files'
 41 |             )
 42 | 
 43 |     #positional argument
 44 |     parser.add_argument('dirname', action='store',
 45 |         help='directory containing .txt files (files must be encoded in utf-8)')
 46 | 
 47 |     #options
 48 |     parser.add_argument('-o', '--output', action='store', dest='outdir',
 49 |             help='directory to store the resulting files')
 50 |     parser.add_argument('-e', '--extension', action='store', dest='extension',
 51 |             default='.txt',
 52 |             help='extension of the files you are looking for. Default: %(default)s')
 53 |     #TODO minoccurrence should work for the overall occurrence
 54 |     parser.add_argument('--minoccurrence', action='store',
 55 |             dest='minoccurrence', type=int, default=1,
 56 |             help='Minimum occurrences a word needs at least once in one document to be taken into account.')
 57 |     parser.add_argument('--minlength', action='store',
 58 |             dest='minlength', type=int, default=1,
 59 |             help='Minimum length a word needs to be taken into account.')
 60 |     #stopwords
 61 |     parser.add_argument('--stopwords', action='store', dest='stopword_file',
 62 |             help='Remove the stopwords given in the stopword file (one line per stopword).')
 63 | 
 64 |     #TODO
 65 |     parser.add_argument('--mallet', action='store_true',
 66 |             help='convert data that exists in the format used by mallet. NOT SUPPORTED YET')
 67 | 
 68 |     return parser.parse_args()
 69 | 
 70 | 
 71 | def get_filenames(directory, extension):
 72 |     '''
 73 |     Search for files in the directory ending in EXTENSION and return the full
 74 |     paths as a list.
 75 |     '''
 76 |     all_fnames = []
 77 |     for dirpath,dirnames,filenames in os.walk(directory):
 78 |         all_fnames += [os.path.join(dirpath,f) for f in filenames if
 79 |                 f.endswith(extension)]
 80 |     return all_fnames
 81 | 
 82 | 
 83 | def clean_word(word):
 84 |     '''
 85 |     returns the word in lowercase without punctuation at the start or end
 86 |     '''
 87 |     return word.rstrip(string.punctuation).lstrip(string.punctuation).lower()
 88 | 
 89 | def load_stopwords(stopword_filename):
 90 |     '''
 91 |     returns a set of stopwords found line by line in the stopwords file
 92 |     '''
 93 |     stopwords = set()
 94 | 
 95 |     with codecs.open(stopword_filename, 'r', 'utf-8') as sf:
 96 |         for line in sf:
 97 |             if len(line.split()) != 1:
 98 |                 print('ignoring line with more than one stopword:\n"{0}"'.format(
 99 |                     line))
100 |                 continue
101 |             stopwords.add(line.strip())
102 | 
103 |     return stopwords
104 | 
105 | def write_document_map_file(fnames, dmap_fname):
106 |     """
107 |     Save document's names in the order they were processed
108 |     """
109 |     with codecs.open(dmap_fname,'w','utf-8') as d_file:
110 |         for title in fnames:
111 |             d_file.write(title + '\n')
112 | 
113 | def reindex(word_id_dict, min_index):
114 |     """
115 |     re-index the word_id for word_id pairs to guarantee that the max
116 |     index of the word matches/reflects number of words in word dict
117 |     """
118 |     num_word_shifts = 0
119 |     for word in word_id_dict:
120 |         cur_index = word_id_dict[word]
121 | 
122 |         if cur_index > min_index:
123 |             word_id_dict[word] = min_index + num_word_shifts
124 |             num_word_shifts += 1
125 | 
126 | def generate_dat_lines_and_word_ids(fnames, config):
127 |     dat_lines = [] #.dat file output
128 |     word_id_dict = dict()
129 |     used_docs = [] #needed to generate .dmap file
130 | 
131 |     for dayname in fnames:
132 |         print dayname
133 |         try:
134 |             with codecs.open(dayname, 'r', 'utf-8') as day:
135 |                 for line in day:
136 |                     freq_dict = dict()
137 |                     new_words = set()
138 |                              
139 |                     for word in line.split():
140 |                         word = clean_word(word)
141 | 
142 |                         if len(word) < config['minlength'] or word in config['stopwords']:
143 |                             continue
144 | 
145 |                         #word occurrs for the first time
146 |                         if not word_id_dict.has_key(word):
147 |                             freq_dict[word] = 1
148 |                             word_id_dict[word] = len(word_id_dict)
149 |                             new_words.add(word)
150 |                         #word may be in word_id_dict but not yet in freq_dict
151 |                         else:
152 |                             freq = freq_dict.setdefault(word, 0)
153 |                             freq_dict[word] = freq + 1
154 | 
155 |                     if len(freq_dict)==0: #did the document contribute anything?
156 |                         print('Document "{0}" (#{1}) seems to be empty and is ignored!'.format(
157 |                             dayname,fnames.index(docname)))
158 |                         continue
159 |                     else:
160 |                         used_docs.append(dayname)
161 | 
162 |                     #remove words that do not reach minoccurrence
163 |                     remove_list = [word for word in freq_dict.iterkeys() if\
164 |                         freq_dict[word] < config['minoccurrence']]
165 |                     #smallest index of a word that is removed
166 |                     remove_word_min_index = len(word_id_dict)
167 | 
168 |                     for word in remove_list:
169 |                         freq_dict.pop(word)
170 |                         #if they are new also remove them from word_id_dict
171 |                         if word in new_words:
172 |                             word_index = word_id_dict[word]
173 |                             if word_index < remove_word_min_index:
174 |                                 remove_word_min_index = word_index
175 |                             word_id_dict.pop(word)
176 |                     reindex(word_id_dict, remove_word_min_index)
177 | 
178 |                     dat_line =  '' #line for the .dat file
179 | 
180 |                     for word in freq_dict.iterkeys():
181 |                         dat_line += str(word_id_dict[word]) + ':' + str(freq_dict[word]) + ' '
182 | 
183 |                     #last blank in dat_line is removed
184 |                     dat_lines.append(str(len(freq_dict)) + ' ' + dat_line[:-1] + '\n')
185 | 
186 |         except UnicodeDecodeError as u_error:
187 |             print('Document "{0}" has encoding errors and is ignored!\n{1}'.format(dayname, u_error))
188 | 
189 |     #write_document_map_file(used_docs, config['dmapname'])
190 | 
191 |     return dat_lines, word_id_dict
192 | 
193 | def generate_dat_and_vocab_files(fnames, config):
194 | 
195 |     with codecs.open(config['datname'], 'w', 'utf-8') as datfile:
196 |         dat_lines, word_id_dict = generate_dat_lines_and_word_ids(fnames,
197 |                 config)
198 |         datfile.writelines(dat_lines)
199 | 
200 |     #sort word_id_dict ascending by value und write the words in that
201 |     #order to a .vocab file
202 |     with codecs.open(config['vocabname'], 'w', 'utf-8') as vocabfile:
203 |         for item in sorted(word_id_dict.iteritems(), key=operator.itemgetter(1)):
204 |             vocabfile.write(item[0]+'\n')
205 | 
206 |     print('Found {0} unique words in {1} files.'.format(
207 |         len(word_id_dict), len(fnames)))
208 |     print('Results can be found in "{0}" and "{1}"'.format(
209 |         config['datname'], config['vocabname']))
210 | 
211 | 
212 | if __name__=='__main__':
213 | 
214 |     parser = init_parser()
215 | 
216 |     #directory with document files
217 |     dirname = parser.dirname
218 |     dirname = dirname + os.sep if not dirname.endswith(os.sep) else dirname
219 |     #directory for results
220 |     outdir_name = parser.outdir if parser.outdir else dirname
221 |     outdir_name = outdir_name + os.sep if not outdir_name.endswith(os.sep) else outdir_name
222 |     #prefix of the .dat and .vocab files
223 |     basename = os.path.dirname(dirname).split('/')[-1]
224 | 
225 | 
226 |     if not os.path.exists(outdir_name):
227 |         os.mkdir(outdir_name)
228 | 
229 |     #store configuration
230 |     config = dict()
231 |     config['datname'] = outdir_name + basename + '.dat'
232 |     config['vocabname'] = outdir_name + basename + '.vocab'
233 |     config['dmapname'] = outdir_name + basename + '.dmap'
234 |     config['minlength'] = parser.minlength
235 |     config['minoccurrence'] = parser.minoccurrence
236 |     if parser.stopword_file:
237 |         config['stopwords'] = load_stopwords(parser.stopword_file)
238 |     else:
239 |         config['stopwords'] = set()
240 | 
241 |     fnames = get_filenames(dirname, parser.extension)
242 |     
243 |     try:
244 |         generate_dat_and_vocab_files(fnames, config)
245 |     except IOError as ioe:
246 |         print(ioe)
247 |         sys.exit(1)
248 | 


--------------------------------------------------------------------------------
/SCVB0_Evaluation/scvb.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * scvb.cpp
  3 |  *
  4 |  *  Created on: May 3, 2014
  5 |  *      Author: vspathak
  6 |  */
  7 | 
  8 | #include <math.h>
  9 | #include <vector>
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <omp.h>
 13 | #include <iostream>
 14 | #include <fstream>
 15 | #include <string.h>
 16 | #include <time.h>
 17 | #include <random>
 18 | #include <cstring>
 19 | #include <map>
 20 | 
 21 | using namespace std;
 22 | 
 23 | unsigned int W, D, K;
 24 | 
 25 | int main(int argc, char **argv) {
 26 | 
 27 | 	if (argc < 4) {
 28 | 		printf("Usage: ./scvb inputfile num_iterations num_topics miniBatchSize vocabFile anything\n");
 29 | 		return 1;
 30 | 	}
 31 | 
 32 | 	// Initlialize expected topic counts per document
 33 | 	double **nTheta;
 34 | 	// Dynamically
 35 | 	double **nPi;
 36 | 	double *N_z;
 37 | 	// Initialize estimates from each minibatch
 38 | 	// Initialize step sizes
 39 | 	double rhoTheta = 0;
 40 | 	double rhoPhi = 0;
 41 | 	double **Pi;
 42 | 	double **theta;
 43 | 	double **perplexities;
 44 | 	// Initlalize dirichlet prior parameters
 45 | 	double alpha, eta;
 46 | 	double M; // Number of documents in each minibatch
 47 | 	int Cj = 0;
 48 | 	unsigned int i, j, k, w, MAXITER;
 49 | 	int batch_idx = 0;
 50 | 	int C = 0;
 51 | 	int iter = 0;
 52 | 	int NNZ;
 53 | 	double perplexityval, innerval;
 54 | 	ofstream pfile;
 55 | 	pfile.open("perplexity.txt");
 56 | 
 57 | 	M = 100; //343 works for KOS and only for KOS
 58 | 	eta = 0.01; // was 0.01
 59 | 	alpha = 0.1;
 60 | 
 61 | 	ifstream seqfile;
 62 | 	seqfile.open("Data/seqfile.txt");
 63 | 	string newline = "";
 64 | 	vector<int>* months = new vector<int>();
 65 | 	vector<int>* numOfDocs = new vector<int>();
 66 | 	vector<int>* monthFirstIdx = new vector<int>();
 67 | 	vector<int>* monthLastIdx = new vector<int>();
 68 | 	int curIdx = 0;
 69 | 
 70 | 	while (seqfile >> newline) {
 71 | 		const char * ptr = strchr(newline.c_str(), ':');
 72 | 		int count = atoi(ptr + 1);
 73 | 		ptr = "\0";
 74 | 		int yearMonth = atoi(newline.c_str());
 75 | 		months->push_back(yearMonth);
 76 | 		numOfDocs->push_back(count);
 77 | 		monthFirstIdx->push_back(curIdx);
 78 | 		monthLastIdx->push_back(curIdx + count);
 79 | 		curIdx += count;
 80 | 	}
 81 | 	seqfile.close();
 82 | 
 83 | 	//if user also specified a minibatch size
 84 | 	if (argc == 5 || argc == 6) {
 85 | 		M = atof(argv[4]);
 86 | 	}
 87 | 
 88 | 	MAXITER = atoi(argv[2]);
 89 | 	K = atoi(argv[3]);
 90 | 
 91 | 	printf("Input file: %s\n", argv[1]);
 92 | 	printf("Number of iterations: %d\n", MAXITER);
 93 | 	printf("Number of topics: %d\n", K);
 94 | 	printf("Minibatch size: %f\n", M);
 95 | 	printf("alpha:  %f\n", alpha);
 96 | 	printf("eta:  %f\n", eta);
 97 | 
 98 | 	// Read the file and store it in DATA
 99 | 	FILE* fptr;
100 | 	unsigned int docnum, wnum;
101 | 	unsigned char countnum;
102 | 
103 | 	fptr = fopen(argv[1], "rt");
104 | 
105 | 	fscanf(fptr, "%d\n", &D);
106 | 	fscanf(fptr, "%d\n", &W);
107 | 	fscanf(fptr, "%d\n", &NNZ);
108 | 
109 | 	printf("Number of documents: %d\n", D);
110 | 	printf("Vocabulary size: %d\n", W);
111 | 
112 | 	// Dynamically allocate phi
113 | 	Pi = new double*[W];
114 | //#pragma omp parallel for
115 | 	for (w = 0; w < W; w++) {
116 | 		Pi[w] = new double[K];
117 | 	}
118 | 
119 | 	printf("allocated phi\n");
120 | 
121 | 	// Dynamically allocate theta
122 | 
123 | 	theta = new double*[D];
124 | //#pragma omp parallel for
125 | 	for (i = 0; i < D; i++) {
126 | 		theta[i] = new double[K];
127 | 	}
128 | 
129 | 	printf("allocated theta\n");
130 | 
131 | 	vector<vector<int> > corpus;
132 | 	vector<int> corpus_size(D, 0);
133 | 	corpus.resize(D);
134 | 	vector<vector<int> > corpus_expanded;
135 | 	corpus_expanded.resize(D);
136 | 
137 | 	while (!feof(fptr)) {
138 | 		fscanf(fptr, "%d %d %hhu\n", &docnum, &wnum, &countnum);
139 | 
140 | 		corpus[docnum - 1].push_back(wnum - 1);
141 | 		corpus[docnum - 1].push_back(countnum);
142 | 
143 | 		corpus_size[docnum - 1] += countnum;
144 | 
145 | 		for (i = 0; i < countnum; i++) {
146 | 			corpus_expanded[docnum - 1].push_back(wnum - 1);
147 | 		}
148 | 	}
149 | 	fclose(fptr);
150 | 
151 | 
152 | 	// Initialize phi_est and all other arrays
153 | 	nPi = new double*[W];
154 | 
155 | 	for (i = 0; i < W; i++) {
156 | 		nPi[i] = new double[K];
157 | 	}
158 | 
159 | 	for (i = 0; i < W; i++) {
160 | 		for (k = 0; k < K; k++) {
161 | 			nPi[i][k] = rand() % 10;
162 | 		}
163 | 	}
164 | 
165 | 	// Initialize n_z and n_z_est and other arrays
166 | 	N_z = new double[K];
167 | 	for (k = 0; k < K; k++) {
168 | 		N_z[k] = 0;
169 | 	}
170 | 
171 | 	nTheta = new double*[D];
172 | 	for (i = 0; i < D; i++) {
173 | 		nTheta[i] = new double[K];
174 | 	}
175 | 
176 | 	for (i = 0; i < D; i++) {
177 | 		for (k = 0; k < K; k++) {
178 | 			nTheta[i][k] = rand() % 10;
179 | 		}
180 | 	}
181 | 
182 | 	perplexities = new double*[months->size()];
183 | 	for (i = 0; i < months->size(); i++) {
184 | 		perplexities[i] = new double[MAXITER];
185 | 		for (unsigned int a = 0; a < MAXITER; ++a) {
186 | 			perplexities[i][a] = 0;
187 | 		}
188 | 	}
189 | 
190 | 	int*** topwords;
191 | 	double** maxval;
192 | 	topwords = new int**[months->size()];
193 | 
194 | 	for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) {
195 | 		cout << endl << (*months)[timeSlice] << " " << (*numOfDocs)[timeSlice] << endl;
196 | 
197 | 		for (i = 0; i < W; i++) {
198 | 				for (k = 0; k < K; k++) {
199 | 					nPi[i][k] = rand() % 10;
200 | 				}
201 | 			}
202 | 
203 | 		//if parallelizing this, make sure to avoid race condition (most likely use reduction)
204 | 		for (k = 0; k < K; k++) {
205 | 			N_z[k] = 0;
206 | 			for (w = 0; w < W; w++) {
207 | 				N_z[k] += nPi[w][k];
208 | 			}
209 | 		}
210 | 
211 | 
212 | 		// Find the total number of word in the document
213 | 		int monthFirstDoc = monthFirstIdx->at(timeSlice);
214 | 		int monthLastDoc = monthLastIdx->at(timeSlice);
215 | 
216 | 		int monthD = monthLastDoc - monthFirstDoc;
217 | 
218 | 		C = 0;
219 | 
220 | 		for (j = monthFirstDoc; j < (unsigned)monthLastDoc; j++) {
221 | 			C += corpus_size[j];
222 | 		}
223 | 
224 | 		printf("Number of words in corpus: %d\n", C);
225 | 
226 | 		int firstdoc = 0;
227 | 		int lastdoc = 0;
228 | 		int DM = monthD / M;
229 | 
230 | 		for (iter = 0; iter < (int)MAXITER; iter++) {
231 | 			// Decide rho_phi and rho_theta
232 | 			rhoPhi = 10 / pow((1000 + iter), 0.9);
233 | 			rhoTheta = 1 / pow((10 + iter), 0.9);
234 | 
235 | #pragma omp parallel private(batch_idx,j,k,i,w,firstdoc,lastdoc)
236 | 			{
237 | 				double *gamma = new double[K];
238 | 				double *nzHat = new double[K];
239 | 				double **nPhiHat = new double *[W];
240 | 				for (k = 0; k < K; k++) {
241 | 					gamma[k] = 0;
242 | 					nzHat[k] = 0;
243 | 				}
244 | 				for (i = 0; i < W; i++) {
245 | 					nPhiHat[i] = new double[K];
246 | 					for (k = 0; k < K; k++) {
247 | 						nPhiHat[i][k] = 0;
248 | 					}
249 | 				}
250 | 
251 | #pragma omp for
252 | 				for (batch_idx = 0; batch_idx < DM+1; batch_idx++) {
253 | 
254 | 					// Decide the document indices which go in each minibatch
255 | 					firstdoc = monthFirstDoc + (batch_idx * M);
256 | 					lastdoc = monthFirstDoc + ((batch_idx + 1) * M);
257 | 
258 | 					if (batch_idx == DM) {
259 | 						lastdoc = monthLastDoc;
260 | 					}
261 | 					for (j = (unsigned)firstdoc; j < (unsigned)lastdoc; j++) {
262 | 
263 | 						// First perform the burn-in passes
264 | 						// Iteration of burn in passes
265 | 
266 | 						// Store size of corpus in Cj
267 | 						Cj = corpus_size[j];
268 | 
269 | 						for (i = 0; i < (corpus[j].size() / 2); i++) {// indexing is very different here!
270 | 
271 | 							int w_aj = corpus[j][2 * i];
272 | 							int m_aj = corpus[j][(2 * i) + 1];
273 | 							// Update gamma_ij and N_theta
274 | 							double normSum = 0;
275 | 
276 | 							for (k = 0; k < K; k++) {
277 | 								gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W));
278 | 								normSum += gamma[k];
279 | 							}
280 | 
281 | 							for (k = 0; k < K; k++) {
282 | 								gamma[k] = gamma[k] / normSum;
283 | 							}
284 | 
285 | 							for (k = 0; k < K; k++) {
286 | 
287 | 								nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k])
288 | 										+ ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]);
289 | 							}
290 | 
291 | 						}
292 | 
293 | 						// Iteration of the main loop
294 | 						for (i = 0; i < (corpus[j].size() / 2); i++) { // indexing is very different here!
295 | 
296 | 							int w_aj = corpus[j][2 * i];
297 | 							int m_aj = corpus[j][(2 * i) + 1];
298 | 							double normSum = 0;
299 | 							for (k = 0; k < K; k++) {
300 | 								gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W));
301 | 								normSum += gamma[k];
302 | 							}
303 | 
304 | 							for (k = 0; k < K; k++) {
305 | 								gamma[k] = gamma[k] / normSum;
306 | 							}
307 | 
308 | 							// Update N_theta estimates
309 | 							for (k = 0; k < K; k++) {
310 | 								nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k])
311 | 										+ ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]);
312 | 
313 | 								nPhiHat[w_aj][k] = nPhiHat[w_aj][k] + (C * gamma[k] / M);
314 | 
315 | 								nzHat[k] = nzHat[k] + (C * gamma[k] / M);
316 | 							}
317 | 						}
318 | 
319 | 					} // End of j
320 | 
321 | 					// Update the estimates matrix
322 | 					for (k = 0; k < K; k++) {
323 | 						for (w = 0; w < W; w++) {
324 | 							nPi[w][k] = (1 - rhoPhi) * nPi[w][k] + rhoPhi * nPhiHat[w][k];
325 | 						}
326 | #pragma omp atomic
327 | 						N_z[k] *= (1 - rhoPhi);
328 | #pragma omp atomic
329 | 						N_z[k] += rhoPhi * nzHat[k];
330 | 					}
331 | 
332 | 				} // End of batch_idx
333 | 
334 | 				// Compute phi
335 | #pragma omp for
336 | 				for (k = 0; k < K; k++) {
337 | 					double normSum = 0;
338 | 					for (w = 0; w < W; w++) {
339 | 						nPi[w][k] += eta;
340 | 						normSum += nPi[w][k];
341 | 					}
342 | 					for (w = 0; w < W; w++) {
343 | 						Pi[w][k] = (double) nPi[w][k] / normSum;
344 | 					}
345 | 				}
346 | 
347 | 				// Compute theta
348 | #pragma omp for
349 | 				for (i = monthFirstDoc; i < (unsigned)monthLastDoc; i++) {
350 | 					double normSum = 0;
351 | 					for (k = 0; k < K; k++) {
352 | 						nTheta[i][k] += alpha;
353 | 						normSum += nTheta[i][k];
354 | 					}
355 | 					for (k = 0; k < K; k++) {
356 | 						theta[i][k] = (double) nTheta[i][k] / normSum;
357 | 					}
358 | 				}
359 | 
360 | 				delete[] gamma;
361 | 				delete[] nzHat;
362 | 
363 | 				for (i = 0; i < W; i++) {
364 | 					delete[] nPhiHat[i];
365 | 				}
366 | 
367 | 				delete[] nPhiHat;
368 | 
369 | 			}
370 | 
371 | 			// Calculate the perplexity here
372 | 			// Compute posterior means here
373 | 			// Iterate over the corpus here
374 | 			perplexityval = 0;
375 | #pragma omp parallel for private(j,i,k) reduction(+:innerval) reduction(+:perplexityval)
376 | 			for (j = monthFirstDoc; j < (unsigned)monthLastDoc; j++) {
377 | 				for (i = 0; i < corpus_expanded[j].size(); i++) {
378 | 					innerval = 0;
379 | 					for (k = 0; k < K; k++) {
380 | 						innerval += (theta[j][k] * Pi[corpus_expanded[j][i]][k]);
381 | 					}
382 | 					perplexityval += (log(innerval) / log(2));
383 | 				}
384 | 			}
385 | 			printf("%d,%f\n", iter, pow(2, -perplexityval / C));
386 | 			perplexities[timeSlice][iter] = pow(2, -perplexityval / C);
387 | 
388 | 			pfile << iter + 1 << "," << perplexities[timeSlice][iter] << endl;
389 | 			pfile.flush();
390 | 
391 | 		} // End of iter
392 | 
393 | 		if (argc == 7) {
394 | 			ofstream pifile;
395 | 			pifile.open("Pi/topics_" + to_string(months->at(timeSlice)) + ".txt");
396 | 			for (k = 0; k < K; k++) {
397 | 				for (w = 0; w < W; w++) {
398 | 					pifile << Pi[w][k] << ",";
399 | 				}
400 | 				pifile << endl;
401 | 			}
402 | 			pifile.close();
403 | 		}
404 | 
405 | 		//compute the top 100 words for each topic
406 | 
407 | 		topwords[timeSlice] = new int*[K];
408 | 		maxval = new double*[K];
409 | 		for (k = 0; k < K; k++) {
410 | 			topwords[timeSlice][k] = new int[100];
411 | 			maxval[k] = new double[100];
412 | 		}
413 | 		for (k = 0; k < K; k++) {
414 | 			double oldMax = std::numeric_limits<double>::max();
415 | 			for (i = 0; i < 100; i++) {
416 | 				double max = -1;
417 | 				int max_idx = -1;
418 | 				for (w = 0; w < W; w++) {
419 | 					if ((oldMax > Pi[w][k]) && (Pi[w][k] > max)) {
420 | 						max = Pi[w][k];
421 | 						max_idx = w;
422 | 					}
423 | 				}
424 | 				oldMax = Pi[max_idx][k];
425 | 				topwords[timeSlice][k][i] = max_idx;
426 | 				maxval[k][i] = max;
427 | 			}
428 | 		}
429 | 
430 | 	} // End of TimeSlice Loop
431 | 	string *dict;
432 | 	dict = new string[W];
433 | //	char word;
434 | 	//retrieve the words from the file
435 | 	w = 0;
436 | 	string line;
437 | 	ifstream vocabFile(argv[5]);
438 | 	if (vocabFile.is_open()) {
439 | 		while (getline(vocabFile, line)) {
440 | 			dict[w] = line;
441 | 			w++;
442 | 		}
443 | 		vocabFile.close();
444 | 	}
445 | 
446 | //	write topics file
447 | 	for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) {
448 | 		ofstream tfile;
449 | 		tfile.open("output/topics_" + to_string(months->at(timeSlice)) + ".txt");
450 | 		for (k = 0; k < K; k++) {
451 | 			for (w = 0; w < 100; w++) {
452 | 				tfile << topwords[timeSlice][k][w] << ":" << maxval[k][w] << ",";
453 | 			}
454 | 			tfile << endl;
455 | 
456 | 			for (w = 0; w < 100; w++) {
457 | 				tfile << dict[topwords[timeSlice][k][w]] << ",";
458 | 			}
459 | 			tfile << endl;
460 | 		}
461 | 		tfile.close();
462 | 	}
463 | 
464 | 	ofstream topTenfile;
465 | 	topTenfile.open("TopTen.txt");
466 | 	for (k = 0; k < K; k++) {
467 | 		for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) {
468 | 			for (w = 0; w < 10; w++) {
469 | 				topTenfile << topwords[timeSlice][k][w] << ",";
470 | 			}
471 | 		}
472 | 		topTenfile << endl;
473 | 	}
474 | 	topTenfile.close();
475 | 
476 | 	return (0);
477 | }
478 | 


--------------------------------------------------------------------------------
/TopicChains/GetData.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * TopicChains.cpp
  3 |  *
  4 |  *  Created on: Apr 22, 2014
  5 |  *      Author: vspathak
  6 |  */
  7 | 
  8 | #include <math.h>
  9 | #include <vector>
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <omp.h>
 13 | #include <iostream>
 14 | #include <fstream>
 15 | #include <string.h>
 16 | #include <time.h>
 17 | #include <random>
 18 | #include <cstring>
 19 | #include <map>
 20 | #include <limits>
 21 | #include <boost/config.hpp>
 22 | #include <algorithm>
 23 | #include <utility>
 24 | #include <boost/graph/adjacency_list.hpp>
 25 | #include <boost/graph/connected_components.hpp>
 26 | 
 27 | using namespace std;
 28 | using namespace boost;
 29 | 
 30 | typedef adjacency_list <vecS, vecS, undirectedS> Graph;
 31 | 
 32 | // Initialize number of documents, topics and words in vocabulary
 33 | unsigned int W, D, K;
 34 | 
 35 | int main(int argc, char* argv[]) {
 36 |     if (argc < 4) {
 37 |         printf("Usage: ./fastLDA inputfile num_iterations num_topics\n");
 38 |         return 1;
 39 |     }
 40 | 
 41 |     // Initlialize expected topic counts per document
 42 |     double **nTheta;
 43 |     // Dynamically
 44 |     double **nPi;
 45 |     double *N_z;
 46 |     // Initialize estimates from each minibatch
 47 |     // Initialize step sizes
 48 |     double rhoTheta = 0;
 49 |     double rhoPhi = 0;
 50 |     double ***Pi;
 51 |     double **theta;
 52 |     double **perplexities;
 53 |     // Initlalize dirichlet prior parameters
 54 |     double alpha, eta;
 55 |     double M; // Number of documents in each minibatch
 56 |     int Cj = 0;
 57 |     unsigned int i, j, k, w, MAXITER;
 58 |     int batch_idx = 0;
 59 |     int C = 0;
 60 |     int iter = 0;
 61 |     int NNZ;
 62 | 	double perplexityval, innerval;
 63 | 	ofstream pfile;
 64 | 	pfile.open("perplexity.txt");
 65 | 
 66 |     M = 100; //343 works for KOS and only for KOS
 67 |     eta = 0.01; // was 0.01
 68 |     alpha = 0.1;
 69 | 
 70 |     ifstream seqfile;
 71 |     seqfile.open("Data/seqfile.txt");
 72 |     string newline = "";
 73 |     vector<int>* months = new vector<int>();
 74 |     vector<int>* numOfDocs = new vector<int>();
 75 |         vector<int>* monthFirstIdx = new vector<int>();
 76 |         vector<int>* monthLastIdx = new vector<int>();
 77 |         int curIdx = 0;
 78 | 
 79 |     while (seqfile >> newline) {
 80 |         const char * ptr = strchr(newline.c_str(), ':');
 81 |         int count = atoi(ptr + 1);
 82 |         ptr = "\0";
 83 |         int yearMonth = atoi(newline.c_str());
 84 |         months->push_back(yearMonth);
 85 |         numOfDocs->push_back(count);
 86 |                 monthFirstIdx->push_back(curIdx);
 87 |                 monthLastIdx->push_back(curIdx+count);
 88 |                 curIdx += count;
 89 |     }
 90 |     seqfile.close();
 91 | 
 92 |     //if user also specified a minibatch size
 93 |     if (argc > 4) {
 94 |         M = atof(argv[4]);
 95 |     }
 96 | 
 97 |     MAXITER = atoi(argv[2]);
 98 |     K = atoi(argv[3]);
 99 | 
100 |     printf("Input file: %s\n", argv[1]);
101 |     printf("Number of iterations: %d\n", MAXITER);
102 |     printf("Number of topics: %d\n", K);
103 |     printf("Minibatch size: %f\n", M);
104 |     printf("alpha:  %f\n", alpha);
105 |     printf("eta:  %f\n", eta);
106 | 
107 |     // Read the file and store it in DATA
108 |     FILE* fptr;
109 |     unsigned int docnum, wnum;
110 |     unsigned char countnum;
111 | 
112 |     fptr = fopen(argv[1], "rt");
113 | 
114 |     fscanf(fptr, "%d\n", &D);
115 |     fscanf(fptr, "%d\n", &W);
116 |     fscanf(fptr, "%d\n", &NNZ);
117 | 
118 |     printf("Number of documents: %d\n", D);
119 |     printf("Vocabulary size: %d\n", W);
120 | 
121 |     // Dynamically allocate phi
122 |     Pi = new double**[months->size()];
123 |     for (unsigned int m = 0; m < months->size(); ++m) {
124 |         Pi[m] = new double*[W];
125 |         for (unsigned int word = 0; word < W; word++) {
126 |             Pi[m][word] = new double[K];
127 |         }
128 |     }
129 | //#pragma omp parallel for
130 | 
131 | 
132 |     printf("allocated phi\n");
133 | 
134 |     // Dynamically allocate theta
135 | 
136 |     theta = new double*[D];
137 | //#pragma omp parallel for
138 |     for (i = 0; i < D; i++) {
139 |         theta[i] = new double[K];
140 |     }
141 | 
142 |     printf("allocated theta\n");
143 | 
144 |     vector<vector<int> > corpus;
145 |     vector<int> corpus_size(D, 0);
146 |     corpus.resize(D);
147 |     vector<vector<int> > corpus_expanded;
148 |     corpus_expanded.resize(D);
149 | 
150 |     while (!feof(fptr)) {
151 |         fscanf(fptr, "%d %d %hhu\n", &docnum, &wnum, &countnum);
152 | 
153 |         corpus[docnum - 1].push_back(wnum - 1);
154 |         corpus[docnum - 1].push_back(countnum);
155 | 
156 |         corpus_size[docnum - 1] += countnum;
157 | 
158 |         for (i = 0; i < countnum; i++) {
159 |             corpus_expanded[docnum - 1].push_back(wnum - 1);
160 |         }
161 |     }
162 |     fclose(fptr);
163 | 
164 | 
165 |     // Initialize phi_est and all other arrays
166 |     nPi = new double*[W];
167 | 
168 |     for (i = 0; i < W; i++) {
169 |         nPi[i] = new double[K];
170 |     }
171 | 
172 |     // Initialize n_z and n_z_est and other arrays
173 |     N_z = new double[K];
174 |     for (k = 0; k < K; k++) {
175 |         N_z[k] = 0;
176 |     }
177 | 
178 |     nTheta = new double*[D];
179 |     for (i = 0; i < D; i++) {
180 |         nTheta[i] = new double[K];
181 |     }
182 | 
183 |     perplexities = new double*[months->size()];
184 |     for (i = 0; i < months->size(); i++) {
185 |         perplexities[i] = new double[MAXITER];
186 |         for (unsigned int a = 0; a < MAXITER; ++a) {
187 |             perplexities[i][a] = 0;
188 |         }
189 |     }
190 | 
191 |     int*** topwords;
192 |     topwords = new int**[months->size()];
193 | 
194 |     //Generate Numbers according to Gaussian Distribution
195 | 
196 |     for (int timeSlice = 0; timeSlice < months->size(); timeSlice++) {
197 |         cout << (*months)[timeSlice] << " " << (*numOfDocs)[timeSlice] << endl;
198 | 
199 | 
200 |         for (int doc = 0; doc < D; doc++) {
201 | 			for (int top = 0; top < K; top++) {
202 | 				nTheta[doc][top] = rand() % 10;
203 | 			}
204 | 		}
205 | 
206 | 		for (int word = 0; word < W; word++) {
207 | 			for (int top = 0; top < K; top++) {
208 | 				nPi[word][top] = rand() % 10;
209 | 			}
210 | 		}
211 | 
212 |         //if parallelizing this, make sure to avoid race condition (most likely use reduction)
213 |         for (k = 0; k < K; k++) {
214 |         	N_z[k] = 0;
215 |             for (w = 0; w < W; w++) {
216 |                 N_z[k] += nPi[w][k];
217 |             }
218 |         }
219 | 
220 |         // Find the total number of word in the document
221 |         int monthFirstDoc = monthFirstIdx->at(timeSlice);
222 |         int monthLastDoc = monthLastIdx->at(timeSlice);
223 | 
224 |         int monthD = monthLastDoc - monthFirstDoc;
225 | 
226 |         C = 0;
227 | 
228 |         for (int var = monthFirstDoc; var < monthLastDoc; var++) {
229 |             C += corpus_size[var];
230 |         }
231 | 
232 |         printf("Number of words in corpus: %d\n", C);
233 | 
234 |         int firstdoc = 0;
235 |         int lastdoc = 0;
236 |         int DM = monthD / M;
237 | 
238 |         for (iter = 0; iter < (int)MAXITER; iter++) {
239 |             // Decide rho_phi and rho_theta
240 |             rhoPhi = 10 / pow((1000 + iter), 0.9);
241 |             rhoTheta = 1 / pow((10 + iter), 0.9);
242 | 
243 | #pragma omp parallel private(batch_idx,j,k,i,w,firstdoc,lastdoc)
244 |             {
245 |                 double *gamma = new double[K];
246 |                 double *nzHat = new double[K];
247 |                 double **nPhiHat = new double *[W];
248 |                 for (k = 0; k < K; k++) {
249 |                     gamma[k] = 0;
250 |                     nzHat[k] = 0;
251 |                 }
252 |                 for (i = 0; i < W; i++) {
253 |                     nPhiHat[i] = new double[K];
254 |                     for (k = 0; k < K; k++) {
255 |                         nPhiHat[i][k] = 0;
256 |                     }
257 |                 }
258 | 
259 | #pragma omp for
260 |                 for (batch_idx = 0; batch_idx < DM+1; batch_idx++) {
261 | 
262 |                     // Decide the document indices which go in each minibatch
263 |                     firstdoc = monthFirstDoc + (batch_idx * M);
264 |                     lastdoc = monthFirstDoc + ((batch_idx + 1) * M);
265 | 
266 |                     if (batch_idx == DM) {
267 |                         lastdoc = monthLastDoc;
268 |                     }
269 |                     for (j = (unsigned)firstdoc; j < (unsigned)lastdoc; j++) {
270 | 
271 |                         // First perform the burn-in passes
272 |                         // Iteration of burn in passes
273 | 
274 |                         // Store size of corpus in Cj
275 |                         Cj = corpus_size[j];
276 | 
277 |                         for (i = 0; i < (corpus[j].size() / 2); i++) {// indexing is very different here!
278 | 
279 |                             int w_aj = corpus[j][2 * i];
280 |                             int m_aj = corpus[j][(2 * i) + 1];
281 |                             // Update gamma_ij and N_theta
282 |                             double normSum = 0;
283 | 
284 |                             for (k = 0; k < K; k++) {
285 |                                 gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W));
286 |                                 normSum += gamma[k];
287 |                             }
288 | 
289 |                             for (k = 0; k < K; k++) {
290 |                                 gamma[k] = gamma[k] / normSum;
291 |                             }
292 | 
293 |                             for (k = 0; k < K; k++) {
294 | 
295 |                                 nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k])
296 |                                         + ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]);
297 |                             }
298 | 
299 |                         }
300 | 
301 |                         // Iteration of the main loop
302 |                         for (i = 0; i < (corpus[j].size() / 2); i++) { // indexing is very different here!
303 | 
304 |                             int w_aj = corpus[j][2 * i];
305 |                             int m_aj = corpus[j][(2 * i) + 1];
306 |                             double normSum = 0;
307 |                             for (k = 0; k < K; k++) {
308 |                                 gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W));
309 |                                 normSum += gamma[k];
310 |                             }
311 | 
312 |                             for (k = 0; k < K; k++) {
313 |                                 gamma[k] = gamma[k] / normSum;
314 |                             }
315 | 
316 |                             // Update N_theta estimates
317 |                             for (k = 0; k < K; k++) {
318 |                                 nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k])
319 |                                         + ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]);
320 | 
321 |                                 nPhiHat[w_aj][k] = nPhiHat[w_aj][k] + (C * gamma[k] / M);
322 | 
323 |                                 nzHat[k] = nzHat[k] + (C * gamma[k] / M);
324 |                             }
325 |                         }
326 | 
327 |                     } // End of j
328 | 
329 |                     // Update the estimates matrix
330 |                     for (k = 0; k < K; k++) {
331 |                         for (w = 0; w < W; w++) {
332 |                             nPi[w][k] = (1 - rhoPhi) * nPi[w][k] + rhoPhi * nPhiHat[w][k];
333 |                         }
334 | #pragma omp atomic
335 |                         N_z[k] *= (1 - rhoPhi);
336 | #pragma omp atomic
337 |                         N_z[k] += rhoPhi * nzHat[k];
338 |                     }
339 | 
340 |                 } // End of batch_idx
341 | 
342 |                 // Compute phi
343 | #pragma omp for
344 |                 for (k = 0; k < K; k++) {
345 |                     double normSum = 0;
346 |                     for (w = 0; w < W; w++) {
347 |                         nPi[w][k] += eta;
348 |                         normSum += nPi[w][k];
349 |                     }
350 | 
351 |                     for (w = 0; w < W; w++) {
352 |                         Pi[timeSlice][w][k] = (double) nPi[w][k] / normSum;
353 |                     }
354 |                 }
355 | 
356 |                 // Compute theta
357 | #pragma omp for
358 |                 for (int var = monthFirstDoc; var < monthLastDoc; var++) {
359 |                     double normSum = 0;
360 |                     for (k = 0; k < K; k++) {
361 |                         nTheta[var][k] += alpha;
362 |                         normSum += nTheta[var][k];
363 |                     }
364 | 
365 |                     for (k = 0; k < K; k++) {
366 |                         theta[var][k] = (double) nTheta[var][k] / normSum;
367 |                     }
368 |                 }
369 | 
370 |                 delete[] gamma;
371 |                 delete[] nzHat;
372 | 
373 |                 for (i = 0; i < W; i++) {
374 |                     delete[] nPhiHat[i];
375 |                 }
376 | 
377 |                 delete[] nPhiHat;
378 | 
379 |             }
380 | 
381 |             // Calculate the perplexity here
382 | 			perplexityval = 0;
383 | #pragma omp parallel for private(j,i,k) reduction(+:innerval) reduction(+:perplexityval)
384 | 			for (j = monthFirstDoc; j < monthLastDoc; j++) {
385 | 				for (i = 0; i < corpus_expanded[j].size(); i++) {
386 | 					innerval = 0;
387 | 					for (k = 0; k < K; k++) {
388 | 						innerval += (theta[j][k] * Pi[timeSlice][corpus_expanded[j][i]][k]);
389 | 					}
390 | 					perplexityval += (log(innerval) / log(2));
391 | 				}
392 | 			}
393 | 			printf("%d,%f\n", iter, pow(2, -perplexityval / C));
394 | 			perplexities[timeSlice][iter] = pow(2, -perplexityval / C);
395 | 
396 | 			pfile << iter + 1 << "," << perplexities[timeSlice][iter] << endl;
397 | 			pfile.flush();
398 | 
399 |         } // End of iter
400 | 
401 |         //compute the top 100 words for each topic
402 | 
403 |         double** maxval;
404 |         topwords[timeSlice] = new int*[K];
405 |         maxval = new double*[K];
406 |         for (k = 0; k < K; k++) {
407 |             topwords[timeSlice][k] = new int[100];
408 |             maxval[k] = new double[100];
409 |         }
410 |         for (k = 0; k < K; k++) {
411 |             double oldMax = std::numeric_limits<double>::max();
412 |             for (i = 0; i < 100; i++) {
413 |                 double max = -1;
414 |                 int max_idx = -1;
415 |                 for (w = 0; w < W; w++) {
416 |                     if (oldMax > Pi[timeSlice][w][k] && Pi[timeSlice][w][k] > max) {
417 |                         max = Pi[timeSlice][w][k];
418 |                         max_idx = w;
419 |                     }
420 |                 }
421 |                 oldMax = Pi[timeSlice][max_idx][k];
422 |                 topwords[timeSlice][k][i] = max_idx;
423 |                 maxval[k][i] = max;
424 |             }
425 |         }
426 | 
427 |         ofstream pifile;
428 | 		pifile.open("Pi/topics_" + to_string(months->at(timeSlice)) + ".txt");
429 | 		for (k = 0; k < K; k++) {
430 | 			for (w = 0; w < W-1; w++) {
431 | 				pifile << Pi[timeSlice][w][k] << ",";
432 | 			}
433 | 			pifile << Pi[timeSlice][W-1][k];
434 | 			pifile << endl;
435 | 		}
436 | 		pifile.close();
437 |     }//All timeSlices finished
438 | 
439 |     string *dict;
440 |     dict = new string[W];
441 | //    char word;
442 |     //retrieve the words from the file
443 |     w = 0;
444 |     string line;
445 |     ifstream vocabFile(argv[5]);
446 |     if (vocabFile.is_open()) {
447 |         while (getline(vocabFile, line)) {
448 |             dict[w] = line;
449 |             w++;
450 |         }
451 |         vocabFile.close();
452 |     }
453 | 
454 | //    write topics file
455 |     for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) {
456 |         ofstream tfile;
457 |         tfile.open("output/topics_" + to_string(months->at(timeSlice)) + ".txt");
458 |         for (k = 0; k < K; k++) {
459 |             for (w = 0; w < 100; w++) {
460 |                 tfile << topwords[timeSlice][k][w];// << ":" << maxval[k][w] << ",";
461 | 
462 |             }
463 |             tfile << endl;
464 | 
465 |             for (w = 0; w < 100; w++) {
466 |                 tfile << dict[topwords[timeSlice][k][w]] << ",";
467 |             }
468 | 
469 |             tfile << endl;
470 |         }
471 |         tfile.close();
472 |     }
473 | 
474 |     return (0);
475 | 
476 | } // End of main
477 | 


--------------------------------------------------------------------------------
/TopicChains/TopicChains.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * TopicChains.cpp
  3 |  *
  4 |  *  Created on: Apr 22, 2014
  5 |  *      Author: vspathak
  6 |  */
  7 | 
  8 | #include <math.h>
  9 | #include <vector>
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <omp.h>
 13 | #include <iostream>
 14 | #include <fstream>
 15 | #include <string.h>
 16 | #include <time.h>
 17 | #include <random>
 18 | #include <cstring>
 19 | #include <map>
 20 | #include <limits>
 21 | #include <boost/config.hpp>
 22 | #include <algorithm>
 23 | #include <utility>
 24 | #include <boost/graph/adjacency_list.hpp>
 25 | #include <boost/graph/connected_components.hpp>
 26 | 
 27 | using namespace std;
 28 | using namespace boost;
 29 | 
 30 | typedef adjacency_list <vecS, vecS, undirectedS> Graph;
 31 | 
 32 | 
 33 | // Initialize number of documents, topics and words in vocabulary
 34 | unsigned int W, D, K;
 35 | 
 36 | double diffclock(clock_t clock1, clock_t clock2) {
 37 |     double diffticks = clock1 - clock2;
 38 |     double diffms = (diffticks * 1000) / CLOCKS_PER_SEC;
 39 |     return diffms;
 40 | }
 41 | 
 42 | double KLDivergence(double*** Pi, int t, int k, double* M) {
 43 |     double result = 0.0;
 44 |     for (unsigned int w = 0; w < W; ++w) {
 45 |         result += log(Pi[t][w][k] / M[w]) * Pi[t][w][k];
 46 |     }
 47 |     return result;
 48 | }
 49 | 
 50 | double JSsimilarity(double*** Pi, int t1, int k1, int t2, int k2) {
 51 |     double result = 0.0;
 52 |     double* M = new double[W];
 53 |     for (unsigned int w = 0; w < W; ++w) {
 54 |         M[w] = (Pi[t1][w][k1] + Pi[t2][w][k2]) / 2;
 55 |     }
 56 |     result = KLDivergence(Pi, t1, k1, M) + KLDivergence(Pi, t2, k2, M);
 57 |     result = result / 2;
 58 |     return result;
 59 | }
 60 | 
 61 | void generateTopicLinks(Graph &G, double*** Pi, int timeSlice, int topic, int numTopics, int windowSize, double threshold) {
 62 | 	for (int w = 0; w < windowSize; w++) {
 63 | 		int numLinks = 0;
 64 | 		for (int k = 0; k < numTopics; k++) {
 65 | 			if ((timeSlice - 1 - w >= 0)
 66 | 					&& JSsimilarity(Pi, timeSlice, topic, timeSlice - 1 - w, k) > threshold) {
 67 | 				//add edge to graph structure here
 68 | 				int e1 = (timeSlice * numTopics) + topic;
 69 | 				int e2 = ((timeSlice - 1 - w) * numTopics) + k;
 70 | 
 71 | 				cout<<"Adding edge "<<e1<<", "<<e2<<endl;
 72 | 				add_edge(e1, e2, G);
 73 | 				numLinks++;
 74 | 			}
 75 | 		}
 76 | 		if (numLinks > 0) {
 77 | 			break;
 78 | 		}
 79 | 	}
 80 | }
 81 | 
 82 | void generateAllLinks(Graph &G, double*** Pi, int numTimeSlices, int numTopics, int windowSize, double threshold) {
 83 |     for (int t = 0; t < numTimeSlices; t++) {
 84 |         for (int k = 0; k < numTopics; k++) {
 85 |             generateTopicLinks(G, Pi, t, k, numTopics, windowSize, threshold);
 86 |         }
 87 |     }
 88 | }
 89 | 
 90 | int main(int argc, char* argv[]) {
 91 |     if (argc < 4) {
 92 |         printf("Usage: ./fastLDA inputfile num_iterations num_topics\n");
 93 |         return 1;
 94 |     }
 95 | 
 96 |     // Initlialize expected topic counts per document
 97 |     double **nTheta;
 98 |     // Dynamically
 99 |     double **nPi;
100 |     double *N_z;
101 |     // Initialize estimates from each minibatch
102 |     // Initialize step sizes
103 |     double rhoTheta = 0;
104 |     double rhoPhi = 0;
105 |     double ***Pi;
106 |     double **theta;
107 |     double **perplexities;
108 |     // Initlalize dirichlet prior parameters
109 |     double alpha, eta;
110 |     double M; // Number of documents in each minibatch
111 |     int Cj = 0;
112 |     unsigned int i, j, k, w, MAXITER;
113 |     int batch_idx = 0;
114 |     int C = 0;
115 |     int iter = 0;
116 |     int NNZ;
117 | 
118 |     int windowSize = 0;
119 |     double similarityThreshold = 0;
120 | 
121 |     ofstream pfile;
122 |     pfile.open("perplexity.txt");
123 | 
124 |     M = 100; //343 works for KOS and only for KOS
125 |     eta = 0.01; // was 0.01
126 |     alpha = 0.1;
127 | 
128 |     ifstream seqfile;
129 |     seqfile.open("Data/seqfile.txt");
130 |     string newline = "";
131 |     vector<int>* months = new vector<int>();
132 |     vector<int>* numOfDocs = new vector<int>();
133 |         vector<int>* monthFirstIdx = new vector<int>();
134 |         vector<int>* monthLastIdx = new vector<int>();
135 |         int curIdx = 0;
136 | 
137 |     while (seqfile >> newline) {
138 |         const char * ptr = strchr(newline.c_str(), ':');
139 |         int count = atoi(ptr + 1);
140 |         ptr = "\0";
141 |         int yearMonth = atoi(newline.c_str());
142 |         months->push_back(yearMonth);
143 |         numOfDocs->push_back(count);
144 |                 monthFirstIdx->push_back(curIdx);
145 |                 monthLastIdx->push_back(curIdx+count);
146 |                 curIdx += count;
147 |     }
148 |     seqfile.close();
149 | 
150 |     //if user also specified a minibatch size
151 |     if (argc > 4) {
152 |         M = atof(argv[4]);
153 |         windowSize = atoi(argv[6]);
154 |         similarityThreshold = atof(argv[7]);
155 |     }
156 | 
157 |     MAXITER = atoi(argv[2]);
158 |     K = atoi(argv[3]);
159 | 
160 |     printf("Input file: %s\n", argv[1]);
161 |     printf("Number of iterations: %d\n", MAXITER);
162 |     printf("Number of topics: %d\n", K);
163 |     printf("Minibatch size: %f\n", M);
164 |     printf("alpha:  %f\n", alpha);
165 |     printf("eta:  %f\n", eta);
166 | 
167 |     // Read the file and store it in DATA
168 |     FILE* fptr;
169 |     unsigned int docnum, wnum;
170 |     unsigned char countnum;
171 | 
172 |     fptr = fopen(argv[1], "rt");
173 | 
174 |     fscanf(fptr, "%d\n", &D);
175 |     fscanf(fptr, "%d\n", &W);
176 |     fscanf(fptr, "%d\n", &NNZ);
177 | 
178 |     printf("Number of documents: %d\n", D);
179 |     printf("Vocabulary size: %d\n", W);
180 | 
181 |     // Dynamically allocate phi
182 |     Pi = new double**[months->size()];
183 |     for (unsigned int m = 0; m < months->size(); ++m) {
184 |         Pi[m] = new double*[W];
185 |         for (unsigned int word = 0; word < W; word++) {
186 |             Pi[m][word] = new double[K];
187 |         }
188 |     }
189 | //#pragma omp parallel for
190 | 
191 | 
192 |     printf("allocated phi\n");
193 | 
194 |     // Dynamically allocate theta
195 | 
196 |     theta = new double*[D];
197 | //#pragma omp parallel for
198 |     for (i = 0; i < D; i++) {
199 |         theta[i] = new double[K];
200 |     }
201 | 
202 |     printf("allocated theta\n");
203 | 
204 |     vector<vector<int> > corpus;
205 |     vector<int> corpus_size(D, 0);
206 |     corpus.resize(D);
207 |     vector<vector<int> > corpus_expanded;
208 |     corpus_expanded.resize(D);
209 | 
210 |     while (!feof(fptr)) {
211 |         fscanf(fptr, "%d %d %hhu\n", &docnum, &wnum, &countnum);
212 | 
213 |         corpus[docnum - 1].push_back(wnum - 1);
214 |         corpus[docnum - 1].push_back(countnum);
215 | 
216 |         corpus_size[docnum - 1] += countnum;
217 | 
218 |         for (i = 0; i < countnum; i++) {
219 |             corpus_expanded[docnum - 1].push_back(wnum - 1);
220 |         }
221 |     }
222 |     fclose(fptr);
223 | 
224 | 
225 |     // Initialize phi_est and all other arrays
226 |     nPi = new double*[W];
227 | 
228 |     for (i = 0; i < W; i++) {
229 |         nPi[i] = new double[K];
230 |     }
231 | 
232 |     // Initialize n_z and n_z_est and other arrays
233 |     N_z = new double[K];
234 |     for (k = 0; k < K; k++) {
235 |         N_z[k] = 0;
236 |     }
237 | 
238 |     nTheta = new double*[D];
239 |     for (i = 0; i < D; i++) {
240 |         nTheta[i] = new double[K];
241 |     }
242 | 
243 |     for (i = 0; i < D; i++) {
244 |         for (k = 0; k < K; k++) {
245 |             nTheta[i][k] = rand() % 10;
246 |         }
247 |     }
248 | 
249 |     perplexities = new double*[months->size()];
250 |     for (i = 0; i < months->size(); i++) {
251 |         perplexities[i] = new double[K];
252 |         for (unsigned int a = 0; a < K; ++a) {
253 |             perplexities[i][a] = 0;
254 |         }
255 |     }
256 | 
257 |     int*** topwords;
258 |     topwords = new int**[months->size()];
259 | 
260 |     //Generate Numbers according to Gaussian Distribution
261 | 
262 |     for (int timeSlice = 0; timeSlice < 10; timeSlice++) {
263 |         cout << (*months)[timeSlice] << " " << (*numOfDocs)[timeSlice] << endl;
264 | 
265 |         //if parallelizing this, make sure to avoid race condition (most likely use reduction)
266 |         for (k = 0; k < K; k++) {
267 |             for (w = 0; w < W; w++) {
268 |                 N_z[k] += nPi[w][k];
269 |             }
270 |         }
271 | 
272 |         // Find the total number of word in the document
273 |         int monthFirstDoc = monthFirstIdx->at(timeSlice);
274 |         int monthLastDoc = monthLastIdx->at(timeSlice);
275 | 
276 |         int monthD = monthLastDoc - monthFirstDoc;
277 | 
278 |         C = 0;
279 | 
280 |         for (j = monthFirstDoc; j < monthLastDoc; j++) {
281 |             C += corpus_size[j];
282 |         }
283 | 
284 |         printf("Number of words in corpus: %d\n", C);
285 | 
286 |         int firstdoc = 0;
287 |         int lastdoc = 0;
288 |         int DM = monthD / M;
289 | 
290 |         for (iter = 0; iter < (int)MAXITER; iter++) {
291 |             // Decide rho_phi and rho_theta
292 |             rhoPhi = 10 / pow((1000 + iter), 0.9);
293 |             rhoTheta = 1 / pow((10 + iter), 0.9);
294 | 
295 | #pragma omp parallel private(batch_idx,j,k,i,w,firstdoc,lastdoc)
296 |             {
297 |                 double *gamma = new double[K];
298 |                 double *nzHat = new double[K];
299 |                 double **nPhiHat = new double *[W];
300 |                 for (k = 0; k < K; k++) {
301 |                     gamma[k] = 0;
302 |                     nzHat[k] = 0;
303 |                 }
304 |                 for (i = 0; i < W; i++) {
305 |                     nPhiHat[i] = new double[K];
306 |                     for (k = 0; k < K; k++) {
307 |                         nPhiHat[i][k] = 0;
308 |                     }
309 |                 }
310 | 
311 | #pragma omp for
312 |                 for (batch_idx = 0; batch_idx < DM+1; batch_idx++) {
313 | 
314 |                     // Decide the document indices which go in each minibatch
315 |                     firstdoc = monthFirstDoc + (batch_idx * M);
316 |                     lastdoc = monthFirstDoc + ((batch_idx + 1) * M);
317 | 
318 |                     if (batch_idx == DM) {
319 |                         lastdoc = monthLastDoc;
320 |                     }
321 |                     for (j = (unsigned)firstdoc; j < (unsigned)lastdoc; j++) {
322 | 
323 |                         // First perform the burn-in passes
324 |                         // Iteration of burn in passes
325 | 
326 |                         // Store size of corpus in Cj
327 |                         Cj = corpus_size[j];
328 | 
329 |                         for (i = 0; i < (corpus[j].size() / 2); i++) {// indexing is very different here!
330 | 
331 |                             int w_aj = corpus[j][2 * i];
332 |                             int m_aj = corpus[j][(2 * i) + 1];
333 |                             // Update gamma_ij and N_theta
334 |                             double normSum = 0;
335 | 
336 |                             for (k = 0; k < K; k++) {
337 |                                 gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W));
338 |                                 normSum += gamma[k];
339 |                             }
340 | 
341 |                             for (k = 0; k < K; k++) {
342 |                                 gamma[k] = gamma[k] / normSum;
343 |                             }
344 | 
345 |                             for (k = 0; k < K; k++) {
346 | 
347 |                                 nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k])
348 |                                         + ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]);
349 |                             }
350 | 
351 |                         }
352 | 
353 |                         // Iteration of the main loop
354 |                         for (i = 0; i < (corpus[j].size() / 2); i++) { // indexing is very different here!
355 | 
356 |                             int w_aj = corpus[j][2 * i];
357 |                             int m_aj = corpus[j][(2 * i) + 1];
358 |                             double normSum = 0;
359 |                             for (k = 0; k < K; k++) {
360 |                                 gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W));
361 |                                 normSum += gamma[k];
362 |                             }
363 | 
364 |                             for (k = 0; k < K; k++) {
365 |                                 gamma[k] = gamma[k] / normSum;
366 |                             }
367 | 
368 |                             // Update N_theta estimates
369 |                             for (k = 0; k < K; k++) {
370 |                                 nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k])
371 |                                         + ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]);
372 | 
373 |                                 nPhiHat[w_aj][k] = nPhiHat[w_aj][k] + (C * gamma[k] / M);
374 | 
375 |                                 nzHat[k] = nzHat[k] + (C * gamma[k] / M);
376 |                             }
377 |                         }
378 | 
379 |                     } // End of j
380 | 
381 |                     // Update the estimates matrix
382 |                     for (k = 0; k < K; k++) {
383 |                         for (w = 0; w < W; w++) {
384 |                             nPi[w][k] = (1 - rhoPhi) * nPi[w][k] + rhoPhi * nPhiHat[w][k];
385 |                         }
386 | #pragma omp atomic
387 |                         N_z[k] *= (1 - rhoPhi);
388 | #pragma omp atomic
389 |                         N_z[k] += rhoPhi * nzHat[k];
390 |                     }
391 | 
392 |                 } // End of batch_idx
393 | 
394 |                 // Compute phi
395 | #pragma omp for
396 |                 for (k = 0; k < K; k++) {
397 |                     double normSum = 0;
398 |                     for (w = 0; w < W; w++) {
399 |                         nPi[w][k] += eta;
400 |                         normSum += nPi[w][k];
401 |                     }
402 | //                    cout << normSum << endl;
403 |                     for (w = 0; w < W; w++) {
404 |                         Pi[timeSlice][w][k] = (double) nPi[w][k] / normSum;
405 |                     }
406 |                 }
407 | 
408 |                 // Compute theta
409 | #pragma omp for
410 |                 for (i = monthFirstDoc; i < monthLastDoc; i++) {
411 |                     double normSum = 0;
412 |                     for (k = 0; k < K; k++) {
413 |                         nTheta[i][k] += alpha;
414 |                         normSum += nTheta[i][k];
415 |                     }
416 |                     for (k = 0; k < K; k++) {
417 |                         theta[i][k] = (double) nTheta[i][k] / normSum;
418 |                     }
419 |                 }
420 | 
421 |                 delete[] gamma;
422 |                 delete[] nzHat;
423 | 
424 |                 for (i = 0; i < W; i++) {
425 |                     delete[] nPhiHat[i];
426 |                 }
427 | 
428 |                 delete[] nPhiHat;
429 | 
430 |             }
431 | 
432 |         } // End of iter
433 | 
434 |         //write doctopics file
435 |         /*ofstream dtfile;
436 |         dtfile.open("output/doctopic_" + to_string(months->at(timeSlice)) + ".txt");
437 |         for (i = monthFirstDoc; i < monthLastDoc; i++) {
438 |             for (k = 0; k < K; k++) {
439 |                 dtfile << theta[i][k] << ",";
440 |             }
441 |             dtfile << endl;
442 |         }
443 |         dtfile.close();*/
444 | 
445 |         //compute the top 100 words for each topic
446 | 
447 | //        double** maxval;
448 | //        topwords[timeSlice] = new int*[K];
449 | //        maxval = new double*[K];
450 | //        for (k = 0; k < K; k++) {
451 | //            topwords[timeSlice][k] = new int[100];
452 | //            maxval[k] = new double[100];
453 | //        }
454 | //        for (k = 0; k < K; k++) {
455 | //            double oldMax = std::numeric_limits<double>::max();
456 | //            for (i = 0; i < 100; i++) {
457 | //                double max = -1;
458 | //                int max_idx = -1;
459 | //                for (w = 0; w < W; w++) {
460 | //                    if (oldMax > Pi[timeSlice][w][k] && Pi[timeSlice][w][k] > max) {
461 | //                        max = Pi[timeSlice][w][k];
462 | //                        max_idx = w;
463 | //                    }
464 | //                }
465 | //                oldMax = Pi[timeSlice][max_idx][k];
466 | //                topwords[timeSlice][k][i] = max_idx;
467 | //                maxval[k][i] = max;
468 | //            }
469 | //        }
470 |     }//All timeSlices finished
471 | 
472 |     // MAKE CHAINS
473 |     Graph G;
474 |     //K is unsigned -- is this a problem?
475 | //    generateAllLinks(G, Pi, months->size(), K, windowSize, similarityThreshold);
476 |     generateAllLinks(G, Pi, 10, K, windowSize, similarityThreshold);
477 | 
478 |     vector<int> component(num_vertices(G));
479 |     int num = connected_components(G, &component[0]);
480 | 
481 |     vector<int>::size_type p;
482 |     cout << "Total number of components: " << num << endl;
483 |     for (p = 0; p != component.size(); ++p)
484 |    		cout << "Vertex " << p <<" is in component " << component[p] << endl;
485 | 
486 | //    string *dict;
487 | //    dict = new string[W];
488 | ////    char word;
489 | //    //retrieve the words from the file
490 | //    w = 0;
491 | //    string line;
492 | //    ifstream vocabFile(argv[5]);
493 | //    if (vocabFile.is_open()) {
494 | //        while (getline(vocabFile, line)) {
495 | //            dict[w] = line;
496 | //            w++;
497 | //        }
498 | //        vocabFile.close();
499 | //    }
500 | 
501 | //    write topics file
502 | //    for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) {
503 | //    for (int timeSlice = 0; timeSlice < 10; timeSlice++) {
504 | //        ofstream tfile;
505 | //        tfile.open("output/topics_" + to_string(months->at(timeSlice)) + ".txt");
506 | //        for (k = 0; k < K; k++) {
507 | //            for (w = 0; w < 100; w++) {
508 | //                tfile << topwords[timeSlice][k][w];// << ":" << maxval[k][w] << ",";
509 | //
510 | //            }
511 | //            tfile << endl;
512 | //
513 | //            for (w = 0; w < 100; w++) {
514 | //                tfile << dict[topwords[timeSlice][k][w]] << ",";
515 | //            }
516 | //
517 | //            tfile << endl;
518 | //        }
519 | //        tfile.close();
520 | //    }
521 | 
522 |     return (0);
523 | 
524 | } // End of main
525 | 


--------------------------------------------------------------------------------
/SCVB0/scvb0.cpp:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <vector>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <omp.h>
  6 | #include <iostream>
  7 | #include <fstream>
  8 | #include <string.h>
  9 | #include <time.h>
 10 | #include <random>
 11 | #include <cstring>
 12 | #include <map>
 13 | using namespace std;
 14 | 
 15 | // Initialize number of documents, topics and words in vocabulary
 16 | unsigned int W, D, K;
 17 | 
 18 | void Transform(double** beta_t, double** npi) {
 19 | 	double* Beta_Total = new double[K];
 20 | 	for (unsigned int var = 0; var < K; ++var) {
 21 | 		Beta_Total[var] = 0;
 22 | 	}
 23 | 	for (unsigned int q = 0; q < K; ++q) {
 24 | 		for (unsigned int p = 0; p < W; ++p) {
 25 | 			Beta_Total[q] += exp(beta_t[p][q]);
 26 | 		}
 27 | 		cout << "BetaTol: " << Beta_Total[q] << endl;
 28 | 	}
 29 | 	for (unsigned int p = 0; p < W; ++p) {
 30 | 		for (unsigned int q = 0; q < K; ++q) {
 31 | 			npi[p][q] = exp(beta_t[p][q]) / Beta_Total[q];
 32 | 		}
 33 | 	}
 34 | 	delete [] Beta_Total;
 35 | }
 36 | 
 37 | void InverseTransform(double** pi, double** beta_t) {
 38 | 	double* Pi_Total = new double[K];
 39 | 	for (unsigned int var = 0; var < K; ++var) {
 40 | 		Pi_Total[var] = 0;
 41 | 	}
 42 | 	for (unsigned int q = 0; q < K; ++q) {
 43 | 		for (unsigned int p = 0; p < W; ++p) {
 44 | 			Pi_Total[q] += pi[p][q];
 45 | 		}
 46 | 	}
 47 | 	for (unsigned int p = 0; p < W; ++p) {
 48 | 		for (unsigned int q = 0; q < K; ++q) {
 49 | 			beta_t[p][q] = log(pi[p][q] / Pi_Total[q]);
 50 | 		}
 51 | 	}
 52 | 	delete [] Pi_Total;
 53 | }
 54 | 
 55 | void runRegularSCVB(double** nPi,vector<vector<int> > &corpus, vector<int> &corpus_size, int MAXITER, double M){
 56 | 
 57 | 	double **nTheta;
 58 | 	double *N_z;
 59 | 
 60 | 	double rhoTheta = 0;
 61 | 	double rhoPhi = 0;
 62 | 	double **Pi;
 63 | 	// Initlalize dirichlet prior parameters
 64 | 	double alpha, eta;
 65 | 	//double M; // Number of documents in each minibatch
 66 | 	int Cj = 0;
 67 | 	unsigned int i, j, k, w;
 68 | 	int batch_idx = 0;
 69 | 	int C = 0;
 70 | 	int iter = 0;
 71 | 	ofstream pfile;
 72 | 	pfile.open("perplexity.txt");
 73 | 
 74 | 	//M = 100; //343 works for KOS and only for KOS
 75 | 	eta = 0.01; // was 0.01
 76 | 	alpha = 0.1;
 77 | 
 78 | 	// Dynamically allocate phi
 79 | 	Pi = new double*[W];
 80 | 	//#pragma omp parallel for
 81 | 	for (w = 0; w < W; w++) {
 82 | 		Pi[w] = new double[K];
 83 | 	}
 84 | 
 85 | 	// Initialize phi_est and all other arrays
 86 | 	N_z = new double[K];
 87 | 
 88 | 	for (unsigned int var = 0; var < W; ++var) {
 89 | 		for (unsigned int var2 = 0; var2 < K; ++var2) {
 90 | 			nPi[var][var2] = rand() % 10;
 91 | 		}
 92 | 	}
 93 | 	nTheta = new double*[D];
 94 | 	for (i = 0; i < D; i++) {
 95 | 		nTheta[i] = new double[K];
 96 | 	}
 97 | 
 98 | 	for (i = 0; i < D; i++) {
 99 | 		for (k = 0; k < K; k++) {
100 | 			nTheta[i][k] = rand() % 10;
101 | 		}
102 | 	}
103 | 
104 | 	for (k = 0; k < K; k++) {
105 | 		N_z[k] = 0;
106 | 		for (w = 0; w < W; w++) {
107 | 			N_z[k] += nPi[w][k];
108 | 		}
109 | 	}
110 | 
111 | 
112 | 	// Find the total number of word in the document
113 | 	C = 0;
114 | 	 for(j=0;j<D;j++)
115 | 	    {
116 | 	        C += corpus_size[j];
117 | 	    }
118 | 
119 | 	printf("Number of words in corpus: %d\n", C);
120 | 
121 | 	int firstdoc = 0;
122 | 	int lastdoc = 0;
123 | 	int DM = D / M;
124 | 
125 | 	for (iter = 0; iter < (int)MAXITER; iter++) {
126 | 		cout<<"Executing init SCVB0 iteration no: "<<iter+1<<endl;
127 | 		// Decide rho_phi and rho_theta
128 | 		rhoPhi = 10 / pow((1000 + iter), 0.9);
129 | 		rhoTheta = 1 / pow((10 + iter), 0.9);
130 | 
131 | #pragma omp parallel private(batch_idx,j,k,i,w,firstdoc,lastdoc)
132 | 		{
133 | 			double *gamma = new double[K];
134 | 			double *nzHat = new double[K];
135 | 			double **nPhiHat = new double *[W];
136 | 			for (k = 0; k < K; k++) {
137 | 				gamma[k] = 0;
138 | 				nzHat[k] = 0;
139 | 			}
140 | 			for (i = 0; i < W; i++) {
141 | 				nPhiHat[i] = new double[K];
142 | 				for (k = 0; k < K; k++) {
143 | 					nPhiHat[i][k] = 0;
144 | 				}
145 | 			}
146 | 
147 | #pragma omp for
148 | 			for (batch_idx = 0; batch_idx < DM; batch_idx++) {
149 | 
150 | 				// Decide the document indices which go in each minibatch
151 | 				firstdoc = batch_idx * M;
152 | 				lastdoc = (batch_idx+1) * M;
153 | 
154 | 				for (j = (unsigned)firstdoc; j < (unsigned)lastdoc; j++) {
155 | 
156 | 					// First perform the burn-in passes
157 | 					// Iteration of burn in passes
158 | 
159 | 					// Store size of corpus in Cj
160 | 					Cj = corpus_size[j];
161 | 
162 | 					for (i = 0; i < (corpus[j].size() / 2); i++) {// indexing is very different here!
163 | 
164 | 						int w_aj = corpus[j][2 * i];
165 | 						int m_aj = corpus[j][(2 * i) + 1];
166 | 						// Update gamma_ij and N_theta
167 | 						double normSum = 0;
168 | 
169 | 						for (k = 0; k < K; k++) {
170 | 							gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W));
171 | 							normSum += gamma[k];
172 | 						}
173 | 
174 | 						for (k = 0; k < K; k++) {
175 | 							gamma[k] = gamma[k] / normSum;
176 | 						}
177 | 
178 | 						for (k = 0; k < K; k++) {
179 | 
180 | 							nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k])
181 | 									+ ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]);
182 | 						}
183 | 
184 | 					}
185 | 
186 | 					// Iteration of the main loop
187 | 					for (i = 0; i < (corpus[j].size() / 2); i++) { // indexing is very different here!
188 | 
189 | 						int w_aj = corpus[j][2 * i];
190 | 						int m_aj = corpus[j][(2 * i) + 1];
191 | 						double normSum = 0;
192 | 						for (k = 0; k < K; k++) {
193 | 							gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W));
194 | 							normSum += gamma[k];
195 | 						}
196 | 
197 | 						for (k = 0; k < K; k++) {
198 | 							gamma[k] = gamma[k] / normSum;
199 | 						}
200 | 
201 | 						// Update N_theta estimates
202 | 						for (k = 0; k < K; k++) {
203 | 							nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k])
204 | 									+ ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]);
205 | 
206 | 							nPhiHat[w_aj][k] = nPhiHat[w_aj][k] + (C * gamma[k] / M);
207 | 
208 | 							nzHat[k] = nzHat[k] + (C * gamma[k] / M);
209 | 						}
210 | 					}
211 | 
212 | 				} // End of j
213 | 
214 | 				// Update the estimates matrix
215 | 				for (k = 0; k < K; k++) {
216 | 					for (w = 0; w < W; w++) {
217 | 						nPi[w][k] = (1 - rhoPhi) * nPi[w][k] + rhoPhi * nPhiHat[w][k];
218 | 					}
219 | #pragma omp atomic
220 | 					N_z[k] *= (1 - rhoPhi);
221 | #pragma omp atomic
222 | 					N_z[k] += rhoPhi * nzHat[k];
223 | 				}
224 | 
225 | 			} // End of batch_idx
226 | 
227 | 			// Compute phi
228 | #pragma omp for
229 | 			for (k = 0; k < K; k++) {
230 | 				double normSum = 0;
231 | 				for (w = 0; w < W; w++) {
232 | 					nPi[w][k] += eta;
233 | 					normSum += nPi[w][k];
234 | 				}
235 | 				for (w = 0; w < W; w++) {
236 | 					Pi[w][k] = (double) nPi[w][k] / normSum;
237 | 				}
238 | 			}
239 | 
240 | 			delete[] gamma;
241 | 			delete[] nzHat;
242 | 
243 | 			for (i = 0; i < W; i++) {
244 | 				delete[] nPhiHat[i];
245 | 			}
246 | 
247 | 			delete[] nPhiHat;
248 | 
249 | 		}
250 | 
251 | 	} // End of iter
252 | 
253 | //	InverseTransform(Pi, Beta_t_1);
254 | }
255 | 
256 | int main(int argc, char* argv[]) {
257 | 	if (argc < 4) {
258 | 		printf("Usage: ./fastLDA inputfile num_iterations num_topics\n");
259 | 		return 1;
260 | 	}
261 | 
262 | 	// Initlialize expected topic counts per document
263 | 	double **nTheta;
264 | 	// Dynamically
265 | 	double **nPi;
266 | 	double *N_z;
267 | 	// Initialize estimates from each minibatch
268 | 	// Initialize step sizes
269 | 	double rhoTheta = 0;
270 | 	double rhoPhi = 0;
271 | 	double **Pi;
272 | 	double **theta;
273 | 	double **perplexities;
274 | 	// Initlalize dirichlet prior parameters
275 | 	double alpha, eta;
276 | 	double M; // Number of documents in each minibatch
277 | 	int Cj = 0;
278 | 	unsigned int i, j, k, w, MAXITER;
279 | 	int batch_idx = 0;
280 | 	int C = 0;
281 | 	int iter = 0;
282 | 	int NNZ;
283 | 	double perplexityval, innerval;
284 | 	ofstream pfile;
285 | 	pfile.open("perplexity.txt");
286 | 
287 | 	M = 100; //343 works for KOS and only for KOS
288 | 	eta = 0.01; // was 0.01
289 | 	alpha = 0.1;
290 | 
291 | 	ifstream seqfile;
292 | 	seqfile.open("Data/test-seq.dat");
293 | 	string newline = "";
294 | 	vector<int>* months = new vector<int>();
295 | 	vector<int>* numOfDocs = new vector<int>();
296 | 	vector<int>* monthFirstIdx = new vector<int>();
297 | 	vector<int>* monthLastIdx = new vector<int>();
298 | 	int curIdx = 0;
299 | 
300 | 	while (seqfile >> newline) {
301 | 		const char * ptr = strchr(newline.c_str(), ':');
302 | 		int count = atoi(ptr + 1);
303 | 		ptr = "\0";
304 | 		int yearMonth = atoi(newline.c_str());
305 | 		months->push_back(yearMonth);
306 | 		numOfDocs->push_back(count);
307 | 		monthFirstIdx->push_back(curIdx);
308 | 		monthLastIdx->push_back(curIdx + count);
309 | 		curIdx += count;
310 | 	}
311 | 	seqfile.close();
312 | 
313 | 	//if user also specified a minibatch size
314 | 	//if (argc == 5 || argc == 6) {
315 | 		M = atof(argv[4]);
316 | 	//}
317 | 
318 | 	MAXITER = atoi(argv[2]);
319 | 	K = atoi(argv[3]);
320 | 
321 | 	printf("Input file: %s\n", argv[1]);
322 | 	printf("Number of iterations: %d\n", MAXITER);
323 | 	printf("Number of topics: %d\n", K);
324 | 	printf("Minibatch size: %f\n", M);
325 | 	printf("alpha:  %f\n", alpha);
326 | 	printf("eta:  %f\n", eta);
327 | 
328 | 	// Read the file and store it in DATA
329 | 	FILE* fptr;
330 | 	unsigned int docnum, wnum;
331 | 	unsigned char countnum;
332 | 
333 | 	fptr = fopen(argv[1], "rt");
334 | 
335 | 	fscanf(fptr, "%d\n", &D);
336 | 	fscanf(fptr, "%d\n", &W);
337 | 	fscanf(fptr, "%d\n", &NNZ);
338 | 
339 | 	printf("Number of documents: %d\n", D);
340 | 	printf("Vocabulary size: %d\n", W);
341 | 
342 | 	// Dynamically allocate phi
343 | 	Pi = new double*[W];
344 | //#pragma omp parallel for
345 | 	for (w = 0; w < W; w++) {
346 | 		Pi[w] = new double[K];
347 | 	}
348 | 
349 | 	printf("allocated phi\n");
350 | 
351 | 	// Dynamically allocate theta
352 | 
353 | 	theta = new double*[D];
354 | //#pragma omp parallel for
355 | 	for (i = 0; i < D; i++) {
356 | 		theta[i] = new double[K];
357 | 	}
358 | 
359 | 	printf("allocated theta\n");
360 | 
361 | 	vector<vector<int> > corpus;
362 | 	vector<int> corpus_size(D, 0);
363 | 	corpus.resize(D);
364 | 	vector<vector<int> > corpus_expanded;
365 | 	corpus_expanded.resize(D);
366 | 
367 | 	while (!feof(fptr)) {
368 | 		fscanf(fptr, "%d %d %hhu\n", &docnum, &wnum, &countnum);
369 | 
370 | 		corpus[docnum - 1].push_back(wnum - 1);
371 | 		corpus[docnum - 1].push_back(countnum);
372 | 
373 | 		corpus_size[docnum - 1] += countnum;
374 | 
375 | 		for (i = 0; i < countnum; i++) {
376 | 			corpus_expanded[docnum - 1].push_back(wnum - 1);
377 | 		}
378 | 	}
379 | 	fclose(fptr);
380 | 
381 | 
382 | 	// Initialize phi_est and all other arrays
383 | 	nPi = new double*[W];
384 | 
385 | 	for (i = 0; i < W; i++) {
386 | 		nPi[i] = new double[K];
387 | 	}
388 | 
389 | 	// Initialize n_z and n_z_est and other arrays
390 | 	N_z = new double[K];
391 | 	for (k = 0; k < K; k++) {
392 | 		N_z[k] = 0;
393 | 	}
394 | 
395 | 	nTheta = new double*[D];
396 | 	for (i = 0; i < D; i++) {
397 | 		nTheta[i] = new double[K];
398 | 	}
399 | 
400 | 	for (i = 0; i < D; i++) {
401 | 		for (k = 0; k < K; k++) {
402 | 			nTheta[i][k] = rand() % 10;
403 | 		}
404 | 	}
405 | 
406 | 	perplexities = new double*[months->size()];
407 | 	for (i = 0; i < months->size(); i++) {
408 | 		perplexities[i] = new double[MAXITER];
409 | 		for (unsigned int a = 0; a < MAXITER; ++a) {
410 | 			perplexities[i][a] = 0;
411 | 		}
412 | 	}
413 | 
414 | 	int*** topwords;
415 | 	double** maxval;
416 | 	topwords = new int**[months->size()];
417 | 
418 | 	//Generate Numbers according to Gaussian Distribution
419 | 	std::default_random_engine generator;
420 | 	double **Beta_t_1 = new double*[W];
421 | 	double **Beta_t = new double*[W];
422 | 	for (i = 0; i < W; i++) {
423 | 		Beta_t_1[i] = new double[K];
424 | 		Beta_t[i] = new double[K];
425 | 	}
426 | 
427 | 	for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) {
428 | 		cout << (*months)[timeSlice] << " " << (*numOfDocs)[timeSlice] << endl;
429 | 
430 | 		//We are initializing nPi from 5 runs of regular SCVB
431 | 		if (timeSlice == 0) {
432 | 			//Run SCVB to initialize betas and thus reduce initial bias
433 | 			runRegularSCVB(nPi, corpus, corpus_size, 5, M);
434 | 		} else {
435 | 			for (unsigned int word = 0; word < W; ++word) {
436 | 				for (unsigned int topic = 0; topic < K; ++topic) {
437 | 					normal_distribution<double> distribution(Beta_t_1[word][topic], 0.005);
438 | 					Beta_t[word][topic] = distribution(generator);
439 | 				}
440 | 			}
441 | 			Transform(Beta_t, nPi);
442 | 		}
443 | 
444 | 		//if parallelizing this, make sure to avoid race condition (most likely use reduction)
445 | 		for (k = 0; k < K; k++) {
446 | 			N_z[k] = 0;
447 | 			for (w = 0; w < W; w++) {
448 | 				N_z[k] += nPi[w][k];
449 | 			}
450 | 		}
451 | 
452 | 
453 | 		// Find the total number of word in the document
454 | 		int monthFirstDoc = monthFirstIdx->at(timeSlice);
455 | 		int monthLastDoc = monthLastIdx->at(timeSlice);
456 | 
457 | 		int monthD = monthLastDoc - monthFirstDoc;
458 | 
459 | 		C = 0;
460 | 
461 | 		for (j = monthFirstDoc; j < (unsigned)monthLastDoc; j++) {
462 | 			C += corpus_size[j];
463 | 		}
464 | 
465 | 		printf("Number of words in corpus: %d\n", C);
466 | 
467 | 		int firstdoc = 0;
468 | 		int lastdoc = 0;
469 | 		int DM = monthD / M;
470 | 
471 | 		for (iter = 0; iter < (int)MAXITER; iter++) {
472 | 			// Decide rho_phi and rho_theta
473 | 			rhoPhi = 10 / pow((1000 + iter), 0.9);
474 | 			rhoTheta = 1 / pow((10 + iter), 0.9);
475 | 
476 | #pragma omp parallel private(batch_idx,j,k,i,w,firstdoc,lastdoc)
477 | 			{
478 | 				double *gamma = new double[K];
479 | 				double *nzHat = new double[K];
480 | 				double **nPhiHat = new double *[W];
481 | 				for (k = 0; k < K; k++) {
482 | 					gamma[k] = 0;
483 | 					nzHat[k] = 0;
484 | 				}
485 | 				for (i = 0; i < W; i++) {
486 | 					nPhiHat[i] = new double[K];
487 | 					for (k = 0; k < K; k++) {
488 | 						nPhiHat[i][k] = 0;
489 | 					}
490 | 				}
491 | 
492 | #pragma omp for
493 | 				for (batch_idx = 0; batch_idx < DM+1; batch_idx++) {
494 | 
495 | 					// Decide the document indices which go in each minibatch
496 | 					firstdoc = monthFirstDoc + (batch_idx * M);
497 | 					lastdoc = monthFirstDoc + ((batch_idx + 1) * M);
498 | 
499 | 					if (batch_idx == DM) {
500 | 						lastdoc = monthLastDoc;
501 | 					}
502 | 					for (j = (unsigned)firstdoc; j < (unsigned)lastdoc; j++) {
503 | 
504 | 						// First perform the burn-in passes
505 | 						// Iteration of burn in passes
506 | 
507 | 						// Store size of corpus in Cj
508 | 						Cj = corpus_size[j];
509 | 
510 | 						for (i = 0; i < (corpus[j].size() / 2); i++) {// indexing is very different here!
511 | 
512 | 							int w_aj = corpus[j][2 * i];
513 | 							int m_aj = corpus[j][(2 * i) + 1];
514 | 							// Update gamma_ij and N_theta
515 | 							double normSum = 0;
516 | 
517 | 							for (k = 0; k < K; k++) {
518 | 								gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W));
519 | 								normSum += gamma[k];
520 | 							}
521 | 
522 | 							for (k = 0; k < K; k++) {
523 | 								gamma[k] = gamma[k] / normSum;
524 | 							}
525 | 
526 | 							for (k = 0; k < K; k++) {
527 | 
528 | 								nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k])
529 | 										+ ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]);
530 | 							}
531 | 
532 | 						}
533 | 
534 | 						// Iteration of the main loop
535 | 						for (i = 0; i < (corpus[j].size() / 2); i++) { // indexing is very different here!
536 | 
537 | 							int w_aj = corpus[j][2 * i];
538 | 							int m_aj = corpus[j][(2 * i) + 1];
539 | 							double normSum = 0;
540 | 							for (k = 0; k < K; k++) {
541 | 								gamma[k] = (nPi[w_aj][k] + eta) * (nTheta[j][k] + alpha) / (N_z[k] + (eta * W));
542 | 								normSum += gamma[k];
543 | 							}
544 | 
545 | 							for (k = 0; k < K; k++) {
546 | 								gamma[k] = gamma[k] / normSum;
547 | 							}
548 | 
549 | 							// Update N_theta estimates
550 | 							for (k = 0; k < K; k++) {
551 | 								nTheta[j][k] = (pow((1 - rhoTheta), m_aj) * nTheta[j][k])
552 | 										+ ((1 - pow((1 - rhoTheta), m_aj)) * Cj * gamma[k]);
553 | 
554 | 								nPhiHat[w_aj][k] = nPhiHat[w_aj][k] + (C * gamma[k] / M);
555 | 
556 | 								nzHat[k] = nzHat[k] + (C * gamma[k] / M);
557 | 							}
558 | 						}
559 | 
560 | 					} // End of j
561 | 
562 | 					// Update the estimates matrix
563 | 					for (k = 0; k < K; k++) {
564 | 						for (w = 0; w < W; w++) {
565 | 							nPi[w][k] = (1 - rhoPhi) * nPi[w][k] + rhoPhi * nPhiHat[w][k];
566 | 						}
567 | #pragma omp atomic
568 | 						N_z[k] *= (1 - rhoPhi);
569 | #pragma omp atomic
570 | 						N_z[k] += rhoPhi * nzHat[k];
571 | 					}
572 | 
573 | 				} // End of batch_idx
574 | 
575 | 				// Compute phi
576 | #pragma omp for
577 | 				for (k = 0; k < K; k++) {
578 | 					double normSum = 0;
579 | 					for (w = 0; w < W; w++) {
580 | 						nPi[w][k] += eta;
581 | 						normSum += nPi[w][k];
582 | 					}
583 | 					for (w = 0; w < W; w++) {
584 | 						Pi[w][k] = (double) nPi[w][k] / normSum;
585 | 					}
586 | 				}
587 | 
588 | 				// Compute theta
589 | #pragma omp for
590 | 				for (i = monthFirstDoc; i < (unsigned)monthLastDoc; i++) {
591 | 					double normSum = 0;
592 | 					for (k = 0; k < K; k++) {
593 | 						nTheta[i][k] += alpha;
594 | 						normSum += nTheta[i][k];
595 | 					}
596 | 					for (k = 0; k < K; k++) {
597 | 						theta[i][k] = (double) nTheta[i][k] / normSum;
598 | 					}
599 | 				}
600 | 
601 | 				delete[] gamma;
602 | 				delete[] nzHat;
603 | 
604 | 				for (i = 0; i < W; i++) {
605 | 					delete[] nPhiHat[i];
606 | 				}
607 | 
608 | 				delete[] nPhiHat;
609 | 
610 | 			}
611 | 
612 | 			// Calculate the perplexity here
613 | 			// Compute posterior means here
614 | 			// Iterate over the corpus here
615 | 			perplexityval = 0;
616 | #pragma omp parallel for private(j,i,k) reduction(+:innerval) reduction(+:perplexityval)
617 | 			for (j = monthFirstDoc; j < (unsigned)monthLastDoc; j++) {
618 | 				for (i = 0; i < corpus_expanded[j].size(); i++) {
619 | 					innerval = 0;
620 | 					for (k = 0; k < K; k++) {
621 | 						innerval += (theta[j][k] * Pi[corpus_expanded[j][i]][k]);
622 | 					}
623 | 					perplexityval += (log(innerval) / log(2));
624 | 				}
625 | 			}
626 | 			printf("%d,%f\n", iter, pow(2, -perplexityval / C));
627 | 			perplexities[timeSlice][iter] = pow(2, -perplexityval / C);
628 | 
629 | 			pfile << iter + 1 << "," << perplexities[timeSlice][iter] << endl;
630 | 			pfile.flush();
631 | 
632 | 		} // End of iter
633 | 
634 | 		if (argc == 7) {
635 | 			ofstream pifile;
636 | 			pifile.open("Pi/topics_" + to_string(months->at(timeSlice)) + ".txt");
637 | 			for (k = 0; k < K; k++) {
638 | 				for (w = 0; w < W; w++) {
639 | 					if(w == 23701){
640 | 						cout<<Pi[w][k]<<" at :"<<timeSlice<<", "<<k<<endl;
641 | 					}
642 | 					pifile << Pi[w][k] << ",";
643 | 				}
644 | 				pifile << endl;
645 | 			}
646 | 			pifile.close();
647 | 		}
648 | 
649 | 		//compute the top 100 words for each topic
650 | 
651 | 		topwords[timeSlice] = new int*[K];
652 | 		maxval = new double*[K];
653 | 		for (k = 0; k < K; k++) {
654 | 			topwords[timeSlice][k] = new int[100];
655 | 			maxval[k] = new double[100];
656 | 		}
657 | 		for (k = 0; k < K; k++) {
658 | 			double oldMax = std::numeric_limits<double>::max();
659 | 			for (i = 0; i < 100; i++) {
660 | 				double max = -1;
661 | 				int max_idx = -1;
662 | 				for (w = 0; w < W; w++) {
663 | 					if ((oldMax > Pi[w][k]) && (Pi[w][k] > max)) {
664 | 						max = Pi[w][k];
665 | 						max_idx = w;
666 | 					}
667 | 				}
668 | 				oldMax = Pi[max_idx][k];
669 | 				topwords[timeSlice][k][i] = max_idx;
670 | 				maxval[k][i] = max;
671 | 			}
672 | 		}
673 | 
674 | 		InverseTransform(Pi, Beta_t);
675 | 		for (unsigned int word = 0; word < W; ++word) {
676 | 			for (unsigned int topic = 0; topic < K; ++topic) {
677 | 				Beta_t_1[word][topic] = Beta_t[word][topic];
678 | 			}
679 | 		}
680 | 	} // End of TimeSlice Loop
681 | 	string *dict;
682 | 	dict = new string[W];
683 | //	char word;
684 | 	//retrieve the words from the file
685 | 	w = 0;
686 | 	string line;
687 | 	ifstream vocabFile(argv[5]);
688 | 	if (vocabFile.is_open()) {
689 | 		while (getline(vocabFile, line)) {
690 | 			dict[w] = line;
691 | 			w++;
692 | 		}
693 | 		vocabFile.close();
694 | 	}
695 | 
696 | //	write topics file
697 | 	for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) {
698 | 		ofstream tfile;
699 | 		tfile.open("output/topics_" + to_string(months->at(timeSlice)) + ".txt");
700 | 		for (k = 0; k < K; k++) {
701 | 			for (w = 0; w < 100; w++) {
702 | 				tfile << topwords[timeSlice][k][w] << ":" << maxval[k][w] << ",";
703 | 			}
704 | 			tfile << endl;
705 | 
706 | 			for (w = 0; w < 100; w++) {
707 | 				tfile << dict[topwords[timeSlice][k][w]] << ",";
708 | 			}
709 | 			tfile << endl;
710 | 		}
711 | 		tfile.close();
712 | 	}
713 | 
714 | 	ofstream topTenfile;
715 | 	topTenfile.open("TopTen.txt");
716 | 	for (k = 0; k < K; k++) {
717 | 		for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) {
718 | 			for (w = 0; w < 10; w++) {
719 | 				topTenfile << topwords[timeSlice][k][w] << ",";
720 | 			}
721 | 		}
722 | 		topTenfile << endl;
723 | 	}
724 | 	topTenfile.close();
725 | 
726 | 	return (0);
727 | 
728 | } // End of main
729 | 


--------------------------------------------------------------------------------