├── .gitignore
├── Scrapper
├── article.py
├── ldac2uci.py
├── removeInfrequentWords.py
├── getarticle.py
├── __init__.py
├── text2ldac.py
└── multitext2ldac.py
├── Evaluation
├── getFinalPerplexities.py
├── WordTrends.py
├── getWordVariation.py
└── lookupWords.py
├── makefile
├── README.md
├── TopicChains
├── GenerateChains.cpp
├── GetData.cpp
└── TopicChains.cpp
├── SCVB0_Evaluation
└── scvb.cpp
└── SCVB0
└── scvb0.cpp
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled source #
2 | ###################
3 | *.o
4 | *.pyc
5 | #Data
6 | *.txt
7 | /Data
8 | #eclipse files
9 | .settings/
10 | *~
11 | .cproject
12 | /Debug
13 |
--------------------------------------------------------------------------------
/Scrapper/article.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Mar 29, 2014
3 |
4 | @author: vspathak
5 | '''
6 |
7 | class article(object):
8 | def __init__(self, title, date, text, url, ID):
9 | self.Title = title
10 | self.Date = date
11 | self.Text = text
12 | self.URL = url
13 | self.id = ID
14 |
15 |
--------------------------------------------------------------------------------
/Scrapper/ldac2uci.py:
--------------------------------------------------------------------------------
1 | infile = open('jan.dat')
2 | outfile = open('JanUCI.txt', 'w')
3 |
4 | doclines = infile.readlines()
5 | infile.close()
6 |
7 | for i in range(len(doclines)):
8 | line = doclines[i].strip().split(' ')[1:]
9 | for elt in line:
10 | pieces = elt.split(':')
11 | outfile.write(str(i + 1) + ' ' + str(int(pieces[0]) + 1) + ' ' + pieces[1] + '\n')
12 |
13 | outfile.close()
14 |
--------------------------------------------------------------------------------
/Evaluation/getFinalPerplexities.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | if len(sys.argv) >= 2:
5 | perpfilename = sys.argv[1]
6 | num_iter = int(sys.argv[2])
7 |
8 | perpfile = open(perpfilename)
9 |
10 | finalperps = [perp for perp in perpfile.readlines() if perp[1] != ',' and int(perp[:2]) == num_iter]
11 |
12 | # for perp in finalperps:
13 | # print perp.split(',')[1]
14 |
15 | #print len(finalperps)
16 | for i in range(len(finalperps)):
17 | if i % 3 == 0:
18 | print i/3, finalperps[i].split(',')[1]
19 |
--------------------------------------------------------------------------------
/Evaluation/WordTrends.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | topTenFile = open('/home/vspathak/git/DynamicLDA/TopTen.txt', 'r')
4 | for input in topTenFile:
5 | words = input.split(',')
6 | topSet = set(words)
7 |
8 | wordEvolution = open('/home/vspathak/git/DynamicLDA/WordEvolution.txt', 'w')
9 |
10 | files = os.listdir('/home/vspathak/git/DynamicLDA/Scrapper/Pi')
11 |
12 | for fileName in files:
13 | for topic in open('output/' + fileName, 'r'):
14 | topic = topic.strip()
15 | wordProb = topic.split(',')
16 | wordEvolution.write(wordProb[word])
17 |
--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
1 | all: SCVB0/scvb0.cpp
2 | g++ -g -std=c++0x -fopenmp SCVB0/scvb0.cpp -o fastLDA
3 |
4 | GenerateChains: TopicChains/GenerateChains.cpp
5 | g++ -g -std=c++0x -fopenmp TopicChains/GenerateChains.cpp -o GenerateChains
6 |
7 | GetData: TopicChains/GetData.cpp
8 | g++ -g -std=c++0x -fopenmp TopicChains/GetData.cpp -o GetData
9 |
10 | serial: SCVB0/scvb0.cpp
11 | g++ -g -std=c++0x SCVB0/scvb0.cpp -o fastLDA
12 |
13 | scvb: SCVB0_Evaluation/scvb.cpp
14 | g++ -g -std=c++0x -fopenmp SCVB0_Evaluation/scvb.cpp -o scvb
15 |
16 | clean:
17 | rm -f *.o fastLDA GetData GenerateChains
18 |
--------------------------------------------------------------------------------
/Scrapper/removeInfrequentWords.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | vocabDict = {}
5 |
6 | files = os.listdir('/home/vspathak/git/DynamicLDA/Scrapper/output')
7 |
8 | for fileName in files:
9 | for line in open('output/'+fileName, 'r'):
10 | line = line.strip()
11 | words = line.split()
12 | for key in words:
13 | if key in vocabDict:
14 | vocabDict[key] += 1
15 | else:
16 | vocabDict[key] = 1
17 |
18 | stopwordsFile = open('stopwords', 'a')
19 | for term in vocabDict:
20 | if vocabDict[term] < 26:
21 | # print term
22 | stopwordsFile.write(term.encode('utf-8') + '\n')
23 | if vocabDict[term] > 100000:
24 | print term
25 | stopwordsFile.close()
26 |
--------------------------------------------------------------------------------
/Scrapper/getarticle.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from lxml import html
3 | import requests
4 | # http://www.reuters.com/article/2012/12/02/us-space-france-russia-idUSBRE8B101L20121202
5 | # http://www.reuters.com/article/2007/01/02/music-jazz-chicago-dc-idUSN2927338620070102
6 | # http://www.reuters.com/article/2014/03/28/us-microsoft-office-ipad-idUSBREA2Q1MV20140328
7 | page = requests.get('http://www.reuters.com/article/2014/01/02/walmart-china-idUSL3N0KC0LH20140102')
8 | tree = html.fromstring(page.text)
9 |
10 | # This will create a list of article URLs:
11 | # URL = tree.xpath('//div[@class="headlineMed"]/a/@href')
12 | # Title = tree.xpath('//div[@class="headlineMed"]/a/text()'
13 | Title = tree.xpath('//*[@id="content"]/div[4]/div/div[3]/div[1]/h1/text()')
14 |
15 | Location = tree.xpath('//*[@id="articleInfo"]/p[2]/span[1]/text()')
16 |
17 | Paragraphs = tree.xpath('//*[@id="articleText"]/p/text()')
18 |
19 | print 'Paragraphs: ', Paragraphs
20 | print 'Location: ' , Location
21 | print 'Title:' , Title
22 |
--------------------------------------------------------------------------------
/Evaluation/getWordVariation.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | if len(sys.argv) >= 3:
5 | topic_id = int(sys.argv[1])
6 | word_id = int(sys.argv[2])
7 | filepath = sys.argv[3]
8 | startYear = int(sys.argv[4])
9 | endYear = int(sys.argv[5])
10 |
11 | files = os.listdir(filepath)
12 |
13 | for year in range(startYear, endYear + 1):
14 | for month in range(1, 13):
15 | if(month > 9):
16 | fileName = filepath + '/' + 'topics_' + str(year) + str(month) + '.txt'
17 | if os.path.isfile(fileName):
18 | # print 'reading file: ' + fileName
19 | monthFile = open(fileName, 'r')
20 | lines = monthFile.readlines()
21 | topic = lines[topic_id]
22 | prob = float(topic.split(',')[word_id])
23 | print prob
24 | else:
25 | fileName = filepath + '/' + 'topics_' + str(year) + '0' + str(month) + '.txt'
26 | if os.path.isfile(fileName):
27 | # print 'reading file: ' + fileName
28 | monthFile = open(fileName, 'r')
29 | lines = monthFile.readlines()
30 | topic = lines[topic_id]
31 | prob = float(topic.split(',')[word_id])
32 | print prob
33 |
--------------------------------------------------------------------------------
/Evaluation/lookupWords.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 |
5 | def buildtriple(idprob,vocab):
6 | # if (len(idprob) == 0):
7 | # print 'working'
8 | word_Prob = idprob.split(':')
9 | word = vocab[int(word_Prob[0])]
10 | return (word, word_Prob[0], word_Prob[1])
11 |
12 | #input: directory of input files, directory for output files, vocab file
13 | if len(sys.argv) >= 3:
14 | infolder = sys.argv[1]
15 | outfolder = sys.argv[2]
16 | vocabfilename = sys.argv[3]
17 |
18 | vocabfile = open(vocabfilename, 'r')
19 | vocab = [word.strip() for word in vocabfile.readlines()]
20 | vocabfile.close()
21 |
22 | #iterate through files of infolder
23 | for filename in os.listdir(infolder):
24 | infile = open(infolder + '/' + filename)
25 | outfile = open(outfolder + '/' + filename, 'w')
26 | for topic in infile.readlines():
27 | temp = topic[:-2].split(',')
28 | word_Prob = temp[0].split(':')
29 | if not (word_Prob[0].isdigit()):
30 | continue
31 | outline = [buildtriple(idprob,vocab) for idprob in topic[:-2].split(',')]
32 | map(lambda x: outfile.write(str(x)), outline)
33 | outfile.write('\n')
34 | outfile.close()
35 | infile.close()
36 |
--------------------------------------------------------------------------------
/Scrapper/__init__.py:
--------------------------------------------------------------------------------
1 | from lxml import html
2 | import requests
3 | import article as ac
4 | import sys
5 | import random
6 | import nltk
7 | import re
8 | from stemming.porter2 import stem
9 |
10 |
11 | docId = 0
12 | if len(sys.argv) >= 4:
13 | theyear = int(sys.argv[1])
14 | firstmonth = int(sys.argv[2])
15 | num_months = int(sys.argv[3])
16 | seqfile = open('seq-' + str(theyear) + '-' + str(firstmonth) + '-' + str(num_months) + '.txt', 'w')
17 |
18 | else:
19 | print 'usage: python __init__.py year firstmonth num_months'
20 | sys.exit(0)
21 |
22 | for yr in range(theyear, theyear + 1):
23 | year = 'http://www.reuters.com/resources/archive/us/' + str(yr)
24 | for mnth in range(firstmonth, firstmonth + num_months):
25 | if(mnth < 10):
26 | month = '0' + str(mnth)
27 | else:
28 | month = str(mnth)
29 |
30 | monthdocs = 0
31 | for day in range(1, 32):
32 | if(day < 10):
33 | URL = year + month + '0' + str(day) + '.html'
34 | else:
35 | URL = year + month + str(day) + '.html'
36 |
37 | page = requests.get(URL)
38 | tree = html.fromstring(page.text)
39 | URLs = tree.xpath('//div[@class="headlineMed"]/a/@href')
40 | date = URL[-13:-5]
41 |
42 | f = open('output/' + str(date) + '.txt', 'w')
43 | # generate the random vector(python generate a sample without
44 | # replacement from a range of numbers)
45 |
46 | for num in random.sample(range(0, len(URLs)), int(len(URLs))):
47 | doc = ac.article('', date, '', URLs[num], -1)
48 | curpage = requests.get(doc.URL)
49 | curtree = html.fromstring(curpage.text)
50 | Title = curtree.xpath('//*[@id="content"]/div[4]/div/div[3]/div[1]/h1/text()')
51 | Paragraphs = curtree.xpath('//*[@id="articleText"]/p/text()')
52 | if len(Title) > 0:
53 | doc.Title = Title[0].replace('\"', '')
54 | Paragraphs.append(Title[0])
55 | doc.Text = " ".join(Paragraphs)
56 | doc.Text = doc.Text.replace('\n', ' ')
57 | doc.Text = doc.Text.replace('\"', '')
58 |
59 | if(len(doc.Text.split()) > 100):
60 | docId = docId + 1
61 | doc.id = docId
62 | print doc.id
63 | monthdocs = monthdocs + 1
64 |
65 | docText = re.sub('[^A-Za-z]+', ' ', doc.Text)
66 | docTitle = re.sub('[^A-Za-z]+', ' ', doc.Title)
67 | docText = docTitle + ' ' + docText
68 | docText = docText.lower()
69 | tokens = docText.split()
70 |
71 | docText = " ".join([stem(t) for t in tokens])
72 |
73 | f.write(docText.encode('utf-8') + '\n')
74 |
75 | f.close()
76 | seqfile.write(str(theyear) + '-' + str(mnth) + ':' + str(monthdocs) + '\n')
77 | seqfile.close()
78 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | DynamicLDA
2 | ==========
3 |
4 | Dynamic Topic Model of Reuters News Articles between 2007-2013
5 | --------------------------------------------------------------------------------------------
6 |
We have implemented fast version of Dynamic Topic Model proposed by David Blei and John Lafferty in 2006.
7 | This version takes advantage of new advancements in LDA model. We have implemented the LDA part of DTM using SCVB0 which is proposed by Foulds, et al 2013. This is parallelized implementation of SCVB0 using OpenMP.
8 | As per our evaluation, even our Serial version gives 36X speedup and the Parallel version when run on core 2 duo 2GHz 2Gb machine gives 53X speedup.
9 | (Report with detail evaluation)
10 |
11 | Reuters News Dataset Details
12 | ----------------------------
13 | Timestamped News articles published by Reuters between 2007 and 2013. This is corpus of 161,989 documents with vocab size of 32,468 after preprocessing. Following are the preprocessing steps performed (Scripts are available in Scrapper folder)
14 |
15 | - From Reuters data we removed all the docs which have length less than 100 words
16 | - We have scrapped random 10% of the data from each day. This was done just to minimize the corpus size.The assumption is that randomly selected data wont cause problem while finding the long and major topics.
17 | - We removed all the punctuation marks and performed stemming using Porter2 stemmer
18 | - We also removed the words which have frequency of less than 25 or more than 100,000
19 | example run of text2ldac:
20 |
21 | Topic Chains
22 | ------------
23 | We have investigated the Topic Chains a solution to topic Birth-Death problem in Dynamic LDA proposed by Kim, et al in 2013.
24 | - We use the same Reuters dataset and use the Jensen-Shannon (JS) divergence to compare similarity between the topics.
25 | - We evaluate performance at different Similarity Thresholds and Window Sizes and find similar results as given in the original paper
26 | - We identify some issues in the method and propose solutions to the same (Please refer the report for more details)
27 |
28 | Execution Commands
29 | ------------------
30 | - Scrape Data from reuters archive website between startMonth for num_of_months
31 | python __init__.py startMonth num_of_months
32 | - Get Stopwords python removeInfrequentWords.py
33 | - Convert the text data to ldac format used by Blei's implementation
34 | python multitext2ldac.py data_folder --stopwords stopwords_file
35 | - Convert data to UCI format
36 | python ldac2uci.py
37 | - Compile Dynamic LDA. make
38 | - Execute Dynamic Topic Modeling on UCI dataset
39 | ./fastLDA UCIFormat_data_file iterations NumOfTopics MiniBatchSize Vocab_file GeneratePi
40 | - Get the word trend in a topic
41 | python getWordVariation.py TopicId WordId PiFolderPath StartYear EndYear
42 | - Compile Topic Chains GetData to get all the Topics in the dataset for all the TimeSlices
43 | make GetData
44 | - Execute GetData for Topic Chains
./GetData UCIFormat_data_file iterations NumOfTopics MiniBatchSize Vocab_file GeneratePi
45 | - Compile GenerateChains for Topic Chains make GenerateChains
46 | - Execute GenerateChains ./GenerateChains Pi_folder num_topics WindowSize SimilarityThreshold
47 |
--------------------------------------------------------------------------------
/TopicChains/GenerateChains.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * TopicChains.cpp
3 | *
4 | * Created on: Apr 22, 2014
5 | * Author: vspathak
6 | */
7 |
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include
22 | #include
23 | #include
24 |
25 | using namespace std;
26 | using namespace boost;
27 |
28 | typedef adjacency_list Graph;
29 |
30 | // Initialize number of documents, topics and words in vocabulary
31 | unsigned int W, D, K;
32 |
33 | double KLDivergence(double*** Pi, int t, int k, double* M) {
34 | double result = 0.0;
35 | for (unsigned int w = 0; w < W; ++w) {
36 | result += log(Pi[t][w][k] / M[w]) * Pi[t][w][k];
37 | }
38 | return result;
39 | }
40 |
41 | double JSsimilarity(double*** Pi, int t1, int k1, int t2, int k2) {
42 | double result = 0.0;
43 | double* M = new double[W];
44 | for (unsigned int w = 0; w < W; ++w) {
45 | M[w] = (Pi[t1][w][k1] + Pi[t2][w][k2]) / 2;
46 | }
47 | result = KLDivergence(Pi, t1, k1, M) + KLDivergence(Pi, t2, k2, M);
48 | result = result / 2;
49 | return result;
50 | }
51 |
52 | void generateTopicLinks(Graph &G, double*** Pi, int timeSlice, int topic,
53 | int numTopics, int windowSize, double threshold) {
54 | for (int w = 0; w < windowSize; w++) {
55 | int numLinks = 0;
56 | for (int k = 0; k < numTopics; k++) {
57 | if ((timeSlice - 1 - w >= 0) && JSsimilarity(Pi, timeSlice, topic, timeSlice - 1 - w, k) > threshold) {
58 | //add edge to graph structure here
59 | int e1 = (timeSlice * numTopics) + topic;
60 | int e2 = ((timeSlice - 1 - w) * numTopics) + k;
61 |
62 | cout << "Adding edge " << e1 << ", " << e2 << endl;
63 | add_edge(e1, e2, G);
64 | numLinks++;
65 | }
66 | }
67 | if (numLinks > 0) {
68 | break;
69 | }
70 | }
71 | }
72 |
73 | void generateAllLinks(Graph &G, double*** Pi, int numTimeSlices, int numTopics,
74 | int windowSize, double threshold) {
75 | for (int t = 0; t < numTimeSlices; t++) {
76 | for (int k = 0; k < numTopics; k++) {
77 | generateTopicLinks(G, Pi, t, k, numTopics, windowSize, threshold);
78 | }
79 | }
80 | }
81 |
82 | int main(int argc, char* argv[]) {
83 | if (argc < 4) {
84 | printf("Usage: ./fastLDA Pi_folder num_topics WindowSize SimilarityThreshold\n");
85 | return 1;
86 | }
87 | string piFolder = argv[1];
88 | cout << "Input Pi folder: " << piFolder << endl;
89 |
90 | double ***Pi;
91 | int windowSize = 0;
92 | double similarityThreshold = 0;
93 |
94 | ifstream seqfile;
95 | seqfile.open("Data/seqfile.txt");
96 | string newline = "";
97 | vector* months = new vector();
98 | vector* numOfDocs = new vector();
99 | vector* monthFirstIdx = new vector();
100 | vector* monthLastIdx = new vector();
101 | int curIdx = 0;
102 |
103 | while (seqfile >> newline) {
104 | const char * ptr = strchr(newline.c_str(), ':');
105 | int count = atoi(ptr + 1);
106 | ptr = "\0";
107 | int yearMonth = atoi(newline.c_str());
108 | months->push_back(yearMonth);
109 | numOfDocs->push_back(count);
110 | monthFirstIdx->push_back(curIdx);
111 | monthLastIdx->push_back(curIdx + count);
112 | curIdx += count;
113 | }
114 | seqfile.close();
115 |
116 | K = atoi(argv[2]);
117 | windowSize = atoi(argv[3]);
118 | similarityThreshold = atof(argv[4]);
119 | W = 32468;
120 |
121 |
122 | printf("Number of topics: %d\n", K);
123 | printf("Window Size: %d\n", windowSize);
124 | printf("Similarity Threshold: %f\n", similarityThreshold);
125 |
126 | // Dynamically allocate Pi
127 | Pi = new double**[months->size()];
128 | for (unsigned int m = 0; m < months->size(); ++m) {
129 | Pi[m] = new double*[W];
130 | for (unsigned int word = 0; word < W; word++) {
131 | Pi[m][word] = new double[K];
132 | for(unsigned int k = 0; k < K; k++) {
133 | Pi[m][word][k] = 0;
134 | }
135 | }
136 | }
137 |
138 | //Read Pi files in Memory
139 | for (int timeSlice = 0; timeSlice < (int)months->size(); timeSlice++) {
140 | string fileName = piFolder + "/topics_" + to_string(months->at(timeSlice)) + ".txt";
141 | cout << "Reading File: " << fileName << endl;
142 | ifstream pifile;
143 | pifile.open(fileName);
144 | int topic = 0;
145 | while (pifile >> newline) {
146 | std::istringstream ss(newline);
147 | std::string token;
148 |
149 | int wordId = 0;
150 | while (std::getline(ss, token, ',')) {
151 | Pi[timeSlice][wordId][topic] = stod(token);
152 | wordId++;
153 | }
154 | topic++;
155 | }
156 | pifile.close();
157 |
158 | } //All timeSlices finished
159 | // for (int timeSlice = 0; timeSlice < (int) months->size(); timeSlice++) {
160 | // for (int k = 0; k < K; k++) {
161 | // for (int w = 0; w < W; w++) {
162 | // cout << Pi[timeSlice][w][k] << ",";
163 | // }
164 | // cout << endl;
165 | // }
166 | // }
167 |
168 | // MAKE CHAINS
169 | Graph G;
170 | //K is unsigned -- is this a problem?
171 | // generateAllLinks(G, Pi, months->size(), K, windowSize, similarityThreshold);
172 | generateAllLinks(G, Pi, 10, K, windowSize, similarityThreshold);
173 |
174 | vector component(num_vertices(G));
175 | int num = connected_components(G, &component[0]);
176 |
177 | vector::size_type p;
178 | cout << "Total number of components: " << num << endl;
179 | for (p = 0; p != component.size(); ++p) {
180 | cout << "Vertex " << p << " is in component " << component[p] << endl;
181 | }
182 |
183 | return (0);
184 |
185 | } // End of main
186 |
--------------------------------------------------------------------------------
/Scrapper/text2ldac.py:
--------------------------------------------------------------------------------
1 | #This file is part of text2ldac.
2 |
3 | #text2ldac is free software: you can redistribute it and/or modify
4 | #it under the terms of the GNU General Public License as published by
5 | #the Free Software Foundation, either version 3 of the License, or
6 | #(at your option) any later version.
7 |
8 | #text2ldac is distributed in the hope that it will be useful,
9 | #but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | #GNU General Public License for more details.
12 |
13 | #You should have received a copy of the GNU General Public License
14 | #along with text2ldac. If not, see .
15 |
16 |
17 |
18 | import argparse
19 | import codecs
20 | import os
21 | import operator
22 | import string
23 | import sys
24 |
25 | __doc = \
26 | '''
27 | This is a program to convert documents into the file format used by David
28 | Blei's lda-c (and hlda-c) implementation. It generates the .dat, .vocab and
29 | .dmap files from .txt files in a given directory.
30 |
31 | cf. http://www.cs.princeton.edu/~blei/lda-c/readme.txt
32 | '''
33 | __author__ = 'Johannes Knopp '
34 |
35 | def init_parser():
36 | '''
37 | Returns an argument parser configured with options for this program
38 | '''
39 | parser = argparse.ArgumentParser(
40 | description='A program to convert documents to .dat, .vocap and .dmap files'
41 | )
42 |
43 | #positional argument
44 | parser.add_argument('dirname', action='store',
45 | help='directory containing .txt files (files must be encoded in utf-8)')
46 |
47 | #options
48 | parser.add_argument('-o', '--output', action='store', dest='outdir',
49 | help='directory to store the resulting files')
50 | parser.add_argument('-e', '--extension', action='store', dest='extension',
51 | default='.txt',
52 | help='extension of the files you are looking for. Default: %(default)s')
53 | #TODO minoccurrence should work for the overall occurrence
54 | parser.add_argument('--minoccurrence', action='store',
55 | dest='minoccurrence', type=int, default=1,
56 | help='Minimum occurrences a word needs at least once in one document to be taken into account.')
57 | parser.add_argument('--minlength', action='store',
58 | dest='minlength', type=int, default=1,
59 | help='Minimum length a word needs to be taken into account.')
60 | #stopwords
61 | parser.add_argument('--stopwords', action='store', dest='stopword_file',
62 | help='Remove the stopwords given in the stopword file (one line per stopword).')
63 |
64 | #TODO
65 | parser.add_argument('--mallet', action='store_true',
66 | help='convert data that exists in the format used by mallet. NOT SUPPORTED YET')
67 |
68 | return parser.parse_args()
69 |
70 |
71 | def get_filenames(directory, extension):
72 | '''
73 | Search for files in the directory ending in EXTENSION and return the full
74 | paths as a list.
75 | '''
76 | all_fnames = []
77 | for dirpath,dirnames,filenames in os.walk(directory):
78 | all_fnames += [os.path.join(dirpath,f) for f in filenames if
79 | f.endswith(extension)]
80 | return all_fnames
81 |
82 |
83 | def clean_word(word):
84 | '''
85 | returns the word in lowercase without punctuation at the start or end
86 | '''
87 | return word.rstrip(string.punctuation).lstrip(string.punctuation).lower()
88 |
89 | def load_stopwords(stopword_filename):
90 | '''
91 | returns a set of stopwords found line by line in the stopwords file
92 | '''
93 | stopwords = set()
94 |
95 | with codecs.open(stopword_filename, 'r', 'utf-8') as sf:
96 | for line in sf:
97 | if len(line.split()) != 1:
98 | print('ignoring line with more than one stopword:\n"{0}"'.format(
99 | line))
100 | continue
101 | stopwords.add(line.strip())
102 |
103 | return stopwords
104 |
105 | def write_document_map_file(fnames, dmap_fname):
106 | """
107 | Save document's names in the order they were processed
108 | """
109 | with codecs.open(dmap_fname,'w','utf-8') as d_file:
110 | for title in fnames:
111 | d_file.write(title + '\n')
112 |
113 | def reindex(word_id_dict, min_index):
114 | """
115 | re-index the word_id for word_id pairs to guarantee that the max
116 | index of the word matches/reflects number of words in word dict
117 | """
118 | num_word_shifts = 0
119 | for word in word_id_dict:
120 | cur_index = word_id_dict[word]
121 |
122 | if cur_index > min_index:
123 | word_id_dict[word] = min_index + num_word_shifts
124 | num_word_shifts += 1
125 |
126 | def generate_dat_lines_and_word_ids(fnames, config):
127 | dat_lines = [] #.dat file output
128 | word_id_dict = dict()
129 | used_docs = [] #needed to generate .dmap file
130 |
131 | for docname in fnames:
132 | freq_dict = dict()
133 | new_words = set()
134 |
135 | try:
136 | with codecs.open(docname, 'r', 'utf-8') as doc:
137 | for line in doc:
138 | for word in line.split():
139 | word = clean_word(word)
140 |
141 | if len(word) < config['minlength'] or word in config['stopwords']:
142 | continue
143 |
144 | #word occurrs for the first time
145 | if not word_id_dict.has_key(word):
146 | freq_dict[word] = 1
147 | word_id_dict[word] = len(word_id_dict)
148 | new_words.add(word)
149 | #word may be in word_id_dict but not yet in freq_dict
150 | else:
151 | freq = freq_dict.setdefault(word, 0)
152 | freq_dict[word] = freq + 1
153 | except UnicodeDecodeError as u_error:
154 | print('Document "{0}" has encoding errors and is ignored!\n{1}'.format(
155 | docname, u_error))
156 |
157 |
158 | if len(freq_dict)==0: #did the document contribute anything?
159 | print('Document "{0}" (#{1}) seems to be empty and is ignored!'.format(
160 | docname,fnames.index(docname)))
161 | continue
162 | else:
163 | used_docs.append(docname)
164 |
165 | #remove words that do not reach minoccurrence
166 | remove_list = [word for word in freq_dict.iterkeys() if\
167 | freq_dict[word] < config['minoccurrence']]
168 | #smallest index of a word that is removed
169 | remove_word_min_index = len(word_id_dict)
170 |
171 | for word in remove_list:
172 | freq_dict.pop(word)
173 | #if they are new also remove them from word_id_dict
174 | if word in new_words:
175 | word_index = word_id_dict[word]
176 | if word_index < remove_word_min_index:
177 | remove_word_min_index = word_index
178 | word_id_dict.pop(word)
179 | reindex(word_id_dict, remove_word_min_index)
180 |
181 | dat_line = '' #line for the .dat file
182 |
183 | for word in freq_dict.iterkeys():
184 | dat_line += str(word_id_dict[word]) + ':' + str(freq_dict[word]) + ' '
185 |
186 | #last blank in dat_line is removed
187 | dat_lines.append(str(len(freq_dict)) + ' ' + dat_line[:-1] + '\n')
188 |
189 | write_document_map_file(used_docs, config['dmapname'])
190 |
191 | return dat_lines, word_id_dict
192 |
193 |
194 | def generate_dat_and_vocab_files(fnames, config):
195 |
196 | with codecs.open(config['datname'], 'w', 'utf-8') as datfile:
197 | dat_lines, word_id_dict = generate_dat_lines_and_word_ids(fnames,
198 | config)
199 | datfile.writelines(dat_lines)
200 |
201 | #sort word_id_dict ascending by value und write the words in that
202 | #order to a .vocab file
203 | with codecs.open(config['vocabname'], 'w', 'utf-8') as vocabfile:
204 | for item in sorted(word_id_dict.iteritems(), key=operator.itemgetter(1)):
205 | vocabfile.write(item[0]+'\n')
206 |
207 | print('Found {0} unique words in {1} files.'.format(
208 | len(word_id_dict), len(fnames)))
209 | print('Results can be found in "{0}" and "{1}"'.format(
210 | config['datname'], config['vocabname']))
211 |
212 |
213 | if __name__=='__main__':
214 |
215 | parser = init_parser()
216 |
217 | #directory with document files
218 | dirname = parser.dirname
219 | dirname = dirname + os.sep if not dirname.endswith(os.sep) else dirname
220 | #directory for results
221 | outdir_name = parser.outdir if parser.outdir else dirname
222 | outdir_name = outdir_name + os.sep if not outdir_name.endswith(os.sep) else outdir_name
223 | #prefix of the .dat and .vocab files
224 | basename = os.path.dirname(dirname).split('/')[-1]
225 |
226 |
227 | if not os.path.exists(outdir_name):
228 | os.mkdir(outdir_name)
229 |
230 | #store configuration
231 | config = dict()
232 | config['datname'] = outdir_name + basename + '.dat'
233 | config['vocabname'] = outdir_name + basename + '.vocab'
234 | config['dmapname'] = outdir_name + basename + '.dmap'
235 | config['minlength'] = parser.minlength
236 | config['minoccurrence'] = parser.minoccurrence
237 | if parser.stopword_file:
238 | config['stopwords'] = load_stopwords(parser.stopword_file)
239 | else:
240 | config['stopwords'] = set()
241 |
242 | fnames = get_filenames(dirname, parser.extension)
243 |
244 | try:
245 | generate_dat_and_vocab_files(fnames, config)
246 | except IOError as ioe:
247 | print(ioe)
248 | sys.exit(1)
249 |
--------------------------------------------------------------------------------
/Scrapper/multitext2ldac.py:
--------------------------------------------------------------------------------
1 | #This file is part of text2ldac.
2 |
3 | #text2ldac is free software: you can redistribute it and/or modify
4 | #it under the terms of the GNU General Public License as published by
5 | #the Free Software Foundation, either version 3 of the License, or
6 | #(at your option) any later version.
7 |
8 | #text2ldac is distributed in the hope that it will be useful,
9 | #but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | #GNU General Public License for more details.
12 |
13 | #You should have received a copy of the GNU General Public License
14 | #along with text2ldac. If not, see .
15 |
16 |
17 |
18 | import argparse
19 | import codecs
20 | import os
21 | import operator
22 | import string
23 | import sys
24 |
25 | __doc = \
26 | '''
27 | This is a program to convert documents into the file format used by David
28 | Blei's lda-c (and hlda-c) implementation. It generates the .dat, .vocab and
29 | .dmap files from .txt files in a given directory.
30 |
31 | cf. http://www.cs.princeton.edu/~blei/lda-c/readme.txt
32 | '''
33 | __author__ = 'Johannes Knopp '
34 |
35 | def init_parser():
36 | '''
37 | Returns an argument parser configured with options for this program
38 | '''
39 | parser = argparse.ArgumentParser(
40 | description='A program to convert documents to .dat, .vocap and .dmap files'
41 | )
42 |
43 | #positional argument
44 | parser.add_argument('dirname', action='store',
45 | help='directory containing .txt files (files must be encoded in utf-8)')
46 |
47 | #options
48 | parser.add_argument('-o', '--output', action='store', dest='outdir',
49 | help='directory to store the resulting files')
50 | parser.add_argument('-e', '--extension', action='store', dest='extension',
51 | default='.txt',
52 | help='extension of the files you are looking for. Default: %(default)s')
53 | #TODO minoccurrence should work for the overall occurrence
54 | parser.add_argument('--minoccurrence', action='store',
55 | dest='minoccurrence', type=int, default=1,
56 | help='Minimum occurrences a word needs at least once in one document to be taken into account.')
57 | parser.add_argument('--minlength', action='store',
58 | dest='minlength', type=int, default=1,
59 | help='Minimum length a word needs to be taken into account.')
60 | #stopwords
61 | parser.add_argument('--stopwords', action='store', dest='stopword_file',
62 | help='Remove the stopwords given in the stopword file (one line per stopword).')
63 |
64 | #TODO
65 | parser.add_argument('--mallet', action='store_true',
66 | help='convert data that exists in the format used by mallet. NOT SUPPORTED YET')
67 |
68 | return parser.parse_args()
69 |
70 |
71 | def get_filenames(directory, extension):
72 | '''
73 | Search for files in the directory ending in EXTENSION and return the full
74 | paths as a list.
75 | '''
76 | all_fnames = []
77 | for dirpath,dirnames,filenames in os.walk(directory):
78 | all_fnames += [os.path.join(dirpath,f) for f in filenames if
79 | f.endswith(extension)]
80 | return all_fnames
81 |
82 |
83 | def clean_word(word):
84 | '''
85 | returns the word in lowercase without punctuation at the start or end
86 | '''
87 | return word.rstrip(string.punctuation).lstrip(string.punctuation).lower()
88 |
89 | def load_stopwords(stopword_filename):
90 | '''
91 | returns a set of stopwords found line by line in the stopwords file
92 | '''
93 | stopwords = set()
94 |
95 | with codecs.open(stopword_filename, 'r', 'utf-8') as sf:
96 | for line in sf:
97 | if len(line.split()) != 1:
98 | print('ignoring line with more than one stopword:\n"{0}"'.format(
99 | line))
100 | continue
101 | stopwords.add(line.strip())
102 |
103 | return stopwords
104 |
105 | def write_document_map_file(fnames, dmap_fname):
106 | """
107 | Save document's names in the order they were processed
108 | """
109 | with codecs.open(dmap_fname,'w','utf-8') as d_file:
110 | for title in fnames:
111 | d_file.write(title + '\n')
112 |
113 | def reindex(word_id_dict, min_index):
114 | """
115 | re-index the word_id for word_id pairs to guarantee that the max
116 | index of the word matches/reflects number of words in word dict
117 | """
118 | num_word_shifts = 0
119 | for word in word_id_dict:
120 | cur_index = word_id_dict[word]
121 |
122 | if cur_index > min_index:
123 | word_id_dict[word] = min_index + num_word_shifts
124 | num_word_shifts += 1
125 |
126 | def generate_dat_lines_and_word_ids(fnames, config):
127 | dat_lines = [] #.dat file output
128 | word_id_dict = dict()
129 | used_docs = [] #needed to generate .dmap file
130 |
131 | for dayname in fnames:
132 | print dayname
133 | try:
134 | with codecs.open(dayname, 'r', 'utf-8') as day:
135 | for line in day:
136 | freq_dict = dict()
137 | new_words = set()
138 |
139 | for word in line.split():
140 | word = clean_word(word)
141 |
142 | if len(word) < config['minlength'] or word in config['stopwords']:
143 | continue
144 |
145 | #word occurrs for the first time
146 | if not word_id_dict.has_key(word):
147 | freq_dict[word] = 1
148 | word_id_dict[word] = len(word_id_dict)
149 | new_words.add(word)
150 | #word may be in word_id_dict but not yet in freq_dict
151 | else:
152 | freq = freq_dict.setdefault(word, 0)
153 | freq_dict[word] = freq + 1
154 |
155 | if len(freq_dict)==0: #did the document contribute anything?
156 | print('Document "{0}" (#{1}) seems to be empty and is ignored!'.format(
157 | dayname,fnames.index(docname)))
158 | continue
159 | else:
160 | used_docs.append(dayname)
161 |
162 | #remove words that do not reach minoccurrence
163 | remove_list = [word for word in freq_dict.iterkeys() if\
164 | freq_dict[word] < config['minoccurrence']]
165 | #smallest index of a word that is removed
166 | remove_word_min_index = len(word_id_dict)
167 |
168 | for word in remove_list:
169 | freq_dict.pop(word)
170 | #if they are new also remove them from word_id_dict
171 | if word in new_words:
172 | word_index = word_id_dict[word]
173 | if word_index < remove_word_min_index:
174 | remove_word_min_index = word_index
175 | word_id_dict.pop(word)
176 | reindex(word_id_dict, remove_word_min_index)
177 |
178 | dat_line = '' #line for the .dat file
179 |
180 | for word in freq_dict.iterkeys():
181 | dat_line += str(word_id_dict[word]) + ':' + str(freq_dict[word]) + ' '
182 |
183 | #last blank in dat_line is removed
184 | dat_lines.append(str(len(freq_dict)) + ' ' + dat_line[:-1] + '\n')
185 |
186 | except UnicodeDecodeError as u_error:
187 | print('Document "{0}" has encoding errors and is ignored!\n{1}'.format(dayname, u_error))
188 |
189 | #write_document_map_file(used_docs, config['dmapname'])
190 |
191 | return dat_lines, word_id_dict
192 |
193 | def generate_dat_and_vocab_files(fnames, config):
194 |
195 | with codecs.open(config['datname'], 'w', 'utf-8') as datfile:
196 | dat_lines, word_id_dict = generate_dat_lines_and_word_ids(fnames,
197 | config)
198 | datfile.writelines(dat_lines)
199 |
200 | #sort word_id_dict ascending by value und write the words in that
201 | #order to a .vocab file
202 | with codecs.open(config['vocabname'], 'w', 'utf-8') as vocabfile:
203 | for item in sorted(word_id_dict.iteritems(), key=operator.itemgetter(1)):
204 | vocabfile.write(item[0]+'\n')
205 |
206 | print('Found {0} unique words in {1} files.'.format(
207 | len(word_id_dict), len(fnames)))
208 | print('Results can be found in "{0}" and "{1}"'.format(
209 | config['datname'], config['vocabname']))
210 |
211 |
212 | if __name__=='__main__':
213 |
214 | parser = init_parser()
215 |
216 | #directory with document files
217 | dirname = parser.dirname
218 | dirname = dirname + os.sep if not dirname.endswith(os.sep) else dirname
219 | #directory for results
220 | outdir_name = parser.outdir if parser.outdir else dirname
221 | outdir_name = outdir_name + os.sep if not outdir_name.endswith(os.sep) else outdir_name
222 | #prefix of the .dat and .vocab files
223 | basename = os.path.dirname(dirname).split('/')[-1]
224 |
225 |
226 | if not os.path.exists(outdir_name):
227 | os.mkdir(outdir_name)
228 |
229 | #store configuration
230 | config = dict()
231 | config['datname'] = outdir_name + basename + '.dat'
232 | config['vocabname'] = outdir_name + basename + '.vocab'
233 | config['dmapname'] = outdir_name + basename + '.dmap'
234 | config['minlength'] = parser.minlength
235 | config['minoccurrence'] = parser.minoccurrence
236 | if parser.stopword_file:
237 | config['stopwords'] = load_stopwords(parser.stopword_file)
238 | else:
239 | config['stopwords'] = set()
240 |
241 | fnames = get_filenames(dirname, parser.extension)
242 |
243 | try:
244 | generate_dat_and_vocab_files(fnames, config)
245 | except IOError as ioe:
246 | print(ioe)
247 | sys.exit(1)
248 |
--------------------------------------------------------------------------------
/SCVB0_Evaluation/scvb.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * scvb.cpp
3 | *
4 | * Created on: May 3, 2014
5 | * Author: vspathak
6 | */
7 |
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 | #include